aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CREDITS6
-rw-r--r--Documentation/filesystems/gfs2.txt43
-rw-r--r--MAINTAINERS18
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/configfs/item.c2
-rw-r--r--fs/dlm/Kconfig21
-rw-r--r--fs/dlm/Makefile19
-rw-r--r--fs/dlm/ast.c173
-rw-r--r--fs/dlm/ast.h26
-rw-r--r--fs/dlm/config.c789
-rw-r--r--fs/dlm/config.h42
-rw-r--r--fs/dlm/debug_fs.c387
-rw-r--r--fs/dlm/dir.c423
-rw-r--r--fs/dlm/dir.h30
-rw-r--r--fs/dlm/dlm_internal.h543
-rw-r--r--fs/dlm/lock.c3869
-rw-r--r--fs/dlm/lock.h62
-rw-r--r--fs/dlm/lockspace.c717
-rw-r--r--fs/dlm/lockspace.h25
-rw-r--r--fs/dlm/lowcomms.c1238
-rw-r--r--fs/dlm/lowcomms.h26
-rw-r--r--fs/dlm/lvb_table.h18
-rw-r--r--fs/dlm/main.c97
-rw-r--r--fs/dlm/member.c327
-rw-r--r--fs/dlm/member.h24
-rw-r--r--fs/dlm/memory.c116
-rw-r--r--fs/dlm/memory.h29
-rw-r--r--fs/dlm/midcomms.c140
-rw-r--r--fs/dlm/midcomms.h21
-rw-r--r--fs/dlm/rcom.c472
-rw-r--r--fs/dlm/rcom.h24
-rw-r--r--fs/dlm/recover.c765
-rw-r--r--fs/dlm/recover.h34
-rw-r--r--fs/dlm/recoverd.c290
-rw-r--r--fs/dlm/recoverd.h24
-rw-r--r--fs/dlm/requestqueue.c184
-rw-r--r--fs/dlm/requestqueue.h22
-rw-r--r--fs/dlm/user.c785
-rw-r--r--fs/dlm/user.h16
-rw-r--r--fs/dlm/util.c161
-rw-r--r--fs/dlm/util.h22
-rw-r--r--fs/gfs2/Kconfig44
-rw-r--r--fs/gfs2/Makefile10
-rw-r--r--fs/gfs2/acl.c313
-rw-r--r--fs/gfs2/acl.h37
-rw-r--r--fs/gfs2/bmap.c1236
-rw-r--r--fs/gfs2/bmap.h27
-rw-r--r--fs/gfs2/daemon.c196
-rw-r--r--fs/gfs2/daemon.h19
-rw-r--r--fs/gfs2/dir.c1976
-rw-r--r--fs/gfs2/dir.h73
-rw-r--r--fs/gfs2/eaops.c230
-rw-r--r--fs/gfs2/eaops.h29
-rw-r--r--fs/gfs2/eattr.c1548
-rw-r--r--fs/gfs2/eattr.h97
-rw-r--r--fs/gfs2/format.h21
-rw-r--r--fs/gfs2/gfs2.h31
-rw-r--r--fs/gfs2/glock.c2274
-rw-r--r--fs/gfs2/glock.h152
-rw-r--r--fs/gfs2/glops.c564
-rw-r--r--fs/gfs2/glops.h23
-rw-r--r--fs/gfs2/incore.h660
-rw-r--r--fs/gfs2/inode.c1344
-rw-r--r--fs/gfs2/inode.h56
-rw-r--r--fs/gfs2/lm.c244
-rw-r--r--fs/gfs2/lm.h41
-rw-r--r--fs/gfs2/lm_interface.h290
-rw-r--r--fs/gfs2/locking.c191
-rw-r--r--fs/gfs2/locking/dlm/Makefile3
-rw-r--r--fs/gfs2/locking/dlm/lock.c541
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h188
-rw-r--r--fs/gfs2/locking/dlm/main.c64
-rw-r--r--fs/gfs2/locking/dlm/mount.c256
-rw-r--r--fs/gfs2/locking/dlm/plock.c302
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c225
-rw-r--r--fs/gfs2/locking/dlm/thread.c359
-rw-r--r--fs/gfs2/locking/nolock/Makefile3
-rw-r--r--fs/gfs2/locking/nolock/main.c259
-rw-r--r--fs/gfs2/log.c603
-rw-r--r--fs/gfs2/log.h61
-rw-r--r--fs/gfs2/lops.c800
-rw-r--r--fs/gfs2/lops.h96
-rw-r--r--fs/gfs2/lvb.c45
-rw-r--r--fs/gfs2/lvb.h19
-rw-r--r--fs/gfs2/main.c127
-rw-r--r--fs/gfs2/meta_io.c779
-rw-r--r--fs/gfs2/meta_io.h74
-rw-r--r--fs/gfs2/mount.c214
-rw-r--r--fs/gfs2/mount.h15
-rw-r--r--fs/gfs2/ondisk.c308
-rw-r--r--fs/gfs2/ops_address.c794
-rw-r--r--fs/gfs2/ops_address.h18
-rw-r--r--fs/gfs2/ops_dentry.c123
-rw-r--r--fs/gfs2/ops_dentry.h15
-rw-r--r--fs/gfs2/ops_export.c293
-rw-r--r--fs/gfs2/ops_export.h19
-rw-r--r--fs/gfs2/ops_file.c812
-rw-r--r--fs/gfs2/ops_file.h20
-rw-r--r--fs/gfs2/ops_fstype.c980
-rw-r--r--fs/gfs2/ops_fstype.h16
-rw-r--r--fs/gfs2/ops_inode.c1165
-rw-r--r--fs/gfs2/ops_inode.h18
-rw-r--r--fs/gfs2/ops_super.c472
-rw-r--r--fs/gfs2/ops_super.h15
-rw-r--r--fs/gfs2/ops_vm.c188
-rw-r--r--fs/gfs2/ops_vm.h16
-rw-r--r--fs/gfs2/quota.c1286
-rw-r--r--fs/gfs2/quota.h32
-rw-r--r--fs/gfs2/recovery.c573
-rw-r--r--fs/gfs2/recovery.h32
-rw-r--r--fs/gfs2/rgrp.c1525
-rw-r--r--fs/gfs2/rgrp.h62
-rw-r--r--fs/gfs2/super.c979
-rw-r--r--fs/gfs2/super.h52
-rw-r--r--fs/gfs2/sys.c579
-rw-r--r--fs/gfs2/sys.h24
-rw-r--r--fs/gfs2/trans.c184
-rw-r--r--fs/gfs2/trans.h34
-rw-r--r--fs/gfs2/util.c245
-rw-r--r--fs/gfs2/util.h169
-rw-r--r--include/linux/Kbuild33
-rw-r--r--include/linux/dlm.h302
-rw-r--r--include/linux/dlm_device.h86
-rw-r--r--include/linux/fs.h3
-rw-r--r--include/linux/gfs2_ondisk.h443
-rw-r--r--include/linux/iflags.h102
-rw-r--r--include/linux/kernel.h1
-rw-r--r--include/linux/lock_dlm_plock.h41
-rw-r--r--mm/filemap.c3
-rw-r--r--mm/readahead.c1
131 files changed, 40835 insertions, 21 deletions
diff --git a/CREDITS b/CREDITS
index 0fe904ebb7c7..30308027e30b 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3541,11 +3541,11 @@ S: Fargo, North Dakota 58122
3541S: USA 3541S: USA
3542 3542
3543N: Steven Whitehouse 3543N: Steven Whitehouse
3544E: SteveW@ACM.org 3544E: steve@chygwyn.com
3545W: http://www.chygwyn.com/~steve 3545W: http://www.chygwyn.com/~steve
3546D: Linux DECnet project: http://www.sucs.swan.ac.uk/~rohan/DECnet/index.html 3546D: Linux DECnet project
3547D: Minor debugging of other networking protocols. 3547D: Minor debugging of other networking protocols.
3548D: Misc bug fixes and filesystem development 3548D: Misc bug fixes and GFS2 filesystem development
3549 3549
3550N: Hans-Joachim Widmaier 3550N: Hans-Joachim Widmaier
3551E: hjw@zvw.de 3551E: hjw@zvw.de
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt
new file mode 100644
index 000000000000..593004b6bbab
--- /dev/null
+++ b/Documentation/filesystems/gfs2.txt
@@ -0,0 +1,43 @@
1Global File System
2------------------
3
4http://sources.redhat.com/cluster/
5
6GFS is a cluster file system. It allows a cluster of computers to
7simultaneously use a block device that is shared between them (with FC,
8iSCSI, NBD, etc). GFS reads and writes to the block device like a local
9file system, but also uses a lock module to allow the computers coordinate
10their I/O so file system consistency is maintained. One of the nifty
11features of GFS is perfect consistency -- changes made to the file system
12on one machine show up immediately on all other machines in the cluster.
13
14GFS uses interchangable inter-node locking mechanisms. Different lock
15modules can plug into GFS and each file system selects the appropriate
16lock module at mount time. Lock modules include:
17
18 lock_nolock -- allows gfs to be used as a local file system
19
20 lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking
21 The dlm is found at linux/fs/dlm/
22
23In addition to interfacing with an external locking manager, a gfs lock
24module is responsible for interacting with external cluster management
25systems. Lock_dlm depends on user space cluster management systems found
26at the URL above.
27
28To use gfs as a local file system, no external clustering systems are
29needed, simply:
30
31 $ mkfs -t gfs2 -p lock_nolock -j 1 /dev/block_device
32 $ mount -t gfs2 /dev/block_device /dir
33
34GFS2 is not on-disk compatible with previous versions of GFS.
35
36The following man pages can be found at the URL above:
37 gfs2_fsck to repair a filesystem
38 gfs2_grow to expand a filesystem online
39 gfs2_jadd to add journals to a filesystem online
40 gfs2_tool to manipulate, examine and tune a filesystem
41 gfs2_quota to examine and change quota values in a filesystem
42 mount.gfs2 to help mount(8) mount a filesystem
43 mkfs.gfs2 to make a filesystem
diff --git a/MAINTAINERS b/MAINTAINERS
index 3bab239e82fe..2d036ffd961f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -875,6 +875,16 @@ M: jack@suse.cz
875L: linux-kernel@vger.kernel.org 875L: linux-kernel@vger.kernel.org
876S: Maintained 876S: Maintained
877 877
878DISTRIBUTED LOCK MANAGER
879P: Patrick Caulfield
880M: pcaulfie@redhat.com
881P: David Teigland
882M: teigland@redhat.com
883L: cluster-devel@redhat.com
884W: http://sources.redhat.com/cluster/
885T: git kernel.org:/pub/scm/linux/kernel/git/steve/gfs-2.6.git
886S: Supported
887
878DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER 888DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER
879P: Tobias Ringstrom 889P: Tobias Ringstrom
880M: tori@unhappy.mine.nu 890M: tori@unhappy.mine.nu
@@ -1135,6 +1145,14 @@ M: khc@pm.waw.pl
1135W: http://www.kernel.org/pub/linux/utils/net/hdlc/ 1145W: http://www.kernel.org/pub/linux/utils/net/hdlc/
1136S: Maintained 1146S: Maintained
1137 1147
1148GFS2 FILE SYSTEM
1149P: Steven Whitehouse
1150M: swhiteho@redhat.com
1151L: cluster-devel@redhat.com
1152W: http://sources.redhat.com/cluster/
1153T: git kernel.org:/pub/scm/linux/kernel/git/steve/gfs-2.6.git
1154S: Supported
1155
1138GIGASET ISDN DRIVERS 1156GIGASET ISDN DRIVERS
1139P: Hansjoerg Lipp 1157P: Hansjoerg Lipp
1140M: hjlipp@web.de 1158M: hjlipp@web.de
diff --git a/fs/Kconfig b/fs/Kconfig
index 3f00a9faabcb..ddc7462ddb56 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -323,6 +323,7 @@ config FS_POSIX_ACL
323 default n 323 default n
324 324
325source "fs/xfs/Kconfig" 325source "fs/xfs/Kconfig"
326source "fs/gfs2/Kconfig"
326 327
327config OCFS2_FS 328config OCFS2_FS
328 tristate "OCFS2 file system support (EXPERIMENTAL)" 329 tristate "OCFS2 file system support (EXPERIMENTAL)"
@@ -1930,6 +1931,7 @@ source "fs/partitions/Kconfig"
1930endmenu 1931endmenu
1931 1932
1932source "fs/nls/Kconfig" 1933source "fs/nls/Kconfig"
1934source "fs/dlm/Kconfig"
1933 1935
1934endmenu 1936endmenu
1935 1937
diff --git a/fs/Makefile b/fs/Makefile
index 89135428a539..64df11047ccc 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -50,6 +50,7 @@ obj-$(CONFIG_CONFIGFS_FS) += configfs/
50obj-y += devpts/ 50obj-y += devpts/
51 51
52obj-$(CONFIG_PROFILING) += dcookies.o 52obj-$(CONFIG_PROFILING) += dcookies.o
53obj-$(CONFIG_DLM) += dlm/
53 54
54# Do not add any filesystems before this line 55# Do not add any filesystems before this line
55obj-$(CONFIG_REISERFS_FS) += reiserfs/ 56obj-$(CONFIG_REISERFS_FS) += reiserfs/
@@ -102,3 +103,4 @@ obj-$(CONFIG_HOSTFS) += hostfs/
102obj-$(CONFIG_HPPFS) += hppfs/ 103obj-$(CONFIG_HPPFS) += hppfs/
103obj-$(CONFIG_DEBUG_FS) += debugfs/ 104obj-$(CONFIG_DEBUG_FS) += debugfs/
104obj-$(CONFIG_OCFS2_FS) += ocfs2/ 105obj-$(CONFIG_OCFS2_FS) += ocfs2/
106obj-$(CONFIG_GFS2_FS) += gfs2/
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index e07485ac50ad..24421209f854 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -224,4 +224,4 @@ EXPORT_SYMBOL(config_item_init);
224EXPORT_SYMBOL(config_group_init); 224EXPORT_SYMBOL(config_group_init);
225EXPORT_SYMBOL(config_item_get); 225EXPORT_SYMBOL(config_item_get);
226EXPORT_SYMBOL(config_item_put); 226EXPORT_SYMBOL(config_item_put);
227 227EXPORT_SYMBOL(config_group_find_obj);
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
new file mode 100644
index 000000000000..490f85b3fa59
--- /dev/null
+++ b/fs/dlm/Kconfig
@@ -0,0 +1,21 @@
1menu "Distributed Lock Manager"
2 depends on INET && EXPERIMENTAL
3
4config DLM
5 tristate "Distributed Lock Manager (DLM)"
6 depends on IPV6 || IPV6=n
7 depends on IP_SCTP
8 select CONFIGFS_FS
9 help
10 A general purpose distributed lock manager for kernel or userspace
11 applications.
12
13config DLM_DEBUG
14 bool "DLM debugging"
15 depends on DLM
16 help
17 Under the debugfs mount point, the name of each lockspace will
18 appear as a file in the "dlm" directory. The output is the
19 list of resource and locks the local node knows about.
20
21endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
new file mode 100644
index 000000000000..1832e0297f7d
--- /dev/null
+++ b/fs/dlm/Makefile
@@ -0,0 +1,19 @@
1obj-$(CONFIG_DLM) += dlm.o
2dlm-y := ast.o \
3 config.o \
4 dir.o \
5 lock.o \
6 lockspace.o \
7 lowcomms.o \
8 main.o \
9 member.o \
10 memory.o \
11 midcomms.o \
12 rcom.o \
13 recover.o \
14 recoverd.o \
15 requestqueue.o \
16 user.o \
17 util.o
18dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o
19
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
new file mode 100644
index 000000000000..f91d39cb1e0b
--- /dev/null
+++ b/fs/dlm/ast.c
@@ -0,0 +1,173 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lock.h"
16#include "user.h"
17
18#define WAKE_ASTS 0
19
20static struct list_head ast_queue;
21static spinlock_t ast_queue_lock;
22static struct task_struct * astd_task;
23static unsigned long astd_wakeflags;
24static struct mutex astd_running;
25
26
27void dlm_del_ast(struct dlm_lkb *lkb)
28{
29 spin_lock(&ast_queue_lock);
30 if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
31 list_del(&lkb->lkb_astqueue);
32 spin_unlock(&ast_queue_lock);
33}
34
35void dlm_add_ast(struct dlm_lkb *lkb, int type)
36{
37 if (lkb->lkb_flags & DLM_IFL_USER) {
38 dlm_user_add_ast(lkb, type);
39 return;
40 }
41 DLM_ASSERT(lkb->lkb_astaddr != DLM_FAKE_USER_AST, dlm_print_lkb(lkb););
42
43 spin_lock(&ast_queue_lock);
44 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
45 kref_get(&lkb->lkb_ref);
46 list_add_tail(&lkb->lkb_astqueue, &ast_queue);
47 }
48 lkb->lkb_ast_type |= type;
49 spin_unlock(&ast_queue_lock);
50
51 set_bit(WAKE_ASTS, &astd_wakeflags);
52 wake_up_process(astd_task);
53}
54
55static void process_asts(void)
56{
57 struct dlm_ls *ls = NULL;
58 struct dlm_rsb *r = NULL;
59 struct dlm_lkb *lkb;
60 void (*cast) (long param);
61 void (*bast) (long param, int mode);
62 int type = 0, found, bmode;
63
64 for (;;) {
65 found = 0;
66 spin_lock(&ast_queue_lock);
67 list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
68 r = lkb->lkb_resource;
69 ls = r->res_ls;
70
71 if (dlm_locking_stopped(ls))
72 continue;
73
74 list_del(&lkb->lkb_astqueue);
75 type = lkb->lkb_ast_type;
76 lkb->lkb_ast_type = 0;
77 found = 1;
78 break;
79 }
80 spin_unlock(&ast_queue_lock);
81
82 if (!found)
83 break;
84
85 cast = lkb->lkb_astaddr;
86 bast = lkb->lkb_bastaddr;
87 bmode = lkb->lkb_bastmode;
88
89 if ((type & AST_COMP) && cast)
90 cast(lkb->lkb_astparam);
91
92 /* FIXME: Is it safe to look at lkb_grmode here
93 without doing a lock_rsb() ?
94 Look at other checks in v1 to avoid basts. */
95
96 if ((type & AST_BAST) && bast)
97 if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
98 bast(lkb->lkb_astparam, bmode);
99
100 /* this removes the reference added by dlm_add_ast
101 and may result in the lkb being freed */
102 dlm_put_lkb(lkb);
103
104 schedule();
105 }
106}
107
108static inline int no_asts(void)
109{
110 int ret;
111
112 spin_lock(&ast_queue_lock);
113 ret = list_empty(&ast_queue);
114 spin_unlock(&ast_queue_lock);
115 return ret;
116}
117
118static int dlm_astd(void *data)
119{
120 while (!kthread_should_stop()) {
121 set_current_state(TASK_INTERRUPTIBLE);
122 if (!test_bit(WAKE_ASTS, &astd_wakeflags))
123 schedule();
124 set_current_state(TASK_RUNNING);
125
126 mutex_lock(&astd_running);
127 if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
128 process_asts();
129 mutex_unlock(&astd_running);
130 }
131 return 0;
132}
133
134void dlm_astd_wake(void)
135{
136 if (!no_asts()) {
137 set_bit(WAKE_ASTS, &astd_wakeflags);
138 wake_up_process(astd_task);
139 }
140}
141
142int dlm_astd_start(void)
143{
144 struct task_struct *p;
145 int error = 0;
146
147 INIT_LIST_HEAD(&ast_queue);
148 spin_lock_init(&ast_queue_lock);
149 mutex_init(&astd_running);
150
151 p = kthread_run(dlm_astd, NULL, "dlm_astd");
152 if (IS_ERR(p))
153 error = PTR_ERR(p);
154 else
155 astd_task = p;
156 return error;
157}
158
159void dlm_astd_stop(void)
160{
161 kthread_stop(astd_task);
162}
163
164void dlm_astd_suspend(void)
165{
166 mutex_lock(&astd_running);
167}
168
169void dlm_astd_resume(void)
170{
171 mutex_unlock(&astd_running);
172}
173
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
new file mode 100644
index 000000000000..6ee276c74c52
--- /dev/null
+++ b/fs/dlm/ast.h
@@ -0,0 +1,26 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __ASTD_DOT_H__
14#define __ASTD_DOT_H__
15
16void dlm_add_ast(struct dlm_lkb *lkb, int type);
17void dlm_del_ast(struct dlm_lkb *lkb);
18
19void dlm_astd_wake(void);
20int dlm_astd_start(void);
21void dlm_astd_stop(void);
22void dlm_astd_suspend(void);
23void dlm_astd_resume(void);
24
25#endif
26
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
new file mode 100644
index 000000000000..88553054bbfa
--- /dev/null
+++ b/fs/dlm/config.c
@@ -0,0 +1,789 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/configfs.h>
17#include <net/sock.h>
18
19#include "config.h"
20#include "lowcomms.h"
21
22/*
23 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
24 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
25 * /config/dlm/<cluster>/comms/<comm>/nodeid
26 * /config/dlm/<cluster>/comms/<comm>/local
27 * /config/dlm/<cluster>/comms/<comm>/addr
28 * The <cluster> level is useless, but I haven't figured out how to avoid it.
29 */
30
31static struct config_group *space_list;
32static struct config_group *comm_list;
33static struct comm *local_comm;
34
35struct clusters;
36struct cluster;
37struct spaces;
38struct space;
39struct comms;
40struct comm;
41struct nodes;
42struct node;
43
44static struct config_group *make_cluster(struct config_group *, const char *);
45static void drop_cluster(struct config_group *, struct config_item *);
46static void release_cluster(struct config_item *);
47static struct config_group *make_space(struct config_group *, const char *);
48static void drop_space(struct config_group *, struct config_item *);
49static void release_space(struct config_item *);
50static struct config_item *make_comm(struct config_group *, const char *);
51static void drop_comm(struct config_group *, struct config_item *);
52static void release_comm(struct config_item *);
53static struct config_item *make_node(struct config_group *, const char *);
54static void drop_node(struct config_group *, struct config_item *);
55static void release_node(struct config_item *);
56
57static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
58 char *buf);
59static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
60 const char *buf, size_t len);
61static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
62 char *buf);
63static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
64 const char *buf, size_t len);
65
66static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
67static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
68static ssize_t comm_local_read(struct comm *cm, char *buf);
69static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
70static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
71static ssize_t node_nodeid_read(struct node *nd, char *buf);
72static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
73static ssize_t node_weight_read(struct node *nd, char *buf);
74static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
75
76enum {
77 COMM_ATTR_NODEID = 0,
78 COMM_ATTR_LOCAL,
79 COMM_ATTR_ADDR,
80};
81
82struct comm_attribute {
83 struct configfs_attribute attr;
84 ssize_t (*show)(struct comm *, char *);
85 ssize_t (*store)(struct comm *, const char *, size_t);
86};
87
88static struct comm_attribute comm_attr_nodeid = {
89 .attr = { .ca_owner = THIS_MODULE,
90 .ca_name = "nodeid",
91 .ca_mode = S_IRUGO | S_IWUSR },
92 .show = comm_nodeid_read,
93 .store = comm_nodeid_write,
94};
95
96static struct comm_attribute comm_attr_local = {
97 .attr = { .ca_owner = THIS_MODULE,
98 .ca_name = "local",
99 .ca_mode = S_IRUGO | S_IWUSR },
100 .show = comm_local_read,
101 .store = comm_local_write,
102};
103
104static struct comm_attribute comm_attr_addr = {
105 .attr = { .ca_owner = THIS_MODULE,
106 .ca_name = "addr",
107 .ca_mode = S_IRUGO | S_IWUSR },
108 .store = comm_addr_write,
109};
110
111static struct configfs_attribute *comm_attrs[] = {
112 [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
113 [COMM_ATTR_LOCAL] = &comm_attr_local.attr,
114 [COMM_ATTR_ADDR] = &comm_attr_addr.attr,
115 NULL,
116};
117
118enum {
119 NODE_ATTR_NODEID = 0,
120 NODE_ATTR_WEIGHT,
121};
122
123struct node_attribute {
124 struct configfs_attribute attr;
125 ssize_t (*show)(struct node *, char *);
126 ssize_t (*store)(struct node *, const char *, size_t);
127};
128
129static struct node_attribute node_attr_nodeid = {
130 .attr = { .ca_owner = THIS_MODULE,
131 .ca_name = "nodeid",
132 .ca_mode = S_IRUGO | S_IWUSR },
133 .show = node_nodeid_read,
134 .store = node_nodeid_write,
135};
136
137static struct node_attribute node_attr_weight = {
138 .attr = { .ca_owner = THIS_MODULE,
139 .ca_name = "weight",
140 .ca_mode = S_IRUGO | S_IWUSR },
141 .show = node_weight_read,
142 .store = node_weight_write,
143};
144
145static struct configfs_attribute *node_attrs[] = {
146 [NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
147 [NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
148 NULL,
149};
150
151struct clusters {
152 struct configfs_subsystem subsys;
153};
154
155struct cluster {
156 struct config_group group;
157};
158
159struct spaces {
160 struct config_group ss_group;
161};
162
163struct space {
164 struct config_group group;
165 struct list_head members;
166 struct mutex members_lock;
167 int members_count;
168};
169
170struct comms {
171 struct config_group cs_group;
172};
173
174struct comm {
175 struct config_item item;
176 int nodeid;
177 int local;
178 int addr_count;
179 struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
180};
181
182struct nodes {
183 struct config_group ns_group;
184};
185
186struct node {
187 struct config_item item;
188 struct list_head list; /* space->members */
189 int nodeid;
190 int weight;
191};
192
193static struct configfs_group_operations clusters_ops = {
194 .make_group = make_cluster,
195 .drop_item = drop_cluster,
196};
197
198static struct configfs_item_operations cluster_ops = {
199 .release = release_cluster,
200};
201
202static struct configfs_group_operations spaces_ops = {
203 .make_group = make_space,
204 .drop_item = drop_space,
205};
206
207static struct configfs_item_operations space_ops = {
208 .release = release_space,
209};
210
211static struct configfs_group_operations comms_ops = {
212 .make_item = make_comm,
213 .drop_item = drop_comm,
214};
215
216static struct configfs_item_operations comm_ops = {
217 .release = release_comm,
218 .show_attribute = show_comm,
219 .store_attribute = store_comm,
220};
221
222static struct configfs_group_operations nodes_ops = {
223 .make_item = make_node,
224 .drop_item = drop_node,
225};
226
227static struct configfs_item_operations node_ops = {
228 .release = release_node,
229 .show_attribute = show_node,
230 .store_attribute = store_node,
231};
232
233static struct config_item_type clusters_type = {
234 .ct_group_ops = &clusters_ops,
235 .ct_owner = THIS_MODULE,
236};
237
238static struct config_item_type cluster_type = {
239 .ct_item_ops = &cluster_ops,
240 .ct_owner = THIS_MODULE,
241};
242
243static struct config_item_type spaces_type = {
244 .ct_group_ops = &spaces_ops,
245 .ct_owner = THIS_MODULE,
246};
247
248static struct config_item_type space_type = {
249 .ct_item_ops = &space_ops,
250 .ct_owner = THIS_MODULE,
251};
252
253static struct config_item_type comms_type = {
254 .ct_group_ops = &comms_ops,
255 .ct_owner = THIS_MODULE,
256};
257
258static struct config_item_type comm_type = {
259 .ct_item_ops = &comm_ops,
260 .ct_attrs = comm_attrs,
261 .ct_owner = THIS_MODULE,
262};
263
264static struct config_item_type nodes_type = {
265 .ct_group_ops = &nodes_ops,
266 .ct_owner = THIS_MODULE,
267};
268
269static struct config_item_type node_type = {
270 .ct_item_ops = &node_ops,
271 .ct_attrs = node_attrs,
272 .ct_owner = THIS_MODULE,
273};
274
275static struct cluster *to_cluster(struct config_item *i)
276{
277 return i ? container_of(to_config_group(i), struct cluster, group):NULL;
278}
279
280static struct space *to_space(struct config_item *i)
281{
282 return i ? container_of(to_config_group(i), struct space, group) : NULL;
283}
284
285static struct comm *to_comm(struct config_item *i)
286{
287 return i ? container_of(i, struct comm, item) : NULL;
288}
289
290static struct node *to_node(struct config_item *i)
291{
292 return i ? container_of(i, struct node, item) : NULL;
293}
294
295static struct config_group *make_cluster(struct config_group *g,
296 const char *name)
297{
298 struct cluster *cl = NULL;
299 struct spaces *sps = NULL;
300 struct comms *cms = NULL;
301 void *gps = NULL;
302
303 cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
304 gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
305 sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
306 cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
307
308 if (!cl || !gps || !sps || !cms)
309 goto fail;
310
311 config_group_init_type_name(&cl->group, name, &cluster_type);
312 config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
313 config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
314
315 cl->group.default_groups = gps;
316 cl->group.default_groups[0] = &sps->ss_group;
317 cl->group.default_groups[1] = &cms->cs_group;
318 cl->group.default_groups[2] = NULL;
319
320 space_list = &sps->ss_group;
321 comm_list = &cms->cs_group;
322 return &cl->group;
323
324 fail:
325 kfree(cl);
326 kfree(gps);
327 kfree(sps);
328 kfree(cms);
329 return NULL;
330}
331
332static void drop_cluster(struct config_group *g, struct config_item *i)
333{
334 struct cluster *cl = to_cluster(i);
335 struct config_item *tmp;
336 int j;
337
338 for (j = 0; cl->group.default_groups[j]; j++) {
339 tmp = &cl->group.default_groups[j]->cg_item;
340 cl->group.default_groups[j] = NULL;
341 config_item_put(tmp);
342 }
343
344 space_list = NULL;
345 comm_list = NULL;
346
347 config_item_put(i);
348}
349
350static void release_cluster(struct config_item *i)
351{
352 struct cluster *cl = to_cluster(i);
353 kfree(cl->group.default_groups);
354 kfree(cl);
355}
356
357static struct config_group *make_space(struct config_group *g, const char *name)
358{
359 struct space *sp = NULL;
360 struct nodes *nds = NULL;
361 void *gps = NULL;
362
363 sp = kzalloc(sizeof(struct space), GFP_KERNEL);
364 gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
365 nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
366
367 if (!sp || !gps || !nds)
368 goto fail;
369
370 config_group_init_type_name(&sp->group, name, &space_type);
371 config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
372
373 sp->group.default_groups = gps;
374 sp->group.default_groups[0] = &nds->ns_group;
375 sp->group.default_groups[1] = NULL;
376
377 INIT_LIST_HEAD(&sp->members);
378 mutex_init(&sp->members_lock);
379 sp->members_count = 0;
380 return &sp->group;
381
382 fail:
383 kfree(sp);
384 kfree(gps);
385 kfree(nds);
386 return NULL;
387}
388
389static void drop_space(struct config_group *g, struct config_item *i)
390{
391 struct space *sp = to_space(i);
392 struct config_item *tmp;
393 int j;
394
395 /* assert list_empty(&sp->members) */
396
397 for (j = 0; sp->group.default_groups[j]; j++) {
398 tmp = &sp->group.default_groups[j]->cg_item;
399 sp->group.default_groups[j] = NULL;
400 config_item_put(tmp);
401 }
402
403 config_item_put(i);
404}
405
406static void release_space(struct config_item *i)
407{
408 struct space *sp = to_space(i);
409 kfree(sp->group.default_groups);
410 kfree(sp);
411}
412
413static struct config_item *make_comm(struct config_group *g, const char *name)
414{
415 struct comm *cm;
416
417 cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
418 if (!cm)
419 return NULL;
420
421 config_item_init_type_name(&cm->item, name, &comm_type);
422 cm->nodeid = -1;
423 cm->local = 0;
424 cm->addr_count = 0;
425 return &cm->item;
426}
427
428static void drop_comm(struct config_group *g, struct config_item *i)
429{
430 struct comm *cm = to_comm(i);
431 if (local_comm == cm)
432 local_comm = NULL;
433 dlm_lowcomms_close(cm->nodeid);
434 while (cm->addr_count--)
435 kfree(cm->addr[cm->addr_count]);
436 config_item_put(i);
437}
438
439static void release_comm(struct config_item *i)
440{
441 struct comm *cm = to_comm(i);
442 kfree(cm);
443}
444
445static struct config_item *make_node(struct config_group *g, const char *name)
446{
447 struct space *sp = to_space(g->cg_item.ci_parent);
448 struct node *nd;
449
450 nd = kzalloc(sizeof(struct node), GFP_KERNEL);
451 if (!nd)
452 return NULL;
453
454 config_item_init_type_name(&nd->item, name, &node_type);
455 nd->nodeid = -1;
456 nd->weight = 1; /* default weight of 1 if none is set */
457
458 mutex_lock(&sp->members_lock);
459 list_add(&nd->list, &sp->members);
460 sp->members_count++;
461 mutex_unlock(&sp->members_lock);
462
463 return &nd->item;
464}
465
466static void drop_node(struct config_group *g, struct config_item *i)
467{
468 struct space *sp = to_space(g->cg_item.ci_parent);
469 struct node *nd = to_node(i);
470
471 mutex_lock(&sp->members_lock);
472 list_del(&nd->list);
473 sp->members_count--;
474 mutex_unlock(&sp->members_lock);
475
476 config_item_put(i);
477}
478
479static void release_node(struct config_item *i)
480{
481 struct node *nd = to_node(i);
482 kfree(nd);
483}
484
485static struct clusters clusters_root = {
486 .subsys = {
487 .su_group = {
488 .cg_item = {
489 .ci_namebuf = "dlm",
490 .ci_type = &clusters_type,
491 },
492 },
493 },
494};
495
496int dlm_config_init(void)
497{
498 config_group_init(&clusters_root.subsys.su_group);
499 init_MUTEX(&clusters_root.subsys.su_sem);
500 return configfs_register_subsystem(&clusters_root.subsys);
501}
502
503void dlm_config_exit(void)
504{
505 configfs_unregister_subsystem(&clusters_root.subsys);
506}
507
508/*
509 * Functions for user space to read/write attributes
510 */
511
512static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
513 char *buf)
514{
515 struct comm *cm = to_comm(i);
516 struct comm_attribute *cma =
517 container_of(a, struct comm_attribute, attr);
518 return cma->show ? cma->show(cm, buf) : 0;
519}
520
521static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
522 const char *buf, size_t len)
523{
524 struct comm *cm = to_comm(i);
525 struct comm_attribute *cma =
526 container_of(a, struct comm_attribute, attr);
527 return cma->store ? cma->store(cm, buf, len) : -EINVAL;
528}
529
530static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
531{
532 return sprintf(buf, "%d\n", cm->nodeid);
533}
534
535static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
536{
537 cm->nodeid = simple_strtol(buf, NULL, 0);
538 return len;
539}
540
541static ssize_t comm_local_read(struct comm *cm, char *buf)
542{
543 return sprintf(buf, "%d\n", cm->local);
544}
545
546static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
547{
548 cm->local= simple_strtol(buf, NULL, 0);
549 if (cm->local && !local_comm)
550 local_comm = cm;
551 return len;
552}
553
554static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
555{
556 struct sockaddr_storage *addr;
557
558 if (len != sizeof(struct sockaddr_storage))
559 return -EINVAL;
560
561 if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
562 return -ENOSPC;
563
564 addr = kzalloc(sizeof(*addr), GFP_KERNEL);
565 if (!addr)
566 return -ENOMEM;
567
568 memcpy(addr, buf, len);
569 cm->addr[cm->addr_count++] = addr;
570 return len;
571}
572
573static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
574 char *buf)
575{
576 struct node *nd = to_node(i);
577 struct node_attribute *nda =
578 container_of(a, struct node_attribute, attr);
579 return nda->show ? nda->show(nd, buf) : 0;
580}
581
582static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
583 const char *buf, size_t len)
584{
585 struct node *nd = to_node(i);
586 struct node_attribute *nda =
587 container_of(a, struct node_attribute, attr);
588 return nda->store ? nda->store(nd, buf, len) : -EINVAL;
589}
590
591static ssize_t node_nodeid_read(struct node *nd, char *buf)
592{
593 return sprintf(buf, "%d\n", nd->nodeid);
594}
595
596static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
597{
598 nd->nodeid = simple_strtol(buf, NULL, 0);
599 return len;
600}
601
602static ssize_t node_weight_read(struct node *nd, char *buf)
603{
604 return sprintf(buf, "%d\n", nd->weight);
605}
606
607static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
608{
609 nd->weight = simple_strtol(buf, NULL, 0);
610 return len;
611}
612
613/*
614 * Functions for the dlm to get the info that's been configured
615 */
616
617static struct space *get_space(char *name)
618{
619 if (!space_list)
620 return NULL;
621 return to_space(config_group_find_obj(space_list, name));
622}
623
624static void put_space(struct space *sp)
625{
626 config_item_put(&sp->group.cg_item);
627}
628
629static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
630{
631 struct config_item *i;
632 struct comm *cm = NULL;
633 int found = 0;
634
635 if (!comm_list)
636 return NULL;
637
638 down(&clusters_root.subsys.su_sem);
639
640 list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
641 cm = to_comm(i);
642
643 if (nodeid) {
644 if (cm->nodeid != nodeid)
645 continue;
646 found = 1;
647 break;
648 } else {
649 if (!cm->addr_count ||
650 memcmp(cm->addr[0], addr, sizeof(*addr)))
651 continue;
652 found = 1;
653 break;
654 }
655 }
656 up(&clusters_root.subsys.su_sem);
657
658 if (found)
659 config_item_get(i);
660 else
661 cm = NULL;
662 return cm;
663}
664
665static void put_comm(struct comm *cm)
666{
667 config_item_put(&cm->item);
668}
669
670/* caller must free mem */
671int dlm_nodeid_list(char *lsname, int **ids_out)
672{
673 struct space *sp;
674 struct node *nd;
675 int i = 0, rv = 0;
676 int *ids;
677
678 sp = get_space(lsname);
679 if (!sp)
680 return -EEXIST;
681
682 mutex_lock(&sp->members_lock);
683 if (!sp->members_count) {
684 rv = 0;
685 goto out;
686 }
687
688 ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL);
689 if (!ids) {
690 rv = -ENOMEM;
691 goto out;
692 }
693
694 rv = sp->members_count;
695 list_for_each_entry(nd, &sp->members, list)
696 ids[i++] = nd->nodeid;
697
698 if (rv != i)
699 printk("bad nodeid count %d %d\n", rv, i);
700
701 *ids_out = ids;
702 out:
703 mutex_unlock(&sp->members_lock);
704 put_space(sp);
705 return rv;
706}
707
708int dlm_node_weight(char *lsname, int nodeid)
709{
710 struct space *sp;
711 struct node *nd;
712 int w = -EEXIST;
713
714 sp = get_space(lsname);
715 if (!sp)
716 goto out;
717
718 mutex_lock(&sp->members_lock);
719 list_for_each_entry(nd, &sp->members, list) {
720 if (nd->nodeid != nodeid)
721 continue;
722 w = nd->weight;
723 break;
724 }
725 mutex_unlock(&sp->members_lock);
726 put_space(sp);
727 out:
728 return w;
729}
730
731int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
732{
733 struct comm *cm = get_comm(nodeid, NULL);
734 if (!cm)
735 return -EEXIST;
736 if (!cm->addr_count)
737 return -ENOENT;
738 memcpy(addr, cm->addr[0], sizeof(*addr));
739 put_comm(cm);
740 return 0;
741}
742
743int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
744{
745 struct comm *cm = get_comm(0, addr);
746 if (!cm)
747 return -EEXIST;
748 *nodeid = cm->nodeid;
749 put_comm(cm);
750 return 0;
751}
752
753int dlm_our_nodeid(void)
754{
755 return local_comm ? local_comm->nodeid : 0;
756}
757
758/* num 0 is first addr, num 1 is second addr */
759int dlm_our_addr(struct sockaddr_storage *addr, int num)
760{
761 if (!local_comm)
762 return -1;
763 if (num + 1 > local_comm->addr_count)
764 return -1;
765 memcpy(addr, local_comm->addr[num], sizeof(*addr));
766 return 0;
767}
768
769/* Config file defaults */
770#define DEFAULT_TCP_PORT 21064
771#define DEFAULT_BUFFER_SIZE 4096
772#define DEFAULT_RSBTBL_SIZE 256
773#define DEFAULT_LKBTBL_SIZE 1024
774#define DEFAULT_DIRTBL_SIZE 512
775#define DEFAULT_RECOVER_TIMER 5
776#define DEFAULT_TOSS_SECS 10
777#define DEFAULT_SCAN_SECS 5
778
779struct dlm_config_info dlm_config = {
780 .tcp_port = DEFAULT_TCP_PORT,
781 .buffer_size = DEFAULT_BUFFER_SIZE,
782 .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
783 .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
784 .dirtbl_size = DEFAULT_DIRTBL_SIZE,
785 .recover_timer = DEFAULT_RECOVER_TIMER,
786 .toss_secs = DEFAULT_TOSS_SECS,
787 .scan_secs = DEFAULT_SCAN_SECS
788};
789
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
new file mode 100644
index 000000000000..9da7839958a9
--- /dev/null
+++ b/fs/dlm/config.h
@@ -0,0 +1,42 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __CONFIG_DOT_H__
15#define __CONFIG_DOT_H__
16
17#define DLM_MAX_ADDR_COUNT 3
18
19struct dlm_config_info {
20 int tcp_port;
21 int buffer_size;
22 int rsbtbl_size;
23 int lkbtbl_size;
24 int dirtbl_size;
25 int recover_timer;
26 int toss_secs;
27 int scan_secs;
28};
29
30extern struct dlm_config_info dlm_config;
31
32int dlm_config_init(void);
33void dlm_config_exit(void);
34int dlm_node_weight(char *lsname, int nodeid);
35int dlm_nodeid_list(char *lsname, int **ids_out);
36int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
37int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
38int dlm_our_nodeid(void);
39int dlm_our_addr(struct sockaddr_storage *addr, int num);
40
41#endif /* __CONFIG_DOT_H__ */
42
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
new file mode 100644
index 000000000000..9c3aeddc8667
--- /dev/null
+++ b/fs/dlm/debug_fs.c
@@ -0,0 +1,387 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include <linux/pagemap.h>
14#include <linux/seq_file.h>
15#include <linux/module.h>
16#include <linux/ctype.h>
17#include <linux/debugfs.h>
18
19#include "dlm_internal.h"
20
21#define DLM_DEBUG_BUF_LEN 4096
22static char debug_buf[DLM_DEBUG_BUF_LEN];
23static struct mutex debug_buf_lock;
24
25static struct dentry *dlm_root;
26
27struct rsb_iter {
28 int entry;
29 struct dlm_ls *ls;
30 struct list_head *next;
31 struct dlm_rsb *rsb;
32};
33
34/*
35 * dump all rsb's in the lockspace hash table
36 */
37
38static char *print_lockmode(int mode)
39{
40 switch (mode) {
41 case DLM_LOCK_IV:
42 return "--";
43 case DLM_LOCK_NL:
44 return "NL";
45 case DLM_LOCK_CR:
46 return "CR";
47 case DLM_LOCK_CW:
48 return "CW";
49 case DLM_LOCK_PR:
50 return "PR";
51 case DLM_LOCK_PW:
52 return "PW";
53 case DLM_LOCK_EX:
54 return "EX";
55 default:
56 return "??";
57 }
58}
59
60static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
61 struct dlm_rsb *res)
62{
63 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
64
65 if (lkb->lkb_status == DLM_LKSTS_CONVERT
66 || lkb->lkb_status == DLM_LKSTS_WAITING)
67 seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
68
69 if (lkb->lkb_nodeid) {
70 if (lkb->lkb_nodeid != res->res_nodeid)
71 seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
72 lkb->lkb_remid);
73 else
74 seq_printf(s, " Master: %08x", lkb->lkb_remid);
75 }
76
77 if (lkb->lkb_wait_type)
78 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
79
80 seq_printf(s, "\n");
81}
82
83static int print_resource(struct dlm_rsb *res, struct seq_file *s)
84{
85 struct dlm_lkb *lkb;
86 int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
87
88 seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
89 for (i = 0; i < res->res_length; i++) {
90 if (isprint(res->res_name[i]))
91 seq_printf(s, "%c", res->res_name[i]);
92 else
93 seq_printf(s, "%c", '.');
94 }
95 if (res->res_nodeid > 0)
96 seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
97 res->res_nodeid);
98 else if (res->res_nodeid == 0)
99 seq_printf(s, "\" \nMaster Copy\n");
100 else if (res->res_nodeid == -1)
101 seq_printf(s, "\" \nLooking up master (lkid %x)\n",
102 res->res_first_lkid);
103 else
104 seq_printf(s, "\" \nInvalid master %d\n", res->res_nodeid);
105
106 /* Print the LVB: */
107 if (res->res_lvbptr) {
108 seq_printf(s, "LVB: ");
109 for (i = 0; i < lvblen; i++) {
110 if (i == lvblen / 2)
111 seq_printf(s, "\n ");
112 seq_printf(s, "%02x ",
113 (unsigned char) res->res_lvbptr[i]);
114 }
115 if (rsb_flag(res, RSB_VALNOTVALID))
116 seq_printf(s, " (INVALID)");
117 seq_printf(s, "\n");
118 }
119
120 root_list = !list_empty(&res->res_root_list);
121 recover_list = !list_empty(&res->res_recover_list);
122
123 if (root_list || recover_list) {
124 seq_printf(s, "Recovery: root %d recover %d flags %lx "
125 "count %d\n", root_list, recover_list,
126 res->res_flags, res->res_recover_locks_count);
127 }
128
129 /* Print the locks attached to this resource */
130 seq_printf(s, "Granted Queue\n");
131 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
132 print_lock(s, lkb, res);
133
134 seq_printf(s, "Conversion Queue\n");
135 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
136 print_lock(s, lkb, res);
137
138 seq_printf(s, "Waiting Queue\n");
139 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
140 print_lock(s, lkb, res);
141
142 if (list_empty(&res->res_lookup))
143 goto out;
144
145 seq_printf(s, "Lookup Queue\n");
146 list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
147 seq_printf(s, "%08x %s", lkb->lkb_id,
148 print_lockmode(lkb->lkb_rqmode));
149 if (lkb->lkb_wait_type)
150 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
151 seq_printf(s, "\n");
152 }
153 out:
154 return 0;
155}
156
157static int rsb_iter_next(struct rsb_iter *ri)
158{
159 struct dlm_ls *ls = ri->ls;
160 int i;
161
162 if (!ri->next) {
163 top:
164 /* Find the next non-empty hash bucket */
165 for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
166 read_lock(&ls->ls_rsbtbl[i].lock);
167 if (!list_empty(&ls->ls_rsbtbl[i].list)) {
168 ri->next = ls->ls_rsbtbl[i].list.next;
169 read_unlock(&ls->ls_rsbtbl[i].lock);
170 break;
171 }
172 read_unlock(&ls->ls_rsbtbl[i].lock);
173 }
174 ri->entry = i;
175
176 if (ri->entry >= ls->ls_rsbtbl_size)
177 return 1;
178 } else {
179 i = ri->entry;
180 read_lock(&ls->ls_rsbtbl[i].lock);
181 ri->next = ri->next->next;
182 if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
183 /* End of list - move to next bucket */
184 ri->next = NULL;
185 ri->entry++;
186 read_unlock(&ls->ls_rsbtbl[i].lock);
187 goto top;
188 }
189 read_unlock(&ls->ls_rsbtbl[i].lock);
190 }
191 ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
192
193 return 0;
194}
195
196static void rsb_iter_free(struct rsb_iter *ri)
197{
198 kfree(ri);
199}
200
201static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
202{
203 struct rsb_iter *ri;
204
205 ri = kmalloc(sizeof *ri, GFP_KERNEL);
206 if (!ri)
207 return NULL;
208
209 ri->ls = ls;
210 ri->entry = 0;
211 ri->next = NULL;
212
213 if (rsb_iter_next(ri)) {
214 rsb_iter_free(ri);
215 return NULL;
216 }
217
218 return ri;
219}
220
221static void *rsb_seq_start(struct seq_file *file, loff_t *pos)
222{
223 struct rsb_iter *ri;
224 loff_t n = *pos;
225
226 ri = rsb_iter_init(file->private);
227 if (!ri)
228 return NULL;
229
230 while (n--) {
231 if (rsb_iter_next(ri)) {
232 rsb_iter_free(ri);
233 return NULL;
234 }
235 }
236
237 return ri;
238}
239
240static void *rsb_seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
241{
242 struct rsb_iter *ri = iter_ptr;
243
244 (*pos)++;
245
246 if (rsb_iter_next(ri)) {
247 rsb_iter_free(ri);
248 return NULL;
249 }
250
251 return ri;
252}
253
254static void rsb_seq_stop(struct seq_file *file, void *iter_ptr)
255{
256 /* nothing for now */
257}
258
259static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
260{
261 struct rsb_iter *ri = iter_ptr;
262
263 print_resource(ri->rsb, file);
264
265 return 0;
266}
267
268static struct seq_operations rsb_seq_ops = {
269 .start = rsb_seq_start,
270 .next = rsb_seq_next,
271 .stop = rsb_seq_stop,
272 .show = rsb_seq_show,
273};
274
275static int rsb_open(struct inode *inode, struct file *file)
276{
277 struct seq_file *seq;
278 int ret;
279
280 ret = seq_open(file, &rsb_seq_ops);
281 if (ret)
282 return ret;
283
284 seq = file->private_data;
285 seq->private = inode->u.generic_ip;
286
287 return 0;
288}
289
290static struct file_operations rsb_fops = {
291 .owner = THIS_MODULE,
292 .open = rsb_open,
293 .read = seq_read,
294 .llseek = seq_lseek,
295 .release = seq_release
296};
297
298/*
299 * dump lkb's on the ls_waiters list
300 */
301
302static int waiters_open(struct inode *inode, struct file *file)
303{
304 file->private_data = inode->u.generic_ip;
305 return 0;
306}
307
308static ssize_t waiters_read(struct file *file, char __user *userbuf,
309 size_t count, loff_t *ppos)
310{
311 struct dlm_ls *ls = file->private_data;
312 struct dlm_lkb *lkb;
313 size_t len = DLM_DEBUG_BUF_LEN, pos = 0, ret, rv;
314
315 mutex_lock(&debug_buf_lock);
316 mutex_lock(&ls->ls_waiters_mutex);
317 memset(debug_buf, 0, sizeof(debug_buf));
318
319 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
320 ret = snprintf(debug_buf + pos, len - pos, "%x %d %d %s\n",
321 lkb->lkb_id, lkb->lkb_wait_type,
322 lkb->lkb_nodeid, lkb->lkb_resource->res_name);
323 if (ret >= len - pos)
324 break;
325 pos += ret;
326 }
327 mutex_unlock(&ls->ls_waiters_mutex);
328
329 rv = simple_read_from_buffer(userbuf, count, ppos, debug_buf, pos);
330 mutex_unlock(&debug_buf_lock);
331 return rv;
332}
333
334static struct file_operations waiters_fops = {
335 .owner = THIS_MODULE,
336 .open = waiters_open,
337 .read = waiters_read
338};
339
340int dlm_create_debug_file(struct dlm_ls *ls)
341{
342 char name[DLM_LOCKSPACE_LEN+8];
343
344 ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
345 S_IFREG | S_IRUGO,
346 dlm_root,
347 ls,
348 &rsb_fops);
349 if (!ls->ls_debug_rsb_dentry)
350 return -ENOMEM;
351
352 memset(name, 0, sizeof(name));
353 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
354
355 ls->ls_debug_waiters_dentry = debugfs_create_file(name,
356 S_IFREG | S_IRUGO,
357 dlm_root,
358 ls,
359 &waiters_fops);
360 if (!ls->ls_debug_waiters_dentry) {
361 debugfs_remove(ls->ls_debug_rsb_dentry);
362 return -ENOMEM;
363 }
364
365 return 0;
366}
367
368void dlm_delete_debug_file(struct dlm_ls *ls)
369{
370 if (ls->ls_debug_rsb_dentry)
371 debugfs_remove(ls->ls_debug_rsb_dentry);
372 if (ls->ls_debug_waiters_dentry)
373 debugfs_remove(ls->ls_debug_waiters_dentry);
374}
375
376int dlm_register_debugfs(void)
377{
378 mutex_init(&debug_buf_lock);
379 dlm_root = debugfs_create_dir("dlm", NULL);
380 return dlm_root ? 0 : -ENOMEM;
381}
382
383void dlm_unregister_debugfs(void)
384{
385 debugfs_remove(dlm_root);
386}
387
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
new file mode 100644
index 000000000000..46754553fdcc
--- /dev/null
+++ b/fs/dlm/dir.c
@@ -0,0 +1,423 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "rcom.h"
19#include "config.h"
20#include "memory.h"
21#include "recover.h"
22#include "util.h"
23#include "lock.h"
24#include "dir.h"
25
26
27static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
28{
29 spin_lock(&ls->ls_recover_list_lock);
30 list_add(&de->list, &ls->ls_recover_list);
31 spin_unlock(&ls->ls_recover_list_lock);
32}
33
34static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
35{
36 int found = 0;
37 struct dlm_direntry *de;
38
39 spin_lock(&ls->ls_recover_list_lock);
40 list_for_each_entry(de, &ls->ls_recover_list, list) {
41 if (de->length == len) {
42 list_del(&de->list);
43 de->master_nodeid = 0;
44 memset(de->name, 0, len);
45 found = 1;
46 break;
47 }
48 }
49 spin_unlock(&ls->ls_recover_list_lock);
50
51 if (!found)
52 de = allocate_direntry(ls, len);
53 return de;
54}
55
56void dlm_clear_free_entries(struct dlm_ls *ls)
57{
58 struct dlm_direntry *de;
59
60 spin_lock(&ls->ls_recover_list_lock);
61 while (!list_empty(&ls->ls_recover_list)) {
62 de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
63 list);
64 list_del(&de->list);
65 free_direntry(de);
66 }
67 spin_unlock(&ls->ls_recover_list_lock);
68}
69
70/*
71 * We use the upper 16 bits of the hash value to select the directory node.
72 * Low bits are used for distribution of rsb's among hash buckets on each node.
73 *
74 * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
75 * num_nodes to the hash value. This value in the desired range is used as an
76 * offset into the sorted list of nodeid's to give the particular nodeid.
77 */
78
79int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
80{
81 struct list_head *tmp;
82 struct dlm_member *memb = NULL;
83 uint32_t node, n = 0;
84 int nodeid;
85
86 if (ls->ls_num_nodes == 1) {
87 nodeid = dlm_our_nodeid();
88 goto out;
89 }
90
91 if (ls->ls_node_array) {
92 node = (hash >> 16) % ls->ls_total_weight;
93 nodeid = ls->ls_node_array[node];
94 goto out;
95 }
96
97 /* make_member_array() failed to kmalloc ls_node_array... */
98
99 node = (hash >> 16) % ls->ls_num_nodes;
100
101 list_for_each(tmp, &ls->ls_nodes) {
102 if (n++ != node)
103 continue;
104 memb = list_entry(tmp, struct dlm_member, list);
105 break;
106 }
107
108 DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n",
109 ls->ls_num_nodes, n, node););
110 nodeid = memb->nodeid;
111 out:
112 return nodeid;
113}
114
115int dlm_dir_nodeid(struct dlm_rsb *r)
116{
117 return dlm_hash2nodeid(r->res_ls, r->res_hash);
118}
119
120static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
121{
122 uint32_t val;
123
124 val = jhash(name, len, 0);
125 val &= (ls->ls_dirtbl_size - 1);
126
127 return val;
128}
129
130static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
131{
132 uint32_t bucket;
133
134 bucket = dir_hash(ls, de->name, de->length);
135 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
136}
137
138static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
139 int namelen, uint32_t bucket)
140{
141 struct dlm_direntry *de;
142
143 list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
144 if (de->length == namelen && !memcmp(name, de->name, namelen))
145 goto out;
146 }
147 de = NULL;
148 out:
149 return de;
150}
151
152void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen)
153{
154 struct dlm_direntry *de;
155 uint32_t bucket;
156
157 bucket = dir_hash(ls, name, namelen);
158
159 write_lock(&ls->ls_dirtbl[bucket].lock);
160
161 de = search_bucket(ls, name, namelen, bucket);
162
163 if (!de) {
164 log_error(ls, "remove fr %u none", nodeid);
165 goto out;
166 }
167
168 if (de->master_nodeid != nodeid) {
169 log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
170 goto out;
171 }
172
173 list_del(&de->list);
174 free_direntry(de);
175 out:
176 write_unlock(&ls->ls_dirtbl[bucket].lock);
177}
178
179void dlm_dir_clear(struct dlm_ls *ls)
180{
181 struct list_head *head;
182 struct dlm_direntry *de;
183 int i;
184
185 DLM_ASSERT(list_empty(&ls->ls_recover_list), );
186
187 for (i = 0; i < ls->ls_dirtbl_size; i++) {
188 write_lock(&ls->ls_dirtbl[i].lock);
189 head = &ls->ls_dirtbl[i].list;
190 while (!list_empty(head)) {
191 de = list_entry(head->next, struct dlm_direntry, list);
192 list_del(&de->list);
193 put_free_de(ls, de);
194 }
195 write_unlock(&ls->ls_dirtbl[i].lock);
196 }
197}
198
199int dlm_recover_directory(struct dlm_ls *ls)
200{
201 struct dlm_member *memb;
202 struct dlm_direntry *de;
203 char *b, *last_name = NULL;
204 int error = -ENOMEM, last_len, count = 0;
205 uint16_t namelen;
206
207 log_debug(ls, "dlm_recover_directory");
208
209 if (dlm_no_directory(ls))
210 goto out_status;
211
212 dlm_dir_clear(ls);
213
214 last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
215 if (!last_name)
216 goto out;
217
218 list_for_each_entry(memb, &ls->ls_nodes, list) {
219 memset(last_name, 0, DLM_RESNAME_MAXLEN);
220 last_len = 0;
221
222 for (;;) {
223 error = dlm_recovery_stopped(ls);
224 if (error)
225 goto out_free;
226
227 error = dlm_rcom_names(ls, memb->nodeid,
228 last_name, last_len);
229 if (error)
230 goto out_free;
231
232 schedule();
233
234 /*
235 * pick namelen/name pairs out of received buffer
236 */
237
238 b = ls->ls_recover_buf + sizeof(struct dlm_rcom);
239
240 for (;;) {
241 memcpy(&namelen, b, sizeof(uint16_t));
242 namelen = be16_to_cpu(namelen);
243 b += sizeof(uint16_t);
244
245 /* namelen of 0xFFFFF marks end of names for
246 this node; namelen of 0 marks end of the
247 buffer */
248
249 if (namelen == 0xFFFF)
250 goto done;
251 if (!namelen)
252 break;
253
254 error = -ENOMEM;
255 de = get_free_de(ls, namelen);
256 if (!de)
257 goto out_free;
258
259 de->master_nodeid = memb->nodeid;
260 de->length = namelen;
261 last_len = namelen;
262 memcpy(de->name, b, namelen);
263 memcpy(last_name, b, namelen);
264 b += namelen;
265
266 add_entry_to_hash(ls, de);
267 count++;
268 }
269 }
270 done:
271 ;
272 }
273
274 out_status:
275 error = 0;
276 dlm_set_recover_status(ls, DLM_RS_DIR);
277 log_debug(ls, "dlm_recover_directory %d entries", count);
278 out_free:
279 kfree(last_name);
280 out:
281 dlm_clear_free_entries(ls);
282 return error;
283}
284
285static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
286 int namelen, int *r_nodeid)
287{
288 struct dlm_direntry *de, *tmp;
289 uint32_t bucket;
290
291 bucket = dir_hash(ls, name, namelen);
292
293 write_lock(&ls->ls_dirtbl[bucket].lock);
294 de = search_bucket(ls, name, namelen, bucket);
295 if (de) {
296 *r_nodeid = de->master_nodeid;
297 write_unlock(&ls->ls_dirtbl[bucket].lock);
298 if (*r_nodeid == nodeid)
299 return -EEXIST;
300 return 0;
301 }
302
303 write_unlock(&ls->ls_dirtbl[bucket].lock);
304
305 de = allocate_direntry(ls, namelen);
306 if (!de)
307 return -ENOMEM;
308
309 de->master_nodeid = nodeid;
310 de->length = namelen;
311 memcpy(de->name, name, namelen);
312
313 write_lock(&ls->ls_dirtbl[bucket].lock);
314 tmp = search_bucket(ls, name, namelen, bucket);
315 if (tmp) {
316 free_direntry(de);
317 de = tmp;
318 } else {
319 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
320 }
321 *r_nodeid = de->master_nodeid;
322 write_unlock(&ls->ls_dirtbl[bucket].lock);
323 return 0;
324}
325
326int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
327 int *r_nodeid)
328{
329 return get_entry(ls, nodeid, name, namelen, r_nodeid);
330}
331
332/* Copy the names of master rsb's into the buffer provided.
333 Only select names whose dir node is the given nodeid. */
334
335void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
336 char *outbuf, int outlen, int nodeid)
337{
338 struct list_head *list;
339 struct dlm_rsb *start_r = NULL, *r = NULL;
340 int offset = 0, start_namelen, error, dir_nodeid;
341 char *start_name;
342 uint16_t be_namelen;
343
344 /*
345 * Find the rsb where we left off (or start again)
346 */
347
348 start_namelen = inlen;
349 start_name = inbuf;
350
351 if (start_namelen > 1) {
352 /*
353 * We could also use a find_rsb_root() function here that
354 * searched the ls_root_list.
355 */
356 error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
357 &start_r);
358 DLM_ASSERT(!error && start_r,
359 printk("error %d\n", error););
360 DLM_ASSERT(!list_empty(&start_r->res_root_list),
361 dlm_print_rsb(start_r););
362 dlm_put_rsb(start_r);
363 }
364
365 /*
366 * Send rsb names for rsb's we're master of and whose directory node
367 * matches the requesting node.
368 */
369
370 down_read(&ls->ls_root_sem);
371 if (start_r)
372 list = start_r->res_root_list.next;
373 else
374 list = ls->ls_root_list.next;
375
376 for (offset = 0; list != &ls->ls_root_list; list = list->next) {
377 r = list_entry(list, struct dlm_rsb, res_root_list);
378 if (r->res_nodeid)
379 continue;
380
381 dir_nodeid = dlm_dir_nodeid(r);
382 if (dir_nodeid != nodeid)
383 continue;
384
385 /*
386 * The block ends when we can't fit the following in the
387 * remaining buffer space:
388 * namelen (uint16_t) +
389 * name (r->res_length) +
390 * end-of-block record 0x0000 (uint16_t)
391 */
392
393 if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
394 /* Write end-of-block record */
395 be_namelen = 0;
396 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
397 offset += sizeof(uint16_t);
398 goto out;
399 }
400
401 be_namelen = cpu_to_be16(r->res_length);
402 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
403 offset += sizeof(uint16_t);
404 memcpy(outbuf + offset, r->res_name, r->res_length);
405 offset += r->res_length;
406 }
407
408 /*
409 * If we've reached the end of the list (and there's room) write a
410 * terminating record.
411 */
412
413 if ((list == &ls->ls_root_list) &&
414 (offset + sizeof(uint16_t) <= outlen)) {
415 be_namelen = 0xFFFF;
416 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
417 offset += sizeof(uint16_t);
418 }
419
420 out:
421 up_read(&ls->ls_root_sem);
422}
423
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
new file mode 100644
index 000000000000..0b0eb1267b6e
--- /dev/null
+++ b/fs/dlm/dir.h
@@ -0,0 +1,30 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DIR_DOT_H__
15#define __DIR_DOT_H__
16
17
18int dlm_dir_nodeid(struct dlm_rsb *rsb);
19int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
20void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len);
21void dlm_dir_clear(struct dlm_ls *ls);
22void dlm_clear_free_entries(struct dlm_ls *ls);
23int dlm_recover_directory(struct dlm_ls *ls);
24int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
25 int *r_nodeid);
26void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
27 char *outbuf, int outlen, int nodeid);
28
29#endif /* __DIR_DOT_H__ */
30
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
new file mode 100644
index 000000000000..1e5cd67e1b7a
--- /dev/null
+++ b/fs/dlm/dlm_internal.h
@@ -0,0 +1,543 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DLM_INTERNAL_DOT_H__
15#define __DLM_INTERNAL_DOT_H__
16
17/*
18 * This is the main header file to be included in each DLM source file.
19 */
20
21#include <linux/module.h>
22#include <linux/slab.h>
23#include <linux/sched.h>
24#include <linux/types.h>
25#include <linux/ctype.h>
26#include <linux/spinlock.h>
27#include <linux/vmalloc.h>
28#include <linux/list.h>
29#include <linux/errno.h>
30#include <linux/random.h>
31#include <linux/delay.h>
32#include <linux/socket.h>
33#include <linux/kthread.h>
34#include <linux/kobject.h>
35#include <linux/kref.h>
36#include <linux/kernel.h>
37#include <linux/jhash.h>
38#include <linux/miscdevice.h>
39#include <linux/mutex.h>
40#include <asm/semaphore.h>
41#include <asm/uaccess.h>
42
43#include <linux/dlm.h>
44
45#define DLM_LOCKSPACE_LEN 64
46
47/* Size of the temp buffer midcomms allocates on the stack.
48 We try to make this large enough so most messages fit.
49 FIXME: should sctp make this unnecessary? */
50
51#define DLM_INBUF_LEN 148
52
53struct dlm_ls;
54struct dlm_lkb;
55struct dlm_rsb;
56struct dlm_member;
57struct dlm_lkbtable;
58struct dlm_rsbtable;
59struct dlm_dirtable;
60struct dlm_direntry;
61struct dlm_recover;
62struct dlm_header;
63struct dlm_message;
64struct dlm_rcom;
65struct dlm_mhandle;
66
67#define log_print(fmt, args...) \
68 printk(KERN_ERR "dlm: "fmt"\n" , ##args)
69#define log_error(ls, fmt, args...) \
70 printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
71
72#define DLM_LOG_DEBUG
73#ifdef DLM_LOG_DEBUG
74#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args)
75#else
76#define log_debug(ls, fmt, args...)
77#endif
78
79#define DLM_ASSERT(x, do) \
80{ \
81 if (!(x)) \
82 { \
83 printk(KERN_ERR "\nDLM: Assertion failed on line %d of file %s\n" \
84 "DLM: assertion: \"%s\"\n" \
85 "DLM: time = %lu\n", \
86 __LINE__, __FILE__, #x, jiffies); \
87 {do} \
88 printk("\n"); \
89 BUG(); \
90 panic("DLM: Record message above and reboot.\n"); \
91 } \
92}
93
94#define DLM_FAKE_USER_AST ERR_PTR(-EINVAL)
95
96
97struct dlm_direntry {
98 struct list_head list;
99 uint32_t master_nodeid;
100 uint16_t length;
101 char name[1];
102};
103
104struct dlm_dirtable {
105 struct list_head list;
106 rwlock_t lock;
107};
108
109struct dlm_rsbtable {
110 struct list_head list;
111 struct list_head toss;
112 rwlock_t lock;
113};
114
115struct dlm_lkbtable {
116 struct list_head list;
117 rwlock_t lock;
118 uint16_t counter;
119};
120
121/*
122 * Lockspace member (per node in a ls)
123 */
124
125struct dlm_member {
126 struct list_head list;
127 int nodeid;
128 int weight;
129};
130
131/*
132 * Save and manage recovery state for a lockspace.
133 */
134
135struct dlm_recover {
136 struct list_head list;
137 int *nodeids;
138 int node_count;
139 uint64_t seq;
140};
141
142/*
143 * Pass input args to second stage locking function.
144 */
145
146struct dlm_args {
147 uint32_t flags;
148 void *astaddr;
149 long astparam;
150 void *bastaddr;
151 int mode;
152 struct dlm_lksb *lksb;
153};
154
155
156/*
157 * Lock block
158 *
159 * A lock can be one of three types:
160 *
161 * local copy lock is mastered locally
162 * (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
163 * process copy lock is mastered on a remote node
164 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
165 * master copy master node's copy of a lock owned by remote node
166 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
167 *
168 * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
169 * dlm_unlock. The dlm does not modify these or use any private flags in
170 * this field; it only contains DLM_LKF_ flags from dlm.h. These flags
171 * are sent as-is to the remote master when the lock is remote.
172 *
173 * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
174 * Some internal flags are shared between the master and process nodes;
175 * these shared flags are kept in the lower two bytes. One of these
176 * flags set on the master copy will be propagated to the process copy
177 * and v.v. Other internal flags are private to the master or process
178 * node (e.g. DLM_IFL_MSTCPY). These are kept in the high two bytes.
179 *
180 * lkb_sbflags: status block flags. These flags are copied directly into
181 * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
182 * ast. All defined in dlm.h with DLM_SBF_ prefix.
183 *
184 * lkb_status: the lock status indicates which rsb queue the lock is
185 * on, grant, convert, or wait. DLM_LKSTS_ WAITING/GRANTED/CONVERT
186 *
187 * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
188 * reply is needed. Only set when the lkb is on the lockspace waiters
189 * list awaiting a reply from a remote node.
190 *
191 * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
192 * is a master copy, nodeid specifies the remote lock holder, when the
193 * lkb is a process copy, the nodeid specifies the lock master.
194 */
195
196/* lkb_ast_type */
197
198#define AST_COMP 1
199#define AST_BAST 2
200
201/* lkb_status */
202
203#define DLM_LKSTS_WAITING 1
204#define DLM_LKSTS_GRANTED 2
205#define DLM_LKSTS_CONVERT 3
206
207/* lkb_flags */
208
209#define DLM_IFL_MSTCPY 0x00010000
210#define DLM_IFL_RESEND 0x00020000
211#define DLM_IFL_DEAD 0x00040000
212#define DLM_IFL_USER 0x00000001
213#define DLM_IFL_ORPHAN 0x00000002
214
215struct dlm_lkb {
216 struct dlm_rsb *lkb_resource; /* the rsb */
217 struct kref lkb_ref;
218 int lkb_nodeid; /* copied from rsb */
219 int lkb_ownpid; /* pid of lock owner */
220 uint32_t lkb_id; /* our lock ID */
221 uint32_t lkb_remid; /* lock ID on remote partner */
222 uint32_t lkb_exflags; /* external flags from caller */
223 uint32_t lkb_sbflags; /* lksb flags */
224 uint32_t lkb_flags; /* internal flags */
225 uint32_t lkb_lvbseq; /* lvb sequence number */
226
227 int8_t lkb_status; /* granted, waiting, convert */
228 int8_t lkb_rqmode; /* requested lock mode */
229 int8_t lkb_grmode; /* granted lock mode */
230 int8_t lkb_bastmode; /* requested mode */
231 int8_t lkb_highbast; /* highest mode bast sent for */
232
233 int8_t lkb_wait_type; /* type of reply waiting for */
234 int8_t lkb_ast_type; /* type of ast queued for */
235
236 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
237 struct list_head lkb_statequeue; /* rsb g/c/w list */
238 struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */
239 struct list_head lkb_wait_reply; /* waiting for remote reply */
240 struct list_head lkb_astqueue; /* need ast to be sent */
241 struct list_head lkb_ownqueue; /* list of locks for a process */
242
243 char *lkb_lvbptr;
244 struct dlm_lksb *lkb_lksb; /* caller's status block */
245 void *lkb_astaddr; /* caller's ast function */
246 void *lkb_bastaddr; /* caller's bast function */
247 long lkb_astparam; /* caller's ast arg */
248};
249
250
251struct dlm_rsb {
252 struct dlm_ls *res_ls; /* the lockspace */
253 struct kref res_ref;
254 struct mutex res_mutex;
255 unsigned long res_flags;
256 int res_length; /* length of rsb name */
257 int res_nodeid;
258 uint32_t res_lvbseq;
259 uint32_t res_hash;
260 uint32_t res_bucket; /* rsbtbl */
261 unsigned long res_toss_time;
262 uint32_t res_first_lkid;
263 struct list_head res_lookup; /* lkbs waiting on first */
264 struct list_head res_hashchain; /* rsbtbl */
265 struct list_head res_grantqueue;
266 struct list_head res_convertqueue;
267 struct list_head res_waitqueue;
268
269 struct list_head res_root_list; /* used for recovery */
270 struct list_head res_recover_list; /* used for recovery */
271 int res_recover_locks_count;
272
273 char *res_lvbptr;
274 char res_name[1];
275};
276
277/* find_rsb() flags */
278
279#define R_MASTER 1 /* only return rsb if it's a master */
280#define R_CREATE 2 /* create/add rsb if not found */
281
282/* rsb_flags */
283
284enum rsb_flags {
285 RSB_MASTER_UNCERTAIN,
286 RSB_VALNOTVALID,
287 RSB_VALNOTVALID_PREV,
288 RSB_NEW_MASTER,
289 RSB_NEW_MASTER2,
290 RSB_RECOVER_CONVERT,
291 RSB_LOCKS_PURGED,
292};
293
294static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
295{
296 __set_bit(flag, &r->res_flags);
297}
298
299static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
300{
301 __clear_bit(flag, &r->res_flags);
302}
303
304static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
305{
306 return test_bit(flag, &r->res_flags);
307}
308
309
310/* dlm_header is first element of all structs sent between nodes */
311
312#define DLM_HEADER_MAJOR 0x00020000
313#define DLM_HEADER_MINOR 0x00000001
314
315#define DLM_MSG 1
316#define DLM_RCOM 2
317
318struct dlm_header {
319 uint32_t h_version;
320 uint32_t h_lockspace;
321 uint32_t h_nodeid; /* nodeid of sender */
322 uint16_t h_length;
323 uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */
324 uint8_t h_pad;
325};
326
327
328#define DLM_MSG_REQUEST 1
329#define DLM_MSG_CONVERT 2
330#define DLM_MSG_UNLOCK 3
331#define DLM_MSG_CANCEL 4
332#define DLM_MSG_REQUEST_REPLY 5
333#define DLM_MSG_CONVERT_REPLY 6
334#define DLM_MSG_UNLOCK_REPLY 7
335#define DLM_MSG_CANCEL_REPLY 8
336#define DLM_MSG_GRANT 9
337#define DLM_MSG_BAST 10
338#define DLM_MSG_LOOKUP 11
339#define DLM_MSG_REMOVE 12
340#define DLM_MSG_LOOKUP_REPLY 13
341
342struct dlm_message {
343 struct dlm_header m_header;
344 uint32_t m_type; /* DLM_MSG_ */
345 uint32_t m_nodeid;
346 uint32_t m_pid;
347 uint32_t m_lkid; /* lkid on sender */
348 uint32_t m_remid; /* lkid on receiver */
349 uint32_t m_parent_lkid;
350 uint32_t m_parent_remid;
351 uint32_t m_exflags;
352 uint32_t m_sbflags;
353 uint32_t m_flags;
354 uint32_t m_lvbseq;
355 uint32_t m_hash;
356 int m_status;
357 int m_grmode;
358 int m_rqmode;
359 int m_bastmode;
360 int m_asts;
361 int m_result; /* 0 or -EXXX */
362 char m_extra[0]; /* name or lvb */
363};
364
365
366#define DLM_RS_NODES 0x00000001
367#define DLM_RS_NODES_ALL 0x00000002
368#define DLM_RS_DIR 0x00000004
369#define DLM_RS_DIR_ALL 0x00000008
370#define DLM_RS_LOCKS 0x00000010
371#define DLM_RS_LOCKS_ALL 0x00000020
372#define DLM_RS_DONE 0x00000040
373#define DLM_RS_DONE_ALL 0x00000080
374
375#define DLM_RCOM_STATUS 1
376#define DLM_RCOM_NAMES 2
377#define DLM_RCOM_LOOKUP 3
378#define DLM_RCOM_LOCK 4
379#define DLM_RCOM_STATUS_REPLY 5
380#define DLM_RCOM_NAMES_REPLY 6
381#define DLM_RCOM_LOOKUP_REPLY 7
382#define DLM_RCOM_LOCK_REPLY 8
383
384struct dlm_rcom {
385 struct dlm_header rc_header;
386 uint32_t rc_type; /* DLM_RCOM_ */
387 int rc_result; /* multi-purpose */
388 uint64_t rc_id; /* match reply with request */
389 char rc_buf[0];
390};
391
392struct rcom_config {
393 uint32_t rf_lvblen;
394 uint32_t rf_lsflags;
395 uint64_t rf_unused;
396};
397
398struct rcom_lock {
399 uint32_t rl_ownpid;
400 uint32_t rl_lkid;
401 uint32_t rl_remid;
402 uint32_t rl_parent_lkid;
403 uint32_t rl_parent_remid;
404 uint32_t rl_exflags;
405 uint32_t rl_flags;
406 uint32_t rl_lvbseq;
407 int rl_result;
408 int8_t rl_rqmode;
409 int8_t rl_grmode;
410 int8_t rl_status;
411 int8_t rl_asts;
412 uint16_t rl_wait_type;
413 uint16_t rl_namelen;
414 char rl_name[DLM_RESNAME_MAXLEN];
415 char rl_lvb[0];
416};
417
418struct dlm_ls {
419 struct list_head ls_list; /* list of lockspaces */
420 dlm_lockspace_t *ls_local_handle;
421 uint32_t ls_global_id; /* global unique lockspace ID */
422 uint32_t ls_exflags;
423 int ls_lvblen;
424 int ls_count; /* reference count */
425 unsigned long ls_flags; /* LSFL_ */
426 struct kobject ls_kobj;
427
428 struct dlm_rsbtable *ls_rsbtbl;
429 uint32_t ls_rsbtbl_size;
430
431 struct dlm_lkbtable *ls_lkbtbl;
432 uint32_t ls_lkbtbl_size;
433
434 struct dlm_dirtable *ls_dirtbl;
435 uint32_t ls_dirtbl_size;
436
437 struct mutex ls_waiters_mutex;
438 struct list_head ls_waiters; /* lkbs needing a reply */
439
440 struct list_head ls_nodes; /* current nodes in ls */
441 struct list_head ls_nodes_gone; /* dead node list, recovery */
442 int ls_num_nodes; /* number of nodes in ls */
443 int ls_low_nodeid;
444 int ls_total_weight;
445 int *ls_node_array;
446
447 struct dlm_rsb ls_stub_rsb; /* for returning errors */
448 struct dlm_lkb ls_stub_lkb; /* for returning errors */
449 struct dlm_message ls_stub_ms; /* for faking a reply */
450
451 struct dentry *ls_debug_rsb_dentry; /* debugfs */
452 struct dentry *ls_debug_waiters_dentry; /* debugfs */
453
454 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
455 int ls_uevent_result;
456
457 struct miscdevice ls_device;
458
459 /* recovery related */
460
461 struct timer_list ls_timer;
462 struct task_struct *ls_recoverd_task;
463 struct mutex ls_recoverd_active;
464 spinlock_t ls_recover_lock;
465 uint32_t ls_recover_status; /* DLM_RS_ */
466 uint64_t ls_recover_seq;
467 struct dlm_recover *ls_recover_args;
468 struct rw_semaphore ls_in_recovery; /* block local requests */
469 struct list_head ls_requestqueue;/* queue remote requests */
470 struct mutex ls_requestqueue_mutex;
471 char *ls_recover_buf;
472 int ls_recover_nodeid; /* for debugging */
473 uint64_t ls_rcom_seq;
474 struct list_head ls_recover_list;
475 spinlock_t ls_recover_list_lock;
476 int ls_recover_list_count;
477 wait_queue_head_t ls_wait_general;
478 struct mutex ls_clear_proc_locks;
479
480 struct list_head ls_root_list; /* root resources */
481 struct rw_semaphore ls_root_sem; /* protect root_list */
482
483 int ls_namelen;
484 char ls_name[1];
485};
486
487#define LSFL_WORK 0
488#define LSFL_RUNNING 1
489#define LSFL_RECOVERY_STOP 2
490#define LSFL_RCOM_READY 3
491#define LSFL_UEVENT_WAIT 4
492
493/* much of this is just saving user space pointers associated with the
494 lock that we pass back to the user lib with an ast */
495
496struct dlm_user_args {
497 struct dlm_user_proc *proc; /* each process that opens the lockspace
498 device has private data
499 (dlm_user_proc) on the struct file,
500 the process's locks point back to it*/
501 struct dlm_lksb lksb;
502 int old_mode;
503 int update_user_lvb;
504 struct dlm_lksb __user *user_lksb;
505 void __user *castparam;
506 void __user *castaddr;
507 void __user *bastparam;
508 void __user *bastaddr;
509};
510
511#define DLM_PROC_FLAGS_CLOSING 1
512#define DLM_PROC_FLAGS_COMPAT 2
513
514/* locks list is kept so we can remove all a process's locks when it
515 exits (or orphan those that are persistent) */
516
517struct dlm_user_proc {
518 dlm_lockspace_t *lockspace;
519 unsigned long flags; /* DLM_PROC_FLAGS */
520 struct list_head asts;
521 spinlock_t asts_spin;
522 struct list_head locks;
523 spinlock_t locks_spin;
524 wait_queue_head_t wait;
525};
526
527static inline int dlm_locking_stopped(struct dlm_ls *ls)
528{
529 return !test_bit(LSFL_RUNNING, &ls->ls_flags);
530}
531
532static inline int dlm_recovery_stopped(struct dlm_ls *ls)
533{
534 return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
535}
536
537static inline int dlm_no_directory(struct dlm_ls *ls)
538{
539 return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
540}
541
542#endif /* __DLM_INTERNAL_DOT_H__ */
543
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
new file mode 100644
index 000000000000..67247f0b508a
--- /dev/null
+++ b/fs/dlm/lock.c
@@ -0,0 +1,3869 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13/* Central locking logic has four stages:
14
15 dlm_lock()
16 dlm_unlock()
17
18 request_lock(ls, lkb)
19 convert_lock(ls, lkb)
20 unlock_lock(ls, lkb)
21 cancel_lock(ls, lkb)
22
23 _request_lock(r, lkb)
24 _convert_lock(r, lkb)
25 _unlock_lock(r, lkb)
26 _cancel_lock(r, lkb)
27
28 do_request(r, lkb)
29 do_convert(r, lkb)
30 do_unlock(r, lkb)
31 do_cancel(r, lkb)
32
33 Stage 1 (lock, unlock) is mainly about checking input args and
34 splitting into one of the four main operations:
35
36 dlm_lock = request_lock
37 dlm_lock+CONVERT = convert_lock
38 dlm_unlock = unlock_lock
39 dlm_unlock+CANCEL = cancel_lock
40
41 Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
42 provided to the next stage.
43
44 Stage 3, _xxxx_lock(), determines if the operation is local or remote.
45 When remote, it calls send_xxxx(), when local it calls do_xxxx().
46
47 Stage 4, do_xxxx(), is the guts of the operation. It manipulates the
48 given rsb and lkb and queues callbacks.
49
50 For remote operations, send_xxxx() results in the corresponding do_xxxx()
51 function being executed on the remote node. The connecting send/receive
52 calls on local (L) and remote (R) nodes:
53
54 L: send_xxxx() -> R: receive_xxxx()
55 R: do_xxxx()
56 L: receive_xxxx_reply() <- R: send_xxxx_reply()
57*/
58#include <linux/types.h>
59#include "dlm_internal.h"
60#include <linux/dlm_device.h>
61#include "memory.h"
62#include "lowcomms.h"
63#include "requestqueue.h"
64#include "util.h"
65#include "dir.h"
66#include "member.h"
67#include "lockspace.h"
68#include "ast.h"
69#include "lock.h"
70#include "rcom.h"
71#include "recover.h"
72#include "lvb_table.h"
73#include "user.h"
74#include "config.h"
75
76static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
77static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
78static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
79static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
80static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
81static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
82static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
83static int send_remove(struct dlm_rsb *r);
84static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
85static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
86 struct dlm_message *ms);
87static int receive_extralen(struct dlm_message *ms);
88
89/*
90 * Lock compatibilty matrix - thanks Steve
91 * UN = Unlocked state. Not really a state, used as a flag
92 * PD = Padding. Used to make the matrix a nice power of two in size
93 * Other states are the same as the VMS DLM.
94 * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
95 */
96
97static const int __dlm_compat_matrix[8][8] = {
98 /* UN NL CR CW PR PW EX PD */
99 {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
100 {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
101 {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
102 {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
103 {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
104 {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
105 {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
106 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
107};
108
109/*
110 * This defines the direction of transfer of LVB data.
111 * Granted mode is the row; requested mode is the column.
112 * Usage: matrix[grmode+1][rqmode+1]
113 * 1 = LVB is returned to the caller
114 * 0 = LVB is written to the resource
115 * -1 = nothing happens to the LVB
116 */
117
118const int dlm_lvb_operations[8][8] = {
119 /* UN NL CR CW PR PW EX PD*/
120 { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
121 { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
122 { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
123 { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
124 { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
125 { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
126 { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
127 { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
128};
129
130#define modes_compat(gr, rq) \
131 __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
132
133int dlm_modes_compat(int mode1, int mode2)
134{
135 return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
136}
137
138/*
139 * Compatibility matrix for conversions with QUECVT set.
140 * Granted mode is the row; requested mode is the column.
141 * Usage: matrix[grmode+1][rqmode+1]
142 */
143
144static const int __quecvt_compat_matrix[8][8] = {
145 /* UN NL CR CW PR PW EX PD */
146 {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
147 {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
148 {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
149 {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
150 {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
151 {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
152 {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
153 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
154};
155
156void dlm_print_lkb(struct dlm_lkb *lkb)
157{
158 printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
159 " status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
160 lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
161 lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
162 lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
163}
164
165void dlm_print_rsb(struct dlm_rsb *r)
166{
167 printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
168 r->res_nodeid, r->res_flags, r->res_first_lkid,
169 r->res_recover_locks_count, r->res_name);
170}
171
172void dlm_dump_rsb(struct dlm_rsb *r)
173{
174 struct dlm_lkb *lkb;
175
176 dlm_print_rsb(r);
177
178 printk(KERN_ERR "rsb: root_list empty %d recover_list empty %d\n",
179 list_empty(&r->res_root_list), list_empty(&r->res_recover_list));
180 printk(KERN_ERR "rsb lookup list\n");
181 list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
182 dlm_print_lkb(lkb);
183 printk(KERN_ERR "rsb grant queue:\n");
184 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
185 dlm_print_lkb(lkb);
186 printk(KERN_ERR "rsb convert queue:\n");
187 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
188 dlm_print_lkb(lkb);
189 printk(KERN_ERR "rsb wait queue:\n");
190 list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
191 dlm_print_lkb(lkb);
192}
193
194/* Threads cannot use the lockspace while it's being recovered */
195
196static inline void lock_recovery(struct dlm_ls *ls)
197{
198 down_read(&ls->ls_in_recovery);
199}
200
201static inline void unlock_recovery(struct dlm_ls *ls)
202{
203 up_read(&ls->ls_in_recovery);
204}
205
206static inline int lock_recovery_try(struct dlm_ls *ls)
207{
208 return down_read_trylock(&ls->ls_in_recovery);
209}
210
211static inline int can_be_queued(struct dlm_lkb *lkb)
212{
213 return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
214}
215
216static inline int force_blocking_asts(struct dlm_lkb *lkb)
217{
218 return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
219}
220
221static inline int is_demoted(struct dlm_lkb *lkb)
222{
223 return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
224}
225
226static inline int is_remote(struct dlm_rsb *r)
227{
228 DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
229 return !!r->res_nodeid;
230}
231
232static inline int is_process_copy(struct dlm_lkb *lkb)
233{
234 return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
235}
236
237static inline int is_master_copy(struct dlm_lkb *lkb)
238{
239 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
240 DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
241 return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
242}
243
244static inline int middle_conversion(struct dlm_lkb *lkb)
245{
246 if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
247 (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
248 return 1;
249 return 0;
250}
251
252static inline int down_conversion(struct dlm_lkb *lkb)
253{
254 return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
255}
256
257static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
258{
259 if (is_master_copy(lkb))
260 return;
261
262 DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
263
264 lkb->lkb_lksb->sb_status = rv;
265 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
266
267 dlm_add_ast(lkb, AST_COMP);
268}
269
270static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
271{
272 if (is_master_copy(lkb))
273 send_bast(r, lkb, rqmode);
274 else {
275 lkb->lkb_bastmode = rqmode;
276 dlm_add_ast(lkb, AST_BAST);
277 }
278}
279
280/*
281 * Basic operations on rsb's and lkb's
282 */
283
284static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
285{
286 struct dlm_rsb *r;
287
288 r = allocate_rsb(ls, len);
289 if (!r)
290 return NULL;
291
292 r->res_ls = ls;
293 r->res_length = len;
294 memcpy(r->res_name, name, len);
295 mutex_init(&r->res_mutex);
296
297 INIT_LIST_HEAD(&r->res_lookup);
298 INIT_LIST_HEAD(&r->res_grantqueue);
299 INIT_LIST_HEAD(&r->res_convertqueue);
300 INIT_LIST_HEAD(&r->res_waitqueue);
301 INIT_LIST_HEAD(&r->res_root_list);
302 INIT_LIST_HEAD(&r->res_recover_list);
303
304 return r;
305}
306
307static int search_rsb_list(struct list_head *head, char *name, int len,
308 unsigned int flags, struct dlm_rsb **r_ret)
309{
310 struct dlm_rsb *r;
311 int error = 0;
312
313 list_for_each_entry(r, head, res_hashchain) {
314 if (len == r->res_length && !memcmp(name, r->res_name, len))
315 goto found;
316 }
317 return -EBADR;
318
319 found:
320 if (r->res_nodeid && (flags & R_MASTER))
321 error = -ENOTBLK;
322 *r_ret = r;
323 return error;
324}
325
326static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
327 unsigned int flags, struct dlm_rsb **r_ret)
328{
329 struct dlm_rsb *r;
330 int error;
331
332 error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
333 if (!error) {
334 kref_get(&r->res_ref);
335 goto out;
336 }
337 error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
338 if (error)
339 goto out;
340
341 list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
342
343 if (dlm_no_directory(ls))
344 goto out;
345
346 if (r->res_nodeid == -1) {
347 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
348 r->res_first_lkid = 0;
349 } else if (r->res_nodeid > 0) {
350 rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
351 r->res_first_lkid = 0;
352 } else {
353 DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
354 DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
355 }
356 out:
357 *r_ret = r;
358 return error;
359}
360
361static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
362 unsigned int flags, struct dlm_rsb **r_ret)
363{
364 int error;
365 write_lock(&ls->ls_rsbtbl[b].lock);
366 error = _search_rsb(ls, name, len, b, flags, r_ret);
367 write_unlock(&ls->ls_rsbtbl[b].lock);
368 return error;
369}
370
371/*
372 * Find rsb in rsbtbl and potentially create/add one
373 *
374 * Delaying the release of rsb's has a similar benefit to applications keeping
375 * NL locks on an rsb, but without the guarantee that the cached master value
376 * will still be valid when the rsb is reused. Apps aren't always smart enough
377 * to keep NL locks on an rsb that they may lock again shortly; this can lead
378 * to excessive master lookups and removals if we don't delay the release.
379 *
380 * Searching for an rsb means looking through both the normal list and toss
381 * list. When found on the toss list the rsb is moved to the normal list with
382 * ref count of 1; when found on normal list the ref count is incremented.
383 */
384
385static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
386 unsigned int flags, struct dlm_rsb **r_ret)
387{
388 struct dlm_rsb *r, *tmp;
389 uint32_t hash, bucket;
390 int error = 0;
391
392 if (dlm_no_directory(ls))
393 flags |= R_CREATE;
394
395 hash = jhash(name, namelen, 0);
396 bucket = hash & (ls->ls_rsbtbl_size - 1);
397
398 error = search_rsb(ls, name, namelen, bucket, flags, &r);
399 if (!error)
400 goto out;
401
402 if (error == -EBADR && !(flags & R_CREATE))
403 goto out;
404
405 /* the rsb was found but wasn't a master copy */
406 if (error == -ENOTBLK)
407 goto out;
408
409 error = -ENOMEM;
410 r = create_rsb(ls, name, namelen);
411 if (!r)
412 goto out;
413
414 r->res_hash = hash;
415 r->res_bucket = bucket;
416 r->res_nodeid = -1;
417 kref_init(&r->res_ref);
418
419 /* With no directory, the master can be set immediately */
420 if (dlm_no_directory(ls)) {
421 int nodeid = dlm_dir_nodeid(r);
422 if (nodeid == dlm_our_nodeid())
423 nodeid = 0;
424 r->res_nodeid = nodeid;
425 }
426
427 write_lock(&ls->ls_rsbtbl[bucket].lock);
428 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
429 if (!error) {
430 write_unlock(&ls->ls_rsbtbl[bucket].lock);
431 free_rsb(r);
432 r = tmp;
433 goto out;
434 }
435 list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
436 write_unlock(&ls->ls_rsbtbl[bucket].lock);
437 error = 0;
438 out:
439 *r_ret = r;
440 return error;
441}
442
443int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
444 unsigned int flags, struct dlm_rsb **r_ret)
445{
446 return find_rsb(ls, name, namelen, flags, r_ret);
447}
448
449/* This is only called to add a reference when the code already holds
450 a valid reference to the rsb, so there's no need for locking. */
451
452static inline void hold_rsb(struct dlm_rsb *r)
453{
454 kref_get(&r->res_ref);
455}
456
457void dlm_hold_rsb(struct dlm_rsb *r)
458{
459 hold_rsb(r);
460}
461
462static void toss_rsb(struct kref *kref)
463{
464 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
465 struct dlm_ls *ls = r->res_ls;
466
467 DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
468 kref_init(&r->res_ref);
469 list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
470 r->res_toss_time = jiffies;
471 if (r->res_lvbptr) {
472 free_lvb(r->res_lvbptr);
473 r->res_lvbptr = NULL;
474 }
475}
476
477/* When all references to the rsb are gone it's transfered to
478 the tossed list for later disposal. */
479
480static void put_rsb(struct dlm_rsb *r)
481{
482 struct dlm_ls *ls = r->res_ls;
483 uint32_t bucket = r->res_bucket;
484
485 write_lock(&ls->ls_rsbtbl[bucket].lock);
486 kref_put(&r->res_ref, toss_rsb);
487 write_unlock(&ls->ls_rsbtbl[bucket].lock);
488}
489
490void dlm_put_rsb(struct dlm_rsb *r)
491{
492 put_rsb(r);
493}
494
495/* See comment for unhold_lkb */
496
497static void unhold_rsb(struct dlm_rsb *r)
498{
499 int rv;
500 rv = kref_put(&r->res_ref, toss_rsb);
501 DLM_ASSERT(!rv, dlm_dump_rsb(r););
502}
503
504static void kill_rsb(struct kref *kref)
505{
506 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
507
508 /* All work is done after the return from kref_put() so we
509 can release the write_lock before the remove and free. */
510
511 DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r););
512 DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r););
513 DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r););
514 DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r););
515 DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r););
516 DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r););
517}
518
519/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
520 The rsb must exist as long as any lkb's for it do. */
521
522static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
523{
524 hold_rsb(r);
525 lkb->lkb_resource = r;
526}
527
528static void detach_lkb(struct dlm_lkb *lkb)
529{
530 if (lkb->lkb_resource) {
531 put_rsb(lkb->lkb_resource);
532 lkb->lkb_resource = NULL;
533 }
534}
535
536static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
537{
538 struct dlm_lkb *lkb, *tmp;
539 uint32_t lkid = 0;
540 uint16_t bucket;
541
542 lkb = allocate_lkb(ls);
543 if (!lkb)
544 return -ENOMEM;
545
546 lkb->lkb_nodeid = -1;
547 lkb->lkb_grmode = DLM_LOCK_IV;
548 kref_init(&lkb->lkb_ref);
549 INIT_LIST_HEAD(&lkb->lkb_ownqueue);
550
551 get_random_bytes(&bucket, sizeof(bucket));
552 bucket &= (ls->ls_lkbtbl_size - 1);
553
554 write_lock(&ls->ls_lkbtbl[bucket].lock);
555
556 /* counter can roll over so we must verify lkid is not in use */
557
558 while (lkid == 0) {
559 lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
560
561 list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
562 lkb_idtbl_list) {
563 if (tmp->lkb_id != lkid)
564 continue;
565 lkid = 0;
566 break;
567 }
568 }
569
570 lkb->lkb_id = lkid;
571 list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
572 write_unlock(&ls->ls_lkbtbl[bucket].lock);
573
574 *lkb_ret = lkb;
575 return 0;
576}
577
578static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
579{
580 uint16_t bucket = lkid & 0xFFFF;
581 struct dlm_lkb *lkb;
582
583 list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
584 if (lkb->lkb_id == lkid)
585 return lkb;
586 }
587 return NULL;
588}
589
590static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
591{
592 struct dlm_lkb *lkb;
593 uint16_t bucket = lkid & 0xFFFF;
594
595 if (bucket >= ls->ls_lkbtbl_size)
596 return -EBADSLT;
597
598 read_lock(&ls->ls_lkbtbl[bucket].lock);
599 lkb = __find_lkb(ls, lkid);
600 if (lkb)
601 kref_get(&lkb->lkb_ref);
602 read_unlock(&ls->ls_lkbtbl[bucket].lock);
603
604 *lkb_ret = lkb;
605 return lkb ? 0 : -ENOENT;
606}
607
608static void kill_lkb(struct kref *kref)
609{
610 struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
611
612 /* All work is done after the return from kref_put() so we
613 can release the write_lock before the detach_lkb */
614
615 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
616}
617
618/* __put_lkb() is used when an lkb may not have an rsb attached to
619 it so we need to provide the lockspace explicitly */
620
621static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
622{
623 uint16_t bucket = lkb->lkb_id & 0xFFFF;
624
625 write_lock(&ls->ls_lkbtbl[bucket].lock);
626 if (kref_put(&lkb->lkb_ref, kill_lkb)) {
627 list_del(&lkb->lkb_idtbl_list);
628 write_unlock(&ls->ls_lkbtbl[bucket].lock);
629
630 detach_lkb(lkb);
631
632 /* for local/process lkbs, lvbptr points to caller's lksb */
633 if (lkb->lkb_lvbptr && is_master_copy(lkb))
634 free_lvb(lkb->lkb_lvbptr);
635 free_lkb(lkb);
636 return 1;
637 } else {
638 write_unlock(&ls->ls_lkbtbl[bucket].lock);
639 return 0;
640 }
641}
642
643int dlm_put_lkb(struct dlm_lkb *lkb)
644{
645 struct dlm_ls *ls;
646
647 DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb););
648 DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb););
649
650 ls = lkb->lkb_resource->res_ls;
651 return __put_lkb(ls, lkb);
652}
653
654/* This is only called to add a reference when the code already holds
655 a valid reference to the lkb, so there's no need for locking. */
656
657static inline void hold_lkb(struct dlm_lkb *lkb)
658{
659 kref_get(&lkb->lkb_ref);
660}
661
662/* This is called when we need to remove a reference and are certain
663 it's not the last ref. e.g. del_lkb is always called between a
664 find_lkb/put_lkb and is always the inverse of a previous add_lkb.
665 put_lkb would work fine, but would involve unnecessary locking */
666
667static inline void unhold_lkb(struct dlm_lkb *lkb)
668{
669 int rv;
670 rv = kref_put(&lkb->lkb_ref, kill_lkb);
671 DLM_ASSERT(!rv, dlm_print_lkb(lkb););
672}
673
674static void lkb_add_ordered(struct list_head *new, struct list_head *head,
675 int mode)
676{
677 struct dlm_lkb *lkb = NULL;
678
679 list_for_each_entry(lkb, head, lkb_statequeue)
680 if (lkb->lkb_rqmode < mode)
681 break;
682
683 if (!lkb)
684 list_add_tail(new, head);
685 else
686 __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
687}
688
689/* add/remove lkb to rsb's grant/convert/wait queue */
690
691static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
692{
693 kref_get(&lkb->lkb_ref);
694
695 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
696
697 lkb->lkb_status = status;
698
699 switch (status) {
700 case DLM_LKSTS_WAITING:
701 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
702 list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
703 else
704 list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
705 break;
706 case DLM_LKSTS_GRANTED:
707 /* convention says granted locks kept in order of grmode */
708 lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
709 lkb->lkb_grmode);
710 break;
711 case DLM_LKSTS_CONVERT:
712 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
713 list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
714 else
715 list_add_tail(&lkb->lkb_statequeue,
716 &r->res_convertqueue);
717 break;
718 default:
719 DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
720 }
721}
722
723static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
724{
725 lkb->lkb_status = 0;
726 list_del(&lkb->lkb_statequeue);
727 unhold_lkb(lkb);
728}
729
730static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
731{
732 hold_lkb(lkb);
733 del_lkb(r, lkb);
734 add_lkb(r, lkb, sts);
735 unhold_lkb(lkb);
736}
737
738/* add/remove lkb from global waiters list of lkb's waiting for
739 a reply from a remote node */
740
741static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
742{
743 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
744
745 mutex_lock(&ls->ls_waiters_mutex);
746 if (lkb->lkb_wait_type) {
747 log_print("add_to_waiters error %d", lkb->lkb_wait_type);
748 goto out;
749 }
750 lkb->lkb_wait_type = mstype;
751 kref_get(&lkb->lkb_ref);
752 list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
753 out:
754 mutex_unlock(&ls->ls_waiters_mutex);
755}
756
757static int _remove_from_waiters(struct dlm_lkb *lkb)
758{
759 int error = 0;
760
761 if (!lkb->lkb_wait_type) {
762 log_print("remove_from_waiters error");
763 error = -EINVAL;
764 goto out;
765 }
766 lkb->lkb_wait_type = 0;
767 list_del(&lkb->lkb_wait_reply);
768 unhold_lkb(lkb);
769 out:
770 return error;
771}
772
773static int remove_from_waiters(struct dlm_lkb *lkb)
774{
775 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
776 int error;
777
778 mutex_lock(&ls->ls_waiters_mutex);
779 error = _remove_from_waiters(lkb);
780 mutex_unlock(&ls->ls_waiters_mutex);
781 return error;
782}
783
784static void dir_remove(struct dlm_rsb *r)
785{
786 int to_nodeid;
787
788 if (dlm_no_directory(r->res_ls))
789 return;
790
791 to_nodeid = dlm_dir_nodeid(r);
792 if (to_nodeid != dlm_our_nodeid())
793 send_remove(r);
794 else
795 dlm_dir_remove_entry(r->res_ls, to_nodeid,
796 r->res_name, r->res_length);
797}
798
799/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
800 found since they are in order of newest to oldest? */
801
802static int shrink_bucket(struct dlm_ls *ls, int b)
803{
804 struct dlm_rsb *r;
805 int count = 0, found;
806
807 for (;;) {
808 found = 0;
809 write_lock(&ls->ls_rsbtbl[b].lock);
810 list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
811 res_hashchain) {
812 if (!time_after_eq(jiffies, r->res_toss_time +
813 dlm_config.toss_secs * HZ))
814 continue;
815 found = 1;
816 break;
817 }
818
819 if (!found) {
820 write_unlock(&ls->ls_rsbtbl[b].lock);
821 break;
822 }
823
824 if (kref_put(&r->res_ref, kill_rsb)) {
825 list_del(&r->res_hashchain);
826 write_unlock(&ls->ls_rsbtbl[b].lock);
827
828 if (is_master(r))
829 dir_remove(r);
830 free_rsb(r);
831 count++;
832 } else {
833 write_unlock(&ls->ls_rsbtbl[b].lock);
834 log_error(ls, "tossed rsb in use %s", r->res_name);
835 }
836 }
837
838 return count;
839}
840
841void dlm_scan_rsbs(struct dlm_ls *ls)
842{
843 int i;
844
845 if (dlm_locking_stopped(ls))
846 return;
847
848 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
849 shrink_bucket(ls, i);
850 cond_resched();
851 }
852}
853
854/* lkb is master or local copy */
855
856static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
857{
858 int b, len = r->res_ls->ls_lvblen;
859
860 /* b=1 lvb returned to caller
861 b=0 lvb written to rsb or invalidated
862 b=-1 do nothing */
863
864 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
865
866 if (b == 1) {
867 if (!lkb->lkb_lvbptr)
868 return;
869
870 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
871 return;
872
873 if (!r->res_lvbptr)
874 return;
875
876 memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
877 lkb->lkb_lvbseq = r->res_lvbseq;
878
879 } else if (b == 0) {
880 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
881 rsb_set_flag(r, RSB_VALNOTVALID);
882 return;
883 }
884
885 if (!lkb->lkb_lvbptr)
886 return;
887
888 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
889 return;
890
891 if (!r->res_lvbptr)
892 r->res_lvbptr = allocate_lvb(r->res_ls);
893
894 if (!r->res_lvbptr)
895 return;
896
897 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
898 r->res_lvbseq++;
899 lkb->lkb_lvbseq = r->res_lvbseq;
900 rsb_clear_flag(r, RSB_VALNOTVALID);
901 }
902
903 if (rsb_flag(r, RSB_VALNOTVALID))
904 lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
905}
906
907static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
908{
909 if (lkb->lkb_grmode < DLM_LOCK_PW)
910 return;
911
912 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
913 rsb_set_flag(r, RSB_VALNOTVALID);
914 return;
915 }
916
917 if (!lkb->lkb_lvbptr)
918 return;
919
920 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
921 return;
922
923 if (!r->res_lvbptr)
924 r->res_lvbptr = allocate_lvb(r->res_ls);
925
926 if (!r->res_lvbptr)
927 return;
928
929 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
930 r->res_lvbseq++;
931 rsb_clear_flag(r, RSB_VALNOTVALID);
932}
933
934/* lkb is process copy (pc) */
935
936static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
937 struct dlm_message *ms)
938{
939 int b;
940
941 if (!lkb->lkb_lvbptr)
942 return;
943
944 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
945 return;
946
947 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
948 if (b == 1) {
949 int len = receive_extralen(ms);
950 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
951 lkb->lkb_lvbseq = ms->m_lvbseq;
952 }
953}
954
955/* Manipulate lkb's on rsb's convert/granted/waiting queues
956 remove_lock -- used for unlock, removes lkb from granted
957 revert_lock -- used for cancel, moves lkb from convert to granted
958 grant_lock -- used for request and convert, adds lkb to granted or
959 moves lkb from convert or waiting to granted
960
961 Each of these is used for master or local copy lkb's. There is
962 also a _pc() variation used to make the corresponding change on
963 a process copy (pc) lkb. */
964
965static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
966{
967 del_lkb(r, lkb);
968 lkb->lkb_grmode = DLM_LOCK_IV;
969 /* this unhold undoes the original ref from create_lkb()
970 so this leads to the lkb being freed */
971 unhold_lkb(lkb);
972}
973
974static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
975{
976 set_lvb_unlock(r, lkb);
977 _remove_lock(r, lkb);
978}
979
980static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
981{
982 _remove_lock(r, lkb);
983}
984
985static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
986{
987 lkb->lkb_rqmode = DLM_LOCK_IV;
988
989 switch (lkb->lkb_status) {
990 case DLM_LKSTS_GRANTED:
991 break;
992 case DLM_LKSTS_CONVERT:
993 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
994 break;
995 case DLM_LKSTS_WAITING:
996 del_lkb(r, lkb);
997 lkb->lkb_grmode = DLM_LOCK_IV;
998 /* this unhold undoes the original ref from create_lkb()
999 so this leads to the lkb being freed */
1000 unhold_lkb(lkb);
1001 break;
1002 default:
1003 log_print("invalid status for revert %d", lkb->lkb_status);
1004 }
1005}
1006
1007static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
1008{
1009 revert_lock(r, lkb);
1010}
1011
1012static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1013{
1014 if (lkb->lkb_grmode != lkb->lkb_rqmode) {
1015 lkb->lkb_grmode = lkb->lkb_rqmode;
1016 if (lkb->lkb_status)
1017 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
1018 else
1019 add_lkb(r, lkb, DLM_LKSTS_GRANTED);
1020 }
1021
1022 lkb->lkb_rqmode = DLM_LOCK_IV;
1023}
1024
1025static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1026{
1027 set_lvb_lock(r, lkb);
1028 _grant_lock(r, lkb);
1029 lkb->lkb_highbast = 0;
1030}
1031
1032static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
1033 struct dlm_message *ms)
1034{
1035 set_lvb_lock_pc(r, lkb, ms);
1036 _grant_lock(r, lkb);
1037}
1038
1039/* called by grant_pending_locks() which means an async grant message must
1040 be sent to the requesting node in addition to granting the lock if the
1041 lkb belongs to a remote node. */
1042
1043static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
1044{
1045 grant_lock(r, lkb);
1046 if (is_master_copy(lkb))
1047 send_grant(r, lkb);
1048 else
1049 queue_cast(r, lkb, 0);
1050}
1051
1052static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
1053{
1054 struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
1055 lkb_statequeue);
1056 if (lkb->lkb_id == first->lkb_id)
1057 return 1;
1058
1059 return 0;
1060}
1061
1062/* Check if the given lkb conflicts with another lkb on the queue. */
1063
1064static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
1065{
1066 struct dlm_lkb *this;
1067
1068 list_for_each_entry(this, head, lkb_statequeue) {
1069 if (this == lkb)
1070 continue;
1071 if (!modes_compat(this, lkb))
1072 return 1;
1073 }
1074 return 0;
1075}
1076
1077/*
1078 * "A conversion deadlock arises with a pair of lock requests in the converting
1079 * queue for one resource. The granted mode of each lock blocks the requested
1080 * mode of the other lock."
1081 *
1082 * Part 2: if the granted mode of lkb is preventing the first lkb in the
1083 * convert queue from being granted, then demote lkb (set grmode to NL).
1084 * This second form requires that we check for conv-deadlk even when
1085 * now == 0 in _can_be_granted().
1086 *
1087 * Example:
1088 * Granted Queue: empty
1089 * Convert Queue: NL->EX (first lock)
1090 * PR->EX (second lock)
1091 *
1092 * The first lock can't be granted because of the granted mode of the second
1093 * lock and the second lock can't be granted because it's not first in the
1094 * list. We demote the granted mode of the second lock (the lkb passed to this
1095 * function).
1096 *
1097 * After the resolution, the "grant pending" function needs to go back and try
1098 * to grant locks on the convert queue again since the first lock can now be
1099 * granted.
1100 */
1101
1102static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
1103{
1104 struct dlm_lkb *this, *first = NULL, *self = NULL;
1105
1106 list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
1107 if (!first)
1108 first = this;
1109 if (this == lkb) {
1110 self = lkb;
1111 continue;
1112 }
1113
1114 if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
1115 return 1;
1116 }
1117
1118 /* if lkb is on the convert queue and is preventing the first
1119 from being granted, then there's deadlock and we demote lkb.
1120 multiple converting locks may need to do this before the first
1121 converting lock can be granted. */
1122
1123 if (self && self != first) {
1124 if (!modes_compat(lkb, first) &&
1125 !queue_conflict(&rsb->res_grantqueue, first))
1126 return 1;
1127 }
1128
1129 return 0;
1130}
1131
1132/*
1133 * Return 1 if the lock can be granted, 0 otherwise.
1134 * Also detect and resolve conversion deadlocks.
1135 *
1136 * lkb is the lock to be granted
1137 *
1138 * now is 1 if the function is being called in the context of the
1139 * immediate request, it is 0 if called later, after the lock has been
1140 * queued.
1141 *
1142 * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
1143 */
1144
1145static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1146{
1147 int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
1148
1149 /*
1150 * 6-10: Version 5.4 introduced an option to address the phenomenon of
1151 * a new request for a NL mode lock being blocked.
1152 *
1153 * 6-11: If the optional EXPEDITE flag is used with the new NL mode
1154 * request, then it would be granted. In essence, the use of this flag
1155 * tells the Lock Manager to expedite theis request by not considering
1156 * what may be in the CONVERTING or WAITING queues... As of this
1157 * writing, the EXPEDITE flag can be used only with new requests for NL
1158 * mode locks. This flag is not valid for conversion requests.
1159 *
1160 * A shortcut. Earlier checks return an error if EXPEDITE is used in a
1161 * conversion or used with a non-NL requested mode. We also know an
1162 * EXPEDITE request is always granted immediately, so now must always
1163 * be 1. The full condition to grant an expedite request: (now &&
1164 * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
1165 * therefore be shortened to just checking the flag.
1166 */
1167
1168 if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
1169 return 1;
1170
1171 /*
1172 * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
1173 * added to the remaining conditions.
1174 */
1175
1176 if (queue_conflict(&r->res_grantqueue, lkb))
1177 goto out;
1178
1179 /*
1180 * 6-3: By default, a conversion request is immediately granted if the
1181 * requested mode is compatible with the modes of all other granted
1182 * locks
1183 */
1184
1185 if (queue_conflict(&r->res_convertqueue, lkb))
1186 goto out;
1187
1188 /*
1189 * 6-5: But the default algorithm for deciding whether to grant or
1190 * queue conversion requests does not by itself guarantee that such
1191 * requests are serviced on a "first come first serve" basis. This, in
1192 * turn, can lead to a phenomenon known as "indefinate postponement".
1193 *
1194 * 6-7: This issue is dealt with by using the optional QUECVT flag with
1195 * the system service employed to request a lock conversion. This flag
1196 * forces certain conversion requests to be queued, even if they are
1197 * compatible with the granted modes of other locks on the same
1198 * resource. Thus, the use of this flag results in conversion requests
1199 * being ordered on a "first come first servce" basis.
1200 *
1201 * DCT: This condition is all about new conversions being able to occur
1202 * "in place" while the lock remains on the granted queue (assuming
1203 * nothing else conflicts.) IOW if QUECVT isn't set, a conversion
1204 * doesn't _have_ to go onto the convert queue where it's processed in
1205 * order. The "now" variable is necessary to distinguish converts
1206 * being received and processed for the first time now, because once a
1207 * convert is moved to the conversion queue the condition below applies
1208 * requiring fifo granting.
1209 */
1210
1211 if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
1212 return 1;
1213
1214 /*
1215 * The NOORDER flag is set to avoid the standard vms rules on grant
1216 * order.
1217 */
1218
1219 if (lkb->lkb_exflags & DLM_LKF_NOORDER)
1220 return 1;
1221
1222 /*
1223 * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
1224 * granted until all other conversion requests ahead of it are granted
1225 * and/or canceled.
1226 */
1227
1228 if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
1229 return 1;
1230
1231 /*
1232 * 6-4: By default, a new request is immediately granted only if all
1233 * three of the following conditions are satisfied when the request is
1234 * issued:
1235 * - The queue of ungranted conversion requests for the resource is
1236 * empty.
1237 * - The queue of ungranted new requests for the resource is empty.
1238 * - The mode of the new request is compatible with the most
1239 * restrictive mode of all granted locks on the resource.
1240 */
1241
1242 if (now && !conv && list_empty(&r->res_convertqueue) &&
1243 list_empty(&r->res_waitqueue))
1244 return 1;
1245
1246 /*
1247 * 6-4: Once a lock request is in the queue of ungranted new requests,
1248 * it cannot be granted until the queue of ungranted conversion
1249 * requests is empty, all ungranted new requests ahead of it are
1250 * granted and/or canceled, and it is compatible with the granted mode
1251 * of the most restrictive lock granted on the resource.
1252 */
1253
1254 if (!now && !conv && list_empty(&r->res_convertqueue) &&
1255 first_in_list(lkb, &r->res_waitqueue))
1256 return 1;
1257
1258 out:
1259 /*
1260 * The following, enabled by CONVDEADLK, departs from VMS.
1261 */
1262
1263 if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
1264 conversion_deadlock_detect(r, lkb)) {
1265 lkb->lkb_grmode = DLM_LOCK_NL;
1266 lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
1267 }
1268
1269 return 0;
1270}
1271
1272/*
1273 * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a
1274 * simple way to provide a big optimization to applications that can use them.
1275 */
1276
1277static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1278{
1279 uint32_t flags = lkb->lkb_exflags;
1280 int rv;
1281 int8_t alt = 0, rqmode = lkb->lkb_rqmode;
1282
1283 rv = _can_be_granted(r, lkb, now);
1284 if (rv)
1285 goto out;
1286
1287 if (lkb->lkb_sbflags & DLM_SBF_DEMOTED)
1288 goto out;
1289
1290 if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR)
1291 alt = DLM_LOCK_PR;
1292 else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW)
1293 alt = DLM_LOCK_CW;
1294
1295 if (alt) {
1296 lkb->lkb_rqmode = alt;
1297 rv = _can_be_granted(r, lkb, now);
1298 if (rv)
1299 lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
1300 else
1301 lkb->lkb_rqmode = rqmode;
1302 }
1303 out:
1304 return rv;
1305}
1306
1307static int grant_pending_convert(struct dlm_rsb *r, int high)
1308{
1309 struct dlm_lkb *lkb, *s;
1310 int hi, demoted, quit, grant_restart, demote_restart;
1311
1312 quit = 0;
1313 restart:
1314 grant_restart = 0;
1315 demote_restart = 0;
1316 hi = DLM_LOCK_IV;
1317
1318 list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
1319 demoted = is_demoted(lkb);
1320 if (can_be_granted(r, lkb, 0)) {
1321 grant_lock_pending(r, lkb);
1322 grant_restart = 1;
1323 } else {
1324 hi = max_t(int, lkb->lkb_rqmode, hi);
1325 if (!demoted && is_demoted(lkb))
1326 demote_restart = 1;
1327 }
1328 }
1329
1330 if (grant_restart)
1331 goto restart;
1332 if (demote_restart && !quit) {
1333 quit = 1;
1334 goto restart;
1335 }
1336
1337 return max_t(int, high, hi);
1338}
1339
1340static int grant_pending_wait(struct dlm_rsb *r, int high)
1341{
1342 struct dlm_lkb *lkb, *s;
1343
1344 list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
1345 if (can_be_granted(r, lkb, 0))
1346 grant_lock_pending(r, lkb);
1347 else
1348 high = max_t(int, lkb->lkb_rqmode, high);
1349 }
1350
1351 return high;
1352}
1353
1354static void grant_pending_locks(struct dlm_rsb *r)
1355{
1356 struct dlm_lkb *lkb, *s;
1357 int high = DLM_LOCK_IV;
1358
1359 DLM_ASSERT(is_master(r), dlm_dump_rsb(r););
1360
1361 high = grant_pending_convert(r, high);
1362 high = grant_pending_wait(r, high);
1363
1364 if (high == DLM_LOCK_IV)
1365 return;
1366
1367 /*
1368 * If there are locks left on the wait/convert queue then send blocking
1369 * ASTs to granted locks based on the largest requested mode (high)
1370 * found above. FIXME: highbast < high comparison not valid for PR/CW.
1371 */
1372
1373 list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
1374 if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
1375 !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
1376 queue_bast(r, lkb, high);
1377 lkb->lkb_highbast = high;
1378 }
1379 }
1380}
1381
1382static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
1383 struct dlm_lkb *lkb)
1384{
1385 struct dlm_lkb *gr;
1386
1387 list_for_each_entry(gr, head, lkb_statequeue) {
1388 if (gr->lkb_bastaddr &&
1389 gr->lkb_highbast < lkb->lkb_rqmode &&
1390 !modes_compat(gr, lkb)) {
1391 queue_bast(r, gr, lkb->lkb_rqmode);
1392 gr->lkb_highbast = lkb->lkb_rqmode;
1393 }
1394 }
1395}
1396
1397static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
1398{
1399 send_bast_queue(r, &r->res_grantqueue, lkb);
1400}
1401
1402static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
1403{
1404 send_bast_queue(r, &r->res_grantqueue, lkb);
1405 send_bast_queue(r, &r->res_convertqueue, lkb);
1406}
1407
1408/* set_master(r, lkb) -- set the master nodeid of a resource
1409
1410 The purpose of this function is to set the nodeid field in the given
1411 lkb using the nodeid field in the given rsb. If the rsb's nodeid is
1412 known, it can just be copied to the lkb and the function will return
1413 0. If the rsb's nodeid is _not_ known, it needs to be looked up
1414 before it can be copied to the lkb.
1415
1416 When the rsb nodeid is being looked up remotely, the initial lkb
1417 causing the lookup is kept on the ls_waiters list waiting for the
1418 lookup reply. Other lkb's waiting for the same rsb lookup are kept
1419 on the rsb's res_lookup list until the master is verified.
1420
1421 Return values:
1422 0: nodeid is set in rsb/lkb and the caller should go ahead and use it
1423 1: the rsb master is not available and the lkb has been placed on
1424 a wait queue
1425*/
1426
1427static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1428{
1429 struct dlm_ls *ls = r->res_ls;
1430 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
1431
1432 if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
1433 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
1434 r->res_first_lkid = lkb->lkb_id;
1435 lkb->lkb_nodeid = r->res_nodeid;
1436 return 0;
1437 }
1438
1439 if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
1440 list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
1441 return 1;
1442 }
1443
1444 if (r->res_nodeid == 0) {
1445 lkb->lkb_nodeid = 0;
1446 return 0;
1447 }
1448
1449 if (r->res_nodeid > 0) {
1450 lkb->lkb_nodeid = r->res_nodeid;
1451 return 0;
1452 }
1453
1454 DLM_ASSERT(r->res_nodeid == -1, dlm_dump_rsb(r););
1455
1456 dir_nodeid = dlm_dir_nodeid(r);
1457
1458 if (dir_nodeid != our_nodeid) {
1459 r->res_first_lkid = lkb->lkb_id;
1460 send_lookup(r, lkb);
1461 return 1;
1462 }
1463
1464 for (;;) {
1465 /* It's possible for dlm_scand to remove an old rsb for
1466 this same resource from the toss list, us to create
1467 a new one, look up the master locally, and find it
1468 already exists just before dlm_scand does the
1469 dir_remove() on the previous rsb. */
1470
1471 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
1472 r->res_length, &ret_nodeid);
1473 if (!error)
1474 break;
1475 log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
1476 schedule();
1477 }
1478
1479 if (ret_nodeid == our_nodeid) {
1480 r->res_first_lkid = 0;
1481 r->res_nodeid = 0;
1482 lkb->lkb_nodeid = 0;
1483 } else {
1484 r->res_first_lkid = lkb->lkb_id;
1485 r->res_nodeid = ret_nodeid;
1486 lkb->lkb_nodeid = ret_nodeid;
1487 }
1488 return 0;
1489}
1490
1491static void process_lookup_list(struct dlm_rsb *r)
1492{
1493 struct dlm_lkb *lkb, *safe;
1494
1495 list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
1496 list_del(&lkb->lkb_rsb_lookup);
1497 _request_lock(r, lkb);
1498 schedule();
1499 }
1500}
1501
1502/* confirm_master -- confirm (or deny) an rsb's master nodeid */
1503
1504static void confirm_master(struct dlm_rsb *r, int error)
1505{
1506 struct dlm_lkb *lkb;
1507
1508 if (!r->res_first_lkid)
1509 return;
1510
1511 switch (error) {
1512 case 0:
1513 case -EINPROGRESS:
1514 r->res_first_lkid = 0;
1515 process_lookup_list(r);
1516 break;
1517
1518 case -EAGAIN:
1519 /* the remote master didn't queue our NOQUEUE request;
1520 make a waiting lkb the first_lkid */
1521
1522 r->res_first_lkid = 0;
1523
1524 if (!list_empty(&r->res_lookup)) {
1525 lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
1526 lkb_rsb_lookup);
1527 list_del(&lkb->lkb_rsb_lookup);
1528 r->res_first_lkid = lkb->lkb_id;
1529 _request_lock(r, lkb);
1530 } else
1531 r->res_nodeid = -1;
1532 break;
1533
1534 default:
1535 log_error(r->res_ls, "confirm_master unknown error %d", error);
1536 }
1537}
1538
1539static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
1540 int namelen, uint32_t parent_lkid, void *ast,
1541 void *astarg, void *bast, struct dlm_args *args)
1542{
1543 int rv = -EINVAL;
1544
1545 /* check for invalid arg usage */
1546
1547 if (mode < 0 || mode > DLM_LOCK_EX)
1548 goto out;
1549
1550 if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
1551 goto out;
1552
1553 if (flags & DLM_LKF_CANCEL)
1554 goto out;
1555
1556 if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
1557 goto out;
1558
1559 if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
1560 goto out;
1561
1562 if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
1563 goto out;
1564
1565 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
1566 goto out;
1567
1568 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
1569 goto out;
1570
1571 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
1572 goto out;
1573
1574 if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
1575 goto out;
1576
1577 if (!ast || !lksb)
1578 goto out;
1579
1580 if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
1581 goto out;
1582
1583 /* parent/child locks not yet supported */
1584 if (parent_lkid)
1585 goto out;
1586
1587 if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
1588 goto out;
1589
1590 /* these args will be copied to the lkb in validate_lock_args,
1591 it cannot be done now because when converting locks, fields in
1592 an active lkb cannot be modified before locking the rsb */
1593
1594 args->flags = flags;
1595 args->astaddr = ast;
1596 args->astparam = (long) astarg;
1597 args->bastaddr = bast;
1598 args->mode = mode;
1599 args->lksb = lksb;
1600 rv = 0;
1601 out:
1602 return rv;
1603}
1604
1605static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
1606{
1607 if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
1608 DLM_LKF_FORCEUNLOCK))
1609 return -EINVAL;
1610
1611 args->flags = flags;
1612 args->astparam = (long) astarg;
1613 return 0;
1614}
1615
1616static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
1617 struct dlm_args *args)
1618{
1619 int rv = -EINVAL;
1620
1621 if (args->flags & DLM_LKF_CONVERT) {
1622 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1623 goto out;
1624
1625 if (args->flags & DLM_LKF_QUECVT &&
1626 !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
1627 goto out;
1628
1629 rv = -EBUSY;
1630 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
1631 goto out;
1632
1633 if (lkb->lkb_wait_type)
1634 goto out;
1635 }
1636
1637 lkb->lkb_exflags = args->flags;
1638 lkb->lkb_sbflags = 0;
1639 lkb->lkb_astaddr = args->astaddr;
1640 lkb->lkb_astparam = args->astparam;
1641 lkb->lkb_bastaddr = args->bastaddr;
1642 lkb->lkb_rqmode = args->mode;
1643 lkb->lkb_lksb = args->lksb;
1644 lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
1645 lkb->lkb_ownpid = (int) current->pid;
1646 rv = 0;
1647 out:
1648 return rv;
1649}
1650
1651static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
1652{
1653 int rv = -EINVAL;
1654
1655 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1656 goto out;
1657
1658 if (args->flags & DLM_LKF_FORCEUNLOCK)
1659 goto out_ok;
1660
1661 if (args->flags & DLM_LKF_CANCEL &&
1662 lkb->lkb_status == DLM_LKSTS_GRANTED)
1663 goto out;
1664
1665 if (!(args->flags & DLM_LKF_CANCEL) &&
1666 lkb->lkb_status != DLM_LKSTS_GRANTED)
1667 goto out;
1668
1669 rv = -EBUSY;
1670 if (lkb->lkb_wait_type)
1671 goto out;
1672
1673 out_ok:
1674 lkb->lkb_exflags = args->flags;
1675 lkb->lkb_sbflags = 0;
1676 lkb->lkb_astparam = args->astparam;
1677
1678 rv = 0;
1679 out:
1680 return rv;
1681}
1682
1683/*
1684 * Four stage 4 varieties:
1685 * do_request(), do_convert(), do_unlock(), do_cancel()
1686 * These are called on the master node for the given lock and
1687 * from the central locking logic.
1688 */
1689
1690static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
1691{
1692 int error = 0;
1693
1694 if (can_be_granted(r, lkb, 1)) {
1695 grant_lock(r, lkb);
1696 queue_cast(r, lkb, 0);
1697 goto out;
1698 }
1699
1700 if (can_be_queued(lkb)) {
1701 error = -EINPROGRESS;
1702 add_lkb(r, lkb, DLM_LKSTS_WAITING);
1703 send_blocking_asts(r, lkb);
1704 goto out;
1705 }
1706
1707 error = -EAGAIN;
1708 if (force_blocking_asts(lkb))
1709 send_blocking_asts_all(r, lkb);
1710 queue_cast(r, lkb, -EAGAIN);
1711
1712 out:
1713 return error;
1714}
1715
1716static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
1717{
1718 int error = 0;
1719
1720 /* changing an existing lock may allow others to be granted */
1721
1722 if (can_be_granted(r, lkb, 1)) {
1723 grant_lock(r, lkb);
1724 queue_cast(r, lkb, 0);
1725 grant_pending_locks(r);
1726 goto out;
1727 }
1728
1729 if (can_be_queued(lkb)) {
1730 if (is_demoted(lkb))
1731 grant_pending_locks(r);
1732 error = -EINPROGRESS;
1733 del_lkb(r, lkb);
1734 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
1735 send_blocking_asts(r, lkb);
1736 goto out;
1737 }
1738
1739 error = -EAGAIN;
1740 if (force_blocking_asts(lkb))
1741 send_blocking_asts_all(r, lkb);
1742 queue_cast(r, lkb, -EAGAIN);
1743
1744 out:
1745 return error;
1746}
1747
1748static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1749{
1750 remove_lock(r, lkb);
1751 queue_cast(r, lkb, -DLM_EUNLOCK);
1752 grant_pending_locks(r);
1753 return -DLM_EUNLOCK;
1754}
1755
1756/* FIXME: if revert_lock() finds that the lkb is granted, we should
1757 skip the queue_cast(ECANCEL). It indicates that the request/convert
1758 completed (and queued a normal ast) just before the cancel; we don't
1759 want to clobber the sb_result for the normal ast with ECANCEL. */
1760
1761static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
1762{
1763 revert_lock(r, lkb);
1764 queue_cast(r, lkb, -DLM_ECANCEL);
1765 grant_pending_locks(r);
1766 return -DLM_ECANCEL;
1767}
1768
1769/*
1770 * Four stage 3 varieties:
1771 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
1772 */
1773
1774/* add a new lkb to a possibly new rsb, called by requesting process */
1775
1776static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1777{
1778 int error;
1779
1780 /* set_master: sets lkb nodeid from r */
1781
1782 error = set_master(r, lkb);
1783 if (error < 0)
1784 goto out;
1785 if (error) {
1786 error = 0;
1787 goto out;
1788 }
1789
1790 if (is_remote(r))
1791 /* receive_request() calls do_request() on remote node */
1792 error = send_request(r, lkb);
1793 else
1794 error = do_request(r, lkb);
1795 out:
1796 return error;
1797}
1798
1799/* change some property of an existing lkb, e.g. mode */
1800
1801static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1802{
1803 int error;
1804
1805 if (is_remote(r))
1806 /* receive_convert() calls do_convert() on remote node */
1807 error = send_convert(r, lkb);
1808 else
1809 error = do_convert(r, lkb);
1810
1811 return error;
1812}
1813
1814/* remove an existing lkb from the granted queue */
1815
1816static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1817{
1818 int error;
1819
1820 if (is_remote(r))
1821 /* receive_unlock() calls do_unlock() on remote node */
1822 error = send_unlock(r, lkb);
1823 else
1824 error = do_unlock(r, lkb);
1825
1826 return error;
1827}
1828
1829/* remove an existing lkb from the convert or wait queue */
1830
1831static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1832{
1833 int error;
1834
1835 if (is_remote(r))
1836 /* receive_cancel() calls do_cancel() on remote node */
1837 error = send_cancel(r, lkb);
1838 else
1839 error = do_cancel(r, lkb);
1840
1841 return error;
1842}
1843
1844/*
1845 * Four stage 2 varieties:
1846 * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
1847 */
1848
1849static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
1850 int len, struct dlm_args *args)
1851{
1852 struct dlm_rsb *r;
1853 int error;
1854
1855 error = validate_lock_args(ls, lkb, args);
1856 if (error)
1857 goto out;
1858
1859 error = find_rsb(ls, name, len, R_CREATE, &r);
1860 if (error)
1861 goto out;
1862
1863 lock_rsb(r);
1864
1865 attach_lkb(r, lkb);
1866 lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
1867
1868 error = _request_lock(r, lkb);
1869
1870 unlock_rsb(r);
1871 put_rsb(r);
1872
1873 out:
1874 return error;
1875}
1876
1877static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1878 struct dlm_args *args)
1879{
1880 struct dlm_rsb *r;
1881 int error;
1882
1883 r = lkb->lkb_resource;
1884
1885 hold_rsb(r);
1886 lock_rsb(r);
1887
1888 error = validate_lock_args(ls, lkb, args);
1889 if (error)
1890 goto out;
1891
1892 error = _convert_lock(r, lkb);
1893 out:
1894 unlock_rsb(r);
1895 put_rsb(r);
1896 return error;
1897}
1898
1899static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1900 struct dlm_args *args)
1901{
1902 struct dlm_rsb *r;
1903 int error;
1904
1905 r = lkb->lkb_resource;
1906
1907 hold_rsb(r);
1908 lock_rsb(r);
1909
1910 error = validate_unlock_args(lkb, args);
1911 if (error)
1912 goto out;
1913
1914 error = _unlock_lock(r, lkb);
1915 out:
1916 unlock_rsb(r);
1917 put_rsb(r);
1918 return error;
1919}
1920
1921static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1922 struct dlm_args *args)
1923{
1924 struct dlm_rsb *r;
1925 int error;
1926
1927 r = lkb->lkb_resource;
1928
1929 hold_rsb(r);
1930 lock_rsb(r);
1931
1932 error = validate_unlock_args(lkb, args);
1933 if (error)
1934 goto out;
1935
1936 error = _cancel_lock(r, lkb);
1937 out:
1938 unlock_rsb(r);
1939 put_rsb(r);
1940 return error;
1941}
1942
1943/*
1944 * Two stage 1 varieties: dlm_lock() and dlm_unlock()
1945 */
1946
1947int dlm_lock(dlm_lockspace_t *lockspace,
1948 int mode,
1949 struct dlm_lksb *lksb,
1950 uint32_t flags,
1951 void *name,
1952 unsigned int namelen,
1953 uint32_t parent_lkid,
1954 void (*ast) (void *astarg),
1955 void *astarg,
1956 void (*bast) (void *astarg, int mode))
1957{
1958 struct dlm_ls *ls;
1959 struct dlm_lkb *lkb;
1960 struct dlm_args args;
1961 int error, convert = flags & DLM_LKF_CONVERT;
1962
1963 ls = dlm_find_lockspace_local(lockspace);
1964 if (!ls)
1965 return -EINVAL;
1966
1967 lock_recovery(ls);
1968
1969 if (convert)
1970 error = find_lkb(ls, lksb->sb_lkid, &lkb);
1971 else
1972 error = create_lkb(ls, &lkb);
1973
1974 if (error)
1975 goto out;
1976
1977 error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast,
1978 astarg, bast, &args);
1979 if (error)
1980 goto out_put;
1981
1982 if (convert)
1983 error = convert_lock(ls, lkb, &args);
1984 else
1985 error = request_lock(ls, lkb, name, namelen, &args);
1986
1987 if (error == -EINPROGRESS)
1988 error = 0;
1989 out_put:
1990 if (convert || error)
1991 __put_lkb(ls, lkb);
1992 if (error == -EAGAIN)
1993 error = 0;
1994 out:
1995 unlock_recovery(ls);
1996 dlm_put_lockspace(ls);
1997 return error;
1998}
1999
2000int dlm_unlock(dlm_lockspace_t *lockspace,
2001 uint32_t lkid,
2002 uint32_t flags,
2003 struct dlm_lksb *lksb,
2004 void *astarg)
2005{
2006 struct dlm_ls *ls;
2007 struct dlm_lkb *lkb;
2008 struct dlm_args args;
2009 int error;
2010
2011 ls = dlm_find_lockspace_local(lockspace);
2012 if (!ls)
2013 return -EINVAL;
2014
2015 lock_recovery(ls);
2016
2017 error = find_lkb(ls, lkid, &lkb);
2018 if (error)
2019 goto out;
2020
2021 error = set_unlock_args(flags, astarg, &args);
2022 if (error)
2023 goto out_put;
2024
2025 if (flags & DLM_LKF_CANCEL)
2026 error = cancel_lock(ls, lkb, &args);
2027 else
2028 error = unlock_lock(ls, lkb, &args);
2029
2030 if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
2031 error = 0;
2032 out_put:
2033 dlm_put_lkb(lkb);
2034 out:
2035 unlock_recovery(ls);
2036 dlm_put_lockspace(ls);
2037 return error;
2038}
2039
2040/*
2041 * send/receive routines for remote operations and replies
2042 *
2043 * send_args
2044 * send_common
2045 * send_request receive_request
2046 * send_convert receive_convert
2047 * send_unlock receive_unlock
2048 * send_cancel receive_cancel
2049 * send_grant receive_grant
2050 * send_bast receive_bast
2051 * send_lookup receive_lookup
2052 * send_remove receive_remove
2053 *
2054 * send_common_reply
2055 * receive_request_reply send_request_reply
2056 * receive_convert_reply send_convert_reply
2057 * receive_unlock_reply send_unlock_reply
2058 * receive_cancel_reply send_cancel_reply
2059 * receive_lookup_reply send_lookup_reply
2060 */
2061
2062static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
2063 int to_nodeid, int mstype,
2064 struct dlm_message **ms_ret,
2065 struct dlm_mhandle **mh_ret)
2066{
2067 struct dlm_message *ms;
2068 struct dlm_mhandle *mh;
2069 char *mb;
2070 int mb_len = sizeof(struct dlm_message);
2071
2072 switch (mstype) {
2073 case DLM_MSG_REQUEST:
2074 case DLM_MSG_LOOKUP:
2075 case DLM_MSG_REMOVE:
2076 mb_len += r->res_length;
2077 break;
2078 case DLM_MSG_CONVERT:
2079 case DLM_MSG_UNLOCK:
2080 case DLM_MSG_REQUEST_REPLY:
2081 case DLM_MSG_CONVERT_REPLY:
2082 case DLM_MSG_GRANT:
2083 if (lkb && lkb->lkb_lvbptr)
2084 mb_len += r->res_ls->ls_lvblen;
2085 break;
2086 }
2087
2088 /* get_buffer gives us a message handle (mh) that we need to
2089 pass into lowcomms_commit and a message buffer (mb) that we
2090 write our data into */
2091
2092 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
2093 if (!mh)
2094 return -ENOBUFS;
2095
2096 memset(mb, 0, mb_len);
2097
2098 ms = (struct dlm_message *) mb;
2099
2100 ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
2101 ms->m_header.h_lockspace = r->res_ls->ls_global_id;
2102 ms->m_header.h_nodeid = dlm_our_nodeid();
2103 ms->m_header.h_length = mb_len;
2104 ms->m_header.h_cmd = DLM_MSG;
2105
2106 ms->m_type = mstype;
2107
2108 *mh_ret = mh;
2109 *ms_ret = ms;
2110 return 0;
2111}
2112
2113/* further lowcomms enhancements or alternate implementations may make
2114 the return value from this function useful at some point */
2115
2116static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
2117{
2118 dlm_message_out(ms);
2119 dlm_lowcomms_commit_buffer(mh);
2120 return 0;
2121}
2122
2123static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
2124 struct dlm_message *ms)
2125{
2126 ms->m_nodeid = lkb->lkb_nodeid;
2127 ms->m_pid = lkb->lkb_ownpid;
2128 ms->m_lkid = lkb->lkb_id;
2129 ms->m_remid = lkb->lkb_remid;
2130 ms->m_exflags = lkb->lkb_exflags;
2131 ms->m_sbflags = lkb->lkb_sbflags;
2132 ms->m_flags = lkb->lkb_flags;
2133 ms->m_lvbseq = lkb->lkb_lvbseq;
2134 ms->m_status = lkb->lkb_status;
2135 ms->m_grmode = lkb->lkb_grmode;
2136 ms->m_rqmode = lkb->lkb_rqmode;
2137 ms->m_hash = r->res_hash;
2138
2139 /* m_result and m_bastmode are set from function args,
2140 not from lkb fields */
2141
2142 if (lkb->lkb_bastaddr)
2143 ms->m_asts |= AST_BAST;
2144 if (lkb->lkb_astaddr)
2145 ms->m_asts |= AST_COMP;
2146
2147 if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
2148 memcpy(ms->m_extra, r->res_name, r->res_length);
2149
2150 else if (lkb->lkb_lvbptr)
2151 memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
2152
2153}
2154
2155static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
2156{
2157 struct dlm_message *ms;
2158 struct dlm_mhandle *mh;
2159 int to_nodeid, error;
2160
2161 add_to_waiters(lkb, mstype);
2162
2163 to_nodeid = r->res_nodeid;
2164
2165 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2166 if (error)
2167 goto fail;
2168
2169 send_args(r, lkb, ms);
2170
2171 error = send_message(mh, ms);
2172 if (error)
2173 goto fail;
2174 return 0;
2175
2176 fail:
2177 remove_from_waiters(lkb);
2178 return error;
2179}
2180
2181static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
2182{
2183 return send_common(r, lkb, DLM_MSG_REQUEST);
2184}
2185
2186static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2187{
2188 int error;
2189
2190 error = send_common(r, lkb, DLM_MSG_CONVERT);
2191
2192 /* down conversions go without a reply from the master */
2193 if (!error && down_conversion(lkb)) {
2194 remove_from_waiters(lkb);
2195 r->res_ls->ls_stub_ms.m_result = 0;
2196 r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
2197 __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
2198 }
2199
2200 return error;
2201}
2202
2203/* FIXME: if this lkb is the only lock we hold on the rsb, then set
2204 MASTER_UNCERTAIN to force the next request on the rsb to confirm
2205 that the master is still correct. */
2206
2207static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2208{
2209 return send_common(r, lkb, DLM_MSG_UNLOCK);
2210}
2211
2212static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
2213{
2214 return send_common(r, lkb, DLM_MSG_CANCEL);
2215}
2216
2217static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
2218{
2219 struct dlm_message *ms;
2220 struct dlm_mhandle *mh;
2221 int to_nodeid, error;
2222
2223 to_nodeid = lkb->lkb_nodeid;
2224
2225 error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
2226 if (error)
2227 goto out;
2228
2229 send_args(r, lkb, ms);
2230
2231 ms->m_result = 0;
2232
2233 error = send_message(mh, ms);
2234 out:
2235 return error;
2236}
2237
2238static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
2239{
2240 struct dlm_message *ms;
2241 struct dlm_mhandle *mh;
2242 int to_nodeid, error;
2243
2244 to_nodeid = lkb->lkb_nodeid;
2245
2246 error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
2247 if (error)
2248 goto out;
2249
2250 send_args(r, lkb, ms);
2251
2252 ms->m_bastmode = mode;
2253
2254 error = send_message(mh, ms);
2255 out:
2256 return error;
2257}
2258
2259static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
2260{
2261 struct dlm_message *ms;
2262 struct dlm_mhandle *mh;
2263 int to_nodeid, error;
2264
2265 add_to_waiters(lkb, DLM_MSG_LOOKUP);
2266
2267 to_nodeid = dlm_dir_nodeid(r);
2268
2269 error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
2270 if (error)
2271 goto fail;
2272
2273 send_args(r, lkb, ms);
2274
2275 error = send_message(mh, ms);
2276 if (error)
2277 goto fail;
2278 return 0;
2279
2280 fail:
2281 remove_from_waiters(lkb);
2282 return error;
2283}
2284
2285static int send_remove(struct dlm_rsb *r)
2286{
2287 struct dlm_message *ms;
2288 struct dlm_mhandle *mh;
2289 int to_nodeid, error;
2290
2291 to_nodeid = dlm_dir_nodeid(r);
2292
2293 error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
2294 if (error)
2295 goto out;
2296
2297 memcpy(ms->m_extra, r->res_name, r->res_length);
2298 ms->m_hash = r->res_hash;
2299
2300 error = send_message(mh, ms);
2301 out:
2302 return error;
2303}
2304
2305static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2306 int mstype, int rv)
2307{
2308 struct dlm_message *ms;
2309 struct dlm_mhandle *mh;
2310 int to_nodeid, error;
2311
2312 to_nodeid = lkb->lkb_nodeid;
2313
2314 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2315 if (error)
2316 goto out;
2317
2318 send_args(r, lkb, ms);
2319
2320 ms->m_result = rv;
2321
2322 error = send_message(mh, ms);
2323 out:
2324 return error;
2325}
2326
2327static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2328{
2329 return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
2330}
2331
2332static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2333{
2334 return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
2335}
2336
2337static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2338{
2339 return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
2340}
2341
2342static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2343{
2344 return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
2345}
2346
2347static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
2348 int ret_nodeid, int rv)
2349{
2350 struct dlm_rsb *r = &ls->ls_stub_rsb;
2351 struct dlm_message *ms;
2352 struct dlm_mhandle *mh;
2353 int error, nodeid = ms_in->m_header.h_nodeid;
2354
2355 error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
2356 if (error)
2357 goto out;
2358
2359 ms->m_lkid = ms_in->m_lkid;
2360 ms->m_result = rv;
2361 ms->m_nodeid = ret_nodeid;
2362
2363 error = send_message(mh, ms);
2364 out:
2365 return error;
2366}
2367
2368/* which args we save from a received message depends heavily on the type
2369 of message, unlike the send side where we can safely send everything about
2370 the lkb for any type of message */
2371
2372static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
2373{
2374 lkb->lkb_exflags = ms->m_exflags;
2375 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2376 (ms->m_flags & 0x0000FFFF);
2377}
2378
2379static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2380{
2381 lkb->lkb_sbflags = ms->m_sbflags;
2382 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2383 (ms->m_flags & 0x0000FFFF);
2384}
2385
2386static int receive_extralen(struct dlm_message *ms)
2387{
2388 return (ms->m_header.h_length - sizeof(struct dlm_message));
2389}
2390
2391static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
2392 struct dlm_message *ms)
2393{
2394 int len;
2395
2396 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
2397 if (!lkb->lkb_lvbptr)
2398 lkb->lkb_lvbptr = allocate_lvb(ls);
2399 if (!lkb->lkb_lvbptr)
2400 return -ENOMEM;
2401 len = receive_extralen(ms);
2402 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
2403 }
2404 return 0;
2405}
2406
2407static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2408 struct dlm_message *ms)
2409{
2410 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2411 lkb->lkb_ownpid = ms->m_pid;
2412 lkb->lkb_remid = ms->m_lkid;
2413 lkb->lkb_grmode = DLM_LOCK_IV;
2414 lkb->lkb_rqmode = ms->m_rqmode;
2415 lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
2416 lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
2417
2418 DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
2419
2420 if (receive_lvb(ls, lkb, ms))
2421 return -ENOMEM;
2422
2423 return 0;
2424}
2425
2426static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2427 struct dlm_message *ms)
2428{
2429 if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
2430 log_error(ls, "convert_args nodeid %d %d lkid %x %x",
2431 lkb->lkb_nodeid, ms->m_header.h_nodeid,
2432 lkb->lkb_id, lkb->lkb_remid);
2433 return -EINVAL;
2434 }
2435
2436 if (!is_master_copy(lkb))
2437 return -EINVAL;
2438
2439 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
2440 return -EBUSY;
2441
2442 if (receive_lvb(ls, lkb, ms))
2443 return -ENOMEM;
2444
2445 lkb->lkb_rqmode = ms->m_rqmode;
2446 lkb->lkb_lvbseq = ms->m_lvbseq;
2447
2448 return 0;
2449}
2450
2451static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2452 struct dlm_message *ms)
2453{
2454 if (!is_master_copy(lkb))
2455 return -EINVAL;
2456 if (receive_lvb(ls, lkb, ms))
2457 return -ENOMEM;
2458 return 0;
2459}
2460
2461/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
2462 uses to send a reply and that the remote end uses to process the reply. */
2463
2464static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
2465{
2466 struct dlm_lkb *lkb = &ls->ls_stub_lkb;
2467 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2468 lkb->lkb_remid = ms->m_lkid;
2469}
2470
2471static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
2472{
2473 struct dlm_lkb *lkb;
2474 struct dlm_rsb *r;
2475 int error, namelen;
2476
2477 error = create_lkb(ls, &lkb);
2478 if (error)
2479 goto fail;
2480
2481 receive_flags(lkb, ms);
2482 lkb->lkb_flags |= DLM_IFL_MSTCPY;
2483 error = receive_request_args(ls, lkb, ms);
2484 if (error) {
2485 __put_lkb(ls, lkb);
2486 goto fail;
2487 }
2488
2489 namelen = receive_extralen(ms);
2490
2491 error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
2492 if (error) {
2493 __put_lkb(ls, lkb);
2494 goto fail;
2495 }
2496
2497 lock_rsb(r);
2498
2499 attach_lkb(r, lkb);
2500 error = do_request(r, lkb);
2501 send_request_reply(r, lkb, error);
2502
2503 unlock_rsb(r);
2504 put_rsb(r);
2505
2506 if (error == -EINPROGRESS)
2507 error = 0;
2508 if (error)
2509 dlm_put_lkb(lkb);
2510 return;
2511
2512 fail:
2513 setup_stub_lkb(ls, ms);
2514 send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2515}
2516
2517static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
2518{
2519 struct dlm_lkb *lkb;
2520 struct dlm_rsb *r;
2521 int error, reply = 1;
2522
2523 error = find_lkb(ls, ms->m_remid, &lkb);
2524 if (error)
2525 goto fail;
2526
2527 r = lkb->lkb_resource;
2528
2529 hold_rsb(r);
2530 lock_rsb(r);
2531
2532 receive_flags(lkb, ms);
2533 error = receive_convert_args(ls, lkb, ms);
2534 if (error)
2535 goto out;
2536 reply = !down_conversion(lkb);
2537
2538 error = do_convert(r, lkb);
2539 out:
2540 if (reply)
2541 send_convert_reply(r, lkb, error);
2542
2543 unlock_rsb(r);
2544 put_rsb(r);
2545 dlm_put_lkb(lkb);
2546 return;
2547
2548 fail:
2549 setup_stub_lkb(ls, ms);
2550 send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2551}
2552
2553static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
2554{
2555 struct dlm_lkb *lkb;
2556 struct dlm_rsb *r;
2557 int error;
2558
2559 error = find_lkb(ls, ms->m_remid, &lkb);
2560 if (error)
2561 goto fail;
2562
2563 r = lkb->lkb_resource;
2564
2565 hold_rsb(r);
2566 lock_rsb(r);
2567
2568 receive_flags(lkb, ms);
2569 error = receive_unlock_args(ls, lkb, ms);
2570 if (error)
2571 goto out;
2572
2573 error = do_unlock(r, lkb);
2574 out:
2575 send_unlock_reply(r, lkb, error);
2576
2577 unlock_rsb(r);
2578 put_rsb(r);
2579 dlm_put_lkb(lkb);
2580 return;
2581
2582 fail:
2583 setup_stub_lkb(ls, ms);
2584 send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2585}
2586
2587static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
2588{
2589 struct dlm_lkb *lkb;
2590 struct dlm_rsb *r;
2591 int error;
2592
2593 error = find_lkb(ls, ms->m_remid, &lkb);
2594 if (error)
2595 goto fail;
2596
2597 receive_flags(lkb, ms);
2598
2599 r = lkb->lkb_resource;
2600
2601 hold_rsb(r);
2602 lock_rsb(r);
2603
2604 error = do_cancel(r, lkb);
2605 send_cancel_reply(r, lkb, error);
2606
2607 unlock_rsb(r);
2608 put_rsb(r);
2609 dlm_put_lkb(lkb);
2610 return;
2611
2612 fail:
2613 setup_stub_lkb(ls, ms);
2614 send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2615}
2616
2617static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
2618{
2619 struct dlm_lkb *lkb;
2620 struct dlm_rsb *r;
2621 int error;
2622
2623 error = find_lkb(ls, ms->m_remid, &lkb);
2624 if (error) {
2625 log_error(ls, "receive_grant no lkb");
2626 return;
2627 }
2628 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2629
2630 r = lkb->lkb_resource;
2631
2632 hold_rsb(r);
2633 lock_rsb(r);
2634
2635 receive_flags_reply(lkb, ms);
2636 grant_lock_pc(r, lkb, ms);
2637 queue_cast(r, lkb, 0);
2638
2639 unlock_rsb(r);
2640 put_rsb(r);
2641 dlm_put_lkb(lkb);
2642}
2643
2644static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
2645{
2646 struct dlm_lkb *lkb;
2647 struct dlm_rsb *r;
2648 int error;
2649
2650 error = find_lkb(ls, ms->m_remid, &lkb);
2651 if (error) {
2652 log_error(ls, "receive_bast no lkb");
2653 return;
2654 }
2655 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2656
2657 r = lkb->lkb_resource;
2658
2659 hold_rsb(r);
2660 lock_rsb(r);
2661
2662 queue_bast(r, lkb, ms->m_bastmode);
2663
2664 unlock_rsb(r);
2665 put_rsb(r);
2666 dlm_put_lkb(lkb);
2667}
2668
2669static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
2670{
2671 int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
2672
2673 from_nodeid = ms->m_header.h_nodeid;
2674 our_nodeid = dlm_our_nodeid();
2675
2676 len = receive_extralen(ms);
2677
2678 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2679 if (dir_nodeid != our_nodeid) {
2680 log_error(ls, "lookup dir_nodeid %d from %d",
2681 dir_nodeid, from_nodeid);
2682 error = -EINVAL;
2683 ret_nodeid = -1;
2684 goto out;
2685 }
2686
2687 error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
2688
2689 /* Optimization: we're master so treat lookup as a request */
2690 if (!error && ret_nodeid == our_nodeid) {
2691 receive_request(ls, ms);
2692 return;
2693 }
2694 out:
2695 send_lookup_reply(ls, ms, ret_nodeid, error);
2696}
2697
2698static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
2699{
2700 int len, dir_nodeid, from_nodeid;
2701
2702 from_nodeid = ms->m_header.h_nodeid;
2703
2704 len = receive_extralen(ms);
2705
2706 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2707 if (dir_nodeid != dlm_our_nodeid()) {
2708 log_error(ls, "remove dir entry dir_nodeid %d from %d",
2709 dir_nodeid, from_nodeid);
2710 return;
2711 }
2712
2713 dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
2714}
2715
2716static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
2717{
2718 struct dlm_lkb *lkb;
2719 struct dlm_rsb *r;
2720 int error, mstype;
2721
2722 error = find_lkb(ls, ms->m_remid, &lkb);
2723 if (error) {
2724 log_error(ls, "receive_request_reply no lkb");
2725 return;
2726 }
2727 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2728
2729 mstype = lkb->lkb_wait_type;
2730 error = remove_from_waiters(lkb);
2731 if (error) {
2732 log_error(ls, "receive_request_reply not on waiters");
2733 goto out;
2734 }
2735
2736 /* this is the value returned from do_request() on the master */
2737 error = ms->m_result;
2738
2739 r = lkb->lkb_resource;
2740 hold_rsb(r);
2741 lock_rsb(r);
2742
2743 /* Optimization: the dir node was also the master, so it took our
2744 lookup as a request and sent request reply instead of lookup reply */
2745 if (mstype == DLM_MSG_LOOKUP) {
2746 r->res_nodeid = ms->m_header.h_nodeid;
2747 lkb->lkb_nodeid = r->res_nodeid;
2748 }
2749
2750 switch (error) {
2751 case -EAGAIN:
2752 /* request would block (be queued) on remote master;
2753 the unhold undoes the original ref from create_lkb()
2754 so it leads to the lkb being freed */
2755 queue_cast(r, lkb, -EAGAIN);
2756 confirm_master(r, -EAGAIN);
2757 unhold_lkb(lkb);
2758 break;
2759
2760 case -EINPROGRESS:
2761 case 0:
2762 /* request was queued or granted on remote master */
2763 receive_flags_reply(lkb, ms);
2764 lkb->lkb_remid = ms->m_lkid;
2765 if (error)
2766 add_lkb(r, lkb, DLM_LKSTS_WAITING);
2767 else {
2768 grant_lock_pc(r, lkb, ms);
2769 queue_cast(r, lkb, 0);
2770 }
2771 confirm_master(r, error);
2772 break;
2773
2774 case -EBADR:
2775 case -ENOTBLK:
2776 /* find_rsb failed to find rsb or rsb wasn't master */
2777 r->res_nodeid = -1;
2778 lkb->lkb_nodeid = -1;
2779 _request_lock(r, lkb);
2780 break;
2781
2782 default:
2783 log_error(ls, "receive_request_reply error %d", error);
2784 }
2785
2786 unlock_rsb(r);
2787 put_rsb(r);
2788 out:
2789 dlm_put_lkb(lkb);
2790}
2791
2792static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2793 struct dlm_message *ms)
2794{
2795 int error = ms->m_result;
2796
2797 /* this is the value returned from do_convert() on the master */
2798
2799 switch (error) {
2800 case -EAGAIN:
2801 /* convert would block (be queued) on remote master */
2802 queue_cast(r, lkb, -EAGAIN);
2803 break;
2804
2805 case -EINPROGRESS:
2806 /* convert was queued on remote master */
2807 del_lkb(r, lkb);
2808 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
2809 break;
2810
2811 case 0:
2812 /* convert was granted on remote master */
2813 receive_flags_reply(lkb, ms);
2814 grant_lock_pc(r, lkb, ms);
2815 queue_cast(r, lkb, 0);
2816 break;
2817
2818 default:
2819 log_error(r->res_ls, "receive_convert_reply error %d", error);
2820 }
2821}
2822
2823static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2824{
2825 struct dlm_rsb *r = lkb->lkb_resource;
2826
2827 hold_rsb(r);
2828 lock_rsb(r);
2829
2830 __receive_convert_reply(r, lkb, ms);
2831
2832 unlock_rsb(r);
2833 put_rsb(r);
2834}
2835
2836static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
2837{
2838 struct dlm_lkb *lkb;
2839 int error;
2840
2841 error = find_lkb(ls, ms->m_remid, &lkb);
2842 if (error) {
2843 log_error(ls, "receive_convert_reply no lkb");
2844 return;
2845 }
2846 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2847
2848 error = remove_from_waiters(lkb);
2849 if (error) {
2850 log_error(ls, "receive_convert_reply not on waiters");
2851 goto out;
2852 }
2853
2854 _receive_convert_reply(lkb, ms);
2855 out:
2856 dlm_put_lkb(lkb);
2857}
2858
2859static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2860{
2861 struct dlm_rsb *r = lkb->lkb_resource;
2862 int error = ms->m_result;
2863
2864 hold_rsb(r);
2865 lock_rsb(r);
2866
2867 /* this is the value returned from do_unlock() on the master */
2868
2869 switch (error) {
2870 case -DLM_EUNLOCK:
2871 receive_flags_reply(lkb, ms);
2872 remove_lock_pc(r, lkb);
2873 queue_cast(r, lkb, -DLM_EUNLOCK);
2874 break;
2875 default:
2876 log_error(r->res_ls, "receive_unlock_reply error %d", error);
2877 }
2878
2879 unlock_rsb(r);
2880 put_rsb(r);
2881}
2882
2883static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
2884{
2885 struct dlm_lkb *lkb;
2886 int error;
2887
2888 error = find_lkb(ls, ms->m_remid, &lkb);
2889 if (error) {
2890 log_error(ls, "receive_unlock_reply no lkb");
2891 return;
2892 }
2893 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2894
2895 error = remove_from_waiters(lkb);
2896 if (error) {
2897 log_error(ls, "receive_unlock_reply not on waiters");
2898 goto out;
2899 }
2900
2901 _receive_unlock_reply(lkb, ms);
2902 out:
2903 dlm_put_lkb(lkb);
2904}
2905
2906static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2907{
2908 struct dlm_rsb *r = lkb->lkb_resource;
2909 int error = ms->m_result;
2910
2911 hold_rsb(r);
2912 lock_rsb(r);
2913
2914 /* this is the value returned from do_cancel() on the master */
2915
2916 switch (error) {
2917 case -DLM_ECANCEL:
2918 receive_flags_reply(lkb, ms);
2919 revert_lock_pc(r, lkb);
2920 queue_cast(r, lkb, -DLM_ECANCEL);
2921 break;
2922 default:
2923 log_error(r->res_ls, "receive_cancel_reply error %d", error);
2924 }
2925
2926 unlock_rsb(r);
2927 put_rsb(r);
2928}
2929
2930static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
2931{
2932 struct dlm_lkb *lkb;
2933 int error;
2934
2935 error = find_lkb(ls, ms->m_remid, &lkb);
2936 if (error) {
2937 log_error(ls, "receive_cancel_reply no lkb");
2938 return;
2939 }
2940 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2941
2942 error = remove_from_waiters(lkb);
2943 if (error) {
2944 log_error(ls, "receive_cancel_reply not on waiters");
2945 goto out;
2946 }
2947
2948 _receive_cancel_reply(lkb, ms);
2949 out:
2950 dlm_put_lkb(lkb);
2951}
2952
2953static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
2954{
2955 struct dlm_lkb *lkb;
2956 struct dlm_rsb *r;
2957 int error, ret_nodeid;
2958
2959 error = find_lkb(ls, ms->m_lkid, &lkb);
2960 if (error) {
2961 log_error(ls, "receive_lookup_reply no lkb");
2962 return;
2963 }
2964
2965 error = remove_from_waiters(lkb);
2966 if (error) {
2967 log_error(ls, "receive_lookup_reply not on waiters");
2968 goto out;
2969 }
2970
2971 /* this is the value returned by dlm_dir_lookup on dir node
2972 FIXME: will a non-zero error ever be returned? */
2973 error = ms->m_result;
2974
2975 r = lkb->lkb_resource;
2976 hold_rsb(r);
2977 lock_rsb(r);
2978
2979 ret_nodeid = ms->m_nodeid;
2980 if (ret_nodeid == dlm_our_nodeid()) {
2981 r->res_nodeid = 0;
2982 ret_nodeid = 0;
2983 r->res_first_lkid = 0;
2984 } else {
2985 /* set_master() will copy res_nodeid to lkb_nodeid */
2986 r->res_nodeid = ret_nodeid;
2987 }
2988
2989 _request_lock(r, lkb);
2990
2991 if (!ret_nodeid)
2992 process_lookup_list(r);
2993
2994 unlock_rsb(r);
2995 put_rsb(r);
2996 out:
2997 dlm_put_lkb(lkb);
2998}
2999
3000int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
3001{
3002 struct dlm_message *ms = (struct dlm_message *) hd;
3003 struct dlm_ls *ls;
3004 int error;
3005
3006 if (!recovery)
3007 dlm_message_in(ms);
3008
3009 ls = dlm_find_lockspace_global(hd->h_lockspace);
3010 if (!ls) {
3011 log_print("drop message %d from %d for unknown lockspace %d",
3012 ms->m_type, nodeid, hd->h_lockspace);
3013 return -EINVAL;
3014 }
3015
3016 /* recovery may have just ended leaving a bunch of backed-up requests
3017 in the requestqueue; wait while dlm_recoverd clears them */
3018
3019 if (!recovery)
3020 dlm_wait_requestqueue(ls);
3021
3022 /* recovery may have just started while there were a bunch of
3023 in-flight requests -- save them in requestqueue to be processed
3024 after recovery. we can't let dlm_recvd block on the recovery
3025 lock. if dlm_recoverd is calling this function to clear the
3026 requestqueue, it needs to be interrupted (-EINTR) if another
3027 recovery operation is starting. */
3028
3029 while (1) {
3030 if (dlm_locking_stopped(ls)) {
3031 if (!recovery)
3032 dlm_add_requestqueue(ls, nodeid, hd);
3033 error = -EINTR;
3034 goto out;
3035 }
3036
3037 if (lock_recovery_try(ls))
3038 break;
3039 schedule();
3040 }
3041
3042 switch (ms->m_type) {
3043
3044 /* messages sent to a master node */
3045
3046 case DLM_MSG_REQUEST:
3047 receive_request(ls, ms);
3048 break;
3049
3050 case DLM_MSG_CONVERT:
3051 receive_convert(ls, ms);
3052 break;
3053
3054 case DLM_MSG_UNLOCK:
3055 receive_unlock(ls, ms);
3056 break;
3057
3058 case DLM_MSG_CANCEL:
3059 receive_cancel(ls, ms);
3060 break;
3061
3062 /* messages sent from a master node (replies to above) */
3063
3064 case DLM_MSG_REQUEST_REPLY:
3065 receive_request_reply(ls, ms);
3066 break;
3067
3068 case DLM_MSG_CONVERT_REPLY:
3069 receive_convert_reply(ls, ms);
3070 break;
3071
3072 case DLM_MSG_UNLOCK_REPLY:
3073 receive_unlock_reply(ls, ms);
3074 break;
3075
3076 case DLM_MSG_CANCEL_REPLY:
3077 receive_cancel_reply(ls, ms);
3078 break;
3079
3080 /* messages sent from a master node (only two types of async msg) */
3081
3082 case DLM_MSG_GRANT:
3083 receive_grant(ls, ms);
3084 break;
3085
3086 case DLM_MSG_BAST:
3087 receive_bast(ls, ms);
3088 break;
3089
3090 /* messages sent to a dir node */
3091
3092 case DLM_MSG_LOOKUP:
3093 receive_lookup(ls, ms);
3094 break;
3095
3096 case DLM_MSG_REMOVE:
3097 receive_remove(ls, ms);
3098 break;
3099
3100 /* messages sent from a dir node (remove has no reply) */
3101
3102 case DLM_MSG_LOOKUP_REPLY:
3103 receive_lookup_reply(ls, ms);
3104 break;
3105
3106 default:
3107 log_error(ls, "unknown message type %d", ms->m_type);
3108 }
3109
3110 unlock_recovery(ls);
3111 out:
3112 dlm_put_lockspace(ls);
3113 dlm_astd_wake();
3114 return 0;
3115}
3116
3117
3118/*
3119 * Recovery related
3120 */
3121
3122static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
3123{
3124 if (middle_conversion(lkb)) {
3125 hold_lkb(lkb);
3126 ls->ls_stub_ms.m_result = -EINPROGRESS;
3127 _remove_from_waiters(lkb);
3128 _receive_convert_reply(lkb, &ls->ls_stub_ms);
3129
3130 /* Same special case as in receive_rcom_lock_args() */
3131 lkb->lkb_grmode = DLM_LOCK_IV;
3132 rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
3133 unhold_lkb(lkb);
3134
3135 } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
3136 lkb->lkb_flags |= DLM_IFL_RESEND;
3137 }
3138
3139 /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
3140 conversions are async; there's no reply from the remote master */
3141}
3142
3143/* A waiting lkb needs recovery if the master node has failed, or
3144 the master node is changing (only when no directory is used) */
3145
3146static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
3147{
3148 if (dlm_is_removed(ls, lkb->lkb_nodeid))
3149 return 1;
3150
3151 if (!dlm_no_directory(ls))
3152 return 0;
3153
3154 if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
3155 return 1;
3156
3157 return 0;
3158}
3159
3160/* Recovery for locks that are waiting for replies from nodes that are now
3161 gone. We can just complete unlocks and cancels by faking a reply from the
3162 dead node. Requests and up-conversions we flag to be resent after
3163 recovery. Down-conversions can just be completed with a fake reply like
3164 unlocks. Conversions between PR and CW need special attention. */
3165
3166void dlm_recover_waiters_pre(struct dlm_ls *ls)
3167{
3168 struct dlm_lkb *lkb, *safe;
3169
3170 mutex_lock(&ls->ls_waiters_mutex);
3171
3172 list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
3173 log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
3174 lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
3175
3176 /* all outstanding lookups, regardless of destination will be
3177 resent after recovery is done */
3178
3179 if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
3180 lkb->lkb_flags |= DLM_IFL_RESEND;
3181 continue;
3182 }
3183
3184 if (!waiter_needs_recovery(ls, lkb))
3185 continue;
3186
3187 switch (lkb->lkb_wait_type) {
3188
3189 case DLM_MSG_REQUEST:
3190 lkb->lkb_flags |= DLM_IFL_RESEND;
3191 break;
3192
3193 case DLM_MSG_CONVERT:
3194 recover_convert_waiter(ls, lkb);
3195 break;
3196
3197 case DLM_MSG_UNLOCK:
3198 hold_lkb(lkb);
3199 ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
3200 _remove_from_waiters(lkb);
3201 _receive_unlock_reply(lkb, &ls->ls_stub_ms);
3202 dlm_put_lkb(lkb);
3203 break;
3204
3205 case DLM_MSG_CANCEL:
3206 hold_lkb(lkb);
3207 ls->ls_stub_ms.m_result = -DLM_ECANCEL;
3208 _remove_from_waiters(lkb);
3209 _receive_cancel_reply(lkb, &ls->ls_stub_ms);
3210 dlm_put_lkb(lkb);
3211 break;
3212
3213 default:
3214 log_error(ls, "invalid lkb wait_type %d",
3215 lkb->lkb_wait_type);
3216 }
3217 schedule();
3218 }
3219 mutex_unlock(&ls->ls_waiters_mutex);
3220}
3221
3222static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
3223{
3224 struct dlm_lkb *lkb;
3225 int rv = 0;
3226
3227 mutex_lock(&ls->ls_waiters_mutex);
3228 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
3229 if (lkb->lkb_flags & DLM_IFL_RESEND) {
3230 rv = lkb->lkb_wait_type;
3231 _remove_from_waiters(lkb);
3232 lkb->lkb_flags &= ~DLM_IFL_RESEND;
3233 break;
3234 }
3235 }
3236 mutex_unlock(&ls->ls_waiters_mutex);
3237
3238 if (!rv)
3239 lkb = NULL;
3240 *lkb_ret = lkb;
3241 return rv;
3242}
3243
3244/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the
3245 master or dir-node for r. Processing the lkb may result in it being placed
3246 back on waiters. */
3247
3248int dlm_recover_waiters_post(struct dlm_ls *ls)
3249{
3250 struct dlm_lkb *lkb;
3251 struct dlm_rsb *r;
3252 int error = 0, mstype;
3253
3254 while (1) {
3255 if (dlm_locking_stopped(ls)) {
3256 log_debug(ls, "recover_waiters_post aborted");
3257 error = -EINTR;
3258 break;
3259 }
3260
3261 mstype = remove_resend_waiter(ls, &lkb);
3262 if (!mstype)
3263 break;
3264
3265 r = lkb->lkb_resource;
3266
3267 log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
3268 lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
3269
3270 switch (mstype) {
3271
3272 case DLM_MSG_LOOKUP:
3273 hold_rsb(r);
3274 lock_rsb(r);
3275 _request_lock(r, lkb);
3276 if (is_master(r))
3277 confirm_master(r, 0);
3278 unlock_rsb(r);
3279 put_rsb(r);
3280 break;
3281
3282 case DLM_MSG_REQUEST:
3283 hold_rsb(r);
3284 lock_rsb(r);
3285 _request_lock(r, lkb);
3286 unlock_rsb(r);
3287 put_rsb(r);
3288 break;
3289
3290 case DLM_MSG_CONVERT:
3291 hold_rsb(r);
3292 lock_rsb(r);
3293 _convert_lock(r, lkb);
3294 unlock_rsb(r);
3295 put_rsb(r);
3296 break;
3297
3298 default:
3299 log_error(ls, "recover_waiters_post type %d", mstype);
3300 }
3301 }
3302
3303 return error;
3304}
3305
3306static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
3307 int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
3308{
3309 struct dlm_ls *ls = r->res_ls;
3310 struct dlm_lkb *lkb, *safe;
3311
3312 list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
3313 if (test(ls, lkb)) {
3314 rsb_set_flag(r, RSB_LOCKS_PURGED);
3315 del_lkb(r, lkb);
3316 /* this put should free the lkb */
3317 if (!dlm_put_lkb(lkb))
3318 log_error(ls, "purged lkb not released");
3319 }
3320 }
3321}
3322
3323static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3324{
3325 return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
3326}
3327
3328static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3329{
3330 return is_master_copy(lkb);
3331}
3332
3333static void purge_dead_locks(struct dlm_rsb *r)
3334{
3335 purge_queue(r, &r->res_grantqueue, &purge_dead_test);
3336 purge_queue(r, &r->res_convertqueue, &purge_dead_test);
3337 purge_queue(r, &r->res_waitqueue, &purge_dead_test);
3338}
3339
3340void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
3341{
3342 purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
3343 purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
3344 purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
3345}
3346
3347/* Get rid of locks held by nodes that are gone. */
3348
3349int dlm_purge_locks(struct dlm_ls *ls)
3350{
3351 struct dlm_rsb *r;
3352
3353 log_debug(ls, "dlm_purge_locks");
3354
3355 down_write(&ls->ls_root_sem);
3356 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
3357 hold_rsb(r);
3358 lock_rsb(r);
3359 if (is_master(r))
3360 purge_dead_locks(r);
3361 unlock_rsb(r);
3362 unhold_rsb(r);
3363
3364 schedule();
3365 }
3366 up_write(&ls->ls_root_sem);
3367
3368 return 0;
3369}
3370
3371static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
3372{
3373 struct dlm_rsb *r, *r_ret = NULL;
3374
3375 read_lock(&ls->ls_rsbtbl[bucket].lock);
3376 list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
3377 if (!rsb_flag(r, RSB_LOCKS_PURGED))
3378 continue;
3379 hold_rsb(r);
3380 rsb_clear_flag(r, RSB_LOCKS_PURGED);
3381 r_ret = r;
3382 break;
3383 }
3384 read_unlock(&ls->ls_rsbtbl[bucket].lock);
3385 return r_ret;
3386}
3387
3388void dlm_grant_after_purge(struct dlm_ls *ls)
3389{
3390 struct dlm_rsb *r;
3391 int bucket = 0;
3392
3393 while (1) {
3394 r = find_purged_rsb(ls, bucket);
3395 if (!r) {
3396 if (bucket == ls->ls_rsbtbl_size - 1)
3397 break;
3398 bucket++;
3399 continue;
3400 }
3401 lock_rsb(r);
3402 if (is_master(r)) {
3403 grant_pending_locks(r);
3404 confirm_master(r, 0);
3405 }
3406 unlock_rsb(r);
3407 put_rsb(r);
3408 schedule();
3409 }
3410}
3411
3412static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
3413 uint32_t remid)
3414{
3415 struct dlm_lkb *lkb;
3416
3417 list_for_each_entry(lkb, head, lkb_statequeue) {
3418 if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
3419 return lkb;
3420 }
3421 return NULL;
3422}
3423
3424static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
3425 uint32_t remid)
3426{
3427 struct dlm_lkb *lkb;
3428
3429 lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
3430 if (lkb)
3431 return lkb;
3432 lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
3433 if (lkb)
3434 return lkb;
3435 lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
3436 if (lkb)
3437 return lkb;
3438 return NULL;
3439}
3440
3441static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3442 struct dlm_rsb *r, struct dlm_rcom *rc)
3443{
3444 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3445 int lvblen;
3446
3447 lkb->lkb_nodeid = rc->rc_header.h_nodeid;
3448 lkb->lkb_ownpid = rl->rl_ownpid;
3449 lkb->lkb_remid = rl->rl_lkid;
3450 lkb->lkb_exflags = rl->rl_exflags;
3451 lkb->lkb_flags = rl->rl_flags & 0x0000FFFF;
3452 lkb->lkb_flags |= DLM_IFL_MSTCPY;
3453 lkb->lkb_lvbseq = rl->rl_lvbseq;
3454 lkb->lkb_rqmode = rl->rl_rqmode;
3455 lkb->lkb_grmode = rl->rl_grmode;
3456 /* don't set lkb_status because add_lkb wants to itself */
3457
3458 lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST);
3459 lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
3460
3461 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3462 lkb->lkb_lvbptr = allocate_lvb(ls);
3463 if (!lkb->lkb_lvbptr)
3464 return -ENOMEM;
3465 lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
3466 sizeof(struct rcom_lock);
3467 memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
3468 }
3469
3470 /* Conversions between PR and CW (middle modes) need special handling.
3471 The real granted mode of these converting locks cannot be determined
3472 until all locks have been rebuilt on the rsb (recover_conversion) */
3473
3474 if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) {
3475 rl->rl_status = DLM_LKSTS_CONVERT;
3476 lkb->lkb_grmode = DLM_LOCK_IV;
3477 rsb_set_flag(r, RSB_RECOVER_CONVERT);
3478 }
3479
3480 return 0;
3481}
3482
3483/* This lkb may have been recovered in a previous aborted recovery so we need
3484 to check if the rsb already has an lkb with the given remote nodeid/lkid.
3485 If so we just send back a standard reply. If not, we create a new lkb with
3486 the given values and send back our lkid. We send back our lkid by sending
3487 back the rcom_lock struct we got but with the remid field filled in. */
3488
3489int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3490{
3491 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3492 struct dlm_rsb *r;
3493 struct dlm_lkb *lkb;
3494 int error;
3495
3496 if (rl->rl_parent_lkid) {
3497 error = -EOPNOTSUPP;
3498 goto out;
3499 }
3500
3501 error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r);
3502 if (error)
3503 goto out;
3504
3505 lock_rsb(r);
3506
3507 lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid);
3508 if (lkb) {
3509 error = -EEXIST;
3510 goto out_remid;
3511 }
3512
3513 error = create_lkb(ls, &lkb);
3514 if (error)
3515 goto out_unlock;
3516
3517 error = receive_rcom_lock_args(ls, lkb, r, rc);
3518 if (error) {
3519 __put_lkb(ls, lkb);
3520 goto out_unlock;
3521 }
3522
3523 attach_lkb(r, lkb);
3524 add_lkb(r, lkb, rl->rl_status);
3525 error = 0;
3526
3527 out_remid:
3528 /* this is the new value returned to the lock holder for
3529 saving in its process-copy lkb */
3530 rl->rl_remid = lkb->lkb_id;
3531
3532 out_unlock:
3533 unlock_rsb(r);
3534 put_rsb(r);
3535 out:
3536 if (error)
3537 log_print("recover_master_copy %d %x", error, rl->rl_lkid);
3538 rl->rl_result = error;
3539 return error;
3540}
3541
3542int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3543{
3544 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3545 struct dlm_rsb *r;
3546 struct dlm_lkb *lkb;
3547 int error;
3548
3549 error = find_lkb(ls, rl->rl_lkid, &lkb);
3550 if (error) {
3551 log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid);
3552 return error;
3553 }
3554
3555 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3556
3557 error = rl->rl_result;
3558
3559 r = lkb->lkb_resource;
3560 hold_rsb(r);
3561 lock_rsb(r);
3562
3563 switch (error) {
3564 case -EEXIST:
3565 log_debug(ls, "master copy exists %x", lkb->lkb_id);
3566 /* fall through */
3567 case 0:
3568 lkb->lkb_remid = rl->rl_remid;
3569 break;
3570 default:
3571 log_error(ls, "dlm_recover_process_copy unknown error %d %x",
3572 error, lkb->lkb_id);
3573 }
3574
3575 /* an ack for dlm_recover_locks() which waits for replies from
3576 all the locks it sends to new masters */
3577 dlm_recovered_lock(r);
3578
3579 unlock_rsb(r);
3580 put_rsb(r);
3581 dlm_put_lkb(lkb);
3582
3583 return 0;
3584}
3585
3586int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
3587 int mode, uint32_t flags, void *name, unsigned int namelen,
3588 uint32_t parent_lkid)
3589{
3590 struct dlm_lkb *lkb;
3591 struct dlm_args args;
3592 int error;
3593
3594 lock_recovery(ls);
3595
3596 error = create_lkb(ls, &lkb);
3597 if (error) {
3598 kfree(ua);
3599 goto out;
3600 }
3601
3602 if (flags & DLM_LKF_VALBLK) {
3603 ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
3604 if (!ua->lksb.sb_lvbptr) {
3605 kfree(ua);
3606 __put_lkb(ls, lkb);
3607 error = -ENOMEM;
3608 goto out;
3609 }
3610 }
3611
3612 /* After ua is attached to lkb it will be freed by free_lkb().
3613 When DLM_IFL_USER is set, the dlm knows that this is a userspace
3614 lock and that lkb_astparam is the dlm_user_args structure. */
3615
3616 error = set_lock_args(mode, &ua->lksb, flags, namelen, parent_lkid,
3617 DLM_FAKE_USER_AST, ua, DLM_FAKE_USER_AST, &args);
3618 lkb->lkb_flags |= DLM_IFL_USER;
3619 ua->old_mode = DLM_LOCK_IV;
3620
3621 if (error) {
3622 __put_lkb(ls, lkb);
3623 goto out;
3624 }
3625
3626 error = request_lock(ls, lkb, name, namelen, &args);
3627
3628 switch (error) {
3629 case 0:
3630 break;
3631 case -EINPROGRESS:
3632 error = 0;
3633 break;
3634 case -EAGAIN:
3635 error = 0;
3636 /* fall through */
3637 default:
3638 __put_lkb(ls, lkb);
3639 goto out;
3640 }
3641
3642 /* add this new lkb to the per-process list of locks */
3643 spin_lock(&ua->proc->locks_spin);
3644 kref_get(&lkb->lkb_ref);
3645 list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
3646 spin_unlock(&ua->proc->locks_spin);
3647 out:
3648 unlock_recovery(ls);
3649 return error;
3650}
3651
3652int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3653 int mode, uint32_t flags, uint32_t lkid, char *lvb_in)
3654{
3655 struct dlm_lkb *lkb;
3656 struct dlm_args args;
3657 struct dlm_user_args *ua;
3658 int error;
3659
3660 lock_recovery(ls);
3661
3662 error = find_lkb(ls, lkid, &lkb);
3663 if (error)
3664 goto out;
3665
3666 /* user can change the params on its lock when it converts it, or
3667 add an lvb that didn't exist before */
3668
3669 ua = (struct dlm_user_args *)lkb->lkb_astparam;
3670
3671 if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
3672 ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
3673 if (!ua->lksb.sb_lvbptr) {
3674 error = -ENOMEM;
3675 goto out_put;
3676 }
3677 }
3678 if (lvb_in && ua->lksb.sb_lvbptr)
3679 memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
3680
3681 ua->castparam = ua_tmp->castparam;
3682 ua->castaddr = ua_tmp->castaddr;
3683 ua->bastparam = ua_tmp->bastparam;
3684 ua->bastaddr = ua_tmp->bastaddr;
3685 ua->user_lksb = ua_tmp->user_lksb;
3686 ua->old_mode = lkb->lkb_grmode;
3687
3688 error = set_lock_args(mode, &ua->lksb, flags, 0, 0, DLM_FAKE_USER_AST,
3689 ua, DLM_FAKE_USER_AST, &args);
3690 if (error)
3691 goto out_put;
3692
3693 error = convert_lock(ls, lkb, &args);
3694
3695 if (error == -EINPROGRESS || error == -EAGAIN)
3696 error = 0;
3697 out_put:
3698 dlm_put_lkb(lkb);
3699 out:
3700 unlock_recovery(ls);
3701 kfree(ua_tmp);
3702 return error;
3703}
3704
3705int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3706 uint32_t flags, uint32_t lkid, char *lvb_in)
3707{
3708 struct dlm_lkb *lkb;
3709 struct dlm_args args;
3710 struct dlm_user_args *ua;
3711 int error;
3712
3713 lock_recovery(ls);
3714
3715 error = find_lkb(ls, lkid, &lkb);
3716 if (error)
3717 goto out;
3718
3719 ua = (struct dlm_user_args *)lkb->lkb_astparam;
3720
3721 if (lvb_in && ua->lksb.sb_lvbptr)
3722 memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
3723 ua->castparam = ua_tmp->castparam;
3724 ua->user_lksb = ua_tmp->user_lksb;
3725
3726 error = set_unlock_args(flags, ua, &args);
3727 if (error)
3728 goto out_put;
3729
3730 error = unlock_lock(ls, lkb, &args);
3731
3732 if (error == -DLM_EUNLOCK)
3733 error = 0;
3734 if (error)
3735 goto out_put;
3736
3737 spin_lock(&ua->proc->locks_spin);
3738 list_del_init(&lkb->lkb_ownqueue);
3739 spin_unlock(&ua->proc->locks_spin);
3740
3741 /* this removes the reference for the proc->locks list added by
3742 dlm_user_request */
3743 unhold_lkb(lkb);
3744 out_put:
3745 dlm_put_lkb(lkb);
3746 out:
3747 unlock_recovery(ls);
3748 return error;
3749}
3750
3751int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3752 uint32_t flags, uint32_t lkid)
3753{
3754 struct dlm_lkb *lkb;
3755 struct dlm_args args;
3756 struct dlm_user_args *ua;
3757 int error;
3758
3759 lock_recovery(ls);
3760
3761 error = find_lkb(ls, lkid, &lkb);
3762 if (error)
3763 goto out;
3764
3765 ua = (struct dlm_user_args *)lkb->lkb_astparam;
3766 ua->castparam = ua_tmp->castparam;
3767 ua->user_lksb = ua_tmp->user_lksb;
3768
3769 error = set_unlock_args(flags, ua, &args);
3770 if (error)
3771 goto out_put;
3772
3773 error = cancel_lock(ls, lkb, &args);
3774
3775 if (error == -DLM_ECANCEL)
3776 error = 0;
3777 if (error)
3778 goto out_put;
3779
3780 /* this lkb was removed from the WAITING queue */
3781 if (lkb->lkb_grmode == DLM_LOCK_IV) {
3782 spin_lock(&ua->proc->locks_spin);
3783 list_del_init(&lkb->lkb_ownqueue);
3784 spin_unlock(&ua->proc->locks_spin);
3785 unhold_lkb(lkb);
3786 }
3787 out_put:
3788 dlm_put_lkb(lkb);
3789 out:
3790 unlock_recovery(ls);
3791 return error;
3792}
3793
3794static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
3795{
3796 struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
3797
3798 if (ua->lksb.sb_lvbptr)
3799 kfree(ua->lksb.sb_lvbptr);
3800 kfree(ua);
3801 lkb->lkb_astparam = (long)NULL;
3802
3803 /* TODO: propogate to master if needed */
3804 return 0;
3805}
3806
3807/* The force flag allows the unlock to go ahead even if the lkb isn't granted.
3808 Regardless of what rsb queue the lock is on, it's removed and freed. */
3809
3810static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
3811{
3812 struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
3813 struct dlm_args args;
3814 int error;
3815
3816 /* FIXME: we need to handle the case where the lkb is in limbo
3817 while the rsb is being looked up, currently we assert in
3818 _unlock_lock/is_remote because rsb nodeid is -1. */
3819
3820 set_unlock_args(DLM_LKF_FORCEUNLOCK, ua, &args);
3821
3822 error = unlock_lock(ls, lkb, &args);
3823 if (error == -DLM_EUNLOCK)
3824 error = 0;
3825 return error;
3826}
3827
3828/* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which
3829 1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
3830 which we clear here. */
3831
3832/* proc CLOSING flag is set so no more device_reads should look at proc->asts
3833 list, and no more device_writes should add lkb's to proc->locks list; so we
3834 shouldn't need to take asts_spin or locks_spin here. this assumes that
3835 device reads/writes/closes are serialized -- FIXME: we may need to serialize
3836 them ourself. */
3837
3838void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
3839{
3840 struct dlm_lkb *lkb, *safe;
3841
3842 lock_recovery(ls);
3843 mutex_lock(&ls->ls_clear_proc_locks);
3844
3845 list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) {
3846 if (lkb->lkb_ast_type) {
3847 list_del(&lkb->lkb_astqueue);
3848 unhold_lkb(lkb);
3849 }
3850
3851 list_del_init(&lkb->lkb_ownqueue);
3852
3853 if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) {
3854 lkb->lkb_flags |= DLM_IFL_ORPHAN;
3855 orphan_proc_lock(ls, lkb);
3856 } else {
3857 lkb->lkb_flags |= DLM_IFL_DEAD;
3858 unlock_proc_lock(ls, lkb);
3859 }
3860
3861 /* this removes the reference for the proc->locks list
3862 added by dlm_user_request, it may result in the lkb
3863 being freed */
3864
3865 dlm_put_lkb(lkb);
3866 }
3867 mutex_unlock(&ls->ls_clear_proc_locks);
3868 unlock_recovery(ls);
3869}
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
new file mode 100644
index 000000000000..0843a3073ec3
--- /dev/null
+++ b/fs/dlm/lock.h
@@ -0,0 +1,62 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LOCK_DOT_H__
14#define __LOCK_DOT_H__
15
16void dlm_print_rsb(struct dlm_rsb *r);
17void dlm_dump_rsb(struct dlm_rsb *r);
18void dlm_print_lkb(struct dlm_lkb *lkb);
19int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery);
20int dlm_modes_compat(int mode1, int mode2);
21int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
22 unsigned int flags, struct dlm_rsb **r_ret);
23void dlm_put_rsb(struct dlm_rsb *r);
24void dlm_hold_rsb(struct dlm_rsb *r);
25int dlm_put_lkb(struct dlm_lkb *lkb);
26void dlm_scan_rsbs(struct dlm_ls *ls);
27
28int dlm_purge_locks(struct dlm_ls *ls);
29void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
30void dlm_grant_after_purge(struct dlm_ls *ls);
31int dlm_recover_waiters_post(struct dlm_ls *ls);
32void dlm_recover_waiters_pre(struct dlm_ls *ls);
33int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
34int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
35
36int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
37 uint32_t flags, void *name, unsigned int namelen, uint32_t parent_lkid);
38int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
39 int mode, uint32_t flags, uint32_t lkid, char *lvb_in);
40int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
41 uint32_t flags, uint32_t lkid, char *lvb_in);
42int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
43 uint32_t flags, uint32_t lkid);
44void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
45
46static inline int is_master(struct dlm_rsb *r)
47{
48 return !r->res_nodeid;
49}
50
51static inline void lock_rsb(struct dlm_rsb *r)
52{
53 mutex_lock(&r->res_mutex);
54}
55
56static inline void unlock_rsb(struct dlm_rsb *r)
57{
58 mutex_unlock(&r->res_mutex);
59}
60
61#endif
62
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
new file mode 100644
index 000000000000..ff83f80e43eb
--- /dev/null
+++ b/fs/dlm/lockspace.c
@@ -0,0 +1,717 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "recoverd.h"
18#include "ast.h"
19#include "dir.h"
20#include "lowcomms.h"
21#include "config.h"
22#include "memory.h"
23#include "lock.h"
24#include "recover.h"
25
26#ifdef CONFIG_DLM_DEBUG
27int dlm_create_debug_file(struct dlm_ls *ls);
28void dlm_delete_debug_file(struct dlm_ls *ls);
29#else
30static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
31static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
32#endif
33
34static int ls_count;
35static struct mutex ls_lock;
36static struct list_head lslist;
37static spinlock_t lslist_lock;
38static struct task_struct * scand_task;
39
40
41static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
42{
43 ssize_t ret = len;
44 int n = simple_strtol(buf, NULL, 0);
45
46 switch (n) {
47 case 0:
48 dlm_ls_stop(ls);
49 break;
50 case 1:
51 dlm_ls_start(ls);
52 break;
53 default:
54 ret = -EINVAL;
55 }
56 return ret;
57}
58
59static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
60{
61 ls->ls_uevent_result = simple_strtol(buf, NULL, 0);
62 set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
63 wake_up(&ls->ls_uevent_wait);
64 return len;
65}
66
67static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
68{
69 return sprintf(buf, "%u\n", ls->ls_global_id);
70}
71
72static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
73{
74 ls->ls_global_id = simple_strtoul(buf, NULL, 0);
75 return len;
76}
77
78static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf)
79{
80 uint32_t status = dlm_recover_status(ls);
81 return sprintf(buf, "%x\n", status);
82}
83
84static ssize_t dlm_recover_nodeid_show(struct dlm_ls *ls, char *buf)
85{
86 return sprintf(buf, "%d\n", ls->ls_recover_nodeid);
87}
88
89struct dlm_attr {
90 struct attribute attr;
91 ssize_t (*show)(struct dlm_ls *, char *);
92 ssize_t (*store)(struct dlm_ls *, const char *, size_t);
93};
94
95static struct dlm_attr dlm_attr_control = {
96 .attr = {.name = "control", .mode = S_IWUSR},
97 .store = dlm_control_store
98};
99
100static struct dlm_attr dlm_attr_event = {
101 .attr = {.name = "event_done", .mode = S_IWUSR},
102 .store = dlm_event_store
103};
104
105static struct dlm_attr dlm_attr_id = {
106 .attr = {.name = "id", .mode = S_IRUGO | S_IWUSR},
107 .show = dlm_id_show,
108 .store = dlm_id_store
109};
110
111static struct dlm_attr dlm_attr_recover_status = {
112 .attr = {.name = "recover_status", .mode = S_IRUGO},
113 .show = dlm_recover_status_show
114};
115
116static struct dlm_attr dlm_attr_recover_nodeid = {
117 .attr = {.name = "recover_nodeid", .mode = S_IRUGO},
118 .show = dlm_recover_nodeid_show
119};
120
121static struct attribute *dlm_attrs[] = {
122 &dlm_attr_control.attr,
123 &dlm_attr_event.attr,
124 &dlm_attr_id.attr,
125 &dlm_attr_recover_status.attr,
126 &dlm_attr_recover_nodeid.attr,
127 NULL,
128};
129
130static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
131 char *buf)
132{
133 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
134 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
135 return a->show ? a->show(ls, buf) : 0;
136}
137
138static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
139 const char *buf, size_t len)
140{
141 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
142 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
143 return a->store ? a->store(ls, buf, len) : len;
144}
145
146static struct sysfs_ops dlm_attr_ops = {
147 .show = dlm_attr_show,
148 .store = dlm_attr_store,
149};
150
151static struct kobj_type dlm_ktype = {
152 .default_attrs = dlm_attrs,
153 .sysfs_ops = &dlm_attr_ops,
154};
155
156static struct kset dlm_kset = {
157 .subsys = &kernel_subsys,
158 .kobj = {.name = "dlm",},
159 .ktype = &dlm_ktype,
160};
161
162static int kobject_setup(struct dlm_ls *ls)
163{
164 char lsname[DLM_LOCKSPACE_LEN];
165 int error;
166
167 memset(lsname, 0, DLM_LOCKSPACE_LEN);
168 snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
169
170 error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
171 if (error)
172 return error;
173
174 ls->ls_kobj.kset = &dlm_kset;
175 ls->ls_kobj.ktype = &dlm_ktype;
176 return 0;
177}
178
179static int do_uevent(struct dlm_ls *ls, int in)
180{
181 int error;
182
183 if (in)
184 kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
185 else
186 kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
187
188 error = wait_event_interruptible(ls->ls_uevent_wait,
189 test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
190 if (error)
191 goto out;
192
193 error = ls->ls_uevent_result;
194 out:
195 return error;
196}
197
198
199int dlm_lockspace_init(void)
200{
201 int error;
202
203 ls_count = 0;
204 mutex_init(&ls_lock);
205 INIT_LIST_HEAD(&lslist);
206 spin_lock_init(&lslist_lock);
207
208 error = kset_register(&dlm_kset);
209 if (error)
210 printk("dlm_lockspace_init: cannot register kset %d\n", error);
211 return error;
212}
213
214void dlm_lockspace_exit(void)
215{
216 kset_unregister(&dlm_kset);
217}
218
219static int dlm_scand(void *data)
220{
221 struct dlm_ls *ls;
222
223 while (!kthread_should_stop()) {
224 list_for_each_entry(ls, &lslist, ls_list)
225 dlm_scan_rsbs(ls);
226 schedule_timeout_interruptible(dlm_config.scan_secs * HZ);
227 }
228 return 0;
229}
230
231static int dlm_scand_start(void)
232{
233 struct task_struct *p;
234 int error = 0;
235
236 p = kthread_run(dlm_scand, NULL, "dlm_scand");
237 if (IS_ERR(p))
238 error = PTR_ERR(p);
239 else
240 scand_task = p;
241 return error;
242}
243
244static void dlm_scand_stop(void)
245{
246 kthread_stop(scand_task);
247}
248
249static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
250{
251 struct dlm_ls *ls;
252
253 spin_lock(&lslist_lock);
254
255 list_for_each_entry(ls, &lslist, ls_list) {
256 if (ls->ls_namelen == namelen &&
257 memcmp(ls->ls_name, name, namelen) == 0)
258 goto out;
259 }
260 ls = NULL;
261 out:
262 spin_unlock(&lslist_lock);
263 return ls;
264}
265
266struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
267{
268 struct dlm_ls *ls;
269
270 spin_lock(&lslist_lock);
271
272 list_for_each_entry(ls, &lslist, ls_list) {
273 if (ls->ls_global_id == id) {
274 ls->ls_count++;
275 goto out;
276 }
277 }
278 ls = NULL;
279 out:
280 spin_unlock(&lslist_lock);
281 return ls;
282}
283
284struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace)
285{
286 struct dlm_ls *ls;
287
288 spin_lock(&lslist_lock);
289 list_for_each_entry(ls, &lslist, ls_list) {
290 if (ls->ls_local_handle == lockspace) {
291 ls->ls_count++;
292 goto out;
293 }
294 }
295 ls = NULL;
296 out:
297 spin_unlock(&lslist_lock);
298 return ls;
299}
300
301struct dlm_ls *dlm_find_lockspace_device(int minor)
302{
303 struct dlm_ls *ls;
304
305 spin_lock(&lslist_lock);
306 list_for_each_entry(ls, &lslist, ls_list) {
307 if (ls->ls_device.minor == minor) {
308 ls->ls_count++;
309 goto out;
310 }
311 }
312 ls = NULL;
313 out:
314 spin_unlock(&lslist_lock);
315 return ls;
316}
317
318void dlm_put_lockspace(struct dlm_ls *ls)
319{
320 spin_lock(&lslist_lock);
321 ls->ls_count--;
322 spin_unlock(&lslist_lock);
323}
324
325static void remove_lockspace(struct dlm_ls *ls)
326{
327 for (;;) {
328 spin_lock(&lslist_lock);
329 if (ls->ls_count == 0) {
330 list_del(&ls->ls_list);
331 spin_unlock(&lslist_lock);
332 return;
333 }
334 spin_unlock(&lslist_lock);
335 ssleep(1);
336 }
337}
338
339static int threads_start(void)
340{
341 int error;
342
343 /* Thread which process lock requests for all lockspace's */
344 error = dlm_astd_start();
345 if (error) {
346 log_print("cannot start dlm_astd thread %d", error);
347 goto fail;
348 }
349
350 error = dlm_scand_start();
351 if (error) {
352 log_print("cannot start dlm_scand thread %d", error);
353 goto astd_fail;
354 }
355
356 /* Thread for sending/receiving messages for all lockspace's */
357 error = dlm_lowcomms_start();
358 if (error) {
359 log_print("cannot start dlm lowcomms %d", error);
360 goto scand_fail;
361 }
362
363 return 0;
364
365 scand_fail:
366 dlm_scand_stop();
367 astd_fail:
368 dlm_astd_stop();
369 fail:
370 return error;
371}
372
373static void threads_stop(void)
374{
375 dlm_scand_stop();
376 dlm_lowcomms_stop();
377 dlm_astd_stop();
378}
379
380static int new_lockspace(char *name, int namelen, void **lockspace,
381 uint32_t flags, int lvblen)
382{
383 struct dlm_ls *ls;
384 int i, size, error = -ENOMEM;
385
386 if (namelen > DLM_LOCKSPACE_LEN)
387 return -EINVAL;
388
389 if (!lvblen || (lvblen % 8))
390 return -EINVAL;
391
392 if (!try_module_get(THIS_MODULE))
393 return -EINVAL;
394
395 ls = dlm_find_lockspace_name(name, namelen);
396 if (ls) {
397 *lockspace = ls;
398 module_put(THIS_MODULE);
399 return -EEXIST;
400 }
401
402 ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
403 if (!ls)
404 goto out;
405 memcpy(ls->ls_name, name, namelen);
406 ls->ls_namelen = namelen;
407 ls->ls_exflags = flags;
408 ls->ls_lvblen = lvblen;
409 ls->ls_count = 0;
410 ls->ls_flags = 0;
411
412 size = dlm_config.rsbtbl_size;
413 ls->ls_rsbtbl_size = size;
414
415 ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
416 if (!ls->ls_rsbtbl)
417 goto out_lsfree;
418 for (i = 0; i < size; i++) {
419 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
420 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
421 rwlock_init(&ls->ls_rsbtbl[i].lock);
422 }
423
424 size = dlm_config.lkbtbl_size;
425 ls->ls_lkbtbl_size = size;
426
427 ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
428 if (!ls->ls_lkbtbl)
429 goto out_rsbfree;
430 for (i = 0; i < size; i++) {
431 INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
432 rwlock_init(&ls->ls_lkbtbl[i].lock);
433 ls->ls_lkbtbl[i].counter = 1;
434 }
435
436 size = dlm_config.dirtbl_size;
437 ls->ls_dirtbl_size = size;
438
439 ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
440 if (!ls->ls_dirtbl)
441 goto out_lkbfree;
442 for (i = 0; i < size; i++) {
443 INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
444 rwlock_init(&ls->ls_dirtbl[i].lock);
445 }
446
447 INIT_LIST_HEAD(&ls->ls_waiters);
448 mutex_init(&ls->ls_waiters_mutex);
449
450 INIT_LIST_HEAD(&ls->ls_nodes);
451 INIT_LIST_HEAD(&ls->ls_nodes_gone);
452 ls->ls_num_nodes = 0;
453 ls->ls_low_nodeid = 0;
454 ls->ls_total_weight = 0;
455 ls->ls_node_array = NULL;
456
457 memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
458 ls->ls_stub_rsb.res_ls = ls;
459
460 ls->ls_debug_rsb_dentry = NULL;
461 ls->ls_debug_waiters_dentry = NULL;
462
463 init_waitqueue_head(&ls->ls_uevent_wait);
464 ls->ls_uevent_result = 0;
465
466 ls->ls_recoverd_task = NULL;
467 mutex_init(&ls->ls_recoverd_active);
468 spin_lock_init(&ls->ls_recover_lock);
469 ls->ls_recover_status = 0;
470 ls->ls_recover_seq = 0;
471 ls->ls_recover_args = NULL;
472 init_rwsem(&ls->ls_in_recovery);
473 INIT_LIST_HEAD(&ls->ls_requestqueue);
474 mutex_init(&ls->ls_requestqueue_mutex);
475 mutex_init(&ls->ls_clear_proc_locks);
476
477 ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
478 if (!ls->ls_recover_buf)
479 goto out_dirfree;
480
481 INIT_LIST_HEAD(&ls->ls_recover_list);
482 spin_lock_init(&ls->ls_recover_list_lock);
483 ls->ls_recover_list_count = 0;
484 ls->ls_local_handle = ls;
485 init_waitqueue_head(&ls->ls_wait_general);
486 INIT_LIST_HEAD(&ls->ls_root_list);
487 init_rwsem(&ls->ls_root_sem);
488
489 down_write(&ls->ls_in_recovery);
490
491 spin_lock(&lslist_lock);
492 list_add(&ls->ls_list, &lslist);
493 spin_unlock(&lslist_lock);
494
495 /* needs to find ls in lslist */
496 error = dlm_recoverd_start(ls);
497 if (error) {
498 log_error(ls, "can't start dlm_recoverd %d", error);
499 goto out_rcomfree;
500 }
501
502 dlm_create_debug_file(ls);
503
504 error = kobject_setup(ls);
505 if (error)
506 goto out_del;
507
508 error = kobject_register(&ls->ls_kobj);
509 if (error)
510 goto out_del;
511
512 error = do_uevent(ls, 1);
513 if (error)
514 goto out_unreg;
515
516 *lockspace = ls;
517 return 0;
518
519 out_unreg:
520 kobject_unregister(&ls->ls_kobj);
521 out_del:
522 dlm_delete_debug_file(ls);
523 dlm_recoverd_stop(ls);
524 out_rcomfree:
525 spin_lock(&lslist_lock);
526 list_del(&ls->ls_list);
527 spin_unlock(&lslist_lock);
528 kfree(ls->ls_recover_buf);
529 out_dirfree:
530 kfree(ls->ls_dirtbl);
531 out_lkbfree:
532 kfree(ls->ls_lkbtbl);
533 out_rsbfree:
534 kfree(ls->ls_rsbtbl);
535 out_lsfree:
536 kfree(ls);
537 out:
538 module_put(THIS_MODULE);
539 return error;
540}
541
542int dlm_new_lockspace(char *name, int namelen, void **lockspace,
543 uint32_t flags, int lvblen)
544{
545 int error = 0;
546
547 mutex_lock(&ls_lock);
548 if (!ls_count)
549 error = threads_start();
550 if (error)
551 goto out;
552
553 error = new_lockspace(name, namelen, lockspace, flags, lvblen);
554 if (!error)
555 ls_count++;
556 out:
557 mutex_unlock(&ls_lock);
558 return error;
559}
560
561/* Return 1 if the lockspace still has active remote locks,
562 * 2 if the lockspace still has active local locks.
563 */
564static int lockspace_busy(struct dlm_ls *ls)
565{
566 int i, lkb_found = 0;
567 struct dlm_lkb *lkb;
568
569 /* NOTE: We check the lockidtbl here rather than the resource table.
570 This is because there may be LKBs queued as ASTs that have been
571 unlinked from their RSBs and are pending deletion once the AST has
572 been delivered */
573
574 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
575 read_lock(&ls->ls_lkbtbl[i].lock);
576 if (!list_empty(&ls->ls_lkbtbl[i].list)) {
577 lkb_found = 1;
578 list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
579 lkb_idtbl_list) {
580 if (!lkb->lkb_nodeid) {
581 read_unlock(&ls->ls_lkbtbl[i].lock);
582 return 2;
583 }
584 }
585 }
586 read_unlock(&ls->ls_lkbtbl[i].lock);
587 }
588 return lkb_found;
589}
590
591static int release_lockspace(struct dlm_ls *ls, int force)
592{
593 struct dlm_lkb *lkb;
594 struct dlm_rsb *rsb;
595 struct list_head *head;
596 int i;
597 int busy = lockspace_busy(ls);
598
599 if (busy > force)
600 return -EBUSY;
601
602 if (force < 3)
603 do_uevent(ls, 0);
604
605 dlm_recoverd_stop(ls);
606
607 remove_lockspace(ls);
608
609 dlm_delete_debug_file(ls);
610
611 dlm_astd_suspend();
612
613 kfree(ls->ls_recover_buf);
614
615 /*
616 * Free direntry structs.
617 */
618
619 dlm_dir_clear(ls);
620 kfree(ls->ls_dirtbl);
621
622 /*
623 * Free all lkb's on lkbtbl[] lists.
624 */
625
626 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
627 head = &ls->ls_lkbtbl[i].list;
628 while (!list_empty(head)) {
629 lkb = list_entry(head->next, struct dlm_lkb,
630 lkb_idtbl_list);
631
632 list_del(&lkb->lkb_idtbl_list);
633
634 dlm_del_ast(lkb);
635
636 if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
637 free_lvb(lkb->lkb_lvbptr);
638
639 free_lkb(lkb);
640 }
641 }
642 dlm_astd_resume();
643
644 kfree(ls->ls_lkbtbl);
645
646 /*
647 * Free all rsb's on rsbtbl[] lists
648 */
649
650 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
651 head = &ls->ls_rsbtbl[i].list;
652 while (!list_empty(head)) {
653 rsb = list_entry(head->next, struct dlm_rsb,
654 res_hashchain);
655
656 list_del(&rsb->res_hashchain);
657 free_rsb(rsb);
658 }
659
660 head = &ls->ls_rsbtbl[i].toss;
661 while (!list_empty(head)) {
662 rsb = list_entry(head->next, struct dlm_rsb,
663 res_hashchain);
664 list_del(&rsb->res_hashchain);
665 free_rsb(rsb);
666 }
667 }
668
669 kfree(ls->ls_rsbtbl);
670
671 /*
672 * Free structures on any other lists
673 */
674
675 kfree(ls->ls_recover_args);
676 dlm_clear_free_entries(ls);
677 dlm_clear_members(ls);
678 dlm_clear_members_gone(ls);
679 kfree(ls->ls_node_array);
680 kobject_unregister(&ls->ls_kobj);
681 kfree(ls);
682
683 mutex_lock(&ls_lock);
684 ls_count--;
685 if (!ls_count)
686 threads_stop();
687 mutex_unlock(&ls_lock);
688
689 module_put(THIS_MODULE);
690 return 0;
691}
692
693/*
694 * Called when a system has released all its locks and is not going to use the
695 * lockspace any longer. We free everything we're managing for this lockspace.
696 * Remaining nodes will go through the recovery process as if we'd died. The
697 * lockspace must continue to function as usual, participating in recoveries,
698 * until this returns.
699 *
700 * Force has 4 possible values:
701 * 0 - don't destroy locksapce if it has any LKBs
702 * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
703 * 2 - destroy lockspace regardless of LKBs
704 * 3 - destroy lockspace as part of a forced shutdown
705 */
706
707int dlm_release_lockspace(void *lockspace, int force)
708{
709 struct dlm_ls *ls;
710
711 ls = dlm_find_lockspace_local(lockspace);
712 if (!ls)
713 return -EINVAL;
714 dlm_put_lockspace(ls);
715 return release_lockspace(ls, force);
716}
717
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
new file mode 100644
index 000000000000..891eabbdd021
--- /dev/null
+++ b/fs/dlm/lockspace.h
@@ -0,0 +1,25 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOCKSPACE_DOT_H__
15#define __LOCKSPACE_DOT_H__
16
17int dlm_lockspace_init(void);
18void dlm_lockspace_exit(void);
19struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
20struct dlm_ls *dlm_find_lockspace_local(void *id);
21struct dlm_ls *dlm_find_lockspace_device(int minor);
22void dlm_put_lockspace(struct dlm_ls *ls);
23
24#endif /* __LOCKSPACE_DOT_H__ */
25
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
new file mode 100644
index 000000000000..23f5ce12080b
--- /dev/null
+++ b/fs/dlm/lowcomms.c
@@ -0,0 +1,1238 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * lowcomms.c
16 *
17 * This is the "low-level" comms layer.
18 *
19 * It is responsible for sending/receiving messages
20 * from other nodes in the cluster.
21 *
22 * Cluster nodes are referred to by their nodeids. nodeids are
23 * simply 32 bit numbers to the locking module - if they need to
24 * be expanded for the cluster infrastructure then that is it's
25 * responsibility. It is this layer's
26 * responsibility to resolve these into IP address or
27 * whatever it needs for inter-node communication.
28 *
29 * The comms level is two kernel threads that deal mainly with
30 * the receiving of messages from other nodes and passing them
31 * up to the mid-level comms layer (which understands the
32 * message format) for execution by the locking core, and
33 * a send thread which does all the setting up of connections
34 * to remote nodes and the sending of data. Threads are not allowed
35 * to send their own data because it may cause them to wait in times
36 * of high load. Also, this way, the sending thread can collect together
37 * messages bound for one node and send them in one block.
38 *
39 * I don't see any problem with the recv thread executing the locking
40 * code on behalf of remote processes as the locking code is
41 * short, efficient and never (well, hardly ever) waits.
42 *
43 */
44
45#include <asm/ioctls.h>
46#include <net/sock.h>
47#include <net/tcp.h>
48#include <net/sctp/user.h>
49#include <linux/pagemap.h>
50#include <linux/socket.h>
51#include <linux/idr.h>
52
53#include "dlm_internal.h"
54#include "lowcomms.h"
55#include "config.h"
56#include "midcomms.h"
57
58static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
59static int dlm_local_count;
60static int dlm_local_nodeid;
61
62/* One of these per connected node */
63
64#define NI_INIT_PENDING 1
65#define NI_WRITE_PENDING 2
66
67struct nodeinfo {
68 spinlock_t lock;
69 sctp_assoc_t assoc_id;
70 unsigned long flags;
71 struct list_head write_list; /* nodes with pending writes */
72 struct list_head writequeue; /* outgoing writequeue_entries */
73 spinlock_t writequeue_lock;
74 int nodeid;
75};
76
77static DEFINE_IDR(nodeinfo_idr);
78static struct rw_semaphore nodeinfo_lock;
79static int max_nodeid;
80
81struct cbuf {
82 unsigned base;
83 unsigned len;
84 unsigned mask;
85};
86
87/* Just the one of these, now. But this struct keeps
88 the connection-specific variables together */
89
90#define CF_READ_PENDING 1
91
92struct connection {
93 struct socket *sock;
94 unsigned long flags;
95 struct page *rx_page;
96 atomic_t waiting_requests;
97 struct cbuf cb;
98 int eagain_flag;
99};
100
101/* An entry waiting to be sent */
102
103struct writequeue_entry {
104 struct list_head list;
105 struct page *page;
106 int offset;
107 int len;
108 int end;
109 int users;
110 struct nodeinfo *ni;
111};
112
113#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
114#define CBUF_EMPTY(cb) ((cb)->len == 0)
115#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
116#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
117
118#define CBUF_INIT(cb, size) \
119do { \
120 (cb)->base = (cb)->len = 0; \
121 (cb)->mask = ((size)-1); \
122} while(0)
123
124#define CBUF_EAT(cb, n) \
125do { \
126 (cb)->len -= (n); \
127 (cb)->base += (n); \
128 (cb)->base &= (cb)->mask; \
129} while(0)
130
131
132/* List of nodes which have writes pending */
133static struct list_head write_nodes;
134static spinlock_t write_nodes_lock;
135
136/* Maximum number of incoming messages to process before
137 * doing a schedule()
138 */
139#define MAX_RX_MSG_COUNT 25
140
141/* Manage daemons */
142static struct task_struct *recv_task;
143static struct task_struct *send_task;
144static wait_queue_head_t lowcomms_recv_wait;
145static atomic_t accepting;
146
147/* The SCTP connection */
148static struct connection sctp_con;
149
150
151static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
152{
153 struct sockaddr_storage addr;
154 int error;
155
156 if (!dlm_local_count)
157 return -1;
158
159 error = dlm_nodeid_to_addr(nodeid, &addr);
160 if (error)
161 return error;
162
163 if (dlm_local_addr[0]->ss_family == AF_INET) {
164 struct sockaddr_in *in4 = (struct sockaddr_in *) &addr;
165 struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
166 ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
167 } else {
168 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
169 struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
170 memcpy(&ret6->sin6_addr, &in6->sin6_addr,
171 sizeof(in6->sin6_addr));
172 }
173
174 return 0;
175}
176
177static struct nodeinfo *nodeid2nodeinfo(int nodeid, int alloc)
178{
179 struct nodeinfo *ni;
180 int r;
181 int n;
182
183 down_read(&nodeinfo_lock);
184 ni = idr_find(&nodeinfo_idr, nodeid);
185 up_read(&nodeinfo_lock);
186
187 if (!ni && alloc) {
188 down_write(&nodeinfo_lock);
189
190 ni = idr_find(&nodeinfo_idr, nodeid);
191 if (ni)
192 goto out_up;
193
194 r = idr_pre_get(&nodeinfo_idr, alloc);
195 if (!r)
196 goto out_up;
197
198 ni = kmalloc(sizeof(struct nodeinfo), alloc);
199 if (!ni)
200 goto out_up;
201
202 r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
203 if (r) {
204 kfree(ni);
205 ni = NULL;
206 goto out_up;
207 }
208 if (n != nodeid) {
209 idr_remove(&nodeinfo_idr, n);
210 kfree(ni);
211 ni = NULL;
212 goto out_up;
213 }
214 memset(ni, 0, sizeof(struct nodeinfo));
215 spin_lock_init(&ni->lock);
216 INIT_LIST_HEAD(&ni->writequeue);
217 spin_lock_init(&ni->writequeue_lock);
218 ni->nodeid = nodeid;
219
220 if (nodeid > max_nodeid)
221 max_nodeid = nodeid;
222 out_up:
223 up_write(&nodeinfo_lock);
224 }
225
226 return ni;
227}
228
229/* Don't call this too often... */
230static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
231{
232 int i;
233 struct nodeinfo *ni;
234
235 for (i=1; i<=max_nodeid; i++) {
236 ni = nodeid2nodeinfo(i, 0);
237 if (ni && ni->assoc_id == assoc)
238 return ni;
239 }
240 return NULL;
241}
242
243/* Data or notification available on socket */
244static void lowcomms_data_ready(struct sock *sk, int count_unused)
245{
246 atomic_inc(&sctp_con.waiting_requests);
247 if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
248 return;
249
250 wake_up_interruptible(&lowcomms_recv_wait);
251}
252
253
254/* Add the port number to an IP6 or 4 sockaddr and return the address length.
255 Also padd out the struct with zeros to make comparisons meaningful */
256
257static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
258 int *addr_len)
259{
260 struct sockaddr_in *local4_addr;
261 struct sockaddr_in6 *local6_addr;
262
263 if (!dlm_local_count)
264 return;
265
266 if (!port) {
267 if (dlm_local_addr[0]->ss_family == AF_INET) {
268 local4_addr = (struct sockaddr_in *)dlm_local_addr[0];
269 port = be16_to_cpu(local4_addr->sin_port);
270 } else {
271 local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0];
272 port = be16_to_cpu(local6_addr->sin6_port);
273 }
274 }
275
276 saddr->ss_family = dlm_local_addr[0]->ss_family;
277 if (dlm_local_addr[0]->ss_family == AF_INET) {
278 struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
279 in4_addr->sin_port = cpu_to_be16(port);
280 memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
281 memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
282 sizeof(struct sockaddr_in));
283 *addr_len = sizeof(struct sockaddr_in);
284 } else {
285 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
286 in6_addr->sin6_port = cpu_to_be16(port);
287 memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
288 sizeof(struct sockaddr_in6));
289 *addr_len = sizeof(struct sockaddr_in6);
290 }
291}
292
293/* Close the connection and tidy up */
294static void close_connection(void)
295{
296 if (sctp_con.sock) {
297 sock_release(sctp_con.sock);
298 sctp_con.sock = NULL;
299 }
300
301 if (sctp_con.rx_page) {
302 __free_page(sctp_con.rx_page);
303 sctp_con.rx_page = NULL;
304 }
305}
306
307/* We only send shutdown messages to nodes that are not part of the cluster */
308static void send_shutdown(sctp_assoc_t associd)
309{
310 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
311 struct msghdr outmessage;
312 struct cmsghdr *cmsg;
313 struct sctp_sndrcvinfo *sinfo;
314 int ret;
315
316 outmessage.msg_name = NULL;
317 outmessage.msg_namelen = 0;
318 outmessage.msg_control = outcmsg;
319 outmessage.msg_controllen = sizeof(outcmsg);
320 outmessage.msg_flags = MSG_EOR;
321
322 cmsg = CMSG_FIRSTHDR(&outmessage);
323 cmsg->cmsg_level = IPPROTO_SCTP;
324 cmsg->cmsg_type = SCTP_SNDRCV;
325 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
326 outmessage.msg_controllen = cmsg->cmsg_len;
327 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
328 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
329
330 sinfo->sinfo_flags |= MSG_EOF;
331 sinfo->sinfo_assoc_id = associd;
332
333 ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
334
335 if (ret != 0)
336 log_print("send EOF to node failed: %d", ret);
337}
338
339
340/* INIT failed but we don't know which node...
341 restart INIT on all pending nodes */
342static void init_failed(void)
343{
344 int i;
345 struct nodeinfo *ni;
346
347 for (i=1; i<=max_nodeid; i++) {
348 ni = nodeid2nodeinfo(i, 0);
349 if (!ni)
350 continue;
351
352 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
353 ni->assoc_id = 0;
354 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
355 spin_lock_bh(&write_nodes_lock);
356 list_add_tail(&ni->write_list, &write_nodes);
357 spin_unlock_bh(&write_nodes_lock);
358 }
359 }
360 }
361 wake_up_process(send_task);
362}
363
364/* Something happened to an association */
365static void process_sctp_notification(struct msghdr *msg, char *buf)
366{
367 union sctp_notification *sn = (union sctp_notification *)buf;
368
369 if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
370 switch (sn->sn_assoc_change.sac_state) {
371
372 case SCTP_COMM_UP:
373 case SCTP_RESTART:
374 {
375 /* Check that the new node is in the lockspace */
376 struct sctp_prim prim;
377 mm_segment_t fs;
378 int nodeid;
379 int prim_len, ret;
380 int addr_len;
381 struct nodeinfo *ni;
382
383 /* This seems to happen when we received a connection
384 * too early... or something... anyway, it happens but
385 * we always seem to get a real message too, see
386 * receive_from_sock */
387
388 if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
389 log_print("COMM_UP for invalid assoc ID %d",
390 (int)sn->sn_assoc_change.sac_assoc_id);
391 init_failed();
392 return;
393 }
394 memset(&prim, 0, sizeof(struct sctp_prim));
395 prim_len = sizeof(struct sctp_prim);
396 prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
397
398 fs = get_fs();
399 set_fs(get_ds());
400 ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
401 IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
402 (char*)&prim, &prim_len);
403 set_fs(fs);
404 if (ret < 0) {
405 struct nodeinfo *ni;
406
407 log_print("getsockopt/sctp_primary_addr on "
408 "new assoc %d failed : %d",
409 (int)sn->sn_assoc_change.sac_assoc_id, ret);
410
411 /* Retry INIT later */
412 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
413 if (ni)
414 clear_bit(NI_INIT_PENDING, &ni->flags);
415 return;
416 }
417 make_sockaddr(&prim.ssp_addr, 0, &addr_len);
418 if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
419 log_print("reject connect from unknown addr");
420 send_shutdown(prim.ssp_assoc_id);
421 return;
422 }
423
424 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
425 if (!ni)
426 return;
427
428 /* Save the assoc ID */
429 spin_lock(&ni->lock);
430 ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
431 spin_unlock(&ni->lock);
432
433 log_print("got new/restarted association %d nodeid %d",
434 (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
435
436 /* Send any pending writes */
437 clear_bit(NI_INIT_PENDING, &ni->flags);
438 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
439 spin_lock_bh(&write_nodes_lock);
440 list_add_tail(&ni->write_list, &write_nodes);
441 spin_unlock_bh(&write_nodes_lock);
442 }
443 wake_up_process(send_task);
444 }
445 break;
446
447 case SCTP_COMM_LOST:
448 case SCTP_SHUTDOWN_COMP:
449 {
450 struct nodeinfo *ni;
451
452 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
453 if (ni) {
454 spin_lock(&ni->lock);
455 ni->assoc_id = 0;
456 spin_unlock(&ni->lock);
457 }
458 }
459 break;
460
461 /* We don't know which INIT failed, so clear the PENDING flags
462 * on them all. if assoc_id is zero then it will then try
463 * again */
464
465 case SCTP_CANT_STR_ASSOC:
466 {
467 log_print("Can't start SCTP association - retrying");
468 init_failed();
469 }
470 break;
471
472 default:
473 log_print("unexpected SCTP assoc change id=%d state=%d",
474 (int)sn->sn_assoc_change.sac_assoc_id,
475 sn->sn_assoc_change.sac_state);
476 }
477 }
478}
479
480/* Data received from remote end */
481static int receive_from_sock(void)
482{
483 int ret = 0;
484 struct msghdr msg;
485 struct kvec iov[2];
486 unsigned len;
487 int r;
488 struct sctp_sndrcvinfo *sinfo;
489 struct cmsghdr *cmsg;
490 struct nodeinfo *ni;
491
492 /* These two are marginally too big for stack allocation, but this
493 * function is (currently) only called by dlm_recvd so static should be
494 * OK.
495 */
496 static struct sockaddr_storage msgname;
497 static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
498
499 if (sctp_con.sock == NULL)
500 goto out;
501
502 if (sctp_con.rx_page == NULL) {
503 /*
504 * This doesn't need to be atomic, but I think it should
505 * improve performance if it is.
506 */
507 sctp_con.rx_page = alloc_page(GFP_ATOMIC);
508 if (sctp_con.rx_page == NULL)
509 goto out_resched;
510 CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
511 }
512
513 memset(&incmsg, 0, sizeof(incmsg));
514 memset(&msgname, 0, sizeof(msgname));
515
516 memset(incmsg, 0, sizeof(incmsg));
517 msg.msg_name = &msgname;
518 msg.msg_namelen = sizeof(msgname);
519 msg.msg_flags = 0;
520 msg.msg_control = incmsg;
521 msg.msg_controllen = sizeof(incmsg);
522
523 /* I don't see why this circular buffer stuff is necessary for SCTP
524 * which is a packet-based protocol, but the whole thing breaks under
525 * load without it! The overhead is minimal (and is in the TCP lowcomms
526 * anyway, of course) so I'll leave it in until I can figure out what's
527 * really happening.
528 */
529
530 /*
531 * iov[0] is the bit of the circular buffer between the current end
532 * point (cb.base + cb.len) and the end of the buffer.
533 */
534 iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
535 iov[0].iov_base = page_address(sctp_con.rx_page) +
536 CBUF_DATA(&sctp_con.cb);
537 iov[1].iov_len = 0;
538
539 /*
540 * iov[1] is the bit of the circular buffer between the start of the
541 * buffer and the start of the currently used section (cb.base)
542 */
543 if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
544 iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
545 iov[1].iov_len = sctp_con.cb.base;
546 iov[1].iov_base = page_address(sctp_con.rx_page);
547 msg.msg_iovlen = 2;
548 }
549 len = iov[0].iov_len + iov[1].iov_len;
550
551 r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, 1, len,
552 MSG_NOSIGNAL | MSG_DONTWAIT);
553 if (ret <= 0)
554 goto out_close;
555
556 msg.msg_control = incmsg;
557 msg.msg_controllen = sizeof(incmsg);
558 cmsg = CMSG_FIRSTHDR(&msg);
559 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
560
561 if (msg.msg_flags & MSG_NOTIFICATION) {
562 process_sctp_notification(&msg, page_address(sctp_con.rx_page));
563 return 0;
564 }
565
566 /* Is this a new association ? */
567 ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
568 if (ni) {
569 ni->assoc_id = sinfo->sinfo_assoc_id;
570 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
571
572 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
573 spin_lock_bh(&write_nodes_lock);
574 list_add_tail(&ni->write_list, &write_nodes);
575 spin_unlock_bh(&write_nodes_lock);
576 }
577 wake_up_process(send_task);
578 }
579 }
580
581 /* INIT sends a message with length of 1 - ignore it */
582 if (r == 1)
583 return 0;
584
585 CBUF_ADD(&sctp_con.cb, ret);
586 ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
587 page_address(sctp_con.rx_page),
588 sctp_con.cb.base, sctp_con.cb.len,
589 PAGE_CACHE_SIZE);
590 if (ret < 0)
591 goto out_close;
592 CBUF_EAT(&sctp_con.cb, ret);
593
594 out:
595 ret = 0;
596 goto out_ret;
597
598 out_resched:
599 lowcomms_data_ready(sctp_con.sock->sk, 0);
600 ret = 0;
601 schedule();
602 goto out_ret;
603
604 out_close:
605 if (ret != -EAGAIN)
606 log_print("error reading from sctp socket: %d", ret);
607 out_ret:
608 return ret;
609}
610
611/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
612static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
613{
614 mm_segment_t fs;
615 int result = 0;
616
617 fs = get_fs();
618 set_fs(get_ds());
619 if (num == 1)
620 result = sctp_con.sock->ops->bind(sctp_con.sock,
621 (struct sockaddr *) addr, addr_len);
622 else
623 result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
624 SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
625 set_fs(fs);
626
627 if (result < 0)
628 log_print("Can't bind to port %d addr number %d",
629 dlm_config.tcp_port, num);
630
631 return result;
632}
633
634static void init_local(void)
635{
636 struct sockaddr_storage sas, *addr;
637 int i;
638
639 dlm_local_nodeid = dlm_our_nodeid();
640
641 for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
642 if (dlm_our_addr(&sas, i))
643 break;
644
645 addr = kmalloc(sizeof(*addr), GFP_KERNEL);
646 if (!addr)
647 break;
648 memcpy(addr, &sas, sizeof(*addr));
649 dlm_local_addr[dlm_local_count++] = addr;
650 }
651}
652
653/* Initialise SCTP socket and bind to all interfaces */
654static int init_sock(void)
655{
656 mm_segment_t fs;
657 struct socket *sock = NULL;
658 struct sockaddr_storage localaddr;
659 struct sctp_event_subscribe subscribe;
660 int result = -EINVAL, num = 1, i, addr_len;
661
662 if (!dlm_local_count) {
663 init_local();
664 if (!dlm_local_count) {
665 log_print("no local IP address has been set");
666 goto out;
667 }
668 }
669
670 result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
671 IPPROTO_SCTP, &sock);
672 if (result < 0) {
673 log_print("Can't create comms socket, check SCTP is loaded");
674 goto out;
675 }
676
677 /* Listen for events */
678 memset(&subscribe, 0, sizeof(subscribe));
679 subscribe.sctp_data_io_event = 1;
680 subscribe.sctp_association_event = 1;
681 subscribe.sctp_send_failure_event = 1;
682 subscribe.sctp_shutdown_event = 1;
683 subscribe.sctp_partial_delivery_event = 1;
684
685 fs = get_fs();
686 set_fs(get_ds());
687 result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
688 (char *)&subscribe, sizeof(subscribe));
689 set_fs(fs);
690
691 if (result < 0) {
692 log_print("Failed to set SCTP_EVENTS on socket: result=%d",
693 result);
694 goto create_delsock;
695 }
696
697 /* Init con struct */
698 sock->sk->sk_user_data = &sctp_con;
699 sctp_con.sock = sock;
700 sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
701
702 /* Bind to all interfaces. */
703 for (i = 0; i < dlm_local_count; i++) {
704 memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
705 make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
706
707 result = add_bind_addr(&localaddr, addr_len, num);
708 if (result)
709 goto create_delsock;
710 ++num;
711 }
712
713 result = sock->ops->listen(sock, 5);
714 if (result < 0) {
715 log_print("Can't set socket listening");
716 goto create_delsock;
717 }
718
719 return 0;
720
721 create_delsock:
722 sock_release(sock);
723 sctp_con.sock = NULL;
724 out:
725 return result;
726}
727
728
729static struct writequeue_entry *new_writequeue_entry(int allocation)
730{
731 struct writequeue_entry *entry;
732
733 entry = kmalloc(sizeof(struct writequeue_entry), allocation);
734 if (!entry)
735 return NULL;
736
737 entry->page = alloc_page(allocation);
738 if (!entry->page) {
739 kfree(entry);
740 return NULL;
741 }
742
743 entry->offset = 0;
744 entry->len = 0;
745 entry->end = 0;
746 entry->users = 0;
747
748 return entry;
749}
750
751void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc)
752{
753 struct writequeue_entry *e;
754 int offset = 0;
755 int users = 0;
756 struct nodeinfo *ni;
757
758 if (!atomic_read(&accepting))
759 return NULL;
760
761 ni = nodeid2nodeinfo(nodeid, allocation);
762 if (!ni)
763 return NULL;
764
765 spin_lock(&ni->writequeue_lock);
766 e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
767 if (((struct list_head *) e == &ni->writequeue) ||
768 (PAGE_CACHE_SIZE - e->end < len)) {
769 e = NULL;
770 } else {
771 offset = e->end;
772 e->end += len;
773 users = e->users++;
774 }
775 spin_unlock(&ni->writequeue_lock);
776
777 if (e) {
778 got_one:
779 if (users == 0)
780 kmap(e->page);
781 *ppc = page_address(e->page) + offset;
782 return e;
783 }
784
785 e = new_writequeue_entry(allocation);
786 if (e) {
787 spin_lock(&ni->writequeue_lock);
788 offset = e->end;
789 e->end += len;
790 e->ni = ni;
791 users = e->users++;
792 list_add_tail(&e->list, &ni->writequeue);
793 spin_unlock(&ni->writequeue_lock);
794 goto got_one;
795 }
796 return NULL;
797}
798
799void dlm_lowcomms_commit_buffer(void *arg)
800{
801 struct writequeue_entry *e = (struct writequeue_entry *) arg;
802 int users;
803 struct nodeinfo *ni = e->ni;
804
805 if (!atomic_read(&accepting))
806 return;
807
808 spin_lock(&ni->writequeue_lock);
809 users = --e->users;
810 if (users)
811 goto out;
812 e->len = e->end - e->offset;
813 kunmap(e->page);
814 spin_unlock(&ni->writequeue_lock);
815
816 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
817 spin_lock_bh(&write_nodes_lock);
818 list_add_tail(&ni->write_list, &write_nodes);
819 spin_unlock_bh(&write_nodes_lock);
820 wake_up_process(send_task);
821 }
822 return;
823
824 out:
825 spin_unlock(&ni->writequeue_lock);
826 return;
827}
828
829static void free_entry(struct writequeue_entry *e)
830{
831 __free_page(e->page);
832 kfree(e);
833}
834
835/* Initiate an SCTP association. In theory we could just use sendmsg() on
836 the first IP address and it should work, but this allows us to set up the
837 association before sending any valuable data that we can't afford to lose.
838 It also keeps the send path clean as it can now always use the association ID */
839static void initiate_association(int nodeid)
840{
841 struct sockaddr_storage rem_addr;
842 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
843 struct msghdr outmessage;
844 struct cmsghdr *cmsg;
845 struct sctp_sndrcvinfo *sinfo;
846 int ret;
847 int addrlen;
848 char buf[1];
849 struct kvec iov[1];
850 struct nodeinfo *ni;
851
852 log_print("Initiating association with node %d", nodeid);
853
854 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
855 if (!ni)
856 return;
857
858 if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
859 log_print("no address for nodeid %d", nodeid);
860 return;
861 }
862
863 make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
864
865 outmessage.msg_name = &rem_addr;
866 outmessage.msg_namelen = addrlen;
867 outmessage.msg_control = outcmsg;
868 outmessage.msg_controllen = sizeof(outcmsg);
869 outmessage.msg_flags = MSG_EOR;
870
871 iov[0].iov_base = buf;
872 iov[0].iov_len = 1;
873
874 /* Real INIT messages seem to cause trouble. Just send a 1 byte message
875 we can afford to lose */
876 cmsg = CMSG_FIRSTHDR(&outmessage);
877 cmsg->cmsg_level = IPPROTO_SCTP;
878 cmsg->cmsg_type = SCTP_SNDRCV;
879 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
880 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
881 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
882 sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
883
884 outmessage.msg_controllen = cmsg->cmsg_len;
885 ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
886 if (ret < 0) {
887 log_print("send INIT to node failed: %d", ret);
888 /* Try again later */
889 clear_bit(NI_INIT_PENDING, &ni->flags);
890 }
891}
892
893/* Send a message */
894static int send_to_sock(struct nodeinfo *ni)
895{
896 int ret = 0;
897 struct writequeue_entry *e;
898 int len, offset;
899 struct msghdr outmsg;
900 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
901 struct cmsghdr *cmsg;
902 struct sctp_sndrcvinfo *sinfo;
903 struct kvec iov;
904
905 /* See if we need to init an association before we start
906 sending precious messages */
907 spin_lock(&ni->lock);
908 if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
909 spin_unlock(&ni->lock);
910 initiate_association(ni->nodeid);
911 return 0;
912 }
913 spin_unlock(&ni->lock);
914
915 outmsg.msg_name = NULL; /* We use assoc_id */
916 outmsg.msg_namelen = 0;
917 outmsg.msg_control = outcmsg;
918 outmsg.msg_controllen = sizeof(outcmsg);
919 outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
920
921 cmsg = CMSG_FIRSTHDR(&outmsg);
922 cmsg->cmsg_level = IPPROTO_SCTP;
923 cmsg->cmsg_type = SCTP_SNDRCV;
924 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
925 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
926 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
927 sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
928 sinfo->sinfo_assoc_id = ni->assoc_id;
929 outmsg.msg_controllen = cmsg->cmsg_len;
930
931 spin_lock(&ni->writequeue_lock);
932 for (;;) {
933 if (list_empty(&ni->writequeue))
934 break;
935 e = list_entry(ni->writequeue.next, struct writequeue_entry,
936 list);
937 len = e->len;
938 offset = e->offset;
939 BUG_ON(len == 0 && e->users == 0);
940 spin_unlock(&ni->writequeue_lock);
941 kmap(e->page);
942
943 ret = 0;
944 if (len) {
945 iov.iov_base = page_address(e->page)+offset;
946 iov.iov_len = len;
947
948 ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
949 len);
950 if (ret == -EAGAIN) {
951 sctp_con.eagain_flag = 1;
952 goto out;
953 } else if (ret < 0)
954 goto send_error;
955 } else {
956 /* Don't starve people filling buffers */
957 schedule();
958 }
959
960 spin_lock(&ni->writequeue_lock);
961 e->offset += ret;
962 e->len -= ret;
963
964 if (e->len == 0 && e->users == 0) {
965 list_del(&e->list);
966 free_entry(e);
967 continue;
968 }
969 }
970 spin_unlock(&ni->writequeue_lock);
971 out:
972 return ret;
973
974 send_error:
975 log_print("Error sending to node %d %d", ni->nodeid, ret);
976 spin_lock(&ni->lock);
977 if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
978 ni->assoc_id = 0;
979 spin_unlock(&ni->lock);
980 initiate_association(ni->nodeid);
981 } else
982 spin_unlock(&ni->lock);
983
984 return ret;
985}
986
987/* Try to send any messages that are pending */
988static void process_output_queue(void)
989{
990 struct list_head *list;
991 struct list_head *temp;
992
993 spin_lock_bh(&write_nodes_lock);
994 list_for_each_safe(list, temp, &write_nodes) {
995 struct nodeinfo *ni =
996 list_entry(list, struct nodeinfo, write_list);
997 clear_bit(NI_WRITE_PENDING, &ni->flags);
998 list_del(&ni->write_list);
999
1000 spin_unlock_bh(&write_nodes_lock);
1001
1002 send_to_sock(ni);
1003 spin_lock_bh(&write_nodes_lock);
1004 }
1005 spin_unlock_bh(&write_nodes_lock);
1006}
1007
1008/* Called after we've had -EAGAIN and been woken up */
1009static void refill_write_queue(void)
1010{
1011 int i;
1012
1013 for (i=1; i<=max_nodeid; i++) {
1014 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1015
1016 if (ni) {
1017 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
1018 spin_lock_bh(&write_nodes_lock);
1019 list_add_tail(&ni->write_list, &write_nodes);
1020 spin_unlock_bh(&write_nodes_lock);
1021 }
1022 }
1023 }
1024}
1025
1026static void clean_one_writequeue(struct nodeinfo *ni)
1027{
1028 struct list_head *list;
1029 struct list_head *temp;
1030
1031 spin_lock(&ni->writequeue_lock);
1032 list_for_each_safe(list, temp, &ni->writequeue) {
1033 struct writequeue_entry *e =
1034 list_entry(list, struct writequeue_entry, list);
1035 list_del(&e->list);
1036 free_entry(e);
1037 }
1038 spin_unlock(&ni->writequeue_lock);
1039}
1040
1041static void clean_writequeues(void)
1042{
1043 int i;
1044
1045 for (i=1; i<=max_nodeid; i++) {
1046 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1047 if (ni)
1048 clean_one_writequeue(ni);
1049 }
1050}
1051
1052
1053static void dealloc_nodeinfo(void)
1054{
1055 int i;
1056
1057 for (i=1; i<=max_nodeid; i++) {
1058 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1059 if (ni) {
1060 idr_remove(&nodeinfo_idr, i);
1061 kfree(ni);
1062 }
1063 }
1064}
1065
1066int dlm_lowcomms_close(int nodeid)
1067{
1068 struct nodeinfo *ni;
1069
1070 ni = nodeid2nodeinfo(nodeid, 0);
1071 if (!ni)
1072 return -1;
1073
1074 spin_lock(&ni->lock);
1075 if (ni->assoc_id) {
1076 ni->assoc_id = 0;
1077 /* Don't send shutdown here, sctp will just queue it
1078 till the node comes back up! */
1079 }
1080 spin_unlock(&ni->lock);
1081
1082 clean_one_writequeue(ni);
1083 clear_bit(NI_INIT_PENDING, &ni->flags);
1084 return 0;
1085}
1086
1087static int write_list_empty(void)
1088{
1089 int status;
1090
1091 spin_lock_bh(&write_nodes_lock);
1092 status = list_empty(&write_nodes);
1093 spin_unlock_bh(&write_nodes_lock);
1094
1095 return status;
1096}
1097
1098static int dlm_recvd(void *data)
1099{
1100 DECLARE_WAITQUEUE(wait, current);
1101
1102 while (!kthread_should_stop()) {
1103 int count = 0;
1104
1105 set_current_state(TASK_INTERRUPTIBLE);
1106 add_wait_queue(&lowcomms_recv_wait, &wait);
1107 if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
1108 schedule();
1109 remove_wait_queue(&lowcomms_recv_wait, &wait);
1110 set_current_state(TASK_RUNNING);
1111
1112 if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
1113 int ret;
1114
1115 do {
1116 ret = receive_from_sock();
1117
1118 /* Don't starve out everyone else */
1119 if (++count >= MAX_RX_MSG_COUNT) {
1120 schedule();
1121 count = 0;
1122 }
1123 } while (!kthread_should_stop() && ret >=0);
1124 }
1125 schedule();
1126 }
1127
1128 return 0;
1129}
1130
1131static int dlm_sendd(void *data)
1132{
1133 DECLARE_WAITQUEUE(wait, current);
1134
1135 add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1136
1137 while (!kthread_should_stop()) {
1138 set_current_state(TASK_INTERRUPTIBLE);
1139 if (write_list_empty())
1140 schedule();
1141 set_current_state(TASK_RUNNING);
1142
1143 if (sctp_con.eagain_flag) {
1144 sctp_con.eagain_flag = 0;
1145 refill_write_queue();
1146 }
1147 process_output_queue();
1148 }
1149
1150 remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1151
1152 return 0;
1153}
1154
1155static void daemons_stop(void)
1156{
1157 kthread_stop(recv_task);
1158 kthread_stop(send_task);
1159}
1160
1161static int daemons_start(void)
1162{
1163 struct task_struct *p;
1164 int error;
1165
1166 p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
1167 error = IS_ERR(p);
1168 if (error) {
1169 log_print("can't start dlm_recvd %d", error);
1170 return error;
1171 }
1172 recv_task = p;
1173
1174 p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
1175 error = IS_ERR(p);
1176 if (error) {
1177 log_print("can't start dlm_sendd %d", error);
1178 kthread_stop(recv_task);
1179 return error;
1180 }
1181 send_task = p;
1182
1183 return 0;
1184}
1185
1186/*
1187 * This is quite likely to sleep...
1188 */
1189int dlm_lowcomms_start(void)
1190{
1191 int error;
1192
1193 error = init_sock();
1194 if (error)
1195 goto fail_sock;
1196 error = daemons_start();
1197 if (error)
1198 goto fail_sock;
1199 atomic_set(&accepting, 1);
1200 return 0;
1201
1202 fail_sock:
1203 close_connection();
1204 return error;
1205}
1206
1207/* Set all the activity flags to prevent any socket activity. */
1208
1209void dlm_lowcomms_stop(void)
1210{
1211 atomic_set(&accepting, 0);
1212 sctp_con.flags = 0x7;
1213 daemons_stop();
1214 clean_writequeues();
1215 close_connection();
1216 dealloc_nodeinfo();
1217 max_nodeid = 0;
1218}
1219
1220int dlm_lowcomms_init(void)
1221{
1222 init_waitqueue_head(&lowcomms_recv_wait);
1223 spin_lock_init(&write_nodes_lock);
1224 INIT_LIST_HEAD(&write_nodes);
1225 init_rwsem(&nodeinfo_lock);
1226 return 0;
1227}
1228
1229void dlm_lowcomms_exit(void)
1230{
1231 int i;
1232
1233 for (i = 0; i < dlm_local_count; i++)
1234 kfree(dlm_local_addr[i]);
1235 dlm_local_count = 0;
1236 dlm_local_nodeid = 0;
1237}
1238
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
new file mode 100644
index 000000000000..6c04bb09cfa8
--- /dev/null
+++ b/fs/dlm/lowcomms.h
@@ -0,0 +1,26 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOWCOMMS_DOT_H__
15#define __LOWCOMMS_DOT_H__
16
17int dlm_lowcomms_init(void);
18void dlm_lowcomms_exit(void);
19int dlm_lowcomms_start(void);
20void dlm_lowcomms_stop(void);
21int dlm_lowcomms_close(int nodeid);
22void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc);
23void dlm_lowcomms_commit_buffer(void *mh);
24
25#endif /* __LOWCOMMS_DOT_H__ */
26
diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h
new file mode 100644
index 000000000000..cc3e92f3feef
--- /dev/null
+++ b/fs/dlm/lvb_table.h
@@ -0,0 +1,18 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LVB_TABLE_DOT_H__
14#define __LVB_TABLE_DOT_H__
15
16extern const int dlm_lvb_operations[8][8];
17
18#endif
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
new file mode 100644
index 000000000000..a8da8dc36b2e
--- /dev/null
+++ b/fs/dlm/main.c
@@ -0,0 +1,97 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "lock.h"
17#include "user.h"
18#include "memory.h"
19#include "lowcomms.h"
20#include "config.h"
21
22#ifdef CONFIG_DLM_DEBUG
23int dlm_register_debugfs(void);
24void dlm_unregister_debugfs(void);
25#else
26static inline int dlm_register_debugfs(void) { return 0; }
27static inline void dlm_unregister_debugfs(void) { }
28#endif
29
30static int __init init_dlm(void)
31{
32 int error;
33
34 error = dlm_memory_init();
35 if (error)
36 goto out;
37
38 error = dlm_lockspace_init();
39 if (error)
40 goto out_mem;
41
42 error = dlm_config_init();
43 if (error)
44 goto out_lockspace;
45
46 error = dlm_register_debugfs();
47 if (error)
48 goto out_config;
49
50 error = dlm_lowcomms_init();
51 if (error)
52 goto out_debug;
53
54 error = dlm_user_init();
55 if (error)
56 goto out_lowcomms;
57
58 printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
59
60 return 0;
61
62 out_lowcomms:
63 dlm_lowcomms_exit();
64 out_debug:
65 dlm_unregister_debugfs();
66 out_config:
67 dlm_config_exit();
68 out_lockspace:
69 dlm_lockspace_exit();
70 out_mem:
71 dlm_memory_exit();
72 out:
73 return error;
74}
75
76static void __exit exit_dlm(void)
77{
78 dlm_user_exit();
79 dlm_lowcomms_exit();
80 dlm_config_exit();
81 dlm_memory_exit();
82 dlm_lockspace_exit();
83 dlm_unregister_debugfs();
84}
85
86module_init(init_dlm);
87module_exit(exit_dlm);
88
89MODULE_DESCRIPTION("Distributed Lock Manager");
90MODULE_AUTHOR("Red Hat, Inc.");
91MODULE_LICENSE("GPL");
92
93EXPORT_SYMBOL_GPL(dlm_new_lockspace);
94EXPORT_SYMBOL_GPL(dlm_release_lockspace);
95EXPORT_SYMBOL_GPL(dlm_lock);
96EXPORT_SYMBOL_GPL(dlm_unlock);
97
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
new file mode 100644
index 000000000000..a3f7de7f3a8f
--- /dev/null
+++ b/fs/dlm/member.c
@@ -0,0 +1,327 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "lockspace.h"
15#include "member.h"
16#include "recoverd.h"
17#include "recover.h"
18#include "rcom.h"
19#include "config.h"
20
21/*
22 * Following called by dlm_recoverd thread
23 */
24
25static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
26{
27 struct dlm_member *memb = NULL;
28 struct list_head *tmp;
29 struct list_head *newlist = &new->list;
30 struct list_head *head = &ls->ls_nodes;
31
32 list_for_each(tmp, head) {
33 memb = list_entry(tmp, struct dlm_member, list);
34 if (new->nodeid < memb->nodeid)
35 break;
36 }
37
38 if (!memb)
39 list_add_tail(newlist, head);
40 else {
41 /* FIXME: can use list macro here */
42 newlist->prev = tmp->prev;
43 newlist->next = tmp;
44 tmp->prev->next = newlist;
45 tmp->prev = newlist;
46 }
47}
48
49static int dlm_add_member(struct dlm_ls *ls, int nodeid)
50{
51 struct dlm_member *memb;
52 int w;
53
54 memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
55 if (!memb)
56 return -ENOMEM;
57
58 w = dlm_node_weight(ls->ls_name, nodeid);
59 if (w < 0)
60 return w;
61
62 memb->nodeid = nodeid;
63 memb->weight = w;
64 add_ordered_member(ls, memb);
65 ls->ls_num_nodes++;
66 return 0;
67}
68
69static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
70{
71 list_move(&memb->list, &ls->ls_nodes_gone);
72 ls->ls_num_nodes--;
73}
74
75static int dlm_is_member(struct dlm_ls *ls, int nodeid)
76{
77 struct dlm_member *memb;
78
79 list_for_each_entry(memb, &ls->ls_nodes, list) {
80 if (memb->nodeid == nodeid)
81 return 1;
82 }
83 return 0;
84}
85
86int dlm_is_removed(struct dlm_ls *ls, int nodeid)
87{
88 struct dlm_member *memb;
89
90 list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
91 if (memb->nodeid == nodeid)
92 return 1;
93 }
94 return 0;
95}
96
97static void clear_memb_list(struct list_head *head)
98{
99 struct dlm_member *memb;
100
101 while (!list_empty(head)) {
102 memb = list_entry(head->next, struct dlm_member, list);
103 list_del(&memb->list);
104 kfree(memb);
105 }
106}
107
108void dlm_clear_members(struct dlm_ls *ls)
109{
110 clear_memb_list(&ls->ls_nodes);
111 ls->ls_num_nodes = 0;
112}
113
114void dlm_clear_members_gone(struct dlm_ls *ls)
115{
116 clear_memb_list(&ls->ls_nodes_gone);
117}
118
119static void make_member_array(struct dlm_ls *ls)
120{
121 struct dlm_member *memb;
122 int i, w, x = 0, total = 0, all_zero = 0, *array;
123
124 kfree(ls->ls_node_array);
125 ls->ls_node_array = NULL;
126
127 list_for_each_entry(memb, &ls->ls_nodes, list) {
128 if (memb->weight)
129 total += memb->weight;
130 }
131
132 /* all nodes revert to weight of 1 if all have weight 0 */
133
134 if (!total) {
135 total = ls->ls_num_nodes;
136 all_zero = 1;
137 }
138
139 ls->ls_total_weight = total;
140
141 array = kmalloc(sizeof(int) * total, GFP_KERNEL);
142 if (!array)
143 return;
144
145 list_for_each_entry(memb, &ls->ls_nodes, list) {
146 if (!all_zero && !memb->weight)
147 continue;
148
149 if (all_zero)
150 w = 1;
151 else
152 w = memb->weight;
153
154 DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
155
156 for (i = 0; i < w; i++)
157 array[x++] = memb->nodeid;
158 }
159
160 ls->ls_node_array = array;
161}
162
163/* send a status request to all members just to establish comms connections */
164
165static int ping_members(struct dlm_ls *ls)
166{
167 struct dlm_member *memb;
168 int error = 0;
169
170 list_for_each_entry(memb, &ls->ls_nodes, list) {
171 error = dlm_recovery_stopped(ls);
172 if (error)
173 break;
174 error = dlm_rcom_status(ls, memb->nodeid);
175 if (error)
176 break;
177 }
178 if (error)
179 log_debug(ls, "ping_members aborted %d last nodeid %d",
180 error, ls->ls_recover_nodeid);
181 return error;
182}
183
184int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
185{
186 struct dlm_member *memb, *safe;
187 int i, error, found, pos = 0, neg = 0, low = -1;
188
189 /* move departed members from ls_nodes to ls_nodes_gone */
190
191 list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
192 found = 0;
193 for (i = 0; i < rv->node_count; i++) {
194 if (memb->nodeid == rv->nodeids[i]) {
195 found = 1;
196 break;
197 }
198 }
199
200 if (!found) {
201 neg++;
202 dlm_remove_member(ls, memb);
203 log_debug(ls, "remove member %d", memb->nodeid);
204 }
205 }
206
207 /* add new members to ls_nodes */
208
209 for (i = 0; i < rv->node_count; i++) {
210 if (dlm_is_member(ls, rv->nodeids[i]))
211 continue;
212 dlm_add_member(ls, rv->nodeids[i]);
213 pos++;
214 log_debug(ls, "add member %d", rv->nodeids[i]);
215 }
216
217 list_for_each_entry(memb, &ls->ls_nodes, list) {
218 if (low == -1 || memb->nodeid < low)
219 low = memb->nodeid;
220 }
221 ls->ls_low_nodeid = low;
222
223 make_member_array(ls);
224 dlm_set_recover_status(ls, DLM_RS_NODES);
225 *neg_out = neg;
226
227 error = ping_members(ls);
228 if (error)
229 goto out;
230
231 error = dlm_recover_members_wait(ls);
232 out:
233 log_debug(ls, "total members %d error %d", ls->ls_num_nodes, error);
234 return error;
235}
236
237/*
238 * Following called from lockspace.c
239 */
240
241int dlm_ls_stop(struct dlm_ls *ls)
242{
243 int new;
244
245 /*
246 * A stop cancels any recovery that's in progress (see RECOVERY_STOP,
247 * dlm_recovery_stopped()) and prevents any new locks from being
248 * processed (see RUNNING, dlm_locking_stopped()).
249 */
250
251 spin_lock(&ls->ls_recover_lock);
252 set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
253 new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
254 ls->ls_recover_seq++;
255 spin_unlock(&ls->ls_recover_lock);
256
257 /*
258 * This in_recovery lock does two things:
259 *
260 * 1) Keeps this function from returning until all threads are out
261 * of locking routines and locking is truely stopped.
262 * 2) Keeps any new requests from being processed until it's unlocked
263 * when recovery is complete.
264 */
265
266 if (new)
267 down_write(&ls->ls_in_recovery);
268
269 /*
270 * The recoverd suspend/resume makes sure that dlm_recoverd (if
271 * running) has noticed the clearing of RUNNING above and quit
272 * processing the previous recovery. This will be true for all nodes
273 * before any nodes start the new recovery.
274 */
275
276 dlm_recoverd_suspend(ls);
277 ls->ls_recover_status = 0;
278 dlm_recoverd_resume(ls);
279 return 0;
280}
281
282int dlm_ls_start(struct dlm_ls *ls)
283{
284 struct dlm_recover *rv = NULL, *rv_old;
285 int *ids = NULL;
286 int error, count;
287
288 rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL);
289 if (!rv)
290 return -ENOMEM;
291
292 error = count = dlm_nodeid_list(ls->ls_name, &ids);
293 if (error <= 0)
294 goto fail;
295
296 spin_lock(&ls->ls_recover_lock);
297
298 /* the lockspace needs to be stopped before it can be started */
299
300 if (!dlm_locking_stopped(ls)) {
301 spin_unlock(&ls->ls_recover_lock);
302 log_error(ls, "start ignored: lockspace running");
303 error = -EINVAL;
304 goto fail;
305 }
306
307 rv->nodeids = ids;
308 rv->node_count = count;
309 rv->seq = ++ls->ls_recover_seq;
310 rv_old = ls->ls_recover_args;
311 ls->ls_recover_args = rv;
312 spin_unlock(&ls->ls_recover_lock);
313
314 if (rv_old) {
315 kfree(rv_old->nodeids);
316 kfree(rv_old);
317 }
318
319 dlm_recoverd_kick(ls);
320 return 0;
321
322 fail:
323 kfree(rv);
324 kfree(ids);
325 return error;
326}
327
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
new file mode 100644
index 000000000000..927c08c19214
--- /dev/null
+++ b/fs/dlm/member.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __MEMBER_DOT_H__
14#define __MEMBER_DOT_H__
15
16int dlm_ls_stop(struct dlm_ls *ls);
17int dlm_ls_start(struct dlm_ls *ls);
18void dlm_clear_members(struct dlm_ls *ls);
19void dlm_clear_members_gone(struct dlm_ls *ls);
20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
21int dlm_is_removed(struct dlm_ls *ls, int nodeid);
22
23#endif /* __MEMBER_DOT_H__ */
24
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
new file mode 100644
index 000000000000..989b608fd836
--- /dev/null
+++ b/fs/dlm/memory.c
@@ -0,0 +1,116 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "config.h"
16#include "memory.h"
17
18static kmem_cache_t *lkb_cache;
19
20
21int dlm_memory_init(void)
22{
23 int ret = 0;
24
25 lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
26 __alignof__(struct dlm_lkb), 0, NULL, NULL);
27 if (!lkb_cache)
28 ret = -ENOMEM;
29 return ret;
30}
31
32void dlm_memory_exit(void)
33{
34 if (lkb_cache)
35 kmem_cache_destroy(lkb_cache);
36}
37
38char *allocate_lvb(struct dlm_ls *ls)
39{
40 char *p;
41
42 p = kmalloc(ls->ls_lvblen, GFP_KERNEL);
43 if (p)
44 memset(p, 0, ls->ls_lvblen);
45 return p;
46}
47
48void free_lvb(char *p)
49{
50 kfree(p);
51}
52
53/* FIXME: have some minimal space built-in to rsb for the name and
54 kmalloc a separate name if needed, like dentries are done */
55
56struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
57{
58 struct dlm_rsb *r;
59
60 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
61
62 r = kmalloc(sizeof(*r) + namelen, GFP_KERNEL);
63 if (r)
64 memset(r, 0, sizeof(*r) + namelen);
65 return r;
66}
67
68void free_rsb(struct dlm_rsb *r)
69{
70 if (r->res_lvbptr)
71 free_lvb(r->res_lvbptr);
72 kfree(r);
73}
74
75struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
76{
77 struct dlm_lkb *lkb;
78
79 lkb = kmem_cache_alloc(lkb_cache, GFP_KERNEL);
80 if (lkb)
81 memset(lkb, 0, sizeof(*lkb));
82 return lkb;
83}
84
85void free_lkb(struct dlm_lkb *lkb)
86{
87 if (lkb->lkb_flags & DLM_IFL_USER) {
88 struct dlm_user_args *ua;
89 ua = (struct dlm_user_args *)lkb->lkb_astparam;
90 if (ua) {
91 if (ua->lksb.sb_lvbptr)
92 kfree(ua->lksb.sb_lvbptr);
93 kfree(ua);
94 }
95 }
96 kmem_cache_free(lkb_cache, lkb);
97}
98
99struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
100{
101 struct dlm_direntry *de;
102
103 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,
104 printk("namelen = %d\n", namelen););
105
106 de = kmalloc(sizeof(*de) + namelen, GFP_KERNEL);
107 if (de)
108 memset(de, 0, sizeof(*de) + namelen);
109 return de;
110}
111
112void free_direntry(struct dlm_direntry *de)
113{
114 kfree(de);
115}
116
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
new file mode 100644
index 000000000000..6ead158ccc5c
--- /dev/null
+++ b/fs/dlm/memory.h
@@ -0,0 +1,29 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MEMORY_DOT_H__
15#define __MEMORY_DOT_H__
16
17int dlm_memory_init(void);
18void dlm_memory_exit(void);
19struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
20void free_rsb(struct dlm_rsb *r);
21struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
22void free_lkb(struct dlm_lkb *l);
23struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
24void free_direntry(struct dlm_direntry *de);
25char *allocate_lvb(struct dlm_ls *ls);
26void free_lvb(char *l);
27
28#endif /* __MEMORY_DOT_H__ */
29
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
new file mode 100644
index 000000000000..c9b1c3d535f4
--- /dev/null
+++ b/fs/dlm/midcomms.c
@@ -0,0 +1,140 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * midcomms.c
16 *
17 * This is the appallingly named "mid-level" comms layer.
18 *
19 * Its purpose is to take packets from the "real" comms layer,
20 * split them up into packets and pass them to the interested
21 * part of the locking mechanism.
22 *
23 * It also takes messages from the locking layer, formats them
24 * into packets and sends them to the comms layer.
25 */
26
27#include "dlm_internal.h"
28#include "lowcomms.h"
29#include "config.h"
30#include "rcom.h"
31#include "lock.h"
32#include "midcomms.h"
33
34
35static void copy_from_cb(void *dst, const void *base, unsigned offset,
36 unsigned len, unsigned limit)
37{
38 unsigned copy = len;
39
40 if ((copy + offset) > limit)
41 copy = limit - offset;
42 memcpy(dst, base + offset, copy);
43 len -= copy;
44 if (len)
45 memcpy(dst + copy, base, len);
46}
47
48/*
49 * Called from the low-level comms layer to process a buffer of
50 * commands.
51 *
52 * Only complete messages are processed here, any "spare" bytes from
53 * the end of a buffer are saved and tacked onto the front of the next
54 * message that comes in. I doubt this will happen very often but we
55 * need to be able to cope with it and I don't want the task to be waiting
56 * for packets to come in when there is useful work to be done.
57 */
58
59int dlm_process_incoming_buffer(int nodeid, const void *base,
60 unsigned offset, unsigned len, unsigned limit)
61{
62 unsigned char __tmp[DLM_INBUF_LEN];
63 struct dlm_header *msg = (struct dlm_header *) __tmp;
64 int ret = 0;
65 int err = 0;
66 uint16_t msglen;
67 uint32_t lockspace;
68
69 while (len > sizeof(struct dlm_header)) {
70
71 /* Copy just the header to check the total length. The
72 message may wrap around the end of the buffer back to the
73 start, so we need to use a temp buffer and copy_from_cb. */
74
75 copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
76 limit);
77
78 msglen = le16_to_cpu(msg->h_length);
79 lockspace = msg->h_lockspace;
80
81 err = -EINVAL;
82 if (msglen < sizeof(struct dlm_header))
83 break;
84 err = -E2BIG;
85 if (msglen > dlm_config.buffer_size) {
86 log_print("message size %d from %d too big, buf len %d",
87 msglen, nodeid, len);
88 break;
89 }
90 err = 0;
91
92 /* If only part of the full message is contained in this
93 buffer, then do nothing and wait for lowcomms to call
94 us again later with more data. We return 0 meaning
95 we've consumed none of the input buffer. */
96
97 if (msglen > len)
98 break;
99
100 /* Allocate a larger temp buffer if the full message won't fit
101 in the buffer on the stack (which should work for most
102 ordinary messages). */
103
104 if (msglen > sizeof(__tmp) &&
105 msg == (struct dlm_header *) __tmp) {
106 msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
107 if (msg == NULL)
108 return ret;
109 }
110
111 copy_from_cb(msg, base, offset, msglen, limit);
112
113 BUG_ON(lockspace != msg->h_lockspace);
114
115 ret += msglen;
116 offset += msglen;
117 offset &= (limit - 1);
118 len -= msglen;
119
120 switch (msg->h_cmd) {
121 case DLM_MSG:
122 dlm_receive_message(msg, nodeid, 0);
123 break;
124
125 case DLM_RCOM:
126 dlm_receive_rcom(msg, nodeid);
127 break;
128
129 default:
130 log_print("unknown msg type %x from %u: %u %u %u %u",
131 msg->h_cmd, nodeid, msglen, len, offset, ret);
132 }
133 }
134
135 if (msg != (struct dlm_header *) __tmp)
136 kfree(msg);
137
138 return err ? err : ret;
139}
140
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
new file mode 100644
index 000000000000..95852a5f111d
--- /dev/null
+++ b/fs/dlm/midcomms.h
@@ -0,0 +1,21 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MIDCOMMS_DOT_H__
15#define __MIDCOMMS_DOT_H__
16
17int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
18 unsigned len, unsigned limit);
19
20#endif /* __MIDCOMMS_DOT_H__ */
21
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
new file mode 100644
index 000000000000..518239a8b1e9
--- /dev/null
+++ b/fs/dlm/rcom.c
@@ -0,0 +1,472 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "midcomms.h"
19#include "rcom.h"
20#include "recover.h"
21#include "dir.h"
22#include "config.h"
23#include "memory.h"
24#include "lock.h"
25#include "util.h"
26
27
28static int rcom_response(struct dlm_ls *ls)
29{
30 return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
31}
32
33static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
34 struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
35{
36 struct dlm_rcom *rc;
37 struct dlm_mhandle *mh;
38 char *mb;
39 int mb_len = sizeof(struct dlm_rcom) + len;
40
41 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
42 if (!mh) {
43 log_print("create_rcom to %d type %d len %d ENOBUFS",
44 to_nodeid, type, len);
45 return -ENOBUFS;
46 }
47 memset(mb, 0, mb_len);
48
49 rc = (struct dlm_rcom *) mb;
50
51 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
52 rc->rc_header.h_lockspace = ls->ls_global_id;
53 rc->rc_header.h_nodeid = dlm_our_nodeid();
54 rc->rc_header.h_length = mb_len;
55 rc->rc_header.h_cmd = DLM_RCOM;
56
57 rc->rc_type = type;
58
59 *mh_ret = mh;
60 *rc_ret = rc;
61 return 0;
62}
63
64static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
65 struct dlm_rcom *rc)
66{
67 dlm_rcom_out(rc);
68 dlm_lowcomms_commit_buffer(mh);
69}
70
71/* When replying to a status request, a node also sends back its
72 configuration values. The requesting node then checks that the remote
73 node is configured the same way as itself. */
74
75static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
76{
77 rf->rf_lvblen = ls->ls_lvblen;
78 rf->rf_lsflags = ls->ls_exflags;
79}
80
81static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
82{
83 if (rf->rf_lvblen != ls->ls_lvblen ||
84 rf->rf_lsflags != ls->ls_exflags) {
85 log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
86 ls->ls_lvblen, ls->ls_exflags,
87 nodeid, rf->rf_lvblen, rf->rf_lsflags);
88 return -EINVAL;
89 }
90 return 0;
91}
92
93int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
94{
95 struct dlm_rcom *rc;
96 struct dlm_mhandle *mh;
97 int error = 0;
98
99 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
100 ls->ls_recover_nodeid = nodeid;
101
102 if (nodeid == dlm_our_nodeid()) {
103 rc = (struct dlm_rcom *) ls->ls_recover_buf;
104 rc->rc_result = dlm_recover_status(ls);
105 goto out;
106 }
107
108 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
109 if (error)
110 goto out;
111 rc->rc_id = ++ls->ls_rcom_seq;
112
113 send_rcom(ls, mh, rc);
114
115 error = dlm_wait_function(ls, &rcom_response);
116 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
117 if (error)
118 goto out;
119
120 rc = (struct dlm_rcom *) ls->ls_recover_buf;
121
122 if (rc->rc_result == -ESRCH) {
123 /* we pretend the remote lockspace exists with 0 status */
124 log_debug(ls, "remote node %d not ready", nodeid);
125 rc->rc_result = 0;
126 } else
127 error = check_config(ls, (struct rcom_config *) rc->rc_buf,
128 nodeid);
129 /* the caller looks at rc_result for the remote recovery status */
130 out:
131 return error;
132}
133
134static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
135{
136 struct dlm_rcom *rc;
137 struct dlm_mhandle *mh;
138 int error, nodeid = rc_in->rc_header.h_nodeid;
139
140 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
141 sizeof(struct rcom_config), &rc, &mh);
142 if (error)
143 return;
144 rc->rc_id = rc_in->rc_id;
145 rc->rc_result = dlm_recover_status(ls);
146 make_config(ls, (struct rcom_config *) rc->rc_buf);
147
148 send_rcom(ls, mh, rc);
149}
150
151static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
152{
153 if (rc_in->rc_id != ls->ls_rcom_seq) {
154 log_debug(ls, "reject old reply %d got %llx wanted %llx",
155 rc_in->rc_type, rc_in->rc_id, ls->ls_rcom_seq);
156 return;
157 }
158 memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
159 set_bit(LSFL_RCOM_READY, &ls->ls_flags);
160 wake_up(&ls->ls_wait_general);
161}
162
163static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
164{
165 receive_sync_reply(ls, rc_in);
166}
167
168int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
169{
170 struct dlm_rcom *rc;
171 struct dlm_mhandle *mh;
172 int error = 0, len = sizeof(struct dlm_rcom);
173
174 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
175 ls->ls_recover_nodeid = nodeid;
176
177 if (nodeid == dlm_our_nodeid()) {
178 dlm_copy_master_names(ls, last_name, last_len,
179 ls->ls_recover_buf + len,
180 dlm_config.buffer_size - len, nodeid);
181 goto out;
182 }
183
184 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
185 if (error)
186 goto out;
187 memcpy(rc->rc_buf, last_name, last_len);
188 rc->rc_id = ++ls->ls_rcom_seq;
189
190 send_rcom(ls, mh, rc);
191
192 error = dlm_wait_function(ls, &rcom_response);
193 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
194 out:
195 return error;
196}
197
198static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
199{
200 struct dlm_rcom *rc;
201 struct dlm_mhandle *mh;
202 int error, inlen, outlen;
203 int nodeid = rc_in->rc_header.h_nodeid;
204 uint32_t status = dlm_recover_status(ls);
205
206 /*
207 * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
208 * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
209 * It could only happen in rare cases where we get a late NAMES
210 * message from a previous instance of recovery.
211 */
212
213 if (!(status & DLM_RS_NODES)) {
214 log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
215 return;
216 }
217
218 nodeid = rc_in->rc_header.h_nodeid;
219 inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
220 outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
221
222 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
223 if (error)
224 return;
225 rc->rc_id = rc_in->rc_id;
226
227 dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
228 nodeid);
229 send_rcom(ls, mh, rc);
230}
231
232static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
233{
234 receive_sync_reply(ls, rc_in);
235}
236
237int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
238{
239 struct dlm_rcom *rc;
240 struct dlm_mhandle *mh;
241 struct dlm_ls *ls = r->res_ls;
242 int error;
243
244 error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
245 &rc, &mh);
246 if (error)
247 goto out;
248 memcpy(rc->rc_buf, r->res_name, r->res_length);
249 rc->rc_id = (unsigned long) r;
250
251 send_rcom(ls, mh, rc);
252 out:
253 return error;
254}
255
256static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
257{
258 struct dlm_rcom *rc;
259 struct dlm_mhandle *mh;
260 int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
261 int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
262
263 error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
264 if (error)
265 return;
266
267 error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
268 if (error)
269 ret_nodeid = error;
270 rc->rc_result = ret_nodeid;
271 rc->rc_id = rc_in->rc_id;
272
273 send_rcom(ls, mh, rc);
274}
275
276static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
277{
278 dlm_recover_master_reply(ls, rc_in);
279}
280
281static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
282 struct rcom_lock *rl)
283{
284 memset(rl, 0, sizeof(*rl));
285
286 rl->rl_ownpid = lkb->lkb_ownpid;
287 rl->rl_lkid = lkb->lkb_id;
288 rl->rl_exflags = lkb->lkb_exflags;
289 rl->rl_flags = lkb->lkb_flags;
290 rl->rl_lvbseq = lkb->lkb_lvbseq;
291 rl->rl_rqmode = lkb->lkb_rqmode;
292 rl->rl_grmode = lkb->lkb_grmode;
293 rl->rl_status = lkb->lkb_status;
294 rl->rl_wait_type = lkb->lkb_wait_type;
295
296 if (lkb->lkb_bastaddr)
297 rl->rl_asts |= AST_BAST;
298 if (lkb->lkb_astaddr)
299 rl->rl_asts |= AST_COMP;
300
301 rl->rl_namelen = r->res_length;
302 memcpy(rl->rl_name, r->res_name, r->res_length);
303
304 /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
305 If so, receive_rcom_lock_args() won't take this copy. */
306
307 if (lkb->lkb_lvbptr)
308 memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
309}
310
311int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
312{
313 struct dlm_ls *ls = r->res_ls;
314 struct dlm_rcom *rc;
315 struct dlm_mhandle *mh;
316 struct rcom_lock *rl;
317 int error, len = sizeof(struct rcom_lock);
318
319 if (lkb->lkb_lvbptr)
320 len += ls->ls_lvblen;
321
322 error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
323 if (error)
324 goto out;
325
326 rl = (struct rcom_lock *) rc->rc_buf;
327 pack_rcom_lock(r, lkb, rl);
328 rc->rc_id = (unsigned long) r;
329
330 send_rcom(ls, mh, rc);
331 out:
332 return error;
333}
334
335static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
336{
337 struct dlm_rcom *rc;
338 struct dlm_mhandle *mh;
339 int error, nodeid = rc_in->rc_header.h_nodeid;
340
341 dlm_recover_master_copy(ls, rc_in);
342
343 error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
344 sizeof(struct rcom_lock), &rc, &mh);
345 if (error)
346 return;
347
348 /* We send back the same rcom_lock struct we received, but
349 dlm_recover_master_copy() has filled in rl_remid and rl_result */
350
351 memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
352 rc->rc_id = rc_in->rc_id;
353
354 send_rcom(ls, mh, rc);
355}
356
357static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
358{
359 uint32_t status = dlm_recover_status(ls);
360
361 if (!(status & DLM_RS_DIR)) {
362 log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
363 rc_in->rc_header.h_nodeid);
364 return;
365 }
366
367 dlm_recover_process_copy(ls, rc_in);
368}
369
370static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
371{
372 struct dlm_rcom *rc;
373 struct dlm_mhandle *mh;
374 char *mb;
375 int mb_len = sizeof(struct dlm_rcom);
376
377 mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb);
378 if (!mh)
379 return -ENOBUFS;
380 memset(mb, 0, mb_len);
381
382 rc = (struct dlm_rcom *) mb;
383
384 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
385 rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
386 rc->rc_header.h_nodeid = dlm_our_nodeid();
387 rc->rc_header.h_length = mb_len;
388 rc->rc_header.h_cmd = DLM_RCOM;
389
390 rc->rc_type = DLM_RCOM_STATUS_REPLY;
391 rc->rc_id = rc_in->rc_id;
392 rc->rc_result = -ESRCH;
393
394 dlm_rcom_out(rc);
395 dlm_lowcomms_commit_buffer(mh);
396
397 return 0;
398}
399
400/* Called by dlm_recvd; corresponds to dlm_receive_message() but special
401 recovery-only comms are sent through here. */
402
403void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
404{
405 struct dlm_rcom *rc = (struct dlm_rcom *) hd;
406 struct dlm_ls *ls;
407
408 dlm_rcom_in(rc);
409
410 /* If the lockspace doesn't exist then still send a status message
411 back; it's possible that it just doesn't have its global_id yet. */
412
413 ls = dlm_find_lockspace_global(hd->h_lockspace);
414 if (!ls) {
415 log_print("lockspace %x from %d not found",
416 hd->h_lockspace, nodeid);
417 send_ls_not_ready(nodeid, rc);
418 return;
419 }
420
421 if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
422 log_error(ls, "ignoring recovery message %x from %d",
423 rc->rc_type, nodeid);
424 goto out;
425 }
426
427 if (nodeid != rc->rc_header.h_nodeid) {
428 log_error(ls, "bad rcom nodeid %d from %d",
429 rc->rc_header.h_nodeid, nodeid);
430 goto out;
431 }
432
433 switch (rc->rc_type) {
434 case DLM_RCOM_STATUS:
435 receive_rcom_status(ls, rc);
436 break;
437
438 case DLM_RCOM_NAMES:
439 receive_rcom_names(ls, rc);
440 break;
441
442 case DLM_RCOM_LOOKUP:
443 receive_rcom_lookup(ls, rc);
444 break;
445
446 case DLM_RCOM_LOCK:
447 receive_rcom_lock(ls, rc);
448 break;
449
450 case DLM_RCOM_STATUS_REPLY:
451 receive_rcom_status_reply(ls, rc);
452 break;
453
454 case DLM_RCOM_NAMES_REPLY:
455 receive_rcom_names_reply(ls, rc);
456 break;
457
458 case DLM_RCOM_LOOKUP_REPLY:
459 receive_rcom_lookup_reply(ls, rc);
460 break;
461
462 case DLM_RCOM_LOCK_REPLY:
463 receive_rcom_lock_reply(ls, rc);
464 break;
465
466 default:
467 DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
468 }
469 out:
470 dlm_put_lockspace(ls);
471}
472
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
new file mode 100644
index 000000000000..d7984321ff41
--- /dev/null
+++ b/fs/dlm/rcom.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RCOM_DOT_H__
15#define __RCOM_DOT_H__
16
17int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
18int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
19int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
20int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
21void dlm_receive_rcom(struct dlm_header *hd, int nodeid);
22
23#endif
24
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
new file mode 100644
index 000000000000..a5e6d184872e
--- /dev/null
+++ b/fs/dlm/recover.c
@@ -0,0 +1,765 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "dir.h"
17#include "config.h"
18#include "ast.h"
19#include "memory.h"
20#include "rcom.h"
21#include "lock.h"
22#include "lowcomms.h"
23#include "member.h"
24#include "recover.h"
25
26
27/*
28 * Recovery waiting routines: these functions wait for a particular reply from
29 * a remote node, or for the remote node to report a certain status. They need
30 * to abort if the lockspace is stopped indicating a node has failed (perhaps
31 * the one being waited for).
32 */
33
34/*
35 * Wait until given function returns non-zero or lockspace is stopped
36 * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another
37 * function thinks it could have completed the waited-on task, they should wake
38 * up ls_wait_general to get an immediate response rather than waiting for the
39 * timer to detect the result. A timer wakes us up periodically while waiting
40 * to see if we should abort due to a node failure. This should only be called
41 * by the dlm_recoverd thread.
42 */
43
44static void dlm_wait_timer_fn(unsigned long data)
45{
46 struct dlm_ls *ls = (struct dlm_ls *) data;
47 mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ));
48 wake_up(&ls->ls_wait_general);
49}
50
51int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
52{
53 int error = 0;
54
55 init_timer(&ls->ls_timer);
56 ls->ls_timer.function = dlm_wait_timer_fn;
57 ls->ls_timer.data = (long) ls;
58 ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ);
59 add_timer(&ls->ls_timer);
60
61 wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
62 del_timer_sync(&ls->ls_timer);
63
64 if (dlm_recovery_stopped(ls)) {
65 log_debug(ls, "dlm_wait_function aborted");
66 error = -EINTR;
67 }
68 return error;
69}
70
71/*
72 * An efficient way for all nodes to wait for all others to have a certain
73 * status. The node with the lowest nodeid polls all the others for their
74 * status (wait_status_all) and all the others poll the node with the low id
75 * for its accumulated result (wait_status_low). When all nodes have set
76 * status flag X, then status flag X_ALL will be set on the low nodeid.
77 */
78
79uint32_t dlm_recover_status(struct dlm_ls *ls)
80{
81 uint32_t status;
82 spin_lock(&ls->ls_recover_lock);
83 status = ls->ls_recover_status;
84 spin_unlock(&ls->ls_recover_lock);
85 return status;
86}
87
88void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
89{
90 spin_lock(&ls->ls_recover_lock);
91 ls->ls_recover_status |= status;
92 spin_unlock(&ls->ls_recover_lock);
93}
94
95static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
96{
97 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
98 struct dlm_member *memb;
99 int error = 0, delay;
100
101 list_for_each_entry(memb, &ls->ls_nodes, list) {
102 delay = 0;
103 for (;;) {
104 if (dlm_recovery_stopped(ls)) {
105 error = -EINTR;
106 goto out;
107 }
108
109 error = dlm_rcom_status(ls, memb->nodeid);
110 if (error)
111 goto out;
112
113 if (rc->rc_result & wait_status)
114 break;
115 if (delay < 1000)
116 delay += 20;
117 msleep(delay);
118 }
119 }
120 out:
121 return error;
122}
123
124static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
125{
126 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
127 int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
128
129 for (;;) {
130 if (dlm_recovery_stopped(ls)) {
131 error = -EINTR;
132 goto out;
133 }
134
135 error = dlm_rcom_status(ls, nodeid);
136 if (error)
137 break;
138
139 if (rc->rc_result & wait_status)
140 break;
141 if (delay < 1000)
142 delay += 20;
143 msleep(delay);
144 }
145 out:
146 return error;
147}
148
149static int wait_status(struct dlm_ls *ls, uint32_t status)
150{
151 uint32_t status_all = status << 1;
152 int error;
153
154 if (ls->ls_low_nodeid == dlm_our_nodeid()) {
155 error = wait_status_all(ls, status);
156 if (!error)
157 dlm_set_recover_status(ls, status_all);
158 } else
159 error = wait_status_low(ls, status_all);
160
161 return error;
162}
163
164int dlm_recover_members_wait(struct dlm_ls *ls)
165{
166 return wait_status(ls, DLM_RS_NODES);
167}
168
169int dlm_recover_directory_wait(struct dlm_ls *ls)
170{
171 return wait_status(ls, DLM_RS_DIR);
172}
173
174int dlm_recover_locks_wait(struct dlm_ls *ls)
175{
176 return wait_status(ls, DLM_RS_LOCKS);
177}
178
179int dlm_recover_done_wait(struct dlm_ls *ls)
180{
181 return wait_status(ls, DLM_RS_DONE);
182}
183
184/*
185 * The recover_list contains all the rsb's for which we've requested the new
186 * master nodeid. As replies are returned from the resource directories the
187 * rsb's are removed from the list. When the list is empty we're done.
188 *
189 * The recover_list is later similarly used for all rsb's for which we've sent
190 * new lkb's and need to receive new corresponding lkid's.
191 *
192 * We use the address of the rsb struct as a simple local identifier for the
193 * rsb so we can match an rcom reply with the rsb it was sent for.
194 */
195
196static int recover_list_empty(struct dlm_ls *ls)
197{
198 int empty;
199
200 spin_lock(&ls->ls_recover_list_lock);
201 empty = list_empty(&ls->ls_recover_list);
202 spin_unlock(&ls->ls_recover_list_lock);
203
204 return empty;
205}
206
207static void recover_list_add(struct dlm_rsb *r)
208{
209 struct dlm_ls *ls = r->res_ls;
210
211 spin_lock(&ls->ls_recover_list_lock);
212 if (list_empty(&r->res_recover_list)) {
213 list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
214 ls->ls_recover_list_count++;
215 dlm_hold_rsb(r);
216 }
217 spin_unlock(&ls->ls_recover_list_lock);
218}
219
220static void recover_list_del(struct dlm_rsb *r)
221{
222 struct dlm_ls *ls = r->res_ls;
223
224 spin_lock(&ls->ls_recover_list_lock);
225 list_del_init(&r->res_recover_list);
226 ls->ls_recover_list_count--;
227 spin_unlock(&ls->ls_recover_list_lock);
228
229 dlm_put_rsb(r);
230}
231
232static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id)
233{
234 struct dlm_rsb *r = NULL;
235
236 spin_lock(&ls->ls_recover_list_lock);
237
238 list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) {
239 if (id == (unsigned long) r)
240 goto out;
241 }
242 r = NULL;
243 out:
244 spin_unlock(&ls->ls_recover_list_lock);
245 return r;
246}
247
248static void recover_list_clear(struct dlm_ls *ls)
249{
250 struct dlm_rsb *r, *s;
251
252 spin_lock(&ls->ls_recover_list_lock);
253 list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
254 list_del_init(&r->res_recover_list);
255 dlm_put_rsb(r);
256 ls->ls_recover_list_count--;
257 }
258
259 if (ls->ls_recover_list_count != 0) {
260 log_error(ls, "warning: recover_list_count %d",
261 ls->ls_recover_list_count);
262 ls->ls_recover_list_count = 0;
263 }
264 spin_unlock(&ls->ls_recover_list_lock);
265}
266
267
268/* Master recovery: find new master node for rsb's that were
269 mastered on nodes that have been removed.
270
271 dlm_recover_masters
272 recover_master
273 dlm_send_rcom_lookup -> receive_rcom_lookup
274 dlm_dir_lookup
275 receive_rcom_lookup_reply <-
276 dlm_recover_master_reply
277 set_new_master
278 set_master_lkbs
279 set_lock_master
280*/
281
282/*
283 * Set the lock master for all LKBs in a lock queue
284 * If we are the new master of the rsb, we may have received new
285 * MSTCPY locks from other nodes already which we need to ignore
286 * when setting the new nodeid.
287 */
288
289static void set_lock_master(struct list_head *queue, int nodeid)
290{
291 struct dlm_lkb *lkb;
292
293 list_for_each_entry(lkb, queue, lkb_statequeue)
294 if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
295 lkb->lkb_nodeid = nodeid;
296}
297
298static void set_master_lkbs(struct dlm_rsb *r)
299{
300 set_lock_master(&r->res_grantqueue, r->res_nodeid);
301 set_lock_master(&r->res_convertqueue, r->res_nodeid);
302 set_lock_master(&r->res_waitqueue, r->res_nodeid);
303}
304
305/*
306 * Propogate the new master nodeid to locks
307 * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
308 * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which
309 * rsb's to consider.
310 */
311
312static void set_new_master(struct dlm_rsb *r, int nodeid)
313{
314 lock_rsb(r);
315 r->res_nodeid = nodeid;
316 set_master_lkbs(r);
317 rsb_set_flag(r, RSB_NEW_MASTER);
318 rsb_set_flag(r, RSB_NEW_MASTER2);
319 unlock_rsb(r);
320}
321
322/*
323 * We do async lookups on rsb's that need new masters. The rsb's
324 * waiting for a lookup reply are kept on the recover_list.
325 */
326
327static int recover_master(struct dlm_rsb *r)
328{
329 struct dlm_ls *ls = r->res_ls;
330 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
331
332 dir_nodeid = dlm_dir_nodeid(r);
333
334 if (dir_nodeid == our_nodeid) {
335 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
336 r->res_length, &ret_nodeid);
337 if (error)
338 log_error(ls, "recover dir lookup error %d", error);
339
340 if (ret_nodeid == our_nodeid)
341 ret_nodeid = 0;
342 set_new_master(r, ret_nodeid);
343 } else {
344 recover_list_add(r);
345 error = dlm_send_rcom_lookup(r, dir_nodeid);
346 }
347
348 return error;
349}
350
351/*
352 * When not using a directory, most resource names will hash to a new static
353 * master nodeid and the resource will need to be remastered.
354 */
355
356static int recover_master_static(struct dlm_rsb *r)
357{
358 int master = dlm_dir_nodeid(r);
359
360 if (master == dlm_our_nodeid())
361 master = 0;
362
363 if (r->res_nodeid != master) {
364 if (is_master(r))
365 dlm_purge_mstcpy_locks(r);
366 set_new_master(r, master);
367 return 1;
368 }
369 return 0;
370}
371
372/*
373 * Go through local root resources and for each rsb which has a master which
374 * has departed, get the new master nodeid from the directory. The dir will
375 * assign mastery to the first node to look up the new master. That means
376 * we'll discover in this lookup if we're the new master of any rsb's.
377 *
378 * We fire off all the dir lookup requests individually and asynchronously to
379 * the correct dir node.
380 */
381
382int dlm_recover_masters(struct dlm_ls *ls)
383{
384 struct dlm_rsb *r;
385 int error = 0, count = 0;
386
387 log_debug(ls, "dlm_recover_masters");
388
389 down_read(&ls->ls_root_sem);
390 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
391 if (dlm_recovery_stopped(ls)) {
392 up_read(&ls->ls_root_sem);
393 error = -EINTR;
394 goto out;
395 }
396
397 if (dlm_no_directory(ls))
398 count += recover_master_static(r);
399 else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) {
400 recover_master(r);
401 count++;
402 }
403
404 schedule();
405 }
406 up_read(&ls->ls_root_sem);
407
408 log_debug(ls, "dlm_recover_masters %d resources", count);
409
410 error = dlm_wait_function(ls, &recover_list_empty);
411 out:
412 if (error)
413 recover_list_clear(ls);
414 return error;
415}
416
417int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
418{
419 struct dlm_rsb *r;
420 int nodeid;
421
422 r = recover_list_find(ls, rc->rc_id);
423 if (!r) {
424 log_error(ls, "dlm_recover_master_reply no id %llx",
425 (unsigned long long)rc->rc_id);
426 goto out;
427 }
428
429 nodeid = rc->rc_result;
430 if (nodeid == dlm_our_nodeid())
431 nodeid = 0;
432
433 set_new_master(r, nodeid);
434 recover_list_del(r);
435
436 if (recover_list_empty(ls))
437 wake_up(&ls->ls_wait_general);
438 out:
439 return 0;
440}
441
442
443/* Lock recovery: rebuild the process-copy locks we hold on a
444 remastered rsb on the new rsb master.
445
446 dlm_recover_locks
447 recover_locks
448 recover_locks_queue
449 dlm_send_rcom_lock -> receive_rcom_lock
450 dlm_recover_master_copy
451 receive_rcom_lock_reply <-
452 dlm_recover_process_copy
453*/
454
455
456/*
457 * keep a count of the number of lkb's we send to the new master; when we get
458 * an equal number of replies then recovery for the rsb is done
459 */
460
461static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
462{
463 struct dlm_lkb *lkb;
464 int error = 0;
465
466 list_for_each_entry(lkb, head, lkb_statequeue) {
467 error = dlm_send_rcom_lock(r, lkb);
468 if (error)
469 break;
470 r->res_recover_locks_count++;
471 }
472
473 return error;
474}
475
476static int recover_locks(struct dlm_rsb *r)
477{
478 int error = 0;
479
480 lock_rsb(r);
481
482 DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r););
483
484 error = recover_locks_queue(r, &r->res_grantqueue);
485 if (error)
486 goto out;
487 error = recover_locks_queue(r, &r->res_convertqueue);
488 if (error)
489 goto out;
490 error = recover_locks_queue(r, &r->res_waitqueue);
491 if (error)
492 goto out;
493
494 if (r->res_recover_locks_count)
495 recover_list_add(r);
496 else
497 rsb_clear_flag(r, RSB_NEW_MASTER);
498 out:
499 unlock_rsb(r);
500 return error;
501}
502
503int dlm_recover_locks(struct dlm_ls *ls)
504{
505 struct dlm_rsb *r;
506 int error, count = 0;
507
508 log_debug(ls, "dlm_recover_locks");
509
510 down_read(&ls->ls_root_sem);
511 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
512 if (is_master(r)) {
513 rsb_clear_flag(r, RSB_NEW_MASTER);
514 continue;
515 }
516
517 if (!rsb_flag(r, RSB_NEW_MASTER))
518 continue;
519
520 if (dlm_recovery_stopped(ls)) {
521 error = -EINTR;
522 up_read(&ls->ls_root_sem);
523 goto out;
524 }
525
526 error = recover_locks(r);
527 if (error) {
528 up_read(&ls->ls_root_sem);
529 goto out;
530 }
531
532 count += r->res_recover_locks_count;
533 }
534 up_read(&ls->ls_root_sem);
535
536 log_debug(ls, "dlm_recover_locks %d locks", count);
537
538 error = dlm_wait_function(ls, &recover_list_empty);
539 out:
540 if (error)
541 recover_list_clear(ls);
542 else
543 dlm_set_recover_status(ls, DLM_RS_LOCKS);
544 return error;
545}
546
547void dlm_recovered_lock(struct dlm_rsb *r)
548{
549 DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_dump_rsb(r););
550
551 r->res_recover_locks_count--;
552 if (!r->res_recover_locks_count) {
553 rsb_clear_flag(r, RSB_NEW_MASTER);
554 recover_list_del(r);
555 }
556
557 if (recover_list_empty(r->res_ls))
558 wake_up(&r->res_ls->ls_wait_general);
559}
560
561/*
562 * The lvb needs to be recovered on all master rsb's. This includes setting
563 * the VALNOTVALID flag if necessary, and determining the correct lvb contents
564 * based on the lvb's of the locks held on the rsb.
565 *
566 * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it
567 * was already set prior to recovery, it's not cleared, regardless of locks.
568 *
569 * The LVB contents are only considered for changing when this is a new master
570 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
571 * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken
572 * from the lkb with the largest lvb sequence number.
573 */
574
575static void recover_lvb(struct dlm_rsb *r)
576{
577 struct dlm_lkb *lkb, *high_lkb = NULL;
578 uint32_t high_seq = 0;
579 int lock_lvb_exists = 0;
580 int big_lock_exists = 0;
581 int lvblen = r->res_ls->ls_lvblen;
582
583 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
584 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
585 continue;
586
587 lock_lvb_exists = 1;
588
589 if (lkb->lkb_grmode > DLM_LOCK_CR) {
590 big_lock_exists = 1;
591 goto setflag;
592 }
593
594 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
595 high_lkb = lkb;
596 high_seq = lkb->lkb_lvbseq;
597 }
598 }
599
600 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
601 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
602 continue;
603
604 lock_lvb_exists = 1;
605
606 if (lkb->lkb_grmode > DLM_LOCK_CR) {
607 big_lock_exists = 1;
608 goto setflag;
609 }
610
611 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
612 high_lkb = lkb;
613 high_seq = lkb->lkb_lvbseq;
614 }
615 }
616
617 setflag:
618 if (!lock_lvb_exists)
619 goto out;
620
621 if (!big_lock_exists)
622 rsb_set_flag(r, RSB_VALNOTVALID);
623
624 /* don't mess with the lvb unless we're the new master */
625 if (!rsb_flag(r, RSB_NEW_MASTER2))
626 goto out;
627
628 if (!r->res_lvbptr) {
629 r->res_lvbptr = allocate_lvb(r->res_ls);
630 if (!r->res_lvbptr)
631 goto out;
632 }
633
634 if (big_lock_exists) {
635 r->res_lvbseq = lkb->lkb_lvbseq;
636 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
637 } else if (high_lkb) {
638 r->res_lvbseq = high_lkb->lkb_lvbseq;
639 memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
640 } else {
641 r->res_lvbseq = 0;
642 memset(r->res_lvbptr, 0, lvblen);
643 }
644 out:
645 return;
646}
647
648/* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks
649 converting PR->CW or CW->PR need to have their lkb_grmode set. */
650
651static void recover_conversion(struct dlm_rsb *r)
652{
653 struct dlm_lkb *lkb;
654 int grmode = -1;
655
656 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
657 if (lkb->lkb_grmode == DLM_LOCK_PR ||
658 lkb->lkb_grmode == DLM_LOCK_CW) {
659 grmode = lkb->lkb_grmode;
660 break;
661 }
662 }
663
664 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
665 if (lkb->lkb_grmode != DLM_LOCK_IV)
666 continue;
667 if (grmode == -1)
668 lkb->lkb_grmode = lkb->lkb_rqmode;
669 else
670 lkb->lkb_grmode = grmode;
671 }
672}
673
674/* We've become the new master for this rsb and waiting/converting locks may
675 need to be granted in dlm_grant_after_purge() due to locks that may have
676 existed from a removed node. */
677
678static void set_locks_purged(struct dlm_rsb *r)
679{
680 if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
681 rsb_set_flag(r, RSB_LOCKS_PURGED);
682}
683
684void dlm_recover_rsbs(struct dlm_ls *ls)
685{
686 struct dlm_rsb *r;
687 int count = 0;
688
689 log_debug(ls, "dlm_recover_rsbs");
690
691 down_read(&ls->ls_root_sem);
692 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
693 lock_rsb(r);
694 if (is_master(r)) {
695 if (rsb_flag(r, RSB_RECOVER_CONVERT))
696 recover_conversion(r);
697 if (rsb_flag(r, RSB_NEW_MASTER2))
698 set_locks_purged(r);
699 recover_lvb(r);
700 count++;
701 }
702 rsb_clear_flag(r, RSB_RECOVER_CONVERT);
703 rsb_clear_flag(r, RSB_NEW_MASTER2);
704 unlock_rsb(r);
705 }
706 up_read(&ls->ls_root_sem);
707
708 log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
709}
710
711/* Create a single list of all root rsb's to be used during recovery */
712
713int dlm_create_root_list(struct dlm_ls *ls)
714{
715 struct dlm_rsb *r;
716 int i, error = 0;
717
718 down_write(&ls->ls_root_sem);
719 if (!list_empty(&ls->ls_root_list)) {
720 log_error(ls, "root list not empty");
721 error = -EINVAL;
722 goto out;
723 }
724
725 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
726 read_lock(&ls->ls_rsbtbl[i].lock);
727 list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
728 list_add(&r->res_root_list, &ls->ls_root_list);
729 dlm_hold_rsb(r);
730 }
731 read_unlock(&ls->ls_rsbtbl[i].lock);
732 }
733 out:
734 up_write(&ls->ls_root_sem);
735 return error;
736}
737
738void dlm_release_root_list(struct dlm_ls *ls)
739{
740 struct dlm_rsb *r, *safe;
741
742 down_write(&ls->ls_root_sem);
743 list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
744 list_del_init(&r->res_root_list);
745 dlm_put_rsb(r);
746 }
747 up_write(&ls->ls_root_sem);
748}
749
750void dlm_clear_toss_list(struct dlm_ls *ls)
751{
752 struct dlm_rsb *r, *safe;
753 int i;
754
755 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
756 write_lock(&ls->ls_rsbtbl[i].lock);
757 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
758 res_hashchain) {
759 list_del(&r->res_hashchain);
760 free_rsb(r);
761 }
762 write_unlock(&ls->ls_rsbtbl[i].lock);
763 }
764}
765
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
new file mode 100644
index 000000000000..ebd0363f1e08
--- /dev/null
+++ b/fs/dlm/recover.h
@@ -0,0 +1,34 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVER_DOT_H__
15#define __RECOVER_DOT_H__
16
17int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
18uint32_t dlm_recover_status(struct dlm_ls *ls);
19void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
20int dlm_recover_members_wait(struct dlm_ls *ls);
21int dlm_recover_directory_wait(struct dlm_ls *ls);
22int dlm_recover_locks_wait(struct dlm_ls *ls);
23int dlm_recover_done_wait(struct dlm_ls *ls);
24int dlm_recover_masters(struct dlm_ls *ls);
25int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
26int dlm_recover_locks(struct dlm_ls *ls);
27void dlm_recovered_lock(struct dlm_rsb *r);
28int dlm_create_root_list(struct dlm_ls *ls);
29void dlm_release_root_list(struct dlm_ls *ls);
30void dlm_clear_toss_list(struct dlm_ls *ls);
31void dlm_recover_rsbs(struct dlm_ls *ls);
32
33#endif /* __RECOVER_DOT_H__ */
34
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
new file mode 100644
index 000000000000..362e3eff4dc9
--- /dev/null
+++ b/fs/dlm/recoverd.c
@@ -0,0 +1,290 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "dir.h"
18#include "ast.h"
19#include "recover.h"
20#include "lowcomms.h"
21#include "lock.h"
22#include "requestqueue.h"
23#include "recoverd.h"
24
25
26/* If the start for which we're re-enabling locking (seq) has been superseded
27 by a newer stop (ls_recover_seq), we need to leave locking disabled. */
28
29static int enable_locking(struct dlm_ls *ls, uint64_t seq)
30{
31 int error = -EINTR;
32
33 spin_lock(&ls->ls_recover_lock);
34 if (ls->ls_recover_seq == seq) {
35 set_bit(LSFL_RUNNING, &ls->ls_flags);
36 up_write(&ls->ls_in_recovery);
37 error = 0;
38 }
39 spin_unlock(&ls->ls_recover_lock);
40 return error;
41}
42
43static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
44{
45 unsigned long start;
46 int error, neg = 0;
47
48 log_debug(ls, "recover %llx", rv->seq);
49
50 mutex_lock(&ls->ls_recoverd_active);
51
52 /*
53 * Suspending and resuming dlm_astd ensures that no lkb's from this ls
54 * will be processed by dlm_astd during recovery.
55 */
56
57 dlm_astd_suspend();
58 dlm_astd_resume();
59
60 /*
61 * This list of root rsb's will be the basis of most of the recovery
62 * routines.
63 */
64
65 dlm_create_root_list(ls);
66
67 /*
68 * Free all the tossed rsb's so we don't have to recover them.
69 */
70
71 dlm_clear_toss_list(ls);
72
73 /*
74 * Add or remove nodes from the lockspace's ls_nodes list.
75 * Also waits for all nodes to complete dlm_recover_members.
76 */
77
78 error = dlm_recover_members(ls, rv, &neg);
79 if (error) {
80 log_error(ls, "recover_members failed %d", error);
81 goto fail;
82 }
83 start = jiffies;
84
85 /*
86 * Rebuild our own share of the directory by collecting from all other
87 * nodes their master rsb names that hash to us.
88 */
89
90 error = dlm_recover_directory(ls);
91 if (error) {
92 log_error(ls, "recover_directory failed %d", error);
93 goto fail;
94 }
95
96 /*
97 * Purge directory-related requests that are saved in requestqueue.
98 * All dir requests from before recovery are invalid now due to the dir
99 * rebuild and will be resent by the requesting nodes.
100 */
101
102 dlm_purge_requestqueue(ls);
103
104 /*
105 * Wait for all nodes to complete directory rebuild.
106 */
107
108 error = dlm_recover_directory_wait(ls);
109 if (error) {
110 log_error(ls, "recover_directory_wait failed %d", error);
111 goto fail;
112 }
113
114 /*
115 * We may have outstanding operations that are waiting for a reply from
116 * a failed node. Mark these to be resent after recovery. Unlock and
117 * cancel ops can just be completed.
118 */
119
120 dlm_recover_waiters_pre(ls);
121
122 error = dlm_recovery_stopped(ls);
123 if (error)
124 goto fail;
125
126 if (neg || dlm_no_directory(ls)) {
127 /*
128 * Clear lkb's for departed nodes.
129 */
130
131 dlm_purge_locks(ls);
132
133 /*
134 * Get new master nodeid's for rsb's that were mastered on
135 * departed nodes.
136 */
137
138 error = dlm_recover_masters(ls);
139 if (error) {
140 log_error(ls, "recover_masters failed %d", error);
141 goto fail;
142 }
143
144 /*
145 * Send our locks on remastered rsb's to the new masters.
146 */
147
148 error = dlm_recover_locks(ls);
149 if (error) {
150 log_error(ls, "recover_locks failed %d", error);
151 goto fail;
152 }
153
154 error = dlm_recover_locks_wait(ls);
155 if (error) {
156 log_error(ls, "recover_locks_wait failed %d", error);
157 goto fail;
158 }
159
160 /*
161 * Finalize state in master rsb's now that all locks can be
162 * checked. This includes conversion resolution and lvb
163 * settings.
164 */
165
166 dlm_recover_rsbs(ls);
167 }
168
169 dlm_release_root_list(ls);
170
171 dlm_set_recover_status(ls, DLM_RS_DONE);
172 error = dlm_recover_done_wait(ls);
173 if (error) {
174 log_error(ls, "recover_done_wait failed %d", error);
175 goto fail;
176 }
177
178 dlm_clear_members_gone(ls);
179
180 error = enable_locking(ls, rv->seq);
181 if (error) {
182 log_error(ls, "enable_locking failed %d", error);
183 goto fail;
184 }
185
186 error = dlm_process_requestqueue(ls);
187 if (error) {
188 log_error(ls, "process_requestqueue failed %d", error);
189 goto fail;
190 }
191
192 error = dlm_recover_waiters_post(ls);
193 if (error) {
194 log_error(ls, "recover_waiters_post failed %d", error);
195 goto fail;
196 }
197
198 dlm_grant_after_purge(ls);
199
200 dlm_astd_wake();
201
202 log_debug(ls, "recover %llx done: %u ms", rv->seq,
203 jiffies_to_msecs(jiffies - start));
204 mutex_unlock(&ls->ls_recoverd_active);
205
206 return 0;
207
208 fail:
209 dlm_release_root_list(ls);
210 log_debug(ls, "recover %llx error %d", rv->seq, error);
211 mutex_unlock(&ls->ls_recoverd_active);
212 return error;
213}
214
215static void do_ls_recovery(struct dlm_ls *ls)
216{
217 struct dlm_recover *rv = NULL;
218
219 spin_lock(&ls->ls_recover_lock);
220 rv = ls->ls_recover_args;
221 ls->ls_recover_args = NULL;
222 clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
223 spin_unlock(&ls->ls_recover_lock);
224
225 if (rv) {
226 ls_recover(ls, rv);
227 kfree(rv->nodeids);
228 kfree(rv);
229 }
230}
231
232static int dlm_recoverd(void *arg)
233{
234 struct dlm_ls *ls;
235
236 ls = dlm_find_lockspace_local(arg);
237 if (!ls) {
238 log_print("dlm_recoverd: no lockspace %p", arg);
239 return -1;
240 }
241
242 while (!kthread_should_stop()) {
243 set_current_state(TASK_INTERRUPTIBLE);
244 if (!test_bit(LSFL_WORK, &ls->ls_flags))
245 schedule();
246 set_current_state(TASK_RUNNING);
247
248 if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
249 do_ls_recovery(ls);
250 }
251
252 dlm_put_lockspace(ls);
253 return 0;
254}
255
256void dlm_recoverd_kick(struct dlm_ls *ls)
257{
258 set_bit(LSFL_WORK, &ls->ls_flags);
259 wake_up_process(ls->ls_recoverd_task);
260}
261
262int dlm_recoverd_start(struct dlm_ls *ls)
263{
264 struct task_struct *p;
265 int error = 0;
266
267 p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
268 if (IS_ERR(p))
269 error = PTR_ERR(p);
270 else
271 ls->ls_recoverd_task = p;
272 return error;
273}
274
275void dlm_recoverd_stop(struct dlm_ls *ls)
276{
277 kthread_stop(ls->ls_recoverd_task);
278}
279
280void dlm_recoverd_suspend(struct dlm_ls *ls)
281{
282 wake_up(&ls->ls_wait_general);
283 mutex_lock(&ls->ls_recoverd_active);
284}
285
286void dlm_recoverd_resume(struct dlm_ls *ls)
287{
288 mutex_unlock(&ls->ls_recoverd_active);
289}
290
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
new file mode 100644
index 000000000000..866657c5d69d
--- /dev/null
+++ b/fs/dlm/recoverd.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVERD_DOT_H__
15#define __RECOVERD_DOT_H__
16
17void dlm_recoverd_kick(struct dlm_ls *ls);
18void dlm_recoverd_stop(struct dlm_ls *ls);
19int dlm_recoverd_start(struct dlm_ls *ls);
20void dlm_recoverd_suspend(struct dlm_ls *ls);
21void dlm_recoverd_resume(struct dlm_ls *ls);
22
23#endif /* __RECOVERD_DOT_H__ */
24
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
new file mode 100644
index 000000000000..7b2b089634a2
--- /dev/null
+++ b/fs/dlm/requestqueue.c
@@ -0,0 +1,184 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "member.h"
15#include "lock.h"
16#include "dir.h"
17#include "config.h"
18#include "requestqueue.h"
19
20struct rq_entry {
21 struct list_head list;
22 int nodeid;
23 char request[1];
24};
25
26/*
27 * Requests received while the lockspace is in recovery get added to the
28 * request queue and processed when recovery is complete. This happens when
29 * the lockspace is suspended on some nodes before it is on others, or the
30 * lockspace is enabled on some while still suspended on others.
31 */
32
33void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
34{
35 struct rq_entry *e;
36 int length = hd->h_length;
37
38 if (dlm_is_removed(ls, nodeid))
39 return;
40
41 e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
42 if (!e) {
43 log_print("dlm_add_requestqueue: out of memory\n");
44 return;
45 }
46
47 e->nodeid = nodeid;
48 memcpy(e->request, hd, length);
49
50 mutex_lock(&ls->ls_requestqueue_mutex);
51 list_add_tail(&e->list, &ls->ls_requestqueue);
52 mutex_unlock(&ls->ls_requestqueue_mutex);
53}
54
55int dlm_process_requestqueue(struct dlm_ls *ls)
56{
57 struct rq_entry *e;
58 struct dlm_header *hd;
59 int error = 0;
60
61 mutex_lock(&ls->ls_requestqueue_mutex);
62
63 for (;;) {
64 if (list_empty(&ls->ls_requestqueue)) {
65 mutex_unlock(&ls->ls_requestqueue_mutex);
66 error = 0;
67 break;
68 }
69 e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
70 mutex_unlock(&ls->ls_requestqueue_mutex);
71
72 hd = (struct dlm_header *) e->request;
73 error = dlm_receive_message(hd, e->nodeid, 1);
74
75 if (error == -EINTR) {
76 /* entry is left on requestqueue */
77 log_debug(ls, "process_requestqueue abort eintr");
78 break;
79 }
80
81 mutex_lock(&ls->ls_requestqueue_mutex);
82 list_del(&e->list);
83 kfree(e);
84
85 if (dlm_locking_stopped(ls)) {
86 log_debug(ls, "process_requestqueue abort running");
87 mutex_unlock(&ls->ls_requestqueue_mutex);
88 error = -EINTR;
89 break;
90 }
91 schedule();
92 }
93
94 return error;
95}
96
97/*
98 * After recovery is done, locking is resumed and dlm_recoverd takes all the
99 * saved requests and processes them as they would have been by dlm_recvd. At
100 * the same time, dlm_recvd will start receiving new requests from remote
101 * nodes. We want to delay dlm_recvd processing new requests until
102 * dlm_recoverd has finished processing the old saved requests.
103 */
104
105void dlm_wait_requestqueue(struct dlm_ls *ls)
106{
107 for (;;) {
108 mutex_lock(&ls->ls_requestqueue_mutex);
109 if (list_empty(&ls->ls_requestqueue))
110 break;
111 if (dlm_locking_stopped(ls))
112 break;
113 mutex_unlock(&ls->ls_requestqueue_mutex);
114 schedule();
115 }
116 mutex_unlock(&ls->ls_requestqueue_mutex);
117}
118
119static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
120{
121 uint32_t type = ms->m_type;
122
123 if (dlm_is_removed(ls, nodeid))
124 return 1;
125
126 /* directory operations are always purged because the directory is
127 always rebuilt during recovery and the lookups resent */
128
129 if (type == DLM_MSG_REMOVE ||
130 type == DLM_MSG_LOOKUP ||
131 type == DLM_MSG_LOOKUP_REPLY)
132 return 1;
133
134 if (!dlm_no_directory(ls))
135 return 0;
136
137 /* with no directory, the master is likely to change as a part of
138 recovery; requests to/from the defunct master need to be purged */
139
140 switch (type) {
141 case DLM_MSG_REQUEST:
142 case DLM_MSG_CONVERT:
143 case DLM_MSG_UNLOCK:
144 case DLM_MSG_CANCEL:
145 /* we're no longer the master of this resource, the sender
146 will resend to the new master (see waiter_needs_recovery) */
147
148 if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
149 return 1;
150 break;
151
152 case DLM_MSG_REQUEST_REPLY:
153 case DLM_MSG_CONVERT_REPLY:
154 case DLM_MSG_UNLOCK_REPLY:
155 case DLM_MSG_CANCEL_REPLY:
156 case DLM_MSG_GRANT:
157 /* this reply is from the former master of the resource,
158 we'll resend to the new master if needed */
159
160 if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
161 return 1;
162 break;
163 }
164
165 return 0;
166}
167
168void dlm_purge_requestqueue(struct dlm_ls *ls)
169{
170 struct dlm_message *ms;
171 struct rq_entry *e, *safe;
172
173 mutex_lock(&ls->ls_requestqueue_mutex);
174 list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
175 ms = (struct dlm_message *) e->request;
176
177 if (purge_request(ls, ms, e->nodeid)) {
178 list_del(&e->list);
179 kfree(e);
180 }
181 }
182 mutex_unlock(&ls->ls_requestqueue_mutex);
183}
184
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
new file mode 100644
index 000000000000..349f0d292d95
--- /dev/null
+++ b/fs/dlm/requestqueue.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __REQUESTQUEUE_DOT_H__
14#define __REQUESTQUEUE_DOT_H__
15
16void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
17int dlm_process_requestqueue(struct dlm_ls *ls);
18void dlm_wait_requestqueue(struct dlm_ls *ls);
19void dlm_purge_requestqueue(struct dlm_ls *ls);
20
21#endif
22
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
new file mode 100644
index 000000000000..494d00ac014e
--- /dev/null
+++ b/fs/dlm/user.c
@@ -0,0 +1,785 @@
1/*
2 * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#include <linux/miscdevice.h>
10#include <linux/init.h>
11#include <linux/wait.h>
12#include <linux/module.h>
13#include <linux/file.h>
14#include <linux/fs.h>
15#include <linux/poll.h>
16#include <linux/signal.h>
17#include <linux/spinlock.h>
18#include <linux/dlm.h>
19#include <linux/dlm_device.h>
20
21#include "dlm_internal.h"
22#include "lockspace.h"
23#include "lock.h"
24#include "lvb_table.h"
25
26static const char *name_prefix="dlm";
27static struct miscdevice ctl_device;
28static struct file_operations device_fops;
29
30#ifdef CONFIG_COMPAT
31
32struct dlm_lock_params32 {
33 __u8 mode;
34 __u8 namelen;
35 __u16 flags;
36 __u32 lkid;
37 __u32 parent;
38
39 __u32 castparam;
40 __u32 castaddr;
41 __u32 bastparam;
42 __u32 bastaddr;
43 __u32 lksb;
44
45 char lvb[DLM_USER_LVB_LEN];
46 char name[0];
47};
48
49struct dlm_write_request32 {
50 __u32 version[3];
51 __u8 cmd;
52 __u8 is64bit;
53 __u8 unused[2];
54
55 union {
56 struct dlm_lock_params32 lock;
57 struct dlm_lspace_params lspace;
58 } i;
59};
60
61struct dlm_lksb32 {
62 __u32 sb_status;
63 __u32 sb_lkid;
64 __u8 sb_flags;
65 __u32 sb_lvbptr;
66};
67
68struct dlm_lock_result32 {
69 __u32 length;
70 __u32 user_astaddr;
71 __u32 user_astparam;
72 __u32 user_lksb;
73 struct dlm_lksb32 lksb;
74 __u8 bast_mode;
75 __u8 unused[3];
76 /* Offsets may be zero if no data is present */
77 __u32 lvb_offset;
78};
79
80static void compat_input(struct dlm_write_request *kb,
81 struct dlm_write_request32 *kb32)
82{
83 kb->version[0] = kb32->version[0];
84 kb->version[1] = kb32->version[1];
85 kb->version[2] = kb32->version[2];
86
87 kb->cmd = kb32->cmd;
88 kb->is64bit = kb32->is64bit;
89 if (kb->cmd == DLM_USER_CREATE_LOCKSPACE ||
90 kb->cmd == DLM_USER_REMOVE_LOCKSPACE) {
91 kb->i.lspace.flags = kb32->i.lspace.flags;
92 kb->i.lspace.minor = kb32->i.lspace.minor;
93 strcpy(kb->i.lspace.name, kb32->i.lspace.name);
94 } else {
95 kb->i.lock.mode = kb32->i.lock.mode;
96 kb->i.lock.namelen = kb32->i.lock.namelen;
97 kb->i.lock.flags = kb32->i.lock.flags;
98 kb->i.lock.lkid = kb32->i.lock.lkid;
99 kb->i.lock.parent = kb32->i.lock.parent;
100 kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam;
101 kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr;
102 kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam;
103 kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
104 kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
105 memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
106 memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen);
107 }
108}
109
110static void compat_output(struct dlm_lock_result *res,
111 struct dlm_lock_result32 *res32)
112{
113 res32->length = res->length - (sizeof(struct dlm_lock_result) -
114 sizeof(struct dlm_lock_result32));
115 res32->user_astaddr = (__u32)(long)res->user_astaddr;
116 res32->user_astparam = (__u32)(long)res->user_astparam;
117 res32->user_lksb = (__u32)(long)res->user_lksb;
118 res32->bast_mode = res->bast_mode;
119
120 res32->lvb_offset = res->lvb_offset;
121 res32->length = res->length;
122
123 res32->lksb.sb_status = res->lksb.sb_status;
124 res32->lksb.sb_flags = res->lksb.sb_flags;
125 res32->lksb.sb_lkid = res->lksb.sb_lkid;
126 res32->lksb.sb_lvbptr = (__u32)(long)res->lksb.sb_lvbptr;
127}
128#endif
129
130
131void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
132{
133 struct dlm_ls *ls;
134 struct dlm_user_args *ua;
135 struct dlm_user_proc *proc;
136 int remove_ownqueue = 0;
137
138 /* dlm_clear_proc_locks() sets ORPHAN/DEAD flag on each
139 lkb before dealing with it. We need to check this
140 flag before taking ls_clear_proc_locks mutex because if
141 it's set, dlm_clear_proc_locks() holds the mutex. */
142
143 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
144 /* log_print("user_add_ast skip1 %x", lkb->lkb_flags); */
145 return;
146 }
147
148 ls = lkb->lkb_resource->res_ls;
149 mutex_lock(&ls->ls_clear_proc_locks);
150
151 /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
152 can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed
153 lkb->ua so we can't try to use it. */
154
155 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
156 /* log_print("user_add_ast skip2 %x", lkb->lkb_flags); */
157 goto out;
158 }
159
160 DLM_ASSERT(lkb->lkb_astparam, dlm_print_lkb(lkb););
161 ua = (struct dlm_user_args *)lkb->lkb_astparam;
162 proc = ua->proc;
163
164 if (type == AST_BAST && ua->bastaddr == NULL)
165 goto out;
166
167 spin_lock(&proc->asts_spin);
168 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
169 kref_get(&lkb->lkb_ref);
170 list_add_tail(&lkb->lkb_astqueue, &proc->asts);
171 lkb->lkb_ast_type |= type;
172 wake_up_interruptible(&proc->wait);
173 }
174
175 /* noqueue requests that fail may need to be removed from the
176 proc's locks list, there should be a better way of detecting
177 this situation than checking all these things... */
178
179 if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV &&
180 ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue))
181 remove_ownqueue = 1;
182
183 /* We want to copy the lvb to userspace when the completion
184 ast is read if the status is 0, the lock has an lvb and
185 lvb_ops says we should. We could probably have set_lvb_lock()
186 set update_user_lvb instead and not need old_mode */
187
188 if ((lkb->lkb_ast_type & AST_COMP) &&
189 (lkb->lkb_lksb->sb_status == 0) &&
190 lkb->lkb_lksb->sb_lvbptr &&
191 dlm_lvb_operations[ua->old_mode + 1][lkb->lkb_grmode + 1])
192 ua->update_user_lvb = 1;
193 else
194 ua->update_user_lvb = 0;
195
196 spin_unlock(&proc->asts_spin);
197
198 if (remove_ownqueue) {
199 spin_lock(&ua->proc->locks_spin);
200 list_del_init(&lkb->lkb_ownqueue);
201 spin_unlock(&ua->proc->locks_spin);
202 dlm_put_lkb(lkb);
203 }
204 out:
205 mutex_unlock(&ls->ls_clear_proc_locks);
206}
207
208static int device_user_lock(struct dlm_user_proc *proc,
209 struct dlm_lock_params *params)
210{
211 struct dlm_ls *ls;
212 struct dlm_user_args *ua;
213 int error = -ENOMEM;
214
215 ls = dlm_find_lockspace_local(proc->lockspace);
216 if (!ls)
217 return -ENOENT;
218
219 if (!params->castaddr || !params->lksb) {
220 error = -EINVAL;
221 goto out;
222 }
223
224 ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
225 if (!ua)
226 goto out;
227 ua->proc = proc;
228 ua->user_lksb = params->lksb;
229 ua->castparam = params->castparam;
230 ua->castaddr = params->castaddr;
231 ua->bastparam = params->bastparam;
232 ua->bastaddr = params->bastaddr;
233
234 if (params->flags & DLM_LKF_CONVERT)
235 error = dlm_user_convert(ls, ua,
236 params->mode, params->flags,
237 params->lkid, params->lvb);
238 else {
239 error = dlm_user_request(ls, ua,
240 params->mode, params->flags,
241 params->name, params->namelen,
242 params->parent);
243 if (!error)
244 error = ua->lksb.sb_lkid;
245 }
246 out:
247 dlm_put_lockspace(ls);
248 return error;
249}
250
251static int device_user_unlock(struct dlm_user_proc *proc,
252 struct dlm_lock_params *params)
253{
254 struct dlm_ls *ls;
255 struct dlm_user_args *ua;
256 int error = -ENOMEM;
257
258 ls = dlm_find_lockspace_local(proc->lockspace);
259 if (!ls)
260 return -ENOENT;
261
262 ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
263 if (!ua)
264 goto out;
265 ua->proc = proc;
266 ua->user_lksb = params->lksb;
267 ua->castparam = params->castparam;
268 ua->castaddr = params->castaddr;
269
270 if (params->flags & DLM_LKF_CANCEL)
271 error = dlm_user_cancel(ls, ua, params->flags, params->lkid);
272 else
273 error = dlm_user_unlock(ls, ua, params->flags, params->lkid,
274 params->lvb);
275 out:
276 dlm_put_lockspace(ls);
277 return error;
278}
279
280static int device_create_lockspace(struct dlm_lspace_params *params)
281{
282 dlm_lockspace_t *lockspace;
283 struct dlm_ls *ls;
284 int error, len;
285
286 if (!capable(CAP_SYS_ADMIN))
287 return -EPERM;
288
289 error = dlm_new_lockspace(params->name, strlen(params->name),
290 &lockspace, 0, DLM_USER_LVB_LEN);
291 if (error)
292 return error;
293
294 ls = dlm_find_lockspace_local(lockspace);
295 if (!ls)
296 return -ENOENT;
297
298 error = -ENOMEM;
299 len = strlen(params->name) + strlen(name_prefix) + 2;
300 ls->ls_device.name = kzalloc(len, GFP_KERNEL);
301 if (!ls->ls_device.name)
302 goto fail;
303 snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
304 params->name);
305 ls->ls_device.fops = &device_fops;
306 ls->ls_device.minor = MISC_DYNAMIC_MINOR;
307
308 error = misc_register(&ls->ls_device);
309 if (error) {
310 kfree(ls->ls_device.name);
311 goto fail;
312 }
313
314 error = ls->ls_device.minor;
315 dlm_put_lockspace(ls);
316 return error;
317
318 fail:
319 dlm_put_lockspace(ls);
320 dlm_release_lockspace(lockspace, 0);
321 return error;
322}
323
324static int device_remove_lockspace(struct dlm_lspace_params *params)
325{
326 dlm_lockspace_t *lockspace;
327 struct dlm_ls *ls;
328 int error;
329
330 if (!capable(CAP_SYS_ADMIN))
331 return -EPERM;
332
333 ls = dlm_find_lockspace_device(params->minor);
334 if (!ls)
335 return -ENOENT;
336
337 error = misc_deregister(&ls->ls_device);
338 if (error) {
339 dlm_put_lockspace(ls);
340 goto out;
341 }
342 kfree(ls->ls_device.name);
343
344 lockspace = ls->ls_local_handle;
345
346 /* dlm_release_lockspace waits for references to go to zero,
347 so all processes will need to close their device for the ls
348 before the release will procede */
349
350 dlm_put_lockspace(ls);
351 error = dlm_release_lockspace(lockspace, 0);
352out:
353 return error;
354}
355
356/* Check the user's version matches ours */
357static int check_version(struct dlm_write_request *req)
358{
359 if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
360 (req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
361 req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
362
363 printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
364 "user (%d.%d.%d) kernel (%d.%d.%d)\n",
365 current->comm,
366 current->pid,
367 req->version[0],
368 req->version[1],
369 req->version[2],
370 DLM_DEVICE_VERSION_MAJOR,
371 DLM_DEVICE_VERSION_MINOR,
372 DLM_DEVICE_VERSION_PATCH);
373 return -EINVAL;
374 }
375 return 0;
376}
377
378/*
379 * device_write
380 *
381 * device_user_lock
382 * dlm_user_request -> request_lock
383 * dlm_user_convert -> convert_lock
384 *
385 * device_user_unlock
386 * dlm_user_unlock -> unlock_lock
387 * dlm_user_cancel -> cancel_lock
388 *
389 * device_create_lockspace
390 * dlm_new_lockspace
391 *
392 * device_remove_lockspace
393 * dlm_release_lockspace
394 */
395
396/* a write to a lockspace device is a lock or unlock request, a write
397 to the control device is to create/remove a lockspace */
398
399static ssize_t device_write(struct file *file, const char __user *buf,
400 size_t count, loff_t *ppos)
401{
402 struct dlm_user_proc *proc = file->private_data;
403 struct dlm_write_request *kbuf;
404 sigset_t tmpsig, allsigs;
405 int error;
406
407#ifdef CONFIG_COMPAT
408 if (count < sizeof(struct dlm_write_request32))
409#else
410 if (count < sizeof(struct dlm_write_request))
411#endif
412 return -EINVAL;
413
414 kbuf = kmalloc(count, GFP_KERNEL);
415 if (!kbuf)
416 return -ENOMEM;
417
418 if (copy_from_user(kbuf, buf, count)) {
419 error = -EFAULT;
420 goto out_free;
421 }
422
423 if (check_version(kbuf)) {
424 error = -EBADE;
425 goto out_free;
426 }
427
428#ifdef CONFIG_COMPAT
429 if (!kbuf->is64bit) {
430 struct dlm_write_request32 *k32buf;
431 k32buf = (struct dlm_write_request32 *)kbuf;
432 kbuf = kmalloc(count + (sizeof(struct dlm_write_request) -
433 sizeof(struct dlm_write_request32)), GFP_KERNEL);
434 if (!kbuf)
435 return -ENOMEM;
436
437 if (proc)
438 set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
439 compat_input(kbuf, k32buf);
440 kfree(k32buf);
441 }
442#endif
443
444 /* do we really need this? can a write happen after a close? */
445 if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
446 test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
447 return -EINVAL;
448
449 sigfillset(&allsigs);
450 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
451
452 error = -EINVAL;
453
454 switch (kbuf->cmd)
455 {
456 case DLM_USER_LOCK:
457 if (!proc) {
458 log_print("no locking on control device");
459 goto out_sig;
460 }
461 error = device_user_lock(proc, &kbuf->i.lock);
462 break;
463
464 case DLM_USER_UNLOCK:
465 if (!proc) {
466 log_print("no locking on control device");
467 goto out_sig;
468 }
469 error = device_user_unlock(proc, &kbuf->i.lock);
470 break;
471
472 case DLM_USER_CREATE_LOCKSPACE:
473 if (proc) {
474 log_print("create/remove only on control device");
475 goto out_sig;
476 }
477 error = device_create_lockspace(&kbuf->i.lspace);
478 break;
479
480 case DLM_USER_REMOVE_LOCKSPACE:
481 if (proc) {
482 log_print("create/remove only on control device");
483 goto out_sig;
484 }
485 error = device_remove_lockspace(&kbuf->i.lspace);
486 break;
487
488 default:
489 log_print("Unknown command passed to DLM device : %d\n",
490 kbuf->cmd);
491 }
492
493 out_sig:
494 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
495 recalc_sigpending();
496 out_free:
497 kfree(kbuf);
498 return error;
499}
500
501/* Every process that opens the lockspace device has its own "proc" structure
502 hanging off the open file that's used to keep track of locks owned by the
503 process and asts that need to be delivered to the process. */
504
505static int device_open(struct inode *inode, struct file *file)
506{
507 struct dlm_user_proc *proc;
508 struct dlm_ls *ls;
509
510 ls = dlm_find_lockspace_device(iminor(inode));
511 if (!ls)
512 return -ENOENT;
513
514 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
515 if (!proc) {
516 dlm_put_lockspace(ls);
517 return -ENOMEM;
518 }
519
520 proc->lockspace = ls->ls_local_handle;
521 INIT_LIST_HEAD(&proc->asts);
522 INIT_LIST_HEAD(&proc->locks);
523 spin_lock_init(&proc->asts_spin);
524 spin_lock_init(&proc->locks_spin);
525 init_waitqueue_head(&proc->wait);
526 file->private_data = proc;
527
528 return 0;
529}
530
531static int device_close(struct inode *inode, struct file *file)
532{
533 struct dlm_user_proc *proc = file->private_data;
534 struct dlm_ls *ls;
535 sigset_t tmpsig, allsigs;
536
537 ls = dlm_find_lockspace_local(proc->lockspace);
538 if (!ls)
539 return -ENOENT;
540
541 sigfillset(&allsigs);
542 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
543
544 set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags);
545
546 dlm_clear_proc_locks(ls, proc);
547
548 /* at this point no more lkb's should exist for this lockspace,
549 so there's no chance of dlm_user_add_ast() being called and
550 looking for lkb->ua->proc */
551
552 kfree(proc);
553 file->private_data = NULL;
554
555 dlm_put_lockspace(ls);
556 dlm_put_lockspace(ls); /* for the find in device_open() */
557
558 /* FIXME: AUTOFREE: if this ls is no longer used do
559 device_remove_lockspace() */
560
561 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
562 recalc_sigpending();
563
564 return 0;
565}
566
567static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
568 int bmode, char __user *buf, size_t count)
569{
570#ifdef CONFIG_COMPAT
571 struct dlm_lock_result32 result32;
572#endif
573 struct dlm_lock_result result;
574 void *resultptr;
575 int error=0;
576 int len;
577 int struct_len;
578
579 memset(&result, 0, sizeof(struct dlm_lock_result));
580 memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb));
581 result.user_lksb = ua->user_lksb;
582
583 /* FIXME: dlm1 provides for the user's bastparam/addr to not be updated
584 in a conversion unless the conversion is successful. See code
585 in dlm_user_convert() for updating ua from ua_tmp. OpenVMS, though,
586 notes that a new blocking AST address and parameter are set even if
587 the conversion fails, so maybe we should just do that. */
588
589 if (type == AST_BAST) {
590 result.user_astaddr = ua->bastaddr;
591 result.user_astparam = ua->bastparam;
592 result.bast_mode = bmode;
593 } else {
594 result.user_astaddr = ua->castaddr;
595 result.user_astparam = ua->castparam;
596 }
597
598#ifdef CONFIG_COMPAT
599 if (compat)
600 len = sizeof(struct dlm_lock_result32);
601 else
602#endif
603 len = sizeof(struct dlm_lock_result);
604 struct_len = len;
605
606 /* copy lvb to userspace if there is one, it's been updated, and
607 the user buffer has space for it */
608
609 if (ua->update_user_lvb && ua->lksb.sb_lvbptr &&
610 count >= len + DLM_USER_LVB_LEN) {
611 if (copy_to_user(buf+len, ua->lksb.sb_lvbptr,
612 DLM_USER_LVB_LEN)) {
613 error = -EFAULT;
614 goto out;
615 }
616
617 result.lvb_offset = len;
618 len += DLM_USER_LVB_LEN;
619 }
620
621 result.length = len;
622 resultptr = &result;
623#ifdef CONFIG_COMPAT
624 if (compat) {
625 compat_output(&result, &result32);
626 resultptr = &result32;
627 }
628#endif
629
630 if (copy_to_user(buf, resultptr, struct_len))
631 error = -EFAULT;
632 else
633 error = len;
634 out:
635 return error;
636}
637
638/* a read returns a single ast described in a struct dlm_lock_result */
639
640static ssize_t device_read(struct file *file, char __user *buf, size_t count,
641 loff_t *ppos)
642{
643 struct dlm_user_proc *proc = file->private_data;
644 struct dlm_lkb *lkb;
645 struct dlm_user_args *ua;
646 DECLARE_WAITQUEUE(wait, current);
647 int error, type=0, bmode=0, removed = 0;
648
649#ifdef CONFIG_COMPAT
650 if (count < sizeof(struct dlm_lock_result32))
651#else
652 if (count < sizeof(struct dlm_lock_result))
653#endif
654 return -EINVAL;
655
656 /* do we really need this? can a read happen after a close? */
657 if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
658 return -EINVAL;
659
660 spin_lock(&proc->asts_spin);
661 if (list_empty(&proc->asts)) {
662 if (file->f_flags & O_NONBLOCK) {
663 spin_unlock(&proc->asts_spin);
664 return -EAGAIN;
665 }
666
667 add_wait_queue(&proc->wait, &wait);
668
669 repeat:
670 set_current_state(TASK_INTERRUPTIBLE);
671 if (list_empty(&proc->asts) && !signal_pending(current)) {
672 spin_unlock(&proc->asts_spin);
673 schedule();
674 spin_lock(&proc->asts_spin);
675 goto repeat;
676 }
677 set_current_state(TASK_RUNNING);
678 remove_wait_queue(&proc->wait, &wait);
679
680 if (signal_pending(current)) {
681 spin_unlock(&proc->asts_spin);
682 return -ERESTARTSYS;
683 }
684 }
685
686 if (list_empty(&proc->asts)) {
687 spin_unlock(&proc->asts_spin);
688 return -EAGAIN;
689 }
690
691 /* there may be both completion and blocking asts to return for
692 the lkb, don't remove lkb from asts list unless no asts remain */
693
694 lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
695
696 if (lkb->lkb_ast_type & AST_COMP) {
697 lkb->lkb_ast_type &= ~AST_COMP;
698 type = AST_COMP;
699 } else if (lkb->lkb_ast_type & AST_BAST) {
700 lkb->lkb_ast_type &= ~AST_BAST;
701 type = AST_BAST;
702 bmode = lkb->lkb_bastmode;
703 }
704
705 if (!lkb->lkb_ast_type) {
706 list_del(&lkb->lkb_astqueue);
707 removed = 1;
708 }
709 spin_unlock(&proc->asts_spin);
710
711 ua = (struct dlm_user_args *)lkb->lkb_astparam;
712 error = copy_result_to_user(ua,
713 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
714 type, bmode, buf, count);
715
716 /* removes reference for the proc->asts lists added by
717 dlm_user_add_ast() and may result in the lkb being freed */
718 if (removed)
719 dlm_put_lkb(lkb);
720
721 return error;
722}
723
724static unsigned int device_poll(struct file *file, poll_table *wait)
725{
726 struct dlm_user_proc *proc = file->private_data;
727
728 poll_wait(file, &proc->wait, wait);
729
730 spin_lock(&proc->asts_spin);
731 if (!list_empty(&proc->asts)) {
732 spin_unlock(&proc->asts_spin);
733 return POLLIN | POLLRDNORM;
734 }
735 spin_unlock(&proc->asts_spin);
736 return 0;
737}
738
739static int ctl_device_open(struct inode *inode, struct file *file)
740{
741 file->private_data = NULL;
742 return 0;
743}
744
745static int ctl_device_close(struct inode *inode, struct file *file)
746{
747 return 0;
748}
749
750static struct file_operations device_fops = {
751 .open = device_open,
752 .release = device_close,
753 .read = device_read,
754 .write = device_write,
755 .poll = device_poll,
756 .owner = THIS_MODULE,
757};
758
759static struct file_operations ctl_device_fops = {
760 .open = ctl_device_open,
761 .release = ctl_device_close,
762 .write = device_write,
763 .owner = THIS_MODULE,
764};
765
766int dlm_user_init(void)
767{
768 int error;
769
770 ctl_device.name = "dlm-control";
771 ctl_device.fops = &ctl_device_fops;
772 ctl_device.minor = MISC_DYNAMIC_MINOR;
773
774 error = misc_register(&ctl_device);
775 if (error)
776 log_print("misc_register failed for control device");
777
778 return error;
779}
780
781void dlm_user_exit(void)
782{
783 misc_deregister(&ctl_device);
784}
785
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
new file mode 100644
index 000000000000..d38e9f3e4151
--- /dev/null
+++ b/fs/dlm/user.h
@@ -0,0 +1,16 @@
1/*
2 * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#ifndef __USER_DOT_H__
10#define __USER_DOT_H__
11
12void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
13int dlm_user_init(void);
14void dlm_user_exit(void);
15
16#endif
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
new file mode 100644
index 000000000000..767197db9944
--- /dev/null
+++ b/fs/dlm/util.c
@@ -0,0 +1,161 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "rcom.h"
15#include "util.h"
16
17static void header_out(struct dlm_header *hd)
18{
19 hd->h_version = cpu_to_le32(hd->h_version);
20 hd->h_lockspace = cpu_to_le32(hd->h_lockspace);
21 hd->h_nodeid = cpu_to_le32(hd->h_nodeid);
22 hd->h_length = cpu_to_le16(hd->h_length);
23}
24
25static void header_in(struct dlm_header *hd)
26{
27 hd->h_version = le32_to_cpu(hd->h_version);
28 hd->h_lockspace = le32_to_cpu(hd->h_lockspace);
29 hd->h_nodeid = le32_to_cpu(hd->h_nodeid);
30 hd->h_length = le16_to_cpu(hd->h_length);
31}
32
33void dlm_message_out(struct dlm_message *ms)
34{
35 struct dlm_header *hd = (struct dlm_header *) ms;
36
37 header_out(hd);
38
39 ms->m_type = cpu_to_le32(ms->m_type);
40 ms->m_nodeid = cpu_to_le32(ms->m_nodeid);
41 ms->m_pid = cpu_to_le32(ms->m_pid);
42 ms->m_lkid = cpu_to_le32(ms->m_lkid);
43 ms->m_remid = cpu_to_le32(ms->m_remid);
44 ms->m_parent_lkid = cpu_to_le32(ms->m_parent_lkid);
45 ms->m_parent_remid = cpu_to_le32(ms->m_parent_remid);
46 ms->m_exflags = cpu_to_le32(ms->m_exflags);
47 ms->m_sbflags = cpu_to_le32(ms->m_sbflags);
48 ms->m_flags = cpu_to_le32(ms->m_flags);
49 ms->m_lvbseq = cpu_to_le32(ms->m_lvbseq);
50 ms->m_hash = cpu_to_le32(ms->m_hash);
51 ms->m_status = cpu_to_le32(ms->m_status);
52 ms->m_grmode = cpu_to_le32(ms->m_grmode);
53 ms->m_rqmode = cpu_to_le32(ms->m_rqmode);
54 ms->m_bastmode = cpu_to_le32(ms->m_bastmode);
55 ms->m_asts = cpu_to_le32(ms->m_asts);
56 ms->m_result = cpu_to_le32(ms->m_result);
57}
58
59void dlm_message_in(struct dlm_message *ms)
60{
61 struct dlm_header *hd = (struct dlm_header *) ms;
62
63 header_in(hd);
64
65 ms->m_type = le32_to_cpu(ms->m_type);
66 ms->m_nodeid = le32_to_cpu(ms->m_nodeid);
67 ms->m_pid = le32_to_cpu(ms->m_pid);
68 ms->m_lkid = le32_to_cpu(ms->m_lkid);
69 ms->m_remid = le32_to_cpu(ms->m_remid);
70 ms->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid);
71 ms->m_parent_remid = le32_to_cpu(ms->m_parent_remid);
72 ms->m_exflags = le32_to_cpu(ms->m_exflags);
73 ms->m_sbflags = le32_to_cpu(ms->m_sbflags);
74 ms->m_flags = le32_to_cpu(ms->m_flags);
75 ms->m_lvbseq = le32_to_cpu(ms->m_lvbseq);
76 ms->m_hash = le32_to_cpu(ms->m_hash);
77 ms->m_status = le32_to_cpu(ms->m_status);
78 ms->m_grmode = le32_to_cpu(ms->m_grmode);
79 ms->m_rqmode = le32_to_cpu(ms->m_rqmode);
80 ms->m_bastmode = le32_to_cpu(ms->m_bastmode);
81 ms->m_asts = le32_to_cpu(ms->m_asts);
82 ms->m_result = le32_to_cpu(ms->m_result);
83}
84
85static void rcom_lock_out(struct rcom_lock *rl)
86{
87 rl->rl_ownpid = cpu_to_le32(rl->rl_ownpid);
88 rl->rl_lkid = cpu_to_le32(rl->rl_lkid);
89 rl->rl_remid = cpu_to_le32(rl->rl_remid);
90 rl->rl_parent_lkid = cpu_to_le32(rl->rl_parent_lkid);
91 rl->rl_parent_remid = cpu_to_le32(rl->rl_parent_remid);
92 rl->rl_exflags = cpu_to_le32(rl->rl_exflags);
93 rl->rl_flags = cpu_to_le32(rl->rl_flags);
94 rl->rl_lvbseq = cpu_to_le32(rl->rl_lvbseq);
95 rl->rl_result = cpu_to_le32(rl->rl_result);
96 rl->rl_wait_type = cpu_to_le16(rl->rl_wait_type);
97 rl->rl_namelen = cpu_to_le16(rl->rl_namelen);
98}
99
100static void rcom_lock_in(struct rcom_lock *rl)
101{
102 rl->rl_ownpid = le32_to_cpu(rl->rl_ownpid);
103 rl->rl_lkid = le32_to_cpu(rl->rl_lkid);
104 rl->rl_remid = le32_to_cpu(rl->rl_remid);
105 rl->rl_parent_lkid = le32_to_cpu(rl->rl_parent_lkid);
106 rl->rl_parent_remid = le32_to_cpu(rl->rl_parent_remid);
107 rl->rl_exflags = le32_to_cpu(rl->rl_exflags);
108 rl->rl_flags = le32_to_cpu(rl->rl_flags);
109 rl->rl_lvbseq = le32_to_cpu(rl->rl_lvbseq);
110 rl->rl_result = le32_to_cpu(rl->rl_result);
111 rl->rl_wait_type = le16_to_cpu(rl->rl_wait_type);
112 rl->rl_namelen = le16_to_cpu(rl->rl_namelen);
113}
114
115static void rcom_config_out(struct rcom_config *rf)
116{
117 rf->rf_lvblen = cpu_to_le32(rf->rf_lvblen);
118 rf->rf_lsflags = cpu_to_le32(rf->rf_lsflags);
119}
120
121static void rcom_config_in(struct rcom_config *rf)
122{
123 rf->rf_lvblen = le32_to_cpu(rf->rf_lvblen);
124 rf->rf_lsflags = le32_to_cpu(rf->rf_lsflags);
125}
126
127void dlm_rcom_out(struct dlm_rcom *rc)
128{
129 struct dlm_header *hd = (struct dlm_header *) rc;
130 int type = rc->rc_type;
131
132 header_out(hd);
133
134 rc->rc_type = cpu_to_le32(rc->rc_type);
135 rc->rc_result = cpu_to_le32(rc->rc_result);
136 rc->rc_id = cpu_to_le64(rc->rc_id);
137
138 if (type == DLM_RCOM_LOCK)
139 rcom_lock_out((struct rcom_lock *) rc->rc_buf);
140
141 else if (type == DLM_RCOM_STATUS_REPLY)
142 rcom_config_out((struct rcom_config *) rc->rc_buf);
143}
144
145void dlm_rcom_in(struct dlm_rcom *rc)
146{
147 struct dlm_header *hd = (struct dlm_header *) rc;
148
149 header_in(hd);
150
151 rc->rc_type = le32_to_cpu(rc->rc_type);
152 rc->rc_result = le32_to_cpu(rc->rc_result);
153 rc->rc_id = le64_to_cpu(rc->rc_id);
154
155 if (rc->rc_type == DLM_RCOM_LOCK)
156 rcom_lock_in((struct rcom_lock *) rc->rc_buf);
157
158 else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
159 rcom_config_in((struct rcom_config *) rc->rc_buf);
160}
161
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
new file mode 100644
index 000000000000..2b25915161c0
--- /dev/null
+++ b/fs/dlm/util.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __UTIL_DOT_H__
14#define __UTIL_DOT_H__
15
16void dlm_message_out(struct dlm_message *ms);
17void dlm_message_in(struct dlm_message *ms);
18void dlm_rcom_out(struct dlm_rcom *rc);
19void dlm_rcom_in(struct dlm_rcom *rc);
20
21#endif
22
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
new file mode 100644
index 000000000000..8c27de8b9568
--- /dev/null
+++ b/fs/gfs2/Kconfig
@@ -0,0 +1,44 @@
1config GFS2_FS
2 tristate "GFS2 file system support"
3 depends on EXPERIMENTAL
4 select FS_POSIX_ACL
5 help
6 A cluster filesystem.
7
8 Allows a cluster of computers to simultaneously use a block device
9 that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads
10 and writes to the block device like a local filesystem, but also uses
11 a lock module to allow the computers coordinate their I/O so
12 filesystem consistency is maintained. One of the nifty features of
13 GFS is perfect consistency -- changes made to the filesystem on one
14 machine show up immediately on all other machines in the cluster.
15
16 To use the GFS2 filesystem, you will need to enable one or more of
17 the below locking modules. Documentation and utilities for GFS2 can
18 be found here: http://sources.redhat.com/cluster
19
20config GFS2_FS_LOCKING_NOLOCK
21 tristate "GFS2 \"nolock\" locking module"
22 depends on GFS2_FS
23 help
24 Single node locking module for GFS2.
25
26 Use this module if you want to use GFS2 on a single node without
27 its clustering features. You can still take advantage of the
28 large file support, and upgrade to running a full cluster later on
29 if required.
30
31 If you will only be using GFS2 in cluster mode, you do not need this
32 module.
33
34config GFS2_FS_LOCKING_DLM
35 tristate "GFS2 DLM locking module"
36 depends on GFS2_FS
37 select DLM
38 help
39 Multiple node locking module for GFS2
40
41 Most users of GFS2 will require this module. It provides the locking
42 interface between GFS2 and the DLM, which is required to use GFS2
43 in a cluster environment.
44
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
new file mode 100644
index 000000000000..b92852b66629
--- /dev/null
+++ b/fs/gfs2/Makefile
@@ -0,0 +1,10 @@
1obj-$(CONFIG_GFS2_FS) += gfs2.o
2gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
3 glops.o inode.o lm.o log.o lops.o locking.o lvb.o main.o meta_io.o \
4 mount.o ondisk.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
5 ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \
6 recovery.o rgrp.o super.o sys.o trans.o util.o
7
8obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
9obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
10
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
new file mode 100644
index 000000000000..399317841501
--- /dev/null
+++ b/fs/gfs2/acl.c
@@ -0,0 +1,313 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/posix_acl.h>
16#include <linux/posix_acl_xattr.h>
17#include <linux/gfs2_ondisk.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "acl.h"
23#include "eaops.h"
24#include "eattr.h"
25#include "glock.h"
26#include "inode.h"
27#include "meta_io.h"
28#include "trans.h"
29#include "util.h"
30
31#define ACL_ACCESS 1
32#define ACL_DEFAULT 0
33
34int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
35 struct gfs2_ea_request *er,
36 int *remove, mode_t *mode)
37{
38 struct posix_acl *acl;
39 int error;
40
41 error = gfs2_acl_validate_remove(ip, access);
42 if (error)
43 return error;
44
45 if (!er->er_data)
46 return -EINVAL;
47
48 acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
49 if (IS_ERR(acl))
50 return PTR_ERR(acl);
51 if (!acl) {
52 *remove = 1;
53 return 0;
54 }
55
56 error = posix_acl_valid(acl);
57 if (error)
58 goto out;
59
60 if (access) {
61 error = posix_acl_equiv_mode(acl, mode);
62 if (!error)
63 *remove = 1;
64 else if (error > 0)
65 error = 0;
66 }
67
68 out:
69 posix_acl_release(acl);
70
71 return error;
72}
73
74int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
75{
76 if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
77 return -EOPNOTSUPP;
78 if (current->fsuid != ip->i_di.di_uid && !capable(CAP_FOWNER))
79 return -EPERM;
80 if (S_ISLNK(ip->i_di.di_mode))
81 return -EOPNOTSUPP;
82 if (!access && !S_ISDIR(ip->i_di.di_mode))
83 return -EACCES;
84
85 return 0;
86}
87
88static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
89 struct gfs2_ea_location *el, char **data, unsigned int *len)
90{
91 struct gfs2_ea_request er;
92 struct gfs2_ea_location el_this;
93 int error;
94
95 if (!ip->i_di.di_eattr)
96 return 0;
97
98 memset(&er, 0, sizeof(struct gfs2_ea_request));
99 if (access) {
100 er.er_name = GFS2_POSIX_ACL_ACCESS;
101 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
102 } else {
103 er.er_name = GFS2_POSIX_ACL_DEFAULT;
104 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
105 }
106 er.er_type = GFS2_EATYPE_SYS;
107
108 if (!el)
109 el = &el_this;
110
111 error = gfs2_ea_find(ip, &er, el);
112 if (error)
113 return error;
114 if (!el->el_ea)
115 return 0;
116 if (!GFS2_EA_DATA_LEN(el->el_ea))
117 goto out;
118
119 er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
120 er.er_data = kmalloc(er.er_data_len, GFP_KERNEL);
121 error = -ENOMEM;
122 if (!er.er_data)
123 goto out;
124
125 error = gfs2_ea_get_copy(ip, el, er.er_data);
126 if (error)
127 goto out_kfree;
128
129 if (acl) {
130 *acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
131 if (IS_ERR(*acl))
132 error = PTR_ERR(*acl);
133 }
134
135 out_kfree:
136 if (error || !data)
137 kfree(er.er_data);
138 else {
139 *data = er.er_data;
140 *len = er.er_data_len;
141 }
142
143 out:
144 if (error || el == &el_this)
145 brelse(el->el_bh);
146
147 return error;
148}
149
150/**
151 * gfs2_check_acl_locked - Check an ACL to see if we're allowed to do something
152 * @inode: the file we want to do something to
153 * @mask: what we want to do
154 *
155 * Returns: errno
156 */
157
158int gfs2_check_acl_locked(struct inode *inode, int mask)
159{
160 struct posix_acl *acl = NULL;
161 int error;
162
163 error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL);
164 if (error)
165 return error;
166
167 if (acl) {
168 error = posix_acl_permission(inode, acl, mask);
169 posix_acl_release(acl);
170 return error;
171 }
172
173 return -EAGAIN;
174}
175
176int gfs2_check_acl(struct inode *inode, int mask)
177{
178 struct gfs2_inode *ip = GFS2_I(inode);
179 struct gfs2_holder i_gh;
180 int error;
181
182 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
183 if (!error) {
184 error = gfs2_check_acl_locked(inode, mask);
185 gfs2_glock_dq_uninit(&i_gh);
186 }
187
188 return error;
189}
190
191static int munge_mode(struct gfs2_inode *ip, mode_t mode)
192{
193 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
194 struct buffer_head *dibh;
195 int error;
196
197 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
198 if (error)
199 return error;
200
201 error = gfs2_meta_inode_buffer(ip, &dibh);
202 if (!error) {
203 gfs2_assert_withdraw(sdp,
204 (ip->i_di.di_mode & S_IFMT) == (mode & S_IFMT));
205 ip->i_di.di_mode = mode;
206 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
207 gfs2_dinode_out(&ip->i_di, dibh->b_data);
208 brelse(dibh);
209 }
210
211 gfs2_trans_end(sdp);
212
213 return 0;
214}
215
216int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
217{
218 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
219 struct posix_acl *acl = NULL, *clone;
220 struct gfs2_ea_request er;
221 mode_t mode = ip->i_di.di_mode;
222 int error;
223
224 if (!sdp->sd_args.ar_posix_acl)
225 return 0;
226 if (S_ISLNK(ip->i_di.di_mode))
227 return 0;
228
229 memset(&er, 0, sizeof(struct gfs2_ea_request));
230 er.er_type = GFS2_EATYPE_SYS;
231
232 error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
233 &er.er_data, &er.er_data_len);
234 if (error)
235 return error;
236 if (!acl) {
237 mode &= ~current->fs->umask;
238 if (mode != ip->i_di.di_mode)
239 error = munge_mode(ip, mode);
240 return error;
241 }
242
243 clone = posix_acl_clone(acl, GFP_KERNEL);
244 error = -ENOMEM;
245 if (!clone)
246 goto out;
247 posix_acl_release(acl);
248 acl = clone;
249
250 if (S_ISDIR(ip->i_di.di_mode)) {
251 er.er_name = GFS2_POSIX_ACL_DEFAULT;
252 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
253 error = gfs2_system_eaops.eo_set(ip, &er);
254 if (error)
255 goto out;
256 }
257
258 error = posix_acl_create_masq(acl, &mode);
259 if (error < 0)
260 goto out;
261 if (error > 0) {
262 er.er_name = GFS2_POSIX_ACL_ACCESS;
263 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
264 posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
265 er.er_mode = mode;
266 er.er_flags = GFS2_ERF_MODE;
267 error = gfs2_system_eaops.eo_set(ip, &er);
268 if (error)
269 goto out;
270 } else
271 munge_mode(ip, mode);
272
273 out:
274 posix_acl_release(acl);
275 kfree(er.er_data);
276 return error;
277}
278
279int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
280{
281 struct posix_acl *acl = NULL, *clone;
282 struct gfs2_ea_location el;
283 char *data;
284 unsigned int len;
285 int error;
286
287 error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
288 if (error)
289 return error;
290 if (!acl)
291 return gfs2_setattr_simple(ip, attr);
292
293 clone = posix_acl_clone(acl, GFP_KERNEL);
294 error = -ENOMEM;
295 if (!clone)
296 goto out;
297 posix_acl_release(acl);
298 acl = clone;
299
300 error = posix_acl_chmod_masq(acl, attr->ia_mode);
301 if (!error) {
302 posix_acl_to_xattr(acl, data, len);
303 error = gfs2_ea_acl_chmod(ip, &el, attr, data);
304 }
305
306 out:
307 posix_acl_release(acl);
308 brelse(el.el_bh);
309 kfree(data);
310
311 return error;
312}
313
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
new file mode 100644
index 000000000000..067105786eaa
--- /dev/null
+++ b/fs/gfs2/acl.h
@@ -0,0 +1,37 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __ACL_DOT_H__
11#define __ACL_DOT_H__
12
13#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
14#define GFS2_POSIX_ACL_ACCESS_LEN 16
15#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
16#define GFS2_POSIX_ACL_DEFAULT_LEN 17
17
18#define GFS2_ACL_IS_ACCESS(name, len) \
19 ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
20 !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
21
22#define GFS2_ACL_IS_DEFAULT(name, len) \
23 ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
24 !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
25
26struct gfs2_ea_request;
27
28int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
29 struct gfs2_ea_request *er,
30 int *remove, mode_t *mode);
31int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
32int gfs2_check_acl_locked(struct inode *inode, int mask);
33int gfs2_check_acl(struct inode *inode, int mask);
34int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
35int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
36
37#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
new file mode 100644
index 000000000000..d20d41e1c028
--- /dev/null
+++ b/fs/gfs2/bmap.c
@@ -0,0 +1,1236 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "inode.h"
24#include "meta_io.h"
25#include "quota.h"
26#include "rgrp.h"
27#include "trans.h"
28#include "dir.h"
29#include "util.h"
30#include "ops_address.h"
31
32/* This doesn't need to be that large as max 64 bit pointers in a 4k
33 * block is 512, so __u16 is fine for that. It saves stack space to
34 * keep it small.
35 */
36struct metapath {
37 __u16 mp_list[GFS2_MAX_META_HEIGHT];
38};
39
40typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
41 struct buffer_head *bh, uint64_t *top,
42 uint64_t *bottom, unsigned int height,
43 void *data);
44
45struct strip_mine {
46 int sm_first;
47 unsigned int sm_height;
48};
49
50/**
51 * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
52 * @ip: the inode
53 * @dibh: the dinode buffer
54 * @block: the block number that was allocated
55 * @private: any locked page held by the caller process
56 *
57 * Returns: errno
58 */
59
60static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
61 uint64_t block, struct page *page)
62{
63 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
64 struct inode *inode = &ip->i_inode;
65 struct buffer_head *bh;
66 int release = 0;
67
68 if (!page || page->index) {
69 page = grab_cache_page(inode->i_mapping, 0);
70 if (!page)
71 return -ENOMEM;
72 release = 1;
73 }
74
75 if (!PageUptodate(page)) {
76 void *kaddr = kmap(page);
77
78 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
79 ip->i_di.di_size);
80 memset(kaddr + ip->i_di.di_size, 0,
81 PAGE_CACHE_SIZE - ip->i_di.di_size);
82 kunmap(page);
83
84 SetPageUptodate(page);
85 }
86
87 if (!page_has_buffers(page))
88 create_empty_buffers(page, 1 << inode->i_blkbits,
89 (1 << BH_Uptodate));
90
91 bh = page_buffers(page);
92
93 if (!buffer_mapped(bh))
94 map_bh(bh, inode->i_sb, block);
95
96 set_buffer_uptodate(bh);
97 if ((sdp->sd_args.ar_data == GFS2_DATA_ORDERED) || gfs2_is_jdata(ip))
98 gfs2_trans_add_bh(ip->i_gl, bh, 0);
99 mark_buffer_dirty(bh);
100
101 if (release) {
102 unlock_page(page);
103 page_cache_release(page);
104 }
105
106 return 0;
107}
108
109/**
110 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
111 * @ip: The GFS2 inode to unstuff
112 * @unstuffer: the routine that handles unstuffing a non-zero length file
113 * @private: private data for the unstuffer
114 *
115 * This routine unstuffs a dinode and returns it to a "normal" state such
116 * that the height can be grown in the traditional way.
117 *
118 * Returns: errno
119 */
120
121int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
122{
123 struct buffer_head *bh, *dibh;
124 uint64_t block = 0;
125 int isdir = gfs2_is_dir(ip);
126 int error;
127
128 down_write(&ip->i_rw_mutex);
129
130 error = gfs2_meta_inode_buffer(ip, &dibh);
131 if (error)
132 goto out;
133
134 if (ip->i_di.di_size) {
135 /* Get a free block, fill it with the stuffed data,
136 and write it out to disk */
137
138 if (isdir) {
139 block = gfs2_alloc_meta(ip);
140
141 error = gfs2_dir_get_new_buffer(ip, block, &bh);
142 if (error)
143 goto out_brelse;
144 gfs2_buffer_copy_tail(bh,
145 sizeof(struct gfs2_meta_header),
146 dibh, sizeof(struct gfs2_dinode));
147 brelse(bh);
148 } else {
149 block = gfs2_alloc_data(ip);
150
151 error = gfs2_unstuffer_page(ip, dibh, block, page);
152 if (error)
153 goto out_brelse;
154 }
155 }
156
157 /* Set up the pointer to the new block */
158
159 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
160
161 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
162
163 if (ip->i_di.di_size) {
164 *(uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode)) =
165 cpu_to_be64(block);
166 ip->i_di.di_blocks++;
167 }
168
169 ip->i_di.di_height = 1;
170
171 gfs2_dinode_out(&ip->i_di, dibh->b_data);
172
173 out_brelse:
174 brelse(dibh);
175
176 out:
177 up_write(&ip->i_rw_mutex);
178
179 return error;
180}
181
182/**
183 * calc_tree_height - Calculate the height of a metadata tree
184 * @ip: The GFS2 inode
185 * @size: The proposed size of the file
186 *
187 * Work out how tall a metadata tree needs to be in order to accommodate a
188 * file of a particular size. If size is less than the current size of
189 * the inode, then the current size of the inode is used instead of the
190 * supplied one.
191 *
192 * Returns: the height the tree should be
193 */
194
195static unsigned int calc_tree_height(struct gfs2_inode *ip, uint64_t size)
196{
197 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
198 uint64_t *arr;
199 unsigned int max, height;
200
201 if (ip->i_di.di_size > size)
202 size = ip->i_di.di_size;
203
204 if (gfs2_is_dir(ip)) {
205 arr = sdp->sd_jheightsize;
206 max = sdp->sd_max_jheight;
207 } else {
208 arr = sdp->sd_heightsize;
209 max = sdp->sd_max_height;
210 }
211
212 for (height = 0; height < max; height++)
213 if (arr[height] >= size)
214 break;
215
216 return height;
217}
218
219/**
220 * build_height - Build a metadata tree of the requested height
221 * @ip: The GFS2 inode
222 * @height: The height to build to
223 *
224 *
225 * Returns: errno
226 */
227
228static int build_height(struct inode *inode, unsigned height)
229{
230 struct gfs2_inode *ip = GFS2_I(inode);
231 unsigned new_height = height - ip->i_di.di_height;
232 struct buffer_head *dibh;
233 struct buffer_head *blocks[GFS2_MAX_META_HEIGHT];
234 int error;
235 u64 *bp;
236 u64 bn;
237 unsigned n;
238
239 if (height <= ip->i_di.di_height)
240 return 0;
241
242 error = gfs2_meta_inode_buffer(ip, &dibh);
243 if (error)
244 return error;
245
246 for(n = 0; n < new_height; n++) {
247 bn = gfs2_alloc_meta(ip);
248 blocks[n] = gfs2_meta_new(ip->i_gl, bn);
249 gfs2_trans_add_bh(ip->i_gl, blocks[n], 1);
250 }
251
252 n = 0;
253 bn = blocks[0]->b_blocknr;
254 if (new_height > 1) {
255 for(; n < new_height-1; n++) {
256 gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN,
257 GFS2_FORMAT_IN);
258 gfs2_buffer_clear_tail(blocks[n],
259 sizeof(struct gfs2_meta_header));
260 bp = (u64 *)(blocks[n]->b_data +
261 sizeof(struct gfs2_meta_header));
262 *bp = cpu_to_be64(blocks[n+1]->b_blocknr);
263 brelse(blocks[n]);
264 blocks[n] = NULL;
265 }
266 }
267 gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
268 gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header),
269 dibh, sizeof(struct gfs2_dinode));
270 brelse(blocks[n]);
271 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
272 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
273 bp = (u64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
274 *bp = cpu_to_be64(bn);
275 ip->i_di.di_height += new_height;
276 ip->i_di.di_blocks += new_height;
277 gfs2_dinode_out(&ip->i_di, dibh->b_data);
278 brelse(dibh);
279 return error;
280}
281
282/**
283 * find_metapath - Find path through the metadata tree
284 * @ip: The inode pointer
285 * @mp: The metapath to return the result in
286 * @block: The disk block to look up
287 *
288 * This routine returns a struct metapath structure that defines a path
289 * through the metadata of inode "ip" to get to block "block".
290 *
291 * Example:
292 * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a
293 * filesystem with a blocksize of 4096.
294 *
295 * find_metapath() would return a struct metapath structure set to:
296 * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
297 * and mp_list[2] = 165.
298 *
299 * That means that in order to get to the block containing the byte at
300 * offset 101342453, we would load the indirect block pointed to by pointer
301 * 0 in the dinode. We would then load the indirect block pointed to by
302 * pointer 48 in that indirect block. We would then load the data block
303 * pointed to by pointer 165 in that indirect block.
304 *
305 * ----------------------------------------
306 * | Dinode | |
307 * | | 4|
308 * | |0 1 2 3 4 5 9|
309 * | | 6|
310 * ----------------------------------------
311 * |
312 * |
313 * V
314 * ----------------------------------------
315 * | Indirect Block |
316 * | 5|
317 * | 4 4 4 4 4 5 5 1|
318 * |0 5 6 7 8 9 0 1 2|
319 * ----------------------------------------
320 * |
321 * |
322 * V
323 * ----------------------------------------
324 * | Indirect Block |
325 * | 1 1 1 1 1 5|
326 * | 6 6 6 6 6 1|
327 * |0 3 4 5 6 7 2|
328 * ----------------------------------------
329 * |
330 * |
331 * V
332 * ----------------------------------------
333 * | Data block containing offset |
334 * | 101342453 |
335 * | |
336 * | |
337 * ----------------------------------------
338 *
339 */
340
341static void find_metapath(struct gfs2_inode *ip, uint64_t block,
342 struct metapath *mp)
343{
344 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
345 uint64_t b = block;
346 unsigned int i;
347
348 for (i = ip->i_di.di_height; i--;)
349 mp->mp_list[i] = (__u16)do_div(b, sdp->sd_inptrs);
350
351}
352
353/**
354 * metapointer - Return pointer to start of metadata in a buffer
355 * @bh: The buffer
356 * @height: The metadata height (0 = dinode)
357 * @mp: The metapath
358 *
359 * Return a pointer to the block number of the next height of the metadata
360 * tree given a buffer containing the pointer to the current height of the
361 * metadata tree.
362 */
363
364static inline u64 *metapointer(struct buffer_head *bh, int *boundary,
365 unsigned int height, const struct metapath *mp)
366{
367 unsigned int head_size = (height > 0) ?
368 sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
369 u64 *ptr;
370 *boundary = 0;
371 ptr = ((u64 *)(bh->b_data + head_size)) + mp->mp_list[height];
372 if (ptr + 1 == (u64*)(bh->b_data + bh->b_size))
373 *boundary = 1;
374 return ptr;
375}
376
377/**
378 * lookup_block - Get the next metadata block in metadata tree
379 * @ip: The GFS2 inode
380 * @bh: Buffer containing the pointers to metadata blocks
381 * @height: The height of the tree (0 = dinode)
382 * @mp: The metapath
383 * @create: Non-zero if we may create a new meatdata block
384 * @new: Used to indicate if we did create a new metadata block
385 * @block: the returned disk block number
386 *
387 * Given a metatree, complete to a particular height, checks to see if the next
388 * height of the tree exists. If not the next height of the tree is created.
389 * The block number of the next height of the metadata tree is returned.
390 *
391 */
392
393static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
394 unsigned int height, struct metapath *mp, int create,
395 int *new, uint64_t *block)
396{
397 int boundary;
398 uint64_t *ptr = metapointer(bh, &boundary, height, mp);
399
400 if (*ptr) {
401 *block = be64_to_cpu(*ptr);
402 return boundary;
403 }
404
405 *block = 0;
406
407 if (!create)
408 return 0;
409
410 if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip))
411 *block = gfs2_alloc_data(ip);
412 else
413 *block = gfs2_alloc_meta(ip);
414
415 gfs2_trans_add_bh(ip->i_gl, bh, 1);
416
417 *ptr = cpu_to_be64(*block);
418 ip->i_di.di_blocks++;
419
420 *new = 1;
421 return 0;
422}
423
424/**
425 * gfs2_block_pointers - Map a block from an inode to a disk block
426 * @inode: The inode
427 * @lblock: The logical block number
428 * @new: Value/Result argument (1 = may create/did create new blocks)
429 * @boundary: gets set if we've hit a block boundary
430 * @mp: metapath to use
431 *
432 * Find the block number on the current device which corresponds to an
433 * inode's block. If the block had to be created, "new" will be set.
434 *
435 * Returns: errno
436 */
437
438static struct buffer_head *gfs2_block_pointers(struct inode *inode, u64 lblock,
439 int *new, u64 *dblock,
440 int *boundary,
441 struct metapath *mp)
442{
443 struct gfs2_inode *ip = GFS2_I(inode);
444 struct gfs2_sbd *sdp = GFS2_SB(inode);
445 struct buffer_head *bh;
446 int create = *new;
447 unsigned int bsize;
448 unsigned int height;
449 unsigned int end_of_metadata;
450 unsigned int x;
451 int error = 0;
452
453 *new = 0;
454 *dblock = 0;
455
456 if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
457 goto out;
458
459 bsize = (gfs2_is_dir(ip)) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
460
461 height = calc_tree_height(ip, (lblock + 1) * bsize);
462 if (ip->i_di.di_height < height) {
463 if (!create)
464 goto out;
465
466 error = build_height(inode, height);
467 if (error)
468 goto out;
469 }
470
471 find_metapath(ip, lblock, mp);
472 end_of_metadata = ip->i_di.di_height - 1;
473
474 error = gfs2_meta_inode_buffer(ip, &bh);
475 if (error)
476 goto out;
477
478 for (x = 0; x < end_of_metadata; x++) {
479 lookup_block(ip, bh, x, mp, create, new, dblock);
480 brelse(bh);
481 if (!*dblock)
482 goto out;
483
484 error = gfs2_meta_indirect_buffer(ip, x+1, *dblock, *new, &bh);
485 if (error)
486 goto out;
487 }
488
489 *boundary = lookup_block(ip, bh, end_of_metadata, mp, create, new, dblock);
490 if (*new) {
491 struct buffer_head *dibh;
492 error = gfs2_meta_inode_buffer(ip, &dibh);
493 if (!error) {
494 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
495 gfs2_dinode_out(&ip->i_di, dibh->b_data);
496 brelse(dibh);
497 }
498 }
499 return bh;
500out:
501 return ERR_PTR(error);
502}
503
504
505static inline void bmap_lock(struct inode *inode, int create)
506{
507 struct gfs2_inode *ip = GFS2_I(inode);
508 if (create)
509 down_write(&ip->i_rw_mutex);
510 else
511 down_read(&ip->i_rw_mutex);
512}
513
514static inline void bmap_unlock(struct inode *inode, int create)
515{
516 struct gfs2_inode *ip = GFS2_I(inode);
517 if (create)
518 up_write(&ip->i_rw_mutex);
519 else
520 up_read(&ip->i_rw_mutex);
521}
522
523int gfs2_block_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, int *boundary)
524{
525 struct metapath mp;
526 struct buffer_head *bh;
527 int create = *new;
528
529 bmap_lock(inode, create);
530 bh = gfs2_block_pointers(inode, lblock, new, dblock, boundary, &mp);
531 bmap_unlock(inode, create);
532 if (!bh)
533 return 0;
534 if (IS_ERR(bh))
535 return PTR_ERR(bh);
536 brelse(bh);
537 return 0;
538}
539
540int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
541{
542 struct gfs2_inode *ip = GFS2_I(inode);
543 struct gfs2_sbd *sdp = GFS2_SB(inode);
544 struct metapath mp;
545 struct buffer_head *bh;
546 int boundary;
547 int create = *new;
548
549 BUG_ON(!extlen);
550 BUG_ON(!dblock);
551 BUG_ON(!new);
552
553 bmap_lock(inode, create);
554 bh = gfs2_block_pointers(inode, lblock, new, dblock, &boundary, &mp);
555 *extlen = 1;
556
557 if (bh && !IS_ERR(bh) && *dblock && !*new) {
558 u64 tmp_dblock;
559 int tmp_new;
560 unsigned int nptrs;
561 unsigned end_of_metadata = ip->i_di.di_height - 1;
562
563 nptrs = (end_of_metadata) ? sdp->sd_inptrs : sdp->sd_diptrs;
564 while (++mp.mp_list[end_of_metadata] < nptrs) {
565 lookup_block(ip, bh, end_of_metadata, &mp, 0, &tmp_new, &tmp_dblock);
566 if (*dblock + *extlen != tmp_dblock)
567 break;
568 (*extlen)++;
569 }
570 }
571 bmap_unlock(inode, create);
572 if (!bh)
573 return 0;
574 if (IS_ERR(bh))
575 return PTR_ERR(bh);
576 brelse(bh);
577 return 0;
578}
579
580/**
581 * recursive_scan - recursively scan through the end of a file
582 * @ip: the inode
583 * @dibh: the dinode buffer
584 * @mp: the path through the metadata to the point to start
585 * @height: the height the recursion is at
586 * @block: the indirect block to look at
587 * @first: 1 if this is the first block
588 * @bc: the call to make for each piece of metadata
589 * @data: data opaque to this function to pass to @bc
590 *
591 * When this is first called @height and @block should be zero and
592 * @first should be 1.
593 *
594 * Returns: errno
595 */
596
597static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
598 struct metapath *mp, unsigned int height,
599 uint64_t block, int first, block_call_t bc,
600 void *data)
601{
602 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
603 struct buffer_head *bh = NULL;
604 uint64_t *top, *bottom;
605 uint64_t bn;
606 int error;
607 int mh_size = sizeof(struct gfs2_meta_header);
608
609 if (!height) {
610 error = gfs2_meta_inode_buffer(ip, &bh);
611 if (error)
612 return error;
613 dibh = bh;
614
615 top = (uint64_t *)(bh->b_data + sizeof(struct gfs2_dinode)) +
616 mp->mp_list[0];
617 bottom = (uint64_t *)(bh->b_data + sizeof(struct gfs2_dinode)) +
618 sdp->sd_diptrs;
619 } else {
620 error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
621 if (error)
622 return error;
623
624 top = (uint64_t *)(bh->b_data + mh_size) +
625 ((first) ? mp->mp_list[height] : 0);
626
627 bottom = (uint64_t *)(bh->b_data + mh_size) + sdp->sd_inptrs;
628 }
629
630 error = bc(ip, dibh, bh, top, bottom, height, data);
631 if (error)
632 goto out;
633
634 if (height < ip->i_di.di_height - 1)
635 for (; top < bottom; top++, first = 0) {
636 if (!*top)
637 continue;
638
639 bn = be64_to_cpu(*top);
640
641 error = recursive_scan(ip, dibh, mp, height + 1, bn,
642 first, bc, data);
643 if (error)
644 break;
645 }
646
647 out:
648 brelse(bh);
649
650 return error;
651}
652
653/**
654 * do_strip - Look for a layer a particular layer of the file and strip it off
655 * @ip: the inode
656 * @dibh: the dinode buffer
657 * @bh: A buffer of pointers
658 * @top: The first pointer in the buffer
659 * @bottom: One more than the last pointer
660 * @height: the height this buffer is at
661 * @data: a pointer to a struct strip_mine
662 *
663 * Returns: errno
664 */
665
666static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
667 struct buffer_head *bh, uint64_t *top, uint64_t *bottom,
668 unsigned int height, void *data)
669{
670 struct strip_mine *sm = data;
671 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
672 struct gfs2_rgrp_list rlist;
673 uint64_t bn, bstart;
674 uint32_t blen;
675 uint64_t *p;
676 unsigned int rg_blocks = 0;
677 int metadata;
678 unsigned int revokes = 0;
679 int x;
680 int error;
681
682 if (!*top)
683 sm->sm_first = 0;
684
685 if (height != sm->sm_height)
686 return 0;
687
688 if (sm->sm_first) {
689 top++;
690 sm->sm_first = 0;
691 }
692
693 metadata = (height != ip->i_di.di_height - 1);
694 if (metadata)
695 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
696
697 error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh);
698 if (error)
699 return error;
700
701 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
702 bstart = 0;
703 blen = 0;
704
705 for (p = top; p < bottom; p++) {
706 if (!*p)
707 continue;
708
709 bn = be64_to_cpu(*p);
710
711 if (bstart + blen == bn)
712 blen++;
713 else {
714 if (bstart)
715 gfs2_rlist_add(sdp, &rlist, bstart);
716
717 bstart = bn;
718 blen = 1;
719 }
720 }
721
722 if (bstart)
723 gfs2_rlist_add(sdp, &rlist, bstart);
724 else
725 goto out; /* Nothing to do */
726
727 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
728
729 for (x = 0; x < rlist.rl_rgrps; x++) {
730 struct gfs2_rgrpd *rgd;
731 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
732 rg_blocks += rgd->rd_ri.ri_length;
733 }
734
735 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
736 if (error)
737 goto out_rlist;
738
739 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
740 RES_INDIRECT + RES_STATFS + RES_QUOTA,
741 revokes);
742 if (error)
743 goto out_rg_gunlock;
744
745 down_write(&ip->i_rw_mutex);
746
747 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
748 gfs2_trans_add_bh(ip->i_gl, bh, 1);
749
750 bstart = 0;
751 blen = 0;
752
753 for (p = top; p < bottom; p++) {
754 if (!*p)
755 continue;
756
757 bn = be64_to_cpu(*p);
758
759 if (bstart + blen == bn)
760 blen++;
761 else {
762 if (bstart) {
763 if (metadata)
764 gfs2_free_meta(ip, bstart, blen);
765 else
766 gfs2_free_data(ip, bstart, blen);
767 }
768
769 bstart = bn;
770 blen = 1;
771 }
772
773 *p = 0;
774 if (!ip->i_di.di_blocks)
775 gfs2_consist_inode(ip);
776 ip->i_di.di_blocks--;
777 }
778 if (bstart) {
779 if (metadata)
780 gfs2_free_meta(ip, bstart, blen);
781 else
782 gfs2_free_data(ip, bstart, blen);
783 }
784
785 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
786
787 gfs2_dinode_out(&ip->i_di, dibh->b_data);
788
789 up_write(&ip->i_rw_mutex);
790
791 gfs2_trans_end(sdp);
792
793 out_rg_gunlock:
794 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
795
796 out_rlist:
797 gfs2_rlist_free(&rlist);
798
799 out:
800 gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh);
801
802 return error;
803}
804
805/**
806 * do_grow - Make a file look bigger than it is
807 * @ip: the inode
808 * @size: the size to set the file to
809 *
810 * Called with an exclusive lock on @ip.
811 *
812 * Returns: errno
813 */
814
815static int do_grow(struct gfs2_inode *ip, uint64_t size)
816{
817 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
818 struct gfs2_alloc *al;
819 struct buffer_head *dibh;
820 unsigned int h;
821 int error;
822
823 al = gfs2_alloc_get(ip);
824
825 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
826 if (error)
827 goto out;
828
829 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
830 if (error)
831 goto out_gunlock_q;
832
833 al->al_requested = sdp->sd_max_height + RES_DATA;
834
835 error = gfs2_inplace_reserve(ip);
836 if (error)
837 goto out_gunlock_q;
838
839 error = gfs2_trans_begin(sdp,
840 sdp->sd_max_height + al->al_rgd->rd_ri.ri_length +
841 RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
842 if (error)
843 goto out_ipres;
844
845 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
846 if (gfs2_is_stuffed(ip)) {
847 error = gfs2_unstuff_dinode(ip, NULL);
848 if (error)
849 goto out_end_trans;
850 }
851
852 h = calc_tree_height(ip, size);
853 if (ip->i_di.di_height < h) {
854 down_write(&ip->i_rw_mutex);
855 error = build_height(&ip->i_inode, h);
856 up_write(&ip->i_rw_mutex);
857 if (error)
858 goto out_end_trans;
859 }
860 }
861
862 ip->i_di.di_size = size;
863 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
864
865 error = gfs2_meta_inode_buffer(ip, &dibh);
866 if (error)
867 goto out_end_trans;
868
869 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
870 gfs2_dinode_out(&ip->i_di, dibh->b_data);
871 brelse(dibh);
872
873 out_end_trans:
874 gfs2_trans_end(sdp);
875
876 out_ipres:
877 gfs2_inplace_release(ip);
878
879 out_gunlock_q:
880 gfs2_quota_unlock(ip);
881
882 out:
883 gfs2_alloc_put(ip);
884
885 return error;
886}
887
888
889/**
890 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
891 *
892 * This is partly borrowed from ext3.
893 */
894static int gfs2_block_truncate_page(struct address_space *mapping)
895{
896 struct inode *inode = mapping->host;
897 struct gfs2_inode *ip = GFS2_I(inode);
898 struct gfs2_sbd *sdp = GFS2_SB(inode);
899 loff_t from = inode->i_size;
900 unsigned long index = from >> PAGE_CACHE_SHIFT;
901 unsigned offset = from & (PAGE_CACHE_SIZE-1);
902 unsigned blocksize, iblock, length, pos;
903 struct buffer_head *bh;
904 struct page *page;
905 void *kaddr;
906 int err;
907
908 page = grab_cache_page(mapping, index);
909 if (!page)
910 return 0;
911
912 blocksize = inode->i_sb->s_blocksize;
913 length = blocksize - (offset & (blocksize - 1));
914 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
915
916 if (!page_has_buffers(page))
917 create_empty_buffers(page, blocksize, 0);
918
919 /* Find the buffer that contains "offset" */
920 bh = page_buffers(page);
921 pos = blocksize;
922 while (offset >= pos) {
923 bh = bh->b_this_page;
924 iblock++;
925 pos += blocksize;
926 }
927
928 err = 0;
929
930 if (!buffer_mapped(bh)) {
931 gfs2_get_block(inode, iblock, bh, 0);
932 /* unmapped? It's a hole - nothing to do */
933 if (!buffer_mapped(bh))
934 goto unlock;
935 }
936
937 /* Ok, it's mapped. Make sure it's up-to-date */
938 if (PageUptodate(page))
939 set_buffer_uptodate(bh);
940
941 if (!buffer_uptodate(bh)) {
942 err = -EIO;
943 ll_rw_block(READ, 1, &bh);
944 wait_on_buffer(bh);
945 /* Uhhuh. Read error. Complain and punt. */
946 if (!buffer_uptodate(bh))
947 goto unlock;
948 }
949
950 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
951 gfs2_trans_add_bh(ip->i_gl, bh, 0);
952
953 kaddr = kmap_atomic(page, KM_USER0);
954 memset(kaddr + offset, 0, length);
955 flush_dcache_page(page);
956 kunmap_atomic(kaddr, KM_USER0);
957
958unlock:
959 unlock_page(page);
960 page_cache_release(page);
961 return err;
962}
963
964static int trunc_start(struct gfs2_inode *ip, uint64_t size)
965{
966 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
967 struct buffer_head *dibh;
968 int journaled = gfs2_is_jdata(ip);
969 int error;
970
971 error = gfs2_trans_begin(sdp,
972 RES_DINODE + ((journaled) ? RES_JDATA : 0), 0);
973 if (error)
974 return error;
975
976 error = gfs2_meta_inode_buffer(ip, &dibh);
977 if (error)
978 goto out;
979
980 if (gfs2_is_stuffed(ip)) {
981 ip->i_di.di_size = size;
982 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
983 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
984 gfs2_dinode_out(&ip->i_di, dibh->b_data);
985 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
986 error = 1;
987
988 } else {
989 if (size & (uint64_t)(sdp->sd_sb.sb_bsize - 1))
990 error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
991
992 if (!error) {
993 ip->i_di.di_size = size;
994 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
995 ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
996 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
997 gfs2_dinode_out(&ip->i_di, dibh->b_data);
998 }
999 }
1000
1001 brelse(dibh);
1002
1003 out:
1004 gfs2_trans_end(sdp);
1005
1006 return error;
1007}
1008
1009static int trunc_dealloc(struct gfs2_inode *ip, uint64_t size)
1010{
1011 unsigned int height = ip->i_di.di_height;
1012 uint64_t lblock;
1013 struct metapath mp;
1014 int error;
1015
1016 if (!size)
1017 lblock = 0;
1018 else
1019 lblock = (size - 1) >> GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize_shift;
1020
1021 find_metapath(ip, lblock, &mp);
1022 gfs2_alloc_get(ip);
1023
1024 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1025 if (error)
1026 goto out;
1027
1028 while (height--) {
1029 struct strip_mine sm;
1030 sm.sm_first = !!size;
1031 sm.sm_height = height;
1032
1033 error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
1034 if (error)
1035 break;
1036 }
1037
1038 gfs2_quota_unhold(ip);
1039
1040 out:
1041 gfs2_alloc_put(ip);
1042 return error;
1043}
1044
1045static int trunc_end(struct gfs2_inode *ip)
1046{
1047 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1048 struct buffer_head *dibh;
1049 int error;
1050
1051 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1052 if (error)
1053 return error;
1054
1055 down_write(&ip->i_rw_mutex);
1056
1057 error = gfs2_meta_inode_buffer(ip, &dibh);
1058 if (error)
1059 goto out;
1060
1061 if (!ip->i_di.di_size) {
1062 ip->i_di.di_height = 0;
1063 ip->i_di.di_goal_meta =
1064 ip->i_di.di_goal_data =
1065 ip->i_num.no_addr;
1066 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1067 }
1068 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
1069 ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
1070
1071 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1072 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1073 brelse(dibh);
1074
1075 out:
1076 up_write(&ip->i_rw_mutex);
1077
1078 gfs2_trans_end(sdp);
1079
1080 return error;
1081}
1082
1083/**
1084 * do_shrink - make a file smaller
1085 * @ip: the inode
1086 * @size: the size to make the file
1087 * @truncator: function to truncate the last partial block
1088 *
1089 * Called with an exclusive lock on @ip.
1090 *
1091 * Returns: errno
1092 */
1093
1094static int do_shrink(struct gfs2_inode *ip, uint64_t size)
1095{
1096 int error;
1097
1098 error = trunc_start(ip, size);
1099 if (error < 0)
1100 return error;
1101 if (error > 0)
1102 return 0;
1103
1104 error = trunc_dealloc(ip, size);
1105 if (!error)
1106 error = trunc_end(ip);
1107
1108 return error;
1109}
1110
1111/**
1112 * gfs2_truncatei - make a file a given size
1113 * @ip: the inode
1114 * @size: the size to make the file
1115 * @truncator: function to truncate the last partial block
1116 *
1117 * The file size can grow, shrink, or stay the same size.
1118 *
1119 * Returns: errno
1120 */
1121
1122int gfs2_truncatei(struct gfs2_inode *ip, uint64_t size)
1123{
1124 int error;
1125
1126 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_di.di_mode)))
1127 return -EINVAL;
1128
1129 if (size > ip->i_di.di_size)
1130 error = do_grow(ip, size);
1131 else
1132 error = do_shrink(ip, size);
1133
1134 return error;
1135}
1136
1137int gfs2_truncatei_resume(struct gfs2_inode *ip)
1138{
1139 int error;
1140 error = trunc_dealloc(ip, ip->i_di.di_size);
1141 if (!error)
1142 error = trunc_end(ip);
1143 return error;
1144}
1145
1146int gfs2_file_dealloc(struct gfs2_inode *ip)
1147{
1148 return trunc_dealloc(ip, 0);
1149}
1150
1151/**
1152 * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
1153 * @ip: the file
1154 * @len: the number of bytes to be written to the file
1155 * @data_blocks: returns the number of data blocks required
1156 * @ind_blocks: returns the number of indirect blocks required
1157 *
1158 */
1159
1160void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
1161 unsigned int *data_blocks, unsigned int *ind_blocks)
1162{
1163 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1164 unsigned int tmp;
1165
1166 if (gfs2_is_dir(ip)) {
1167 *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
1168 *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
1169 } else {
1170 *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
1171 *ind_blocks = 3 * (sdp->sd_max_height - 1);
1172 }
1173
1174 for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
1175 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
1176 *ind_blocks += tmp;
1177 }
1178}
1179
1180/**
1181 * gfs2_write_alloc_required - figure out if a write will require an allocation
1182 * @ip: the file being written to
1183 * @offset: the offset to write to
1184 * @len: the number of bytes being written
1185 * @alloc_required: set to 1 if an alloc is required, 0 otherwise
1186 *
1187 * Returns: errno
1188 */
1189
1190int gfs2_write_alloc_required(struct gfs2_inode *ip, uint64_t offset,
1191 unsigned int len, int *alloc_required)
1192{
1193 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1194 uint64_t lblock, lblock_stop, dblock;
1195 uint32_t extlen;
1196 int new = 0;
1197 int error = 0;
1198
1199 *alloc_required = 0;
1200
1201 if (!len)
1202 return 0;
1203
1204 if (gfs2_is_stuffed(ip)) {
1205 if (offset + len >
1206 sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
1207 *alloc_required = 1;
1208 return 0;
1209 }
1210
1211 if (gfs2_is_dir(ip)) {
1212 unsigned int bsize = sdp->sd_jbsize;
1213 lblock = offset;
1214 do_div(lblock, bsize);
1215 lblock_stop = offset + len + bsize - 1;
1216 do_div(lblock_stop, bsize);
1217 } else {
1218 unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1219 lblock = offset >> shift;
1220 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1221 }
1222
1223 for (; lblock < lblock_stop; lblock += extlen) {
1224 error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
1225 if (error)
1226 return error;
1227
1228 if (!dblock) {
1229 *alloc_required = 1;
1230 return 0;
1231 }
1232 }
1233
1234 return 0;
1235}
1236
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
new file mode 100644
index 000000000000..1a265412f7ee
--- /dev/null
+++ b/fs/gfs2/bmap.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __BMAP_DOT_H__
11#define __BMAP_DOT_H__
12
13int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
14int gfs2_block_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, int *boundary);
15int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
16
17int gfs2_truncatei(struct gfs2_inode *ip, uint64_t size);
18int gfs2_truncatei_resume(struct gfs2_inode *ip);
19int gfs2_file_dealloc(struct gfs2_inode *ip);
20
21void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
22 unsigned int *data_blocks,
23 unsigned int *ind_blocks);
24int gfs2_write_alloc_required(struct gfs2_inode *ip, uint64_t offset,
25 unsigned int len, int *alloc_required);
26
27#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
new file mode 100644
index 000000000000..1453605c8f32
--- /dev/null
+++ b/fs/gfs2/daemon.c
@@ -0,0 +1,196 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/kthread.h>
16#include <linux/delay.h>
17#include <linux/gfs2_ondisk.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "daemon.h"
23#include "glock.h"
24#include "log.h"
25#include "quota.h"
26#include "recovery.h"
27#include "super.h"
28#include "util.h"
29
30/* This uses schedule_timeout() instead of msleep() because it's good for
31 the daemons to wake up more often than the timeout when unmounting so
32 the user's unmount doesn't sit there forever.
33
34 The kthread functions used to start these daemons block and flush signals. */
35
36/**
37 * gfs2_scand - Look for cached glocks and inodes to toss from memory
38 * @sdp: Pointer to GFS2 superblock
39 *
40 * One of these daemons runs, finding candidates to add to sd_reclaim_list.
41 * See gfs2_glockd()
42 */
43
44int gfs2_scand(void *data)
45{
46 struct gfs2_sbd *sdp = data;
47 unsigned long t;
48
49 while (!kthread_should_stop()) {
50 gfs2_scand_internal(sdp);
51 t = gfs2_tune_get(sdp, gt_scand_secs) * HZ;
52 schedule_timeout_interruptible(t);
53 }
54
55 return 0;
56}
57
58/**
59 * gfs2_glockd - Reclaim unused glock structures
60 * @sdp: Pointer to GFS2 superblock
61 *
62 * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
63 * Number of daemons can be set by user, with num_glockd mount option.
64 */
65
66int gfs2_glockd(void *data)
67{
68 struct gfs2_sbd *sdp = data;
69
70 while (!kthread_should_stop()) {
71 while (atomic_read(&sdp->sd_reclaim_count))
72 gfs2_reclaim_glock(sdp);
73
74 wait_event_interruptible(sdp->sd_reclaim_wq,
75 (atomic_read(&sdp->sd_reclaim_count) ||
76 kthread_should_stop()));
77 }
78
79 return 0;
80}
81
82/**
83 * gfs2_recoverd - Recover dead machine's journals
84 * @sdp: Pointer to GFS2 superblock
85 *
86 */
87
88int gfs2_recoverd(void *data)
89{
90 struct gfs2_sbd *sdp = data;
91 unsigned long t;
92
93 while (!kthread_should_stop()) {
94 gfs2_check_journals(sdp);
95 t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
96 schedule_timeout_interruptible(t);
97 }
98
99 return 0;
100}
101
102/**
103 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
104 * @sdp: Pointer to GFS2 superblock
105 *
106 * Also, periodically check to make sure that we're using the most recent
107 * journal index.
108 */
109
110int gfs2_logd(void *data)
111{
112 struct gfs2_sbd *sdp = data;
113 struct gfs2_holder ji_gh;
114 unsigned long t;
115
116 while (!kthread_should_stop()) {
117 /* Advance the log tail */
118
119 t = sdp->sd_log_flush_time +
120 gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
121
122 gfs2_ail1_empty(sdp, DIO_ALL);
123
124 if (time_after_eq(jiffies, t)) {
125 gfs2_log_flush(sdp, NULL);
126 sdp->sd_log_flush_time = jiffies;
127 }
128
129 /* Check for latest journal index */
130
131 t = sdp->sd_jindex_refresh_time +
132 gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
133
134 if (time_after_eq(jiffies, t)) {
135 if (!gfs2_jindex_hold(sdp, &ji_gh))
136 gfs2_glock_dq_uninit(&ji_gh);
137 sdp->sd_jindex_refresh_time = jiffies;
138 }
139
140 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
141 schedule_timeout_interruptible(t);
142 }
143
144 return 0;
145}
146
147/**
148 * gfs2_quotad - Write cached quota changes into the quota file
149 * @sdp: Pointer to GFS2 superblock
150 *
151 */
152
153int gfs2_quotad(void *data)
154{
155 struct gfs2_sbd *sdp = data;
156 unsigned long t;
157 int error;
158
159 while (!kthread_should_stop()) {
160 /* Update the master statfs file */
161
162 t = sdp->sd_statfs_sync_time +
163 gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
164
165 if (time_after_eq(jiffies, t)) {
166 error = gfs2_statfs_sync(sdp);
167 if (error &&
168 error != -EROFS &&
169 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
170 fs_err(sdp, "quotad: (1) error=%d\n", error);
171 sdp->sd_statfs_sync_time = jiffies;
172 }
173
174 /* Update quota file */
175
176 t = sdp->sd_quota_sync_time +
177 gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
178
179 if (time_after_eq(jiffies, t)) {
180 error = gfs2_quota_sync(sdp);
181 if (error &&
182 error != -EROFS &&
183 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
184 fs_err(sdp, "quotad: (2) error=%d\n", error);
185 sdp->sd_quota_sync_time = jiffies;
186 }
187
188 gfs2_quota_scan(sdp);
189
190 t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
191 schedule_timeout_interruptible(t);
192 }
193
194 return 0;
195}
196
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
new file mode 100644
index 000000000000..aa93eb6f668e
--- /dev/null
+++ b/fs/gfs2/daemon.h
@@ -0,0 +1,19 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __DAEMON_DOT_H__
11#define __DAEMON_DOT_H__
12
13int gfs2_scand(void *data);
14int gfs2_glockd(void *data);
15int gfs2_recoverd(void *data);
16int gfs2_logd(void *data);
17int gfs2_quotad(void *data);
18
19#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
new file mode 100644
index 000000000000..76a23c172eeb
--- /dev/null
+++ b/fs/gfs2/dir.c
@@ -0,0 +1,1976 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10/*
11 * Implements Extendible Hashing as described in:
12 * "Extendible Hashing" by Fagin, et al in
13 * __ACM Trans. on Database Systems__, Sept 1979.
14 *
15 *
16 * Here's the layout of dirents which is essentially the same as that of ext2
17 * within a single block. The field de_name_len is the number of bytes
18 * actually required for the name (no null terminator). The field de_rec_len
19 * is the number of bytes allocated to the dirent. The offset of the next
20 * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
21 * deleted, the preceding dirent inherits its allocated space, ie
22 * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
23 * by adding de_rec_len to the current dirent, this essentially causes the
24 * deleted dirent to get jumped over when iterating through all the dirents.
25 *
26 * When deleting the first dirent in a block, there is no previous dirent so
27 * the field de_ino is set to zero to designate it as deleted. When allocating
28 * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the
29 * first dirent has (de_ino == 0) and de_rec_len is large enough, this first
30 * dirent is allocated. Otherwise it must go through all the 'used' dirents
31 * searching for one in which the amount of total space minus the amount of
32 * used space will provide enough space for the new dirent.
33 *
34 * There are two types of blocks in which dirents reside. In a stuffed dinode,
35 * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of
36 * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
37 * beginning of the leaf block. The dirents reside in leaves when
38 *
39 * dip->i_di.di_flags & GFS2_DIF_EXHASH is true
40 *
41 * Otherwise, the dirents are "linear", within a single stuffed dinode block.
42 *
43 * When the dirents are in leaves, the actual contents of the directory file are
44 * used as an array of 64-bit block pointers pointing to the leaf blocks. The
45 * dirents are NOT in the directory file itself. There can be more than one
46 * block pointer in the array that points to the same leaf. In fact, when a
47 * directory is first converted from linear to exhash, all of the pointers
48 * point to the same leaf.
49 *
50 * When a leaf is completely full, the size of the hash table can be
51 * doubled unless it is already at the maximum size which is hard coded into
52 * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list,
53 * but never before the maximum hash table size has been reached.
54 */
55
56#include <linux/sched.h>
57#include <linux/slab.h>
58#include <linux/spinlock.h>
59#include <linux/buffer_head.h>
60#include <linux/sort.h>
61#include <linux/gfs2_ondisk.h>
62#include <linux/crc32.h>
63#include <linux/vmalloc.h>
64
65#include "gfs2.h"
66#include "lm_interface.h"
67#include "incore.h"
68#include "dir.h"
69#include "glock.h"
70#include "inode.h"
71#include "meta_io.h"
72#include "quota.h"
73#include "rgrp.h"
74#include "trans.h"
75#include "bmap.h"
76#include "util.h"
77
78#define IS_LEAF 1 /* Hashed (leaf) directory */
79#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */
80
81#define gfs2_disk_hash2offset(h) (((uint64_t)(h)) >> 1)
82#define gfs2_dir_offset2hash(p) ((uint32_t)(((uint64_t)(p)) << 1))
83
84typedef int (*leaf_call_t) (struct gfs2_inode *dip,
85 uint32_t index, uint32_t len, uint64_t leaf_no,
86 void *data);
87
88
89int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, uint64_t block,
90 struct buffer_head **bhp)
91{
92 struct buffer_head *bh;
93
94 bh = gfs2_meta_new(ip->i_gl, block);
95 gfs2_trans_add_bh(ip->i_gl, bh, 1);
96 gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
97 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
98 *bhp = bh;
99 return 0;
100}
101
102static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, uint64_t block,
103 struct buffer_head **bhp)
104{
105 struct buffer_head *bh;
106 int error;
107
108 error = gfs2_meta_read(ip->i_gl, block, DIO_START | DIO_WAIT, &bh);
109 if (error)
110 return error;
111 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
112 brelse(bh);
113 return -EIO;
114 }
115 *bhp = bh;
116 return 0;
117}
118
119static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
120 unsigned int offset, unsigned int size)
121
122{
123 struct buffer_head *dibh;
124 int error;
125
126 error = gfs2_meta_inode_buffer(ip, &dibh);
127 if (error)
128 return error;
129
130 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
131 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
132 if (ip->i_di.di_size < offset + size)
133 ip->i_di.di_size = offset + size;
134 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
135 gfs2_dinode_out(&ip->i_di, dibh->b_data);
136
137 brelse(dibh);
138
139 return size;
140}
141
142
143
144/**
145 * gfs2_dir_write_data - Write directory information to the inode
146 * @ip: The GFS2 inode
147 * @buf: The buffer containing information to be written
148 * @offset: The file offset to start writing at
149 * @size: The amount of data to write
150 *
151 * Returns: The number of bytes correctly written or error code
152 */
153static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
154 uint64_t offset, unsigned int size)
155{
156 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
157 struct buffer_head *dibh;
158 uint64_t lblock, dblock;
159 uint32_t extlen = 0;
160 unsigned int o;
161 int copied = 0;
162 int error = 0;
163
164 if (!size)
165 return 0;
166
167 if (gfs2_is_stuffed(ip) &&
168 offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
169 return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset,
170 size);
171
172 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
173 return -EINVAL;
174
175 if (gfs2_is_stuffed(ip)) {
176 error = gfs2_unstuff_dinode(ip, NULL);
177 if (error)
178 return error;
179 }
180
181 lblock = offset;
182 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
183
184 while (copied < size) {
185 unsigned int amount;
186 struct buffer_head *bh;
187 int new;
188
189 amount = size - copied;
190 if (amount > sdp->sd_sb.sb_bsize - o)
191 amount = sdp->sd_sb.sb_bsize - o;
192
193 if (!extlen) {
194 new = 1;
195 error = gfs2_extent_map(&ip->i_inode, lblock, &new,
196 &dblock, &extlen);
197 if (error)
198 goto fail;
199 error = -EIO;
200 if (gfs2_assert_withdraw(sdp, dblock))
201 goto fail;
202 }
203
204 if (amount == sdp->sd_jbsize || new)
205 error = gfs2_dir_get_new_buffer(ip, dblock, &bh);
206 else
207 error = gfs2_dir_get_existing_buffer(ip, dblock, &bh);
208
209 if (error)
210 goto fail;
211
212 gfs2_trans_add_bh(ip->i_gl, bh, 1);
213 memcpy(bh->b_data + o, buf, amount);
214 brelse(bh);
215 if (error)
216 goto fail;
217
218 buf += amount;
219 copied += amount;
220 lblock++;
221 dblock++;
222 extlen--;
223
224 o = sizeof(struct gfs2_meta_header);
225 }
226
227out:
228 error = gfs2_meta_inode_buffer(ip, &dibh);
229 if (error)
230 return error;
231
232 if (ip->i_di.di_size < offset + copied)
233 ip->i_di.di_size = offset + copied;
234 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
235
236 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
237 gfs2_dinode_out(&ip->i_di, dibh->b_data);
238 brelse(dibh);
239
240 return copied;
241fail:
242 if (copied)
243 goto out;
244 return error;
245}
246
247static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
248 unsigned int offset, unsigned int size)
249{
250 struct buffer_head *dibh;
251 int error;
252
253 error = gfs2_meta_inode_buffer(ip, &dibh);
254 if (!error) {
255 offset += sizeof(struct gfs2_dinode);
256 memcpy(buf, dibh->b_data + offset, size);
257 brelse(dibh);
258 }
259
260 return (error) ? error : size;
261}
262
263
264/**
265 * gfs2_dir_read_data - Read a data from a directory inode
266 * @ip: The GFS2 Inode
267 * @buf: The buffer to place result into
268 * @offset: File offset to begin jdata_readng from
269 * @size: Amount of data to transfer
270 *
271 * Returns: The amount of data actually copied or the error
272 */
273static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf,
274 uint64_t offset, unsigned int size)
275{
276 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
277 uint64_t lblock, dblock;
278 uint32_t extlen = 0;
279 unsigned int o;
280 int copied = 0;
281 int error = 0;
282
283 if (offset >= ip->i_di.di_size)
284 return 0;
285
286 if ((offset + size) > ip->i_di.di_size)
287 size = ip->i_di.di_size - offset;
288
289 if (!size)
290 return 0;
291
292 if (gfs2_is_stuffed(ip))
293 return gfs2_dir_read_stuffed(ip, buf, (unsigned int)offset,
294 size);
295
296 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
297 return -EINVAL;
298
299 lblock = offset;
300 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
301
302 while (copied < size) {
303 unsigned int amount;
304 struct buffer_head *bh;
305 int new;
306
307 amount = size - copied;
308 if (amount > sdp->sd_sb.sb_bsize - o)
309 amount = sdp->sd_sb.sb_bsize - o;
310
311 if (!extlen) {
312 new = 0;
313 error = gfs2_extent_map(&ip->i_inode, lblock, &new,
314 &dblock, &extlen);
315 if (error)
316 goto fail;
317 }
318
319 if (extlen > 1)
320 gfs2_meta_ra(ip->i_gl, dblock, extlen);
321
322 if (dblock) {
323 if (new)
324 error = gfs2_dir_get_new_buffer(ip, dblock, &bh);
325 else
326 error = gfs2_dir_get_existing_buffer(ip, dblock, &bh);
327 if (error)
328 goto fail;
329 dblock++;
330 extlen--;
331 } else
332 bh = NULL;
333
334 memcpy(buf, bh->b_data + o, amount);
335 brelse(bh);
336 if (error)
337 goto fail;
338
339 buf += amount;
340 copied += amount;
341 lblock++;
342
343 o = sizeof(struct gfs2_meta_header);
344 }
345
346 return copied;
347fail:
348 return (copied) ? copied : error;
349}
350
351typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
352 const struct qstr *name,
353 void *opaque);
354
355static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent,
356 const struct qstr *name, int ret)
357{
358 if (dent->de_inum.no_addr != 0 &&
359 be32_to_cpu(dent->de_hash) == name->hash &&
360 be16_to_cpu(dent->de_name_len) == name->len &&
361 memcmp((char *)(dent+1), name->name, name->len) == 0)
362 return ret;
363 return 0;
364}
365
366static int gfs2_dirent_find(const struct gfs2_dirent *dent,
367 const struct qstr *name,
368 void *opaque)
369{
370 return __gfs2_dirent_find(dent, name, 1);
371}
372
373static int gfs2_dirent_prev(const struct gfs2_dirent *dent,
374 const struct qstr *name,
375 void *opaque)
376{
377 return __gfs2_dirent_find(dent, name, 2);
378}
379
380/*
381 * name->name holds ptr to start of block.
382 * name->len holds size of block.
383 */
384static int gfs2_dirent_last(const struct gfs2_dirent *dent,
385 const struct qstr *name,
386 void *opaque)
387{
388 const char *start = name->name;
389 const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len);
390 if (name->len == (end - start))
391 return 1;
392 return 0;
393}
394
395static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
396 const struct qstr *name,
397 void *opaque)
398{
399 unsigned required = GFS2_DIRENT_SIZE(name->len);
400 unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
401 unsigned totlen = be16_to_cpu(dent->de_rec_len);
402
403 if (!dent->de_inum.no_addr)
404 actual = GFS2_DIRENT_SIZE(0);
405 if ((totlen - actual) >= required)
406 return 1;
407 return 0;
408}
409
410struct dirent_gather {
411 const struct gfs2_dirent **pdent;
412 unsigned offset;
413};
414
415static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
416 const struct qstr *name,
417 void *opaque)
418{
419 struct dirent_gather *g = opaque;
420 if (dent->de_inum.no_addr) {
421 g->pdent[g->offset++] = dent;
422 }
423 return 0;
424}
425
426/*
427 * Other possible things to check:
428 * - Inode located within filesystem size (and on valid block)
429 * - Valid directory entry type
430 * Not sure how heavy-weight we want to make this... could also check
431 * hash is correct for example, but that would take a lot of extra time.
432 * For now the most important thing is to check that the various sizes
433 * are correct.
434 */
435static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
436 unsigned int size, unsigned int len, int first)
437{
438 const char *msg = "gfs2_dirent too small";
439 if (unlikely(size < sizeof(struct gfs2_dirent)))
440 goto error;
441 msg = "gfs2_dirent misaligned";
442 if (unlikely(offset & 0x7))
443 goto error;
444 msg = "gfs2_dirent points beyond end of block";
445 if (unlikely(offset + size > len))
446 goto error;
447 msg = "zero inode number";
448 if (unlikely(!first && !dent->de_inum.no_addr))
449 goto error;
450 msg = "name length is greater than space in dirent";
451 if (dent->de_inum.no_addr &&
452 unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) >
453 size))
454 goto error;
455 return 0;
456error:
457 printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg,
458 first ? "first in block" : "not first in block");
459 return -EIO;
460}
461
462static int gfs2_dirent_offset(const void *buf)
463{
464 const struct gfs2_meta_header *h = buf;
465 int offset;
466
467 BUG_ON(buf == NULL);
468
469 switch(be32_to_cpu(h->mh_type)) {
470 case GFS2_METATYPE_LF:
471 offset = sizeof(struct gfs2_leaf);
472 break;
473 case GFS2_METATYPE_DI:
474 offset = sizeof(struct gfs2_dinode);
475 break;
476 default:
477 goto wrong_type;
478 }
479 return offset;
480wrong_type:
481 printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n",
482 be32_to_cpu(h->mh_type));
483 return -1;
484}
485
486static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode,
487 void *buf,
488 unsigned int len, gfs2_dscan_t scan,
489 const struct qstr *name,
490 void *opaque)
491{
492 struct gfs2_dirent *dent, *prev;
493 unsigned offset;
494 unsigned size;
495 int ret = 0;
496
497 ret = gfs2_dirent_offset(buf);
498 if (ret < 0)
499 goto consist_inode;
500
501 offset = ret;
502 prev = NULL;
503 dent = (struct gfs2_dirent *)(buf + offset);
504 size = be16_to_cpu(dent->de_rec_len);
505 if (gfs2_check_dirent(dent, offset, size, len, 1))
506 goto consist_inode;
507 do {
508 ret = scan(dent, name, opaque);
509 if (ret)
510 break;
511 offset += size;
512 if (offset == len)
513 break;
514 prev = dent;
515 dent = (struct gfs2_dirent *)(buf + offset);
516 size = be16_to_cpu(dent->de_rec_len);
517 if (gfs2_check_dirent(dent, offset, size, len, 0))
518 goto consist_inode;
519 } while(1);
520
521 switch(ret) {
522 case 0:
523 return NULL;
524 case 1:
525 return dent;
526 case 2:
527 return prev ? prev : dent;
528 default:
529 BUG_ON(ret > 0);
530 return ERR_PTR(ret);
531 }
532
533consist_inode:
534 gfs2_consist_inode(GFS2_I(inode));
535 return ERR_PTR(-EIO);
536}
537
538
539/**
540 * dirent_first - Return the first dirent
541 * @dip: the directory
542 * @bh: The buffer
543 * @dent: Pointer to list of dirents
544 *
545 * return first dirent whether bh points to leaf or stuffed dinode
546 *
547 * Returns: IS_LEAF, IS_DINODE, or -errno
548 */
549
550static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
551 struct gfs2_dirent **dent)
552{
553 struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
554
555 if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
556 if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh))
557 return -EIO;
558 *dent = (struct gfs2_dirent *)(bh->b_data +
559 sizeof(struct gfs2_leaf));
560 return IS_LEAF;
561 } else {
562 if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI))
563 return -EIO;
564 *dent = (struct gfs2_dirent *)(bh->b_data +
565 sizeof(struct gfs2_dinode));
566 return IS_DINODE;
567 }
568}
569
570/**
571 * dirent_next - Next dirent
572 * @dip: the directory
573 * @bh: The buffer
574 * @dent: Pointer to list of dirents
575 *
576 * Returns: 0 on success, error code otherwise
577 */
578
579static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh,
580 struct gfs2_dirent **dent)
581{
582 struct gfs2_dirent *tmp, *cur;
583 char *bh_end;
584 uint16_t cur_rec_len;
585
586 cur = *dent;
587 bh_end = bh->b_data + bh->b_size;
588 cur_rec_len = be16_to_cpu(cur->de_rec_len);
589
590 if ((char *)cur + cur_rec_len >= bh_end) {
591 if ((char *)cur + cur_rec_len > bh_end) {
592 gfs2_consist_inode(dip);
593 return -EIO;
594 }
595 return -ENOENT;
596 }
597
598 tmp = (struct gfs2_dirent *)((char *)cur + cur_rec_len);
599
600 if ((char *)tmp + be16_to_cpu(tmp->de_rec_len) > bh_end) {
601 gfs2_consist_inode(dip);
602 return -EIO;
603 }
604
605 if (cur_rec_len == 0) {
606 gfs2_consist_inode(dip);
607 return -EIO;
608 }
609
610 /* Only the first dent could ever have de_inum.no_addr == 0 */
611 if (!tmp->de_inum.no_addr) {
612 gfs2_consist_inode(dip);
613 return -EIO;
614 }
615
616 *dent = tmp;
617
618 return 0;
619}
620
621/**
622 * dirent_del - Delete a dirent
623 * @dip: The GFS2 inode
624 * @bh: The buffer
625 * @prev: The previous dirent
626 * @cur: The current dirent
627 *
628 */
629
630static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
631 struct gfs2_dirent *prev, struct gfs2_dirent *cur)
632{
633 uint16_t cur_rec_len, prev_rec_len;
634
635 if (!cur->de_inum.no_addr) {
636 gfs2_consist_inode(dip);
637 return;
638 }
639
640 gfs2_trans_add_bh(dip->i_gl, bh, 1);
641
642 /* If there is no prev entry, this is the first entry in the block.
643 The de_rec_len is already as big as it needs to be. Just zero
644 out the inode number and return. */
645
646 if (!prev) {
647 cur->de_inum.no_addr = 0; /* No endianess worries */
648 return;
649 }
650
651 /* Combine this dentry with the previous one. */
652
653 prev_rec_len = be16_to_cpu(prev->de_rec_len);
654 cur_rec_len = be16_to_cpu(cur->de_rec_len);
655
656 if ((char *)prev + prev_rec_len != (char *)cur)
657 gfs2_consist_inode(dip);
658 if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size)
659 gfs2_consist_inode(dip);
660
661 prev_rec_len += cur_rec_len;
662 prev->de_rec_len = cpu_to_be16(prev_rec_len);
663}
664
665/*
666 * Takes a dent from which to grab space as an argument. Returns the
667 * newly created dent.
668 */
669static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
670 struct gfs2_dirent *dent,
671 const struct qstr *name,
672 struct buffer_head *bh)
673{
674 struct gfs2_inode *ip = GFS2_I(inode);
675 struct gfs2_dirent *ndent;
676 unsigned offset = 0, totlen;
677
678 if (dent->de_inum.no_addr)
679 offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
680 totlen = be16_to_cpu(dent->de_rec_len);
681 BUG_ON(offset + name->len > totlen);
682 gfs2_trans_add_bh(ip->i_gl, bh, 1);
683 ndent = (struct gfs2_dirent *)((char *)dent + offset);
684 dent->de_rec_len = cpu_to_be16(offset);
685 gfs2_qstr2dirent(name, totlen - offset, ndent);
686 return ndent;
687}
688
689static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
690 struct buffer_head *bh,
691 const struct qstr *name)
692{
693 struct gfs2_dirent *dent;
694 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
695 gfs2_dirent_find_space, name, NULL);
696 if (!dent || IS_ERR(dent))
697 return dent;
698 return gfs2_init_dirent(inode, dent, name, bh);
699}
700
701static int get_leaf(struct gfs2_inode *dip, uint64_t leaf_no,
702 struct buffer_head **bhp)
703{
704 int error;
705
706 error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_START | DIO_WAIT, bhp);
707 if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
708 /* printk(KERN_INFO "block num=%llu\n", leaf_no); */
709 error = -EIO;
710 }
711
712 return error;
713}
714
715/**
716 * get_leaf_nr - Get a leaf number associated with the index
717 * @dip: The GFS2 inode
718 * @index:
719 * @leaf_out:
720 *
721 * Returns: 0 on success, error code otherwise
722 */
723
724static int get_leaf_nr(struct gfs2_inode *dip, uint32_t index,
725 uint64_t *leaf_out)
726{
727 uint64_t leaf_no;
728 int error;
729
730 error = gfs2_dir_read_data(dip, (char *)&leaf_no,
731 index * sizeof(uint64_t),
732 sizeof(uint64_t));
733 if (error != sizeof(uint64_t))
734 return (error < 0) ? error : -EIO;
735
736 *leaf_out = be64_to_cpu(leaf_no);
737
738 return 0;
739}
740
741static int get_first_leaf(struct gfs2_inode *dip, uint32_t index,
742 struct buffer_head **bh_out)
743{
744 uint64_t leaf_no;
745 int error;
746
747 error = get_leaf_nr(dip, index, &leaf_no);
748 if (!error)
749 error = get_leaf(dip, leaf_no, bh_out);
750
751 return error;
752}
753
754static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
755 const struct qstr *name,
756 gfs2_dscan_t scan,
757 struct buffer_head **pbh)
758{
759 struct buffer_head *bh;
760 struct gfs2_dirent *dent;
761 struct gfs2_inode *ip = GFS2_I(inode);
762 int error;
763
764 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
765 struct gfs2_leaf *leaf;
766 unsigned hsize = 1 << ip->i_di.di_depth;
767 unsigned index;
768 u64 ln;
769 if (hsize * sizeof(u64) != ip->i_di.di_size) {
770 gfs2_consist_inode(ip);
771 return ERR_PTR(-EIO);
772 }
773
774 index = name->hash >> (32 - ip->i_di.di_depth);
775 error = get_first_leaf(ip, index, &bh);
776 if (error)
777 return ERR_PTR(error);
778 do {
779 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
780 scan, name, NULL);
781 if (dent)
782 goto got_dent;
783 leaf = (struct gfs2_leaf *)bh->b_data;
784 ln = be64_to_cpu(leaf->lf_next);
785 brelse(bh);
786 if (!ln)
787 break;
788
789 error = get_leaf(ip, ln, &bh);
790 } while(!error);
791
792 return error ? ERR_PTR(error) : NULL;
793 }
794
795
796 error = gfs2_meta_inode_buffer(ip, &bh);
797 if (error)
798 return ERR_PTR(error);
799 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL);
800got_dent:
801 if (unlikely(dent == NULL || IS_ERR(dent))) {
802 brelse(bh);
803 bh = NULL;
804 }
805 *pbh = bh;
806 return dent;
807}
808
809static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
810{
811 struct gfs2_inode *ip = GFS2_I(inode);
812 u64 bn = gfs2_alloc_meta(ip);
813 struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
814 struct gfs2_leaf *leaf;
815 struct gfs2_dirent *dent;
816 struct qstr name = { .name = "", .len = 0, .hash = 0 };
817 if (!bh)
818 return NULL;
819
820 gfs2_trans_add_bh(ip->i_gl, bh, 1);
821 gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
822 leaf = (struct gfs2_leaf *)bh->b_data;
823 leaf->lf_depth = cpu_to_be16(depth);
824 leaf->lf_entries = cpu_to_be16(0);
825 leaf->lf_dirent_format = cpu_to_be16(GFS2_FORMAT_DE);
826 leaf->lf_next = cpu_to_be64(0);
827 memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
828 dent = (struct gfs2_dirent *)(leaf+1);
829 gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
830 *pbh = bh;
831 return leaf;
832}
833
834/**
835 * dir_make_exhash - Convert a stuffed directory into an ExHash directory
836 * @dip: The GFS2 inode
837 *
838 * Returns: 0 on success, error code otherwise
839 */
840
841static int dir_make_exhash(struct inode *inode)
842{
843 struct gfs2_inode *dip = GFS2_I(inode);
844 struct gfs2_sbd *sdp = GFS2_SB(inode);
845 struct gfs2_dirent *dent;
846 struct qstr args;
847 struct buffer_head *bh, *dibh;
848 struct gfs2_leaf *leaf;
849 int y;
850 uint32_t x;
851 uint64_t *lp, bn;
852 int error;
853
854 error = gfs2_meta_inode_buffer(dip, &dibh);
855 if (error)
856 return error;
857
858 /* Turn over a new leaf */
859
860 leaf = new_leaf(inode, &bh, 0);
861 if (!leaf)
862 return -ENOSPC;
863 bn = bh->b_blocknr;
864
865 gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
866 leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
867
868 /* Copy dirents */
869
870 gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh,
871 sizeof(struct gfs2_dinode));
872
873 /* Find last entry */
874
875 x = 0;
876 args.len = bh->b_size - sizeof(struct gfs2_dinode) +
877 sizeof(struct gfs2_leaf);
878 args.name = bh->b_data;
879 dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size,
880 gfs2_dirent_last, &args, NULL);
881 if (!dent) {
882 brelse(bh);
883 brelse(dibh);
884 return -EIO;
885 }
886 if (IS_ERR(dent)) {
887 brelse(bh);
888 brelse(dibh);
889 return PTR_ERR(dent);
890 }
891
892 /* Adjust the last dirent's record length
893 (Remember that dent still points to the last entry.) */
894
895 dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) +
896 sizeof(struct gfs2_dinode) -
897 sizeof(struct gfs2_leaf));
898
899 brelse(bh);
900
901 /* We're done with the new leaf block, now setup the new
902 hash table. */
903
904 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
905 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
906
907 lp = (uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode));
908
909 for (x = sdp->sd_hash_ptrs; x--; lp++)
910 *lp = cpu_to_be64(bn);
911
912 dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
913 dip->i_di.di_blocks++;
914 dip->i_di.di_flags |= GFS2_DIF_EXHASH;
915 dip->i_di.di_payload_format = 0;
916
917 for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
918 dip->i_di.di_depth = y;
919
920 gfs2_dinode_out(&dip->i_di, dibh->b_data);
921
922 brelse(dibh);
923
924 return 0;
925}
926
927/**
928 * dir_split_leaf - Split a leaf block into two
929 * @dip: The GFS2 inode
930 * @index:
931 * @leaf_no:
932 *
933 * Returns: 0 on success, error code on failure
934 */
935
936static int dir_split_leaf(struct inode *inode, const struct qstr *name)
937{
938 struct gfs2_inode *dip = GFS2_I(inode);
939 struct buffer_head *nbh, *obh, *dibh;
940 struct gfs2_leaf *nleaf, *oleaf;
941 struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new;
942 uint32_t start, len, half_len, divider;
943 uint64_t bn, *lp, leaf_no;
944 uint32_t index;
945 int x, moved = 0;
946 int error;
947
948 index = name->hash >> (32 - dip->i_di.di_depth);
949 error = get_leaf_nr(dip, index, &leaf_no);
950 if (error)
951 return error;
952
953 /* Get the old leaf block */
954 error = get_leaf(dip, leaf_no, &obh);
955 if (error)
956 return error;
957
958 oleaf = (struct gfs2_leaf *)obh->b_data;
959 if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) {
960 brelse(obh);
961 return 1; /* can't split */
962 }
963
964 gfs2_trans_add_bh(dip->i_gl, obh, 1);
965
966 nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
967 if (!nleaf) {
968 brelse(obh);
969 return -ENOSPC;
970 }
971 bn = nbh->b_blocknr;
972
973 /* Compute the start and len of leaf pointers in the hash table. */
974 len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth));
975 half_len = len >> 1;
976 if (!half_len) {
977 printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index);
978 gfs2_consist_inode(dip);
979 error = -EIO;
980 goto fail_brelse;
981 }
982
983 start = (index & ~(len - 1));
984
985 /* Change the pointers.
986 Don't bother distinguishing stuffed from non-stuffed.
987 This code is complicated enough already. */
988 lp = kmalloc(half_len * sizeof(uint64_t), GFP_NOFS | __GFP_NOFAIL);
989 /* Change the pointers */
990 for (x = 0; x < half_len; x++)
991 lp[x] = cpu_to_be64(bn);
992
993 error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(uint64_t),
994 half_len * sizeof(uint64_t));
995 if (error != half_len * sizeof(uint64_t)) {
996 if (error >= 0)
997 error = -EIO;
998 goto fail_lpfree;
999 }
1000
1001 kfree(lp);
1002
1003 /* Compute the divider */
1004 divider = (start + half_len) << (32 - dip->i_di.di_depth);
1005
1006 /* Copy the entries */
1007 dirent_first(dip, obh, &dent);
1008
1009 do {
1010 next = dent;
1011 if (dirent_next(dip, obh, &next))
1012 next = NULL;
1013
1014 if (dent->de_inum.no_addr &&
1015 be32_to_cpu(dent->de_hash) < divider) {
1016 struct qstr str;
1017 str.name = (char*)(dent+1);
1018 str.len = be16_to_cpu(dent->de_name_len);
1019 str.hash = be32_to_cpu(dent->de_hash);
1020 new = gfs2_dirent_alloc(inode, nbh, &str);
1021 if (IS_ERR(new)) {
1022 error = PTR_ERR(new);
1023 break;
1024 }
1025
1026 new->de_inum = dent->de_inum; /* No endian worries */
1027 new->de_type = dent->de_type; /* No endian worries */
1028 nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1);
1029
1030 dirent_del(dip, obh, prev, dent);
1031
1032 if (!oleaf->lf_entries)
1033 gfs2_consist_inode(dip);
1034 oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1);
1035
1036 if (!prev)
1037 prev = dent;
1038
1039 moved = 1;
1040 } else {
1041 prev = dent;
1042 }
1043 dent = next;
1044 } while (dent);
1045
1046 oleaf->lf_depth = nleaf->lf_depth;
1047
1048 error = gfs2_meta_inode_buffer(dip, &dibh);
1049 if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
1050 dip->i_di.di_blocks++;
1051 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1052 brelse(dibh);
1053 }
1054
1055 brelse(obh);
1056 brelse(nbh);
1057
1058 return error;
1059
1060fail_lpfree:
1061 kfree(lp);
1062
1063fail_brelse:
1064 brelse(obh);
1065 brelse(nbh);
1066 return error;
1067}
1068
1069/**
1070 * dir_double_exhash - Double size of ExHash table
1071 * @dip: The GFS2 dinode
1072 *
1073 * Returns: 0 on success, error code on failure
1074 */
1075
1076static int dir_double_exhash(struct gfs2_inode *dip)
1077{
1078 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1079 struct buffer_head *dibh;
1080 uint32_t hsize;
1081 uint64_t *buf;
1082 uint64_t *from, *to;
1083 uint64_t block;
1084 int x;
1085 int error = 0;
1086
1087 hsize = 1 << dip->i_di.di_depth;
1088 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
1089 gfs2_consist_inode(dip);
1090 return -EIO;
1091 }
1092
1093 /* Allocate both the "from" and "to" buffers in one big chunk */
1094
1095 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL);
1096
1097 for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
1098 error = gfs2_dir_read_data(dip, (char *)buf,
1099 block * sdp->sd_hash_bsize,
1100 sdp->sd_hash_bsize);
1101 if (error != sdp->sd_hash_bsize) {
1102 if (error >= 0)
1103 error = -EIO;
1104 goto fail;
1105 }
1106
1107 from = buf;
1108 to = (uint64_t *)((char *)buf + sdp->sd_hash_bsize);
1109
1110 for (x = sdp->sd_hash_ptrs; x--; from++) {
1111 *to++ = *from; /* No endianess worries */
1112 *to++ = *from;
1113 }
1114
1115 error = gfs2_dir_write_data(dip,
1116 (char *)buf + sdp->sd_hash_bsize,
1117 block * sdp->sd_sb.sb_bsize,
1118 sdp->sd_sb.sb_bsize);
1119 if (error != sdp->sd_sb.sb_bsize) {
1120 if (error >= 0)
1121 error = -EIO;
1122 goto fail;
1123 }
1124 }
1125
1126 kfree(buf);
1127
1128 error = gfs2_meta_inode_buffer(dip, &dibh);
1129 if (!gfs2_assert_withdraw(sdp, !error)) {
1130 dip->i_di.di_depth++;
1131 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1132 brelse(dibh);
1133 }
1134
1135 return error;
1136
1137 fail:
1138 kfree(buf);
1139
1140 return error;
1141}
1142
1143/**
1144 * compare_dents - compare directory entries by hash value
1145 * @a: first dent
1146 * @b: second dent
1147 *
1148 * When comparing the hash entries of @a to @b:
1149 * gt: returns 1
1150 * lt: returns -1
1151 * eq: returns 0
1152 */
1153
1154static int compare_dents(const void *a, const void *b)
1155{
1156 struct gfs2_dirent *dent_a, *dent_b;
1157 uint32_t hash_a, hash_b;
1158 int ret = 0;
1159
1160 dent_a = *(struct gfs2_dirent **)a;
1161 hash_a = be32_to_cpu(dent_a->de_hash);
1162
1163 dent_b = *(struct gfs2_dirent **)b;
1164 hash_b = be32_to_cpu(dent_b->de_hash);
1165
1166 if (hash_a > hash_b)
1167 ret = 1;
1168 else if (hash_a < hash_b)
1169 ret = -1;
1170 else {
1171 unsigned int len_a = be16_to_cpu(dent_a->de_name_len);
1172 unsigned int len_b = be16_to_cpu(dent_b->de_name_len);
1173
1174 if (len_a > len_b)
1175 ret = 1;
1176 else if (len_a < len_b)
1177 ret = -1;
1178 else
1179 ret = memcmp((char *)(dent_a + 1),
1180 (char *)(dent_b + 1),
1181 len_a);
1182 }
1183
1184 return ret;
1185}
1186
1187/**
1188 * do_filldir_main - read out directory entries
1189 * @dip: The GFS2 inode
1190 * @offset: The offset in the file to read from
1191 * @opaque: opaque data to pass to filldir
1192 * @filldir: The function to pass entries to
1193 * @darr: an array of struct gfs2_dirent pointers to read
1194 * @entries: the number of entries in darr
1195 * @copied: pointer to int that's non-zero if a entry has been copied out
1196 *
1197 * Jump through some hoops to make sure that if there are hash collsions,
1198 * they are read out at the beginning of a buffer. We want to minimize
1199 * the possibility that they will fall into different readdir buffers or
1200 * that someone will want to seek to that location.
1201 *
1202 * Returns: errno, >0 on exception from filldir
1203 */
1204
1205static int do_filldir_main(struct gfs2_inode *dip, uint64_t *offset,
1206 void *opaque, gfs2_filldir_t filldir,
1207 const struct gfs2_dirent **darr, uint32_t entries,
1208 int *copied)
1209{
1210 const struct gfs2_dirent *dent, *dent_next;
1211 struct gfs2_inum inum;
1212 uint64_t off, off_next;
1213 unsigned int x, y;
1214 int run = 0;
1215 int error = 0;
1216
1217 sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
1218
1219 dent_next = darr[0];
1220 off_next = be32_to_cpu(dent_next->de_hash);
1221 off_next = gfs2_disk_hash2offset(off_next);
1222
1223 for (x = 0, y = 1; x < entries; x++, y++) {
1224 dent = dent_next;
1225 off = off_next;
1226
1227 if (y < entries) {
1228 dent_next = darr[y];
1229 off_next = be32_to_cpu(dent_next->de_hash);
1230 off_next = gfs2_disk_hash2offset(off_next);
1231
1232 if (off < *offset)
1233 continue;
1234 *offset = off;
1235
1236 if (off_next == off) {
1237 if (*copied && !run)
1238 return 1;
1239 run = 1;
1240 } else
1241 run = 0;
1242 } else {
1243 if (off < *offset)
1244 continue;
1245 *offset = off;
1246 }
1247
1248 gfs2_inum_in(&inum, (char *)&dent->de_inum);
1249
1250 error = filldir(opaque, (char *)(dent + 1),
1251 be16_to_cpu(dent->de_name_len),
1252 off, &inum,
1253 be16_to_cpu(dent->de_type));
1254 if (error)
1255 return 1;
1256
1257 *copied = 1;
1258 }
1259
1260 /* Increment the *offset by one, so the next time we come into the
1261 do_filldir fxn, we get the next entry instead of the last one in the
1262 current leaf */
1263
1264 (*offset)++;
1265
1266 return 0;
1267}
1268
1269static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1270 gfs2_filldir_t filldir, int *copied,
1271 unsigned *depth, u64 leaf_no)
1272{
1273 struct gfs2_inode *ip = GFS2_I(inode);
1274 struct buffer_head *bh;
1275 struct gfs2_leaf *lf;
1276 unsigned entries = 0;
1277 unsigned leaves = 0;
1278 const struct gfs2_dirent **darr, *dent;
1279 struct dirent_gather g;
1280 struct buffer_head **larr;
1281 int leaf = 0;
1282 int error, i;
1283 u64 lfn = leaf_no;
1284
1285 do {
1286 error = get_leaf(ip, lfn, &bh);
1287 if (error)
1288 goto out;
1289 lf = (struct gfs2_leaf *)bh->b_data;
1290 if (leaves == 0)
1291 *depth = be16_to_cpu(lf->lf_depth);
1292 entries += be16_to_cpu(lf->lf_entries);
1293 leaves++;
1294 lfn = be64_to_cpu(lf->lf_next);
1295 brelse(bh);
1296 } while(lfn);
1297
1298 if (!entries)
1299 return 0;
1300
1301 error = -ENOMEM;
1302 larr = vmalloc((leaves + entries) * sizeof(void*));
1303 if (!larr)
1304 goto out;
1305 darr = (const struct gfs2_dirent **)(larr + leaves);
1306 g.pdent = darr;
1307 g.offset = 0;
1308 lfn = leaf_no;
1309
1310 do {
1311 error = get_leaf(ip, lfn, &bh);
1312 if (error)
1313 goto out_kfree;
1314 lf = (struct gfs2_leaf *)bh->b_data;
1315 lfn = be64_to_cpu(lf->lf_next);
1316 if (lf->lf_entries) {
1317 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
1318 gfs2_dirent_gather, NULL, &g);
1319 error = PTR_ERR(dent);
1320 if (IS_ERR(dent)) {
1321 goto out_kfree;
1322 }
1323 error = 0;
1324 larr[leaf++] = bh;
1325 } else {
1326 brelse(bh);
1327 }
1328 } while(lfn);
1329
1330 error = do_filldir_main(ip, offset, opaque, filldir, darr,
1331 entries, copied);
1332out_kfree:
1333 for(i = 0; i < leaf; i++)
1334 brelse(larr[i]);
1335 vfree(larr);
1336out:
1337 return error;
1338}
1339
1340/**
1341 * dir_e_read - Reads the entries from a directory into a filldir buffer
1342 * @dip: dinode pointer
1343 * @offset: the hash of the last entry read shifted to the right once
1344 * @opaque: buffer for the filldir function to fill
1345 * @filldir: points to the filldir function to use
1346 *
1347 * Returns: errno
1348 */
1349
1350static int dir_e_read(struct inode *inode, uint64_t *offset, void *opaque,
1351 gfs2_filldir_t filldir)
1352{
1353 struct gfs2_inode *dip = GFS2_I(inode);
1354 struct gfs2_sbd *sdp = GFS2_SB(inode);
1355 uint32_t hsize, len = 0;
1356 uint32_t ht_offset, lp_offset, ht_offset_cur = -1;
1357 uint32_t hash, index;
1358 uint64_t *lp;
1359 int copied = 0;
1360 int error = 0;
1361 unsigned depth = 0;
1362
1363 hsize = 1 << dip->i_di.di_depth;
1364 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
1365 gfs2_consist_inode(dip);
1366 return -EIO;
1367 }
1368
1369 hash = gfs2_dir_offset2hash(*offset);
1370 index = hash >> (32 - dip->i_di.di_depth);
1371
1372 lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
1373 if (!lp)
1374 return -ENOMEM;
1375
1376 while (index < hsize) {
1377 lp_offset = index & (sdp->sd_hash_ptrs - 1);
1378 ht_offset = index - lp_offset;
1379
1380 if (ht_offset_cur != ht_offset) {
1381 error = gfs2_dir_read_data(dip, (char *)lp,
1382 ht_offset * sizeof(uint64_t),
1383 sdp->sd_hash_bsize);
1384 if (error != sdp->sd_hash_bsize) {
1385 if (error >= 0)
1386 error = -EIO;
1387 goto out;
1388 }
1389 ht_offset_cur = ht_offset;
1390 }
1391
1392 error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
1393 &copied, &depth,
1394 be64_to_cpu(lp[lp_offset]));
1395 if (error)
1396 break;
1397
1398 len = 1 << (dip->i_di.di_depth - depth);
1399 index = (index & ~(len - 1)) + len;
1400 }
1401
1402out:
1403 kfree(lp);
1404 if (error > 0)
1405 error = 0;
1406 return error;
1407}
1408
1409int gfs2_dir_read(struct inode *inode, uint64_t *offset, void *opaque,
1410 gfs2_filldir_t filldir)
1411{
1412 struct gfs2_inode *dip = GFS2_I(inode);
1413 struct dirent_gather g;
1414 const struct gfs2_dirent **darr, *dent;
1415 struct buffer_head *dibh;
1416 int copied = 0;
1417 int error;
1418
1419 if (!dip->i_di.di_entries)
1420 return 0;
1421
1422 if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
1423 return dir_e_read(inode, offset, opaque, filldir);
1424
1425 if (!gfs2_is_stuffed(dip)) {
1426 gfs2_consist_inode(dip);
1427 return -EIO;
1428 }
1429
1430 error = gfs2_meta_inode_buffer(dip, &dibh);
1431 if (error)
1432 return error;
1433
1434 error = -ENOMEM;
1435 darr = kmalloc(dip->i_di.di_entries * sizeof(struct gfs2_dirent *),
1436 GFP_KERNEL);
1437 if (darr) {
1438 g.pdent = darr;
1439 g.offset = 0;
1440 dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
1441 gfs2_dirent_gather, NULL, &g);
1442 if (IS_ERR(dent)) {
1443 error = PTR_ERR(dent);
1444 goto out;
1445 }
1446 error = do_filldir_main(dip, offset, opaque, filldir, darr,
1447 dip->i_di.di_entries, &copied);
1448out:
1449 kfree(darr);
1450 }
1451
1452 if (error > 0)
1453 error = 0;
1454
1455 brelse(dibh);
1456
1457 return error;
1458}
1459
1460/**
1461 * gfs2_dir_search - Search a directory
1462 * @dip: The GFS2 inode
1463 * @filename:
1464 * @inode:
1465 *
1466 * This routine searches a directory for a file or another directory.
1467 * Assumes a glock is held on dip.
1468 *
1469 * Returns: errno
1470 */
1471
1472int gfs2_dir_search(struct inode *dir, const struct qstr *name,
1473 struct gfs2_inum *inum, unsigned int *type)
1474{
1475 struct buffer_head *bh;
1476 struct gfs2_dirent *dent;
1477
1478 dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
1479 if (dent) {
1480 if (IS_ERR(dent))
1481 return PTR_ERR(dent);
1482 if (inum)
1483 gfs2_inum_in(inum, (char *)&dent->de_inum);
1484 if (type)
1485 *type = be16_to_cpu(dent->de_type);
1486 brelse(bh);
1487 return 0;
1488 }
1489 return -ENOENT;
1490}
1491
1492static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1493{
1494 struct buffer_head *bh, *obh;
1495 struct gfs2_inode *ip = GFS2_I(inode);
1496 struct gfs2_leaf *leaf, *oleaf;
1497 int error;
1498 u32 index;
1499 u64 bn;
1500
1501 index = name->hash >> (32 - ip->i_di.di_depth);
1502 error = get_first_leaf(ip, index, &obh);
1503 if (error)
1504 return error;
1505 do {
1506 oleaf = (struct gfs2_leaf *)obh->b_data;
1507 bn = be64_to_cpu(oleaf->lf_next);
1508 if (!bn)
1509 break;
1510 brelse(obh);
1511 error = get_leaf(ip, bn, &obh);
1512 if (error)
1513 return error;
1514 } while(1);
1515
1516 gfs2_trans_add_bh(ip->i_gl, obh, 1);
1517
1518 leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
1519 if (!leaf) {
1520 brelse(obh);
1521 return -ENOSPC;
1522 }
1523 oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
1524 brelse(bh);
1525 brelse(obh);
1526
1527 error = gfs2_meta_inode_buffer(ip, &bh);
1528 if (error)
1529 return error;
1530 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1531 ip->i_di.di_blocks++;
1532 gfs2_dinode_out(&ip->i_di, bh->b_data);
1533 brelse(bh);
1534 return 0;
1535}
1536
1537/**
1538 * gfs2_dir_add - Add new filename into directory
1539 * @dip: The GFS2 inode
1540 * @filename: The new name
1541 * @inode: The inode number of the entry
1542 * @type: The type of the entry
1543 *
1544 * Returns: 0 on success, error code on failure
1545 */
1546
1547int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1548 const struct gfs2_inum *inum, unsigned type)
1549{
1550 struct gfs2_inode *ip = GFS2_I(inode);
1551 struct buffer_head *bh;
1552 struct gfs2_dirent *dent;
1553 struct gfs2_leaf *leaf;
1554 int error;
1555
1556 while(1) {
1557 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
1558 &bh);
1559 if (dent) {
1560 if (IS_ERR(dent))
1561 return PTR_ERR(dent);
1562 dent = gfs2_init_dirent(inode, dent, name, bh);
1563 gfs2_inum_out(inum, (char *)&dent->de_inum);
1564 dent->de_type = cpu_to_be16(type);
1565 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
1566 leaf = (struct gfs2_leaf *)bh->b_data;
1567 leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1);
1568 }
1569 brelse(bh);
1570 error = gfs2_meta_inode_buffer(ip, &bh);
1571 if (error)
1572 break;
1573 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1574 ip->i_di.di_entries++;
1575 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
1576 gfs2_dinode_out(&ip->i_di, bh->b_data);
1577 brelse(bh);
1578 error = 0;
1579 break;
1580 }
1581 if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
1582 error = dir_make_exhash(inode);
1583 if (error)
1584 break;
1585 continue;
1586 }
1587 error = dir_split_leaf(inode, name);
1588 if (error == 0)
1589 continue;
1590 if (error < 0)
1591 break;
1592 if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) {
1593 error = dir_double_exhash(ip);
1594 if (error)
1595 break;
1596 error = dir_split_leaf(inode, name);
1597 if (error < 0)
1598 break;
1599 if (error == 0)
1600 continue;
1601 }
1602 error = dir_new_leaf(inode, name);
1603 if (!error)
1604 continue;
1605 error = -ENOSPC;
1606 break;
1607 }
1608 return error;
1609}
1610
1611
1612/**
1613 * gfs2_dir_del - Delete a directory entry
1614 * @dip: The GFS2 inode
1615 * @filename: The filename
1616 *
1617 * Returns: 0 on success, error code on failure
1618 */
1619
1620int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
1621{
1622 struct gfs2_dirent *dent, *prev = NULL;
1623 struct buffer_head *bh;
1624 int error;
1625
1626 /* Returns _either_ the entry (if its first in block) or the
1627 previous entry otherwise */
1628 dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh);
1629 if (!dent) {
1630 gfs2_consist_inode(dip);
1631 return -EIO;
1632 }
1633 if (IS_ERR(dent)) {
1634 gfs2_consist_inode(dip);
1635 return PTR_ERR(dent);
1636 }
1637 /* If not first in block, adjust pointers accordingly */
1638 if (gfs2_dirent_find(dent, name, NULL) == 0) {
1639 prev = dent;
1640 dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len));
1641 }
1642
1643 dirent_del(dip, bh, prev, dent);
1644 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
1645 struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
1646 u16 entries = be16_to_cpu(leaf->lf_entries);
1647 if (!entries)
1648 gfs2_consist_inode(dip);
1649 leaf->lf_entries = cpu_to_be16(--entries);
1650 }
1651 brelse(bh);
1652
1653 error = gfs2_meta_inode_buffer(dip, &bh);
1654 if (error)
1655 return error;
1656
1657 if (!dip->i_di.di_entries)
1658 gfs2_consist_inode(dip);
1659 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1660 dip->i_di.di_entries--;
1661 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1662 gfs2_dinode_out(&dip->i_di, bh->b_data);
1663 brelse(bh);
1664 mark_inode_dirty(&dip->i_inode);
1665
1666 return error;
1667}
1668
1669/**
1670 * gfs2_dir_mvino - Change inode number of directory entry
1671 * @dip: The GFS2 inode
1672 * @filename:
1673 * @new_inode:
1674 *
1675 * This routine changes the inode number of a directory entry. It's used
1676 * by rename to change ".." when a directory is moved.
1677 * Assumes a glock is held on dvp.
1678 *
1679 * Returns: errno
1680 */
1681
1682int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
1683 struct gfs2_inum *inum, unsigned int new_type)
1684{
1685 struct buffer_head *bh;
1686 struct gfs2_dirent *dent;
1687 int error;
1688
1689 dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh);
1690 if (!dent) {
1691 gfs2_consist_inode(dip);
1692 return -EIO;
1693 }
1694 if (IS_ERR(dent))
1695 return PTR_ERR(dent);
1696
1697 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1698 gfs2_inum_out(inum, (char *)&dent->de_inum);
1699 dent->de_type = cpu_to_be16(new_type);
1700
1701 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
1702 brelse(bh);
1703 error = gfs2_meta_inode_buffer(dip, &bh);
1704 if (error)
1705 return error;
1706 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1707 }
1708
1709 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1710 gfs2_dinode_out(&dip->i_di, bh->b_data);
1711 brelse(bh);
1712 return 0;
1713}
1714
1715/**
1716 * foreach_leaf - call a function for each leaf in a directory
1717 * @dip: the directory
1718 * @lc: the function to call for each each
1719 * @data: private data to pass to it
1720 *
1721 * Returns: errno
1722 */
1723
1724static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1725{
1726 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1727 struct buffer_head *bh;
1728 struct gfs2_leaf *leaf;
1729 uint32_t hsize, len;
1730 uint32_t ht_offset, lp_offset, ht_offset_cur = -1;
1731 uint32_t index = 0;
1732 uint64_t *lp;
1733 uint64_t leaf_no;
1734 int error = 0;
1735
1736 hsize = 1 << dip->i_di.di_depth;
1737 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
1738 gfs2_consist_inode(dip);
1739 return -EIO;
1740 }
1741
1742 lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
1743 if (!lp)
1744 return -ENOMEM;
1745
1746 while (index < hsize) {
1747 lp_offset = index & (sdp->sd_hash_ptrs - 1);
1748 ht_offset = index - lp_offset;
1749
1750 if (ht_offset_cur != ht_offset) {
1751 error = gfs2_dir_read_data(dip, (char *)lp,
1752 ht_offset * sizeof(uint64_t),
1753 sdp->sd_hash_bsize);
1754 if (error != sdp->sd_hash_bsize) {
1755 if (error >= 0)
1756 error = -EIO;
1757 goto out;
1758 }
1759 ht_offset_cur = ht_offset;
1760 }
1761
1762 leaf_no = be64_to_cpu(lp[lp_offset]);
1763 if (leaf_no) {
1764 error = get_leaf(dip, leaf_no, &bh);
1765 if (error)
1766 goto out;
1767 leaf = (struct gfs2_leaf *)bh->b_data;
1768 len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth));
1769 brelse(bh);
1770
1771 error = lc(dip, index, len, leaf_no, data);
1772 if (error)
1773 goto out;
1774
1775 index = (index & ~(len - 1)) + len;
1776 } else
1777 index++;
1778 }
1779
1780 if (index != hsize) {
1781 gfs2_consist_inode(dip);
1782 error = -EIO;
1783 }
1784
1785out:
1786 kfree(lp);
1787
1788 return error;
1789}
1790
1791/**
1792 * leaf_dealloc - Deallocate a directory leaf
1793 * @dip: the directory
1794 * @index: the hash table offset in the directory
1795 * @len: the number of pointers to this leaf
1796 * @leaf_no: the leaf number
1797 * @data: not used
1798 *
1799 * Returns: errno
1800 */
1801
1802static int leaf_dealloc(struct gfs2_inode *dip, uint32_t index, uint32_t len,
1803 uint64_t leaf_no, void *data)
1804{
1805 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1806 struct gfs2_leaf *tmp_leaf;
1807 struct gfs2_rgrp_list rlist;
1808 struct buffer_head *bh, *dibh;
1809 uint64_t blk, nblk;
1810 unsigned int rg_blocks = 0, l_blocks = 0;
1811 char *ht;
1812 unsigned int x, size = len * sizeof(uint64_t);
1813 int error;
1814
1815 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1816
1817 ht = kzalloc(size, GFP_KERNEL);
1818 if (!ht)
1819 return -ENOMEM;
1820
1821 gfs2_alloc_get(dip);
1822
1823 error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1824 if (error)
1825 goto out;
1826
1827 error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh);
1828 if (error)
1829 goto out_qs;
1830
1831 /* Count the number of leaves */
1832
1833 for (blk = leaf_no; blk; blk = nblk) {
1834 error = get_leaf(dip, blk, &bh);
1835 if (error)
1836 goto out_rlist;
1837 tmp_leaf = (struct gfs2_leaf *)bh->b_data;
1838 nblk = be64_to_cpu(tmp_leaf->lf_next);
1839 brelse(bh);
1840
1841 gfs2_rlist_add(sdp, &rlist, blk);
1842 l_blocks++;
1843 }
1844
1845 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
1846
1847 for (x = 0; x < rlist.rl_rgrps; x++) {
1848 struct gfs2_rgrpd *rgd;
1849 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
1850 rg_blocks += rgd->rd_ri.ri_length;
1851 }
1852
1853 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
1854 if (error)
1855 goto out_rlist;
1856
1857 error = gfs2_trans_begin(sdp,
1858 rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) +
1859 RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
1860 if (error)
1861 goto out_rg_gunlock;
1862
1863 for (blk = leaf_no; blk; blk = nblk) {
1864 error = get_leaf(dip, blk, &bh);
1865 if (error)
1866 goto out_end_trans;
1867 tmp_leaf = (struct gfs2_leaf *)bh->b_data;
1868 nblk = be64_to_cpu(tmp_leaf->lf_next);
1869 brelse(bh);
1870
1871 gfs2_free_meta(dip, blk, 1);
1872
1873 if (!dip->i_di.di_blocks)
1874 gfs2_consist_inode(dip);
1875 dip->i_di.di_blocks--;
1876 }
1877
1878 error = gfs2_dir_write_data(dip, ht, index * sizeof(uint64_t), size);
1879 if (error != size) {
1880 if (error >= 0)
1881 error = -EIO;
1882 goto out_end_trans;
1883 }
1884
1885 error = gfs2_meta_inode_buffer(dip, &dibh);
1886 if (error)
1887 goto out_end_trans;
1888
1889 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
1890 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1891 brelse(dibh);
1892
1893 out_end_trans:
1894 gfs2_trans_end(sdp);
1895
1896 out_rg_gunlock:
1897 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1898
1899 out_rlist:
1900 gfs2_rlist_free(&rlist);
1901 gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh);
1902
1903 out_qs:
1904 gfs2_quota_unhold(dip);
1905
1906 out:
1907 gfs2_alloc_put(dip);
1908 kfree(ht);
1909
1910 return error;
1911}
1912
1913/**
1914 * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
1915 * @dip: the directory
1916 *
1917 * Dealloc all on-disk directory leaves to FREEMETA state
1918 * Change on-disk inode type to "regular file"
1919 *
1920 * Returns: errno
1921 */
1922
1923int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
1924{
1925 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1926 struct buffer_head *bh;
1927 int error;
1928
1929 /* Dealloc on-disk leaves to FREEMETA state */
1930 error = foreach_leaf(dip, leaf_dealloc, NULL);
1931 if (error)
1932 return error;
1933
1934 /* Make this a regular file in case we crash.
1935 (We don't want to free these blocks a second time.) */
1936
1937 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1938 if (error)
1939 return error;
1940
1941 error = gfs2_meta_inode_buffer(dip, &bh);
1942 if (!error) {
1943 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1944 ((struct gfs2_dinode *)bh->b_data)->di_mode =
1945 cpu_to_be32(S_IFREG);
1946 brelse(bh);
1947 }
1948
1949 gfs2_trans_end(sdp);
1950
1951 return error;
1952}
1953
1954/**
1955 * gfs2_diradd_alloc_required - find if adding entry will require an allocation
1956 * @ip: the file being written to
1957 * @filname: the filename that's going to be added
1958 *
1959 * Returns: 1 if alloc required, 0 if not, -ve on error
1960 */
1961
1962int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
1963{
1964 struct gfs2_dirent *dent;
1965 struct buffer_head *bh;
1966
1967 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
1968 if (!dent) {
1969 return 1;
1970 }
1971 if (IS_ERR(dent))
1972 return PTR_ERR(dent);
1973 brelse(bh);
1974 return 0;
1975}
1976
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
new file mode 100644
index 000000000000..173403095eb2
--- /dev/null
+++ b/fs/gfs2/dir.h
@@ -0,0 +1,73 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __DIR_DOT_H__
11#define __DIR_DOT_H__
12
13/**
14 * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read()
15 * @opaque: opaque data used by the function
16 * @name: the name of the directory entry
17 * @length: the length of the name
18 * @offset: the entry's offset in the directory
19 * @inum: the inode number the entry points to
20 * @type: the type of inode the entry points to
21 *
22 * Returns: 0 on success, 1 if buffer full
23 */
24
25typedef int (*gfs2_filldir_t) (void *opaque,
26 const char *name, unsigned int length,
27 uint64_t offset,
28 struct gfs2_inum *inum, unsigned int type);
29
30int gfs2_dir_search(struct inode *dir, const struct qstr *filename,
31 struct gfs2_inum *inum, unsigned int *type);
32int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
33 const struct gfs2_inum *inum, unsigned int type);
34int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
35int gfs2_dir_read(struct inode *inode, uint64_t * offset, void *opaque,
36 gfs2_filldir_t filldir);
37int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
38 struct gfs2_inum *new_inum, unsigned int new_type);
39
40int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
41
42int gfs2_diradd_alloc_required(struct inode *dir,
43 const struct qstr *filename);
44int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, uint64_t block,
45 struct buffer_head **bhp);
46
47static inline uint32_t gfs2_disk_hash(const char *data, int len)
48{
49 return crc32_le(0xFFFFFFFF, data, len) ^ 0xFFFFFFFF;
50}
51
52
53static inline void gfs2_str2qstr(struct qstr *name, const char *fname)
54{
55 name->name = fname;
56 name->len = strlen(fname);
57 name->hash = gfs2_disk_hash(name->name, name->len);
58}
59
60/* N.B. This probably ought to take inum & type as args as well */
61static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent)
62{
63 dent->de_inum.no_addr = cpu_to_be64(0);
64 dent->de_inum.no_formal_ino = cpu_to_be64(0);
65 dent->de_hash = cpu_to_be32(name->hash);
66 dent->de_rec_len = cpu_to_be16(reclen);
67 dent->de_name_len = cpu_to_be16(name->len);
68 dent->de_type = cpu_to_be16(0);
69 memset(dent->__pad, 0, sizeof(dent->__pad));
70 memcpy((char*)(dent+1), name->name, name->len);
71}
72
73#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
new file mode 100644
index 000000000000..3ace242f2b16
--- /dev/null
+++ b/fs/gfs2/eaops.c
@@ -0,0 +1,230 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/xattr.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/uaccess.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "acl.h"
23#include "eaops.h"
24#include "eattr.h"
25#include "util.h"
26
27/**
28 * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
29 * @namep: ea name, possibly with type appended
30 *
31 * Returns: GFS2_EATYPE_XXX
32 */
33
34unsigned int gfs2_ea_name2type(const char *name, char **truncated_name)
35{
36 unsigned int type;
37
38 if (strncmp(name, "system.", 7) == 0) {
39 type = GFS2_EATYPE_SYS;
40 if (truncated_name)
41 *truncated_name = strchr(name, '.') + 1;
42 } else if (strncmp(name, "user.", 5) == 0) {
43 type = GFS2_EATYPE_USR;
44 if (truncated_name)
45 *truncated_name = strchr(name, '.') + 1;
46 } else if (strncmp(name, "security.", 9) == 0) {
47 type = GFS2_EATYPE_SECURITY;
48 if (truncated_name)
49 *truncated_name = strchr(name, '.') + 1;
50 } else {
51 type = GFS2_EATYPE_UNUSED;
52 if (truncated_name)
53 *truncated_name = NULL;
54 }
55
56 return type;
57}
58
59static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
60{
61 struct inode *inode = &ip->i_inode;
62 int error = permission(inode, MAY_READ, NULL);
63 if (error)
64 return error;
65
66 return gfs2_ea_get_i(ip, er);
67}
68
69static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
70{
71 struct inode *inode = &ip->i_inode;
72
73 if (S_ISREG(inode->i_mode) ||
74 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
75 int error = permission(inode, MAY_WRITE, NULL);
76 if (error)
77 return error;
78 } else
79 return -EPERM;
80
81 return gfs2_ea_set_i(ip, er);
82}
83
84static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
85{
86 struct inode *inode = &ip->i_inode;
87
88 if (S_ISREG(inode->i_mode) ||
89 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
90 int error = permission(inode, MAY_WRITE, NULL);
91 if (error)
92 return error;
93 } else
94 return -EPERM;
95
96 return gfs2_ea_remove_i(ip, er);
97}
98
99static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
100{
101 if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
102 !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
103 !capable(CAP_SYS_ADMIN))
104 return -EPERM;
105
106 if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
107 (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
108 GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
109 return -EOPNOTSUPP;
110
111
112
113 return gfs2_ea_get_i(ip, er);
114}
115
116static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
117{
118 int remove = 0;
119 int error;
120
121 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
122 if (!(er->er_flags & GFS2_ERF_MODE)) {
123 er->er_mode = ip->i_di.di_mode;
124 er->er_flags |= GFS2_ERF_MODE;
125 }
126 error = gfs2_acl_validate_set(ip, 1, er,
127 &remove, &er->er_mode);
128 if (error)
129 return error;
130 error = gfs2_ea_set_i(ip, er);
131 if (error)
132 return error;
133 if (remove)
134 gfs2_ea_remove_i(ip, er);
135 return 0;
136
137 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
138 error = gfs2_acl_validate_set(ip, 0, er,
139 &remove, NULL);
140 if (error)
141 return error;
142 if (!remove)
143 error = gfs2_ea_set_i(ip, er);
144 else {
145 error = gfs2_ea_remove_i(ip, er);
146 if (error == -ENODATA)
147 error = 0;
148 }
149 return error;
150 }
151
152 return -EPERM;
153}
154
155static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
156{
157 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
158 int error = gfs2_acl_validate_remove(ip, 1);
159 if (error)
160 return error;
161
162 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
163 int error = gfs2_acl_validate_remove(ip, 0);
164 if (error)
165 return error;
166
167 } else
168 return -EPERM;
169
170 return gfs2_ea_remove_i(ip, er);
171}
172
173static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
174{
175 struct inode *inode = &ip->i_inode;
176 int error = permission(inode, MAY_READ, NULL);
177 if (error)
178 return error;
179
180 return gfs2_ea_get_i(ip, er);
181}
182
183static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
184{
185 struct inode *inode = &ip->i_inode;
186 int error = permission(inode, MAY_WRITE, NULL);
187 if (error)
188 return error;
189
190 return gfs2_ea_set_i(ip, er);
191}
192
193static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
194{
195 struct inode *inode = &ip->i_inode;
196 int error = permission(inode, MAY_WRITE, NULL);
197 if (error)
198 return error;
199
200 return gfs2_ea_remove_i(ip, er);
201}
202
203static struct gfs2_eattr_operations gfs2_user_eaops = {
204 .eo_get = user_eo_get,
205 .eo_set = user_eo_set,
206 .eo_remove = user_eo_remove,
207 .eo_name = "user",
208};
209
210struct gfs2_eattr_operations gfs2_system_eaops = {
211 .eo_get = system_eo_get,
212 .eo_set = system_eo_set,
213 .eo_remove = system_eo_remove,
214 .eo_name = "system",
215};
216
217static struct gfs2_eattr_operations gfs2_security_eaops = {
218 .eo_get = security_eo_get,
219 .eo_set = security_eo_set,
220 .eo_remove = security_eo_remove,
221 .eo_name = "security",
222};
223
224struct gfs2_eattr_operations *gfs2_ea_ops[] = {
225 NULL,
226 &gfs2_user_eaops,
227 &gfs2_system_eaops,
228 &gfs2_security_eaops,
229};
230
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
new file mode 100644
index 000000000000..3dece17e3116
--- /dev/null
+++ b/fs/gfs2/eaops.h
@@ -0,0 +1,29 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __EAOPS_DOT_H__
11#define __EAOPS_DOT_H__
12
13struct gfs2_ea_request;
14
15struct gfs2_eattr_operations {
16 int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
17 int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
18 int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
19 char *eo_name;
20};
21
22unsigned int gfs2_ea_name2type(const char *name, char **truncated_name);
23
24extern struct gfs2_eattr_operations gfs2_system_eaops;
25
26extern struct gfs2_eattr_operations *gfs2_ea_ops[];
27
28#endif /* __EAOPS_DOT_H__ */
29
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
new file mode 100644
index 000000000000..96736932260f
--- /dev/null
+++ b/fs/gfs2/eattr.c
@@ -0,0 +1,1548 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/xattr.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/uaccess.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "acl.h"
23#include "eaops.h"
24#include "eattr.h"
25#include "glock.h"
26#include "inode.h"
27#include "meta_io.h"
28#include "quota.h"
29#include "rgrp.h"
30#include "trans.h"
31#include "util.h"
32
33/**
34 * ea_calc_size - returns the acutal number of bytes the request will take up
35 * (not counting any unstuffed data blocks)
36 * @sdp:
37 * @er:
38 * @size:
39 *
40 * Returns: 1 if the EA should be stuffed
41 */
42
43static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
44 unsigned int *size)
45{
46 *size = GFS2_EAREQ_SIZE_STUFFED(er);
47 if (*size <= sdp->sd_jbsize)
48 return 1;
49
50 *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
51
52 return 0;
53}
54
55static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
56{
57 unsigned int size;
58
59 if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
60 return -ERANGE;
61
62 ea_calc_size(sdp, er, &size);
63
64 /* This can only happen with 512 byte blocks */
65 if (size > sdp->sd_jbsize)
66 return -ERANGE;
67
68 return 0;
69}
70
71typedef int (*ea_call_t) (struct gfs2_inode *ip,
72 struct buffer_head *bh,
73 struct gfs2_ea_header *ea,
74 struct gfs2_ea_header *prev,
75 void *private);
76
77static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
78 ea_call_t ea_call, void *data)
79{
80 struct gfs2_ea_header *ea, *prev = NULL;
81 int error = 0;
82
83 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA))
84 return -EIO;
85
86 for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
87 if (!GFS2_EA_REC_LEN(ea))
88 goto fail;
89 if (!(bh->b_data <= (char *)ea &&
90 (char *)GFS2_EA2NEXT(ea) <=
91 bh->b_data + bh->b_size))
92 goto fail;
93 if (!GFS2_EATYPE_VALID(ea->ea_type))
94 goto fail;
95
96 error = ea_call(ip, bh, ea, prev, data);
97 if (error)
98 return error;
99
100 if (GFS2_EA_IS_LAST(ea)) {
101 if ((char *)GFS2_EA2NEXT(ea) !=
102 bh->b_data + bh->b_size)
103 goto fail;
104 break;
105 }
106 }
107
108 return error;
109
110 fail:
111 gfs2_consist_inode(ip);
112 return -EIO;
113}
114
115static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
116{
117 struct buffer_head *bh, *eabh;
118 uint64_t *eablk, *end;
119 int error;
120
121 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
122 DIO_START | DIO_WAIT, &bh);
123 if (error)
124 return error;
125
126 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
127 error = ea_foreach_i(ip, bh, ea_call, data);
128 goto out;
129 }
130
131 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) {
132 error = -EIO;
133 goto out;
134 }
135
136 eablk = (uint64_t *)(bh->b_data + sizeof(struct gfs2_meta_header));
137 end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs;
138
139 for (; eablk < end; eablk++) {
140 uint64_t bn;
141
142 if (!*eablk)
143 break;
144 bn = be64_to_cpu(*eablk);
145
146 error = gfs2_meta_read(ip->i_gl, bn, DIO_START | DIO_WAIT,
147 &eabh);
148 if (error)
149 break;
150 error = ea_foreach_i(ip, eabh, ea_call, data);
151 brelse(eabh);
152 if (error)
153 break;
154 }
155 out:
156 brelse(bh);
157
158 return error;
159}
160
161struct ea_find {
162 struct gfs2_ea_request *ef_er;
163 struct gfs2_ea_location *ef_el;
164};
165
166static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
167 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
168 void *private)
169{
170 struct ea_find *ef = private;
171 struct gfs2_ea_request *er = ef->ef_er;
172
173 if (ea->ea_type == GFS2_EATYPE_UNUSED)
174 return 0;
175
176 if (ea->ea_type == er->er_type) {
177 if (ea->ea_name_len == er->er_name_len &&
178 !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
179 struct gfs2_ea_location *el = ef->ef_el;
180 get_bh(bh);
181 el->el_bh = bh;
182 el->el_ea = ea;
183 el->el_prev = prev;
184 return 1;
185 }
186 }
187
188#if 0
189 else if ((ip->i_di.di_flags & GFS2_DIF_EA_PACKED) &&
190 er->er_type == GFS2_EATYPE_SYS)
191 return 1;
192#endif
193
194 return 0;
195}
196
197int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
198 struct gfs2_ea_location *el)
199{
200 struct ea_find ef;
201 int error;
202
203 ef.ef_er = er;
204 ef.ef_el = el;
205
206 memset(el, 0, sizeof(struct gfs2_ea_location));
207
208 error = ea_foreach(ip, ea_find_i, &ef);
209 if (error > 0)
210 return 0;
211
212 return error;
213}
214
215/**
216 * ea_dealloc_unstuffed -
217 * @ip:
218 * @bh:
219 * @ea:
220 * @prev:
221 * @private:
222 *
223 * Take advantage of the fact that all unstuffed blocks are
224 * allocated from the same RG. But watch, this may not always
225 * be true.
226 *
227 * Returns: errno
228 */
229
230static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
231 struct gfs2_ea_header *ea,
232 struct gfs2_ea_header *prev, void *private)
233{
234 int *leave = private;
235 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
236 struct gfs2_rgrpd *rgd;
237 struct gfs2_holder rg_gh;
238 struct buffer_head *dibh;
239 uint64_t *dataptrs, bn = 0;
240 uint64_t bstart = 0;
241 unsigned int blen = 0;
242 unsigned int blks = 0;
243 unsigned int x;
244 int error;
245
246 if (GFS2_EA_IS_STUFFED(ea))
247 return 0;
248
249 dataptrs = GFS2_EA2DATAPTRS(ea);
250 for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++)
251 if (*dataptrs) {
252 blks++;
253 bn = be64_to_cpu(*dataptrs);
254 }
255 if (!blks)
256 return 0;
257
258 rgd = gfs2_blk2rgrpd(sdp, bn);
259 if (!rgd) {
260 gfs2_consist_inode(ip);
261 return -EIO;
262 }
263
264 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
265 if (error)
266 return error;
267
268 error = gfs2_trans_begin(sdp, rgd->rd_ri.ri_length +
269 RES_DINODE + RES_EATTR + RES_STATFS +
270 RES_QUOTA, blks);
271 if (error)
272 goto out_gunlock;
273
274 gfs2_trans_add_bh(ip->i_gl, bh, 1);
275
276 dataptrs = GFS2_EA2DATAPTRS(ea);
277 for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
278 if (!*dataptrs)
279 break;
280 bn = be64_to_cpu(*dataptrs);
281
282 if (bstart + blen == bn)
283 blen++;
284 else {
285 if (bstart)
286 gfs2_free_meta(ip, bstart, blen);
287 bstart = bn;
288 blen = 1;
289 }
290
291 *dataptrs = 0;
292 if (!ip->i_di.di_blocks)
293 gfs2_consist_inode(ip);
294 ip->i_di.di_blocks--;
295 }
296 if (bstart)
297 gfs2_free_meta(ip, bstart, blen);
298
299 if (prev && !leave) {
300 uint32_t len;
301
302 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
303 prev->ea_rec_len = cpu_to_be32(len);
304
305 if (GFS2_EA_IS_LAST(ea))
306 prev->ea_flags |= GFS2_EAFLAG_LAST;
307 } else {
308 ea->ea_type = GFS2_EATYPE_UNUSED;
309 ea->ea_num_ptrs = 0;
310 }
311
312 error = gfs2_meta_inode_buffer(ip, &dibh);
313 if (!error) {
314 ip->i_di.di_ctime = get_seconds();
315 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
316 gfs2_dinode_out(&ip->i_di, dibh->b_data);
317 brelse(dibh);
318 }
319
320 gfs2_trans_end(sdp);
321
322 out_gunlock:
323 gfs2_glock_dq_uninit(&rg_gh);
324
325 return error;
326}
327
328static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
329 struct gfs2_ea_header *ea,
330 struct gfs2_ea_header *prev, int leave)
331{
332 struct gfs2_alloc *al;
333 int error;
334
335 al = gfs2_alloc_get(ip);
336
337 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
338 if (error)
339 goto out_alloc;
340
341 error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
342 if (error)
343 goto out_quota;
344
345 error = ea_dealloc_unstuffed(ip,
346 bh, ea, prev,
347 (leave) ? &error : NULL);
348
349 gfs2_glock_dq_uninit(&al->al_ri_gh);
350
351 out_quota:
352 gfs2_quota_unhold(ip);
353
354 out_alloc:
355 gfs2_alloc_put(ip);
356
357 return error;
358}
359
360struct ea_list {
361 struct gfs2_ea_request *ei_er;
362 unsigned int ei_size;
363};
364
365static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
366 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
367 void *private)
368{
369 struct ea_list *ei = private;
370 struct gfs2_ea_request *er = ei->ei_er;
371 unsigned int ea_size = gfs2_ea_strlen(ea);
372
373 if (ea->ea_type == GFS2_EATYPE_UNUSED)
374 return 0;
375
376 if (er->er_data_len) {
377 char *prefix = NULL;
378 unsigned int l = 0;
379 char c = 0;
380
381 if (ei->ei_size + ea_size > er->er_data_len)
382 return -ERANGE;
383
384 switch (ea->ea_type) {
385 case GFS2_EATYPE_USR:
386 prefix = "user.";
387 l = 5;
388 break;
389 case GFS2_EATYPE_SYS:
390 prefix = "system.";
391 l = 7;
392 break;
393 case GFS2_EATYPE_SECURITY:
394 prefix = "security.";
395 l = 9;
396 break;
397 }
398
399 BUG_ON(l == 0);
400
401 memcpy(er->er_data + ei->ei_size, prefix, l);
402 memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea),
403 ea->ea_name_len);
404 memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1);
405 }
406
407 ei->ei_size += ea_size;
408
409 return 0;
410}
411
412/**
413 * gfs2_ea_list -
414 * @ip:
415 * @er:
416 *
417 * Returns: actual size of data on success, -errno on error
418 */
419
420int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
421{
422 struct gfs2_holder i_gh;
423 int error;
424
425 if (!er->er_data || !er->er_data_len) {
426 er->er_data = NULL;
427 er->er_data_len = 0;
428 }
429
430 error = gfs2_glock_nq_init(ip->i_gl,
431 LM_ST_SHARED, LM_FLAG_ANY,
432 &i_gh);
433 if (error)
434 return error;
435
436 if (ip->i_di.di_eattr) {
437 struct ea_list ei = { .ei_er = er, .ei_size = 0 };
438
439 error = ea_foreach(ip, ea_list_i, &ei);
440 if (!error)
441 error = ei.ei_size;
442 }
443
444 gfs2_glock_dq_uninit(&i_gh);
445
446 return error;
447}
448
449/**
450 * ea_get_unstuffed - actually copies the unstuffed data into the
451 * request buffer
452 * @ip:
453 * @ea:
454 * @data:
455 *
456 * Returns: errno
457 */
458
459static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
460 char *data)
461{
462 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
463 struct buffer_head **bh;
464 unsigned int amount = GFS2_EA_DATA_LEN(ea);
465 unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
466 uint64_t *dataptrs = GFS2_EA2DATAPTRS(ea);
467 unsigned int x;
468 int error = 0;
469
470 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
471 if (!bh)
472 return -ENOMEM;
473
474 for (x = 0; x < nptrs; x++) {
475 error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs),
476 DIO_START, bh + x);
477 if (error) {
478 while (x--)
479 brelse(bh[x]);
480 goto out;
481 }
482 dataptrs++;
483 }
484
485 for (x = 0; x < nptrs; x++) {
486 error = gfs2_meta_reread(sdp, bh[x], DIO_WAIT);
487 if (error) {
488 for (; x < nptrs; x++)
489 brelse(bh[x]);
490 goto out;
491 }
492 if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
493 for (; x < nptrs; x++)
494 brelse(bh[x]);
495 error = -EIO;
496 goto out;
497 }
498
499 memcpy(data,
500 bh[x]->b_data + sizeof(struct gfs2_meta_header),
501 (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
502
503 amount -= sdp->sd_jbsize;
504 data += sdp->sd_jbsize;
505
506 brelse(bh[x]);
507 }
508
509 out:
510 kfree(bh);
511
512 return error;
513}
514
515int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
516 char *data)
517{
518 if (GFS2_EA_IS_STUFFED(el->el_ea)) {
519 memcpy(data,
520 GFS2_EA2DATA(el->el_ea),
521 GFS2_EA_DATA_LEN(el->el_ea));
522 return 0;
523 } else
524 return ea_get_unstuffed(ip, el->el_ea, data);
525}
526
527/**
528 * gfs2_ea_get_i -
529 * @ip:
530 * @er:
531 *
532 * Returns: actual size of data on success, -errno on error
533 */
534
535int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
536{
537 struct gfs2_ea_location el;
538 int error;
539
540 if (!ip->i_di.di_eattr)
541 return -ENODATA;
542
543 error = gfs2_ea_find(ip, er, &el);
544 if (error)
545 return error;
546 if (!el.el_ea)
547 return -ENODATA;
548
549 if (er->er_data_len) {
550 if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
551 error = -ERANGE;
552 else
553 error = gfs2_ea_get_copy(ip, &el, er->er_data);
554 }
555 if (!error)
556 error = GFS2_EA_DATA_LEN(el.el_ea);
557
558 brelse(el.el_bh);
559
560 return error;
561}
562
563/**
564 * gfs2_ea_get -
565 * @ip:
566 * @er:
567 *
568 * Returns: actual size of data on success, -errno on error
569 */
570
571int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
572{
573 struct gfs2_holder i_gh;
574 int error;
575
576 if (!er->er_name_len ||
577 er->er_name_len > GFS2_EA_MAX_NAME_LEN)
578 return -EINVAL;
579 if (!er->er_data || !er->er_data_len) {
580 er->er_data = NULL;
581 er->er_data_len = 0;
582 }
583
584 error = gfs2_glock_nq_init(ip->i_gl,
585 LM_ST_SHARED, LM_FLAG_ANY,
586 &i_gh);
587 if (error)
588 return error;
589
590 error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
591
592 gfs2_glock_dq_uninit(&i_gh);
593
594 return error;
595}
596
597/**
598 * ea_alloc_blk - allocates a new block for extended attributes.
599 * @ip: A pointer to the inode that's getting extended attributes
600 * @bhp:
601 *
602 * Returns: errno
603 */
604
605static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
606{
607 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
608 struct gfs2_ea_header *ea;
609 uint64_t block;
610
611 block = gfs2_alloc_meta(ip);
612
613 *bhp = gfs2_meta_new(ip->i_gl, block);
614 gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
615 gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
616 gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
617
618 ea = GFS2_EA_BH2FIRST(*bhp);
619 ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
620 ea->ea_type = GFS2_EATYPE_UNUSED;
621 ea->ea_flags = GFS2_EAFLAG_LAST;
622 ea->ea_num_ptrs = 0;
623
624 ip->i_di.di_blocks++;
625
626 return 0;
627}
628
629/**
630 * ea_write - writes the request info to an ea, creating new blocks if
631 * necessary
632 * @ip: inode that is being modified
633 * @ea: the location of the new ea in a block
634 * @er: the write request
635 *
636 * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
637 *
638 * returns : errno
639 */
640
641static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
642 struct gfs2_ea_request *er)
643{
644 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
645
646 ea->ea_data_len = cpu_to_be32(er->er_data_len);
647 ea->ea_name_len = er->er_name_len;
648 ea->ea_type = er->er_type;
649 ea->__pad = 0;
650
651 memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
652
653 if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
654 ea->ea_num_ptrs = 0;
655 memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
656 } else {
657 uint64_t *dataptr = GFS2_EA2DATAPTRS(ea);
658 const char *data = er->er_data;
659 unsigned int data_len = er->er_data_len;
660 unsigned int copy;
661 unsigned int x;
662
663 ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize);
664 for (x = 0; x < ea->ea_num_ptrs; x++) {
665 struct buffer_head *bh;
666 uint64_t block;
667 int mh_size = sizeof(struct gfs2_meta_header);
668
669 block = gfs2_alloc_meta(ip);
670
671 bh = gfs2_meta_new(ip->i_gl, block);
672 gfs2_trans_add_bh(ip->i_gl, bh, 1);
673 gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
674
675 ip->i_di.di_blocks++;
676
677 copy = (data_len > sdp->sd_jbsize) ? sdp->sd_jbsize :
678 data_len;
679 memcpy(bh->b_data + mh_size, data, copy);
680 if (copy < sdp->sd_jbsize)
681 memset(bh->b_data + mh_size + copy, 0,
682 sdp->sd_jbsize - copy);
683
684 *dataptr++ = cpu_to_be64((uint64_t)bh->b_blocknr);
685 data += copy;
686 data_len -= copy;
687
688 brelse(bh);
689 }
690
691 gfs2_assert_withdraw(sdp, !data_len);
692 }
693
694 return 0;
695}
696
697typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
698 struct gfs2_ea_request *er,
699 void *private);
700
701static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
702 unsigned int blks,
703 ea_skeleton_call_t skeleton_call,
704 void *private)
705{
706 struct gfs2_alloc *al;
707 struct buffer_head *dibh;
708 int error;
709
710 al = gfs2_alloc_get(ip);
711
712 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
713 if (error)
714 goto out;
715
716 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
717 if (error)
718 goto out_gunlock_q;
719
720 al->al_requested = blks;
721
722 error = gfs2_inplace_reserve(ip);
723 if (error)
724 goto out_gunlock_q;
725
726 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
727 blks + al->al_rgd->rd_ri.ri_length +
728 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
729 if (error)
730 goto out_ipres;
731
732 error = skeleton_call(ip, er, private);
733 if (error)
734 goto out_end_trans;
735
736 error = gfs2_meta_inode_buffer(ip, &dibh);
737 if (!error) {
738 if (er->er_flags & GFS2_ERF_MODE) {
739 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
740 (ip->i_di.di_mode & S_IFMT) ==
741 (er->er_mode & S_IFMT));
742 ip->i_di.di_mode = er->er_mode;
743 }
744 ip->i_di.di_ctime = get_seconds();
745 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
746 gfs2_dinode_out(&ip->i_di, dibh->b_data);
747 brelse(dibh);
748 }
749
750 out_end_trans:
751 gfs2_trans_end(GFS2_SB(&ip->i_inode));
752
753 out_ipres:
754 gfs2_inplace_release(ip);
755
756 out_gunlock_q:
757 gfs2_quota_unlock(ip);
758
759 out:
760 gfs2_alloc_put(ip);
761
762 return error;
763}
764
765static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
766 void *private)
767{
768 struct buffer_head *bh;
769 int error;
770
771 error = ea_alloc_blk(ip, &bh);
772 if (error)
773 return error;
774
775 ip->i_di.di_eattr = bh->b_blocknr;
776 error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
777
778 brelse(bh);
779
780 return error;
781}
782
783/**
784 * ea_init - initializes a new eattr block
785 * @ip:
786 * @er:
787 *
788 * Returns: errno
789 */
790
791static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
792{
793 unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
794 unsigned int blks = 1;
795
796 if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
797 blks += DIV_ROUND_UP(er->er_data_len, jbsize);
798
799 return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
800}
801
802static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
803{
804 uint32_t ea_size = GFS2_EA_SIZE(ea);
805 struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea +
806 ea_size);
807 uint32_t new_size = GFS2_EA_REC_LEN(ea) - ea_size;
808 int last = ea->ea_flags & GFS2_EAFLAG_LAST;
809
810 ea->ea_rec_len = cpu_to_be32(ea_size);
811 ea->ea_flags ^= last;
812
813 new->ea_rec_len = cpu_to_be32(new_size);
814 new->ea_flags = last;
815
816 return new;
817}
818
819static void ea_set_remove_stuffed(struct gfs2_inode *ip,
820 struct gfs2_ea_location *el)
821{
822 struct gfs2_ea_header *ea = el->el_ea;
823 struct gfs2_ea_header *prev = el->el_prev;
824 uint32_t len;
825
826 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
827
828 if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
829 ea->ea_type = GFS2_EATYPE_UNUSED;
830 return;
831 } else if (GFS2_EA2NEXT(prev) != ea) {
832 prev = GFS2_EA2NEXT(prev);
833 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea);
834 }
835
836 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
837 prev->ea_rec_len = cpu_to_be32(len);
838
839 if (GFS2_EA_IS_LAST(ea))
840 prev->ea_flags |= GFS2_EAFLAG_LAST;
841}
842
843struct ea_set {
844 int ea_split;
845
846 struct gfs2_ea_request *es_er;
847 struct gfs2_ea_location *es_el;
848
849 struct buffer_head *es_bh;
850 struct gfs2_ea_header *es_ea;
851};
852
853static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
854 struct gfs2_ea_header *ea, struct ea_set *es)
855{
856 struct gfs2_ea_request *er = es->es_er;
857 struct buffer_head *dibh;
858 int error;
859
860 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0);
861 if (error)
862 return error;
863
864 gfs2_trans_add_bh(ip->i_gl, bh, 1);
865
866 if (es->ea_split)
867 ea = ea_split_ea(ea);
868
869 ea_write(ip, ea, er);
870
871 if (es->es_el)
872 ea_set_remove_stuffed(ip, es->es_el);
873
874 error = gfs2_meta_inode_buffer(ip, &dibh);
875 if (error)
876 goto out;
877
878 if (er->er_flags & GFS2_ERF_MODE) {
879 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
880 (ip->i_di.di_mode & S_IFMT) == (er->er_mode & S_IFMT));
881 ip->i_di.di_mode = er->er_mode;
882 }
883 ip->i_di.di_ctime = get_seconds();
884 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
885 gfs2_dinode_out(&ip->i_di, dibh->b_data);
886 brelse(dibh);
887 out:
888 gfs2_trans_end(GFS2_SB(&ip->i_inode));
889
890 return error;
891}
892
893static int ea_set_simple_alloc(struct gfs2_inode *ip,
894 struct gfs2_ea_request *er, void *private)
895{
896 struct ea_set *es = private;
897 struct gfs2_ea_header *ea = es->es_ea;
898 int error;
899
900 gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
901
902 if (es->ea_split)
903 ea = ea_split_ea(ea);
904
905 error = ea_write(ip, ea, er);
906 if (error)
907 return error;
908
909 if (es->es_el)
910 ea_set_remove_stuffed(ip, es->es_el);
911
912 return 0;
913}
914
915static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
916 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
917 void *private)
918{
919 struct ea_set *es = private;
920 unsigned int size;
921 int stuffed;
922 int error;
923
924 stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size);
925
926 if (ea->ea_type == GFS2_EATYPE_UNUSED) {
927 if (GFS2_EA_REC_LEN(ea) < size)
928 return 0;
929 if (!GFS2_EA_IS_STUFFED(ea)) {
930 error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
931 if (error)
932 return error;
933 }
934 es->ea_split = 0;
935 } else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
936 es->ea_split = 1;
937 else
938 return 0;
939
940 if (stuffed) {
941 error = ea_set_simple_noalloc(ip, bh, ea, es);
942 if (error)
943 return error;
944 } else {
945 unsigned int blks;
946
947 es->es_bh = bh;
948 es->es_ea = ea;
949 blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len,
950 GFS2_SB(&ip->i_inode)->sd_jbsize);
951
952 error = ea_alloc_skeleton(ip, es->es_er, blks,
953 ea_set_simple_alloc, es);
954 if (error)
955 return error;
956 }
957
958 return 1;
959}
960
961static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
962 void *private)
963{
964 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
965 struct buffer_head *indbh, *newbh;
966 uint64_t *eablk;
967 int error;
968 int mh_size = sizeof(struct gfs2_meta_header);
969
970 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
971 uint64_t *end;
972
973 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
974 DIO_START | DIO_WAIT, &indbh);
975 if (error)
976 return error;
977
978 if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
979 error = -EIO;
980 goto out;
981 }
982
983 eablk = (uint64_t *)(indbh->b_data + mh_size);
984 end = eablk + sdp->sd_inptrs;
985
986 for (; eablk < end; eablk++)
987 if (!*eablk)
988 break;
989
990 if (eablk == end) {
991 error = -ENOSPC;
992 goto out;
993 }
994
995 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
996 } else {
997 uint64_t blk;
998
999 blk = gfs2_alloc_meta(ip);
1000
1001 indbh = gfs2_meta_new(ip->i_gl, blk);
1002 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
1003 gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
1004 gfs2_buffer_clear_tail(indbh, mh_size);
1005
1006 eablk = (uint64_t *)(indbh->b_data + mh_size);
1007 *eablk = cpu_to_be64(ip->i_di.di_eattr);
1008 ip->i_di.di_eattr = blk;
1009 ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
1010 ip->i_di.di_blocks++;
1011
1012 eablk++;
1013 }
1014
1015 error = ea_alloc_blk(ip, &newbh);
1016 if (error)
1017 goto out;
1018
1019 *eablk = cpu_to_be64((uint64_t)newbh->b_blocknr);
1020 error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
1021 brelse(newbh);
1022 if (error)
1023 goto out;
1024
1025 if (private)
1026 ea_set_remove_stuffed(ip, (struct gfs2_ea_location *)private);
1027
1028 out:
1029 brelse(indbh);
1030
1031 return error;
1032}
1033
1034static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
1035 struct gfs2_ea_location *el)
1036{
1037 struct ea_set es;
1038 unsigned int blks = 2;
1039 int error;
1040
1041 memset(&es, 0, sizeof(struct ea_set));
1042 es.es_er = er;
1043 es.es_el = el;
1044
1045 error = ea_foreach(ip, ea_set_simple, &es);
1046 if (error > 0)
1047 return 0;
1048 if (error)
1049 return error;
1050
1051 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
1052 blks++;
1053 if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
1054 blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
1055
1056 return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
1057}
1058
1059static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
1060 struct gfs2_ea_location *el)
1061{
1062 if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
1063 el->el_prev = GFS2_EA2NEXT(el->el_prev);
1064 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
1065 GFS2_EA2NEXT(el->el_prev) == el->el_ea);
1066 }
1067
1068 return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
1069}
1070
1071int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1072{
1073 struct gfs2_ea_location el;
1074 int error;
1075
1076 if (!ip->i_di.di_eattr) {
1077 if (er->er_flags & XATTR_REPLACE)
1078 return -ENODATA;
1079 return ea_init(ip, er);
1080 }
1081
1082 error = gfs2_ea_find(ip, er, &el);
1083 if (error)
1084 return error;
1085
1086 if (el.el_ea) {
1087 if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
1088 brelse(el.el_bh);
1089 return -EPERM;
1090 }
1091
1092 error = -EEXIST;
1093 if (!(er->er_flags & XATTR_CREATE)) {
1094 int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
1095 error = ea_set_i(ip, er, &el);
1096 if (!error && unstuffed)
1097 ea_set_remove_unstuffed(ip, &el);
1098 }
1099
1100 brelse(el.el_bh);
1101 } else {
1102 error = -ENODATA;
1103 if (!(er->er_flags & XATTR_REPLACE))
1104 error = ea_set_i(ip, er, NULL);
1105 }
1106
1107 return error;
1108}
1109
1110int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1111{
1112 struct gfs2_holder i_gh;
1113 int error;
1114
1115 if (!er->er_name_len ||
1116 er->er_name_len > GFS2_EA_MAX_NAME_LEN)
1117 return -EINVAL;
1118 if (!er->er_data || !er->er_data_len) {
1119 er->er_data = NULL;
1120 er->er_data_len = 0;
1121 }
1122 error = ea_check_size(GFS2_SB(&ip->i_inode), er);
1123 if (error)
1124 return error;
1125
1126 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1127 if (error)
1128 return error;
1129
1130 if (IS_IMMUTABLE(&ip->i_inode))
1131 error = -EPERM;
1132 else
1133 error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
1134
1135 gfs2_glock_dq_uninit(&i_gh);
1136
1137 return error;
1138}
1139
1140static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
1141{
1142 struct gfs2_ea_header *ea = el->el_ea;
1143 struct gfs2_ea_header *prev = el->el_prev;
1144 struct buffer_head *dibh;
1145 int error;
1146
1147 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
1148 if (error)
1149 return error;
1150
1151 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
1152
1153 if (prev) {
1154 uint32_t len;
1155
1156 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
1157 prev->ea_rec_len = cpu_to_be32(len);
1158
1159 if (GFS2_EA_IS_LAST(ea))
1160 prev->ea_flags |= GFS2_EAFLAG_LAST;
1161 } else
1162 ea->ea_type = GFS2_EATYPE_UNUSED;
1163
1164 error = gfs2_meta_inode_buffer(ip, &dibh);
1165 if (!error) {
1166 ip->i_di.di_ctime = get_seconds();
1167 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1168 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1169 brelse(dibh);
1170 }
1171
1172 gfs2_trans_end(GFS2_SB(&ip->i_inode));
1173
1174 return error;
1175}
1176
1177int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1178{
1179 struct gfs2_ea_location el;
1180 int error;
1181
1182 if (!ip->i_di.di_eattr)
1183 return -ENODATA;
1184
1185 error = gfs2_ea_find(ip, er, &el);
1186 if (error)
1187 return error;
1188 if (!el.el_ea)
1189 return -ENODATA;
1190
1191 if (GFS2_EA_IS_STUFFED(el.el_ea))
1192 error = ea_remove_stuffed(ip, &el);
1193 else
1194 error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
1195 0);
1196
1197 brelse(el.el_bh);
1198
1199 return error;
1200}
1201
1202/**
1203 * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
1204 * @ip: pointer to the inode of the target file
1205 * @er: request information
1206 *
1207 * Returns: errno
1208 */
1209
1210int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1211{
1212 struct gfs2_holder i_gh;
1213 int error;
1214
1215 if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
1216 return -EINVAL;
1217
1218 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1219 if (error)
1220 return error;
1221
1222 if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
1223 error = -EPERM;
1224 else
1225 error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
1226
1227 gfs2_glock_dq_uninit(&i_gh);
1228
1229 return error;
1230}
1231
1232static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
1233 struct gfs2_ea_header *ea, char *data)
1234{
1235 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1236 struct buffer_head **bh;
1237 unsigned int amount = GFS2_EA_DATA_LEN(ea);
1238 unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
1239 uint64_t *dataptrs = GFS2_EA2DATAPTRS(ea);
1240 unsigned int x;
1241 int error;
1242
1243 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
1244 if (!bh)
1245 return -ENOMEM;
1246
1247 error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
1248 if (error)
1249 goto out;
1250
1251 for (x = 0; x < nptrs; x++) {
1252 error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs),
1253 DIO_START, bh + x);
1254 if (error) {
1255 while (x--)
1256 brelse(bh[x]);
1257 goto fail;
1258 }
1259 dataptrs++;
1260 }
1261
1262 for (x = 0; x < nptrs; x++) {
1263 error = gfs2_meta_reread(sdp, bh[x], DIO_WAIT);
1264 if (error) {
1265 for (; x < nptrs; x++)
1266 brelse(bh[x]);
1267 goto fail;
1268 }
1269 if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
1270 for (; x < nptrs; x++)
1271 brelse(bh[x]);
1272 error = -EIO;
1273 goto fail;
1274 }
1275
1276 gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
1277
1278 memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header),
1279 data,
1280 (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
1281
1282 amount -= sdp->sd_jbsize;
1283 data += sdp->sd_jbsize;
1284
1285 brelse(bh[x]);
1286 }
1287
1288 out:
1289 kfree(bh);
1290
1291 return error;
1292
1293 fail:
1294 gfs2_trans_end(sdp);
1295 kfree(bh);
1296
1297 return error;
1298}
1299
1300int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
1301 struct iattr *attr, char *data)
1302{
1303 struct buffer_head *dibh;
1304 int error;
1305
1306 if (GFS2_EA_IS_STUFFED(el->el_ea)) {
1307 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
1308 if (error)
1309 return error;
1310
1311 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
1312 memcpy(GFS2_EA2DATA(el->el_ea),
1313 data,
1314 GFS2_EA_DATA_LEN(el->el_ea));
1315 } else
1316 error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
1317
1318 if (error)
1319 return error;
1320
1321 error = gfs2_meta_inode_buffer(ip, &dibh);
1322 if (!error) {
1323 error = inode_setattr(&ip->i_inode, attr);
1324 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
1325 gfs2_inode_attr_out(ip);
1326 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1327 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1328 brelse(dibh);
1329 }
1330
1331 gfs2_trans_end(GFS2_SB(&ip->i_inode));
1332
1333 return error;
1334}
1335
1336static int ea_dealloc_indirect(struct gfs2_inode *ip)
1337{
1338 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1339 struct gfs2_rgrp_list rlist;
1340 struct buffer_head *indbh, *dibh;
1341 uint64_t *eablk, *end;
1342 unsigned int rg_blocks = 0;
1343 uint64_t bstart = 0;
1344 unsigned int blen = 0;
1345 unsigned int blks = 0;
1346 unsigned int x;
1347 int error;
1348
1349 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1350
1351 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
1352 DIO_START | DIO_WAIT, &indbh);
1353 if (error)
1354 return error;
1355
1356 if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
1357 error = -EIO;
1358 goto out;
1359 }
1360
1361 eablk = (uint64_t *)(indbh->b_data + sizeof(struct gfs2_meta_header));
1362 end = eablk + sdp->sd_inptrs;
1363
1364 for (; eablk < end; eablk++) {
1365 uint64_t bn;
1366
1367 if (!*eablk)
1368 break;
1369 bn = be64_to_cpu(*eablk);
1370
1371 if (bstart + blen == bn)
1372 blen++;
1373 else {
1374 if (bstart)
1375 gfs2_rlist_add(sdp, &rlist, bstart);
1376 bstart = bn;
1377 blen = 1;
1378 }
1379 blks++;
1380 }
1381 if (bstart)
1382 gfs2_rlist_add(sdp, &rlist, bstart);
1383 else
1384 goto out;
1385
1386 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
1387
1388 for (x = 0; x < rlist.rl_rgrps; x++) {
1389 struct gfs2_rgrpd *rgd;
1390 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
1391 rg_blocks += rgd->rd_ri.ri_length;
1392 }
1393
1394 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
1395 if (error)
1396 goto out_rlist_free;
1397
1398 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
1399 RES_INDIRECT + RES_STATFS +
1400 RES_QUOTA, blks);
1401 if (error)
1402 goto out_gunlock;
1403
1404 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
1405
1406 eablk = (uint64_t *)(indbh->b_data + sizeof(struct gfs2_meta_header));
1407 bstart = 0;
1408 blen = 0;
1409
1410 for (; eablk < end; eablk++) {
1411 uint64_t bn;
1412
1413 if (!*eablk)
1414 break;
1415 bn = be64_to_cpu(*eablk);
1416
1417 if (bstart + blen == bn)
1418 blen++;
1419 else {
1420 if (bstart)
1421 gfs2_free_meta(ip, bstart, blen);
1422 bstart = bn;
1423 blen = 1;
1424 }
1425
1426 *eablk = 0;
1427 if (!ip->i_di.di_blocks)
1428 gfs2_consist_inode(ip);
1429 ip->i_di.di_blocks--;
1430 }
1431 if (bstart)
1432 gfs2_free_meta(ip, bstart, blen);
1433
1434 ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
1435
1436 error = gfs2_meta_inode_buffer(ip, &dibh);
1437 if (!error) {
1438 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1439 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1440 brelse(dibh);
1441 }
1442
1443 gfs2_trans_end(sdp);
1444
1445 out_gunlock:
1446 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1447
1448 out_rlist_free:
1449 gfs2_rlist_free(&rlist);
1450
1451 out:
1452 brelse(indbh);
1453
1454 return error;
1455}
1456
1457static int ea_dealloc_block(struct gfs2_inode *ip)
1458{
1459 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1460 struct gfs2_alloc *al = &ip->i_alloc;
1461 struct gfs2_rgrpd *rgd;
1462 struct buffer_head *dibh;
1463 int error;
1464
1465 rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
1466 if (!rgd) {
1467 gfs2_consist_inode(ip);
1468 return -EIO;
1469 }
1470
1471 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
1472 &al->al_rgd_gh);
1473 if (error)
1474 return error;
1475
1476 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE +
1477 RES_STATFS + RES_QUOTA, 1);
1478 if (error)
1479 goto out_gunlock;
1480
1481 gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
1482
1483 ip->i_di.di_eattr = 0;
1484 if (!ip->i_di.di_blocks)
1485 gfs2_consist_inode(ip);
1486 ip->i_di.di_blocks--;
1487
1488 error = gfs2_meta_inode_buffer(ip, &dibh);
1489 if (!error) {
1490 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1491 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1492 brelse(dibh);
1493 }
1494
1495 gfs2_trans_end(sdp);
1496
1497 out_gunlock:
1498 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1499
1500 return error;
1501}
1502
1503/**
1504 * gfs2_ea_dealloc - deallocate the extended attribute fork
1505 * @ip: the inode
1506 *
1507 * Returns: errno
1508 */
1509
1510int gfs2_ea_dealloc(struct gfs2_inode *ip)
1511{
1512 struct gfs2_alloc *al;
1513 int error;
1514
1515 al = gfs2_alloc_get(ip);
1516
1517 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1518 if (error)
1519 goto out_alloc;
1520
1521 error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
1522 if (error)
1523 goto out_quota;
1524
1525 error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
1526 if (error)
1527 goto out_rindex;
1528
1529 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
1530 error = ea_dealloc_indirect(ip);
1531 if (error)
1532 goto out_rindex;
1533 }
1534
1535 error = ea_dealloc_block(ip);
1536
1537 out_rindex:
1538 gfs2_glock_dq_uninit(&al->al_ri_gh);
1539
1540 out_quota:
1541 gfs2_quota_unhold(ip);
1542
1543 out_alloc:
1544 gfs2_alloc_put(ip);
1545
1546 return error;
1547}
1548
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h
new file mode 100644
index 000000000000..ae199692e51d
--- /dev/null
+++ b/fs/gfs2/eattr.h
@@ -0,0 +1,97 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __EATTR_DOT_H__
11#define __EATTR_DOT_H__
12
13#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
14#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
15
16#define GFS2_EA_SIZE(ea) \
17ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
18 ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
19 (sizeof(uint64_t) * (ea)->ea_num_ptrs)), 8)
20
21#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
22#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
23
24#define GFS2_EAREQ_SIZE_STUFFED(er) \
25ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
26
27#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
28ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
29 sizeof(uint64_t) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
30
31#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
32#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
33
34#define GFS2_EA2DATAPTRS(ea) \
35((uint64_t *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
36
37#define GFS2_EA2NEXT(ea) \
38((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
39
40#define GFS2_EA_BH2FIRST(bh) \
41((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
42
43#define GFS2_ERF_MODE 0x80000000
44
45struct gfs2_ea_request {
46 char *er_name;
47 char *er_data;
48 unsigned int er_name_len;
49 unsigned int er_data_len;
50 unsigned int er_type; /* GFS2_EATYPE_... */
51 int er_flags;
52 mode_t er_mode;
53};
54
55struct gfs2_ea_location {
56 struct buffer_head *el_bh;
57 struct gfs2_ea_header *el_ea;
58 struct gfs2_ea_header *el_prev;
59};
60
61int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
62int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
63int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
64
65int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
66int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
67int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
68int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
69
70int gfs2_ea_dealloc(struct gfs2_inode *ip);
71
72/* Exported to acl.c */
73
74int gfs2_ea_find(struct gfs2_inode *ip,
75 struct gfs2_ea_request *er,
76 struct gfs2_ea_location *el);
77int gfs2_ea_get_copy(struct gfs2_inode *ip,
78 struct gfs2_ea_location *el,
79 char *data);
80int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
81 struct iattr *attr, char *data);
82
83static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
84{
85 switch (ea->ea_type) {
86 case GFS2_EATYPE_USR:
87 return (5 + (ea->ea_name_len + 1));
88 case GFS2_EATYPE_SYS:
89 return (7 + (ea->ea_name_len + 1));
90 case GFS2_EATYPE_SECURITY:
91 return (9 + (ea->ea_name_len + 1));
92 default:
93 return (0);
94 }
95}
96
97#endif /* __EATTR_DOT_H__ */
diff --git a/fs/gfs2/format.h b/fs/gfs2/format.h
new file mode 100644
index 000000000000..239f0c3553fc
--- /dev/null
+++ b/fs/gfs2/format.h
@@ -0,0 +1,21 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __FORMAT_DOT_H__
11#define __FORMAT_DOT_H__
12
13static const uint32_t gfs2_old_fs_formats[] = {
14 0
15};
16
17static const uint32_t gfs2_old_multihost_formats[] = {
18 0
19};
20
21#endif /* __FORMAT_DOT_H__ */
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
new file mode 100644
index 000000000000..6edbd551a4c0
--- /dev/null
+++ b/fs/gfs2/gfs2.h
@@ -0,0 +1,31 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GFS2_DOT_H__
11#define __GFS2_DOT_H__
12
13enum {
14 NO_CREATE = 0,
15 CREATE = 1,
16};
17
18enum {
19 NO_WAIT = 0,
20 WAIT = 1,
21};
22
23enum {
24 NO_FORCE = 0,
25 FORCE = 1,
26};
27
28#define GFS2_FAST_NAME_SIZE 8
29
30#endif /* __GFS2_DOT_H__ */
31
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
new file mode 100644
index 000000000000..ef713dbff601
--- /dev/null
+++ b/fs/gfs2/glock.c
@@ -0,0 +1,2274 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/delay.h>
16#include <linux/sort.h>
17#include <linux/jhash.h>
18#include <linux/kref.h>
19#include <linux/kallsyms.h>
20#include <linux/gfs2_ondisk.h>
21#include <asm/uaccess.h>
22
23#include "gfs2.h"
24#include "lm_interface.h"
25#include "incore.h"
26#include "glock.h"
27#include "glops.h"
28#include "inode.h"
29#include "lm.h"
30#include "lops.h"
31#include "meta_io.h"
32#include "quota.h"
33#include "super.h"
34#include "util.h"
35
36/* Must be kept in sync with the beginning of struct gfs2_glock */
37struct glock_plug {
38 struct list_head gl_list;
39 unsigned long gl_flags;
40};
41
42struct greedy {
43 struct gfs2_holder gr_gh;
44 struct work_struct gr_work;
45};
46
47typedef void (*glock_examiner) (struct gfs2_glock * gl);
48
49static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
50static int dump_glock(struct gfs2_glock *gl);
51
52/**
53 * relaxed_state_ok - is a requested lock compatible with the current lock mode?
54 * @actual: the current state of the lock
55 * @requested: the lock state that was requested by the caller
56 * @flags: the modifier flags passed in by the caller
57 *
58 * Returns: 1 if the locks are compatible, 0 otherwise
59 */
60
61static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
62 int flags)
63{
64 if (actual == requested)
65 return 1;
66
67 if (flags & GL_EXACT)
68 return 0;
69
70 if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
71 return 1;
72
73 if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
74 return 1;
75
76 return 0;
77}
78
79/**
80 * gl_hash() - Turn glock number into hash bucket number
81 * @lock: The glock number
82 *
83 * Returns: The number of the corresponding hash bucket
84 */
85
86static unsigned int gl_hash(struct lm_lockname *name)
87{
88 unsigned int h;
89
90 h = jhash(&name->ln_number, sizeof(uint64_t), 0);
91 h = jhash(&name->ln_type, sizeof(unsigned int), h);
92 h &= GFS2_GL_HASH_MASK;
93
94 return h;
95}
96
97/**
98 * glock_free() - Perform a few checks and then release struct gfs2_glock
99 * @gl: The glock to release
100 *
101 * Also calls lock module to release its internal structure for this glock.
102 *
103 */
104
105static void glock_free(struct gfs2_glock *gl)
106{
107 struct gfs2_sbd *sdp = gl->gl_sbd;
108 struct inode *aspace = gl->gl_aspace;
109
110 gfs2_lm_put_lock(sdp, gl->gl_lock);
111
112 if (aspace)
113 gfs2_aspace_put(aspace);
114
115 kmem_cache_free(gfs2_glock_cachep, gl);
116}
117
118/**
119 * gfs2_glock_hold() - increment reference count on glock
120 * @gl: The glock to hold
121 *
122 */
123
124void gfs2_glock_hold(struct gfs2_glock *gl)
125{
126 kref_get(&gl->gl_ref);
127}
128
129/* All work is done after the return from kref_put() so we
130 can release the write_lock before the free. */
131
132static void kill_glock(struct kref *kref)
133{
134 struct gfs2_glock *gl = container_of(kref, struct gfs2_glock, gl_ref);
135 struct gfs2_sbd *sdp = gl->gl_sbd;
136
137 gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
138 gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
139 gfs2_assert(sdp, list_empty(&gl->gl_holders));
140 gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
141 gfs2_assert(sdp, list_empty(&gl->gl_waiters2));
142 gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
143}
144
145/**
146 * gfs2_glock_put() - Decrement reference count on glock
147 * @gl: The glock to put
148 *
149 */
150
151int gfs2_glock_put(struct gfs2_glock *gl)
152{
153 struct gfs2_gl_hash_bucket *bucket = gl->gl_bucket;
154 int rv = 0;
155
156 write_lock(&bucket->hb_lock);
157 if (kref_put(&gl->gl_ref, kill_glock)) {
158 list_del_init(&gl->gl_list);
159 write_unlock(&bucket->hb_lock);
160 BUG_ON(spin_is_locked(&gl->gl_spin));
161 glock_free(gl);
162 rv = 1;
163 goto out;
164 }
165 write_unlock(&bucket->hb_lock);
166out:
167 return rv;
168}
169
170/**
171 * queue_empty - check to see if a glock's queue is empty
172 * @gl: the glock
173 * @head: the head of the queue to check
174 *
175 * This function protects the list in the event that a process already
176 * has a holder on the list and is adding a second holder for itself.
177 * The glmutex lock is what generally prevents processes from working
178 * on the same glock at once, but the special case of adding a second
179 * holder for yourself ("recursive" locking) doesn't involve locking
180 * glmutex, making the spin lock necessary.
181 *
182 * Returns: 1 if the queue is empty
183 */
184
185static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head)
186{
187 int empty;
188 spin_lock(&gl->gl_spin);
189 empty = list_empty(head);
190 spin_unlock(&gl->gl_spin);
191 return empty;
192}
193
194/**
195 * search_bucket() - Find struct gfs2_glock by lock number
196 * @bucket: the bucket to search
197 * @name: The lock name
198 *
199 * Returns: NULL, or the struct gfs2_glock with the requested number
200 */
201
202static struct gfs2_glock *search_bucket(struct gfs2_gl_hash_bucket *bucket,
203 struct lm_lockname *name)
204{
205 struct gfs2_glock *gl;
206
207 list_for_each_entry(gl, &bucket->hb_list, gl_list) {
208 if (test_bit(GLF_PLUG, &gl->gl_flags))
209 continue;
210 if (!lm_name_equal(&gl->gl_name, name))
211 continue;
212
213 kref_get(&gl->gl_ref);
214
215 return gl;
216 }
217
218 return NULL;
219}
220
221/**
222 * gfs2_glock_find() - Find glock by lock number
223 * @sdp: The GFS2 superblock
224 * @name: The lock name
225 *
226 * Returns: NULL, or the struct gfs2_glock with the requested number
227 */
228
229static struct gfs2_glock *gfs2_glock_find(struct gfs2_sbd *sdp,
230 struct lm_lockname *name)
231{
232 struct gfs2_gl_hash_bucket *bucket = &sdp->sd_gl_hash[gl_hash(name)];
233 struct gfs2_glock *gl;
234
235 read_lock(&bucket->hb_lock);
236 gl = search_bucket(bucket, name);
237 read_unlock(&bucket->hb_lock);
238
239 return gl;
240}
241
242/**
243 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
244 * @sdp: The GFS2 superblock
245 * @number: the lock number
246 * @glops: The glock_operations to use
247 * @create: If 0, don't create the glock if it doesn't exist
248 * @glp: the glock is returned here
249 *
250 * This does not lock a glock, just finds/creates structures for one.
251 *
252 * Returns: errno
253 */
254
255int gfs2_glock_get(struct gfs2_sbd *sdp, uint64_t number,
256 struct gfs2_glock_operations *glops, int create,
257 struct gfs2_glock **glp)
258{
259 struct lm_lockname name;
260 struct gfs2_glock *gl, *tmp;
261 struct gfs2_gl_hash_bucket *bucket;
262 int error;
263
264 name.ln_number = number;
265 name.ln_type = glops->go_type;
266 bucket = &sdp->sd_gl_hash[gl_hash(&name)];
267
268 read_lock(&bucket->hb_lock);
269 gl = search_bucket(bucket, &name);
270 read_unlock(&bucket->hb_lock);
271
272 if (gl || !create) {
273 *glp = gl;
274 return 0;
275 }
276
277 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
278 if (!gl)
279 return -ENOMEM;
280
281 memset(gl, 0, sizeof(struct gfs2_glock));
282
283 INIT_LIST_HEAD(&gl->gl_list);
284 gl->gl_name = name;
285 kref_init(&gl->gl_ref);
286
287 spin_lock_init(&gl->gl_spin);
288
289 gl->gl_state = LM_ST_UNLOCKED;
290 gl->gl_owner = NULL;
291 gl->gl_ip = 0;
292 INIT_LIST_HEAD(&gl->gl_holders);
293 INIT_LIST_HEAD(&gl->gl_waiters1);
294 INIT_LIST_HEAD(&gl->gl_waiters2);
295 INIT_LIST_HEAD(&gl->gl_waiters3);
296
297 gl->gl_ops = glops;
298
299 gl->gl_bucket = bucket;
300 INIT_LIST_HEAD(&gl->gl_reclaim);
301
302 gl->gl_sbd = sdp;
303
304 lops_init_le(&gl->gl_le, &gfs2_glock_lops);
305 INIT_LIST_HEAD(&gl->gl_ail_list);
306
307 /* If this glock protects actual on-disk data or metadata blocks,
308 create a VFS inode to manage the pages/buffers holding them. */
309 if (glops == &gfs2_inode_glops ||
310 glops == &gfs2_rgrp_glops) {
311 gl->gl_aspace = gfs2_aspace_get(sdp);
312 if (!gl->gl_aspace) {
313 error = -ENOMEM;
314 goto fail;
315 }
316 }
317
318 error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock);
319 if (error)
320 goto fail_aspace;
321
322 write_lock(&bucket->hb_lock);
323 tmp = search_bucket(bucket, &name);
324 if (tmp) {
325 write_unlock(&bucket->hb_lock);
326 glock_free(gl);
327 gl = tmp;
328 } else {
329 list_add_tail(&gl->gl_list, &bucket->hb_list);
330 write_unlock(&bucket->hb_lock);
331 }
332
333 *glp = gl;
334
335 return 0;
336
337 fail_aspace:
338 if (gl->gl_aspace)
339 gfs2_aspace_put(gl->gl_aspace);
340
341 fail:
342 kmem_cache_free(gfs2_glock_cachep, gl);
343
344 return error;
345}
346
347/**
348 * gfs2_holder_init - initialize a struct gfs2_holder in the default way
349 * @gl: the glock
350 * @state: the state we're requesting
351 * @flags: the modifier flags
352 * @gh: the holder structure
353 *
354 */
355
356void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
357 struct gfs2_holder *gh)
358{
359 INIT_LIST_HEAD(&gh->gh_list);
360 gh->gh_gl = gl;
361 gh->gh_ip = (unsigned long)__builtin_return_address(0);
362 gh->gh_owner = current;
363 gh->gh_state = state;
364 gh->gh_flags = flags;
365 gh->gh_error = 0;
366 gh->gh_iflags = 0;
367 init_completion(&gh->gh_wait);
368
369 if (gh->gh_state == LM_ST_EXCLUSIVE)
370 gh->gh_flags |= GL_LOCAL_EXCL;
371
372 gfs2_glock_hold(gl);
373}
374
375/**
376 * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
377 * @state: the state we're requesting
378 * @flags: the modifier flags
379 * @gh: the holder structure
380 *
381 * Don't mess with the glock.
382 *
383 */
384
385void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh)
386{
387 gh->gh_state = state;
388 gh->gh_flags = flags;
389 if (gh->gh_state == LM_ST_EXCLUSIVE)
390 gh->gh_flags |= GL_LOCAL_EXCL;
391
392 gh->gh_iflags &= 1 << HIF_ALLOCED;
393 gh->gh_ip = (unsigned long)__builtin_return_address(0);
394}
395
396/**
397 * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference)
398 * @gh: the holder structure
399 *
400 */
401
402void gfs2_holder_uninit(struct gfs2_holder *gh)
403{
404 gfs2_glock_put(gh->gh_gl);
405 gh->gh_gl = NULL;
406 gh->gh_ip = 0;
407}
408
409/**
410 * gfs2_holder_get - get a struct gfs2_holder structure
411 * @gl: the glock
412 * @state: the state we're requesting
413 * @flags: the modifier flags
414 * @gfp_flags:
415 *
416 * Figure out how big an impact this function has. Either:
417 * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd
418 * 2) Leave it like it is
419 *
420 * Returns: the holder structure, NULL on ENOMEM
421 */
422
423static struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl,
424 unsigned int state,
425 int flags, gfp_t gfp_flags)
426{
427 struct gfs2_holder *gh;
428
429 gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags);
430 if (!gh)
431 return NULL;
432
433 gfs2_holder_init(gl, state, flags, gh);
434 set_bit(HIF_ALLOCED, &gh->gh_iflags);
435 gh->gh_ip = (unsigned long)__builtin_return_address(0);
436 return gh;
437}
438
439/**
440 * gfs2_holder_put - get rid of a struct gfs2_holder structure
441 * @gh: the holder structure
442 *
443 */
444
445static void gfs2_holder_put(struct gfs2_holder *gh)
446{
447 gfs2_holder_uninit(gh);
448 kfree(gh);
449}
450
451/**
452 * rq_mutex - process a mutex request in the queue
453 * @gh: the glock holder
454 *
455 * Returns: 1 if the queue is blocked
456 */
457
458static int rq_mutex(struct gfs2_holder *gh)
459{
460 struct gfs2_glock *gl = gh->gh_gl;
461
462 list_del_init(&gh->gh_list);
463 /* gh->gh_error never examined. */
464 set_bit(GLF_LOCK, &gl->gl_flags);
465 complete(&gh->gh_wait);
466
467 return 1;
468}
469
470/**
471 * rq_promote - process a promote request in the queue
472 * @gh: the glock holder
473 *
474 * Acquire a new inter-node lock, or change a lock state to more restrictive.
475 *
476 * Returns: 1 if the queue is blocked
477 */
478
479static int rq_promote(struct gfs2_holder *gh)
480{
481 struct gfs2_glock *gl = gh->gh_gl;
482 struct gfs2_sbd *sdp = gl->gl_sbd;
483 struct gfs2_glock_operations *glops = gl->gl_ops;
484
485 if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
486 if (list_empty(&gl->gl_holders)) {
487 gl->gl_req_gh = gh;
488 set_bit(GLF_LOCK, &gl->gl_flags);
489 spin_unlock(&gl->gl_spin);
490
491 if (atomic_read(&sdp->sd_reclaim_count) >
492 gfs2_tune_get(sdp, gt_reclaim_limit) &&
493 !(gh->gh_flags & LM_FLAG_PRIORITY)) {
494 gfs2_reclaim_glock(sdp);
495 gfs2_reclaim_glock(sdp);
496 }
497
498 glops->go_xmote_th(gl, gh->gh_state,
499 gh->gh_flags);
500
501 spin_lock(&gl->gl_spin);
502 }
503 return 1;
504 }
505
506 if (list_empty(&gl->gl_holders)) {
507 set_bit(HIF_FIRST, &gh->gh_iflags);
508 set_bit(GLF_LOCK, &gl->gl_flags);
509 } else {
510 struct gfs2_holder *next_gh;
511 if (gh->gh_flags & GL_LOCAL_EXCL)
512 return 1;
513 next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
514 gh_list);
515 if (next_gh->gh_flags & GL_LOCAL_EXCL)
516 return 1;
517 }
518
519 list_move_tail(&gh->gh_list, &gl->gl_holders);
520 gh->gh_error = 0;
521 set_bit(HIF_HOLDER, &gh->gh_iflags);
522
523 complete(&gh->gh_wait);
524
525 return 0;
526}
527
528/**
529 * rq_demote - process a demote request in the queue
530 * @gh: the glock holder
531 *
532 * Returns: 1 if the queue is blocked
533 */
534
535static int rq_demote(struct gfs2_holder *gh)
536{
537 struct gfs2_glock *gl = gh->gh_gl;
538 struct gfs2_glock_operations *glops = gl->gl_ops;
539
540 if (!list_empty(&gl->gl_holders))
541 return 1;
542
543 if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
544 list_del_init(&gh->gh_list);
545 gh->gh_error = 0;
546 spin_unlock(&gl->gl_spin);
547 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
548 gfs2_holder_put(gh);
549 else
550 complete(&gh->gh_wait);
551 spin_lock(&gl->gl_spin);
552 } else {
553 gl->gl_req_gh = gh;
554 set_bit(GLF_LOCK, &gl->gl_flags);
555 spin_unlock(&gl->gl_spin);
556
557 if (gh->gh_state == LM_ST_UNLOCKED ||
558 gl->gl_state != LM_ST_EXCLUSIVE)
559 glops->go_drop_th(gl);
560 else
561 glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
562
563 spin_lock(&gl->gl_spin);
564 }
565
566 return 0;
567}
568
569/**
570 * rq_greedy - process a queued request to drop greedy status
571 * @gh: the glock holder
572 *
573 * Returns: 1 if the queue is blocked
574 */
575
576static int rq_greedy(struct gfs2_holder *gh)
577{
578 struct gfs2_glock *gl = gh->gh_gl;
579
580 list_del_init(&gh->gh_list);
581 /* gh->gh_error never examined. */
582 clear_bit(GLF_GREEDY, &gl->gl_flags);
583 spin_unlock(&gl->gl_spin);
584
585 gfs2_holder_uninit(gh);
586 kfree(container_of(gh, struct greedy, gr_gh));
587
588 spin_lock(&gl->gl_spin);
589
590 return 0;
591}
592
593/**
594 * run_queue - process holder structures on a glock
595 * @gl: the glock
596 *
597 */
598static void run_queue(struct gfs2_glock *gl)
599{
600 struct gfs2_holder *gh;
601 int blocked = 1;
602
603 for (;;) {
604 if (test_bit(GLF_LOCK, &gl->gl_flags))
605 break;
606
607 if (!list_empty(&gl->gl_waiters1)) {
608 gh = list_entry(gl->gl_waiters1.next,
609 struct gfs2_holder, gh_list);
610
611 if (test_bit(HIF_MUTEX, &gh->gh_iflags))
612 blocked = rq_mutex(gh);
613 else
614 gfs2_assert_warn(gl->gl_sbd, 0);
615
616 } else if (!list_empty(&gl->gl_waiters2) &&
617 !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) {
618 gh = list_entry(gl->gl_waiters2.next,
619 struct gfs2_holder, gh_list);
620
621 if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
622 blocked = rq_demote(gh);
623 else if (test_bit(HIF_GREEDY, &gh->gh_iflags))
624 blocked = rq_greedy(gh);
625 else
626 gfs2_assert_warn(gl->gl_sbd, 0);
627
628 } else if (!list_empty(&gl->gl_waiters3)) {
629 gh = list_entry(gl->gl_waiters3.next,
630 struct gfs2_holder, gh_list);
631
632 if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
633 blocked = rq_promote(gh);
634 else
635 gfs2_assert_warn(gl->gl_sbd, 0);
636
637 } else
638 break;
639
640 if (blocked)
641 break;
642 }
643}
644
645/**
646 * gfs2_glmutex_lock - acquire a local lock on a glock
647 * @gl: the glock
648 *
649 * Gives caller exclusive access to manipulate a glock structure.
650 */
651
652static void gfs2_glmutex_lock(struct gfs2_glock *gl)
653{
654 struct gfs2_holder gh;
655
656 gfs2_holder_init(gl, 0, 0, &gh);
657 set_bit(HIF_MUTEX, &gh.gh_iflags);
658
659 spin_lock(&gl->gl_spin);
660 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
661 list_add_tail(&gh.gh_list, &gl->gl_waiters1);
662 else {
663 gl->gl_owner = current;
664 gl->gl_ip = (unsigned long)__builtin_return_address(0);
665 complete(&gh.gh_wait);
666 }
667 spin_unlock(&gl->gl_spin);
668
669 wait_for_completion(&gh.gh_wait);
670 gfs2_holder_uninit(&gh);
671}
672
673/**
674 * gfs2_glmutex_trylock - try to acquire a local lock on a glock
675 * @gl: the glock
676 *
677 * Returns: 1 if the glock is acquired
678 */
679
680static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
681{
682 int acquired = 1;
683
684 spin_lock(&gl->gl_spin);
685 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
686 acquired = 0;
687 else {
688 gl->gl_owner = current;
689 gl->gl_ip = (unsigned long)__builtin_return_address(0);
690 }
691 spin_unlock(&gl->gl_spin);
692
693 return acquired;
694}
695
696/**
697 * gfs2_glmutex_unlock - release a local lock on a glock
698 * @gl: the glock
699 *
700 */
701
702static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
703{
704 spin_lock(&gl->gl_spin);
705 clear_bit(GLF_LOCK, &gl->gl_flags);
706 gl->gl_owner = NULL;
707 gl->gl_ip = 0;
708 run_queue(gl);
709 BUG_ON(!spin_is_locked(&gl->gl_spin));
710 spin_unlock(&gl->gl_spin);
711}
712
713/**
714 * handle_callback - add a demote request to a lock's queue
715 * @gl: the glock
716 * @state: the state the caller wants us to change to
717 *
718 * Note: This may fail sliently if we are out of memory.
719 */
720
721static void handle_callback(struct gfs2_glock *gl, unsigned int state)
722{
723 struct gfs2_holder *gh, *new_gh = NULL;
724
725restart:
726 spin_lock(&gl->gl_spin);
727
728 list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
729 if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
730 gl->gl_req_gh != gh) {
731 if (gh->gh_state != state)
732 gh->gh_state = LM_ST_UNLOCKED;
733 goto out;
734 }
735 }
736
737 if (new_gh) {
738 list_add_tail(&new_gh->gh_list, &gl->gl_waiters2);
739 new_gh = NULL;
740 } else {
741 spin_unlock(&gl->gl_spin);
742
743 new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY, GFP_KERNEL);
744 if (!new_gh)
745 return;
746 set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
747 set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
748
749 goto restart;
750 }
751
752out:
753 spin_unlock(&gl->gl_spin);
754
755 if (new_gh)
756 gfs2_holder_put(new_gh);
757}
758
759void gfs2_glock_inode_squish(struct inode *inode)
760{
761 struct gfs2_holder gh;
762 struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
763 gfs2_holder_init(gl, LM_ST_UNLOCKED, 0, &gh);
764 set_bit(HIF_DEMOTE, &gh.gh_iflags);
765 spin_lock(&gl->gl_spin);
766 gfs2_assert(inode->i_sb->s_fs_info, list_empty(&gl->gl_holders));
767 list_add_tail(&gh.gh_list, &gl->gl_waiters2);
768 run_queue(gl);
769 spin_unlock(&gl->gl_spin);
770 wait_for_completion(&gh.gh_wait);
771 gfs2_holder_uninit(&gh);
772}
773
774/**
775 * state_change - record that the glock is now in a different state
776 * @gl: the glock
777 * @new_state the new state
778 *
779 */
780
781static void state_change(struct gfs2_glock *gl, unsigned int new_state)
782{
783 int held1, held2;
784
785 held1 = (gl->gl_state != LM_ST_UNLOCKED);
786 held2 = (new_state != LM_ST_UNLOCKED);
787
788 if (held1 != held2) {
789 if (held2)
790 gfs2_glock_hold(gl);
791 else
792 gfs2_glock_put(gl);
793 }
794
795 gl->gl_state = new_state;
796}
797
798/**
799 * xmote_bh - Called after the lock module is done acquiring a lock
800 * @gl: The glock in question
801 * @ret: the int returned from the lock module
802 *
803 */
804
805static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
806{
807 struct gfs2_sbd *sdp = gl->gl_sbd;
808 struct gfs2_glock_operations *glops = gl->gl_ops;
809 struct gfs2_holder *gh = gl->gl_req_gh;
810 int prev_state = gl->gl_state;
811 int op_done = 1;
812
813 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
814 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
815 gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
816
817 state_change(gl, ret & LM_OUT_ST_MASK);
818
819 if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
820 if (glops->go_inval)
821 glops->go_inval(gl, DIO_METADATA | DIO_DATA);
822 } else if (gl->gl_state == LM_ST_DEFERRED) {
823 /* We might not want to do this here.
824 Look at moving to the inode glops. */
825 if (glops->go_inval)
826 glops->go_inval(gl, DIO_DATA);
827 }
828
829 /* Deal with each possible exit condition */
830
831 if (!gh)
832 gl->gl_stamp = jiffies;
833
834 else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
835 spin_lock(&gl->gl_spin);
836 list_del_init(&gh->gh_list);
837 gh->gh_error = -EIO;
838 spin_unlock(&gl->gl_spin);
839
840 } else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
841 spin_lock(&gl->gl_spin);
842 list_del_init(&gh->gh_list);
843 if (gl->gl_state == gh->gh_state ||
844 gl->gl_state == LM_ST_UNLOCKED)
845 gh->gh_error = 0;
846 else {
847 if (gfs2_assert_warn(sdp, gh->gh_flags &
848 (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1)
849 fs_warn(sdp, "ret = 0x%.8X\n", ret);
850 gh->gh_error = GLR_TRYFAILED;
851 }
852 spin_unlock(&gl->gl_spin);
853
854 if (ret & LM_OUT_CANCELED)
855 handle_callback(gl, LM_ST_UNLOCKED); /* Lame */
856
857 } else if (ret & LM_OUT_CANCELED) {
858 spin_lock(&gl->gl_spin);
859 list_del_init(&gh->gh_list);
860 gh->gh_error = GLR_CANCELED;
861 spin_unlock(&gl->gl_spin);
862
863 } else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
864 spin_lock(&gl->gl_spin);
865 list_move_tail(&gh->gh_list, &gl->gl_holders);
866 gh->gh_error = 0;
867 set_bit(HIF_HOLDER, &gh->gh_iflags);
868 spin_unlock(&gl->gl_spin);
869
870 set_bit(HIF_FIRST, &gh->gh_iflags);
871
872 op_done = 0;
873
874 } else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
875 spin_lock(&gl->gl_spin);
876 list_del_init(&gh->gh_list);
877 gh->gh_error = GLR_TRYFAILED;
878 spin_unlock(&gl->gl_spin);
879
880 } else {
881 if (gfs2_assert_withdraw(sdp, 0) == -1)
882 fs_err(sdp, "ret = 0x%.8X\n", ret);
883 }
884
885 if (glops->go_xmote_bh)
886 glops->go_xmote_bh(gl);
887
888 if (op_done) {
889 spin_lock(&gl->gl_spin);
890 gl->gl_req_gh = NULL;
891 gl->gl_req_bh = NULL;
892 clear_bit(GLF_LOCK, &gl->gl_flags);
893 run_queue(gl);
894 spin_unlock(&gl->gl_spin);
895 }
896
897 gfs2_glock_put(gl);
898
899 if (gh) {
900 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
901 gfs2_holder_put(gh);
902 else
903 complete(&gh->gh_wait);
904 }
905}
906
907/**
908 * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
909 * @gl: The glock in question
910 * @state: the requested state
911 * @flags: modifier flags to the lock call
912 *
913 */
914
915void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags)
916{
917 struct gfs2_sbd *sdp = gl->gl_sbd;
918 struct gfs2_glock_operations *glops = gl->gl_ops;
919 int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
920 LM_FLAG_NOEXP | LM_FLAG_ANY |
921 LM_FLAG_PRIORITY);
922 unsigned int lck_ret;
923
924 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
925 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
926 gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
927 gfs2_assert_warn(sdp, state != gl->gl_state);
928
929 if (gl->gl_state == LM_ST_EXCLUSIVE) {
930 if (glops->go_sync)
931 glops->go_sync(gl,
932 DIO_METADATA | DIO_DATA | DIO_RELEASE);
933 }
934
935 gfs2_glock_hold(gl);
936 gl->gl_req_bh = xmote_bh;
937
938 lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state,
939 lck_flags);
940
941 if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
942 return;
943
944 if (lck_ret & LM_OUT_ASYNC)
945 gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
946 else
947 xmote_bh(gl, lck_ret);
948}
949
950/**
951 * drop_bh - Called after a lock module unlock completes
952 * @gl: the glock
953 * @ret: the return status
954 *
955 * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
956 * Doesn't drop the reference on the glock the top half took out
957 *
958 */
959
960static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
961{
962 struct gfs2_sbd *sdp = gl->gl_sbd;
963 struct gfs2_glock_operations *glops = gl->gl_ops;
964 struct gfs2_holder *gh = gl->gl_req_gh;
965
966 clear_bit(GLF_PREFETCH, &gl->gl_flags);
967
968 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
969 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
970 gfs2_assert_warn(sdp, !ret);
971
972 state_change(gl, LM_ST_UNLOCKED);
973
974 if (glops->go_inval)
975 glops->go_inval(gl, DIO_METADATA | DIO_DATA);
976
977 if (gh) {
978 spin_lock(&gl->gl_spin);
979 list_del_init(&gh->gh_list);
980 gh->gh_error = 0;
981 spin_unlock(&gl->gl_spin);
982 }
983
984 if (glops->go_drop_bh)
985 glops->go_drop_bh(gl);
986
987 spin_lock(&gl->gl_spin);
988 gl->gl_req_gh = NULL;
989 gl->gl_req_bh = NULL;
990 clear_bit(GLF_LOCK, &gl->gl_flags);
991 run_queue(gl);
992 spin_unlock(&gl->gl_spin);
993
994 gfs2_glock_put(gl);
995
996 if (gh) {
997 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
998 gfs2_holder_put(gh);
999 else
1000 complete(&gh->gh_wait);
1001 }
1002}
1003
1004/**
1005 * gfs2_glock_drop_th - call into the lock module to unlock a lock
1006 * @gl: the glock
1007 *
1008 */
1009
1010void gfs2_glock_drop_th(struct gfs2_glock *gl)
1011{
1012 struct gfs2_sbd *sdp = gl->gl_sbd;
1013 struct gfs2_glock_operations *glops = gl->gl_ops;
1014 unsigned int ret;
1015
1016 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1017 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
1018 gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
1019
1020 if (gl->gl_state == LM_ST_EXCLUSIVE) {
1021 if (glops->go_sync)
1022 glops->go_sync(gl,
1023 DIO_METADATA | DIO_DATA | DIO_RELEASE);
1024 }
1025
1026 gfs2_glock_hold(gl);
1027 gl->gl_req_bh = drop_bh;
1028
1029 ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
1030
1031 if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
1032 return;
1033
1034 if (!ret)
1035 drop_bh(gl, ret);
1036 else
1037 gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
1038}
1039
1040/**
1041 * do_cancels - cancel requests for locks stuck waiting on an expire flag
1042 * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
1043 *
1044 * Don't cancel GL_NOCANCEL requests.
1045 */
1046
1047static void do_cancels(struct gfs2_holder *gh)
1048{
1049 struct gfs2_glock *gl = gh->gh_gl;
1050
1051 spin_lock(&gl->gl_spin);
1052
1053 while (gl->gl_req_gh != gh &&
1054 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
1055 !list_empty(&gh->gh_list)) {
1056 if (gl->gl_req_bh &&
1057 !(gl->gl_req_gh &&
1058 (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
1059 spin_unlock(&gl->gl_spin);
1060 gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock);
1061 msleep(100);
1062 spin_lock(&gl->gl_spin);
1063 } else {
1064 spin_unlock(&gl->gl_spin);
1065 msleep(100);
1066 spin_lock(&gl->gl_spin);
1067 }
1068 }
1069
1070 spin_unlock(&gl->gl_spin);
1071}
1072
1073/**
1074 * glock_wait_internal - wait on a glock acquisition
1075 * @gh: the glock holder
1076 *
1077 * Returns: 0 on success
1078 */
1079
1080static int glock_wait_internal(struct gfs2_holder *gh)
1081{
1082 struct gfs2_glock *gl = gh->gh_gl;
1083 struct gfs2_sbd *sdp = gl->gl_sbd;
1084 struct gfs2_glock_operations *glops = gl->gl_ops;
1085
1086 if (test_bit(HIF_ABORTED, &gh->gh_iflags))
1087 return -EIO;
1088
1089 if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
1090 spin_lock(&gl->gl_spin);
1091 if (gl->gl_req_gh != gh &&
1092 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
1093 !list_empty(&gh->gh_list)) {
1094 list_del_init(&gh->gh_list);
1095 gh->gh_error = GLR_TRYFAILED;
1096 run_queue(gl);
1097 spin_unlock(&gl->gl_spin);
1098 return gh->gh_error;
1099 }
1100 spin_unlock(&gl->gl_spin);
1101 }
1102
1103 if (gh->gh_flags & LM_FLAG_PRIORITY)
1104 do_cancels(gh);
1105
1106 wait_for_completion(&gh->gh_wait);
1107
1108 if (gh->gh_error)
1109 return gh->gh_error;
1110
1111 gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
1112 gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state,
1113 gh->gh_state,
1114 gh->gh_flags));
1115
1116 if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
1117 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1118
1119 if (glops->go_lock) {
1120 gh->gh_error = glops->go_lock(gh);
1121 if (gh->gh_error) {
1122 spin_lock(&gl->gl_spin);
1123 list_del_init(&gh->gh_list);
1124 spin_unlock(&gl->gl_spin);
1125 }
1126 }
1127
1128 spin_lock(&gl->gl_spin);
1129 gl->gl_req_gh = NULL;
1130 gl->gl_req_bh = NULL;
1131 clear_bit(GLF_LOCK, &gl->gl_flags);
1132 run_queue(gl);
1133 spin_unlock(&gl->gl_spin);
1134 }
1135
1136 return gh->gh_error;
1137}
1138
1139static inline struct gfs2_holder *
1140find_holder_by_owner(struct list_head *head, struct task_struct *owner)
1141{
1142 struct gfs2_holder *gh;
1143
1144 list_for_each_entry(gh, head, gh_list) {
1145 if (gh->gh_owner == owner)
1146 return gh;
1147 }
1148
1149 return NULL;
1150}
1151
1152/**
1153 * add_to_queue - Add a holder to the wait queue (but look for recursion)
1154 * @gh: the holder structure to add
1155 *
1156 */
1157
1158static void add_to_queue(struct gfs2_holder *gh)
1159{
1160 struct gfs2_glock *gl = gh->gh_gl;
1161 struct gfs2_holder *existing;
1162
1163 BUG_ON(!gh->gh_owner);
1164
1165 existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
1166 if (existing) {
1167 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
1168 printk(KERN_INFO "pid : %d\n", existing->gh_owner->pid);
1169 printk(KERN_INFO "lock type : %d lock state : %d\n",
1170 existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state);
1171 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
1172 printk(KERN_INFO "pid : %d\n", gh->gh_owner->pid);
1173 printk(KERN_INFO "lock type : %d lock state : %d\n",
1174 gl->gl_name.ln_type, gl->gl_state);
1175 BUG();
1176 }
1177
1178 existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner);
1179 if (existing) {
1180 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
1181 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
1182 BUG();
1183 }
1184
1185 if (gh->gh_flags & LM_FLAG_PRIORITY)
1186 list_add(&gh->gh_list, &gl->gl_waiters3);
1187 else
1188 list_add_tail(&gh->gh_list, &gl->gl_waiters3);
1189}
1190
1191/**
1192 * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock)
1193 * @gh: the holder structure
1194 *
1195 * if (gh->gh_flags & GL_ASYNC), this never returns an error
1196 *
1197 * Returns: 0, GLR_TRYFAILED, or errno on failure
1198 */
1199
1200int gfs2_glock_nq(struct gfs2_holder *gh)
1201{
1202 struct gfs2_glock *gl = gh->gh_gl;
1203 struct gfs2_sbd *sdp = gl->gl_sbd;
1204 int error = 0;
1205
1206restart:
1207 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
1208 set_bit(HIF_ABORTED, &gh->gh_iflags);
1209 return -EIO;
1210 }
1211
1212 set_bit(HIF_PROMOTE, &gh->gh_iflags);
1213
1214 spin_lock(&gl->gl_spin);
1215 add_to_queue(gh);
1216 run_queue(gl);
1217 spin_unlock(&gl->gl_spin);
1218
1219 if (!(gh->gh_flags & GL_ASYNC)) {
1220 error = glock_wait_internal(gh);
1221 if (error == GLR_CANCELED) {
1222 msleep(100);
1223 goto restart;
1224 }
1225 }
1226
1227 clear_bit(GLF_PREFETCH, &gl->gl_flags);
1228
1229 if (error == GLR_TRYFAILED && (gh->gh_flags & GL_DUMP))
1230 dump_glock(gl);
1231
1232 return error;
1233}
1234
1235/**
1236 * gfs2_glock_poll - poll to see if an async request has been completed
1237 * @gh: the holder
1238 *
1239 * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on
1240 */
1241
1242int gfs2_glock_poll(struct gfs2_holder *gh)
1243{
1244 struct gfs2_glock *gl = gh->gh_gl;
1245 int ready = 0;
1246
1247 spin_lock(&gl->gl_spin);
1248
1249 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
1250 ready = 1;
1251 else if (list_empty(&gh->gh_list)) {
1252 if (gh->gh_error == GLR_CANCELED) {
1253 spin_unlock(&gl->gl_spin);
1254 msleep(100);
1255 if (gfs2_glock_nq(gh))
1256 return 1;
1257 return 0;
1258 } else
1259 ready = 1;
1260 }
1261
1262 spin_unlock(&gl->gl_spin);
1263
1264 return ready;
1265}
1266
1267/**
1268 * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
1269 * @gh: the holder structure
1270 *
1271 * Returns: 0, GLR_TRYFAILED, or errno on failure
1272 */
1273
1274int gfs2_glock_wait(struct gfs2_holder *gh)
1275{
1276 int error;
1277
1278 error = glock_wait_internal(gh);
1279 if (error == GLR_CANCELED) {
1280 msleep(100);
1281 gh->gh_flags &= ~GL_ASYNC;
1282 error = gfs2_glock_nq(gh);
1283 }
1284
1285 return error;
1286}
1287
1288/**
1289 * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
1290 * @gh: the glock holder
1291 *
1292 */
1293
1294void gfs2_glock_dq(struct gfs2_holder *gh)
1295{
1296 struct gfs2_glock *gl = gh->gh_gl;
1297 struct gfs2_glock_operations *glops = gl->gl_ops;
1298
1299 if (gh->gh_flags & GL_SYNC)
1300 set_bit(GLF_SYNC, &gl->gl_flags);
1301
1302 if (gh->gh_flags & GL_NOCACHE)
1303 handle_callback(gl, LM_ST_UNLOCKED);
1304
1305 gfs2_glmutex_lock(gl);
1306
1307 spin_lock(&gl->gl_spin);
1308 list_del_init(&gh->gh_list);
1309
1310 if (list_empty(&gl->gl_holders)) {
1311 spin_unlock(&gl->gl_spin);
1312
1313 if (glops->go_unlock)
1314 glops->go_unlock(gh);
1315
1316 if (test_bit(GLF_SYNC, &gl->gl_flags)) {
1317 if (glops->go_sync)
1318 glops->go_sync(gl, DIO_METADATA | DIO_DATA);
1319 }
1320
1321 gl->gl_stamp = jiffies;
1322
1323 spin_lock(&gl->gl_spin);
1324 }
1325
1326 clear_bit(GLF_LOCK, &gl->gl_flags);
1327 run_queue(gl);
1328 spin_unlock(&gl->gl_spin);
1329}
1330
1331/**
1332 * gfs2_glock_prefetch - Try to prefetch a glock
1333 * @gl: the glock
1334 * @state: the state to prefetch in
1335 * @flags: flags passed to go_xmote_th()
1336 *
1337 */
1338
1339static void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state,
1340 int flags)
1341{
1342 struct gfs2_glock_operations *glops = gl->gl_ops;
1343
1344 spin_lock(&gl->gl_spin);
1345
1346 if (test_bit(GLF_LOCK, &gl->gl_flags) ||
1347 !list_empty(&gl->gl_holders) ||
1348 !list_empty(&gl->gl_waiters1) ||
1349 !list_empty(&gl->gl_waiters2) ||
1350 !list_empty(&gl->gl_waiters3) ||
1351 relaxed_state_ok(gl->gl_state, state, flags)) {
1352 spin_unlock(&gl->gl_spin);
1353 return;
1354 }
1355
1356 set_bit(GLF_PREFETCH, &gl->gl_flags);
1357 set_bit(GLF_LOCK, &gl->gl_flags);
1358 spin_unlock(&gl->gl_spin);
1359
1360 glops->go_xmote_th(gl, state, flags);
1361}
1362
1363static void greedy_work(void *data)
1364{
1365 struct greedy *gr = data;
1366 struct gfs2_holder *gh = &gr->gr_gh;
1367 struct gfs2_glock *gl = gh->gh_gl;
1368 struct gfs2_glock_operations *glops = gl->gl_ops;
1369
1370 clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
1371
1372 if (glops->go_greedy)
1373 glops->go_greedy(gl);
1374
1375 spin_lock(&gl->gl_spin);
1376
1377 if (list_empty(&gl->gl_waiters2)) {
1378 clear_bit(GLF_GREEDY, &gl->gl_flags);
1379 spin_unlock(&gl->gl_spin);
1380 gfs2_holder_uninit(gh);
1381 kfree(gr);
1382 } else {
1383 gfs2_glock_hold(gl);
1384 list_add_tail(&gh->gh_list, &gl->gl_waiters2);
1385 run_queue(gl);
1386 spin_unlock(&gl->gl_spin);
1387 gfs2_glock_put(gl);
1388 }
1389}
1390
1391/**
1392 * gfs2_glock_be_greedy -
1393 * @gl:
1394 * @time:
1395 *
1396 * Returns: 0 if go_greedy will be called, 1 otherwise
1397 */
1398
1399int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time)
1400{
1401 struct greedy *gr;
1402 struct gfs2_holder *gh;
1403
1404 if (!time || gl->gl_sbd->sd_args.ar_localcaching ||
1405 test_and_set_bit(GLF_GREEDY, &gl->gl_flags))
1406 return 1;
1407
1408 gr = kmalloc(sizeof(struct greedy), GFP_KERNEL);
1409 if (!gr) {
1410 clear_bit(GLF_GREEDY, &gl->gl_flags);
1411 return 1;
1412 }
1413 gh = &gr->gr_gh;
1414
1415 gfs2_holder_init(gl, 0, 0, gh);
1416 set_bit(HIF_GREEDY, &gh->gh_iflags);
1417 INIT_WORK(&gr->gr_work, greedy_work, gr);
1418
1419 set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
1420 schedule_delayed_work(&gr->gr_work, time);
1421
1422 return 0;
1423}
1424
1425/**
1426 * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
1427 * @gh: the holder structure
1428 *
1429 */
1430
1431void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
1432{
1433 gfs2_glock_dq(gh);
1434 gfs2_holder_uninit(gh);
1435}
1436
1437/**
1438 * gfs2_glock_nq_num - acquire a glock based on lock number
1439 * @sdp: the filesystem
1440 * @number: the lock number
1441 * @glops: the glock operations for the type of glock
1442 * @state: the state to acquire the glock in
1443 * @flags: modifier flags for the aquisition
1444 * @gh: the struct gfs2_holder
1445 *
1446 * Returns: errno
1447 */
1448
1449int gfs2_glock_nq_num(struct gfs2_sbd *sdp, uint64_t number,
1450 struct gfs2_glock_operations *glops, unsigned int state,
1451 int flags, struct gfs2_holder *gh)
1452{
1453 struct gfs2_glock *gl;
1454 int error;
1455
1456 error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
1457 if (!error) {
1458 error = gfs2_glock_nq_init(gl, state, flags, gh);
1459 gfs2_glock_put(gl);
1460 }
1461
1462 return error;
1463}
1464
1465/**
1466 * glock_compare - Compare two struct gfs2_glock structures for sorting
1467 * @arg_a: the first structure
1468 * @arg_b: the second structure
1469 *
1470 */
1471
1472static int glock_compare(const void *arg_a, const void *arg_b)
1473{
1474 struct gfs2_holder *gh_a = *(struct gfs2_holder **)arg_a;
1475 struct gfs2_holder *gh_b = *(struct gfs2_holder **)arg_b;
1476 struct lm_lockname *a = &gh_a->gh_gl->gl_name;
1477 struct lm_lockname *b = &gh_b->gh_gl->gl_name;
1478 int ret = 0;
1479
1480 if (a->ln_number > b->ln_number)
1481 ret = 1;
1482 else if (a->ln_number < b->ln_number)
1483 ret = -1;
1484 else {
1485 if (gh_a->gh_state == LM_ST_SHARED &&
1486 gh_b->gh_state == LM_ST_EXCLUSIVE)
1487 ret = 1;
1488 else if (!(gh_a->gh_flags & GL_LOCAL_EXCL) &&
1489 (gh_b->gh_flags & GL_LOCAL_EXCL))
1490 ret = 1;
1491 }
1492
1493 return ret;
1494}
1495
1496/**
1497 * nq_m_sync - synchonously acquire more than one glock in deadlock free order
1498 * @num_gh: the number of structures
1499 * @ghs: an array of struct gfs2_holder structures
1500 *
1501 * Returns: 0 on success (all glocks acquired),
1502 * errno on failure (no glocks acquired)
1503 */
1504
1505static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
1506 struct gfs2_holder **p)
1507{
1508 unsigned int x;
1509 int error = 0;
1510
1511 for (x = 0; x < num_gh; x++)
1512 p[x] = &ghs[x];
1513
1514 sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL);
1515
1516 for (x = 0; x < num_gh; x++) {
1517 p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1518
1519 error = gfs2_glock_nq(p[x]);
1520 if (error) {
1521 while (x--)
1522 gfs2_glock_dq(p[x]);
1523 break;
1524 }
1525 }
1526
1527 return error;
1528}
1529
1530/**
1531 * gfs2_glock_nq_m - acquire multiple glocks
1532 * @num_gh: the number of structures
1533 * @ghs: an array of struct gfs2_holder structures
1534 *
1535 * Figure out how big an impact this function has. Either:
1536 * 1) Replace this code with code that calls gfs2_glock_prefetch()
1537 * 2) Forget async stuff and just call nq_m_sync()
1538 * 3) Leave it like it is
1539 *
1540 * Returns: 0 on success (all glocks acquired),
1541 * errno on failure (no glocks acquired)
1542 */
1543
1544int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1545{
1546 int *e;
1547 unsigned int x;
1548 int borked = 0, serious = 0;
1549 int error = 0;
1550
1551 if (!num_gh)
1552 return 0;
1553
1554 if (num_gh == 1) {
1555 ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1556 return gfs2_glock_nq(ghs);
1557 }
1558
1559 e = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
1560 if (!e)
1561 return -ENOMEM;
1562
1563 for (x = 0; x < num_gh; x++) {
1564 ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC;
1565 error = gfs2_glock_nq(&ghs[x]);
1566 if (error) {
1567 borked = 1;
1568 serious = error;
1569 num_gh = x;
1570 break;
1571 }
1572 }
1573
1574 for (x = 0; x < num_gh; x++) {
1575 error = e[x] = glock_wait_internal(&ghs[x]);
1576 if (error) {
1577 borked = 1;
1578 if (error != GLR_TRYFAILED && error != GLR_CANCELED)
1579 serious = error;
1580 }
1581 }
1582
1583 if (!borked) {
1584 kfree(e);
1585 return 0;
1586 }
1587
1588 for (x = 0; x < num_gh; x++)
1589 if (!e[x])
1590 gfs2_glock_dq(&ghs[x]);
1591
1592 if (serious)
1593 error = serious;
1594 else {
1595 for (x = 0; x < num_gh; x++)
1596 gfs2_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags,
1597 &ghs[x]);
1598 error = nq_m_sync(num_gh, ghs, (struct gfs2_holder **)e);
1599 }
1600
1601 kfree(e);
1602
1603 return error;
1604}
1605
1606/**
1607 * gfs2_glock_dq_m - release multiple glocks
1608 * @num_gh: the number of structures
1609 * @ghs: an array of struct gfs2_holder structures
1610 *
1611 */
1612
1613void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1614{
1615 unsigned int x;
1616
1617 for (x = 0; x < num_gh; x++)
1618 gfs2_glock_dq(&ghs[x]);
1619}
1620
1621/**
1622 * gfs2_glock_dq_uninit_m - release multiple glocks
1623 * @num_gh: the number of structures
1624 * @ghs: an array of struct gfs2_holder structures
1625 *
1626 */
1627
1628void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
1629{
1630 unsigned int x;
1631
1632 for (x = 0; x < num_gh; x++)
1633 gfs2_glock_dq_uninit(&ghs[x]);
1634}
1635
1636/**
1637 * gfs2_glock_prefetch_num - prefetch a glock based on lock number
1638 * @sdp: the filesystem
1639 * @number: the lock number
1640 * @glops: the glock operations for the type of glock
1641 * @state: the state to acquire the glock in
1642 * @flags: modifier flags for the aquisition
1643 *
1644 * Returns: errno
1645 */
1646
1647void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, uint64_t number,
1648 struct gfs2_glock_operations *glops,
1649 unsigned int state, int flags)
1650{
1651 struct gfs2_glock *gl;
1652 int error;
1653
1654 if (atomic_read(&sdp->sd_reclaim_count) <
1655 gfs2_tune_get(sdp, gt_reclaim_limit)) {
1656 error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
1657 if (!error) {
1658 gfs2_glock_prefetch(gl, state, flags);
1659 gfs2_glock_put(gl);
1660 }
1661 }
1662}
1663
1664/**
1665 * gfs2_lvb_hold - attach a LVB from a glock
1666 * @gl: The glock in question
1667 *
1668 */
1669
1670int gfs2_lvb_hold(struct gfs2_glock *gl)
1671{
1672 int error;
1673
1674 gfs2_glmutex_lock(gl);
1675
1676 if (!atomic_read(&gl->gl_lvb_count)) {
1677 error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
1678 if (error) {
1679 gfs2_glmutex_unlock(gl);
1680 return error;
1681 }
1682 gfs2_glock_hold(gl);
1683 }
1684 atomic_inc(&gl->gl_lvb_count);
1685
1686 gfs2_glmutex_unlock(gl);
1687
1688 return 0;
1689}
1690
1691/**
1692 * gfs2_lvb_unhold - detach a LVB from a glock
1693 * @gl: The glock in question
1694 *
1695 */
1696
1697void gfs2_lvb_unhold(struct gfs2_glock *gl)
1698{
1699 gfs2_glock_hold(gl);
1700 gfs2_glmutex_lock(gl);
1701
1702 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
1703 if (atomic_dec_and_test(&gl->gl_lvb_count)) {
1704 gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
1705 gl->gl_lvb = NULL;
1706 gfs2_glock_put(gl);
1707 }
1708
1709 gfs2_glmutex_unlock(gl);
1710 gfs2_glock_put(gl);
1711}
1712
1713#if 0
1714void gfs2_lvb_sync(struct gfs2_glock *gl)
1715{
1716 gfs2_glmutex_lock(gl);
1717
1718 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count));
1719 if (!gfs2_assert_warn(gl->gl_sbd, gfs2_glock_is_held_excl(gl)))
1720 gfs2_lm_sync_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
1721
1722 gfs2_glmutex_unlock(gl);
1723}
1724#endif /* 0 */
1725
1726static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1727 unsigned int state)
1728{
1729 struct gfs2_glock *gl;
1730
1731 gl = gfs2_glock_find(sdp, name);
1732 if (!gl)
1733 return;
1734
1735 if (gl->gl_ops->go_callback)
1736 gl->gl_ops->go_callback(gl, state);
1737 handle_callback(gl, state);
1738
1739 spin_lock(&gl->gl_spin);
1740 run_queue(gl);
1741 spin_unlock(&gl->gl_spin);
1742
1743 gfs2_glock_put(gl);
1744}
1745
1746/**
1747 * gfs2_glock_cb - Callback used by locking module
1748 * @fsdata: Pointer to the superblock
1749 * @type: Type of callback
1750 * @data: Type dependent data pointer
1751 *
1752 * Called by the locking module when it wants to tell us something.
1753 * Either we need to drop a lock, one of our ASYNC requests completed, or
1754 * a journal from another client needs to be recovered.
1755 */
1756
1757void gfs2_glock_cb(lm_fsdata_t *fsdata, unsigned int type, void *data)
1758{
1759 struct gfs2_sbd *sdp = (struct gfs2_sbd *)fsdata;
1760
1761 switch (type) {
1762 case LM_CB_NEED_E:
1763 blocking_cb(sdp, data, LM_ST_UNLOCKED);
1764 return;
1765
1766 case LM_CB_NEED_D:
1767 blocking_cb(sdp, data, LM_ST_DEFERRED);
1768 return;
1769
1770 case LM_CB_NEED_S:
1771 blocking_cb(sdp, data, LM_ST_SHARED);
1772 return;
1773
1774 case LM_CB_ASYNC: {
1775 struct lm_async_cb *async = data;
1776 struct gfs2_glock *gl;
1777
1778 gl = gfs2_glock_find(sdp, &async->lc_name);
1779 if (gfs2_assert_warn(sdp, gl))
1780 return;
1781 if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
1782 gl->gl_req_bh(gl, async->lc_ret);
1783 gfs2_glock_put(gl);
1784 return;
1785 }
1786
1787 case LM_CB_NEED_RECOVERY:
1788 gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data);
1789 if (sdp->sd_recoverd_process)
1790 wake_up_process(sdp->sd_recoverd_process);
1791 return;
1792
1793 case LM_CB_DROPLOCKS:
1794 gfs2_gl_hash_clear(sdp, NO_WAIT);
1795 gfs2_quota_scan(sdp);
1796 return;
1797
1798 default:
1799 gfs2_assert_warn(sdp, 0);
1800 return;
1801 }
1802}
1803
1804/**
1805 * gfs2_iopen_go_callback - Try to kick the inode/vnode associated with an
1806 * iopen glock from memory
1807 * @io_gl: the iopen glock
1808 * @state: the state into which the glock should be put
1809 *
1810 */
1811
1812void gfs2_iopen_go_callback(struct gfs2_glock *io_gl, unsigned int state)
1813{
1814
1815 if (state != LM_ST_UNLOCKED)
1816 return;
1817 /* FIXME: remove this? */
1818}
1819
1820/**
1821 * demote_ok - Check to see if it's ok to unlock a glock
1822 * @gl: the glock
1823 *
1824 * Returns: 1 if it's ok
1825 */
1826
1827static int demote_ok(struct gfs2_glock *gl)
1828{
1829 struct gfs2_sbd *sdp = gl->gl_sbd;
1830 struct gfs2_glock_operations *glops = gl->gl_ops;
1831 int demote = 1;
1832
1833 if (test_bit(GLF_STICKY, &gl->gl_flags))
1834 demote = 0;
1835 else if (test_bit(GLF_PREFETCH, &gl->gl_flags))
1836 demote = time_after_eq(jiffies,
1837 gl->gl_stamp +
1838 gfs2_tune_get(sdp, gt_prefetch_secs) * HZ);
1839 else if (glops->go_demote_ok)
1840 demote = glops->go_demote_ok(gl);
1841
1842 return demote;
1843}
1844
1845/**
1846 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
1847 * @gl: the glock
1848 *
1849 */
1850
1851void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
1852{
1853 struct gfs2_sbd *sdp = gl->gl_sbd;
1854
1855 spin_lock(&sdp->sd_reclaim_lock);
1856 if (list_empty(&gl->gl_reclaim)) {
1857 gfs2_glock_hold(gl);
1858 list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
1859 atomic_inc(&sdp->sd_reclaim_count);
1860 }
1861 spin_unlock(&sdp->sd_reclaim_lock);
1862
1863 wake_up(&sdp->sd_reclaim_wq);
1864}
1865
1866/**
1867 * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
1868 * @sdp: the filesystem
1869 *
1870 * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
1871 * different glock and we notice that there are a lot of glocks in the
1872 * reclaim list.
1873 *
1874 */
1875
1876void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
1877{
1878 struct gfs2_glock *gl;
1879
1880 spin_lock(&sdp->sd_reclaim_lock);
1881 if (list_empty(&sdp->sd_reclaim_list)) {
1882 spin_unlock(&sdp->sd_reclaim_lock);
1883 return;
1884 }
1885 gl = list_entry(sdp->sd_reclaim_list.next,
1886 struct gfs2_glock, gl_reclaim);
1887 list_del_init(&gl->gl_reclaim);
1888 spin_unlock(&sdp->sd_reclaim_lock);
1889
1890 atomic_dec(&sdp->sd_reclaim_count);
1891 atomic_inc(&sdp->sd_reclaimed);
1892
1893 if (gfs2_glmutex_trylock(gl)) {
1894 if (queue_empty(gl, &gl->gl_holders) &&
1895 gl->gl_state != LM_ST_UNLOCKED &&
1896 demote_ok(gl))
1897 handle_callback(gl, LM_ST_UNLOCKED);
1898 gfs2_glmutex_unlock(gl);
1899 }
1900
1901 gfs2_glock_put(gl);
1902}
1903
1904/**
1905 * examine_bucket - Call a function for glock in a hash bucket
1906 * @examiner: the function
1907 * @sdp: the filesystem
1908 * @bucket: the bucket
1909 *
1910 * Returns: 1 if the bucket has entries
1911 */
1912
1913static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
1914 struct gfs2_gl_hash_bucket *bucket)
1915{
1916 struct glock_plug plug;
1917 struct list_head *tmp;
1918 struct gfs2_glock *gl;
1919 int entries;
1920
1921 /* Add "plug" to end of bucket list, work back up list from there */
1922 memset(&plug.gl_flags, 0, sizeof(unsigned long));
1923 set_bit(GLF_PLUG, &plug.gl_flags);
1924
1925 write_lock(&bucket->hb_lock);
1926 list_add(&plug.gl_list, &bucket->hb_list);
1927 write_unlock(&bucket->hb_lock);
1928
1929 for (;;) {
1930 write_lock(&bucket->hb_lock);
1931
1932 for (;;) {
1933 tmp = plug.gl_list.next;
1934
1935 if (tmp == &bucket->hb_list) {
1936 list_del(&plug.gl_list);
1937 entries = !list_empty(&bucket->hb_list);
1938 write_unlock(&bucket->hb_lock);
1939 return entries;
1940 }
1941 gl = list_entry(tmp, struct gfs2_glock, gl_list);
1942
1943 /* Move plug up list */
1944 list_move(&plug.gl_list, &gl->gl_list);
1945
1946 if (test_bit(GLF_PLUG, &gl->gl_flags))
1947 continue;
1948
1949 /* examiner() must glock_put() */
1950 gfs2_glock_hold(gl);
1951
1952 break;
1953 }
1954
1955 write_unlock(&bucket->hb_lock);
1956
1957 examiner(gl);
1958 }
1959}
1960
1961/**
1962 * scan_glock - look at a glock and see if we can reclaim it
1963 * @gl: the glock to look at
1964 *
1965 */
1966
1967static void scan_glock(struct gfs2_glock *gl)
1968{
1969 if (gl->gl_ops == &gfs2_inode_glops)
1970 goto out;
1971
1972 if (gfs2_glmutex_trylock(gl)) {
1973 if (queue_empty(gl, &gl->gl_holders) &&
1974 gl->gl_state != LM_ST_UNLOCKED &&
1975 demote_ok(gl))
1976 goto out_schedule;
1977 gfs2_glmutex_unlock(gl);
1978 }
1979out:
1980 gfs2_glock_put(gl);
1981 return;
1982
1983out_schedule:
1984 gfs2_glmutex_unlock(gl);
1985 gfs2_glock_schedule_for_reclaim(gl);
1986 gfs2_glock_put(gl);
1987}
1988
1989/**
1990 * gfs2_scand_internal - Look for glocks and inodes to toss from memory
1991 * @sdp: the filesystem
1992 *
1993 */
1994
1995void gfs2_scand_internal(struct gfs2_sbd *sdp)
1996{
1997 unsigned int x;
1998
1999 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
2000 examine_bucket(scan_glock, sdp, &sdp->sd_gl_hash[x]);
2001 cond_resched();
2002 }
2003}
2004
2005/**
2006 * clear_glock - look at a glock and see if we can free it from glock cache
2007 * @gl: the glock to look at
2008 *
2009 */
2010
2011static void clear_glock(struct gfs2_glock *gl)
2012{
2013 struct gfs2_sbd *sdp = gl->gl_sbd;
2014 int released;
2015
2016 spin_lock(&sdp->sd_reclaim_lock);
2017 if (!list_empty(&gl->gl_reclaim)) {
2018 list_del_init(&gl->gl_reclaim);
2019 atomic_dec(&sdp->sd_reclaim_count);
2020 spin_unlock(&sdp->sd_reclaim_lock);
2021 released = gfs2_glock_put(gl);
2022 gfs2_assert(sdp, !released);
2023 } else {
2024 spin_unlock(&sdp->sd_reclaim_lock);
2025 }
2026
2027 if (gfs2_glmutex_trylock(gl)) {
2028 if (queue_empty(gl, &gl->gl_holders) &&
2029 gl->gl_state != LM_ST_UNLOCKED)
2030 handle_callback(gl, LM_ST_UNLOCKED);
2031
2032 gfs2_glmutex_unlock(gl);
2033 }
2034
2035 gfs2_glock_put(gl);
2036}
2037
2038/**
2039 * gfs2_gl_hash_clear - Empty out the glock hash table
2040 * @sdp: the filesystem
2041 * @wait: wait until it's all gone
2042 *
2043 * Called when unmounting the filesystem, or when inter-node lock manager
2044 * requests DROPLOCKS because it is running out of capacity.
2045 */
2046
2047void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
2048{
2049 unsigned long t;
2050 unsigned int x;
2051 int cont;
2052
2053 t = jiffies;
2054
2055 for (;;) {
2056 cont = 0;
2057
2058 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
2059 if (examine_bucket(clear_glock, sdp,
2060 &sdp->sd_gl_hash[x]))
2061 cont = 1;
2062
2063 if (!wait || !cont)
2064 break;
2065
2066 if (time_after_eq(jiffies,
2067 t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
2068 fs_warn(sdp, "Unmount seems to be stalled. "
2069 "Dumping lock state...\n");
2070 gfs2_dump_lockstate(sdp);
2071 t = jiffies;
2072 }
2073
2074 invalidate_inodes(sdp->sd_vfs);
2075 msleep(10);
2076 }
2077}
2078
2079/*
2080 * Diagnostic routines to help debug distributed deadlock
2081 */
2082
2083/**
2084 * dump_holder - print information about a glock holder
2085 * @str: a string naming the type of holder
2086 * @gh: the glock holder
2087 *
2088 * Returns: 0 on success, -ENOBUFS when we run out of space
2089 */
2090
2091static int dump_holder(char *str, struct gfs2_holder *gh)
2092{
2093 unsigned int x;
2094 int error = -ENOBUFS;
2095
2096 printk(KERN_INFO " %s\n", str);
2097 printk(KERN_INFO " owner = %ld\n",
2098 (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
2099 printk(KERN_INFO " gh_state = %u\n", gh->gh_state);
2100 printk(KERN_INFO " gh_flags =");
2101 for (x = 0; x < 32; x++)
2102 if (gh->gh_flags & (1 << x))
2103 printk(" %u", x);
2104 printk(" \n");
2105 printk(KERN_INFO " error = %d\n", gh->gh_error);
2106 printk(KERN_INFO " gh_iflags =");
2107 for (x = 0; x < 32; x++)
2108 if (test_bit(x, &gh->gh_iflags))
2109 printk(" %u", x);
2110 printk(" \n");
2111 print_symbol(KERN_INFO " initialized at: %s\n", gh->gh_ip);
2112
2113 error = 0;
2114
2115 return error;
2116}
2117
2118/**
2119 * dump_inode - print information about an inode
2120 * @ip: the inode
2121 *
2122 * Returns: 0 on success, -ENOBUFS when we run out of space
2123 */
2124
2125static int dump_inode(struct gfs2_inode *ip)
2126{
2127 unsigned int x;
2128 int error = -ENOBUFS;
2129
2130 printk(KERN_INFO " Inode:\n");
2131 printk(KERN_INFO " num = %llu %llu\n",
2132 (unsigned long long)ip->i_num.no_formal_ino,
2133 (unsigned long long)ip->i_num.no_addr);
2134 printk(KERN_INFO " type = %u\n", IF2DT(ip->i_di.di_mode));
2135 printk(KERN_INFO " i_flags =");
2136 for (x = 0; x < 32; x++)
2137 if (test_bit(x, &ip->i_flags))
2138 printk(" %u", x);
2139 printk(" \n");
2140
2141 error = 0;
2142
2143 return error;
2144}
2145
2146/**
2147 * dump_glock - print information about a glock
2148 * @gl: the glock
2149 * @count: where we are in the buffer
2150 *
2151 * Returns: 0 on success, -ENOBUFS when we run out of space
2152 */
2153
2154static int dump_glock(struct gfs2_glock *gl)
2155{
2156 struct gfs2_holder *gh;
2157 unsigned int x;
2158 int error = -ENOBUFS;
2159
2160 spin_lock(&gl->gl_spin);
2161
2162 printk(KERN_INFO "Glock 0x%p (%u, %llu)\n",
2163 gl,
2164 gl->gl_name.ln_type,
2165 (unsigned long long)gl->gl_name.ln_number);
2166 printk(KERN_INFO " gl_flags =");
2167 for (x = 0; x < 32; x++)
2168 if (test_bit(x, &gl->gl_flags))
2169 printk(" %u", x);
2170 printk(" \n");
2171 printk(KERN_INFO " gl_ref = %d\n", atomic_read(&gl->gl_ref.refcount));
2172 printk(KERN_INFO " gl_state = %u\n", gl->gl_state);
2173 printk(KERN_INFO " gl_owner = %s\n", gl->gl_owner->comm);
2174 print_symbol(KERN_INFO " gl_ip = %s\n", gl->gl_ip);
2175 printk(KERN_INFO " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
2176 printk(KERN_INFO " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
2177 printk(KERN_INFO " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
2178 printk(KERN_INFO " object = %s\n", (gl->gl_object) ? "yes" : "no");
2179 printk(KERN_INFO " le = %s\n",
2180 (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
2181 printk(KERN_INFO " reclaim = %s\n",
2182 (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
2183 if (gl->gl_aspace)
2184 printk(KERN_INFO " aspace = 0x%p nrpages = %lu\n",
2185 gl->gl_aspace,
2186 gl->gl_aspace->i_mapping->nrpages);
2187 else
2188 printk(KERN_INFO " aspace = no\n");
2189 printk(KERN_INFO " ail = %d\n", atomic_read(&gl->gl_ail_count));
2190 if (gl->gl_req_gh) {
2191 error = dump_holder("Request", gl->gl_req_gh);
2192 if (error)
2193 goto out;
2194 }
2195 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
2196 error = dump_holder("Holder", gh);
2197 if (error)
2198 goto out;
2199 }
2200 list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
2201 error = dump_holder("Waiter1", gh);
2202 if (error)
2203 goto out;
2204 }
2205 list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
2206 error = dump_holder("Waiter2", gh);
2207 if (error)
2208 goto out;
2209 }
2210 list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
2211 error = dump_holder("Waiter3", gh);
2212 if (error)
2213 goto out;
2214 }
2215 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
2216 if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
2217 list_empty(&gl->gl_holders)) {
2218 error = dump_inode(gl->gl_object);
2219 if (error)
2220 goto out;
2221 } else {
2222 error = -ENOBUFS;
2223 printk(KERN_INFO " Inode: busy\n");
2224 }
2225 }
2226
2227 error = 0;
2228
2229 out:
2230 spin_unlock(&gl->gl_spin);
2231
2232 return error;
2233}
2234
2235/**
2236 * gfs2_dump_lockstate - print out the current lockstate
2237 * @sdp: the filesystem
2238 * @ub: the buffer to copy the information into
2239 *
2240 * If @ub is NULL, dump the lockstate to the console.
2241 *
2242 */
2243
2244static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
2245{
2246 struct gfs2_gl_hash_bucket *bucket;
2247 struct gfs2_glock *gl;
2248 unsigned int x;
2249 int error = 0;
2250
2251 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
2252 bucket = &sdp->sd_gl_hash[x];
2253
2254 read_lock(&bucket->hb_lock);
2255
2256 list_for_each_entry(gl, &bucket->hb_list, gl_list) {
2257 if (test_bit(GLF_PLUG, &gl->gl_flags))
2258 continue;
2259
2260 error = dump_glock(gl);
2261 if (error)
2262 break;
2263 }
2264
2265 read_unlock(&bucket->hb_lock);
2266
2267 if (error)
2268 break;
2269 }
2270
2271
2272 return error;
2273}
2274
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
new file mode 100644
index 000000000000..fdf58db44ae3
--- /dev/null
+++ b/fs/gfs2/glock.h
@@ -0,0 +1,152 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GLOCK_DOT_H__
11#define __GLOCK_DOT_H__
12
13/* Flags for lock requests; used in gfs2_holder gh_flag field.
14 From lm_interface.h:
15#define LM_FLAG_TRY 0x00000001
16#define LM_FLAG_TRY_1CB 0x00000002
17#define LM_FLAG_NOEXP 0x00000004
18#define LM_FLAG_ANY 0x00000008
19#define LM_FLAG_PRIORITY 0x00000010 */
20
21#define GL_LOCAL_EXCL 0x00000020
22#define GL_ASYNC 0x00000040
23#define GL_EXACT 0x00000080
24#define GL_SKIP 0x00000100
25#define GL_ATIME 0x00000200
26#define GL_NOCACHE 0x00000400
27#define GL_SYNC 0x00000800
28#define GL_NOCANCEL 0x00001000
29#define GL_AOP 0x00004000
30#define GL_DUMP 0x00008000
31
32#define GLR_TRYFAILED 13
33#define GLR_CANCELED 14
34
35static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
36{
37 struct gfs2_holder *gh;
38 int locked = 0;
39
40 /* Look in glock's list of holders for one with current task as owner */
41 spin_lock(&gl->gl_spin);
42 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
43 if (gh->gh_owner == current) {
44 locked = 1;
45 break;
46 }
47 }
48 spin_unlock(&gl->gl_spin);
49
50 return locked;
51}
52
53static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
54{
55 return (gl->gl_state == LM_ST_EXCLUSIVE);
56}
57
58static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl)
59{
60 return (gl->gl_state == LM_ST_DEFERRED);
61}
62
63static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
64{
65 return (gl->gl_state == LM_ST_SHARED);
66}
67
68static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
69{
70 int ret;
71 spin_lock(&gl->gl_spin);
72 ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3);
73 spin_unlock(&gl->gl_spin);
74 return ret;
75}
76
77int gfs2_glock_get(struct gfs2_sbd *sdp,
78 uint64_t number, struct gfs2_glock_operations *glops,
79 int create, struct gfs2_glock **glp);
80void gfs2_glock_hold(struct gfs2_glock *gl);
81int gfs2_glock_put(struct gfs2_glock *gl);
82void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
83 struct gfs2_holder *gh);
84void gfs2_holder_reinit(unsigned int state, unsigned flags,
85 struct gfs2_holder *gh);
86void gfs2_holder_uninit(struct gfs2_holder *gh);
87
88void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags);
89void gfs2_glock_drop_th(struct gfs2_glock *gl);
90
91int gfs2_glock_nq(struct gfs2_holder *gh);
92int gfs2_glock_poll(struct gfs2_holder *gh);
93int gfs2_glock_wait(struct gfs2_holder *gh);
94void gfs2_glock_dq(struct gfs2_holder *gh);
95
96int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time);
97
98void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
99int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
100 uint64_t number, struct gfs2_glock_operations *glops,
101 unsigned int state, int flags, struct gfs2_holder *gh);
102
103int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
104void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
105void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
106
107void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, uint64_t number,
108 struct gfs2_glock_operations *glops,
109 unsigned int state, int flags);
110void gfs2_glock_inode_squish(struct inode *inode);
111
112/**
113 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
114 * @gl: the glock
115 * @state: the state we're requesting
116 * @flags: the modifier flags
117 * @gh: the holder structure
118 *
119 * Returns: 0, GLR_*, or errno
120 */
121
122static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
123 unsigned int state, int flags,
124 struct gfs2_holder *gh)
125{
126 int error;
127
128 gfs2_holder_init(gl, state, flags, gh);
129
130 error = gfs2_glock_nq(gh);
131 if (error)
132 gfs2_holder_uninit(gh);
133
134 return error;
135}
136
137/* Lock Value Block functions */
138
139int gfs2_lvb_hold(struct gfs2_glock *gl);
140void gfs2_lvb_unhold(struct gfs2_glock *gl);
141
142void gfs2_glock_cb(lm_fsdata_t *fsdata, unsigned int type, void *data);
143
144void gfs2_iopen_go_callback(struct gfs2_glock *gl, unsigned int state);
145
146void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
147void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
148
149void gfs2_scand_internal(struct gfs2_sbd *sdp);
150void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
151
152#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
new file mode 100644
index 000000000000..75d4c50cff45
--- /dev/null
+++ b/fs/gfs2/glops.c
@@ -0,0 +1,564 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16
17#include "gfs2.h"
18#include "lm_interface.h"
19#include "incore.h"
20#include "bmap.h"
21#include "glock.h"
22#include "glops.h"
23#include "inode.h"
24#include "log.h"
25#include "meta_io.h"
26#include "recovery.h"
27#include "rgrp.h"
28#include "util.h"
29
30
31/**
32 * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock
33 * @gl: the glock
34 *
35 */
36
37static void gfs2_pte_inval(struct gfs2_glock *gl)
38{
39 struct gfs2_inode *ip;
40 struct inode *inode;
41
42 ip = gl->gl_object;
43 inode = &ip->i_inode;
44 if (!ip || !S_ISREG(ip->i_di.di_mode))
45 return;
46
47 if (!test_bit(GIF_PAGED, &ip->i_flags))
48 return;
49
50 unmap_shared_mapping_range(inode->i_mapping, 0, 0);
51
52 if (test_bit(GIF_SW_PAGED, &ip->i_flags))
53 set_bit(GLF_DIRTY, &gl->gl_flags);
54
55 clear_bit(GIF_SW_PAGED, &ip->i_flags);
56}
57
58/**
59 * gfs2_page_inval - Invalidate all pages associated with a glock
60 * @gl: the glock
61 *
62 */
63
64static void gfs2_page_inval(struct gfs2_glock *gl)
65{
66 struct gfs2_inode *ip;
67 struct inode *inode;
68
69 ip = gl->gl_object;
70 inode = &ip->i_inode;
71 if (!ip || !S_ISREG(ip->i_di.di_mode))
72 return;
73
74 truncate_inode_pages(inode->i_mapping, 0);
75 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), !inode->i_mapping->nrpages);
76 clear_bit(GIF_PAGED, &ip->i_flags);
77}
78
79/**
80 * gfs2_page_sync - Sync the data pages (not metadata) associated with a glock
81 * @gl: the glock
82 * @flags: DIO_START | DIO_WAIT
83 *
84 * Syncs data (not metadata) for a regular file.
85 * No-op for all other types.
86 */
87
88static void gfs2_page_sync(struct gfs2_glock *gl, int flags)
89{
90 struct gfs2_inode *ip;
91 struct inode *inode;
92 struct address_space *mapping;
93 int error = 0;
94
95 ip = gl->gl_object;
96 inode = &ip->i_inode;
97 if (!ip || !S_ISREG(ip->i_di.di_mode))
98 return;
99
100 mapping = inode->i_mapping;
101
102 if (flags & DIO_START)
103 filemap_fdatawrite(mapping);
104 if (!error && (flags & DIO_WAIT))
105 error = filemap_fdatawait(mapping);
106
107 /* Put back any errors cleared by filemap_fdatawait()
108 so they can be caught by someone who can pass them
109 up to user space. */
110
111 if (error == -ENOSPC)
112 set_bit(AS_ENOSPC, &mapping->flags);
113 else if (error)
114 set_bit(AS_EIO, &mapping->flags);
115
116}
117
118/**
119 * meta_go_sync - sync out the metadata for this glock
120 * @gl: the glock
121 * @flags: DIO_*
122 *
123 * Called when demoting or unlocking an EX glock. We must flush
124 * to disk all dirty buffers/pages relating to this glock, and must not
125 * not return to caller to demote/unlock the glock until I/O is complete.
126 */
127
128static void meta_go_sync(struct gfs2_glock *gl, int flags)
129{
130 if (!(flags & DIO_METADATA))
131 return;
132
133 if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) {
134 gfs2_log_flush(gl->gl_sbd, gl);
135 gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
136 if (flags & DIO_RELEASE)
137 gfs2_ail_empty_gl(gl);
138 }
139
140 clear_bit(GLF_SYNC, &gl->gl_flags);
141}
142
143/**
144 * meta_go_inval - invalidate the metadata for this glock
145 * @gl: the glock
146 * @flags:
147 *
148 */
149
150static void meta_go_inval(struct gfs2_glock *gl, int flags)
151{
152 if (!(flags & DIO_METADATA))
153 return;
154
155 gfs2_meta_inval(gl);
156 gl->gl_vn++;
157}
158
159/**
160 * inode_go_xmote_th - promote/demote a glock
161 * @gl: the glock
162 * @state: the requested state
163 * @flags:
164 *
165 */
166
167static void inode_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
168 int flags)
169{
170 if (gl->gl_state != LM_ST_UNLOCKED)
171 gfs2_pte_inval(gl);
172 gfs2_glock_xmote_th(gl, state, flags);
173}
174
175/**
176 * inode_go_xmote_bh - After promoting/demoting a glock
177 * @gl: the glock
178 *
179 */
180
181static void inode_go_xmote_bh(struct gfs2_glock *gl)
182{
183 struct gfs2_holder *gh = gl->gl_req_gh;
184 struct buffer_head *bh;
185 int error;
186
187 if (gl->gl_state != LM_ST_UNLOCKED &&
188 (!gh || !(gh->gh_flags & GL_SKIP))) {
189 error = gfs2_meta_read(gl, gl->gl_name.ln_number, DIO_START,
190 &bh);
191 if (!error)
192 brelse(bh);
193 }
194}
195
196/**
197 * inode_go_drop_th - unlock a glock
198 * @gl: the glock
199 *
200 * Invoked from rq_demote().
201 * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long)
202 * is being purged from our node's glock cache; we're dropping lock.
203 */
204
205static void inode_go_drop_th(struct gfs2_glock *gl)
206{
207 gfs2_pte_inval(gl);
208 gfs2_glock_drop_th(gl);
209}
210
211/**
212 * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
213 * @gl: the glock protecting the inode
214 * @flags:
215 *
216 */
217
218static void inode_go_sync(struct gfs2_glock *gl, int flags)
219{
220 int meta = (flags & DIO_METADATA);
221 int data = (flags & DIO_DATA);
222
223 if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
224 if (meta && data) {
225 gfs2_page_sync(gl, flags | DIO_START);
226 gfs2_log_flush(gl->gl_sbd, gl);
227 gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
228 gfs2_page_sync(gl, flags | DIO_WAIT);
229 clear_bit(GLF_DIRTY, &gl->gl_flags);
230 } else if (meta) {
231 gfs2_log_flush(gl->gl_sbd, gl);
232 gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
233 } else if (data)
234 gfs2_page_sync(gl, flags | DIO_START | DIO_WAIT);
235 if (flags & DIO_RELEASE)
236 gfs2_ail_empty_gl(gl);
237 }
238
239 clear_bit(GLF_SYNC, &gl->gl_flags);
240}
241
242/**
243 * inode_go_inval - prepare a inode glock to be released
244 * @gl: the glock
245 * @flags:
246 *
247 */
248
249static void inode_go_inval(struct gfs2_glock *gl, int flags)
250{
251 int meta = (flags & DIO_METADATA);
252 int data = (flags & DIO_DATA);
253
254 if (meta) {
255 gfs2_meta_inval(gl);
256 gl->gl_vn++;
257 }
258 if (data)
259 gfs2_page_inval(gl);
260}
261
262/**
263 * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock
264 * @gl: the glock
265 *
266 * Returns: 1 if it's ok
267 */
268
269static int inode_go_demote_ok(struct gfs2_glock *gl)
270{
271 struct gfs2_sbd *sdp = gl->gl_sbd;
272 int demote = 0;
273
274 if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages)
275 demote = 1;
276 else if (!sdp->sd_args.ar_localcaching &&
277 time_after_eq(jiffies, gl->gl_stamp +
278 gfs2_tune_get(sdp, gt_demote_secs) * HZ))
279 demote = 1;
280
281 return demote;
282}
283
284/**
285 * inode_go_lock - operation done after an inode lock is locked by a process
286 * @gl: the glock
287 * @flags:
288 *
289 * Returns: errno
290 */
291
292static int inode_go_lock(struct gfs2_holder *gh)
293{
294 struct gfs2_glock *gl = gh->gh_gl;
295 struct gfs2_inode *ip = gl->gl_object;
296 int error = 0;
297
298 if (!ip)
299 return 0;
300
301 if (ip->i_vn != gl->gl_vn) {
302 error = gfs2_inode_refresh(ip);
303 if (error)
304 return error;
305 gfs2_inode_attr_in(ip);
306 }
307
308 if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
309 (gl->gl_state == LM_ST_EXCLUSIVE) &&
310 (gh->gh_flags & GL_LOCAL_EXCL))
311 error = gfs2_truncatei_resume(ip);
312
313 return error;
314}
315
316/**
317 * inode_go_unlock - operation done before an inode lock is unlocked by a
318 * process
319 * @gl: the glock
320 * @flags:
321 *
322 */
323
324static void inode_go_unlock(struct gfs2_holder *gh)
325{
326 struct gfs2_glock *gl = gh->gh_gl;
327 struct gfs2_inode *ip = gl->gl_object;
328
329 if (ip) {
330 if (test_bit(GLF_DIRTY, &gl->gl_flags))
331 gfs2_inode_attr_in(ip);
332
333 gfs2_meta_cache_flush(ip);
334 }
335}
336
337/**
338 * inode_greedy -
339 * @gl: the glock
340 *
341 */
342
343static void inode_greedy(struct gfs2_glock *gl)
344{
345 struct gfs2_sbd *sdp = gl->gl_sbd;
346 struct gfs2_inode *ip = gl->gl_object;
347 unsigned int quantum = gfs2_tune_get(sdp, gt_greedy_quantum);
348 unsigned int max = gfs2_tune_get(sdp, gt_greedy_max);
349 unsigned int new_time;
350
351 spin_lock(&ip->i_spin);
352
353 if (time_after(ip->i_last_pfault + quantum, jiffies)) {
354 new_time = ip->i_greedy + quantum;
355 if (new_time > max)
356 new_time = max;
357 } else {
358 new_time = ip->i_greedy - quantum;
359 if (!new_time || new_time > max)
360 new_time = 1;
361 }
362
363 ip->i_greedy = new_time;
364
365 spin_unlock(&ip->i_spin);
366
367 iput(&ip->i_inode);
368}
369
370/**
371 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
372 * @gl: the glock
373 *
374 * Returns: 1 if it's ok
375 */
376
377static int rgrp_go_demote_ok(struct gfs2_glock *gl)
378{
379 return !gl->gl_aspace->i_mapping->nrpages;
380}
381
382/**
383 * rgrp_go_lock - operation done after an rgrp lock is locked by
384 * a first holder on this node.
385 * @gl: the glock
386 * @flags:
387 *
388 * Returns: errno
389 */
390
391static int rgrp_go_lock(struct gfs2_holder *gh)
392{
393 return gfs2_rgrp_bh_get(gh->gh_gl->gl_object);
394}
395
396/**
397 * rgrp_go_unlock - operation done before an rgrp lock is unlocked by
398 * a last holder on this node.
399 * @gl: the glock
400 * @flags:
401 *
402 */
403
404static void rgrp_go_unlock(struct gfs2_holder *gh)
405{
406 gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
407}
408
409/**
410 * trans_go_xmote_th - promote/demote the transaction glock
411 * @gl: the glock
412 * @state: the requested state
413 * @flags:
414 *
415 */
416
417static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
418 int flags)
419{
420 struct gfs2_sbd *sdp = gl->gl_sbd;
421
422 if (gl->gl_state != LM_ST_UNLOCKED &&
423 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
424 gfs2_meta_syncfs(sdp);
425 gfs2_log_shutdown(sdp);
426 }
427
428 gfs2_glock_xmote_th(gl, state, flags);
429}
430
431/**
432 * trans_go_xmote_bh - After promoting/demoting the transaction glock
433 * @gl: the glock
434 *
435 */
436
437static void trans_go_xmote_bh(struct gfs2_glock *gl)
438{
439 struct gfs2_sbd *sdp = gl->gl_sbd;
440 struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
441 struct gfs2_glock *j_gl = ip->i_gl;
442 struct gfs2_log_header head;
443 int error;
444
445 if (gl->gl_state != LM_ST_UNLOCKED &&
446 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
447 gfs2_meta_cache_flush(GFS2_I(sdp->sd_jdesc->jd_inode));
448 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
449
450 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
451 if (error)
452 gfs2_consist(sdp);
453 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT))
454 gfs2_consist(sdp);
455
456 /* Initialize some head of the log stuff */
457 if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
458 sdp->sd_log_sequence = head.lh_sequence + 1;
459 gfs2_log_pointers_init(sdp, head.lh_blkno);
460 }
461 }
462}
463
464/**
465 * trans_go_drop_th - unlock the transaction glock
466 * @gl: the glock
467 *
468 * We want to sync the device even with localcaching. Remember
469 * that localcaching journal replay only marks buffers dirty.
470 */
471
472static void trans_go_drop_th(struct gfs2_glock *gl)
473{
474 struct gfs2_sbd *sdp = gl->gl_sbd;
475
476 if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
477 gfs2_meta_syncfs(sdp);
478 gfs2_log_shutdown(sdp);
479 }
480
481 gfs2_glock_drop_th(gl);
482}
483
484/**
485 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
486 * @gl: the glock
487 *
488 * Returns: 1 if it's ok
489 */
490
491static int quota_go_demote_ok(struct gfs2_glock *gl)
492{
493 return !atomic_read(&gl->gl_lvb_count);
494}
495
496struct gfs2_glock_operations gfs2_meta_glops = {
497 .go_xmote_th = gfs2_glock_xmote_th,
498 .go_drop_th = gfs2_glock_drop_th,
499 .go_type = LM_TYPE_META
500};
501
502struct gfs2_glock_operations gfs2_inode_glops = {
503 .go_xmote_th = inode_go_xmote_th,
504 .go_xmote_bh = inode_go_xmote_bh,
505 .go_drop_th = inode_go_drop_th,
506 .go_sync = inode_go_sync,
507 .go_inval = inode_go_inval,
508 .go_demote_ok = inode_go_demote_ok,
509 .go_lock = inode_go_lock,
510 .go_unlock = inode_go_unlock,
511 .go_greedy = inode_greedy,
512 .go_type = LM_TYPE_INODE
513};
514
515struct gfs2_glock_operations gfs2_rgrp_glops = {
516 .go_xmote_th = gfs2_glock_xmote_th,
517 .go_drop_th = gfs2_glock_drop_th,
518 .go_sync = meta_go_sync,
519 .go_inval = meta_go_inval,
520 .go_demote_ok = rgrp_go_demote_ok,
521 .go_lock = rgrp_go_lock,
522 .go_unlock = rgrp_go_unlock,
523 .go_type = LM_TYPE_RGRP
524};
525
526struct gfs2_glock_operations gfs2_trans_glops = {
527 .go_xmote_th = trans_go_xmote_th,
528 .go_xmote_bh = trans_go_xmote_bh,
529 .go_drop_th = trans_go_drop_th,
530 .go_type = LM_TYPE_NONDISK
531};
532
533struct gfs2_glock_operations gfs2_iopen_glops = {
534 .go_xmote_th = gfs2_glock_xmote_th,
535 .go_drop_th = gfs2_glock_drop_th,
536 .go_callback = gfs2_iopen_go_callback,
537 .go_type = LM_TYPE_IOPEN
538};
539
540struct gfs2_glock_operations gfs2_flock_glops = {
541 .go_xmote_th = gfs2_glock_xmote_th,
542 .go_drop_th = gfs2_glock_drop_th,
543 .go_type = LM_TYPE_FLOCK
544};
545
546struct gfs2_glock_operations gfs2_nondisk_glops = {
547 .go_xmote_th = gfs2_glock_xmote_th,
548 .go_drop_th = gfs2_glock_drop_th,
549 .go_type = LM_TYPE_NONDISK
550};
551
552struct gfs2_glock_operations gfs2_quota_glops = {
553 .go_xmote_th = gfs2_glock_xmote_th,
554 .go_drop_th = gfs2_glock_drop_th,
555 .go_demote_ok = quota_go_demote_ok,
556 .go_type = LM_TYPE_QUOTA
557};
558
559struct gfs2_glock_operations gfs2_journal_glops = {
560 .go_xmote_th = gfs2_glock_xmote_th,
561 .go_drop_th = gfs2_glock_drop_th,
562 .go_type = LM_TYPE_JOURNAL
563};
564
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
new file mode 100644
index 000000000000..5c1e9491024f
--- /dev/null
+++ b/fs/gfs2/glops.h
@@ -0,0 +1,23 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GLOPS_DOT_H__
11#define __GLOPS_DOT_H__
12
13extern struct gfs2_glock_operations gfs2_meta_glops;
14extern struct gfs2_glock_operations gfs2_inode_glops;
15extern struct gfs2_glock_operations gfs2_rgrp_glops;
16extern struct gfs2_glock_operations gfs2_trans_glops;
17extern struct gfs2_glock_operations gfs2_iopen_glops;
18extern struct gfs2_glock_operations gfs2_flock_glops;
19extern struct gfs2_glock_operations gfs2_nondisk_glops;
20extern struct gfs2_glock_operations gfs2_quota_glops;
21extern struct gfs2_glock_operations gfs2_journal_glops;
22
23#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
new file mode 100644
index 000000000000..77f0903d2f3e
--- /dev/null
+++ b/fs/gfs2/incore.h
@@ -0,0 +1,660 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __INCORE_DOT_H__
11#define __INCORE_DOT_H__
12
13#define DIO_FORCE 0x00000001
14#define DIO_CLEAN 0x00000002
15#define DIO_DIRTY 0x00000004
16#define DIO_START 0x00000008
17#define DIO_WAIT 0x00000010
18#define DIO_METADATA 0x00000020
19#define DIO_DATA 0x00000040
20#define DIO_RELEASE 0x00000080
21#define DIO_ALL 0x00000100
22
23struct gfs2_log_operations;
24struct gfs2_log_element;
25struct gfs2_bitmap;
26struct gfs2_rgrpd;
27struct gfs2_bufdata;
28struct gfs2_glock_operations;
29struct gfs2_holder;
30struct gfs2_glock;
31struct gfs2_alloc;
32struct gfs2_inode;
33struct gfs2_file;
34struct gfs2_revoke;
35struct gfs2_revoke_replay;
36struct gfs2_quota_data;
37struct gfs2_log_buf;
38struct gfs2_trans;
39struct gfs2_ail;
40struct gfs2_jdesc;
41struct gfs2_args;
42struct gfs2_tune;
43struct gfs2_gl_hash_bucket;
44struct gfs2_sbd;
45
46typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
47
48/*
49 * Structure of operations that are associated with each
50 * type of element in the log.
51 */
52
53struct gfs2_log_operations {
54 void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le);
55 void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
56 void (*lo_before_commit) (struct gfs2_sbd *sdp);
57 void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
58 void (*lo_before_scan) (struct gfs2_jdesc *jd,
59 struct gfs2_log_header *head, int pass);
60 int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start,
61 struct gfs2_log_descriptor *ld, __be64 *ptr,
62 int pass);
63 void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass);
64 const char *lo_name;
65};
66
67struct gfs2_log_element {
68 struct list_head le_list;
69 const struct gfs2_log_operations *le_ops;
70};
71
72struct gfs2_bitmap {
73 struct buffer_head *bi_bh;
74 char *bi_clone;
75 uint32_t bi_offset;
76 uint32_t bi_start;
77 uint32_t bi_len;
78};
79
80struct gfs2_rgrpd {
81 struct list_head rd_list; /* Link with superblock */
82 struct list_head rd_list_mru;
83 struct list_head rd_recent; /* Recently used rgrps */
84 struct gfs2_glock *rd_gl; /* Glock for this rgrp */
85 struct gfs2_rindex rd_ri;
86 struct gfs2_rgrp rd_rg;
87 uint64_t rd_rg_vn;
88 struct gfs2_bitmap *rd_bits;
89 unsigned int rd_bh_count;
90 struct mutex rd_mutex;
91 uint32_t rd_free_clone;
92 struct gfs2_log_element rd_le;
93 uint32_t rd_last_alloc_data;
94 uint32_t rd_last_alloc_meta;
95 struct gfs2_sbd *rd_sbd;
96};
97
98enum gfs2_state_bits {
99 BH_Pinned = BH_PrivateStart,
100 BH_Escaped = BH_PrivateStart + 1,
101};
102
103BUFFER_FNS(Pinned, pinned)
104TAS_BUFFER_FNS(Pinned, pinned)
105BUFFER_FNS(Escaped, escaped)
106TAS_BUFFER_FNS(Escaped, escaped)
107
108struct gfs2_bufdata {
109 struct buffer_head *bd_bh;
110 struct gfs2_glock *bd_gl;
111
112 struct list_head bd_list_tr;
113 struct gfs2_log_element bd_le;
114
115 struct gfs2_ail *bd_ail;
116 struct list_head bd_ail_st_list;
117 struct list_head bd_ail_gl_list;
118};
119
120struct gfs2_glock_operations {
121 void (*go_xmote_th) (struct gfs2_glock * gl, unsigned int state,
122 int flags);
123 void (*go_xmote_bh) (struct gfs2_glock * gl);
124 void (*go_drop_th) (struct gfs2_glock * gl);
125 void (*go_drop_bh) (struct gfs2_glock * gl);
126 void (*go_sync) (struct gfs2_glock * gl, int flags);
127 void (*go_inval) (struct gfs2_glock * gl, int flags);
128 int (*go_demote_ok) (struct gfs2_glock * gl);
129 int (*go_lock) (struct gfs2_holder * gh);
130 void (*go_unlock) (struct gfs2_holder * gh);
131 void (*go_callback) (struct gfs2_glock * gl, unsigned int state);
132 void (*go_greedy) (struct gfs2_glock * gl);
133 int go_type;
134};
135
136enum {
137 /* Actions */
138 HIF_MUTEX = 0,
139 HIF_PROMOTE = 1,
140 HIF_DEMOTE = 2,
141 HIF_GREEDY = 3,
142
143 /* States */
144 HIF_ALLOCED = 4,
145 HIF_DEALLOC = 5,
146 HIF_HOLDER = 6,
147 HIF_FIRST = 7,
148 HIF_ABORTED = 9,
149};
150
151struct gfs2_holder {
152 struct list_head gh_list;
153
154 struct gfs2_glock *gh_gl;
155 struct task_struct *gh_owner;
156 unsigned int gh_state;
157 unsigned gh_flags;
158
159 int gh_error;
160 unsigned long gh_iflags;
161 struct completion gh_wait;
162 unsigned long gh_ip;
163};
164
165enum {
166 GLF_PLUG = 0,
167 GLF_LOCK = 1,
168 GLF_STICKY = 2,
169 GLF_PREFETCH = 3,
170 GLF_SYNC = 4,
171 GLF_DIRTY = 5,
172 GLF_SKIP_WAITERS2 = 6,
173 GLF_GREEDY = 7,
174};
175
176struct gfs2_glock {
177 struct list_head gl_list;
178 unsigned long gl_flags; /* GLF_... */
179 struct lm_lockname gl_name;
180 struct kref gl_ref;
181
182 spinlock_t gl_spin;
183
184 unsigned int gl_state;
185 struct task_struct *gl_owner;
186 unsigned long gl_ip;
187 struct list_head gl_holders;
188 struct list_head gl_waiters1; /* HIF_MUTEX */
189 struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_GREEDY */
190 struct list_head gl_waiters3; /* HIF_PROMOTE */
191
192 struct gfs2_glock_operations *gl_ops;
193
194 struct gfs2_holder *gl_req_gh;
195 gfs2_glop_bh_t gl_req_bh;
196
197 lm_lock_t *gl_lock;
198 char *gl_lvb;
199 atomic_t gl_lvb_count;
200
201 uint64_t gl_vn;
202 unsigned long gl_stamp;
203 void *gl_object;
204
205 struct gfs2_gl_hash_bucket *gl_bucket;
206 struct list_head gl_reclaim;
207
208 struct gfs2_sbd *gl_sbd;
209
210 struct inode *gl_aspace;
211 struct gfs2_log_element gl_le;
212 struct list_head gl_ail_list;
213 atomic_t gl_ail_count;
214};
215
216struct gfs2_alloc {
217 /* Quota stuff */
218
219 struct gfs2_quota_data *al_qd[4];
220 struct gfs2_holder al_qd_ghs[4];
221 unsigned int al_qd_num;
222
223 u32 al_requested; /* Filled in by caller of gfs2_inplace_reserve() */
224 u32 al_alloced; /* Filled in by gfs2_alloc_*() */
225
226 /* Filled in by gfs2_inplace_reserve() */
227
228 unsigned int al_line;
229 char *al_file;
230 struct gfs2_holder al_ri_gh;
231 struct gfs2_holder al_rgd_gh;
232 struct gfs2_rgrpd *al_rgd;
233
234};
235
236enum {
237 GIF_QD_LOCKED = 1,
238 GIF_PAGED = 2,
239 GIF_SW_PAGED = 3,
240};
241
242struct gfs2_inode {
243 struct inode i_inode;
244 struct gfs2_inum i_num;
245
246 unsigned long i_flags; /* GIF_... */
247
248 uint64_t i_vn;
249 struct gfs2_dinode i_di; /* To be replaced by ref to block */
250
251 struct gfs2_glock *i_gl; /* Move into i_gh? */
252 struct gfs2_holder i_iopen_gh;
253 struct gfs2_holder i_gh; /* for prepare/commit_write only */
254 struct gfs2_alloc i_alloc;
255 uint64_t i_last_rg_alloc;
256
257 spinlock_t i_spin;
258 struct rw_semaphore i_rw_mutex;
259 unsigned int i_greedy;
260 unsigned long i_last_pfault;
261
262 struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT];
263};
264
265/*
266 * Since i_inode is the first element of struct gfs2_inode,
267 * this is effectively a cast.
268 */
269static inline struct gfs2_inode *GFS2_I(struct inode *inode)
270{
271 return container_of(inode, struct gfs2_inode, i_inode);
272}
273
274/* To be removed? */
275static inline struct gfs2_sbd *GFS2_SB(struct inode *inode)
276{
277 return inode->i_sb->s_fs_info;
278}
279
280enum {
281 GFF_DID_DIRECT_ALLOC = 0,
282 GFF_EXLOCK = 1,
283};
284
285struct gfs2_file {
286 unsigned long f_flags; /* GFF_... */
287 struct mutex f_fl_mutex;
288 struct gfs2_holder f_fl_gh;
289};
290
291struct gfs2_revoke {
292 struct gfs2_log_element rv_le;
293 uint64_t rv_blkno;
294};
295
296struct gfs2_revoke_replay {
297 struct list_head rr_list;
298 uint64_t rr_blkno;
299 unsigned int rr_where;
300};
301
302enum {
303 QDF_USER = 0,
304 QDF_CHANGE = 1,
305 QDF_LOCKED = 2,
306};
307
308struct gfs2_quota_lvb {
309 uint32_t qb_magic;
310 uint32_t __pad;
311 uint64_t qb_limit; /* Hard limit of # blocks to alloc */
312 uint64_t qb_warn; /* Warn user when alloc is above this # */
313 int64_t qb_value; /* Current # blocks allocated */
314};
315
316struct gfs2_quota_data {
317 struct list_head qd_list;
318 unsigned int qd_count;
319
320 uint32_t qd_id;
321 unsigned long qd_flags; /* QDF_... */
322
323 int64_t qd_change;
324 int64_t qd_change_sync;
325
326 unsigned int qd_slot;
327 unsigned int qd_slot_count;
328
329 struct buffer_head *qd_bh;
330 struct gfs2_quota_change *qd_bh_qc;
331 unsigned int qd_bh_count;
332
333 struct gfs2_glock *qd_gl;
334 struct gfs2_quota_lvb qd_qb;
335
336 uint64_t qd_sync_gen;
337 unsigned long qd_last_warn;
338 unsigned long qd_last_touched;
339};
340
341struct gfs2_log_buf {
342 struct list_head lb_list;
343 struct buffer_head *lb_bh;
344 struct buffer_head *lb_real;
345};
346
347struct gfs2_trans {
348 unsigned long tr_ip;
349
350 unsigned int tr_blocks;
351 unsigned int tr_revokes;
352 unsigned int tr_reserved;
353
354 struct gfs2_holder tr_t_gh;
355
356 int tr_touched;
357
358 unsigned int tr_num_buf;
359 unsigned int tr_num_buf_new;
360 unsigned int tr_num_buf_rm;
361 struct list_head tr_list_buf;
362
363 unsigned int tr_num_revoke;
364 unsigned int tr_num_revoke_rm;
365};
366
367struct gfs2_ail {
368 struct list_head ai_list;
369
370 unsigned int ai_first;
371 struct list_head ai_ail1_list;
372 struct list_head ai_ail2_list;
373
374 uint64_t ai_sync_gen;
375};
376
377struct gfs2_jdesc {
378 struct list_head jd_list;
379
380 struct inode *jd_inode;
381 unsigned int jd_jid;
382 int jd_dirty;
383
384 unsigned int jd_blocks;
385};
386
387#define GFS2_GLOCKD_DEFAULT 1
388#define GFS2_GLOCKD_MAX 16
389
390#define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF
391#define GFS2_QUOTA_OFF 0
392#define GFS2_QUOTA_ACCOUNT 1
393#define GFS2_QUOTA_ON 2
394
395#define GFS2_DATA_DEFAULT GFS2_DATA_ORDERED
396#define GFS2_DATA_WRITEBACK 1
397#define GFS2_DATA_ORDERED 2
398
399struct gfs2_args {
400 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
401 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
402 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
403 int ar_spectator; /* Don't get a journal because we're always RO */
404 int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */
405 int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */
406 int ar_localcaching; /* Local-style caching (dangerous on multihost) */
407 int ar_debug; /* Oops on errors instead of trying to be graceful */
408 int ar_upgrade; /* Upgrade ondisk/multihost format */
409 unsigned int ar_num_glockd; /* Number of glockd threads */
410 int ar_posix_acl; /* Enable posix acls */
411 int ar_quota; /* off/account/on */
412 int ar_suiddir; /* suiddir support */
413 int ar_data; /* ordered/writeback */
414};
415
416struct gfs2_tune {
417 spinlock_t gt_spin;
418
419 unsigned int gt_ilimit;
420 unsigned int gt_ilimit_tries;
421 unsigned int gt_ilimit_min;
422 unsigned int gt_demote_secs; /* Cache retention for unheld glock */
423 unsigned int gt_incore_log_blocks;
424 unsigned int gt_log_flush_secs;
425 unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
426
427 unsigned int gt_scand_secs;
428 unsigned int gt_recoverd_secs;
429 unsigned int gt_logd_secs;
430 unsigned int gt_quotad_secs;
431
432 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
433 unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
434 unsigned int gt_quota_scale_num; /* Numerator */
435 unsigned int gt_quota_scale_den; /* Denominator */
436 unsigned int gt_quota_cache_secs;
437 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
438 unsigned int gt_atime_quantum; /* Min secs between atime updates */
439 unsigned int gt_new_files_jdata;
440 unsigned int gt_new_files_directio;
441 unsigned int gt_max_atomic_write; /* Split big writes into this size */
442 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
443 unsigned int gt_lockdump_size;
444 unsigned int gt_stall_secs; /* Detects trouble! */
445 unsigned int gt_complain_secs;
446 unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
447 unsigned int gt_entries_per_readdir;
448 unsigned int gt_prefetch_secs; /* Usage window for prefetched glocks */
449 unsigned int gt_greedy_default;
450 unsigned int gt_greedy_quantum;
451 unsigned int gt_greedy_max;
452 unsigned int gt_statfs_quantum;
453 unsigned int gt_statfs_slow;
454};
455
456struct gfs2_gl_hash_bucket {
457 rwlock_t hb_lock;
458 struct list_head hb_list;
459};
460
461enum {
462 SDF_JOURNAL_CHECKED = 0,
463 SDF_JOURNAL_LIVE = 1,
464 SDF_SHUTDOWN = 2,
465 SDF_NOATIME = 3,
466};
467
468#define GFS2_GL_HASH_SHIFT 13
469#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
470#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1)
471#define GFS2_FSNAME_LEN 256
472
473struct gfs2_sbd {
474 struct super_block *sd_vfs;
475 struct super_block *sd_vfs_meta;
476 struct kobject sd_kobj;
477 unsigned long sd_flags; /* SDF_... */
478 struct gfs2_sb sd_sb;
479
480 /* Constants computed on mount */
481
482 uint32_t sd_fsb2bb;
483 uint32_t sd_fsb2bb_shift;
484 uint32_t sd_diptrs; /* Number of pointers in a dinode */
485 uint32_t sd_inptrs; /* Number of pointers in a indirect block */
486 uint32_t sd_jbsize; /* Size of a journaled data block */
487 uint32_t sd_hash_bsize; /* sizeof(exhash block) */
488 uint32_t sd_hash_bsize_shift;
489 uint32_t sd_hash_ptrs; /* Number of pointers in a hash block */
490 uint32_t sd_qc_per_block;
491 uint32_t sd_max_dirres; /* Max blocks needed to add a directory entry */
492 uint32_t sd_max_height; /* Max height of a file's metadata tree */
493 uint64_t sd_heightsize[GFS2_MAX_META_HEIGHT];
494 uint32_t sd_max_jheight; /* Max height of journaled file's meta tree */
495 uint64_t sd_jheightsize[GFS2_MAX_META_HEIGHT];
496
497 struct gfs2_args sd_args; /* Mount arguments */
498 struct gfs2_tune sd_tune; /* Filesystem tuning structure */
499
500 /* Lock Stuff */
501
502 struct lm_lockstruct sd_lockstruct;
503 struct gfs2_gl_hash_bucket sd_gl_hash[GFS2_GL_HASH_SIZE];
504 struct list_head sd_reclaim_list;
505 spinlock_t sd_reclaim_lock;
506 wait_queue_head_t sd_reclaim_wq;
507 atomic_t sd_reclaim_count;
508 struct gfs2_holder sd_live_gh;
509 struct gfs2_glock *sd_rename_gl;
510 struct gfs2_glock *sd_trans_gl;
511
512 /* Inode Stuff */
513
514 struct inode *sd_master_dir;
515 struct inode *sd_jindex;
516 struct inode *sd_inum_inode;
517 struct inode *sd_statfs_inode;
518 struct inode *sd_ir_inode;
519 struct inode *sd_sc_inode;
520 struct inode *sd_qc_inode;
521 struct inode *sd_rindex;
522 struct inode *sd_quota_inode;
523
524 /* Inum stuff */
525
526 struct mutex sd_inum_mutex;
527
528 /* StatFS stuff */
529
530 spinlock_t sd_statfs_spin;
531 struct mutex sd_statfs_mutex;
532 struct gfs2_statfs_change sd_statfs_master;
533 struct gfs2_statfs_change sd_statfs_local;
534 unsigned long sd_statfs_sync_time;
535
536 /* Resource group stuff */
537
538 uint64_t sd_rindex_vn;
539 spinlock_t sd_rindex_spin;
540 struct mutex sd_rindex_mutex;
541 struct list_head sd_rindex_list;
542 struct list_head sd_rindex_mru_list;
543 struct list_head sd_rindex_recent_list;
544 struct gfs2_rgrpd *sd_rindex_forward;
545 unsigned int sd_rgrps;
546
547 /* Journal index stuff */
548
549 struct list_head sd_jindex_list;
550 spinlock_t sd_jindex_spin;
551 struct mutex sd_jindex_mutex;
552 unsigned int sd_journals;
553 unsigned long sd_jindex_refresh_time;
554
555 struct gfs2_jdesc *sd_jdesc;
556 struct gfs2_holder sd_journal_gh;
557 struct gfs2_holder sd_jinode_gh;
558
559 struct gfs2_holder sd_ir_gh;
560 struct gfs2_holder sd_sc_gh;
561 struct gfs2_holder sd_qc_gh;
562
563 /* Daemon stuff */
564
565 struct task_struct *sd_scand_process;
566 struct task_struct *sd_recoverd_process;
567 struct task_struct *sd_logd_process;
568 struct task_struct *sd_quotad_process;
569 struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
570 unsigned int sd_glockd_num;
571
572 /* Quota stuff */
573
574 struct list_head sd_quota_list;
575 atomic_t sd_quota_count;
576 spinlock_t sd_quota_spin;
577 struct mutex sd_quota_mutex;
578
579 unsigned int sd_quota_slots;
580 unsigned int sd_quota_chunks;
581 unsigned char **sd_quota_bitmap;
582
583 uint64_t sd_quota_sync_gen;
584 unsigned long sd_quota_sync_time;
585
586 /* Log stuff */
587
588 spinlock_t sd_log_lock;
589
590 unsigned int sd_log_blks_reserved;
591 unsigned int sd_log_commited_buf;
592 unsigned int sd_log_commited_revoke;
593
594 unsigned int sd_log_num_gl;
595 unsigned int sd_log_num_buf;
596 unsigned int sd_log_num_revoke;
597 unsigned int sd_log_num_rg;
598 unsigned int sd_log_num_databuf;
599 unsigned int sd_log_num_jdata;
600 unsigned int sd_log_num_hdrs;
601
602 struct list_head sd_log_le_gl;
603 struct list_head sd_log_le_buf;
604 struct list_head sd_log_le_revoke;
605 struct list_head sd_log_le_rg;
606 struct list_head sd_log_le_databuf;
607
608 unsigned int sd_log_blks_free;
609 struct mutex sd_log_reserve_mutex;
610
611 uint64_t sd_log_sequence;
612 unsigned int sd_log_head;
613 unsigned int sd_log_tail;
614 int sd_log_idle;
615
616 unsigned long sd_log_flush_time;
617 struct rw_semaphore sd_log_flush_lock;
618 struct list_head sd_log_flush_list;
619
620 unsigned int sd_log_flush_head;
621 uint64_t sd_log_flush_wrapped;
622
623 struct list_head sd_ail1_list;
624 struct list_head sd_ail2_list;
625 uint64_t sd_ail_sync_gen;
626
627 /* Replay stuff */
628
629 struct list_head sd_revoke_list;
630 unsigned int sd_replay_tail;
631
632 unsigned int sd_found_blocks;
633 unsigned int sd_found_revokes;
634 unsigned int sd_replayed_blocks;
635
636 /* For quiescing the filesystem */
637
638 struct gfs2_holder sd_freeze_gh;
639 struct mutex sd_freeze_lock;
640 unsigned int sd_freeze_count;
641
642 /* Counters */
643
644 atomic_t sd_glock_count;
645 atomic_t sd_glock_held_count;
646 atomic_t sd_inode_count;
647 atomic_t sd_reclaimed;
648
649 char sd_fsname[GFS2_FSNAME_LEN];
650 char sd_table_name[GFS2_FSNAME_LEN];
651 char sd_proto_name[GFS2_FSNAME_LEN];
652
653 /* Debugging crud */
654
655 unsigned long sd_last_warning;
656 struct vfsmount *sd_gfs2mnt;
657};
658
659#endif /* __INCORE_DOT_H__ */
660
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
new file mode 100644
index 000000000000..9fb340984b29
--- /dev/null
+++ b/fs/gfs2/inode.c
@@ -0,0 +1,1344 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/posix_acl.h>
16#include <linux/sort.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/crc32.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "acl.h"
24#include "bmap.h"
25#include "dir.h"
26#include "eattr.h"
27#include "glock.h"
28#include "glops.h"
29#include "inode.h"
30#include "log.h"
31#include "meta_io.h"
32#include "ops_address.h"
33#include "ops_file.h"
34#include "ops_inode.h"
35#include "quota.h"
36#include "rgrp.h"
37#include "trans.h"
38#include "util.h"
39
40/**
41 * gfs2_inode_attr_in - Copy attributes from the dinode into the VFS inode
42 * @ip: The GFS2 inode (with embedded disk inode data)
43 * @inode: The Linux VFS inode
44 *
45 */
46
47void gfs2_inode_attr_in(struct gfs2_inode *ip)
48{
49 struct inode *inode = &ip->i_inode;
50
51 inode->i_ino = ip->i_num.no_addr;
52
53 switch (ip->i_di.di_mode & S_IFMT) {
54 case S_IFBLK:
55 case S_IFCHR:
56 inode->i_rdev = MKDEV(ip->i_di.di_major, ip->i_di.di_minor);
57 break;
58 default:
59 inode->i_rdev = 0;
60 break;
61 };
62
63 inode->i_mode = ip->i_di.di_mode;
64 inode->i_nlink = ip->i_di.di_nlink;
65 inode->i_uid = ip->i_di.di_uid;
66 inode->i_gid = ip->i_di.di_gid;
67 i_size_write(inode, ip->i_di.di_size);
68 inode->i_atime.tv_sec = ip->i_di.di_atime;
69 inode->i_mtime.tv_sec = ip->i_di.di_mtime;
70 inode->i_ctime.tv_sec = ip->i_di.di_ctime;
71 inode->i_atime.tv_nsec = 0;
72 inode->i_mtime.tv_nsec = 0;
73 inode->i_ctime.tv_nsec = 0;
74 inode->i_blksize = PAGE_SIZE;
75 inode->i_blocks = ip->i_di.di_blocks <<
76 (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
77
78 if (ip->i_di.di_flags & GFS2_DIF_IMMUTABLE)
79 inode->i_flags |= S_IMMUTABLE;
80 else
81 inode->i_flags &= ~S_IMMUTABLE;
82
83 if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY)
84 inode->i_flags |= S_APPEND;
85 else
86 inode->i_flags &= ~S_APPEND;
87}
88
89/**
90 * gfs2_inode_attr_out - Copy attributes from VFS inode into the dinode
91 * @ip: The GFS2 inode
92 *
93 * Only copy out the attributes that we want the VFS layer
94 * to be able to modify.
95 */
96
97void gfs2_inode_attr_out(struct gfs2_inode *ip)
98{
99 struct inode *inode = &ip->i_inode;
100
101 gfs2_assert_withdraw(GFS2_SB(inode),
102 (ip->i_di.di_mode & S_IFMT) == (inode->i_mode & S_IFMT));
103 ip->i_di.di_mode = inode->i_mode;
104 ip->i_di.di_uid = inode->i_uid;
105 ip->i_di.di_gid = inode->i_gid;
106 ip->i_di.di_atime = inode->i_atime.tv_sec;
107 ip->i_di.di_mtime = inode->i_mtime.tv_sec;
108 ip->i_di.di_ctime = inode->i_ctime.tv_sec;
109}
110
111static int iget_test(struct inode *inode, void *opaque)
112{
113 struct gfs2_inode *ip = GFS2_I(inode);
114 struct gfs2_inum *inum = opaque;
115
116 if (ip && ip->i_num.no_addr == inum->no_addr)
117 return 1;
118
119 return 0;
120}
121
122static int iget_set(struct inode *inode, void *opaque)
123{
124 struct gfs2_inode *ip = GFS2_I(inode);
125 struct gfs2_inum *inum = opaque;
126
127 ip->i_num = *inum;
128 return 0;
129}
130
131struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum)
132{
133 return ilookup5(sb, (unsigned long)inum->no_formal_ino,
134 iget_test, inum);
135}
136
137static struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum *inum)
138{
139 return iget5_locked(sb, (unsigned long)inum->no_formal_ino,
140 iget_test, iget_set, inum);
141}
142
143/**
144 * gfs2_inode_lookup - Lookup an inode
145 * @sb: The super block
146 * @inum: The inode number
147 * @type: The type of the inode
148 *
149 * Returns: A VFS inode, or an error
150 */
151
152struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned int type)
153{
154 struct inode *inode = gfs2_iget(sb, inum);
155 struct gfs2_inode *ip = GFS2_I(inode);
156 struct gfs2_glock *io_gl;
157 int error;
158
159 if (inode->i_state & I_NEW) {
160 struct gfs2_sbd *sdp = GFS2_SB(inode);
161 umode_t mode = DT2IF(type);
162 inode->u.generic_ip = ip;
163 inode->i_mode = mode;
164
165 if (S_ISREG(mode)) {
166 inode->i_op = &gfs2_file_iops;
167 inode->i_fop = &gfs2_file_fops;
168 inode->i_mapping->a_ops = &gfs2_file_aops;
169 } else if (S_ISDIR(mode)) {
170 inode->i_op = &gfs2_dir_iops;
171 inode->i_fop = &gfs2_dir_fops;
172 } else if (S_ISLNK(mode)) {
173 inode->i_op = &gfs2_symlink_iops;
174 } else {
175 inode->i_op = &gfs2_dev_iops;
176 }
177
178 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
179 if (unlikely(error))
180 goto fail;
181 ip->i_gl->gl_object = ip;
182
183 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
184 if (unlikely(error))
185 goto fail_put;
186
187 ip->i_vn = ip->i_gl->gl_vn - 1;
188 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
189 if (unlikely(error))
190 goto fail_iopen;
191
192 gfs2_glock_put(io_gl);
193 unlock_new_inode(inode);
194 }
195
196 return inode;
197fail_iopen:
198 gfs2_glock_put(io_gl);
199fail_put:
200 ip->i_gl->gl_object = NULL;
201 gfs2_glock_put(ip->i_gl);
202fail:
203 iput(inode);
204 return ERR_PTR(error);
205}
206
207/**
208 * gfs2_inode_refresh - Refresh the incore copy of the dinode
209 * @ip: The GFS2 inode
210 *
211 * Returns: errno
212 */
213
214int gfs2_inode_refresh(struct gfs2_inode *ip)
215{
216 struct buffer_head *dibh;
217 int error;
218
219 error = gfs2_meta_inode_buffer(ip, &dibh);
220 if (error)
221 return error;
222
223 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) {
224 brelse(dibh);
225 return -EIO;
226 }
227
228 gfs2_dinode_in(&ip->i_di, dibh->b_data);
229
230 brelse(dibh);
231
232 if (ip->i_num.no_addr != ip->i_di.di_num.no_addr) {
233 if (gfs2_consist_inode(ip))
234 gfs2_dinode_print(&ip->i_di);
235 return -EIO;
236 }
237 if (ip->i_num.no_formal_ino != ip->i_di.di_num.no_formal_ino)
238 return -ESTALE;
239
240 ip->i_vn = ip->i_gl->gl_vn;
241
242 return 0;
243}
244
245int gfs2_dinode_dealloc(struct gfs2_inode *ip)
246{
247 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
248 struct gfs2_alloc *al;
249 struct gfs2_rgrpd *rgd;
250 int error;
251
252 if (ip->i_di.di_blocks != 1) {
253 if (gfs2_consist_inode(ip))
254 gfs2_dinode_print(&ip->i_di);
255 return -EIO;
256 }
257
258 al = gfs2_alloc_get(ip);
259
260 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
261 if (error)
262 goto out;
263
264 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
265 if (error)
266 goto out_qs;
267
268 rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
269 if (!rgd) {
270 gfs2_consist_inode(ip);
271 error = -EIO;
272 goto out_rindex_relse;
273 }
274
275 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
276 &al->al_rgd_gh);
277 if (error)
278 goto out_rindex_relse;
279
280 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 1);
281 if (error)
282 goto out_rg_gunlock;
283
284 gfs2_trans_add_gl(ip->i_gl);
285
286 gfs2_free_di(rgd, ip);
287
288 gfs2_trans_end(sdp);
289 clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
290
291out_rg_gunlock:
292 gfs2_glock_dq_uninit(&al->al_rgd_gh);
293out_rindex_relse:
294 gfs2_glock_dq_uninit(&al->al_ri_gh);
295out_qs:
296 gfs2_quota_unhold(ip);
297out:
298 gfs2_alloc_put(ip);
299 return error;
300}
301
302/**
303 * gfs2_change_nlink - Change nlink count on inode
304 * @ip: The GFS2 inode
305 * @diff: The change in the nlink count required
306 *
307 * Returns: errno
308 */
309
310int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
311{
312 struct gfs2_sbd *sdp = ip->i_inode.i_sb->s_fs_info;
313 struct buffer_head *dibh;
314 uint32_t nlink;
315 int error;
316
317 BUG_ON(ip->i_di.di_nlink != ip->i_inode.i_nlink);
318 nlink = ip->i_di.di_nlink + diff;
319
320 /* If we are reducing the nlink count, but the new value ends up being
321 bigger than the old one, we must have underflowed. */
322 if (diff < 0 && nlink > ip->i_di.di_nlink) {
323 if (gfs2_consist_inode(ip))
324 gfs2_dinode_print(&ip->i_di);
325 return -EIO;
326 }
327
328 error = gfs2_meta_inode_buffer(ip, &dibh);
329 if (error)
330 return error;
331
332 ip->i_di.di_nlink = nlink;
333 ip->i_di.di_ctime = get_seconds();
334 ip->i_inode.i_nlink = nlink;
335
336 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
337 gfs2_dinode_out(&ip->i_di, dibh->b_data);
338 brelse(dibh);
339 mark_inode_dirty(&ip->i_inode);
340
341 if (ip->i_di.di_nlink == 0) {
342 struct gfs2_rgrpd *rgd;
343 struct gfs2_holder ri_gh, rg_gh;
344
345 error = gfs2_rindex_hold(sdp, &ri_gh);
346 if (error)
347 goto out;
348 error = -EIO;
349 rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
350 if (!rgd)
351 goto out_norgrp;
352 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
353 if (error)
354 goto out_norgrp;
355
356 gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */
357 gfs2_glock_dq_uninit(&rg_gh);
358out_norgrp:
359 gfs2_glock_dq_uninit(&ri_gh);
360 }
361out:
362 return error;
363}
364
365struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
366{
367 struct qstr qstr;
368 gfs2_str2qstr(&qstr, name);
369 return gfs2_lookupi(dip, &qstr, 1, NULL);
370}
371
372
373/**
374 * gfs2_lookupi - Look up a filename in a directory and return its inode
375 * @d_gh: An initialized holder for the directory glock
376 * @name: The name of the inode to look for
377 * @is_root: If 1, ignore the caller's permissions
378 * @i_gh: An uninitialized holder for the new inode glock
379 *
380 * There will always be a vnode (Linux VFS inode) for the d_gh inode unless
381 * @is_root is true.
382 *
383 * Returns: errno
384 */
385
386struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
387 int is_root, struct nameidata *nd)
388
389{
390 struct super_block *sb = dir->i_sb;
391 struct gfs2_inode *dip = GFS2_I(dir);
392 struct gfs2_holder d_gh;
393 struct gfs2_inum inum;
394 unsigned int type;
395 int error = 0;
396 struct inode *inode = NULL;
397
398 if (!name->len || name->len > GFS2_FNAMESIZE)
399 return ERR_PTR(-ENAMETOOLONG);
400
401 if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) ||
402 (name->len == 2 && memcmp(name->name, "..", 2) == 0 &&
403 dir == sb->s_root->d_inode)) {
404 igrab(dir);
405 return dir;
406 }
407
408 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
409 if (error)
410 return ERR_PTR(error);
411
412 if (!is_root) {
413 error = permission(dir, MAY_EXEC, NULL);
414 if (error)
415 goto out;
416 }
417
418 error = gfs2_dir_search(dir, name, &inum, &type);
419 if (error)
420 goto out;
421
422 inode = gfs2_inode_lookup(sb, &inum, type);
423
424out:
425 gfs2_glock_dq_uninit(&d_gh);
426 if (error == -ENOENT)
427 return NULL;
428 return inode;
429}
430
431static int pick_formal_ino_1(struct gfs2_sbd *sdp, uint64_t *formal_ino)
432{
433 struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
434 struct buffer_head *bh;
435 struct gfs2_inum_range ir;
436 int error;
437
438 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
439 if (error)
440 return error;
441 mutex_lock(&sdp->sd_inum_mutex);
442
443 error = gfs2_meta_inode_buffer(ip, &bh);
444 if (error) {
445 mutex_unlock(&sdp->sd_inum_mutex);
446 gfs2_trans_end(sdp);
447 return error;
448 }
449
450 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
451
452 if (ir.ir_length) {
453 *formal_ino = ir.ir_start++;
454 ir.ir_length--;
455 gfs2_trans_add_bh(ip->i_gl, bh, 1);
456 gfs2_inum_range_out(&ir,
457 bh->b_data + sizeof(struct gfs2_dinode));
458 brelse(bh);
459 mutex_unlock(&sdp->sd_inum_mutex);
460 gfs2_trans_end(sdp);
461 return 0;
462 }
463
464 brelse(bh);
465
466 mutex_unlock(&sdp->sd_inum_mutex);
467 gfs2_trans_end(sdp);
468
469 return 1;
470}
471
472static int pick_formal_ino_2(struct gfs2_sbd *sdp, uint64_t *formal_ino)
473{
474 struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
475 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
476 struct gfs2_holder gh;
477 struct buffer_head *bh;
478 struct gfs2_inum_range ir;
479 int error;
480
481 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
482 if (error)
483 return error;
484
485 error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
486 if (error)
487 goto out;
488 mutex_lock(&sdp->sd_inum_mutex);
489
490 error = gfs2_meta_inode_buffer(ip, &bh);
491 if (error)
492 goto out_end_trans;
493
494 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
495
496 if (!ir.ir_length) {
497 struct buffer_head *m_bh;
498 uint64_t x, y;
499
500 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
501 if (error)
502 goto out_brelse;
503
504 x = *(uint64_t *)(m_bh->b_data + sizeof(struct gfs2_dinode));
505 x = y = be64_to_cpu(x);
506 ir.ir_start = x;
507 ir.ir_length = GFS2_INUM_QUANTUM;
508 x += GFS2_INUM_QUANTUM;
509 if (x < y)
510 gfs2_consist_inode(m_ip);
511 x = cpu_to_be64(x);
512 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
513 *(uint64_t *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = x;
514
515 brelse(m_bh);
516 }
517
518 *formal_ino = ir.ir_start++;
519 ir.ir_length--;
520
521 gfs2_trans_add_bh(ip->i_gl, bh, 1);
522 gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
523
524out_brelse:
525 brelse(bh);
526out_end_trans:
527 mutex_unlock(&sdp->sd_inum_mutex);
528 gfs2_trans_end(sdp);
529out:
530 gfs2_glock_dq_uninit(&gh);
531 return error;
532}
533
534static int pick_formal_ino(struct gfs2_sbd *sdp, uint64_t *inum)
535{
536 int error;
537
538 error = pick_formal_ino_1(sdp, inum);
539 if (error <= 0)
540 return error;
541
542 error = pick_formal_ino_2(sdp, inum);
543
544 return error;
545}
546
547/**
548 * create_ok - OK to create a new on-disk inode here?
549 * @dip: Directory in which dinode is to be created
550 * @name: Name of new dinode
551 * @mode:
552 *
553 * Returns: errno
554 */
555
556static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
557 unsigned int mode)
558{
559 int error;
560
561 error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
562 if (error)
563 return error;
564
565 /* Don't create entries in an unlinked directory */
566 if (!dip->i_di.di_nlink)
567 return -EPERM;
568
569 error = gfs2_dir_search(&dip->i_inode, name, NULL, NULL);
570 switch (error) {
571 case -ENOENT:
572 error = 0;
573 break;
574 case 0:
575 return -EEXIST;
576 default:
577 return error;
578 }
579
580 if (dip->i_di.di_entries == (uint32_t)-1)
581 return -EFBIG;
582 if (S_ISDIR(mode) && dip->i_di.di_nlink == (uint32_t)-1)
583 return -EMLINK;
584
585 return 0;
586}
587
588static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
589 unsigned int *uid, unsigned int *gid)
590{
591 if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
592 (dip->i_di.di_mode & S_ISUID) && dip->i_di.di_uid) {
593 if (S_ISDIR(*mode))
594 *mode |= S_ISUID;
595 else if (dip->i_di.di_uid != current->fsuid)
596 *mode &= ~07111;
597 *uid = dip->i_di.di_uid;
598 } else
599 *uid = current->fsuid;
600
601 if (dip->i_di.di_mode & S_ISGID) {
602 if (S_ISDIR(*mode))
603 *mode |= S_ISGID;
604 *gid = dip->i_di.di_gid;
605 } else
606 *gid = current->fsgid;
607}
608
609static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_inum *inum,
610 u64 *generation)
611{
612 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
613 int error;
614
615 gfs2_alloc_get(dip);
616
617 dip->i_alloc.al_requested = RES_DINODE;
618 error = gfs2_inplace_reserve(dip);
619 if (error)
620 goto out;
621
622 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS, 0);
623 if (error)
624 goto out_ipreserv;
625
626 inum->no_addr = gfs2_alloc_di(dip, generation);
627
628 gfs2_trans_end(sdp);
629
630out_ipreserv:
631 gfs2_inplace_release(dip);
632out:
633 gfs2_alloc_put(dip);
634 return error;
635}
636
637/**
638 * init_dinode - Fill in a new dinode structure
639 * @dip: the directory this inode is being created in
640 * @gl: The glock covering the new inode
641 * @inum: the inode number
642 * @mode: the file permissions
643 * @uid:
644 * @gid:
645 *
646 */
647
648static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
649 const struct gfs2_inum *inum, unsigned int mode,
650 unsigned int uid, unsigned int gid,
651 const u64 *generation)
652{
653 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
654 struct gfs2_dinode *di;
655 struct buffer_head *dibh;
656
657 dibh = gfs2_meta_new(gl, inum->no_addr);
658 gfs2_trans_add_bh(gl, dibh, 1);
659 gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
660 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
661 di = (struct gfs2_dinode *)dibh->b_data;
662
663 di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino);
664 di->di_num.no_addr = cpu_to_be64(inum->no_addr);
665 di->di_mode = cpu_to_be32(mode);
666 di->di_uid = cpu_to_be32(uid);
667 di->di_gid = cpu_to_be32(gid);
668 di->di_nlink = cpu_to_be32(0);
669 di->di_size = cpu_to_be64(0);
670 di->di_blocks = cpu_to_be64(1);
671 di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(get_seconds());
672 di->di_major = di->di_minor = cpu_to_be32(0);
673 di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
674 di->di_generation = cpu_to_be64(*generation);
675 di->di_flags = cpu_to_be32(0);
676
677 if (S_ISREG(mode)) {
678 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
679 gfs2_tune_get(sdp, gt_new_files_jdata))
680 di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
681 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
682 gfs2_tune_get(sdp, gt_new_files_directio))
683 di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
684 } else if (S_ISDIR(mode)) {
685 di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
686 GFS2_DIF_INHERIT_DIRECTIO);
687 di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
688 GFS2_DIF_INHERIT_JDATA);
689 }
690
691 di->__pad1 = 0;
692 di->di_payload_format = cpu_to_be32(0);
693 di->di_height = cpu_to_be32(0);
694 di->__pad2 = 0;
695 di->__pad3 = 0;
696 di->di_depth = cpu_to_be16(0);
697 di->di_entries = cpu_to_be32(0);
698 memset(&di->__pad4, 0, sizeof(di->__pad4));
699 di->di_eattr = cpu_to_be64(0);
700 memset(&di->di_reserved, 0, sizeof(di->di_reserved));
701
702 brelse(dibh);
703}
704
705static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
706 unsigned int mode, const struct gfs2_inum *inum,
707 const u64 *generation)
708{
709 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
710 unsigned int uid, gid;
711 int error;
712
713 munge_mode_uid_gid(dip, &mode, &uid, &gid);
714 gfs2_alloc_get(dip);
715
716 error = gfs2_quota_lock(dip, uid, gid);
717 if (error)
718 goto out;
719
720 error = gfs2_quota_check(dip, uid, gid);
721 if (error)
722 goto out_quota;
723
724 error = gfs2_trans_begin(sdp, RES_DINODE + RES_QUOTA, 0);
725 if (error)
726 goto out_quota;
727
728 init_dinode(dip, gl, inum, mode, uid, gid, generation);
729 gfs2_quota_change(dip, +1, uid, gid);
730 gfs2_trans_end(sdp);
731
732out_quota:
733 gfs2_quota_unlock(dip);
734out:
735 gfs2_alloc_put(dip);
736 return error;
737}
738
739static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
740 struct gfs2_inode *ip)
741{
742 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
743 struct gfs2_alloc *al;
744 int alloc_required;
745 struct buffer_head *dibh;
746 int error;
747
748 al = gfs2_alloc_get(dip);
749
750 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
751 if (error)
752 goto fail;
753
754 error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name);
755 if (alloc_required < 0)
756 goto fail;
757 if (alloc_required) {
758 error = gfs2_quota_check(dip, dip->i_di.di_uid,
759 dip->i_di.di_gid);
760 if (error)
761 goto fail_quota_locks;
762
763 al->al_requested = sdp->sd_max_dirres;
764
765 error = gfs2_inplace_reserve(dip);
766 if (error)
767 goto fail_quota_locks;
768
769 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
770 al->al_rgd->rd_ri.ri_length +
771 2 * RES_DINODE +
772 RES_STATFS + RES_QUOTA, 0);
773 if (error)
774 goto fail_ipreserv;
775 } else {
776 error = gfs2_trans_begin(sdp, RES_LEAF + 2 * RES_DINODE, 0);
777 if (error)
778 goto fail_quota_locks;
779 }
780
781 error = gfs2_dir_add(&dip->i_inode, name, &ip->i_num, IF2DT(ip->i_di.di_mode));
782 if (error)
783 goto fail_end_trans;
784
785 error = gfs2_meta_inode_buffer(ip, &dibh);
786 if (error)
787 goto fail_end_trans;
788 ip->i_di.di_nlink = 1;
789 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
790 gfs2_dinode_out(&ip->i_di, dibh->b_data);
791 brelse(dibh);
792 return 0;
793
794fail_end_trans:
795 gfs2_trans_end(sdp);
796
797fail_ipreserv:
798 if (dip->i_alloc.al_rgd)
799 gfs2_inplace_release(dip);
800
801fail_quota_locks:
802 gfs2_quota_unlock(dip);
803
804fail:
805 gfs2_alloc_put(dip);
806 return error;
807}
808
809/**
810 * gfs2_createi - Create a new inode
811 * @ghs: An array of two holders
812 * @name: The name of the new file
813 * @mode: the permissions on the new inode
814 *
815 * @ghs[0] is an initialized holder for the directory
816 * @ghs[1] is the holder for the inode lock
817 *
818 * If the return value is not NULL, the glocks on both the directory and the new
819 * file are held. A transaction has been started and an inplace reservation
820 * is held, as well.
821 *
822 * Returns: An inode
823 */
824
825struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
826 unsigned int mode)
827{
828 struct inode *inode;
829 struct gfs2_inode *dip = ghs->gh_gl->gl_object;
830 struct inode *dir = &dip->i_inode;
831 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
832 struct gfs2_inum inum;
833 int error;
834 u64 generation;
835
836 if (!name->len || name->len > GFS2_FNAMESIZE)
837 return ERR_PTR(-ENAMETOOLONG);
838
839 gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
840 error = gfs2_glock_nq(ghs);
841 if (error)
842 goto fail;
843
844 error = create_ok(dip, name, mode);
845 if (error)
846 goto fail_gunlock;
847
848 error = pick_formal_ino(sdp, &inum.no_formal_ino);
849 if (error)
850 goto fail_gunlock;
851
852 error = alloc_dinode(dip, &inum, &generation);
853 if (error)
854 goto fail_gunlock;
855
856 if (inum.no_addr < dip->i_num.no_addr) {
857 gfs2_glock_dq(ghs);
858
859 error = gfs2_glock_nq_num(sdp, inum.no_addr,
860 &gfs2_inode_glops, LM_ST_EXCLUSIVE,
861 GL_SKIP, ghs + 1);
862 if (error) {
863 return ERR_PTR(error);
864 }
865
866 gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
867 error = gfs2_glock_nq(ghs);
868 if (error) {
869 gfs2_glock_dq_uninit(ghs + 1);
870 return ERR_PTR(error);
871 }
872
873 error = create_ok(dip, name, mode);
874 if (error)
875 goto fail_gunlock2;
876 } else {
877 error = gfs2_glock_nq_num(sdp, inum.no_addr,
878 &gfs2_inode_glops, LM_ST_EXCLUSIVE,
879 GL_SKIP, ghs + 1);
880 if (error)
881 goto fail_gunlock;
882 }
883
884 error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation);
885 if (error)
886 goto fail_gunlock2;
887
888 inode = gfs2_inode_lookup(dir->i_sb, &inum, IF2DT(mode));
889 if (IS_ERR(inode))
890 goto fail_gunlock2;
891
892 error = gfs2_inode_refresh(GFS2_I(inode));
893 if (error)
894 goto fail_iput;
895
896 error = gfs2_acl_create(dip, GFS2_I(inode));
897 if (error)
898 goto fail_iput;
899
900 error = link_dinode(dip, name, GFS2_I(inode));
901 if (error)
902 goto fail_iput;
903
904 if (!inode)
905 return ERR_PTR(-ENOMEM);
906 return inode;
907
908fail_iput:
909 iput(inode);
910fail_gunlock2:
911 gfs2_glock_dq_uninit(ghs + 1);
912fail_gunlock:
913 gfs2_glock_dq(ghs);
914fail:
915 return ERR_PTR(error);
916}
917
918/**
919 * gfs2_rmdiri - Remove a directory
920 * @dip: The parent directory of the directory to be removed
921 * @name: The name of the directory to be removed
922 * @ip: The GFS2 inode of the directory to be removed
923 *
924 * Assumes Glocks on dip and ip are held
925 *
926 * Returns: errno
927 */
928
929int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
930 struct gfs2_inode *ip)
931{
932 struct qstr dotname;
933 int error;
934
935 if (ip->i_di.di_entries != 2) {
936 if (gfs2_consist_inode(ip))
937 gfs2_dinode_print(&ip->i_di);
938 return -EIO;
939 }
940
941 error = gfs2_dir_del(dip, name);
942 if (error)
943 return error;
944
945 error = gfs2_change_nlink(dip, -1);
946 if (error)
947 return error;
948
949 gfs2_str2qstr(&dotname, ".");
950 error = gfs2_dir_del(ip, &dotname);
951 if (error)
952 return error;
953
954 gfs2_str2qstr(&dotname, "..");
955 error = gfs2_dir_del(ip, &dotname);
956 if (error)
957 return error;
958
959 error = gfs2_change_nlink(ip, -2);
960 if (error)
961 return error;
962
963 return error;
964}
965
966/*
967 * gfs2_unlink_ok - check to see that a inode is still in a directory
968 * @dip: the directory
969 * @name: the name of the file
970 * @ip: the inode
971 *
972 * Assumes that the lock on (at least) @dip is held.
973 *
974 * Returns: 0 if the parent/child relationship is correct, errno if it isn't
975 */
976
977int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
978 struct gfs2_inode *ip)
979{
980 struct gfs2_inum inum;
981 unsigned int type;
982 int error;
983
984 if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
985 return -EPERM;
986
987 if ((dip->i_di.di_mode & S_ISVTX) &&
988 dip->i_di.di_uid != current->fsuid &&
989 ip->i_di.di_uid != current->fsuid && !capable(CAP_FOWNER))
990 return -EPERM;
991
992 if (IS_APPEND(&dip->i_inode))
993 return -EPERM;
994
995 error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
996 if (error)
997 return error;
998
999 error = gfs2_dir_search(&dip->i_inode, name, &inum, &type);
1000 if (error)
1001 return error;
1002
1003 if (!gfs2_inum_equal(&inum, &ip->i_num))
1004 return -ENOENT;
1005
1006 if (IF2DT(ip->i_di.di_mode) != type) {
1007 gfs2_consist_inode(dip);
1008 return -EIO;
1009 }
1010
1011 return 0;
1012}
1013
1014/*
1015 * gfs2_ok_to_move - check if it's ok to move a directory to another directory
1016 * @this: move this
1017 * @to: to here
1018 *
1019 * Follow @to back to the root and make sure we don't encounter @this
1020 * Assumes we already hold the rename lock.
1021 *
1022 * Returns: errno
1023 */
1024
1025int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
1026{
1027 struct inode *dir = &to->i_inode;
1028 struct super_block *sb = dir->i_sb;
1029 struct inode *tmp;
1030 struct qstr dotdot;
1031 int error = 0;
1032
1033 gfs2_str2qstr(&dotdot, "..");
1034
1035 igrab(dir);
1036
1037 for (;;) {
1038 if (dir == &this->i_inode) {
1039 error = -EINVAL;
1040 break;
1041 }
1042 if (dir == sb->s_root->d_inode) {
1043 error = 0;
1044 break;
1045 }
1046
1047 tmp = gfs2_lookupi(dir, &dotdot, 1, NULL);
1048 if (IS_ERR(tmp)) {
1049 error = PTR_ERR(tmp);
1050 break;
1051 }
1052
1053 iput(dir);
1054 dir = tmp;
1055 }
1056
1057 iput(dir);
1058
1059 return error;
1060}
1061
1062/**
1063 * gfs2_readlinki - return the contents of a symlink
1064 * @ip: the symlink's inode
1065 * @buf: a pointer to the buffer to be filled
1066 * @len: a pointer to the length of @buf
1067 *
1068 * If @buf is too small, a piece of memory is kmalloc()ed and needs
1069 * to be freed by the caller.
1070 *
1071 * Returns: errno
1072 */
1073
1074int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1075{
1076 struct gfs2_holder i_gh;
1077 struct buffer_head *dibh;
1078 unsigned int x;
1079 int error;
1080
1081 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
1082 error = gfs2_glock_nq_atime(&i_gh);
1083 if (error) {
1084 gfs2_holder_uninit(&i_gh);
1085 return error;
1086 }
1087
1088 if (!ip->i_di.di_size) {
1089 gfs2_consist_inode(ip);
1090 error = -EIO;
1091 goto out;
1092 }
1093
1094 error = gfs2_meta_inode_buffer(ip, &dibh);
1095 if (error)
1096 goto out;
1097
1098 x = ip->i_di.di_size + 1;
1099 if (x > *len) {
1100 *buf = kmalloc(x, GFP_KERNEL);
1101 if (!*buf) {
1102 error = -ENOMEM;
1103 goto out_brelse;
1104 }
1105 }
1106
1107 memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
1108 *len = x;
1109
1110out_brelse:
1111 brelse(dibh);
1112out:
1113 gfs2_glock_dq_uninit(&i_gh);
1114 return error;
1115}
1116
1117/**
1118 * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
1119 * conditionally update the inode's atime
1120 * @gh: the holder to acquire
1121 *
1122 * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
1123 * Update if the difference between the current time and the inode's current
1124 * atime is greater than an interval specified at mount.
1125 *
1126 * Returns: errno
1127 */
1128
1129int gfs2_glock_nq_atime(struct gfs2_holder *gh)
1130{
1131 struct gfs2_glock *gl = gh->gh_gl;
1132 struct gfs2_sbd *sdp = gl->gl_sbd;
1133 struct gfs2_inode *ip = gl->gl_object;
1134 int64_t curtime, quantum = gfs2_tune_get(sdp, gt_atime_quantum);
1135 unsigned int state;
1136 int flags;
1137 int error;
1138
1139 if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
1140 gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
1141 gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
1142 return -EINVAL;
1143
1144 state = gh->gh_state;
1145 flags = gh->gh_flags;
1146
1147 error = gfs2_glock_nq(gh);
1148 if (error)
1149 return error;
1150
1151 if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
1152 (sdp->sd_vfs->s_flags & MS_RDONLY))
1153 return 0;
1154
1155 curtime = get_seconds();
1156 if (curtime - ip->i_di.di_atime >= quantum) {
1157 gfs2_glock_dq(gh);
1158 gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
1159 gh);
1160 error = gfs2_glock_nq(gh);
1161 if (error)
1162 return error;
1163
1164 /* Verify that atime hasn't been updated while we were
1165 trying to get exclusive lock. */
1166
1167 curtime = get_seconds();
1168 if (curtime - ip->i_di.di_atime >= quantum) {
1169 struct buffer_head *dibh;
1170
1171 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1172 if (error == -EROFS)
1173 return 0;
1174 if (error)
1175 goto fail;
1176
1177 error = gfs2_meta_inode_buffer(ip, &dibh);
1178 if (error)
1179 goto fail_end_trans;
1180
1181 ip->i_di.di_atime = curtime;
1182
1183 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1184 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1185 brelse(dibh);
1186
1187 gfs2_trans_end(sdp);
1188 }
1189
1190 /* If someone else has asked for the glock,
1191 unlock and let them have it. Then reacquire
1192 in the original state. */
1193 if (gfs2_glock_is_blocking(gl)) {
1194 gfs2_glock_dq(gh);
1195 gfs2_holder_reinit(state, flags, gh);
1196 return gfs2_glock_nq(gh);
1197 }
1198 }
1199
1200 return 0;
1201
1202fail_end_trans:
1203 gfs2_trans_end(sdp);
1204fail:
1205 gfs2_glock_dq(gh);
1206 return error;
1207}
1208
1209/**
1210 * glock_compare_atime - Compare two struct gfs2_glock structures for sort
1211 * @arg_a: the first structure
1212 * @arg_b: the second structure
1213 *
1214 * Returns: 1 if A > B
1215 * -1 if A < B
1216 * 0 if A = B
1217 */
1218
1219static int glock_compare_atime(const void *arg_a, const void *arg_b)
1220{
1221 struct gfs2_holder *gh_a = *(struct gfs2_holder **)arg_a;
1222 struct gfs2_holder *gh_b = *(struct gfs2_holder **)arg_b;
1223 struct lm_lockname *a = &gh_a->gh_gl->gl_name;
1224 struct lm_lockname *b = &gh_b->gh_gl->gl_name;
1225 int ret = 0;
1226
1227 if (a->ln_number > b->ln_number)
1228 ret = 1;
1229 else if (a->ln_number < b->ln_number)
1230 ret = -1;
1231 else {
1232 if (gh_a->gh_state == LM_ST_SHARED &&
1233 gh_b->gh_state == LM_ST_EXCLUSIVE)
1234 ret = 1;
1235 else if (gh_a->gh_state == LM_ST_SHARED &&
1236 (gh_b->gh_flags & GL_ATIME))
1237 ret = 1;
1238 }
1239
1240 return ret;
1241}
1242
1243/**
1244 * gfs2_glock_nq_m_atime - acquire multiple glocks where one may need an
1245 * atime update
1246 * @num_gh: the number of structures
1247 * @ghs: an array of struct gfs2_holder structures
1248 *
1249 * Returns: 0 on success (all glocks acquired),
1250 * errno on failure (no glocks acquired)
1251 */
1252
1253int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs)
1254{
1255 struct gfs2_holder **p;
1256 unsigned int x;
1257 int error = 0;
1258
1259 if (!num_gh)
1260 return 0;
1261
1262 if (num_gh == 1) {
1263 ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1264 if (ghs->gh_flags & GL_ATIME)
1265 error = gfs2_glock_nq_atime(ghs);
1266 else
1267 error = gfs2_glock_nq(ghs);
1268 return error;
1269 }
1270
1271 p = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
1272 if (!p)
1273 return -ENOMEM;
1274
1275 for (x = 0; x < num_gh; x++)
1276 p[x] = &ghs[x];
1277
1278 sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare_atime,NULL);
1279
1280 for (x = 0; x < num_gh; x++) {
1281 p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1282
1283 if (p[x]->gh_flags & GL_ATIME)
1284 error = gfs2_glock_nq_atime(p[x]);
1285 else
1286 error = gfs2_glock_nq(p[x]);
1287
1288 if (error) {
1289 while (x--)
1290 gfs2_glock_dq(p[x]);
1291 break;
1292 }
1293 }
1294
1295 kfree(p);
1296 return error;
1297}
1298
1299
1300static int
1301__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1302{
1303 struct buffer_head *dibh;
1304 int error;
1305
1306 error = gfs2_meta_inode_buffer(ip, &dibh);
1307 if (!error) {
1308 error = inode_setattr(&ip->i_inode, attr);
1309 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
1310 gfs2_inode_attr_out(ip);
1311
1312 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1313 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1314 brelse(dibh);
1315 }
1316 return error;
1317}
1318
1319/**
1320 * gfs2_setattr_simple -
1321 * @ip:
1322 * @attr:
1323 *
1324 * Called with a reference on the vnode.
1325 *
1326 * Returns: errno
1327 */
1328
1329int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1330{
1331 int error;
1332
1333 if (current->journal_info)
1334 return __gfs2_setattr_simple(ip, attr);
1335
1336 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0);
1337 if (error)
1338 return error;
1339
1340 error = __gfs2_setattr_simple(ip, attr);
1341 gfs2_trans_end(GFS2_SB(&ip->i_inode));
1342 return error;
1343}
1344
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
new file mode 100644
index 000000000000..8bb8b559bcea
--- /dev/null
+++ b/fs/gfs2/inode.h
@@ -0,0 +1,56 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __INODE_DOT_H__
11#define __INODE_DOT_H__
12
13static inline int gfs2_is_stuffed(struct gfs2_inode *ip)
14{
15 return !ip->i_di.di_height;
16}
17
18static inline int gfs2_is_jdata(struct gfs2_inode *ip)
19{
20 return ip->i_di.di_flags & GFS2_DIF_JDATA;
21}
22
23static inline int gfs2_is_dir(struct gfs2_inode *ip)
24{
25 return S_ISDIR(ip->i_di.di_mode);
26}
27
28void gfs2_inode_attr_in(struct gfs2_inode *ip);
29void gfs2_inode_attr_out(struct gfs2_inode *ip);
30struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned type);
31struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum);
32
33int gfs2_inode_refresh(struct gfs2_inode *ip);
34
35int gfs2_dinode_dealloc(struct gfs2_inode *inode);
36int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
37struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
38 int is_root, struct nameidata *nd);
39struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
40 unsigned int mode);
41int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
42 struct gfs2_inode *ip);
43int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
44 struct gfs2_inode *ip);
45int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
46int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
47
48int gfs2_glock_nq_atime(struct gfs2_holder *gh);
49int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs);
50
51int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
52
53struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
54
55#endif /* __INODE_DOT_H__ */
56
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
new file mode 100644
index 000000000000..f45c0ffd1c35
--- /dev/null
+++ b/fs/gfs2/lm.c
@@ -0,0 +1,244 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/delay.h>
16#include <linux/gfs2_ondisk.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "glock.h"
22#include "lm.h"
23#include "super.h"
24#include "util.h"
25#include "lvb.h"
26
27/**
28 * gfs2_lm_mount - mount a locking protocol
29 * @sdp: the filesystem
30 * @args: mount arguements
31 * @silent: if 1, don't complain if the FS isn't a GFS2 fs
32 *
33 * Returns: errno
34 */
35
36int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
37{
38 char *proto = sdp->sd_proto_name;
39 char *table = sdp->sd_table_name;
40 int flags = 0;
41 int error;
42
43 if (sdp->sd_args.ar_spectator)
44 flags |= LM_MFLAG_SPECTATOR;
45
46 fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
47
48 error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
49 gfs2_glock_cb, sdp,
50 GFS2_MIN_LVB_SIZE, flags,
51 &sdp->sd_lockstruct, &sdp->sd_kobj);
52 if (error) {
53 fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
54 proto, table, sdp->sd_args.ar_hostdata);
55 goto out;
56 }
57
58 if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
59 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
60 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
61 GFS2_MIN_LVB_SIZE)) {
62 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
63 goto out;
64 }
65
66 if (sdp->sd_args.ar_spectator)
67 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
68 else
69 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
70 sdp->sd_lockstruct.ls_jid);
71
72 fs_info(sdp, "Joined cluster. Now mounting FS...\n");
73
74 if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
75 !sdp->sd_args.ar_ignore_local_fs) {
76 sdp->sd_args.ar_localflocks = 1;
77 sdp->sd_args.ar_localcaching = 1;
78 }
79
80 out:
81 return error;
82}
83
84void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
85{
86 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
87 sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
88 sdp->sd_lockstruct.ls_lockspace);
89}
90
91void gfs2_lm_unmount(struct gfs2_sbd *sdp)
92{
93 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
94 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
95}
96
97int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
98{
99 va_list args;
100
101 if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
102 return 0;
103
104 va_start(args, fmt);
105 vprintk(fmt, args);
106 va_end(args);
107
108 fs_err(sdp, "about to withdraw from the cluster\n");
109 BUG_ON(sdp->sd_args.ar_debug);
110
111
112 fs_err(sdp, "waiting for outstanding I/O\n");
113
114 /* FIXME: suspend dm device so oustanding bio's complete
115 and all further io requests fail */
116
117 fs_err(sdp, "telling LM to withdraw\n");
118 gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
119 fs_err(sdp, "withdrawn\n");
120 dump_stack();
121
122 return -1;
123}
124
125int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
126 lm_lock_t **lockp)
127{
128 int error;
129 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
130 error = -EIO;
131 else
132 error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
133 sdp->sd_lockstruct.ls_lockspace, name, lockp);
134 return error;
135}
136
137void gfs2_lm_put_lock(struct gfs2_sbd *sdp, lm_lock_t *lock)
138{
139 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
140 sdp->sd_lockstruct.ls_ops->lm_put_lock(lock);
141}
142
143unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, lm_lock_t *lock,
144 unsigned int cur_state, unsigned int req_state,
145 unsigned int flags)
146{
147 int ret;
148 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
149 ret = 0;
150 else
151 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
152 cur_state,
153 req_state, flags);
154 return ret;
155}
156
157unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, lm_lock_t *lock,
158 unsigned int cur_state)
159{
160 int ret;
161 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
162 ret = 0;
163 else
164 ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
165 return ret;
166}
167
168void gfs2_lm_cancel(struct gfs2_sbd *sdp, lm_lock_t *lock)
169{
170 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
171 sdp->sd_lockstruct.ls_ops->lm_cancel(lock);
172}
173
174int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char **lvbp)
175{
176 int error;
177 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
178 error = -EIO;
179 else
180 error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
181 return error;
182}
183
184void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb)
185{
186 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
187 sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb);
188}
189
190#if 0
191void gfs2_lm_sync_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb)
192{
193 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
194 sdp->sd_lockstruct.ls_ops->lm_sync_lvb(lock, lvb);
195}
196#endif /* 0 */
197
198int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
199 struct file *file, struct file_lock *fl)
200{
201 int error;
202 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
203 error = -EIO;
204 else
205 error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
206 sdp->sd_lockstruct.ls_lockspace,
207 name, file, fl);
208 return error;
209}
210
211int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
212 struct file *file, int cmd, struct file_lock *fl)
213{
214 int error;
215 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
216 error = -EIO;
217 else
218 error = sdp->sd_lockstruct.ls_ops->lm_plock(
219 sdp->sd_lockstruct.ls_lockspace,
220 name, file, cmd, fl);
221 return error;
222}
223
224int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
225 struct file *file, struct file_lock *fl)
226{
227 int error;
228 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
229 error = -EIO;
230 else
231 error = sdp->sd_lockstruct.ls_ops->lm_punlock(
232 sdp->sd_lockstruct.ls_lockspace,
233 name, file, fl);
234 return error;
235}
236
237void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
238 unsigned int message)
239{
240 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
241 sdp->sd_lockstruct.ls_ops->lm_recovery_done(
242 sdp->sd_lockstruct.ls_lockspace, jid, message);
243}
244
diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h
new file mode 100644
index 000000000000..e821101d19c0
--- /dev/null
+++ b/fs/gfs2/lm.h
@@ -0,0 +1,41 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LM_DOT_H__
11#define __LM_DOT_H__
12
13int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent);
14void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp);
15void gfs2_lm_unmount(struct gfs2_sbd *sdp);
16int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
17__attribute__ ((format(printf, 2, 3)));
18int gfs2_lm_get_lock(struct gfs2_sbd *sdp,
19 struct lm_lockname *name, lm_lock_t **lockp);
20void gfs2_lm_put_lock(struct gfs2_sbd *sdp, lm_lock_t *lock);
21unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, lm_lock_t *lock,
22 unsigned int cur_state, unsigned int req_state,
23 unsigned int flags);
24unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, lm_lock_t *lock,
25 unsigned int cur_state);
26void gfs2_lm_cancel(struct gfs2_sbd *sdp, lm_lock_t *lock);
27int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char **lvbp);
28void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb);
29int gfs2_lm_plock_get(struct gfs2_sbd *sdp,
30 struct lm_lockname *name,
31 struct file *file, struct file_lock *fl);
32int gfs2_lm_plock(struct gfs2_sbd *sdp,
33 struct lm_lockname *name,
34 struct file *file, int cmd, struct file_lock *fl);
35int gfs2_lm_punlock(struct gfs2_sbd *sdp,
36 struct lm_lockname *name,
37 struct file *file, struct file_lock *fl);
38void gfs2_lm_recovery_done(struct gfs2_sbd *sdp,
39 unsigned int jid, unsigned int message);
40
41#endif /* __LM_DOT_H__ */
diff --git a/fs/gfs2/lm_interface.h b/fs/gfs2/lm_interface.h
new file mode 100644
index 000000000000..1da95a55f768
--- /dev/null
+++ b/fs/gfs2/lm_interface.h
@@ -0,0 +1,290 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LM_INTERFACE_DOT_H__
11#define __LM_INTERFACE_DOT_H__
12
13/*
14 * Opaque handles represent the lock module's lockspace structure, the lock
15 * module's lock structures, and GFS's file system (superblock) structure.
16 */
17
18typedef void lm_lockspace_t;
19typedef void lm_lock_t;
20typedef void lm_fsdata_t;
21
22typedef void (*lm_callback_t) (lm_fsdata_t *fsdata, unsigned int type,
23 void *data);
24
25/*
26 * lm_mount() flags
27 *
28 * LM_MFLAG_SPECTATOR
29 * GFS is asking to join the filesystem's lockspace, but it doesn't want to
30 * modify the filesystem. The lock module shouldn't assign a journal to the FS
31 * mount. It shouldn't send recovery callbacks to the FS mount. If the node
32 * dies or withdraws, all locks can be wiped immediately.
33 */
34
35#define LM_MFLAG_SPECTATOR 0x00000001
36
37/*
38 * lm_lockstruct flags
39 *
40 * LM_LSFLAG_LOCAL
41 * The lock_nolock module returns LM_LSFLAG_LOCAL to GFS, indicating that GFS
42 * can make single-node optimizations.
43 */
44
45#define LM_LSFLAG_LOCAL 0x00000001
46
47/*
48 * lm_lockname types
49 */
50
51#define LM_TYPE_RESERVED 0x00
52#define LM_TYPE_NONDISK 0x01
53#define LM_TYPE_INODE 0x02
54#define LM_TYPE_RGRP 0x03
55#define LM_TYPE_META 0x04
56#define LM_TYPE_IOPEN 0x05
57#define LM_TYPE_FLOCK 0x06
58#define LM_TYPE_PLOCK 0x07
59#define LM_TYPE_QUOTA 0x08
60#define LM_TYPE_JOURNAL 0x09
61
62/*
63 * lm_lock() states
64 *
65 * SHARED is compatible with SHARED, not with DEFERRED or EX.
66 * DEFERRED is compatible with DEFERRED, not with SHARED or EX.
67 */
68
69#define LM_ST_UNLOCKED 0
70#define LM_ST_EXCLUSIVE 1
71#define LM_ST_DEFERRED 2
72#define LM_ST_SHARED 3
73
74/*
75 * lm_lock() flags
76 *
77 * LM_FLAG_TRY
78 * Don't wait to acquire the lock if it can't be granted immediately.
79 *
80 * LM_FLAG_TRY_1CB
81 * Send one blocking callback if TRY is set and the lock is not granted.
82 *
83 * LM_FLAG_NOEXP
84 * GFS sets this flag on lock requests it makes while doing journal recovery.
85 * These special requests should not be blocked due to the recovery like
86 * ordinary locks would be.
87 *
88 * LM_FLAG_ANY
89 * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may
90 * also be granted in SHARED. The preferred state is whichever is compatible
91 * with other granted locks, or the specified state if no other locks exist.
92 *
93 * LM_FLAG_PRIORITY
94 * Override fairness considerations. Suppose a lock is held in a shared state
95 * and there is a pending request for the deferred state. A shared lock
96 * request with the priority flag would be allowed to bypass the deferred
97 * request and directly join the other shared lock. A shared lock request
98 * without the priority flag might be forced to wait until the deferred
99 * requested had acquired and released the lock.
100 */
101
102#define LM_FLAG_TRY 0x00000001
103#define LM_FLAG_TRY_1CB 0x00000002
104#define LM_FLAG_NOEXP 0x00000004
105#define LM_FLAG_ANY 0x00000008
106#define LM_FLAG_PRIORITY 0x00000010
107
108/*
109 * lm_lock() and lm_async_cb return flags
110 *
111 * LM_OUT_ST_MASK
112 * Masks the lower two bits of lock state in the returned value.
113 *
114 * LM_OUT_CACHEABLE
115 * The lock hasn't been released so GFS can continue to cache data for it.
116 *
117 * LM_OUT_CANCELED
118 * The lock request was canceled.
119 *
120 * LM_OUT_ASYNC
121 * The result of the request will be returned in an LM_CB_ASYNC callback.
122 */
123
124#define LM_OUT_ST_MASK 0x00000003
125#define LM_OUT_CACHEABLE 0x00000004
126#define LM_OUT_CANCELED 0x00000008
127#define LM_OUT_ASYNC 0x00000080
128#define LM_OUT_ERROR 0x00000100
129
130/*
131 * lm_callback_t types
132 *
133 * LM_CB_NEED_E LM_CB_NEED_D LM_CB_NEED_S
134 * Blocking callback, a remote node is requesting the given lock in
135 * EXCLUSIVE, DEFERRED, or SHARED.
136 *
137 * LM_CB_NEED_RECOVERY
138 * The given journal needs to be recovered.
139 *
140 * LM_CB_DROPLOCKS
141 * Reduce the number of cached locks.
142 *
143 * LM_CB_ASYNC
144 * The given lock has been granted.
145 */
146
147#define LM_CB_NEED_E 257
148#define LM_CB_NEED_D 258
149#define LM_CB_NEED_S 259
150#define LM_CB_NEED_RECOVERY 260
151#define LM_CB_DROPLOCKS 261
152#define LM_CB_ASYNC 262
153
154/*
155 * lm_recovery_done() messages
156 */
157
158#define LM_RD_GAVEUP 308
159#define LM_RD_SUCCESS 309
160
161
162struct lm_lockname {
163 uint64_t ln_number;
164 unsigned int ln_type;
165};
166
167#define lm_name_equal(name1, name2) \
168 (((name1)->ln_number == (name2)->ln_number) && \
169 ((name1)->ln_type == (name2)->ln_type)) \
170
171struct lm_async_cb {
172 struct lm_lockname lc_name;
173 int lc_ret;
174};
175
176struct lm_lockstruct;
177
178struct lm_lockops {
179 char lm_proto_name[256];
180
181 /*
182 * Mount/Unmount
183 */
184
185 int (*lm_mount) (char *table_name, char *host_data,
186 lm_callback_t cb, lm_fsdata_t *fsdata,
187 unsigned int min_lvb_size, int flags,
188 struct lm_lockstruct *lockstruct,
189 struct kobject *fskobj);
190
191 void (*lm_others_may_mount) (lm_lockspace_t *lockspace);
192
193 void (*lm_unmount) (lm_lockspace_t *lockspace);
194
195 void (*lm_withdraw) (lm_lockspace_t *lockspace);
196
197 /*
198 * Lock oriented operations
199 */
200
201 int (*lm_get_lock) (lm_lockspace_t *lockspace,
202 struct lm_lockname *name, lm_lock_t **lockp);
203
204 void (*lm_put_lock) (lm_lock_t *lock);
205
206 unsigned int (*lm_lock) (lm_lock_t *lock, unsigned int cur_state,
207 unsigned int req_state, unsigned int flags);
208
209 unsigned int (*lm_unlock) (lm_lock_t *lock, unsigned int cur_state);
210
211 void (*lm_cancel) (lm_lock_t *lock);
212
213 int (*lm_hold_lvb) (lm_lock_t *lock, char **lvbp);
214 void (*lm_unhold_lvb) (lm_lock_t *lock, char *lvb);
215 void (*lm_sync_lvb) (lm_lock_t *lock, char *lvb);
216
217 /*
218 * Posix Lock oriented operations
219 */
220
221 int (*lm_plock_get) (lm_lockspace_t *lockspace,
222 struct lm_lockname *name,
223 struct file *file, struct file_lock *fl);
224
225 int (*lm_plock) (lm_lockspace_t *lockspace,
226 struct lm_lockname *name,
227 struct file *file, int cmd, struct file_lock *fl);
228
229 int (*lm_punlock) (lm_lockspace_t *lockspace,
230 struct lm_lockname *name,
231 struct file *file, struct file_lock *fl);
232
233 /*
234 * Client oriented operations
235 */
236
237 void (*lm_recovery_done) (lm_lockspace_t *lockspace, unsigned int jid,
238 unsigned int message);
239
240 struct module *lm_owner;
241};
242
243/*
244 * lm_mount() return values
245 *
246 * ls_jid - the journal ID this node should use
247 * ls_first - this node is the first to mount the file system
248 * ls_lvb_size - size in bytes of lock value blocks
249 * ls_lockspace - lock module's context for this file system
250 * ls_ops - lock module's functions
251 * ls_flags - lock module features
252 */
253
254struct lm_lockstruct {
255 unsigned int ls_jid;
256 unsigned int ls_first;
257 unsigned int ls_lvb_size;
258 lm_lockspace_t *ls_lockspace;
259 struct lm_lockops *ls_ops;
260 int ls_flags;
261};
262
263void __init gfs2_init_lmh(void);
264
265/*
266 * Lock module bottom interface. A lock module makes itself available to GFS
267 * with these functions.
268 */
269
270int gfs2_register_lockproto(struct lm_lockops *proto);
271
272void gfs2_unregister_lockproto(struct lm_lockops *proto);
273
274/*
275 * Lock module top interface. GFS calls these functions when mounting or
276 * unmounting a file system.
277 */
278
279int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
280 lm_callback_t cb, lm_fsdata_t *fsdata,
281 unsigned int min_lvb_size, int flags,
282 struct lm_lockstruct *lockstruct,
283 struct kobject *fskobj);
284
285void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct);
286
287void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct);
288
289#endif /* __LM_INTERFACE_DOT_H__ */
290
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
new file mode 100644
index 000000000000..ded1ef6c81e7
--- /dev/null
+++ b/fs/gfs2/locking.c
@@ -0,0 +1,191 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/string.h>
13#include <linux/slab.h>
14#include <linux/wait.h>
15#include <linux/sched.h>
16#include <linux/kmod.h>
17#include <linux/fs.h>
18#include <linux/delay.h>
19
20#include "lm_interface.h"
21
22struct lmh_wrapper {
23 struct list_head lw_list;
24 struct lm_lockops *lw_ops;
25};
26
27/* List of registered low-level locking protocols. A file system selects one
28 of them by name at mount time, e.g. lock_nolock, lock_dlm. */
29
30static struct list_head lmh_list;
31static struct mutex lmh_lock;
32
33/**
34 * gfs2_register_lockproto - Register a low-level locking protocol
35 * @proto: the protocol definition
36 *
37 * Returns: 0 on success, -EXXX on failure
38 */
39
40int gfs2_register_lockproto(struct lm_lockops *proto)
41{
42 struct lmh_wrapper *lw;
43
44 mutex_lock(&lmh_lock);
45
46 list_for_each_entry(lw, &lmh_list, lw_list) {
47 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
48 mutex_unlock(&lmh_lock);
49 printk(KERN_INFO "GFS2: protocol %s already exists\n",
50 proto->lm_proto_name);
51 return -EEXIST;
52 }
53 }
54
55 lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL);
56 if (!lw) {
57 mutex_unlock(&lmh_lock);
58 return -ENOMEM;
59 }
60
61 lw->lw_ops = proto;
62 list_add(&lw->lw_list, &lmh_list);
63
64 mutex_unlock(&lmh_lock);
65
66 return 0;
67}
68
69/**
70 * gfs2_unregister_lockproto - Unregister a low-level locking protocol
71 * @proto: the protocol definition
72 *
73 */
74
75void gfs2_unregister_lockproto(struct lm_lockops *proto)
76{
77 struct lmh_wrapper *lw;
78
79 mutex_lock(&lmh_lock);
80
81 list_for_each_entry(lw, &lmh_list, lw_list) {
82 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
83 list_del(&lw->lw_list);
84 mutex_unlock(&lmh_lock);
85 kfree(lw);
86 return;
87 }
88 }
89
90 mutex_unlock(&lmh_lock);
91
92 printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n",
93 proto->lm_proto_name);
94}
95
96/**
97 * gfs2_mount_lockproto - Mount a lock protocol
98 * @proto_name - the name of the protocol
99 * @table_name - the name of the lock space
100 * @host_data - data specific to this host
101 * @cb - the callback to the code using the lock module
102 * @fsdata - data to pass back with the callback
103 * @min_lvb_size - the mininum LVB size that the caller can deal with
104 * @flags - LM_MFLAG_*
105 * @lockstruct - a structure returned describing the mount
106 *
107 * Returns: 0 on success, -EXXX on failure
108 */
109
110int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
111 lm_callback_t cb, lm_fsdata_t *fsdata,
112 unsigned int min_lvb_size, int flags,
113 struct lm_lockstruct *lockstruct,
114 struct kobject *fskobj)
115{
116 struct lmh_wrapper *lw = NULL;
117 int try = 0;
118 int error, found;
119
120retry:
121 mutex_lock(&lmh_lock);
122
123 found = 0;
124 list_for_each_entry(lw, &lmh_list, lw_list) {
125 if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
126 found = 1;
127 break;
128 }
129 }
130
131 if (!found) {
132 if (!try && capable(CAP_SYS_MODULE)) {
133 try = 1;
134 mutex_unlock(&lmh_lock);
135 request_module(proto_name);
136 goto retry;
137 }
138 printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name);
139 error = -ENOENT;
140 goto out;
141 }
142
143 if (!try_module_get(lw->lw_ops->lm_owner)) {
144 try = 0;
145 mutex_unlock(&lmh_lock);
146 msleep(1000);
147 goto retry;
148 }
149
150 error = lw->lw_ops->lm_mount(table_name, host_data, cb, fsdata,
151 min_lvb_size, flags, lockstruct, fskobj);
152 if (error)
153 module_put(lw->lw_ops->lm_owner);
154out:
155 mutex_unlock(&lmh_lock);
156 return error;
157}
158
159void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
160{
161 mutex_lock(&lmh_lock);
162 lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
163 if (lockstruct->ls_ops->lm_owner)
164 module_put(lockstruct->ls_ops->lm_owner);
165 mutex_unlock(&lmh_lock);
166}
167
168/**
169 * gfs2_withdraw_lockproto - abnormally unmount a lock module
170 * @lockstruct: the lockstruct passed into mount
171 *
172 */
173
174void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct)
175{
176 mutex_lock(&lmh_lock);
177 lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace);
178 if (lockstruct->ls_ops->lm_owner)
179 module_put(lockstruct->ls_ops->lm_owner);
180 mutex_unlock(&lmh_lock);
181}
182
183void __init gfs2_init_lmh(void)
184{
185 mutex_init(&lmh_lock);
186 INIT_LIST_HEAD(&lmh_list);
187}
188
189EXPORT_SYMBOL_GPL(gfs2_register_lockproto);
190EXPORT_SYMBOL_GPL(gfs2_unregister_lockproto);
191
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
new file mode 100644
index 000000000000..a9733ff80371
--- /dev/null
+++ b/fs/gfs2/locking/dlm/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_GFS2_FS) += lock_dlm.o
2lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o
3
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
new file mode 100644
index 000000000000..f769eac1a34a
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -0,0 +1,541 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include "lock_dlm.h"
11
12static char junk_lvb[GDLM_LVB_SIZE];
13
14static void queue_complete(struct gdlm_lock *lp)
15{
16 struct gdlm_ls *ls = lp->ls;
17
18 clear_bit(LFL_ACTIVE, &lp->flags);
19
20 spin_lock(&ls->async_lock);
21 list_add_tail(&lp->clist, &ls->complete);
22 spin_unlock(&ls->async_lock);
23 wake_up(&ls->thread_wait);
24}
25
26static inline void gdlm_ast(void *astarg)
27{
28 queue_complete(astarg);
29}
30
31static inline void gdlm_bast(void *astarg, int mode)
32{
33 struct gdlm_lock *lp = astarg;
34 struct gdlm_ls *ls = lp->ls;
35
36 if (!mode) {
37 printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
38 lp->lockname.ln_type,
39 (unsigned long long)lp->lockname.ln_number);
40 return;
41 }
42
43 spin_lock(&ls->async_lock);
44 if (!lp->bast_mode) {
45 list_add_tail(&lp->blist, &ls->blocking);
46 lp->bast_mode = mode;
47 } else if (lp->bast_mode < mode)
48 lp->bast_mode = mode;
49 spin_unlock(&ls->async_lock);
50 wake_up(&ls->thread_wait);
51}
52
53void gdlm_queue_delayed(struct gdlm_lock *lp)
54{
55 struct gdlm_ls *ls = lp->ls;
56
57 spin_lock(&ls->async_lock);
58 list_add_tail(&lp->delay_list, &ls->delayed);
59 spin_unlock(&ls->async_lock);
60}
61
62/* convert gfs lock-state to dlm lock-mode */
63
64static int16_t make_mode(int16_t lmstate)
65{
66 switch (lmstate) {
67 case LM_ST_UNLOCKED:
68 return DLM_LOCK_NL;
69 case LM_ST_EXCLUSIVE:
70 return DLM_LOCK_EX;
71 case LM_ST_DEFERRED:
72 return DLM_LOCK_CW;
73 case LM_ST_SHARED:
74 return DLM_LOCK_PR;
75 }
76 gdlm_assert(0, "unknown LM state %d", lmstate);
77 return -1;
78}
79
80/* convert dlm lock-mode to gfs lock-state */
81
82int16_t gdlm_make_lmstate(int16_t dlmmode)
83{
84 switch (dlmmode) {
85 case DLM_LOCK_IV:
86 case DLM_LOCK_NL:
87 return LM_ST_UNLOCKED;
88 case DLM_LOCK_EX:
89 return LM_ST_EXCLUSIVE;
90 case DLM_LOCK_CW:
91 return LM_ST_DEFERRED;
92 case DLM_LOCK_PR:
93 return LM_ST_SHARED;
94 }
95 gdlm_assert(0, "unknown DLM mode %d", dlmmode);
96 return -1;
97}
98
99/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
100 DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
101
102static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state)
103{
104 int16_t cur = make_mode(cur_state);
105 if (lp->cur != DLM_LOCK_IV)
106 gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur);
107}
108
109static inline unsigned int make_flags(struct gdlm_lock *lp,
110 unsigned int gfs_flags,
111 int16_t cur, int16_t req)
112{
113 unsigned int lkf = 0;
114
115 if (gfs_flags & LM_FLAG_TRY)
116 lkf |= DLM_LKF_NOQUEUE;
117
118 if (gfs_flags & LM_FLAG_TRY_1CB) {
119 lkf |= DLM_LKF_NOQUEUE;
120 lkf |= DLM_LKF_NOQUEUEBAST;
121 }
122
123 if (gfs_flags & LM_FLAG_PRIORITY) {
124 lkf |= DLM_LKF_NOORDER;
125 lkf |= DLM_LKF_HEADQUE;
126 }
127
128 if (gfs_flags & LM_FLAG_ANY) {
129 if (req == DLM_LOCK_PR)
130 lkf |= DLM_LKF_ALTCW;
131 else if (req == DLM_LOCK_CW)
132 lkf |= DLM_LKF_ALTPR;
133 }
134
135 if (lp->lksb.sb_lkid != 0) {
136 lkf |= DLM_LKF_CONVERT;
137
138 /* Conversion deadlock avoidance by DLM */
139
140 if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
141 !(lkf & DLM_LKF_NOQUEUE) &&
142 cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
143 lkf |= DLM_LKF_CONVDEADLK;
144 }
145
146 if (lp->lvb)
147 lkf |= DLM_LKF_VALBLK;
148
149 return lkf;
150}
151
152/* make_strname - convert GFS lock numbers to a string */
153
154static inline void make_strname(struct lm_lockname *lockname,
155 struct gdlm_strname *str)
156{
157 sprintf(str->name, "%8x%16llx", lockname->ln_type,
158 (unsigned long long)lockname->ln_number);
159 str->namelen = GDLM_STRNAME_BYTES;
160}
161
162static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
163 struct gdlm_lock **lpp)
164{
165 struct gdlm_lock *lp;
166
167 lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL);
168 if (!lp)
169 return -ENOMEM;
170
171 lp->lockname = *name;
172 lp->ls = ls;
173 lp->cur = DLM_LOCK_IV;
174 lp->lvb = NULL;
175 lp->hold_null = NULL;
176 init_completion(&lp->ast_wait);
177 INIT_LIST_HEAD(&lp->clist);
178 INIT_LIST_HEAD(&lp->blist);
179 INIT_LIST_HEAD(&lp->delay_list);
180
181 spin_lock(&ls->async_lock);
182 list_add(&lp->all_list, &ls->all_locks);
183 ls->all_locks_count++;
184 spin_unlock(&ls->async_lock);
185
186 *lpp = lp;
187 return 0;
188}
189
190void gdlm_delete_lp(struct gdlm_lock *lp)
191{
192 struct gdlm_ls *ls = lp->ls;
193
194 spin_lock(&ls->async_lock);
195 if (!list_empty(&lp->clist))
196 list_del_init(&lp->clist);
197 if (!list_empty(&lp->blist))
198 list_del_init(&lp->blist);
199 if (!list_empty(&lp->delay_list))
200 list_del_init(&lp->delay_list);
201 gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
202 (unsigned long long)lp->lockname.ln_number);
203 list_del_init(&lp->all_list);
204 ls->all_locks_count--;
205 spin_unlock(&ls->async_lock);
206
207 kfree(lp);
208}
209
210int gdlm_get_lock(lm_lockspace_t *lockspace, struct lm_lockname *name,
211 lm_lock_t **lockp)
212{
213 struct gdlm_lock *lp;
214 int error;
215
216 error = gdlm_create_lp((struct gdlm_ls *) lockspace, name, &lp);
217
218 *lockp = (lm_lock_t *) lp;
219 return error;
220}
221
222void gdlm_put_lock(lm_lock_t *lock)
223{
224 gdlm_delete_lp((struct gdlm_lock *) lock);
225}
226
227unsigned int gdlm_do_lock(struct gdlm_lock *lp)
228{
229 struct gdlm_ls *ls = lp->ls;
230 struct gdlm_strname str;
231 int error, bast = 1;
232
233 /*
234 * When recovery is in progress, delay lock requests for submission
235 * once recovery is done. Requests for recovery (NOEXP) and unlocks
236 * can pass.
237 */
238
239 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
240 !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) {
241 gdlm_queue_delayed(lp);
242 return LM_OUT_ASYNC;
243 }
244
245 /*
246 * Submit the actual lock request.
247 */
248
249 if (test_bit(LFL_NOBAST, &lp->flags))
250 bast = 0;
251
252 make_strname(&lp->lockname, &str);
253
254 set_bit(LFL_ACTIVE, &lp->flags);
255
256 log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
257 (unsigned long long)lp->lockname.ln_number, lp->lksb.sb_lkid,
258 lp->cur, lp->req, lp->lkf);
259
260 error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
261 str.name, str.namelen, 0, gdlm_ast, lp,
262 bast ? gdlm_bast : NULL);
263
264 if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
265 lp->lksb.sb_status = -EAGAIN;
266 queue_complete(lp);
267 error = 0;
268 }
269
270 if (error) {
271 log_debug("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
272 "flags=%lx", ls->fsname, lp->lockname.ln_type,
273 (unsigned long long)lp->lockname.ln_number, error,
274 lp->cur, lp->req, lp->lkf, lp->flags);
275 return LM_OUT_ERROR;
276 }
277 return LM_OUT_ASYNC;
278}
279
280static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
281{
282 struct gdlm_ls *ls = lp->ls;
283 unsigned int lkf = 0;
284 int error;
285
286 set_bit(LFL_DLM_UNLOCK, &lp->flags);
287 set_bit(LFL_ACTIVE, &lp->flags);
288
289 if (lp->lvb)
290 lkf = DLM_LKF_VALBLK;
291
292 log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type,
293 (unsigned long long)lp->lockname.ln_number,
294 lp->lksb.sb_lkid, lp->cur, lkf);
295
296 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
297
298 if (error) {
299 log_debug("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
300 "flags=%lx", ls->fsname, lp->lockname.ln_type,
301 (unsigned long long)lp->lockname.ln_number, error,
302 lp->cur, lp->req, lp->lkf, lp->flags);
303 return LM_OUT_ERROR;
304 }
305 return LM_OUT_ASYNC;
306}
307
308unsigned int gdlm_lock(lm_lock_t *lock, unsigned int cur_state,
309 unsigned int req_state, unsigned int flags)
310{
311 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
312
313 clear_bit(LFL_DLM_CANCEL, &lp->flags);
314 if (flags & LM_FLAG_NOEXP)
315 set_bit(LFL_NOBLOCK, &lp->flags);
316
317 check_cur_state(lp, cur_state);
318 lp->req = make_mode(req_state);
319 lp->lkf = make_flags(lp, flags, lp->cur, lp->req);
320
321 return gdlm_do_lock(lp);
322}
323
324unsigned int gdlm_unlock(lm_lock_t *lock, unsigned int cur_state)
325{
326 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
327
328 clear_bit(LFL_DLM_CANCEL, &lp->flags);
329 if (lp->cur == DLM_LOCK_IV)
330 return 0;
331 return gdlm_do_unlock(lp);
332}
333
334void gdlm_cancel(lm_lock_t *lock)
335{
336 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
337 struct gdlm_ls *ls = lp->ls;
338 int error, delay_list = 0;
339
340 if (test_bit(LFL_DLM_CANCEL, &lp->flags))
341 return;
342
343 log_info("gdlm_cancel %x,%llx flags %lx", lp->lockname.ln_type,
344 (unsigned long long)lp->lockname.ln_number, lp->flags);
345
346 spin_lock(&ls->async_lock);
347 if (!list_empty(&lp->delay_list)) {
348 list_del_init(&lp->delay_list);
349 delay_list = 1;
350 }
351 spin_unlock(&ls->async_lock);
352
353 if (delay_list) {
354 set_bit(LFL_CANCEL, &lp->flags);
355 set_bit(LFL_ACTIVE, &lp->flags);
356 queue_complete(lp);
357 return;
358 }
359
360 if (!test_bit(LFL_ACTIVE, &lp->flags) ||
361 test_bit(LFL_DLM_UNLOCK, &lp->flags)) {
362 log_info("gdlm_cancel skip %x,%llx flags %lx",
363 lp->lockname.ln_type,
364 (unsigned long long)lp->lockname.ln_number, lp->flags);
365 return;
366 }
367
368 /* the lock is blocked in the dlm */
369
370 set_bit(LFL_DLM_CANCEL, &lp->flags);
371 set_bit(LFL_ACTIVE, &lp->flags);
372
373 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL,
374 NULL, lp);
375
376 log_info("gdlm_cancel rv %d %x,%llx flags %lx", error,
377 lp->lockname.ln_type,
378 (unsigned long long)lp->lockname.ln_number, lp->flags);
379
380 if (error == -EBUSY)
381 clear_bit(LFL_DLM_CANCEL, &lp->flags);
382}
383
384static int gdlm_add_lvb(struct gdlm_lock *lp)
385{
386 char *lvb;
387
388 lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL);
389 if (!lvb)
390 return -ENOMEM;
391
392 lp->lksb.sb_lvbptr = lvb;
393 lp->lvb = lvb;
394 return 0;
395}
396
397static void gdlm_del_lvb(struct gdlm_lock *lp)
398{
399 kfree(lp->lvb);
400 lp->lvb = NULL;
401 lp->lksb.sb_lvbptr = NULL;
402}
403
404/* This can do a synchronous dlm request (requiring a lock_dlm thread to get
405 the completion) because gfs won't call hold_lvb() during a callback (from
406 the context of a lock_dlm thread). */
407
408static int hold_null_lock(struct gdlm_lock *lp)
409{
410 struct gdlm_lock *lpn = NULL;
411 int error;
412
413 if (lp->hold_null) {
414 printk(KERN_INFO "lock_dlm: lvb already held\n");
415 return 0;
416 }
417
418 error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn);
419 if (error)
420 goto out;
421
422 lpn->lksb.sb_lvbptr = junk_lvb;
423 lpn->lvb = junk_lvb;
424
425 lpn->req = DLM_LOCK_NL;
426 lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE;
427 set_bit(LFL_NOBAST, &lpn->flags);
428 set_bit(LFL_INLOCK, &lpn->flags);
429
430 init_completion(&lpn->ast_wait);
431 gdlm_do_lock(lpn);
432 wait_for_completion(&lpn->ast_wait);
433 error = lpn->lksb.sb_status;
434 if (error) {
435 printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n",
436 error);
437 gdlm_delete_lp(lpn);
438 lpn = NULL;
439 }
440 out:
441 lp->hold_null = lpn;
442 return error;
443}
444
445/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get
446 the completion) because gfs may call unhold_lvb() during a callback (from
447 the context of a lock_dlm thread) which could cause a deadlock since the
448 other lock_dlm thread could be engaged in recovery. */
449
450static void unhold_null_lock(struct gdlm_lock *lp)
451{
452 struct gdlm_lock *lpn = lp->hold_null;
453
454 gdlm_assert(lpn, "%x,%llx", lp->lockname.ln_type,
455 (unsigned long long)lp->lockname.ln_number);
456 lpn->lksb.sb_lvbptr = NULL;
457 lpn->lvb = NULL;
458 set_bit(LFL_UNLOCK_DELETE, &lpn->flags);
459 gdlm_do_unlock(lpn);
460 lp->hold_null = NULL;
461}
462
463/* Acquire a NL lock because gfs requires the value block to remain
464 intact on the resource while the lvb is "held" even if it's holding no locks
465 on the resource. */
466
467int gdlm_hold_lvb(lm_lock_t *lock, char **lvbp)
468{
469 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
470 int error;
471
472 error = gdlm_add_lvb(lp);
473 if (error)
474 return error;
475
476 *lvbp = lp->lvb;
477
478 error = hold_null_lock(lp);
479 if (error)
480 gdlm_del_lvb(lp);
481
482 return error;
483}
484
485void gdlm_unhold_lvb(lm_lock_t *lock, char *lvb)
486{
487 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
488
489 unhold_null_lock(lp);
490 gdlm_del_lvb(lp);
491}
492
493void gdlm_sync_lvb(lm_lock_t *lock, char *lvb)
494{
495 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
496
497 if (lp->cur != DLM_LOCK_EX)
498 return;
499
500 init_completion(&lp->ast_wait);
501 set_bit(LFL_SYNC_LVB, &lp->flags);
502
503 lp->req = DLM_LOCK_EX;
504 lp->lkf = make_flags(lp, 0, lp->cur, lp->req);
505
506 gdlm_do_lock(lp);
507 wait_for_completion(&lp->ast_wait);
508}
509
510void gdlm_submit_delayed(struct gdlm_ls *ls)
511{
512 struct gdlm_lock *lp, *safe;
513
514 spin_lock(&ls->async_lock);
515 list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) {
516 list_del_init(&lp->delay_list);
517 list_add_tail(&lp->delay_list, &ls->submit);
518 }
519 spin_unlock(&ls->async_lock);
520 wake_up(&ls->thread_wait);
521}
522
523int gdlm_release_all_locks(struct gdlm_ls *ls)
524{
525 struct gdlm_lock *lp, *safe;
526 int count = 0;
527
528 spin_lock(&ls->async_lock);
529 list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
530 list_del_init(&lp->all_list);
531
532 if (lp->lvb && lp->lvb != junk_lvb)
533 kfree(lp->lvb);
534 kfree(lp);
535 count++;
536 }
537 spin_unlock(&ls->async_lock);
538
539 return count;
540}
541
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
new file mode 100644
index 000000000000..530c2f542584
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -0,0 +1,188 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef LOCK_DLM_DOT_H
11#define LOCK_DLM_DOT_H
12
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/spinlock.h>
16#include <linux/module.h>
17#include <linux/types.h>
18#include <linux/string.h>
19#include <linux/list.h>
20#include <linux/socket.h>
21#include <linux/delay.h>
22#include <linux/kthread.h>
23#include <linux/kobject.h>
24#include <linux/fcntl.h>
25#include <linux/wait.h>
26#include <net/sock.h>
27
28#include <linux/dlm.h>
29#include "../../lm_interface.h"
30
31/*
32 * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
33 * prefix of lock_dlm_ gets awkward. Externally, GFS refers to this module
34 * as "lock_dlm".
35 */
36
37#define GDLM_STRNAME_BYTES 24
38#define GDLM_LVB_SIZE 32
39#define GDLM_DROP_COUNT 50000
40#define GDLM_DROP_PERIOD 60
41#define GDLM_NAME_LEN 128
42
43/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number).
44 We sprintf these numbers into a 24 byte string of hex values to make them
45 human-readable (to make debugging simpler.) */
46
47struct gdlm_strname {
48 unsigned char name[GDLM_STRNAME_BYTES];
49 unsigned short namelen;
50};
51
52enum {
53 DFL_BLOCK_LOCKS = 0,
54 DFL_SPECTATOR = 1,
55 DFL_WITHDRAW = 2,
56};
57
58struct gdlm_ls {
59 uint32_t id;
60 int jid;
61 int first;
62 int first_done;
63 unsigned long flags;
64 struct kobject kobj;
65 char clustername[GDLM_NAME_LEN];
66 char fsname[GDLM_NAME_LEN];
67 int fsflags;
68 dlm_lockspace_t *dlm_lockspace;
69 lm_callback_t fscb;
70 lm_fsdata_t *fsdata;
71 int recover_jid;
72 int recover_jid_done;
73 int recover_jid_status;
74 spinlock_t async_lock;
75 struct list_head complete;
76 struct list_head blocking;
77 struct list_head delayed;
78 struct list_head submit;
79 struct list_head all_locks;
80 uint32_t all_locks_count;
81 wait_queue_head_t wait_control;
82 struct task_struct *thread1;
83 struct task_struct *thread2;
84 wait_queue_head_t thread_wait;
85 unsigned long drop_time;
86 int drop_locks_count;
87 int drop_locks_period;
88};
89
90enum {
91 LFL_NOBLOCK = 0,
92 LFL_NOCACHE = 1,
93 LFL_DLM_UNLOCK = 2,
94 LFL_DLM_CANCEL = 3,
95 LFL_SYNC_LVB = 4,
96 LFL_FORCE_PROMOTE = 5,
97 LFL_REREQUEST = 6,
98 LFL_ACTIVE = 7,
99 LFL_INLOCK = 8,
100 LFL_CANCEL = 9,
101 LFL_NOBAST = 10,
102 LFL_HEADQUE = 11,
103 LFL_UNLOCK_DELETE = 12,
104};
105
106struct gdlm_lock {
107 struct gdlm_ls *ls;
108 struct lm_lockname lockname;
109 char *lvb;
110 struct dlm_lksb lksb;
111
112 int16_t cur;
113 int16_t req;
114 int16_t prev_req;
115 uint32_t lkf; /* dlm flags DLM_LKF_ */
116 unsigned long flags; /* lock_dlm flags LFL_ */
117
118 int bast_mode; /* protected by async_lock */
119 struct completion ast_wait;
120
121 struct list_head clist; /* complete */
122 struct list_head blist; /* blocking */
123 struct list_head delay_list; /* delayed */
124 struct list_head all_list; /* all locks for the fs */
125 struct gdlm_lock *hold_null; /* NL lock for hold_lvb */
126};
127
128#define gdlm_assert(assertion, fmt, args...) \
129do { \
130 if (unlikely(!(assertion))) { \
131 printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \
132 "lock_dlm: " fmt "\n", \
133 #assertion, ##args); \
134 BUG(); \
135 } \
136} while (0)
137
138#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg)
139#define log_info(fmt, arg...) log_print(KERN_INFO , fmt , ## arg)
140#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg)
141#ifdef LOCK_DLM_LOG_DEBUG
142#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg)
143#else
144#define log_debug(fmt, arg...)
145#endif
146
147/* sysfs.c */
148
149int gdlm_sysfs_init(void);
150void gdlm_sysfs_exit(void);
151int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *);
152void gdlm_kobject_release(struct gdlm_ls *);
153
154/* thread.c */
155
156int gdlm_init_threads(struct gdlm_ls *);
157void gdlm_release_threads(struct gdlm_ls *);
158
159/* lock.c */
160
161int16_t gdlm_make_lmstate(int16_t);
162void gdlm_queue_delayed(struct gdlm_lock *);
163void gdlm_submit_delayed(struct gdlm_ls *);
164int gdlm_release_all_locks(struct gdlm_ls *);
165void gdlm_delete_lp(struct gdlm_lock *);
166unsigned int gdlm_do_lock(struct gdlm_lock *);
167
168int gdlm_get_lock(lm_lockspace_t *, struct lm_lockname *, lm_lock_t **);
169void gdlm_put_lock(lm_lock_t *);
170unsigned int gdlm_lock(lm_lock_t *, unsigned int, unsigned int, unsigned int);
171unsigned int gdlm_unlock(lm_lock_t *, unsigned int);
172void gdlm_cancel(lm_lock_t *);
173int gdlm_hold_lvb(lm_lock_t *, char **);
174void gdlm_unhold_lvb(lm_lock_t *, char *);
175void gdlm_sync_lvb(lm_lock_t *, char *);
176
177/* plock.c */
178
179int gdlm_plock_init(void);
180void gdlm_plock_exit(void);
181int gdlm_plock(lm_lockspace_t *, struct lm_lockname *, struct file *, int,
182 struct file_lock *);
183int gdlm_plock_get(lm_lockspace_t *, struct lm_lockname *, struct file *,
184 struct file_lock *);
185int gdlm_punlock(lm_lockspace_t *, struct lm_lockname *, struct file *,
186 struct file_lock *);
187#endif
188
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
new file mode 100644
index 000000000000..870a1cd99f57
--- /dev/null
+++ b/fs/gfs2/locking/dlm/main.c
@@ -0,0 +1,64 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/init.h>
11
12#include "lock_dlm.h"
13
14extern int gdlm_drop_count;
15extern int gdlm_drop_period;
16
17extern struct lm_lockops gdlm_ops;
18
19static int __init init_lock_dlm(void)
20{
21 int error;
22
23 error = gfs2_register_lockproto(&gdlm_ops);
24 if (error) {
25 printk(KERN_WARNING "lock_dlm: can't register protocol: %d\n",
26 error);
27 return error;
28 }
29
30 error = gdlm_sysfs_init();
31 if (error) {
32 gfs2_unregister_lockproto(&gdlm_ops);
33 return error;
34 }
35
36 error = gdlm_plock_init();
37 if (error) {
38 gdlm_sysfs_exit();
39 gfs2_unregister_lockproto(&gdlm_ops);
40 return error;
41 }
42
43 gdlm_drop_count = GDLM_DROP_COUNT;
44 gdlm_drop_period = GDLM_DROP_PERIOD;
45
46 printk(KERN_INFO
47 "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
48 return 0;
49}
50
51static void __exit exit_lock_dlm(void)
52{
53 gdlm_plock_exit();
54 gdlm_sysfs_exit();
55 gfs2_unregister_lockproto(&gdlm_ops);
56}
57
58module_init(init_lock_dlm);
59module_exit(exit_lock_dlm);
60
61MODULE_DESCRIPTION("GFS DLM Locking Module");
62MODULE_AUTHOR("Red Hat, Inc.");
63MODULE_LICENSE("GPL");
64
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
new file mode 100644
index 000000000000..3caeafc02a1b
--- /dev/null
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -0,0 +1,256 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include "lock_dlm.h"
11
12int gdlm_drop_count;
13int gdlm_drop_period;
14struct lm_lockops gdlm_ops;
15
16
17static struct gdlm_ls *init_gdlm(lm_callback_t cb, lm_fsdata_t *fsdata,
18 int flags, char *table_name)
19{
20 struct gdlm_ls *ls;
21 char buf[256], *p;
22
23 ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL);
24 if (!ls)
25 return NULL;
26
27 ls->drop_locks_count = gdlm_drop_count;
28 ls->drop_locks_period = gdlm_drop_period;
29 ls->fscb = cb;
30 ls->fsdata = fsdata;
31 ls->fsflags = flags;
32 spin_lock_init(&ls->async_lock);
33 INIT_LIST_HEAD(&ls->complete);
34 INIT_LIST_HEAD(&ls->blocking);
35 INIT_LIST_HEAD(&ls->delayed);
36 INIT_LIST_HEAD(&ls->submit);
37 INIT_LIST_HEAD(&ls->all_locks);
38 init_waitqueue_head(&ls->thread_wait);
39 init_waitqueue_head(&ls->wait_control);
40 ls->thread1 = NULL;
41 ls->thread2 = NULL;
42 ls->drop_time = jiffies;
43 ls->jid = -1;
44
45 strncpy(buf, table_name, 256);
46 buf[255] = '\0';
47
48 p = strstr(buf, ":");
49 if (!p) {
50 log_info("invalid table_name \"%s\"", table_name);
51 kfree(ls);
52 return NULL;
53 }
54 *p = '\0';
55 p++;
56
57 strncpy(ls->clustername, buf, GDLM_NAME_LEN);
58 strncpy(ls->fsname, p, GDLM_NAME_LEN);
59
60 return ls;
61}
62
63static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
64{
65 char data[256];
66 char *options, *x, *y;
67 int error = 0;
68
69 memset(data, 0, 256);
70 strncpy(data, data_arg, 255);
71
72 for (options = data; (x = strsep(&options, ":")); ) {
73 if (!*x)
74 continue;
75
76 y = strchr(x, '=');
77 if (y)
78 *y++ = 0;
79
80 if (!strcmp(x, "jid")) {
81 if (!y) {
82 log_error("need argument to jid");
83 error = -EINVAL;
84 break;
85 }
86 sscanf(y, "%u", &ls->jid);
87
88 } else if (!strcmp(x, "first")) {
89 if (!y) {
90 log_error("need argument to first");
91 error = -EINVAL;
92 break;
93 }
94 sscanf(y, "%u", &ls->first);
95
96 } else if (!strcmp(x, "id")) {
97 if (!y) {
98 log_error("need argument to id");
99 error = -EINVAL;
100 break;
101 }
102 sscanf(y, "%u", &ls->id);
103
104 } else if (!strcmp(x, "nodir")) {
105 if (!y) {
106 log_error("need argument to nodir");
107 error = -EINVAL;
108 break;
109 }
110 sscanf(y, "%u", nodir);
111
112 } else {
113 log_error("unkonwn option: %s", x);
114 error = -EINVAL;
115 break;
116 }
117 }
118
119 return error;
120}
121
122static int gdlm_mount(char *table_name, char *host_data,
123 lm_callback_t cb, lm_fsdata_t *fsdata,
124 unsigned int min_lvb_size, int flags,
125 struct lm_lockstruct *lockstruct,
126 struct kobject *fskobj)
127{
128 struct gdlm_ls *ls;
129 int error = -ENOMEM, nodir = 0;
130
131 if (min_lvb_size > GDLM_LVB_SIZE)
132 goto out;
133
134 ls = init_gdlm(cb, fsdata, flags, table_name);
135 if (!ls)
136 goto out;
137
138 error = make_args(ls, host_data, &nodir);
139 if (error)
140 goto out;
141
142 error = gdlm_init_threads(ls);
143 if (error)
144 goto out_free;
145
146 error = gdlm_kobject_setup(ls, fskobj);
147 if (error)
148 goto out_thread;
149
150 error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
151 &ls->dlm_lockspace,
152 nodir ? DLM_LSFL_NODIR : 0,
153 GDLM_LVB_SIZE);
154 if (error) {
155 log_error("dlm_new_lockspace error %d", error);
156 goto out_kobj;
157 }
158
159 lockstruct->ls_jid = ls->jid;
160 lockstruct->ls_first = ls->first;
161 lockstruct->ls_lockspace = ls;
162 lockstruct->ls_ops = &gdlm_ops;
163 lockstruct->ls_flags = 0;
164 lockstruct->ls_lvb_size = GDLM_LVB_SIZE;
165 return 0;
166
167 out_kobj:
168 gdlm_kobject_release(ls);
169 out_thread:
170 gdlm_release_threads(ls);
171 out_free:
172 kfree(ls);
173 out:
174 return error;
175}
176
177static void gdlm_unmount(lm_lockspace_t *lockspace)
178{
179 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
180 int rv;
181
182 log_debug("unmount flags %lx", ls->flags);
183
184 /* FIXME: serialize unmount and withdraw in case they
185 happen at once. Also, if unmount follows withdraw,
186 wait for withdraw to finish. */
187
188 if (test_bit(DFL_WITHDRAW, &ls->flags))
189 goto out;
190
191 gdlm_kobject_release(ls);
192 dlm_release_lockspace(ls->dlm_lockspace, 2);
193 gdlm_release_threads(ls);
194 rv = gdlm_release_all_locks(ls);
195 if (rv)
196 log_info("gdlm_unmount: %d stray locks freed", rv);
197 out:
198 kfree(ls);
199}
200
201static void gdlm_recovery_done(lm_lockspace_t *lockspace, unsigned int jid,
202 unsigned int message)
203{
204 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
205 ls->recover_jid_done = jid;
206 ls->recover_jid_status = message;
207 kobject_uevent(&ls->kobj, KOBJ_CHANGE);
208}
209
210static void gdlm_others_may_mount(lm_lockspace_t *lockspace)
211{
212 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
213 ls->first_done = 1;
214 kobject_uevent(&ls->kobj, KOBJ_CHANGE);
215}
216
217/* Userspace gets the offline uevent, blocks new gfs locks on
218 other mounters, and lets us know (sets WITHDRAW flag). Then,
219 userspace leaves the mount group while we leave the lockspace. */
220
221static void gdlm_withdraw(lm_lockspace_t *lockspace)
222{
223 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
224
225 kobject_uevent(&ls->kobj, KOBJ_OFFLINE);
226
227 wait_event_interruptible(ls->wait_control,
228 test_bit(DFL_WITHDRAW, &ls->flags));
229
230 dlm_release_lockspace(ls->dlm_lockspace, 2);
231 gdlm_release_threads(ls);
232 gdlm_release_all_locks(ls);
233 gdlm_kobject_release(ls);
234}
235
236struct lm_lockops gdlm_ops = {
237 .lm_proto_name = "lock_dlm",
238 .lm_mount = gdlm_mount,
239 .lm_others_may_mount = gdlm_others_may_mount,
240 .lm_unmount = gdlm_unmount,
241 .lm_withdraw = gdlm_withdraw,
242 .lm_get_lock = gdlm_get_lock,
243 .lm_put_lock = gdlm_put_lock,
244 .lm_lock = gdlm_lock,
245 .lm_unlock = gdlm_unlock,
246 .lm_plock = gdlm_plock,
247 .lm_punlock = gdlm_punlock,
248 .lm_plock_get = gdlm_plock_get,
249 .lm_cancel = gdlm_cancel,
250 .lm_hold_lvb = gdlm_hold_lvb,
251 .lm_unhold_lvb = gdlm_unhold_lvb,
252 .lm_sync_lvb = gdlm_sync_lvb,
253 .lm_recovery_done = gdlm_recovery_done,
254 .lm_owner = THIS_MODULE,
255};
256
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
new file mode 100644
index 000000000000..1acb2519f439
--- /dev/null
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -0,0 +1,302 @@
1/*
2 * Copyright (C) 2005 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#include <linux/miscdevice.h>
10#include <linux/lock_dlm_plock.h>
11
12#include "lock_dlm.h"
13
14
15static spinlock_t ops_lock;
16static struct list_head send_list;
17static struct list_head recv_list;
18static wait_queue_head_t send_wq;
19static wait_queue_head_t recv_wq;
20
21struct plock_op {
22 struct list_head list;
23 int done;
24 struct gdlm_plock_info info;
25};
26
27static inline void set_version(struct gdlm_plock_info *info)
28{
29 info->version[0] = GDLM_PLOCK_VERSION_MAJOR;
30 info->version[1] = GDLM_PLOCK_VERSION_MINOR;
31 info->version[2] = GDLM_PLOCK_VERSION_PATCH;
32}
33
34static int check_version(struct gdlm_plock_info *info)
35{
36 if ((GDLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
37 (GDLM_PLOCK_VERSION_MINOR < info->version[1])) {
38 log_error("plock device version mismatch: "
39 "kernel (%u.%u.%u), user (%u.%u.%u)",
40 GDLM_PLOCK_VERSION_MAJOR,
41 GDLM_PLOCK_VERSION_MINOR,
42 GDLM_PLOCK_VERSION_PATCH,
43 info->version[0],
44 info->version[1],
45 info->version[2]);
46 return -EINVAL;
47 }
48 return 0;
49}
50
51static void send_op(struct plock_op *op)
52{
53 set_version(&op->info);
54 INIT_LIST_HEAD(&op->list);
55 spin_lock(&ops_lock);
56 list_add_tail(&op->list, &send_list);
57 spin_unlock(&ops_lock);
58 wake_up(&send_wq);
59}
60
61int gdlm_plock(lm_lockspace_t *lockspace, struct lm_lockname *name,
62 struct file *file, int cmd, struct file_lock *fl)
63{
64 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
65 struct plock_op *op;
66 int rv;
67
68 op = kzalloc(sizeof(*op), GFP_KERNEL);
69 if (!op)
70 return -ENOMEM;
71
72 op->info.optype = GDLM_PLOCK_OP_LOCK;
73 op->info.pid = fl->fl_pid;
74 op->info.ex = (fl->fl_type == F_WRLCK);
75 op->info.wait = IS_SETLKW(cmd);
76 op->info.fsid = ls->id;
77 op->info.number = name->ln_number;
78 op->info.start = fl->fl_start;
79 op->info.end = fl->fl_end;
80 op->info.owner = (__u64)(long) fl->fl_owner;
81
82 send_op(op);
83 wait_event(recv_wq, (op->done != 0));
84
85 spin_lock(&ops_lock);
86 if (!list_empty(&op->list)) {
87 printk(KERN_INFO "plock op on list\n");
88 list_del(&op->list);
89 }
90 spin_unlock(&ops_lock);
91
92 rv = op->info.rv;
93
94 if (!rv) {
95 if (posix_lock_file_wait(file, fl) < 0)
96 log_error("gdlm_plock: vfs lock error %x,%llx",
97 name->ln_type,
98 (unsigned long long)name->ln_number);
99 }
100
101 kfree(op);
102 return rv;
103}
104
105int gdlm_punlock(lm_lockspace_t *lockspace, struct lm_lockname *name,
106 struct file *file, struct file_lock *fl)
107{
108 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
109 struct plock_op *op;
110 int rv;
111
112 op = kzalloc(sizeof(*op), GFP_KERNEL);
113 if (!op)
114 return -ENOMEM;
115
116 if (posix_lock_file_wait(file, fl) < 0)
117 log_error("gdlm_punlock: vfs unlock error %x,%llx",
118 name->ln_type, (unsigned long long)name->ln_number);
119
120 op->info.optype = GDLM_PLOCK_OP_UNLOCK;
121 op->info.pid = fl->fl_pid;
122 op->info.fsid = ls->id;
123 op->info.number = name->ln_number;
124 op->info.start = fl->fl_start;
125 op->info.end = fl->fl_end;
126 op->info.owner = (__u64)(long) fl->fl_owner;
127
128 send_op(op);
129 wait_event(recv_wq, (op->done != 0));
130
131 spin_lock(&ops_lock);
132 if (!list_empty(&op->list)) {
133 printk(KERN_INFO "punlock op on list\n");
134 list_del(&op->list);
135 }
136 spin_unlock(&ops_lock);
137
138 rv = op->info.rv;
139
140 kfree(op);
141 return rv;
142}
143
144int gdlm_plock_get(lm_lockspace_t *lockspace, struct lm_lockname *name,
145 struct file *file, struct file_lock *fl)
146{
147 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
148 struct plock_op *op;
149 int rv;
150
151 op = kzalloc(sizeof(*op), GFP_KERNEL);
152 if (!op)
153 return -ENOMEM;
154
155 op->info.optype = GDLM_PLOCK_OP_GET;
156 op->info.pid = fl->fl_pid;
157 op->info.ex = (fl->fl_type == F_WRLCK);
158 op->info.fsid = ls->id;
159 op->info.number = name->ln_number;
160 op->info.start = fl->fl_start;
161 op->info.end = fl->fl_end;
162
163 send_op(op);
164 wait_event(recv_wq, (op->done != 0));
165
166 spin_lock(&ops_lock);
167 if (!list_empty(&op->list)) {
168 printk(KERN_INFO "plock_get op on list\n");
169 list_del(&op->list);
170 }
171 spin_unlock(&ops_lock);
172
173 rv = op->info.rv;
174
175 if (rv == 0)
176 fl->fl_type = F_UNLCK;
177 else if (rv > 0) {
178 fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
179 fl->fl_pid = op->info.pid;
180 fl->fl_start = op->info.start;
181 fl->fl_end = op->info.end;
182 }
183
184 kfree(op);
185 return rv;
186}
187
188/* a read copies out one plock request from the send list */
189static ssize_t dev_read(struct file *file, char __user *u, size_t count,
190 loff_t *ppos)
191{
192 struct gdlm_plock_info info;
193 struct plock_op *op = NULL;
194
195 if (count < sizeof(info))
196 return -EINVAL;
197
198 spin_lock(&ops_lock);
199 if (!list_empty(&send_list)) {
200 op = list_entry(send_list.next, struct plock_op, list);
201 list_move(&op->list, &recv_list);
202 memcpy(&info, &op->info, sizeof(info));
203 }
204 spin_unlock(&ops_lock);
205
206 if (!op)
207 return -EAGAIN;
208
209 if (copy_to_user(u, &info, sizeof(info)))
210 return -EFAULT;
211 return sizeof(info);
212}
213
214/* a write copies in one plock result that should match a plock_op
215 on the recv list */
216static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
217 loff_t *ppos)
218{
219 struct gdlm_plock_info info;
220 struct plock_op *op;
221 int found = 0;
222
223 if (count != sizeof(info))
224 return -EINVAL;
225
226 if (copy_from_user(&info, u, sizeof(info)))
227 return -EFAULT;
228
229 if (check_version(&info))
230 return -EINVAL;
231
232 spin_lock(&ops_lock);
233 list_for_each_entry(op, &recv_list, list) {
234 if (op->info.fsid == info.fsid &&
235 op->info.number == info.number &&
236 op->info.owner == info.owner) {
237 list_del_init(&op->list);
238 found = 1;
239 op->done = 1;
240 memcpy(&op->info, &info, sizeof(info));
241 break;
242 }
243 }
244 spin_unlock(&ops_lock);
245
246 if (found)
247 wake_up(&recv_wq);
248 else
249 printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid,
250 (unsigned long long)info.number);
251 return count;
252}
253
254static unsigned int dev_poll(struct file *file, poll_table *wait)
255{
256 poll_wait(file, &send_wq, wait);
257
258 spin_lock(&ops_lock);
259 if (!list_empty(&send_list)) {
260 spin_unlock(&ops_lock);
261 return POLLIN | POLLRDNORM;
262 }
263 spin_unlock(&ops_lock);
264 return 0;
265}
266
267static struct file_operations dev_fops = {
268 .read = dev_read,
269 .write = dev_write,
270 .poll = dev_poll,
271 .owner = THIS_MODULE
272};
273
274static struct miscdevice plock_dev_misc = {
275 .minor = MISC_DYNAMIC_MINOR,
276 .name = GDLM_PLOCK_MISC_NAME,
277 .fops = &dev_fops
278};
279
280int gdlm_plock_init(void)
281{
282 int rv;
283
284 spin_lock_init(&ops_lock);
285 INIT_LIST_HEAD(&send_list);
286 INIT_LIST_HEAD(&recv_list);
287 init_waitqueue_head(&send_wq);
288 init_waitqueue_head(&recv_wq);
289
290 rv = misc_register(&plock_dev_misc);
291 if (rv)
292 printk(KERN_INFO "gdlm_plock_init: misc_register failed %d",
293 rv);
294 return rv;
295}
296
297void gdlm_plock_exit(void)
298{
299 if (misc_deregister(&plock_dev_misc) < 0)
300 printk(KERN_INFO "gdlm_plock_exit: misc_deregister failed");
301}
302
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
new file mode 100644
index 000000000000..0d8bd0806dba
--- /dev/null
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -0,0 +1,225 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/ctype.h>
11#include <linux/stat.h>
12
13#include "lock_dlm.h"
14
15extern struct lm_lockops gdlm_ops;
16
17static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
18{
19 return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
20}
21
22static ssize_t block_show(struct gdlm_ls *ls, char *buf)
23{
24 ssize_t ret;
25 int val = 0;
26
27 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags))
28 val = 1;
29 ret = sprintf(buf, "%d\n", val);
30 return ret;
31}
32
33static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len)
34{
35 ssize_t ret = len;
36 int val;
37
38 val = simple_strtol(buf, NULL, 0);
39
40 if (val == 1)
41 set_bit(DFL_BLOCK_LOCKS, &ls->flags);
42 else if (val == 0) {
43 clear_bit(DFL_BLOCK_LOCKS, &ls->flags);
44 gdlm_submit_delayed(ls);
45 } else
46 ret = -EINVAL;
47 return ret;
48}
49
50static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf)
51{
52 ssize_t ret;
53 int val = 0;
54
55 if (test_bit(DFL_WITHDRAW, &ls->flags))
56 val = 1;
57 ret = sprintf(buf, "%d\n", val);
58 return ret;
59}
60
61static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len)
62{
63 ssize_t ret = len;
64 int val;
65
66 val = simple_strtol(buf, NULL, 0);
67
68 if (val == 1)
69 set_bit(DFL_WITHDRAW, &ls->flags);
70 else
71 ret = -EINVAL;
72 wake_up(&ls->wait_control);
73 return ret;
74}
75
76static ssize_t id_show(struct gdlm_ls *ls, char *buf)
77{
78 return sprintf(buf, "%u\n", ls->id);
79}
80
81static ssize_t jid_show(struct gdlm_ls *ls, char *buf)
82{
83 return sprintf(buf, "%d\n", ls->jid);
84}
85
86static ssize_t first_show(struct gdlm_ls *ls, char *buf)
87{
88 return sprintf(buf, "%d\n", ls->first);
89}
90
91static ssize_t first_done_show(struct gdlm_ls *ls, char *buf)
92{
93 return sprintf(buf, "%d\n", ls->first_done);
94}
95
96static ssize_t recover_show(struct gdlm_ls *ls, char *buf)
97{
98 return sprintf(buf, "%d\n", ls->recover_jid);
99}
100
101static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len)
102{
103 ls->recover_jid = simple_strtol(buf, NULL, 0);
104 ls->fscb(ls->fsdata, LM_CB_NEED_RECOVERY, &ls->recover_jid);
105 return len;
106}
107
108static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf)
109{
110 return sprintf(buf, "%d\n", ls->recover_jid_done);
111}
112
113static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
114{
115 return sprintf(buf, "%d\n", ls->recover_jid_status);
116}
117
118struct gdlm_attr {
119 struct attribute attr;
120 ssize_t (*show)(struct gdlm_ls *, char *);
121 ssize_t (*store)(struct gdlm_ls *, const char *, size_t);
122};
123
124#define GDLM_ATTR(_name,_mode,_show,_store) \
125static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
126
127GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
128GDLM_ATTR(block, 0644, block_show, block_store);
129GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
130GDLM_ATTR(id, 0444, id_show, NULL);
131GDLM_ATTR(jid, 0444, jid_show, NULL);
132GDLM_ATTR(first, 0444, first_show, NULL);
133GDLM_ATTR(first_done, 0444, first_done_show, NULL);
134GDLM_ATTR(recover, 0644, recover_show, recover_store);
135GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
136GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
137
138static struct attribute *gdlm_attrs[] = {
139 &gdlm_attr_proto_name.attr,
140 &gdlm_attr_block.attr,
141 &gdlm_attr_withdraw.attr,
142 &gdlm_attr_id.attr,
143 &gdlm_attr_jid.attr,
144 &gdlm_attr_first.attr,
145 &gdlm_attr_first_done.attr,
146 &gdlm_attr_recover.attr,
147 &gdlm_attr_recover_done.attr,
148 &gdlm_attr_recover_status.attr,
149 NULL,
150};
151
152static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr,
153 char *buf)
154{
155 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
156 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
157 return a->show ? a->show(ls, buf) : 0;
158}
159
160static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr,
161 const char *buf, size_t len)
162{
163 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
164 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
165 return a->store ? a->store(ls, buf, len) : len;
166}
167
168static struct sysfs_ops gdlm_attr_ops = {
169 .show = gdlm_attr_show,
170 .store = gdlm_attr_store,
171};
172
173static struct kobj_type gdlm_ktype = {
174 .default_attrs = gdlm_attrs,
175 .sysfs_ops = &gdlm_attr_ops,
176};
177
178static struct kset gdlm_kset = {
179 .subsys = &kernel_subsys,
180 .kobj = {.name = "lock_dlm",},
181 .ktype = &gdlm_ktype,
182};
183
184int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
185{
186 int error;
187
188 error = kobject_set_name(&ls->kobj, "%s", "lock_module");
189 if (error) {
190 log_error("can't set kobj name %d", error);
191 return error;
192 }
193
194 ls->kobj.kset = &gdlm_kset;
195 ls->kobj.ktype = &gdlm_ktype;
196 ls->kobj.parent = fskobj;
197
198 error = kobject_register(&ls->kobj);
199 if (error)
200 log_error("can't register kobj %d", error);
201
202 return error;
203}
204
205void gdlm_kobject_release(struct gdlm_ls *ls)
206{
207 kobject_unregister(&ls->kobj);
208}
209
210int gdlm_sysfs_init(void)
211{
212 int error;
213
214 error = kset_register(&gdlm_kset);
215 if (error)
216 printk("lock_dlm: cannot register kset %d\n", error);
217
218 return error;
219}
220
221void gdlm_sysfs_exit(void)
222{
223 kset_unregister(&gdlm_kset);
224}
225
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
new file mode 100644
index 000000000000..489235b2edba
--- /dev/null
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -0,0 +1,359 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include "lock_dlm.h"
11
12/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
13 thread gets to it. */
14
15static void queue_submit(struct gdlm_lock *lp)
16{
17 struct gdlm_ls *ls = lp->ls;
18
19 spin_lock(&ls->async_lock);
20 list_add_tail(&lp->delay_list, &ls->submit);
21 spin_unlock(&ls->async_lock);
22 wake_up(&ls->thread_wait);
23}
24
25static void process_blocking(struct gdlm_lock *lp, int bast_mode)
26{
27 struct gdlm_ls *ls = lp->ls;
28 unsigned int cb = 0;
29
30 switch (gdlm_make_lmstate(bast_mode)) {
31 case LM_ST_EXCLUSIVE:
32 cb = LM_CB_NEED_E;
33 break;
34 case LM_ST_DEFERRED:
35 cb = LM_CB_NEED_D;
36 break;
37 case LM_ST_SHARED:
38 cb = LM_CB_NEED_S;
39 break;
40 default:
41 gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
42 }
43
44 ls->fscb(ls->fsdata, cb, &lp->lockname);
45}
46
47static void process_complete(struct gdlm_lock *lp)
48{
49 struct gdlm_ls *ls = lp->ls;
50 struct lm_async_cb acb;
51 int16_t prev_mode = lp->cur;
52
53 memset(&acb, 0, sizeof(acb));
54
55 if (lp->lksb.sb_status == -DLM_ECANCEL) {
56 log_info("complete dlm cancel %x,%llx flags %lx",
57 lp->lockname.ln_type,
58 (unsigned long long)lp->lockname.ln_number,
59 lp->flags);
60
61 lp->req = lp->cur;
62 acb.lc_ret |= LM_OUT_CANCELED;
63 if (lp->cur == DLM_LOCK_IV)
64 lp->lksb.sb_lkid = 0;
65 goto out;
66 }
67
68 if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
69 if (lp->lksb.sb_status != -DLM_EUNLOCK) {
70 log_info("unlock sb_status %d %x,%llx flags %lx",
71 lp->lksb.sb_status, lp->lockname.ln_type,
72 (unsigned long long)lp->lockname.ln_number,
73 lp->flags);
74 return;
75 }
76
77 lp->cur = DLM_LOCK_IV;
78 lp->req = DLM_LOCK_IV;
79 lp->lksb.sb_lkid = 0;
80
81 if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
82 gdlm_delete_lp(lp);
83 return;
84 }
85 goto out;
86 }
87
88 if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
89 memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
90
91 if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
92 if (lp->req == DLM_LOCK_PR)
93 lp->req = DLM_LOCK_CW;
94 else if (lp->req == DLM_LOCK_CW)
95 lp->req = DLM_LOCK_PR;
96 }
97
98 /*
99 * A canceled lock request. The lock was just taken off the delayed
100 * list and was never even submitted to dlm.
101 */
102
103 if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
104 log_info("complete internal cancel %x,%llx",
105 lp->lockname.ln_type,
106 (unsigned long long)lp->lockname.ln_number);
107 lp->req = lp->cur;
108 acb.lc_ret |= LM_OUT_CANCELED;
109 goto out;
110 }
111
112 /*
113 * An error occured.
114 */
115
116 if (lp->lksb.sb_status) {
117 /* a "normal" error */
118 if ((lp->lksb.sb_status == -EAGAIN) &&
119 (lp->lkf & DLM_LKF_NOQUEUE)) {
120 lp->req = lp->cur;
121 if (lp->cur == DLM_LOCK_IV)
122 lp->lksb.sb_lkid = 0;
123 goto out;
124 }
125
126 /* this could only happen with cancels I think */
127 log_info("ast sb_status %d %x,%llx flags %lx",
128 lp->lksb.sb_status, lp->lockname.ln_type,
129 (unsigned long long)lp->lockname.ln_number,
130 lp->flags);
131 return;
132 }
133
134 /*
135 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
136 */
137
138 if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
139 complete(&lp->ast_wait);
140 return;
141 }
142
143 /*
144 * A lock has been demoted to NL because it initially completed during
145 * BLOCK_LOCKS. Now it must be requested in the originally requested
146 * mode.
147 */
148
149 if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
150 gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
151 lp->lockname.ln_type,
152 (unsigned long long)lp->lockname.ln_number);
153 gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
154 lp->lockname.ln_type,
155 (unsigned long long)lp->lockname.ln_number);
156
157 lp->cur = DLM_LOCK_NL;
158 lp->req = lp->prev_req;
159 lp->prev_req = DLM_LOCK_IV;
160 lp->lkf &= ~DLM_LKF_CONVDEADLK;
161
162 set_bit(LFL_NOCACHE, &lp->flags);
163
164 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
165 !test_bit(LFL_NOBLOCK, &lp->flags))
166 gdlm_queue_delayed(lp);
167 else
168 queue_submit(lp);
169 return;
170 }
171
172 /*
173 * A request is granted during dlm recovery. It may be granted
174 * because the locks of a failed node were cleared. In that case,
175 * there may be inconsistent data beneath this lock and we must wait
176 * for recovery to complete to use it. When gfs recovery is done this
177 * granted lock will be converted to NL and then reacquired in this
178 * granted state.
179 */
180
181 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
182 !test_bit(LFL_NOBLOCK, &lp->flags) &&
183 lp->req != DLM_LOCK_NL) {
184
185 lp->cur = lp->req;
186 lp->prev_req = lp->req;
187 lp->req = DLM_LOCK_NL;
188 lp->lkf |= DLM_LKF_CONVERT;
189 lp->lkf &= ~DLM_LKF_CONVDEADLK;
190
191 log_debug("rereq %x,%llx id %x %d,%d",
192 lp->lockname.ln_type,
193 (unsigned long long)lp->lockname.ln_number,
194 lp->lksb.sb_lkid, lp->cur, lp->req);
195
196 set_bit(LFL_REREQUEST, &lp->flags);
197 queue_submit(lp);
198 return;
199 }
200
201 /*
202 * DLM demoted the lock to NL before it was granted so GFS must be
203 * told it cannot cache data for this lock.
204 */
205
206 if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
207 set_bit(LFL_NOCACHE, &lp->flags);
208
209 out:
210 /*
211 * This is an internal lock_dlm lock
212 */
213
214 if (test_bit(LFL_INLOCK, &lp->flags)) {
215 clear_bit(LFL_NOBLOCK, &lp->flags);
216 lp->cur = lp->req;
217 complete(&lp->ast_wait);
218 return;
219 }
220
221 /*
222 * Normal completion of a lock request. Tell GFS it now has the lock.
223 */
224
225 clear_bit(LFL_NOBLOCK, &lp->flags);
226 lp->cur = lp->req;
227
228 acb.lc_name = lp->lockname;
229 acb.lc_ret |= gdlm_make_lmstate(lp->cur);
230
231 if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
232 (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
233 acb.lc_ret |= LM_OUT_CACHEABLE;
234
235 ls->fscb(ls->fsdata, LM_CB_ASYNC, &acb);
236}
237
238static inline int no_work(struct gdlm_ls *ls, int blocking)
239{
240 int ret;
241
242 spin_lock(&ls->async_lock);
243 ret = list_empty(&ls->complete) && list_empty(&ls->submit);
244 if (ret && blocking)
245 ret = list_empty(&ls->blocking);
246 spin_unlock(&ls->async_lock);
247
248 return ret;
249}
250
251static inline int check_drop(struct gdlm_ls *ls)
252{
253 if (!ls->drop_locks_count)
254 return 0;
255
256 if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
257 ls->drop_time = jiffies;
258 if (ls->all_locks_count >= ls->drop_locks_count)
259 return 1;
260 }
261 return 0;
262}
263
264static int gdlm_thread(void *data)
265{
266 struct gdlm_ls *ls = (struct gdlm_ls *) data;
267 struct gdlm_lock *lp = NULL;
268 int blist = 0;
269 uint8_t complete, blocking, submit, drop;
270 DECLARE_WAITQUEUE(wait, current);
271
272 /* Only thread1 is allowed to do blocking callbacks since gfs
273 may wait for a completion callback within a blocking cb. */
274
275 if (current == ls->thread1)
276 blist = 1;
277
278 while (!kthread_should_stop()) {
279 set_current_state(TASK_INTERRUPTIBLE);
280 add_wait_queue(&ls->thread_wait, &wait);
281 if (no_work(ls, blist))
282 schedule();
283 remove_wait_queue(&ls->thread_wait, &wait);
284 set_current_state(TASK_RUNNING);
285
286 complete = blocking = submit = drop = 0;
287
288 spin_lock(&ls->async_lock);
289
290 if (blist && !list_empty(&ls->blocking)) {
291 lp = list_entry(ls->blocking.next, struct gdlm_lock,
292 blist);
293 list_del_init(&lp->blist);
294 blocking = lp->bast_mode;
295 lp->bast_mode = 0;
296 } else if (!list_empty(&ls->complete)) {
297 lp = list_entry(ls->complete.next, struct gdlm_lock,
298 clist);
299 list_del_init(&lp->clist);
300 complete = 1;
301 } else if (!list_empty(&ls->submit)) {
302 lp = list_entry(ls->submit.next, struct gdlm_lock,
303 delay_list);
304 list_del_init(&lp->delay_list);
305 submit = 1;
306 }
307
308 drop = check_drop(ls);
309 spin_unlock(&ls->async_lock);
310
311 if (complete)
312 process_complete(lp);
313
314 else if (blocking)
315 process_blocking(lp, blocking);
316
317 else if (submit)
318 gdlm_do_lock(lp);
319
320 if (drop)
321 ls->fscb(ls->fsdata, LM_CB_DROPLOCKS, NULL);
322
323 schedule();
324 }
325
326 return 0;
327}
328
329int gdlm_init_threads(struct gdlm_ls *ls)
330{
331 struct task_struct *p;
332 int error;
333
334 p = kthread_run(gdlm_thread, ls, "lock_dlm1");
335 error = IS_ERR(p);
336 if (error) {
337 log_error("can't start lock_dlm1 thread %d", error);
338 return error;
339 }
340 ls->thread1 = p;
341
342 p = kthread_run(gdlm_thread, ls, "lock_dlm2");
343 error = IS_ERR(p);
344 if (error) {
345 log_error("can't start lock_dlm2 thread %d", error);
346 kthread_stop(ls->thread1);
347 return error;
348 }
349 ls->thread2 = p;
350
351 return 0;
352}
353
354void gdlm_release_threads(struct gdlm_ls *ls)
355{
356 kthread_stop(ls->thread1);
357 kthread_stop(ls->thread2);
358}
359
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
new file mode 100644
index 000000000000..cdadf956c831
--- /dev/null
+++ b/fs/gfs2/locking/nolock/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_GFS2_FS) += lock_nolock.o
2lock_nolock-y := main.o
3
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
new file mode 100644
index 000000000000..748aa5d33641
--- /dev/null
+++ b/fs/gfs2/locking/nolock/main.c
@@ -0,0 +1,259 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/module.h>
11#include <linux/slab.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/types.h>
15#include <linux/fs.h>
16#include <linux/smp_lock.h>
17
18#include "../../lm_interface.h"
19
20struct nolock_lockspace {
21 unsigned int nl_lvb_size;
22};
23
24static struct lm_lockops nolock_ops;
25
26static int nolock_mount(char *table_name, char *host_data,
27 lm_callback_t cb, lm_fsdata_t *fsdata,
28 unsigned int min_lvb_size, int flags,
29 struct lm_lockstruct *lockstruct,
30 struct kobject *fskobj)
31{
32 char *c;
33 unsigned int jid;
34 struct nolock_lockspace *nl;
35
36 c = strstr(host_data, "jid=");
37 if (!c)
38 jid = 0;
39 else {
40 c += 4;
41 sscanf(c, "%u", &jid);
42 }
43
44 nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
45 if (!nl)
46 return -ENOMEM;
47
48 nl->nl_lvb_size = min_lvb_size;
49
50 lockstruct->ls_jid = jid;
51 lockstruct->ls_first = 1;
52 lockstruct->ls_lvb_size = min_lvb_size;
53 lockstruct->ls_lockspace = (lm_lockspace_t *)nl;
54 lockstruct->ls_ops = &nolock_ops;
55 lockstruct->ls_flags = LM_LSFLAG_LOCAL;
56
57 return 0;
58}
59
60static void nolock_others_may_mount(lm_lockspace_t *lockspace)
61{
62}
63
64static void nolock_unmount(lm_lockspace_t *lockspace)
65{
66 struct nolock_lockspace *nl = (struct nolock_lockspace *)lockspace;
67 kfree(nl);
68}
69
70static void nolock_withdraw(lm_lockspace_t *lockspace)
71{
72}
73
74/**
75 * nolock_get_lock - get a lm_lock_t given a descripton of the lock
76 * @lockspace: the lockspace the lock lives in
77 * @name: the name of the lock
78 * @lockp: return the lm_lock_t here
79 *
80 * Returns: 0 on success, -EXXX on failure
81 */
82
83static int nolock_get_lock(lm_lockspace_t *lockspace, struct lm_lockname *name,
84 lm_lock_t **lockp)
85{
86 *lockp = (lm_lock_t *)lockspace;
87 return 0;
88}
89
90/**
91 * nolock_put_lock - get rid of a lock structure
92 * @lock: the lock to throw away
93 *
94 */
95
96static void nolock_put_lock(lm_lock_t *lock)
97{
98}
99
100/**
101 * nolock_lock - acquire a lock
102 * @lock: the lock to manipulate
103 * @cur_state: the current state
104 * @req_state: the requested state
105 * @flags: modifier flags
106 *
107 * Returns: A bitmap of LM_OUT_*
108 */
109
110static unsigned int nolock_lock(lm_lock_t *lock, unsigned int cur_state,
111 unsigned int req_state, unsigned int flags)
112{
113 return req_state | LM_OUT_CACHEABLE;
114}
115
116/**
117 * nolock_unlock - unlock a lock
118 * @lock: the lock to manipulate
119 * @cur_state: the current state
120 *
121 * Returns: 0
122 */
123
124static unsigned int nolock_unlock(lm_lock_t *lock, unsigned int cur_state)
125{
126 return 0;
127}
128
129static void nolock_cancel(lm_lock_t *lock)
130{
131}
132
133/**
134 * nolock_hold_lvb - hold on to a lock value block
135 * @lock: the lock the LVB is associated with
136 * @lvbp: return the lm_lvb_t here
137 *
138 * Returns: 0 on success, -EXXX on failure
139 */
140
141static int nolock_hold_lvb(lm_lock_t *lock, char **lvbp)
142{
143 struct nolock_lockspace *nl = (struct nolock_lockspace *)lock;
144 int error = 0;
145
146 *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL);
147 if (!*lvbp)
148 error = -ENOMEM;
149
150 return error;
151}
152
153/**
154 * nolock_unhold_lvb - release a LVB
155 * @lock: the lock the LVB is associated with
156 * @lvb: the lock value block
157 *
158 */
159
160static void nolock_unhold_lvb(lm_lock_t *lock, char *lvb)
161{
162 kfree(lvb);
163}
164
165/**
166 * nolock_sync_lvb - sync out the value of a lvb
167 * @lock: the lock the LVB is associated with
168 * @lvb: the lock value block
169 *
170 */
171
172static void nolock_sync_lvb(lm_lock_t *lock, char *lvb)
173{
174}
175
176static int nolock_plock_get(lm_lockspace_t *lockspace, struct lm_lockname *name,
177 struct file *file, struct file_lock *fl)
178{
179 struct file_lock tmp;
180 int ret;
181
182 ret = posix_test_lock(file, fl, &tmp);
183 fl->fl_type = F_UNLCK;
184 if (ret)
185 memcpy(fl, &tmp, sizeof(struct file_lock));
186
187 return 0;
188}
189
190static int nolock_plock(lm_lockspace_t *lockspace, struct lm_lockname *name,
191 struct file *file, int cmd, struct file_lock *fl)
192{
193 int error;
194 error = posix_lock_file_wait(file, fl);
195 return error;
196}
197
198static int nolock_punlock(lm_lockspace_t *lockspace, struct lm_lockname *name,
199 struct file *file, struct file_lock *fl)
200{
201 int error;
202 error = posix_lock_file_wait(file, fl);
203 return error;
204}
205
206static void nolock_recovery_done(lm_lockspace_t *lockspace, unsigned int jid,
207 unsigned int message)
208{
209}
210
211static struct lm_lockops nolock_ops = {
212 .lm_proto_name = "lock_nolock",
213 .lm_mount = nolock_mount,
214 .lm_others_may_mount = nolock_others_may_mount,
215 .lm_unmount = nolock_unmount,
216 .lm_withdraw = nolock_withdraw,
217 .lm_get_lock = nolock_get_lock,
218 .lm_put_lock = nolock_put_lock,
219 .lm_lock = nolock_lock,
220 .lm_unlock = nolock_unlock,
221 .lm_cancel = nolock_cancel,
222 .lm_hold_lvb = nolock_hold_lvb,
223 .lm_unhold_lvb = nolock_unhold_lvb,
224 .lm_sync_lvb = nolock_sync_lvb,
225 .lm_plock_get = nolock_plock_get,
226 .lm_plock = nolock_plock,
227 .lm_punlock = nolock_punlock,
228 .lm_recovery_done = nolock_recovery_done,
229 .lm_owner = THIS_MODULE,
230};
231
232static int __init init_nolock(void)
233{
234 int error;
235
236 error = gfs2_register_lockproto(&nolock_ops);
237 if (error) {
238 printk(KERN_WARNING
239 "lock_nolock: can't register protocol: %d\n", error);
240 return error;
241 }
242
243 printk(KERN_INFO
244 "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
245 return 0;
246}
247
248static void __exit exit_nolock(void)
249{
250 gfs2_unregister_lockproto(&nolock_ops);
251}
252
253module_init(init_nolock);
254module_exit(exit_nolock);
255
256MODULE_DESCRIPTION("GFS Nolock Locking Module");
257MODULE_AUTHOR("Red Hat, Inc.");
258MODULE_LICENSE("GPL");
259
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
new file mode 100644
index 000000000000..af728cb3b327
--- /dev/null
+++ b/fs/gfs2/log.c
@@ -0,0 +1,603 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "log.h"
24#include "lops.h"
25#include "meta_io.h"
26#include "util.h"
27#include "dir.h"
28
29#define PULL 1
30
31/**
32 * gfs2_struct2blk - compute stuff
33 * @sdp: the filesystem
34 * @nstruct: the number of structures
35 * @ssize: the size of the structures
36 *
37 * Compute the number of log descriptor blocks needed to hold a certain number
38 * of structures of a certain size.
39 *
40 * Returns: the number of blocks needed (minimum is always 1)
41 */
42
43unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
44 unsigned int ssize)
45{
46 unsigned int blks;
47 unsigned int first, second;
48
49 blks = 1;
50 first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) /
51 ssize;
52
53 if (nstruct > first) {
54 second = (sdp->sd_sb.sb_bsize -
55 sizeof(struct gfs2_meta_header)) / ssize;
56 blks += DIV_ROUND_UP(nstruct - first, second);
57 }
58
59 return blks;
60}
61
62void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
63{
64 struct list_head *head = &sdp->sd_ail1_list;
65 uint64_t sync_gen;
66 struct list_head *first, *tmp;
67 struct gfs2_ail *first_ai, *ai;
68
69 gfs2_log_lock(sdp);
70 if (list_empty(head)) {
71 gfs2_log_unlock(sdp);
72 return;
73 }
74 sync_gen = sdp->sd_ail_sync_gen++;
75
76 first = head->prev;
77 first_ai = list_entry(first, struct gfs2_ail, ai_list);
78 first_ai->ai_sync_gen = sync_gen;
79 gfs2_ail1_start_one(sdp, first_ai);
80
81 if (flags & DIO_ALL)
82 first = NULL;
83
84 for (;;) {
85 if (first && (head->prev != first ||
86 gfs2_ail1_empty_one(sdp, first_ai, 0)))
87 break;
88
89 for (tmp = head->prev; tmp != head; tmp = tmp->prev) {
90 ai = list_entry(tmp, struct gfs2_ail, ai_list);
91 if (ai->ai_sync_gen >= sync_gen)
92 continue;
93 ai->ai_sync_gen = sync_gen;
94 gfs2_ail1_start_one(sdp, ai);
95 break;
96 }
97
98 if (tmp == head)
99 break;
100 }
101
102 gfs2_log_unlock(sdp);
103}
104
105int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
106{
107 struct gfs2_ail *ai, *s;
108 int ret;
109
110 gfs2_log_lock(sdp);
111
112 list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
113 if (gfs2_ail1_empty_one(sdp, ai, flags))
114 list_move(&ai->ai_list, &sdp->sd_ail2_list);
115 else if (!(flags & DIO_ALL))
116 break;
117 }
118
119 ret = list_empty(&sdp->sd_ail1_list);
120
121 gfs2_log_unlock(sdp);
122
123 return ret;
124}
125
126static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
127{
128 struct gfs2_ail *ai, *safe;
129 unsigned int old_tail = sdp->sd_log_tail;
130 int wrap = (new_tail < old_tail);
131 int a, b, rm;
132
133 gfs2_log_lock(sdp);
134
135 list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
136 a = (old_tail <= ai->ai_first);
137 b = (ai->ai_first < new_tail);
138 rm = (wrap) ? (a || b) : (a && b);
139 if (!rm)
140 continue;
141
142 gfs2_ail2_empty_one(sdp, ai);
143 list_del(&ai->ai_list);
144 gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list));
145 gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list));
146 kfree(ai);
147 }
148
149 gfs2_log_unlock(sdp);
150}
151
152/**
153 * gfs2_log_reserve - Make a log reservation
154 * @sdp: The GFS2 superblock
155 * @blks: The number of blocks to reserve
156 *
157 * Returns: errno
158 */
159
160int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
161{
162 unsigned int try = 0;
163
164 if (gfs2_assert_warn(sdp, blks) ||
165 gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
166 return -EINVAL;
167
168 mutex_lock(&sdp->sd_log_reserve_mutex);
169 gfs2_log_lock(sdp);
170 while(sdp->sd_log_blks_free <= blks) {
171 gfs2_log_unlock(sdp);
172 gfs2_ail1_empty(sdp, 0);
173 gfs2_log_flush(sdp, NULL);
174
175 if (try++)
176 gfs2_ail1_start(sdp, 0);
177 gfs2_log_lock(sdp);
178 }
179 sdp->sd_log_blks_free -= blks;
180 /* printk(KERN_INFO "reserved %u blocks (%u left)\n", blks, sdp->sd_log_blks_free); */
181 gfs2_log_unlock(sdp);
182 mutex_unlock(&sdp->sd_log_reserve_mutex);
183
184 down_read(&sdp->sd_log_flush_lock);
185
186 return 0;
187}
188
189/**
190 * gfs2_log_release - Release a given number of log blocks
191 * @sdp: The GFS2 superblock
192 * @blks: The number of blocks
193 *
194 */
195
196void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
197{
198
199 gfs2_log_lock(sdp);
200 sdp->sd_log_blks_free += blks;
201 /* printk(KERN_INFO "released %u blocks (%u left)\n", blks, sdp->sd_log_blks_free); */
202 gfs2_assert_withdraw(sdp,
203 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
204 gfs2_log_unlock(sdp);
205 up_read(&sdp->sd_log_flush_lock);
206}
207
208static uint64_t log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
209{
210 int new = 0;
211 uint64_t dbn;
212 int error;
213 int bdy;
214
215 error = gfs2_block_map(sdp->sd_jdesc->jd_inode, lbn, &new, &dbn, &bdy);
216 if (!(!error && dbn)) {
217 printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error, (unsigned long long)dbn, lbn);
218 }
219 gfs2_assert_withdraw(sdp, !error && dbn);
220
221 return dbn;
222}
223
224/**
225 * log_distance - Compute distance between two journal blocks
226 * @sdp: The GFS2 superblock
227 * @newer: The most recent journal block of the pair
228 * @older: The older journal block of the pair
229 *
230 * Compute the distance (in the journal direction) between two
231 * blocks in the journal
232 *
233 * Returns: the distance in blocks
234 */
235
236static inline unsigned int log_distance(struct gfs2_sbd *sdp,
237 unsigned int newer,
238 unsigned int older)
239{
240 int dist;
241
242 dist = newer - older;
243 if (dist < 0)
244 dist += sdp->sd_jdesc->jd_blocks;
245
246 return dist;
247}
248
249static unsigned int current_tail(struct gfs2_sbd *sdp)
250{
251 struct gfs2_ail *ai;
252 unsigned int tail;
253
254 gfs2_log_lock(sdp);
255
256 if (list_empty(&sdp->sd_ail1_list))
257 tail = sdp->sd_log_head;
258 else {
259 ai = list_entry(sdp->sd_ail1_list.prev, struct gfs2_ail,
260 ai_list);
261 tail = ai->ai_first;
262 }
263
264 gfs2_log_unlock(sdp);
265
266 return tail;
267}
268
269static inline void log_incr_head(struct gfs2_sbd *sdp)
270{
271 if (sdp->sd_log_flush_head == sdp->sd_log_tail)
272 gfs2_assert_withdraw(sdp,
273 sdp->sd_log_flush_head == sdp->sd_log_head);
274
275 if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
276 sdp->sd_log_flush_head = 0;
277 sdp->sd_log_flush_wrapped = 1;
278 }
279}
280
281/**
282 * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
283 * @sdp: The GFS2 superblock
284 *
285 * Returns: the buffer_head
286 */
287
288struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
289{
290 uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head);
291 struct gfs2_log_buf *lb;
292 struct buffer_head *bh;
293
294 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
295 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
296
297 bh = lb->lb_bh = sb_getblk(sdp->sd_vfs, blkno);
298 lock_buffer(bh);
299 memset(bh->b_data, 0, bh->b_size);
300 set_buffer_uptodate(bh);
301 clear_buffer_dirty(bh);
302 unlock_buffer(bh);
303
304 log_incr_head(sdp);
305
306 return bh;
307}
308
309/**
310 * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
311 * @sdp: the filesystem
312 * @data: the data the buffer_head should point to
313 *
314 * Returns: the log buffer descriptor
315 */
316
317struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
318 struct buffer_head *real)
319{
320 uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head);
321 struct gfs2_log_buf *lb;
322 struct buffer_head *bh;
323
324 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
325 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
326 lb->lb_real = real;
327
328 bh = lb->lb_bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
329 atomic_set(&bh->b_count, 1);
330 bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate);
331 set_bh_page(bh, real->b_page, bh_offset(real));
332 bh->b_blocknr = blkno;
333 bh->b_size = sdp->sd_sb.sb_bsize;
334 bh->b_bdev = sdp->sd_vfs->s_bdev;
335
336 log_incr_head(sdp);
337
338 return bh;
339}
340
341static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail, int pull)
342{
343 unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail);
344
345 ail2_empty(sdp, new_tail);
346
347 gfs2_log_lock(sdp);
348 sdp->sd_log_blks_free += dist - ((pull) ? 1 : 0);
349 /* printk(KERN_INFO "pull tail refunding %u blocks (%u left) pull=%d\n", dist - ((pull) ? 1 : 0), sdp->sd_log_blks_free, pull); */
350 gfs2_assert_withdraw(sdp,
351 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
352 gfs2_log_unlock(sdp);
353
354 sdp->sd_log_tail = new_tail;
355}
356
357/**
358 * log_write_header - Get and initialize a journal header buffer
359 * @sdp: The GFS2 superblock
360 *
361 * Returns: the initialized log buffer descriptor
362 */
363
364static void log_write_header(struct gfs2_sbd *sdp, uint32_t flags, int pull)
365{
366 uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head);
367 struct buffer_head *bh;
368 struct gfs2_log_header *lh;
369 unsigned int tail;
370 uint32_t hash;
371
372 /* printk(KERN_INFO "log write header start (flags=%08x, pull=%d)\n", flags, pull); */
373
374 bh = sb_getblk(sdp->sd_vfs, blkno);
375 lock_buffer(bh);
376 memset(bh->b_data, 0, bh->b_size);
377 set_buffer_uptodate(bh);
378 clear_buffer_dirty(bh);
379 unlock_buffer(bh);
380
381 gfs2_ail1_empty(sdp, 0);
382 tail = current_tail(sdp);
383
384 lh = (struct gfs2_log_header *)bh->b_data;
385 memset(lh, 0, sizeof(struct gfs2_log_header));
386 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
387 lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
388 lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
389 lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
390 lh->lh_flags = cpu_to_be32(flags);
391 lh->lh_tail = cpu_to_be32(tail);
392 lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head);
393 hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
394 lh->lh_hash = cpu_to_be32(hash);
395
396 set_buffer_dirty(bh);
397 if (sync_dirty_buffer(bh))
398 gfs2_io_error_bh(sdp, bh);
399 brelse(bh);
400
401 if (sdp->sd_log_tail != tail)
402 log_pull_tail(sdp, tail, pull);
403 else
404 gfs2_assert_withdraw(sdp, !pull);
405
406 sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
407 log_incr_head(sdp);
408
409 /* printk(KERN_INFO "log write header out\n"); */
410}
411
412static void log_flush_commit(struct gfs2_sbd *sdp)
413{
414 struct list_head *head = &sdp->sd_log_flush_list;
415 struct gfs2_log_buf *lb;
416 struct buffer_head *bh;
417#if 0
418 unsigned int d;
419
420 d = log_distance(sdp, sdp->sd_log_flush_head, sdp->sd_log_head);
421
422 gfs2_assert_withdraw(sdp, d + 1 == sdp->sd_log_blks_reserved);
423#endif
424
425 while (!list_empty(head)) {
426 lb = list_entry(head->next, struct gfs2_log_buf, lb_list);
427 list_del(&lb->lb_list);
428 bh = lb->lb_bh;
429
430 wait_on_buffer(bh);
431 if (!buffer_uptodate(bh))
432 gfs2_io_error_bh(sdp, bh);
433 if (lb->lb_real) {
434 while (atomic_read(&bh->b_count) != 1) /* Grrrr... */
435 schedule();
436 free_buffer_head(bh);
437 } else
438 brelse(bh);
439 kfree(lb);
440 }
441
442 log_write_header(sdp, 0, 0);
443}
444
445/**
446 * gfs2_log_flush - flush incore transaction(s)
447 * @sdp: the filesystem
448 * @gl: The glock structure to flush. If NULL, flush the whole incore log
449 *
450 */
451
452void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
453{
454 struct gfs2_ail *ai;
455
456 down_write(&sdp->sd_log_flush_lock);
457
458 if (gl) {
459 gfs2_log_lock(sdp);
460 if (list_empty(&gl->gl_le.le_list)) {
461 gfs2_log_unlock(sdp);
462 up_write(&sdp->sd_log_flush_lock);
463 return;
464 }
465 gfs2_log_unlock(sdp);
466 }
467
468 ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
469 INIT_LIST_HEAD(&ai->ai_ail1_list);
470 INIT_LIST_HEAD(&ai->ai_ail2_list);
471
472 gfs2_assert_withdraw(sdp,
473 sdp->sd_log_num_buf == sdp->sd_log_commited_buf);
474 gfs2_assert_withdraw(sdp,
475 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
476
477 sdp->sd_log_flush_head = sdp->sd_log_head;
478 sdp->sd_log_flush_wrapped = 0;
479 ai->ai_first = sdp->sd_log_flush_head;
480
481 lops_before_commit(sdp);
482 if (!list_empty(&sdp->sd_log_flush_list))
483 log_flush_commit(sdp);
484 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle)
485 log_write_header(sdp, 0, PULL);
486 lops_after_commit(sdp, ai);
487 sdp->sd_log_head = sdp->sd_log_flush_head;
488
489 /* printk(KERN_INFO "sd_log_num_hdrs %u\n", sdp->sd_log_num_hdrs); */
490 sdp->sd_log_blks_free -= sdp->sd_log_num_hdrs;
491
492 sdp->sd_log_blks_reserved =
493 sdp->sd_log_commited_buf =
494 sdp->sd_log_num_hdrs =
495 sdp->sd_log_commited_revoke = 0;
496
497 gfs2_log_lock(sdp);
498 if (!list_empty(&ai->ai_ail1_list)) {
499 list_add(&ai->ai_list, &sdp->sd_ail1_list);
500 ai = NULL;
501 }
502 gfs2_log_unlock(sdp);
503
504 sdp->sd_vfs->s_dirt = 0;
505 up_write(&sdp->sd_log_flush_lock);
506
507 kfree(ai);
508}
509
510static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
511{
512 unsigned int reserved = 0;
513 unsigned int old;
514
515 gfs2_log_lock(sdp);
516
517 sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm;
518 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_buf) >= 0);
519 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
520 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
521
522 if (sdp->sd_log_commited_buf)
523 reserved += sdp->sd_log_commited_buf;
524 if (sdp->sd_log_commited_revoke)
525 reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
526 sizeof(uint64_t));
527 if (reserved)
528 reserved++;
529
530 old = sdp->sd_log_blks_free;
531 sdp->sd_log_blks_free += tr->tr_reserved -
532 (reserved - sdp->sd_log_blks_reserved);
533
534 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old);
535 gfs2_assert_withdraw(sdp,
536 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks +
537 sdp->sd_log_num_hdrs);
538
539 sdp->sd_log_blks_reserved = reserved;
540
541 gfs2_log_unlock(sdp);
542}
543
544/**
545 * gfs2_log_commit - Commit a transaction to the log
546 * @sdp: the filesystem
547 * @tr: the transaction
548 *
549 * Returns: errno
550 */
551
552void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
553{
554 log_refund(sdp, tr);
555 lops_incore_commit(sdp, tr);
556
557 sdp->sd_vfs->s_dirt = 1;
558 up_read(&sdp->sd_log_flush_lock);
559
560 gfs2_log_lock(sdp);
561 if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) {
562 gfs2_log_unlock(sdp);
563 gfs2_log_flush(sdp, NULL);
564 } else
565 gfs2_log_unlock(sdp);
566}
567
568/**
569 * gfs2_log_shutdown - write a shutdown header into a journal
570 * @sdp: the filesystem
571 *
572 */
573
574void gfs2_log_shutdown(struct gfs2_sbd *sdp)
575{
576 down_write(&sdp->sd_log_flush_lock);
577
578 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
579 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
580 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
581 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_jdata);
582 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
583 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
584 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
585 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_hdrs);
586 gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
587
588 sdp->sd_log_flush_head = sdp->sd_log_head;
589 sdp->sd_log_flush_wrapped = 0;
590
591 log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, 0);
592
593 /* printk(KERN_INFO "sd_log_blks_free %u, sd_jdesc->jd_blocks %u\n", sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); */
594 gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks);
595 gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
596 gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
597
598 sdp->sd_log_head = sdp->sd_log_flush_head;
599 sdp->sd_log_tail = sdp->sd_log_head;
600
601 up_write(&sdp->sd_log_flush_lock);
602}
603
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
new file mode 100644
index 000000000000..8cfd0f1d29f8
--- /dev/null
+++ b/fs/gfs2/log.h
@@ -0,0 +1,61 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LOG_DOT_H__
11#define __LOG_DOT_H__
12
13/**
14 * gfs2_log_lock - acquire the right to mess with the log manager
15 * @sdp: the filesystem
16 *
17 */
18
19static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
20{
21 spin_lock(&sdp->sd_log_lock);
22}
23
24/**
25 * gfs2_log_unlock - release the right to mess with the log manager
26 * @sdp: the filesystem
27 *
28 */
29
30static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
31{
32 spin_unlock(&sdp->sd_log_lock);
33}
34
35static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
36 unsigned int value)
37{
38 if (++value == sdp->sd_jdesc->jd_blocks) {
39 value = 0;
40 }
41 sdp->sd_log_head = sdp->sd_log_tail = value;
42}
43
44unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
45 unsigned int ssize);
46
47void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags);
48int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
49
50int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
51void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
52
53struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
54struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
55 struct buffer_head *real);
56void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
57void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
58
59void gfs2_log_shutdown(struct gfs2_sbd *sdp);
60
61#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
new file mode 100644
index 000000000000..08de8b7fb316
--- /dev/null
+++ b/fs/gfs2/lops.c
@@ -0,0 +1,800 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16
17#include "gfs2.h"
18#include "lm_interface.h"
19#include "incore.h"
20#include "glock.h"
21#include "log.h"
22#include "lops.h"
23#include "meta_io.h"
24#include "recovery.h"
25#include "rgrp.h"
26#include "trans.h"
27#include "util.h"
28
29static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
30{
31 struct gfs2_glock *gl;
32 struct gfs2_trans *tr = current->journal_info;
33
34 tr->tr_touched = 1;
35
36 if (!list_empty(&le->le_list))
37 return;
38
39 gl = container_of(le, struct gfs2_glock, gl_le);
40 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
41 return;
42 gfs2_glock_hold(gl);
43 set_bit(GLF_DIRTY, &gl->gl_flags);
44
45 gfs2_log_lock(sdp);
46 sdp->sd_log_num_gl++;
47 list_add(&le->le_list, &sdp->sd_log_le_gl);
48 gfs2_log_unlock(sdp);
49}
50
51static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
52{
53 struct list_head *head = &sdp->sd_log_le_gl;
54 struct gfs2_glock *gl;
55
56 while (!list_empty(head)) {
57 gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list);
58 list_del_init(&gl->gl_le.le_list);
59 sdp->sd_log_num_gl--;
60
61 gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl));
62 gfs2_glock_put(gl);
63 }
64 gfs2_assert_warn(sdp, !sdp->sd_log_num_gl);
65}
66
67static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
68{
69 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
70 struct gfs2_trans *tr;
71
72 if (!list_empty(&bd->bd_list_tr))
73 return;
74
75 tr = current->journal_info;
76 tr->tr_touched = 1;
77 tr->tr_num_buf++;
78 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
79
80 if (!list_empty(&le->le_list))
81 return;
82
83 gfs2_trans_add_gl(bd->bd_gl);
84
85 gfs2_meta_check(sdp, bd->bd_bh);
86 gfs2_pin(sdp, bd->bd_bh);
87
88 gfs2_log_lock(sdp);
89 sdp->sd_log_num_buf++;
90 list_add(&le->le_list, &sdp->sd_log_le_buf);
91 gfs2_log_unlock(sdp);
92
93 tr->tr_num_buf_new++;
94}
95
96static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
97{
98 struct list_head *head = &tr->tr_list_buf;
99 struct gfs2_bufdata *bd;
100
101 while (!list_empty(head)) {
102 bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
103 list_del_init(&bd->bd_list_tr);
104 tr->tr_num_buf--;
105 }
106 gfs2_assert_warn(sdp, !tr->tr_num_buf);
107}
108
109static void buf_lo_before_commit(struct gfs2_sbd *sdp)
110{
111 struct buffer_head *bh;
112 struct gfs2_log_descriptor *ld;
113 struct gfs2_bufdata *bd1 = NULL, *bd2;
114 unsigned int total = sdp->sd_log_num_buf;
115 unsigned int offset = sizeof(struct gfs2_log_descriptor);
116 unsigned int limit;
117 unsigned int num;
118 unsigned n;
119 __be64 *ptr;
120
121 offset += (sizeof(__be64) - 1);
122 offset &= ~(sizeof(__be64) - 1);
123 limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
124 /* for 4k blocks, limit = 503 */
125
126 bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list);
127 while(total) {
128 num = total;
129 if (total > limit)
130 num = limit;
131 bh = gfs2_log_get_buf(sdp);
132 sdp->sd_log_num_hdrs++;
133 ld = (struct gfs2_log_descriptor *)bh->b_data;
134 ptr = (__be64 *)(bh->b_data + offset);
135 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
136 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
137 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
138 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_METADATA);
139 ld->ld_length = cpu_to_be32(num + 1);
140 ld->ld_data1 = cpu_to_be32(num);
141 ld->ld_data2 = cpu_to_be32(0);
142 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
143
144 n = 0;
145 list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf,
146 bd_le.le_list) {
147 *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
148 if (++n >= num)
149 break;
150 }
151
152 set_buffer_dirty(bh);
153 ll_rw_block(WRITE, 1, &bh);
154
155 n = 0;
156 list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf,
157 bd_le.le_list) {
158 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
159 set_buffer_dirty(bh);
160 ll_rw_block(WRITE, 1, &bh);
161 if (++n >= num)
162 break;
163 }
164
165 total -= num;
166 }
167}
168
169static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
170{
171 struct list_head *head = &sdp->sd_log_le_buf;
172 struct gfs2_bufdata *bd;
173
174 while (!list_empty(head)) {
175 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
176 list_del_init(&bd->bd_le.le_list);
177 sdp->sd_log_num_buf--;
178
179 gfs2_unpin(sdp, bd->bd_bh, ai);
180 }
181 gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
182}
183
184static void buf_lo_before_scan(struct gfs2_jdesc *jd,
185 struct gfs2_log_header *head, int pass)
186{
187 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
188
189 if (pass != 0)
190 return;
191
192 sdp->sd_found_blocks = 0;
193 sdp->sd_replayed_blocks = 0;
194}
195
196static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
197 struct gfs2_log_descriptor *ld, __be64 *ptr,
198 int pass)
199{
200 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
201 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
202 struct gfs2_glock *gl = ip->i_gl;
203 unsigned int blks = be32_to_cpu(ld->ld_data1);
204 struct buffer_head *bh_log, *bh_ip;
205 uint64_t blkno;
206 int error = 0;
207
208 if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA)
209 return 0;
210
211 gfs2_replay_incr_blk(sdp, &start);
212
213 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
214 blkno = be64_to_cpu(*ptr++);
215
216 sdp->sd_found_blocks++;
217
218 if (gfs2_revoke_check(sdp, blkno, start))
219 continue;
220
221 error = gfs2_replay_read_block(jd, start, &bh_log);
222 if (error)
223 return error;
224
225 bh_ip = gfs2_meta_new(gl, blkno);
226 memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
227
228 if (gfs2_meta_check(sdp, bh_ip))
229 error = -EIO;
230 else
231 mark_buffer_dirty(bh_ip);
232
233 brelse(bh_log);
234 brelse(bh_ip);
235
236 if (error)
237 break;
238
239 sdp->sd_replayed_blocks++;
240 }
241
242 return error;
243}
244
245static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
246{
247 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
248 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
249
250 if (error) {
251 gfs2_meta_sync(ip->i_gl,
252 DIO_START | DIO_WAIT);
253 return;
254 }
255 if (pass != 1)
256 return;
257
258 gfs2_meta_sync(ip->i_gl, DIO_START | DIO_WAIT);
259
260 fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
261 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
262}
263
264static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
265{
266 struct gfs2_trans *tr;
267
268 tr = current->journal_info;
269 tr->tr_touched = 1;
270 tr->tr_num_revoke++;
271
272 gfs2_log_lock(sdp);
273 sdp->sd_log_num_revoke++;
274 list_add(&le->le_list, &sdp->sd_log_le_revoke);
275 gfs2_log_unlock(sdp);
276}
277
278static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
279{
280 struct gfs2_log_descriptor *ld;
281 struct gfs2_meta_header *mh;
282 struct buffer_head *bh;
283 unsigned int offset;
284 struct list_head *head = &sdp->sd_log_le_revoke;
285 struct gfs2_revoke *rv;
286
287 if (!sdp->sd_log_num_revoke)
288 return;
289
290 bh = gfs2_log_get_buf(sdp);
291 ld = (struct gfs2_log_descriptor *)bh->b_data;
292 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
293 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
294 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
295 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_REVOKE);
296 ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke,
297 sizeof(uint64_t)));
298 ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
299 ld->ld_data2 = cpu_to_be32(0);
300 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
301 offset = sizeof(struct gfs2_log_descriptor);
302
303 while (!list_empty(head)) {
304 rv = list_entry(head->next, struct gfs2_revoke, rv_le.le_list);
305 list_del_init(&rv->rv_le.le_list);
306 sdp->sd_log_num_revoke--;
307
308 if (offset + sizeof(uint64_t) > sdp->sd_sb.sb_bsize) {
309 set_buffer_dirty(bh);
310 ll_rw_block(WRITE, 1, &bh);
311
312 bh = gfs2_log_get_buf(sdp);
313 mh = (struct gfs2_meta_header *)bh->b_data;
314 mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
315 mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB);
316 mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB);
317 offset = sizeof(struct gfs2_meta_header);
318 }
319
320 *(__be64 *)(bh->b_data + offset) = cpu_to_be64(rv->rv_blkno);
321 kfree(rv);
322
323 offset += sizeof(uint64_t);
324 }
325 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
326
327 set_buffer_dirty(bh);
328 ll_rw_block(WRITE, 1, &bh);
329}
330
331static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
332 struct gfs2_log_header *head, int pass)
333{
334 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
335
336 if (pass != 0)
337 return;
338
339 sdp->sd_found_revokes = 0;
340 sdp->sd_replay_tail = head->lh_tail;
341}
342
343static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
344 struct gfs2_log_descriptor *ld, __be64 *ptr,
345 int pass)
346{
347 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
348 unsigned int blks = be32_to_cpu(ld->ld_length);
349 unsigned int revokes = be32_to_cpu(ld->ld_data1);
350 struct buffer_head *bh;
351 unsigned int offset;
352 uint64_t blkno;
353 int first = 1;
354 int error;
355
356 if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE)
357 return 0;
358
359 offset = sizeof(struct gfs2_log_descriptor);
360
361 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
362 error = gfs2_replay_read_block(jd, start, &bh);
363 if (error)
364 return error;
365
366 if (!first)
367 gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB);
368
369 while (offset + sizeof(uint64_t) <= sdp->sd_sb.sb_bsize) {
370 blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
371
372 error = gfs2_revoke_add(sdp, blkno, start);
373 if (error < 0)
374 return error;
375 else if (error)
376 sdp->sd_found_revokes++;
377
378 if (!--revokes)
379 break;
380 offset += sizeof(uint64_t);
381 }
382
383 brelse(bh);
384 offset = sizeof(struct gfs2_meta_header);
385 first = 0;
386 }
387
388 return 0;
389}
390
391static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
392{
393 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
394
395 if (error) {
396 gfs2_revoke_clean(sdp);
397 return;
398 }
399 if (pass != 1)
400 return;
401
402 fs_info(sdp, "jid=%u: Found %u revoke tags\n",
403 jd->jd_jid, sdp->sd_found_revokes);
404
405 gfs2_revoke_clean(sdp);
406}
407
408static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
409{
410 struct gfs2_rgrpd *rgd;
411 struct gfs2_trans *tr = current->journal_info;
412
413 tr->tr_touched = 1;
414
415 if (!list_empty(&le->le_list))
416 return;
417
418 rgd = container_of(le, struct gfs2_rgrpd, rd_le);
419 gfs2_rgrp_bh_hold(rgd);
420
421 gfs2_log_lock(sdp);
422 sdp->sd_log_num_rg++;
423 list_add(&le->le_list, &sdp->sd_log_le_rg);
424 gfs2_log_unlock(sdp);
425}
426
427static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
428{
429 struct list_head *head = &sdp->sd_log_le_rg;
430 struct gfs2_rgrpd *rgd;
431
432 while (!list_empty(head)) {
433 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list);
434 list_del_init(&rgd->rd_le.le_list);
435 sdp->sd_log_num_rg--;
436
437 gfs2_rgrp_repolish_clones(rgd);
438 gfs2_rgrp_bh_put(rgd);
439 }
440 gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
441}
442
443/**
444 * databuf_lo_add - Add a databuf to the transaction.
445 *
446 * This is used in two distinct cases:
447 * i) In ordered write mode
448 * We put the data buffer on a list so that we can ensure that its
449 * synced to disk at the right time
450 * ii) In journaled data mode
451 * We need to journal the data block in the same way as metadata in
452 * the functions above. The difference is that here we have a tag
453 * which is two __be64's being the block number (as per meta data)
454 * and a flag which says whether the data block needs escaping or
455 * not. This means we need a new log entry for each 251 or so data
456 * blocks, which isn't an enormous overhead but twice as much as
457 * for normal metadata blocks.
458 */
459static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
460{
461 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
462 struct gfs2_trans *tr = current->journal_info;
463 struct address_space *mapping = bd->bd_bh->b_page->mapping;
464 struct gfs2_inode *ip = GFS2_I(mapping->host);
465
466 tr->tr_touched = 1;
467 if (list_empty(&bd->bd_list_tr) &&
468 (ip->i_di.di_flags & GFS2_DIF_JDATA)) {
469 tr->tr_num_buf++;
470 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
471 gfs2_pin(sdp, bd->bd_bh);
472 tr->tr_num_buf_new++;
473 }
474 gfs2_trans_add_gl(bd->bd_gl);
475 gfs2_log_lock(sdp);
476 if (list_empty(&le->le_list)) {
477 if (ip->i_di.di_flags & GFS2_DIF_JDATA)
478 sdp->sd_log_num_jdata++;
479 sdp->sd_log_num_databuf++;
480 list_add(&le->le_list, &sdp->sd_log_le_databuf);
481 }
482 gfs2_log_unlock(sdp);
483}
484
485static int gfs2_check_magic(struct buffer_head *bh)
486{
487 struct page *page = bh->b_page;
488 void *kaddr;
489 __be32 *ptr;
490 int rv = 0;
491
492 kaddr = kmap_atomic(page, KM_USER0);
493 ptr = kaddr + bh_offset(bh);
494 if (*ptr == cpu_to_be32(GFS2_MAGIC))
495 rv = 1;
496 kunmap_atomic(page, KM_USER0);
497
498 return rv;
499}
500
501/**
502 * databuf_lo_before_commit - Scan the data buffers, writing as we go
503 *
504 * Here we scan through the lists of buffers and make the assumption
505 * that any buffer thats been pinned is being journaled, and that
506 * any unpinned buffer is an ordered write data buffer and therefore
507 * will be written back rather than journaled.
508 */
509static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
510{
511 LIST_HEAD(started);
512 struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt;
513 struct buffer_head *bh = NULL;
514 unsigned int offset = sizeof(struct gfs2_log_descriptor);
515 struct gfs2_log_descriptor *ld;
516 unsigned int limit;
517 unsigned int total_dbuf = sdp->sd_log_num_databuf;
518 unsigned int total_jdata = sdp->sd_log_num_jdata;
519 unsigned int num, n;
520 __be64 *ptr = NULL;
521
522 offset += (2*sizeof(__be64) - 1);
523 offset &= ~(2*sizeof(__be64) - 1);
524 limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
525
526 /*
527 * Start writing ordered buffers, write journaled buffers
528 * into the log along with a header
529 */
530 gfs2_log_lock(sdp);
531 bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf,
532 bd_le.le_list);
533 while(total_dbuf) {
534 num = total_jdata;
535 if (num > limit)
536 num = limit;
537 n = 0;
538 list_for_each_entry_safe_continue(bd1, bdt,
539 &sdp->sd_log_le_databuf,
540 bd_le.le_list) {
541 /* An ordered write buffer */
542 if (bd1->bd_bh && !buffer_pinned(bd1->bd_bh)) {
543 list_move(&bd1->bd_le.le_list, &started);
544 if (bd1 == bd2) {
545 bd2 = NULL;
546 bd2 = list_prepare_entry(bd2,
547 &sdp->sd_log_le_databuf,
548 bd_le.le_list);
549 }
550 total_dbuf--;
551 if (bd1->bd_bh) {
552 get_bh(bd1->bd_bh);
553 if (buffer_dirty(bd1->bd_bh)) {
554 gfs2_log_unlock(sdp);
555 wait_on_buffer(bd1->bd_bh);
556 ll_rw_block(WRITE, 1,
557 &bd1->bd_bh);
558 gfs2_log_lock(sdp);
559 }
560 brelse(bd1->bd_bh);
561 continue;
562 }
563 continue;
564 } else if (bd1->bd_bh) { /* A journaled buffer */
565 int magic;
566 gfs2_log_unlock(sdp);
567 if (!bh) {
568 bh = gfs2_log_get_buf(sdp);
569 sdp->sd_log_num_hdrs++;
570 ld = (struct gfs2_log_descriptor *)
571 bh->b_data;
572 ptr = (__be64 *)(bh->b_data + offset);
573 ld->ld_header.mh_magic =
574 cpu_to_be32(GFS2_MAGIC);
575 ld->ld_header.mh_type =
576 cpu_to_be32(GFS2_METATYPE_LD);
577 ld->ld_header.mh_format =
578 cpu_to_be32(GFS2_FORMAT_LD);
579 ld->ld_type =
580 cpu_to_be32(GFS2_LOG_DESC_JDATA);
581 ld->ld_length = cpu_to_be32(num + 1);
582 ld->ld_data1 = cpu_to_be32(num);
583 ld->ld_data2 = cpu_to_be32(0);
584 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
585 }
586 magic = gfs2_check_magic(bd1->bd_bh);
587 *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
588 *ptr++ = cpu_to_be64((__u64)magic);
589 clear_buffer_escaped(bd1->bd_bh);
590 if (unlikely(magic != 0))
591 set_buffer_escaped(bd1->bd_bh);
592 gfs2_log_lock(sdp);
593 if (n++ > num)
594 break;
595 }
596 }
597 gfs2_log_unlock(sdp);
598 if (bh) {
599 set_buffer_dirty(bh);
600 ll_rw_block(WRITE, 1, &bh);
601 bh = NULL;
602 }
603 n = 0;
604 gfs2_log_lock(sdp);
605 list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf,
606 bd_le.le_list) {
607 if (!bd2->bd_bh)
608 continue;
609 /* copy buffer if it needs escaping */
610 gfs2_log_unlock(sdp);
611 if (unlikely(buffer_escaped(bd2->bd_bh))) {
612 void *kaddr;
613 struct page *page = bd2->bd_bh->b_page;
614 bh = gfs2_log_get_buf(sdp);
615 kaddr = kmap_atomic(page, KM_USER0);
616 memcpy(bh->b_data,
617 kaddr + bh_offset(bd2->bd_bh),
618 sdp->sd_sb.sb_bsize);
619 kunmap_atomic(page, KM_USER0);
620 *(__be32 *)bh->b_data = 0;
621 } else {
622 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
623 }
624 set_buffer_dirty(bh);
625 ll_rw_block(WRITE, 1, &bh);
626 gfs2_log_lock(sdp);
627 if (++n >= num)
628 break;
629 }
630 bh = NULL;
631 total_dbuf -= num;
632 total_jdata -= num;
633 }
634 gfs2_log_unlock(sdp);
635
636 /* Wait on all ordered buffers */
637 while (!list_empty(&started)) {
638 gfs2_log_lock(sdp);
639 bd1 = list_entry(started.next, struct gfs2_bufdata,
640 bd_le.le_list);
641 list_del_init(&bd1->bd_le.le_list);
642 sdp->sd_log_num_databuf--;
643 bh = bd1->bd_bh;
644 if (bh) {
645 bh->b_private = NULL;
646 get_bh(bh);
647 gfs2_log_unlock(sdp);
648 wait_on_buffer(bh);
649 brelse(bh);
650 } else
651 gfs2_log_unlock(sdp);
652
653 kmem_cache_free(gfs2_bufdata_cachep, bd1);
654 }
655
656 /* We've removed all the ordered write bufs here, so only jdata left */
657 gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata);
658}
659
660static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
661 struct gfs2_log_descriptor *ld,
662 __be64 *ptr, int pass)
663{
664 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
665 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
666 struct gfs2_glock *gl = ip->i_gl;
667 unsigned int blks = be32_to_cpu(ld->ld_data1);
668 struct buffer_head *bh_log, *bh_ip;
669 uint64_t blkno;
670 uint64_t esc;
671 int error = 0;
672
673 if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
674 return 0;
675
676 gfs2_replay_incr_blk(sdp, &start);
677 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
678 blkno = be64_to_cpu(*ptr++);
679 esc = be64_to_cpu(*ptr++);
680
681 sdp->sd_found_blocks++;
682
683 if (gfs2_revoke_check(sdp, blkno, start))
684 continue;
685
686 error = gfs2_replay_read_block(jd, start, &bh_log);
687 if (error)
688 return error;
689
690 bh_ip = gfs2_meta_new(gl, blkno);
691 memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
692
693 /* Unescape */
694 if (esc) {
695 __be32 *eptr = (__be32 *)bh_ip->b_data;
696 *eptr = cpu_to_be32(GFS2_MAGIC);
697 }
698 mark_buffer_dirty(bh_ip);
699
700 brelse(bh_log);
701 brelse(bh_ip);
702 if (error)
703 break;
704
705 sdp->sd_replayed_blocks++;
706 }
707
708 return error;
709}
710
711/* FIXME: sort out accounting for log blocks etc. */
712
713static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
714{
715 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
716 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
717
718 if (error) {
719 gfs2_meta_sync(ip->i_gl,
720 DIO_START | DIO_WAIT);
721 return;
722 }
723 if (pass != 1)
724 return;
725
726 /* data sync? */
727 gfs2_meta_sync(ip->i_gl, DIO_START | DIO_WAIT);
728
729 fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
730 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
731}
732
733static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
734{
735 struct list_head *head = &sdp->sd_log_le_databuf;
736 struct gfs2_bufdata *bd;
737
738 while (!list_empty(head)) {
739 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
740 list_del_init(&bd->bd_le.le_list);
741 sdp->sd_log_num_databuf--;
742 sdp->sd_log_num_jdata--;
743 gfs2_unpin(sdp, bd->bd_bh, ai);
744 }
745 gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
746 gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata);
747}
748
749
750const struct gfs2_log_operations gfs2_glock_lops = {
751 .lo_add = glock_lo_add,
752 .lo_after_commit = glock_lo_after_commit,
753 .lo_name = "glock"
754};
755
756const struct gfs2_log_operations gfs2_buf_lops = {
757 .lo_add = buf_lo_add,
758 .lo_incore_commit = buf_lo_incore_commit,
759 .lo_before_commit = buf_lo_before_commit,
760 .lo_after_commit = buf_lo_after_commit,
761 .lo_before_scan = buf_lo_before_scan,
762 .lo_scan_elements = buf_lo_scan_elements,
763 .lo_after_scan = buf_lo_after_scan,
764 .lo_name = "buf"
765};
766
767const struct gfs2_log_operations gfs2_revoke_lops = {
768 .lo_add = revoke_lo_add,
769 .lo_before_commit = revoke_lo_before_commit,
770 .lo_before_scan = revoke_lo_before_scan,
771 .lo_scan_elements = revoke_lo_scan_elements,
772 .lo_after_scan = revoke_lo_after_scan,
773 .lo_name = "revoke"
774};
775
776const struct gfs2_log_operations gfs2_rg_lops = {
777 .lo_add = rg_lo_add,
778 .lo_after_commit = rg_lo_after_commit,
779 .lo_name = "rg"
780};
781
782const struct gfs2_log_operations gfs2_databuf_lops = {
783 .lo_add = databuf_lo_add,
784 .lo_incore_commit = buf_lo_incore_commit,
785 .lo_before_commit = databuf_lo_before_commit,
786 .lo_after_commit = databuf_lo_after_commit,
787 .lo_scan_elements = databuf_lo_scan_elements,
788 .lo_after_scan = databuf_lo_after_scan,
789 .lo_name = "databuf"
790};
791
792const struct gfs2_log_operations *gfs2_log_ops[] = {
793 &gfs2_glock_lops,
794 &gfs2_buf_lops,
795 &gfs2_revoke_lops,
796 &gfs2_rg_lops,
797 &gfs2_databuf_lops,
798 NULL
799};
800
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
new file mode 100644
index 000000000000..8a1029d3d389
--- /dev/null
+++ b/fs/gfs2/lops.h
@@ -0,0 +1,96 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LOPS_DOT_H__
11#define __LOPS_DOT_H__
12
13extern const struct gfs2_log_operations gfs2_glock_lops;
14extern const struct gfs2_log_operations gfs2_buf_lops;
15extern const struct gfs2_log_operations gfs2_revoke_lops;
16extern const struct gfs2_log_operations gfs2_rg_lops;
17extern const struct gfs2_log_operations gfs2_databuf_lops;
18
19extern const struct gfs2_log_operations *gfs2_log_ops[];
20
21static inline void lops_init_le(struct gfs2_log_element *le,
22 const struct gfs2_log_operations *lops)
23{
24 INIT_LIST_HEAD(&le->le_list);
25 le->le_ops = lops;
26}
27
28static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
29{
30 if (le->le_ops->lo_add)
31 le->le_ops->lo_add(sdp, le);
32}
33
34static inline void lops_incore_commit(struct gfs2_sbd *sdp,
35 struct gfs2_trans *tr)
36{
37 int x;
38 for (x = 0; gfs2_log_ops[x]; x++)
39 if (gfs2_log_ops[x]->lo_incore_commit)
40 gfs2_log_ops[x]->lo_incore_commit(sdp, tr);
41}
42
43static inline void lops_before_commit(struct gfs2_sbd *sdp)
44{
45 int x;
46 for (x = 0; gfs2_log_ops[x]; x++)
47 if (gfs2_log_ops[x]->lo_before_commit)
48 gfs2_log_ops[x]->lo_before_commit(sdp);
49}
50
51static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
52{
53 int x;
54 for (x = 0; gfs2_log_ops[x]; x++)
55 if (gfs2_log_ops[x]->lo_after_commit)
56 gfs2_log_ops[x]->lo_after_commit(sdp, ai);
57}
58
59static inline void lops_before_scan(struct gfs2_jdesc *jd,
60 struct gfs2_log_header *head,
61 unsigned int pass)
62{
63 int x;
64 for (x = 0; gfs2_log_ops[x]; x++)
65 if (gfs2_log_ops[x]->lo_before_scan)
66 gfs2_log_ops[x]->lo_before_scan(jd, head, pass);
67}
68
69static inline int lops_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
70 struct gfs2_log_descriptor *ld,
71 __be64 *ptr,
72 unsigned int pass)
73{
74 int x, error;
75 for (x = 0; gfs2_log_ops[x]; x++)
76 if (gfs2_log_ops[x]->lo_scan_elements) {
77 error = gfs2_log_ops[x]->lo_scan_elements(jd, start,
78 ld, ptr, pass);
79 if (error)
80 return error;
81 }
82
83 return 0;
84}
85
86static inline void lops_after_scan(struct gfs2_jdesc *jd, int error,
87 unsigned int pass)
88{
89 int x;
90 for (x = 0; gfs2_log_ops[x]; x++)
91 if (gfs2_log_ops[x]->lo_before_scan)
92 gfs2_log_ops[x]->lo_after_scan(jd, error, pass);
93}
94
95#endif /* __LOPS_DOT_H__ */
96
diff --git a/fs/gfs2/lvb.c b/fs/gfs2/lvb.c
new file mode 100644
index 000000000000..e88e9cce14e7
--- /dev/null
+++ b/fs/gfs2/lvb.c
@@ -0,0 +1,45 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16
17#include "gfs2.h"
18#include "lm_interface.h"
19#include "incore.h"
20#include "lvb.h"
21
22#define pv(struct, member, fmt) printk(KERN_INFO " "#member" = "fmt"\n", \
23 struct->member);
24
25void gfs2_quota_lvb_in(struct gfs2_quota_lvb *qb, char *lvb)
26{
27 struct gfs2_quota_lvb *str = (struct gfs2_quota_lvb *)lvb;
28
29 qb->qb_magic = be32_to_cpu(str->qb_magic);
30 qb->qb_limit = be64_to_cpu(str->qb_limit);
31 qb->qb_warn = be64_to_cpu(str->qb_warn);
32 qb->qb_value = be64_to_cpu(str->qb_value);
33}
34
35void gfs2_quota_lvb_out(struct gfs2_quota_lvb *qb, char *lvb)
36{
37 struct gfs2_quota_lvb *str = (struct gfs2_quota_lvb *)lvb;
38
39 str->qb_magic = cpu_to_be32(qb->qb_magic);
40 str->qb_limit = cpu_to_be64(qb->qb_limit);
41 str->qb_warn = cpu_to_be64(qb->qb_warn);
42 str->qb_value = cpu_to_be64(qb->qb_value);
43}
44
45
diff --git a/fs/gfs2/lvb.h b/fs/gfs2/lvb.h
new file mode 100644
index 000000000000..1b1a8b75219a
--- /dev/null
+++ b/fs/gfs2/lvb.h
@@ -0,0 +1,19 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LVB_DOT_H__
11#define __LVB_DOT_H__
12
13#define GFS2_MIN_LVB_SIZE 32
14
15void gfs2_quota_lvb_in(struct gfs2_quota_lvb *qb, char *lvb);
16void gfs2_quota_lvb_out(struct gfs2_quota_lvb *qb, char *lvb);
17
18#endif /* __LVB_DOT_H__ */
19
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
new file mode 100644
index 000000000000..c112943ee8c1
--- /dev/null
+++ b/fs/gfs2/main.c
@@ -0,0 +1,127 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/module.h>
16#include <linux/init.h>
17#include <linux/gfs2_ondisk.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "ops_fstype.h"
23#include "sys.h"
24#include "util.h"
25
26static void gfs2_init_inode_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
27{
28 struct gfs2_inode *ip = foo;
29 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
30 SLAB_CTOR_CONSTRUCTOR) {
31 inode_init_once(&ip->i_inode);
32 spin_lock_init(&ip->i_spin);
33 init_rwsem(&ip->i_rw_mutex);
34 memset(ip->i_cache, 0, sizeof(ip->i_cache));
35 }
36}
37
38/**
39 * init_gfs2_fs - Register GFS2 as a filesystem
40 *
41 * Returns: 0 on success, error code on failure
42 */
43
44static int __init init_gfs2_fs(void)
45{
46 int error;
47
48 gfs2_init_lmh();
49
50 error = gfs2_sys_init();
51 if (error)
52 return error;
53
54 error = -ENOMEM;
55
56 gfs2_glock_cachep = kmem_cache_create("gfs2_glock",
57 sizeof(struct gfs2_glock),
58 0, 0, NULL, NULL);
59 if (!gfs2_glock_cachep)
60 goto fail;
61
62 gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
63 sizeof(struct gfs2_inode),
64 0, (SLAB_RECLAIM_ACCOUNT|
65 SLAB_PANIC|SLAB_MEM_SPREAD),
66 gfs2_init_inode_once, NULL);
67 if (!gfs2_inode_cachep)
68 goto fail;
69
70 gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata",
71 sizeof(struct gfs2_bufdata),
72 0, 0, NULL, NULL);
73 if (!gfs2_bufdata_cachep)
74 goto fail;
75
76 error = register_filesystem(&gfs2_fs_type);
77 if (error)
78 goto fail;
79
80 error = register_filesystem(&gfs2meta_fs_type);
81 if (error)
82 goto fail_unregister;
83
84 printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
85
86 return 0;
87
88fail_unregister:
89 unregister_filesystem(&gfs2_fs_type);
90fail:
91 if (gfs2_bufdata_cachep)
92 kmem_cache_destroy(gfs2_bufdata_cachep);
93
94 if (gfs2_inode_cachep)
95 kmem_cache_destroy(gfs2_inode_cachep);
96
97 if (gfs2_glock_cachep)
98 kmem_cache_destroy(gfs2_glock_cachep);
99
100 gfs2_sys_uninit();
101 return error;
102}
103
104/**
105 * exit_gfs2_fs - Unregister the file system
106 *
107 */
108
109static void __exit exit_gfs2_fs(void)
110{
111 unregister_filesystem(&gfs2_fs_type);
112 unregister_filesystem(&gfs2meta_fs_type);
113
114 kmem_cache_destroy(gfs2_bufdata_cachep);
115 kmem_cache_destroy(gfs2_inode_cachep);
116 kmem_cache_destroy(gfs2_glock_cachep);
117
118 gfs2_sys_uninit();
119}
120
121MODULE_DESCRIPTION("Global File System");
122MODULE_AUTHOR("Red Hat, Inc.");
123MODULE_LICENSE("GPL");
124
125module_init(init_gfs2_fs);
126module_exit(exit_gfs2_fs);
127
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
new file mode 100644
index 000000000000..502864b24196
--- /dev/null
+++ b/fs/gfs2/meta_io.c
@@ -0,0 +1,779 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/writeback.h>
18#include <linux/swap.h>
19#include <linux/delay.h>
20#include <linux/gfs2_ondisk.h>
21
22#include "gfs2.h"
23#include "lm_interface.h"
24#include "incore.h"
25#include "glock.h"
26#include "glops.h"
27#include "inode.h"
28#include "log.h"
29#include "lops.h"
30#include "meta_io.h"
31#include "rgrp.h"
32#include "trans.h"
33#include "util.h"
34#include "ops_address.h"
35
36#define buffer_busy(bh) \
37((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned)))
38#define buffer_in_io(bh) \
39((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock)))
40
41static int aspace_get_block(struct inode *inode, sector_t lblock,
42 struct buffer_head *bh_result, int create)
43{
44 gfs2_assert_warn(inode->i_sb->s_fs_info, 0);
45 return -EOPNOTSUPP;
46}
47
48static int gfs2_aspace_writepage(struct page *page,
49 struct writeback_control *wbc)
50{
51 return block_write_full_page(page, aspace_get_block, wbc);
52}
53
54static const struct address_space_operations aspace_aops = {
55 .writepage = gfs2_aspace_writepage,
56 .releasepage = gfs2_releasepage,
57};
58
59/**
60 * gfs2_aspace_get - Create and initialize a struct inode structure
61 * @sdp: the filesystem the aspace is in
62 *
63 * Right now a struct inode is just a struct inode. Maybe Linux
64 * will supply a more lightweight address space construct (that works)
65 * in the future.
66 *
67 * Make sure pages/buffers in this aspace aren't in high memory.
68 *
69 * Returns: the aspace
70 */
71
72struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
73{
74 struct inode *aspace;
75
76 aspace = new_inode(sdp->sd_vfs);
77 if (aspace) {
78 mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS);
79 aspace->i_mapping->a_ops = &aspace_aops;
80 aspace->i_size = ~0ULL;
81 aspace->u.generic_ip = NULL;
82 insert_inode_hash(aspace);
83 }
84 return aspace;
85}
86
87void gfs2_aspace_put(struct inode *aspace)
88{
89 remove_inode_hash(aspace);
90 iput(aspace);
91}
92
93/**
94 * gfs2_ail1_start_one - Start I/O on a part of the AIL
95 * @sdp: the filesystem
96 * @tr: the part of the AIL
97 *
98 */
99
100void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
101{
102 struct gfs2_bufdata *bd, *s;
103 struct buffer_head *bh;
104 int retry;
105
106 BUG_ON(!spin_is_locked(&sdp->sd_log_lock));
107
108 do {
109 retry = 0;
110
111 list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
112 bd_ail_st_list) {
113 bh = bd->bd_bh;
114
115 gfs2_assert(sdp, bd->bd_ail == ai);
116
117 if (!buffer_busy(bh)) {
118 if (!buffer_uptodate(bh)) {
119 gfs2_log_unlock(sdp);
120 gfs2_io_error_bh(sdp, bh);
121 gfs2_log_lock(sdp);
122 }
123 list_move(&bd->bd_ail_st_list,
124 &ai->ai_ail2_list);
125 continue;
126 }
127
128 if (!buffer_dirty(bh))
129 continue;
130
131 list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
132
133 gfs2_log_unlock(sdp);
134 wait_on_buffer(bh);
135 ll_rw_block(WRITE, 1, &bh);
136 gfs2_log_lock(sdp);
137
138 retry = 1;
139 break;
140 }
141 } while (retry);
142}
143
144/**
145 * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced
146 * @sdp: the filesystem
147 * @ai: the AIL entry
148 *
149 */
150
151int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags)
152{
153 struct gfs2_bufdata *bd, *s;
154 struct buffer_head *bh;
155
156 list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
157 bd_ail_st_list) {
158 bh = bd->bd_bh;
159
160 gfs2_assert(sdp, bd->bd_ail == ai);
161
162 if (buffer_busy(bh)) {
163 if (flags & DIO_ALL)
164 continue;
165 else
166 break;
167 }
168
169 if (!buffer_uptodate(bh))
170 gfs2_io_error_bh(sdp, bh);
171
172 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
173 }
174
175 return list_empty(&ai->ai_ail1_list);
176}
177
178/**
179 * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
180 * @sdp: the filesystem
181 * @ai: the AIL entry
182 *
183 */
184
185void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
186{
187 struct list_head *head = &ai->ai_ail2_list;
188 struct gfs2_bufdata *bd;
189
190 while (!list_empty(head)) {
191 bd = list_entry(head->prev, struct gfs2_bufdata,
192 bd_ail_st_list);
193 gfs2_assert(sdp, bd->bd_ail == ai);
194 bd->bd_ail = NULL;
195 list_del(&bd->bd_ail_st_list);
196 list_del(&bd->bd_ail_gl_list);
197 atomic_dec(&bd->bd_gl->gl_ail_count);
198 brelse(bd->bd_bh);
199 }
200}
201
202/**
203 * ail_empty_gl - remove all buffers for a given lock from the AIL
204 * @gl: the glock
205 *
206 * None of the buffers should be dirty, locked, or pinned.
207 */
208
209void gfs2_ail_empty_gl(struct gfs2_glock *gl)
210{
211 struct gfs2_sbd *sdp = gl->gl_sbd;
212 unsigned int blocks;
213 struct list_head *head = &gl->gl_ail_list;
214 struct gfs2_bufdata *bd;
215 struct buffer_head *bh;
216 uint64_t blkno;
217 int error;
218
219 blocks = atomic_read(&gl->gl_ail_count);
220 if (!blocks)
221 return;
222
223 error = gfs2_trans_begin(sdp, 0, blocks);
224 if (gfs2_assert_withdraw(sdp, !error))
225 return;
226
227 gfs2_log_lock(sdp);
228 while (!list_empty(head)) {
229 bd = list_entry(head->next, struct gfs2_bufdata,
230 bd_ail_gl_list);
231 bh = bd->bd_bh;
232 blkno = bh->b_blocknr;
233 gfs2_assert_withdraw(sdp, !buffer_busy(bh));
234
235 bd->bd_ail = NULL;
236 list_del(&bd->bd_ail_st_list);
237 list_del(&bd->bd_ail_gl_list);
238 atomic_dec(&gl->gl_ail_count);
239 brelse(bh);
240 gfs2_log_unlock(sdp);
241
242 gfs2_trans_add_revoke(sdp, blkno);
243
244 gfs2_log_lock(sdp);
245 }
246 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
247 gfs2_log_unlock(sdp);
248
249 gfs2_trans_end(sdp);
250 gfs2_log_flush(sdp, NULL);
251}
252
253/**
254 * gfs2_meta_inval - Invalidate all buffers associated with a glock
255 * @gl: the glock
256 *
257 */
258
259void gfs2_meta_inval(struct gfs2_glock *gl)
260{
261 struct gfs2_sbd *sdp = gl->gl_sbd;
262 struct inode *aspace = gl->gl_aspace;
263 struct address_space *mapping = gl->gl_aspace->i_mapping;
264
265 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
266
267 atomic_inc(&aspace->i_writecount);
268 truncate_inode_pages(mapping, 0);
269 atomic_dec(&aspace->i_writecount);
270
271 gfs2_assert_withdraw(sdp, !mapping->nrpages);
272}
273
274/**
275 * gfs2_meta_sync - Sync all buffers associated with a glock
276 * @gl: The glock
277 * @flags: DIO_START | DIO_WAIT
278 *
279 */
280
281void gfs2_meta_sync(struct gfs2_glock *gl, int flags)
282{
283 struct address_space *mapping = gl->gl_aspace->i_mapping;
284 int error = 0;
285
286 if (flags & DIO_START)
287 filemap_fdatawrite(mapping);
288 if (!error && (flags & DIO_WAIT))
289 error = filemap_fdatawait(mapping);
290
291 if (error)
292 gfs2_io_error(gl->gl_sbd);
293}
294
295/**
296 * getbuf - Get a buffer with a given address space
297 * @sdp: the filesystem
298 * @aspace: the address space
299 * @blkno: the block number (filesystem scope)
300 * @create: 1 if the buffer should be created
301 *
302 * Returns: the buffer
303 */
304
305static struct buffer_head *getbuf(struct gfs2_sbd *sdp, struct inode *aspace,
306 uint64_t blkno, int create)
307{
308 struct page *page;
309 struct buffer_head *bh;
310 unsigned int shift;
311 unsigned long index;
312 unsigned int bufnum;
313
314 shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
315 index = blkno >> shift; /* convert block to page */
316 bufnum = blkno - (index << shift); /* block buf index within page */
317
318 if (create) {
319 for (;;) {
320 page = grab_cache_page(aspace->i_mapping, index);
321 if (page)
322 break;
323 yield();
324 }
325 } else {
326 page = find_lock_page(aspace->i_mapping, index);
327 if (!page)
328 return NULL;
329 }
330
331 if (!page_has_buffers(page))
332 create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
333
334 /* Locate header for our buffer within our page */
335 for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
336 /* Do nothing */;
337 get_bh(bh);
338
339 if (!buffer_mapped(bh))
340 map_bh(bh, sdp->sd_vfs, blkno);
341
342 unlock_page(page);
343 mark_page_accessed(page);
344 page_cache_release(page);
345
346 return bh;
347}
348
349static void meta_prep_new(struct buffer_head *bh)
350{
351 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
352
353 lock_buffer(bh);
354 clear_buffer_dirty(bh);
355 set_buffer_uptodate(bh);
356 unlock_buffer(bh);
357
358 mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
359}
360
361/**
362 * gfs2_meta_new - Get a block
363 * @gl: The glock associated with this block
364 * @blkno: The block number
365 *
366 * Returns: The buffer
367 */
368
369struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, uint64_t blkno)
370{
371 struct buffer_head *bh;
372 bh = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
373 meta_prep_new(bh);
374 return bh;
375}
376
377/**
378 * gfs2_meta_read - Read a block from disk
379 * @gl: The glock covering the block
380 * @blkno: The block number
381 * @flags: flags to gfs2_dreread()
382 * @bhp: the place where the buffer is returned (NULL on failure)
383 *
384 * Returns: errno
385 */
386
387int gfs2_meta_read(struct gfs2_glock *gl, uint64_t blkno, int flags,
388 struct buffer_head **bhp)
389{
390 int error;
391
392 *bhp = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
393 error = gfs2_meta_reread(gl->gl_sbd, *bhp, flags);
394 if (error)
395 brelse(*bhp);
396
397 return error;
398}
399
400/**
401 * gfs2_meta_reread - Reread a block from disk
402 * @sdp: the filesystem
403 * @bh: The block to read
404 * @flags: Flags that control the read
405 *
406 * Returns: errno
407 */
408
409int gfs2_meta_reread(struct gfs2_sbd *sdp, struct buffer_head *bh, int flags)
410{
411 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
412 return -EIO;
413
414 if (flags & DIO_FORCE)
415 clear_buffer_uptodate(bh);
416
417 if ((flags & DIO_START) && !buffer_uptodate(bh))
418 ll_rw_block(READ, 1, &bh);
419
420 if (flags & DIO_WAIT) {
421 wait_on_buffer(bh);
422
423 if (!buffer_uptodate(bh)) {
424 struct gfs2_trans *tr = current->journal_info;
425 if (tr && tr->tr_touched)
426 gfs2_io_error_bh(sdp, bh);
427 return -EIO;
428 }
429 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
430 return -EIO;
431 }
432
433 return 0;
434}
435
436/**
437 * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer
438 * @gl: the glock the buffer belongs to
439 * @bh: The buffer to be attached to
440 * @meta: Flag to indicate whether its metadata or not
441 */
442
443void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
444 int meta)
445{
446 struct gfs2_bufdata *bd;
447
448 if (meta)
449 lock_page(bh->b_page);
450
451 if (bh->b_private) {
452 if (meta)
453 unlock_page(bh->b_page);
454 return;
455 }
456
457 bd = kmem_cache_alloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL),
458 memset(bd, 0, sizeof(struct gfs2_bufdata));
459 bd->bd_bh = bh;
460 bd->bd_gl = gl;
461
462 INIT_LIST_HEAD(&bd->bd_list_tr);
463 if (meta) {
464 lops_init_le(&bd->bd_le, &gfs2_buf_lops);
465 } else {
466 lops_init_le(&bd->bd_le, &gfs2_databuf_lops);
467 }
468 bh->b_private = bd;
469
470 if (meta)
471 unlock_page(bh->b_page);
472}
473
474/**
475 * gfs2_pin - Pin a buffer in memory
476 * @sdp: the filesystem the buffer belongs to
477 * @bh: The buffer to be pinned
478 *
479 */
480
481void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
482{
483 struct gfs2_bufdata *bd = bh->b_private;
484
485 gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
486
487 if (test_set_buffer_pinned(bh))
488 gfs2_assert_withdraw(sdp, 0);
489
490 wait_on_buffer(bh);
491
492 /* If this buffer is in the AIL and it has already been written
493 to in-place disk block, remove it from the AIL. */
494
495 gfs2_log_lock(sdp);
496 if (bd->bd_ail && !buffer_in_io(bh))
497 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
498 gfs2_log_unlock(sdp);
499
500 clear_buffer_dirty(bh);
501 wait_on_buffer(bh);
502
503 if (!buffer_uptodate(bh))
504 gfs2_io_error_bh(sdp, bh);
505
506 get_bh(bh);
507}
508
509/**
510 * gfs2_unpin - Unpin a buffer
511 * @sdp: the filesystem the buffer belongs to
512 * @bh: The buffer to unpin
513 * @ai:
514 *
515 */
516
517void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
518 struct gfs2_ail *ai)
519{
520 struct gfs2_bufdata *bd = bh->b_private;
521
522 gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
523
524 if (!buffer_pinned(bh))
525 gfs2_assert_withdraw(sdp, 0);
526
527 mark_buffer_dirty(bh);
528 clear_buffer_pinned(bh);
529
530 gfs2_log_lock(sdp);
531 if (bd->bd_ail) {
532 list_del(&bd->bd_ail_st_list);
533 brelse(bh);
534 } else {
535 struct gfs2_glock *gl = bd->bd_gl;
536 list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
537 atomic_inc(&gl->gl_ail_count);
538 }
539 bd->bd_ail = ai;
540 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
541 gfs2_log_unlock(sdp);
542}
543
544/**
545 * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore
546 * @ip: the inode who owns the buffers
547 * @bstart: the first buffer in the run
548 * @blen: the number of buffers in the run
549 *
550 */
551
552void gfs2_meta_wipe(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen)
553{
554 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
555 struct inode *aspace = ip->i_gl->gl_aspace;
556 struct buffer_head *bh;
557
558 while (blen) {
559 bh = getbuf(sdp, aspace, bstart, NO_CREATE);
560 if (bh) {
561 struct gfs2_bufdata *bd = bh->b_private;
562
563 if (test_clear_buffer_pinned(bh)) {
564 struct gfs2_trans *tr = current->journal_info;
565 gfs2_log_lock(sdp);
566 list_del_init(&bd->bd_le.le_list);
567 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
568 sdp->sd_log_num_buf--;
569 gfs2_log_unlock(sdp);
570 tr->tr_num_buf_rm++;
571 brelse(bh);
572 }
573 if (bd) {
574 gfs2_log_lock(sdp);
575 if (bd->bd_ail) {
576 uint64_t blkno = bh->b_blocknr;
577 bd->bd_ail = NULL;
578 list_del(&bd->bd_ail_st_list);
579 list_del(&bd->bd_ail_gl_list);
580 atomic_dec(&bd->bd_gl->gl_ail_count);
581 brelse(bh);
582 gfs2_log_unlock(sdp);
583 gfs2_trans_add_revoke(sdp, blkno);
584 } else
585 gfs2_log_unlock(sdp);
586 }
587
588 lock_buffer(bh);
589 clear_buffer_dirty(bh);
590 clear_buffer_uptodate(bh);
591 unlock_buffer(bh);
592
593 brelse(bh);
594 }
595
596 bstart++;
597 blen--;
598 }
599}
600
601/**
602 * gfs2_meta_cache_flush - get rid of any references on buffers for this inode
603 * @ip: The GFS2 inode
604 *
605 * This releases buffers that are in the most-recently-used array of
606 * blocks used for indirect block addressing for this inode.
607 */
608
609void gfs2_meta_cache_flush(struct gfs2_inode *ip)
610{
611 struct buffer_head **bh_slot;
612 unsigned int x;
613
614 spin_lock(&ip->i_spin);
615
616 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
617 bh_slot = &ip->i_cache[x];
618 if (!*bh_slot)
619 break;
620 brelse(*bh_slot);
621 *bh_slot = NULL;
622 }
623
624 spin_unlock(&ip->i_spin);
625}
626
627/**
628 * gfs2_meta_indirect_buffer - Get a metadata buffer
629 * @ip: The GFS2 inode
630 * @height: The level of this buf in the metadata (indir addr) tree (if any)
631 * @num: The block number (device relative) of the buffer
632 * @new: Non-zero if we may create a new buffer
633 * @bhp: the buffer is returned here
634 *
635 * Try to use the gfs2_inode's MRU metadata tree cache.
636 *
637 * Returns: errno
638 */
639
640int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, uint64_t num,
641 int new, struct buffer_head **bhp)
642{
643 struct buffer_head *bh, **bh_slot = ip->i_cache + height;
644 int error;
645
646 spin_lock(&ip->i_spin);
647 bh = *bh_slot;
648 if (bh) {
649 if (bh->b_blocknr == num)
650 get_bh(bh);
651 else
652 bh = NULL;
653 }
654 spin_unlock(&ip->i_spin);
655
656 if (bh) {
657 if (new)
658 meta_prep_new(bh);
659 else {
660 error = gfs2_meta_reread(GFS2_SB(&ip->i_inode), bh,
661 DIO_START | DIO_WAIT);
662 if (error) {
663 brelse(bh);
664 return error;
665 }
666 }
667 } else {
668 if (new)
669 bh = gfs2_meta_new(ip->i_gl, num);
670 else {
671 error = gfs2_meta_read(ip->i_gl, num,
672 DIO_START | DIO_WAIT, &bh);
673 if (error)
674 return error;
675 }
676
677 spin_lock(&ip->i_spin);
678 if (*bh_slot != bh) {
679 brelse(*bh_slot);
680 *bh_slot = bh;
681 get_bh(bh);
682 }
683 spin_unlock(&ip->i_spin);
684 }
685
686 if (new) {
687 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), height)) {
688 brelse(bh);
689 return -EIO;
690 }
691 gfs2_trans_add_bh(ip->i_gl, bh, 1);
692 gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
693 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
694
695 } else if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh,
696 (height) ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)) {
697 brelse(bh);
698 return -EIO;
699 }
700
701 *bhp = bh;
702
703 return 0;
704}
705
706/**
707 * gfs2_meta_ra - start readahead on an extent of a file
708 * @gl: the glock the blocks belong to
709 * @dblock: the starting disk block
710 * @extlen: the number of blocks in the extent
711 *
712 */
713
714void gfs2_meta_ra(struct gfs2_glock *gl, uint64_t dblock, uint32_t extlen)
715{
716 struct gfs2_sbd *sdp = gl->gl_sbd;
717 struct inode *aspace = gl->gl_aspace;
718 struct buffer_head *first_bh, *bh;
719 uint32_t max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
720 sdp->sd_sb.sb_bsize_shift;
721 int error;
722
723 if (!extlen || !max_ra)
724 return;
725 if (extlen > max_ra)
726 extlen = max_ra;
727
728 first_bh = getbuf(sdp, aspace, dblock, CREATE);
729
730 if (buffer_uptodate(first_bh))
731 goto out;
732 if (!buffer_locked(first_bh)) {
733 error = gfs2_meta_reread(sdp, first_bh, DIO_START);
734 if (error)
735 goto out;
736 }
737
738 dblock++;
739 extlen--;
740
741 while (extlen) {
742 bh = getbuf(sdp, aspace, dblock, CREATE);
743
744 if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
745 error = gfs2_meta_reread(sdp, bh, DIO_START);
746 brelse(bh);
747 if (error)
748 goto out;
749 } else
750 brelse(bh);
751
752 dblock++;
753 extlen--;
754
755 if (buffer_uptodate(first_bh))
756 break;
757 }
758
759 out:
760 brelse(first_bh);
761}
762
763/**
764 * gfs2_meta_syncfs - sync all the buffers in a filesystem
765 * @sdp: the filesystem
766 *
767 */
768
769void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
770{
771 gfs2_log_flush(sdp, NULL);
772 for (;;) {
773 gfs2_ail1_start(sdp, DIO_ALL);
774 if (gfs2_ail1_empty(sdp, DIO_ALL))
775 break;
776 msleep(10);
777 }
778}
779
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
new file mode 100644
index 000000000000..951814e86272
--- /dev/null
+++ b/fs/gfs2/meta_io.h
@@ -0,0 +1,74 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __DIO_DOT_H__
11#define __DIO_DOT_H__
12
13static inline void gfs2_buffer_clear(struct buffer_head *bh)
14{
15 memset(bh->b_data, 0, bh->b_size);
16}
17
18static inline void gfs2_buffer_clear_tail(struct buffer_head *bh, int head)
19{
20 BUG_ON(head > bh->b_size);
21 memset(bh->b_data + head, 0, bh->b_size - head);
22}
23
24static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
25 int to_head,
26 struct buffer_head *from_bh,
27 int from_head)
28{
29 BUG_ON(from_head < to_head);
30 memcpy(to_bh->b_data + to_head, from_bh->b_data + from_head,
31 from_bh->b_size - from_head);
32 memset(to_bh->b_data + to_bh->b_size + to_head - from_head,
33 0, from_head - to_head);
34}
35
36struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
37void gfs2_aspace_put(struct inode *aspace);
38
39void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai);
40int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags);
41void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai);
42void gfs2_ail_empty_gl(struct gfs2_glock *gl);
43
44void gfs2_meta_inval(struct gfs2_glock *gl);
45void gfs2_meta_sync(struct gfs2_glock *gl, int flags);
46
47struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, uint64_t blkno);
48int gfs2_meta_read(struct gfs2_glock *gl, uint64_t blkno,
49 int flags, struct buffer_head **bhp);
50int gfs2_meta_reread(struct gfs2_sbd *sdp, struct buffer_head *bh, int flags);
51
52void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
53 int meta);
54void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
55void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
56 struct gfs2_ail *ai);
57
58void gfs2_meta_wipe(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen);
59
60void gfs2_meta_cache_flush(struct gfs2_inode *ip);
61int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, uint64_t num,
62 int new, struct buffer_head **bhp);
63
64static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
65 struct buffer_head **bhp)
66{
67 return gfs2_meta_indirect_buffer(ip, 0, ip->i_num.no_addr, 0, bhp);
68}
69
70void gfs2_meta_ra(struct gfs2_glock *gl, uint64_t dblock, uint32_t extlen);
71void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
72
73#endif /* __DIO_DOT_H__ */
74
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
new file mode 100644
index 000000000000..0d4b230785af
--- /dev/null
+++ b/fs/gfs2/mount.c
@@ -0,0 +1,214 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16
17#include "gfs2.h"
18#include "lm_interface.h"
19#include "incore.h"
20#include "mount.h"
21#include "sys.h"
22#include "util.h"
23
24/**
25 * gfs2_mount_args - Parse mount options
26 * @sdp:
27 * @data:
28 *
29 * Return: errno
30 */
31
32int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
33{
34 struct gfs2_args *args = &sdp->sd_args;
35 char *data = data_arg;
36 char *options, *o, *v;
37 int error = 0;
38
39 if (!remount) {
40 /* If someone preloaded options, use those instead */
41 spin_lock(&gfs2_sys_margs_lock);
42 if (gfs2_sys_margs) {
43 data = gfs2_sys_margs;
44 gfs2_sys_margs = NULL;
45 }
46 spin_unlock(&gfs2_sys_margs_lock);
47
48 /* Set some defaults */
49 args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
50 args->ar_quota = GFS2_QUOTA_DEFAULT;
51 args->ar_data = GFS2_DATA_DEFAULT;
52 }
53
54 /* Split the options into tokens with the "," character and
55 process them */
56
57 for (options = data; (o = strsep(&options, ",")); ) {
58 if (!*o)
59 continue;
60
61 v = strchr(o, '=');
62 if (v)
63 *v++ = 0;
64
65 if (!strcmp(o, "lockproto")) {
66 if (!v)
67 goto need_value;
68 if (remount && strcmp(v, args->ar_lockproto))
69 goto cant_remount;
70 strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
71 args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
72 }
73
74 else if (!strcmp(o, "locktable")) {
75 if (!v)
76 goto need_value;
77 if (remount && strcmp(v, args->ar_locktable))
78 goto cant_remount;
79 strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
80 args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0;
81 }
82
83 else if (!strcmp(o, "hostdata")) {
84 if (!v)
85 goto need_value;
86 if (remount && strcmp(v, args->ar_hostdata))
87 goto cant_remount;
88 strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
89 args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
90 }
91
92 else if (!strcmp(o, "spectator")) {
93 if (remount && !args->ar_spectator)
94 goto cant_remount;
95 args->ar_spectator = 1;
96 sdp->sd_vfs->s_flags |= MS_RDONLY;
97 }
98
99 else if (!strcmp(o, "ignore_local_fs")) {
100 if (remount && !args->ar_ignore_local_fs)
101 goto cant_remount;
102 args->ar_ignore_local_fs = 1;
103 }
104
105 else if (!strcmp(o, "localflocks")) {
106 if (remount && !args->ar_localflocks)
107 goto cant_remount;
108 args->ar_localflocks = 1;
109 }
110
111 else if (!strcmp(o, "localcaching")) {
112 if (remount && !args->ar_localcaching)
113 goto cant_remount;
114 args->ar_localcaching = 1;
115 }
116
117 else if (!strcmp(o, "debug"))
118 args->ar_debug = 1;
119
120 else if (!strcmp(o, "nodebug"))
121 args->ar_debug = 0;
122
123 else if (!strcmp(o, "upgrade")) {
124 if (remount && !args->ar_upgrade)
125 goto cant_remount;
126 args->ar_upgrade = 1;
127 }
128
129 else if (!strcmp(o, "num_glockd")) {
130 unsigned int x;
131 if (!v)
132 goto need_value;
133 sscanf(v, "%u", &x);
134 if (remount && x != args->ar_num_glockd)
135 goto cant_remount;
136 if (!x || x > GFS2_GLOCKD_MAX) {
137 fs_info(sdp, "0 < num_glockd <= %u (not %u)\n",
138 GFS2_GLOCKD_MAX, x);
139 error = -EINVAL;
140 break;
141 }
142 args->ar_num_glockd = x;
143 }
144
145 else if (!strcmp(o, "acl")) {
146 args->ar_posix_acl = 1;
147 sdp->sd_vfs->s_flags |= MS_POSIXACL;
148 }
149
150 else if (!strcmp(o, "noacl")) {
151 args->ar_posix_acl = 0;
152 sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
153 }
154
155 else if (!strcmp(o, "quota")) {
156 if (!v)
157 goto need_value;
158 if (!strcmp(v, "off"))
159 args->ar_quota = GFS2_QUOTA_OFF;
160 else if (!strcmp(v, "account"))
161 args->ar_quota = GFS2_QUOTA_ACCOUNT;
162 else if (!strcmp(v, "on"))
163 args->ar_quota = GFS2_QUOTA_ON;
164 else {
165 fs_info(sdp, "invalid value for quota\n");
166 error = -EINVAL;
167 break;
168 }
169 }
170
171 else if (!strcmp(o, "suiddir"))
172 args->ar_suiddir = 1;
173
174 else if (!strcmp(o, "nosuiddir"))
175 args->ar_suiddir = 0;
176
177 else if (!strcmp(o, "data")) {
178 if (!v)
179 goto need_value;
180 if (!strcmp(v, "writeback"))
181 args->ar_data = GFS2_DATA_WRITEBACK;
182 else if (!strcmp(v, "ordered"))
183 args->ar_data = GFS2_DATA_ORDERED;
184 else {
185 fs_info(sdp, "invalid value for data\n");
186 error = -EINVAL;
187 break;
188 }
189 }
190
191 else {
192 fs_info(sdp, "unknown option: %s\n", o);
193 error = -EINVAL;
194 break;
195 }
196 }
197
198 if (error)
199 fs_info(sdp, "invalid mount option(s)\n");
200
201 if (data != data_arg)
202 kfree(data);
203
204 return error;
205
206 need_value:
207 fs_info(sdp, "need value for option %s\n", o);
208 return -EINVAL;
209
210 cant_remount:
211 fs_info(sdp, "can't remount with option %s\n", o);
212 return -EINVAL;
213}
214
diff --git a/fs/gfs2/mount.h b/fs/gfs2/mount.h
new file mode 100644
index 000000000000..2eb14722144f
--- /dev/null
+++ b/fs/gfs2/mount.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __MOUNT_DOT_H__
11#define __MOUNT_DOT_H__
12
13int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount);
14
15#endif /* __MOUNT_DOT_H__ */
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
new file mode 100644
index 000000000000..39c7f0345fc6
--- /dev/null
+++ b/fs/gfs2/ondisk.c
@@ -0,0 +1,308 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15
16#include "gfs2.h"
17#include <linux/gfs2_ondisk.h>
18
19#define pv(struct, member, fmt) printk(KERN_INFO " "#member" = "fmt"\n", \
20 struct->member);
21
22/*
23 * gfs2_xxx_in - read in an xxx struct
24 * first arg: the cpu-order structure
25 * buf: the disk-order buffer
26 *
27 * gfs2_xxx_out - write out an xxx struct
28 * first arg: the cpu-order structure
29 * buf: the disk-order buffer
30 *
31 * gfs2_xxx_print - print out an xxx struct
32 * first arg: the cpu-order structure
33 */
34
35void gfs2_inum_in(struct gfs2_inum *no, char *buf)
36{
37 struct gfs2_inum *str = (struct gfs2_inum *)buf;
38
39 no->no_formal_ino = be64_to_cpu(str->no_formal_ino);
40 no->no_addr = be64_to_cpu(str->no_addr);
41}
42
43void gfs2_inum_out(const struct gfs2_inum *no, char *buf)
44{
45 struct gfs2_inum *str = (struct gfs2_inum *)buf;
46
47 str->no_formal_ino = cpu_to_be64(no->no_formal_ino);
48 str->no_addr = cpu_to_be64(no->no_addr);
49}
50
51static void gfs2_inum_print(struct gfs2_inum *no)
52{
53 printk(KERN_INFO " no_formal_ino = %llu\n", (unsigned long long)no->no_formal_ino);
54 printk(KERN_INFO " no_addr = %llu\n", (unsigned long long)no->no_addr);
55}
56
57static void gfs2_meta_header_in(struct gfs2_meta_header *mh, char *buf)
58{
59 struct gfs2_meta_header *str = (struct gfs2_meta_header *)buf;
60
61 mh->mh_magic = be32_to_cpu(str->mh_magic);
62 mh->mh_type = be32_to_cpu(str->mh_type);
63 mh->mh_format = be32_to_cpu(str->mh_format);
64}
65
66static void gfs2_meta_header_out(struct gfs2_meta_header *mh, char *buf)
67{
68 struct gfs2_meta_header *str = (struct gfs2_meta_header *)buf;
69
70 str->mh_magic = cpu_to_be32(mh->mh_magic);
71 str->mh_type = cpu_to_be32(mh->mh_type);
72 str->mh_format = cpu_to_be32(mh->mh_format);
73}
74
75static void gfs2_meta_header_print(struct gfs2_meta_header *mh)
76{
77 pv(mh, mh_magic, "0x%.8X");
78 pv(mh, mh_type, "%u");
79 pv(mh, mh_format, "%u");
80}
81
82void gfs2_sb_in(struct gfs2_sb *sb, char *buf)
83{
84 struct gfs2_sb *str = (struct gfs2_sb *)buf;
85
86 gfs2_meta_header_in(&sb->sb_header, buf);
87
88 sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
89 sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
90 sb->sb_bsize = be32_to_cpu(str->sb_bsize);
91 sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
92
93 gfs2_inum_in(&sb->sb_master_dir, (char *)&str->sb_master_dir);
94 gfs2_inum_in(&sb->sb_root_dir, (char *)&str->sb_root_dir);
95
96 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
97 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
98}
99
100void gfs2_rindex_in(struct gfs2_rindex *ri, char *buf)
101{
102 struct gfs2_rindex *str = (struct gfs2_rindex *)buf;
103
104 ri->ri_addr = be64_to_cpu(str->ri_addr);
105 ri->ri_length = be32_to_cpu(str->ri_length);
106 ri->ri_data0 = be64_to_cpu(str->ri_data0);
107 ri->ri_data = be32_to_cpu(str->ri_data);
108 ri->ri_bitbytes = be32_to_cpu(str->ri_bitbytes);
109
110}
111
112void gfs2_rindex_print(struct gfs2_rindex *ri)
113{
114 printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)ri->ri_addr);
115 pv(ri, ri_length, "%u");
116
117 printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)ri->ri_data0);
118 pv(ri, ri_data, "%u");
119
120 pv(ri, ri_bitbytes, "%u");
121}
122
123void gfs2_rgrp_in(struct gfs2_rgrp *rg, char *buf)
124{
125 struct gfs2_rgrp *str = (struct gfs2_rgrp *)buf;
126
127 gfs2_meta_header_in(&rg->rg_header, buf);
128 rg->rg_flags = be32_to_cpu(str->rg_flags);
129 rg->rg_free = be32_to_cpu(str->rg_free);
130 rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
131 rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
132}
133
134void gfs2_rgrp_out(struct gfs2_rgrp *rg, char *buf)
135{
136 struct gfs2_rgrp *str = (struct gfs2_rgrp *)buf;
137
138 gfs2_meta_header_out(&rg->rg_header, buf);
139 str->rg_flags = cpu_to_be32(rg->rg_flags);
140 str->rg_free = cpu_to_be32(rg->rg_free);
141 str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
142 str->__pad = cpu_to_be32(0);
143 str->rg_igeneration = cpu_to_be64(rg->rg_igeneration);
144 memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
145}
146
147void gfs2_quota_in(struct gfs2_quota *qu, char *buf)
148{
149 struct gfs2_quota *str = (struct gfs2_quota *)buf;
150
151 qu->qu_limit = be64_to_cpu(str->qu_limit);
152 qu->qu_warn = be64_to_cpu(str->qu_warn);
153 qu->qu_value = be64_to_cpu(str->qu_value);
154}
155
156void gfs2_dinode_in(struct gfs2_dinode *di, char *buf)
157{
158 struct gfs2_dinode *str = (struct gfs2_dinode *)buf;
159
160 gfs2_meta_header_in(&di->di_header, buf);
161 gfs2_inum_in(&di->di_num, (char *)&str->di_num);
162
163 di->di_mode = be32_to_cpu(str->di_mode);
164 di->di_uid = be32_to_cpu(str->di_uid);
165 di->di_gid = be32_to_cpu(str->di_gid);
166 di->di_nlink = be32_to_cpu(str->di_nlink);
167 di->di_size = be64_to_cpu(str->di_size);
168 di->di_blocks = be64_to_cpu(str->di_blocks);
169 di->di_atime = be64_to_cpu(str->di_atime);
170 di->di_mtime = be64_to_cpu(str->di_mtime);
171 di->di_ctime = be64_to_cpu(str->di_ctime);
172 di->di_major = be32_to_cpu(str->di_major);
173 di->di_minor = be32_to_cpu(str->di_minor);
174
175 di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
176 di->di_goal_data = be64_to_cpu(str->di_goal_data);
177 di->di_generation = be64_to_cpu(str->di_generation);
178
179 di->di_flags = be32_to_cpu(str->di_flags);
180 di->di_payload_format = be32_to_cpu(str->di_payload_format);
181 di->di_height = be16_to_cpu(str->di_height);
182
183 di->di_depth = be16_to_cpu(str->di_depth);
184 di->di_entries = be32_to_cpu(str->di_entries);
185
186 di->di_eattr = be64_to_cpu(str->di_eattr);
187
188}
189
190void gfs2_dinode_out(struct gfs2_dinode *di, char *buf)
191{
192 struct gfs2_dinode *str = (struct gfs2_dinode *)buf;
193
194 gfs2_meta_header_out(&di->di_header, buf);
195 gfs2_inum_out(&di->di_num, (char *)&str->di_num);
196
197 str->di_mode = cpu_to_be32(di->di_mode);
198 str->di_uid = cpu_to_be32(di->di_uid);
199 str->di_gid = cpu_to_be32(di->di_gid);
200 str->di_nlink = cpu_to_be32(di->di_nlink);
201 str->di_size = cpu_to_be64(di->di_size);
202 str->di_blocks = cpu_to_be64(di->di_blocks);
203 str->di_atime = cpu_to_be64(di->di_atime);
204 str->di_mtime = cpu_to_be64(di->di_mtime);
205 str->di_ctime = cpu_to_be64(di->di_ctime);
206 str->di_major = cpu_to_be32(di->di_major);
207 str->di_minor = cpu_to_be32(di->di_minor);
208
209 str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
210 str->di_goal_data = cpu_to_be64(di->di_goal_data);
211 str->di_generation = cpu_to_be64(di->di_generation);
212
213 str->di_flags = cpu_to_be32(di->di_flags);
214 str->di_payload_format = cpu_to_be32(di->di_payload_format);
215 str->di_height = cpu_to_be16(di->di_height);
216
217 str->di_depth = cpu_to_be16(di->di_depth);
218 str->di_entries = cpu_to_be32(di->di_entries);
219
220 str->di_eattr = cpu_to_be64(di->di_eattr);
221
222}
223
224void gfs2_dinode_print(struct gfs2_dinode *di)
225{
226 gfs2_meta_header_print(&di->di_header);
227 gfs2_inum_print(&di->di_num);
228
229 pv(di, di_mode, "0%o");
230 pv(di, di_uid, "%u");
231 pv(di, di_gid, "%u");
232 pv(di, di_nlink, "%u");
233 printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size);
234 printk(KERN_INFO " di_blocks = %llu\n", (unsigned long long)di->di_blocks);
235 printk(KERN_INFO " di_atime = %lld\n", (long long)di->di_atime);
236 printk(KERN_INFO " di_mtime = %lld\n", (long long)di->di_mtime);
237 printk(KERN_INFO " di_ctime = %lld\n", (long long)di->di_ctime);
238 pv(di, di_major, "%u");
239 pv(di, di_minor, "%u");
240
241 printk(KERN_INFO " di_goal_meta = %llu\n", (unsigned long long)di->di_goal_meta);
242 printk(KERN_INFO " di_goal_data = %llu\n", (unsigned long long)di->di_goal_data);
243
244 pv(di, di_flags, "0x%.8X");
245 pv(di, di_payload_format, "%u");
246 pv(di, di_height, "%u");
247
248 pv(di, di_depth, "%u");
249 pv(di, di_entries, "%u");
250
251 printk(KERN_INFO " di_eattr = %llu\n", (unsigned long long)di->di_eattr);
252}
253
254void gfs2_log_header_in(struct gfs2_log_header *lh, char *buf)
255{
256 struct gfs2_log_header *str = (struct gfs2_log_header *)buf;
257
258 gfs2_meta_header_in(&lh->lh_header, buf);
259 lh->lh_sequence = be64_to_cpu(str->lh_sequence);
260 lh->lh_flags = be32_to_cpu(str->lh_flags);
261 lh->lh_tail = be32_to_cpu(str->lh_tail);
262 lh->lh_blkno = be32_to_cpu(str->lh_blkno);
263 lh->lh_hash = be32_to_cpu(str->lh_hash);
264}
265
266void gfs2_inum_range_in(struct gfs2_inum_range *ir, char *buf)
267{
268 struct gfs2_inum_range *str = (struct gfs2_inum_range *)buf;
269
270 ir->ir_start = be64_to_cpu(str->ir_start);
271 ir->ir_length = be64_to_cpu(str->ir_length);
272}
273
274void gfs2_inum_range_out(struct gfs2_inum_range *ir, char *buf)
275{
276 struct gfs2_inum_range *str = (struct gfs2_inum_range *)buf;
277
278 str->ir_start = cpu_to_be64(ir->ir_start);
279 str->ir_length = cpu_to_be64(ir->ir_length);
280}
281
282void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, char *buf)
283{
284 struct gfs2_statfs_change *str = (struct gfs2_statfs_change *)buf;
285
286 sc->sc_total = be64_to_cpu(str->sc_total);
287 sc->sc_free = be64_to_cpu(str->sc_free);
288 sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
289}
290
291void gfs2_statfs_change_out(struct gfs2_statfs_change *sc, char *buf)
292{
293 struct gfs2_statfs_change *str = (struct gfs2_statfs_change *)buf;
294
295 str->sc_total = cpu_to_be64(sc->sc_total);
296 str->sc_free = cpu_to_be64(sc->sc_free);
297 str->sc_dinodes = cpu_to_be64(sc->sc_dinodes);
298}
299
300void gfs2_quota_change_in(struct gfs2_quota_change *qc, char *buf)
301{
302 struct gfs2_quota_change *str = (struct gfs2_quota_change *)buf;
303
304 qc->qc_change = be64_to_cpu(str->qc_change);
305 qc->qc_flags = be32_to_cpu(str->qc_flags);
306 qc->qc_id = be32_to_cpu(str->qc_id);
307}
308
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
new file mode 100644
index 000000000000..48720421c796
--- /dev/null
+++ b/fs/gfs2/ops_address.c
@@ -0,0 +1,794 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/pagevec.h>
17#include <linux/mpage.h>
18#include <linux/fs.h>
19#include <linux/gfs2_ondisk.h>
20
21#include "gfs2.h"
22#include "lm_interface.h"
23#include "incore.h"
24#include "bmap.h"
25#include "glock.h"
26#include "inode.h"
27#include "log.h"
28#include "meta_io.h"
29#include "ops_address.h"
30#include "quota.h"
31#include "trans.h"
32#include "rgrp.h"
33#include "ops_file.h"
34#include "util.h"
35#include "glops.h"
36
37
38static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
39 unsigned int from, unsigned int to)
40{
41 struct buffer_head *head = page_buffers(page);
42 unsigned int bsize = head->b_size;
43 struct buffer_head *bh;
44 unsigned int start, end;
45
46 for (bh = head, start = 0; bh != head || !start;
47 bh = bh->b_this_page, start = end) {
48 end = start + bsize;
49 if (end <= from || start >= to)
50 continue;
51 gfs2_trans_add_bh(ip->i_gl, bh, 0);
52 }
53}
54
55/**
56 * gfs2_get_block - Fills in a buffer head with details about a block
57 * @inode: The inode
58 * @lblock: The block number to look up
59 * @bh_result: The buffer head to return the result in
60 * @create: Non-zero if we may add block to the file
61 *
62 * Returns: errno
63 */
64
65int gfs2_get_block(struct inode *inode, sector_t lblock,
66 struct buffer_head *bh_result, int create)
67{
68 int new = create;
69 uint64_t dblock;
70 int error;
71 int boundary;
72
73 error = gfs2_block_map(inode, lblock, &new, &dblock, &boundary);
74 if (error)
75 return error;
76
77 if (!dblock)
78 return 0;
79
80 map_bh(bh_result, inode->i_sb, dblock);
81 if (new)
82 set_buffer_new(bh_result);
83 if (boundary)
84 set_buffer_boundary(bh_result);
85
86 return 0;
87}
88
89/**
90 * get_block_noalloc - Fills in a buffer head with details about a block
91 * @inode: The inode
92 * @lblock: The block number to look up
93 * @bh_result: The buffer head to return the result in
94 * @create: Non-zero if we may add block to the file
95 *
96 * Returns: errno
97 */
98
99static int get_block_noalloc(struct inode *inode, sector_t lblock,
100 struct buffer_head *bh_result, int create)
101{
102 int new = 0;
103 uint64_t dblock;
104 int error;
105 int boundary;
106
107 error = gfs2_block_map(inode, lblock, &new, &dblock, &boundary);
108 if (error)
109 return error;
110
111 if (dblock)
112 map_bh(bh_result, inode->i_sb, dblock);
113 else if (gfs2_assert_withdraw(GFS2_SB(inode), !create))
114 error = -EIO;
115 if (boundary)
116 set_buffer_boundary(bh_result);
117
118 return error;
119}
120
121/**
122 * gfs2_writepage - Write complete page
123 * @page: Page to write
124 *
125 * Returns: errno
126 *
127 * Some of this is copied from block_write_full_page() although we still
128 * call it to do most of the work.
129 */
130
131static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
132{
133 struct inode *inode = page->mapping->host;
134 struct gfs2_inode *ip = GFS2_I(inode);
135 struct gfs2_sbd *sdp = GFS2_SB(inode);
136 loff_t i_size = i_size_read(inode);
137 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
138 unsigned offset;
139 int error;
140 int done_trans = 0;
141
142 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) {
143 unlock_page(page);
144 return -EIO;
145 }
146 if (current->journal_info)
147 goto out_ignore;
148
149 /* Is the page fully outside i_size? (truncate in progress) */
150 offset = i_size & (PAGE_CACHE_SIZE-1);
151 if (page->index > end_index || (page->index == end_index && !offset)) {
152 page->mapping->a_ops->invalidatepage(page, 0);
153 unlock_page(page);
154 return 0; /* don't care */
155 }
156
157 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) {
158 error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
159 if (error)
160 goto out_ignore;
161 if (!page_has_buffers(page)) {
162 create_empty_buffers(page, inode->i_sb->s_blocksize,
163 (1 << BH_Dirty)|(1 << BH_Uptodate));
164 }
165 gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
166 done_trans = 1;
167 }
168 error = block_write_full_page(page, get_block_noalloc, wbc);
169 if (done_trans)
170 gfs2_trans_end(sdp);
171 gfs2_meta_cache_flush(ip);
172 return error;
173
174out_ignore:
175 redirty_page_for_writepage(wbc, page);
176 unlock_page(page);
177 return 0;
178}
179
180static int zero_readpage(struct page *page)
181{
182 void *kaddr;
183
184 kaddr = kmap_atomic(page, KM_USER0);
185 memset(kaddr, 0, PAGE_CACHE_SIZE);
186 kunmap_atomic(page, KM_USER0);
187
188 SetPageUptodate(page);
189
190 return 0;
191}
192
193/**
194 * stuffed_readpage - Fill in a Linux page with stuffed file data
195 * @ip: the inode
196 * @page: the page
197 *
198 * Returns: errno
199 */
200
201static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
202{
203 struct buffer_head *dibh;
204 void *kaddr;
205 int error;
206
207 /* Only the first page of a stuffed file might contain data */
208 if (unlikely(page->index))
209 return zero_readpage(page);
210
211 error = gfs2_meta_inode_buffer(ip, &dibh);
212 if (error)
213 return error;
214
215 kaddr = kmap_atomic(page, KM_USER0);
216 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
217 ip->i_di.di_size);
218 memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
219 kunmap_atomic(page, KM_USER0);
220
221 brelse(dibh);
222
223 SetPageUptodate(page);
224
225 return 0;
226}
227
228
229/**
230 * gfs2_readpage - readpage with locking
231 * @file: The file to read a page for. N.B. This may be NULL if we are
232 * reading an internal file.
233 * @page: The page to read
234 *
235 * Returns: errno
236 */
237
238static int gfs2_readpage(struct file *file, struct page *page)
239{
240 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
241 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
242 struct gfs2_holder gh;
243 int error;
244 int do_unlock = 0;
245
246 if (likely(file != &gfs2_internal_file_sentinal)) {
247 if (file) {
248 struct gfs2_file *gf = file->private_data;
249 if (test_bit(GFF_EXLOCK, &gf->f_flags))
250 goto skip_lock;
251 }
252 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|GL_AOP, &gh);
253 do_unlock = 1;
254 error = gfs2_glock_nq_m_atime(1, &gh);
255 if (unlikely(error))
256 goto out_unlock;
257 }
258
259skip_lock:
260 if (gfs2_is_stuffed(ip)) {
261 error = stuffed_readpage(ip, page);
262 unlock_page(page);
263 } else
264 error = mpage_readpage(page, gfs2_get_block);
265
266 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
267 error = -EIO;
268
269 if (file != &gfs2_internal_file_sentinal) {
270 gfs2_glock_dq_m(1, &gh);
271 gfs2_holder_uninit(&gh);
272 }
273out:
274 return error;
275out_unlock:
276 unlock_page(page);
277 if (do_unlock)
278 gfs2_holder_uninit(&gh);
279 goto out;
280}
281
282/**
283 * gfs2_readpages - Read a bunch of pages at once
284 *
285 * Some notes:
286 * 1. This is only for readahead, so we can simply ignore any things
287 * which are slightly inconvenient (such as locking conflicts between
288 * the page lock and the glock) and return having done no I/O. Its
289 * obviously not something we'd want to do on too regular a basis.
290 * Any I/O we ignore at this time will be done via readpage later.
291 * 2. We have to handle stuffed files here too.
292 * 3. mpage_readpages() does most of the heavy lifting in the common case.
293 * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places.
294 * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
295 * well as read-ahead.
296 */
297static int gfs2_readpages(struct file *file, struct address_space *mapping,
298 struct list_head *pages, unsigned nr_pages)
299{
300 struct inode *inode = mapping->host;
301 struct gfs2_inode *ip = GFS2_I(inode);
302 struct gfs2_sbd *sdp = GFS2_SB(inode);
303 struct gfs2_holder gh;
304 unsigned page_idx;
305 int ret;
306 int do_unlock = 0;
307
308 if (likely(file != &gfs2_internal_file_sentinal)) {
309 if (file) {
310 struct gfs2_file *gf = file->private_data;
311 if (test_bit(GFF_EXLOCK, &gf->f_flags))
312 goto skip_lock;
313 }
314 gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
315 LM_FLAG_TRY_1CB|GL_ATIME|GL_AOP, &gh);
316 do_unlock = 1;
317 ret = gfs2_glock_nq_m_atime(1, &gh);
318 if (ret == GLR_TRYFAILED)
319 goto out_noerror;
320 if (unlikely(ret))
321 goto out_unlock;
322 }
323skip_lock:
324 if (gfs2_is_stuffed(ip)) {
325 struct pagevec lru_pvec;
326 pagevec_init(&lru_pvec, 0);
327 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
328 struct page *page = list_entry(pages->prev, struct page, lru);
329 prefetchw(&page->flags);
330 list_del(&page->lru);
331 if (!add_to_page_cache(page, mapping,
332 page->index, GFP_KERNEL)) {
333 ret = stuffed_readpage(ip, page);
334 unlock_page(page);
335 if (!pagevec_add(&lru_pvec, page))
336 __pagevec_lru_add(&lru_pvec);
337 } else {
338 page_cache_release(page);
339 }
340 }
341 pagevec_lru_add(&lru_pvec);
342 ret = 0;
343 } else {
344 /* What we really want to do .... */
345 ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
346 }
347
348 if (do_unlock) {
349 gfs2_glock_dq_m(1, &gh);
350 gfs2_holder_uninit(&gh);
351 }
352out:
353 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
354 ret = -EIO;
355 return ret;
356out_noerror:
357 ret = 0;
358out_unlock:
359 /* unlock all pages, we can't do any I/O right now */
360 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
361 struct page *page = list_entry(pages->prev, struct page, lru);
362 list_del(&page->lru);
363 unlock_page(page);
364 page_cache_release(page);
365 }
366 if (do_unlock)
367 gfs2_holder_uninit(&gh);
368 goto out;
369}
370
371/**
372 * gfs2_prepare_write - Prepare to write a page to a file
373 * @file: The file to write to
374 * @page: The page which is to be prepared for writing
375 * @from: From (byte range within page)
376 * @to: To (byte range within page)
377 *
378 * Returns: errno
379 */
380
381static int gfs2_prepare_write(struct file *file, struct page *page,
382 unsigned from, unsigned to)
383{
384 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
385 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
386 unsigned int data_blocks, ind_blocks, rblocks;
387 int alloc_required;
388 int error = 0;
389 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + from;
390 loff_t end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
391 struct gfs2_alloc *al;
392
393 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|GL_AOP, &ip->i_gh);
394 error = gfs2_glock_nq_m_atime(1, &ip->i_gh);
395 if (error)
396 goto out_uninit;
397
398 gfs2_write_calc_reserv(ip, to - from, &data_blocks, &ind_blocks);
399
400 error = gfs2_write_alloc_required(ip, pos, from - to, &alloc_required);
401 if (error)
402 goto out_unlock;
403
404
405 if (alloc_required) {
406 al = gfs2_alloc_get(ip);
407
408 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
409 if (error)
410 goto out_alloc_put;
411
412 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
413 if (error)
414 goto out_qunlock;
415
416 al->al_requested = data_blocks + ind_blocks;
417 error = gfs2_inplace_reserve(ip);
418 if (error)
419 goto out_qunlock;
420 }
421
422 rblocks = RES_DINODE + ind_blocks;
423 if (gfs2_is_jdata(ip))
424 rblocks += data_blocks ? data_blocks : 1;
425 if (ind_blocks || data_blocks)
426 rblocks += RES_STATFS + RES_QUOTA;
427
428 error = gfs2_trans_begin(sdp, rblocks, 0);
429 if (error)
430 goto out;
431
432 if (gfs2_is_stuffed(ip)) {
433 if (end > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
434 error = gfs2_unstuff_dinode(ip, page);
435 if (error == 0)
436 goto prepare_write;
437 } else if (!PageUptodate(page))
438 error = stuffed_readpage(ip, page);
439 goto out;
440 }
441
442prepare_write:
443 error = block_prepare_write(page, from, to, gfs2_get_block);
444
445out:
446 if (error) {
447 gfs2_trans_end(sdp);
448 if (alloc_required) {
449 gfs2_inplace_release(ip);
450out_qunlock:
451 gfs2_quota_unlock(ip);
452out_alloc_put:
453 gfs2_alloc_put(ip);
454 }
455out_unlock:
456 gfs2_glock_dq_m(1, &ip->i_gh);
457out_uninit:
458 gfs2_holder_uninit(&ip->i_gh);
459 }
460
461 return error;
462}
463
464/**
465 * gfs2_commit_write - Commit write to a file
466 * @file: The file to write to
467 * @page: The page containing the data
468 * @from: From (byte range within page)
469 * @to: To (byte range within page)
470 *
471 * Returns: errno
472 */
473
474static int gfs2_commit_write(struct file *file, struct page *page,
475 unsigned from, unsigned to)
476{
477 struct inode *inode = page->mapping->host;
478 struct gfs2_inode *ip = GFS2_I(inode);
479 struct gfs2_sbd *sdp = GFS2_SB(inode);
480 int error = -EOPNOTSUPP;
481 struct buffer_head *dibh;
482 struct gfs2_alloc *al = &ip->i_alloc;;
483
484 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(ip->i_gl)))
485 goto fail_nounlock;
486
487 error = gfs2_meta_inode_buffer(ip, &dibh);
488 if (error)
489 goto fail_endtrans;
490
491 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
492
493 if (gfs2_is_stuffed(ip)) {
494 uint64_t file_size;
495 void *kaddr;
496
497 file_size = ((uint64_t)page->index << PAGE_CACHE_SHIFT) + to;
498
499 kaddr = kmap_atomic(page, KM_USER0);
500 memcpy(dibh->b_data + sizeof(struct gfs2_dinode) + from,
501 (char *)kaddr + from, to - from);
502 kunmap_atomic(page, KM_USER0);
503
504 SetPageUptodate(page);
505
506 if (inode->i_size < file_size)
507 i_size_write(inode, file_size);
508 } else {
509 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED ||
510 gfs2_is_jdata(ip))
511 gfs2_page_add_databufs(ip, page, from, to);
512 error = generic_commit_write(file, page, from, to);
513 if (error)
514 goto fail;
515 }
516
517 if (ip->i_di.di_size < inode->i_size)
518 ip->i_di.di_size = inode->i_size;
519
520 gfs2_dinode_out(&ip->i_di, dibh->b_data);
521 brelse(dibh);
522 gfs2_trans_end(sdp);
523 if (al->al_requested) {
524 gfs2_inplace_release(ip);
525 gfs2_quota_unlock(ip);
526 gfs2_alloc_put(ip);
527 }
528 gfs2_glock_dq_m(1, &ip->i_gh);
529 gfs2_holder_uninit(&ip->i_gh);
530 return 0;
531
532fail:
533 brelse(dibh);
534fail_endtrans:
535 gfs2_trans_end(sdp);
536 if (al->al_requested) {
537 gfs2_inplace_release(ip);
538 gfs2_quota_unlock(ip);
539 gfs2_alloc_put(ip);
540 }
541 gfs2_glock_dq_m(1, &ip->i_gh);
542 gfs2_holder_uninit(&ip->i_gh);
543fail_nounlock:
544 ClearPageUptodate(page);
545 return error;
546}
547
548/**
549 * gfs2_bmap - Block map function
550 * @mapping: Address space info
551 * @lblock: The block to map
552 *
553 * Returns: The disk address for the block or 0 on hole or error
554 */
555
556static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
557{
558 struct gfs2_inode *ip = GFS2_I(mapping->host);
559 struct gfs2_holder i_gh;
560 sector_t dblock = 0;
561 int error;
562
563 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
564 if (error)
565 return 0;
566
567 if (!gfs2_is_stuffed(ip))
568 dblock = generic_block_bmap(mapping, lblock, gfs2_get_block);
569
570 gfs2_glock_dq_uninit(&i_gh);
571
572 return dblock;
573}
574
575static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh)
576{
577 struct gfs2_bufdata *bd;
578
579 gfs2_log_lock(sdp);
580 bd = bh->b_private;
581 if (bd) {
582 bd->bd_bh = NULL;
583 bh->b_private = NULL;
584 }
585 gfs2_log_unlock(sdp);
586
587 lock_buffer(bh);
588 clear_buffer_dirty(bh);
589 bh->b_bdev = NULL;
590 clear_buffer_mapped(bh);
591 clear_buffer_req(bh);
592 clear_buffer_new(bh);
593 clear_buffer_delay(bh);
594 unlock_buffer(bh);
595}
596
597static void gfs2_invalidatepage(struct page *page, unsigned long offset)
598{
599 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
600 struct buffer_head *head, *bh, *next;
601 unsigned int curr_off = 0;
602
603 BUG_ON(!PageLocked(page));
604 if (!page_has_buffers(page))
605 return;
606
607 bh = head = page_buffers(page);
608 do {
609 unsigned int next_off = curr_off + bh->b_size;
610 next = bh->b_this_page;
611
612 if (offset <= curr_off)
613 discard_buffer(sdp, bh);
614
615 curr_off = next_off;
616 bh = next;
617 } while (bh != head);
618
619 if (!offset)
620 try_to_release_page(page, 0);
621
622 return;
623}
624
625static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
626 const struct iovec *iov, loff_t offset,
627 unsigned long nr_segs)
628{
629 struct file *file = iocb->ki_filp;
630 struct inode *inode = file->f_mapping->host;
631 struct gfs2_inode *ip = GFS2_I(inode);
632 struct gfs2_holder gh;
633 int rv;
634
635 if (rw == READ)
636 mutex_lock(&inode->i_mutex);
637 /*
638 * Shared lock, even if its a write, since we do no allocation
639 * on this path. All we need change is atime.
640 */
641 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
642 rv = gfs2_glock_nq_m_atime(1, &gh);
643 if (rv)
644 goto out;
645
646 if (offset > i_size_read(inode))
647 goto out;
648
649 /*
650 * Should we return an error here? I can't see that O_DIRECT for
651 * a journaled file makes any sense. For now we'll silently fall
652 * back to buffered I/O, likewise we do the same for stuffed
653 * files since they are (a) small and (b) unaligned.
654 */
655 if (gfs2_is_jdata(ip))
656 goto out;
657
658 if (gfs2_is_stuffed(ip))
659 goto out;
660
661 rv = blockdev_direct_IO_own_locking(rw, iocb, inode,
662 inode->i_sb->s_bdev,
663 iov, offset, nr_segs,
664 gfs2_get_block, NULL);
665out:
666 gfs2_glock_dq_m(1, &gh);
667 gfs2_holder_uninit(&gh);
668 if (rw == READ)
669 mutex_unlock(&inode->i_mutex);
670
671 return rv;
672}
673
674/**
675 * stuck_releasepage - We're stuck in gfs2_releasepage(). Print stuff out.
676 * @bh: the buffer we're stuck on
677 *
678 */
679
680static void stuck_releasepage(struct buffer_head *bh)
681{
682 struct inode *inode = bh->b_page->mapping->host;
683 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
684 struct gfs2_bufdata *bd = bh->b_private;
685 struct gfs2_glock *gl;
686static unsigned limit = 0;
687
688 if (limit++ > 3)
689 return;
690
691 fs_warn(sdp, "stuck in gfs2_releasepage() %p\n", inode);
692 fs_warn(sdp, "blkno = %llu, bh->b_count = %d\n",
693 (unsigned long long)bh->b_blocknr, atomic_read(&bh->b_count));
694 fs_warn(sdp, "pinned = %u\n", buffer_pinned(bh));
695 fs_warn(sdp, "bh->b_private = %s\n", (bd) ? "!NULL" : "NULL");
696
697 if (!bd)
698 return;
699
700 gl = bd->bd_gl;
701
702 fs_warn(sdp, "gl = (%u, %llu)\n",
703 gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number);
704
705 fs_warn(sdp, "bd_list_tr = %s, bd_le.le_list = %s\n",
706 (list_empty(&bd->bd_list_tr)) ? "no" : "yes",
707 (list_empty(&bd->bd_le.le_list)) ? "no" : "yes");
708
709 if (gl->gl_ops == &gfs2_inode_glops) {
710 struct gfs2_inode *ip = gl->gl_object;
711 unsigned int x;
712
713 if (!ip)
714 return;
715
716 fs_warn(sdp, "ip = %llu %llu\n",
717 (unsigned long long)ip->i_num.no_formal_ino,
718 (unsigned long long)ip->i_num.no_addr);
719
720 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++)
721 fs_warn(sdp, "ip->i_cache[%u] = %s\n",
722 x, (ip->i_cache[x]) ? "!NULL" : "NULL");
723 }
724}
725
726/**
727 * gfs2_aspace_releasepage - free the metadata associated with a page
728 * @page: the page that's being released
729 * @gfp_mask: passed from Linux VFS, ignored by us
730 *
731 * Call try_to_free_buffers() if the buffers in this page can be
732 * released.
733 *
734 * Returns: 0
735 */
736
737int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
738{
739 struct inode *aspace = page->mapping->host;
740 struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
741 struct buffer_head *bh, *head;
742 struct gfs2_bufdata *bd;
743 unsigned long t = jiffies + gfs2_tune_get(sdp, gt_stall_secs) * HZ;
744
745 if (!page_has_buffers(page))
746 goto out;
747
748 head = bh = page_buffers(page);
749 do {
750 while (atomic_read(&bh->b_count)) {
751 if (!atomic_read(&aspace->i_writecount))
752 return 0;
753
754 if (time_after_eq(jiffies, t)) {
755 stuck_releasepage(bh);
756 /* should we withdraw here? */
757 return 0;
758 }
759
760 yield();
761 }
762
763 gfs2_assert_warn(sdp, !buffer_pinned(bh));
764
765 bd = bh->b_private;
766 if (bd) {
767 gfs2_assert_warn(sdp, bd->bd_bh == bh);
768 gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
769 gfs2_assert_warn(sdp, list_empty(&bd->bd_le.le_list));
770 gfs2_assert_warn(sdp, !bd->bd_ail);
771 kmem_cache_free(gfs2_bufdata_cachep, bd);
772 bh->b_private = NULL;
773 }
774
775 bh = bh->b_this_page;
776 } while (bh != head);
777
778out:
779 return try_to_free_buffers(page);
780}
781
782const struct address_space_operations gfs2_file_aops = {
783 .writepage = gfs2_writepage,
784 .readpage = gfs2_readpage,
785 .readpages = gfs2_readpages,
786 .sync_page = block_sync_page,
787 .prepare_write = gfs2_prepare_write,
788 .commit_write = gfs2_commit_write,
789 .bmap = gfs2_bmap,
790 .invalidatepage = gfs2_invalidatepage,
791 .releasepage = gfs2_releasepage,
792 .direct_IO = gfs2_direct_IO,
793};
794
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
new file mode 100644
index 000000000000..dfc3dda6de11
--- /dev/null
+++ b/fs/gfs2/ops_address.h
@@ -0,0 +1,18 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_ADDRESS_DOT_H__
11#define __OPS_ADDRESS_DOT_H__
12
13extern const struct address_space_operations gfs2_file_aops;
14extern int gfs2_get_block(struct inode *inode, sector_t lblock,
15 struct buffer_head *bh_result, int create);
16extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
17
18#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
new file mode 100644
index 000000000000..fd55979ec428
--- /dev/null
+++ b/fs/gfs2/ops_dentry.c
@@ -0,0 +1,123 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/smp_lock.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/crc32.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "dir.h"
23#include "glock.h"
24#include "ops_dentry.h"
25#include "util.h"
26
27/**
28 * gfs2_drevalidate - Check directory lookup consistency
29 * @dentry: the mapping to check
30 * @nd:
31 *
32 * Check to make sure the lookup necessary to arrive at this inode from its
33 * parent is still good.
34 *
35 * Returns: 1 if the dentry is ok, 0 if it isn't
36 */
37
38static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
39{
40 struct dentry *parent = dget_parent(dentry);
41 struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode);
42 struct gfs2_inode *dip = GFS2_I(parent->d_inode);
43 struct inode *inode = dentry->d_inode;
44 struct gfs2_holder d_gh;
45 struct gfs2_inode *ip;
46 struct gfs2_inum inum;
47 unsigned int type;
48 int error;
49
50 if (inode && is_bad_inode(inode))
51 goto invalid;
52
53 if (sdp->sd_args.ar_localcaching)
54 goto valid;
55
56 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
57 if (error)
58 goto fail;
59
60 error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type);
61 switch (error) {
62 case 0:
63 if (!inode)
64 goto invalid_gunlock;
65 break;
66 case -ENOENT:
67 if (!inode)
68 goto valid_gunlock;
69 goto invalid_gunlock;
70 default:
71 goto fail_gunlock;
72 }
73
74 ip = GFS2_I(inode);
75
76 if (!gfs2_inum_equal(&ip->i_num, &inum))
77 goto invalid_gunlock;
78
79 if (IF2DT(ip->i_di.di_mode) != type) {
80 gfs2_consist_inode(dip);
81 goto fail_gunlock;
82 }
83
84 valid_gunlock:
85 gfs2_glock_dq_uninit(&d_gh);
86
87 valid:
88 dput(parent);
89 return 1;
90
91 invalid_gunlock:
92 gfs2_glock_dq_uninit(&d_gh);
93
94 invalid:
95 if (inode && S_ISDIR(inode->i_mode)) {
96 if (have_submounts(dentry))
97 goto valid;
98 shrink_dcache_parent(dentry);
99 }
100 d_drop(dentry);
101
102 dput(parent);
103 return 0;
104
105 fail_gunlock:
106 gfs2_glock_dq_uninit(&d_gh);
107
108 fail:
109 dput(parent);
110 return 0;
111}
112
113static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
114{
115 str->hash = gfs2_disk_hash(str->name, str->len);
116 return 0;
117}
118
119struct dentry_operations gfs2_dops = {
120 .d_revalidate = gfs2_drevalidate,
121 .d_hash = gfs2_dhash,
122};
123
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
new file mode 100644
index 000000000000..1b6e75c0a4a7
--- /dev/null
+++ b/fs/gfs2/ops_dentry.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_DENTRY_DOT_H__
11#define __OPS_DENTRY_DOT_H__
12
13extern struct dentry_operations gfs2_dops;
14
15#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
new file mode 100644
index 000000000000..6354f4799e68
--- /dev/null
+++ b/fs/gfs2/ops_export.c
@@ -0,0 +1,293 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "dir.h"
22#include "glock.h"
23#include "glops.h"
24#include "inode.h"
25#include "ops_export.h"
26#include "rgrp.h"
27#include "util.h"
28
29static struct dentry *gfs2_decode_fh(struct super_block *sb,
30 __u32 *fh,
31 int fh_len,
32 int fh_type,
33 int (*acceptable)(void *context,
34 struct dentry *dentry),
35 void *context)
36{
37 struct gfs2_fh_obj fh_obj;
38 struct gfs2_inum *this, parent;
39
40 if (fh_type != fh_len)
41 return NULL;
42
43 this = &fh_obj.this;
44 fh_obj.imode = DT_UNKNOWN;
45 memset(&parent, 0, sizeof(struct gfs2_inum));
46
47 switch (fh_type) {
48 case 10:
49 parent.no_formal_ino = ((uint64_t)be32_to_cpu(fh[4])) << 32;
50 parent.no_formal_ino |= be32_to_cpu(fh[5]);
51 parent.no_addr = ((uint64_t)be32_to_cpu(fh[6])) << 32;
52 parent.no_addr |= be32_to_cpu(fh[7]);
53 fh_obj.imode = be32_to_cpu(fh[8]);
54 case 4:
55 this->no_formal_ino = ((uint64_t)be32_to_cpu(fh[0])) << 32;
56 this->no_formal_ino |= be32_to_cpu(fh[1]);
57 this->no_addr = ((uint64_t)be32_to_cpu(fh[2])) << 32;
58 this->no_addr |= be32_to_cpu(fh[3]);
59 break;
60 default:
61 return NULL;
62 }
63
64 return gfs2_export_ops.find_exported_dentry(sb, &fh_obj, &parent,
65 acceptable, context);
66}
67
68static int gfs2_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
69 int connectable)
70{
71 struct inode *inode = dentry->d_inode;
72 struct super_block *sb = inode->i_sb;
73 struct gfs2_inode *ip = GFS2_I(inode);
74
75 if (*len < 4 || (connectable && *len < 10))
76 return 255;
77
78 fh[0] = ip->i_num.no_formal_ino >> 32;
79 fh[0] = cpu_to_be32(fh[0]);
80 fh[1] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
81 fh[1] = cpu_to_be32(fh[1]);
82 fh[2] = ip->i_num.no_addr >> 32;
83 fh[2] = cpu_to_be32(fh[2]);
84 fh[3] = ip->i_num.no_addr & 0xFFFFFFFF;
85 fh[3] = cpu_to_be32(fh[3]);
86 *len = 4;
87
88 if (!connectable || inode == sb->s_root->d_inode)
89 return *len;
90
91 spin_lock(&dentry->d_lock);
92 inode = dentry->d_parent->d_inode;
93 ip = GFS2_I(inode);
94 igrab(inode);
95 spin_unlock(&dentry->d_lock);
96
97 fh[4] = ip->i_num.no_formal_ino >> 32;
98 fh[4] = cpu_to_be32(fh[4]);
99 fh[5] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
100 fh[5] = cpu_to_be32(fh[5]);
101 fh[6] = ip->i_num.no_addr >> 32;
102 fh[6] = cpu_to_be32(fh[6]);
103 fh[7] = ip->i_num.no_addr & 0xFFFFFFFF;
104 fh[7] = cpu_to_be32(fh[7]);
105
106 fh[8] = cpu_to_be32(inode->i_mode);
107 fh[9] = 0; /* pad to double word */
108 *len = 10;
109
110 iput(inode);
111
112 return *len;
113}
114
115struct get_name_filldir {
116 struct gfs2_inum inum;
117 char *name;
118};
119
120static int get_name_filldir(void *opaque, const char *name, unsigned int length,
121 uint64_t offset, struct gfs2_inum *inum,
122 unsigned int type)
123{
124 struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque;
125
126 if (!gfs2_inum_equal(inum, &gnfd->inum))
127 return 0;
128
129 memcpy(gnfd->name, name, length);
130 gnfd->name[length] = 0;
131
132 return 1;
133}
134
135static int gfs2_get_name(struct dentry *parent, char *name,
136 struct dentry *child)
137{
138 struct inode *dir = parent->d_inode;
139 struct inode *inode = child->d_inode;
140 struct gfs2_inode *dip, *ip;
141 struct get_name_filldir gnfd;
142 struct gfs2_holder gh;
143 uint64_t offset = 0;
144 int error;
145
146 if (!dir)
147 return -EINVAL;
148
149 if (!S_ISDIR(dir->i_mode) || !inode)
150 return -EINVAL;
151
152 dip = GFS2_I(dir);
153 ip = GFS2_I(inode);
154
155 *name = 0;
156 gnfd.inum = ip->i_num;
157 gnfd.name = name;
158
159 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
160 if (error)
161 return error;
162
163 error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
164
165 gfs2_glock_dq_uninit(&gh);
166
167 if (!error && !*name)
168 error = -ENOENT;
169
170 return error;
171}
172
173static struct dentry *gfs2_get_parent(struct dentry *child)
174{
175 struct qstr dotdot;
176 struct inode *inode;
177 struct dentry *dentry;
178
179 gfs2_str2qstr(&dotdot, "..");
180 inode = gfs2_lookupi(child->d_inode, &dotdot, 1, NULL);
181
182 if (!inode)
183 return ERR_PTR(-ENOENT);
184 if (IS_ERR(inode))
185 return ERR_PTR(PTR_ERR(inode));
186
187 dentry = d_alloc_anon(inode);
188 if (!dentry) {
189 iput(inode);
190 return ERR_PTR(-ENOMEM);
191 }
192
193 return dentry;
194}
195
196static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
197{
198 struct gfs2_sbd *sdp = sb->s_fs_info;
199 struct gfs2_fh_obj *fh_obj = (struct gfs2_fh_obj *)inum_obj;
200 struct gfs2_inum *inum = &fh_obj->this;
201 struct gfs2_holder i_gh, ri_gh, rgd_gh;
202 struct gfs2_rgrpd *rgd;
203 struct inode *inode;
204 struct dentry *dentry;
205 int error;
206
207 /* System files? */
208
209 inode = gfs2_ilookup(sb, inum);
210 if (inode) {
211 if (GFS2_I(inode)->i_num.no_formal_ino != inum->no_formal_ino) {
212 iput(inode);
213 return ERR_PTR(-ESTALE);
214 }
215 goto out_inode;
216 }
217
218 error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
219 LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL,
220 &i_gh);
221 if (error)
222 return ERR_PTR(error);
223
224 error = gfs2_rindex_hold(sdp, &ri_gh);
225 if (error)
226 goto fail;
227
228 error = -EINVAL;
229 rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
230 if (!rgd)
231 goto fail_rindex;
232
233 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
234 if (error)
235 goto fail_rindex;
236
237 error = -ESTALE;
238 if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
239 goto fail_rgd;
240
241 gfs2_glock_dq_uninit(&rgd_gh);
242 gfs2_glock_dq_uninit(&ri_gh);
243
244 inode = gfs2_inode_lookup(sb, inum, fh_obj->imode);
245 if (!inode)
246 goto fail;
247 if (IS_ERR(inode)) {
248 error = PTR_ERR(inode);
249 goto fail;
250 }
251
252 error = gfs2_inode_refresh(GFS2_I(inode));
253 if (error) {
254 iput(inode);
255 goto fail;
256 }
257
258 error = -EIO;
259 if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) {
260 iput(inode);
261 goto fail;
262 }
263
264 gfs2_glock_dq_uninit(&i_gh);
265
266out_inode:
267 dentry = d_alloc_anon(inode);
268 if (!dentry) {
269 iput(inode);
270 return ERR_PTR(-ENOMEM);
271 }
272
273 return dentry;
274
275fail_rgd:
276 gfs2_glock_dq_uninit(&rgd_gh);
277
278fail_rindex:
279 gfs2_glock_dq_uninit(&ri_gh);
280
281fail:
282 gfs2_glock_dq_uninit(&i_gh);
283 return ERR_PTR(error);
284}
285
286struct export_operations gfs2_export_ops = {
287 .decode_fh = gfs2_decode_fh,
288 .encode_fh = gfs2_encode_fh,
289 .get_name = gfs2_get_name,
290 .get_parent = gfs2_get_parent,
291 .get_dentry = gfs2_get_dentry,
292};
293
diff --git a/fs/gfs2/ops_export.h b/fs/gfs2/ops_export.h
new file mode 100644
index 000000000000..09fc077657d1
--- /dev/null
+++ b/fs/gfs2/ops_export.h
@@ -0,0 +1,19 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_EXPORT_DOT_H__
11#define __OPS_EXPORT_DOT_H__
12
13extern struct export_operations gfs2_export_ops;
14struct gfs2_fh_obj {
15 struct gfs2_inum this;
16 __u32 imode;
17};
18
19#endif /* __OPS_EXPORT_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
new file mode 100644
index 000000000000..145a29fa4ea4
--- /dev/null
+++ b/fs/gfs2/ops_file.c
@@ -0,0 +1,812 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/uio.h>
17#include <linux/blkdev.h>
18#include <linux/mm.h>
19#include <linux/smp_lock.h>
20#include <linux/fs.h>
21#include <linux/gfs2_ondisk.h>
22#include <linux/ext2_fs.h>
23#include <linux/crc32.h>
24#include <linux/iflags.h>
25#include <asm/uaccess.h>
26
27#include "gfs2.h"
28#include "lm_interface.h"
29#include "incore.h"
30#include "bmap.h"
31#include "dir.h"
32#include "glock.h"
33#include "glops.h"
34#include "inode.h"
35#include "lm.h"
36#include "log.h"
37#include "meta_io.h"
38#include "ops_file.h"
39#include "ops_vm.h"
40#include "quota.h"
41#include "rgrp.h"
42#include "trans.h"
43#include "util.h"
44#include "eaops.h"
45
46/* "bad" is for NFS support */
47struct filldir_bad_entry {
48 char *fbe_name;
49 unsigned int fbe_length;
50 uint64_t fbe_offset;
51 struct gfs2_inum fbe_inum;
52 unsigned int fbe_type;
53};
54
55struct filldir_bad {
56 struct gfs2_sbd *fdb_sbd;
57
58 struct filldir_bad_entry *fdb_entry;
59 unsigned int fdb_entry_num;
60 unsigned int fdb_entry_off;
61
62 char *fdb_name;
63 unsigned int fdb_name_size;
64 unsigned int fdb_name_off;
65};
66
67/* For regular, non-NFS */
68struct filldir_reg {
69 struct gfs2_sbd *fdr_sbd;
70 int fdr_prefetch;
71
72 filldir_t fdr_filldir;
73 void *fdr_opaque;
74};
75
76/*
77 * Most fields left uninitialised to catch anybody who tries to
78 * use them. f_flags set to prevent file_accessed() from touching
79 * any other part of this. Its use is purely as a flag so that we
80 * know (in readpage()) whether or not do to locking.
81 */
82struct file gfs2_internal_file_sentinal = {
83 .f_flags = O_NOATIME|O_RDONLY,
84};
85
86static int gfs2_read_actor(read_descriptor_t *desc, struct page *page,
87 unsigned long offset, unsigned long size)
88{
89 char *kaddr;
90 unsigned long count = desc->count;
91
92 if (size > count)
93 size = count;
94
95 kaddr = kmap(page);
96 memcpy(desc->arg.buf, kaddr + offset, size);
97 kunmap(page);
98
99 desc->count = count - size;
100 desc->written += size;
101 desc->arg.buf += size;
102 return size;
103}
104
105int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
106 char *buf, loff_t *pos, unsigned size)
107{
108 struct inode *inode = &ip->i_inode;
109 read_descriptor_t desc;
110 desc.written = 0;
111 desc.arg.buf = buf;
112 desc.count = size;
113 desc.error = 0;
114 do_generic_mapping_read(inode->i_mapping, ra_state,
115 &gfs2_internal_file_sentinal, pos, &desc,
116 gfs2_read_actor);
117 return desc.written ? desc.written : desc.error;
118}
119
120/**
121 * gfs2_llseek - seek to a location in a file
122 * @file: the file
123 * @offset: the offset
124 * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
125 *
126 * SEEK_END requires the glock for the file because it references the
127 * file's size.
128 *
129 * Returns: The new offset, or errno
130 */
131
132static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
133{
134 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
135 struct gfs2_holder i_gh;
136 loff_t error;
137
138 if (origin == 2) {
139 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
140 &i_gh);
141 if (!error) {
142 error = remote_llseek(file, offset, origin);
143 gfs2_glock_dq_uninit(&i_gh);
144 }
145 } else
146 error = remote_llseek(file, offset, origin);
147
148 return error;
149}
150
151/**
152 * filldir_reg_func - Report a directory entry to the caller of gfs2_dir_read()
153 * @opaque: opaque data used by the function
154 * @name: the name of the directory entry
155 * @length: the length of the name
156 * @offset: the entry's offset in the directory
157 * @inum: the inode number the entry points to
158 * @type: the type of inode the entry points to
159 *
160 * Returns: 0 on success, 1 if buffer full
161 */
162
163static int filldir_reg_func(void *opaque, const char *name, unsigned int length,
164 uint64_t offset, struct gfs2_inum *inum,
165 unsigned int type)
166{
167 struct filldir_reg *fdr = (struct filldir_reg *)opaque;
168 struct gfs2_sbd *sdp = fdr->fdr_sbd;
169 int error;
170
171 error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset,
172 inum->no_addr, type);
173 if (error)
174 return 1;
175
176 if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) {
177 gfs2_glock_prefetch_num(sdp,
178 inum->no_addr, &gfs2_inode_glops,
179 LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
180 gfs2_glock_prefetch_num(sdp,
181 inum->no_addr, &gfs2_iopen_glops,
182 LM_ST_SHARED, LM_FLAG_TRY);
183 }
184
185 return 0;
186}
187
188/**
189 * readdir_reg - Read directory entries from a directory
190 * @file: The directory to read from
191 * @dirent: Buffer for dirents
192 * @filldir: Function used to do the copying
193 *
194 * Returns: errno
195 */
196
197static int readdir_reg(struct file *file, void *dirent, filldir_t filldir)
198{
199 struct inode *dir = file->f_mapping->host;
200 struct gfs2_inode *dip = GFS2_I(dir);
201 struct filldir_reg fdr;
202 struct gfs2_holder d_gh;
203 uint64_t offset = file->f_pos;
204 int error;
205
206 fdr.fdr_sbd = GFS2_SB(dir);
207 fdr.fdr_prefetch = 1;
208 fdr.fdr_filldir = filldir;
209 fdr.fdr_opaque = dirent;
210
211 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
212 error = gfs2_glock_nq_atime(&d_gh);
213 if (error) {
214 gfs2_holder_uninit(&d_gh);
215 return error;
216 }
217
218 error = gfs2_dir_read(dir, &offset, &fdr, filldir_reg_func);
219
220 gfs2_glock_dq_uninit(&d_gh);
221
222 file->f_pos = offset;
223
224 return error;
225}
226
227/**
228 * filldir_bad_func - Report a directory entry to the caller of gfs2_dir_read()
229 * @opaque: opaque data used by the function
230 * @name: the name of the directory entry
231 * @length: the length of the name
232 * @offset: the entry's offset in the directory
233 * @inum: the inode number the entry points to
234 * @type: the type of inode the entry points to
235 *
236 * For supporting NFS.
237 *
238 * Returns: 0 on success, 1 if buffer full
239 */
240
241static int filldir_bad_func(void *opaque, const char *name, unsigned int length,
242 uint64_t offset, struct gfs2_inum *inum,
243 unsigned int type)
244{
245 struct filldir_bad *fdb = (struct filldir_bad *)opaque;
246 struct gfs2_sbd *sdp = fdb->fdb_sbd;
247 struct filldir_bad_entry *fbe;
248
249 if (fdb->fdb_entry_off == fdb->fdb_entry_num ||
250 fdb->fdb_name_off + length > fdb->fdb_name_size)
251 return 1;
252
253 fbe = &fdb->fdb_entry[fdb->fdb_entry_off];
254 fbe->fbe_name = fdb->fdb_name + fdb->fdb_name_off;
255 memcpy(fbe->fbe_name, name, length);
256 fbe->fbe_length = length;
257 fbe->fbe_offset = offset;
258 fbe->fbe_inum = *inum;
259 fbe->fbe_type = type;
260
261 fdb->fdb_entry_off++;
262 fdb->fdb_name_off += length;
263
264 if (!(length == 1 && *name == '.')) {
265 gfs2_glock_prefetch_num(sdp,
266 inum->no_addr, &gfs2_inode_glops,
267 LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
268 gfs2_glock_prefetch_num(sdp,
269 inum->no_addr, &gfs2_iopen_glops,
270 LM_ST_SHARED, LM_FLAG_TRY);
271 }
272
273 return 0;
274}
275
276/**
277 * readdir_bad - Read directory entries from a directory
278 * @file: The directory to read from
279 * @dirent: Buffer for dirents
280 * @filldir: Function used to do the copying
281 *
282 * For supporting NFS.
283 *
284 * Returns: errno
285 */
286
287static int readdir_bad(struct file *file, void *dirent, filldir_t filldir)
288{
289 struct inode *dir = file->f_mapping->host;
290 struct gfs2_inode *dip = GFS2_I(dir);
291 struct gfs2_sbd *sdp = GFS2_SB(dir);
292 struct filldir_reg fdr;
293 unsigned int entries, size;
294 struct filldir_bad *fdb;
295 struct gfs2_holder d_gh;
296 uint64_t offset = file->f_pos;
297 unsigned int x;
298 struct filldir_bad_entry *fbe;
299 int error;
300
301 entries = gfs2_tune_get(sdp, gt_entries_per_readdir);
302 size = sizeof(struct filldir_bad) +
303 entries * (sizeof(struct filldir_bad_entry) + GFS2_FAST_NAME_SIZE);
304
305 fdb = kzalloc(size, GFP_KERNEL);
306 if (!fdb)
307 return -ENOMEM;
308
309 fdb->fdb_sbd = sdp;
310 fdb->fdb_entry = (struct filldir_bad_entry *)(fdb + 1);
311 fdb->fdb_entry_num = entries;
312 fdb->fdb_name = ((char *)fdb) + sizeof(struct filldir_bad) +
313 entries * sizeof(struct filldir_bad_entry);
314 fdb->fdb_name_size = entries * GFS2_FAST_NAME_SIZE;
315
316 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
317 error = gfs2_glock_nq_atime(&d_gh);
318 if (error) {
319 gfs2_holder_uninit(&d_gh);
320 goto out;
321 }
322
323 error = gfs2_dir_read(dir, &offset, fdb, filldir_bad_func);
324
325 gfs2_glock_dq_uninit(&d_gh);
326
327 fdr.fdr_sbd = sdp;
328 fdr.fdr_prefetch = 0;
329 fdr.fdr_filldir = filldir;
330 fdr.fdr_opaque = dirent;
331
332 for (x = 0; x < fdb->fdb_entry_off; x++) {
333 fbe = &fdb->fdb_entry[x];
334
335 error = filldir_reg_func(&fdr,
336 fbe->fbe_name, fbe->fbe_length,
337 fbe->fbe_offset,
338 &fbe->fbe_inum, fbe->fbe_type);
339 if (error) {
340 file->f_pos = fbe->fbe_offset;
341 error = 0;
342 goto out;
343 }
344 }
345
346 file->f_pos = offset;
347
348 out:
349 kfree(fdb);
350
351 return error;
352}
353
354/**
355 * gfs2_readdir - Read directory entries from a directory
356 * @file: The directory to read from
357 * @dirent: Buffer for dirents
358 * @filldir: Function used to do the copying
359 *
360 * Returns: errno
361 */
362
363static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
364{
365 int error;
366
367 if (strcmp(current->comm, "nfsd") != 0)
368 error = readdir_reg(file, dirent, filldir);
369 else
370 error = readdir_bad(file, dirent, filldir);
371
372 return error;
373}
374
375static const u32 iflags_to_gfs2[32] = {
376 [iflag_Sync] = GFS2_DIF_SYNC,
377 [iflag_Immutable] = GFS2_DIF_IMMUTABLE,
378 [iflag_Append] = GFS2_DIF_APPENDONLY,
379 [iflag_NoAtime] = GFS2_DIF_NOATIME,
380 [iflag_Index] = GFS2_DIF_EXHASH,
381 [iflag_JournalData] = GFS2_DIF_JDATA,
382 [iflag_DirectIO] = GFS2_DIF_DIRECTIO,
383};
384
385static const u32 gfs2_to_iflags[32] = {
386 [gfs2fl_Sync] = IFLAG_SYNC,
387 [gfs2fl_Immutable] = IFLAG_IMMUTABLE,
388 [gfs2fl_AppendOnly] = IFLAG_APPEND,
389 [gfs2fl_NoAtime] = IFLAG_NOATIME,
390 [gfs2fl_ExHash] = IFLAG_INDEX,
391 [gfs2fl_Jdata] = IFLAG_JOURNAL_DATA,
392 [gfs2fl_Directio] = IFLAG_DIRECTIO,
393 [gfs2fl_InheritDirectio] = IFLAG_DIRECTIO,
394 [gfs2fl_InheritJdata] = IFLAG_JOURNAL_DATA,
395};
396
397static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
398{
399 struct inode *inode = filp->f_dentry->d_inode;
400 struct gfs2_inode *ip = GFS2_I(inode);
401 struct gfs2_holder gh;
402 int error;
403 u32 iflags;
404
405 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
406 error = gfs2_glock_nq_m_atime(1, &gh);
407 if (error)
408 return error;
409
410 iflags = iflags_cvt(gfs2_to_iflags, ip->i_di.di_flags);
411 if (put_user(iflags, ptr))
412 error = -EFAULT;
413
414 gfs2_glock_dq_m(1, &gh);
415 gfs2_holder_uninit(&gh);
416 return error;
417}
418
419/* Flags that can be set by user space */
420#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \
421 GFS2_DIF_DIRECTIO| \
422 GFS2_DIF_IMMUTABLE| \
423 GFS2_DIF_APPENDONLY| \
424 GFS2_DIF_NOATIME| \
425 GFS2_DIF_SYNC| \
426 GFS2_DIF_SYSTEM| \
427 GFS2_DIF_INHERIT_DIRECTIO| \
428 GFS2_DIF_INHERIT_JDATA)
429
430/**
431 * gfs2_set_flags - set flags on an inode
432 * @inode: The inode
433 * @flags: The flags to set
434 * @mask: Indicates which flags are valid
435 *
436 */
437static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
438{
439 struct inode *inode = filp->f_dentry->d_inode;
440 struct gfs2_inode *ip = GFS2_I(inode);
441 struct gfs2_sbd *sdp = GFS2_SB(inode);
442 struct buffer_head *bh;
443 struct gfs2_holder gh;
444 int error;
445 u32 new_flags, flags;
446
447 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
448 if (error)
449 return error;
450
451 flags = ip->i_di.di_flags;
452 new_flags = (flags & ~mask) | (reqflags & mask);
453 if ((new_flags ^ flags) == 0)
454 goto out;
455
456 if (S_ISDIR(inode->i_mode)) {
457 if ((new_flags ^ flags) & GFS2_DIF_JDATA)
458 new_flags ^= (GFS2_DIF_JDATA|GFS2_DIF_INHERIT_JDATA);
459 if ((new_flags ^ flags) & GFS2_DIF_DIRECTIO)
460 new_flags ^= (GFS2_DIF_DIRECTIO|GFS2_DIF_INHERIT_DIRECTIO);
461 }
462
463 error = -EINVAL;
464 if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
465 goto out;
466
467 error = -EPERM;
468 if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
469 goto out;
470 if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
471 goto out;
472 if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&
473 !capable(CAP_LINUX_IMMUTABLE))
474 goto out;
475 if (!IS_IMMUTABLE(inode)) {
476 error = permission(inode, MAY_WRITE, NULL);
477 if (error)
478 goto out;
479 }
480
481 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
482 if (error)
483 goto out;
484 error = gfs2_meta_inode_buffer(ip, &bh);
485 if (error)
486 goto out_trans_end;
487 gfs2_trans_add_bh(ip->i_gl, bh, 1);
488 ip->i_di.di_flags = new_flags;
489 gfs2_dinode_out(&ip->i_di, bh->b_data);
490 brelse(bh);
491out_trans_end:
492 gfs2_trans_end(sdp);
493out:
494 gfs2_glock_dq_uninit(&gh);
495 return error;
496}
497
498static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
499{
500 u32 iflags, gfsflags;
501 if (get_user(iflags, ptr))
502 return -EFAULT;
503 gfsflags = iflags_cvt(iflags_to_gfs2, iflags);
504 return do_gfs2_set_flags(filp, gfsflags, ~0);
505}
506
507static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
508{
509 switch(cmd) {
510 case IFLAGS_GET_IOC:
511 return gfs2_get_flags(filp, (u32 __user *)arg);
512 case IFLAGS_SET_IOC:
513 return gfs2_set_flags(filp, (u32 __user *)arg);
514 }
515 return -ENOTTY;
516}
517
518
519/**
520 * gfs2_mmap -
521 * @file: The file to map
522 * @vma: The VMA which described the mapping
523 *
524 * Returns: 0 or error code
525 */
526
527static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
528{
529 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
530 struct gfs2_holder i_gh;
531 int error;
532
533 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
534 error = gfs2_glock_nq_atime(&i_gh);
535 if (error) {
536 gfs2_holder_uninit(&i_gh);
537 return error;
538 }
539
540 /* This is VM_MAYWRITE instead of VM_WRITE because a call
541 to mprotect() can turn on VM_WRITE later. */
542
543 if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) ==
544 (VM_MAYSHARE | VM_MAYWRITE))
545 vma->vm_ops = &gfs2_vm_ops_sharewrite;
546 else
547 vma->vm_ops = &gfs2_vm_ops_private;
548
549 gfs2_glock_dq_uninit(&i_gh);
550
551 return error;
552}
553
554/**
555 * gfs2_open - open a file
556 * @inode: the inode to open
557 * @file: the struct file for this opening
558 *
559 * Returns: errno
560 */
561
562static int gfs2_open(struct inode *inode, struct file *file)
563{
564 struct gfs2_inode *ip = GFS2_I(inode);
565 struct gfs2_holder i_gh;
566 struct gfs2_file *fp;
567 int error;
568
569 fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
570 if (!fp)
571 return -ENOMEM;
572
573 mutex_init(&fp->f_fl_mutex);
574
575 gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
576 file->private_data = fp;
577
578 if (S_ISREG(ip->i_di.di_mode)) {
579 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
580 &i_gh);
581 if (error)
582 goto fail;
583
584 if (!(file->f_flags & O_LARGEFILE) &&
585 ip->i_di.di_size > MAX_NON_LFS) {
586 error = -EFBIG;
587 goto fail_gunlock;
588 }
589
590 /* Listen to the Direct I/O flag */
591
592 if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
593 file->f_flags |= O_DIRECT;
594
595 gfs2_glock_dq_uninit(&i_gh);
596 }
597
598 return 0;
599
600fail_gunlock:
601 gfs2_glock_dq_uninit(&i_gh);
602fail:
603 file->private_data = NULL;
604 kfree(fp);
605 return error;
606}
607
608/**
609 * gfs2_close - called to close a struct file
610 * @inode: the inode the struct file belongs to
611 * @file: the struct file being closed
612 *
613 * Returns: errno
614 */
615
616static int gfs2_close(struct inode *inode, struct file *file)
617{
618 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
619 struct gfs2_file *fp;
620
621 fp = file->private_data;
622 file->private_data = NULL;
623
624 if (gfs2_assert_warn(sdp, fp))
625 return -EIO;
626
627 kfree(fp);
628
629 return 0;
630}
631
632/**
633 * gfs2_fsync - sync the dirty data for a file (across the cluster)
634 * @file: the file that points to the dentry (we ignore this)
635 * @dentry: the dentry that points to the inode to sync
636 *
637 * Returns: errno
638 */
639
640static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
641{
642 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
643
644 gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
645
646 return 0;
647}
648
649/**
650 * gfs2_lock - acquire/release a posix lock on a file
651 * @file: the file pointer
652 * @cmd: either modify or retrieve lock state, possibly wait
653 * @fl: type and range of lock
654 *
655 * Returns: errno
656 */
657
658static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
659{
660 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
661 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
662 struct lm_lockname name =
663 { .ln_number = ip->i_num.no_addr,
664 .ln_type = LM_TYPE_PLOCK };
665
666 if (!(fl->fl_flags & FL_POSIX))
667 return -ENOLCK;
668 if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
669 return -ENOLCK;
670
671 if (sdp->sd_args.ar_localflocks) {
672 if (IS_GETLK(cmd)) {
673 struct file_lock tmp;
674 int ret;
675 ret = posix_test_lock(file, fl, &tmp);
676 fl->fl_type = F_UNLCK;
677 if (ret)
678 memcpy(fl, &tmp, sizeof(struct file_lock));
679 return 0;
680 } else {
681 return posix_lock_file_wait(file, fl);
682 }
683 }
684
685 if (IS_GETLK(cmd))
686 return gfs2_lm_plock_get(sdp, &name, file, fl);
687 else if (fl->fl_type == F_UNLCK)
688 return gfs2_lm_punlock(sdp, &name, file, fl);
689 else
690 return gfs2_lm_plock(sdp, &name, file, cmd, fl);
691}
692
693static int do_flock(struct file *file, int cmd, struct file_lock *fl)
694{
695 struct gfs2_file *fp = file->private_data;
696 struct gfs2_holder *fl_gh = &fp->f_fl_gh;
697 struct gfs2_inode *ip = GFS2_I(file->f_dentry->d_inode);
698 struct gfs2_glock *gl;
699 unsigned int state;
700 int flags;
701 int error = 0;
702
703 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
704 flags = ((IS_SETLKW(cmd)) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
705
706 mutex_lock(&fp->f_fl_mutex);
707
708 gl = fl_gh->gh_gl;
709 if (gl) {
710 if (fl_gh->gh_state == state)
711 goto out;
712 gfs2_glock_hold(gl);
713 flock_lock_file_wait(file,
714 &(struct file_lock){.fl_type = F_UNLCK});
715 gfs2_glock_dq_uninit(fl_gh);
716 } else {
717 error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
718 ip->i_num.no_addr, &gfs2_flock_glops,
719 CREATE, &gl);
720 if (error)
721 goto out;
722 }
723
724 gfs2_holder_init(gl, state, flags, fl_gh);
725 gfs2_glock_put(gl);
726
727 error = gfs2_glock_nq(fl_gh);
728 if (error) {
729 gfs2_holder_uninit(fl_gh);
730 if (error == GLR_TRYFAILED)
731 error = -EAGAIN;
732 } else {
733 error = flock_lock_file_wait(file, fl);
734 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
735 }
736
737out:
738 mutex_unlock(&fp->f_fl_mutex);
739 return error;
740}
741
742static void do_unflock(struct file *file, struct file_lock *fl)
743{
744 struct gfs2_file *fp = file->private_data;
745 struct gfs2_holder *fl_gh = &fp->f_fl_gh;
746
747 mutex_lock(&fp->f_fl_mutex);
748 flock_lock_file_wait(file, fl);
749 if (fl_gh->gh_gl)
750 gfs2_glock_dq_uninit(fl_gh);
751 mutex_unlock(&fp->f_fl_mutex);
752}
753
754/**
755 * gfs2_flock - acquire/release a flock lock on a file
756 * @file: the file pointer
757 * @cmd: either modify or retrieve lock state, possibly wait
758 * @fl: type and range of lock
759 *
760 * Returns: errno
761 */
762
763static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
764{
765 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
766 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
767
768 if (!(fl->fl_flags & FL_FLOCK))
769 return -ENOLCK;
770 if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
771 return -ENOLCK;
772
773 if (sdp->sd_args.ar_localflocks)
774 return flock_lock_file_wait(file, fl);
775
776 if (fl->fl_type == F_UNLCK) {
777 do_unflock(file, fl);
778 return 0;
779 } else
780 return do_flock(file, cmd, fl);
781}
782
783const struct file_operations gfs2_file_fops = {
784 .llseek = gfs2_llseek,
785 .read = generic_file_read,
786 .readv = generic_file_readv,
787 .aio_read = generic_file_aio_read,
788 .write = generic_file_write,
789 .writev = generic_file_writev,
790 .aio_write = generic_file_aio_write,
791 .unlocked_ioctl = gfs2_ioctl,
792 .mmap = gfs2_mmap,
793 .open = gfs2_open,
794 .release = gfs2_close,
795 .fsync = gfs2_fsync,
796 .lock = gfs2_lock,
797 .sendfile = generic_file_sendfile,
798 .flock = gfs2_flock,
799 .splice_read = generic_file_splice_read,
800 .splice_write = generic_file_splice_write,
801};
802
803const struct file_operations gfs2_dir_fops = {
804 .readdir = gfs2_readdir,
805 .unlocked_ioctl = gfs2_ioctl,
806 .open = gfs2_open,
807 .release = gfs2_close,
808 .fsync = gfs2_fsync,
809 .lock = gfs2_lock,
810 .flock = gfs2_flock,
811};
812
diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h
new file mode 100644
index 000000000000..46302b513937
--- /dev/null
+++ b/fs/gfs2/ops_file.h
@@ -0,0 +1,20 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_FILE_DOT_H__
11#define __OPS_FILE_DOT_H__
12extern struct file gfs2_internal_file_sentinal;
13extern int gfs2_internal_read(struct gfs2_inode *ip,
14 struct file_ra_state *ra_state,
15 char *buf, loff_t *pos, unsigned size);
16
17extern const struct file_operations gfs2_file_fops;
18extern const struct file_operations gfs2_dir_fops;
19
20#endif /* __OPS_FILE_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
new file mode 100644
index 000000000000..e5a91ead250c
--- /dev/null
+++ b/fs/gfs2/ops_fstype.c
@@ -0,0 +1,980 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/vmalloc.h>
16#include <linux/blkdev.h>
17#include <linux/kthread.h>
18#include <linux/namei.h>
19#include <linux/mount.h>
20#include <linux/gfs2_ondisk.h>
21
22#include "gfs2.h"
23#include "lm_interface.h"
24#include "incore.h"
25#include "daemon.h"
26#include "glock.h"
27#include "glops.h"
28#include "inode.h"
29#include "lm.h"
30#include "mount.h"
31#include "ops_export.h"
32#include "ops_fstype.h"
33#include "ops_super.h"
34#include "recovery.h"
35#include "rgrp.h"
36#include "super.h"
37#include "sys.h"
38#include "util.h"
39
40#define DO 0
41#define UNDO 1
42
43extern struct dentry_operations gfs2_dops;
44
45static struct gfs2_sbd *init_sbd(struct super_block *sb)
46{
47 struct gfs2_sbd *sdp;
48 unsigned int x;
49
50 sdp = vmalloc(sizeof(struct gfs2_sbd));
51 if (!sdp)
52 return NULL;
53
54 memset(sdp, 0, sizeof(struct gfs2_sbd));
55
56 sb->s_fs_info = sdp;
57 sdp->sd_vfs = sb;
58
59 gfs2_tune_init(&sdp->sd_tune);
60
61 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
62 rwlock_init(&sdp->sd_gl_hash[x].hb_lock);
63 INIT_LIST_HEAD(&sdp->sd_gl_hash[x].hb_list);
64 }
65 INIT_LIST_HEAD(&sdp->sd_reclaim_list);
66 spin_lock_init(&sdp->sd_reclaim_lock);
67 init_waitqueue_head(&sdp->sd_reclaim_wq);
68
69 mutex_init(&sdp->sd_inum_mutex);
70 spin_lock_init(&sdp->sd_statfs_spin);
71 mutex_init(&sdp->sd_statfs_mutex);
72
73 spin_lock_init(&sdp->sd_rindex_spin);
74 mutex_init(&sdp->sd_rindex_mutex);
75 INIT_LIST_HEAD(&sdp->sd_rindex_list);
76 INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
77 INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
78
79 INIT_LIST_HEAD(&sdp->sd_jindex_list);
80 spin_lock_init(&sdp->sd_jindex_spin);
81 mutex_init(&sdp->sd_jindex_mutex);
82
83 INIT_LIST_HEAD(&sdp->sd_quota_list);
84 spin_lock_init(&sdp->sd_quota_spin);
85 mutex_init(&sdp->sd_quota_mutex);
86
87 spin_lock_init(&sdp->sd_log_lock);
88
89 INIT_LIST_HEAD(&sdp->sd_log_le_gl);
90 INIT_LIST_HEAD(&sdp->sd_log_le_buf);
91 INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
92 INIT_LIST_HEAD(&sdp->sd_log_le_rg);
93 INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
94
95 mutex_init(&sdp->sd_log_reserve_mutex);
96 INIT_LIST_HEAD(&sdp->sd_ail1_list);
97 INIT_LIST_HEAD(&sdp->sd_ail2_list);
98
99 init_rwsem(&sdp->sd_log_flush_lock);
100 INIT_LIST_HEAD(&sdp->sd_log_flush_list);
101
102 INIT_LIST_HEAD(&sdp->sd_revoke_list);
103
104 mutex_init(&sdp->sd_freeze_lock);
105
106 return sdp;
107}
108
109static void init_vfs(struct super_block *sb, unsigned noatime)
110{
111 struct gfs2_sbd *sdp = sb->s_fs_info;
112
113 sb->s_magic = GFS2_MAGIC;
114 sb->s_op = &gfs2_super_ops;
115 sb->s_export_op = &gfs2_export_ops;
116 sb->s_maxbytes = MAX_LFS_FILESIZE;
117
118 if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
119 set_bit(noatime, &sdp->sd_flags);
120
121 /* Don't let the VFS update atimes. GFS2 handles this itself. */
122 sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
123}
124
125static int init_names(struct gfs2_sbd *sdp, int silent)
126{
127 struct gfs2_sb *sb = NULL;
128 char *proto, *table;
129 int error = 0;
130
131 proto = sdp->sd_args.ar_lockproto;
132 table = sdp->sd_args.ar_locktable;
133
134 /* Try to autodetect */
135
136 if (!proto[0] || !table[0]) {
137 struct buffer_head *bh;
138 bh = sb_getblk(sdp->sd_vfs,
139 GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
140 lock_buffer(bh);
141 clear_buffer_uptodate(bh);
142 clear_buffer_dirty(bh);
143 unlock_buffer(bh);
144 ll_rw_block(READ, 1, &bh);
145 wait_on_buffer(bh);
146
147 if (!buffer_uptodate(bh)) {
148 brelse(bh);
149 return -EIO;
150 }
151
152 sb = kmalloc(sizeof(struct gfs2_sb), GFP_KERNEL);
153 if (!sb) {
154 brelse(bh);
155 return -ENOMEM;
156 }
157 gfs2_sb_in(sb, bh->b_data);
158 brelse(bh);
159
160 error = gfs2_check_sb(sdp, sb, silent);
161 if (error)
162 goto out;
163
164 if (!proto[0])
165 proto = sb->sb_lockproto;
166 if (!table[0])
167 table = sb->sb_locktable;
168 }
169
170 if (!table[0])
171 table = sdp->sd_vfs->s_id;
172
173 snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto);
174 snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table);
175
176 out:
177 kfree(sb);
178
179 return error;
180}
181
182static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
183 int undo)
184{
185 struct task_struct *p;
186 int error = 0;
187
188 if (undo)
189 goto fail_trans;
190
191 p = kthread_run(gfs2_scand, sdp, "gfs2_scand");
192 error = IS_ERR(p);
193 if (error) {
194 fs_err(sdp, "can't start scand thread: %d\n", error);
195 return error;
196 }
197 sdp->sd_scand_process = p;
198
199 for (sdp->sd_glockd_num = 0;
200 sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
201 sdp->sd_glockd_num++) {
202 p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
203 error = IS_ERR(p);
204 if (error) {
205 fs_err(sdp, "can't start glockd thread: %d\n", error);
206 goto fail;
207 }
208 sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
209 }
210
211 error = gfs2_glock_nq_num(sdp,
212 GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
213 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
214 mount_gh);
215 if (error) {
216 fs_err(sdp, "can't acquire mount glock: %d\n", error);
217 goto fail;
218 }
219
220 error = gfs2_glock_nq_num(sdp,
221 GFS2_LIVE_LOCK, &gfs2_nondisk_glops,
222 LM_ST_SHARED,
223 LM_FLAG_NOEXP | GL_EXACT,
224 &sdp->sd_live_gh);
225 if (error) {
226 fs_err(sdp, "can't acquire live glock: %d\n", error);
227 goto fail_mount;
228 }
229
230 error = gfs2_glock_get(sdp, GFS2_RENAME_LOCK, &gfs2_nondisk_glops,
231 CREATE, &sdp->sd_rename_gl);
232 if (error) {
233 fs_err(sdp, "can't create rename glock: %d\n", error);
234 goto fail_live;
235 }
236
237 error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops,
238 CREATE, &sdp->sd_trans_gl);
239 if (error) {
240 fs_err(sdp, "can't create transaction glock: %d\n", error);
241 goto fail_rename;
242 }
243 set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
244
245 return 0;
246
247fail_trans:
248 gfs2_glock_put(sdp->sd_trans_gl);
249
250fail_rename:
251 gfs2_glock_put(sdp->sd_rename_gl);
252
253fail_live:
254 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
255
256fail_mount:
257 gfs2_glock_dq_uninit(mount_gh);
258
259fail:
260 while (sdp->sd_glockd_num--)
261 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
262
263 kthread_stop(sdp->sd_scand_process);
264
265 return error;
266}
267
268static struct inode *gfs2_lookup_root(struct super_block *sb,
269 struct gfs2_inum *inum)
270{
271 return gfs2_inode_lookup(sb, inum, DT_DIR);
272}
273
274static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
275{
276 struct super_block *sb = sdp->sd_vfs;
277 struct gfs2_holder sb_gh;
278 struct gfs2_inum *inum;
279 struct inode *inode;
280 int error = 0;
281
282 if (undo) {
283 if (sb->s_root) {
284 dput(sb->s_root);
285 sb->s_root = NULL;
286 }
287 return 0;
288 }
289
290 error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
291 LM_ST_SHARED, 0, &sb_gh);
292 if (error) {
293 fs_err(sdp, "can't acquire superblock glock: %d\n", error);
294 return error;
295 }
296
297 error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
298 if (error) {
299 fs_err(sdp, "can't read superblock: %d\n", error);
300 goto out;
301 }
302
303 /* Set up the buffer cache and SB for real */
304 if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
305 error = -EINVAL;
306 fs_err(sdp, "FS block size (%u) is too small for device "
307 "block size (%u)\n",
308 sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
309 goto out;
310 }
311 if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
312 error = -EINVAL;
313 fs_err(sdp, "FS block size (%u) is too big for machine "
314 "page size (%u)\n",
315 sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
316 goto out;
317 }
318 sb_set_blocksize(sb, sdp->sd_sb.sb_bsize);
319
320 /* Get the root inode */
321 inum = &sdp->sd_sb.sb_root_dir;
322 if (sb->s_type == &gfs2meta_fs_type)
323 inum = &sdp->sd_sb.sb_master_dir;
324 inode = gfs2_lookup_root(sb, inum);
325 if (IS_ERR(inode)) {
326 error = PTR_ERR(inode);
327 fs_err(sdp, "can't read in root inode: %d\n", error);
328 goto out;
329 }
330
331 sb->s_root = d_alloc_root(inode);
332 if (!sb->s_root) {
333 fs_err(sdp, "can't get root dentry\n");
334 error = -ENOMEM;
335 iput(inode);
336 }
337 sb->s_root->d_op = &gfs2_dops;
338out:
339 gfs2_glock_dq_uninit(&sb_gh);
340 return error;
341}
342
343static int init_journal(struct gfs2_sbd *sdp, int undo)
344{
345 struct gfs2_holder ji_gh;
346 struct task_struct *p;
347 struct gfs2_inode *ip;
348 int jindex = 1;
349 int error = 0;
350
351 if (undo) {
352 jindex = 0;
353 goto fail_recoverd;
354 }
355
356 sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex");
357 if (IS_ERR(sdp->sd_jindex)) {
358 fs_err(sdp, "can't lookup journal index: %d\n", error);
359 return PTR_ERR(sdp->sd_jindex);
360 }
361 ip = GFS2_I(sdp->sd_jindex);
362 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
363
364 /* Load in the journal index special file */
365
366 error = gfs2_jindex_hold(sdp, &ji_gh);
367 if (error) {
368 fs_err(sdp, "can't read journal index: %d\n", error);
369 goto fail;
370 }
371
372 error = -EINVAL;
373 if (!gfs2_jindex_size(sdp)) {
374 fs_err(sdp, "no journals!\n");
375 goto fail_jindex;
376 }
377
378 if (sdp->sd_args.ar_spectator) {
379 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
380 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
381 } else {
382 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
383 fs_err(sdp, "can't mount journal #%u\n",
384 sdp->sd_lockstruct.ls_jid);
385 fs_err(sdp, "there are only %u journals (0 - %u)\n",
386 gfs2_jindex_size(sdp),
387 gfs2_jindex_size(sdp) - 1);
388 goto fail_jindex;
389 }
390 sdp->sd_jdesc = gfs2_jdesc_find(sdp, sdp->sd_lockstruct.ls_jid);
391
392 error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid,
393 &gfs2_journal_glops,
394 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP,
395 &sdp->sd_journal_gh);
396 if (error) {
397 fs_err(sdp, "can't acquire journal glock: %d\n", error);
398 goto fail_jindex;
399 }
400
401 ip = GFS2_I(sdp->sd_jdesc->jd_inode);
402 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
403 LM_FLAG_NOEXP | GL_EXACT,
404 &sdp->sd_jinode_gh);
405 if (error) {
406 fs_err(sdp, "can't acquire journal inode glock: %d\n",
407 error);
408 goto fail_journal_gh;
409 }
410
411 error = gfs2_jdesc_check(sdp->sd_jdesc);
412 if (error) {
413 fs_err(sdp, "my journal (%u) is bad: %d\n",
414 sdp->sd_jdesc->jd_jid, error);
415 goto fail_jinode_gh;
416 }
417 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
418 }
419
420 if (sdp->sd_lockstruct.ls_first) {
421 unsigned int x;
422 for (x = 0; x < sdp->sd_journals; x++) {
423 error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x));
424 if (error) {
425 fs_err(sdp, "error recovering journal %u: %d\n",
426 x, error);
427 goto fail_jinode_gh;
428 }
429 }
430
431 gfs2_lm_others_may_mount(sdp);
432 } else if (!sdp->sd_args.ar_spectator) {
433 error = gfs2_recover_journal(sdp->sd_jdesc);
434 if (error) {
435 fs_err(sdp, "error recovering my journal: %d\n", error);
436 goto fail_jinode_gh;
437 }
438 }
439
440 set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags);
441 gfs2_glock_dq_uninit(&ji_gh);
442 jindex = 0;
443
444 p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
445 error = IS_ERR(p);
446 if (error) {
447 fs_err(sdp, "can't start recoverd thread: %d\n", error);
448 goto fail_jinode_gh;
449 }
450 sdp->sd_recoverd_process = p;
451
452 return 0;
453
454 fail_recoverd:
455 kthread_stop(sdp->sd_recoverd_process);
456
457 fail_jinode_gh:
458 if (!sdp->sd_args.ar_spectator)
459 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
460
461 fail_journal_gh:
462 if (!sdp->sd_args.ar_spectator)
463 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
464
465 fail_jindex:
466 gfs2_jindex_free(sdp);
467 if (jindex)
468 gfs2_glock_dq_uninit(&ji_gh);
469
470 fail:
471 iput(sdp->sd_jindex);
472
473 return error;
474}
475
476
477static int init_inodes(struct gfs2_sbd *sdp, int undo)
478{
479 int error = 0;
480 struct gfs2_inode *ip;
481 struct inode *inode;
482
483 if (undo)
484 goto fail_qinode;
485
486 inode = gfs2_lookup_root(sdp->sd_vfs, &sdp->sd_sb.sb_master_dir);
487 if (IS_ERR(inode)) {
488 error = PTR_ERR(inode);
489 fs_err(sdp, "can't read in master directory: %d\n", error);
490 goto fail;
491 }
492 sdp->sd_master_dir = inode;
493
494 error = init_journal(sdp, undo);
495 if (error)
496 goto fail_master;
497
498 /* Read in the master inode number inode */
499 sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum");
500 if (IS_ERR(sdp->sd_inum_inode)) {
501 error = PTR_ERR(sdp->sd_inum_inode);
502 fs_err(sdp, "can't read in inum inode: %d\n", error);
503 goto fail_journal;
504 }
505
506
507 /* Read in the master statfs inode */
508 sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs");
509 if (IS_ERR(sdp->sd_statfs_inode)) {
510 error = PTR_ERR(sdp->sd_statfs_inode);
511 fs_err(sdp, "can't read in statfs inode: %d\n", error);
512 goto fail_inum;
513 }
514
515 /* Read in the resource index inode */
516 sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex");
517 if (IS_ERR(sdp->sd_rindex)) {
518 error = PTR_ERR(sdp->sd_rindex);
519 fs_err(sdp, "can't get resource index inode: %d\n", error);
520 goto fail_statfs;
521 }
522 ip = GFS2_I(sdp->sd_rindex);
523 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
524 sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1;
525
526 /* Read in the quota inode */
527 sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
528 if (IS_ERR(sdp->sd_quota_inode)) {
529 error = PTR_ERR(sdp->sd_quota_inode);
530 fs_err(sdp, "can't get quota file inode: %d\n", error);
531 goto fail_rindex;
532 }
533 return 0;
534
535fail_qinode:
536 iput(sdp->sd_quota_inode);
537
538fail_rindex:
539 gfs2_clear_rgrpd(sdp);
540 iput(sdp->sd_rindex);
541
542fail_statfs:
543 iput(sdp->sd_statfs_inode);
544
545fail_inum:
546 iput(sdp->sd_inum_inode);
547fail_journal:
548 init_journal(sdp, UNDO);
549fail_master:
550 iput(sdp->sd_master_dir);
551fail:
552 return error;
553}
554
555static int init_per_node(struct gfs2_sbd *sdp, int undo)
556{
557 struct inode *pn = NULL;
558 char buf[30];
559 int error = 0;
560 struct gfs2_inode *ip;
561
562 if (sdp->sd_args.ar_spectator)
563 return 0;
564
565 if (undo)
566 goto fail_qc_gh;
567
568 pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node");
569 if (IS_ERR(pn)) {
570 error = PTR_ERR(pn);
571 fs_err(sdp, "can't find per_node directory: %d\n", error);
572 return error;
573 }
574
575 sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
576 sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
577 if (IS_ERR(sdp->sd_ir_inode)) {
578 error = PTR_ERR(sdp->sd_ir_inode);
579 fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
580 goto fail;
581 }
582
583 sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
584 sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
585 if (IS_ERR(sdp->sd_sc_inode)) {
586 error = PTR_ERR(sdp->sd_sc_inode);
587 fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
588 goto fail_ir_i;
589 }
590
591 sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
592 sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf);
593 if (IS_ERR(sdp->sd_qc_inode)) {
594 error = PTR_ERR(sdp->sd_qc_inode);
595 fs_err(sdp, "can't find local \"qc\" file: %d\n", error);
596 goto fail_ut_i;
597 }
598
599 iput(pn);
600 pn = NULL;
601
602 ip = GFS2_I(sdp->sd_ir_inode);
603 error = gfs2_glock_nq_init(ip->i_gl,
604 LM_ST_EXCLUSIVE, 0,
605 &sdp->sd_ir_gh);
606 if (error) {
607 fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
608 goto fail_qc_i;
609 }
610
611 ip = GFS2_I(sdp->sd_sc_inode);
612 error = gfs2_glock_nq_init(ip->i_gl,
613 LM_ST_EXCLUSIVE, 0,
614 &sdp->sd_sc_gh);
615 if (error) {
616 fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
617 goto fail_ir_gh;
618 }
619
620 ip = GFS2_I(sdp->sd_qc_inode);
621 error = gfs2_glock_nq_init(ip->i_gl,
622 LM_ST_EXCLUSIVE, 0,
623 &sdp->sd_qc_gh);
624 if (error) {
625 fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
626 goto fail_ut_gh;
627 }
628
629 return 0;
630
631 fail_qc_gh:
632 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
633
634 fail_ut_gh:
635
636 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
637
638 fail_ir_gh:
639 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
640
641 fail_qc_i:
642 iput(sdp->sd_qc_inode);
643
644 fail_ut_i:
645
646 iput(sdp->sd_sc_inode);
647
648 fail_ir_i:
649 iput(sdp->sd_ir_inode);
650
651 fail:
652 if (pn)
653 iput(pn);
654 return error;
655}
656
657static int init_threads(struct gfs2_sbd *sdp, int undo)
658{
659 struct task_struct *p;
660 int error = 0;
661
662 if (undo)
663 goto fail_quotad;
664
665 sdp->sd_log_flush_time = jiffies;
666 sdp->sd_jindex_refresh_time = jiffies;
667
668 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
669 error = IS_ERR(p);
670 if (error) {
671 fs_err(sdp, "can't start logd thread: %d\n", error);
672 return error;
673 }
674 sdp->sd_logd_process = p;
675
676 sdp->sd_statfs_sync_time = jiffies;
677 sdp->sd_quota_sync_time = jiffies;
678
679 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
680 error = IS_ERR(p);
681 if (error) {
682 fs_err(sdp, "can't start quotad thread: %d\n", error);
683 goto fail;
684 }
685 sdp->sd_quotad_process = p;
686
687 return 0;
688
689
690fail_quotad:
691 kthread_stop(sdp->sd_quotad_process);
692fail:
693 kthread_stop(sdp->sd_logd_process);
694 return error;
695}
696
697/**
698 * fill_super - Read in superblock
699 * @sb: The VFS superblock
700 * @data: Mount options
701 * @silent: Don't complain if it's not a GFS2 filesystem
702 *
703 * Returns: errno
704 */
705
706static int fill_super(struct super_block *sb, void *data, int silent)
707{
708 struct gfs2_sbd *sdp;
709 struct gfs2_holder mount_gh;
710 int error;
711
712 sdp = init_sbd(sb);
713 if (!sdp) {
714 printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
715 return -ENOMEM;
716 }
717
718 error = gfs2_mount_args(sdp, (char *)data, 0);
719 if (error) {
720 printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
721 goto fail;
722 }
723
724 init_vfs(sb, SDF_NOATIME);
725
726 /* Set up the buffer cache and fill in some fake block size values
727 to allow us to read-in the on-disk superblock. */
728 sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK);
729 sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
730 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
731 GFS2_BASIC_BLOCK_SHIFT;
732 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
733
734 error = init_names(sdp, silent);
735 if (error)
736 goto fail;
737
738 error = gfs2_sys_fs_add(sdp);
739 if (error)
740 goto fail;
741
742 error = gfs2_lm_mount(sdp, silent);
743 if (error)
744 goto fail_sys;
745
746 error = init_locking(sdp, &mount_gh, DO);
747 if (error)
748 goto fail_lm;
749
750 error = init_sb(sdp, silent, DO);
751 if (error)
752 goto fail_locking;
753
754 error = init_inodes(sdp, DO);
755 if (error)
756 goto fail_sb;
757
758 error = init_per_node(sdp, DO);
759 if (error)
760 goto fail_inodes;
761
762 error = gfs2_statfs_init(sdp);
763 if (error) {
764 fs_err(sdp, "can't initialize statfs subsystem: %d\n", error);
765 goto fail_per_node;
766 }
767
768 error = init_threads(sdp, DO);
769 if (error)
770 goto fail_per_node;
771
772 if (!(sb->s_flags & MS_RDONLY)) {
773 error = gfs2_make_fs_rw(sdp);
774 if (error) {
775 fs_err(sdp, "can't make FS RW: %d\n", error);
776 goto fail_threads;
777 }
778 }
779
780 gfs2_glock_dq_uninit(&mount_gh);
781
782 return 0;
783
784 fail_threads:
785 init_threads(sdp, UNDO);
786
787 fail_per_node:
788 init_per_node(sdp, UNDO);
789
790 fail_inodes:
791 init_inodes(sdp, UNDO);
792
793 fail_sb:
794 init_sb(sdp, 0, UNDO);
795
796 fail_locking:
797 init_locking(sdp, &mount_gh, UNDO);
798
799 fail_lm:
800 gfs2_gl_hash_clear(sdp, WAIT);
801 gfs2_lm_unmount(sdp);
802 while (invalidate_inodes(sb))
803 yield();
804
805 fail_sys:
806 gfs2_sys_fs_del(sdp);
807
808 fail:
809 vfree(sdp);
810 sb->s_fs_info = NULL;
811
812 return error;
813}
814
815static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
816 const char *dev_name, void *data, struct vfsmount *mnt)
817{
818 struct super_block *sb;
819 struct gfs2_sbd *sdp;
820 int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
821 if (error)
822 goto out;
823 sb = mnt->mnt_sb;
824 sdp = (struct gfs2_sbd*)sb->s_fs_info;
825 sdp->sd_gfs2mnt = mnt;
826out:
827 return error;
828}
829
830static int fill_super_meta(struct super_block *sb, struct super_block *new,
831 void *data, int silent)
832{
833 struct gfs2_sbd *sdp = sb->s_fs_info;
834 struct inode *inode;
835 int error = 0;
836
837 new->s_fs_info = sdp;
838 sdp->sd_vfs_meta = sb;
839
840 init_vfs(new, SDF_NOATIME);
841
842 /* Get the master inode */
843 inode = igrab(sdp->sd_master_dir);
844
845 new->s_root = d_alloc_root(inode);
846 if (!new->s_root) {
847 fs_err(sdp, "can't get root dentry\n");
848 error = -ENOMEM;
849 iput(inode);
850 }
851 new->s_root->d_op = &gfs2_dops;
852
853 return error;
854}
855static int set_bdev_super(struct super_block *s, void *data)
856{
857 s->s_bdev = data;
858 s->s_dev = s->s_bdev->bd_dev;
859 return 0;
860}
861
862static int test_bdev_super(struct super_block *s, void *data)
863{
864 return (void *)s->s_bdev == data;
865}
866
867static struct super_block* get_gfs2_sb(const char *dev_name)
868{
869 struct kstat stat;
870 struct nameidata nd;
871 struct file_system_type *fstype;
872 struct super_block *sb = NULL, *s;
873 struct list_head *l;
874 int error;
875
876 error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
877 if (error) {
878 printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n",
879 dev_name);
880 goto out;
881 }
882 error = vfs_getattr(nd.mnt, nd.dentry, &stat);
883
884 fstype = get_fs_type("gfs2");
885 list_for_each(l, &fstype->fs_supers) {
886 s = list_entry(l, struct super_block, s_instances);
887 if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
888 (S_ISDIR(stat.mode) && s == nd.dentry->d_inode->i_sb)) {
889 sb = s;
890 goto free_nd;
891 }
892 }
893
894 printk(KERN_WARNING "GFS2: Unrecognized block device or "
895 "mount point %s", dev_name);
896
897free_nd:
898 path_release(&nd);
899out:
900 return sb;
901}
902
903static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
904 const char *dev_name, void *data, struct vfsmount *mnt)
905{
906 int error = 0;
907 struct super_block *sb = NULL, *new;
908 struct gfs2_sbd *sdp;
909 char *gfs2mnt = NULL;
910
911 sb = get_gfs2_sb(dev_name);
912 if (!sb) {
913 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
914 error = -ENOENT;
915 goto error;
916 }
917 sdp = (struct gfs2_sbd*) sb->s_fs_info;
918 if (sdp->sd_vfs_meta) {
919 printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n");
920 error = -EBUSY;
921 goto error;
922 }
923 mutex_lock(&sb->s_bdev->bd_mount_mutex);
924 new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev);
925 mutex_unlock(&sb->s_bdev->bd_mount_mutex);
926 if (IS_ERR(new)) {
927 error = PTR_ERR(new);
928 goto error;
929 }
930 module_put(fs_type->owner);
931 new->s_flags = flags;
932 strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
933 sb_set_blocksize(new, sb->s_blocksize);
934 error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0);
935 if (error) {
936 up_write(&new->s_umount);
937 deactivate_super(new);
938 goto error;
939 }
940
941 new->s_flags |= MS_ACTIVE;
942
943 /* Grab a reference to the gfs2 mount point */
944 atomic_inc(&sdp->sd_gfs2mnt->mnt_count);
945 return simple_set_mnt(mnt, new);
946error:
947 if (gfs2mnt)
948 kfree(gfs2mnt);
949 return error;
950}
951
952static void gfs2_kill_sb(struct super_block *sb)
953{
954 kill_block_super(sb);
955}
956
957static void gfs2_kill_sb_meta(struct super_block *sb)
958{
959 struct gfs2_sbd *sdp = sb->s_fs_info;
960 generic_shutdown_super(sb);
961 sdp->sd_vfs_meta = NULL;
962 atomic_dec(&sdp->sd_gfs2mnt->mnt_count);
963}
964
965struct file_system_type gfs2_fs_type = {
966 .name = "gfs2",
967 .fs_flags = FS_REQUIRES_DEV,
968 .get_sb = gfs2_get_sb,
969 .kill_sb = gfs2_kill_sb,
970 .owner = THIS_MODULE,
971};
972
973struct file_system_type gfs2meta_fs_type = {
974 .name = "gfs2meta",
975 .fs_flags = FS_REQUIRES_DEV,
976 .get_sb = gfs2_get_sb_meta,
977 .kill_sb = gfs2_kill_sb_meta,
978 .owner = THIS_MODULE,
979};
980
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
new file mode 100644
index 000000000000..622f5760d6b2
--- /dev/null
+++ b/fs/gfs2/ops_fstype.h
@@ -0,0 +1,16 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_FSTYPE_DOT_H__
11#define __OPS_FSTYPE_DOT_H__
12
13extern struct file_system_type gfs2_fs_type;
14extern struct file_system_type gfs2meta_fs_type;
15
16#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
new file mode 100644
index 000000000000..8fb7c5c9a7c3
--- /dev/null
+++ b/fs/gfs2/ops_inode.c
@@ -0,0 +1,1165 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/namei.h>
16#include <linux/utsname.h>
17#include <linux/mm.h>
18#include <linux/xattr.h>
19#include <linux/posix_acl.h>
20#include <linux/gfs2_ondisk.h>
21#include <linux/crc32.h>
22#include <asm/uaccess.h>
23
24#include "gfs2.h"
25#include "lm_interface.h"
26#include "incore.h"
27#include "acl.h"
28#include "bmap.h"
29#include "dir.h"
30#include "eaops.h"
31#include "eattr.h"
32#include "glock.h"
33#include "inode.h"
34#include "meta_io.h"
35#include "ops_dentry.h"
36#include "ops_inode.h"
37#include "quota.h"
38#include "rgrp.h"
39#include "trans.h"
40#include "util.h"
41
42/**
43 * gfs2_create - Create a file
44 * @dir: The directory in which to create the file
45 * @dentry: The dentry of the new file
46 * @mode: The mode of the new file
47 *
48 * Returns: errno
49 */
50
51static int gfs2_create(struct inode *dir, struct dentry *dentry,
52 int mode, struct nameidata *nd)
53{
54 struct gfs2_inode *dip = GFS2_I(dir);
55 struct gfs2_sbd *sdp = GFS2_SB(dir);
56 struct gfs2_holder ghs[2];
57 struct inode *inode;
58
59 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
60
61 for (;;) {
62 inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode);
63 if (!IS_ERR(inode)) {
64 gfs2_trans_end(sdp);
65 if (dip->i_alloc.al_rgd)
66 gfs2_inplace_release(dip);
67 gfs2_quota_unlock(dip);
68 gfs2_alloc_put(dip);
69 gfs2_glock_dq_uninit_m(2, ghs);
70 mark_inode_dirty(inode);
71 break;
72 } else if (PTR_ERR(inode) != -EEXIST ||
73 (nd->intent.open.flags & O_EXCL)) {
74 gfs2_holder_uninit(ghs);
75 return PTR_ERR(inode);
76 }
77
78 inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
79 if (inode) {
80 if (!IS_ERR(inode)) {
81 gfs2_holder_uninit(ghs);
82 break;
83 } else {
84 gfs2_holder_uninit(ghs);
85 return PTR_ERR(inode);
86 }
87 }
88 }
89
90 d_instantiate(dentry, inode);
91
92 return 0;
93}
94
95/**
96 * gfs2_lookup - Look up a filename in a directory and return its inode
97 * @dir: The directory inode
98 * @dentry: The dentry of the new inode
99 * @nd: passed from Linux VFS, ignored by us
100 *
101 * Called by the VFS layer. Lock dir and call gfs2_lookupi()
102 *
103 * Returns: errno
104 */
105
106static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
107 struct nameidata *nd)
108{
109 struct inode *inode = NULL;
110
111 dentry->d_op = &gfs2_dops;
112
113 inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
114 if (inode && IS_ERR(inode))
115 return ERR_PTR(PTR_ERR(inode));
116
117 if (inode)
118 return d_splice_alias(inode, dentry);
119 d_add(dentry, inode);
120
121 return NULL;
122}
123
124/**
125 * gfs2_link - Link to a file
126 * @old_dentry: The inode to link
127 * @dir: Add link to this directory
128 * @dentry: The name of the link
129 *
130 * Link the inode in "old_dentry" into the directory "dir" with the
131 * name in "dentry".
132 *
133 * Returns: errno
134 */
135
136static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
137 struct dentry *dentry)
138{
139 struct gfs2_inode *dip = GFS2_I(dir);
140 struct gfs2_sbd *sdp = GFS2_SB(dir);
141 struct inode *inode = old_dentry->d_inode;
142 struct gfs2_inode *ip = GFS2_I(inode);
143 struct gfs2_holder ghs[2];
144 int alloc_required;
145 int error;
146
147 if (S_ISDIR(ip->i_di.di_mode))
148 return -EPERM;
149
150 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
151 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
152
153 error = gfs2_glock_nq_m(2, ghs);
154 if (error)
155 goto out;
156
157 error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
158 if (error)
159 goto out_gunlock;
160
161 error = gfs2_dir_search(dir, &dentry->d_name, NULL, NULL);
162 switch (error) {
163 case -ENOENT:
164 break;
165 case 0:
166 error = -EEXIST;
167 default:
168 goto out_gunlock;
169 }
170
171 error = -EINVAL;
172 if (!dip->i_di.di_nlink)
173 goto out_gunlock;
174 error = -EFBIG;
175 if (dip->i_di.di_entries == (uint32_t)-1)
176 goto out_gunlock;
177 error = -EPERM;
178 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
179 goto out_gunlock;
180 error = -EINVAL;
181 if (!ip->i_di.di_nlink)
182 goto out_gunlock;
183 error = -EMLINK;
184 if (ip->i_di.di_nlink == (uint32_t)-1)
185 goto out_gunlock;
186
187 alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
188 if (error < 0)
189 goto out_gunlock;
190 error = 0;
191
192 if (alloc_required) {
193 struct gfs2_alloc *al = gfs2_alloc_get(dip);
194
195 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
196 if (error)
197 goto out_alloc;
198
199 error = gfs2_quota_check(dip, dip->i_di.di_uid,
200 dip->i_di.di_gid);
201 if (error)
202 goto out_gunlock_q;
203
204 al->al_requested = sdp->sd_max_dirres;
205
206 error = gfs2_inplace_reserve(dip);
207 if (error)
208 goto out_gunlock_q;
209
210 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
211 al->al_rgd->rd_ri.ri_length +
212 2 * RES_DINODE + RES_STATFS +
213 RES_QUOTA, 0);
214 if (error)
215 goto out_ipres;
216 } else {
217 error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
218 if (error)
219 goto out_ipres;
220 }
221
222 error = gfs2_dir_add(dir, &dentry->d_name, &ip->i_num,
223 IF2DT(ip->i_di.di_mode));
224 if (error)
225 goto out_end_trans;
226
227 error = gfs2_change_nlink(ip, +1);
228
229out_end_trans:
230 gfs2_trans_end(sdp);
231
232out_ipres:
233 if (alloc_required)
234 gfs2_inplace_release(dip);
235
236out_gunlock_q:
237 if (alloc_required)
238 gfs2_quota_unlock(dip);
239
240out_alloc:
241 if (alloc_required)
242 gfs2_alloc_put(dip);
243
244out_gunlock:
245 gfs2_glock_dq_m(2, ghs);
246
247out:
248 gfs2_holder_uninit(ghs);
249 gfs2_holder_uninit(ghs + 1);
250
251 if (!error) {
252 atomic_inc(&inode->i_count);
253 d_instantiate(dentry, inode);
254 mark_inode_dirty(inode);
255 }
256
257 return error;
258}
259
260/**
261 * gfs2_unlink - Unlink a file
262 * @dir: The inode of the directory containing the file to unlink
263 * @dentry: The file itself
264 *
265 * Unlink a file. Call gfs2_unlinki()
266 *
267 * Returns: errno
268 */
269
270static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
271{
272 struct gfs2_inode *dip = GFS2_I(dir);
273 struct gfs2_sbd *sdp = GFS2_SB(dir);
274 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
275 struct gfs2_holder ghs[2];
276 int error;
277
278 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
279 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
280
281 error = gfs2_glock_nq_m(2, ghs);
282 if (error)
283 goto out;
284
285 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
286 if (error)
287 goto out_gunlock;
288
289 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
290 if (error)
291 goto out_gunlock;
292
293 error = gfs2_dir_del(dip, &dentry->d_name);
294 if (error)
295 goto out_end_trans;
296
297 error = gfs2_change_nlink(ip, -1);
298
299out_end_trans:
300 gfs2_trans_end(sdp);
301out_gunlock:
302 gfs2_glock_dq_m(2, ghs);
303out:
304 gfs2_holder_uninit(ghs);
305 gfs2_holder_uninit(ghs + 1);
306 return error;
307}
308
309/**
310 * gfs2_symlink - Create a symlink
311 * @dir: The directory to create the symlink in
312 * @dentry: The dentry to put the symlink in
313 * @symname: The thing which the link points to
314 *
315 * Returns: errno
316 */
317
318static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
319 const char *symname)
320{
321 struct gfs2_inode *dip = GFS2_I(dir), *ip;
322 struct gfs2_sbd *sdp = GFS2_SB(dir);
323 struct gfs2_holder ghs[2];
324 struct inode *inode;
325 struct buffer_head *dibh;
326 int size;
327 int error;
328
329 /* Must be stuffed with a null terminator for gfs2_follow_link() */
330 size = strlen(symname);
331 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
332 return -ENAMETOOLONG;
333
334 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
335
336 inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO);
337 if (IS_ERR(inode)) {
338 gfs2_holder_uninit(ghs);
339 return PTR_ERR(inode);
340 }
341
342 ip = ghs[1].gh_gl->gl_object;
343
344 ip->i_di.di_size = size;
345
346 error = gfs2_meta_inode_buffer(ip, &dibh);
347
348 if (!gfs2_assert_withdraw(sdp, !error)) {
349 gfs2_dinode_out(&ip->i_di, dibh->b_data);
350 memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname,
351 size);
352 brelse(dibh);
353 }
354
355 gfs2_trans_end(sdp);
356 if (dip->i_alloc.al_rgd)
357 gfs2_inplace_release(dip);
358 gfs2_quota_unlock(dip);
359 gfs2_alloc_put(dip);
360
361 gfs2_glock_dq_uninit_m(2, ghs);
362
363 d_instantiate(dentry, inode);
364 mark_inode_dirty(inode);
365
366 return 0;
367}
368
369/**
370 * gfs2_mkdir - Make a directory
371 * @dir: The parent directory of the new one
372 * @dentry: The dentry of the new directory
373 * @mode: The mode of the new directory
374 *
375 * Returns: errno
376 */
377
378static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
379{
380 struct gfs2_inode *dip = GFS2_I(dir), *ip;
381 struct gfs2_sbd *sdp = GFS2_SB(dir);
382 struct gfs2_holder ghs[2];
383 struct inode *inode;
384 struct buffer_head *dibh;
385 int error;
386
387 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
388
389 inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode);
390 if (IS_ERR(inode)) {
391 gfs2_holder_uninit(ghs);
392 return PTR_ERR(inode);
393 }
394
395 ip = ghs[1].gh_gl->gl_object;
396
397 ip->i_di.di_nlink = 2;
398 ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
399 ip->i_di.di_flags |= GFS2_DIF_JDATA;
400 ip->i_di.di_payload_format = GFS2_FORMAT_DE;
401 ip->i_di.di_entries = 2;
402
403 error = gfs2_meta_inode_buffer(ip, &dibh);
404
405 if (!gfs2_assert_withdraw(sdp, !error)) {
406 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
407 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
408 struct qstr str;
409
410 gfs2_str2qstr(&str, ".");
411 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
412 gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent);
413 dent->de_inum = di->di_num; /* already GFS2 endian */
414 dent->de_type = DT_DIR;
415 di->di_entries = cpu_to_be32(1);
416
417 gfs2_str2qstr(&str, "..");
418 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
419 gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
420
421 gfs2_inum_out(&dip->i_num, (char *) &dent->de_inum);
422 dent->de_type = DT_DIR;
423
424 gfs2_dinode_out(&ip->i_di, (char *)di);
425
426 brelse(dibh);
427 }
428
429 error = gfs2_change_nlink(dip, +1);
430 gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
431
432 gfs2_trans_end(sdp);
433 if (dip->i_alloc.al_rgd)
434 gfs2_inplace_release(dip);
435 gfs2_quota_unlock(dip);
436 gfs2_alloc_put(dip);
437
438 gfs2_glock_dq_uninit_m(2, ghs);
439
440 d_instantiate(dentry, inode);
441 mark_inode_dirty(inode);
442
443 return 0;
444}
445
446/**
447 * gfs2_rmdir - Remove a directory
448 * @dir: The parent directory of the directory to be removed
449 * @dentry: The dentry of the directory to remove
450 *
451 * Remove a directory. Call gfs2_rmdiri()
452 *
453 * Returns: errno
454 */
455
456static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
457{
458 struct gfs2_inode *dip = GFS2_I(dir);
459 struct gfs2_sbd *sdp = GFS2_SB(dir);
460 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
461 struct gfs2_holder ghs[2];
462 int error;
463
464 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
465 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
466
467 error = gfs2_glock_nq_m(2, ghs);
468 if (error)
469 goto out;
470
471 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
472 if (error)
473 goto out_gunlock;
474
475 if (ip->i_di.di_entries < 2) {
476 if (gfs2_consist_inode(ip))
477 gfs2_dinode_print(&ip->i_di);
478 error = -EIO;
479 goto out_gunlock;
480 }
481 if (ip->i_di.di_entries > 2) {
482 error = -ENOTEMPTY;
483 goto out_gunlock;
484 }
485
486 error = gfs2_trans_begin(sdp, 2 * RES_DINODE + 3 * RES_LEAF + RES_RG_BIT, 0);
487 if (error)
488 goto out_gunlock;
489
490 error = gfs2_rmdiri(dip, &dentry->d_name, ip);
491
492 gfs2_trans_end(sdp);
493
494 out_gunlock:
495 gfs2_glock_dq_m(2, ghs);
496
497 out:
498 gfs2_holder_uninit(ghs);
499 gfs2_holder_uninit(ghs + 1);
500
501 return error;
502}
503
504/**
505 * gfs2_mknod - Make a special file
506 * @dir: The directory in which the special file will reside
507 * @dentry: The dentry of the special file
508 * @mode: The mode of the special file
509 * @rdev: The device specification of the special file
510 *
511 */
512
513static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
514 dev_t dev)
515{
516 struct gfs2_inode *dip = GFS2_I(dir), *ip;
517 struct gfs2_sbd *sdp = GFS2_SB(dir);
518 struct gfs2_holder ghs[2];
519 struct inode *inode;
520 struct buffer_head *dibh;
521 uint32_t major = 0, minor = 0;
522 int error;
523
524 switch (mode & S_IFMT) {
525 case S_IFBLK:
526 case S_IFCHR:
527 major = MAJOR(dev);
528 minor = MINOR(dev);
529 break;
530 case S_IFIFO:
531 case S_IFSOCK:
532 break;
533 default:
534 return -EOPNOTSUPP;
535 };
536
537 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
538
539 inode = gfs2_createi(ghs, &dentry->d_name, mode);
540 if (IS_ERR(inode)) {
541 gfs2_holder_uninit(ghs);
542 return PTR_ERR(inode);
543 }
544
545 ip = ghs[1].gh_gl->gl_object;
546
547 ip->i_di.di_major = major;
548 ip->i_di.di_minor = minor;
549
550 error = gfs2_meta_inode_buffer(ip, &dibh);
551
552 if (!gfs2_assert_withdraw(sdp, !error)) {
553 gfs2_dinode_out(&ip->i_di, dibh->b_data);
554 brelse(dibh);
555 }
556
557 gfs2_trans_end(sdp);
558 if (dip->i_alloc.al_rgd)
559 gfs2_inplace_release(dip);
560 gfs2_quota_unlock(dip);
561 gfs2_alloc_put(dip);
562
563 gfs2_glock_dq_uninit_m(2, ghs);
564
565 d_instantiate(dentry, inode);
566 mark_inode_dirty(inode);
567
568 return 0;
569}
570
571/**
572 * gfs2_rename - Rename a file
573 * @odir: Parent directory of old file name
574 * @odentry: The old dentry of the file
575 * @ndir: Parent directory of new file name
576 * @ndentry: The new dentry of the file
577 *
578 * Returns: errno
579 */
580
581static int gfs2_rename(struct inode *odir, struct dentry *odentry,
582 struct inode *ndir, struct dentry *ndentry)
583{
584 struct gfs2_inode *odip = GFS2_I(odir);
585 struct gfs2_inode *ndip = GFS2_I(ndir);
586 struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
587 struct gfs2_inode *nip = NULL;
588 struct gfs2_sbd *sdp = GFS2_SB(odir);
589 struct gfs2_holder ghs[4], r_gh;
590 unsigned int num_gh;
591 int dir_rename = 0;
592 int alloc_required;
593 unsigned int x;
594 int error;
595
596 if (ndentry->d_inode) {
597 nip = GFS2_I(ndentry->d_inode);
598 if (ip == nip)
599 return 0;
600 }
601
602 /* Make sure we aren't trying to move a dirctory into it's subdir */
603
604 if (S_ISDIR(ip->i_di.di_mode) && odip != ndip) {
605 dir_rename = 1;
606
607 error = gfs2_glock_nq_init(sdp->sd_rename_gl,
608 LM_ST_EXCLUSIVE, 0,
609 &r_gh);
610 if (error)
611 goto out;
612
613 error = gfs2_ok_to_move(ip, ndip);
614 if (error)
615 goto out_gunlock_r;
616 }
617
618 num_gh = 1;
619 gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
620 if (odip != ndip) {
621 gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
622 num_gh++;
623 }
624 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
625 num_gh++;
626
627 if (nip) {
628 gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
629 num_gh++;
630 }
631
632 error = gfs2_glock_nq_m(num_gh, ghs);
633 if (error)
634 goto out_uninit;
635
636 /* Check out the old directory */
637
638 error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
639 if (error)
640 goto out_gunlock;
641
642 /* Check out the new directory */
643
644 if (nip) {
645 error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
646 if (error)
647 goto out_gunlock;
648
649 if (S_ISDIR(nip->i_di.di_mode)) {
650 if (nip->i_di.di_entries < 2) {
651 if (gfs2_consist_inode(nip))
652 gfs2_dinode_print(&nip->i_di);
653 error = -EIO;
654 goto out_gunlock;
655 }
656 if (nip->i_di.di_entries > 2) {
657 error = -ENOTEMPTY;
658 goto out_gunlock;
659 }
660 }
661 } else {
662 error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL);
663 if (error)
664 goto out_gunlock;
665
666 error = gfs2_dir_search(ndir, &ndentry->d_name, NULL, NULL);
667 switch (error) {
668 case -ENOENT:
669 error = 0;
670 break;
671 case 0:
672 error = -EEXIST;
673 default:
674 goto out_gunlock;
675 };
676
677 if (odip != ndip) {
678 if (!ndip->i_di.di_nlink) {
679 error = -EINVAL;
680 goto out_gunlock;
681 }
682 if (ndip->i_di.di_entries == (uint32_t)-1) {
683 error = -EFBIG;
684 goto out_gunlock;
685 }
686 if (S_ISDIR(ip->i_di.di_mode) &&
687 ndip->i_di.di_nlink == (uint32_t)-1) {
688 error = -EMLINK;
689 goto out_gunlock;
690 }
691 }
692 }
693
694 /* Check out the dir to be renamed */
695
696 if (dir_rename) {
697 error = permission(odentry->d_inode, MAY_WRITE, NULL);
698 if (error)
699 goto out_gunlock;
700 }
701
702 alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
703 if (error < 0)
704 goto out_gunlock;
705 error = 0;
706
707 if (alloc_required) {
708 struct gfs2_alloc *al = gfs2_alloc_get(ndip);
709
710 error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
711 if (error)
712 goto out_alloc;
713
714 error = gfs2_quota_check(ndip, ndip->i_di.di_uid,
715 ndip->i_di.di_gid);
716 if (error)
717 goto out_gunlock_q;
718
719 al->al_requested = sdp->sd_max_dirres;
720
721 error = gfs2_inplace_reserve(ndip);
722 if (error)
723 goto out_gunlock_q;
724
725 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
726 al->al_rgd->rd_ri.ri_length +
727 4 * RES_DINODE + 4 * RES_LEAF +
728 RES_STATFS + RES_QUOTA, 0);
729 if (error)
730 goto out_ipreserv;
731 } else {
732 error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
733 5 * RES_LEAF, 0);
734 if (error)
735 goto out_gunlock;
736 }
737
738 /* Remove the target file, if it exists */
739
740 if (nip) {
741 if (S_ISDIR(nip->i_di.di_mode))
742 error = gfs2_rmdiri(ndip, &ndentry->d_name, nip);
743 else {
744 error = gfs2_dir_del(ndip, &ndentry->d_name);
745 if (error)
746 goto out_end_trans;
747 error = gfs2_change_nlink(nip, -1);
748 }
749 if (error)
750 goto out_end_trans;
751 }
752
753 if (dir_rename) {
754 struct qstr name;
755 gfs2_str2qstr(&name, "..");
756
757 error = gfs2_change_nlink(ndip, +1);
758 if (error)
759 goto out_end_trans;
760 error = gfs2_change_nlink(odip, -1);
761 if (error)
762 goto out_end_trans;
763
764 error = gfs2_dir_mvino(ip, &name, &ndip->i_num, DT_DIR);
765 if (error)
766 goto out_end_trans;
767 } else {
768 struct buffer_head *dibh;
769 error = gfs2_meta_inode_buffer(ip, &dibh);
770 if (error)
771 goto out_end_trans;
772 ip->i_di.di_ctime = get_seconds();
773 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
774 gfs2_dinode_out(&ip->i_di, dibh->b_data);
775 brelse(dibh);
776 }
777
778 error = gfs2_dir_del(odip, &odentry->d_name);
779 if (error)
780 goto out_end_trans;
781
782 error = gfs2_dir_add(ndir, &ndentry->d_name, &ip->i_num,
783 IF2DT(ip->i_di.di_mode));
784 if (error)
785 goto out_end_trans;
786
787out_end_trans:
788 gfs2_trans_end(sdp);
789out_ipreserv:
790 if (alloc_required)
791 gfs2_inplace_release(ndip);
792out_gunlock_q:
793 if (alloc_required)
794 gfs2_quota_unlock(ndip);
795out_alloc:
796 if (alloc_required)
797 gfs2_alloc_put(ndip);
798out_gunlock:
799 gfs2_glock_dq_m(num_gh, ghs);
800out_uninit:
801 for (x = 0; x < num_gh; x++)
802 gfs2_holder_uninit(ghs + x);
803out_gunlock_r:
804 if (dir_rename)
805 gfs2_glock_dq_uninit(&r_gh);
806out:
807 return error;
808}
809
810/**
811 * gfs2_readlink - Read the value of a symlink
812 * @dentry: the symlink
813 * @buf: the buffer to read the symlink data into
814 * @size: the size of the buffer
815 *
816 * Returns: errno
817 */
818
819static int gfs2_readlink(struct dentry *dentry, char __user *user_buf,
820 int user_size)
821{
822 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
823 char array[GFS2_FAST_NAME_SIZE], *buf = array;
824 unsigned int len = GFS2_FAST_NAME_SIZE;
825 int error;
826
827 error = gfs2_readlinki(ip, &buf, &len);
828 if (error)
829 return error;
830
831 if (user_size > len - 1)
832 user_size = len - 1;
833
834 if (copy_to_user(user_buf, buf, user_size))
835 error = -EFAULT;
836 else
837 error = user_size;
838
839 if (buf != array)
840 kfree(buf);
841
842 return error;
843}
844
845/**
846 * gfs2_follow_link - Follow a symbolic link
847 * @dentry: The dentry of the link
848 * @nd: Data that we pass to vfs_follow_link()
849 *
850 * This can handle symlinks of any size. It is optimised for symlinks
851 * under GFS2_FAST_NAME_SIZE.
852 *
853 * Returns: 0 on success or error code
854 */
855
856static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
857{
858 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
859 char array[GFS2_FAST_NAME_SIZE], *buf = array;
860 unsigned int len = GFS2_FAST_NAME_SIZE;
861 int error;
862
863 error = gfs2_readlinki(ip, &buf, &len);
864 if (!error) {
865 error = vfs_follow_link(nd, buf);
866 if (buf != array)
867 kfree(buf);
868 }
869
870 return ERR_PTR(error);
871}
872
873/**
874 * gfs2_permission -
875 * @inode:
876 * @mask:
877 * @nd: passed from Linux VFS, ignored by us
878 *
879 * Returns: errno
880 */
881
882static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
883{
884 struct gfs2_inode *ip = GFS2_I(inode);
885 struct gfs2_holder i_gh;
886 int error;
887
888 if (ip->i_vn == ip->i_gl->gl_vn)
889 return generic_permission(inode, mask, gfs2_check_acl);
890
891 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
892 if (!error) {
893 error = generic_permission(inode, mask, gfs2_check_acl_locked);
894 gfs2_glock_dq_uninit(&i_gh);
895 }
896
897 return error;
898}
899
900static int setattr_size(struct inode *inode, struct iattr *attr)
901{
902 struct gfs2_inode *ip = GFS2_I(inode);
903 int error;
904
905 if (attr->ia_size != ip->i_di.di_size) {
906 error = vmtruncate(inode, attr->ia_size);
907 if (error)
908 return error;
909 }
910
911 error = gfs2_truncatei(ip, attr->ia_size);
912 if (error)
913 return error;
914
915 return error;
916}
917
918static int setattr_chown(struct inode *inode, struct iattr *attr)
919{
920 struct gfs2_inode *ip = GFS2_I(inode);
921 struct gfs2_sbd *sdp = GFS2_SB(inode);
922 struct buffer_head *dibh;
923 uint32_t ouid, ogid, nuid, ngid;
924 int error;
925
926 ouid = ip->i_di.di_uid;
927 ogid = ip->i_di.di_gid;
928 nuid = attr->ia_uid;
929 ngid = attr->ia_gid;
930
931 if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
932 ouid = nuid = NO_QUOTA_CHANGE;
933 if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
934 ogid = ngid = NO_QUOTA_CHANGE;
935
936 gfs2_alloc_get(ip);
937
938 error = gfs2_quota_lock(ip, nuid, ngid);
939 if (error)
940 goto out_alloc;
941
942 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
943 error = gfs2_quota_check(ip, nuid, ngid);
944 if (error)
945 goto out_gunlock_q;
946 }
947
948 error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
949 if (error)
950 goto out_gunlock_q;
951
952 error = gfs2_meta_inode_buffer(ip, &dibh);
953 if (error)
954 goto out_end_trans;
955
956 error = inode_setattr(inode, attr);
957 gfs2_assert_warn(sdp, !error);
958 gfs2_inode_attr_out(ip);
959
960 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
961 gfs2_dinode_out(&ip->i_di, dibh->b_data);
962 brelse(dibh);
963
964 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
965 gfs2_quota_change(ip, -ip->i_di.di_blocks, ouid, ogid);
966 gfs2_quota_change(ip, ip->i_di.di_blocks, nuid, ngid);
967 }
968
969 out_end_trans:
970 gfs2_trans_end(sdp);
971
972 out_gunlock_q:
973 gfs2_quota_unlock(ip);
974
975 out_alloc:
976 gfs2_alloc_put(ip);
977
978 return error;
979}
980
981/**
982 * gfs2_setattr - Change attributes on an inode
983 * @dentry: The dentry which is changing
984 * @attr: The structure describing the change
985 *
986 * The VFS layer wants to change one or more of an inodes attributes. Write
987 * that change out to disk.
988 *
989 * Returns: errno
990 */
991
992static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
993{
994 struct inode *inode = dentry->d_inode;
995 struct gfs2_inode *ip = GFS2_I(inode);
996 struct gfs2_holder i_gh;
997 int error;
998
999 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1000 if (error)
1001 return error;
1002
1003 error = -EPERM;
1004 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
1005 goto out;
1006
1007 error = inode_change_ok(inode, attr);
1008 if (error)
1009 goto out;
1010
1011 if (attr->ia_valid & ATTR_SIZE)
1012 error = setattr_size(inode, attr);
1013 else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
1014 error = setattr_chown(inode, attr);
1015 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
1016 error = gfs2_acl_chmod(ip, attr);
1017 else
1018 error = gfs2_setattr_simple(ip, attr);
1019
1020 out:
1021 gfs2_glock_dq_uninit(&i_gh);
1022
1023 if (!error)
1024 mark_inode_dirty(inode);
1025
1026 return error;
1027}
1028
1029/**
1030 * gfs2_getattr - Read out an inode's attributes
1031 * @mnt: ?
1032 * @dentry: The dentry to stat
1033 * @stat: The inode's stats
1034 *
1035 * Returns: errno
1036 */
1037
1038static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
1039 struct kstat *stat)
1040{
1041 struct inode *inode = dentry->d_inode;
1042 struct gfs2_inode *ip = GFS2_I(inode);
1043 struct gfs2_holder gh;
1044 int error;
1045
1046 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
1047 if (!error) {
1048 generic_fillattr(inode, stat);
1049 gfs2_glock_dq_uninit(&gh);
1050 }
1051
1052 return error;
1053}
1054
1055static int gfs2_setxattr(struct dentry *dentry, const char *name,
1056 const void *data, size_t size, int flags)
1057{
1058 struct inode *inode = dentry->d_inode;
1059 struct gfs2_ea_request er;
1060
1061 memset(&er, 0, sizeof(struct gfs2_ea_request));
1062 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1063 if (er.er_type == GFS2_EATYPE_UNUSED)
1064 return -EOPNOTSUPP;
1065 er.er_data = (char *)data;
1066 er.er_name_len = strlen(er.er_name);
1067 er.er_data_len = size;
1068 er.er_flags = flags;
1069
1070 gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE));
1071
1072 return gfs2_ea_set(GFS2_I(inode), &er);
1073}
1074
1075static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
1076 void *data, size_t size)
1077{
1078 struct gfs2_ea_request er;
1079
1080 memset(&er, 0, sizeof(struct gfs2_ea_request));
1081 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1082 if (er.er_type == GFS2_EATYPE_UNUSED)
1083 return -EOPNOTSUPP;
1084 er.er_data = data;
1085 er.er_name_len = strlen(er.er_name);
1086 er.er_data_len = size;
1087
1088 return gfs2_ea_get(GFS2_I(dentry->d_inode), &er);
1089}
1090
1091static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
1092{
1093 struct gfs2_ea_request er;
1094
1095 memset(&er, 0, sizeof(struct gfs2_ea_request));
1096 er.er_data = (size) ? buffer : NULL;
1097 er.er_data_len = size;
1098
1099 return gfs2_ea_list(GFS2_I(dentry->d_inode), &er);
1100}
1101
1102static int gfs2_removexattr(struct dentry *dentry, const char *name)
1103{
1104 struct gfs2_ea_request er;
1105
1106 memset(&er, 0, sizeof(struct gfs2_ea_request));
1107 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1108 if (er.er_type == GFS2_EATYPE_UNUSED)
1109 return -EOPNOTSUPP;
1110 er.er_name_len = strlen(er.er_name);
1111
1112 return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
1113}
1114
1115struct inode_operations gfs2_file_iops = {
1116 .permission = gfs2_permission,
1117 .setattr = gfs2_setattr,
1118 .getattr = gfs2_getattr,
1119 .setxattr = gfs2_setxattr,
1120 .getxattr = gfs2_getxattr,
1121 .listxattr = gfs2_listxattr,
1122 .removexattr = gfs2_removexattr,
1123};
1124
1125struct inode_operations gfs2_dev_iops = {
1126 .permission = gfs2_permission,
1127 .setattr = gfs2_setattr,
1128 .getattr = gfs2_getattr,
1129 .setxattr = gfs2_setxattr,
1130 .getxattr = gfs2_getxattr,
1131 .listxattr = gfs2_listxattr,
1132 .removexattr = gfs2_removexattr,
1133};
1134
1135struct inode_operations gfs2_dir_iops = {
1136 .create = gfs2_create,
1137 .lookup = gfs2_lookup,
1138 .link = gfs2_link,
1139 .unlink = gfs2_unlink,
1140 .symlink = gfs2_symlink,
1141 .mkdir = gfs2_mkdir,
1142 .rmdir = gfs2_rmdir,
1143 .mknod = gfs2_mknod,
1144 .rename = gfs2_rename,
1145 .permission = gfs2_permission,
1146 .setattr = gfs2_setattr,
1147 .getattr = gfs2_getattr,
1148 .setxattr = gfs2_setxattr,
1149 .getxattr = gfs2_getxattr,
1150 .listxattr = gfs2_listxattr,
1151 .removexattr = gfs2_removexattr,
1152};
1153
1154struct inode_operations gfs2_symlink_iops = {
1155 .readlink = gfs2_readlink,
1156 .follow_link = gfs2_follow_link,
1157 .permission = gfs2_permission,
1158 .setattr = gfs2_setattr,
1159 .getattr = gfs2_getattr,
1160 .setxattr = gfs2_setxattr,
1161 .getxattr = gfs2_getxattr,
1162 .listxattr = gfs2_listxattr,
1163 .removexattr = gfs2_removexattr,
1164};
1165
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
new file mode 100644
index 000000000000..930aaae91377
--- /dev/null
+++ b/fs/gfs2/ops_inode.h
@@ -0,0 +1,18 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_INODE_DOT_H__
11#define __OPS_INODE_DOT_H__
12
13extern struct inode_operations gfs2_file_iops;
14extern struct inode_operations gfs2_dir_iops;
15extern struct inode_operations gfs2_symlink_iops;
16extern struct inode_operations gfs2_dev_iops;
17
18#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
new file mode 100644
index 000000000000..18ed18c729e8
--- /dev/null
+++ b/fs/gfs2/ops_super.c
@@ -0,0 +1,472 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/statfs.h>
16#include <linux/vmalloc.h>
17#include <linux/seq_file.h>
18#include <linux/mount.h>
19#include <linux/kthread.h>
20#include <linux/delay.h>
21#include <linux/gfs2_ondisk.h>
22#include <linux/crc32.h>
23
24#include "gfs2.h"
25#include "lm_interface.h"
26#include "incore.h"
27#include "glock.h"
28#include "inode.h"
29#include "lm.h"
30#include "log.h"
31#include "mount.h"
32#include "ops_super.h"
33#include "quota.h"
34#include "recovery.h"
35#include "rgrp.h"
36#include "super.h"
37#include "sys.h"
38#include "util.h"
39#include "trans.h"
40#include "dir.h"
41#include "eattr.h"
42#include "bmap.h"
43
44/**
45 * gfs2_write_inode - Make sure the inode is stable on the disk
46 * @inode: The inode
47 * @sync: synchronous write flag
48 *
49 * Returns: errno
50 */
51
52static int gfs2_write_inode(struct inode *inode, int sync)
53{
54 struct gfs2_inode *ip = GFS2_I(inode);
55
56 /* Check this is a "normal" inode */
57 if (inode->u.generic_ip) {
58 if (current->flags & PF_MEMALLOC)
59 return 0;
60 if (sync)
61 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
62 }
63
64 return 0;
65}
66
67/**
68 * gfs2_put_super - Unmount the filesystem
69 * @sb: The VFS superblock
70 *
71 */
72
73static void gfs2_put_super(struct super_block *sb)
74{
75 struct gfs2_sbd *sdp = sb->s_fs_info;
76 int error;
77
78 if (!sdp)
79 return;
80
81 if (!strncmp(sb->s_type->name, "gfs2meta", 8))
82 return; /* meta fs. don't do nothin' */
83
84 /* Unfreeze the filesystem, if we need to */
85
86 mutex_lock(&sdp->sd_freeze_lock);
87 if (sdp->sd_freeze_count)
88 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
89 mutex_unlock(&sdp->sd_freeze_lock);
90
91 kthread_stop(sdp->sd_quotad_process);
92 kthread_stop(sdp->sd_logd_process);
93 kthread_stop(sdp->sd_recoverd_process);
94 while (sdp->sd_glockd_num--)
95 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
96 kthread_stop(sdp->sd_scand_process);
97
98 if (!(sb->s_flags & MS_RDONLY)) {
99 error = gfs2_make_fs_ro(sdp);
100 if (error)
101 gfs2_io_error(sdp);
102 }
103 /* At this point, we're through modifying the disk */
104
105 /* Release stuff */
106
107 iput(sdp->sd_master_dir);
108 iput(sdp->sd_jindex);
109 iput(sdp->sd_inum_inode);
110 iput(sdp->sd_statfs_inode);
111 iput(sdp->sd_rindex);
112 iput(sdp->sd_quota_inode);
113
114 gfs2_glock_put(sdp->sd_rename_gl);
115 gfs2_glock_put(sdp->sd_trans_gl);
116
117 if (!sdp->sd_args.ar_spectator) {
118 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
119 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
120 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
121 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
122 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
123 iput(sdp->sd_ir_inode);
124 iput(sdp->sd_sc_inode);
125 iput(sdp->sd_qc_inode);
126 }
127
128 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
129 gfs2_clear_rgrpd(sdp);
130 gfs2_jindex_free(sdp);
131 /* Take apart glock structures and buffer lists */
132 gfs2_gl_hash_clear(sdp, WAIT);
133 /* Unmount the locking protocol */
134 gfs2_lm_unmount(sdp);
135
136 /* At this point, we're through participating in the lockspace */
137 gfs2_sys_fs_del(sdp);
138 vfree(sdp);
139 sb->s_fs_info = NULL;
140}
141
142/**
143 * gfs2_write_super - disk commit all incore transactions
144 * @sb: the filesystem
145 *
146 * This function is called every time sync(2) is called.
147 * After this exits, all dirty buffers are synced.
148 */
149
150static void gfs2_write_super(struct super_block *sb)
151{
152 struct gfs2_sbd *sdp = sb->s_fs_info;
153 gfs2_log_flush(sdp, NULL);
154}
155
156/**
157 * gfs2_write_super_lockfs - prevent further writes to the filesystem
158 * @sb: the VFS structure for the filesystem
159 *
160 */
161
162static void gfs2_write_super_lockfs(struct super_block *sb)
163{
164 struct gfs2_sbd *sdp = sb->s_fs_info;
165 int error;
166
167 for (;;) {
168 error = gfs2_freeze_fs(sdp);
169 if (!error)
170 break;
171
172 switch (error) {
173 case -EBUSY:
174 fs_err(sdp, "waiting for recovery before freeze\n");
175 break;
176
177 default:
178 fs_err(sdp, "error freezing FS: %d\n", error);
179 break;
180 }
181
182 fs_err(sdp, "retrying...\n");
183 msleep(1000);
184 }
185}
186
187/**
188 * gfs2_unlockfs - reallow writes to the filesystem
189 * @sb: the VFS structure for the filesystem
190 *
191 */
192
193static void gfs2_unlockfs(struct super_block *sb)
194{
195 struct gfs2_sbd *sdp = sb->s_fs_info;
196 gfs2_unfreeze_fs(sdp);
197}
198
199/**
200 * gfs2_statfs - Gather and return stats about the filesystem
201 * @sb: The superblock
202 * @statfsbuf: The buffer
203 *
204 * Returns: 0 on success or error code
205 */
206
207static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
208{
209 struct super_block *sb = dentry->d_inode->i_sb;
210 struct gfs2_sbd *sdp = sb->s_fs_info;
211 struct gfs2_statfs_change sc;
212 int error;
213
214 if (gfs2_tune_get(sdp, gt_statfs_slow))
215 error = gfs2_statfs_slow(sdp, &sc);
216 else
217 error = gfs2_statfs_i(sdp, &sc);
218
219 if (error)
220 return error;
221
222 buf->f_type = GFS2_MAGIC;
223 buf->f_bsize = sdp->sd_sb.sb_bsize;
224 buf->f_blocks = sc.sc_total;
225 buf->f_bfree = sc.sc_free;
226 buf->f_bavail = sc.sc_free;
227 buf->f_files = sc.sc_dinodes + sc.sc_free;
228 buf->f_ffree = sc.sc_free;
229 buf->f_namelen = GFS2_FNAMESIZE;
230
231 return 0;
232}
233
234/**
235 * gfs2_remount_fs - called when the FS is remounted
236 * @sb: the filesystem
237 * @flags: the remount flags
238 * @data: extra data passed in (not used right now)
239 *
240 * Returns: errno
241 */
242
243static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
244{
245 struct gfs2_sbd *sdp = sb->s_fs_info;
246 int error;
247
248 error = gfs2_mount_args(sdp, data, 1);
249 if (error)
250 return error;
251
252 if (sdp->sd_args.ar_spectator)
253 *flags |= MS_RDONLY;
254 else {
255 if (*flags & MS_RDONLY) {
256 if (!(sb->s_flags & MS_RDONLY))
257 error = gfs2_make_fs_ro(sdp);
258 } else if (!(*flags & MS_RDONLY) &&
259 (sb->s_flags & MS_RDONLY)) {
260 error = gfs2_make_fs_rw(sdp);
261 }
262 }
263
264 if (*flags & (MS_NOATIME | MS_NODIRATIME))
265 set_bit(SDF_NOATIME, &sdp->sd_flags);
266 else
267 clear_bit(SDF_NOATIME, &sdp->sd_flags);
268
269 /* Don't let the VFS update atimes. GFS2 handles this itself. */
270 *flags |= MS_NOATIME | MS_NODIRATIME;
271
272 return error;
273}
274
275/**
276 * gfs2_clear_inode - Deallocate an inode when VFS is done with it
277 * @inode: The VFS inode
278 *
279 */
280
281static void gfs2_clear_inode(struct inode *inode)
282{
283 /* This tells us its a "real" inode and not one which only
284 * serves to contain an address space (see rgrp.c, meta_io.c)
285 * which therefore doesn't have its own glocks.
286 */
287 if (inode->u.generic_ip) {
288 struct gfs2_inode *ip = GFS2_I(inode);
289 gfs2_glock_inode_squish(inode);
290 gfs2_assert(inode->i_sb->s_fs_info, ip->i_gl->gl_state == LM_ST_UNLOCKED);
291 ip->i_gl->gl_object = NULL;
292 gfs2_glock_schedule_for_reclaim(ip->i_gl);
293 gfs2_glock_put(ip->i_gl);
294 ip->i_gl = NULL;
295 if (ip->i_iopen_gh.gh_gl)
296 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
297 }
298}
299
300/**
301 * gfs2_show_options - Show mount options for /proc/mounts
302 * @s: seq_file structure
303 * @mnt: vfsmount
304 *
305 * Returns: 0 on success or error code
306 */
307
308static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
309{
310 struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
311 struct gfs2_args *args = &sdp->sd_args;
312
313 if (args->ar_lockproto[0])
314 seq_printf(s, ",lockproto=%s", args->ar_lockproto);
315 if (args->ar_locktable[0])
316 seq_printf(s, ",locktable=%s", args->ar_locktable);
317 if (args->ar_hostdata[0])
318 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
319 if (args->ar_spectator)
320 seq_printf(s, ",spectator");
321 if (args->ar_ignore_local_fs)
322 seq_printf(s, ",ignore_local_fs");
323 if (args->ar_localflocks)
324 seq_printf(s, ",localflocks");
325 if (args->ar_localcaching)
326 seq_printf(s, ",localcaching");
327 if (args->ar_debug)
328 seq_printf(s, ",debug");
329 if (args->ar_upgrade)
330 seq_printf(s, ",upgrade");
331 if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
332 seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
333 if (args->ar_posix_acl)
334 seq_printf(s, ",acl");
335 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
336 char *state;
337 switch (args->ar_quota) {
338 case GFS2_QUOTA_OFF:
339 state = "off";
340 break;
341 case GFS2_QUOTA_ACCOUNT:
342 state = "account";
343 break;
344 case GFS2_QUOTA_ON:
345 state = "on";
346 break;
347 default:
348 state = "unknown";
349 break;
350 }
351 seq_printf(s, ",quota=%s", state);
352 }
353 if (args->ar_suiddir)
354 seq_printf(s, ",suiddir");
355 if (args->ar_data != GFS2_DATA_DEFAULT) {
356 char *state;
357 switch (args->ar_data) {
358 case GFS2_DATA_WRITEBACK:
359 state = "writeback";
360 break;
361 case GFS2_DATA_ORDERED:
362 state = "ordered";
363 break;
364 default:
365 state = "unknown";
366 break;
367 }
368 seq_printf(s, ",data=%s", state);
369 }
370
371 return 0;
372}
373
374/*
375 * We have to (at the moment) hold the inodes main lock to cover
376 * the gap between unlocking the shared lock on the iopen lock and
377 * taking the exclusive lock. I'd rather do a shared -> exclusive
378 * conversion on the iopen lock, but we can change that later. This
379 * is safe, just less efficient.
380 */
381static void gfs2_delete_inode(struct inode *inode)
382{
383 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
384 struct gfs2_inode *ip = GFS2_I(inode);
385 struct gfs2_holder gh;
386 int error;
387
388 if (!inode->u.generic_ip)
389 goto out;
390
391 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &gh);
392 if (unlikely(error)) {
393 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
394 goto out;
395 }
396
397 gfs2_glock_dq(&ip->i_iopen_gh);
398 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
399 error = gfs2_glock_nq(&ip->i_iopen_gh);
400 if (error)
401 goto out_uninit;
402
403 if (S_ISDIR(ip->i_di.di_mode) &&
404 (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
405 error = gfs2_dir_exhash_dealloc(ip);
406 if (error)
407 goto out_unlock;
408 }
409
410 if (ip->i_di.di_eattr) {
411 error = gfs2_ea_dealloc(ip);
412 if (error)
413 goto out_unlock;
414 }
415
416 if (!gfs2_is_stuffed(ip)) {
417 error = gfs2_file_dealloc(ip);
418 if (error)
419 goto out_unlock;
420 }
421
422 error = gfs2_dinode_dealloc(ip);
423
424out_unlock:
425 gfs2_glock_dq(&ip->i_iopen_gh);
426out_uninit:
427 gfs2_holder_uninit(&ip->i_iopen_gh);
428 gfs2_glock_dq_uninit(&gh);
429 if (error)
430 fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
431out:
432 truncate_inode_pages(&inode->i_data, 0);
433 clear_inode(inode);
434}
435
436
437
438static struct inode *gfs2_alloc_inode(struct super_block *sb)
439{
440 struct gfs2_sbd *sdp = sb->s_fs_info;
441 struct gfs2_inode *ip;
442
443 ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
444 if (ip) {
445 ip->i_flags = 0;
446 ip->i_gl = NULL;
447 ip->i_greedy = gfs2_tune_get(sdp, gt_greedy_default);
448 ip->i_last_pfault = jiffies;
449 }
450 return &ip->i_inode;
451}
452
453static void gfs2_destroy_inode(struct inode *inode)
454{
455 kmem_cache_free(gfs2_inode_cachep, inode);
456}
457
458struct super_operations gfs2_super_ops = {
459 .alloc_inode = gfs2_alloc_inode,
460 .destroy_inode = gfs2_destroy_inode,
461 .write_inode = gfs2_write_inode,
462 .delete_inode = gfs2_delete_inode,
463 .put_super = gfs2_put_super,
464 .write_super = gfs2_write_super,
465 .write_super_lockfs = gfs2_write_super_lockfs,
466 .unlockfs = gfs2_unlockfs,
467 .statfs = gfs2_statfs,
468 .remount_fs = gfs2_remount_fs,
469 .clear_inode = gfs2_clear_inode,
470 .show_options = gfs2_show_options,
471};
472
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
new file mode 100644
index 000000000000..a15ccc276113
--- /dev/null
+++ b/fs/gfs2/ops_super.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_SUPER_DOT_H__
11#define __OPS_SUPER_DOT_H__
12
13extern struct super_operations gfs2_super_ops;
14
15#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
new file mode 100644
index 000000000000..875a769444a1
--- /dev/null
+++ b/fs/gfs2/ops_vm.c
@@ -0,0 +1,188 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/gfs2_ondisk.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "bmap.h"
23#include "glock.h"
24#include "inode.h"
25#include "ops_vm.h"
26#include "quota.h"
27#include "rgrp.h"
28#include "trans.h"
29#include "util.h"
30
31static void pfault_be_greedy(struct gfs2_inode *ip)
32{
33 unsigned int time;
34
35 spin_lock(&ip->i_spin);
36 time = ip->i_greedy;
37 ip->i_last_pfault = jiffies;
38 spin_unlock(&ip->i_spin);
39
40 igrab(&ip->i_inode);
41 if (gfs2_glock_be_greedy(ip->i_gl, time))
42 iput(&ip->i_inode);
43}
44
45static struct page *gfs2_private_nopage(struct vm_area_struct *area,
46 unsigned long address, int *type)
47{
48 struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host);
49 struct page *result;
50
51 set_bit(GIF_PAGED, &ip->i_flags);
52
53 result = filemap_nopage(area, address, type);
54
55 if (result && result != NOPAGE_OOM)
56 pfault_be_greedy(ip);
57
58 return result;
59}
60
61static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
62{
63 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
64 unsigned long index = page->index;
65 uint64_t lblock = index << (PAGE_CACHE_SHIFT -
66 sdp->sd_sb.sb_bsize_shift);
67 unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift;
68 struct gfs2_alloc *al;
69 unsigned int data_blocks, ind_blocks;
70 unsigned int x;
71 int error;
72
73 al = gfs2_alloc_get(ip);
74
75 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
76 if (error)
77 goto out;
78
79 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
80 if (error)
81 goto out_gunlock_q;
82
83 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
84
85 al->al_requested = data_blocks + ind_blocks;
86
87 error = gfs2_inplace_reserve(ip);
88 if (error)
89 goto out_gunlock_q;
90
91 error = gfs2_trans_begin(sdp, al->al_rgd->rd_ri.ri_length +
92 ind_blocks + RES_DINODE +
93 RES_STATFS + RES_QUOTA, 0);
94 if (error)
95 goto out_ipres;
96
97 if (gfs2_is_stuffed(ip)) {
98 error = gfs2_unstuff_dinode(ip, NULL);
99 if (error)
100 goto out_trans;
101 }
102
103 for (x = 0; x < blocks; ) {
104 uint64_t dblock;
105 unsigned int extlen;
106 int new = 1;
107
108 error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
109 if (error)
110 goto out_trans;
111
112 lblock += extlen;
113 x += extlen;
114 }
115
116 gfs2_assert_warn(sdp, al->al_alloced);
117
118 out_trans:
119 gfs2_trans_end(sdp);
120
121 out_ipres:
122 gfs2_inplace_release(ip);
123
124 out_gunlock_q:
125 gfs2_quota_unlock(ip);
126
127 out:
128 gfs2_alloc_put(ip);
129
130 return error;
131}
132
133static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
134 unsigned long address, int *type)
135{
136 struct file *file = area->vm_file;
137 struct gfs2_file *gf = file->private_data;
138 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
139 struct gfs2_holder i_gh;
140 struct page *result = NULL;
141 unsigned long index = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) +
142 area->vm_pgoff;
143 int alloc_required;
144 int error;
145
146 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
147 if (error)
148 return NULL;
149
150 set_bit(GIF_PAGED, &ip->i_flags);
151 set_bit(GIF_SW_PAGED, &ip->i_flags);
152
153 error = gfs2_write_alloc_required(ip, (u64)index << PAGE_CACHE_SHIFT,
154 PAGE_CACHE_SIZE, &alloc_required);
155 if (error)
156 goto out;
157
158 set_bit(GFF_EXLOCK, &gf->f_flags);
159 result = filemap_nopage(area, address, type);
160 clear_bit(GFF_EXLOCK, &gf->f_flags);
161 if (!result || result == NOPAGE_OOM)
162 goto out;
163
164 if (alloc_required) {
165 error = alloc_page_backing(ip, result);
166 if (error) {
167 page_cache_release(result);
168 result = NULL;
169 goto out;
170 }
171 set_page_dirty(result);
172 }
173
174 pfault_be_greedy(ip);
175out:
176 gfs2_glock_dq_uninit(&i_gh);
177
178 return result;
179}
180
181struct vm_operations_struct gfs2_vm_ops_private = {
182 .nopage = gfs2_private_nopage,
183};
184
185struct vm_operations_struct gfs2_vm_ops_sharewrite = {
186 .nopage = gfs2_sharewrite_nopage,
187};
188
diff --git a/fs/gfs2/ops_vm.h b/fs/gfs2/ops_vm.h
new file mode 100644
index 000000000000..077cffcd4085
--- /dev/null
+++ b/fs/gfs2/ops_vm.h
@@ -0,0 +1,16 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_VM_DOT_H__
11#define __OPS_VM_DOT_H__
12
13extern struct vm_operations_struct gfs2_vm_ops_private;
14extern struct vm_operations_struct gfs2_vm_ops_sharewrite;
15
16#endif /* __OPS_VM_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
new file mode 100644
index 000000000000..3ca65c37c354
--- /dev/null
+++ b/fs/gfs2/quota.c
@@ -0,0 +1,1286 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10/*
11 * Quota change tags are associated with each transaction that allocates or
12 * deallocates space. Those changes are accumulated locally to each node (in a
13 * per-node file) and then are periodically synced to the quota file. This
14 * avoids the bottleneck of constantly touching the quota file, but introduces
15 * fuzziness in the current usage value of IDs that are being used on different
16 * nodes in the cluster simultaneously. So, it is possible for a user on
17 * multiple nodes to overrun their quota, but that overrun is controlable.
18 * Since quota tags are part of transactions, there is no need to a quota check
19 * program to be run on node crashes or anything like that.
20 *
21 * There are couple of knobs that let the administrator manage the quota
22 * fuzziness. "quota_quantum" sets the maximum time a quota change can be
23 * sitting on one node before being synced to the quota file. (The default is
24 * 60 seconds.) Another knob, "quota_scale" controls how quickly the frequency
25 * of quota file syncs increases as the user moves closer to their limit. The
26 * more frequent the syncs, the more accurate the quota enforcement, but that
27 * means that there is more contention between the nodes for the quota file.
28 * The default value is one. This sets the maximum theoretical quota overrun
29 * (with infinite node with infinite bandwidth) to twice the user's limit. (In
30 * practice, the maximum overrun you see should be much less.) A "quota_scale"
31 * number greater than one makes quota syncs more frequent and reduces the
32 * maximum overrun. Numbers less than one (but greater than zero) make quota
33 * syncs less frequent.
34 *
35 * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of
36 * the quota file, so it is not being constantly read.
37 */
38
39#include <linux/sched.h>
40#include <linux/slab.h>
41#include <linux/spinlock.h>
42#include <linux/completion.h>
43#include <linux/buffer_head.h>
44#include <linux/sort.h>
45#include <linux/fs.h>
46#include <linux/gfs2_ondisk.h>
47
48#include "gfs2.h"
49#include "lm_interface.h"
50#include "incore.h"
51#include "bmap.h"
52#include "glock.h"
53#include "glops.h"
54#include "log.h"
55#include "lvb.h"
56#include "meta_io.h"
57#include "quota.h"
58#include "rgrp.h"
59#include "super.h"
60#include "trans.h"
61#include "inode.h"
62#include "ops_file.h"
63#include "ops_address.h"
64#include "util.h"
65
66#define QUOTA_USER 1
67#define QUOTA_GROUP 0
68
69static uint64_t qd2offset(struct gfs2_quota_data *qd)
70{
71 uint64_t offset;
72
73 offset = 2 * (uint64_t)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags);
74 offset *= sizeof(struct gfs2_quota);
75
76 return offset;
77}
78
79static int qd_alloc(struct gfs2_sbd *sdp, int user, uint32_t id,
80 struct gfs2_quota_data **qdp)
81{
82 struct gfs2_quota_data *qd;
83 int error;
84
85 qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL);
86 if (!qd)
87 return -ENOMEM;
88
89 qd->qd_count = 1;
90 qd->qd_id = id;
91 if (user)
92 set_bit(QDF_USER, &qd->qd_flags);
93 qd->qd_slot = -1;
94
95 error = gfs2_glock_get(sdp, 2 * (uint64_t)id + !user,
96 &gfs2_quota_glops, CREATE, &qd->qd_gl);
97 if (error)
98 goto fail;
99
100 error = gfs2_lvb_hold(qd->qd_gl);
101 gfs2_glock_put(qd->qd_gl);
102 if (error)
103 goto fail;
104
105 *qdp = qd;
106
107 return 0;
108
109 fail:
110 kfree(qd);
111 return error;
112}
113
114static int qd_get(struct gfs2_sbd *sdp, int user, uint32_t id, int create,
115 struct gfs2_quota_data **qdp)
116{
117 struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
118 int error, found;
119
120 *qdp = NULL;
121
122 for (;;) {
123 found = 0;
124 spin_lock(&sdp->sd_quota_spin);
125 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
126 if (qd->qd_id == id &&
127 !test_bit(QDF_USER, &qd->qd_flags) == !user) {
128 qd->qd_count++;
129 found = 1;
130 break;
131 }
132 }
133
134 if (!found)
135 qd = NULL;
136
137 if (!qd && new_qd) {
138 qd = new_qd;
139 list_add(&qd->qd_list, &sdp->sd_quota_list);
140 atomic_inc(&sdp->sd_quota_count);
141 new_qd = NULL;
142 }
143
144 spin_unlock(&sdp->sd_quota_spin);
145
146 if (qd || !create) {
147 if (new_qd) {
148 gfs2_lvb_unhold(new_qd->qd_gl);
149 kfree(new_qd);
150 }
151 *qdp = qd;
152 return 0;
153 }
154
155 error = qd_alloc(sdp, user, id, &new_qd);
156 if (error)
157 return error;
158 }
159}
160
161static void qd_hold(struct gfs2_quota_data *qd)
162{
163 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
164
165 spin_lock(&sdp->sd_quota_spin);
166 gfs2_assert(sdp, qd->qd_count);
167 qd->qd_count++;
168 spin_unlock(&sdp->sd_quota_spin);
169}
170
171static void qd_put(struct gfs2_quota_data *qd)
172{
173 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
174 spin_lock(&sdp->sd_quota_spin);
175 gfs2_assert(sdp, qd->qd_count);
176 if (!--qd->qd_count)
177 qd->qd_last_touched = jiffies;
178 spin_unlock(&sdp->sd_quota_spin);
179}
180
181static int slot_get(struct gfs2_quota_data *qd)
182{
183 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
184 unsigned int c, o = 0, b;
185 unsigned char byte = 0;
186
187 spin_lock(&sdp->sd_quota_spin);
188
189 if (qd->qd_slot_count++) {
190 spin_unlock(&sdp->sd_quota_spin);
191 return 0;
192 }
193
194 for (c = 0; c < sdp->sd_quota_chunks; c++)
195 for (o = 0; o < PAGE_SIZE; o++) {
196 byte = sdp->sd_quota_bitmap[c][o];
197 if (byte != 0xFF)
198 goto found;
199 }
200
201 goto fail;
202
203 found:
204 for (b = 0; b < 8; b++)
205 if (!(byte & (1 << b)))
206 break;
207 qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
208
209 if (qd->qd_slot >= sdp->sd_quota_slots)
210 goto fail;
211
212 sdp->sd_quota_bitmap[c][o] |= 1 << b;
213
214 spin_unlock(&sdp->sd_quota_spin);
215
216 return 0;
217
218 fail:
219 qd->qd_slot_count--;
220 spin_unlock(&sdp->sd_quota_spin);
221 return -ENOSPC;
222}
223
224static void slot_hold(struct gfs2_quota_data *qd)
225{
226 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
227
228 spin_lock(&sdp->sd_quota_spin);
229 gfs2_assert(sdp, qd->qd_slot_count);
230 qd->qd_slot_count++;
231 spin_unlock(&sdp->sd_quota_spin);
232}
233
234static void slot_put(struct gfs2_quota_data *qd)
235{
236 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
237
238 spin_lock(&sdp->sd_quota_spin);
239 gfs2_assert(sdp, qd->qd_slot_count);
240 if (!--qd->qd_slot_count) {
241 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
242 qd->qd_slot = -1;
243 }
244 spin_unlock(&sdp->sd_quota_spin);
245}
246
247static int bh_get(struct gfs2_quota_data *qd)
248{
249 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
250 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
251 unsigned int block, offset;
252 uint64_t dblock;
253 int new = 0;
254 struct buffer_head *bh;
255 int error;
256 int boundary;
257
258 mutex_lock(&sdp->sd_quota_mutex);
259
260 if (qd->qd_bh_count++) {
261 mutex_unlock(&sdp->sd_quota_mutex);
262 return 0;
263 }
264
265 block = qd->qd_slot / sdp->sd_qc_per_block;
266 offset = qd->qd_slot % sdp->sd_qc_per_block;;
267
268 error = gfs2_block_map(&ip->i_inode, block, &new, &dblock, &boundary);
269 if (error)
270 goto fail;
271 error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT, &bh);
272 if (error)
273 goto fail;
274 error = -EIO;
275 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC))
276 goto fail_brelse;
277
278 qd->qd_bh = bh;
279 qd->qd_bh_qc = (struct gfs2_quota_change *)
280 (bh->b_data + sizeof(struct gfs2_meta_header) +
281 offset * sizeof(struct gfs2_quota_change));
282
283 mutex_lock(&sdp->sd_quota_mutex);
284
285 return 0;
286
287 fail_brelse:
288 brelse(bh);
289
290 fail:
291 qd->qd_bh_count--;
292 mutex_unlock(&sdp->sd_quota_mutex);
293 return error;
294}
295
296static void bh_put(struct gfs2_quota_data *qd)
297{
298 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
299
300 mutex_lock(&sdp->sd_quota_mutex);
301 gfs2_assert(sdp, qd->qd_bh_count);
302 if (!--qd->qd_bh_count) {
303 brelse(qd->qd_bh);
304 qd->qd_bh = NULL;
305 qd->qd_bh_qc = NULL;
306 }
307 mutex_unlock(&sdp->sd_quota_mutex);
308}
309
310static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
311{
312 struct gfs2_quota_data *qd = NULL;
313 int error;
314 int found = 0;
315
316 *qdp = NULL;
317
318 if (sdp->sd_vfs->s_flags & MS_RDONLY)
319 return 0;
320
321 spin_lock(&sdp->sd_quota_spin);
322
323 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
324 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
325 !test_bit(QDF_CHANGE, &qd->qd_flags) ||
326 qd->qd_sync_gen >= sdp->sd_quota_sync_gen)
327 continue;
328
329 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
330
331 set_bit(QDF_LOCKED, &qd->qd_flags);
332 gfs2_assert_warn(sdp, qd->qd_count);
333 qd->qd_count++;
334 qd->qd_change_sync = qd->qd_change;
335 gfs2_assert_warn(sdp, qd->qd_slot_count);
336 qd->qd_slot_count++;
337 found = 1;
338
339 break;
340 }
341
342 if (!found)
343 qd = NULL;
344
345 spin_unlock(&sdp->sd_quota_spin);
346
347 if (qd) {
348 gfs2_assert_warn(sdp, qd->qd_change_sync);
349 error = bh_get(qd);
350 if (error) {
351 clear_bit(QDF_LOCKED, &qd->qd_flags);
352 slot_put(qd);
353 qd_put(qd);
354 return error;
355 }
356 }
357
358 *qdp = qd;
359
360 return 0;
361}
362
363static int qd_trylock(struct gfs2_quota_data *qd)
364{
365 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
366
367 if (sdp->sd_vfs->s_flags & MS_RDONLY)
368 return 0;
369
370 spin_lock(&sdp->sd_quota_spin);
371
372 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
373 !test_bit(QDF_CHANGE, &qd->qd_flags)) {
374 spin_unlock(&sdp->sd_quota_spin);
375 return 0;
376 }
377
378 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
379
380 set_bit(QDF_LOCKED, &qd->qd_flags);
381 gfs2_assert_warn(sdp, qd->qd_count);
382 qd->qd_count++;
383 qd->qd_change_sync = qd->qd_change;
384 gfs2_assert_warn(sdp, qd->qd_slot_count);
385 qd->qd_slot_count++;
386
387 spin_unlock(&sdp->sd_quota_spin);
388
389 gfs2_assert_warn(sdp, qd->qd_change_sync);
390 if (bh_get(qd)) {
391 clear_bit(QDF_LOCKED, &qd->qd_flags);
392 slot_put(qd);
393 qd_put(qd);
394 return 0;
395 }
396
397 return 1;
398}
399
400static void qd_unlock(struct gfs2_quota_data *qd)
401{
402 gfs2_assert_warn(qd->qd_gl->gl_sbd,
403 test_bit(QDF_LOCKED, &qd->qd_flags));
404 clear_bit(QDF_LOCKED, &qd->qd_flags);
405 bh_put(qd);
406 slot_put(qd);
407 qd_put(qd);
408}
409
410static int qdsb_get(struct gfs2_sbd *sdp, int user, uint32_t id, int create,
411 struct gfs2_quota_data **qdp)
412{
413 int error;
414
415 error = qd_get(sdp, user, id, create, qdp);
416 if (error)
417 return error;
418
419 error = slot_get(*qdp);
420 if (error)
421 goto fail;
422
423 error = bh_get(*qdp);
424 if (error)
425 goto fail_slot;
426
427 return 0;
428
429 fail_slot:
430 slot_put(*qdp);
431
432 fail:
433 qd_put(*qdp);
434 return error;
435}
436
437static void qdsb_put(struct gfs2_quota_data *qd)
438{
439 bh_put(qd);
440 slot_put(qd);
441 qd_put(qd);
442}
443
444int gfs2_quota_hold(struct gfs2_inode *ip, uint32_t uid, uint32_t gid)
445{
446 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
447 struct gfs2_alloc *al = &ip->i_alloc;
448 struct gfs2_quota_data **qd = al->al_qd;
449 int error;
450
451 if (gfs2_assert_warn(sdp, !al->al_qd_num) ||
452 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
453 return -EIO;
454
455 if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
456 return 0;
457
458 error = qdsb_get(sdp, QUOTA_USER, ip->i_di.di_uid, CREATE, qd);
459 if (error)
460 goto out;
461 al->al_qd_num++;
462 qd++;
463
464 error = qdsb_get(sdp, QUOTA_GROUP, ip->i_di.di_gid, CREATE, qd);
465 if (error)
466 goto out;
467 al->al_qd_num++;
468 qd++;
469
470 if (uid != NO_QUOTA_CHANGE && uid != ip->i_di.di_uid) {
471 error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd);
472 if (error)
473 goto out;
474 al->al_qd_num++;
475 qd++;
476 }
477
478 if (gid != NO_QUOTA_CHANGE && gid != ip->i_di.di_gid) {
479 error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd);
480 if (error)
481 goto out;
482 al->al_qd_num++;
483 qd++;
484 }
485
486 out:
487 if (error)
488 gfs2_quota_unhold(ip);
489
490 return error;
491}
492
493void gfs2_quota_unhold(struct gfs2_inode *ip)
494{
495 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
496 struct gfs2_alloc *al = &ip->i_alloc;
497 unsigned int x;
498
499 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
500
501 for (x = 0; x < al->al_qd_num; x++) {
502 qdsb_put(al->al_qd[x]);
503 al->al_qd[x] = NULL;
504 }
505 al->al_qd_num = 0;
506}
507
508static int sort_qd(const void *a, const void *b)
509{
510 struct gfs2_quota_data *qd_a = *(struct gfs2_quota_data **)a;
511 struct gfs2_quota_data *qd_b = *(struct gfs2_quota_data **)b;
512 int ret = 0;
513
514 if (!test_bit(QDF_USER, &qd_a->qd_flags) !=
515 !test_bit(QDF_USER, &qd_b->qd_flags)) {
516 if (test_bit(QDF_USER, &qd_a->qd_flags))
517 ret = -1;
518 else
519 ret = 1;
520 } else {
521 if (qd_a->qd_id < qd_b->qd_id)
522 ret = -1;
523 else if (qd_a->qd_id > qd_b->qd_id)
524 ret = 1;
525 }
526
527 return ret;
528}
529
530static void do_qc(struct gfs2_quota_data *qd, int64_t change)
531{
532 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
533 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
534 struct gfs2_quota_change *qc = qd->qd_bh_qc;
535 int64_t x;
536
537 mutex_lock(&sdp->sd_quota_mutex);
538 gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1);
539
540 if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {
541 qc->qc_change = 0;
542 qc->qc_flags = 0;
543 if (test_bit(QDF_USER, &qd->qd_flags))
544 qc->qc_flags = cpu_to_be32(GFS2_QCF_USER);
545 qc->qc_id = cpu_to_be32(qd->qd_id);
546 }
547
548 x = qc->qc_change;
549 x = be64_to_cpu(x) + change;
550 qc->qc_change = cpu_to_be64(x);
551
552 spin_lock(&sdp->sd_quota_spin);
553 qd->qd_change = x;
554 spin_unlock(&sdp->sd_quota_spin);
555
556 if (!x) {
557 gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
558 clear_bit(QDF_CHANGE, &qd->qd_flags);
559 qc->qc_flags = 0;
560 qc->qc_id = 0;
561 slot_put(qd);
562 qd_put(qd);
563 } else if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) {
564 qd_hold(qd);
565 slot_hold(qd);
566 }
567
568 mutex_unlock(&sdp->sd_quota_mutex);
569}
570
571/**
572 * gfs2_adjust_quota
573 *
574 * This function was mostly borrowed from gfs2_block_truncate_page which was
575 * in turn mostly borrowed from ext3
576 */
577static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
578 int64_t change, struct gfs2_quota_data *qd)
579{
580 struct inode *inode = &ip->i_inode;
581 struct address_space *mapping = inode->i_mapping;
582 unsigned long index = loc >> PAGE_CACHE_SHIFT;
583 unsigned offset = loc & (PAGE_CACHE_SHIFT - 1);
584 unsigned blocksize, iblock, pos;
585 struct buffer_head *bh;
586 struct page *page;
587 void *kaddr;
588 __be64 *ptr;
589 u64 value;
590 int err = -EIO;
591
592 page = grab_cache_page(mapping, index);
593 if (!page)
594 return -ENOMEM;
595
596 blocksize = inode->i_sb->s_blocksize;
597 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
598
599 if (!page_has_buffers(page))
600 create_empty_buffers(page, blocksize, 0);
601
602 bh = page_buffers(page);
603 pos = blocksize;
604 while (offset >= pos) {
605 bh = bh->b_this_page;
606 iblock++;
607 pos += blocksize;
608 }
609
610 if (!buffer_mapped(bh)) {
611 gfs2_get_block(inode, iblock, bh, 1);
612 if (!buffer_mapped(bh))
613 goto unlock;
614 }
615
616 if (PageUptodate(page))
617 set_buffer_uptodate(bh);
618
619 if (!buffer_uptodate(bh)) {
620 ll_rw_block(READ, 1, &bh);
621 wait_on_buffer(bh);
622 if (!buffer_uptodate(bh))
623 goto unlock;
624 }
625
626 gfs2_trans_add_bh(ip->i_gl, bh, 0);
627
628 kaddr = kmap_atomic(page, KM_USER0);
629 ptr = (__be64 *)(kaddr + offset);
630 value = *ptr = cpu_to_be64(be64_to_cpu(*ptr) + change);
631 flush_dcache_page(page);
632 kunmap_atomic(kaddr, KM_USER0);
633 err = 0;
634 qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC);
635#if 0
636 qd->qd_qb.qb_limit = cpu_to_be64(q.qu_limit);
637 qd->qd_qb.qb_warn = cpu_to_be64(q.qu_warn);
638#endif
639 qd->qd_qb.qb_value = cpu_to_be64(value);
640unlock:
641 unlock_page(page);
642 page_cache_release(page);
643 return err;
644}
645
646static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
647{
648 struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
649 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
650 unsigned int data_blocks, ind_blocks;
651 struct gfs2_holder *ghs, i_gh;
652 unsigned int qx, x;
653 struct gfs2_quota_data *qd;
654 loff_t offset;
655 unsigned int nalloc = 0;
656 struct gfs2_alloc *al = NULL;
657 int error;
658
659 gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
660 &data_blocks, &ind_blocks);
661
662 ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL);
663 if (!ghs)
664 return -ENOMEM;
665
666 sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
667 for (qx = 0; qx < num_qd; qx++) {
668 error = gfs2_glock_nq_init(qda[qx]->qd_gl,
669 LM_ST_EXCLUSIVE,
670 GL_NOCACHE, &ghs[qx]);
671 if (error)
672 goto out;
673 }
674
675 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
676 if (error)
677 goto out;
678
679 for (x = 0; x < num_qd; x++) {
680 int alloc_required;
681
682 offset = qd2offset(qda[x]);
683 error = gfs2_write_alloc_required(ip, offset,
684 sizeof(struct gfs2_quota),
685 &alloc_required);
686 if (error)
687 goto out_gunlock;
688 if (alloc_required)
689 nalloc++;
690 }
691
692 if (nalloc) {
693 al = gfs2_alloc_get(ip);
694
695 al->al_requested = nalloc * (data_blocks + ind_blocks);
696
697 error = gfs2_inplace_reserve(ip);
698 if (error)
699 goto out_alloc;
700
701 error = gfs2_trans_begin(sdp,
702 al->al_rgd->rd_ri.ri_length +
703 num_qd * data_blocks +
704 nalloc * ind_blocks +
705 RES_DINODE + num_qd +
706 RES_STATFS, 0);
707 if (error)
708 goto out_ipres;
709 } else {
710 error = gfs2_trans_begin(sdp,
711 num_qd * data_blocks +
712 RES_DINODE + num_qd, 0);
713 if (error)
714 goto out_gunlock;
715 }
716
717 for (x = 0; x < num_qd; x++) {
718 qd = qda[x];
719 offset = qd2offset(qd);
720 error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync,
721 (struct gfs2_quota_data *)
722 qd->qd_gl->gl_lvb);
723 if (error)
724 goto out_end_trans;
725
726 do_qc(qd, -qd->qd_change_sync);
727 }
728
729 error = 0;
730
731 out_end_trans:
732 gfs2_trans_end(sdp);
733
734 out_ipres:
735 if (nalloc)
736 gfs2_inplace_release(ip);
737
738 out_alloc:
739 if (nalloc)
740 gfs2_alloc_put(ip);
741
742 out_gunlock:
743 gfs2_glock_dq_uninit(&i_gh);
744
745 out:
746 while (qx--)
747 gfs2_glock_dq_uninit(&ghs[qx]);
748 kfree(ghs);
749 gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
750
751 return error;
752}
753
754static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
755 struct gfs2_holder *q_gh)
756{
757 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
758 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
759 struct gfs2_holder i_gh;
760 struct gfs2_quota q;
761 char buf[sizeof(struct gfs2_quota)];
762 struct file_ra_state ra_state;
763 int error;
764
765 file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping);
766 restart:
767 error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
768 if (error)
769 return error;
770
771 gfs2_quota_lvb_in(&qd->qd_qb, qd->qd_gl->gl_lvb);
772
773 if (force_refresh || qd->qd_qb.qb_magic != GFS2_MAGIC) {
774 loff_t pos;
775 gfs2_glock_dq_uninit(q_gh);
776 error = gfs2_glock_nq_init(qd->qd_gl,
777 LM_ST_EXCLUSIVE, GL_NOCACHE,
778 q_gh);
779 if (error)
780 return error;
781
782 error = gfs2_glock_nq_init(ip->i_gl,
783 LM_ST_SHARED, 0,
784 &i_gh);
785 if (error)
786 goto fail;
787
788 memset(buf, 0, sizeof(struct gfs2_quota));
789 pos = qd2offset(qd);
790 error = gfs2_internal_read(ip, &ra_state, buf,
791 &pos, sizeof(struct gfs2_quota));
792 if (error < 0)
793 goto fail_gunlock;
794
795 gfs2_glock_dq_uninit(&i_gh);
796
797 gfs2_quota_in(&q, buf);
798
799 memset(&qd->qd_qb, 0, sizeof(struct gfs2_quota_lvb));
800 qd->qd_qb.qb_magic = GFS2_MAGIC;
801 qd->qd_qb.qb_limit = q.qu_limit;
802 qd->qd_qb.qb_warn = q.qu_warn;
803 qd->qd_qb.qb_value = q.qu_value;
804
805 gfs2_quota_lvb_out(&qd->qd_qb, qd->qd_gl->gl_lvb);
806
807 if (gfs2_glock_is_blocking(qd->qd_gl)) {
808 gfs2_glock_dq_uninit(q_gh);
809 force_refresh = 0;
810 goto restart;
811 }
812 }
813
814 return 0;
815
816 fail_gunlock:
817 gfs2_glock_dq_uninit(&i_gh);
818
819 fail:
820 gfs2_glock_dq_uninit(q_gh);
821
822 return error;
823}
824
825int gfs2_quota_lock(struct gfs2_inode *ip, uint32_t uid, uint32_t gid)
826{
827 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
828 struct gfs2_alloc *al = &ip->i_alloc;
829 unsigned int x;
830 int error = 0;
831
832 gfs2_quota_hold(ip, uid, gid);
833
834 if (capable(CAP_SYS_RESOURCE) ||
835 sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
836 return 0;
837
838 sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *),
839 sort_qd, NULL);
840
841 for (x = 0; x < al->al_qd_num; x++) {
842 error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]);
843 if (error)
844 break;
845 }
846
847 if (!error)
848 set_bit(GIF_QD_LOCKED, &ip->i_flags);
849 else {
850 while (x--)
851 gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
852 gfs2_quota_unhold(ip);
853 }
854
855 return error;
856}
857
858static int need_sync(struct gfs2_quota_data *qd)
859{
860 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
861 struct gfs2_tune *gt = &sdp->sd_tune;
862 int64_t value;
863 unsigned int num, den;
864 int do_sync = 1;
865
866 if (!qd->qd_qb.qb_limit)
867 return 0;
868
869 spin_lock(&sdp->sd_quota_spin);
870 value = qd->qd_change;
871 spin_unlock(&sdp->sd_quota_spin);
872
873 spin_lock(&gt->gt_spin);
874 num = gt->gt_quota_scale_num;
875 den = gt->gt_quota_scale_den;
876 spin_unlock(&gt->gt_spin);
877
878 if (value < 0)
879 do_sync = 0;
880 else if (qd->qd_qb.qb_value >= (int64_t)qd->qd_qb.qb_limit)
881 do_sync = 0;
882 else {
883 value *= gfs2_jindex_size(sdp) * num;
884 do_div(value, den);
885 value += qd->qd_qb.qb_value;
886 if (value < (int64_t)qd->qd_qb.qb_limit)
887 do_sync = 0;
888 }
889
890 return do_sync;
891}
892
893void gfs2_quota_unlock(struct gfs2_inode *ip)
894{
895 struct gfs2_alloc *al = &ip->i_alloc;
896 struct gfs2_quota_data *qda[4];
897 unsigned int count = 0;
898 unsigned int x;
899
900 if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
901 goto out;
902
903 for (x = 0; x < al->al_qd_num; x++) {
904 struct gfs2_quota_data *qd;
905 int sync;
906
907 qd = al->al_qd[x];
908 sync = need_sync(qd);
909
910 gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
911
912 if (sync && qd_trylock(qd))
913 qda[count++] = qd;
914 }
915
916 if (count) {
917 do_sync(count, qda);
918 for (x = 0; x < count; x++)
919 qd_unlock(qda[x]);
920 }
921
922 out:
923 gfs2_quota_unhold(ip);
924}
925
926#define MAX_LINE 256
927
928static int print_message(struct gfs2_quota_data *qd, char *type)
929{
930 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
931
932 printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n",
933 sdp->sd_fsname, type,
934 (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
935 qd->qd_id);
936
937 return 0;
938}
939
940int gfs2_quota_check(struct gfs2_inode *ip, uint32_t uid, uint32_t gid)
941{
942 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
943 struct gfs2_alloc *al = &ip->i_alloc;
944 struct gfs2_quota_data *qd;
945 int64_t value;
946 unsigned int x;
947 int error = 0;
948
949 if (!test_bit(GIF_QD_LOCKED, &ip->i_flags))
950 return 0;
951
952 if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
953 return 0;
954
955 for (x = 0; x < al->al_qd_num; x++) {
956 qd = al->al_qd[x];
957
958 if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
959 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))))
960 continue;
961
962 value = qd->qd_qb.qb_value;
963 spin_lock(&sdp->sd_quota_spin);
964 value += qd->qd_change;
965 spin_unlock(&sdp->sd_quota_spin);
966
967 if (qd->qd_qb.qb_limit && (int64_t)qd->qd_qb.qb_limit < value) {
968 print_message(qd, "exceeded");
969 error = -EDQUOT;
970 break;
971 } else if (qd->qd_qb.qb_warn &&
972 (int64_t)qd->qd_qb.qb_warn < value &&
973 time_after_eq(jiffies, qd->qd_last_warn +
974 gfs2_tune_get(sdp,
975 gt_quota_warn_period) * HZ)) {
976 error = print_message(qd, "warning");
977 qd->qd_last_warn = jiffies;
978 }
979 }
980
981 return error;
982}
983
984void gfs2_quota_change(struct gfs2_inode *ip, int64_t change,
985 uint32_t uid, uint32_t gid)
986{
987 struct gfs2_alloc *al = &ip->i_alloc;
988 struct gfs2_quota_data *qd;
989 unsigned int x;
990 unsigned int found = 0;
991
992 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
993 return;
994 if (ip->i_di.di_flags & GFS2_DIF_SYSTEM)
995 return;
996
997 for (x = 0; x < al->al_qd_num; x++) {
998 qd = al->al_qd[x];
999
1000 if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
1001 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
1002 do_qc(qd, change);
1003 found++;
1004 }
1005 }
1006}
1007
1008int gfs2_quota_sync(struct gfs2_sbd *sdp)
1009{
1010 struct gfs2_quota_data **qda;
1011 unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
1012 unsigned int num_qd;
1013 unsigned int x;
1014 int error = 0;
1015
1016 sdp->sd_quota_sync_gen++;
1017
1018 qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL);
1019 if (!qda)
1020 return -ENOMEM;
1021
1022 do {
1023 num_qd = 0;
1024
1025 for (;;) {
1026 error = qd_fish(sdp, qda + num_qd);
1027 if (error || !qda[num_qd])
1028 break;
1029 if (++num_qd == max_qd)
1030 break;
1031 }
1032
1033 if (num_qd) {
1034 if (!error)
1035 error = do_sync(num_qd, qda);
1036 if (!error)
1037 for (x = 0; x < num_qd; x++)
1038 qda[x]->qd_sync_gen =
1039 sdp->sd_quota_sync_gen;
1040
1041 for (x = 0; x < num_qd; x++)
1042 qd_unlock(qda[x]);
1043 }
1044 } while (!error && num_qd == max_qd);
1045
1046 kfree(qda);
1047
1048 return error;
1049}
1050
1051int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, uint32_t id)
1052{
1053 struct gfs2_quota_data *qd;
1054 struct gfs2_holder q_gh;
1055 int error;
1056
1057 error = qd_get(sdp, user, id, CREATE, &qd);
1058 if (error)
1059 return error;
1060
1061 error = do_glock(qd, FORCE, &q_gh);
1062 if (!error)
1063 gfs2_glock_dq_uninit(&q_gh);
1064
1065 qd_put(qd);
1066
1067 return error;
1068}
1069
1070#if 0
1071int gfs2_quota_read(struct gfs2_sbd *sdp, int user, uint32_t id,
1072 struct gfs2_quota *q)
1073{
1074 struct gfs2_quota_data *qd;
1075 struct gfs2_holder q_gh;
1076 int error;
1077
1078 if (((user) ? (id != current->fsuid) : (!in_group_p(id))) &&
1079 !capable(CAP_SYS_ADMIN))
1080 return -EACCES;
1081
1082 error = qd_get(sdp, user, id, CREATE, &qd);
1083 if (error)
1084 return error;
1085
1086 error = do_glock(qd, NO_FORCE, &q_gh);
1087 if (error)
1088 goto out;
1089
1090 memset(q, 0, sizeof(struct gfs2_quota));
1091 q->qu_limit = qd->qd_qb.qb_limit;
1092 q->qu_warn = qd->qd_qb.qb_warn;
1093 q->qu_value = qd->qd_qb.qb_value;
1094
1095 spin_lock(&sdp->sd_quota_spin);
1096 q->qu_value += qd->qd_change;
1097 spin_unlock(&sdp->sd_quota_spin);
1098
1099 gfs2_glock_dq_uninit(&q_gh);
1100
1101 out:
1102 qd_put(qd);
1103
1104 return error;
1105}
1106#endif /* 0 */
1107
1108int gfs2_quota_init(struct gfs2_sbd *sdp)
1109{
1110 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
1111 unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
1112 unsigned int x, slot = 0;
1113 unsigned int found = 0;
1114 uint64_t dblock;
1115 uint32_t extlen = 0;
1116 int error;
1117
1118 if (!ip->i_di.di_size ||
1119 ip->i_di.di_size > (64 << 20) ||
1120 ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
1121 gfs2_consist_inode(ip);
1122 return -EIO;
1123 }
1124 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
1125 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
1126
1127 error = -ENOMEM;
1128
1129 sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
1130 sizeof(unsigned char *), GFP_KERNEL);
1131 if (!sdp->sd_quota_bitmap)
1132 return error;
1133
1134 for (x = 0; x < sdp->sd_quota_chunks; x++) {
1135 sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL);
1136 if (!sdp->sd_quota_bitmap[x])
1137 goto fail;
1138 }
1139
1140 for (x = 0; x < blocks; x++) {
1141 struct buffer_head *bh;
1142 unsigned int y;
1143
1144 if (!extlen) {
1145 int new = 0;
1146 error = gfs2_extent_map(&ip->i_inode, x, &new, &dblock, &extlen);
1147 if (error)
1148 goto fail;
1149 }
1150 gfs2_meta_ra(ip->i_gl, dblock, extlen);
1151 error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT,
1152 &bh);
1153 if (error)
1154 goto fail;
1155 error = -EIO;
1156 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) {
1157 brelse(bh);
1158 goto fail;
1159 }
1160
1161 for (y = 0;
1162 y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
1163 y++, slot++) {
1164 struct gfs2_quota_change qc;
1165 struct gfs2_quota_data *qd;
1166
1167 gfs2_quota_change_in(&qc, bh->b_data +
1168 sizeof(struct gfs2_meta_header) +
1169 y * sizeof(struct gfs2_quota_change));
1170 if (!qc.qc_change)
1171 continue;
1172
1173 error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER),
1174 qc.qc_id, &qd);
1175 if (error) {
1176 brelse(bh);
1177 goto fail;
1178 }
1179
1180 set_bit(QDF_CHANGE, &qd->qd_flags);
1181 qd->qd_change = qc.qc_change;
1182 qd->qd_slot = slot;
1183 qd->qd_slot_count = 1;
1184 qd->qd_last_touched = jiffies;
1185
1186 spin_lock(&sdp->sd_quota_spin);
1187 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
1188 list_add(&qd->qd_list, &sdp->sd_quota_list);
1189 atomic_inc(&sdp->sd_quota_count);
1190 spin_unlock(&sdp->sd_quota_spin);
1191
1192 found++;
1193 }
1194
1195 brelse(bh);
1196 dblock++;
1197 extlen--;
1198 }
1199
1200 if (found)
1201 fs_info(sdp, "found %u quota changes\n", found);
1202
1203 return 0;
1204
1205 fail:
1206 gfs2_quota_cleanup(sdp);
1207 return error;
1208}
1209
1210void gfs2_quota_scan(struct gfs2_sbd *sdp)
1211{
1212 struct gfs2_quota_data *qd, *safe;
1213 LIST_HEAD(dead);
1214
1215 spin_lock(&sdp->sd_quota_spin);
1216 list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) {
1217 if (!qd->qd_count &&
1218 time_after_eq(jiffies, qd->qd_last_touched +
1219 gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) {
1220 list_move(&qd->qd_list, &dead);
1221 gfs2_assert_warn(sdp,
1222 atomic_read(&sdp->sd_quota_count) > 0);
1223 atomic_dec(&sdp->sd_quota_count);
1224 }
1225 }
1226 spin_unlock(&sdp->sd_quota_spin);
1227
1228 while (!list_empty(&dead)) {
1229 qd = list_entry(dead.next, struct gfs2_quota_data, qd_list);
1230 list_del(&qd->qd_list);
1231
1232 gfs2_assert_warn(sdp, !qd->qd_change);
1233 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1234 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1235
1236 gfs2_lvb_unhold(qd->qd_gl);
1237 kfree(qd);
1238 }
1239}
1240
1241void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1242{
1243 struct list_head *head = &sdp->sd_quota_list;
1244 struct gfs2_quota_data *qd;
1245 unsigned int x;
1246
1247 spin_lock(&sdp->sd_quota_spin);
1248 while (!list_empty(head)) {
1249 qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
1250
1251 if (qd->qd_count > 1 ||
1252 (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
1253 list_move(&qd->qd_list, head);
1254 spin_unlock(&sdp->sd_quota_spin);
1255 schedule();
1256 spin_lock(&sdp->sd_quota_spin);
1257 continue;
1258 }
1259
1260 list_del(&qd->qd_list);
1261 atomic_dec(&sdp->sd_quota_count);
1262 spin_unlock(&sdp->sd_quota_spin);
1263
1264 if (!qd->qd_count) {
1265 gfs2_assert_warn(sdp, !qd->qd_change);
1266 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1267 } else
1268 gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
1269 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1270
1271 gfs2_lvb_unhold(qd->qd_gl);
1272 kfree(qd);
1273
1274 spin_lock(&sdp->sd_quota_spin);
1275 }
1276 spin_unlock(&sdp->sd_quota_spin);
1277
1278 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
1279
1280 if (sdp->sd_quota_bitmap) {
1281 for (x = 0; x < sdp->sd_quota_chunks; x++)
1282 kfree(sdp->sd_quota_bitmap[x]);
1283 kfree(sdp->sd_quota_bitmap);
1284 }
1285}
1286
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
new file mode 100644
index 000000000000..af05492f9644
--- /dev/null
+++ b/fs/gfs2/quota.h
@@ -0,0 +1,32 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __QUOTA_DOT_H__
11#define __QUOTA_DOT_H__
12
13#define NO_QUOTA_CHANGE ((uint32_t)-1)
14
15int gfs2_quota_hold(struct gfs2_inode *ip, uint32_t uid, uint32_t gid);
16void gfs2_quota_unhold(struct gfs2_inode *ip);
17
18int gfs2_quota_lock(struct gfs2_inode *ip, uint32_t uid, uint32_t gid);
19void gfs2_quota_unlock(struct gfs2_inode *ip);
20
21int gfs2_quota_check(struct gfs2_inode *ip, uint32_t uid, uint32_t gid);
22void gfs2_quota_change(struct gfs2_inode *ip, int64_t change,
23 uint32_t uid, uint32_t gid);
24
25int gfs2_quota_sync(struct gfs2_sbd *sdp);
26int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, uint32_t id);
27
28int gfs2_quota_init(struct gfs2_sbd *sdp);
29void gfs2_quota_scan(struct gfs2_sbd *sdp);
30void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
31
32#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
new file mode 100644
index 000000000000..8fe518cfb3de
--- /dev/null
+++ b/fs/gfs2/recovery.c
@@ -0,0 +1,573 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "glops.h"
24#include "lm.h"
25#include "lops.h"
26#include "meta_io.h"
27#include "recovery.h"
28#include "super.h"
29#include "util.h"
30#include "dir.h"
31
32int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
33 struct buffer_head **bh)
34{
35 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
36 struct gfs2_glock *gl = ip->i_gl;
37 int new = 0;
38 uint64_t dblock;
39 uint32_t extlen;
40 int error;
41
42 error = gfs2_extent_map(&ip->i_inode, blk, &new, &dblock, &extlen);
43 if (error)
44 return error;
45 if (!dblock) {
46 gfs2_consist_inode(ip);
47 return -EIO;
48 }
49
50 gfs2_meta_ra(gl, dblock, extlen);
51 error = gfs2_meta_read(gl, dblock, DIO_START | DIO_WAIT, bh);
52
53 return error;
54}
55
56int gfs2_revoke_add(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where)
57{
58 struct list_head *head = &sdp->sd_revoke_list;
59 struct gfs2_revoke_replay *rr;
60 int found = 0;
61
62 list_for_each_entry(rr, head, rr_list) {
63 if (rr->rr_blkno == blkno) {
64 found = 1;
65 break;
66 }
67 }
68
69 if (found) {
70 rr->rr_where = where;
71 return 0;
72 }
73
74 rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL);
75 if (!rr)
76 return -ENOMEM;
77
78 rr->rr_blkno = blkno;
79 rr->rr_where = where;
80 list_add(&rr->rr_list, head);
81
82 return 1;
83}
84
85int gfs2_revoke_check(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where)
86{
87 struct gfs2_revoke_replay *rr;
88 int wrap, a, b, revoke;
89 int found = 0;
90
91 list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) {
92 if (rr->rr_blkno == blkno) {
93 found = 1;
94 break;
95 }
96 }
97
98 if (!found)
99 return 0;
100
101 wrap = (rr->rr_where < sdp->sd_replay_tail);
102 a = (sdp->sd_replay_tail < where);
103 b = (where < rr->rr_where);
104 revoke = (wrap) ? (a || b) : (a && b);
105
106 return revoke;
107}
108
109void gfs2_revoke_clean(struct gfs2_sbd *sdp)
110{
111 struct list_head *head = &sdp->sd_revoke_list;
112 struct gfs2_revoke_replay *rr;
113
114 while (!list_empty(head)) {
115 rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list);
116 list_del(&rr->rr_list);
117 kfree(rr);
118 }
119}
120
121/**
122 * get_log_header - read the log header for a given segment
123 * @jd: the journal
124 * @blk: the block to look at
125 * @lh: the log header to return
126 *
127 * Read the log header for a given segement in a given journal. Do a few
128 * sanity checks on it.
129 *
130 * Returns: 0 on success,
131 * 1 if the header was invalid or incomplete,
132 * errno on error
133 */
134
135static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
136 struct gfs2_log_header *head)
137{
138 struct buffer_head *bh;
139 struct gfs2_log_header lh;
140 uint32_t hash;
141 int error;
142
143 error = gfs2_replay_read_block(jd, blk, &bh);
144 if (error)
145 return error;
146
147 memcpy(&lh, bh->b_data, sizeof(struct gfs2_log_header));
148 lh.lh_hash = 0;
149 hash = gfs2_disk_hash((char *)&lh, sizeof(struct gfs2_log_header));
150 gfs2_log_header_in(&lh, bh->b_data);
151
152 brelse(bh);
153
154 if (lh.lh_header.mh_magic != GFS2_MAGIC ||
155 lh.lh_header.mh_type != GFS2_METATYPE_LH ||
156 lh.lh_blkno != blk || lh.lh_hash != hash)
157 return 1;
158
159 *head = lh;
160
161 return 0;
162}
163
164/**
165 * find_good_lh - find a good log header
166 * @jd: the journal
167 * @blk: the segment to start searching from
168 * @lh: the log header to fill in
169 * @forward: if true search forward in the log, else search backward
170 *
171 * Call get_log_header() to get a log header for a segment, but if the
172 * segment is bad, either scan forward or backward until we find a good one.
173 *
174 * Returns: errno
175 */
176
177static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk,
178 struct gfs2_log_header *head)
179{
180 unsigned int orig_blk = *blk;
181 int error;
182
183 for (;;) {
184 error = get_log_header(jd, *blk, head);
185 if (error <= 0)
186 return error;
187
188 if (++*blk == jd->jd_blocks)
189 *blk = 0;
190
191 if (*blk == orig_blk) {
192 gfs2_consist_inode(GFS2_I(jd->jd_inode));
193 return -EIO;
194 }
195 }
196}
197
198/**
199 * jhead_scan - make sure we've found the head of the log
200 * @jd: the journal
201 * @head: this is filled in with the log descriptor of the head
202 *
203 * At this point, seg and lh should be either the head of the log or just
204 * before. Scan forward until we find the head.
205 *
206 * Returns: errno
207 */
208
209static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
210{
211 unsigned int blk = head->lh_blkno;
212 struct gfs2_log_header lh;
213 int error;
214
215 for (;;) {
216 if (++blk == jd->jd_blocks)
217 blk = 0;
218
219 error = get_log_header(jd, blk, &lh);
220 if (error < 0)
221 return error;
222 if (error == 1)
223 continue;
224
225 if (lh.lh_sequence == head->lh_sequence) {
226 gfs2_consist_inode(GFS2_I(jd->jd_inode));
227 return -EIO;
228 }
229 if (lh.lh_sequence < head->lh_sequence)
230 break;
231
232 *head = lh;
233 }
234
235 return 0;
236}
237
238/**
239 * gfs2_find_jhead - find the head of a log
240 * @jd: the journal
241 * @head: the log descriptor for the head of the log is returned here
242 *
243 * Do a binary search of a journal and find the valid log entry with the
244 * highest sequence number. (i.e. the log head)
245 *
246 * Returns: errno
247 */
248
249int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
250{
251 struct gfs2_log_header lh_1, lh_m;
252 uint32_t blk_1, blk_2, blk_m;
253 int error;
254
255 blk_1 = 0;
256 blk_2 = jd->jd_blocks - 1;
257
258 for (;;) {
259 blk_m = (blk_1 + blk_2) / 2;
260
261 error = find_good_lh(jd, &blk_1, &lh_1);
262 if (error)
263 return error;
264
265 error = find_good_lh(jd, &blk_m, &lh_m);
266 if (error)
267 return error;
268
269 if (blk_1 == blk_m || blk_m == blk_2)
270 break;
271
272 if (lh_1.lh_sequence <= lh_m.lh_sequence)
273 blk_1 = blk_m;
274 else
275 blk_2 = blk_m;
276 }
277
278 error = jhead_scan(jd, &lh_1);
279 if (error)
280 return error;
281
282 *head = lh_1;
283
284 return error;
285}
286
287/**
288 * foreach_descriptor - go through the active part of the log
289 * @jd: the journal
290 * @start: the first log header in the active region
291 * @end: the last log header (don't process the contents of this entry))
292 *
293 * Call a given function once for every log descriptor in the active
294 * portion of the log.
295 *
296 * Returns: errno
297 */
298
299static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start,
300 unsigned int end, int pass)
301{
302 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
303 struct buffer_head *bh;
304 struct gfs2_log_descriptor *ld;
305 int error = 0;
306 u32 length;
307 __be64 *ptr;
308 unsigned int offset = sizeof(struct gfs2_log_descriptor);
309 offset += (sizeof(__be64)-1);
310 offset &= ~(sizeof(__be64)-1);
311
312 while (start != end) {
313 error = gfs2_replay_read_block(jd, start, &bh);
314 if (error)
315 return error;
316 if (gfs2_meta_check(sdp, bh)) {
317 brelse(bh);
318 return -EIO;
319 }
320 ld = (struct gfs2_log_descriptor *)bh->b_data;
321 length = be32_to_cpu(ld->ld_length);
322
323 if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) {
324 struct gfs2_log_header lh;
325 error = get_log_header(jd, start, &lh);
326 if (!error) {
327 gfs2_replay_incr_blk(sdp, &start);
328 brelse(bh);
329 continue;
330 }
331 if (error == 1) {
332 gfs2_consist_inode(GFS2_I(jd->jd_inode));
333 error = -EIO;
334 }
335 brelse(bh);
336 return error;
337 } else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) {
338 brelse(bh);
339 return -EIO;
340 }
341 ptr = (__be64 *)(bh->b_data + offset);
342 error = lops_scan_elements(jd, start, ld, ptr, pass);
343 if (error) {
344 brelse(bh);
345 return error;
346 }
347
348 while (length--)
349 gfs2_replay_incr_blk(sdp, &start);
350
351 brelse(bh);
352 }
353
354 return 0;
355}
356
357/**
358 * clean_journal - mark a dirty journal as being clean
359 * @sdp: the filesystem
360 * @jd: the journal
361 * @gl: the journal's glock
362 * @head: the head journal to start from
363 *
364 * Returns: errno
365 */
366
367static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
368{
369 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
370 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
371 unsigned int lblock;
372 int new = 0;
373 uint64_t dblock;
374 struct gfs2_log_header *lh;
375 uint32_t hash;
376 struct buffer_head *bh;
377 int error;
378 int boundary;
379
380 lblock = head->lh_blkno;
381 gfs2_replay_incr_blk(sdp, &lblock);
382 error = gfs2_block_map(&ip->i_inode, lblock, &new, &dblock, &boundary);
383 if (error)
384 return error;
385 if (!dblock) {
386 gfs2_consist_inode(ip);
387 return -EIO;
388 }
389
390 bh = sb_getblk(sdp->sd_vfs, dblock);
391 lock_buffer(bh);
392 memset(bh->b_data, 0, bh->b_size);
393 set_buffer_uptodate(bh);
394 clear_buffer_dirty(bh);
395 unlock_buffer(bh);
396
397 lh = (struct gfs2_log_header *)bh->b_data;
398 memset(lh, 0, sizeof(struct gfs2_log_header));
399 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
400 lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
401 lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
402 lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
403 lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
404 lh->lh_blkno = cpu_to_be32(lblock);
405 hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header));
406 lh->lh_hash = cpu_to_be32(hash);
407
408 set_buffer_dirty(bh);
409 if (sync_dirty_buffer(bh))
410 gfs2_io_error_bh(sdp, bh);
411 brelse(bh);
412
413 return error;
414}
415
416/**
417 * gfs2_recover_journal - recovery a given journal
418 * @jd: the struct gfs2_jdesc describing the journal
419 *
420 * Acquire the journal's lock, check to see if the journal is clean, and
421 * do recovery if necessary.
422 *
423 * Returns: errno
424 */
425
426int gfs2_recover_journal(struct gfs2_jdesc *jd)
427{
428 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
429 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
430 struct gfs2_log_header head;
431 struct gfs2_holder j_gh, ji_gh, t_gh;
432 unsigned long t;
433 int ro = 0;
434 unsigned int pass;
435 int error;
436
437 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
438 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
439 jd->jd_jid);
440
441 /* Aquire the journal lock so we can do recovery */
442
443 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
444 LM_ST_EXCLUSIVE,
445 LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE,
446 &j_gh);
447 switch (error) {
448 case 0:
449 break;
450
451 case GLR_TRYFAILED:
452 fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid);
453 error = 0;
454
455 default:
456 goto fail;
457 };
458
459 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
460 LM_FLAG_NOEXP, &ji_gh);
461 if (error)
462 goto fail_gunlock_j;
463 } else {
464 fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid);
465 }
466
467 fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid);
468
469 error = gfs2_jdesc_check(jd);
470 if (error)
471 goto fail_gunlock_ji;
472
473 error = gfs2_find_jhead(jd, &head);
474 if (error)
475 goto fail_gunlock_ji;
476
477 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
478 fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n",
479 jd->jd_jid);
480
481 t = jiffies;
482
483 /* Acquire a shared hold on the transaction lock */
484
485 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
486 LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
487 GL_NOCANCEL | GL_NOCACHE, &t_gh);
488 if (error)
489 goto fail_gunlock_ji;
490
491 if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
492 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
493 ro = 1;
494 } else {
495 if (sdp->sd_vfs->s_flags & MS_RDONLY)
496 ro = 1;
497 }
498
499 if (ro) {
500 fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n",
501 jd->jd_jid);
502 error = -EROFS;
503 goto fail_gunlock_tr;
504 }
505
506 fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid);
507
508 for (pass = 0; pass < 2; pass++) {
509 lops_before_scan(jd, &head, pass);
510 error = foreach_descriptor(jd, head.lh_tail,
511 head.lh_blkno, pass);
512 lops_after_scan(jd, error, pass);
513 if (error)
514 goto fail_gunlock_tr;
515 }
516
517 error = clean_journal(jd, &head);
518 if (error)
519 goto fail_gunlock_tr;
520
521 gfs2_glock_dq_uninit(&t_gh);
522 t = DIV_ROUND_UP(jiffies - t, HZ);
523 fs_info(sdp, "jid=%u: Journal replayed in %lus\n",
524 jd->jd_jid, t);
525 }
526
527 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
528 gfs2_glock_dq_uninit(&ji_gh);
529
530 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
531
532 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
533 gfs2_glock_dq_uninit(&j_gh);
534
535 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
536 return 0;
537
538fail_gunlock_tr:
539 gfs2_glock_dq_uninit(&t_gh);
540fail_gunlock_ji:
541 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
542 gfs2_glock_dq_uninit(&ji_gh);
543fail_gunlock_j:
544 gfs2_glock_dq_uninit(&j_gh);
545 }
546
547 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
548
549fail:
550 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
551 return error;
552}
553
554/**
555 * gfs2_check_journals - Recover any dirty journals
556 * @sdp: the filesystem
557 *
558 */
559
560void gfs2_check_journals(struct gfs2_sbd *sdp)
561{
562 struct gfs2_jdesc *jd;
563
564 for (;;) {
565 jd = gfs2_jdesc_find_dirty(sdp);
566 if (!jd)
567 break;
568
569 if (jd != sdp->sd_jdesc)
570 gfs2_recover_journal(jd);
571 }
572}
573
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
new file mode 100644
index 000000000000..ac0f1d6ce456
--- /dev/null
+++ b/fs/gfs2/recovery.h
@@ -0,0 +1,32 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __RECOVERY_DOT_H__
11#define __RECOVERY_DOT_H__
12
13static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
14{
15 if (++*blk == sdp->sd_jdesc->jd_blocks)
16 *blk = 0;
17}
18
19int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
20 struct buffer_head **bh);
21
22int gfs2_revoke_add(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where);
23int gfs2_revoke_check(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where);
24void gfs2_revoke_clean(struct gfs2_sbd *sdp);
25
26int gfs2_find_jhead(struct gfs2_jdesc *jd,
27 struct gfs2_log_header *head);
28int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
29void gfs2_check_journals(struct gfs2_sbd *sdp);
30
31#endif /* __RECOVERY_DOT_H__ */
32
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
new file mode 100644
index 000000000000..84fcc1bfaf1b
--- /dev/null
+++ b/fs/gfs2/rgrp.c
@@ -0,0 +1,1525 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/fs.h>
16#include <linux/gfs2_ondisk.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "glock.h"
22#include "glops.h"
23#include "lops.h"
24#include "meta_io.h"
25#include "quota.h"
26#include "rgrp.h"
27#include "super.h"
28#include "trans.h"
29#include "ops_file.h"
30#include "util.h"
31
32#define BFITNOENT 0xFFFFFFFF
33
34/*
35 * These routines are used by the resource group routines (rgrp.c)
36 * to keep track of block allocation. Each block is represented by two
37 * bits. So, each byte represents GFS2_NBBY (i.e. 4) blocks.
38 *
39 * 0 = Free
40 * 1 = Used (not metadata)
41 * 2 = Unlinked (still in use) inode
42 * 3 = Used (metadata)
43 */
44
45static const char valid_change[16] = {
46 /* current */
47 /* n */ 0, 1, 1, 1,
48 /* e */ 1, 0, 0, 0,
49 /* w */ 0, 0, 0, 1,
50 1, 0, 0, 0
51};
52
53/**
54 * gfs2_setbit - Set a bit in the bitmaps
55 * @buffer: the buffer that holds the bitmaps
56 * @buflen: the length (in bytes) of the buffer
57 * @block: the block to set
58 * @new_state: the new state of the block
59 *
60 */
61
62static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
63 unsigned int buflen, uint32_t block,
64 unsigned char new_state)
65{
66 unsigned char *byte, *end, cur_state;
67 unsigned int bit;
68
69 byte = buffer + (block / GFS2_NBBY);
70 bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
71 end = buffer + buflen;
72
73 gfs2_assert(rgd->rd_sbd, byte < end);
74
75 cur_state = (*byte >> bit) & GFS2_BIT_MASK;
76
77 if (valid_change[new_state * 4 + cur_state]) {
78 *byte ^= cur_state << bit;
79 *byte |= new_state << bit;
80 } else
81 gfs2_consist_rgrpd(rgd);
82}
83
84/**
85 * gfs2_testbit - test a bit in the bitmaps
86 * @buffer: the buffer that holds the bitmaps
87 * @buflen: the length (in bytes) of the buffer
88 * @block: the block to read
89 *
90 */
91
92static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
93 unsigned int buflen, uint32_t block)
94{
95 unsigned char *byte, *end, cur_state;
96 unsigned int bit;
97
98 byte = buffer + (block / GFS2_NBBY);
99 bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
100 end = buffer + buflen;
101
102 gfs2_assert(rgd->rd_sbd, byte < end);
103
104 cur_state = (*byte >> bit) & GFS2_BIT_MASK;
105
106 return cur_state;
107}
108
109/**
110 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
111 * a block in a given allocation state.
112 * @buffer: the buffer that holds the bitmaps
113 * @buflen: the length (in bytes) of the buffer
114 * @goal: start search at this block's bit-pair (within @buffer)
115 * @old_state: GFS2_BLKST_XXX the state of the block we're looking for;
116 * bit 0 = alloc(1)/free(0), bit 1 = meta(1)/data(0)
117 *
118 * Scope of @goal and returned block number is only within this bitmap buffer,
119 * not entire rgrp or filesystem. @buffer will be offset from the actual
120 * beginning of a bitmap block buffer, skipping any header structures.
121 *
122 * Return: the block number (bitmap buffer scope) that was found
123 */
124
125static uint32_t gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
126 unsigned int buflen, uint32_t goal,
127 unsigned char old_state)
128{
129 unsigned char *byte, *end, alloc;
130 uint32_t blk = goal;
131 unsigned int bit;
132
133 byte = buffer + (goal / GFS2_NBBY);
134 bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
135 end = buffer + buflen;
136 alloc = (old_state & 1) ? 0 : 0x55;
137
138 while (byte < end) {
139 if ((*byte & 0x55) == alloc) {
140 blk += (8 - bit) >> 1;
141
142 bit = 0;
143 byte++;
144
145 continue;
146 }
147
148 if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
149 return blk;
150
151 bit += GFS2_BIT_SIZE;
152 if (bit >= 8) {
153 bit = 0;
154 byte++;
155 }
156
157 blk++;
158 }
159
160 return BFITNOENT;
161}
162
163/**
164 * gfs2_bitcount - count the number of bits in a certain state
165 * @buffer: the buffer that holds the bitmaps
166 * @buflen: the length (in bytes) of the buffer
167 * @state: the state of the block we're looking for
168 *
169 * Returns: The number of bits
170 */
171
172static uint32_t gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer,
173 unsigned int buflen, unsigned char state)
174{
175 unsigned char *byte = buffer;
176 unsigned char *end = buffer + buflen;
177 unsigned char state1 = state << 2;
178 unsigned char state2 = state << 4;
179 unsigned char state3 = state << 6;
180 uint32_t count = 0;
181
182 for (; byte < end; byte++) {
183 if (((*byte) & 0x03) == state)
184 count++;
185 if (((*byte) & 0x0C) == state1)
186 count++;
187 if (((*byte) & 0x30) == state2)
188 count++;
189 if (((*byte) & 0xC0) == state3)
190 count++;
191 }
192
193 return count;
194}
195
196/**
197 * gfs2_rgrp_verify - Verify that a resource group is consistent
198 * @sdp: the filesystem
199 * @rgd: the rgrp
200 *
201 */
202
203void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
204{
205 struct gfs2_sbd *sdp = rgd->rd_sbd;
206 struct gfs2_bitmap *bi = NULL;
207 uint32_t length = rgd->rd_ri.ri_length;
208 uint32_t count[4], tmp;
209 int buf, x;
210
211 memset(count, 0, 4 * sizeof(uint32_t));
212
213 /* Count # blocks in each of 4 possible allocation states */
214 for (buf = 0; buf < length; buf++) {
215 bi = rgd->rd_bits + buf;
216 for (x = 0; x < 4; x++)
217 count[x] += gfs2_bitcount(rgd,
218 bi->bi_bh->b_data +
219 bi->bi_offset,
220 bi->bi_len, x);
221 }
222
223 if (count[0] != rgd->rd_rg.rg_free) {
224 if (gfs2_consist_rgrpd(rgd))
225 fs_err(sdp, "free data mismatch: %u != %u\n",
226 count[0], rgd->rd_rg.rg_free);
227 return;
228 }
229
230 tmp = rgd->rd_ri.ri_data -
231 rgd->rd_rg.rg_free -
232 rgd->rd_rg.rg_dinodes;
233 if (count[1] + count[2] != tmp) {
234 if (gfs2_consist_rgrpd(rgd))
235 fs_err(sdp, "used data mismatch: %u != %u\n",
236 count[1], tmp);
237 return;
238 }
239
240 if (count[3] != rgd->rd_rg.rg_dinodes) {
241 if (gfs2_consist_rgrpd(rgd))
242 fs_err(sdp, "used metadata mismatch: %u != %u\n",
243 count[3], rgd->rd_rg.rg_dinodes);
244 return;
245 }
246
247 if (count[2] > count[3]) {
248 if (gfs2_consist_rgrpd(rgd))
249 fs_err(sdp, "unlinked inodes > inodes: %u\n",
250 count[2]);
251 return;
252 }
253
254}
255
256static inline int rgrp_contains_block(struct gfs2_rindex *ri, uint64_t block)
257{
258 uint64_t first = ri->ri_data0;
259 uint64_t last = first + ri->ri_data;
260 return !!(first <= block && block < last);
261}
262
263/**
264 * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
265 * @sdp: The GFS2 superblock
266 * @n: The data block number
267 *
268 * Returns: The resource group, or NULL if not found
269 */
270
271struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, uint64_t blk)
272{
273 struct gfs2_rgrpd *rgd;
274
275 spin_lock(&sdp->sd_rindex_spin);
276
277 list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) {
278 if (rgrp_contains_block(&rgd->rd_ri, blk)) {
279 list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
280 spin_unlock(&sdp->sd_rindex_spin);
281 return rgd;
282 }
283 }
284
285 spin_unlock(&sdp->sd_rindex_spin);
286
287 return NULL;
288}
289
290/**
291 * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
292 * @sdp: The GFS2 superblock
293 *
294 * Returns: The first rgrp in the filesystem
295 */
296
297struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
298{
299 gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list));
300 return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list);
301}
302
303/**
304 * gfs2_rgrpd_get_next - get the next RG
305 * @rgd: A RG
306 *
307 * Returns: The next rgrp
308 */
309
310struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
311{
312 if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list)
313 return NULL;
314 return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list);
315}
316
317static void clear_rgrpdi(struct gfs2_sbd *sdp)
318{
319 struct list_head *head;
320 struct gfs2_rgrpd *rgd;
321 struct gfs2_glock *gl;
322
323 spin_lock(&sdp->sd_rindex_spin);
324 sdp->sd_rindex_forward = NULL;
325 head = &sdp->sd_rindex_recent_list;
326 while (!list_empty(head)) {
327 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
328 list_del(&rgd->rd_recent);
329 }
330 spin_unlock(&sdp->sd_rindex_spin);
331
332 head = &sdp->sd_rindex_list;
333 while (!list_empty(head)) {
334 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list);
335 gl = rgd->rd_gl;
336
337 list_del(&rgd->rd_list);
338 list_del(&rgd->rd_list_mru);
339
340 if (gl) {
341 gl->gl_object = NULL;
342 gfs2_glock_put(gl);
343 }
344
345 kfree(rgd->rd_bits);
346 kfree(rgd);
347 }
348}
349
350void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
351{
352 mutex_lock(&sdp->sd_rindex_mutex);
353 clear_rgrpdi(sdp);
354 mutex_unlock(&sdp->sd_rindex_mutex);
355}
356
357/**
358 * gfs2_compute_bitstructs - Compute the bitmap sizes
359 * @rgd: The resource group descriptor
360 *
361 * Calculates bitmap descriptors, one for each block that contains bitmap data
362 *
363 * Returns: errno
364 */
365
366static int compute_bitstructs(struct gfs2_rgrpd *rgd)
367{
368 struct gfs2_sbd *sdp = rgd->rd_sbd;
369 struct gfs2_bitmap *bi;
370 uint32_t length = rgd->rd_ri.ri_length; /* # blocks in hdr & bitmap */
371 uint32_t bytes_left, bytes;
372 int x;
373
374 if (!length)
375 return -EINVAL;
376
377 rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
378 if (!rgd->rd_bits)
379 return -ENOMEM;
380
381 bytes_left = rgd->rd_ri.ri_bitbytes;
382
383 for (x = 0; x < length; x++) {
384 bi = rgd->rd_bits + x;
385
386 /* small rgrp; bitmap stored completely in header block */
387 if (length == 1) {
388 bytes = bytes_left;
389 bi->bi_offset = sizeof(struct gfs2_rgrp);
390 bi->bi_start = 0;
391 bi->bi_len = bytes;
392 /* header block */
393 } else if (x == 0) {
394 bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
395 bi->bi_offset = sizeof(struct gfs2_rgrp);
396 bi->bi_start = 0;
397 bi->bi_len = bytes;
398 /* last block */
399 } else if (x + 1 == length) {
400 bytes = bytes_left;
401 bi->bi_offset = sizeof(struct gfs2_meta_header);
402 bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
403 bi->bi_len = bytes;
404 /* other blocks */
405 } else {
406 bytes = sdp->sd_sb.sb_bsize -
407 sizeof(struct gfs2_meta_header);
408 bi->bi_offset = sizeof(struct gfs2_meta_header);
409 bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
410 bi->bi_len = bytes;
411 }
412
413 bytes_left -= bytes;
414 }
415
416 if (bytes_left) {
417 gfs2_consist_rgrpd(rgd);
418 return -EIO;
419 }
420 bi = rgd->rd_bits + (length - 1);
421 if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_ri.ri_data) {
422 if (gfs2_consist_rgrpd(rgd)) {
423 gfs2_rindex_print(&rgd->rd_ri);
424 fs_err(sdp, "start=%u len=%u offset=%u\n",
425 bi->bi_start, bi->bi_len, bi->bi_offset);
426 }
427 return -EIO;
428 }
429
430 return 0;
431}
432
433/**
434 * gfs2_ri_update - Pull in a new resource index from the disk
435 * @gl: The glock covering the rindex inode
436 *
437 * Returns: 0 on successful update, error code otherwise
438 */
439
440static int gfs2_ri_update(struct gfs2_inode *ip)
441{
442 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
443 struct inode *inode = &ip->i_inode;
444 struct gfs2_rgrpd *rgd;
445 char buf[sizeof(struct gfs2_rindex)];
446 struct file_ra_state ra_state;
447 uint64_t junk = ip->i_di.di_size;
448 int error;
449
450 if (do_div(junk, sizeof(struct gfs2_rindex))) {
451 gfs2_consist_inode(ip);
452 return -EIO;
453 }
454
455 clear_rgrpdi(sdp);
456
457 file_ra_state_init(&ra_state, inode->i_mapping);
458 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
459 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
460 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
461 sizeof(struct gfs2_rindex));
462 if (!error)
463 break;
464 if (error != sizeof(struct gfs2_rindex)) {
465 if (error > 0)
466 error = -EIO;
467 goto fail;
468 }
469
470 rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS);
471 error = -ENOMEM;
472 if (!rgd)
473 goto fail;
474
475 mutex_init(&rgd->rd_mutex);
476 lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
477 rgd->rd_sbd = sdp;
478
479 list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
480 list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
481
482 gfs2_rindex_in(&rgd->rd_ri, buf);
483 error = compute_bitstructs(rgd);
484 if (error)
485 goto fail;
486
487 error = gfs2_glock_get(sdp, rgd->rd_ri.ri_addr,
488 &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
489 if (error)
490 goto fail;
491
492 rgd->rd_gl->gl_object = rgd;
493 rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
494 }
495
496 sdp->sd_rindex_vn = ip->i_gl->gl_vn;
497 return 0;
498
499fail:
500 clear_rgrpdi(sdp);
501 return error;
502}
503
504/**
505 * gfs2_rindex_hold - Grab a lock on the rindex
506 * @sdp: The GFS2 superblock
507 * @ri_gh: the glock holder
508 *
509 * We grab a lock on the rindex inode to make sure that it doesn't
510 * change whilst we are performing an operation. We keep this lock
511 * for quite long periods of time compared to other locks. This
512 * doesn't matter, since it is shared and it is very, very rarely
513 * accessed in the exclusive mode (i.e. only when expanding the filesystem).
514 *
515 * This makes sure that we're using the latest copy of the resource index
516 * special file, which might have been updated if someone expanded the
517 * filesystem (via gfs2_grow utility), which adds new resource groups.
518 *
519 * Returns: 0 on success, error code otherwise
520 */
521
522int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
523{
524 struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
525 struct gfs2_glock *gl = ip->i_gl;
526 int error;
527
528 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
529 if (error)
530 return error;
531
532 /* Read new copy from disk if we don't have the latest */
533 if (sdp->sd_rindex_vn != gl->gl_vn) {
534 mutex_lock(&sdp->sd_rindex_mutex);
535 if (sdp->sd_rindex_vn != gl->gl_vn) {
536 error = gfs2_ri_update(ip);
537 if (error)
538 gfs2_glock_dq_uninit(ri_gh);
539 }
540 mutex_unlock(&sdp->sd_rindex_mutex);
541 }
542
543 return error;
544}
545
546/**
547 * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
548 * @rgd: the struct gfs2_rgrpd describing the RG to read in
549 *
550 * Read in all of a Resource Group's header and bitmap blocks.
551 * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
552 *
553 * Returns: errno
554 */
555
556int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
557{
558 struct gfs2_sbd *sdp = rgd->rd_sbd;
559 struct gfs2_glock *gl = rgd->rd_gl;
560 unsigned int length = rgd->rd_ri.ri_length;
561 struct gfs2_bitmap *bi;
562 unsigned int x, y;
563 int error;
564
565 mutex_lock(&rgd->rd_mutex);
566
567 spin_lock(&sdp->sd_rindex_spin);
568 if (rgd->rd_bh_count) {
569 rgd->rd_bh_count++;
570 spin_unlock(&sdp->sd_rindex_spin);
571 mutex_unlock(&rgd->rd_mutex);
572 return 0;
573 }
574 spin_unlock(&sdp->sd_rindex_spin);
575
576 for (x = 0; x < length; x++) {
577 bi = rgd->rd_bits + x;
578 error = gfs2_meta_read(gl, rgd->rd_ri.ri_addr + x, DIO_START,
579 &bi->bi_bh);
580 if (error)
581 goto fail;
582 }
583
584 for (y = length; y--;) {
585 bi = rgd->rd_bits + y;
586 error = gfs2_meta_reread(sdp, bi->bi_bh, DIO_WAIT);
587 if (error)
588 goto fail;
589 if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
590 GFS2_METATYPE_RG)) {
591 error = -EIO;
592 goto fail;
593 }
594 }
595
596 if (rgd->rd_rg_vn != gl->gl_vn) {
597 gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data);
598 rgd->rd_rg_vn = gl->gl_vn;
599 }
600
601 spin_lock(&sdp->sd_rindex_spin);
602 rgd->rd_free_clone = rgd->rd_rg.rg_free;
603 rgd->rd_bh_count++;
604 spin_unlock(&sdp->sd_rindex_spin);
605
606 mutex_unlock(&rgd->rd_mutex);
607
608 return 0;
609
610fail:
611 while (x--) {
612 bi = rgd->rd_bits + x;
613 brelse(bi->bi_bh);
614 bi->bi_bh = NULL;
615 gfs2_assert_warn(sdp, !bi->bi_clone);
616 }
617 mutex_unlock(&rgd->rd_mutex);
618
619 return error;
620}
621
622void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
623{
624 struct gfs2_sbd *sdp = rgd->rd_sbd;
625
626 spin_lock(&sdp->sd_rindex_spin);
627 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
628 rgd->rd_bh_count++;
629 spin_unlock(&sdp->sd_rindex_spin);
630}
631
632/**
633 * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get()
634 * @rgd: the struct gfs2_rgrpd describing the RG to read in
635 *
636 */
637
638void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
639{
640 struct gfs2_sbd *sdp = rgd->rd_sbd;
641 int x, length = rgd->rd_ri.ri_length;
642
643 spin_lock(&sdp->sd_rindex_spin);
644 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
645 if (--rgd->rd_bh_count) {
646 spin_unlock(&sdp->sd_rindex_spin);
647 return;
648 }
649
650 for (x = 0; x < length; x++) {
651 struct gfs2_bitmap *bi = rgd->rd_bits + x;
652 kfree(bi->bi_clone);
653 bi->bi_clone = NULL;
654 brelse(bi->bi_bh);
655 bi->bi_bh = NULL;
656 }
657
658 spin_unlock(&sdp->sd_rindex_spin);
659}
660
661void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
662{
663 struct gfs2_sbd *sdp = rgd->rd_sbd;
664 unsigned int length = rgd->rd_ri.ri_length;
665 unsigned int x;
666
667 for (x = 0; x < length; x++) {
668 struct gfs2_bitmap *bi = rgd->rd_bits + x;
669 if (!bi->bi_clone)
670 continue;
671 memcpy(bi->bi_clone + bi->bi_offset,
672 bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
673 }
674
675 spin_lock(&sdp->sd_rindex_spin);
676 rgd->rd_free_clone = rgd->rd_rg.rg_free;
677 spin_unlock(&sdp->sd_rindex_spin);
678}
679
680/**
681 * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
682 * @ip: the incore GFS2 inode structure
683 *
684 * Returns: the struct gfs2_alloc
685 */
686
687struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
688{
689 struct gfs2_alloc *al = &ip->i_alloc;
690
691 /* FIXME: Should assert that the correct locks are held here... */
692 memset(al, 0, sizeof(*al));
693 return al;
694}
695
696/**
697 * gfs2_alloc_put - throw away the struct gfs2_alloc for an inode
698 * @ip: the inode
699 *
700 */
701
702void gfs2_alloc_put(struct gfs2_inode *ip)
703{
704 return;
705}
706
707/**
708 * try_rgrp_fit - See if a given reservation will fit in a given RG
709 * @rgd: the RG data
710 * @al: the struct gfs2_alloc structure describing the reservation
711 *
712 * If there's room for the requested blocks to be allocated from the RG:
713 * Sets the $al_reserved_data field in @al.
714 * Sets the $al_reserved_meta field in @al.
715 * Sets the $al_rgd field in @al.
716 *
717 * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
718 */
719
720static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
721{
722 struct gfs2_sbd *sdp = rgd->rd_sbd;
723 int ret = 0;
724
725 spin_lock(&sdp->sd_rindex_spin);
726 if (rgd->rd_free_clone >= al->al_requested) {
727 al->al_rgd = rgd;
728 ret = 1;
729 }
730 spin_unlock(&sdp->sd_rindex_spin);
731
732 return ret;
733}
734
735/**
736 * recent_rgrp_first - get first RG from "recent" list
737 * @sdp: The GFS2 superblock
738 * @rglast: address of the rgrp used last
739 *
740 * Returns: The first rgrp in the recent list
741 */
742
743static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
744 uint64_t rglast)
745{
746 struct gfs2_rgrpd *rgd = NULL;
747
748 spin_lock(&sdp->sd_rindex_spin);
749
750 if (list_empty(&sdp->sd_rindex_recent_list))
751 goto out;
752
753 if (!rglast)
754 goto first;
755
756 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
757 if (rgd->rd_ri.ri_addr == rglast)
758 goto out;
759 }
760
761first:
762 rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd,
763 rd_recent);
764out:
765 spin_unlock(&sdp->sd_rindex_spin);
766 return rgd;
767}
768
769/**
770 * recent_rgrp_next - get next RG from "recent" list
771 * @cur_rgd: current rgrp
772 * @remove:
773 *
774 * Returns: The next rgrp in the recent list
775 */
776
777static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
778 int remove)
779{
780 struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
781 struct list_head *head;
782 struct gfs2_rgrpd *rgd;
783
784 spin_lock(&sdp->sd_rindex_spin);
785
786 head = &sdp->sd_rindex_recent_list;
787
788 list_for_each_entry(rgd, head, rd_recent) {
789 if (rgd == cur_rgd) {
790 if (cur_rgd->rd_recent.next != head)
791 rgd = list_entry(cur_rgd->rd_recent.next,
792 struct gfs2_rgrpd, rd_recent);
793 else
794 rgd = NULL;
795
796 if (remove)
797 list_del(&cur_rgd->rd_recent);
798
799 goto out;
800 }
801 }
802
803 rgd = NULL;
804 if (!list_empty(head))
805 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
806
807out:
808 spin_unlock(&sdp->sd_rindex_spin);
809 return rgd;
810}
811
812/**
813 * recent_rgrp_add - add an RG to tail of "recent" list
814 * @new_rgd: The rgrp to add
815 *
816 */
817
818static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
819{
820 struct gfs2_sbd *sdp = new_rgd->rd_sbd;
821 struct gfs2_rgrpd *rgd;
822 unsigned int count = 0;
823 unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
824
825 spin_lock(&sdp->sd_rindex_spin);
826
827 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
828 if (rgd == new_rgd)
829 goto out;
830
831 if (++count >= max)
832 goto out;
833 }
834 list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
835
836out:
837 spin_unlock(&sdp->sd_rindex_spin);
838}
839
840/**
841 * forward_rgrp_get - get an rgrp to try next from full list
842 * @sdp: The GFS2 superblock
843 *
844 * Returns: The rgrp to try next
845 */
846
847static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp)
848{
849 struct gfs2_rgrpd *rgd;
850 unsigned int journals = gfs2_jindex_size(sdp);
851 unsigned int rg = 0, x;
852
853 spin_lock(&sdp->sd_rindex_spin);
854
855 rgd = sdp->sd_rindex_forward;
856 if (!rgd) {
857 if (sdp->sd_rgrps >= journals)
858 rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals;
859
860 for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg;
861 x++, rgd = gfs2_rgrpd_get_next(rgd))
862 /* Do Nothing */;
863
864 sdp->sd_rindex_forward = rgd;
865 }
866
867 spin_unlock(&sdp->sd_rindex_spin);
868
869 return rgd;
870}
871
872/**
873 * forward_rgrp_set - set the forward rgrp pointer
874 * @sdp: the filesystem
875 * @rgd: The new forward rgrp
876 *
877 */
878
879static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
880{
881 spin_lock(&sdp->sd_rindex_spin);
882 sdp->sd_rindex_forward = rgd;
883 spin_unlock(&sdp->sd_rindex_spin);
884}
885
886/**
887 * get_local_rgrp - Choose and lock a rgrp for allocation
888 * @ip: the inode to reserve space for
889 * @rgp: the chosen and locked rgrp
890 *
891 * Try to acquire rgrp in way which avoids contending with others.
892 *
893 * Returns: errno
894 */
895
896static int get_local_rgrp(struct gfs2_inode *ip)
897{
898 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
899 struct gfs2_rgrpd *rgd, *begin = NULL;
900 struct gfs2_alloc *al = &ip->i_alloc;
901 int flags = LM_FLAG_TRY;
902 int skipped = 0;
903 int loops = 0;
904 int error;
905
906 /* Try recently successful rgrps */
907
908 rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
909
910 while (rgd) {
911 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
912 LM_FLAG_TRY, &al->al_rgd_gh);
913 switch (error) {
914 case 0:
915 if (try_rgrp_fit(rgd, al))
916 goto out;
917 gfs2_glock_dq_uninit(&al->al_rgd_gh);
918 rgd = recent_rgrp_next(rgd, 1);
919 break;
920
921 case GLR_TRYFAILED:
922 rgd = recent_rgrp_next(rgd, 0);
923 break;
924
925 default:
926 return error;
927 }
928 }
929
930 /* Go through full list of rgrps */
931
932 begin = rgd = forward_rgrp_get(sdp);
933
934 for (;;) {
935 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
936 &al->al_rgd_gh);
937 switch (error) {
938 case 0:
939 if (try_rgrp_fit(rgd, al))
940 goto out;
941 gfs2_glock_dq_uninit(&al->al_rgd_gh);
942 break;
943
944 case GLR_TRYFAILED:
945 skipped++;
946 break;
947
948 default:
949 return error;
950 }
951
952 rgd = gfs2_rgrpd_get_next(rgd);
953 if (!rgd)
954 rgd = gfs2_rgrpd_get_first(sdp);
955
956 if (rgd == begin) {
957 if (++loops >= 2 || !skipped)
958 return -ENOSPC;
959 flags = 0;
960 }
961 }
962
963out:
964 ip->i_last_rg_alloc = rgd->rd_ri.ri_addr;
965
966 if (begin) {
967 recent_rgrp_add(rgd);
968 rgd = gfs2_rgrpd_get_next(rgd);
969 if (!rgd)
970 rgd = gfs2_rgrpd_get_first(sdp);
971 forward_rgrp_set(sdp, rgd);
972 }
973
974 return 0;
975}
976
977/**
978 * gfs2_inplace_reserve_i - Reserve space in the filesystem
979 * @ip: the inode to reserve space for
980 *
981 * Returns: errno
982 */
983
984int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
985{
986 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
987 struct gfs2_alloc *al = &ip->i_alloc;
988 int error;
989
990 if (gfs2_assert_warn(sdp, al->al_requested))
991 return -EINVAL;
992
993 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
994 if (error)
995 return error;
996
997 error = get_local_rgrp(ip);
998 if (error) {
999 gfs2_glock_dq_uninit(&al->al_ri_gh);
1000 return error;
1001 }
1002
1003 al->al_file = file;
1004 al->al_line = line;
1005
1006 return 0;
1007}
1008
1009/**
1010 * gfs2_inplace_release - release an inplace reservation
1011 * @ip: the inode the reservation was taken out on
1012 *
1013 * Release a reservation made by gfs2_inplace_reserve().
1014 */
1015
1016void gfs2_inplace_release(struct gfs2_inode *ip)
1017{
1018 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1019 struct gfs2_alloc *al = &ip->i_alloc;
1020
1021 if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
1022 fs_warn(sdp, "al_alloced = %u, al_requested = %u "
1023 "al_file = %s, al_line = %u\n",
1024 al->al_alloced, al->al_requested, al->al_file,
1025 al->al_line);
1026
1027 al->al_rgd = NULL;
1028 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1029 gfs2_glock_dq_uninit(&al->al_ri_gh);
1030}
1031
1032/**
1033 * gfs2_get_block_type - Check a block in a RG is of given type
1034 * @rgd: the resource group holding the block
1035 * @block: the block number
1036 *
1037 * Returns: The block type (GFS2_BLKST_*)
1038 */
1039
1040unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, uint64_t block)
1041{
1042 struct gfs2_bitmap *bi = NULL;
1043 uint32_t length, rgrp_block, buf_block;
1044 unsigned int buf;
1045 unsigned char type;
1046
1047 length = rgd->rd_ri.ri_length;
1048 rgrp_block = block - rgd->rd_ri.ri_data0;
1049
1050 for (buf = 0; buf < length; buf++) {
1051 bi = rgd->rd_bits + buf;
1052 if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1053 break;
1054 }
1055
1056 gfs2_assert(rgd->rd_sbd, buf < length);
1057 buf_block = rgrp_block - bi->bi_start * GFS2_NBBY;
1058
1059 type = gfs2_testbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
1060 bi->bi_len, buf_block);
1061
1062 return type;
1063}
1064
1065/**
1066 * rgblk_search - find a block in @old_state, change allocation
1067 * state to @new_state
1068 * @rgd: the resource group descriptor
1069 * @goal: the goal block within the RG (start here to search for avail block)
1070 * @old_state: GFS2_BLKST_XXX the before-allocation state to find
1071 * @new_state: GFS2_BLKST_XXX the after-allocation block state
1072 *
1073 * Walk rgrp's bitmap to find bits that represent a block in @old_state.
1074 * Add the found bitmap buffer to the transaction.
1075 * Set the found bits to @new_state to change block's allocation state.
1076 *
1077 * This function never fails, because we wouldn't call it unless we
1078 * know (from reservation results, etc.) that a block is available.
1079 *
1080 * Scope of @goal and returned block is just within rgrp, not the whole
1081 * filesystem.
1082 *
1083 * Returns: the block number allocated
1084 */
1085
1086static uint32_t rgblk_search(struct gfs2_rgrpd *rgd, uint32_t goal,
1087 unsigned char old_state, unsigned char new_state)
1088{
1089 struct gfs2_bitmap *bi = NULL;
1090 uint32_t length = rgd->rd_ri.ri_length;
1091 uint32_t blk = 0;
1092 unsigned int buf, x;
1093
1094 /* Find bitmap block that contains bits for goal block */
1095 for (buf = 0; buf < length; buf++) {
1096 bi = rgd->rd_bits + buf;
1097 if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1098 break;
1099 }
1100
1101 gfs2_assert(rgd->rd_sbd, buf < length);
1102
1103 /* Convert scope of "goal" from rgrp-wide to within found bit block */
1104 goal -= bi->bi_start * GFS2_NBBY;
1105
1106 /* Search (up to entire) bitmap in this rgrp for allocatable block.
1107 "x <= length", instead of "x < length", because we typically start
1108 the search in the middle of a bit block, but if we can't find an
1109 allocatable block anywhere else, we want to be able wrap around and
1110 search in the first part of our first-searched bit block. */
1111 for (x = 0; x <= length; x++) {
1112 if (bi->bi_clone)
1113 blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset,
1114 bi->bi_len, goal, old_state);
1115 else
1116 blk = gfs2_bitfit(rgd,
1117 bi->bi_bh->b_data + bi->bi_offset,
1118 bi->bi_len, goal, old_state);
1119 if (blk != BFITNOENT)
1120 break;
1121
1122 /* Try next bitmap block (wrap back to rgrp header if at end) */
1123 buf = (buf + 1) % length;
1124 bi = rgd->rd_bits + buf;
1125 goal = 0;
1126 }
1127
1128 if (gfs2_assert_withdraw(rgd->rd_sbd, x <= length))
1129 blk = 0;
1130
1131 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1132 gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
1133 bi->bi_len, blk, new_state);
1134 if (bi->bi_clone)
1135 gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset,
1136 bi->bi_len, blk, new_state);
1137
1138 return bi->bi_start * GFS2_NBBY + blk;
1139}
1140
1141/**
1142 * rgblk_free - Change alloc state of given block(s)
1143 * @sdp: the filesystem
1144 * @bstart: the start of a run of blocks to free
1145 * @blen: the length of the block run (all must lie within ONE RG!)
1146 * @new_state: GFS2_BLKST_XXX the after-allocation block state
1147 *
1148 * Returns: Resource group containing the block(s)
1149 */
1150
1151static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, uint64_t bstart,
1152 uint32_t blen, unsigned char new_state)
1153{
1154 struct gfs2_rgrpd *rgd;
1155 struct gfs2_bitmap *bi = NULL;
1156 uint32_t length, rgrp_blk, buf_blk;
1157 unsigned int buf;
1158
1159 rgd = gfs2_blk2rgrpd(sdp, bstart);
1160 if (!rgd) {
1161 if (gfs2_consist(sdp))
1162 fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
1163 return NULL;
1164 }
1165
1166 length = rgd->rd_ri.ri_length;
1167
1168 rgrp_blk = bstart - rgd->rd_ri.ri_data0;
1169
1170 while (blen--) {
1171 for (buf = 0; buf < length; buf++) {
1172 bi = rgd->rd_bits + buf;
1173 if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1174 break;
1175 }
1176
1177 gfs2_assert(rgd->rd_sbd, buf < length);
1178
1179 buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY;
1180 rgrp_blk++;
1181
1182 if (!bi->bi_clone) {
1183 bi->bi_clone = kmalloc(bi->bi_bh->b_size,
1184 GFP_NOFS | __GFP_NOFAIL);
1185 memcpy(bi->bi_clone + bi->bi_offset,
1186 bi->bi_bh->b_data + bi->bi_offset,
1187 bi->bi_len);
1188 }
1189 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1190 gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
1191 bi->bi_len, buf_blk, new_state);
1192 }
1193
1194 return rgd;
1195}
1196
1197/**
1198 * gfs2_alloc_data - Allocate a data block
1199 * @ip: the inode to allocate the data block for
1200 *
1201 * Returns: the allocated block
1202 */
1203
1204u64 gfs2_alloc_data(struct gfs2_inode *ip)
1205{
1206 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1207 struct gfs2_alloc *al = &ip->i_alloc;
1208 struct gfs2_rgrpd *rgd = al->al_rgd;
1209 uint32_t goal, blk;
1210 uint64_t block;
1211
1212 if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_data))
1213 goal = ip->i_di.di_goal_data - rgd->rd_ri.ri_data0;
1214 else
1215 goal = rgd->rd_last_alloc_data;
1216
1217 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
1218 rgd->rd_last_alloc_data = blk;
1219
1220 block = rgd->rd_ri.ri_data0 + blk;
1221 ip->i_di.di_goal_data = block;
1222
1223 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1224 rgd->rd_rg.rg_free--;
1225
1226 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1227 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1228
1229 al->al_alloced++;
1230
1231 gfs2_statfs_change(sdp, 0, -1, 0);
1232 gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
1233
1234 spin_lock(&sdp->sd_rindex_spin);
1235 rgd->rd_free_clone--;
1236 spin_unlock(&sdp->sd_rindex_spin);
1237
1238 return block;
1239}
1240
1241/**
1242 * gfs2_alloc_meta - Allocate a metadata block
1243 * @ip: the inode to allocate the metadata block for
1244 *
1245 * Returns: the allocated block
1246 */
1247
1248u64 gfs2_alloc_meta(struct gfs2_inode *ip)
1249{
1250 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1251 struct gfs2_alloc *al = &ip->i_alloc;
1252 struct gfs2_rgrpd *rgd = al->al_rgd;
1253 uint32_t goal, blk;
1254 uint64_t block;
1255
1256 if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_meta))
1257 goal = ip->i_di.di_goal_meta - rgd->rd_ri.ri_data0;
1258 else
1259 goal = rgd->rd_last_alloc_meta;
1260
1261 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
1262 rgd->rd_last_alloc_meta = blk;
1263
1264 block = rgd->rd_ri.ri_data0 + blk;
1265 ip->i_di.di_goal_meta = block;
1266
1267 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1268 rgd->rd_rg.rg_free--;
1269
1270 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1271 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1272
1273 al->al_alloced++;
1274
1275 gfs2_statfs_change(sdp, 0, -1, 0);
1276 gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
1277 gfs2_trans_add_unrevoke(sdp, block);
1278
1279 spin_lock(&sdp->sd_rindex_spin);
1280 rgd->rd_free_clone--;
1281 spin_unlock(&sdp->sd_rindex_spin);
1282
1283 return block;
1284}
1285
1286/**
1287 * gfs2_alloc_di - Allocate a dinode
1288 * @dip: the directory that the inode is going in
1289 *
1290 * Returns: the block allocated
1291 */
1292
1293u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1294{
1295 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1296 struct gfs2_alloc *al = &dip->i_alloc;
1297 struct gfs2_rgrpd *rgd = al->al_rgd;
1298 u32 blk;
1299 u64 block;
1300
1301 blk = rgblk_search(rgd, rgd->rd_last_alloc_meta,
1302 GFS2_BLKST_FREE, GFS2_BLKST_DINODE);
1303
1304 rgd->rd_last_alloc_meta = blk;
1305
1306 block = rgd->rd_ri.ri_data0 + blk;
1307
1308 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1309 rgd->rd_rg.rg_free--;
1310 rgd->rd_rg.rg_dinodes++;
1311 *generation = rgd->rd_rg.rg_igeneration++;
1312 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1313 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1314
1315 al->al_alloced++;
1316
1317 gfs2_statfs_change(sdp, 0, -1, +1);
1318 gfs2_trans_add_unrevoke(sdp, block);
1319
1320 spin_lock(&sdp->sd_rindex_spin);
1321 rgd->rd_free_clone--;
1322 spin_unlock(&sdp->sd_rindex_spin);
1323
1324 return block;
1325}
1326
1327/**
1328 * gfs2_free_data - free a contiguous run of data block(s)
1329 * @ip: the inode these blocks are being freed from
1330 * @bstart: first block of a run of contiguous blocks
1331 * @blen: the length of the block run
1332 *
1333 */
1334
1335void gfs2_free_data(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen)
1336{
1337 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1338 struct gfs2_rgrpd *rgd;
1339
1340 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1341 if (!rgd)
1342 return;
1343
1344 rgd->rd_rg.rg_free += blen;
1345
1346 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1347 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1348
1349 gfs2_trans_add_rg(rgd);
1350
1351 gfs2_statfs_change(sdp, 0, +blen, 0);
1352 gfs2_quota_change(ip, -(int64_t)blen,
1353 ip->i_di.di_uid, ip->i_di.di_gid);
1354}
1355
1356/**
1357 * gfs2_free_meta - free a contiguous run of data block(s)
1358 * @ip: the inode these blocks are being freed from
1359 * @bstart: first block of a run of contiguous blocks
1360 * @blen: the length of the block run
1361 *
1362 */
1363
1364void gfs2_free_meta(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen)
1365{
1366 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1367 struct gfs2_rgrpd *rgd;
1368
1369 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1370 if (!rgd)
1371 return;
1372
1373 rgd->rd_rg.rg_free += blen;
1374
1375 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1376 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1377
1378 gfs2_trans_add_rg(rgd);
1379
1380 gfs2_statfs_change(sdp, 0, +blen, 0);
1381 gfs2_quota_change(ip, -(int64_t)blen, ip->i_di.di_uid, ip->i_di.di_gid);
1382 gfs2_meta_wipe(ip, bstart, blen);
1383}
1384
1385void gfs2_unlink_di(struct inode *inode)
1386{
1387 struct gfs2_inode *ip = GFS2_I(inode);
1388 struct gfs2_sbd *sdp = GFS2_SB(inode);
1389 struct gfs2_rgrpd *rgd;
1390 u64 blkno = ip->i_num.no_addr;
1391
1392 rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
1393 if (!rgd)
1394 return;
1395 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1396 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1397 gfs2_trans_add_rg(rgd);
1398}
1399
1400static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, uint64_t blkno)
1401{
1402 struct gfs2_sbd *sdp = rgd->rd_sbd;
1403 struct gfs2_rgrpd *tmp_rgd;
1404
1405 tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE);
1406 if (!tmp_rgd)
1407 return;
1408 gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
1409
1410 if (!rgd->rd_rg.rg_dinodes)
1411 gfs2_consist_rgrpd(rgd);
1412 rgd->rd_rg.rg_dinodes--;
1413 rgd->rd_rg.rg_free++;
1414
1415 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1416 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1417
1418 gfs2_statfs_change(sdp, 0, +1, -1);
1419 gfs2_trans_add_rg(rgd);
1420}
1421
1422
1423void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
1424{
1425 gfs2_free_uninit_di(rgd, ip->i_num.no_addr);
1426 gfs2_quota_change(ip, -1, ip->i_di.di_uid, ip->i_di.di_gid);
1427 gfs2_meta_wipe(ip, ip->i_num.no_addr, 1);
1428}
1429
1430/**
1431 * gfs2_rlist_add - add a RG to a list of RGs
1432 * @sdp: the filesystem
1433 * @rlist: the list of resource groups
1434 * @block: the block
1435 *
1436 * Figure out what RG a block belongs to and add that RG to the list
1437 *
1438 * FIXME: Don't use NOFAIL
1439 *
1440 */
1441
1442void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
1443 uint64_t block)
1444{
1445 struct gfs2_rgrpd *rgd;
1446 struct gfs2_rgrpd **tmp;
1447 unsigned int new_space;
1448 unsigned int x;
1449
1450 if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
1451 return;
1452
1453 rgd = gfs2_blk2rgrpd(sdp, block);
1454 if (!rgd) {
1455 if (gfs2_consist(sdp))
1456 fs_err(sdp, "block = %llu\n", (unsigned long long)block);
1457 return;
1458 }
1459
1460 for (x = 0; x < rlist->rl_rgrps; x++)
1461 if (rlist->rl_rgd[x] == rgd)
1462 return;
1463
1464 if (rlist->rl_rgrps == rlist->rl_space) {
1465 new_space = rlist->rl_space + 10;
1466
1467 tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
1468 GFP_NOFS | __GFP_NOFAIL);
1469
1470 if (rlist->rl_rgd) {
1471 memcpy(tmp, rlist->rl_rgd,
1472 rlist->rl_space * sizeof(struct gfs2_rgrpd *));
1473 kfree(rlist->rl_rgd);
1474 }
1475
1476 rlist->rl_space = new_space;
1477 rlist->rl_rgd = tmp;
1478 }
1479
1480 rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
1481}
1482
1483/**
1484 * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
1485 * and initialize an array of glock holders for them
1486 * @rlist: the list of resource groups
1487 * @state: the lock state to acquire the RG lock in
1488 * @flags: the modifier flags for the holder structures
1489 *
1490 * FIXME: Don't use NOFAIL
1491 *
1492 */
1493
1494void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
1495 int flags)
1496{
1497 unsigned int x;
1498
1499 rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
1500 GFP_NOFS | __GFP_NOFAIL);
1501 for (x = 0; x < rlist->rl_rgrps; x++)
1502 gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
1503 state, flags,
1504 &rlist->rl_ghs[x]);
1505}
1506
1507/**
1508 * gfs2_rlist_free - free a resource group list
1509 * @list: the list of resource groups
1510 *
1511 */
1512
1513void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
1514{
1515 unsigned int x;
1516
1517 kfree(rlist->rl_rgd);
1518
1519 if (rlist->rl_ghs) {
1520 for (x = 0; x < rlist->rl_rgrps; x++)
1521 gfs2_holder_uninit(&rlist->rl_ghs[x]);
1522 kfree(rlist->rl_ghs);
1523 }
1524}
1525
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
new file mode 100644
index 000000000000..14600944d184
--- /dev/null
+++ b/fs/gfs2/rgrp.h
@@ -0,0 +1,62 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __RGRP_DOT_H__
11#define __RGRP_DOT_H__
12
13void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
14
15struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, uint64_t blk);
16struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
17struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
18
19void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
20int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
21
22int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
23void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
24void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
25
26void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
27
28struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
29void gfs2_alloc_put(struct gfs2_inode *ip);
30
31int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
32 char *file, unsigned int line);
33#define gfs2_inplace_reserve(ip) \
34gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
35
36void gfs2_inplace_release(struct gfs2_inode *ip);
37
38unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, uint64_t block);
39
40u64 gfs2_alloc_data(struct gfs2_inode *ip);
41u64 gfs2_alloc_meta(struct gfs2_inode *ip);
42u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
43
44void gfs2_free_data(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen);
45void gfs2_free_meta(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen);
46void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
47void gfs2_unlink_di(struct inode *inode);
48
49struct gfs2_rgrp_list {
50 unsigned int rl_rgrps;
51 unsigned int rl_space;
52 struct gfs2_rgrpd **rl_rgd;
53 struct gfs2_holder *rl_ghs;
54};
55
56void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
57 uint64_t block);
58void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
59 int flags);
60void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
61
62#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
new file mode 100644
index 000000000000..3c318a9e8a8c
--- /dev/null
+++ b/fs/gfs2/super.c
@@ -0,0 +1,979 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/crc32.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/bio.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "bmap.h"
23#include "dir.h"
24#include "format.h"
25#include "glock.h"
26#include "glops.h"
27#include "inode.h"
28#include "log.h"
29#include "meta_io.h"
30#include "quota.h"
31#include "recovery.h"
32#include "rgrp.h"
33#include "super.h"
34#include "trans.h"
35#include "util.h"
36
37/**
38 * gfs2_tune_init - Fill a gfs2_tune structure with default values
39 * @gt: tune
40 *
41 */
42
43void gfs2_tune_init(struct gfs2_tune *gt)
44{
45 spin_lock_init(&gt->gt_spin);
46
47 gt->gt_ilimit = 100;
48 gt->gt_ilimit_tries = 3;
49 gt->gt_ilimit_min = 1;
50 gt->gt_demote_secs = 300;
51 gt->gt_incore_log_blocks = 1024;
52 gt->gt_log_flush_secs = 60;
53 gt->gt_jindex_refresh_secs = 60;
54 gt->gt_scand_secs = 15;
55 gt->gt_recoverd_secs = 60;
56 gt->gt_logd_secs = 1;
57 gt->gt_quotad_secs = 5;
58 gt->gt_quota_simul_sync = 64;
59 gt->gt_quota_warn_period = 10;
60 gt->gt_quota_scale_num = 1;
61 gt->gt_quota_scale_den = 1;
62 gt->gt_quota_cache_secs = 300;
63 gt->gt_quota_quantum = 60;
64 gt->gt_atime_quantum = 3600;
65 gt->gt_new_files_jdata = 0;
66 gt->gt_new_files_directio = 0;
67 gt->gt_max_atomic_write = 4 << 20;
68 gt->gt_max_readahead = 1 << 18;
69 gt->gt_lockdump_size = 131072;
70 gt->gt_stall_secs = 600;
71 gt->gt_complain_secs = 10;
72 gt->gt_reclaim_limit = 5000;
73 gt->gt_entries_per_readdir = 32;
74 gt->gt_prefetch_secs = 10;
75 gt->gt_greedy_default = HZ / 10;
76 gt->gt_greedy_quantum = HZ / 40;
77 gt->gt_greedy_max = HZ / 4;
78 gt->gt_statfs_quantum = 30;
79 gt->gt_statfs_slow = 0;
80}
81
82/**
83 * gfs2_check_sb - Check superblock
84 * @sdp: the filesystem
85 * @sb: The superblock
86 * @silent: Don't print a message if the check fails
87 *
88 * Checks the version code of the FS is one that we understand how to
89 * read and that the sizes of the various on-disk structures have not
90 * changed.
91 */
92
93int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent)
94{
95 unsigned int x;
96
97 if (sb->sb_header.mh_magic != GFS2_MAGIC ||
98 sb->sb_header.mh_type != GFS2_METATYPE_SB) {
99 if (!silent)
100 printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
101 return -EINVAL;
102 }
103
104 /* If format numbers match exactly, we're done. */
105
106 if (sb->sb_fs_format == GFS2_FORMAT_FS &&
107 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
108 return 0;
109
110 if (sb->sb_fs_format != GFS2_FORMAT_FS) {
111 for (x = 0; gfs2_old_fs_formats[x]; x++)
112 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
113 break;
114
115 if (!gfs2_old_fs_formats[x]) {
116 printk(KERN_WARNING
117 "GFS2: code version (%u, %u) is incompatible "
118 "with ondisk format (%u, %u)\n",
119 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
120 sb->sb_fs_format, sb->sb_multihost_format);
121 printk(KERN_WARNING
122 "GFS2: I don't know how to upgrade this FS\n");
123 return -EINVAL;
124 }
125 }
126
127 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
128 for (x = 0; gfs2_old_multihost_formats[x]; x++)
129 if (gfs2_old_multihost_formats[x] ==
130 sb->sb_multihost_format)
131 break;
132
133 if (!gfs2_old_multihost_formats[x]) {
134 printk(KERN_WARNING
135 "GFS2: code version (%u, %u) is incompatible "
136 "with ondisk format (%u, %u)\n",
137 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
138 sb->sb_fs_format, sb->sb_multihost_format);
139 printk(KERN_WARNING
140 "GFS2: I don't know how to upgrade this FS\n");
141 return -EINVAL;
142 }
143 }
144
145 if (!sdp->sd_args.ar_upgrade) {
146 printk(KERN_WARNING
147 "GFS2: code version (%u, %u) is incompatible "
148 "with ondisk format (%u, %u)\n",
149 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
150 sb->sb_fs_format, sb->sb_multihost_format);
151 printk(KERN_INFO
152 "GFS2: Use the \"upgrade\" mount option to upgrade "
153 "the FS\n");
154 printk(KERN_INFO "GFS2: See the manual for more details\n");
155 return -EINVAL;
156 }
157
158 return 0;
159}
160
161
162static int end_bio_io_page(struct bio *bio, unsigned int bytes_done, int error)
163{
164 struct page *page = bio->bi_private;
165 if (bio->bi_size)
166 return 1;
167
168 if (!error)
169 SetPageUptodate(page);
170 else
171 printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
172 unlock_page(page);
173 return 0;
174}
175
176static struct page *gfs2_read_super(struct super_block *sb, sector_t sector)
177{
178 struct page *page;
179 struct bio *bio;
180
181 page = alloc_page(GFP_KERNEL);
182 if (unlikely(!page))
183 return NULL;
184
185 ClearPageUptodate(page);
186 ClearPageDirty(page);
187 lock_page(page);
188
189 bio = bio_alloc(GFP_KERNEL, 1);
190 if (unlikely(!bio)) {
191 __free_page(page);
192 return NULL;
193 }
194
195 bio->bi_sector = sector;
196 bio->bi_bdev = sb->s_bdev;
197 bio_add_page(bio, page, PAGE_SIZE, 0);
198
199 bio->bi_end_io = end_bio_io_page;
200 bio->bi_private = page;
201 submit_bio(READ_SYNC, bio);
202 wait_on_page_locked(page);
203 bio_put(bio);
204 if (!PageUptodate(page)) {
205 __free_page(page);
206 return NULL;
207 }
208 return page;
209}
210
211/**
212 * gfs2_read_sb - Read super block
213 * @sdp: The GFS2 superblock
214 * @gl: the glock for the superblock (assumed to be held)
215 * @silent: Don't print message if mount fails
216 *
217 */
218
219int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
220{
221 uint32_t hash_blocks, ind_blocks, leaf_blocks;
222 uint32_t tmp_blocks;
223 unsigned int x;
224 int error;
225 struct page *page;
226 char *sb;
227
228 page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
229 if (!page) {
230 if (!silent)
231 fs_err(sdp, "can't read superblock\n");
232 return -EIO;
233 }
234 sb = kmap(page);
235 gfs2_sb_in(&sdp->sd_sb, sb);
236 kunmap(page);
237 __free_page(page);
238
239 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
240 if (error)
241 return error;
242
243 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
244 GFS2_BASIC_BLOCK_SHIFT;
245 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
246 sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
247 sizeof(struct gfs2_dinode)) / sizeof(uint64_t);
248 sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
249 sizeof(struct gfs2_meta_header)) / sizeof(uint64_t);
250 sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
251 sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
252 sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
253 sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(uint64_t);
254 sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
255 sizeof(struct gfs2_meta_header)) /
256 sizeof(struct gfs2_quota_change);
257
258 /* Compute maximum reservation required to add a entry to a directory */
259
260 hash_blocks = DIV_ROUND_UP(sizeof(uint64_t) * (1 << GFS2_DIR_MAX_DEPTH),
261 sdp->sd_jbsize);
262
263 ind_blocks = 0;
264 for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
265 tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
266 ind_blocks += tmp_blocks;
267 }
268
269 leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
270
271 sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
272
273 sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
274 sizeof(struct gfs2_dinode);
275 sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
276 for (x = 2;; x++) {
277 uint64_t space, d;
278 uint32_t m;
279
280 space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
281 d = space;
282 m = do_div(d, sdp->sd_inptrs);
283
284 if (d != sdp->sd_heightsize[x - 1] || m)
285 break;
286 sdp->sd_heightsize[x] = space;
287 }
288 sdp->sd_max_height = x;
289 gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
290
291 sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
292 sizeof(struct gfs2_dinode);
293 sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
294 for (x = 2;; x++) {
295 uint64_t space, d;
296 uint32_t m;
297
298 space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
299 d = space;
300 m = do_div(d, sdp->sd_inptrs);
301
302 if (d != sdp->sd_jheightsize[x - 1] || m)
303 break;
304 sdp->sd_jheightsize[x] = space;
305 }
306 sdp->sd_max_jheight = x;
307 gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
308
309 return 0;
310}
311
312/**
313 * gfs2_jindex_hold - Grab a lock on the jindex
314 * @sdp: The GFS2 superblock
315 * @ji_gh: the holder for the jindex glock
316 *
317 * This is very similar to the gfs2_rindex_hold() function, except that
318 * in general we hold the jindex lock for longer periods of time and
319 * we grab it far less frequently (in general) then the rgrp lock.
320 *
321 * Returns: errno
322 */
323
324int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
325{
326 struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
327 struct qstr name;
328 char buf[20];
329 struct gfs2_jdesc *jd;
330 int error;
331
332 name.name = buf;
333
334 mutex_lock(&sdp->sd_jindex_mutex);
335
336 for (;;) {
337 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED,
338 GL_LOCAL_EXCL, ji_gh);
339 if (error)
340 break;
341
342 name.len = sprintf(buf, "journal%u", sdp->sd_journals);
343 name.hash = gfs2_disk_hash(name.name, name.len);
344
345 error = gfs2_dir_search(sdp->sd_jindex, &name, NULL, NULL);
346 if (error == -ENOENT) {
347 error = 0;
348 break;
349 }
350
351 gfs2_glock_dq_uninit(ji_gh);
352
353 if (error)
354 break;
355
356 error = -ENOMEM;
357 jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
358 if (!jd)
359 break;
360
361 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
362 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
363 if (!jd->jd_inode)
364 error = -ENOENT;
365 else
366 error = PTR_ERR(jd->jd_inode);
367 kfree(jd);
368 break;
369 }
370
371 spin_lock(&sdp->sd_jindex_spin);
372 jd->jd_jid = sdp->sd_journals++;
373 list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
374 spin_unlock(&sdp->sd_jindex_spin);
375 }
376
377 mutex_unlock(&sdp->sd_jindex_mutex);
378
379 return error;
380}
381
382/**
383 * gfs2_jindex_free - Clear all the journal index information
384 * @sdp: The GFS2 superblock
385 *
386 */
387
388void gfs2_jindex_free(struct gfs2_sbd *sdp)
389{
390 struct list_head list;
391 struct gfs2_jdesc *jd;
392
393 spin_lock(&sdp->sd_jindex_spin);
394 list_add(&list, &sdp->sd_jindex_list);
395 list_del_init(&sdp->sd_jindex_list);
396 sdp->sd_journals = 0;
397 spin_unlock(&sdp->sd_jindex_spin);
398
399 while (!list_empty(&list)) {
400 jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
401 list_del(&jd->jd_list);
402 iput(jd->jd_inode);
403 kfree(jd);
404 }
405}
406
407static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid)
408{
409 struct gfs2_jdesc *jd;
410 int found = 0;
411
412 list_for_each_entry(jd, head, jd_list) {
413 if (jd->jd_jid == jid) {
414 found = 1;
415 break;
416 }
417 }
418
419 if (!found)
420 jd = NULL;
421
422 return jd;
423}
424
425struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
426{
427 struct gfs2_jdesc *jd;
428
429 spin_lock(&sdp->sd_jindex_spin);
430 jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
431 spin_unlock(&sdp->sd_jindex_spin);
432
433 return jd;
434}
435
436void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
437{
438 struct gfs2_jdesc *jd;
439
440 spin_lock(&sdp->sd_jindex_spin);
441 jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
442 if (jd)
443 jd->jd_dirty = 1;
444 spin_unlock(&sdp->sd_jindex_spin);
445}
446
447struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
448{
449 struct gfs2_jdesc *jd;
450 int found = 0;
451
452 spin_lock(&sdp->sd_jindex_spin);
453
454 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
455 if (jd->jd_dirty) {
456 jd->jd_dirty = 0;
457 found = 1;
458 break;
459 }
460 }
461 spin_unlock(&sdp->sd_jindex_spin);
462
463 if (!found)
464 jd = NULL;
465
466 return jd;
467}
468
469int gfs2_jdesc_check(struct gfs2_jdesc *jd)
470{
471 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
472 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
473 int ar;
474 int error;
475
476 if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) ||
477 (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) {
478 gfs2_consist_inode(ip);
479 return -EIO;
480 }
481 jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
482
483 error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar);
484 if (!error && ar) {
485 gfs2_consist_inode(ip);
486 error = -EIO;
487 }
488
489 return error;
490}
491
492/**
493 * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
494 * @sdp: the filesystem
495 *
496 * Returns: errno
497 */
498
499int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
500{
501 struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
502 struct gfs2_glock *j_gl = ip->i_gl;
503 struct gfs2_holder t_gh;
504 struct gfs2_log_header head;
505 int error;
506
507 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
508 GL_LOCAL_EXCL, &t_gh);
509 if (error)
510 return error;
511
512 gfs2_meta_cache_flush(ip);
513 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
514
515 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
516 if (error)
517 goto fail;
518
519 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
520 gfs2_consist(sdp);
521 error = -EIO;
522 goto fail;
523 }
524
525 /* Initialize some head of the log stuff */
526 sdp->sd_log_sequence = head.lh_sequence + 1;
527 gfs2_log_pointers_init(sdp, head.lh_blkno);
528
529 error = gfs2_quota_init(sdp);
530 if (error)
531 goto fail_unlinked;
532
533 set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
534
535 gfs2_glock_dq_uninit(&t_gh);
536
537 return 0;
538
539 fail_unlinked:
540
541 fail:
542 t_gh.gh_flags |= GL_NOCACHE;
543 gfs2_glock_dq_uninit(&t_gh);
544
545 return error;
546}
547
548/**
549 * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
550 * @sdp: the filesystem
551 *
552 * Returns: errno
553 */
554
555int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
556{
557 struct gfs2_holder t_gh;
558 int error;
559
560 gfs2_quota_sync(sdp);
561 gfs2_statfs_sync(sdp);
562
563 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
564 GL_LOCAL_EXCL | GL_NOCACHE,
565 &t_gh);
566 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
567 return error;
568
569 gfs2_meta_syncfs(sdp);
570 gfs2_log_shutdown(sdp);
571
572 clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
573
574 if (t_gh.gh_gl)
575 gfs2_glock_dq_uninit(&t_gh);
576
577 gfs2_quota_cleanup(sdp);
578
579 return error;
580}
581
582int gfs2_statfs_init(struct gfs2_sbd *sdp)
583{
584 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
585 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
586 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
587 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
588 struct buffer_head *m_bh, *l_bh;
589 struct gfs2_holder gh;
590 int error;
591
592 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
593 &gh);
594 if (error)
595 return error;
596
597 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
598 if (error)
599 goto out;
600
601 if (sdp->sd_args.ar_spectator) {
602 spin_lock(&sdp->sd_statfs_spin);
603 gfs2_statfs_change_in(m_sc, m_bh->b_data +
604 sizeof(struct gfs2_dinode));
605 spin_unlock(&sdp->sd_statfs_spin);
606 } else {
607 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
608 if (error)
609 goto out_m_bh;
610
611 spin_lock(&sdp->sd_statfs_spin);
612 gfs2_statfs_change_in(m_sc, m_bh->b_data +
613 sizeof(struct gfs2_dinode));
614 gfs2_statfs_change_in(l_sc, l_bh->b_data +
615 sizeof(struct gfs2_dinode));
616 spin_unlock(&sdp->sd_statfs_spin);
617
618 brelse(l_bh);
619 }
620
621 out_m_bh:
622 brelse(m_bh);
623
624 out:
625 gfs2_glock_dq_uninit(&gh);
626
627 return 0;
628}
629
630void gfs2_statfs_change(struct gfs2_sbd *sdp, int64_t total, int64_t free,
631 int64_t dinodes)
632{
633 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
634 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
635 struct buffer_head *l_bh;
636 int error;
637
638 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
639 if (error)
640 return;
641
642 mutex_lock(&sdp->sd_statfs_mutex);
643 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
644 mutex_unlock(&sdp->sd_statfs_mutex);
645
646 spin_lock(&sdp->sd_statfs_spin);
647 l_sc->sc_total += total;
648 l_sc->sc_free += free;
649 l_sc->sc_dinodes += dinodes;
650 gfs2_statfs_change_out(l_sc, l_bh->b_data +
651 sizeof(struct gfs2_dinode));
652 spin_unlock(&sdp->sd_statfs_spin);
653
654 brelse(l_bh);
655}
656
657int gfs2_statfs_sync(struct gfs2_sbd *sdp)
658{
659 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
660 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
661 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
662 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
663 struct gfs2_holder gh;
664 struct buffer_head *m_bh, *l_bh;
665 int error;
666
667 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
668 &gh);
669 if (error)
670 return error;
671
672 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
673 if (error)
674 goto out;
675
676 spin_lock(&sdp->sd_statfs_spin);
677 gfs2_statfs_change_in(m_sc, m_bh->b_data +
678 sizeof(struct gfs2_dinode));
679 if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) {
680 spin_unlock(&sdp->sd_statfs_spin);
681 goto out_bh;
682 }
683 spin_unlock(&sdp->sd_statfs_spin);
684
685 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
686 if (error)
687 goto out_bh;
688
689 error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
690 if (error)
691 goto out_bh2;
692
693 mutex_lock(&sdp->sd_statfs_mutex);
694 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
695 mutex_unlock(&sdp->sd_statfs_mutex);
696
697 spin_lock(&sdp->sd_statfs_spin);
698 m_sc->sc_total += l_sc->sc_total;
699 m_sc->sc_free += l_sc->sc_free;
700 m_sc->sc_dinodes += l_sc->sc_dinodes;
701 memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
702 memset(l_bh->b_data + sizeof(struct gfs2_dinode),
703 0, sizeof(struct gfs2_statfs_change));
704 spin_unlock(&sdp->sd_statfs_spin);
705
706 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
707 gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
708
709 gfs2_trans_end(sdp);
710
711 out_bh2:
712 brelse(l_bh);
713
714 out_bh:
715 brelse(m_bh);
716
717 out:
718 gfs2_glock_dq_uninit(&gh);
719
720 return error;
721}
722
723/**
724 * gfs2_statfs_i - Do a statfs
725 * @sdp: the filesystem
726 * @sg: the sg structure
727 *
728 * Returns: errno
729 */
730
731int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
732{
733 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
734 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
735
736 spin_lock(&sdp->sd_statfs_spin);
737
738 *sc = *m_sc;
739 sc->sc_total += l_sc->sc_total;
740 sc->sc_free += l_sc->sc_free;
741 sc->sc_dinodes += l_sc->sc_dinodes;
742
743 spin_unlock(&sdp->sd_statfs_spin);
744
745 if (sc->sc_free < 0)
746 sc->sc_free = 0;
747 if (sc->sc_free > sc->sc_total)
748 sc->sc_free = sc->sc_total;
749 if (sc->sc_dinodes < 0)
750 sc->sc_dinodes = 0;
751
752 return 0;
753}
754
755/**
756 * statfs_fill - fill in the sg for a given RG
757 * @rgd: the RG
758 * @sc: the sc structure
759 *
760 * Returns: 0 on success, -ESTALE if the LVB is invalid
761 */
762
763static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
764 struct gfs2_statfs_change *sc)
765{
766 gfs2_rgrp_verify(rgd);
767 sc->sc_total += rgd->rd_ri.ri_data;
768 sc->sc_free += rgd->rd_rg.rg_free;
769 sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
770 return 0;
771}
772
773/**
774 * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
775 * @sdp: the filesystem
776 * @sc: the sc info that will be returned
777 *
778 * Any error (other than a signal) will cause this routine to fall back
779 * to the synchronous version.
780 *
781 * FIXME: This really shouldn't busy wait like this.
782 *
783 * Returns: errno
784 */
785
786int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
787{
788 struct gfs2_holder ri_gh;
789 struct gfs2_rgrpd *rgd_next;
790 struct gfs2_holder *gha, *gh;
791 unsigned int slots = 64;
792 unsigned int x;
793 int done;
794 int error = 0, err;
795
796 memset(sc, 0, sizeof(struct gfs2_statfs_change));
797 gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
798 if (!gha)
799 return -ENOMEM;
800
801 error = gfs2_rindex_hold(sdp, &ri_gh);
802 if (error)
803 goto out;
804
805 rgd_next = gfs2_rgrpd_get_first(sdp);
806
807 for (;;) {
808 done = 1;
809
810 for (x = 0; x < slots; x++) {
811 gh = gha + x;
812
813 if (gh->gh_gl && gfs2_glock_poll(gh)) {
814 err = gfs2_glock_wait(gh);
815 if (err) {
816 gfs2_holder_uninit(gh);
817 error = err;
818 } else {
819 if (!error)
820 error = statfs_slow_fill(
821 gh->gh_gl->gl_object, sc);
822 gfs2_glock_dq_uninit(gh);
823 }
824 }
825
826 if (gh->gh_gl)
827 done = 0;
828 else if (rgd_next && !error) {
829 error = gfs2_glock_nq_init(rgd_next->rd_gl,
830 LM_ST_SHARED,
831 GL_ASYNC,
832 gh);
833 rgd_next = gfs2_rgrpd_get_next(rgd_next);
834 done = 0;
835 }
836
837 if (signal_pending(current))
838 error = -ERESTARTSYS;
839 }
840
841 if (done)
842 break;
843
844 yield();
845 }
846
847 gfs2_glock_dq_uninit(&ri_gh);
848
849 out:
850 kfree(gha);
851
852 return error;
853}
854
855struct lfcc {
856 struct list_head list;
857 struct gfs2_holder gh;
858};
859
860/**
861 * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all
862 * journals are clean
863 * @sdp: the file system
864 * @state: the state to put the transaction lock into
865 * @t_gh: the hold on the transaction lock
866 *
867 * Returns: errno
868 */
869
870static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
871 struct gfs2_holder *t_gh)
872{
873 struct gfs2_inode *ip;
874 struct gfs2_holder ji_gh;
875 struct gfs2_jdesc *jd;
876 struct lfcc *lfcc;
877 LIST_HEAD(list);
878 struct gfs2_log_header lh;
879 int error;
880
881 error = gfs2_jindex_hold(sdp, &ji_gh);
882 if (error)
883 return error;
884
885 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
886 lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
887 if (!lfcc) {
888 error = -ENOMEM;
889 goto out;
890 }
891 ip = GFS2_I(jd->jd_inode);
892 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh);
893 if (error) {
894 kfree(lfcc);
895 goto out;
896 }
897 list_add(&lfcc->list, &list);
898 }
899
900 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
901 LM_FLAG_PRIORITY | GL_NOCACHE,
902 t_gh);
903
904 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
905 error = gfs2_jdesc_check(jd);
906 if (error)
907 break;
908 error = gfs2_find_jhead(jd, &lh);
909 if (error)
910 break;
911 if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
912 error = -EBUSY;
913 break;
914 }
915 }
916
917 if (error)
918 gfs2_glock_dq_uninit(t_gh);
919
920 out:
921 while (!list_empty(&list)) {
922 lfcc = list_entry(list.next, struct lfcc, list);
923 list_del(&lfcc->list);
924 gfs2_glock_dq_uninit(&lfcc->gh);
925 kfree(lfcc);
926 }
927 gfs2_glock_dq_uninit(&ji_gh);
928
929 return error;
930}
931
932/**
933 * gfs2_freeze_fs - freezes the file system
934 * @sdp: the file system
935 *
936 * This function flushes data and meta data for all machines by
937 * aquiring the transaction log exclusively. All journals are
938 * ensured to be in a clean state as well.
939 *
940 * Returns: errno
941 */
942
943int gfs2_freeze_fs(struct gfs2_sbd *sdp)
944{
945 int error = 0;
946
947 mutex_lock(&sdp->sd_freeze_lock);
948
949 if (!sdp->sd_freeze_count++) {
950 error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
951 if (error)
952 sdp->sd_freeze_count--;
953 }
954
955 mutex_unlock(&sdp->sd_freeze_lock);
956
957 return error;
958}
959
960/**
961 * gfs2_unfreeze_fs - unfreezes the file system
962 * @sdp: the file system
963 *
964 * This function allows the file system to proceed by unlocking
965 * the exclusively held transaction lock. Other GFS2 nodes are
966 * now free to acquire the lock shared and go on with their lives.
967 *
968 */
969
970void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
971{
972 mutex_lock(&sdp->sd_freeze_lock);
973
974 if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
975 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
976
977 mutex_unlock(&sdp->sd_freeze_lock);
978}
979
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
new file mode 100644
index 000000000000..df2495230402
--- /dev/null
+++ b/fs/gfs2/super.h
@@ -0,0 +1,52 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __SUPER_DOT_H__
11#define __SUPER_DOT_H__
12
13void gfs2_tune_init(struct gfs2_tune *gt);
14
15int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent);
16int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
17
18static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
19{
20 unsigned int x;
21 spin_lock(&sdp->sd_jindex_spin);
22 x = sdp->sd_journals;
23 spin_unlock(&sdp->sd_jindex_spin);
24 return x;
25}
26
27int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
28void gfs2_jindex_free(struct gfs2_sbd *sdp);
29
30struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
31void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
32struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
33int gfs2_jdesc_check(struct gfs2_jdesc *jd);
34
35int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
36 struct gfs2_inode **ipp);
37
38int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
39int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
40
41int gfs2_statfs_init(struct gfs2_sbd *sdp);
42void gfs2_statfs_change(struct gfs2_sbd *sdp,
43 int64_t total, int64_t free, int64_t dinodes);
44int gfs2_statfs_sync(struct gfs2_sbd *sdp);
45int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
46int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
47
48int gfs2_freeze_fs(struct gfs2_sbd *sdp);
49void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
50
51#endif /* __SUPER_DOT_H__ */
52
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
new file mode 100644
index 000000000000..3c4cb4558905
--- /dev/null
+++ b/fs/gfs2/sys.c
@@ -0,0 +1,579 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/module.h>
16#include <linux/kobject.h>
17#include <linux/gfs2_ondisk.h>
18#include <asm/uaccess.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "lm.h"
24#include "sys.h"
25#include "super.h"
26#include "glock.h"
27#include "quota.h"
28#include "util.h"
29
30char *gfs2_sys_margs;
31spinlock_t gfs2_sys_margs_lock;
32
33static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
34{
35 return sprintf(buf, "%s\n", sdp->sd_vfs->s_id);
36}
37
38static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
39{
40 return sprintf(buf, "%s\n", sdp->sd_fsname);
41}
42
43static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
44{
45 unsigned int count;
46
47 mutex_lock(&sdp->sd_freeze_lock);
48 count = sdp->sd_freeze_count;
49 mutex_unlock(&sdp->sd_freeze_lock);
50
51 return sprintf(buf, "%u\n", count);
52}
53
54static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
55{
56 ssize_t ret = len;
57 int error = 0;
58 int n = simple_strtol(buf, NULL, 0);
59
60 if (!capable(CAP_SYS_ADMIN))
61 return -EACCES;
62
63 switch (n) {
64 case 0:
65 gfs2_unfreeze_fs(sdp);
66 break;
67 case 1:
68 error = gfs2_freeze_fs(sdp);
69 break;
70 default:
71 ret = -EINVAL;
72 }
73
74 if (error)
75 fs_warn(sdp, "freeze %d error %d", n, error);
76
77 return ret;
78}
79
80static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
81{
82 unsigned int b = test_bit(SDF_SHUTDOWN, &sdp->sd_flags);
83 return sprintf(buf, "%u\n", b);
84}
85
86static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
87{
88 if (!capable(CAP_SYS_ADMIN))
89 return -EACCES;
90
91 if (simple_strtol(buf, NULL, 0) != 1)
92 return -EINVAL;
93
94 gfs2_lm_withdraw(sdp,
95 "GFS2: fsid=%s: withdrawing from cluster at user's request\n",
96 sdp->sd_fsname);
97 return len;
98}
99
100static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
101 size_t len)
102{
103 if (!capable(CAP_SYS_ADMIN))
104 return -EACCES;
105
106 if (simple_strtol(buf, NULL, 0) != 1)
107 return -EINVAL;
108
109 gfs2_statfs_sync(sdp);
110 return len;
111}
112
113static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
114{
115 if (!capable(CAP_SYS_ADMIN))
116 return -EACCES;
117
118 if (simple_strtol(buf, NULL, 0) != 1)
119 return -EINVAL;
120
121 gfs2_gl_hash_clear(sdp, NO_WAIT);
122 return len;
123}
124
125static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
126 size_t len)
127{
128 if (!capable(CAP_SYS_ADMIN))
129 return -EACCES;
130
131 if (simple_strtol(buf, NULL, 0) != 1)
132 return -EINVAL;
133
134 gfs2_quota_sync(sdp);
135 return len;
136}
137
138static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
139 size_t len)
140{
141 uint32_t id;
142
143 if (!capable(CAP_SYS_ADMIN))
144 return -EACCES;
145
146 id = simple_strtoul(buf, NULL, 0);
147
148 gfs2_quota_refresh(sdp, 1, id);
149 return len;
150}
151
152static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
153 size_t len)
154{
155 uint32_t id;
156
157 if (!capable(CAP_SYS_ADMIN))
158 return -EACCES;
159
160 id = simple_strtoul(buf, NULL, 0);
161
162 gfs2_quota_refresh(sdp, 0, id);
163 return len;
164}
165
166struct gfs2_attr {
167 struct attribute attr;
168 ssize_t (*show)(struct gfs2_sbd *, char *);
169 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
170};
171
172#define GFS2_ATTR(name, mode, show, store) \
173static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
174
175GFS2_ATTR(id, 0444, id_show, NULL);
176GFS2_ATTR(fsname, 0444, fsname_show, NULL);
177GFS2_ATTR(freeze, 0644, freeze_show, freeze_store);
178GFS2_ATTR(shrink, 0200, NULL, shrink_store);
179GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
180GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store);
181GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store);
182GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store);
183GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store);
184
185static struct attribute *gfs2_attrs[] = {
186 &gfs2_attr_id.attr,
187 &gfs2_attr_fsname.attr,
188 &gfs2_attr_freeze.attr,
189 &gfs2_attr_shrink.attr,
190 &gfs2_attr_withdraw.attr,
191 &gfs2_attr_statfs_sync.attr,
192 &gfs2_attr_quota_sync.attr,
193 &gfs2_attr_quota_refresh_user.attr,
194 &gfs2_attr_quota_refresh_group.attr,
195 NULL,
196};
197
198static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
199 char *buf)
200{
201 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
202 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
203 return a->show ? a->show(sdp, buf) : 0;
204}
205
206static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
207 const char *buf, size_t len)
208{
209 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
210 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
211 return a->store ? a->store(sdp, buf, len) : len;
212}
213
214static struct sysfs_ops gfs2_attr_ops = {
215 .show = gfs2_attr_show,
216 .store = gfs2_attr_store,
217};
218
219static struct kobj_type gfs2_ktype = {
220 .default_attrs = gfs2_attrs,
221 .sysfs_ops = &gfs2_attr_ops,
222};
223
224static struct kset gfs2_kset = {
225 .subsys = &fs_subsys,
226 .kobj = {.name = "gfs2",},
227 .ktype = &gfs2_ktype,
228};
229
230/*
231 * display struct lm_lockstruct fields
232 */
233
234struct lockstruct_attr {
235 struct attribute attr;
236 ssize_t (*show)(struct gfs2_sbd *, char *);
237};
238
239#define LOCKSTRUCT_ATTR(name, fmt) \
240static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
241{ \
242 return sprintf(buf, fmt, sdp->sd_lockstruct.ls_##name); \
243} \
244static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
245
246LOCKSTRUCT_ATTR(jid, "%u\n");
247LOCKSTRUCT_ATTR(first, "%u\n");
248LOCKSTRUCT_ATTR(lvb_size, "%u\n");
249LOCKSTRUCT_ATTR(flags, "%d\n");
250
251static struct attribute *lockstruct_attrs[] = {
252 &lockstruct_attr_jid.attr,
253 &lockstruct_attr_first.attr,
254 &lockstruct_attr_lvb_size.attr,
255 &lockstruct_attr_flags.attr,
256 NULL
257};
258
259/*
260 * display struct gfs2_args fields
261 */
262
263struct args_attr {
264 struct attribute attr;
265 ssize_t (*show)(struct gfs2_sbd *, char *);
266};
267
268#define ARGS_ATTR(name, fmt) \
269static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
270{ \
271 return sprintf(buf, fmt, sdp->sd_args.ar_##name); \
272} \
273static struct args_attr args_attr_##name = __ATTR_RO(name)
274
275ARGS_ATTR(lockproto, "%s\n");
276ARGS_ATTR(locktable, "%s\n");
277ARGS_ATTR(hostdata, "%s\n");
278ARGS_ATTR(spectator, "%d\n");
279ARGS_ATTR(ignore_local_fs, "%d\n");
280ARGS_ATTR(localcaching, "%d\n");
281ARGS_ATTR(localflocks, "%d\n");
282ARGS_ATTR(debug, "%d\n");
283ARGS_ATTR(upgrade, "%d\n");
284ARGS_ATTR(num_glockd, "%u\n");
285ARGS_ATTR(posix_acl, "%d\n");
286ARGS_ATTR(quota, "%u\n");
287ARGS_ATTR(suiddir, "%d\n");
288ARGS_ATTR(data, "%d\n");
289
290/* one oddball doesn't fit the macro mold */
291static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
292{
293 return sprintf(buf, "%d\n", !!test_bit(SDF_NOATIME, &sdp->sd_flags));
294}
295static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
296
297static struct attribute *args_attrs[] = {
298 &args_attr_lockproto.attr,
299 &args_attr_locktable.attr,
300 &args_attr_hostdata.attr,
301 &args_attr_spectator.attr,
302 &args_attr_ignore_local_fs.attr,
303 &args_attr_localcaching.attr,
304 &args_attr_localflocks.attr,
305 &args_attr_debug.attr,
306 &args_attr_upgrade.attr,
307 &args_attr_num_glockd.attr,
308 &args_attr_posix_acl.attr,
309 &args_attr_quota.attr,
310 &args_attr_suiddir.attr,
311 &args_attr_data.attr,
312 &args_attr_noatime.attr,
313 NULL
314};
315
316/*
317 * display counters from superblock
318 */
319
320struct counters_attr {
321 struct attribute attr;
322 ssize_t (*show)(struct gfs2_sbd *, char *);
323};
324
325#define COUNTERS_ATTR(name, fmt) \
326static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
327{ \
328 return sprintf(buf, fmt, (unsigned int)atomic_read(&sdp->sd_##name)); \
329} \
330static struct counters_attr counters_attr_##name = __ATTR_RO(name)
331
332COUNTERS_ATTR(glock_count, "%u\n");
333COUNTERS_ATTR(glock_held_count, "%u\n");
334COUNTERS_ATTR(inode_count, "%u\n");
335COUNTERS_ATTR(reclaimed, "%u\n");
336
337static struct attribute *counters_attrs[] = {
338 &counters_attr_glock_count.attr,
339 &counters_attr_glock_held_count.attr,
340 &counters_attr_inode_count.attr,
341 &counters_attr_reclaimed.attr,
342 NULL
343};
344
345/*
346 * get and set struct gfs2_tune fields
347 */
348
349static ssize_t quota_scale_show(struct gfs2_sbd *sdp, char *buf)
350{
351 return sprintf(buf, "%u %u\n", sdp->sd_tune.gt_quota_scale_num,
352 sdp->sd_tune.gt_quota_scale_den);
353}
354
355static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf,
356 size_t len)
357{
358 struct gfs2_tune *gt = &sdp->sd_tune;
359 unsigned int x, y;
360
361 if (!capable(CAP_SYS_ADMIN))
362 return -EACCES;
363
364 if (sscanf(buf, "%u %u", &x, &y) != 2 || !y)
365 return -EINVAL;
366
367 spin_lock(&gt->gt_spin);
368 gt->gt_quota_scale_num = x;
369 gt->gt_quota_scale_den = y;
370 spin_unlock(&gt->gt_spin);
371 return len;
372}
373
374static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
375 int check_zero, const char *buf, size_t len)
376{
377 struct gfs2_tune *gt = &sdp->sd_tune;
378 unsigned int x;
379
380 if (!capable(CAP_SYS_ADMIN))
381 return -EACCES;
382
383 x = simple_strtoul(buf, NULL, 0);
384
385 if (check_zero && !x)
386 return -EINVAL;
387
388 spin_lock(&gt->gt_spin);
389 *field = x;
390 spin_unlock(&gt->gt_spin);
391 return len;
392}
393
394struct tune_attr {
395 struct attribute attr;
396 ssize_t (*show)(struct gfs2_sbd *, char *);
397 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
398};
399
400#define TUNE_ATTR_3(name, show, store) \
401static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store)
402
403#define TUNE_ATTR_2(name, store) \
404static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
405{ \
406 return sprintf(buf, "%u\n", sdp->sd_tune.gt_##name); \
407} \
408TUNE_ATTR_3(name, name##_show, store)
409
410#define TUNE_ATTR(name, check_zero) \
411static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
412{ \
413 return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len); \
414} \
415TUNE_ATTR_2(name, name##_store)
416
417#define TUNE_ATTR_DAEMON(name, process) \
418static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
419{ \
420 ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len); \
421 wake_up_process(sdp->sd_##process); \
422 return r; \
423} \
424TUNE_ATTR_2(name, name##_store)
425
426TUNE_ATTR(ilimit, 0);
427TUNE_ATTR(ilimit_tries, 0);
428TUNE_ATTR(ilimit_min, 0);
429TUNE_ATTR(demote_secs, 0);
430TUNE_ATTR(incore_log_blocks, 0);
431TUNE_ATTR(log_flush_secs, 0);
432TUNE_ATTR(jindex_refresh_secs, 0);
433TUNE_ATTR(quota_warn_period, 0);
434TUNE_ATTR(quota_quantum, 0);
435TUNE_ATTR(atime_quantum, 0);
436TUNE_ATTR(max_readahead, 0);
437TUNE_ATTR(complain_secs, 0);
438TUNE_ATTR(reclaim_limit, 0);
439TUNE_ATTR(prefetch_secs, 0);
440TUNE_ATTR(statfs_slow, 0);
441TUNE_ATTR(new_files_jdata, 0);
442TUNE_ATTR(new_files_directio, 0);
443TUNE_ATTR(quota_simul_sync, 1);
444TUNE_ATTR(quota_cache_secs, 1);
445TUNE_ATTR(max_atomic_write, 1);
446TUNE_ATTR(stall_secs, 1);
447TUNE_ATTR(entries_per_readdir, 1);
448TUNE_ATTR(greedy_default, 1);
449TUNE_ATTR(greedy_quantum, 1);
450TUNE_ATTR(greedy_max, 1);
451TUNE_ATTR(statfs_quantum, 1);
452TUNE_ATTR_DAEMON(scand_secs, scand_process);
453TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
454TUNE_ATTR_DAEMON(logd_secs, logd_process);
455TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
456TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
457
458static struct attribute *tune_attrs[] = {
459 &tune_attr_ilimit.attr,
460 &tune_attr_ilimit_tries.attr,
461 &tune_attr_ilimit_min.attr,
462 &tune_attr_demote_secs.attr,
463 &tune_attr_incore_log_blocks.attr,
464 &tune_attr_log_flush_secs.attr,
465 &tune_attr_jindex_refresh_secs.attr,
466 &tune_attr_quota_warn_period.attr,
467 &tune_attr_quota_quantum.attr,
468 &tune_attr_atime_quantum.attr,
469 &tune_attr_max_readahead.attr,
470 &tune_attr_complain_secs.attr,
471 &tune_attr_reclaim_limit.attr,
472 &tune_attr_prefetch_secs.attr,
473 &tune_attr_statfs_slow.attr,
474 &tune_attr_quota_simul_sync.attr,
475 &tune_attr_quota_cache_secs.attr,
476 &tune_attr_max_atomic_write.attr,
477 &tune_attr_stall_secs.attr,
478 &tune_attr_entries_per_readdir.attr,
479 &tune_attr_greedy_default.attr,
480 &tune_attr_greedy_quantum.attr,
481 &tune_attr_greedy_max.attr,
482 &tune_attr_statfs_quantum.attr,
483 &tune_attr_scand_secs.attr,
484 &tune_attr_recoverd_secs.attr,
485 &tune_attr_logd_secs.attr,
486 &tune_attr_quotad_secs.attr,
487 &tune_attr_quota_scale.attr,
488 &tune_attr_new_files_jdata.attr,
489 &tune_attr_new_files_directio.attr,
490 NULL
491};
492
493static struct attribute_group lockstruct_group = {
494 .name = "lockstruct",
495 .attrs = lockstruct_attrs
496};
497
498static struct attribute_group counters_group = {
499 .name = "counters",
500 .attrs = counters_attrs
501};
502
503static struct attribute_group args_group = {
504 .name = "args",
505 .attrs = args_attrs
506};
507
508static struct attribute_group tune_group = {
509 .name = "tune",
510 .attrs = tune_attrs
511};
512
513int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
514{
515 int error;
516
517 sdp->sd_kobj.kset = &gfs2_kset;
518 sdp->sd_kobj.ktype = &gfs2_ktype;
519
520 error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name);
521 if (error)
522 goto fail;
523
524 error = kobject_register(&sdp->sd_kobj);
525 if (error)
526 goto fail;
527
528 error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
529 if (error)
530 goto fail_reg;
531
532 error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
533 if (error)
534 goto fail_lockstruct;
535
536 error = sysfs_create_group(&sdp->sd_kobj, &args_group);
537 if (error)
538 goto fail_counters;
539
540 error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
541 if (error)
542 goto fail_args;
543
544 return 0;
545
546 fail_args:
547 sysfs_remove_group(&sdp->sd_kobj, &args_group);
548 fail_counters:
549 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
550 fail_lockstruct:
551 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
552 fail_reg:
553 kobject_unregister(&sdp->sd_kobj);
554 fail:
555 return error;
556}
557
558void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
559{
560 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
561 sysfs_remove_group(&sdp->sd_kobj, &args_group);
562 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
563 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
564 kobject_unregister(&sdp->sd_kobj);
565}
566
567int gfs2_sys_init(void)
568{
569 gfs2_sys_margs = NULL;
570 spin_lock_init(&gfs2_sys_margs_lock);
571 return kset_register(&gfs2_kset);
572}
573
574void gfs2_sys_uninit(void)
575{
576 kfree(gfs2_sys_margs);
577 kset_unregister(&gfs2_kset);
578}
579
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
new file mode 100644
index 000000000000..c46a700e801e
--- /dev/null
+++ b/fs/gfs2/sys.h
@@ -0,0 +1,24 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __SYS_DOT_H__
11#define __SYS_DOT_H__
12
13/* Allow args to be passed to GFS2 when using an initial ram disk */
14extern char *gfs2_sys_margs;
15extern spinlock_t gfs2_sys_margs_lock;
16
17int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
18void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
19
20int gfs2_sys_init(void);
21void gfs2_sys_uninit(void);
22
23#endif /* __SYS_DOT_H__ */
24
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
new file mode 100644
index 000000000000..05e0b72d56ff
--- /dev/null
+++ b/fs/gfs2/trans.c
@@ -0,0 +1,184 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/kallsyms.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "glock.h"
22#include "log.h"
23#include "lops.h"
24#include "meta_io.h"
25#include "trans.h"
26#include "util.h"
27
28int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
29 unsigned int revokes)
30{
31 struct gfs2_trans *tr;
32 int error;
33
34 BUG_ON(current->journal_info);
35 BUG_ON(blocks == 0 && revokes == 0);
36
37 tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
38 if (!tr)
39 return -ENOMEM;
40
41 tr->tr_ip = (unsigned long)__builtin_return_address(0);
42 tr->tr_blocks = blocks;
43 tr->tr_revokes = revokes;
44 tr->tr_reserved = 1;
45 if (blocks)
46 tr->tr_reserved += 6 + blocks;
47 if (revokes)
48 tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
49 sizeof(uint64_t));
50 INIT_LIST_HEAD(&tr->tr_list_buf);
51
52 gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
53
54 error = gfs2_glock_nq(&tr->tr_t_gh);
55 if (error)
56 goto fail_holder_uninit;
57
58 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
59 tr->tr_t_gh.gh_flags |= GL_NOCACHE;
60 error = -EROFS;
61 goto fail_gunlock;
62 }
63
64 error = gfs2_log_reserve(sdp, tr->tr_reserved);
65 if (error)
66 goto fail_gunlock;
67
68 current->journal_info = tr;
69
70 return 0;
71
72fail_gunlock:
73 gfs2_glock_dq(&tr->tr_t_gh);
74
75fail_holder_uninit:
76 gfs2_holder_uninit(&tr->tr_t_gh);
77 kfree(tr);
78
79 return error;
80}
81
82void gfs2_trans_end(struct gfs2_sbd *sdp)
83{
84 struct gfs2_trans *tr = current->journal_info;
85
86 BUG_ON(!tr);
87 current->journal_info = NULL;
88
89 if (!tr->tr_touched) {
90 gfs2_log_release(sdp, tr->tr_reserved);
91 gfs2_glock_dq(&tr->tr_t_gh);
92 gfs2_holder_uninit(&tr->tr_t_gh);
93 kfree(tr);
94 return;
95 }
96
97 if (gfs2_assert_withdraw(sdp, tr->tr_num_buf <= tr->tr_blocks)) {
98 fs_err(sdp, "tr_num_buf = %u, tr_blocks = %u ",
99 tr->tr_num_buf, tr->tr_blocks);
100 print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
101 }
102 if (gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) {
103 fs_err(sdp, "tr_num_revoke = %u, tr_revokes = %u ",
104 tr->tr_num_revoke, tr->tr_revokes);
105 print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
106 }
107
108 gfs2_log_commit(sdp, tr);
109 gfs2_glock_dq(&tr->tr_t_gh);
110 gfs2_holder_uninit(&tr->tr_t_gh);
111 kfree(tr);
112
113 if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
114 gfs2_log_flush(sdp, NULL);
115}
116
117void gfs2_trans_add_gl(struct gfs2_glock *gl)
118{
119 lops_add(gl->gl_sbd, &gl->gl_le);
120}
121
122/**
123 * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
124 * @gl: the glock the buffer belongs to
125 * @bh: The buffer to add
126 * @meta: True in the case of adding metadata
127 *
128 */
129
130void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
131{
132 struct gfs2_sbd *sdp = gl->gl_sbd;
133 struct gfs2_bufdata *bd;
134
135 bd = bh->b_private;
136 if (bd)
137 gfs2_assert(sdp, bd->bd_gl == gl);
138 else {
139 gfs2_attach_bufdata(gl, bh, meta);
140 bd = bh->b_private;
141 }
142 lops_add(sdp, &bd->bd_le);
143}
144
145void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, uint64_t blkno)
146{
147 struct gfs2_revoke *rv = kmalloc(sizeof(struct gfs2_revoke),
148 GFP_NOFS | __GFP_NOFAIL);
149 lops_init_le(&rv->rv_le, &gfs2_revoke_lops);
150 rv->rv_blkno = blkno;
151 lops_add(sdp, &rv->rv_le);
152}
153
154void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, uint64_t blkno)
155{
156 struct gfs2_revoke *rv;
157 int found = 0;
158
159 gfs2_log_lock(sdp);
160
161 list_for_each_entry(rv, &sdp->sd_log_le_revoke, rv_le.le_list) {
162 if (rv->rv_blkno == blkno) {
163 list_del(&rv->rv_le.le_list);
164 gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
165 sdp->sd_log_num_revoke--;
166 found = 1;
167 break;
168 }
169 }
170
171 gfs2_log_unlock(sdp);
172
173 if (found) {
174 struct gfs2_trans *tr = current->journal_info;
175 kfree(rv);
176 tr->tr_num_revoke_rm++;
177 }
178}
179
180void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
181{
182 lops_add(rgd->rd_sbd, &rgd->rd_le);
183}
184
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
new file mode 100644
index 000000000000..fbef3f5a99e3
--- /dev/null
+++ b/fs/gfs2/trans.h
@@ -0,0 +1,34 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __TRANS_DOT_H__
11#define __TRANS_DOT_H__
12
13#define RES_DINODE 1
14#define RES_INDIRECT 1
15#define RES_JDATA 1
16#define RES_DATA 1
17#define RES_LEAF 1
18#define RES_RG_BIT 2
19#define RES_EATTR 1
20#define RES_STATFS 1
21#define RES_QUOTA 2
22
23int gfs2_trans_begin(struct gfs2_sbd *sdp,
24 unsigned int blocks, unsigned int revokes);
25
26void gfs2_trans_end(struct gfs2_sbd *sdp);
27
28void gfs2_trans_add_gl(struct gfs2_glock *gl);
29void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
30void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, uint64_t blkno);
31void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, uint64_t blkno);
32void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
33
34#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
new file mode 100644
index 000000000000..39e67b1ec70a
--- /dev/null
+++ b/fs/gfs2/util.c
@@ -0,0 +1,245 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/crc32.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/uaccess.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "glock.h"
23#include "lm.h"
24#include "util.h"
25
26kmem_cache_t *gfs2_glock_cachep __read_mostly;
27kmem_cache_t *gfs2_inode_cachep __read_mostly;
28kmem_cache_t *gfs2_bufdata_cachep __read_mostly;
29
30void gfs2_assert_i(struct gfs2_sbd *sdp)
31{
32 printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n",
33 sdp->sd_fsname);
34}
35
36/**
37 * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
38 * Returns: -1 if this call withdrew the machine,
39 * -2 if it was already withdrawn
40 */
41
42int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
43 const char *function, char *file, unsigned int line)
44{
45 int me;
46 me = gfs2_lm_withdraw(sdp,
47 "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n"
48 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
49 sdp->sd_fsname, assertion,
50 sdp->sd_fsname, function, file, line);
51 dump_stack();
52 return (me) ? -1 : -2;
53}
54
55/**
56 * gfs2_assert_warn_i - Print a message to the console if @assertion is false
57 * Returns: -1 if we printed something
58 * -2 if we didn't
59 */
60
61int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
62 const char *function, char *file, unsigned int line)
63{
64 if (time_before(jiffies,
65 sdp->sd_last_warning +
66 gfs2_tune_get(sdp, gt_complain_secs) * HZ))
67 return -2;
68
69 printk(KERN_WARNING
70 "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
71 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
72 sdp->sd_fsname, assertion,
73 sdp->sd_fsname, function, file, line);
74
75 if (sdp->sd_args.ar_debug)
76 BUG();
77 else
78 dump_stack();
79
80 sdp->sd_last_warning = jiffies;
81
82 return -1;
83}
84
85/**
86 * gfs2_consist_i - Flag a filesystem consistency error and withdraw
87 * Returns: -1 if this call withdrew the machine,
88 * 0 if it was already withdrawn
89 */
90
91int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
92 char *file, unsigned int line)
93{
94 int rv;
95 rv = gfs2_lm_withdraw(sdp,
96 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
97 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
98 sdp->sd_fsname,
99 sdp->sd_fsname, function, file, line);
100 return rv;
101}
102
103/**
104 * gfs2_consist_inode_i - Flag an inode consistency error and withdraw
105 * Returns: -1 if this call withdrew the machine,
106 * 0 if it was already withdrawn
107 */
108
109int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
110 const char *function, char *file, unsigned int line)
111{
112 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
113 int rv;
114 rv = gfs2_lm_withdraw(sdp,
115 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
116 "GFS2: fsid=%s: inode = %llu %llu\n"
117 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
118 sdp->sd_fsname,
119 sdp->sd_fsname, (unsigned long long)ip->i_num.no_formal_ino,
120 (unsigned long long)ip->i_num.no_addr,
121 sdp->sd_fsname, function, file, line);
122 return rv;
123}
124
125/**
126 * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw
127 * Returns: -1 if this call withdrew the machine,
128 * 0 if it was already withdrawn
129 */
130
131int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
132 const char *function, char *file, unsigned int line)
133{
134 struct gfs2_sbd *sdp = rgd->rd_sbd;
135 int rv;
136 rv = gfs2_lm_withdraw(sdp,
137 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
138 "GFS2: fsid=%s: RG = %llu\n"
139 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
140 sdp->sd_fsname,
141 sdp->sd_fsname, (unsigned long long)rgd->rd_ri.ri_addr,
142 sdp->sd_fsname, function, file, line);
143 return rv;
144}
145
146/**
147 * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw
148 * Returns: -1 if this call withdrew the machine,
149 * -2 if it was already withdrawn
150 */
151
152int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
153 const char *type, const char *function, char *file,
154 unsigned int line)
155{
156 int me;
157 me = gfs2_lm_withdraw(sdp,
158 "GFS2: fsid=%s: fatal: invalid metadata block\n"
159 "GFS2: fsid=%s: bh = %llu (%s)\n"
160 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
161 sdp->sd_fsname,
162 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type,
163 sdp->sd_fsname, function, file, line);
164 return (me) ? -1 : -2;
165}
166
167/**
168 * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw
169 * Returns: -1 if this call withdrew the machine,
170 * -2 if it was already withdrawn
171 */
172
173int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
174 uint16_t type, uint16_t t, const char *function,
175 char *file, unsigned int line)
176{
177 int me;
178 me = gfs2_lm_withdraw(sdp,
179 "GFS2: fsid=%s: fatal: invalid metadata block\n"
180 "GFS2: fsid=%s: bh = %llu (type: exp=%u, found=%u)\n"
181 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
182 sdp->sd_fsname,
183 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t,
184 sdp->sd_fsname, function, file, line);
185 return (me) ? -1 : -2;
186}
187
188/**
189 * gfs2_io_error_i - Flag an I/O error and withdraw
190 * Returns: -1 if this call withdrew the machine,
191 * 0 if it was already withdrawn
192 */
193
194int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
195 unsigned int line)
196{
197 int rv;
198 rv = gfs2_lm_withdraw(sdp,
199 "GFS2: fsid=%s: fatal: I/O error\n"
200 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
201 sdp->sd_fsname,
202 sdp->sd_fsname, function, file, line);
203 return rv;
204}
205
206/**
207 * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw
208 * Returns: -1 if this call withdrew the machine,
209 * 0 if it was already withdrawn
210 */
211
212int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
213 const char *function, char *file, unsigned int line)
214{
215 int rv;
216 rv = gfs2_lm_withdraw(sdp,
217 "GFS2: fsid=%s: fatal: I/O error\n"
218 "GFS2: fsid=%s: block = %llu\n"
219 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
220 sdp->sd_fsname,
221 sdp->sd_fsname, (unsigned long long)bh->b_blocknr,
222 sdp->sd_fsname, function, file, line);
223 return rv;
224}
225
226void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
227 unsigned int bit, int new_value)
228{
229 unsigned int c, o, b = bit;
230 int old_value;
231
232 c = b / (8 * PAGE_SIZE);
233 b %= 8 * PAGE_SIZE;
234 o = b / 8;
235 b %= 8;
236
237 old_value = (bitmap[c][o] & (1 << b));
238 gfs2_assert_withdraw(sdp, !old_value != !new_value);
239
240 if (new_value)
241 bitmap[c][o] |= 1 << b;
242 else
243 bitmap[c][o] &= ~(1 << b);
244}
245
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
new file mode 100644
index 000000000000..8216d28bd816
--- /dev/null
+++ b/fs/gfs2/util.h
@@ -0,0 +1,169 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __UTIL_DOT_H__
11#define __UTIL_DOT_H__
12
13
14#define fs_printk(level, fs, fmt, arg...) \
15 printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg)
16
17#define fs_info(fs, fmt, arg...) \
18 fs_printk(KERN_INFO , fs , fmt , ## arg)
19
20#define fs_warn(fs, fmt, arg...) \
21 fs_printk(KERN_WARNING , fs , fmt , ## arg)
22
23#define fs_err(fs, fmt, arg...) \
24 fs_printk(KERN_ERR, fs , fmt , ## arg)
25
26
27void gfs2_assert_i(struct gfs2_sbd *sdp);
28
29#define gfs2_assert(sdp, assertion) \
30do { \
31 if (unlikely(!(assertion))) { \
32 gfs2_assert_i(sdp); \
33 BUG(); \
34 } \
35} while (0)
36
37
38int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
39 const char *function, char *file, unsigned int line);
40
41#define gfs2_assert_withdraw(sdp, assertion) \
42((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \
43 __FUNCTION__, __FILE__, __LINE__))
44
45
46int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
47 const char *function, char *file, unsigned int line);
48
49#define gfs2_assert_warn(sdp, assertion) \
50((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \
51 __FUNCTION__, __FILE__, __LINE__))
52
53
54int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide,
55 const char *function, char *file, unsigned int line);
56
57#define gfs2_consist(sdp) \
58gfs2_consist_i((sdp), 0, __FUNCTION__, __FILE__, __LINE__)
59
60
61int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
62 const char *function, char *file, unsigned int line);
63
64#define gfs2_consist_inode(ip) \
65gfs2_consist_inode_i((ip), 0, __FUNCTION__, __FILE__, __LINE__)
66
67
68int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
69 const char *function, char *file, unsigned int line);
70
71#define gfs2_consist_rgrpd(rgd) \
72gfs2_consist_rgrpd_i((rgd), 0, __FUNCTION__, __FILE__, __LINE__)
73
74
75int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
76 const char *type, const char *function,
77 char *file, unsigned int line);
78
79static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp,
80 struct buffer_head *bh,
81 const char *function,
82 char *file, unsigned int line)
83{
84 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
85 uint32_t magic = mh->mh_magic;
86 magic = be32_to_cpu(magic);
87 if (unlikely(magic != GFS2_MAGIC))
88 return gfs2_meta_check_ii(sdp, bh, "magic number", function,
89 file, line);
90 return 0;
91}
92
93#define gfs2_meta_check(sdp, bh) \
94gfs2_meta_check_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__)
95
96
97int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
98 uint16_t type, uint16_t t,
99 const char *function,
100 char *file, unsigned int line);
101
102static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
103 struct buffer_head *bh,
104 uint16_t type,
105 const char *function,
106 char *file, unsigned int line)
107{
108 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
109 uint32_t magic = mh->mh_magic;
110 uint16_t t = be32_to_cpu(mh->mh_type);
111 magic = be32_to_cpu(magic);
112 if (unlikely(magic != GFS2_MAGIC))
113 return gfs2_meta_check_ii(sdp, bh, "magic number", function,
114 file, line);
115 if (unlikely(t != type))
116 return gfs2_metatype_check_ii(sdp, bh, type, t, function,
117 file, line);
118 return 0;
119}
120
121#define gfs2_metatype_check(sdp, bh, type) \
122gfs2_metatype_check_i((sdp), (bh), (type), __FUNCTION__, __FILE__, __LINE__)
123
124static inline void gfs2_metatype_set(struct buffer_head *bh, uint16_t type,
125 uint16_t format)
126{
127 struct gfs2_meta_header *mh;
128 mh = (struct gfs2_meta_header *)bh->b_data;
129 mh->mh_type = cpu_to_be32(type);
130 mh->mh_format = cpu_to_be32(format);
131}
132
133
134int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
135 char *file, unsigned int line);
136
137#define gfs2_io_error(sdp) \
138gfs2_io_error_i((sdp), __FUNCTION__, __FILE__, __LINE__);
139
140
141int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
142 const char *function, char *file, unsigned int line);
143
144#define gfs2_io_error_bh(sdp, bh) \
145gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__);
146
147
148extern kmem_cache_t *gfs2_glock_cachep;
149extern kmem_cache_t *gfs2_inode_cachep;
150extern kmem_cache_t *gfs2_bufdata_cachep;
151
152static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
153 unsigned int *p)
154{
155 unsigned int x;
156 spin_lock(&gt->gt_spin);
157 x = *p;
158 spin_unlock(&gt->gt_spin);
159 return x;
160}
161
162#define gfs2_tune_get(sdp, field) \
163gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
164
165void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
166 unsigned int bit, int new_value);
167
168#endif /* __UTIL_DOT_H__ */
169
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 2b8a7d68fae3..2121cde187d8 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -8,7 +8,7 @@ header-y += affs_fs.h affs_hardblocks.h aio_abi.h a.out.h arcfb.h \
8 atmppp.h atmsap.h atmsvc.h atm_zatm.h auto_fs4.h auxvec.h \ 8 atmppp.h atmsap.h atmsvc.h atm_zatm.h auto_fs4.h auxvec.h \
9 awe_voice.h ax25.h b1lli.h baycom.h bfs_fs.h blkpg.h \ 9 awe_voice.h ax25.h b1lli.h baycom.h bfs_fs.h blkpg.h \
10 bpqether.h cdk.h chio.h coda_psdev.h coff.h comstats.h \ 10 bpqether.h cdk.h chio.h coda_psdev.h coff.h comstats.h \
11 consolemap.h cycx_cfm.h dm-ioctl.h dn.h dqblk_v1.h \ 11 consolemap.h cycx_cfm.h dlm_device.h dm-ioctl.h dn.h dqblk_v1.h \
12 dqblk_v2.h dqblk_xfs.h efs_fs_sb.h elf-fdpic.h elf.h elf-em.h \ 12 dqblk_v2.h dqblk_xfs.h efs_fs_sb.h elf-fdpic.h elf.h elf-em.h \
13 fadvise.h fd.h fdreg.h ftape-header-segment.h ftape-vendors.h \ 13 fadvise.h fd.h fdreg.h ftape-header-segment.h ftape-vendors.h \
14 fuse.h futex.h genetlink.h gen_stats.h gigaset_dev.h hdsmart.h \ 14 fuse.h futex.h genetlink.h gen_stats.h gigaset_dev.h hdsmart.h \
@@ -18,28 +18,29 @@ header-y += affs_fs.h affs_hardblocks.h aio_abi.h a.out.h arcfb.h \
18 if_plip.h if_ppp.h if_slip.h if_strip.h if_tunnel.h in6.h \ 18 if_plip.h if_ppp.h if_slip.h if_strip.h if_tunnel.h in6.h \
19 in_route.h ioctl.h ip.h ipmi_msgdefs.h ip_mp_alg.h ipsec.h \ 19 in_route.h ioctl.h ip.h ipmi_msgdefs.h ip_mp_alg.h ipsec.h \
20 ipx.h irda.h isdn_divertif.h iso_fs.h ite_gpio.h ixjuser.h \ 20 ipx.h irda.h isdn_divertif.h iso_fs.h ite_gpio.h ixjuser.h \
21 jffs2.h keyctl.h limits.h major.h matroxfb.h meye.h minix_fs.h \ 21 jffs2.h keyctl.h limits.h lock_dlm_plock.h major.h matroxfb.h \
22 mmtimer.h mqueue.h mtio.h ncp_no.h netfilter_arp.h netrom.h \ 22 meye.h minix_fs.h mmtimer.h mqueue.h mtio.h ncp_no.h \
23 nfs2.h nfs4_mount.h nfs_mount.h openprom_fs.h param.h \ 23 netfilter_arp.h netrom.h nfs2.h nfs4_mount.h nfs_mount.h \
24 pci_ids.h pci_regs.h personality.h pfkeyv2.h pg.h pkt_cls.h \ 24 openprom_fs.h param.h pci_ids.h pci_regs.h personality.h \
25 pkt_sched.h posix_types.h ppdev.h prctl.h ps2esdi.h qic117.h \ 25 pfkeyv2.h pg.h pkt_cls.h pkt_sched.h posix_types.h ppdev.h \
26 qnxtypes.h quotaio_v1.h quotaio_v2.h radeonfb.h raw.h \ 26 prctl.h ps2esdi.h qic117.h qnxtypes.h quotaio_v1.h quotaio_v2.h \
27 resource.h rose.h sctp.h smbno.h snmp.h sockios.h som.h \ 27 radeonfb.h raw.h resource.h rose.h sctp.h smbno.h snmp.h \
28 sound.h stddef.h synclink.h telephony.h termios.h ticable.h \ 28 sockios.h som.h sound.h stddef.h synclink.h telephony.h \
29 times.h tiocl.h tipc.h toshiba.h ultrasound.h un.h utime.h \ 29 termios.h ticable.h times.h tiocl.h tipc.h toshiba.h \
30 utsname.h video_decoder.h video_encoder.h videotext.h vt.h \ 30 ultrasound.h un.h utime.h utsname.h video_decoder.h \
31 wavefront.h wireless.h xattr.h x25.h zorro_ids.h 31 video_encoder.h videotext.h vt.h wavefront.h wireless.h xattr.h \
32 x25.h zorro_ids.h
32 33
33unifdef-y += acct.h adb.h adfs_fs.h agpgart.h apm_bios.h atalk.h \ 34unifdef-y += acct.h adb.h adfs_fs.h agpgart.h apm_bios.h atalk.h \
34 atmarp.h atmdev.h atm.h atm_tcp.h audit.h auto_fs.h binfmts.h \ 35 atmarp.h atmdev.h atm.h atm_tcp.h audit.h auto_fs.h binfmts.h \
35 capability.h capi.h cciss_ioctl.h cdrom.h cm4000_cs.h \ 36 capability.h capi.h cciss_ioctl.h cdrom.h cm4000_cs.h \
36 cn_proc.h coda.h connector.h cramfs_fs.h cuda.h cyclades.h \ 37 cn_proc.h coda.h connector.h cramfs_fs.h cuda.h cyclades.h \
37 dccp.h dirent.h divert.h elfcore.h errno.h errqueue.h \ 38 dccp.h dirent.h divert.h dlm.h elfcore.h errno.h errqueue.h \
38 ethtool.h eventpoll.h ext2_fs.h ext3_fs.h fb.h fcntl.h \ 39 ethtool.h eventpoll.h ext2_fs.h ext3_fs.h fb.h fcntl.h \
39 filter.h flat.h fs.h ftape.h gameport.h generic_serial.h \ 40 filter.h flat.h fs.h ftape.h gameport.h generic_serial.h \
40 genhd.h hayesesp.h hdlcdrv.h hdlc.h hdreg.h hiddev.h hpet.h \ 41 genhd.h gfs2_ondisk.h hayesesp.h hdlcdrv.h hdlc.h hdreg.h \
41 i2c.h i2o-dev.h icmpv6.h if_bridge.h if_ec.h \ 42 hiddev.h hpet.h i2c.h i2o-dev.h icmpv6.h iflags.h if_bridge.h \
42 if_eql.h if_ether.h if_frad.h if_ltalk.h if_pppox.h \ 43 if_ec.h if_eql.h if_ether.h if_frad.h if_ltalk.h if_pppox.h \
43 if_shaper.h if_tr.h if_tun.h if_vlan.h if_wanpipe.h igmp.h \ 44 if_shaper.h if_tr.h if_tun.h if_vlan.h if_wanpipe.h igmp.h \
44 inet_diag.h in.h inotify.h input.h ipc.h ipmi.h ipv6.h \ 45 inet_diag.h in.h inotify.h input.h ipc.h ipmi.h ipv6.h \
45 ipv6_route.h isdn.h isdnif.h isdn_ppp.h isicom.h jbd.h \ 46 ipv6_route.h isdn.h isdnif.h isdn_ppp.h isicom.h jbd.h \
diff --git a/include/linux/dlm.h b/include/linux/dlm.h
new file mode 100644
index 000000000000..1b1dcb9a40bb
--- /dev/null
+++ b/include/linux/dlm.h
@@ -0,0 +1,302 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DLM_DOT_H__
15#define __DLM_DOT_H__
16
17/*
18 * Interface to Distributed Lock Manager (DLM)
19 * routines and structures to use DLM lockspaces
20 */
21
22/*
23 * Lock Modes
24 */
25
26#define DLM_LOCK_IV -1 /* invalid */
27#define DLM_LOCK_NL 0 /* null */
28#define DLM_LOCK_CR 1 /* concurrent read */
29#define DLM_LOCK_CW 2 /* concurrent write */
30#define DLM_LOCK_PR 3 /* protected read */
31#define DLM_LOCK_PW 4 /* protected write */
32#define DLM_LOCK_EX 5 /* exclusive */
33
34/*
35 * Maximum size in bytes of a dlm_lock name
36 */
37
38#define DLM_RESNAME_MAXLEN 64
39
40/*
41 * Flags to dlm_lock
42 *
43 * DLM_LKF_NOQUEUE
44 *
45 * Do not queue the lock request on the wait queue if it cannot be granted
46 * immediately. If the lock cannot be granted because of this flag, DLM will
47 * either return -EAGAIN from the dlm_lock call or will return 0 from
48 * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
49 *
50 * DLM_LKF_CANCEL
51 *
52 * Used to cancel a pending lock request or conversion. A converting lock is
53 * returned to its previously granted mode.
54 *
55 * DLM_LKF_CONVERT
56 *
57 * Indicates a lock conversion request. For conversions the name and namelen
58 * are ignored and the lock ID in the LKSB is used to identify the lock.
59 *
60 * DLM_LKF_VALBLK
61 *
62 * Requests DLM to return the current contents of the lock value block in the
63 * lock status block. When this flag is set in a lock conversion from PW or EX
64 * modes, DLM assigns the value specified in the lock status block to the lock
65 * value block of the lock resource. The LVB is a DLM_LVB_LEN size array
66 * containing application-specific information.
67 *
68 * DLM_LKF_QUECVT
69 *
70 * Force a conversion request to be queued, even if it is compatible with
71 * the granted modes of other locks on the same resource.
72 *
73 * DLM_LKF_IVVALBLK
74 *
75 * Invalidate the lock value block.
76 *
77 * DLM_LKF_CONVDEADLK
78 *
79 * Allows the dlm to resolve conversion deadlocks internally by demoting the
80 * granted mode of a converting lock to NL. The DLM_SBF_DEMOTED flag is
81 * returned for a conversion that's been effected by this.
82 *
83 * DLM_LKF_PERSISTENT
84 *
85 * Only relevant to locks originating in userspace. A persistent lock will not
86 * be removed if the process holding the lock exits.
87 *
88 * DLM_LKF_NODLKWT
89 * DLM_LKF_NODLCKBLK
90 *
91 * net yet implemented
92 *
93 * DLM_LKF_EXPEDITE
94 *
95 * Used only with new requests for NL mode locks. Tells the lock manager
96 * to grant the lock, ignoring other locks in convert and wait queues.
97 *
98 * DLM_LKF_NOQUEUEBAST
99 *
100 * Send blocking AST's before returning -EAGAIN to the caller. It is only
101 * used along with the NOQUEUE flag. Blocking AST's are not sent for failed
102 * NOQUEUE requests otherwise.
103 *
104 * DLM_LKF_HEADQUE
105 *
106 * Add a lock to the head of the convert or wait queue rather than the tail.
107 *
108 * DLM_LKF_NOORDER
109 *
110 * Disregard the standard grant order rules and grant a lock as soon as it
111 * is compatible with other granted locks.
112 *
113 * DLM_LKF_ORPHAN
114 *
115 * not yet implemented
116 *
117 * DLM_LKF_ALTPR
118 *
119 * If the requested mode cannot be granted immediately, try to grant the lock
120 * in PR mode instead. If this alternate mode is granted instead of the
121 * requested mode, DLM_SBF_ALTMODE is returned in the lksb.
122 *
123 * DLM_LKF_ALTCW
124 *
125 * The same as ALTPR, but the alternate mode is CW.
126 *
127 * DLM_LKF_FORCEUNLOCK
128 *
129 * Unlock the lock even if it is converting or waiting or has sublocks.
130 * Only really for use by the userland device.c code.
131 *
132 */
133
134#define DLM_LKF_NOQUEUE 0x00000001
135#define DLM_LKF_CANCEL 0x00000002
136#define DLM_LKF_CONVERT 0x00000004
137#define DLM_LKF_VALBLK 0x00000008
138#define DLM_LKF_QUECVT 0x00000010
139#define DLM_LKF_IVVALBLK 0x00000020
140#define DLM_LKF_CONVDEADLK 0x00000040
141#define DLM_LKF_PERSISTENT 0x00000080
142#define DLM_LKF_NODLCKWT 0x00000100
143#define DLM_LKF_NODLCKBLK 0x00000200
144#define DLM_LKF_EXPEDITE 0x00000400
145#define DLM_LKF_NOQUEUEBAST 0x00000800
146#define DLM_LKF_HEADQUE 0x00001000
147#define DLM_LKF_NOORDER 0x00002000
148#define DLM_LKF_ORPHAN 0x00004000
149#define DLM_LKF_ALTPR 0x00008000
150#define DLM_LKF_ALTCW 0x00010000
151#define DLM_LKF_FORCEUNLOCK 0x00020000
152
153/*
154 * Some return codes that are not in errno.h
155 */
156
157#define DLM_ECANCEL 0x10001
158#define DLM_EUNLOCK 0x10002
159
160typedef void dlm_lockspace_t;
161
162/*
163 * Lock status block
164 *
165 * Use this structure to specify the contents of the lock value block. For a
166 * conversion request, this structure is used to specify the lock ID of the
167 * lock. DLM writes the status of the lock request and the lock ID assigned
168 * to the request in the lock status block.
169 *
170 * sb_lkid: the returned lock ID. It is set on new (non-conversion) requests.
171 * It is available when dlm_lock returns.
172 *
173 * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
174 * shown for the DLM_LKF_VALBLK flag.
175 *
176 * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
177 * it was first demoted to NL to avoid conversion deadlock.
178 * DLM_SBF_VALNOTVALID is returned if the resource's LVB is marked invalid.
179 *
180 * sb_status: the returned status of the lock request set prior to AST
181 * execution. Possible return values:
182 *
183 * 0 if lock request was successful
184 * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
185 * -ENOMEM if there is no memory to process request
186 * -EINVAL if there are invalid parameters
187 * -DLM_EUNLOCK if unlock request was successful
188 * -DLM_ECANCEL if a cancel completed successfully
189 */
190
191#define DLM_SBF_DEMOTED 0x01
192#define DLM_SBF_VALNOTVALID 0x02
193#define DLM_SBF_ALTMODE 0x04
194
195struct dlm_lksb {
196 int sb_status;
197 uint32_t sb_lkid;
198 char sb_flags;
199 char * sb_lvbptr;
200};
201
202
203#ifdef __KERNEL__
204
205#define DLM_LSFL_NODIR 0x00000001
206
207/*
208 * dlm_new_lockspace
209 *
210 * Starts a lockspace with the given name. If the named lockspace exists in
211 * the cluster, the calling node joins it.
212 */
213
214int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
215 uint32_t flags, int lvblen);
216
217/*
218 * dlm_release_lockspace
219 *
220 * Stop a lockspace.
221 */
222
223int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
224
225/*
226 * dlm_lock
227 *
228 * Make an asyncronous request to acquire or convert a lock on a named
229 * resource.
230 *
231 * lockspace: context for the request
232 * mode: the requested mode of the lock (DLM_LOCK_)
233 * lksb: lock status block for input and async return values
234 * flags: input flags (DLM_LKF_)
235 * name: name of the resource to lock, can be binary
236 * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
237 * parent: the lock ID of a parent lock or 0 if none
238 * lockast: function DLM executes when it completes processing the request
239 * astarg: argument passed to lockast and bast functions
240 * bast: function DLM executes when this lock later blocks another request
241 *
242 * Returns:
243 * 0 if request is successfully queued for processing
244 * -EINVAL if any input parameters are invalid
245 * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
246 * -ENOMEM if there is no memory to process request
247 * -ENOTCONN if there is a communication error
248 *
249 * If the call to dlm_lock returns an error then the operation has failed and
250 * the AST routine will not be called. If dlm_lock returns 0 it is still
251 * possible that the lock operation will fail. The AST routine will be called
252 * when the locking is complete and the status is returned in the lksb.
253 *
254 * If the AST routines or parameter are passed to a conversion operation then
255 * they will overwrite those values that were passed to a previous dlm_lock
256 * call.
257 *
258 * AST routines should not block (at least not for long), but may make
259 * any locking calls they please.
260 */
261
262int dlm_lock(dlm_lockspace_t *lockspace,
263 int mode,
264 struct dlm_lksb *lksb,
265 uint32_t flags,
266 void *name,
267 unsigned int namelen,
268 uint32_t parent_lkid,
269 void (*lockast) (void *astarg),
270 void *astarg,
271 void (*bast) (void *astarg, int mode));
272
273/*
274 * dlm_unlock
275 *
276 * Asynchronously release a lock on a resource. The AST routine is called
277 * when the resource is successfully unlocked.
278 *
279 * lockspace: context for the request
280 * lkid: the lock ID as returned in the lksb
281 * flags: input flags (DLM_LKF_)
282 * lksb: if NULL the lksb parameter passed to last lock request is used
283 * astarg: the arg used with the completion ast for the unlock
284 *
285 * Returns:
286 * 0 if request is successfully queued for processing
287 * -EINVAL if any input parameters are invalid
288 * -ENOTEMPTY if the lock still has sublocks
289 * -EBUSY if the lock is waiting for a remote lock operation
290 * -ENOTCONN if there is a communication error
291 */
292
293int dlm_unlock(dlm_lockspace_t *lockspace,
294 uint32_t lkid,
295 uint32_t flags,
296 struct dlm_lksb *lksb,
297 void *astarg);
298
299#endif /* __KERNEL__ */
300
301#endif /* __DLM_DOT_H__ */
302
diff --git a/include/linux/dlm_device.h b/include/linux/dlm_device.h
new file mode 100644
index 000000000000..2a2dd189b9fd
--- /dev/null
+++ b/include/linux/dlm_device.h
@@ -0,0 +1,86 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/* This is the device interface for dlm, most users will use a library
15 * interface.
16 */
17
18#define DLM_USER_LVB_LEN 32
19
20/* Version of the device interface */
21#define DLM_DEVICE_VERSION_MAJOR 5
22#define DLM_DEVICE_VERSION_MINOR 0
23#define DLM_DEVICE_VERSION_PATCH 0
24
25/* struct passed to the lock write */
26struct dlm_lock_params {
27 __u8 mode;
28 __u8 namelen;
29 __u16 flags;
30 __u32 lkid;
31 __u32 parent;
32 void __user *castparam;
33 void __user *castaddr;
34 void __user *bastparam;
35 void __user *bastaddr;
36 struct dlm_lksb __user *lksb;
37 char lvb[DLM_USER_LVB_LEN];
38 char name[0];
39};
40
41struct dlm_lspace_params {
42 __u32 flags;
43 __u32 minor;
44 char name[0];
45};
46
47struct dlm_write_request {
48 __u32 version[3];
49 __u8 cmd;
50 __u8 is64bit;
51 __u8 unused[2];
52
53 union {
54 struct dlm_lock_params lock;
55 struct dlm_lspace_params lspace;
56 } i;
57};
58
59/* struct read from the "device" fd,
60 consists mainly of userspace pointers for the library to use */
61struct dlm_lock_result {
62 __u32 length;
63 void __user * user_astaddr;
64 void __user * user_astparam;
65 struct dlm_lksb __user * user_lksb;
66 struct dlm_lksb lksb;
67 __u8 bast_mode;
68 __u8 unused[3];
69 /* Offsets may be zero if no data is present */
70 __u32 lvb_offset;
71};
72
73/* Commands passed to the device */
74#define DLM_USER_LOCK 1
75#define DLM_USER_UNLOCK 2
76#define DLM_USER_QUERY 3
77#define DLM_USER_CREATE_LOCKSPACE 4
78#define DLM_USER_REMOVE_LOCKSPACE 5
79
80/* Arbitrary length restriction */
81#define MAX_LS_NAME_LEN 64
82
83/* Lockspace flags */
84#define DLM_USER_LSFLG_AUTOFREE 1
85#define DLM_USER_LSFLG_FORCEFREE 2
86
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 555bc195c420..4e93bf925086 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1376,6 +1376,9 @@ extern struct subsystem fs_subsys;
1376#define FLOCK_VERIFY_READ 1 1376#define FLOCK_VERIFY_READ 1
1377#define FLOCK_VERIFY_WRITE 2 1377#define FLOCK_VERIFY_WRITE 2
1378 1378
1379/* /sys/fs */
1380extern struct subsystem fs_subsys;
1381
1379extern int locks_mandatory_locked(struct inode *); 1382extern int locks_mandatory_locked(struct inode *);
1380extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); 1383extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t);
1381 1384
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
new file mode 100644
index 000000000000..3ebd8743ce8c
--- /dev/null
+++ b/include/linux/gfs2_ondisk.h
@@ -0,0 +1,443 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GFS2_ONDISK_DOT_H__
11#define __GFS2_ONDISK_DOT_H__
12
13#define GFS2_MAGIC 0x01161970
14#define GFS2_BASIC_BLOCK 512
15#define GFS2_BASIC_BLOCK_SHIFT 9
16
17/* Lock numbers of the LM_TYPE_NONDISK type */
18
19#define GFS2_MOUNT_LOCK 0
20#define GFS2_LIVE_LOCK 1
21#define GFS2_TRANS_LOCK 2
22#define GFS2_RENAME_LOCK 3
23
24/* Format numbers for various metadata types */
25
26#define GFS2_FORMAT_NONE 0
27#define GFS2_FORMAT_SB 100
28#define GFS2_FORMAT_RG 200
29#define GFS2_FORMAT_RB 300
30#define GFS2_FORMAT_DI 400
31#define GFS2_FORMAT_IN 500
32#define GFS2_FORMAT_LF 600
33#define GFS2_FORMAT_JD 700
34#define GFS2_FORMAT_LH 800
35#define GFS2_FORMAT_LD 900
36#define GFS2_FORMAT_LB 1000
37#define GFS2_FORMAT_EA 1600
38#define GFS2_FORMAT_ED 1700
39#define GFS2_FORMAT_QC 1400
40/* These are format numbers for entities contained in files */
41#define GFS2_FORMAT_RI 1100
42#define GFS2_FORMAT_DE 1200
43#define GFS2_FORMAT_QU 1500
44/* These are part of the superblock */
45#define GFS2_FORMAT_FS 1801
46#define GFS2_FORMAT_MULTI 1900
47
48/*
49 * An on-disk inode number
50 */
51
52struct gfs2_inum {
53 __be64 no_formal_ino;
54 __be64 no_addr;
55};
56
57static inline int gfs2_inum_equal(const struct gfs2_inum *ino1,
58 const struct gfs2_inum *ino2)
59{
60 return ino1->no_formal_ino == ino2->no_formal_ino &&
61 ino1->no_addr == ino2->no_addr;
62}
63
64/*
65 * Generic metadata head structure
66 * Every inplace buffer logged in the journal must start with this.
67 */
68
69#define GFS2_METATYPE_NONE 0
70#define GFS2_METATYPE_SB 1
71#define GFS2_METATYPE_RG 2
72#define GFS2_METATYPE_RB 3
73#define GFS2_METATYPE_DI 4
74#define GFS2_METATYPE_IN 5
75#define GFS2_METATYPE_LF 6
76#define GFS2_METATYPE_JD 7
77#define GFS2_METATYPE_LH 8
78#define GFS2_METATYPE_LD 9
79#define GFS2_METATYPE_LB 12
80#define GFS2_METATYPE_EA 10
81#define GFS2_METATYPE_ED 11
82#define GFS2_METATYPE_QC 14
83
84struct gfs2_meta_header {
85 __be32 mh_magic;
86 __be32 mh_type;
87 __be64 __pad0; /* Was generation number in gfs1 */
88 __be32 mh_format;
89 __be32 __pad1; /* Was incarnation number in gfs1 */
90};
91
92/*
93 * super-block structure
94 *
95 * It's probably good if SIZEOF_SB <= GFS2_BASIC_BLOCK (512 bytes)
96 *
97 * Order is important, need to be able to read old superblocks to do on-disk
98 * version upgrades.
99 */
100
101/* Address of superblock in GFS2 basic blocks */
102#define GFS2_SB_ADDR 128
103
104/* The lock number for the superblock (must be zero) */
105#define GFS2_SB_LOCK 0
106
107/* Requirement: GFS2_LOCKNAME_LEN % 8 == 0
108 Includes: the fencing zero at the end */
109#define GFS2_LOCKNAME_LEN 64
110
111struct gfs2_sb {
112 struct gfs2_meta_header sb_header;
113
114 __be32 sb_fs_format;
115 __be32 sb_multihost_format;
116 __u32 __pad0; /* Was superblock flags in gfs1 */
117
118 __be32 sb_bsize;
119 __be32 sb_bsize_shift;
120 __u32 __pad1; /* Was journal segment size in gfs1 */
121
122 struct gfs2_inum sb_master_dir; /* Was jindex dinode in gfs1 */
123 struct gfs2_inum __pad2; /* Was rindex dinode in gfs1 */
124 struct gfs2_inum sb_root_dir;
125
126 char sb_lockproto[GFS2_LOCKNAME_LEN];
127 char sb_locktable[GFS2_LOCKNAME_LEN];
128 /* In gfs1, quota and license dinodes followed */
129};
130
131/*
132 * resource index structure
133 */
134
135struct gfs2_rindex {
136 __be64 ri_addr; /* grp block disk address */
137 __be32 ri_length; /* length of rgrp header in fs blocks */
138 __u32 __pad;
139
140 __be64 ri_data0; /* first data location */
141 __be32 ri_data; /* num of data blocks in rgrp */
142
143 __be32 ri_bitbytes; /* number of bytes in data bitmaps */
144
145 __u8 ri_reserved[64];
146};
147
148/*
149 * resource group header structure
150 */
151
152/* Number of blocks per byte in rgrp */
153#define GFS2_NBBY 4
154#define GFS2_BIT_SIZE 2
155#define GFS2_BIT_MASK 0x00000003
156
157#define GFS2_BLKST_FREE 0
158#define GFS2_BLKST_USED 1
159#define GFS2_BLKST_UNLINKED 2
160#define GFS2_BLKST_DINODE 3
161
162#define GFS2_RGF_JOURNAL 0x00000001
163#define GFS2_RGF_METAONLY 0x00000002
164#define GFS2_RGF_DATAONLY 0x00000004
165#define GFS2_RGF_NOALLOC 0x00000008
166
167struct gfs2_rgrp {
168 struct gfs2_meta_header rg_header;
169
170 __be32 rg_flags;
171 __be32 rg_free;
172 __be32 rg_dinodes;
173 __be32 __pad;
174 __be64 rg_igeneration;
175
176 __u8 rg_reserved[80]; /* Several fields from gfs1 now reserved */
177};
178
179/*
180 * quota structure
181 */
182
183struct gfs2_quota {
184 __be64 qu_limit;
185 __be64 qu_warn;
186 __be64 qu_value;
187 __u8 qu_reserved[64];
188};
189
190/*
191 * dinode structure
192 */
193
194#define GFS2_MAX_META_HEIGHT 10
195#define GFS2_DIR_MAX_DEPTH 17
196
197#define DT2IF(dt) (((dt) << 12) & S_IFMT)
198#define IF2DT(sif) (((sif) & S_IFMT) >> 12)
199
200enum {
201 gfs2fl_Jdata = 0,
202 gfs2fl_ExHash = 1,
203 gfs2fl_Unused = 2,
204 gfs2fl_EaIndirect = 3,
205 gfs2fl_Directio = 4,
206 gfs2fl_Immutable = 5,
207 gfs2fl_AppendOnly = 6,
208 gfs2fl_NoAtime = 7,
209 gfs2fl_Sync = 8,
210 gfs2fl_System = 9,
211 gfs2fl_TruncInProg = 29,
212 gfs2fl_InheritDirectio = 30,
213 gfs2fl_InheritJdata = 31,
214};
215
216/* Dinode flags */
217#define GFS2_DIF_JDATA 0x00000001
218#define GFS2_DIF_EXHASH 0x00000002
219#define GFS2_DIF_UNUSED 0x00000004 /* only in gfs1 */
220#define GFS2_DIF_EA_INDIRECT 0x00000008
221#define GFS2_DIF_DIRECTIO 0x00000010
222#define GFS2_DIF_IMMUTABLE 0x00000020
223#define GFS2_DIF_APPENDONLY 0x00000040
224#define GFS2_DIF_NOATIME 0x00000080
225#define GFS2_DIF_SYNC 0x00000100
226#define GFS2_DIF_SYSTEM 0x00000200 /* New in gfs2 */
227#define GFS2_DIF_TRUNC_IN_PROG 0x20000000 /* New in gfs2 */
228#define GFS2_DIF_INHERIT_DIRECTIO 0x40000000
229#define GFS2_DIF_INHERIT_JDATA 0x80000000
230
231struct gfs2_dinode {
232 struct gfs2_meta_header di_header;
233
234 struct gfs2_inum di_num;
235
236 __be32 di_mode; /* mode of file */
237 __be32 di_uid; /* owner's user id */
238 __be32 di_gid; /* owner's group id */
239 __be32 di_nlink; /* number of links to this file */
240 __be64 di_size; /* number of bytes in file */
241 __be64 di_blocks; /* number of blocks in file */
242 __be64 di_atime; /* time last accessed */
243 __be64 di_mtime; /* time last modified */
244 __be64 di_ctime; /* time last changed */
245 __be32 di_major; /* device major number */
246 __be32 di_minor; /* device minor number */
247
248 /* This section varies from gfs1. Padding added to align with
249 * remainder of dinode
250 */
251 __be64 di_goal_meta; /* rgrp to alloc from next */
252 __be64 di_goal_data; /* data block goal */
253 __be64 di_generation; /* generation number for NFS */
254
255 __be32 di_flags; /* GFS2_DIF_... */
256 __be32 di_payload_format; /* GFS2_FORMAT_... */
257 __u16 __pad1; /* Was ditype in gfs1 */
258 __be16 di_height; /* height of metadata */
259 __u32 __pad2; /* Unused incarnation number from gfs1 */
260
261 /* These only apply to directories */
262 __u16 __pad3; /* Padding */
263 __be16 di_depth; /* Number of bits in the table */
264 __be32 di_entries; /* The number of entries in the directory */
265
266 struct gfs2_inum __pad4; /* Unused even in current gfs1 */
267
268 __be64 di_eattr; /* extended attribute block number */
269
270 __u8 di_reserved[56];
271};
272
273/*
274 * directory structure - many of these per directory file
275 */
276
277#define GFS2_FNAMESIZE 255
278#define GFS2_DIRENT_SIZE(name_len) ((sizeof(struct gfs2_dirent) + (name_len) + 7) & ~7)
279
280struct gfs2_dirent {
281 struct gfs2_inum de_inum;
282 __be32 de_hash;
283 __be16 de_rec_len;
284 __be16 de_name_len;
285 __be16 de_type;
286 __u8 __pad[14];
287};
288
289/*
290 * Header of leaf directory nodes
291 */
292
293struct gfs2_leaf {
294 struct gfs2_meta_header lf_header;
295
296 __be16 lf_depth; /* Depth of leaf */
297 __be16 lf_entries; /* Number of dirents in leaf */
298 __be32 lf_dirent_format; /* Format of the dirents */
299 __be64 lf_next; /* Next leaf, if overflow */
300
301 __u8 lf_reserved[64];
302};
303
304/*
305 * Extended attribute header format
306 */
307
308#define GFS2_EA_MAX_NAME_LEN 255
309#define GFS2_EA_MAX_DATA_LEN 65536
310
311#define GFS2_EATYPE_UNUSED 0
312#define GFS2_EATYPE_USR 1
313#define GFS2_EATYPE_SYS 2
314#define GFS2_EATYPE_SECURITY 3
315
316#define GFS2_EATYPE_LAST 3
317#define GFS2_EATYPE_VALID(x) ((x) <= GFS2_EATYPE_LAST)
318
319#define GFS2_EAFLAG_LAST 0x01 /* last ea in block */
320
321struct gfs2_ea_header {
322 __be32 ea_rec_len;
323 __be32 ea_data_len;
324 __u8 ea_name_len; /* no NULL pointer after the string */
325 __u8 ea_type; /* GFS2_EATYPE_... */
326 __u8 ea_flags; /* GFS2_EAFLAG_... */
327 __u8 ea_num_ptrs;
328 __u32 __pad;
329};
330
331/*
332 * Log header structure
333 */
334
335#define GFS2_LOG_HEAD_UNMOUNT 0x00000001 /* log is clean */
336
337struct gfs2_log_header {
338 struct gfs2_meta_header lh_header;
339
340 __be64 lh_sequence; /* Sequence number of this transaction */
341 __be32 lh_flags; /* GFS2_LOG_HEAD_... */
342 __be32 lh_tail; /* Block number of log tail */
343 __be32 lh_blkno;
344 __be32 lh_hash;
345};
346
347/*
348 * Log type descriptor
349 */
350
351#define GFS2_LOG_DESC_METADATA 300
352/* ld_data1 is the number of metadata blocks in the descriptor.
353 ld_data2 is unused. */
354
355#define GFS2_LOG_DESC_REVOKE 301
356/* ld_data1 is the number of revoke blocks in the descriptor.
357 ld_data2 is unused. */
358
359#define GFS2_LOG_DESC_JDATA 302
360/* ld_data1 is the number of data blocks in the descriptor.
361 ld_data2 is unused. */
362
363struct gfs2_log_descriptor {
364 struct gfs2_meta_header ld_header;
365
366 __be32 ld_type; /* GFS2_LOG_DESC_... */
367 __be32 ld_length; /* Number of buffers in this chunk */
368 __be32 ld_data1; /* descriptor-specific field */
369 __be32 ld_data2; /* descriptor-specific field */
370
371 __u8 ld_reserved[32];
372};
373
374/*
375 * Inum Range
376 * Describe a range of formal inode numbers allocated to
377 * one machine to assign to inodes.
378 */
379
380#define GFS2_INUM_QUANTUM 1048576
381
382struct gfs2_inum_range {
383 __be64 ir_start;
384 __be64 ir_length;
385};
386
387/*
388 * Statfs change
389 * Describes an change to the pool of free and allocated
390 * blocks.
391 */
392
393struct gfs2_statfs_change {
394 __be64 sc_total;
395 __be64 sc_free;
396 __be64 sc_dinodes;
397};
398
399/*
400 * Quota change
401 * Describes an allocation change for a particular
402 * user or group.
403 */
404
405#define GFS2_QCF_USER 0x00000001
406
407struct gfs2_quota_change {
408 __be64 qc_change;
409 __be32 qc_flags; /* GFS2_QCF_... */
410 __be32 qc_id;
411};
412
413#ifdef __KERNEL__
414/* Translation functions */
415
416extern void gfs2_inum_in(struct gfs2_inum *no, char *buf);
417extern void gfs2_inum_out(const struct gfs2_inum *no, char *buf);
418extern void gfs2_sb_in(struct gfs2_sb *sb, char *buf);
419extern void gfs2_rindex_in(struct gfs2_rindex *ri, char *buf);
420extern void gfs2_rindex_out(struct gfs2_rindex *ri, char *buf);
421extern void gfs2_rgrp_in(struct gfs2_rgrp *rg, char *buf);
422extern void gfs2_rgrp_out(struct gfs2_rgrp *rg, char *buf);
423extern void gfs2_quota_in(struct gfs2_quota *qu, char *buf);
424extern void gfs2_quota_out(struct gfs2_quota *qu, char *buf);
425extern void gfs2_dinode_in(struct gfs2_dinode *di, char *buf);
426extern void gfs2_dinode_out(struct gfs2_dinode *di, char *buf);
427extern void gfs2_ea_header_in(struct gfs2_ea_header *ea, char *buf);
428extern void gfs2_ea_header_out(struct gfs2_ea_header *ea, char *buf);
429extern void gfs2_log_header_in(struct gfs2_log_header *lh, char *buf);
430extern void gfs2_inum_range_in(struct gfs2_inum_range *ir, char *buf);
431extern void gfs2_inum_range_out(struct gfs2_inum_range *ir, char *buf);
432extern void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, char *buf);
433extern void gfs2_statfs_change_out(struct gfs2_statfs_change *sc, char *buf);
434extern void gfs2_quota_change_in(struct gfs2_quota_change *qc, char *buf);
435
436/* Printing functions */
437
438extern void gfs2_rindex_print(struct gfs2_rindex *ri);
439extern void gfs2_dinode_print(struct gfs2_dinode *di);
440
441#endif /* __KERNEL__ */
442
443#endif /* __GFS2_ONDISK_DOT_H__ */
diff --git a/include/linux/iflags.h b/include/linux/iflags.h
new file mode 100644
index 000000000000..5b27102dfeaf
--- /dev/null
+++ b/include/linux/iflags.h
@@ -0,0 +1,102 @@
1#ifndef _LINUX_IFLAGS_H
2#define _LINUX_IFLAGS_H
3
4/*
5 * A universal set of inode flags.
6 *
7 * Originally taken from ext2/3 with additions for other filesystems.
8 * Filesystems supporting this interface should interoperate with
9 * the lsattr and chattr command line tools.
10 *
11 * This interface is supported in whole or in part by:
12 * ext2
13 * ext3
14 * xfs
15 * jfs
16 * gfs2
17 *
18 */
19
20#define IFLAGS_GET_IOC _IOR('f', 1, long)
21#define IFLAGS_SET_IOC _IOW('f', 2, long)
22
23/*
24 * These values are provided for use as indices of an array
25 * for use with the iflags_cvt function below
26 */
27enum {
28 iflag_SecureRm = 0, /* Secure deletion */
29 iflag_Unrm = 1, /* Undelete */
30 iflag_Compress = 2, /* Compress file */
31 iflag_Sync = 3, /* Synchronous updates */
32 iflag_Immutable = 4, /* Immutable */
33 iflag_Append = 5, /* Append */
34 iflag_NoDump = 6, /* Don't dump file */
35 iflag_NoAtime = 7, /* No atime updates */
36 /* Reserved for compression usage */
37 iflag_Dirty = 8,
38 iflag_ComprBlk = 9, /* One or more compressed clusters */
39 iflag_NoComp = 10, /* Don't compress */
40 iflag_Ecompr = 11, /* Compression error */
41 /* End of compression flags */
42 iflag_Btree = 12, /* btree format dir */
43 iflag_Index = 12, /* hash-indexed directory */
44 iflag_Imagic = 13, /* AFS directory */
45 iflag_JournalData = 14, /* file data should be journaled */
46 iflag_NoTail = 15, /* file tail should not be merged */
47 iflag_DirSync = 16, /* dirsync behaviour */
48 iflag_TopDir = 17, /* Top of directory hierarchies */
49 iflag_Extent = 19, /* Extents */
50 iflag_DirectIO = 20, /* Always use direct I/O on this file */
51 iflag_Reserved = 31 /* reserved for ext2/3 lib */
52};
53
54#define __IFL(x) (1<<(iflag_##x))
55#define IFLAG_SECRM __IFL(SecureRm) /* 0x00000001 */
56#define IFLAG_UNRM __IFL(Unrm) /* 0x00000002 */
57#define IFLAG_COMPR __IFL(Compr) /* 0x00000004 */
58#define IFLAG_SYNC __IFL(Sync) /* 0x00000008 */
59#define IFLAG_IMMUTABLE __IFL(Immutable) /* 0x00000010 */
60#define IFLAG_APPEND __IFL(Append) /* 0x00000020 */
61#define IFLAG_NODUMP __IFL(NoDump) /* 0x00000040 */
62#define IFLAG_NOATIME __IFL(NoAtime) /* 0x00000080 */
63#define IFLAG_DIRTY __IFL(Dirty) /* 0x00000100 */
64#define IFLAG_COMPRBLK __IFL(ComprBlk) /* 0x00000200 */
65#define IFLAG_NOCOMP __IFL(NoComp) /* 0x00000400 */
66#define IFLAG_ECOMPR __IFL(Ecompr) /* 0x00000800 */
67#define IFLAG_BTREE __IFL(Btree) /* 0x00001000 */
68#define IFLAG_INDEX __IFL(Index) /* 0x00001000 */
69#define IFLAG_IMAGIC __IFL(Imagic) /* 0x00002000 */
70#define IFLAG_JOURNAL_DATA __IFL(JournalData) /* 0x00004000 */
71#define IFLAG_NOTAIL __IFL(NoTail) /* 0x00008000 */
72#define IFLAG_DIRSYNC __IFL(DirSync) /* 0x00010000 */
73#define IFLAG_TOPDIR __IFL(TopDir) /* 0x00020000 */
74#define IFLAG_EXTENT __IFL(Extent) /* 0x00080000 */
75#define IFLAG_DIRECTIO __IFL(DirectIO) /* 0x00100000 */
76#define IFLAG_RESERVED __IFL(Reserved) /* 0x80000000 */
77
78#ifdef __KERNEL__
79/**
80 * iflags_cvt
81 * @table: A table of 32 u32 flags
82 * @val: a 32 bit value to convert
83 *
84 * This function can be used to convert between IFLAGS values and
85 * the filesystem's own flags values.
86 *
87 * Returns: the converted flags
88 */
89static inline u32 iflags_cvt(const u32 *table, u32 val)
90{
91 u32 res = 0;
92 while(val) {
93 if (val & 1)
94 res |= *table;
95 table++;
96 val >>= 1;
97 }
98 return res;
99}
100#endif /* __KERNEL__ */
101
102#endif /* _LINUX_IFLAGS_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 851aa1bcfc1a..6fce604d2502 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -32,6 +32,7 @@ extern const char linux_banner[];
32 32
33#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 33#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
34#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1)) 34#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
35#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
35#define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) 36#define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
36#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) 37#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
37 38
diff --git a/include/linux/lock_dlm_plock.h b/include/linux/lock_dlm_plock.h
new file mode 100644
index 000000000000..fc3415113973
--- /dev/null
+++ b/include/linux/lock_dlm_plock.h
@@ -0,0 +1,41 @@
1/*
2 * Copyright (C) 2005 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#ifndef __LOCK_DLM_PLOCK_DOT_H__
10#define __LOCK_DLM_PLOCK_DOT_H__
11
12#define GDLM_PLOCK_MISC_NAME "lock_dlm_plock"
13
14#define GDLM_PLOCK_VERSION_MAJOR 1
15#define GDLM_PLOCK_VERSION_MINOR 1
16#define GDLM_PLOCK_VERSION_PATCH 0
17
18enum {
19 GDLM_PLOCK_OP_LOCK = 1,
20 GDLM_PLOCK_OP_UNLOCK,
21 GDLM_PLOCK_OP_GET,
22};
23
24struct gdlm_plock_info {
25 __u32 version[3];
26 __u8 optype;
27 __u8 ex;
28 __u8 wait;
29 __u8 pad;
30 __u32 pid;
31 __s32 nodeid;
32 __s32 rv;
33 __u32 fsid;
34 __u64 number;
35 __u64 start;
36 __u64 end;
37 __u64 owner;
38};
39
40#endif
41
diff --git a/mm/filemap.c b/mm/filemap.c
index b9a60c43b61a..d9bbea1e87d2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1181,7 +1181,8 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1181 *ppos = pos + retval; 1181 *ppos = pos + retval;
1182 } 1182 }
1183 file_accessed(filp); 1183 file_accessed(filp);
1184 goto out; 1184 if (retval != 0)
1185 goto out;
1185 } 1186 }
1186 1187
1187 retval = 0; 1188 retval = 0;
diff --git a/mm/readahead.c b/mm/readahead.c
index aa7ec424656a..1ba736ac0367 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -38,6 +38,7 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
38 ra->ra_pages = mapping->backing_dev_info->ra_pages; 38 ra->ra_pages = mapping->backing_dev_info->ra_pages;
39 ra->prev_page = -1; 39 ra->prev_page = -1;
40} 40}
41EXPORT_SYMBOL_GPL(file_ra_state_init);
41 42
42/* 43/*
43 * Return max readahead size for this inode in number-of-pages. 44 * Return max readahead size for this inode in number-of-pages.