aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@g5.osdl.org>2006-10-04 12:06:16 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-10-04 12:06:16 -0400
commit4a61f17378c2cdd9bd8f34ef8bd7422861d0c1f1 (patch)
treea2054556900af8c16fd9f5419f012dcf1ee2995a
parentd002ec481c24f325ed6cfcb7810d317c015dd1b5 (diff)
parent7ecdb70a0ea436c06540140242bfac6ac3babfc0 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-2.6
* git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-2.6: (292 commits) [GFS2] Fix endian bug for de_type [GFS2] Initialize SELinux extended attributes at inode creation time. [GFS2] Move logging code into log.c (mostly) [GFS2] Mark nlink cleared so VFS sees it happen [GFS2] Two redundant casts removed [GFS2] Remove uneeded endian conversion [GFS2] Remove duplicate sb reading code [GFS2] Mark metadata reads for blktrace [GFS2] Remove iflags.h, use FS_ [GFS2] Fix code style/indent in ops_file.c [GFS2] streamline-generic_file_-interfaces-and-filemap gfs fix [GFS2] Remove readv/writev methods and use aio_read/aio_write instead (gfs bits) [GFS2] inode-diet: Eliminate i_blksize from the inode structure [GFS2] inode_diet: Replace inode.u.generic_ip with inode.i_private (gfs) [GFS2] Fix typo in last patch [GFS2] Fix direct i/o logic in filemap.c [GFS2] Fix bug in Makefiles for lock modules [GFS2] Remove (extra) fs_subsys declaration [GFS2/DLM] Fix trailing whitespace [GFS2] Tidy up meta_io code ...
-rw-r--r--CREDITS6
-rw-r--r--Documentation/filesystems/gfs2.txt43
-rw-r--r--MAINTAINERS18
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/configfs/item.c2
-rw-r--r--fs/dlm/Kconfig21
-rw-r--r--fs/dlm/Makefile19
-rw-r--r--fs/dlm/ast.c173
-rw-r--r--fs/dlm/ast.h26
-rw-r--r--fs/dlm/config.c789
-rw-r--r--fs/dlm/config.h42
-rw-r--r--fs/dlm/debug_fs.c387
-rw-r--r--fs/dlm/dir.c423
-rw-r--r--fs/dlm/dir.h30
-rw-r--r--fs/dlm/dlm_internal.h543
-rw-r--r--fs/dlm/lock.c3871
-rw-r--r--fs/dlm/lock.h62
-rw-r--r--fs/dlm/lockspace.c717
-rw-r--r--fs/dlm/lockspace.h25
-rw-r--r--fs/dlm/lowcomms.c1238
-rw-r--r--fs/dlm/lowcomms.h26
-rw-r--r--fs/dlm/lvb_table.h18
-rw-r--r--fs/dlm/main.c97
-rw-r--r--fs/dlm/member.c327
-rw-r--r--fs/dlm/member.h24
-rw-r--r--fs/dlm/memory.c116
-rw-r--r--fs/dlm/memory.h29
-rw-r--r--fs/dlm/midcomms.c140
-rw-r--r--fs/dlm/midcomms.h21
-rw-r--r--fs/dlm/rcom.c472
-rw-r--r--fs/dlm/rcom.h24
-rw-r--r--fs/dlm/recover.c765
-rw-r--r--fs/dlm/recover.h34
-rw-r--r--fs/dlm/recoverd.c290
-rw-r--r--fs/dlm/recoverd.h24
-rw-r--r--fs/dlm/requestqueue.c184
-rw-r--r--fs/dlm/requestqueue.h22
-rw-r--r--fs/dlm/user.c788
-rw-r--r--fs/dlm/user.h16
-rw-r--r--fs/dlm/util.c161
-rw-r--r--fs/dlm/util.h22
-rw-r--r--fs/gfs2/Kconfig44
-rw-r--r--fs/gfs2/Makefile10
-rw-r--r--fs/gfs2/acl.c309
-rw-r--r--fs/gfs2/acl.h39
-rw-r--r--fs/gfs2/bmap.c1221
-rw-r--r--fs/gfs2/bmap.h31
-rw-r--r--fs/gfs2/daemon.c196
-rw-r--r--fs/gfs2/daemon.h19
-rw-r--r--fs/gfs2/dir.c1961
-rw-r--r--fs/gfs2/dir.h79
-rw-r--r--fs/gfs2/eaops.c230
-rw-r--r--fs/gfs2/eaops.h30
-rw-r--r--fs/gfs2/eattr.c1501
-rw-r--r--fs/gfs2/eattr.h100
-rw-r--r--fs/gfs2/gfs2.h31
-rw-r--r--fs/gfs2/glock.c2231
-rw-r--r--fs/gfs2/glock.h153
-rw-r--r--fs/gfs2/glops.c615
-rw-r--r--fs/gfs2/glops.h25
-rw-r--r--fs/gfs2/incore.h634
-rw-r--r--fs/gfs2/inode.c1379
-rw-r--r--fs/gfs2/inode.h56
-rw-r--r--fs/gfs2/lm.c217
-rw-r--r--fs/gfs2/lm.h42
-rw-r--r--fs/gfs2/locking.c184
-rw-r--r--fs/gfs2/locking/dlm/Makefile3
-rw-r--r--fs/gfs2/locking/dlm/lock.c524
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h187
-rw-r--r--fs/gfs2/locking/dlm/main.c64
-rw-r--r--fs/gfs2/locking/dlm/mount.c255
-rw-r--r--fs/gfs2/locking/dlm/plock.c301
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c226
-rw-r--r--fs/gfs2/locking/dlm/thread.c359
-rw-r--r--fs/gfs2/locking/nolock/Makefile3
-rw-r--r--fs/gfs2/locking/nolock/main.c246
-rw-r--r--fs/gfs2/log.c687
-rw-r--r--fs/gfs2/log.h65
-rw-r--r--fs/gfs2/lops.c809
-rw-r--r--fs/gfs2/lops.h99
-rw-r--r--fs/gfs2/main.c150
-rw-r--r--fs/gfs2/meta_io.c590
-rw-r--r--fs/gfs2/meta_io.h78
-rw-r--r--fs/gfs2/mount.c214
-rw-r--r--fs/gfs2/mount.h17
-rw-r--r--fs/gfs2/ondisk.c308
-rw-r--r--fs/gfs2/ops_address.c790
-rw-r--r--fs/gfs2/ops_address.h22
-rw-r--r--fs/gfs2/ops_dentry.c119
-rw-r--r--fs/gfs2/ops_dentry.h17
-rw-r--r--fs/gfs2/ops_export.c298
-rw-r--r--fs/gfs2/ops_export.h22
-rw-r--r--fs/gfs2/ops_file.c661
-rw-r--r--fs/gfs2/ops_file.h24
-rw-r--r--fs/gfs2/ops_fstype.c928
-rw-r--r--fs/gfs2/ops_fstype.h18
-rw-r--r--fs/gfs2/ops_inode.c1151
-rw-r--r--fs/gfs2/ops_inode.h20
-rw-r--r--fs/gfs2/ops_super.c468
-rw-r--r--fs/gfs2/ops_super.h17
-rw-r--r--fs/gfs2/ops_vm.c184
-rw-r--r--fs/gfs2/ops_vm.h18
-rw-r--r--fs/gfs2/quota.c1227
-rw-r--r--fs/gfs2/quota.h35
-rw-r--r--fs/gfs2/recovery.c570
-rw-r--r--fs/gfs2/recovery.h34
-rw-r--r--fs/gfs2/rgrp.c1513
-rw-r--r--fs/gfs2/rgrp.h69
-rw-r--r--fs/gfs2/super.c976
-rw-r--r--fs/gfs2/super.h55
-rw-r--r--fs/gfs2/sys.c583
-rw-r--r--fs/gfs2/sys.h27
-rw-r--r--fs/gfs2/trans.c184
-rw-r--r--fs/gfs2/trans.h39
-rw-r--r--fs/gfs2/util.c245
-rw-r--r--fs/gfs2/util.h170
-rw-r--r--include/linux/Kbuild4
-rw-r--r--include/linux/dlm.h302
-rw-r--r--include/linux/dlm_device.h86
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/gfs2_ondisk.h443
-rw-r--r--include/linux/lm_interface.h273
-rw-r--r--include/linux/lock_dlm_plock.h41
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/readahead.c1
126 files changed, 40197 insertions, 6 deletions
diff --git a/CREDITS b/CREDITS
index 5d75254bcb81..5329ead9c672 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3578,11 +3578,11 @@ S: Fargo, North Dakota 58122
3578S: USA 3578S: USA
3579 3579
3580N: Steven Whitehouse 3580N: Steven Whitehouse
3581E: SteveW@ACM.org 3581E: steve@chygwyn.com
3582W: http://www.chygwyn.com/~steve 3582W: http://www.chygwyn.com/~steve
3583D: Linux DECnet project: http://www.sucs.swan.ac.uk/~rohan/DECnet/index.html 3583D: Linux DECnet project
3584D: Minor debugging of other networking protocols. 3584D: Minor debugging of other networking protocols.
3585D: Misc bug fixes and filesystem development 3585D: Misc bug fixes and GFS2 filesystem development
3586 3586
3587N: Hans-Joachim Widmaier 3587N: Hans-Joachim Widmaier
3588E: hjw@zvw.de 3588E: hjw@zvw.de
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt
new file mode 100644
index 000000000000..593004b6bbab
--- /dev/null
+++ b/Documentation/filesystems/gfs2.txt
@@ -0,0 +1,43 @@
1Global File System
2------------------
3
4http://sources.redhat.com/cluster/
5
6GFS is a cluster file system. It allows a cluster of computers to
7simultaneously use a block device that is shared between them (with FC,
8iSCSI, NBD, etc). GFS reads and writes to the block device like a local
9file system, but also uses a lock module to allow the computers coordinate
10their I/O so file system consistency is maintained. One of the nifty
11features of GFS is perfect consistency -- changes made to the file system
12on one machine show up immediately on all other machines in the cluster.
13
14GFS uses interchangable inter-node locking mechanisms. Different lock
15modules can plug into GFS and each file system selects the appropriate
16lock module at mount time. Lock modules include:
17
18 lock_nolock -- allows gfs to be used as a local file system
19
20 lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking
21 The dlm is found at linux/fs/dlm/
22
23In addition to interfacing with an external locking manager, a gfs lock
24module is responsible for interacting with external cluster management
25systems. Lock_dlm depends on user space cluster management systems found
26at the URL above.
27
28To use gfs as a local file system, no external clustering systems are
29needed, simply:
30
31 $ mkfs -t gfs2 -p lock_nolock -j 1 /dev/block_device
32 $ mount -t gfs2 /dev/block_device /dir
33
34GFS2 is not on-disk compatible with previous versions of GFS.
35
36The following man pages can be found at the URL above:
37 gfs2_fsck to repair a filesystem
38 gfs2_grow to expand a filesystem online
39 gfs2_jadd to add journals to a filesystem online
40 gfs2_tool to manipulate, examine and tune a filesystem
41 gfs2_quota to examine and change quota values in a filesystem
42 mount.gfs2 to help mount(8) mount a filesystem
43 mkfs.gfs2 to make a filesystem
diff --git a/MAINTAINERS b/MAINTAINERS
index 8c35b3c503aa..17becb9b1a96 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -898,6 +898,16 @@ M: jack@suse.cz
898L: linux-kernel@vger.kernel.org 898L: linux-kernel@vger.kernel.org
899S: Maintained 899S: Maintained
900 900
901DISTRIBUTED LOCK MANAGER
902P: Patrick Caulfield
903M: pcaulfie@redhat.com
904P: David Teigland
905M: teigland@redhat.com
906L: cluster-devel@redhat.com
907W: http://sources.redhat.com/cluster/
908T: git kernel.org:/pub/scm/linux/kernel/git/steve/gfs-2.6.git
909S: Supported
910
901DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER 911DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER
902P: Tobias Ringstrom 912P: Tobias Ringstrom
903M: tori@unhappy.mine.nu 913M: tori@unhappy.mine.nu
@@ -1173,6 +1183,14 @@ M: khc@pm.waw.pl
1173W: http://www.kernel.org/pub/linux/utils/net/hdlc/ 1183W: http://www.kernel.org/pub/linux/utils/net/hdlc/
1174S: Maintained 1184S: Maintained
1175 1185
1186GFS2 FILE SYSTEM
1187P: Steven Whitehouse
1188M: swhiteho@redhat.com
1189L: cluster-devel@redhat.com
1190W: http://sources.redhat.com/cluster/
1191T: git kernel.org:/pub/scm/linux/kernel/git/steve/gfs-2.6.git
1192S: Supported
1193
1176GIGASET ISDN DRIVERS 1194GIGASET ISDN DRIVERS
1177P: Hansjoerg Lipp 1195P: Hansjoerg Lipp
1178M: hjlipp@web.de 1196M: hjlipp@web.de
diff --git a/fs/Kconfig b/fs/Kconfig
index 674cfbb83a95..599de54451af 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -325,6 +325,7 @@ config FS_POSIX_ACL
325 default n 325 default n
326 326
327source "fs/xfs/Kconfig" 327source "fs/xfs/Kconfig"
328source "fs/gfs2/Kconfig"
328 329
329config OCFS2_FS 330config OCFS2_FS
330 tristate "OCFS2 file system support" 331 tristate "OCFS2 file system support"
@@ -1995,6 +1996,7 @@ endmenu
1995endif 1996endif
1996 1997
1997source "fs/nls/Kconfig" 1998source "fs/nls/Kconfig"
1999source "fs/dlm/Kconfig"
1998 2000
1999endmenu 2001endmenu
2000 2002
diff --git a/fs/Makefile b/fs/Makefile
index fd24d67a7cdb..df614eacee86 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -57,6 +57,7 @@ obj-$(CONFIG_CONFIGFS_FS) += configfs/
57obj-y += devpts/ 57obj-y += devpts/
58 58
59obj-$(CONFIG_PROFILING) += dcookies.o 59obj-$(CONFIG_PROFILING) += dcookies.o
60obj-$(CONFIG_DLM) += dlm/
60 61
61# Do not add any filesystems before this line 62# Do not add any filesystems before this line
62obj-$(CONFIG_REISERFS_FS) += reiserfs/ 63obj-$(CONFIG_REISERFS_FS) += reiserfs/
@@ -110,3 +111,4 @@ obj-$(CONFIG_HOSTFS) += hostfs/
110obj-$(CONFIG_HPPFS) += hppfs/ 111obj-$(CONFIG_HPPFS) += hppfs/
111obj-$(CONFIG_DEBUG_FS) += debugfs/ 112obj-$(CONFIG_DEBUG_FS) += debugfs/
112obj-$(CONFIG_OCFS2_FS) += ocfs2/ 113obj-$(CONFIG_OCFS2_FS) += ocfs2/
114obj-$(CONFIG_GFS2_FS) += gfs2/
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index e07485ac50ad..24421209f854 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -224,4 +224,4 @@ EXPORT_SYMBOL(config_item_init);
224EXPORT_SYMBOL(config_group_init); 224EXPORT_SYMBOL(config_group_init);
225EXPORT_SYMBOL(config_item_get); 225EXPORT_SYMBOL(config_item_get);
226EXPORT_SYMBOL(config_item_put); 226EXPORT_SYMBOL(config_item_put);
227 227EXPORT_SYMBOL(config_group_find_obj);
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
new file mode 100644
index 000000000000..490f85b3fa59
--- /dev/null
+++ b/fs/dlm/Kconfig
@@ -0,0 +1,21 @@
1menu "Distributed Lock Manager"
2 depends on INET && EXPERIMENTAL
3
4config DLM
5 tristate "Distributed Lock Manager (DLM)"
6 depends on IPV6 || IPV6=n
7 depends on IP_SCTP
8 select CONFIGFS_FS
9 help
10 A general purpose distributed lock manager for kernel or userspace
11 applications.
12
13config DLM_DEBUG
14 bool "DLM debugging"
15 depends on DLM
16 help
17 Under the debugfs mount point, the name of each lockspace will
18 appear as a file in the "dlm" directory. The output is the
19 list of resource and locks the local node knows about.
20
21endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
new file mode 100644
index 000000000000..1832e0297f7d
--- /dev/null
+++ b/fs/dlm/Makefile
@@ -0,0 +1,19 @@
1obj-$(CONFIG_DLM) += dlm.o
2dlm-y := ast.o \
3 config.o \
4 dir.o \
5 lock.o \
6 lockspace.o \
7 lowcomms.o \
8 main.o \
9 member.o \
10 memory.o \
11 midcomms.o \
12 rcom.o \
13 recover.o \
14 recoverd.o \
15 requestqueue.o \
16 user.o \
17 util.o
18dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o
19
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
new file mode 100644
index 000000000000..f91d39cb1e0b
--- /dev/null
+++ b/fs/dlm/ast.c
@@ -0,0 +1,173 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lock.h"
16#include "user.h"
17
18#define WAKE_ASTS 0
19
20static struct list_head ast_queue;
21static spinlock_t ast_queue_lock;
22static struct task_struct * astd_task;
23static unsigned long astd_wakeflags;
24static struct mutex astd_running;
25
26
27void dlm_del_ast(struct dlm_lkb *lkb)
28{
29 spin_lock(&ast_queue_lock);
30 if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
31 list_del(&lkb->lkb_astqueue);
32 spin_unlock(&ast_queue_lock);
33}
34
35void dlm_add_ast(struct dlm_lkb *lkb, int type)
36{
37 if (lkb->lkb_flags & DLM_IFL_USER) {
38 dlm_user_add_ast(lkb, type);
39 return;
40 }
41 DLM_ASSERT(lkb->lkb_astaddr != DLM_FAKE_USER_AST, dlm_print_lkb(lkb););
42
43 spin_lock(&ast_queue_lock);
44 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
45 kref_get(&lkb->lkb_ref);
46 list_add_tail(&lkb->lkb_astqueue, &ast_queue);
47 }
48 lkb->lkb_ast_type |= type;
49 spin_unlock(&ast_queue_lock);
50
51 set_bit(WAKE_ASTS, &astd_wakeflags);
52 wake_up_process(astd_task);
53}
54
55static void process_asts(void)
56{
57 struct dlm_ls *ls = NULL;
58 struct dlm_rsb *r = NULL;
59 struct dlm_lkb *lkb;
60 void (*cast) (long param);
61 void (*bast) (long param, int mode);
62 int type = 0, found, bmode;
63
64 for (;;) {
65 found = 0;
66 spin_lock(&ast_queue_lock);
67 list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
68 r = lkb->lkb_resource;
69 ls = r->res_ls;
70
71 if (dlm_locking_stopped(ls))
72 continue;
73
74 list_del(&lkb->lkb_astqueue);
75 type = lkb->lkb_ast_type;
76 lkb->lkb_ast_type = 0;
77 found = 1;
78 break;
79 }
80 spin_unlock(&ast_queue_lock);
81
82 if (!found)
83 break;
84
85 cast = lkb->lkb_astaddr;
86 bast = lkb->lkb_bastaddr;
87 bmode = lkb->lkb_bastmode;
88
89 if ((type & AST_COMP) && cast)
90 cast(lkb->lkb_astparam);
91
92 /* FIXME: Is it safe to look at lkb_grmode here
93 without doing a lock_rsb() ?
94 Look at other checks in v1 to avoid basts. */
95
96 if ((type & AST_BAST) && bast)
97 if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
98 bast(lkb->lkb_astparam, bmode);
99
100 /* this removes the reference added by dlm_add_ast
101 and may result in the lkb being freed */
102 dlm_put_lkb(lkb);
103
104 schedule();
105 }
106}
107
108static inline int no_asts(void)
109{
110 int ret;
111
112 spin_lock(&ast_queue_lock);
113 ret = list_empty(&ast_queue);
114 spin_unlock(&ast_queue_lock);
115 return ret;
116}
117
118static int dlm_astd(void *data)
119{
120 while (!kthread_should_stop()) {
121 set_current_state(TASK_INTERRUPTIBLE);
122 if (!test_bit(WAKE_ASTS, &astd_wakeflags))
123 schedule();
124 set_current_state(TASK_RUNNING);
125
126 mutex_lock(&astd_running);
127 if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
128 process_asts();
129 mutex_unlock(&astd_running);
130 }
131 return 0;
132}
133
134void dlm_astd_wake(void)
135{
136 if (!no_asts()) {
137 set_bit(WAKE_ASTS, &astd_wakeflags);
138 wake_up_process(astd_task);
139 }
140}
141
142int dlm_astd_start(void)
143{
144 struct task_struct *p;
145 int error = 0;
146
147 INIT_LIST_HEAD(&ast_queue);
148 spin_lock_init(&ast_queue_lock);
149 mutex_init(&astd_running);
150
151 p = kthread_run(dlm_astd, NULL, "dlm_astd");
152 if (IS_ERR(p))
153 error = PTR_ERR(p);
154 else
155 astd_task = p;
156 return error;
157}
158
159void dlm_astd_stop(void)
160{
161 kthread_stop(astd_task);
162}
163
164void dlm_astd_suspend(void)
165{
166 mutex_lock(&astd_running);
167}
168
169void dlm_astd_resume(void)
170{
171 mutex_unlock(&astd_running);
172}
173
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
new file mode 100644
index 000000000000..6ee276c74c52
--- /dev/null
+++ b/fs/dlm/ast.h
@@ -0,0 +1,26 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __ASTD_DOT_H__
14#define __ASTD_DOT_H__
15
16void dlm_add_ast(struct dlm_lkb *lkb, int type);
17void dlm_del_ast(struct dlm_lkb *lkb);
18
19void dlm_astd_wake(void);
20int dlm_astd_start(void);
21void dlm_astd_stop(void);
22void dlm_astd_suspend(void);
23void dlm_astd_resume(void);
24
25#endif
26
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
new file mode 100644
index 000000000000..88553054bbfa
--- /dev/null
+++ b/fs/dlm/config.c
@@ -0,0 +1,789 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/configfs.h>
17#include <net/sock.h>
18
19#include "config.h"
20#include "lowcomms.h"
21
22/*
23 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
24 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
25 * /config/dlm/<cluster>/comms/<comm>/nodeid
26 * /config/dlm/<cluster>/comms/<comm>/local
27 * /config/dlm/<cluster>/comms/<comm>/addr
28 * The <cluster> level is useless, but I haven't figured out how to avoid it.
29 */
30
31static struct config_group *space_list;
32static struct config_group *comm_list;
33static struct comm *local_comm;
34
35struct clusters;
36struct cluster;
37struct spaces;
38struct space;
39struct comms;
40struct comm;
41struct nodes;
42struct node;
43
44static struct config_group *make_cluster(struct config_group *, const char *);
45static void drop_cluster(struct config_group *, struct config_item *);
46static void release_cluster(struct config_item *);
47static struct config_group *make_space(struct config_group *, const char *);
48static void drop_space(struct config_group *, struct config_item *);
49static void release_space(struct config_item *);
50static struct config_item *make_comm(struct config_group *, const char *);
51static void drop_comm(struct config_group *, struct config_item *);
52static void release_comm(struct config_item *);
53static struct config_item *make_node(struct config_group *, const char *);
54static void drop_node(struct config_group *, struct config_item *);
55static void release_node(struct config_item *);
56
57static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
58 char *buf);
59static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
60 const char *buf, size_t len);
61static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
62 char *buf);
63static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
64 const char *buf, size_t len);
65
66static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
67static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
68static ssize_t comm_local_read(struct comm *cm, char *buf);
69static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
70static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
71static ssize_t node_nodeid_read(struct node *nd, char *buf);
72static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
73static ssize_t node_weight_read(struct node *nd, char *buf);
74static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
75
76enum {
77 COMM_ATTR_NODEID = 0,
78 COMM_ATTR_LOCAL,
79 COMM_ATTR_ADDR,
80};
81
82struct comm_attribute {
83 struct configfs_attribute attr;
84 ssize_t (*show)(struct comm *, char *);
85 ssize_t (*store)(struct comm *, const char *, size_t);
86};
87
88static struct comm_attribute comm_attr_nodeid = {
89 .attr = { .ca_owner = THIS_MODULE,
90 .ca_name = "nodeid",
91 .ca_mode = S_IRUGO | S_IWUSR },
92 .show = comm_nodeid_read,
93 .store = comm_nodeid_write,
94};
95
96static struct comm_attribute comm_attr_local = {
97 .attr = { .ca_owner = THIS_MODULE,
98 .ca_name = "local",
99 .ca_mode = S_IRUGO | S_IWUSR },
100 .show = comm_local_read,
101 .store = comm_local_write,
102};
103
104static struct comm_attribute comm_attr_addr = {
105 .attr = { .ca_owner = THIS_MODULE,
106 .ca_name = "addr",
107 .ca_mode = S_IRUGO | S_IWUSR },
108 .store = comm_addr_write,
109};
110
111static struct configfs_attribute *comm_attrs[] = {
112 [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
113 [COMM_ATTR_LOCAL] = &comm_attr_local.attr,
114 [COMM_ATTR_ADDR] = &comm_attr_addr.attr,
115 NULL,
116};
117
118enum {
119 NODE_ATTR_NODEID = 0,
120 NODE_ATTR_WEIGHT,
121};
122
123struct node_attribute {
124 struct configfs_attribute attr;
125 ssize_t (*show)(struct node *, char *);
126 ssize_t (*store)(struct node *, const char *, size_t);
127};
128
129static struct node_attribute node_attr_nodeid = {
130 .attr = { .ca_owner = THIS_MODULE,
131 .ca_name = "nodeid",
132 .ca_mode = S_IRUGO | S_IWUSR },
133 .show = node_nodeid_read,
134 .store = node_nodeid_write,
135};
136
137static struct node_attribute node_attr_weight = {
138 .attr = { .ca_owner = THIS_MODULE,
139 .ca_name = "weight",
140 .ca_mode = S_IRUGO | S_IWUSR },
141 .show = node_weight_read,
142 .store = node_weight_write,
143};
144
145static struct configfs_attribute *node_attrs[] = {
146 [NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
147 [NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
148 NULL,
149};
150
151struct clusters {
152 struct configfs_subsystem subsys;
153};
154
155struct cluster {
156 struct config_group group;
157};
158
159struct spaces {
160 struct config_group ss_group;
161};
162
163struct space {
164 struct config_group group;
165 struct list_head members;
166 struct mutex members_lock;
167 int members_count;
168};
169
170struct comms {
171 struct config_group cs_group;
172};
173
174struct comm {
175 struct config_item item;
176 int nodeid;
177 int local;
178 int addr_count;
179 struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
180};
181
182struct nodes {
183 struct config_group ns_group;
184};
185
186struct node {
187 struct config_item item;
188 struct list_head list; /* space->members */
189 int nodeid;
190 int weight;
191};
192
193static struct configfs_group_operations clusters_ops = {
194 .make_group = make_cluster,
195 .drop_item = drop_cluster,
196};
197
198static struct configfs_item_operations cluster_ops = {
199 .release = release_cluster,
200};
201
202static struct configfs_group_operations spaces_ops = {
203 .make_group = make_space,
204 .drop_item = drop_space,
205};
206
207static struct configfs_item_operations space_ops = {
208 .release = release_space,
209};
210
211static struct configfs_group_operations comms_ops = {
212 .make_item = make_comm,
213 .drop_item = drop_comm,
214};
215
216static struct configfs_item_operations comm_ops = {
217 .release = release_comm,
218 .show_attribute = show_comm,
219 .store_attribute = store_comm,
220};
221
222static struct configfs_group_operations nodes_ops = {
223 .make_item = make_node,
224 .drop_item = drop_node,
225};
226
227static struct configfs_item_operations node_ops = {
228 .release = release_node,
229 .show_attribute = show_node,
230 .store_attribute = store_node,
231};
232
233static struct config_item_type clusters_type = {
234 .ct_group_ops = &clusters_ops,
235 .ct_owner = THIS_MODULE,
236};
237
238static struct config_item_type cluster_type = {
239 .ct_item_ops = &cluster_ops,
240 .ct_owner = THIS_MODULE,
241};
242
243static struct config_item_type spaces_type = {
244 .ct_group_ops = &spaces_ops,
245 .ct_owner = THIS_MODULE,
246};
247
248static struct config_item_type space_type = {
249 .ct_item_ops = &space_ops,
250 .ct_owner = THIS_MODULE,
251};
252
253static struct config_item_type comms_type = {
254 .ct_group_ops = &comms_ops,
255 .ct_owner = THIS_MODULE,
256};
257
258static struct config_item_type comm_type = {
259 .ct_item_ops = &comm_ops,
260 .ct_attrs = comm_attrs,
261 .ct_owner = THIS_MODULE,
262};
263
264static struct config_item_type nodes_type = {
265 .ct_group_ops = &nodes_ops,
266 .ct_owner = THIS_MODULE,
267};
268
269static struct config_item_type node_type = {
270 .ct_item_ops = &node_ops,
271 .ct_attrs = node_attrs,
272 .ct_owner = THIS_MODULE,
273};
274
275static struct cluster *to_cluster(struct config_item *i)
276{
277 return i ? container_of(to_config_group(i), struct cluster, group):NULL;
278}
279
280static struct space *to_space(struct config_item *i)
281{
282 return i ? container_of(to_config_group(i), struct space, group) : NULL;
283}
284
285static struct comm *to_comm(struct config_item *i)
286{
287 return i ? container_of(i, struct comm, item) : NULL;
288}
289
290static struct node *to_node(struct config_item *i)
291{
292 return i ? container_of(i, struct node, item) : NULL;
293}
294
295static struct config_group *make_cluster(struct config_group *g,
296 const char *name)
297{
298 struct cluster *cl = NULL;
299 struct spaces *sps = NULL;
300 struct comms *cms = NULL;
301 void *gps = NULL;
302
303 cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
304 gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
305 sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
306 cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
307
308 if (!cl || !gps || !sps || !cms)
309 goto fail;
310
311 config_group_init_type_name(&cl->group, name, &cluster_type);
312 config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
313 config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
314
315 cl->group.default_groups = gps;
316 cl->group.default_groups[0] = &sps->ss_group;
317 cl->group.default_groups[1] = &cms->cs_group;
318 cl->group.default_groups[2] = NULL;
319
320 space_list = &sps->ss_group;
321 comm_list = &cms->cs_group;
322 return &cl->group;
323
324 fail:
325 kfree(cl);
326 kfree(gps);
327 kfree(sps);
328 kfree(cms);
329 return NULL;
330}
331
332static void drop_cluster(struct config_group *g, struct config_item *i)
333{
334 struct cluster *cl = to_cluster(i);
335 struct config_item *tmp;
336 int j;
337
338 for (j = 0; cl->group.default_groups[j]; j++) {
339 tmp = &cl->group.default_groups[j]->cg_item;
340 cl->group.default_groups[j] = NULL;
341 config_item_put(tmp);
342 }
343
344 space_list = NULL;
345 comm_list = NULL;
346
347 config_item_put(i);
348}
349
350static void release_cluster(struct config_item *i)
351{
352 struct cluster *cl = to_cluster(i);
353 kfree(cl->group.default_groups);
354 kfree(cl);
355}
356
357static struct config_group *make_space(struct config_group *g, const char *name)
358{
359 struct space *sp = NULL;
360 struct nodes *nds = NULL;
361 void *gps = NULL;
362
363 sp = kzalloc(sizeof(struct space), GFP_KERNEL);
364 gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
365 nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
366
367 if (!sp || !gps || !nds)
368 goto fail;
369
370 config_group_init_type_name(&sp->group, name, &space_type);
371 config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
372
373 sp->group.default_groups = gps;
374 sp->group.default_groups[0] = &nds->ns_group;
375 sp->group.default_groups[1] = NULL;
376
377 INIT_LIST_HEAD(&sp->members);
378 mutex_init(&sp->members_lock);
379 sp->members_count = 0;
380 return &sp->group;
381
382 fail:
383 kfree(sp);
384 kfree(gps);
385 kfree(nds);
386 return NULL;
387}
388
389static void drop_space(struct config_group *g, struct config_item *i)
390{
391 struct space *sp = to_space(i);
392 struct config_item *tmp;
393 int j;
394
395 /* assert list_empty(&sp->members) */
396
397 for (j = 0; sp->group.default_groups[j]; j++) {
398 tmp = &sp->group.default_groups[j]->cg_item;
399 sp->group.default_groups[j] = NULL;
400 config_item_put(tmp);
401 }
402
403 config_item_put(i);
404}
405
406static void release_space(struct config_item *i)
407{
408 struct space *sp = to_space(i);
409 kfree(sp->group.default_groups);
410 kfree(sp);
411}
412
413static struct config_item *make_comm(struct config_group *g, const char *name)
414{
415 struct comm *cm;
416
417 cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
418 if (!cm)
419 return NULL;
420
421 config_item_init_type_name(&cm->item, name, &comm_type);
422 cm->nodeid = -1;
423 cm->local = 0;
424 cm->addr_count = 0;
425 return &cm->item;
426}
427
428static void drop_comm(struct config_group *g, struct config_item *i)
429{
430 struct comm *cm = to_comm(i);
431 if (local_comm == cm)
432 local_comm = NULL;
433 dlm_lowcomms_close(cm->nodeid);
434 while (cm->addr_count--)
435 kfree(cm->addr[cm->addr_count]);
436 config_item_put(i);
437}
438
439static void release_comm(struct config_item *i)
440{
441 struct comm *cm = to_comm(i);
442 kfree(cm);
443}
444
445static struct config_item *make_node(struct config_group *g, const char *name)
446{
447 struct space *sp = to_space(g->cg_item.ci_parent);
448 struct node *nd;
449
450 nd = kzalloc(sizeof(struct node), GFP_KERNEL);
451 if (!nd)
452 return NULL;
453
454 config_item_init_type_name(&nd->item, name, &node_type);
455 nd->nodeid = -1;
456 nd->weight = 1; /* default weight of 1 if none is set */
457
458 mutex_lock(&sp->members_lock);
459 list_add(&nd->list, &sp->members);
460 sp->members_count++;
461 mutex_unlock(&sp->members_lock);
462
463 return &nd->item;
464}
465
466static void drop_node(struct config_group *g, struct config_item *i)
467{
468 struct space *sp = to_space(g->cg_item.ci_parent);
469 struct node *nd = to_node(i);
470
471 mutex_lock(&sp->members_lock);
472 list_del(&nd->list);
473 sp->members_count--;
474 mutex_unlock(&sp->members_lock);
475
476 config_item_put(i);
477}
478
479static void release_node(struct config_item *i)
480{
481 struct node *nd = to_node(i);
482 kfree(nd);
483}
484
485static struct clusters clusters_root = {
486 .subsys = {
487 .su_group = {
488 .cg_item = {
489 .ci_namebuf = "dlm",
490 .ci_type = &clusters_type,
491 },
492 },
493 },
494};
495
496int dlm_config_init(void)
497{
498 config_group_init(&clusters_root.subsys.su_group);
499 init_MUTEX(&clusters_root.subsys.su_sem);
500 return configfs_register_subsystem(&clusters_root.subsys);
501}
502
503void dlm_config_exit(void)
504{
505 configfs_unregister_subsystem(&clusters_root.subsys);
506}
507
508/*
509 * Functions for user space to read/write attributes
510 */
511
512static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
513 char *buf)
514{
515 struct comm *cm = to_comm(i);
516 struct comm_attribute *cma =
517 container_of(a, struct comm_attribute, attr);
518 return cma->show ? cma->show(cm, buf) : 0;
519}
520
521static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
522 const char *buf, size_t len)
523{
524 struct comm *cm = to_comm(i);
525 struct comm_attribute *cma =
526 container_of(a, struct comm_attribute, attr);
527 return cma->store ? cma->store(cm, buf, len) : -EINVAL;
528}
529
530static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
531{
532 return sprintf(buf, "%d\n", cm->nodeid);
533}
534
535static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
536{
537 cm->nodeid = simple_strtol(buf, NULL, 0);
538 return len;
539}
540
541static ssize_t comm_local_read(struct comm *cm, char *buf)
542{
543 return sprintf(buf, "%d\n", cm->local);
544}
545
546static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
547{
548 cm->local= simple_strtol(buf, NULL, 0);
549 if (cm->local && !local_comm)
550 local_comm = cm;
551 return len;
552}
553
554static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
555{
556 struct sockaddr_storage *addr;
557
558 if (len != sizeof(struct sockaddr_storage))
559 return -EINVAL;
560
561 if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
562 return -ENOSPC;
563
564 addr = kzalloc(sizeof(*addr), GFP_KERNEL);
565 if (!addr)
566 return -ENOMEM;
567
568 memcpy(addr, buf, len);
569 cm->addr[cm->addr_count++] = addr;
570 return len;
571}
572
573static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
574 char *buf)
575{
576 struct node *nd = to_node(i);
577 struct node_attribute *nda =
578 container_of(a, struct node_attribute, attr);
579 return nda->show ? nda->show(nd, buf) : 0;
580}
581
582static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
583 const char *buf, size_t len)
584{
585 struct node *nd = to_node(i);
586 struct node_attribute *nda =
587 container_of(a, struct node_attribute, attr);
588 return nda->store ? nda->store(nd, buf, len) : -EINVAL;
589}
590
591static ssize_t node_nodeid_read(struct node *nd, char *buf)
592{
593 return sprintf(buf, "%d\n", nd->nodeid);
594}
595
596static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
597{
598 nd->nodeid = simple_strtol(buf, NULL, 0);
599 return len;
600}
601
602static ssize_t node_weight_read(struct node *nd, char *buf)
603{
604 return sprintf(buf, "%d\n", nd->weight);
605}
606
607static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
608{
609 nd->weight = simple_strtol(buf, NULL, 0);
610 return len;
611}
612
613/*
614 * Functions for the dlm to get the info that's been configured
615 */
616
617static struct space *get_space(char *name)
618{
619 if (!space_list)
620 return NULL;
621 return to_space(config_group_find_obj(space_list, name));
622}
623
624static void put_space(struct space *sp)
625{
626 config_item_put(&sp->group.cg_item);
627}
628
629static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
630{
631 struct config_item *i;
632 struct comm *cm = NULL;
633 int found = 0;
634
635 if (!comm_list)
636 return NULL;
637
638 down(&clusters_root.subsys.su_sem);
639
640 list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
641 cm = to_comm(i);
642
643 if (nodeid) {
644 if (cm->nodeid != nodeid)
645 continue;
646 found = 1;
647 break;
648 } else {
649 if (!cm->addr_count ||
650 memcmp(cm->addr[0], addr, sizeof(*addr)))
651 continue;
652 found = 1;
653 break;
654 }
655 }
656 up(&clusters_root.subsys.su_sem);
657
658 if (found)
659 config_item_get(i);
660 else
661 cm = NULL;
662 return cm;
663}
664
665static void put_comm(struct comm *cm)
666{
667 config_item_put(&cm->item);
668}
669
670/* caller must free mem */
671int dlm_nodeid_list(char *lsname, int **ids_out)
672{
673 struct space *sp;
674 struct node *nd;
675 int i = 0, rv = 0;
676 int *ids;
677
678 sp = get_space(lsname);
679 if (!sp)
680 return -EEXIST;
681
682 mutex_lock(&sp->members_lock);
683 if (!sp->members_count) {
684 rv = 0;
685 goto out;
686 }
687
688 ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL);
689 if (!ids) {
690 rv = -ENOMEM;
691 goto out;
692 }
693
694 rv = sp->members_count;
695 list_for_each_entry(nd, &sp->members, list)
696 ids[i++] = nd->nodeid;
697
698 if (rv != i)
699 printk("bad nodeid count %d %d\n", rv, i);
700
701 *ids_out = ids;
702 out:
703 mutex_unlock(&sp->members_lock);
704 put_space(sp);
705 return rv;
706}
707
708int dlm_node_weight(char *lsname, int nodeid)
709{
710 struct space *sp;
711 struct node *nd;
712 int w = -EEXIST;
713
714 sp = get_space(lsname);
715 if (!sp)
716 goto out;
717
718 mutex_lock(&sp->members_lock);
719 list_for_each_entry(nd, &sp->members, list) {
720 if (nd->nodeid != nodeid)
721 continue;
722 w = nd->weight;
723 break;
724 }
725 mutex_unlock(&sp->members_lock);
726 put_space(sp);
727 out:
728 return w;
729}
730
731int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
732{
733 struct comm *cm = get_comm(nodeid, NULL);
734 if (!cm)
735 return -EEXIST;
736 if (!cm->addr_count)
737 return -ENOENT;
738 memcpy(addr, cm->addr[0], sizeof(*addr));
739 put_comm(cm);
740 return 0;
741}
742
743int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
744{
745 struct comm *cm = get_comm(0, addr);
746 if (!cm)
747 return -EEXIST;
748 *nodeid = cm->nodeid;
749 put_comm(cm);
750 return 0;
751}
752
753int dlm_our_nodeid(void)
754{
755 return local_comm ? local_comm->nodeid : 0;
756}
757
758/* num 0 is first addr, num 1 is second addr */
759int dlm_our_addr(struct sockaddr_storage *addr, int num)
760{
761 if (!local_comm)
762 return -1;
763 if (num + 1 > local_comm->addr_count)
764 return -1;
765 memcpy(addr, local_comm->addr[num], sizeof(*addr));
766 return 0;
767}
768
769/* Config file defaults */
770#define DEFAULT_TCP_PORT 21064
771#define DEFAULT_BUFFER_SIZE 4096
772#define DEFAULT_RSBTBL_SIZE 256
773#define DEFAULT_LKBTBL_SIZE 1024
774#define DEFAULT_DIRTBL_SIZE 512
775#define DEFAULT_RECOVER_TIMER 5
776#define DEFAULT_TOSS_SECS 10
777#define DEFAULT_SCAN_SECS 5
778
779struct dlm_config_info dlm_config = {
780 .tcp_port = DEFAULT_TCP_PORT,
781 .buffer_size = DEFAULT_BUFFER_SIZE,
782 .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
783 .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
784 .dirtbl_size = DEFAULT_DIRTBL_SIZE,
785 .recover_timer = DEFAULT_RECOVER_TIMER,
786 .toss_secs = DEFAULT_TOSS_SECS,
787 .scan_secs = DEFAULT_SCAN_SECS
788};
789
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
new file mode 100644
index 000000000000..9da7839958a9
--- /dev/null
+++ b/fs/dlm/config.h
@@ -0,0 +1,42 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __CONFIG_DOT_H__
15#define __CONFIG_DOT_H__
16
17#define DLM_MAX_ADDR_COUNT 3
18
19struct dlm_config_info {
20 int tcp_port;
21 int buffer_size;
22 int rsbtbl_size;
23 int lkbtbl_size;
24 int dirtbl_size;
25 int recover_timer;
26 int toss_secs;
27 int scan_secs;
28};
29
30extern struct dlm_config_info dlm_config;
31
32int dlm_config_init(void);
33void dlm_config_exit(void);
34int dlm_node_weight(char *lsname, int nodeid);
35int dlm_nodeid_list(char *lsname, int **ids_out);
36int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
37int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
38int dlm_our_nodeid(void);
39int dlm_our_addr(struct sockaddr_storage *addr, int num);
40
41#endif /* __CONFIG_DOT_H__ */
42
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
new file mode 100644
index 000000000000..ca94a837a5bb
--- /dev/null
+++ b/fs/dlm/debug_fs.c
@@ -0,0 +1,387 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include <linux/pagemap.h>
14#include <linux/seq_file.h>
15#include <linux/module.h>
16#include <linux/ctype.h>
17#include <linux/debugfs.h>
18
19#include "dlm_internal.h"
20
21#define DLM_DEBUG_BUF_LEN 4096
22static char debug_buf[DLM_DEBUG_BUF_LEN];
23static struct mutex debug_buf_lock;
24
25static struct dentry *dlm_root;
26
27struct rsb_iter {
28 int entry;
29 struct dlm_ls *ls;
30 struct list_head *next;
31 struct dlm_rsb *rsb;
32};
33
34/*
35 * dump all rsb's in the lockspace hash table
36 */
37
38static char *print_lockmode(int mode)
39{
40 switch (mode) {
41 case DLM_LOCK_IV:
42 return "--";
43 case DLM_LOCK_NL:
44 return "NL";
45 case DLM_LOCK_CR:
46 return "CR";
47 case DLM_LOCK_CW:
48 return "CW";
49 case DLM_LOCK_PR:
50 return "PR";
51 case DLM_LOCK_PW:
52 return "PW";
53 case DLM_LOCK_EX:
54 return "EX";
55 default:
56 return "??";
57 }
58}
59
60static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
61 struct dlm_rsb *res)
62{
63 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
64
65 if (lkb->lkb_status == DLM_LKSTS_CONVERT
66 || lkb->lkb_status == DLM_LKSTS_WAITING)
67 seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
68
69 if (lkb->lkb_nodeid) {
70 if (lkb->lkb_nodeid != res->res_nodeid)
71 seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
72 lkb->lkb_remid);
73 else
74 seq_printf(s, " Master: %08x", lkb->lkb_remid);
75 }
76
77 if (lkb->lkb_wait_type)
78 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
79
80 seq_printf(s, "\n");
81}
82
83static int print_resource(struct dlm_rsb *res, struct seq_file *s)
84{
85 struct dlm_lkb *lkb;
86 int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
87
88 seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
89 for (i = 0; i < res->res_length; i++) {
90 if (isprint(res->res_name[i]))
91 seq_printf(s, "%c", res->res_name[i]);
92 else
93 seq_printf(s, "%c", '.');
94 }
95 if (res->res_nodeid > 0)
96 seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
97 res->res_nodeid);
98 else if (res->res_nodeid == 0)
99 seq_printf(s, "\" \nMaster Copy\n");
100 else if (res->res_nodeid == -1)
101 seq_printf(s, "\" \nLooking up master (lkid %x)\n",
102 res->res_first_lkid);
103 else
104 seq_printf(s, "\" \nInvalid master %d\n", res->res_nodeid);
105
106 /* Print the LVB: */
107 if (res->res_lvbptr) {
108 seq_printf(s, "LVB: ");
109 for (i = 0; i < lvblen; i++) {
110 if (i == lvblen / 2)
111 seq_printf(s, "\n ");
112 seq_printf(s, "%02x ",
113 (unsigned char) res->res_lvbptr[i]);
114 }
115 if (rsb_flag(res, RSB_VALNOTVALID))
116 seq_printf(s, " (INVALID)");
117 seq_printf(s, "\n");
118 }
119
120 root_list = !list_empty(&res->res_root_list);
121 recover_list = !list_empty(&res->res_recover_list);
122
123 if (root_list || recover_list) {
124 seq_printf(s, "Recovery: root %d recover %d flags %lx "
125 "count %d\n", root_list, recover_list,
126 res->res_flags, res->res_recover_locks_count);
127 }
128
129 /* Print the locks attached to this resource */
130 seq_printf(s, "Granted Queue\n");
131 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
132 print_lock(s, lkb, res);
133
134 seq_printf(s, "Conversion Queue\n");
135 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
136 print_lock(s, lkb, res);
137
138 seq_printf(s, "Waiting Queue\n");
139 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
140 print_lock(s, lkb, res);
141
142 if (list_empty(&res->res_lookup))
143 goto out;
144
145 seq_printf(s, "Lookup Queue\n");
146 list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
147 seq_printf(s, "%08x %s", lkb->lkb_id,
148 print_lockmode(lkb->lkb_rqmode));
149 if (lkb->lkb_wait_type)
150 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
151 seq_printf(s, "\n");
152 }
153 out:
154 return 0;
155}
156
157static int rsb_iter_next(struct rsb_iter *ri)
158{
159 struct dlm_ls *ls = ri->ls;
160 int i;
161
162 if (!ri->next) {
163 top:
164 /* Find the next non-empty hash bucket */
165 for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
166 read_lock(&ls->ls_rsbtbl[i].lock);
167 if (!list_empty(&ls->ls_rsbtbl[i].list)) {
168 ri->next = ls->ls_rsbtbl[i].list.next;
169 read_unlock(&ls->ls_rsbtbl[i].lock);
170 break;
171 }
172 read_unlock(&ls->ls_rsbtbl[i].lock);
173 }
174 ri->entry = i;
175
176 if (ri->entry >= ls->ls_rsbtbl_size)
177 return 1;
178 } else {
179 i = ri->entry;
180 read_lock(&ls->ls_rsbtbl[i].lock);
181 ri->next = ri->next->next;
182 if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
183 /* End of list - move to next bucket */
184 ri->next = NULL;
185 ri->entry++;
186 read_unlock(&ls->ls_rsbtbl[i].lock);
187 goto top;
188 }
189 read_unlock(&ls->ls_rsbtbl[i].lock);
190 }
191 ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
192
193 return 0;
194}
195
196static void rsb_iter_free(struct rsb_iter *ri)
197{
198 kfree(ri);
199}
200
201static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
202{
203 struct rsb_iter *ri;
204
205 ri = kmalloc(sizeof *ri, GFP_KERNEL);
206 if (!ri)
207 return NULL;
208
209 ri->ls = ls;
210 ri->entry = 0;
211 ri->next = NULL;
212
213 if (rsb_iter_next(ri)) {
214 rsb_iter_free(ri);
215 return NULL;
216 }
217
218 return ri;
219}
220
221static void *rsb_seq_start(struct seq_file *file, loff_t *pos)
222{
223 struct rsb_iter *ri;
224 loff_t n = *pos;
225
226 ri = rsb_iter_init(file->private);
227 if (!ri)
228 return NULL;
229
230 while (n--) {
231 if (rsb_iter_next(ri)) {
232 rsb_iter_free(ri);
233 return NULL;
234 }
235 }
236
237 return ri;
238}
239
240static void *rsb_seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
241{
242 struct rsb_iter *ri = iter_ptr;
243
244 (*pos)++;
245
246 if (rsb_iter_next(ri)) {
247 rsb_iter_free(ri);
248 return NULL;
249 }
250
251 return ri;
252}
253
254static void rsb_seq_stop(struct seq_file *file, void *iter_ptr)
255{
256 /* nothing for now */
257}
258
259static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
260{
261 struct rsb_iter *ri = iter_ptr;
262
263 print_resource(ri->rsb, file);
264
265 return 0;
266}
267
268static struct seq_operations rsb_seq_ops = {
269 .start = rsb_seq_start,
270 .next = rsb_seq_next,
271 .stop = rsb_seq_stop,
272 .show = rsb_seq_show,
273};
274
275static int rsb_open(struct inode *inode, struct file *file)
276{
277 struct seq_file *seq;
278 int ret;
279
280 ret = seq_open(file, &rsb_seq_ops);
281 if (ret)
282 return ret;
283
284 seq = file->private_data;
285 seq->private = inode->i_private;
286
287 return 0;
288}
289
290static struct file_operations rsb_fops = {
291 .owner = THIS_MODULE,
292 .open = rsb_open,
293 .read = seq_read,
294 .llseek = seq_lseek,
295 .release = seq_release
296};
297
298/*
299 * dump lkb's on the ls_waiters list
300 */
301
302static int waiters_open(struct inode *inode, struct file *file)
303{
304 file->private_data = inode->i_private;
305 return 0;
306}
307
308static ssize_t waiters_read(struct file *file, char __user *userbuf,
309 size_t count, loff_t *ppos)
310{
311 struct dlm_ls *ls = file->private_data;
312 struct dlm_lkb *lkb;
313 size_t len = DLM_DEBUG_BUF_LEN, pos = 0, ret, rv;
314
315 mutex_lock(&debug_buf_lock);
316 mutex_lock(&ls->ls_waiters_mutex);
317 memset(debug_buf, 0, sizeof(debug_buf));
318
319 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
320 ret = snprintf(debug_buf + pos, len - pos, "%x %d %d %s\n",
321 lkb->lkb_id, lkb->lkb_wait_type,
322 lkb->lkb_nodeid, lkb->lkb_resource->res_name);
323 if (ret >= len - pos)
324 break;
325 pos += ret;
326 }
327 mutex_unlock(&ls->ls_waiters_mutex);
328
329 rv = simple_read_from_buffer(userbuf, count, ppos, debug_buf, pos);
330 mutex_unlock(&debug_buf_lock);
331 return rv;
332}
333
334static struct file_operations waiters_fops = {
335 .owner = THIS_MODULE,
336 .open = waiters_open,
337 .read = waiters_read
338};
339
340int dlm_create_debug_file(struct dlm_ls *ls)
341{
342 char name[DLM_LOCKSPACE_LEN+8];
343
344 ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
345 S_IFREG | S_IRUGO,
346 dlm_root,
347 ls,
348 &rsb_fops);
349 if (!ls->ls_debug_rsb_dentry)
350 return -ENOMEM;
351
352 memset(name, 0, sizeof(name));
353 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
354
355 ls->ls_debug_waiters_dentry = debugfs_create_file(name,
356 S_IFREG | S_IRUGO,
357 dlm_root,
358 ls,
359 &waiters_fops);
360 if (!ls->ls_debug_waiters_dentry) {
361 debugfs_remove(ls->ls_debug_rsb_dentry);
362 return -ENOMEM;
363 }
364
365 return 0;
366}
367
368void dlm_delete_debug_file(struct dlm_ls *ls)
369{
370 if (ls->ls_debug_rsb_dentry)
371 debugfs_remove(ls->ls_debug_rsb_dentry);
372 if (ls->ls_debug_waiters_dentry)
373 debugfs_remove(ls->ls_debug_waiters_dentry);
374}
375
376int dlm_register_debugfs(void)
377{
378 mutex_init(&debug_buf_lock);
379 dlm_root = debugfs_create_dir("dlm", NULL);
380 return dlm_root ? 0 : -ENOMEM;
381}
382
383void dlm_unregister_debugfs(void)
384{
385 debugfs_remove(dlm_root);
386}
387
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
new file mode 100644
index 000000000000..46754553fdcc
--- /dev/null
+++ b/fs/dlm/dir.c
@@ -0,0 +1,423 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "rcom.h"
19#include "config.h"
20#include "memory.h"
21#include "recover.h"
22#include "util.h"
23#include "lock.h"
24#include "dir.h"
25
26
27static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
28{
29 spin_lock(&ls->ls_recover_list_lock);
30 list_add(&de->list, &ls->ls_recover_list);
31 spin_unlock(&ls->ls_recover_list_lock);
32}
33
34static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
35{
36 int found = 0;
37 struct dlm_direntry *de;
38
39 spin_lock(&ls->ls_recover_list_lock);
40 list_for_each_entry(de, &ls->ls_recover_list, list) {
41 if (de->length == len) {
42 list_del(&de->list);
43 de->master_nodeid = 0;
44 memset(de->name, 0, len);
45 found = 1;
46 break;
47 }
48 }
49 spin_unlock(&ls->ls_recover_list_lock);
50
51 if (!found)
52 de = allocate_direntry(ls, len);
53 return de;
54}
55
56void dlm_clear_free_entries(struct dlm_ls *ls)
57{
58 struct dlm_direntry *de;
59
60 spin_lock(&ls->ls_recover_list_lock);
61 while (!list_empty(&ls->ls_recover_list)) {
62 de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
63 list);
64 list_del(&de->list);
65 free_direntry(de);
66 }
67 spin_unlock(&ls->ls_recover_list_lock);
68}
69
70/*
71 * We use the upper 16 bits of the hash value to select the directory node.
72 * Low bits are used for distribution of rsb's among hash buckets on each node.
73 *
74 * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
75 * num_nodes to the hash value. This value in the desired range is used as an
76 * offset into the sorted list of nodeid's to give the particular nodeid.
77 */
78
79int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
80{
81 struct list_head *tmp;
82 struct dlm_member *memb = NULL;
83 uint32_t node, n = 0;
84 int nodeid;
85
86 if (ls->ls_num_nodes == 1) {
87 nodeid = dlm_our_nodeid();
88 goto out;
89 }
90
91 if (ls->ls_node_array) {
92 node = (hash >> 16) % ls->ls_total_weight;
93 nodeid = ls->ls_node_array[node];
94 goto out;
95 }
96
97 /* make_member_array() failed to kmalloc ls_node_array... */
98
99 node = (hash >> 16) % ls->ls_num_nodes;
100
101 list_for_each(tmp, &ls->ls_nodes) {
102 if (n++ != node)
103 continue;
104 memb = list_entry(tmp, struct dlm_member, list);
105 break;
106 }
107
108 DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n",
109 ls->ls_num_nodes, n, node););
110 nodeid = memb->nodeid;
111 out:
112 return nodeid;
113}
114
115int dlm_dir_nodeid(struct dlm_rsb *r)
116{
117 return dlm_hash2nodeid(r->res_ls, r->res_hash);
118}
119
120static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
121{
122 uint32_t val;
123
124 val = jhash(name, len, 0);
125 val &= (ls->ls_dirtbl_size - 1);
126
127 return val;
128}
129
130static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
131{
132 uint32_t bucket;
133
134 bucket = dir_hash(ls, de->name, de->length);
135 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
136}
137
138static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
139 int namelen, uint32_t bucket)
140{
141 struct dlm_direntry *de;
142
143 list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
144 if (de->length == namelen && !memcmp(name, de->name, namelen))
145 goto out;
146 }
147 de = NULL;
148 out:
149 return de;
150}
151
152void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen)
153{
154 struct dlm_direntry *de;
155 uint32_t bucket;
156
157 bucket = dir_hash(ls, name, namelen);
158
159 write_lock(&ls->ls_dirtbl[bucket].lock);
160
161 de = search_bucket(ls, name, namelen, bucket);
162
163 if (!de) {
164 log_error(ls, "remove fr %u none", nodeid);
165 goto out;
166 }
167
168 if (de->master_nodeid != nodeid) {
169 log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
170 goto out;
171 }
172
173 list_del(&de->list);
174 free_direntry(de);
175 out:
176 write_unlock(&ls->ls_dirtbl[bucket].lock);
177}
178
179void dlm_dir_clear(struct dlm_ls *ls)
180{
181 struct list_head *head;
182 struct dlm_direntry *de;
183 int i;
184
185 DLM_ASSERT(list_empty(&ls->ls_recover_list), );
186
187 for (i = 0; i < ls->ls_dirtbl_size; i++) {
188 write_lock(&ls->ls_dirtbl[i].lock);
189 head = &ls->ls_dirtbl[i].list;
190 while (!list_empty(head)) {
191 de = list_entry(head->next, struct dlm_direntry, list);
192 list_del(&de->list);
193 put_free_de(ls, de);
194 }
195 write_unlock(&ls->ls_dirtbl[i].lock);
196 }
197}
198
199int dlm_recover_directory(struct dlm_ls *ls)
200{
201 struct dlm_member *memb;
202 struct dlm_direntry *de;
203 char *b, *last_name = NULL;
204 int error = -ENOMEM, last_len, count = 0;
205 uint16_t namelen;
206
207 log_debug(ls, "dlm_recover_directory");
208
209 if (dlm_no_directory(ls))
210 goto out_status;
211
212 dlm_dir_clear(ls);
213
214 last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
215 if (!last_name)
216 goto out;
217
218 list_for_each_entry(memb, &ls->ls_nodes, list) {
219 memset(last_name, 0, DLM_RESNAME_MAXLEN);
220 last_len = 0;
221
222 for (;;) {
223 error = dlm_recovery_stopped(ls);
224 if (error)
225 goto out_free;
226
227 error = dlm_rcom_names(ls, memb->nodeid,
228 last_name, last_len);
229 if (error)
230 goto out_free;
231
232 schedule();
233
234 /*
235 * pick namelen/name pairs out of received buffer
236 */
237
238 b = ls->ls_recover_buf + sizeof(struct dlm_rcom);
239
240 for (;;) {
241 memcpy(&namelen, b, sizeof(uint16_t));
242 namelen = be16_to_cpu(namelen);
243 b += sizeof(uint16_t);
244
245 /* namelen of 0xFFFFF marks end of names for
246 this node; namelen of 0 marks end of the
247 buffer */
248
249 if (namelen == 0xFFFF)
250 goto done;
251 if (!namelen)
252 break;
253
254 error = -ENOMEM;
255 de = get_free_de(ls, namelen);
256 if (!de)
257 goto out_free;
258
259 de->master_nodeid = memb->nodeid;
260 de->length = namelen;
261 last_len = namelen;
262 memcpy(de->name, b, namelen);
263 memcpy(last_name, b, namelen);
264 b += namelen;
265
266 add_entry_to_hash(ls, de);
267 count++;
268 }
269 }
270 done:
271 ;
272 }
273
274 out_status:
275 error = 0;
276 dlm_set_recover_status(ls, DLM_RS_DIR);
277 log_debug(ls, "dlm_recover_directory %d entries", count);
278 out_free:
279 kfree(last_name);
280 out:
281 dlm_clear_free_entries(ls);
282 return error;
283}
284
285static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
286 int namelen, int *r_nodeid)
287{
288 struct dlm_direntry *de, *tmp;
289 uint32_t bucket;
290
291 bucket = dir_hash(ls, name, namelen);
292
293 write_lock(&ls->ls_dirtbl[bucket].lock);
294 de = search_bucket(ls, name, namelen, bucket);
295 if (de) {
296 *r_nodeid = de->master_nodeid;
297 write_unlock(&ls->ls_dirtbl[bucket].lock);
298 if (*r_nodeid == nodeid)
299 return -EEXIST;
300 return 0;
301 }
302
303 write_unlock(&ls->ls_dirtbl[bucket].lock);
304
305 de = allocate_direntry(ls, namelen);
306 if (!de)
307 return -ENOMEM;
308
309 de->master_nodeid = nodeid;
310 de->length = namelen;
311 memcpy(de->name, name, namelen);
312
313 write_lock(&ls->ls_dirtbl[bucket].lock);
314 tmp = search_bucket(ls, name, namelen, bucket);
315 if (tmp) {
316 free_direntry(de);
317 de = tmp;
318 } else {
319 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
320 }
321 *r_nodeid = de->master_nodeid;
322 write_unlock(&ls->ls_dirtbl[bucket].lock);
323 return 0;
324}
325
326int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
327 int *r_nodeid)
328{
329 return get_entry(ls, nodeid, name, namelen, r_nodeid);
330}
331
332/* Copy the names of master rsb's into the buffer provided.
333 Only select names whose dir node is the given nodeid. */
334
335void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
336 char *outbuf, int outlen, int nodeid)
337{
338 struct list_head *list;
339 struct dlm_rsb *start_r = NULL, *r = NULL;
340 int offset = 0, start_namelen, error, dir_nodeid;
341 char *start_name;
342 uint16_t be_namelen;
343
344 /*
345 * Find the rsb where we left off (or start again)
346 */
347
348 start_namelen = inlen;
349 start_name = inbuf;
350
351 if (start_namelen > 1) {
352 /*
353 * We could also use a find_rsb_root() function here that
354 * searched the ls_root_list.
355 */
356 error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
357 &start_r);
358 DLM_ASSERT(!error && start_r,
359 printk("error %d\n", error););
360 DLM_ASSERT(!list_empty(&start_r->res_root_list),
361 dlm_print_rsb(start_r););
362 dlm_put_rsb(start_r);
363 }
364
365 /*
366 * Send rsb names for rsb's we're master of and whose directory node
367 * matches the requesting node.
368 */
369
370 down_read(&ls->ls_root_sem);
371 if (start_r)
372 list = start_r->res_root_list.next;
373 else
374 list = ls->ls_root_list.next;
375
376 for (offset = 0; list != &ls->ls_root_list; list = list->next) {
377 r = list_entry(list, struct dlm_rsb, res_root_list);
378 if (r->res_nodeid)
379 continue;
380
381 dir_nodeid = dlm_dir_nodeid(r);
382 if (dir_nodeid != nodeid)
383 continue;
384
385 /*
386 * The block ends when we can't fit the following in the
387 * remaining buffer space:
388 * namelen (uint16_t) +
389 * name (r->res_length) +
390 * end-of-block record 0x0000 (uint16_t)
391 */
392
393 if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
394 /* Write end-of-block record */
395 be_namelen = 0;
396 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
397 offset += sizeof(uint16_t);
398 goto out;
399 }
400
401 be_namelen = cpu_to_be16(r->res_length);
402 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
403 offset += sizeof(uint16_t);
404 memcpy(outbuf + offset, r->res_name, r->res_length);
405 offset += r->res_length;
406 }
407
408 /*
409 * If we've reached the end of the list (and there's room) write a
410 * terminating record.
411 */
412
413 if ((list == &ls->ls_root_list) &&
414 (offset + sizeof(uint16_t) <= outlen)) {
415 be_namelen = 0xFFFF;
416 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
417 offset += sizeof(uint16_t);
418 }
419
420 out:
421 up_read(&ls->ls_root_sem);
422}
423
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
new file mode 100644
index 000000000000..0b0eb1267b6e
--- /dev/null
+++ b/fs/dlm/dir.h
@@ -0,0 +1,30 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DIR_DOT_H__
15#define __DIR_DOT_H__
16
17
18int dlm_dir_nodeid(struct dlm_rsb *rsb);
19int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
20void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len);
21void dlm_dir_clear(struct dlm_ls *ls);
22void dlm_clear_free_entries(struct dlm_ls *ls);
23int dlm_recover_directory(struct dlm_ls *ls);
24int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
25 int *r_nodeid);
26void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
27 char *outbuf, int outlen, int nodeid);
28
29#endif /* __DIR_DOT_H__ */
30
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
new file mode 100644
index 000000000000..1e5cd67e1b7a
--- /dev/null
+++ b/fs/dlm/dlm_internal.h
@@ -0,0 +1,543 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DLM_INTERNAL_DOT_H__
15#define __DLM_INTERNAL_DOT_H__
16
17/*
18 * This is the main header file to be included in each DLM source file.
19 */
20
21#include <linux/module.h>
22#include <linux/slab.h>
23#include <linux/sched.h>
24#include <linux/types.h>
25#include <linux/ctype.h>
26#include <linux/spinlock.h>
27#include <linux/vmalloc.h>
28#include <linux/list.h>
29#include <linux/errno.h>
30#include <linux/random.h>
31#include <linux/delay.h>
32#include <linux/socket.h>
33#include <linux/kthread.h>
34#include <linux/kobject.h>
35#include <linux/kref.h>
36#include <linux/kernel.h>
37#include <linux/jhash.h>
38#include <linux/miscdevice.h>
39#include <linux/mutex.h>
40#include <asm/semaphore.h>
41#include <asm/uaccess.h>
42
43#include <linux/dlm.h>
44
45#define DLM_LOCKSPACE_LEN 64
46
47/* Size of the temp buffer midcomms allocates on the stack.
48 We try to make this large enough so most messages fit.
49 FIXME: should sctp make this unnecessary? */
50
51#define DLM_INBUF_LEN 148
52
53struct dlm_ls;
54struct dlm_lkb;
55struct dlm_rsb;
56struct dlm_member;
57struct dlm_lkbtable;
58struct dlm_rsbtable;
59struct dlm_dirtable;
60struct dlm_direntry;
61struct dlm_recover;
62struct dlm_header;
63struct dlm_message;
64struct dlm_rcom;
65struct dlm_mhandle;
66
67#define log_print(fmt, args...) \
68 printk(KERN_ERR "dlm: "fmt"\n" , ##args)
69#define log_error(ls, fmt, args...) \
70 printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
71
72#define DLM_LOG_DEBUG
73#ifdef DLM_LOG_DEBUG
74#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args)
75#else
76#define log_debug(ls, fmt, args...)
77#endif
78
79#define DLM_ASSERT(x, do) \
80{ \
81 if (!(x)) \
82 { \
83 printk(KERN_ERR "\nDLM: Assertion failed on line %d of file %s\n" \
84 "DLM: assertion: \"%s\"\n" \
85 "DLM: time = %lu\n", \
86 __LINE__, __FILE__, #x, jiffies); \
87 {do} \
88 printk("\n"); \
89 BUG(); \
90 panic("DLM: Record message above and reboot.\n"); \
91 } \
92}
93
94#define DLM_FAKE_USER_AST ERR_PTR(-EINVAL)
95
96
97struct dlm_direntry {
98 struct list_head list;
99 uint32_t master_nodeid;
100 uint16_t length;
101 char name[1];
102};
103
104struct dlm_dirtable {
105 struct list_head list;
106 rwlock_t lock;
107};
108
109struct dlm_rsbtable {
110 struct list_head list;
111 struct list_head toss;
112 rwlock_t lock;
113};
114
115struct dlm_lkbtable {
116 struct list_head list;
117 rwlock_t lock;
118 uint16_t counter;
119};
120
121/*
122 * Lockspace member (per node in a ls)
123 */
124
125struct dlm_member {
126 struct list_head list;
127 int nodeid;
128 int weight;
129};
130
131/*
132 * Save and manage recovery state for a lockspace.
133 */
134
135struct dlm_recover {
136 struct list_head list;
137 int *nodeids;
138 int node_count;
139 uint64_t seq;
140};
141
142/*
143 * Pass input args to second stage locking function.
144 */
145
146struct dlm_args {
147 uint32_t flags;
148 void *astaddr;
149 long astparam;
150 void *bastaddr;
151 int mode;
152 struct dlm_lksb *lksb;
153};
154
155
156/*
157 * Lock block
158 *
159 * A lock can be one of three types:
160 *
161 * local copy lock is mastered locally
162 * (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
163 * process copy lock is mastered on a remote node
164 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
165 * master copy master node's copy of a lock owned by remote node
166 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
167 *
168 * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
169 * dlm_unlock. The dlm does not modify these or use any private flags in
170 * this field; it only contains DLM_LKF_ flags from dlm.h. These flags
171 * are sent as-is to the remote master when the lock is remote.
172 *
173 * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
174 * Some internal flags are shared between the master and process nodes;
175 * these shared flags are kept in the lower two bytes. One of these
176 * flags set on the master copy will be propagated to the process copy
177 * and v.v. Other internal flags are private to the master or process
178 * node (e.g. DLM_IFL_MSTCPY). These are kept in the high two bytes.
179 *
180 * lkb_sbflags: status block flags. These flags are copied directly into
181 * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
182 * ast. All defined in dlm.h with DLM_SBF_ prefix.
183 *
184 * lkb_status: the lock status indicates which rsb queue the lock is
185 * on, grant, convert, or wait. DLM_LKSTS_ WAITING/GRANTED/CONVERT
186 *
187 * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
188 * reply is needed. Only set when the lkb is on the lockspace waiters
189 * list awaiting a reply from a remote node.
190 *
191 * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
192 * is a master copy, nodeid specifies the remote lock holder, when the
193 * lkb is a process copy, the nodeid specifies the lock master.
194 */
195
196/* lkb_ast_type */
197
198#define AST_COMP 1
199#define AST_BAST 2
200
201/* lkb_status */
202
203#define DLM_LKSTS_WAITING 1
204#define DLM_LKSTS_GRANTED 2
205#define DLM_LKSTS_CONVERT 3
206
207/* lkb_flags */
208
209#define DLM_IFL_MSTCPY 0x00010000
210#define DLM_IFL_RESEND 0x00020000
211#define DLM_IFL_DEAD 0x00040000
212#define DLM_IFL_USER 0x00000001
213#define DLM_IFL_ORPHAN 0x00000002
214
215struct dlm_lkb {
216 struct dlm_rsb *lkb_resource; /* the rsb */
217 struct kref lkb_ref;
218 int lkb_nodeid; /* copied from rsb */
219 int lkb_ownpid; /* pid of lock owner */
220 uint32_t lkb_id; /* our lock ID */
221 uint32_t lkb_remid; /* lock ID on remote partner */
222 uint32_t lkb_exflags; /* external flags from caller */
223 uint32_t lkb_sbflags; /* lksb flags */
224 uint32_t lkb_flags; /* internal flags */
225 uint32_t lkb_lvbseq; /* lvb sequence number */
226
227 int8_t lkb_status; /* granted, waiting, convert */
228 int8_t lkb_rqmode; /* requested lock mode */
229 int8_t lkb_grmode; /* granted lock mode */
230 int8_t lkb_bastmode; /* requested mode */
231 int8_t lkb_highbast; /* highest mode bast sent for */
232
233 int8_t lkb_wait_type; /* type of reply waiting for */
234 int8_t lkb_ast_type; /* type of ast queued for */
235
236 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
237 struct list_head lkb_statequeue; /* rsb g/c/w list */
238 struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */
239 struct list_head lkb_wait_reply; /* waiting for remote reply */
240 struct list_head lkb_astqueue; /* need ast to be sent */
241 struct list_head lkb_ownqueue; /* list of locks for a process */
242
243 char *lkb_lvbptr;
244 struct dlm_lksb *lkb_lksb; /* caller's status block */
245 void *lkb_astaddr; /* caller's ast function */
246 void *lkb_bastaddr; /* caller's bast function */
247 long lkb_astparam; /* caller's ast arg */
248};
249
250
251struct dlm_rsb {
252 struct dlm_ls *res_ls; /* the lockspace */
253 struct kref res_ref;
254 struct mutex res_mutex;
255 unsigned long res_flags;
256 int res_length; /* length of rsb name */
257 int res_nodeid;
258 uint32_t res_lvbseq;
259 uint32_t res_hash;
260 uint32_t res_bucket; /* rsbtbl */
261 unsigned long res_toss_time;
262 uint32_t res_first_lkid;
263 struct list_head res_lookup; /* lkbs waiting on first */
264 struct list_head res_hashchain; /* rsbtbl */
265 struct list_head res_grantqueue;
266 struct list_head res_convertqueue;
267 struct list_head res_waitqueue;
268
269 struct list_head res_root_list; /* used for recovery */
270 struct list_head res_recover_list; /* used for recovery */
271 int res_recover_locks_count;
272
273 char *res_lvbptr;
274 char res_name[1];
275};
276
277/* find_rsb() flags */
278
279#define R_MASTER 1 /* only return rsb if it's a master */
280#define R_CREATE 2 /* create/add rsb if not found */
281
282/* rsb_flags */
283
284enum rsb_flags {
285 RSB_MASTER_UNCERTAIN,
286 RSB_VALNOTVALID,
287 RSB_VALNOTVALID_PREV,
288 RSB_NEW_MASTER,
289 RSB_NEW_MASTER2,
290 RSB_RECOVER_CONVERT,
291 RSB_LOCKS_PURGED,
292};
293
294static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
295{
296 __set_bit(flag, &r->res_flags);
297}
298
299static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
300{
301 __clear_bit(flag, &r->res_flags);
302}
303
304static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
305{
306 return test_bit(flag, &r->res_flags);
307}
308
309
310/* dlm_header is first element of all structs sent between nodes */
311
312#define DLM_HEADER_MAJOR 0x00020000
313#define DLM_HEADER_MINOR 0x00000001
314
315#define DLM_MSG 1
316#define DLM_RCOM 2
317
318struct dlm_header {
319 uint32_t h_version;
320 uint32_t h_lockspace;
321 uint32_t h_nodeid; /* nodeid of sender */
322 uint16_t h_length;
323 uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */
324 uint8_t h_pad;
325};
326
327
328#define DLM_MSG_REQUEST 1
329#define DLM_MSG_CONVERT 2
330#define DLM_MSG_UNLOCK 3
331#define DLM_MSG_CANCEL 4
332#define DLM_MSG_REQUEST_REPLY 5
333#define DLM_MSG_CONVERT_REPLY 6
334#define DLM_MSG_UNLOCK_REPLY 7
335#define DLM_MSG_CANCEL_REPLY 8
336#define DLM_MSG_GRANT 9
337#define DLM_MSG_BAST 10
338#define DLM_MSG_LOOKUP 11
339#define DLM_MSG_REMOVE 12
340#define DLM_MSG_LOOKUP_REPLY 13
341
342struct dlm_message {
343 struct dlm_header m_header;
344 uint32_t m_type; /* DLM_MSG_ */
345 uint32_t m_nodeid;
346 uint32_t m_pid;
347 uint32_t m_lkid; /* lkid on sender */
348 uint32_t m_remid; /* lkid on receiver */
349 uint32_t m_parent_lkid;
350 uint32_t m_parent_remid;
351 uint32_t m_exflags;
352 uint32_t m_sbflags;
353 uint32_t m_flags;
354 uint32_t m_lvbseq;
355 uint32_t m_hash;
356 int m_status;
357 int m_grmode;
358 int m_rqmode;
359 int m_bastmode;
360 int m_asts;
361 int m_result; /* 0 or -EXXX */
362 char m_extra[0]; /* name or lvb */
363};
364
365
366#define DLM_RS_NODES 0x00000001
367#define DLM_RS_NODES_ALL 0x00000002
368#define DLM_RS_DIR 0x00000004
369#define DLM_RS_DIR_ALL 0x00000008
370#define DLM_RS_LOCKS 0x00000010
371#define DLM_RS_LOCKS_ALL 0x00000020
372#define DLM_RS_DONE 0x00000040
373#define DLM_RS_DONE_ALL 0x00000080
374
375#define DLM_RCOM_STATUS 1
376#define DLM_RCOM_NAMES 2
377#define DLM_RCOM_LOOKUP 3
378#define DLM_RCOM_LOCK 4
379#define DLM_RCOM_STATUS_REPLY 5
380#define DLM_RCOM_NAMES_REPLY 6
381#define DLM_RCOM_LOOKUP_REPLY 7
382#define DLM_RCOM_LOCK_REPLY 8
383
384struct dlm_rcom {
385 struct dlm_header rc_header;
386 uint32_t rc_type; /* DLM_RCOM_ */
387 int rc_result; /* multi-purpose */
388 uint64_t rc_id; /* match reply with request */
389 char rc_buf[0];
390};
391
392struct rcom_config {
393 uint32_t rf_lvblen;
394 uint32_t rf_lsflags;
395 uint64_t rf_unused;
396};
397
398struct rcom_lock {
399 uint32_t rl_ownpid;
400 uint32_t rl_lkid;
401 uint32_t rl_remid;
402 uint32_t rl_parent_lkid;
403 uint32_t rl_parent_remid;
404 uint32_t rl_exflags;
405 uint32_t rl_flags;
406 uint32_t rl_lvbseq;
407 int rl_result;
408 int8_t rl_rqmode;
409 int8_t rl_grmode;
410 int8_t rl_status;
411 int8_t rl_asts;
412 uint16_t rl_wait_type;
413 uint16_t rl_namelen;
414 char rl_name[DLM_RESNAME_MAXLEN];
415 char rl_lvb[0];
416};
417
418struct dlm_ls {
419 struct list_head ls_list; /* list of lockspaces */
420 dlm_lockspace_t *ls_local_handle;
421 uint32_t ls_global_id; /* global unique lockspace ID */
422 uint32_t ls_exflags;
423 int ls_lvblen;
424 int ls_count; /* reference count */
425 unsigned long ls_flags; /* LSFL_ */
426 struct kobject ls_kobj;
427
428 struct dlm_rsbtable *ls_rsbtbl;
429 uint32_t ls_rsbtbl_size;
430
431 struct dlm_lkbtable *ls_lkbtbl;
432 uint32_t ls_lkbtbl_size;
433
434 struct dlm_dirtable *ls_dirtbl;
435 uint32_t ls_dirtbl_size;
436
437 struct mutex ls_waiters_mutex;
438 struct list_head ls_waiters; /* lkbs needing a reply */
439
440 struct list_head ls_nodes; /* current nodes in ls */
441 struct list_head ls_nodes_gone; /* dead node list, recovery */
442 int ls_num_nodes; /* number of nodes in ls */
443 int ls_low_nodeid;
444 int ls_total_weight;
445 int *ls_node_array;
446
447 struct dlm_rsb ls_stub_rsb; /* for returning errors */
448 struct dlm_lkb ls_stub_lkb; /* for returning errors */
449 struct dlm_message ls_stub_ms; /* for faking a reply */
450
451 struct dentry *ls_debug_rsb_dentry; /* debugfs */
452 struct dentry *ls_debug_waiters_dentry; /* debugfs */
453
454 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
455 int ls_uevent_result;
456
457 struct miscdevice ls_device;
458
459 /* recovery related */
460
461 struct timer_list ls_timer;
462 struct task_struct *ls_recoverd_task;
463 struct mutex ls_recoverd_active;
464 spinlock_t ls_recover_lock;
465 uint32_t ls_recover_status; /* DLM_RS_ */
466 uint64_t ls_recover_seq;
467 struct dlm_recover *ls_recover_args;
468 struct rw_semaphore ls_in_recovery; /* block local requests */
469 struct list_head ls_requestqueue;/* queue remote requests */
470 struct mutex ls_requestqueue_mutex;
471 char *ls_recover_buf;
472 int ls_recover_nodeid; /* for debugging */
473 uint64_t ls_rcom_seq;
474 struct list_head ls_recover_list;
475 spinlock_t ls_recover_list_lock;
476 int ls_recover_list_count;
477 wait_queue_head_t ls_wait_general;
478 struct mutex ls_clear_proc_locks;
479
480 struct list_head ls_root_list; /* root resources */
481 struct rw_semaphore ls_root_sem; /* protect root_list */
482
483 int ls_namelen;
484 char ls_name[1];
485};
486
487#define LSFL_WORK 0
488#define LSFL_RUNNING 1
489#define LSFL_RECOVERY_STOP 2
490#define LSFL_RCOM_READY 3
491#define LSFL_UEVENT_WAIT 4
492
493/* much of this is just saving user space pointers associated with the
494 lock that we pass back to the user lib with an ast */
495
496struct dlm_user_args {
497 struct dlm_user_proc *proc; /* each process that opens the lockspace
498 device has private data
499 (dlm_user_proc) on the struct file,
500 the process's locks point back to it*/
501 struct dlm_lksb lksb;
502 int old_mode;
503 int update_user_lvb;
504 struct dlm_lksb __user *user_lksb;
505 void __user *castparam;
506 void __user *castaddr;
507 void __user *bastparam;
508 void __user *bastaddr;
509};
510
511#define DLM_PROC_FLAGS_CLOSING 1
512#define DLM_PROC_FLAGS_COMPAT 2
513
514/* locks list is kept so we can remove all a process's locks when it
515 exits (or orphan those that are persistent) */
516
517struct dlm_user_proc {
518 dlm_lockspace_t *lockspace;
519 unsigned long flags; /* DLM_PROC_FLAGS */
520 struct list_head asts;
521 spinlock_t asts_spin;
522 struct list_head locks;
523 spinlock_t locks_spin;
524 wait_queue_head_t wait;
525};
526
527static inline int dlm_locking_stopped(struct dlm_ls *ls)
528{
529 return !test_bit(LSFL_RUNNING, &ls->ls_flags);
530}
531
532static inline int dlm_recovery_stopped(struct dlm_ls *ls)
533{
534 return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
535}
536
537static inline int dlm_no_directory(struct dlm_ls *ls)
538{
539 return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
540}
541
542#endif /* __DLM_INTERNAL_DOT_H__ */
543
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
new file mode 100644
index 000000000000..3f2befa4797b
--- /dev/null
+++ b/fs/dlm/lock.c
@@ -0,0 +1,3871 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13/* Central locking logic has four stages:
14
15 dlm_lock()
16 dlm_unlock()
17
18 request_lock(ls, lkb)
19 convert_lock(ls, lkb)
20 unlock_lock(ls, lkb)
21 cancel_lock(ls, lkb)
22
23 _request_lock(r, lkb)
24 _convert_lock(r, lkb)
25 _unlock_lock(r, lkb)
26 _cancel_lock(r, lkb)
27
28 do_request(r, lkb)
29 do_convert(r, lkb)
30 do_unlock(r, lkb)
31 do_cancel(r, lkb)
32
33 Stage 1 (lock, unlock) is mainly about checking input args and
34 splitting into one of the four main operations:
35
36 dlm_lock = request_lock
37 dlm_lock+CONVERT = convert_lock
38 dlm_unlock = unlock_lock
39 dlm_unlock+CANCEL = cancel_lock
40
41 Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
42 provided to the next stage.
43
44 Stage 3, _xxxx_lock(), determines if the operation is local or remote.
45 When remote, it calls send_xxxx(), when local it calls do_xxxx().
46
47 Stage 4, do_xxxx(), is the guts of the operation. It manipulates the
48 given rsb and lkb and queues callbacks.
49
50 For remote operations, send_xxxx() results in the corresponding do_xxxx()
51 function being executed on the remote node. The connecting send/receive
52 calls on local (L) and remote (R) nodes:
53
54 L: send_xxxx() -> R: receive_xxxx()
55 R: do_xxxx()
56 L: receive_xxxx_reply() <- R: send_xxxx_reply()
57*/
58#include <linux/types.h>
59#include "dlm_internal.h"
60#include <linux/dlm_device.h>
61#include "memory.h"
62#include "lowcomms.h"
63#include "requestqueue.h"
64#include "util.h"
65#include "dir.h"
66#include "member.h"
67#include "lockspace.h"
68#include "ast.h"
69#include "lock.h"
70#include "rcom.h"
71#include "recover.h"
72#include "lvb_table.h"
73#include "user.h"
74#include "config.h"
75
76static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
77static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
78static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
79static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
80static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
81static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
82static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
83static int send_remove(struct dlm_rsb *r);
84static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
85static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
86 struct dlm_message *ms);
87static int receive_extralen(struct dlm_message *ms);
88
89/*
90 * Lock compatibilty matrix - thanks Steve
91 * UN = Unlocked state. Not really a state, used as a flag
92 * PD = Padding. Used to make the matrix a nice power of two in size
93 * Other states are the same as the VMS DLM.
94 * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
95 */
96
97static const int __dlm_compat_matrix[8][8] = {
98 /* UN NL CR CW PR PW EX PD */
99 {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
100 {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
101 {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
102 {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
103 {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
104 {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
105 {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
106 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
107};
108
109/*
110 * This defines the direction of transfer of LVB data.
111 * Granted mode is the row; requested mode is the column.
112 * Usage: matrix[grmode+1][rqmode+1]
113 * 1 = LVB is returned to the caller
114 * 0 = LVB is written to the resource
115 * -1 = nothing happens to the LVB
116 */
117
118const int dlm_lvb_operations[8][8] = {
119 /* UN NL CR CW PR PW EX PD*/
120 { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
121 { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
122 { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
123 { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
124 { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
125 { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
126 { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
127 { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
128};
129
130#define modes_compat(gr, rq) \
131 __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
132
133int dlm_modes_compat(int mode1, int mode2)
134{
135 return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
136}
137
138/*
139 * Compatibility matrix for conversions with QUECVT set.
140 * Granted mode is the row; requested mode is the column.
141 * Usage: matrix[grmode+1][rqmode+1]
142 */
143
144static const int __quecvt_compat_matrix[8][8] = {
145 /* UN NL CR CW PR PW EX PD */
146 {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
147 {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
148 {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
149 {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
150 {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
151 {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
152 {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
153 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
154};
155
156void dlm_print_lkb(struct dlm_lkb *lkb)
157{
158 printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
159 " status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
160 lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
161 lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
162 lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
163}
164
165void dlm_print_rsb(struct dlm_rsb *r)
166{
167 printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
168 r->res_nodeid, r->res_flags, r->res_first_lkid,
169 r->res_recover_locks_count, r->res_name);
170}
171
172void dlm_dump_rsb(struct dlm_rsb *r)
173{
174 struct dlm_lkb *lkb;
175
176 dlm_print_rsb(r);
177
178 printk(KERN_ERR "rsb: root_list empty %d recover_list empty %d\n",
179 list_empty(&r->res_root_list), list_empty(&r->res_recover_list));
180 printk(KERN_ERR "rsb lookup list\n");
181 list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
182 dlm_print_lkb(lkb);
183 printk(KERN_ERR "rsb grant queue:\n");
184 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
185 dlm_print_lkb(lkb);
186 printk(KERN_ERR "rsb convert queue:\n");
187 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
188 dlm_print_lkb(lkb);
189 printk(KERN_ERR "rsb wait queue:\n");
190 list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
191 dlm_print_lkb(lkb);
192}
193
194/* Threads cannot use the lockspace while it's being recovered */
195
196static inline void lock_recovery(struct dlm_ls *ls)
197{
198 down_read(&ls->ls_in_recovery);
199}
200
201static inline void unlock_recovery(struct dlm_ls *ls)
202{
203 up_read(&ls->ls_in_recovery);
204}
205
206static inline int lock_recovery_try(struct dlm_ls *ls)
207{
208 return down_read_trylock(&ls->ls_in_recovery);
209}
210
211static inline int can_be_queued(struct dlm_lkb *lkb)
212{
213 return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
214}
215
216static inline int force_blocking_asts(struct dlm_lkb *lkb)
217{
218 return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
219}
220
221static inline int is_demoted(struct dlm_lkb *lkb)
222{
223 return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
224}
225
226static inline int is_remote(struct dlm_rsb *r)
227{
228 DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
229 return !!r->res_nodeid;
230}
231
232static inline int is_process_copy(struct dlm_lkb *lkb)
233{
234 return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
235}
236
237static inline int is_master_copy(struct dlm_lkb *lkb)
238{
239 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
240 DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
241 return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
242}
243
244static inline int middle_conversion(struct dlm_lkb *lkb)
245{
246 if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
247 (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
248 return 1;
249 return 0;
250}
251
252static inline int down_conversion(struct dlm_lkb *lkb)
253{
254 return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
255}
256
257static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
258{
259 if (is_master_copy(lkb))
260 return;
261
262 DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
263
264 lkb->lkb_lksb->sb_status = rv;
265 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
266
267 dlm_add_ast(lkb, AST_COMP);
268}
269
270static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
271{
272 if (is_master_copy(lkb))
273 send_bast(r, lkb, rqmode);
274 else {
275 lkb->lkb_bastmode = rqmode;
276 dlm_add_ast(lkb, AST_BAST);
277 }
278}
279
280/*
281 * Basic operations on rsb's and lkb's
282 */
283
284static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
285{
286 struct dlm_rsb *r;
287
288 r = allocate_rsb(ls, len);
289 if (!r)
290 return NULL;
291
292 r->res_ls = ls;
293 r->res_length = len;
294 memcpy(r->res_name, name, len);
295 mutex_init(&r->res_mutex);
296
297 INIT_LIST_HEAD(&r->res_lookup);
298 INIT_LIST_HEAD(&r->res_grantqueue);
299 INIT_LIST_HEAD(&r->res_convertqueue);
300 INIT_LIST_HEAD(&r->res_waitqueue);
301 INIT_LIST_HEAD(&r->res_root_list);
302 INIT_LIST_HEAD(&r->res_recover_list);
303
304 return r;
305}
306
307static int search_rsb_list(struct list_head *head, char *name, int len,
308 unsigned int flags, struct dlm_rsb **r_ret)
309{
310 struct dlm_rsb *r;
311 int error = 0;
312
313 list_for_each_entry(r, head, res_hashchain) {
314 if (len == r->res_length && !memcmp(name, r->res_name, len))
315 goto found;
316 }
317 return -EBADR;
318
319 found:
320 if (r->res_nodeid && (flags & R_MASTER))
321 error = -ENOTBLK;
322 *r_ret = r;
323 return error;
324}
325
326static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
327 unsigned int flags, struct dlm_rsb **r_ret)
328{
329 struct dlm_rsb *r;
330 int error;
331
332 error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
333 if (!error) {
334 kref_get(&r->res_ref);
335 goto out;
336 }
337 error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
338 if (error)
339 goto out;
340
341 list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
342
343 if (dlm_no_directory(ls))
344 goto out;
345
346 if (r->res_nodeid == -1) {
347 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
348 r->res_first_lkid = 0;
349 } else if (r->res_nodeid > 0) {
350 rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
351 r->res_first_lkid = 0;
352 } else {
353 DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
354 DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
355 }
356 out:
357 *r_ret = r;
358 return error;
359}
360
361static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
362 unsigned int flags, struct dlm_rsb **r_ret)
363{
364 int error;
365 write_lock(&ls->ls_rsbtbl[b].lock);
366 error = _search_rsb(ls, name, len, b, flags, r_ret);
367 write_unlock(&ls->ls_rsbtbl[b].lock);
368 return error;
369}
370
371/*
372 * Find rsb in rsbtbl and potentially create/add one
373 *
374 * Delaying the release of rsb's has a similar benefit to applications keeping
375 * NL locks on an rsb, but without the guarantee that the cached master value
376 * will still be valid when the rsb is reused. Apps aren't always smart enough
377 * to keep NL locks on an rsb that they may lock again shortly; this can lead
378 * to excessive master lookups and removals if we don't delay the release.
379 *
380 * Searching for an rsb means looking through both the normal list and toss
381 * list. When found on the toss list the rsb is moved to the normal list with
382 * ref count of 1; when found on normal list the ref count is incremented.
383 */
384
385static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
386 unsigned int flags, struct dlm_rsb **r_ret)
387{
388 struct dlm_rsb *r, *tmp;
389 uint32_t hash, bucket;
390 int error = 0;
391
392 if (dlm_no_directory(ls))
393 flags |= R_CREATE;
394
395 hash = jhash(name, namelen, 0);
396 bucket = hash & (ls->ls_rsbtbl_size - 1);
397
398 error = search_rsb(ls, name, namelen, bucket, flags, &r);
399 if (!error)
400 goto out;
401
402 if (error == -EBADR && !(flags & R_CREATE))
403 goto out;
404
405 /* the rsb was found but wasn't a master copy */
406 if (error == -ENOTBLK)
407 goto out;
408
409 error = -ENOMEM;
410 r = create_rsb(ls, name, namelen);
411 if (!r)
412 goto out;
413
414 r->res_hash = hash;
415 r->res_bucket = bucket;
416 r->res_nodeid = -1;
417 kref_init(&r->res_ref);
418
419 /* With no directory, the master can be set immediately */
420 if (dlm_no_directory(ls)) {
421 int nodeid = dlm_dir_nodeid(r);
422 if (nodeid == dlm_our_nodeid())
423 nodeid = 0;
424 r->res_nodeid = nodeid;
425 }
426
427 write_lock(&ls->ls_rsbtbl[bucket].lock);
428 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
429 if (!error) {
430 write_unlock(&ls->ls_rsbtbl[bucket].lock);
431 free_rsb(r);
432 r = tmp;
433 goto out;
434 }
435 list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
436 write_unlock(&ls->ls_rsbtbl[bucket].lock);
437 error = 0;
438 out:
439 *r_ret = r;
440 return error;
441}
442
443int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
444 unsigned int flags, struct dlm_rsb **r_ret)
445{
446 return find_rsb(ls, name, namelen, flags, r_ret);
447}
448
449/* This is only called to add a reference when the code already holds
450 a valid reference to the rsb, so there's no need for locking. */
451
452static inline void hold_rsb(struct dlm_rsb *r)
453{
454 kref_get(&r->res_ref);
455}
456
457void dlm_hold_rsb(struct dlm_rsb *r)
458{
459 hold_rsb(r);
460}
461
462static void toss_rsb(struct kref *kref)
463{
464 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
465 struct dlm_ls *ls = r->res_ls;
466
467 DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
468 kref_init(&r->res_ref);
469 list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
470 r->res_toss_time = jiffies;
471 if (r->res_lvbptr) {
472 free_lvb(r->res_lvbptr);
473 r->res_lvbptr = NULL;
474 }
475}
476
477/* When all references to the rsb are gone it's transfered to
478 the tossed list for later disposal. */
479
480static void put_rsb(struct dlm_rsb *r)
481{
482 struct dlm_ls *ls = r->res_ls;
483 uint32_t bucket = r->res_bucket;
484
485 write_lock(&ls->ls_rsbtbl[bucket].lock);
486 kref_put(&r->res_ref, toss_rsb);
487 write_unlock(&ls->ls_rsbtbl[bucket].lock);
488}
489
490void dlm_put_rsb(struct dlm_rsb *r)
491{
492 put_rsb(r);
493}
494
495/* See comment for unhold_lkb */
496
497static void unhold_rsb(struct dlm_rsb *r)
498{
499 int rv;
500 rv = kref_put(&r->res_ref, toss_rsb);
501 DLM_ASSERT(!rv, dlm_dump_rsb(r););
502}
503
504static void kill_rsb(struct kref *kref)
505{
506 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
507
508 /* All work is done after the return from kref_put() so we
509 can release the write_lock before the remove and free. */
510
511 DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r););
512 DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r););
513 DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r););
514 DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r););
515 DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r););
516 DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r););
517}
518
519/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
520 The rsb must exist as long as any lkb's for it do. */
521
522static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
523{
524 hold_rsb(r);
525 lkb->lkb_resource = r;
526}
527
528static void detach_lkb(struct dlm_lkb *lkb)
529{
530 if (lkb->lkb_resource) {
531 put_rsb(lkb->lkb_resource);
532 lkb->lkb_resource = NULL;
533 }
534}
535
536static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
537{
538 struct dlm_lkb *lkb, *tmp;
539 uint32_t lkid = 0;
540 uint16_t bucket;
541
542 lkb = allocate_lkb(ls);
543 if (!lkb)
544 return -ENOMEM;
545
546 lkb->lkb_nodeid = -1;
547 lkb->lkb_grmode = DLM_LOCK_IV;
548 kref_init(&lkb->lkb_ref);
549 INIT_LIST_HEAD(&lkb->lkb_ownqueue);
550
551 get_random_bytes(&bucket, sizeof(bucket));
552 bucket &= (ls->ls_lkbtbl_size - 1);
553
554 write_lock(&ls->ls_lkbtbl[bucket].lock);
555
556 /* counter can roll over so we must verify lkid is not in use */
557
558 while (lkid == 0) {
559 lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
560
561 list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
562 lkb_idtbl_list) {
563 if (tmp->lkb_id != lkid)
564 continue;
565 lkid = 0;
566 break;
567 }
568 }
569
570 lkb->lkb_id = lkid;
571 list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
572 write_unlock(&ls->ls_lkbtbl[bucket].lock);
573
574 *lkb_ret = lkb;
575 return 0;
576}
577
578static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
579{
580 uint16_t bucket = lkid & 0xFFFF;
581 struct dlm_lkb *lkb;
582
583 list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
584 if (lkb->lkb_id == lkid)
585 return lkb;
586 }
587 return NULL;
588}
589
590static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
591{
592 struct dlm_lkb *lkb;
593 uint16_t bucket = lkid & 0xFFFF;
594
595 if (bucket >= ls->ls_lkbtbl_size)
596 return -EBADSLT;
597
598 read_lock(&ls->ls_lkbtbl[bucket].lock);
599 lkb = __find_lkb(ls, lkid);
600 if (lkb)
601 kref_get(&lkb->lkb_ref);
602 read_unlock(&ls->ls_lkbtbl[bucket].lock);
603
604 *lkb_ret = lkb;
605 return lkb ? 0 : -ENOENT;
606}
607
608static void kill_lkb(struct kref *kref)
609{
610 struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
611
612 /* All work is done after the return from kref_put() so we
613 can release the write_lock before the detach_lkb */
614
615 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
616}
617
618/* __put_lkb() is used when an lkb may not have an rsb attached to
619 it so we need to provide the lockspace explicitly */
620
621static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
622{
623 uint16_t bucket = lkb->lkb_id & 0xFFFF;
624
625 write_lock(&ls->ls_lkbtbl[bucket].lock);
626 if (kref_put(&lkb->lkb_ref, kill_lkb)) {
627 list_del(&lkb->lkb_idtbl_list);
628 write_unlock(&ls->ls_lkbtbl[bucket].lock);
629
630 detach_lkb(lkb);
631
632 /* for local/process lkbs, lvbptr points to caller's lksb */
633 if (lkb->lkb_lvbptr && is_master_copy(lkb))
634 free_lvb(lkb->lkb_lvbptr);
635 free_lkb(lkb);
636 return 1;
637 } else {
638 write_unlock(&ls->ls_lkbtbl[bucket].lock);
639 return 0;
640 }
641}
642
643int dlm_put_lkb(struct dlm_lkb *lkb)
644{
645 struct dlm_ls *ls;
646
647 DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb););
648 DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb););
649
650 ls = lkb->lkb_resource->res_ls;
651 return __put_lkb(ls, lkb);
652}
653
654/* This is only called to add a reference when the code already holds
655 a valid reference to the lkb, so there's no need for locking. */
656
657static inline void hold_lkb(struct dlm_lkb *lkb)
658{
659 kref_get(&lkb->lkb_ref);
660}
661
662/* This is called when we need to remove a reference and are certain
663 it's not the last ref. e.g. del_lkb is always called between a
664 find_lkb/put_lkb and is always the inverse of a previous add_lkb.
665 put_lkb would work fine, but would involve unnecessary locking */
666
667static inline void unhold_lkb(struct dlm_lkb *lkb)
668{
669 int rv;
670 rv = kref_put(&lkb->lkb_ref, kill_lkb);
671 DLM_ASSERT(!rv, dlm_print_lkb(lkb););
672}
673
674static void lkb_add_ordered(struct list_head *new, struct list_head *head,
675 int mode)
676{
677 struct dlm_lkb *lkb = NULL;
678
679 list_for_each_entry(lkb, head, lkb_statequeue)
680 if (lkb->lkb_rqmode < mode)
681 break;
682
683 if (!lkb)
684 list_add_tail(new, head);
685 else
686 __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
687}
688
689/* add/remove lkb to rsb's grant/convert/wait queue */
690
691static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
692{
693 kref_get(&lkb->lkb_ref);
694
695 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
696
697 lkb->lkb_status = status;
698
699 switch (status) {
700 case DLM_LKSTS_WAITING:
701 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
702 list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
703 else
704 list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
705 break;
706 case DLM_LKSTS_GRANTED:
707 /* convention says granted locks kept in order of grmode */
708 lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
709 lkb->lkb_grmode);
710 break;
711 case DLM_LKSTS_CONVERT:
712 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
713 list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
714 else
715 list_add_tail(&lkb->lkb_statequeue,
716 &r->res_convertqueue);
717 break;
718 default:
719 DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
720 }
721}
722
723static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
724{
725 lkb->lkb_status = 0;
726 list_del(&lkb->lkb_statequeue);
727 unhold_lkb(lkb);
728}
729
730static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
731{
732 hold_lkb(lkb);
733 del_lkb(r, lkb);
734 add_lkb(r, lkb, sts);
735 unhold_lkb(lkb);
736}
737
738/* add/remove lkb from global waiters list of lkb's waiting for
739 a reply from a remote node */
740
741static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
742{
743 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
744
745 mutex_lock(&ls->ls_waiters_mutex);
746 if (lkb->lkb_wait_type) {
747 log_print("add_to_waiters error %d", lkb->lkb_wait_type);
748 goto out;
749 }
750 lkb->lkb_wait_type = mstype;
751 kref_get(&lkb->lkb_ref);
752 list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
753 out:
754 mutex_unlock(&ls->ls_waiters_mutex);
755}
756
757static int _remove_from_waiters(struct dlm_lkb *lkb)
758{
759 int error = 0;
760
761 if (!lkb->lkb_wait_type) {
762 log_print("remove_from_waiters error");
763 error = -EINVAL;
764 goto out;
765 }
766 lkb->lkb_wait_type = 0;
767 list_del(&lkb->lkb_wait_reply);
768 unhold_lkb(lkb);
769 out:
770 return error;
771}
772
773static int remove_from_waiters(struct dlm_lkb *lkb)
774{
775 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
776 int error;
777
778 mutex_lock(&ls->ls_waiters_mutex);
779 error = _remove_from_waiters(lkb);
780 mutex_unlock(&ls->ls_waiters_mutex);
781 return error;
782}
783
784static void dir_remove(struct dlm_rsb *r)
785{
786 int to_nodeid;
787
788 if (dlm_no_directory(r->res_ls))
789 return;
790
791 to_nodeid = dlm_dir_nodeid(r);
792 if (to_nodeid != dlm_our_nodeid())
793 send_remove(r);
794 else
795 dlm_dir_remove_entry(r->res_ls, to_nodeid,
796 r->res_name, r->res_length);
797}
798
799/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
800 found since they are in order of newest to oldest? */
801
802static int shrink_bucket(struct dlm_ls *ls, int b)
803{
804 struct dlm_rsb *r;
805 int count = 0, found;
806
807 for (;;) {
808 found = 0;
809 write_lock(&ls->ls_rsbtbl[b].lock);
810 list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
811 res_hashchain) {
812 if (!time_after_eq(jiffies, r->res_toss_time +
813 dlm_config.toss_secs * HZ))
814 continue;
815 found = 1;
816 break;
817 }
818
819 if (!found) {
820 write_unlock(&ls->ls_rsbtbl[b].lock);
821 break;
822 }
823
824 if (kref_put(&r->res_ref, kill_rsb)) {
825 list_del(&r->res_hashchain);
826 write_unlock(&ls->ls_rsbtbl[b].lock);
827
828 if (is_master(r))
829 dir_remove(r);
830 free_rsb(r);
831 count++;
832 } else {
833 write_unlock(&ls->ls_rsbtbl[b].lock);
834 log_error(ls, "tossed rsb in use %s", r->res_name);
835 }
836 }
837
838 return count;
839}
840
841void dlm_scan_rsbs(struct dlm_ls *ls)
842{
843 int i;
844
845 if (dlm_locking_stopped(ls))
846 return;
847
848 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
849 shrink_bucket(ls, i);
850 cond_resched();
851 }
852}
853
854/* lkb is master or local copy */
855
856static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
857{
858 int b, len = r->res_ls->ls_lvblen;
859
860 /* b=1 lvb returned to caller
861 b=0 lvb written to rsb or invalidated
862 b=-1 do nothing */
863
864 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
865
866 if (b == 1) {
867 if (!lkb->lkb_lvbptr)
868 return;
869
870 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
871 return;
872
873 if (!r->res_lvbptr)
874 return;
875
876 memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
877 lkb->lkb_lvbseq = r->res_lvbseq;
878
879 } else if (b == 0) {
880 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
881 rsb_set_flag(r, RSB_VALNOTVALID);
882 return;
883 }
884
885 if (!lkb->lkb_lvbptr)
886 return;
887
888 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
889 return;
890
891 if (!r->res_lvbptr)
892 r->res_lvbptr = allocate_lvb(r->res_ls);
893
894 if (!r->res_lvbptr)
895 return;
896
897 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
898 r->res_lvbseq++;
899 lkb->lkb_lvbseq = r->res_lvbseq;
900 rsb_clear_flag(r, RSB_VALNOTVALID);
901 }
902
903 if (rsb_flag(r, RSB_VALNOTVALID))
904 lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
905}
906
907static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
908{
909 if (lkb->lkb_grmode < DLM_LOCK_PW)
910 return;
911
912 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
913 rsb_set_flag(r, RSB_VALNOTVALID);
914 return;
915 }
916
917 if (!lkb->lkb_lvbptr)
918 return;
919
920 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
921 return;
922
923 if (!r->res_lvbptr)
924 r->res_lvbptr = allocate_lvb(r->res_ls);
925
926 if (!r->res_lvbptr)
927 return;
928
929 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
930 r->res_lvbseq++;
931 rsb_clear_flag(r, RSB_VALNOTVALID);
932}
933
934/* lkb is process copy (pc) */
935
936static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
937 struct dlm_message *ms)
938{
939 int b;
940
941 if (!lkb->lkb_lvbptr)
942 return;
943
944 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
945 return;
946
947 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
948 if (b == 1) {
949 int len = receive_extralen(ms);
950 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
951 lkb->lkb_lvbseq = ms->m_lvbseq;
952 }
953}
954
955/* Manipulate lkb's on rsb's convert/granted/waiting queues
956 remove_lock -- used for unlock, removes lkb from granted
957 revert_lock -- used for cancel, moves lkb from convert to granted
958 grant_lock -- used for request and convert, adds lkb to granted or
959 moves lkb from convert or waiting to granted
960
961 Each of these is used for master or local copy lkb's. There is
962 also a _pc() variation used to make the corresponding change on
963 a process copy (pc) lkb. */
964
965static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
966{
967 del_lkb(r, lkb);
968 lkb->lkb_grmode = DLM_LOCK_IV;
969 /* this unhold undoes the original ref from create_lkb()
970 so this leads to the lkb being freed */
971 unhold_lkb(lkb);
972}
973
974static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
975{
976 set_lvb_unlock(r, lkb);
977 _remove_lock(r, lkb);
978}
979
980static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
981{
982 _remove_lock(r, lkb);
983}
984
985static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
986{
987 lkb->lkb_rqmode = DLM_LOCK_IV;
988
989 switch (lkb->lkb_status) {
990 case DLM_LKSTS_GRANTED:
991 break;
992 case DLM_LKSTS_CONVERT:
993 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
994 break;
995 case DLM_LKSTS_WAITING:
996 del_lkb(r, lkb);
997 lkb->lkb_grmode = DLM_LOCK_IV;
998 /* this unhold undoes the original ref from create_lkb()
999 so this leads to the lkb being freed */
1000 unhold_lkb(lkb);
1001 break;
1002 default:
1003 log_print("invalid status for revert %d", lkb->lkb_status);
1004 }
1005}
1006
1007static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
1008{
1009 revert_lock(r, lkb);
1010}
1011
1012static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1013{
1014 if (lkb->lkb_grmode != lkb->lkb_rqmode) {
1015 lkb->lkb_grmode = lkb->lkb_rqmode;
1016 if (lkb->lkb_status)
1017 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
1018 else
1019 add_lkb(r, lkb, DLM_LKSTS_GRANTED);
1020 }
1021
1022 lkb->lkb_rqmode = DLM_LOCK_IV;
1023}
1024
1025static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1026{
1027 set_lvb_lock(r, lkb);
1028 _grant_lock(r, lkb);
1029 lkb->lkb_highbast = 0;
1030}
1031
1032static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
1033 struct dlm_message *ms)
1034{
1035 set_lvb_lock_pc(r, lkb, ms);
1036 _grant_lock(r, lkb);
1037}
1038
1039/* called by grant_pending_locks() which means an async grant message must
1040 be sent to the requesting node in addition to granting the lock if the
1041 lkb belongs to a remote node. */
1042
1043static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
1044{
1045 grant_lock(r, lkb);
1046 if (is_master_copy(lkb))
1047 send_grant(r, lkb);
1048 else
1049 queue_cast(r, lkb, 0);
1050}
1051
1052static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
1053{
1054 struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
1055 lkb_statequeue);
1056 if (lkb->lkb_id == first->lkb_id)
1057 return 1;
1058
1059 return 0;
1060}
1061
1062/* Check if the given lkb conflicts with another lkb on the queue. */
1063
1064static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
1065{
1066 struct dlm_lkb *this;
1067
1068 list_for_each_entry(this, head, lkb_statequeue) {
1069 if (this == lkb)
1070 continue;
1071 if (!modes_compat(this, lkb))
1072 return 1;
1073 }
1074 return 0;
1075}
1076
1077/*
1078 * "A conversion deadlock arises with a pair of lock requests in the converting
1079 * queue for one resource. The granted mode of each lock blocks the requested
1080 * mode of the other lock."
1081 *
1082 * Part 2: if the granted mode of lkb is preventing the first lkb in the
1083 * convert queue from being granted, then demote lkb (set grmode to NL).
1084 * This second form requires that we check for conv-deadlk even when
1085 * now == 0 in _can_be_granted().
1086 *
1087 * Example:
1088 * Granted Queue: empty
1089 * Convert Queue: NL->EX (first lock)
1090 * PR->EX (second lock)
1091 *
1092 * The first lock can't be granted because of the granted mode of the second
1093 * lock and the second lock can't be granted because it's not first in the
1094 * list. We demote the granted mode of the second lock (the lkb passed to this
1095 * function).
1096 *
1097 * After the resolution, the "grant pending" function needs to go back and try
1098 * to grant locks on the convert queue again since the first lock can now be
1099 * granted.
1100 */
1101
1102static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
1103{
1104 struct dlm_lkb *this, *first = NULL, *self = NULL;
1105
1106 list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
1107 if (!first)
1108 first = this;
1109 if (this == lkb) {
1110 self = lkb;
1111 continue;
1112 }
1113
1114 if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
1115 return 1;
1116 }
1117
1118 /* if lkb is on the convert queue and is preventing the first
1119 from being granted, then there's deadlock and we demote lkb.
1120 multiple converting locks may need to do this before the first
1121 converting lock can be granted. */
1122
1123 if (self && self != first) {
1124 if (!modes_compat(lkb, first) &&
1125 !queue_conflict(&rsb->res_grantqueue, first))
1126 return 1;
1127 }
1128
1129 return 0;
1130}
1131
1132/*
1133 * Return 1 if the lock can be granted, 0 otherwise.
1134 * Also detect and resolve conversion deadlocks.
1135 *
1136 * lkb is the lock to be granted
1137 *
1138 * now is 1 if the function is being called in the context of the
1139 * immediate request, it is 0 if called later, after the lock has been
1140 * queued.
1141 *
1142 * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
1143 */
1144
1145static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1146{
1147 int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
1148
1149 /*
1150 * 6-10: Version 5.4 introduced an option to address the phenomenon of
1151 * a new request for a NL mode lock being blocked.
1152 *
1153 * 6-11: If the optional EXPEDITE flag is used with the new NL mode
1154 * request, then it would be granted. In essence, the use of this flag
1155 * tells the Lock Manager to expedite theis request by not considering
1156 * what may be in the CONVERTING or WAITING queues... As of this
1157 * writing, the EXPEDITE flag can be used only with new requests for NL
1158 * mode locks. This flag is not valid for conversion requests.
1159 *
1160 * A shortcut. Earlier checks return an error if EXPEDITE is used in a
1161 * conversion or used with a non-NL requested mode. We also know an
1162 * EXPEDITE request is always granted immediately, so now must always
1163 * be 1. The full condition to grant an expedite request: (now &&
1164 * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
1165 * therefore be shortened to just checking the flag.
1166 */
1167
1168 if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
1169 return 1;
1170
1171 /*
1172 * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
1173 * added to the remaining conditions.
1174 */
1175
1176 if (queue_conflict(&r->res_grantqueue, lkb))
1177 goto out;
1178
1179 /*
1180 * 6-3: By default, a conversion request is immediately granted if the
1181 * requested mode is compatible with the modes of all other granted
1182 * locks
1183 */
1184
1185 if (queue_conflict(&r->res_convertqueue, lkb))
1186 goto out;
1187
1188 /*
1189 * 6-5: But the default algorithm for deciding whether to grant or
1190 * queue conversion requests does not by itself guarantee that such
1191 * requests are serviced on a "first come first serve" basis. This, in
1192 * turn, can lead to a phenomenon known as "indefinate postponement".
1193 *
1194 * 6-7: This issue is dealt with by using the optional QUECVT flag with
1195 * the system service employed to request a lock conversion. This flag
1196 * forces certain conversion requests to be queued, even if they are
1197 * compatible with the granted modes of other locks on the same
1198 * resource. Thus, the use of this flag results in conversion requests
1199 * being ordered on a "first come first servce" basis.
1200 *
1201 * DCT: This condition is all about new conversions being able to occur
1202 * "in place" while the lock remains on the granted queue (assuming
1203 * nothing else conflicts.) IOW if QUECVT isn't set, a conversion
1204 * doesn't _have_ to go onto the convert queue where it's processed in
1205 * order. The "now" variable is necessary to distinguish converts
1206 * being received and processed for the first time now, because once a
1207 * convert is moved to the conversion queue the condition below applies
1208 * requiring fifo granting.
1209 */
1210
1211 if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
1212 return 1;
1213
1214 /*
1215 * The NOORDER flag is set to avoid the standard vms rules on grant
1216 * order.
1217 */
1218
1219 if (lkb->lkb_exflags & DLM_LKF_NOORDER)
1220 return 1;
1221
1222 /*
1223 * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
1224 * granted until all other conversion requests ahead of it are granted
1225 * and/or canceled.
1226 */
1227
1228 if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
1229 return 1;
1230
1231 /*
1232 * 6-4: By default, a new request is immediately granted only if all
1233 * three of the following conditions are satisfied when the request is
1234 * issued:
1235 * - The queue of ungranted conversion requests for the resource is
1236 * empty.
1237 * - The queue of ungranted new requests for the resource is empty.
1238 * - The mode of the new request is compatible with the most
1239 * restrictive mode of all granted locks on the resource.
1240 */
1241
1242 if (now && !conv && list_empty(&r->res_convertqueue) &&
1243 list_empty(&r->res_waitqueue))
1244 return 1;
1245
1246 /*
1247 * 6-4: Once a lock request is in the queue of ungranted new requests,
1248 * it cannot be granted until the queue of ungranted conversion
1249 * requests is empty, all ungranted new requests ahead of it are
1250 * granted and/or canceled, and it is compatible with the granted mode
1251 * of the most restrictive lock granted on the resource.
1252 */
1253
1254 if (!now && !conv && list_empty(&r->res_convertqueue) &&
1255 first_in_list(lkb, &r->res_waitqueue))
1256 return 1;
1257
1258 out:
1259 /*
1260 * The following, enabled by CONVDEADLK, departs from VMS.
1261 */
1262
1263 if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
1264 conversion_deadlock_detect(r, lkb)) {
1265 lkb->lkb_grmode = DLM_LOCK_NL;
1266 lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
1267 }
1268
1269 return 0;
1270}
1271
1272/*
1273 * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a
1274 * simple way to provide a big optimization to applications that can use them.
1275 */
1276
1277static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1278{
1279 uint32_t flags = lkb->lkb_exflags;
1280 int rv;
1281 int8_t alt = 0, rqmode = lkb->lkb_rqmode;
1282
1283 rv = _can_be_granted(r, lkb, now);
1284 if (rv)
1285 goto out;
1286
1287 if (lkb->lkb_sbflags & DLM_SBF_DEMOTED)
1288 goto out;
1289
1290 if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR)
1291 alt = DLM_LOCK_PR;
1292 else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW)
1293 alt = DLM_LOCK_CW;
1294
1295 if (alt) {
1296 lkb->lkb_rqmode = alt;
1297 rv = _can_be_granted(r, lkb, now);
1298 if (rv)
1299 lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
1300 else
1301 lkb->lkb_rqmode = rqmode;
1302 }
1303 out:
1304 return rv;
1305}
1306
1307static int grant_pending_convert(struct dlm_rsb *r, int high)
1308{
1309 struct dlm_lkb *lkb, *s;
1310 int hi, demoted, quit, grant_restart, demote_restart;
1311
1312 quit = 0;
1313 restart:
1314 grant_restart = 0;
1315 demote_restart = 0;
1316 hi = DLM_LOCK_IV;
1317
1318 list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
1319 demoted = is_demoted(lkb);
1320 if (can_be_granted(r, lkb, 0)) {
1321 grant_lock_pending(r, lkb);
1322 grant_restart = 1;
1323 } else {
1324 hi = max_t(int, lkb->lkb_rqmode, hi);
1325 if (!demoted && is_demoted(lkb))
1326 demote_restart = 1;
1327 }
1328 }
1329
1330 if (grant_restart)
1331 goto restart;
1332 if (demote_restart && !quit) {
1333 quit = 1;
1334 goto restart;
1335 }
1336
1337 return max_t(int, high, hi);
1338}
1339
1340static int grant_pending_wait(struct dlm_rsb *r, int high)
1341{
1342 struct dlm_lkb *lkb, *s;
1343
1344 list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
1345 if (can_be_granted(r, lkb, 0))
1346 grant_lock_pending(r, lkb);
1347 else
1348 high = max_t(int, lkb->lkb_rqmode, high);
1349 }
1350
1351 return high;
1352}
1353
1354static void grant_pending_locks(struct dlm_rsb *r)
1355{
1356 struct dlm_lkb *lkb, *s;
1357 int high = DLM_LOCK_IV;
1358
1359 DLM_ASSERT(is_master(r), dlm_dump_rsb(r););
1360
1361 high = grant_pending_convert(r, high);
1362 high = grant_pending_wait(r, high);
1363
1364 if (high == DLM_LOCK_IV)
1365 return;
1366
1367 /*
1368 * If there are locks left on the wait/convert queue then send blocking
1369 * ASTs to granted locks based on the largest requested mode (high)
1370 * found above. FIXME: highbast < high comparison not valid for PR/CW.
1371 */
1372
1373 list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
1374 if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
1375 !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
1376 queue_bast(r, lkb, high);
1377 lkb->lkb_highbast = high;
1378 }
1379 }
1380}
1381
1382static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
1383 struct dlm_lkb *lkb)
1384{
1385 struct dlm_lkb *gr;
1386
1387 list_for_each_entry(gr, head, lkb_statequeue) {
1388 if (gr->lkb_bastaddr &&
1389 gr->lkb_highbast < lkb->lkb_rqmode &&
1390 !modes_compat(gr, lkb)) {
1391 queue_bast(r, gr, lkb->lkb_rqmode);
1392 gr->lkb_highbast = lkb->lkb_rqmode;
1393 }
1394 }
1395}
1396
1397static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
1398{
1399 send_bast_queue(r, &r->res_grantqueue, lkb);
1400}
1401
1402static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
1403{
1404 send_bast_queue(r, &r->res_grantqueue, lkb);
1405 send_bast_queue(r, &r->res_convertqueue, lkb);
1406}
1407
1408/* set_master(r, lkb) -- set the master nodeid of a resource
1409
1410 The purpose of this function is to set the nodeid field in the given
1411 lkb using the nodeid field in the given rsb. If the rsb's nodeid is
1412 known, it can just be copied to the lkb and the function will return
1413 0. If the rsb's nodeid is _not_ known, it needs to be looked up
1414 before it can be copied to the lkb.
1415
1416 When the rsb nodeid is being looked up remotely, the initial lkb
1417 causing the lookup is kept on the ls_waiters list waiting for the
1418 lookup reply. Other lkb's waiting for the same rsb lookup are kept
1419 on the rsb's res_lookup list until the master is verified.
1420
1421 Return values:
1422 0: nodeid is set in rsb/lkb and the caller should go ahead and use it
1423 1: the rsb master is not available and the lkb has been placed on
1424 a wait queue
1425*/
1426
1427static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1428{
1429 struct dlm_ls *ls = r->res_ls;
1430 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
1431
1432 if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
1433 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
1434 r->res_first_lkid = lkb->lkb_id;
1435 lkb->lkb_nodeid = r->res_nodeid;
1436 return 0;
1437 }
1438
1439 if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
1440 list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
1441 return 1;
1442 }
1443
1444 if (r->res_nodeid == 0) {
1445 lkb->lkb_nodeid = 0;
1446 return 0;
1447 }
1448
1449 if (r->res_nodeid > 0) {
1450 lkb->lkb_nodeid = r->res_nodeid;
1451 return 0;
1452 }
1453
1454 DLM_ASSERT(r->res_nodeid == -1, dlm_dump_rsb(r););
1455
1456 dir_nodeid = dlm_dir_nodeid(r);
1457
1458 if (dir_nodeid != our_nodeid) {
1459 r->res_first_lkid = lkb->lkb_id;
1460 send_lookup(r, lkb);
1461 return 1;
1462 }
1463
1464 for (;;) {
1465 /* It's possible for dlm_scand to remove an old rsb for
1466 this same resource from the toss list, us to create
1467 a new one, look up the master locally, and find it
1468 already exists just before dlm_scand does the
1469 dir_remove() on the previous rsb. */
1470
1471 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
1472 r->res_length, &ret_nodeid);
1473 if (!error)
1474 break;
1475 log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
1476 schedule();
1477 }
1478
1479 if (ret_nodeid == our_nodeid) {
1480 r->res_first_lkid = 0;
1481 r->res_nodeid = 0;
1482 lkb->lkb_nodeid = 0;
1483 } else {
1484 r->res_first_lkid = lkb->lkb_id;
1485 r->res_nodeid = ret_nodeid;
1486 lkb->lkb_nodeid = ret_nodeid;
1487 }
1488 return 0;
1489}
1490
1491static void process_lookup_list(struct dlm_rsb *r)
1492{
1493 struct dlm_lkb *lkb, *safe;
1494
1495 list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
1496 list_del(&lkb->lkb_rsb_lookup);
1497 _request_lock(r, lkb);
1498 schedule();
1499 }
1500}
1501
1502/* confirm_master -- confirm (or deny) an rsb's master nodeid */
1503
1504static void confirm_master(struct dlm_rsb *r, int error)
1505{
1506 struct dlm_lkb *lkb;
1507
1508 if (!r->res_first_lkid)
1509 return;
1510
1511 switch (error) {
1512 case 0:
1513 case -EINPROGRESS:
1514 r->res_first_lkid = 0;
1515 process_lookup_list(r);
1516 break;
1517
1518 case -EAGAIN:
1519 /* the remote master didn't queue our NOQUEUE request;
1520 make a waiting lkb the first_lkid */
1521
1522 r->res_first_lkid = 0;
1523
1524 if (!list_empty(&r->res_lookup)) {
1525 lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
1526 lkb_rsb_lookup);
1527 list_del(&lkb->lkb_rsb_lookup);
1528 r->res_first_lkid = lkb->lkb_id;
1529 _request_lock(r, lkb);
1530 } else
1531 r->res_nodeid = -1;
1532 break;
1533
1534 default:
1535 log_error(r->res_ls, "confirm_master unknown error %d", error);
1536 }
1537}
1538
1539static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
1540 int namelen, uint32_t parent_lkid, void *ast,
1541 void *astarg, void *bast, struct dlm_args *args)
1542{
1543 int rv = -EINVAL;
1544
1545 /* check for invalid arg usage */
1546
1547 if (mode < 0 || mode > DLM_LOCK_EX)
1548 goto out;
1549
1550 if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
1551 goto out;
1552
1553 if (flags & DLM_LKF_CANCEL)
1554 goto out;
1555
1556 if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
1557 goto out;
1558
1559 if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
1560 goto out;
1561
1562 if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
1563 goto out;
1564
1565 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
1566 goto out;
1567
1568 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
1569 goto out;
1570
1571 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
1572 goto out;
1573
1574 if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
1575 goto out;
1576
1577 if (!ast || !lksb)
1578 goto out;
1579
1580 if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
1581 goto out;
1582
1583 /* parent/child locks not yet supported */
1584 if (parent_lkid)
1585 goto out;
1586
1587 if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
1588 goto out;
1589
1590 /* these args will be copied to the lkb in validate_lock_args,
1591 it cannot be done now because when converting locks, fields in
1592 an active lkb cannot be modified before locking the rsb */
1593
1594 args->flags = flags;
1595 args->astaddr = ast;
1596 args->astparam = (long) astarg;
1597 args->bastaddr = bast;
1598 args->mode = mode;
1599 args->lksb = lksb;
1600 rv = 0;
1601 out:
1602 return rv;
1603}
1604
1605static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
1606{
1607 if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
1608 DLM_LKF_FORCEUNLOCK))
1609 return -EINVAL;
1610
1611 args->flags = flags;
1612 args->astparam = (long) astarg;
1613 return 0;
1614}
1615
1616static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
1617 struct dlm_args *args)
1618{
1619 int rv = -EINVAL;
1620
1621 if (args->flags & DLM_LKF_CONVERT) {
1622 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1623 goto out;
1624
1625 if (args->flags & DLM_LKF_QUECVT &&
1626 !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
1627 goto out;
1628
1629 rv = -EBUSY;
1630 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
1631 goto out;
1632
1633 if (lkb->lkb_wait_type)
1634 goto out;
1635 }
1636
1637 lkb->lkb_exflags = args->flags;
1638 lkb->lkb_sbflags = 0;
1639 lkb->lkb_astaddr = args->astaddr;
1640 lkb->lkb_astparam = args->astparam;
1641 lkb->lkb_bastaddr = args->bastaddr;
1642 lkb->lkb_rqmode = args->mode;
1643 lkb->lkb_lksb = args->lksb;
1644 lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
1645 lkb->lkb_ownpid = (int) current->pid;
1646 rv = 0;
1647 out:
1648 return rv;
1649}
1650
1651static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
1652{
1653 int rv = -EINVAL;
1654
1655 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1656 goto out;
1657
1658 if (args->flags & DLM_LKF_FORCEUNLOCK)
1659 goto out_ok;
1660
1661 if (args->flags & DLM_LKF_CANCEL &&
1662 lkb->lkb_status == DLM_LKSTS_GRANTED)
1663 goto out;
1664
1665 if (!(args->flags & DLM_LKF_CANCEL) &&
1666 lkb->lkb_status != DLM_LKSTS_GRANTED)
1667 goto out;
1668
1669 rv = -EBUSY;
1670 if (lkb->lkb_wait_type)
1671 goto out;
1672
1673 out_ok:
1674 lkb->lkb_exflags = args->flags;
1675 lkb->lkb_sbflags = 0;
1676 lkb->lkb_astparam = args->astparam;
1677
1678 rv = 0;
1679 out:
1680 return rv;
1681}
1682
1683/*
1684 * Four stage 4 varieties:
1685 * do_request(), do_convert(), do_unlock(), do_cancel()
1686 * These are called on the master node for the given lock and
1687 * from the central locking logic.
1688 */
1689
1690static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
1691{
1692 int error = 0;
1693
1694 if (can_be_granted(r, lkb, 1)) {
1695 grant_lock(r, lkb);
1696 queue_cast(r, lkb, 0);
1697 goto out;
1698 }
1699
1700 if (can_be_queued(lkb)) {
1701 error = -EINPROGRESS;
1702 add_lkb(r, lkb, DLM_LKSTS_WAITING);
1703 send_blocking_asts(r, lkb);
1704 goto out;
1705 }
1706
1707 error = -EAGAIN;
1708 if (force_blocking_asts(lkb))
1709 send_blocking_asts_all(r, lkb);
1710 queue_cast(r, lkb, -EAGAIN);
1711
1712 out:
1713 return error;
1714}
1715
1716static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
1717{
1718 int error = 0;
1719
1720 /* changing an existing lock may allow others to be granted */
1721
1722 if (can_be_granted(r, lkb, 1)) {
1723 grant_lock(r, lkb);
1724 queue_cast(r, lkb, 0);
1725 grant_pending_locks(r);
1726 goto out;
1727 }
1728
1729 if (can_be_queued(lkb)) {
1730 if (is_demoted(lkb))
1731 grant_pending_locks(r);
1732 error = -EINPROGRESS;
1733 del_lkb(r, lkb);
1734 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
1735 send_blocking_asts(r, lkb);
1736 goto out;
1737 }
1738
1739 error = -EAGAIN;
1740 if (force_blocking_asts(lkb))
1741 send_blocking_asts_all(r, lkb);
1742 queue_cast(r, lkb, -EAGAIN);
1743
1744 out:
1745 return error;
1746}
1747
1748static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1749{
1750 remove_lock(r, lkb);
1751 queue_cast(r, lkb, -DLM_EUNLOCK);
1752 grant_pending_locks(r);
1753 return -DLM_EUNLOCK;
1754}
1755
1756/* FIXME: if revert_lock() finds that the lkb is granted, we should
1757 skip the queue_cast(ECANCEL). It indicates that the request/convert
1758 completed (and queued a normal ast) just before the cancel; we don't
1759 want to clobber the sb_result for the normal ast with ECANCEL. */
1760
1761static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
1762{
1763 revert_lock(r, lkb);
1764 queue_cast(r, lkb, -DLM_ECANCEL);
1765 grant_pending_locks(r);
1766 return -DLM_ECANCEL;
1767}
1768
1769/*
1770 * Four stage 3 varieties:
1771 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
1772 */
1773
1774/* add a new lkb to a possibly new rsb, called by requesting process */
1775
1776static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1777{
1778 int error;
1779
1780 /* set_master: sets lkb nodeid from r */
1781
1782 error = set_master(r, lkb);
1783 if (error < 0)
1784 goto out;
1785 if (error) {
1786 error = 0;
1787 goto out;
1788 }
1789
1790 if (is_remote(r))
1791 /* receive_request() calls do_request() on remote node */
1792 error = send_request(r, lkb);
1793 else
1794 error = do_request(r, lkb);
1795 out:
1796 return error;
1797}
1798
1799/* change some property of an existing lkb, e.g. mode */
1800
1801static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1802{
1803 int error;
1804
1805 if (is_remote(r))
1806 /* receive_convert() calls do_convert() on remote node */
1807 error = send_convert(r, lkb);
1808 else
1809 error = do_convert(r, lkb);
1810
1811 return error;
1812}
1813
1814/* remove an existing lkb from the granted queue */
1815
1816static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1817{
1818 int error;
1819
1820 if (is_remote(r))
1821 /* receive_unlock() calls do_unlock() on remote node */
1822 error = send_unlock(r, lkb);
1823 else
1824 error = do_unlock(r, lkb);
1825
1826 return error;
1827}
1828
1829/* remove an existing lkb from the convert or wait queue */
1830
1831static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1832{
1833 int error;
1834
1835 if (is_remote(r))
1836 /* receive_cancel() calls do_cancel() on remote node */
1837 error = send_cancel(r, lkb);
1838 else
1839 error = do_cancel(r, lkb);
1840
1841 return error;
1842}
1843
1844/*
1845 * Four stage 2 varieties:
1846 * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
1847 */
1848
1849static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
1850 int len, struct dlm_args *args)
1851{
1852 struct dlm_rsb *r;
1853 int error;
1854
1855 error = validate_lock_args(ls, lkb, args);
1856 if (error)
1857 goto out;
1858
1859 error = find_rsb(ls, name, len, R_CREATE, &r);
1860 if (error)
1861 goto out;
1862
1863 lock_rsb(r);
1864
1865 attach_lkb(r, lkb);
1866 lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
1867
1868 error = _request_lock(r, lkb);
1869
1870 unlock_rsb(r);
1871 put_rsb(r);
1872
1873 out:
1874 return error;
1875}
1876
1877static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1878 struct dlm_args *args)
1879{
1880 struct dlm_rsb *r;
1881 int error;
1882
1883 r = lkb->lkb_resource;
1884
1885 hold_rsb(r);
1886 lock_rsb(r);
1887
1888 error = validate_lock_args(ls, lkb, args);
1889 if (error)
1890 goto out;
1891
1892 error = _convert_lock(r, lkb);
1893 out:
1894 unlock_rsb(r);
1895 put_rsb(r);
1896 return error;
1897}
1898
1899static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1900 struct dlm_args *args)
1901{
1902 struct dlm_rsb *r;
1903 int error;
1904
1905 r = lkb->lkb_resource;
1906
1907 hold_rsb(r);
1908 lock_rsb(r);
1909
1910 error = validate_unlock_args(lkb, args);
1911 if (error)
1912 goto out;
1913
1914 error = _unlock_lock(r, lkb);
1915 out:
1916 unlock_rsb(r);
1917 put_rsb(r);
1918 return error;
1919}
1920
1921static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1922 struct dlm_args *args)
1923{
1924 struct dlm_rsb *r;
1925 int error;
1926
1927 r = lkb->lkb_resource;
1928
1929 hold_rsb(r);
1930 lock_rsb(r);
1931
1932 error = validate_unlock_args(lkb, args);
1933 if (error)
1934 goto out;
1935
1936 error = _cancel_lock(r, lkb);
1937 out:
1938 unlock_rsb(r);
1939 put_rsb(r);
1940 return error;
1941}
1942
1943/*
1944 * Two stage 1 varieties: dlm_lock() and dlm_unlock()
1945 */
1946
1947int dlm_lock(dlm_lockspace_t *lockspace,
1948 int mode,
1949 struct dlm_lksb *lksb,
1950 uint32_t flags,
1951 void *name,
1952 unsigned int namelen,
1953 uint32_t parent_lkid,
1954 void (*ast) (void *astarg),
1955 void *astarg,
1956 void (*bast) (void *astarg, int mode))
1957{
1958 struct dlm_ls *ls;
1959 struct dlm_lkb *lkb;
1960 struct dlm_args args;
1961 int error, convert = flags & DLM_LKF_CONVERT;
1962
1963 ls = dlm_find_lockspace_local(lockspace);
1964 if (!ls)
1965 return -EINVAL;
1966
1967 lock_recovery(ls);
1968
1969 if (convert)
1970 error = find_lkb(ls, lksb->sb_lkid, &lkb);
1971 else
1972 error = create_lkb(ls, &lkb);
1973
1974 if (error)
1975 goto out;
1976
1977 error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast,
1978 astarg, bast, &args);
1979 if (error)
1980 goto out_put;
1981
1982 if (convert)
1983 error = convert_lock(ls, lkb, &args);
1984 else
1985 error = request_lock(ls, lkb, name, namelen, &args);
1986
1987 if (error == -EINPROGRESS)
1988 error = 0;
1989 out_put:
1990 if (convert || error)
1991 __put_lkb(ls, lkb);
1992 if (error == -EAGAIN)
1993 error = 0;
1994 out:
1995 unlock_recovery(ls);
1996 dlm_put_lockspace(ls);
1997 return error;
1998}
1999
2000int dlm_unlock(dlm_lockspace_t *lockspace,
2001 uint32_t lkid,
2002 uint32_t flags,
2003 struct dlm_lksb *lksb,
2004 void *astarg)
2005{
2006 struct dlm_ls *ls;
2007 struct dlm_lkb *lkb;
2008 struct dlm_args args;
2009 int error;
2010
2011 ls = dlm_find_lockspace_local(lockspace);
2012 if (!ls)
2013 return -EINVAL;
2014
2015 lock_recovery(ls);
2016
2017 error = find_lkb(ls, lkid, &lkb);
2018 if (error)
2019 goto out;
2020
2021 error = set_unlock_args(flags, astarg, &args);
2022 if (error)
2023 goto out_put;
2024
2025 if (flags & DLM_LKF_CANCEL)
2026 error = cancel_lock(ls, lkb, &args);
2027 else
2028 error = unlock_lock(ls, lkb, &args);
2029
2030 if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
2031 error = 0;
2032 out_put:
2033 dlm_put_lkb(lkb);
2034 out:
2035 unlock_recovery(ls);
2036 dlm_put_lockspace(ls);
2037 return error;
2038}
2039
2040/*
2041 * send/receive routines for remote operations and replies
2042 *
2043 * send_args
2044 * send_common
2045 * send_request receive_request
2046 * send_convert receive_convert
2047 * send_unlock receive_unlock
2048 * send_cancel receive_cancel
2049 * send_grant receive_grant
2050 * send_bast receive_bast
2051 * send_lookup receive_lookup
2052 * send_remove receive_remove
2053 *
2054 * send_common_reply
2055 * receive_request_reply send_request_reply
2056 * receive_convert_reply send_convert_reply
2057 * receive_unlock_reply send_unlock_reply
2058 * receive_cancel_reply send_cancel_reply
2059 * receive_lookup_reply send_lookup_reply
2060 */
2061
2062static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
2063 int to_nodeid, int mstype,
2064 struct dlm_message **ms_ret,
2065 struct dlm_mhandle **mh_ret)
2066{
2067 struct dlm_message *ms;
2068 struct dlm_mhandle *mh;
2069 char *mb;
2070 int mb_len = sizeof(struct dlm_message);
2071
2072 switch (mstype) {
2073 case DLM_MSG_REQUEST:
2074 case DLM_MSG_LOOKUP:
2075 case DLM_MSG_REMOVE:
2076 mb_len += r->res_length;
2077 break;
2078 case DLM_MSG_CONVERT:
2079 case DLM_MSG_UNLOCK:
2080 case DLM_MSG_REQUEST_REPLY:
2081 case DLM_MSG_CONVERT_REPLY:
2082 case DLM_MSG_GRANT:
2083 if (lkb && lkb->lkb_lvbptr)
2084 mb_len += r->res_ls->ls_lvblen;
2085 break;
2086 }
2087
2088 /* get_buffer gives us a message handle (mh) that we need to
2089 pass into lowcomms_commit and a message buffer (mb) that we
2090 write our data into */
2091
2092 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
2093 if (!mh)
2094 return -ENOBUFS;
2095
2096 memset(mb, 0, mb_len);
2097
2098 ms = (struct dlm_message *) mb;
2099
2100 ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
2101 ms->m_header.h_lockspace = r->res_ls->ls_global_id;
2102 ms->m_header.h_nodeid = dlm_our_nodeid();
2103 ms->m_header.h_length = mb_len;
2104 ms->m_header.h_cmd = DLM_MSG;
2105
2106 ms->m_type = mstype;
2107
2108 *mh_ret = mh;
2109 *ms_ret = ms;
2110 return 0;
2111}
2112
2113/* further lowcomms enhancements or alternate implementations may make
2114 the return value from this function useful at some point */
2115
2116static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
2117{
2118 dlm_message_out(ms);
2119 dlm_lowcomms_commit_buffer(mh);
2120 return 0;
2121}
2122
2123static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
2124 struct dlm_message *ms)
2125{
2126 ms->m_nodeid = lkb->lkb_nodeid;
2127 ms->m_pid = lkb->lkb_ownpid;
2128 ms->m_lkid = lkb->lkb_id;
2129 ms->m_remid = lkb->lkb_remid;
2130 ms->m_exflags = lkb->lkb_exflags;
2131 ms->m_sbflags = lkb->lkb_sbflags;
2132 ms->m_flags = lkb->lkb_flags;
2133 ms->m_lvbseq = lkb->lkb_lvbseq;
2134 ms->m_status = lkb->lkb_status;
2135 ms->m_grmode = lkb->lkb_grmode;
2136 ms->m_rqmode = lkb->lkb_rqmode;
2137 ms->m_hash = r->res_hash;
2138
2139 /* m_result and m_bastmode are set from function args,
2140 not from lkb fields */
2141
2142 if (lkb->lkb_bastaddr)
2143 ms->m_asts |= AST_BAST;
2144 if (lkb->lkb_astaddr)
2145 ms->m_asts |= AST_COMP;
2146
2147 if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
2148 memcpy(ms->m_extra, r->res_name, r->res_length);
2149
2150 else if (lkb->lkb_lvbptr)
2151 memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
2152
2153}
2154
2155static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
2156{
2157 struct dlm_message *ms;
2158 struct dlm_mhandle *mh;
2159 int to_nodeid, error;
2160
2161 add_to_waiters(lkb, mstype);
2162
2163 to_nodeid = r->res_nodeid;
2164
2165 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2166 if (error)
2167 goto fail;
2168
2169 send_args(r, lkb, ms);
2170
2171 error = send_message(mh, ms);
2172 if (error)
2173 goto fail;
2174 return 0;
2175
2176 fail:
2177 remove_from_waiters(lkb);
2178 return error;
2179}
2180
2181static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
2182{
2183 return send_common(r, lkb, DLM_MSG_REQUEST);
2184}
2185
2186static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2187{
2188 int error;
2189
2190 error = send_common(r, lkb, DLM_MSG_CONVERT);
2191
2192 /* down conversions go without a reply from the master */
2193 if (!error && down_conversion(lkb)) {
2194 remove_from_waiters(lkb);
2195 r->res_ls->ls_stub_ms.m_result = 0;
2196 r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
2197 __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
2198 }
2199
2200 return error;
2201}
2202
2203/* FIXME: if this lkb is the only lock we hold on the rsb, then set
2204 MASTER_UNCERTAIN to force the next request on the rsb to confirm
2205 that the master is still correct. */
2206
2207static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2208{
2209 return send_common(r, lkb, DLM_MSG_UNLOCK);
2210}
2211
2212static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
2213{
2214 return send_common(r, lkb, DLM_MSG_CANCEL);
2215}
2216
2217static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
2218{
2219 struct dlm_message *ms;
2220 struct dlm_mhandle *mh;
2221 int to_nodeid, error;
2222
2223 to_nodeid = lkb->lkb_nodeid;
2224
2225 error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
2226 if (error)
2227 goto out;
2228
2229 send_args(r, lkb, ms);
2230
2231 ms->m_result = 0;
2232
2233 error = send_message(mh, ms);
2234 out:
2235 return error;
2236}
2237
2238static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
2239{
2240 struct dlm_message *ms;
2241 struct dlm_mhandle *mh;
2242 int to_nodeid, error;
2243
2244 to_nodeid = lkb->lkb_nodeid;
2245
2246 error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
2247 if (error)
2248 goto out;
2249
2250 send_args(r, lkb, ms);
2251
2252 ms->m_bastmode = mode;
2253
2254 error = send_message(mh, ms);
2255 out:
2256 return error;
2257}
2258
2259static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
2260{
2261 struct dlm_message *ms;
2262 struct dlm_mhandle *mh;
2263 int to_nodeid, error;
2264
2265 add_to_waiters(lkb, DLM_MSG_LOOKUP);
2266
2267 to_nodeid = dlm_dir_nodeid(r);
2268
2269 error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
2270 if (error)
2271 goto fail;
2272
2273 send_args(r, lkb, ms);
2274
2275 error = send_message(mh, ms);
2276 if (error)
2277 goto fail;
2278 return 0;
2279
2280 fail:
2281 remove_from_waiters(lkb);
2282 return error;
2283}
2284
2285static int send_remove(struct dlm_rsb *r)
2286{
2287 struct dlm_message *ms;
2288 struct dlm_mhandle *mh;
2289 int to_nodeid, error;
2290
2291 to_nodeid = dlm_dir_nodeid(r);
2292
2293 error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
2294 if (error)
2295 goto out;
2296
2297 memcpy(ms->m_extra, r->res_name, r->res_length);
2298 ms->m_hash = r->res_hash;
2299
2300 error = send_message(mh, ms);
2301 out:
2302 return error;
2303}
2304
2305static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2306 int mstype, int rv)
2307{
2308 struct dlm_message *ms;
2309 struct dlm_mhandle *mh;
2310 int to_nodeid, error;
2311
2312 to_nodeid = lkb->lkb_nodeid;
2313
2314 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2315 if (error)
2316 goto out;
2317
2318 send_args(r, lkb, ms);
2319
2320 ms->m_result = rv;
2321
2322 error = send_message(mh, ms);
2323 out:
2324 return error;
2325}
2326
2327static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2328{
2329 return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
2330}
2331
2332static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2333{
2334 return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
2335}
2336
2337static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2338{
2339 return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
2340}
2341
2342static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2343{
2344 return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
2345}
2346
2347static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
2348 int ret_nodeid, int rv)
2349{
2350 struct dlm_rsb *r = &ls->ls_stub_rsb;
2351 struct dlm_message *ms;
2352 struct dlm_mhandle *mh;
2353 int error, nodeid = ms_in->m_header.h_nodeid;
2354
2355 error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
2356 if (error)
2357 goto out;
2358
2359 ms->m_lkid = ms_in->m_lkid;
2360 ms->m_result = rv;
2361 ms->m_nodeid = ret_nodeid;
2362
2363 error = send_message(mh, ms);
2364 out:
2365 return error;
2366}
2367
2368/* which args we save from a received message depends heavily on the type
2369 of message, unlike the send side where we can safely send everything about
2370 the lkb for any type of message */
2371
2372static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
2373{
2374 lkb->lkb_exflags = ms->m_exflags;
2375 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2376 (ms->m_flags & 0x0000FFFF);
2377}
2378
2379static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2380{
2381 lkb->lkb_sbflags = ms->m_sbflags;
2382 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2383 (ms->m_flags & 0x0000FFFF);
2384}
2385
2386static int receive_extralen(struct dlm_message *ms)
2387{
2388 return (ms->m_header.h_length - sizeof(struct dlm_message));
2389}
2390
2391static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
2392 struct dlm_message *ms)
2393{
2394 int len;
2395
2396 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
2397 if (!lkb->lkb_lvbptr)
2398 lkb->lkb_lvbptr = allocate_lvb(ls);
2399 if (!lkb->lkb_lvbptr)
2400 return -ENOMEM;
2401 len = receive_extralen(ms);
2402 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
2403 }
2404 return 0;
2405}
2406
2407static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2408 struct dlm_message *ms)
2409{
2410 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2411 lkb->lkb_ownpid = ms->m_pid;
2412 lkb->lkb_remid = ms->m_lkid;
2413 lkb->lkb_grmode = DLM_LOCK_IV;
2414 lkb->lkb_rqmode = ms->m_rqmode;
2415 lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
2416 lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
2417
2418 DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
2419
2420 if (receive_lvb(ls, lkb, ms))
2421 return -ENOMEM;
2422
2423 return 0;
2424}
2425
2426static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2427 struct dlm_message *ms)
2428{
2429 if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
2430 log_error(ls, "convert_args nodeid %d %d lkid %x %x",
2431 lkb->lkb_nodeid, ms->m_header.h_nodeid,
2432 lkb->lkb_id, lkb->lkb_remid);
2433 return -EINVAL;
2434 }
2435
2436 if (!is_master_copy(lkb))
2437 return -EINVAL;
2438
2439 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
2440 return -EBUSY;
2441
2442 if (receive_lvb(ls, lkb, ms))
2443 return -ENOMEM;
2444
2445 lkb->lkb_rqmode = ms->m_rqmode;
2446 lkb->lkb_lvbseq = ms->m_lvbseq;
2447
2448 return 0;
2449}
2450
2451static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2452 struct dlm_message *ms)
2453{
2454 if (!is_master_copy(lkb))
2455 return -EINVAL;
2456 if (receive_lvb(ls, lkb, ms))
2457 return -ENOMEM;
2458 return 0;
2459}
2460
2461/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
2462 uses to send a reply and that the remote end uses to process the reply. */
2463
2464static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
2465{
2466 struct dlm_lkb *lkb = &ls->ls_stub_lkb;
2467 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2468 lkb->lkb_remid = ms->m_lkid;
2469}
2470
2471static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
2472{
2473 struct dlm_lkb *lkb;
2474 struct dlm_rsb *r;
2475 int error, namelen;
2476
2477 error = create_lkb(ls, &lkb);
2478 if (error)
2479 goto fail;
2480
2481 receive_flags(lkb, ms);
2482 lkb->lkb_flags |= DLM_IFL_MSTCPY;
2483 error = receive_request_args(ls, lkb, ms);
2484 if (error) {
2485 __put_lkb(ls, lkb);
2486 goto fail;
2487 }
2488
2489 namelen = receive_extralen(ms);
2490
2491 error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
2492 if (error) {
2493 __put_lkb(ls, lkb);
2494 goto fail;
2495 }
2496
2497 lock_rsb(r);
2498
2499 attach_lkb(r, lkb);
2500 error = do_request(r, lkb);
2501 send_request_reply(r, lkb, error);
2502
2503 unlock_rsb(r);
2504 put_rsb(r);
2505
2506 if (error == -EINPROGRESS)
2507 error = 0;
2508 if (error)
2509 dlm_put_lkb(lkb);
2510 return;
2511
2512 fail:
2513 setup_stub_lkb(ls, ms);
2514 send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2515}
2516
2517static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
2518{
2519 struct dlm_lkb *lkb;
2520 struct dlm_rsb *r;
2521 int error, reply = 1;
2522
2523 error = find_lkb(ls, ms->m_remid, &lkb);
2524 if (error)
2525 goto fail;
2526
2527 r = lkb->lkb_resource;
2528
2529 hold_rsb(r);
2530 lock_rsb(r);
2531
2532 receive_flags(lkb, ms);
2533 error = receive_convert_args(ls, lkb, ms);
2534 if (error)
2535 goto out;
2536 reply = !down_conversion(lkb);
2537
2538 error = do_convert(r, lkb);
2539 out:
2540 if (reply)
2541 send_convert_reply(r, lkb, error);
2542
2543 unlock_rsb(r);
2544 put_rsb(r);
2545 dlm_put_lkb(lkb);
2546 return;
2547
2548 fail:
2549 setup_stub_lkb(ls, ms);
2550 send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2551}
2552
2553static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
2554{
2555 struct dlm_lkb *lkb;
2556 struct dlm_rsb *r;
2557 int error;
2558
2559 error = find_lkb(ls, ms->m_remid, &lkb);
2560 if (error)
2561 goto fail;
2562
2563 r = lkb->lkb_resource;
2564
2565 hold_rsb(r);
2566 lock_rsb(r);
2567
2568 receive_flags(lkb, ms);
2569 error = receive_unlock_args(ls, lkb, ms);
2570 if (error)
2571 goto out;
2572
2573 error = do_unlock(r, lkb);
2574 out:
2575 send_unlock_reply(r, lkb, error);
2576
2577 unlock_rsb(r);
2578 put_rsb(r);
2579 dlm_put_lkb(lkb);
2580 return;
2581
2582 fail:
2583 setup_stub_lkb(ls, ms);
2584 send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2585}
2586
2587static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
2588{
2589 struct dlm_lkb *lkb;
2590 struct dlm_rsb *r;
2591 int error;
2592
2593 error = find_lkb(ls, ms->m_remid, &lkb);
2594 if (error)
2595 goto fail;
2596
2597 receive_flags(lkb, ms);
2598
2599 r = lkb->lkb_resource;
2600
2601 hold_rsb(r);
2602 lock_rsb(r);
2603
2604 error = do_cancel(r, lkb);
2605 send_cancel_reply(r, lkb, error);
2606
2607 unlock_rsb(r);
2608 put_rsb(r);
2609 dlm_put_lkb(lkb);
2610 return;
2611
2612 fail:
2613 setup_stub_lkb(ls, ms);
2614 send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2615}
2616
2617static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
2618{
2619 struct dlm_lkb *lkb;
2620 struct dlm_rsb *r;
2621 int error;
2622
2623 error = find_lkb(ls, ms->m_remid, &lkb);
2624 if (error) {
2625 log_error(ls, "receive_grant no lkb");
2626 return;
2627 }
2628 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2629
2630 r = lkb->lkb_resource;
2631
2632 hold_rsb(r);
2633 lock_rsb(r);
2634
2635 receive_flags_reply(lkb, ms);
2636 grant_lock_pc(r, lkb, ms);
2637 queue_cast(r, lkb, 0);
2638
2639 unlock_rsb(r);
2640 put_rsb(r);
2641 dlm_put_lkb(lkb);
2642}
2643
2644static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
2645{
2646 struct dlm_lkb *lkb;
2647 struct dlm_rsb *r;
2648 int error;
2649
2650 error = find_lkb(ls, ms->m_remid, &lkb);
2651 if (error) {
2652 log_error(ls, "receive_bast no lkb");
2653 return;
2654 }
2655 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2656
2657 r = lkb->lkb_resource;
2658
2659 hold_rsb(r);
2660 lock_rsb(r);
2661
2662 queue_bast(r, lkb, ms->m_bastmode);
2663
2664 unlock_rsb(r);
2665 put_rsb(r);
2666 dlm_put_lkb(lkb);
2667}
2668
2669static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
2670{
2671 int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
2672
2673 from_nodeid = ms->m_header.h_nodeid;
2674 our_nodeid = dlm_our_nodeid();
2675
2676 len = receive_extralen(ms);
2677
2678 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2679 if (dir_nodeid != our_nodeid) {
2680 log_error(ls, "lookup dir_nodeid %d from %d",
2681 dir_nodeid, from_nodeid);
2682 error = -EINVAL;
2683 ret_nodeid = -1;
2684 goto out;
2685 }
2686
2687 error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
2688
2689 /* Optimization: we're master so treat lookup as a request */
2690 if (!error && ret_nodeid == our_nodeid) {
2691 receive_request(ls, ms);
2692 return;
2693 }
2694 out:
2695 send_lookup_reply(ls, ms, ret_nodeid, error);
2696}
2697
2698static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
2699{
2700 int len, dir_nodeid, from_nodeid;
2701
2702 from_nodeid = ms->m_header.h_nodeid;
2703
2704 len = receive_extralen(ms);
2705
2706 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2707 if (dir_nodeid != dlm_our_nodeid()) {
2708 log_error(ls, "remove dir entry dir_nodeid %d from %d",
2709 dir_nodeid, from_nodeid);
2710 return;
2711 }
2712
2713 dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
2714}
2715
2716static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
2717{
2718 struct dlm_lkb *lkb;
2719 struct dlm_rsb *r;
2720 int error, mstype;
2721
2722 error = find_lkb(ls, ms->m_remid, &lkb);
2723 if (error) {
2724 log_error(ls, "receive_request_reply no lkb");
2725 return;
2726 }
2727 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2728
2729 mstype = lkb->lkb_wait_type;
2730 error = remove_from_waiters(lkb);
2731 if (error) {
2732 log_error(ls, "receive_request_reply not on waiters");
2733 goto out;
2734 }
2735
2736 /* this is the value returned from do_request() on the master */
2737 error = ms->m_result;
2738
2739 r = lkb->lkb_resource;
2740 hold_rsb(r);
2741 lock_rsb(r);
2742
2743 /* Optimization: the dir node was also the master, so it took our
2744 lookup as a request and sent request reply instead of lookup reply */
2745 if (mstype == DLM_MSG_LOOKUP) {
2746 r->res_nodeid = ms->m_header.h_nodeid;
2747 lkb->lkb_nodeid = r->res_nodeid;
2748 }
2749
2750 switch (error) {
2751 case -EAGAIN:
2752 /* request would block (be queued) on remote master;
2753 the unhold undoes the original ref from create_lkb()
2754 so it leads to the lkb being freed */
2755 queue_cast(r, lkb, -EAGAIN);
2756 confirm_master(r, -EAGAIN);
2757 unhold_lkb(lkb);
2758 break;
2759
2760 case -EINPROGRESS:
2761 case 0:
2762 /* request was queued or granted on remote master */
2763 receive_flags_reply(lkb, ms);
2764 lkb->lkb_remid = ms->m_lkid;
2765 if (error)
2766 add_lkb(r, lkb, DLM_LKSTS_WAITING);
2767 else {
2768 grant_lock_pc(r, lkb, ms);
2769 queue_cast(r, lkb, 0);
2770 }
2771 confirm_master(r, error);
2772 break;
2773
2774 case -EBADR:
2775 case -ENOTBLK:
2776 /* find_rsb failed to find rsb or rsb wasn't master */
2777 r->res_nodeid = -1;
2778 lkb->lkb_nodeid = -1;
2779 _request_lock(r, lkb);
2780 break;
2781
2782 default:
2783 log_error(ls, "receive_request_reply error %d", error);
2784 }
2785
2786 unlock_rsb(r);
2787 put_rsb(r);
2788 out:
2789 dlm_put_lkb(lkb);
2790}
2791
2792static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2793 struct dlm_message *ms)
2794{
2795 int error = ms->m_result;
2796
2797 /* this is the value returned from do_convert() on the master */
2798
2799 switch (error) {
2800 case -EAGAIN:
2801 /* convert would block (be queued) on remote master */
2802 queue_cast(r, lkb, -EAGAIN);
2803 break;
2804
2805 case -EINPROGRESS:
2806 /* convert was queued on remote master */
2807 del_lkb(r, lkb);
2808 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
2809 break;
2810
2811 case 0:
2812 /* convert was granted on remote master */
2813 receive_flags_reply(lkb, ms);
2814 grant_lock_pc(r, lkb, ms);
2815 queue_cast(r, lkb, 0);
2816 break;
2817
2818 default:
2819 log_error(r->res_ls, "receive_convert_reply error %d", error);
2820 }
2821}
2822
2823static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2824{
2825 struct dlm_rsb *r = lkb->lkb_resource;
2826
2827 hold_rsb(r);
2828 lock_rsb(r);
2829
2830 __receive_convert_reply(r, lkb, ms);
2831
2832 unlock_rsb(r);
2833 put_rsb(r);
2834}
2835
2836static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
2837{
2838 struct dlm_lkb *lkb;
2839 int error;
2840
2841 error = find_lkb(ls, ms->m_remid, &lkb);
2842 if (error) {
2843 log_error(ls, "receive_convert_reply no lkb");
2844 return;
2845 }
2846 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2847
2848 error = remove_from_waiters(lkb);
2849 if (error) {
2850 log_error(ls, "receive_convert_reply not on waiters");
2851 goto out;
2852 }
2853
2854 _receive_convert_reply(lkb, ms);
2855 out:
2856 dlm_put_lkb(lkb);
2857}
2858
2859static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2860{
2861 struct dlm_rsb *r = lkb->lkb_resource;
2862 int error = ms->m_result;
2863
2864 hold_rsb(r);
2865 lock_rsb(r);
2866
2867 /* this is the value returned from do_unlock() on the master */
2868
2869 switch (error) {
2870 case -DLM_EUNLOCK:
2871 receive_flags_reply(lkb, ms);
2872 remove_lock_pc(r, lkb);
2873 queue_cast(r, lkb, -DLM_EUNLOCK);
2874 break;
2875 default:
2876 log_error(r->res_ls, "receive_unlock_reply error %d", error);
2877 }
2878
2879 unlock_rsb(r);
2880 put_rsb(r);
2881}
2882
2883static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
2884{
2885 struct dlm_lkb *lkb;
2886 int error;
2887
2888 error = find_lkb(ls, ms->m_remid, &lkb);
2889 if (error) {
2890 log_error(ls, "receive_unlock_reply no lkb");
2891 return;
2892 }
2893 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2894
2895 error = remove_from_waiters(lkb);
2896 if (error) {
2897 log_error(ls, "receive_unlock_reply not on waiters");
2898 goto out;
2899 }
2900
2901 _receive_unlock_reply(lkb, ms);
2902 out:
2903 dlm_put_lkb(lkb);
2904}
2905
2906static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2907{
2908 struct dlm_rsb *r = lkb->lkb_resource;
2909 int error = ms->m_result;
2910
2911 hold_rsb(r);
2912 lock_rsb(r);
2913
2914 /* this is the value returned from do_cancel() on the master */
2915
2916 switch (error) {
2917 case -DLM_ECANCEL:
2918 receive_flags_reply(lkb, ms);
2919 revert_lock_pc(r, lkb);
2920 queue_cast(r, lkb, -DLM_ECANCEL);
2921 break;
2922 default:
2923 log_error(r->res_ls, "receive_cancel_reply error %d", error);
2924 }
2925
2926 unlock_rsb(r);
2927 put_rsb(r);
2928}
2929
2930static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
2931{
2932 struct dlm_lkb *lkb;
2933 int error;
2934
2935 error = find_lkb(ls, ms->m_remid, &lkb);
2936 if (error) {
2937 log_error(ls, "receive_cancel_reply no lkb");
2938 return;
2939 }
2940 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2941
2942 error = remove_from_waiters(lkb);
2943 if (error) {
2944 log_error(ls, "receive_cancel_reply not on waiters");
2945 goto out;
2946 }
2947
2948 _receive_cancel_reply(lkb, ms);
2949 out:
2950 dlm_put_lkb(lkb);
2951}
2952
2953static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
2954{
2955 struct dlm_lkb *lkb;
2956 struct dlm_rsb *r;
2957 int error, ret_nodeid;
2958
2959 error = find_lkb(ls, ms->m_lkid, &lkb);
2960 if (error) {
2961 log_error(ls, "receive_lookup_reply no lkb");
2962 return;
2963 }
2964
2965 error = remove_from_waiters(lkb);
2966 if (error) {
2967 log_error(ls, "receive_lookup_reply not on waiters");
2968 goto out;
2969 }
2970
2971 /* this is the value returned by dlm_dir_lookup on dir node
2972 FIXME: will a non-zero error ever be returned? */
2973 error = ms->m_result;
2974
2975 r = lkb->lkb_resource;
2976 hold_rsb(r);
2977 lock_rsb(r);
2978
2979 ret_nodeid = ms->m_nodeid;
2980 if (ret_nodeid == dlm_our_nodeid()) {
2981 r->res_nodeid = 0;
2982 ret_nodeid = 0;
2983 r->res_first_lkid = 0;
2984 } else {
2985 /* set_master() will copy res_nodeid to lkb_nodeid */
2986 r->res_nodeid = ret_nodeid;
2987 }
2988
2989 _request_lock(r, lkb);
2990
2991 if (!ret_nodeid)
2992 process_lookup_list(r);
2993
2994 unlock_rsb(r);
2995 put_rsb(r);
2996 out:
2997 dlm_put_lkb(lkb);
2998}
2999
3000int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
3001{
3002 struct dlm_message *ms = (struct dlm_message *) hd;
3003 struct dlm_ls *ls;
3004 int error;
3005
3006 if (!recovery)
3007 dlm_message_in(ms);
3008
3009 ls = dlm_find_lockspace_global(hd->h_lockspace);
3010 if (!ls) {
3011 log_print("drop message %d from %d for unknown lockspace %d",
3012 ms->m_type, nodeid, hd->h_lockspace);
3013 return -EINVAL;
3014 }
3015
3016 /* recovery may have just ended leaving a bunch of backed-up requests
3017 in the requestqueue; wait while dlm_recoverd clears them */
3018
3019 if (!recovery)
3020 dlm_wait_requestqueue(ls);
3021
3022 /* recovery may have just started while there were a bunch of
3023 in-flight requests -- save them in requestqueue to be processed
3024 after recovery. we can't let dlm_recvd block on the recovery
3025 lock. if dlm_recoverd is calling this function to clear the
3026 requestqueue, it needs to be interrupted (-EINTR) if another
3027 recovery operation is starting. */
3028
3029 while (1) {
3030 if (dlm_locking_stopped(ls)) {
3031 if (!recovery)
3032 dlm_add_requestqueue(ls, nodeid, hd);
3033 error = -EINTR;
3034 goto out;
3035 }
3036
3037 if (lock_recovery_try(ls))
3038 break;
3039 schedule();
3040 }
3041
3042 switch (ms->m_type) {
3043
3044 /* messages sent to a master node */
3045
3046 case DLM_MSG_REQUEST:
3047 receive_request(ls, ms);
3048 break;
3049
3050 case DLM_MSG_CONVERT:
3051 receive_convert(ls, ms);
3052 break;
3053
3054 case DLM_MSG_UNLOCK:
3055 receive_unlock(ls, ms);
3056 break;
3057
3058 case DLM_MSG_CANCEL:
3059 receive_cancel(ls, ms);
3060 break;
3061
3062 /* messages sent from a master node (replies to above) */
3063
3064 case DLM_MSG_REQUEST_REPLY:
3065 receive_request_reply(ls, ms);
3066 break;
3067
3068 case DLM_MSG_CONVERT_REPLY:
3069 receive_convert_reply(ls, ms);
3070 break;
3071
3072 case DLM_MSG_UNLOCK_REPLY:
3073 receive_unlock_reply(ls, ms);
3074 break;
3075
3076 case DLM_MSG_CANCEL_REPLY:
3077 receive_cancel_reply(ls, ms);
3078 break;
3079
3080 /* messages sent from a master node (only two types of async msg) */
3081
3082 case DLM_MSG_GRANT:
3083 receive_grant(ls, ms);
3084 break;
3085
3086 case DLM_MSG_BAST:
3087 receive_bast(ls, ms);
3088 break;
3089
3090 /* messages sent to a dir node */
3091
3092 case DLM_MSG_LOOKUP:
3093 receive_lookup(ls, ms);
3094 break;
3095
3096 case DLM_MSG_REMOVE:
3097 receive_remove(ls, ms);
3098 break;
3099
3100 /* messages sent from a dir node (remove has no reply) */
3101
3102 case DLM_MSG_LOOKUP_REPLY:
3103 receive_lookup_reply(ls, ms);
3104 break;
3105
3106 default:
3107 log_error(ls, "unknown message type %d", ms->m_type);
3108 }
3109
3110 unlock_recovery(ls);
3111 out:
3112 dlm_put_lockspace(ls);
3113 dlm_astd_wake();
3114 return 0;
3115}
3116
3117
3118/*
3119 * Recovery related
3120 */
3121
3122static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
3123{
3124 if (middle_conversion(lkb)) {
3125 hold_lkb(lkb);
3126 ls->ls_stub_ms.m_result = -EINPROGRESS;
3127 _remove_from_waiters(lkb);
3128 _receive_convert_reply(lkb, &ls->ls_stub_ms);
3129
3130 /* Same special case as in receive_rcom_lock_args() */
3131 lkb->lkb_grmode = DLM_LOCK_IV;
3132 rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
3133 unhold_lkb(lkb);
3134
3135 } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
3136 lkb->lkb_flags |= DLM_IFL_RESEND;
3137 }
3138
3139 /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
3140 conversions are async; there's no reply from the remote master */
3141}
3142
3143/* A waiting lkb needs recovery if the master node has failed, or
3144 the master node is changing (only when no directory is used) */
3145
3146static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
3147{
3148 if (dlm_is_removed(ls, lkb->lkb_nodeid))
3149 return 1;
3150
3151 if (!dlm_no_directory(ls))
3152 return 0;
3153
3154 if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
3155 return 1;
3156
3157 return 0;
3158}
3159
3160/* Recovery for locks that are waiting for replies from nodes that are now
3161 gone. We can just complete unlocks and cancels by faking a reply from the
3162 dead node. Requests and up-conversions we flag to be resent after
3163 recovery. Down-conversions can just be completed with a fake reply like
3164 unlocks. Conversions between PR and CW need special attention. */
3165
3166void dlm_recover_waiters_pre(struct dlm_ls *ls)
3167{
3168 struct dlm_lkb *lkb, *safe;
3169
3170 mutex_lock(&ls->ls_waiters_mutex);
3171
3172 list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
3173 log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
3174 lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
3175
3176 /* all outstanding lookups, regardless of destination will be
3177 resent after recovery is done */
3178
3179 if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
3180 lkb->lkb_flags |= DLM_IFL_RESEND;
3181 continue;
3182 }
3183
3184 if (!waiter_needs_recovery(ls, lkb))
3185 continue;
3186
3187 switch (lkb->lkb_wait_type) {
3188
3189 case DLM_MSG_REQUEST:
3190 lkb->lkb_flags |= DLM_IFL_RESEND;
3191 break;
3192
3193 case DLM_MSG_CONVERT:
3194 recover_convert_waiter(ls, lkb);
3195 break;
3196
3197 case DLM_MSG_UNLOCK:
3198 hold_lkb(lkb);
3199 ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
3200 _remove_from_waiters(lkb);
3201 _receive_unlock_reply(lkb, &ls->ls_stub_ms);
3202 dlm_put_lkb(lkb);
3203 break;
3204
3205 case DLM_MSG_CANCEL:
3206 hold_lkb(lkb);
3207 ls->ls_stub_ms.m_result = -DLM_ECANCEL;
3208 _remove_from_waiters(lkb);
3209 _receive_cancel_reply(lkb, &ls->ls_stub_ms);
3210 dlm_put_lkb(lkb);
3211 break;
3212
3213 default:
3214 log_error(ls, "invalid lkb wait_type %d",
3215 lkb->lkb_wait_type);
3216 }
3217 schedule();
3218 }
3219 mutex_unlock(&ls->ls_waiters_mutex);
3220}
3221
3222static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
3223{
3224 struct dlm_lkb *lkb;
3225 int rv = 0;
3226
3227 mutex_lock(&ls->ls_waiters_mutex);
3228 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
3229 if (lkb->lkb_flags & DLM_IFL_RESEND) {
3230 rv = lkb->lkb_wait_type;
3231 _remove_from_waiters(lkb);
3232 lkb->lkb_flags &= ~DLM_IFL_RESEND;
3233 break;
3234 }
3235 }
3236 mutex_unlock(&ls->ls_waiters_mutex);
3237
3238 if (!rv)
3239 lkb = NULL;
3240 *lkb_ret = lkb;
3241 return rv;
3242}
3243
3244/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the
3245 master or dir-node for r. Processing the lkb may result in it being placed
3246 back on waiters. */
3247
3248int dlm_recover_waiters_post(struct dlm_ls *ls)
3249{
3250 struct dlm_lkb *lkb;
3251 struct dlm_rsb *r;
3252 int error = 0, mstype;
3253
3254 while (1) {
3255 if (dlm_locking_stopped(ls)) {
3256 log_debug(ls, "recover_waiters_post aborted");
3257 error = -EINTR;
3258 break;
3259 }
3260
3261 mstype = remove_resend_waiter(ls, &lkb);
3262 if (!mstype)
3263 break;
3264
3265 r = lkb->lkb_resource;
3266
3267 log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
3268 lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
3269
3270 switch (mstype) {
3271
3272 case DLM_MSG_LOOKUP:
3273 hold_rsb(r);
3274 lock_rsb(r);
3275 _request_lock(r, lkb);
3276 if (is_master(r))
3277 confirm_master(r, 0);
3278 unlock_rsb(r);
3279 put_rsb(r);
3280 break;
3281
3282 case DLM_MSG_REQUEST:
3283 hold_rsb(r);
3284 lock_rsb(r);
3285 _request_lock(r, lkb);
3286 if (is_master(r))
3287 confirm_master(r, 0);
3288 unlock_rsb(r);
3289 put_rsb(r);
3290 break;
3291
3292 case DLM_MSG_CONVERT:
3293 hold_rsb(r);
3294 lock_rsb(r);
3295 _convert_lock(r, lkb);
3296 unlock_rsb(r);
3297 put_rsb(r);
3298 break;
3299
3300 default:
3301 log_error(ls, "recover_waiters_post type %d", mstype);
3302 }
3303 }
3304
3305 return error;
3306}
3307
3308static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
3309 int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
3310{
3311 struct dlm_ls *ls = r->res_ls;
3312 struct dlm_lkb *lkb, *safe;
3313
3314 list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
3315 if (test(ls, lkb)) {
3316 rsb_set_flag(r, RSB_LOCKS_PURGED);
3317 del_lkb(r, lkb);
3318 /* this put should free the lkb */
3319 if (!dlm_put_lkb(lkb))
3320 log_error(ls, "purged lkb not released");
3321 }
3322 }
3323}
3324
3325static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3326{
3327 return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
3328}
3329
3330static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3331{
3332 return is_master_copy(lkb);
3333}
3334
3335static void purge_dead_locks(struct dlm_rsb *r)
3336{
3337 purge_queue(r, &r->res_grantqueue, &purge_dead_test);
3338 purge_queue(r, &r->res_convertqueue, &purge_dead_test);
3339 purge_queue(r, &r->res_waitqueue, &purge_dead_test);
3340}
3341
3342void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
3343{
3344 purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
3345 purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
3346 purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
3347}
3348
3349/* Get rid of locks held by nodes that are gone. */
3350
3351int dlm_purge_locks(struct dlm_ls *ls)
3352{
3353 struct dlm_rsb *r;
3354
3355 log_debug(ls, "dlm_purge_locks");
3356
3357 down_write(&ls->ls_root_sem);
3358 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
3359 hold_rsb(r);
3360 lock_rsb(r);
3361 if (is_master(r))
3362 purge_dead_locks(r);
3363 unlock_rsb(r);
3364 unhold_rsb(r);
3365
3366 schedule();
3367 }
3368 up_write(&ls->ls_root_sem);
3369
3370 return 0;
3371}
3372
3373static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
3374{
3375 struct dlm_rsb *r, *r_ret = NULL;
3376
3377 read_lock(&ls->ls_rsbtbl[bucket].lock);
3378 list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
3379 if (!rsb_flag(r, RSB_LOCKS_PURGED))
3380 continue;
3381 hold_rsb(r);
3382 rsb_clear_flag(r, RSB_LOCKS_PURGED);
3383 r_ret = r;
3384 break;
3385 }
3386 read_unlock(&ls->ls_rsbtbl[bucket].lock);
3387 return r_ret;
3388}
3389
3390void dlm_grant_after_purge(struct dlm_ls *ls)
3391{
3392 struct dlm_rsb *r;
3393 int bucket = 0;
3394
3395 while (1) {
3396 r = find_purged_rsb(ls, bucket);
3397 if (!r) {
3398 if (bucket == ls->ls_rsbtbl_size - 1)
3399 break;
3400 bucket++;
3401 continue;
3402 }
3403 lock_rsb(r);
3404 if (is_master(r)) {
3405 grant_pending_locks(r);
3406 confirm_master(r, 0);
3407 }
3408 unlock_rsb(r);
3409 put_rsb(r);
3410 schedule();
3411 }
3412}
3413
3414static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
3415 uint32_t remid)
3416{
3417 struct dlm_lkb *lkb;
3418
3419 list_for_each_entry(lkb, head, lkb_statequeue) {
3420 if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
3421 return lkb;
3422 }
3423 return NULL;
3424}
3425
3426static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
3427 uint32_t remid)
3428{
3429 struct dlm_lkb *lkb;
3430
3431 lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
3432 if (lkb)
3433 return lkb;
3434 lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
3435 if (lkb)
3436 return lkb;
3437 lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
3438 if (lkb)
3439 return lkb;
3440 return NULL;
3441}
3442
3443static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3444 struct dlm_rsb *r, struct dlm_rcom *rc)
3445{
3446 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3447 int lvblen;
3448
3449 lkb->lkb_nodeid = rc->rc_header.h_nodeid;
3450 lkb->lkb_ownpid = rl->rl_ownpid;
3451 lkb->lkb_remid = rl->rl_lkid;
3452 lkb->lkb_exflags = rl->rl_exflags;
3453 lkb->lkb_flags = rl->rl_flags & 0x0000FFFF;
3454 lkb->lkb_flags |= DLM_IFL_MSTCPY;
3455 lkb->lkb_lvbseq = rl->rl_lvbseq;
3456 lkb->lkb_rqmode = rl->rl_rqmode;
3457 lkb->lkb_grmode = rl->rl_grmode;
3458 /* don't set lkb_status because add_lkb wants to itself */
3459
3460 lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST);
3461 lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
3462
3463 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3464 lkb->lkb_lvbptr = allocate_lvb(ls);
3465 if (!lkb->lkb_lvbptr)
3466 return -ENOMEM;
3467 lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
3468 sizeof(struct rcom_lock);
3469 memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
3470 }
3471
3472 /* Conversions between PR and CW (middle modes) need special handling.
3473 The real granted mode of these converting locks cannot be determined
3474 until all locks have been rebuilt on the rsb (recover_conversion) */
3475
3476 if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) {
3477 rl->rl_status = DLM_LKSTS_CONVERT;
3478 lkb->lkb_grmode = DLM_LOCK_IV;
3479 rsb_set_flag(r, RSB_RECOVER_CONVERT);
3480 }
3481
3482 return 0;
3483}
3484
3485/* This lkb may have been recovered in a previous aborted recovery so we need
3486 to check if the rsb already has an lkb with the given remote nodeid/lkid.
3487 If so we just send back a standard reply. If not, we create a new lkb with
3488 the given values and send back our lkid. We send back our lkid by sending
3489 back the rcom_lock struct we got but with the remid field filled in. */
3490
3491int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3492{
3493 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3494 struct dlm_rsb *r;
3495 struct dlm_lkb *lkb;
3496 int error;
3497
3498 if (rl->rl_parent_lkid) {
3499 error = -EOPNOTSUPP;
3500 goto out;
3501 }
3502
3503 error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r);
3504 if (error)
3505 goto out;
3506
3507 lock_rsb(r);
3508
3509 lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid);
3510 if (lkb) {
3511 error = -EEXIST;
3512 goto out_remid;
3513 }
3514
3515 error = create_lkb(ls, &lkb);
3516 if (error)
3517 goto out_unlock;
3518
3519 error = receive_rcom_lock_args(ls, lkb, r, rc);
3520 if (error) {
3521 __put_lkb(ls, lkb);
3522 goto out_unlock;
3523 }
3524
3525 attach_lkb(r, lkb);
3526 add_lkb(r, lkb, rl->rl_status);
3527 error = 0;
3528
3529 out_remid:
3530 /* this is the new value returned to the lock holder for
3531 saving in its process-copy lkb */
3532 rl->rl_remid = lkb->lkb_id;
3533
3534 out_unlock:
3535 unlock_rsb(r);
3536 put_rsb(r);
3537 out:
3538 if (error)
3539 log_print("recover_master_copy %d %x", error, rl->rl_lkid);
3540 rl->rl_result = error;
3541 return error;
3542}
3543
3544int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3545{
3546 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3547 struct dlm_rsb *r;
3548 struct dlm_lkb *lkb;
3549 int error;
3550
3551 error = find_lkb(ls, rl->rl_lkid, &lkb);
3552 if (error) {
3553 log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid);
3554 return error;
3555 }
3556
3557 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3558
3559 error = rl->rl_result;
3560
3561 r = lkb->lkb_resource;
3562 hold_rsb(r);
3563 lock_rsb(r);
3564
3565 switch (error) {
3566 case -EEXIST:
3567 log_debug(ls, "master copy exists %x", lkb->lkb_id);
3568 /* fall through */
3569 case 0:
3570 lkb->lkb_remid = rl->rl_remid;
3571 break;
3572 default:
3573 log_error(ls, "dlm_recover_process_copy unknown error %d %x",
3574 error, lkb->lkb_id);
3575 }
3576
3577 /* an ack for dlm_recover_locks() which waits for replies from
3578 all the locks it sends to new masters */
3579 dlm_recovered_lock(r);
3580
3581 unlock_rsb(r);
3582 put_rsb(r);
3583 dlm_put_lkb(lkb);
3584
3585 return 0;
3586}
3587
3588int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
3589 int mode, uint32_t flags, void *name, unsigned int namelen,
3590 uint32_t parent_lkid)
3591{
3592 struct dlm_lkb *lkb;
3593 struct dlm_args args;
3594 int error;
3595
3596 lock_recovery(ls);
3597
3598 error = create_lkb(ls, &lkb);
3599 if (error) {
3600 kfree(ua);
3601 goto out;
3602 }
3603
3604 if (flags & DLM_LKF_VALBLK) {
3605 ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
3606 if (!ua->lksb.sb_lvbptr) {
3607 kfree(ua);
3608 __put_lkb(ls, lkb);
3609 error = -ENOMEM;
3610 goto out;
3611 }
3612 }
3613
3614 /* After ua is attached to lkb it will be freed by free_lkb().
3615 When DLM_IFL_USER is set, the dlm knows that this is a userspace
3616 lock and that lkb_astparam is the dlm_user_args structure. */
3617
3618 error = set_lock_args(mode, &ua->lksb, flags, namelen, parent_lkid,
3619 DLM_FAKE_USER_AST, ua, DLM_FAKE_USER_AST, &args);
3620 lkb->lkb_flags |= DLM_IFL_USER;
3621 ua->old_mode = DLM_LOCK_IV;
3622
3623 if (error) {
3624 __put_lkb(ls, lkb);
3625 goto out;
3626 }
3627
3628 error = request_lock(ls, lkb, name, namelen, &args);
3629
3630 switch (error) {
3631 case 0:
3632 break;
3633 case -EINPROGRESS:
3634 error = 0;
3635 break;
3636 case -EAGAIN:
3637 error = 0;
3638 /* fall through */
3639 default:
3640 __put_lkb(ls, lkb);
3641 goto out;
3642 }
3643
3644 /* add this new lkb to the per-process list of locks */
3645 spin_lock(&ua->proc->locks_spin);
3646 kref_get(&lkb->lkb_ref);
3647 list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
3648 spin_unlock(&ua->proc->locks_spin);
3649 out:
3650 unlock_recovery(ls);
3651 return error;
3652}
3653
3654int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3655 int mode, uint32_t flags, uint32_t lkid, char *lvb_in)
3656{
3657 struct dlm_lkb *lkb;
3658 struct dlm_args args;
3659 struct dlm_user_args *ua;
3660 int error;
3661
3662 lock_recovery(ls);
3663
3664 error = find_lkb(ls, lkid, &lkb);
3665 if (error)
3666 goto out;
3667
3668 /* user can change the params on its lock when it converts it, or
3669 add an lvb that didn't exist before */
3670
3671 ua = (struct dlm_user_args *)lkb->lkb_astparam;
3672
3673 if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
3674 ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
3675 if (!ua->lksb.sb_lvbptr) {
3676 error = -ENOMEM;
3677 goto out_put;
3678 }
3679 }
3680 if (lvb_in && ua->lksb.sb_lvbptr)
3681 memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
3682
3683 ua->castparam = ua_tmp->castparam;
3684 ua->castaddr = ua_tmp->castaddr;
3685 ua->bastparam = ua_tmp->bastparam;
3686 ua->bastaddr = ua_tmp->bastaddr;
3687 ua->user_lksb = ua_tmp->user_lksb;
3688 ua->old_mode = lkb->lkb_grmode;
3689
3690 error = set_lock_args(mode, &ua->lksb, flags, 0, 0, DLM_FAKE_USER_AST,
3691 ua, DLM_FAKE_USER_AST, &args);
3692 if (error)
3693 goto out_put;
3694
3695 error = convert_lock(ls, lkb, &args);
3696
3697 if (error == -EINPROGRESS || error == -EAGAIN)
3698 error = 0;
3699 out_put:
3700 dlm_put_lkb(lkb);
3701 out:
3702 unlock_recovery(ls);
3703 kfree(ua_tmp);
3704 return error;
3705}
3706
3707int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3708 uint32_t flags, uint32_t lkid, char *lvb_in)
3709{
3710 struct dlm_lkb *lkb;
3711 struct dlm_args args;
3712 struct dlm_user_args *ua;
3713 int error;
3714
3715 lock_recovery(ls);
3716
3717 error = find_lkb(ls, lkid, &lkb);
3718 if (error)
3719 goto out;
3720
3721 ua = (struct dlm_user_args *)lkb->lkb_astparam;
3722
3723 if (lvb_in && ua->lksb.sb_lvbptr)
3724 memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
3725 ua->castparam = ua_tmp->castparam;
3726 ua->user_lksb = ua_tmp->user_lksb;
3727
3728 error = set_unlock_args(flags, ua, &args);
3729 if (error)
3730 goto out_put;
3731
3732 error = unlock_lock(ls, lkb, &args);
3733
3734 if (error == -DLM_EUNLOCK)
3735 error = 0;
3736 if (error)
3737 goto out_put;
3738
3739 spin_lock(&ua->proc->locks_spin);
3740 list_del_init(&lkb->lkb_ownqueue);
3741 spin_unlock(&ua->proc->locks_spin);
3742
3743 /* this removes the reference for the proc->locks list added by
3744 dlm_user_request */
3745 unhold_lkb(lkb);
3746 out_put:
3747 dlm_put_lkb(lkb);
3748 out:
3749 unlock_recovery(ls);
3750 return error;
3751}
3752
3753int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
3754 uint32_t flags, uint32_t lkid)
3755{
3756 struct dlm_lkb *lkb;
3757 struct dlm_args args;
3758 struct dlm_user_args *ua;
3759 int error;
3760
3761 lock_recovery(ls);
3762
3763 error = find_lkb(ls, lkid, &lkb);
3764 if (error)
3765 goto out;
3766
3767 ua = (struct dlm_user_args *)lkb->lkb_astparam;
3768 ua->castparam = ua_tmp->castparam;
3769 ua->user_lksb = ua_tmp->user_lksb;
3770
3771 error = set_unlock_args(flags, ua, &args);
3772 if (error)
3773 goto out_put;
3774
3775 error = cancel_lock(ls, lkb, &args);
3776
3777 if (error == -DLM_ECANCEL)
3778 error = 0;
3779 if (error)
3780 goto out_put;
3781
3782 /* this lkb was removed from the WAITING queue */
3783 if (lkb->lkb_grmode == DLM_LOCK_IV) {
3784 spin_lock(&ua->proc->locks_spin);
3785 list_del_init(&lkb->lkb_ownqueue);
3786 spin_unlock(&ua->proc->locks_spin);
3787 unhold_lkb(lkb);
3788 }
3789 out_put:
3790 dlm_put_lkb(lkb);
3791 out:
3792 unlock_recovery(ls);
3793 return error;
3794}
3795
3796static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
3797{
3798 struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
3799
3800 if (ua->lksb.sb_lvbptr)
3801 kfree(ua->lksb.sb_lvbptr);
3802 kfree(ua);
3803 lkb->lkb_astparam = (long)NULL;
3804
3805 /* TODO: propogate to master if needed */
3806 return 0;
3807}
3808
3809/* The force flag allows the unlock to go ahead even if the lkb isn't granted.
3810 Regardless of what rsb queue the lock is on, it's removed and freed. */
3811
3812static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
3813{
3814 struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
3815 struct dlm_args args;
3816 int error;
3817
3818 /* FIXME: we need to handle the case where the lkb is in limbo
3819 while the rsb is being looked up, currently we assert in
3820 _unlock_lock/is_remote because rsb nodeid is -1. */
3821
3822 set_unlock_args(DLM_LKF_FORCEUNLOCK, ua, &args);
3823
3824 error = unlock_lock(ls, lkb, &args);
3825 if (error == -DLM_EUNLOCK)
3826 error = 0;
3827 return error;
3828}
3829
3830/* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which
3831 1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
3832 which we clear here. */
3833
3834/* proc CLOSING flag is set so no more device_reads should look at proc->asts
3835 list, and no more device_writes should add lkb's to proc->locks list; so we
3836 shouldn't need to take asts_spin or locks_spin here. this assumes that
3837 device reads/writes/closes are serialized -- FIXME: we may need to serialize
3838 them ourself. */
3839
3840void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
3841{
3842 struct dlm_lkb *lkb, *safe;
3843
3844 lock_recovery(ls);
3845 mutex_lock(&ls->ls_clear_proc_locks);
3846
3847 list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) {
3848 if (lkb->lkb_ast_type) {
3849 list_del(&lkb->lkb_astqueue);
3850 unhold_lkb(lkb);
3851 }
3852
3853 list_del_init(&lkb->lkb_ownqueue);
3854
3855 if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) {
3856 lkb->lkb_flags |= DLM_IFL_ORPHAN;
3857 orphan_proc_lock(ls, lkb);
3858 } else {
3859 lkb->lkb_flags |= DLM_IFL_DEAD;
3860 unlock_proc_lock(ls, lkb);
3861 }
3862
3863 /* this removes the reference for the proc->locks list
3864 added by dlm_user_request, it may result in the lkb
3865 being freed */
3866
3867 dlm_put_lkb(lkb);
3868 }
3869 mutex_unlock(&ls->ls_clear_proc_locks);
3870 unlock_recovery(ls);
3871}
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
new file mode 100644
index 000000000000..0843a3073ec3
--- /dev/null
+++ b/fs/dlm/lock.h
@@ -0,0 +1,62 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LOCK_DOT_H__
14#define __LOCK_DOT_H__
15
16void dlm_print_rsb(struct dlm_rsb *r);
17void dlm_dump_rsb(struct dlm_rsb *r);
18void dlm_print_lkb(struct dlm_lkb *lkb);
19int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery);
20int dlm_modes_compat(int mode1, int mode2);
21int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
22 unsigned int flags, struct dlm_rsb **r_ret);
23void dlm_put_rsb(struct dlm_rsb *r);
24void dlm_hold_rsb(struct dlm_rsb *r);
25int dlm_put_lkb(struct dlm_lkb *lkb);
26void dlm_scan_rsbs(struct dlm_ls *ls);
27
28int dlm_purge_locks(struct dlm_ls *ls);
29void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
30void dlm_grant_after_purge(struct dlm_ls *ls);
31int dlm_recover_waiters_post(struct dlm_ls *ls);
32void dlm_recover_waiters_pre(struct dlm_ls *ls);
33int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
34int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
35
36int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
37 uint32_t flags, void *name, unsigned int namelen, uint32_t parent_lkid);
38int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
39 int mode, uint32_t flags, uint32_t lkid, char *lvb_in);
40int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
41 uint32_t flags, uint32_t lkid, char *lvb_in);
42int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
43 uint32_t flags, uint32_t lkid);
44void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
45
46static inline int is_master(struct dlm_rsb *r)
47{
48 return !r->res_nodeid;
49}
50
51static inline void lock_rsb(struct dlm_rsb *r)
52{
53 mutex_lock(&r->res_mutex);
54}
55
56static inline void unlock_rsb(struct dlm_rsb *r)
57{
58 mutex_unlock(&r->res_mutex);
59}
60
61#endif
62
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
new file mode 100644
index 000000000000..109333c8ecb9
--- /dev/null
+++ b/fs/dlm/lockspace.c
@@ -0,0 +1,717 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "recoverd.h"
18#include "ast.h"
19#include "dir.h"
20#include "lowcomms.h"
21#include "config.h"
22#include "memory.h"
23#include "lock.h"
24#include "recover.h"
25
26#ifdef CONFIG_DLM_DEBUG
27int dlm_create_debug_file(struct dlm_ls *ls);
28void dlm_delete_debug_file(struct dlm_ls *ls);
29#else
30static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
31static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
32#endif
33
34static int ls_count;
35static struct mutex ls_lock;
36static struct list_head lslist;
37static spinlock_t lslist_lock;
38static struct task_struct * scand_task;
39
40
41static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
42{
43 ssize_t ret = len;
44 int n = simple_strtol(buf, NULL, 0);
45
46 switch (n) {
47 case 0:
48 dlm_ls_stop(ls);
49 break;
50 case 1:
51 dlm_ls_start(ls);
52 break;
53 default:
54 ret = -EINVAL;
55 }
56 return ret;
57}
58
59static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
60{
61 ls->ls_uevent_result = simple_strtol(buf, NULL, 0);
62 set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
63 wake_up(&ls->ls_uevent_wait);
64 return len;
65}
66
67static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
68{
69 return snprintf(buf, PAGE_SIZE, "%u\n", ls->ls_global_id);
70}
71
72static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
73{
74 ls->ls_global_id = simple_strtoul(buf, NULL, 0);
75 return len;
76}
77
78static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf)
79{
80 uint32_t status = dlm_recover_status(ls);
81 return snprintf(buf, PAGE_SIZE, "%x\n", status);
82}
83
84static ssize_t dlm_recover_nodeid_show(struct dlm_ls *ls, char *buf)
85{
86 return snprintf(buf, PAGE_SIZE, "%d\n", ls->ls_recover_nodeid);
87}
88
89struct dlm_attr {
90 struct attribute attr;
91 ssize_t (*show)(struct dlm_ls *, char *);
92 ssize_t (*store)(struct dlm_ls *, const char *, size_t);
93};
94
95static struct dlm_attr dlm_attr_control = {
96 .attr = {.name = "control", .mode = S_IWUSR},
97 .store = dlm_control_store
98};
99
100static struct dlm_attr dlm_attr_event = {
101 .attr = {.name = "event_done", .mode = S_IWUSR},
102 .store = dlm_event_store
103};
104
105static struct dlm_attr dlm_attr_id = {
106 .attr = {.name = "id", .mode = S_IRUGO | S_IWUSR},
107 .show = dlm_id_show,
108 .store = dlm_id_store
109};
110
111static struct dlm_attr dlm_attr_recover_status = {
112 .attr = {.name = "recover_status", .mode = S_IRUGO},
113 .show = dlm_recover_status_show
114};
115
116static struct dlm_attr dlm_attr_recover_nodeid = {
117 .attr = {.name = "recover_nodeid", .mode = S_IRUGO},
118 .show = dlm_recover_nodeid_show
119};
120
121static struct attribute *dlm_attrs[] = {
122 &dlm_attr_control.attr,
123 &dlm_attr_event.attr,
124 &dlm_attr_id.attr,
125 &dlm_attr_recover_status.attr,
126 &dlm_attr_recover_nodeid.attr,
127 NULL,
128};
129
130static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
131 char *buf)
132{
133 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
134 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
135 return a->show ? a->show(ls, buf) : 0;
136}
137
138static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
139 const char *buf, size_t len)
140{
141 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
142 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
143 return a->store ? a->store(ls, buf, len) : len;
144}
145
146static struct sysfs_ops dlm_attr_ops = {
147 .show = dlm_attr_show,
148 .store = dlm_attr_store,
149};
150
151static struct kobj_type dlm_ktype = {
152 .default_attrs = dlm_attrs,
153 .sysfs_ops = &dlm_attr_ops,
154};
155
156static struct kset dlm_kset = {
157 .subsys = &kernel_subsys,
158 .kobj = {.name = "dlm",},
159 .ktype = &dlm_ktype,
160};
161
162static int kobject_setup(struct dlm_ls *ls)
163{
164 char lsname[DLM_LOCKSPACE_LEN];
165 int error;
166
167 memset(lsname, 0, DLM_LOCKSPACE_LEN);
168 snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
169
170 error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
171 if (error)
172 return error;
173
174 ls->ls_kobj.kset = &dlm_kset;
175 ls->ls_kobj.ktype = &dlm_ktype;
176 return 0;
177}
178
179static int do_uevent(struct dlm_ls *ls, int in)
180{
181 int error;
182
183 if (in)
184 kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
185 else
186 kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
187
188 error = wait_event_interruptible(ls->ls_uevent_wait,
189 test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
190 if (error)
191 goto out;
192
193 error = ls->ls_uevent_result;
194 out:
195 return error;
196}
197
198
199int dlm_lockspace_init(void)
200{
201 int error;
202
203 ls_count = 0;
204 mutex_init(&ls_lock);
205 INIT_LIST_HEAD(&lslist);
206 spin_lock_init(&lslist_lock);
207
208 error = kset_register(&dlm_kset);
209 if (error)
210 printk("dlm_lockspace_init: cannot register kset %d\n", error);
211 return error;
212}
213
214void dlm_lockspace_exit(void)
215{
216 kset_unregister(&dlm_kset);
217}
218
219static int dlm_scand(void *data)
220{
221 struct dlm_ls *ls;
222
223 while (!kthread_should_stop()) {
224 list_for_each_entry(ls, &lslist, ls_list)
225 dlm_scan_rsbs(ls);
226 schedule_timeout_interruptible(dlm_config.scan_secs * HZ);
227 }
228 return 0;
229}
230
231static int dlm_scand_start(void)
232{
233 struct task_struct *p;
234 int error = 0;
235
236 p = kthread_run(dlm_scand, NULL, "dlm_scand");
237 if (IS_ERR(p))
238 error = PTR_ERR(p);
239 else
240 scand_task = p;
241 return error;
242}
243
244static void dlm_scand_stop(void)
245{
246 kthread_stop(scand_task);
247}
248
249static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
250{
251 struct dlm_ls *ls;
252
253 spin_lock(&lslist_lock);
254
255 list_for_each_entry(ls, &lslist, ls_list) {
256 if (ls->ls_namelen == namelen &&
257 memcmp(ls->ls_name, name, namelen) == 0)
258 goto out;
259 }
260 ls = NULL;
261 out:
262 spin_unlock(&lslist_lock);
263 return ls;
264}
265
266struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
267{
268 struct dlm_ls *ls;
269
270 spin_lock(&lslist_lock);
271
272 list_for_each_entry(ls, &lslist, ls_list) {
273 if (ls->ls_global_id == id) {
274 ls->ls_count++;
275 goto out;
276 }
277 }
278 ls = NULL;
279 out:
280 spin_unlock(&lslist_lock);
281 return ls;
282}
283
284struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace)
285{
286 struct dlm_ls *ls;
287
288 spin_lock(&lslist_lock);
289 list_for_each_entry(ls, &lslist, ls_list) {
290 if (ls->ls_local_handle == lockspace) {
291 ls->ls_count++;
292 goto out;
293 }
294 }
295 ls = NULL;
296 out:
297 spin_unlock(&lslist_lock);
298 return ls;
299}
300
301struct dlm_ls *dlm_find_lockspace_device(int minor)
302{
303 struct dlm_ls *ls;
304
305 spin_lock(&lslist_lock);
306 list_for_each_entry(ls, &lslist, ls_list) {
307 if (ls->ls_device.minor == minor) {
308 ls->ls_count++;
309 goto out;
310 }
311 }
312 ls = NULL;
313 out:
314 spin_unlock(&lslist_lock);
315 return ls;
316}
317
318void dlm_put_lockspace(struct dlm_ls *ls)
319{
320 spin_lock(&lslist_lock);
321 ls->ls_count--;
322 spin_unlock(&lslist_lock);
323}
324
325static void remove_lockspace(struct dlm_ls *ls)
326{
327 for (;;) {
328 spin_lock(&lslist_lock);
329 if (ls->ls_count == 0) {
330 list_del(&ls->ls_list);
331 spin_unlock(&lslist_lock);
332 return;
333 }
334 spin_unlock(&lslist_lock);
335 ssleep(1);
336 }
337}
338
339static int threads_start(void)
340{
341 int error;
342
343 /* Thread which process lock requests for all lockspace's */
344 error = dlm_astd_start();
345 if (error) {
346 log_print("cannot start dlm_astd thread %d", error);
347 goto fail;
348 }
349
350 error = dlm_scand_start();
351 if (error) {
352 log_print("cannot start dlm_scand thread %d", error);
353 goto astd_fail;
354 }
355
356 /* Thread for sending/receiving messages for all lockspace's */
357 error = dlm_lowcomms_start();
358 if (error) {
359 log_print("cannot start dlm lowcomms %d", error);
360 goto scand_fail;
361 }
362
363 return 0;
364
365 scand_fail:
366 dlm_scand_stop();
367 astd_fail:
368 dlm_astd_stop();
369 fail:
370 return error;
371}
372
373static void threads_stop(void)
374{
375 dlm_scand_stop();
376 dlm_lowcomms_stop();
377 dlm_astd_stop();
378}
379
380static int new_lockspace(char *name, int namelen, void **lockspace,
381 uint32_t flags, int lvblen)
382{
383 struct dlm_ls *ls;
384 int i, size, error = -ENOMEM;
385
386 if (namelen > DLM_LOCKSPACE_LEN)
387 return -EINVAL;
388
389 if (!lvblen || (lvblen % 8))
390 return -EINVAL;
391
392 if (!try_module_get(THIS_MODULE))
393 return -EINVAL;
394
395 ls = dlm_find_lockspace_name(name, namelen);
396 if (ls) {
397 *lockspace = ls;
398 module_put(THIS_MODULE);
399 return -EEXIST;
400 }
401
402 ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
403 if (!ls)
404 goto out;
405 memcpy(ls->ls_name, name, namelen);
406 ls->ls_namelen = namelen;
407 ls->ls_exflags = flags;
408 ls->ls_lvblen = lvblen;
409 ls->ls_count = 0;
410 ls->ls_flags = 0;
411
412 size = dlm_config.rsbtbl_size;
413 ls->ls_rsbtbl_size = size;
414
415 ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
416 if (!ls->ls_rsbtbl)
417 goto out_lsfree;
418 for (i = 0; i < size; i++) {
419 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
420 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
421 rwlock_init(&ls->ls_rsbtbl[i].lock);
422 }
423
424 size = dlm_config.lkbtbl_size;
425 ls->ls_lkbtbl_size = size;
426
427 ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
428 if (!ls->ls_lkbtbl)
429 goto out_rsbfree;
430 for (i = 0; i < size; i++) {
431 INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
432 rwlock_init(&ls->ls_lkbtbl[i].lock);
433 ls->ls_lkbtbl[i].counter = 1;
434 }
435
436 size = dlm_config.dirtbl_size;
437 ls->ls_dirtbl_size = size;
438
439 ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
440 if (!ls->ls_dirtbl)
441 goto out_lkbfree;
442 for (i = 0; i < size; i++) {
443 INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
444 rwlock_init(&ls->ls_dirtbl[i].lock);
445 }
446
447 INIT_LIST_HEAD(&ls->ls_waiters);
448 mutex_init(&ls->ls_waiters_mutex);
449
450 INIT_LIST_HEAD(&ls->ls_nodes);
451 INIT_LIST_HEAD(&ls->ls_nodes_gone);
452 ls->ls_num_nodes = 0;
453 ls->ls_low_nodeid = 0;
454 ls->ls_total_weight = 0;
455 ls->ls_node_array = NULL;
456
457 memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
458 ls->ls_stub_rsb.res_ls = ls;
459
460 ls->ls_debug_rsb_dentry = NULL;
461 ls->ls_debug_waiters_dentry = NULL;
462
463 init_waitqueue_head(&ls->ls_uevent_wait);
464 ls->ls_uevent_result = 0;
465
466 ls->ls_recoverd_task = NULL;
467 mutex_init(&ls->ls_recoverd_active);
468 spin_lock_init(&ls->ls_recover_lock);
469 ls->ls_recover_status = 0;
470 ls->ls_recover_seq = 0;
471 ls->ls_recover_args = NULL;
472 init_rwsem(&ls->ls_in_recovery);
473 INIT_LIST_HEAD(&ls->ls_requestqueue);
474 mutex_init(&ls->ls_requestqueue_mutex);
475 mutex_init(&ls->ls_clear_proc_locks);
476
477 ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
478 if (!ls->ls_recover_buf)
479 goto out_dirfree;
480
481 INIT_LIST_HEAD(&ls->ls_recover_list);
482 spin_lock_init(&ls->ls_recover_list_lock);
483 ls->ls_recover_list_count = 0;
484 ls->ls_local_handle = ls;
485 init_waitqueue_head(&ls->ls_wait_general);
486 INIT_LIST_HEAD(&ls->ls_root_list);
487 init_rwsem(&ls->ls_root_sem);
488
489 down_write(&ls->ls_in_recovery);
490
491 spin_lock(&lslist_lock);
492 list_add(&ls->ls_list, &lslist);
493 spin_unlock(&lslist_lock);
494
495 /* needs to find ls in lslist */
496 error = dlm_recoverd_start(ls);
497 if (error) {
498 log_error(ls, "can't start dlm_recoverd %d", error);
499 goto out_rcomfree;
500 }
501
502 dlm_create_debug_file(ls);
503
504 error = kobject_setup(ls);
505 if (error)
506 goto out_del;
507
508 error = kobject_register(&ls->ls_kobj);
509 if (error)
510 goto out_del;
511
512 error = do_uevent(ls, 1);
513 if (error)
514 goto out_unreg;
515
516 *lockspace = ls;
517 return 0;
518
519 out_unreg:
520 kobject_unregister(&ls->ls_kobj);
521 out_del:
522 dlm_delete_debug_file(ls);
523 dlm_recoverd_stop(ls);
524 out_rcomfree:
525 spin_lock(&lslist_lock);
526 list_del(&ls->ls_list);
527 spin_unlock(&lslist_lock);
528 kfree(ls->ls_recover_buf);
529 out_dirfree:
530 kfree(ls->ls_dirtbl);
531 out_lkbfree:
532 kfree(ls->ls_lkbtbl);
533 out_rsbfree:
534 kfree(ls->ls_rsbtbl);
535 out_lsfree:
536 kfree(ls);
537 out:
538 module_put(THIS_MODULE);
539 return error;
540}
541
542int dlm_new_lockspace(char *name, int namelen, void **lockspace,
543 uint32_t flags, int lvblen)
544{
545 int error = 0;
546
547 mutex_lock(&ls_lock);
548 if (!ls_count)
549 error = threads_start();
550 if (error)
551 goto out;
552
553 error = new_lockspace(name, namelen, lockspace, flags, lvblen);
554 if (!error)
555 ls_count++;
556 out:
557 mutex_unlock(&ls_lock);
558 return error;
559}
560
561/* Return 1 if the lockspace still has active remote locks,
562 * 2 if the lockspace still has active local locks.
563 */
564static int lockspace_busy(struct dlm_ls *ls)
565{
566 int i, lkb_found = 0;
567 struct dlm_lkb *lkb;
568
569 /* NOTE: We check the lockidtbl here rather than the resource table.
570 This is because there may be LKBs queued as ASTs that have been
571 unlinked from their RSBs and are pending deletion once the AST has
572 been delivered */
573
574 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
575 read_lock(&ls->ls_lkbtbl[i].lock);
576 if (!list_empty(&ls->ls_lkbtbl[i].list)) {
577 lkb_found = 1;
578 list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
579 lkb_idtbl_list) {
580 if (!lkb->lkb_nodeid) {
581 read_unlock(&ls->ls_lkbtbl[i].lock);
582 return 2;
583 }
584 }
585 }
586 read_unlock(&ls->ls_lkbtbl[i].lock);
587 }
588 return lkb_found;
589}
590
591static int release_lockspace(struct dlm_ls *ls, int force)
592{
593 struct dlm_lkb *lkb;
594 struct dlm_rsb *rsb;
595 struct list_head *head;
596 int i;
597 int busy = lockspace_busy(ls);
598
599 if (busy > force)
600 return -EBUSY;
601
602 if (force < 3)
603 do_uevent(ls, 0);
604
605 dlm_recoverd_stop(ls);
606
607 remove_lockspace(ls);
608
609 dlm_delete_debug_file(ls);
610
611 dlm_astd_suspend();
612
613 kfree(ls->ls_recover_buf);
614
615 /*
616 * Free direntry structs.
617 */
618
619 dlm_dir_clear(ls);
620 kfree(ls->ls_dirtbl);
621
622 /*
623 * Free all lkb's on lkbtbl[] lists.
624 */
625
626 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
627 head = &ls->ls_lkbtbl[i].list;
628 while (!list_empty(head)) {
629 lkb = list_entry(head->next, struct dlm_lkb,
630 lkb_idtbl_list);
631
632 list_del(&lkb->lkb_idtbl_list);
633
634 dlm_del_ast(lkb);
635
636 if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
637 free_lvb(lkb->lkb_lvbptr);
638
639 free_lkb(lkb);
640 }
641 }
642 dlm_astd_resume();
643
644 kfree(ls->ls_lkbtbl);
645
646 /*
647 * Free all rsb's on rsbtbl[] lists
648 */
649
650 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
651 head = &ls->ls_rsbtbl[i].list;
652 while (!list_empty(head)) {
653 rsb = list_entry(head->next, struct dlm_rsb,
654 res_hashchain);
655
656 list_del(&rsb->res_hashchain);
657 free_rsb(rsb);
658 }
659
660 head = &ls->ls_rsbtbl[i].toss;
661 while (!list_empty(head)) {
662 rsb = list_entry(head->next, struct dlm_rsb,
663 res_hashchain);
664 list_del(&rsb->res_hashchain);
665 free_rsb(rsb);
666 }
667 }
668
669 kfree(ls->ls_rsbtbl);
670
671 /*
672 * Free structures on any other lists
673 */
674
675 kfree(ls->ls_recover_args);
676 dlm_clear_free_entries(ls);
677 dlm_clear_members(ls);
678 dlm_clear_members_gone(ls);
679 kfree(ls->ls_node_array);
680 kobject_unregister(&ls->ls_kobj);
681 kfree(ls);
682
683 mutex_lock(&ls_lock);
684 ls_count--;
685 if (!ls_count)
686 threads_stop();
687 mutex_unlock(&ls_lock);
688
689 module_put(THIS_MODULE);
690 return 0;
691}
692
693/*
694 * Called when a system has released all its locks and is not going to use the
695 * lockspace any longer. We free everything we're managing for this lockspace.
696 * Remaining nodes will go through the recovery process as if we'd died. The
697 * lockspace must continue to function as usual, participating in recoveries,
698 * until this returns.
699 *
700 * Force has 4 possible values:
701 * 0 - don't destroy locksapce if it has any LKBs
702 * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
703 * 2 - destroy lockspace regardless of LKBs
704 * 3 - destroy lockspace as part of a forced shutdown
705 */
706
707int dlm_release_lockspace(void *lockspace, int force)
708{
709 struct dlm_ls *ls;
710
711 ls = dlm_find_lockspace_local(lockspace);
712 if (!ls)
713 return -EINVAL;
714 dlm_put_lockspace(ls);
715 return release_lockspace(ls, force);
716}
717
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
new file mode 100644
index 000000000000..891eabbdd021
--- /dev/null
+++ b/fs/dlm/lockspace.h
@@ -0,0 +1,25 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOCKSPACE_DOT_H__
15#define __LOCKSPACE_DOT_H__
16
17int dlm_lockspace_init(void);
18void dlm_lockspace_exit(void);
19struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
20struct dlm_ls *dlm_find_lockspace_local(void *id);
21struct dlm_ls *dlm_find_lockspace_device(int minor);
22void dlm_put_lockspace(struct dlm_ls *ls);
23
24#endif /* __LOCKSPACE_DOT_H__ */
25
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
new file mode 100644
index 000000000000..23f5ce12080b
--- /dev/null
+++ b/fs/dlm/lowcomms.c
@@ -0,0 +1,1238 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * lowcomms.c
16 *
17 * This is the "low-level" comms layer.
18 *
19 * It is responsible for sending/receiving messages
20 * from other nodes in the cluster.
21 *
22 * Cluster nodes are referred to by their nodeids. nodeids are
23 * simply 32 bit numbers to the locking module - if they need to
24 * be expanded for the cluster infrastructure then that is it's
25 * responsibility. It is this layer's
26 * responsibility to resolve these into IP address or
27 * whatever it needs for inter-node communication.
28 *
29 * The comms level is two kernel threads that deal mainly with
30 * the receiving of messages from other nodes and passing them
31 * up to the mid-level comms layer (which understands the
32 * message format) for execution by the locking core, and
33 * a send thread which does all the setting up of connections
34 * to remote nodes and the sending of data. Threads are not allowed
35 * to send their own data because it may cause them to wait in times
36 * of high load. Also, this way, the sending thread can collect together
37 * messages bound for one node and send them in one block.
38 *
39 * I don't see any problem with the recv thread executing the locking
40 * code on behalf of remote processes as the locking code is
41 * short, efficient and never (well, hardly ever) waits.
42 *
43 */
44
45#include <asm/ioctls.h>
46#include <net/sock.h>
47#include <net/tcp.h>
48#include <net/sctp/user.h>
49#include <linux/pagemap.h>
50#include <linux/socket.h>
51#include <linux/idr.h>
52
53#include "dlm_internal.h"
54#include "lowcomms.h"
55#include "config.h"
56#include "midcomms.h"
57
58static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
59static int dlm_local_count;
60static int dlm_local_nodeid;
61
62/* One of these per connected node */
63
64#define NI_INIT_PENDING 1
65#define NI_WRITE_PENDING 2
66
67struct nodeinfo {
68 spinlock_t lock;
69 sctp_assoc_t assoc_id;
70 unsigned long flags;
71 struct list_head write_list; /* nodes with pending writes */
72 struct list_head writequeue; /* outgoing writequeue_entries */
73 spinlock_t writequeue_lock;
74 int nodeid;
75};
76
77static DEFINE_IDR(nodeinfo_idr);
78static struct rw_semaphore nodeinfo_lock;
79static int max_nodeid;
80
81struct cbuf {
82 unsigned base;
83 unsigned len;
84 unsigned mask;
85};
86
87/* Just the one of these, now. But this struct keeps
88 the connection-specific variables together */
89
90#define CF_READ_PENDING 1
91
92struct connection {
93 struct socket *sock;
94 unsigned long flags;
95 struct page *rx_page;
96 atomic_t waiting_requests;
97 struct cbuf cb;
98 int eagain_flag;
99};
100
101/* An entry waiting to be sent */
102
103struct writequeue_entry {
104 struct list_head list;
105 struct page *page;
106 int offset;
107 int len;
108 int end;
109 int users;
110 struct nodeinfo *ni;
111};
112
113#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
114#define CBUF_EMPTY(cb) ((cb)->len == 0)
115#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
116#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
117
118#define CBUF_INIT(cb, size) \
119do { \
120 (cb)->base = (cb)->len = 0; \
121 (cb)->mask = ((size)-1); \
122} while(0)
123
124#define CBUF_EAT(cb, n) \
125do { \
126 (cb)->len -= (n); \
127 (cb)->base += (n); \
128 (cb)->base &= (cb)->mask; \
129} while(0)
130
131
132/* List of nodes which have writes pending */
133static struct list_head write_nodes;
134static spinlock_t write_nodes_lock;
135
136/* Maximum number of incoming messages to process before
137 * doing a schedule()
138 */
139#define MAX_RX_MSG_COUNT 25
140
141/* Manage daemons */
142static struct task_struct *recv_task;
143static struct task_struct *send_task;
144static wait_queue_head_t lowcomms_recv_wait;
145static atomic_t accepting;
146
147/* The SCTP connection */
148static struct connection sctp_con;
149
150
151static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
152{
153 struct sockaddr_storage addr;
154 int error;
155
156 if (!dlm_local_count)
157 return -1;
158
159 error = dlm_nodeid_to_addr(nodeid, &addr);
160 if (error)
161 return error;
162
163 if (dlm_local_addr[0]->ss_family == AF_INET) {
164 struct sockaddr_in *in4 = (struct sockaddr_in *) &addr;
165 struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
166 ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
167 } else {
168 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
169 struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
170 memcpy(&ret6->sin6_addr, &in6->sin6_addr,
171 sizeof(in6->sin6_addr));
172 }
173
174 return 0;
175}
176
177static struct nodeinfo *nodeid2nodeinfo(int nodeid, int alloc)
178{
179 struct nodeinfo *ni;
180 int r;
181 int n;
182
183 down_read(&nodeinfo_lock);
184 ni = idr_find(&nodeinfo_idr, nodeid);
185 up_read(&nodeinfo_lock);
186
187 if (!ni && alloc) {
188 down_write(&nodeinfo_lock);
189
190 ni = idr_find(&nodeinfo_idr, nodeid);
191 if (ni)
192 goto out_up;
193
194 r = idr_pre_get(&nodeinfo_idr, alloc);
195 if (!r)
196 goto out_up;
197
198 ni = kmalloc(sizeof(struct nodeinfo), alloc);
199 if (!ni)
200 goto out_up;
201
202 r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
203 if (r) {
204 kfree(ni);
205 ni = NULL;
206 goto out_up;
207 }
208 if (n != nodeid) {
209 idr_remove(&nodeinfo_idr, n);
210 kfree(ni);
211 ni = NULL;
212 goto out_up;
213 }
214 memset(ni, 0, sizeof(struct nodeinfo));
215 spin_lock_init(&ni->lock);
216 INIT_LIST_HEAD(&ni->writequeue);
217 spin_lock_init(&ni->writequeue_lock);
218 ni->nodeid = nodeid;
219
220 if (nodeid > max_nodeid)
221 max_nodeid = nodeid;
222 out_up:
223 up_write(&nodeinfo_lock);
224 }
225
226 return ni;
227}
228
229/* Don't call this too often... */
230static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
231{
232 int i;
233 struct nodeinfo *ni;
234
235 for (i=1; i<=max_nodeid; i++) {
236 ni = nodeid2nodeinfo(i, 0);
237 if (ni && ni->assoc_id == assoc)
238 return ni;
239 }
240 return NULL;
241}
242
243/* Data or notification available on socket */
244static void lowcomms_data_ready(struct sock *sk, int count_unused)
245{
246 atomic_inc(&sctp_con.waiting_requests);
247 if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
248 return;
249
250 wake_up_interruptible(&lowcomms_recv_wait);
251}
252
253
254/* Add the port number to an IP6 or 4 sockaddr and return the address length.
255 Also padd out the struct with zeros to make comparisons meaningful */
256
257static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
258 int *addr_len)
259{
260 struct sockaddr_in *local4_addr;
261 struct sockaddr_in6 *local6_addr;
262
263 if (!dlm_local_count)
264 return;
265
266 if (!port) {
267 if (dlm_local_addr[0]->ss_family == AF_INET) {
268 local4_addr = (struct sockaddr_in *)dlm_local_addr[0];
269 port = be16_to_cpu(local4_addr->sin_port);
270 } else {
271 local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0];
272 port = be16_to_cpu(local6_addr->sin6_port);
273 }
274 }
275
276 saddr->ss_family = dlm_local_addr[0]->ss_family;
277 if (dlm_local_addr[0]->ss_family == AF_INET) {
278 struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
279 in4_addr->sin_port = cpu_to_be16(port);
280 memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
281 memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
282 sizeof(struct sockaddr_in));
283 *addr_len = sizeof(struct sockaddr_in);
284 } else {
285 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
286 in6_addr->sin6_port = cpu_to_be16(port);
287 memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
288 sizeof(struct sockaddr_in6));
289 *addr_len = sizeof(struct sockaddr_in6);
290 }
291}
292
293/* Close the connection and tidy up */
294static void close_connection(void)
295{
296 if (sctp_con.sock) {
297 sock_release(sctp_con.sock);
298 sctp_con.sock = NULL;
299 }
300
301 if (sctp_con.rx_page) {
302 __free_page(sctp_con.rx_page);
303 sctp_con.rx_page = NULL;
304 }
305}
306
307/* We only send shutdown messages to nodes that are not part of the cluster */
308static void send_shutdown(sctp_assoc_t associd)
309{
310 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
311 struct msghdr outmessage;
312 struct cmsghdr *cmsg;
313 struct sctp_sndrcvinfo *sinfo;
314 int ret;
315
316 outmessage.msg_name = NULL;
317 outmessage.msg_namelen = 0;
318 outmessage.msg_control = outcmsg;
319 outmessage.msg_controllen = sizeof(outcmsg);
320 outmessage.msg_flags = MSG_EOR;
321
322 cmsg = CMSG_FIRSTHDR(&outmessage);
323 cmsg->cmsg_level = IPPROTO_SCTP;
324 cmsg->cmsg_type = SCTP_SNDRCV;
325 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
326 outmessage.msg_controllen = cmsg->cmsg_len;
327 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
328 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
329
330 sinfo->sinfo_flags |= MSG_EOF;
331 sinfo->sinfo_assoc_id = associd;
332
333 ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
334
335 if (ret != 0)
336 log_print("send EOF to node failed: %d", ret);
337}
338
339
340/* INIT failed but we don't know which node...
341 restart INIT on all pending nodes */
342static void init_failed(void)
343{
344 int i;
345 struct nodeinfo *ni;
346
347 for (i=1; i<=max_nodeid; i++) {
348 ni = nodeid2nodeinfo(i, 0);
349 if (!ni)
350 continue;
351
352 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
353 ni->assoc_id = 0;
354 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
355 spin_lock_bh(&write_nodes_lock);
356 list_add_tail(&ni->write_list, &write_nodes);
357 spin_unlock_bh(&write_nodes_lock);
358 }
359 }
360 }
361 wake_up_process(send_task);
362}
363
364/* Something happened to an association */
365static void process_sctp_notification(struct msghdr *msg, char *buf)
366{
367 union sctp_notification *sn = (union sctp_notification *)buf;
368
369 if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
370 switch (sn->sn_assoc_change.sac_state) {
371
372 case SCTP_COMM_UP:
373 case SCTP_RESTART:
374 {
375 /* Check that the new node is in the lockspace */
376 struct sctp_prim prim;
377 mm_segment_t fs;
378 int nodeid;
379 int prim_len, ret;
380 int addr_len;
381 struct nodeinfo *ni;
382
383 /* This seems to happen when we received a connection
384 * too early... or something... anyway, it happens but
385 * we always seem to get a real message too, see
386 * receive_from_sock */
387
388 if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
389 log_print("COMM_UP for invalid assoc ID %d",
390 (int)sn->sn_assoc_change.sac_assoc_id);
391 init_failed();
392 return;
393 }
394 memset(&prim, 0, sizeof(struct sctp_prim));
395 prim_len = sizeof(struct sctp_prim);
396 prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
397
398 fs = get_fs();
399 set_fs(get_ds());
400 ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
401 IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
402 (char*)&prim, &prim_len);
403 set_fs(fs);
404 if (ret < 0) {
405 struct nodeinfo *ni;
406
407 log_print("getsockopt/sctp_primary_addr on "
408 "new assoc %d failed : %d",
409 (int)sn->sn_assoc_change.sac_assoc_id, ret);
410
411 /* Retry INIT later */
412 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
413 if (ni)
414 clear_bit(NI_INIT_PENDING, &ni->flags);
415 return;
416 }
417 make_sockaddr(&prim.ssp_addr, 0, &addr_len);
418 if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
419 log_print("reject connect from unknown addr");
420 send_shutdown(prim.ssp_assoc_id);
421 return;
422 }
423
424 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
425 if (!ni)
426 return;
427
428 /* Save the assoc ID */
429 spin_lock(&ni->lock);
430 ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
431 spin_unlock(&ni->lock);
432
433 log_print("got new/restarted association %d nodeid %d",
434 (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
435
436 /* Send any pending writes */
437 clear_bit(NI_INIT_PENDING, &ni->flags);
438 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
439 spin_lock_bh(&write_nodes_lock);
440 list_add_tail(&ni->write_list, &write_nodes);
441 spin_unlock_bh(&write_nodes_lock);
442 }
443 wake_up_process(send_task);
444 }
445 break;
446
447 case SCTP_COMM_LOST:
448 case SCTP_SHUTDOWN_COMP:
449 {
450 struct nodeinfo *ni;
451
452 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
453 if (ni) {
454 spin_lock(&ni->lock);
455 ni->assoc_id = 0;
456 spin_unlock(&ni->lock);
457 }
458 }
459 break;
460
461 /* We don't know which INIT failed, so clear the PENDING flags
462 * on them all. if assoc_id is zero then it will then try
463 * again */
464
465 case SCTP_CANT_STR_ASSOC:
466 {
467 log_print("Can't start SCTP association - retrying");
468 init_failed();
469 }
470 break;
471
472 default:
473 log_print("unexpected SCTP assoc change id=%d state=%d",
474 (int)sn->sn_assoc_change.sac_assoc_id,
475 sn->sn_assoc_change.sac_state);
476 }
477 }
478}
479
480/* Data received from remote end */
481static int receive_from_sock(void)
482{
483 int ret = 0;
484 struct msghdr msg;
485 struct kvec iov[2];
486 unsigned len;
487 int r;
488 struct sctp_sndrcvinfo *sinfo;
489 struct cmsghdr *cmsg;
490 struct nodeinfo *ni;
491
492 /* These two are marginally too big for stack allocation, but this
493 * function is (currently) only called by dlm_recvd so static should be
494 * OK.
495 */
496 static struct sockaddr_storage msgname;
497 static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
498
499 if (sctp_con.sock == NULL)
500 goto out;
501
502 if (sctp_con.rx_page == NULL) {
503 /*
504 * This doesn't need to be atomic, but I think it should
505 * improve performance if it is.
506 */
507 sctp_con.rx_page = alloc_page(GFP_ATOMIC);
508 if (sctp_con.rx_page == NULL)
509 goto out_resched;
510 CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
511 }
512
513 memset(&incmsg, 0, sizeof(incmsg));
514 memset(&msgname, 0, sizeof(msgname));
515
516 memset(incmsg, 0, sizeof(incmsg));
517 msg.msg_name = &msgname;
518 msg.msg_namelen = sizeof(msgname);
519 msg.msg_flags = 0;
520 msg.msg_control = incmsg;
521 msg.msg_controllen = sizeof(incmsg);
522
523 /* I don't see why this circular buffer stuff is necessary for SCTP
524 * which is a packet-based protocol, but the whole thing breaks under
525 * load without it! The overhead is minimal (and is in the TCP lowcomms
526 * anyway, of course) so I'll leave it in until I can figure out what's
527 * really happening.
528 */
529
530 /*
531 * iov[0] is the bit of the circular buffer between the current end
532 * point (cb.base + cb.len) and the end of the buffer.
533 */
534 iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
535 iov[0].iov_base = page_address(sctp_con.rx_page) +
536 CBUF_DATA(&sctp_con.cb);
537 iov[1].iov_len = 0;
538
539 /*
540 * iov[1] is the bit of the circular buffer between the start of the
541 * buffer and the start of the currently used section (cb.base)
542 */
543 if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
544 iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
545 iov[1].iov_len = sctp_con.cb.base;
546 iov[1].iov_base = page_address(sctp_con.rx_page);
547 msg.msg_iovlen = 2;
548 }
549 len = iov[0].iov_len + iov[1].iov_len;
550
551 r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, 1, len,
552 MSG_NOSIGNAL | MSG_DONTWAIT);
553 if (ret <= 0)
554 goto out_close;
555
556 msg.msg_control = incmsg;
557 msg.msg_controllen = sizeof(incmsg);
558 cmsg = CMSG_FIRSTHDR(&msg);
559 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
560
561 if (msg.msg_flags & MSG_NOTIFICATION) {
562 process_sctp_notification(&msg, page_address(sctp_con.rx_page));
563 return 0;
564 }
565
566 /* Is this a new association ? */
567 ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
568 if (ni) {
569 ni->assoc_id = sinfo->sinfo_assoc_id;
570 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
571
572 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
573 spin_lock_bh(&write_nodes_lock);
574 list_add_tail(&ni->write_list, &write_nodes);
575 spin_unlock_bh(&write_nodes_lock);
576 }
577 wake_up_process(send_task);
578 }
579 }
580
581 /* INIT sends a message with length of 1 - ignore it */
582 if (r == 1)
583 return 0;
584
585 CBUF_ADD(&sctp_con.cb, ret);
586 ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
587 page_address(sctp_con.rx_page),
588 sctp_con.cb.base, sctp_con.cb.len,
589 PAGE_CACHE_SIZE);
590 if (ret < 0)
591 goto out_close;
592 CBUF_EAT(&sctp_con.cb, ret);
593
594 out:
595 ret = 0;
596 goto out_ret;
597
598 out_resched:
599 lowcomms_data_ready(sctp_con.sock->sk, 0);
600 ret = 0;
601 schedule();
602 goto out_ret;
603
604 out_close:
605 if (ret != -EAGAIN)
606 log_print("error reading from sctp socket: %d", ret);
607 out_ret:
608 return ret;
609}
610
611/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
612static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
613{
614 mm_segment_t fs;
615 int result = 0;
616
617 fs = get_fs();
618 set_fs(get_ds());
619 if (num == 1)
620 result = sctp_con.sock->ops->bind(sctp_con.sock,
621 (struct sockaddr *) addr, addr_len);
622 else
623 result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
624 SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
625 set_fs(fs);
626
627 if (result < 0)
628 log_print("Can't bind to port %d addr number %d",
629 dlm_config.tcp_port, num);
630
631 return result;
632}
633
634static void init_local(void)
635{
636 struct sockaddr_storage sas, *addr;
637 int i;
638
639 dlm_local_nodeid = dlm_our_nodeid();
640
641 for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
642 if (dlm_our_addr(&sas, i))
643 break;
644
645 addr = kmalloc(sizeof(*addr), GFP_KERNEL);
646 if (!addr)
647 break;
648 memcpy(addr, &sas, sizeof(*addr));
649 dlm_local_addr[dlm_local_count++] = addr;
650 }
651}
652
653/* Initialise SCTP socket and bind to all interfaces */
654static int init_sock(void)
655{
656 mm_segment_t fs;
657 struct socket *sock = NULL;
658 struct sockaddr_storage localaddr;
659 struct sctp_event_subscribe subscribe;
660 int result = -EINVAL, num = 1, i, addr_len;
661
662 if (!dlm_local_count) {
663 init_local();
664 if (!dlm_local_count) {
665 log_print("no local IP address has been set");
666 goto out;
667 }
668 }
669
670 result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
671 IPPROTO_SCTP, &sock);
672 if (result < 0) {
673 log_print("Can't create comms socket, check SCTP is loaded");
674 goto out;
675 }
676
677 /* Listen for events */
678 memset(&subscribe, 0, sizeof(subscribe));
679 subscribe.sctp_data_io_event = 1;
680 subscribe.sctp_association_event = 1;
681 subscribe.sctp_send_failure_event = 1;
682 subscribe.sctp_shutdown_event = 1;
683 subscribe.sctp_partial_delivery_event = 1;
684
685 fs = get_fs();
686 set_fs(get_ds());
687 result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
688 (char *)&subscribe, sizeof(subscribe));
689 set_fs(fs);
690
691 if (result < 0) {
692 log_print("Failed to set SCTP_EVENTS on socket: result=%d",
693 result);
694 goto create_delsock;
695 }
696
697 /* Init con struct */
698 sock->sk->sk_user_data = &sctp_con;
699 sctp_con.sock = sock;
700 sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
701
702 /* Bind to all interfaces. */
703 for (i = 0; i < dlm_local_count; i++) {
704 memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
705 make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
706
707 result = add_bind_addr(&localaddr, addr_len, num);
708 if (result)
709 goto create_delsock;
710 ++num;
711 }
712
713 result = sock->ops->listen(sock, 5);
714 if (result < 0) {
715 log_print("Can't set socket listening");
716 goto create_delsock;
717 }
718
719 return 0;
720
721 create_delsock:
722 sock_release(sock);
723 sctp_con.sock = NULL;
724 out:
725 return result;
726}
727
728
729static struct writequeue_entry *new_writequeue_entry(int allocation)
730{
731 struct writequeue_entry *entry;
732
733 entry = kmalloc(sizeof(struct writequeue_entry), allocation);
734 if (!entry)
735 return NULL;
736
737 entry->page = alloc_page(allocation);
738 if (!entry->page) {
739 kfree(entry);
740 return NULL;
741 }
742
743 entry->offset = 0;
744 entry->len = 0;
745 entry->end = 0;
746 entry->users = 0;
747
748 return entry;
749}
750
751void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc)
752{
753 struct writequeue_entry *e;
754 int offset = 0;
755 int users = 0;
756 struct nodeinfo *ni;
757
758 if (!atomic_read(&accepting))
759 return NULL;
760
761 ni = nodeid2nodeinfo(nodeid, allocation);
762 if (!ni)
763 return NULL;
764
765 spin_lock(&ni->writequeue_lock);
766 e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
767 if (((struct list_head *) e == &ni->writequeue) ||
768 (PAGE_CACHE_SIZE - e->end < len)) {
769 e = NULL;
770 } else {
771 offset = e->end;
772 e->end += len;
773 users = e->users++;
774 }
775 spin_unlock(&ni->writequeue_lock);
776
777 if (e) {
778 got_one:
779 if (users == 0)
780 kmap(e->page);
781 *ppc = page_address(e->page) + offset;
782 return e;
783 }
784
785 e = new_writequeue_entry(allocation);
786 if (e) {
787 spin_lock(&ni->writequeue_lock);
788 offset = e->end;
789 e->end += len;
790 e->ni = ni;
791 users = e->users++;
792 list_add_tail(&e->list, &ni->writequeue);
793 spin_unlock(&ni->writequeue_lock);
794 goto got_one;
795 }
796 return NULL;
797}
798
799void dlm_lowcomms_commit_buffer(void *arg)
800{
801 struct writequeue_entry *e = (struct writequeue_entry *) arg;
802 int users;
803 struct nodeinfo *ni = e->ni;
804
805 if (!atomic_read(&accepting))
806 return;
807
808 spin_lock(&ni->writequeue_lock);
809 users = --e->users;
810 if (users)
811 goto out;
812 e->len = e->end - e->offset;
813 kunmap(e->page);
814 spin_unlock(&ni->writequeue_lock);
815
816 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
817 spin_lock_bh(&write_nodes_lock);
818 list_add_tail(&ni->write_list, &write_nodes);
819 spin_unlock_bh(&write_nodes_lock);
820 wake_up_process(send_task);
821 }
822 return;
823
824 out:
825 spin_unlock(&ni->writequeue_lock);
826 return;
827}
828
829static void free_entry(struct writequeue_entry *e)
830{
831 __free_page(e->page);
832 kfree(e);
833}
834
835/* Initiate an SCTP association. In theory we could just use sendmsg() on
836 the first IP address and it should work, but this allows us to set up the
837 association before sending any valuable data that we can't afford to lose.
838 It also keeps the send path clean as it can now always use the association ID */
839static void initiate_association(int nodeid)
840{
841 struct sockaddr_storage rem_addr;
842 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
843 struct msghdr outmessage;
844 struct cmsghdr *cmsg;
845 struct sctp_sndrcvinfo *sinfo;
846 int ret;
847 int addrlen;
848 char buf[1];
849 struct kvec iov[1];
850 struct nodeinfo *ni;
851
852 log_print("Initiating association with node %d", nodeid);
853
854 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
855 if (!ni)
856 return;
857
858 if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
859 log_print("no address for nodeid %d", nodeid);
860 return;
861 }
862
863 make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
864
865 outmessage.msg_name = &rem_addr;
866 outmessage.msg_namelen = addrlen;
867 outmessage.msg_control = outcmsg;
868 outmessage.msg_controllen = sizeof(outcmsg);
869 outmessage.msg_flags = MSG_EOR;
870
871 iov[0].iov_base = buf;
872 iov[0].iov_len = 1;
873
874 /* Real INIT messages seem to cause trouble. Just send a 1 byte message
875 we can afford to lose */
876 cmsg = CMSG_FIRSTHDR(&outmessage);
877 cmsg->cmsg_level = IPPROTO_SCTP;
878 cmsg->cmsg_type = SCTP_SNDRCV;
879 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
880 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
881 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
882 sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
883
884 outmessage.msg_controllen = cmsg->cmsg_len;
885 ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
886 if (ret < 0) {
887 log_print("send INIT to node failed: %d", ret);
888 /* Try again later */
889 clear_bit(NI_INIT_PENDING, &ni->flags);
890 }
891}
892
893/* Send a message */
894static int send_to_sock(struct nodeinfo *ni)
895{
896 int ret = 0;
897 struct writequeue_entry *e;
898 int len, offset;
899 struct msghdr outmsg;
900 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
901 struct cmsghdr *cmsg;
902 struct sctp_sndrcvinfo *sinfo;
903 struct kvec iov;
904
905 /* See if we need to init an association before we start
906 sending precious messages */
907 spin_lock(&ni->lock);
908 if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
909 spin_unlock(&ni->lock);
910 initiate_association(ni->nodeid);
911 return 0;
912 }
913 spin_unlock(&ni->lock);
914
915 outmsg.msg_name = NULL; /* We use assoc_id */
916 outmsg.msg_namelen = 0;
917 outmsg.msg_control = outcmsg;
918 outmsg.msg_controllen = sizeof(outcmsg);
919 outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
920
921 cmsg = CMSG_FIRSTHDR(&outmsg);
922 cmsg->cmsg_level = IPPROTO_SCTP;
923 cmsg->cmsg_type = SCTP_SNDRCV;
924 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
925 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
926 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
927 sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
928 sinfo->sinfo_assoc_id = ni->assoc_id;
929 outmsg.msg_controllen = cmsg->cmsg_len;
930
931 spin_lock(&ni->writequeue_lock);
932 for (;;) {
933 if (list_empty(&ni->writequeue))
934 break;
935 e = list_entry(ni->writequeue.next, struct writequeue_entry,
936 list);
937 len = e->len;
938 offset = e->offset;
939 BUG_ON(len == 0 && e->users == 0);
940 spin_unlock(&ni->writequeue_lock);
941 kmap(e->page);
942
943 ret = 0;
944 if (len) {
945 iov.iov_base = page_address(e->page)+offset;
946 iov.iov_len = len;
947
948 ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
949 len);
950 if (ret == -EAGAIN) {
951 sctp_con.eagain_flag = 1;
952 goto out;
953 } else if (ret < 0)
954 goto send_error;
955 } else {
956 /* Don't starve people filling buffers */
957 schedule();
958 }
959
960 spin_lock(&ni->writequeue_lock);
961 e->offset += ret;
962 e->len -= ret;
963
964 if (e->len == 0 && e->users == 0) {
965 list_del(&e->list);
966 free_entry(e);
967 continue;
968 }
969 }
970 spin_unlock(&ni->writequeue_lock);
971 out:
972 return ret;
973
974 send_error:
975 log_print("Error sending to node %d %d", ni->nodeid, ret);
976 spin_lock(&ni->lock);
977 if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
978 ni->assoc_id = 0;
979 spin_unlock(&ni->lock);
980 initiate_association(ni->nodeid);
981 } else
982 spin_unlock(&ni->lock);
983
984 return ret;
985}
986
987/* Try to send any messages that are pending */
988static void process_output_queue(void)
989{
990 struct list_head *list;
991 struct list_head *temp;
992
993 spin_lock_bh(&write_nodes_lock);
994 list_for_each_safe(list, temp, &write_nodes) {
995 struct nodeinfo *ni =
996 list_entry(list, struct nodeinfo, write_list);
997 clear_bit(NI_WRITE_PENDING, &ni->flags);
998 list_del(&ni->write_list);
999
1000 spin_unlock_bh(&write_nodes_lock);
1001
1002 send_to_sock(ni);
1003 spin_lock_bh(&write_nodes_lock);
1004 }
1005 spin_unlock_bh(&write_nodes_lock);
1006}
1007
1008/* Called after we've had -EAGAIN and been woken up */
1009static void refill_write_queue(void)
1010{
1011 int i;
1012
1013 for (i=1; i<=max_nodeid; i++) {
1014 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1015
1016 if (ni) {
1017 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
1018 spin_lock_bh(&write_nodes_lock);
1019 list_add_tail(&ni->write_list, &write_nodes);
1020 spin_unlock_bh(&write_nodes_lock);
1021 }
1022 }
1023 }
1024}
1025
1026static void clean_one_writequeue(struct nodeinfo *ni)
1027{
1028 struct list_head *list;
1029 struct list_head *temp;
1030
1031 spin_lock(&ni->writequeue_lock);
1032 list_for_each_safe(list, temp, &ni->writequeue) {
1033 struct writequeue_entry *e =
1034 list_entry(list, struct writequeue_entry, list);
1035 list_del(&e->list);
1036 free_entry(e);
1037 }
1038 spin_unlock(&ni->writequeue_lock);
1039}
1040
1041static void clean_writequeues(void)
1042{
1043 int i;
1044
1045 for (i=1; i<=max_nodeid; i++) {
1046 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1047 if (ni)
1048 clean_one_writequeue(ni);
1049 }
1050}
1051
1052
1053static void dealloc_nodeinfo(void)
1054{
1055 int i;
1056
1057 for (i=1; i<=max_nodeid; i++) {
1058 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1059 if (ni) {
1060 idr_remove(&nodeinfo_idr, i);
1061 kfree(ni);
1062 }
1063 }
1064}
1065
1066int dlm_lowcomms_close(int nodeid)
1067{
1068 struct nodeinfo *ni;
1069
1070 ni = nodeid2nodeinfo(nodeid, 0);
1071 if (!ni)
1072 return -1;
1073
1074 spin_lock(&ni->lock);
1075 if (ni->assoc_id) {
1076 ni->assoc_id = 0;
1077 /* Don't send shutdown here, sctp will just queue it
1078 till the node comes back up! */
1079 }
1080 spin_unlock(&ni->lock);
1081
1082 clean_one_writequeue(ni);
1083 clear_bit(NI_INIT_PENDING, &ni->flags);
1084 return 0;
1085}
1086
1087static int write_list_empty(void)
1088{
1089 int status;
1090
1091 spin_lock_bh(&write_nodes_lock);
1092 status = list_empty(&write_nodes);
1093 spin_unlock_bh(&write_nodes_lock);
1094
1095 return status;
1096}
1097
1098static int dlm_recvd(void *data)
1099{
1100 DECLARE_WAITQUEUE(wait, current);
1101
1102 while (!kthread_should_stop()) {
1103 int count = 0;
1104
1105 set_current_state(TASK_INTERRUPTIBLE);
1106 add_wait_queue(&lowcomms_recv_wait, &wait);
1107 if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
1108 schedule();
1109 remove_wait_queue(&lowcomms_recv_wait, &wait);
1110 set_current_state(TASK_RUNNING);
1111
1112 if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
1113 int ret;
1114
1115 do {
1116 ret = receive_from_sock();
1117
1118 /* Don't starve out everyone else */
1119 if (++count >= MAX_RX_MSG_COUNT) {
1120 schedule();
1121 count = 0;
1122 }
1123 } while (!kthread_should_stop() && ret >=0);
1124 }
1125 schedule();
1126 }
1127
1128 return 0;
1129}
1130
1131static int dlm_sendd(void *data)
1132{
1133 DECLARE_WAITQUEUE(wait, current);
1134
1135 add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1136
1137 while (!kthread_should_stop()) {
1138 set_current_state(TASK_INTERRUPTIBLE);
1139 if (write_list_empty())
1140 schedule();
1141 set_current_state(TASK_RUNNING);
1142
1143 if (sctp_con.eagain_flag) {
1144 sctp_con.eagain_flag = 0;
1145 refill_write_queue();
1146 }
1147 process_output_queue();
1148 }
1149
1150 remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1151
1152 return 0;
1153}
1154
1155static void daemons_stop(void)
1156{
1157 kthread_stop(recv_task);
1158 kthread_stop(send_task);
1159}
1160
1161static int daemons_start(void)
1162{
1163 struct task_struct *p;
1164 int error;
1165
1166 p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
1167 error = IS_ERR(p);
1168 if (error) {
1169 log_print("can't start dlm_recvd %d", error);
1170 return error;
1171 }
1172 recv_task = p;
1173
1174 p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
1175 error = IS_ERR(p);
1176 if (error) {
1177 log_print("can't start dlm_sendd %d", error);
1178 kthread_stop(recv_task);
1179 return error;
1180 }
1181 send_task = p;
1182
1183 return 0;
1184}
1185
1186/*
1187 * This is quite likely to sleep...
1188 */
1189int dlm_lowcomms_start(void)
1190{
1191 int error;
1192
1193 error = init_sock();
1194 if (error)
1195 goto fail_sock;
1196 error = daemons_start();
1197 if (error)
1198 goto fail_sock;
1199 atomic_set(&accepting, 1);
1200 return 0;
1201
1202 fail_sock:
1203 close_connection();
1204 return error;
1205}
1206
1207/* Set all the activity flags to prevent any socket activity. */
1208
1209void dlm_lowcomms_stop(void)
1210{
1211 atomic_set(&accepting, 0);
1212 sctp_con.flags = 0x7;
1213 daemons_stop();
1214 clean_writequeues();
1215 close_connection();
1216 dealloc_nodeinfo();
1217 max_nodeid = 0;
1218}
1219
1220int dlm_lowcomms_init(void)
1221{
1222 init_waitqueue_head(&lowcomms_recv_wait);
1223 spin_lock_init(&write_nodes_lock);
1224 INIT_LIST_HEAD(&write_nodes);
1225 init_rwsem(&nodeinfo_lock);
1226 return 0;
1227}
1228
1229void dlm_lowcomms_exit(void)
1230{
1231 int i;
1232
1233 for (i = 0; i < dlm_local_count; i++)
1234 kfree(dlm_local_addr[i]);
1235 dlm_local_count = 0;
1236 dlm_local_nodeid = 0;
1237}
1238
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
new file mode 100644
index 000000000000..6c04bb09cfa8
--- /dev/null
+++ b/fs/dlm/lowcomms.h
@@ -0,0 +1,26 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOWCOMMS_DOT_H__
15#define __LOWCOMMS_DOT_H__
16
17int dlm_lowcomms_init(void);
18void dlm_lowcomms_exit(void);
19int dlm_lowcomms_start(void);
20void dlm_lowcomms_stop(void);
21int dlm_lowcomms_close(int nodeid);
22void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc);
23void dlm_lowcomms_commit_buffer(void *mh);
24
25#endif /* __LOWCOMMS_DOT_H__ */
26
diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h
new file mode 100644
index 000000000000..cc3e92f3feef
--- /dev/null
+++ b/fs/dlm/lvb_table.h
@@ -0,0 +1,18 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LVB_TABLE_DOT_H__
14#define __LVB_TABLE_DOT_H__
15
16extern const int dlm_lvb_operations[8][8];
17
18#endif
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
new file mode 100644
index 000000000000..a8da8dc36b2e
--- /dev/null
+++ b/fs/dlm/main.c
@@ -0,0 +1,97 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "lock.h"
17#include "user.h"
18#include "memory.h"
19#include "lowcomms.h"
20#include "config.h"
21
22#ifdef CONFIG_DLM_DEBUG
23int dlm_register_debugfs(void);
24void dlm_unregister_debugfs(void);
25#else
26static inline int dlm_register_debugfs(void) { return 0; }
27static inline void dlm_unregister_debugfs(void) { }
28#endif
29
30static int __init init_dlm(void)
31{
32 int error;
33
34 error = dlm_memory_init();
35 if (error)
36 goto out;
37
38 error = dlm_lockspace_init();
39 if (error)
40 goto out_mem;
41
42 error = dlm_config_init();
43 if (error)
44 goto out_lockspace;
45
46 error = dlm_register_debugfs();
47 if (error)
48 goto out_config;
49
50 error = dlm_lowcomms_init();
51 if (error)
52 goto out_debug;
53
54 error = dlm_user_init();
55 if (error)
56 goto out_lowcomms;
57
58 printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
59
60 return 0;
61
62 out_lowcomms:
63 dlm_lowcomms_exit();
64 out_debug:
65 dlm_unregister_debugfs();
66 out_config:
67 dlm_config_exit();
68 out_lockspace:
69 dlm_lockspace_exit();
70 out_mem:
71 dlm_memory_exit();
72 out:
73 return error;
74}
75
76static void __exit exit_dlm(void)
77{
78 dlm_user_exit();
79 dlm_lowcomms_exit();
80 dlm_config_exit();
81 dlm_memory_exit();
82 dlm_lockspace_exit();
83 dlm_unregister_debugfs();
84}
85
86module_init(init_dlm);
87module_exit(exit_dlm);
88
89MODULE_DESCRIPTION("Distributed Lock Manager");
90MODULE_AUTHOR("Red Hat, Inc.");
91MODULE_LICENSE("GPL");
92
93EXPORT_SYMBOL_GPL(dlm_new_lockspace);
94EXPORT_SYMBOL_GPL(dlm_release_lockspace);
95EXPORT_SYMBOL_GPL(dlm_lock);
96EXPORT_SYMBOL_GPL(dlm_unlock);
97
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
new file mode 100644
index 000000000000..a3f7de7f3a8f
--- /dev/null
+++ b/fs/dlm/member.c
@@ -0,0 +1,327 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "lockspace.h"
15#include "member.h"
16#include "recoverd.h"
17#include "recover.h"
18#include "rcom.h"
19#include "config.h"
20
21/*
22 * Following called by dlm_recoverd thread
23 */
24
25static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
26{
27 struct dlm_member *memb = NULL;
28 struct list_head *tmp;
29 struct list_head *newlist = &new->list;
30 struct list_head *head = &ls->ls_nodes;
31
32 list_for_each(tmp, head) {
33 memb = list_entry(tmp, struct dlm_member, list);
34 if (new->nodeid < memb->nodeid)
35 break;
36 }
37
38 if (!memb)
39 list_add_tail(newlist, head);
40 else {
41 /* FIXME: can use list macro here */
42 newlist->prev = tmp->prev;
43 newlist->next = tmp;
44 tmp->prev->next = newlist;
45 tmp->prev = newlist;
46 }
47}
48
49static int dlm_add_member(struct dlm_ls *ls, int nodeid)
50{
51 struct dlm_member *memb;
52 int w;
53
54 memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
55 if (!memb)
56 return -ENOMEM;
57
58 w = dlm_node_weight(ls->ls_name, nodeid);
59 if (w < 0)
60 return w;
61
62 memb->nodeid = nodeid;
63 memb->weight = w;
64 add_ordered_member(ls, memb);
65 ls->ls_num_nodes++;
66 return 0;
67}
68
69static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
70{
71 list_move(&memb->list, &ls->ls_nodes_gone);
72 ls->ls_num_nodes--;
73}
74
75static int dlm_is_member(struct dlm_ls *ls, int nodeid)
76{
77 struct dlm_member *memb;
78
79 list_for_each_entry(memb, &ls->ls_nodes, list) {
80 if (memb->nodeid == nodeid)
81 return 1;
82 }
83 return 0;
84}
85
86int dlm_is_removed(struct dlm_ls *ls, int nodeid)
87{
88 struct dlm_member *memb;
89
90 list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
91 if (memb->nodeid == nodeid)
92 return 1;
93 }
94 return 0;
95}
96
97static void clear_memb_list(struct list_head *head)
98{
99 struct dlm_member *memb;
100
101 while (!list_empty(head)) {
102 memb = list_entry(head->next, struct dlm_member, list);
103 list_del(&memb->list);
104 kfree(memb);
105 }
106}
107
108void dlm_clear_members(struct dlm_ls *ls)
109{
110 clear_memb_list(&ls->ls_nodes);
111 ls->ls_num_nodes = 0;
112}
113
114void dlm_clear_members_gone(struct dlm_ls *ls)
115{
116 clear_memb_list(&ls->ls_nodes_gone);
117}
118
119static void make_member_array(struct dlm_ls *ls)
120{
121 struct dlm_member *memb;
122 int i, w, x = 0, total = 0, all_zero = 0, *array;
123
124 kfree(ls->ls_node_array);
125 ls->ls_node_array = NULL;
126
127 list_for_each_entry(memb, &ls->ls_nodes, list) {
128 if (memb->weight)
129 total += memb->weight;
130 }
131
132 /* all nodes revert to weight of 1 if all have weight 0 */
133
134 if (!total) {
135 total = ls->ls_num_nodes;
136 all_zero = 1;
137 }
138
139 ls->ls_total_weight = total;
140
141 array = kmalloc(sizeof(int) * total, GFP_KERNEL);
142 if (!array)
143 return;
144
145 list_for_each_entry(memb, &ls->ls_nodes, list) {
146 if (!all_zero && !memb->weight)
147 continue;
148
149 if (all_zero)
150 w = 1;
151 else
152 w = memb->weight;
153
154 DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
155
156 for (i = 0; i < w; i++)
157 array[x++] = memb->nodeid;
158 }
159
160 ls->ls_node_array = array;
161}
162
163/* send a status request to all members just to establish comms connections */
164
165static int ping_members(struct dlm_ls *ls)
166{
167 struct dlm_member *memb;
168 int error = 0;
169
170 list_for_each_entry(memb, &ls->ls_nodes, list) {
171 error = dlm_recovery_stopped(ls);
172 if (error)
173 break;
174 error = dlm_rcom_status(ls, memb->nodeid);
175 if (error)
176 break;
177 }
178 if (error)
179 log_debug(ls, "ping_members aborted %d last nodeid %d",
180 error, ls->ls_recover_nodeid);
181 return error;
182}
183
184int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
185{
186 struct dlm_member *memb, *safe;
187 int i, error, found, pos = 0, neg = 0, low = -1;
188
189 /* move departed members from ls_nodes to ls_nodes_gone */
190
191 list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
192 found = 0;
193 for (i = 0; i < rv->node_count; i++) {
194 if (memb->nodeid == rv->nodeids[i]) {
195 found = 1;
196 break;
197 }
198 }
199
200 if (!found) {
201 neg++;
202 dlm_remove_member(ls, memb);
203 log_debug(ls, "remove member %d", memb->nodeid);
204 }
205 }
206
207 /* add new members to ls_nodes */
208
209 for (i = 0; i < rv->node_count; i++) {
210 if (dlm_is_member(ls, rv->nodeids[i]))
211 continue;
212 dlm_add_member(ls, rv->nodeids[i]);
213 pos++;
214 log_debug(ls, "add member %d", rv->nodeids[i]);
215 }
216
217 list_for_each_entry(memb, &ls->ls_nodes, list) {
218 if (low == -1 || memb->nodeid < low)
219 low = memb->nodeid;
220 }
221 ls->ls_low_nodeid = low;
222
223 make_member_array(ls);
224 dlm_set_recover_status(ls, DLM_RS_NODES);
225 *neg_out = neg;
226
227 error = ping_members(ls);
228 if (error)
229 goto out;
230
231 error = dlm_recover_members_wait(ls);
232 out:
233 log_debug(ls, "total members %d error %d", ls->ls_num_nodes, error);
234 return error;
235}
236
237/*
238 * Following called from lockspace.c
239 */
240
241int dlm_ls_stop(struct dlm_ls *ls)
242{
243 int new;
244
245 /*
246 * A stop cancels any recovery that's in progress (see RECOVERY_STOP,
247 * dlm_recovery_stopped()) and prevents any new locks from being
248 * processed (see RUNNING, dlm_locking_stopped()).
249 */
250
251 spin_lock(&ls->ls_recover_lock);
252 set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
253 new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
254 ls->ls_recover_seq++;
255 spin_unlock(&ls->ls_recover_lock);
256
257 /*
258 * This in_recovery lock does two things:
259 *
260 * 1) Keeps this function from returning until all threads are out
261 * of locking routines and locking is truely stopped.
262 * 2) Keeps any new requests from being processed until it's unlocked
263 * when recovery is complete.
264 */
265
266 if (new)
267 down_write(&ls->ls_in_recovery);
268
269 /*
270 * The recoverd suspend/resume makes sure that dlm_recoverd (if
271 * running) has noticed the clearing of RUNNING above and quit
272 * processing the previous recovery. This will be true for all nodes
273 * before any nodes start the new recovery.
274 */
275
276 dlm_recoverd_suspend(ls);
277 ls->ls_recover_status = 0;
278 dlm_recoverd_resume(ls);
279 return 0;
280}
281
282int dlm_ls_start(struct dlm_ls *ls)
283{
284 struct dlm_recover *rv = NULL, *rv_old;
285 int *ids = NULL;
286 int error, count;
287
288 rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL);
289 if (!rv)
290 return -ENOMEM;
291
292 error = count = dlm_nodeid_list(ls->ls_name, &ids);
293 if (error <= 0)
294 goto fail;
295
296 spin_lock(&ls->ls_recover_lock);
297
298 /* the lockspace needs to be stopped before it can be started */
299
300 if (!dlm_locking_stopped(ls)) {
301 spin_unlock(&ls->ls_recover_lock);
302 log_error(ls, "start ignored: lockspace running");
303 error = -EINVAL;
304 goto fail;
305 }
306
307 rv->nodeids = ids;
308 rv->node_count = count;
309 rv->seq = ++ls->ls_recover_seq;
310 rv_old = ls->ls_recover_args;
311 ls->ls_recover_args = rv;
312 spin_unlock(&ls->ls_recover_lock);
313
314 if (rv_old) {
315 kfree(rv_old->nodeids);
316 kfree(rv_old);
317 }
318
319 dlm_recoverd_kick(ls);
320 return 0;
321
322 fail:
323 kfree(rv);
324 kfree(ids);
325 return error;
326}
327
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
new file mode 100644
index 000000000000..927c08c19214
--- /dev/null
+++ b/fs/dlm/member.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __MEMBER_DOT_H__
14#define __MEMBER_DOT_H__
15
16int dlm_ls_stop(struct dlm_ls *ls);
17int dlm_ls_start(struct dlm_ls *ls);
18void dlm_clear_members(struct dlm_ls *ls);
19void dlm_clear_members_gone(struct dlm_ls *ls);
20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
21int dlm_is_removed(struct dlm_ls *ls, int nodeid);
22
23#endif /* __MEMBER_DOT_H__ */
24
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
new file mode 100644
index 000000000000..989b608fd836
--- /dev/null
+++ b/fs/dlm/memory.c
@@ -0,0 +1,116 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "config.h"
16#include "memory.h"
17
18static kmem_cache_t *lkb_cache;
19
20
21int dlm_memory_init(void)
22{
23 int ret = 0;
24
25 lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
26 __alignof__(struct dlm_lkb), 0, NULL, NULL);
27 if (!lkb_cache)
28 ret = -ENOMEM;
29 return ret;
30}
31
32void dlm_memory_exit(void)
33{
34 if (lkb_cache)
35 kmem_cache_destroy(lkb_cache);
36}
37
38char *allocate_lvb(struct dlm_ls *ls)
39{
40 char *p;
41
42 p = kmalloc(ls->ls_lvblen, GFP_KERNEL);
43 if (p)
44 memset(p, 0, ls->ls_lvblen);
45 return p;
46}
47
48void free_lvb(char *p)
49{
50 kfree(p);
51}
52
53/* FIXME: have some minimal space built-in to rsb for the name and
54 kmalloc a separate name if needed, like dentries are done */
55
56struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
57{
58 struct dlm_rsb *r;
59
60 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
61
62 r = kmalloc(sizeof(*r) + namelen, GFP_KERNEL);
63 if (r)
64 memset(r, 0, sizeof(*r) + namelen);
65 return r;
66}
67
68void free_rsb(struct dlm_rsb *r)
69{
70 if (r->res_lvbptr)
71 free_lvb(r->res_lvbptr);
72 kfree(r);
73}
74
75struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
76{
77 struct dlm_lkb *lkb;
78
79 lkb = kmem_cache_alloc(lkb_cache, GFP_KERNEL);
80 if (lkb)
81 memset(lkb, 0, sizeof(*lkb));
82 return lkb;
83}
84
85void free_lkb(struct dlm_lkb *lkb)
86{
87 if (lkb->lkb_flags & DLM_IFL_USER) {
88 struct dlm_user_args *ua;
89 ua = (struct dlm_user_args *)lkb->lkb_astparam;
90 if (ua) {
91 if (ua->lksb.sb_lvbptr)
92 kfree(ua->lksb.sb_lvbptr);
93 kfree(ua);
94 }
95 }
96 kmem_cache_free(lkb_cache, lkb);
97}
98
99struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
100{
101 struct dlm_direntry *de;
102
103 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,
104 printk("namelen = %d\n", namelen););
105
106 de = kmalloc(sizeof(*de) + namelen, GFP_KERNEL);
107 if (de)
108 memset(de, 0, sizeof(*de) + namelen);
109 return de;
110}
111
112void free_direntry(struct dlm_direntry *de)
113{
114 kfree(de);
115}
116
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
new file mode 100644
index 000000000000..6ead158ccc5c
--- /dev/null
+++ b/fs/dlm/memory.h
@@ -0,0 +1,29 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MEMORY_DOT_H__
15#define __MEMORY_DOT_H__
16
17int dlm_memory_init(void);
18void dlm_memory_exit(void);
19struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
20void free_rsb(struct dlm_rsb *r);
21struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
22void free_lkb(struct dlm_lkb *l);
23struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
24void free_direntry(struct dlm_direntry *de);
25char *allocate_lvb(struct dlm_ls *ls);
26void free_lvb(char *l);
27
28#endif /* __MEMORY_DOT_H__ */
29
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
new file mode 100644
index 000000000000..c9b1c3d535f4
--- /dev/null
+++ b/fs/dlm/midcomms.c
@@ -0,0 +1,140 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * midcomms.c
16 *
17 * This is the appallingly named "mid-level" comms layer.
18 *
19 * Its purpose is to take packets from the "real" comms layer,
20 * split them up into packets and pass them to the interested
21 * part of the locking mechanism.
22 *
23 * It also takes messages from the locking layer, formats them
24 * into packets and sends them to the comms layer.
25 */
26
27#include "dlm_internal.h"
28#include "lowcomms.h"
29#include "config.h"
30#include "rcom.h"
31#include "lock.h"
32#include "midcomms.h"
33
34
35static void copy_from_cb(void *dst, const void *base, unsigned offset,
36 unsigned len, unsigned limit)
37{
38 unsigned copy = len;
39
40 if ((copy + offset) > limit)
41 copy = limit - offset;
42 memcpy(dst, base + offset, copy);
43 len -= copy;
44 if (len)
45 memcpy(dst + copy, base, len);
46}
47
48/*
49 * Called from the low-level comms layer to process a buffer of
50 * commands.
51 *
52 * Only complete messages are processed here, any "spare" bytes from
53 * the end of a buffer are saved and tacked onto the front of the next
54 * message that comes in. I doubt this will happen very often but we
55 * need to be able to cope with it and I don't want the task to be waiting
56 * for packets to come in when there is useful work to be done.
57 */
58
59int dlm_process_incoming_buffer(int nodeid, const void *base,
60 unsigned offset, unsigned len, unsigned limit)
61{
62 unsigned char __tmp[DLM_INBUF_LEN];
63 struct dlm_header *msg = (struct dlm_header *) __tmp;
64 int ret = 0;
65 int err = 0;
66 uint16_t msglen;
67 uint32_t lockspace;
68
69 while (len > sizeof(struct dlm_header)) {
70
71 /* Copy just the header to check the total length. The
72 message may wrap around the end of the buffer back to the
73 start, so we need to use a temp buffer and copy_from_cb. */
74
75 copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
76 limit);
77
78 msglen = le16_to_cpu(msg->h_length);
79 lockspace = msg->h_lockspace;
80
81 err = -EINVAL;
82 if (msglen < sizeof(struct dlm_header))
83 break;
84 err = -E2BIG;
85 if (msglen > dlm_config.buffer_size) {
86 log_print("message size %d from %d too big, buf len %d",
87 msglen, nodeid, len);
88 break;
89 }
90 err = 0;
91
92 /* If only part of the full message is contained in this
93 buffer, then do nothing and wait for lowcomms to call
94 us again later with more data. We return 0 meaning
95 we've consumed none of the input buffer. */
96
97 if (msglen > len)
98 break;
99
100 /* Allocate a larger temp buffer if the full message won't fit
101 in the buffer on the stack (which should work for most
102 ordinary messages). */
103
104 if (msglen > sizeof(__tmp) &&
105 msg == (struct dlm_header *) __tmp) {
106 msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
107 if (msg == NULL)
108 return ret;
109 }
110
111 copy_from_cb(msg, base, offset, msglen, limit);
112
113 BUG_ON(lockspace != msg->h_lockspace);
114
115 ret += msglen;
116 offset += msglen;
117 offset &= (limit - 1);
118 len -= msglen;
119
120 switch (msg->h_cmd) {
121 case DLM_MSG:
122 dlm_receive_message(msg, nodeid, 0);
123 break;
124
125 case DLM_RCOM:
126 dlm_receive_rcom(msg, nodeid);
127 break;
128
129 default:
130 log_print("unknown msg type %x from %u: %u %u %u %u",
131 msg->h_cmd, nodeid, msglen, len, offset, ret);
132 }
133 }
134
135 if (msg != (struct dlm_header *) __tmp)
136 kfree(msg);
137
138 return err ? err : ret;
139}
140
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
new file mode 100644
index 000000000000..95852a5f111d
--- /dev/null
+++ b/fs/dlm/midcomms.h
@@ -0,0 +1,21 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MIDCOMMS_DOT_H__
15#define __MIDCOMMS_DOT_H__
16
17int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
18 unsigned len, unsigned limit);
19
20#endif /* __MIDCOMMS_DOT_H__ */
21
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
new file mode 100644
index 000000000000..518239a8b1e9
--- /dev/null
+++ b/fs/dlm/rcom.c
@@ -0,0 +1,472 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "midcomms.h"
19#include "rcom.h"
20#include "recover.h"
21#include "dir.h"
22#include "config.h"
23#include "memory.h"
24#include "lock.h"
25#include "util.h"
26
27
28static int rcom_response(struct dlm_ls *ls)
29{
30 return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
31}
32
33static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
34 struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
35{
36 struct dlm_rcom *rc;
37 struct dlm_mhandle *mh;
38 char *mb;
39 int mb_len = sizeof(struct dlm_rcom) + len;
40
41 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
42 if (!mh) {
43 log_print("create_rcom to %d type %d len %d ENOBUFS",
44 to_nodeid, type, len);
45 return -ENOBUFS;
46 }
47 memset(mb, 0, mb_len);
48
49 rc = (struct dlm_rcom *) mb;
50
51 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
52 rc->rc_header.h_lockspace = ls->ls_global_id;
53 rc->rc_header.h_nodeid = dlm_our_nodeid();
54 rc->rc_header.h_length = mb_len;
55 rc->rc_header.h_cmd = DLM_RCOM;
56
57 rc->rc_type = type;
58
59 *mh_ret = mh;
60 *rc_ret = rc;
61 return 0;
62}
63
64static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
65 struct dlm_rcom *rc)
66{
67 dlm_rcom_out(rc);
68 dlm_lowcomms_commit_buffer(mh);
69}
70
71/* When replying to a status request, a node also sends back its
72 configuration values. The requesting node then checks that the remote
73 node is configured the same way as itself. */
74
75static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
76{
77 rf->rf_lvblen = ls->ls_lvblen;
78 rf->rf_lsflags = ls->ls_exflags;
79}
80
81static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
82{
83 if (rf->rf_lvblen != ls->ls_lvblen ||
84 rf->rf_lsflags != ls->ls_exflags) {
85 log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
86 ls->ls_lvblen, ls->ls_exflags,
87 nodeid, rf->rf_lvblen, rf->rf_lsflags);
88 return -EINVAL;
89 }
90 return 0;
91}
92
93int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
94{
95 struct dlm_rcom *rc;
96 struct dlm_mhandle *mh;
97 int error = 0;
98
99 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
100 ls->ls_recover_nodeid = nodeid;
101
102 if (nodeid == dlm_our_nodeid()) {
103 rc = (struct dlm_rcom *) ls->ls_recover_buf;
104 rc->rc_result = dlm_recover_status(ls);
105 goto out;
106 }
107
108 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
109 if (error)
110 goto out;
111 rc->rc_id = ++ls->ls_rcom_seq;
112
113 send_rcom(ls, mh, rc);
114
115 error = dlm_wait_function(ls, &rcom_response);
116 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
117 if (error)
118 goto out;
119
120 rc = (struct dlm_rcom *) ls->ls_recover_buf;
121
122 if (rc->rc_result == -ESRCH) {
123 /* we pretend the remote lockspace exists with 0 status */
124 log_debug(ls, "remote node %d not ready", nodeid);
125 rc->rc_result = 0;
126 } else
127 error = check_config(ls, (struct rcom_config *) rc->rc_buf,
128 nodeid);
129 /* the caller looks at rc_result for the remote recovery status */
130 out:
131 return error;
132}
133
134static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
135{
136 struct dlm_rcom *rc;
137 struct dlm_mhandle *mh;
138 int error, nodeid = rc_in->rc_header.h_nodeid;
139
140 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
141 sizeof(struct rcom_config), &rc, &mh);
142 if (error)
143 return;
144 rc->rc_id = rc_in->rc_id;
145 rc->rc_result = dlm_recover_status(ls);
146 make_config(ls, (struct rcom_config *) rc->rc_buf);
147
148 send_rcom(ls, mh, rc);
149}
150
151static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
152{
153 if (rc_in->rc_id != ls->ls_rcom_seq) {
154 log_debug(ls, "reject old reply %d got %llx wanted %llx",
155 rc_in->rc_type, rc_in->rc_id, ls->ls_rcom_seq);
156 return;
157 }
158 memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
159 set_bit(LSFL_RCOM_READY, &ls->ls_flags);
160 wake_up(&ls->ls_wait_general);
161}
162
163static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
164{
165 receive_sync_reply(ls, rc_in);
166}
167
168int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
169{
170 struct dlm_rcom *rc;
171 struct dlm_mhandle *mh;
172 int error = 0, len = sizeof(struct dlm_rcom);
173
174 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
175 ls->ls_recover_nodeid = nodeid;
176
177 if (nodeid == dlm_our_nodeid()) {
178 dlm_copy_master_names(ls, last_name, last_len,
179 ls->ls_recover_buf + len,
180 dlm_config.buffer_size - len, nodeid);
181 goto out;
182 }
183
184 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
185 if (error)
186 goto out;
187 memcpy(rc->rc_buf, last_name, last_len);
188 rc->rc_id = ++ls->ls_rcom_seq;
189
190 send_rcom(ls, mh, rc);
191
192 error = dlm_wait_function(ls, &rcom_response);
193 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
194 out:
195 return error;
196}
197
198static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
199{
200 struct dlm_rcom *rc;
201 struct dlm_mhandle *mh;
202 int error, inlen, outlen;
203 int nodeid = rc_in->rc_header.h_nodeid;
204 uint32_t status = dlm_recover_status(ls);
205
206 /*
207 * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
208 * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
209 * It could only happen in rare cases where we get a late NAMES
210 * message from a previous instance of recovery.
211 */
212
213 if (!(status & DLM_RS_NODES)) {
214 log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
215 return;
216 }
217
218 nodeid = rc_in->rc_header.h_nodeid;
219 inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
220 outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
221
222 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
223 if (error)
224 return;
225 rc->rc_id = rc_in->rc_id;
226
227 dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
228 nodeid);
229 send_rcom(ls, mh, rc);
230}
231
232static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
233{
234 receive_sync_reply(ls, rc_in);
235}
236
237int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
238{
239 struct dlm_rcom *rc;
240 struct dlm_mhandle *mh;
241 struct dlm_ls *ls = r->res_ls;
242 int error;
243
244 error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
245 &rc, &mh);
246 if (error)
247 goto out;
248 memcpy(rc->rc_buf, r->res_name, r->res_length);
249 rc->rc_id = (unsigned long) r;
250
251 send_rcom(ls, mh, rc);
252 out:
253 return error;
254}
255
256static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
257{
258 struct dlm_rcom *rc;
259 struct dlm_mhandle *mh;
260 int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
261 int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
262
263 error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
264 if (error)
265 return;
266
267 error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
268 if (error)
269 ret_nodeid = error;
270 rc->rc_result = ret_nodeid;
271 rc->rc_id = rc_in->rc_id;
272
273 send_rcom(ls, mh, rc);
274}
275
276static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
277{
278 dlm_recover_master_reply(ls, rc_in);
279}
280
281static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
282 struct rcom_lock *rl)
283{
284 memset(rl, 0, sizeof(*rl));
285
286 rl->rl_ownpid = lkb->lkb_ownpid;
287 rl->rl_lkid = lkb->lkb_id;
288 rl->rl_exflags = lkb->lkb_exflags;
289 rl->rl_flags = lkb->lkb_flags;
290 rl->rl_lvbseq = lkb->lkb_lvbseq;
291 rl->rl_rqmode = lkb->lkb_rqmode;
292 rl->rl_grmode = lkb->lkb_grmode;
293 rl->rl_status = lkb->lkb_status;
294 rl->rl_wait_type = lkb->lkb_wait_type;
295
296 if (lkb->lkb_bastaddr)
297 rl->rl_asts |= AST_BAST;
298 if (lkb->lkb_astaddr)
299 rl->rl_asts |= AST_COMP;
300
301 rl->rl_namelen = r->res_length;
302 memcpy(rl->rl_name, r->res_name, r->res_length);
303
304 /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
305 If so, receive_rcom_lock_args() won't take this copy. */
306
307 if (lkb->lkb_lvbptr)
308 memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
309}
310
311int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
312{
313 struct dlm_ls *ls = r->res_ls;
314 struct dlm_rcom *rc;
315 struct dlm_mhandle *mh;
316 struct rcom_lock *rl;
317 int error, len = sizeof(struct rcom_lock);
318
319 if (lkb->lkb_lvbptr)
320 len += ls->ls_lvblen;
321
322 error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
323 if (error)
324 goto out;
325
326 rl = (struct rcom_lock *) rc->rc_buf;
327 pack_rcom_lock(r, lkb, rl);
328 rc->rc_id = (unsigned long) r;
329
330 send_rcom(ls, mh, rc);
331 out:
332 return error;
333}
334
335static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
336{
337 struct dlm_rcom *rc;
338 struct dlm_mhandle *mh;
339 int error, nodeid = rc_in->rc_header.h_nodeid;
340
341 dlm_recover_master_copy(ls, rc_in);
342
343 error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
344 sizeof(struct rcom_lock), &rc, &mh);
345 if (error)
346 return;
347
348 /* We send back the same rcom_lock struct we received, but
349 dlm_recover_master_copy() has filled in rl_remid and rl_result */
350
351 memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
352 rc->rc_id = rc_in->rc_id;
353
354 send_rcom(ls, mh, rc);
355}
356
357static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
358{
359 uint32_t status = dlm_recover_status(ls);
360
361 if (!(status & DLM_RS_DIR)) {
362 log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
363 rc_in->rc_header.h_nodeid);
364 return;
365 }
366
367 dlm_recover_process_copy(ls, rc_in);
368}
369
370static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
371{
372 struct dlm_rcom *rc;
373 struct dlm_mhandle *mh;
374 char *mb;
375 int mb_len = sizeof(struct dlm_rcom);
376
377 mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb);
378 if (!mh)
379 return -ENOBUFS;
380 memset(mb, 0, mb_len);
381
382 rc = (struct dlm_rcom *) mb;
383
384 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
385 rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
386 rc->rc_header.h_nodeid = dlm_our_nodeid();
387 rc->rc_header.h_length = mb_len;
388 rc->rc_header.h_cmd = DLM_RCOM;
389
390 rc->rc_type = DLM_RCOM_STATUS_REPLY;
391 rc->rc_id = rc_in->rc_id;
392 rc->rc_result = -ESRCH;
393
394 dlm_rcom_out(rc);
395 dlm_lowcomms_commit_buffer(mh);
396
397 return 0;
398}
399
400/* Called by dlm_recvd; corresponds to dlm_receive_message() but special
401 recovery-only comms are sent through here. */
402
403void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
404{
405 struct dlm_rcom *rc = (struct dlm_rcom *) hd;
406 struct dlm_ls *ls;
407
408 dlm_rcom_in(rc);
409
410 /* If the lockspace doesn't exist then still send a status message
411 back; it's possible that it just doesn't have its global_id yet. */
412
413 ls = dlm_find_lockspace_global(hd->h_lockspace);
414 if (!ls) {
415 log_print("lockspace %x from %d not found",
416 hd->h_lockspace, nodeid);
417 send_ls_not_ready(nodeid, rc);
418 return;
419 }
420
421 if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
422 log_error(ls, "ignoring recovery message %x from %d",
423 rc->rc_type, nodeid);
424 goto out;
425 }
426
427 if (nodeid != rc->rc_header.h_nodeid) {
428 log_error(ls, "bad rcom nodeid %d from %d",
429 rc->rc_header.h_nodeid, nodeid);
430 goto out;
431 }
432
433 switch (rc->rc_type) {
434 case DLM_RCOM_STATUS:
435 receive_rcom_status(ls, rc);
436 break;
437
438 case DLM_RCOM_NAMES:
439 receive_rcom_names(ls, rc);
440 break;
441
442 case DLM_RCOM_LOOKUP:
443 receive_rcom_lookup(ls, rc);
444 break;
445
446 case DLM_RCOM_LOCK:
447 receive_rcom_lock(ls, rc);
448 break;
449
450 case DLM_RCOM_STATUS_REPLY:
451 receive_rcom_status_reply(ls, rc);
452 break;
453
454 case DLM_RCOM_NAMES_REPLY:
455 receive_rcom_names_reply(ls, rc);
456 break;
457
458 case DLM_RCOM_LOOKUP_REPLY:
459 receive_rcom_lookup_reply(ls, rc);
460 break;
461
462 case DLM_RCOM_LOCK_REPLY:
463 receive_rcom_lock_reply(ls, rc);
464 break;
465
466 default:
467 DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
468 }
469 out:
470 dlm_put_lockspace(ls);
471}
472
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
new file mode 100644
index 000000000000..d7984321ff41
--- /dev/null
+++ b/fs/dlm/rcom.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RCOM_DOT_H__
15#define __RCOM_DOT_H__
16
17int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
18int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
19int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
20int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
21void dlm_receive_rcom(struct dlm_header *hd, int nodeid);
22
23#endif
24
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
new file mode 100644
index 000000000000..a5e6d184872e
--- /dev/null
+++ b/fs/dlm/recover.c
@@ -0,0 +1,765 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "dir.h"
17#include "config.h"
18#include "ast.h"
19#include "memory.h"
20#include "rcom.h"
21#include "lock.h"
22#include "lowcomms.h"
23#include "member.h"
24#include "recover.h"
25
26
27/*
28 * Recovery waiting routines: these functions wait for a particular reply from
29 * a remote node, or for the remote node to report a certain status. They need
30 * to abort if the lockspace is stopped indicating a node has failed (perhaps
31 * the one being waited for).
32 */
33
34/*
35 * Wait until given function returns non-zero or lockspace is stopped
36 * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another
37 * function thinks it could have completed the waited-on task, they should wake
38 * up ls_wait_general to get an immediate response rather than waiting for the
39 * timer to detect the result. A timer wakes us up periodically while waiting
40 * to see if we should abort due to a node failure. This should only be called
41 * by the dlm_recoverd thread.
42 */
43
44static void dlm_wait_timer_fn(unsigned long data)
45{
46 struct dlm_ls *ls = (struct dlm_ls *) data;
47 mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ));
48 wake_up(&ls->ls_wait_general);
49}
50
51int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
52{
53 int error = 0;
54
55 init_timer(&ls->ls_timer);
56 ls->ls_timer.function = dlm_wait_timer_fn;
57 ls->ls_timer.data = (long) ls;
58 ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ);
59 add_timer(&ls->ls_timer);
60
61 wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
62 del_timer_sync(&ls->ls_timer);
63
64 if (dlm_recovery_stopped(ls)) {
65 log_debug(ls, "dlm_wait_function aborted");
66 error = -EINTR;
67 }
68 return error;
69}
70
71/*
72 * An efficient way for all nodes to wait for all others to have a certain
73 * status. The node with the lowest nodeid polls all the others for their
74 * status (wait_status_all) and all the others poll the node with the low id
75 * for its accumulated result (wait_status_low). When all nodes have set
76 * status flag X, then status flag X_ALL will be set on the low nodeid.
77 */
78
79uint32_t dlm_recover_status(struct dlm_ls *ls)
80{
81 uint32_t status;
82 spin_lock(&ls->ls_recover_lock);
83 status = ls->ls_recover_status;
84 spin_unlock(&ls->ls_recover_lock);
85 return status;
86}
87
88void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
89{
90 spin_lock(&ls->ls_recover_lock);
91 ls->ls_recover_status |= status;
92 spin_unlock(&ls->ls_recover_lock);
93}
94
95static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
96{
97 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
98 struct dlm_member *memb;
99 int error = 0, delay;
100
101 list_for_each_entry(memb, &ls->ls_nodes, list) {
102 delay = 0;
103 for (;;) {
104 if (dlm_recovery_stopped(ls)) {
105 error = -EINTR;
106 goto out;
107 }
108
109 error = dlm_rcom_status(ls, memb->nodeid);
110 if (error)
111 goto out;
112
113 if (rc->rc_result & wait_status)
114 break;
115 if (delay < 1000)
116 delay += 20;
117 msleep(delay);
118 }
119 }
120 out:
121 return error;
122}
123
124static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
125{
126 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
127 int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
128
129 for (;;) {
130 if (dlm_recovery_stopped(ls)) {
131 error = -EINTR;
132 goto out;
133 }
134
135 error = dlm_rcom_status(ls, nodeid);
136 if (error)
137 break;
138
139 if (rc->rc_result & wait_status)
140 break;
141 if (delay < 1000)
142 delay += 20;
143 msleep(delay);
144 }
145 out:
146 return error;
147}
148
149static int wait_status(struct dlm_ls *ls, uint32_t status)
150{
151 uint32_t status_all = status << 1;
152 int error;
153
154 if (ls->ls_low_nodeid == dlm_our_nodeid()) {
155 error = wait_status_all(ls, status);
156 if (!error)
157 dlm_set_recover_status(ls, status_all);
158 } else
159 error = wait_status_low(ls, status_all);
160
161 return error;
162}
163
164int dlm_recover_members_wait(struct dlm_ls *ls)
165{
166 return wait_status(ls, DLM_RS_NODES);
167}
168
169int dlm_recover_directory_wait(struct dlm_ls *ls)
170{
171 return wait_status(ls, DLM_RS_DIR);
172}
173
174int dlm_recover_locks_wait(struct dlm_ls *ls)
175{
176 return wait_status(ls, DLM_RS_LOCKS);
177}
178
179int dlm_recover_done_wait(struct dlm_ls *ls)
180{
181 return wait_status(ls, DLM_RS_DONE);
182}
183
184/*
185 * The recover_list contains all the rsb's for which we've requested the new
186 * master nodeid. As replies are returned from the resource directories the
187 * rsb's are removed from the list. When the list is empty we're done.
188 *
189 * The recover_list is later similarly used for all rsb's for which we've sent
190 * new lkb's and need to receive new corresponding lkid's.
191 *
192 * We use the address of the rsb struct as a simple local identifier for the
193 * rsb so we can match an rcom reply with the rsb it was sent for.
194 */
195
196static int recover_list_empty(struct dlm_ls *ls)
197{
198 int empty;
199
200 spin_lock(&ls->ls_recover_list_lock);
201 empty = list_empty(&ls->ls_recover_list);
202 spin_unlock(&ls->ls_recover_list_lock);
203
204 return empty;
205}
206
207static void recover_list_add(struct dlm_rsb *r)
208{
209 struct dlm_ls *ls = r->res_ls;
210
211 spin_lock(&ls->ls_recover_list_lock);
212 if (list_empty(&r->res_recover_list)) {
213 list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
214 ls->ls_recover_list_count++;
215 dlm_hold_rsb(r);
216 }
217 spin_unlock(&ls->ls_recover_list_lock);
218}
219
220static void recover_list_del(struct dlm_rsb *r)
221{
222 struct dlm_ls *ls = r->res_ls;
223
224 spin_lock(&ls->ls_recover_list_lock);
225 list_del_init(&r->res_recover_list);
226 ls->ls_recover_list_count--;
227 spin_unlock(&ls->ls_recover_list_lock);
228
229 dlm_put_rsb(r);
230}
231
232static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id)
233{
234 struct dlm_rsb *r = NULL;
235
236 spin_lock(&ls->ls_recover_list_lock);
237
238 list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) {
239 if (id == (unsigned long) r)
240 goto out;
241 }
242 r = NULL;
243 out:
244 spin_unlock(&ls->ls_recover_list_lock);
245 return r;
246}
247
248static void recover_list_clear(struct dlm_ls *ls)
249{
250 struct dlm_rsb *r, *s;
251
252 spin_lock(&ls->ls_recover_list_lock);
253 list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
254 list_del_init(&r->res_recover_list);
255 dlm_put_rsb(r);
256 ls->ls_recover_list_count--;
257 }
258
259 if (ls->ls_recover_list_count != 0) {
260 log_error(ls, "warning: recover_list_count %d",
261 ls->ls_recover_list_count);
262 ls->ls_recover_list_count = 0;
263 }
264 spin_unlock(&ls->ls_recover_list_lock);
265}
266
267
268/* Master recovery: find new master node for rsb's that were
269 mastered on nodes that have been removed.
270
271 dlm_recover_masters
272 recover_master
273 dlm_send_rcom_lookup -> receive_rcom_lookup
274 dlm_dir_lookup
275 receive_rcom_lookup_reply <-
276 dlm_recover_master_reply
277 set_new_master
278 set_master_lkbs
279 set_lock_master
280*/
281
282/*
283 * Set the lock master for all LKBs in a lock queue
284 * If we are the new master of the rsb, we may have received new
285 * MSTCPY locks from other nodes already which we need to ignore
286 * when setting the new nodeid.
287 */
288
289static void set_lock_master(struct list_head *queue, int nodeid)
290{
291 struct dlm_lkb *lkb;
292
293 list_for_each_entry(lkb, queue, lkb_statequeue)
294 if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
295 lkb->lkb_nodeid = nodeid;
296}
297
298static void set_master_lkbs(struct dlm_rsb *r)
299{
300 set_lock_master(&r->res_grantqueue, r->res_nodeid);
301 set_lock_master(&r->res_convertqueue, r->res_nodeid);
302 set_lock_master(&r->res_waitqueue, r->res_nodeid);
303}
304
305/*
306 * Propogate the new master nodeid to locks
307 * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
308 * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which
309 * rsb's to consider.
310 */
311
312static void set_new_master(struct dlm_rsb *r, int nodeid)
313{
314 lock_rsb(r);
315 r->res_nodeid = nodeid;
316 set_master_lkbs(r);
317 rsb_set_flag(r, RSB_NEW_MASTER);
318 rsb_set_flag(r, RSB_NEW_MASTER2);
319 unlock_rsb(r);
320}
321
322/*
323 * We do async lookups on rsb's that need new masters. The rsb's
324 * waiting for a lookup reply are kept on the recover_list.
325 */
326
327static int recover_master(struct dlm_rsb *r)
328{
329 struct dlm_ls *ls = r->res_ls;
330 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
331
332 dir_nodeid = dlm_dir_nodeid(r);
333
334 if (dir_nodeid == our_nodeid) {
335 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
336 r->res_length, &ret_nodeid);
337 if (error)
338 log_error(ls, "recover dir lookup error %d", error);
339
340 if (ret_nodeid == our_nodeid)
341 ret_nodeid = 0;
342 set_new_master(r, ret_nodeid);
343 } else {
344 recover_list_add(r);
345 error = dlm_send_rcom_lookup(r, dir_nodeid);
346 }
347
348 return error;
349}
350
351/*
352 * When not using a directory, most resource names will hash to a new static
353 * master nodeid and the resource will need to be remastered.
354 */
355
356static int recover_master_static(struct dlm_rsb *r)
357{
358 int master = dlm_dir_nodeid(r);
359
360 if (master == dlm_our_nodeid())
361 master = 0;
362
363 if (r->res_nodeid != master) {
364 if (is_master(r))
365 dlm_purge_mstcpy_locks(r);
366 set_new_master(r, master);
367 return 1;
368 }
369 return 0;
370}
371
372/*
373 * Go through local root resources and for each rsb which has a master which
374 * has departed, get the new master nodeid from the directory. The dir will
375 * assign mastery to the first node to look up the new master. That means
376 * we'll discover in this lookup if we're the new master of any rsb's.
377 *
378 * We fire off all the dir lookup requests individually and asynchronously to
379 * the correct dir node.
380 */
381
382int dlm_recover_masters(struct dlm_ls *ls)
383{
384 struct dlm_rsb *r;
385 int error = 0, count = 0;
386
387 log_debug(ls, "dlm_recover_masters");
388
389 down_read(&ls->ls_root_sem);
390 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
391 if (dlm_recovery_stopped(ls)) {
392 up_read(&ls->ls_root_sem);
393 error = -EINTR;
394 goto out;
395 }
396
397 if (dlm_no_directory(ls))
398 count += recover_master_static(r);
399 else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) {
400 recover_master(r);
401 count++;
402 }
403
404 schedule();
405 }
406 up_read(&ls->ls_root_sem);
407
408 log_debug(ls, "dlm_recover_masters %d resources", count);
409
410 error = dlm_wait_function(ls, &recover_list_empty);
411 out:
412 if (error)
413 recover_list_clear(ls);
414 return error;
415}
416
417int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
418{
419 struct dlm_rsb *r;
420 int nodeid;
421
422 r = recover_list_find(ls, rc->rc_id);
423 if (!r) {
424 log_error(ls, "dlm_recover_master_reply no id %llx",
425 (unsigned long long)rc->rc_id);
426 goto out;
427 }
428
429 nodeid = rc->rc_result;
430 if (nodeid == dlm_our_nodeid())
431 nodeid = 0;
432
433 set_new_master(r, nodeid);
434 recover_list_del(r);
435
436 if (recover_list_empty(ls))
437 wake_up(&ls->ls_wait_general);
438 out:
439 return 0;
440}
441
442
443/* Lock recovery: rebuild the process-copy locks we hold on a
444 remastered rsb on the new rsb master.
445
446 dlm_recover_locks
447 recover_locks
448 recover_locks_queue
449 dlm_send_rcom_lock -> receive_rcom_lock
450 dlm_recover_master_copy
451 receive_rcom_lock_reply <-
452 dlm_recover_process_copy
453*/
454
455
456/*
457 * keep a count of the number of lkb's we send to the new master; when we get
458 * an equal number of replies then recovery for the rsb is done
459 */
460
461static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
462{
463 struct dlm_lkb *lkb;
464 int error = 0;
465
466 list_for_each_entry(lkb, head, lkb_statequeue) {
467 error = dlm_send_rcom_lock(r, lkb);
468 if (error)
469 break;
470 r->res_recover_locks_count++;
471 }
472
473 return error;
474}
475
476static int recover_locks(struct dlm_rsb *r)
477{
478 int error = 0;
479
480 lock_rsb(r);
481
482 DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r););
483
484 error = recover_locks_queue(r, &r->res_grantqueue);
485 if (error)
486 goto out;
487 error = recover_locks_queue(r, &r->res_convertqueue);
488 if (error)
489 goto out;
490 error = recover_locks_queue(r, &r->res_waitqueue);
491 if (error)
492 goto out;
493
494 if (r->res_recover_locks_count)
495 recover_list_add(r);
496 else
497 rsb_clear_flag(r, RSB_NEW_MASTER);
498 out:
499 unlock_rsb(r);
500 return error;
501}
502
503int dlm_recover_locks(struct dlm_ls *ls)
504{
505 struct dlm_rsb *r;
506 int error, count = 0;
507
508 log_debug(ls, "dlm_recover_locks");
509
510 down_read(&ls->ls_root_sem);
511 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
512 if (is_master(r)) {
513 rsb_clear_flag(r, RSB_NEW_MASTER);
514 continue;
515 }
516
517 if (!rsb_flag(r, RSB_NEW_MASTER))
518 continue;
519
520 if (dlm_recovery_stopped(ls)) {
521 error = -EINTR;
522 up_read(&ls->ls_root_sem);
523 goto out;
524 }
525
526 error = recover_locks(r);
527 if (error) {
528 up_read(&ls->ls_root_sem);
529 goto out;
530 }
531
532 count += r->res_recover_locks_count;
533 }
534 up_read(&ls->ls_root_sem);
535
536 log_debug(ls, "dlm_recover_locks %d locks", count);
537
538 error = dlm_wait_function(ls, &recover_list_empty);
539 out:
540 if (error)
541 recover_list_clear(ls);
542 else
543 dlm_set_recover_status(ls, DLM_RS_LOCKS);
544 return error;
545}
546
547void dlm_recovered_lock(struct dlm_rsb *r)
548{
549 DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_dump_rsb(r););
550
551 r->res_recover_locks_count--;
552 if (!r->res_recover_locks_count) {
553 rsb_clear_flag(r, RSB_NEW_MASTER);
554 recover_list_del(r);
555 }
556
557 if (recover_list_empty(r->res_ls))
558 wake_up(&r->res_ls->ls_wait_general);
559}
560
561/*
562 * The lvb needs to be recovered on all master rsb's. This includes setting
563 * the VALNOTVALID flag if necessary, and determining the correct lvb contents
564 * based on the lvb's of the locks held on the rsb.
565 *
566 * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it
567 * was already set prior to recovery, it's not cleared, regardless of locks.
568 *
569 * The LVB contents are only considered for changing when this is a new master
570 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
571 * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken
572 * from the lkb with the largest lvb sequence number.
573 */
574
575static void recover_lvb(struct dlm_rsb *r)
576{
577 struct dlm_lkb *lkb, *high_lkb = NULL;
578 uint32_t high_seq = 0;
579 int lock_lvb_exists = 0;
580 int big_lock_exists = 0;
581 int lvblen = r->res_ls->ls_lvblen;
582
583 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
584 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
585 continue;
586
587 lock_lvb_exists = 1;
588
589 if (lkb->lkb_grmode > DLM_LOCK_CR) {
590 big_lock_exists = 1;
591 goto setflag;
592 }
593
594 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
595 high_lkb = lkb;
596 high_seq = lkb->lkb_lvbseq;
597 }
598 }
599
600 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
601 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
602 continue;
603
604 lock_lvb_exists = 1;
605
606 if (lkb->lkb_grmode > DLM_LOCK_CR) {
607 big_lock_exists = 1;
608 goto setflag;
609 }
610
611 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
612 high_lkb = lkb;
613 high_seq = lkb->lkb_lvbseq;
614 }
615 }
616
617 setflag:
618 if (!lock_lvb_exists)
619 goto out;
620
621 if (!big_lock_exists)
622 rsb_set_flag(r, RSB_VALNOTVALID);
623
624 /* don't mess with the lvb unless we're the new master */
625 if (!rsb_flag(r, RSB_NEW_MASTER2))
626 goto out;
627
628 if (!r->res_lvbptr) {
629 r->res_lvbptr = allocate_lvb(r->res_ls);
630 if (!r->res_lvbptr)
631 goto out;
632 }
633
634 if (big_lock_exists) {
635 r->res_lvbseq = lkb->lkb_lvbseq;
636 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
637 } else if (high_lkb) {
638 r->res_lvbseq = high_lkb->lkb_lvbseq;
639 memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
640 } else {
641 r->res_lvbseq = 0;
642 memset(r->res_lvbptr, 0, lvblen);
643 }
644 out:
645 return;
646}
647
648/* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks
649 converting PR->CW or CW->PR need to have their lkb_grmode set. */
650
651static void recover_conversion(struct dlm_rsb *r)
652{
653 struct dlm_lkb *lkb;
654 int grmode = -1;
655
656 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
657 if (lkb->lkb_grmode == DLM_LOCK_PR ||
658 lkb->lkb_grmode == DLM_LOCK_CW) {
659 grmode = lkb->lkb_grmode;
660 break;
661 }
662 }
663
664 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
665 if (lkb->lkb_grmode != DLM_LOCK_IV)
666 continue;
667 if (grmode == -1)
668 lkb->lkb_grmode = lkb->lkb_rqmode;
669 else
670 lkb->lkb_grmode = grmode;
671 }
672}
673
674/* We've become the new master for this rsb and waiting/converting locks may
675 need to be granted in dlm_grant_after_purge() due to locks that may have
676 existed from a removed node. */
677
678static void set_locks_purged(struct dlm_rsb *r)
679{
680 if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
681 rsb_set_flag(r, RSB_LOCKS_PURGED);
682}
683
684void dlm_recover_rsbs(struct dlm_ls *ls)
685{
686 struct dlm_rsb *r;
687 int count = 0;
688
689 log_debug(ls, "dlm_recover_rsbs");
690
691 down_read(&ls->ls_root_sem);
692 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
693 lock_rsb(r);
694 if (is_master(r)) {
695 if (rsb_flag(r, RSB_RECOVER_CONVERT))
696 recover_conversion(r);
697 if (rsb_flag(r, RSB_NEW_MASTER2))
698 set_locks_purged(r);
699 recover_lvb(r);
700 count++;
701 }
702 rsb_clear_flag(r, RSB_RECOVER_CONVERT);
703 rsb_clear_flag(r, RSB_NEW_MASTER2);
704 unlock_rsb(r);
705 }
706 up_read(&ls->ls_root_sem);
707
708 log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
709}
710
711/* Create a single list of all root rsb's to be used during recovery */
712
713int dlm_create_root_list(struct dlm_ls *ls)
714{
715 struct dlm_rsb *r;
716 int i, error = 0;
717
718 down_write(&ls->ls_root_sem);
719 if (!list_empty(&ls->ls_root_list)) {
720 log_error(ls, "root list not empty");
721 error = -EINVAL;
722 goto out;
723 }
724
725 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
726 read_lock(&ls->ls_rsbtbl[i].lock);
727 list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
728 list_add(&r->res_root_list, &ls->ls_root_list);
729 dlm_hold_rsb(r);
730 }
731 read_unlock(&ls->ls_rsbtbl[i].lock);
732 }
733 out:
734 up_write(&ls->ls_root_sem);
735 return error;
736}
737
738void dlm_release_root_list(struct dlm_ls *ls)
739{
740 struct dlm_rsb *r, *safe;
741
742 down_write(&ls->ls_root_sem);
743 list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
744 list_del_init(&r->res_root_list);
745 dlm_put_rsb(r);
746 }
747 up_write(&ls->ls_root_sem);
748}
749
750void dlm_clear_toss_list(struct dlm_ls *ls)
751{
752 struct dlm_rsb *r, *safe;
753 int i;
754
755 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
756 write_lock(&ls->ls_rsbtbl[i].lock);
757 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
758 res_hashchain) {
759 list_del(&r->res_hashchain);
760 free_rsb(r);
761 }
762 write_unlock(&ls->ls_rsbtbl[i].lock);
763 }
764}
765
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
new file mode 100644
index 000000000000..ebd0363f1e08
--- /dev/null
+++ b/fs/dlm/recover.h
@@ -0,0 +1,34 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVER_DOT_H__
15#define __RECOVER_DOT_H__
16
17int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
18uint32_t dlm_recover_status(struct dlm_ls *ls);
19void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
20int dlm_recover_members_wait(struct dlm_ls *ls);
21int dlm_recover_directory_wait(struct dlm_ls *ls);
22int dlm_recover_locks_wait(struct dlm_ls *ls);
23int dlm_recover_done_wait(struct dlm_ls *ls);
24int dlm_recover_masters(struct dlm_ls *ls);
25int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
26int dlm_recover_locks(struct dlm_ls *ls);
27void dlm_recovered_lock(struct dlm_rsb *r);
28int dlm_create_root_list(struct dlm_ls *ls);
29void dlm_release_root_list(struct dlm_ls *ls);
30void dlm_clear_toss_list(struct dlm_ls *ls);
31void dlm_recover_rsbs(struct dlm_ls *ls);
32
33#endif /* __RECOVER_DOT_H__ */
34
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
new file mode 100644
index 000000000000..362e3eff4dc9
--- /dev/null
+++ b/fs/dlm/recoverd.c
@@ -0,0 +1,290 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "dir.h"
18#include "ast.h"
19#include "recover.h"
20#include "lowcomms.h"
21#include "lock.h"
22#include "requestqueue.h"
23#include "recoverd.h"
24
25
26/* If the start for which we're re-enabling locking (seq) has been superseded
27 by a newer stop (ls_recover_seq), we need to leave locking disabled. */
28
29static int enable_locking(struct dlm_ls *ls, uint64_t seq)
30{
31 int error = -EINTR;
32
33 spin_lock(&ls->ls_recover_lock);
34 if (ls->ls_recover_seq == seq) {
35 set_bit(LSFL_RUNNING, &ls->ls_flags);
36 up_write(&ls->ls_in_recovery);
37 error = 0;
38 }
39 spin_unlock(&ls->ls_recover_lock);
40 return error;
41}
42
43static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
44{
45 unsigned long start;
46 int error, neg = 0;
47
48 log_debug(ls, "recover %llx", rv->seq);
49
50 mutex_lock(&ls->ls_recoverd_active);
51
52 /*
53 * Suspending and resuming dlm_astd ensures that no lkb's from this ls
54 * will be processed by dlm_astd during recovery.
55 */
56
57 dlm_astd_suspend();
58 dlm_astd_resume();
59
60 /*
61 * This list of root rsb's will be the basis of most of the recovery
62 * routines.
63 */
64
65 dlm_create_root_list(ls);
66
67 /*
68 * Free all the tossed rsb's so we don't have to recover them.
69 */
70
71 dlm_clear_toss_list(ls);
72
73 /*
74 * Add or remove nodes from the lockspace's ls_nodes list.
75 * Also waits for all nodes to complete dlm_recover_members.
76 */
77
78 error = dlm_recover_members(ls, rv, &neg);
79 if (error) {
80 log_error(ls, "recover_members failed %d", error);
81 goto fail;
82 }
83 start = jiffies;
84
85 /*
86 * Rebuild our own share of the directory by collecting from all other
87 * nodes their master rsb names that hash to us.
88 */
89
90 error = dlm_recover_directory(ls);
91 if (error) {
92 log_error(ls, "recover_directory failed %d", error);
93 goto fail;
94 }
95
96 /*
97 * Purge directory-related requests that are saved in requestqueue.
98 * All dir requests from before recovery are invalid now due to the dir
99 * rebuild and will be resent by the requesting nodes.
100 */
101
102 dlm_purge_requestqueue(ls);
103
104 /*
105 * Wait for all nodes to complete directory rebuild.
106 */
107
108 error = dlm_recover_directory_wait(ls);
109 if (error) {
110 log_error(ls, "recover_directory_wait failed %d", error);
111 goto fail;
112 }
113
114 /*
115 * We may have outstanding operations that are waiting for a reply from
116 * a failed node. Mark these to be resent after recovery. Unlock and
117 * cancel ops can just be completed.
118 */
119
120 dlm_recover_waiters_pre(ls);
121
122 error = dlm_recovery_stopped(ls);
123 if (error)
124 goto fail;
125
126 if (neg || dlm_no_directory(ls)) {
127 /*
128 * Clear lkb's for departed nodes.
129 */
130
131 dlm_purge_locks(ls);
132
133 /*
134 * Get new master nodeid's for rsb's that were mastered on
135 * departed nodes.
136 */
137
138 error = dlm_recover_masters(ls);
139 if (error) {
140 log_error(ls, "recover_masters failed %d", error);
141 goto fail;
142 }
143
144 /*
145 * Send our locks on remastered rsb's to the new masters.
146 */
147
148 error = dlm_recover_locks(ls);
149 if (error) {
150 log_error(ls, "recover_locks failed %d", error);
151 goto fail;
152 }
153
154 error = dlm_recover_locks_wait(ls);
155 if (error) {
156 log_error(ls, "recover_locks_wait failed %d", error);
157 goto fail;
158 }
159
160 /*
161 * Finalize state in master rsb's now that all locks can be
162 * checked. This includes conversion resolution and lvb
163 * settings.
164 */
165
166 dlm_recover_rsbs(ls);
167 }
168
169 dlm_release_root_list(ls);
170
171 dlm_set_recover_status(ls, DLM_RS_DONE);
172 error = dlm_recover_done_wait(ls);
173 if (error) {
174 log_error(ls, "recover_done_wait failed %d", error);
175 goto fail;
176 }
177
178 dlm_clear_members_gone(ls);
179
180 error = enable_locking(ls, rv->seq);
181 if (error) {
182 log_error(ls, "enable_locking failed %d", error);
183 goto fail;
184 }
185
186 error = dlm_process_requestqueue(ls);
187 if (error) {
188 log_error(ls, "process_requestqueue failed %d", error);
189 goto fail;
190 }
191
192 error = dlm_recover_waiters_post(ls);
193 if (error) {
194 log_error(ls, "recover_waiters_post failed %d", error);
195 goto fail;
196 }
197
198 dlm_grant_after_purge(ls);
199
200 dlm_astd_wake();
201
202 log_debug(ls, "recover %llx done: %u ms", rv->seq,
203 jiffies_to_msecs(jiffies - start));
204 mutex_unlock(&ls->ls_recoverd_active);
205
206 return 0;
207
208 fail:
209 dlm_release_root_list(ls);
210 log_debug(ls, "recover %llx error %d", rv->seq, error);
211 mutex_unlock(&ls->ls_recoverd_active);
212 return error;
213}
214
215static void do_ls_recovery(struct dlm_ls *ls)
216{
217 struct dlm_recover *rv = NULL;
218
219 spin_lock(&ls->ls_recover_lock);
220 rv = ls->ls_recover_args;
221 ls->ls_recover_args = NULL;
222 clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
223 spin_unlock(&ls->ls_recover_lock);
224
225 if (rv) {
226 ls_recover(ls, rv);
227 kfree(rv->nodeids);
228 kfree(rv);
229 }
230}
231
232static int dlm_recoverd(void *arg)
233{
234 struct dlm_ls *ls;
235
236 ls = dlm_find_lockspace_local(arg);
237 if (!ls) {
238 log_print("dlm_recoverd: no lockspace %p", arg);
239 return -1;
240 }
241
242 while (!kthread_should_stop()) {
243 set_current_state(TASK_INTERRUPTIBLE);
244 if (!test_bit(LSFL_WORK, &ls->ls_flags))
245 schedule();
246 set_current_state(TASK_RUNNING);
247
248 if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
249 do_ls_recovery(ls);
250 }
251
252 dlm_put_lockspace(ls);
253 return 0;
254}
255
256void dlm_recoverd_kick(struct dlm_ls *ls)
257{
258 set_bit(LSFL_WORK, &ls->ls_flags);
259 wake_up_process(ls->ls_recoverd_task);
260}
261
262int dlm_recoverd_start(struct dlm_ls *ls)
263{
264 struct task_struct *p;
265 int error = 0;
266
267 p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
268 if (IS_ERR(p))
269 error = PTR_ERR(p);
270 else
271 ls->ls_recoverd_task = p;
272 return error;
273}
274
275void dlm_recoverd_stop(struct dlm_ls *ls)
276{
277 kthread_stop(ls->ls_recoverd_task);
278}
279
280void dlm_recoverd_suspend(struct dlm_ls *ls)
281{
282 wake_up(&ls->ls_wait_general);
283 mutex_lock(&ls->ls_recoverd_active);
284}
285
286void dlm_recoverd_resume(struct dlm_ls *ls)
287{
288 mutex_unlock(&ls->ls_recoverd_active);
289}
290
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
new file mode 100644
index 000000000000..866657c5d69d
--- /dev/null
+++ b/fs/dlm/recoverd.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVERD_DOT_H__
15#define __RECOVERD_DOT_H__
16
17void dlm_recoverd_kick(struct dlm_ls *ls);
18void dlm_recoverd_stop(struct dlm_ls *ls);
19int dlm_recoverd_start(struct dlm_ls *ls);
20void dlm_recoverd_suspend(struct dlm_ls *ls);
21void dlm_recoverd_resume(struct dlm_ls *ls);
22
23#endif /* __RECOVERD_DOT_H__ */
24
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
new file mode 100644
index 000000000000..7b2b089634a2
--- /dev/null
+++ b/fs/dlm/requestqueue.c
@@ -0,0 +1,184 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "member.h"
15#include "lock.h"
16#include "dir.h"
17#include "config.h"
18#include "requestqueue.h"
19
20struct rq_entry {
21 struct list_head list;
22 int nodeid;
23 char request[1];
24};
25
26/*
27 * Requests received while the lockspace is in recovery get added to the
28 * request queue and processed when recovery is complete. This happens when
29 * the lockspace is suspended on some nodes before it is on others, or the
30 * lockspace is enabled on some while still suspended on others.
31 */
32
33void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
34{
35 struct rq_entry *e;
36 int length = hd->h_length;
37
38 if (dlm_is_removed(ls, nodeid))
39 return;
40
41 e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
42 if (!e) {
43 log_print("dlm_add_requestqueue: out of memory\n");
44 return;
45 }
46
47 e->nodeid = nodeid;
48 memcpy(e->request, hd, length);
49
50 mutex_lock(&ls->ls_requestqueue_mutex);
51 list_add_tail(&e->list, &ls->ls_requestqueue);
52 mutex_unlock(&ls->ls_requestqueue_mutex);
53}
54
55int dlm_process_requestqueue(struct dlm_ls *ls)
56{
57 struct rq_entry *e;
58 struct dlm_header *hd;
59 int error = 0;
60
61 mutex_lock(&ls->ls_requestqueue_mutex);
62
63 for (;;) {
64 if (list_empty(&ls->ls_requestqueue)) {
65 mutex_unlock(&ls->ls_requestqueue_mutex);
66 error = 0;
67 break;
68 }
69 e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
70 mutex_unlock(&ls->ls_requestqueue_mutex);
71
72 hd = (struct dlm_header *) e->request;
73 error = dlm_receive_message(hd, e->nodeid, 1);
74
75 if (error == -EINTR) {
76 /* entry is left on requestqueue */
77 log_debug(ls, "process_requestqueue abort eintr");
78 break;
79 }
80
81 mutex_lock(&ls->ls_requestqueue_mutex);
82 list_del(&e->list);
83 kfree(e);
84
85 if (dlm_locking_stopped(ls)) {
86 log_debug(ls, "process_requestqueue abort running");
87 mutex_unlock(&ls->ls_requestqueue_mutex);
88 error = -EINTR;
89 break;
90 }
91 schedule();
92 }
93
94 return error;
95}
96
97/*
98 * After recovery is done, locking is resumed and dlm_recoverd takes all the
99 * saved requests and processes them as they would have been by dlm_recvd. At
100 * the same time, dlm_recvd will start receiving new requests from remote
101 * nodes. We want to delay dlm_recvd processing new requests until
102 * dlm_recoverd has finished processing the old saved requests.
103 */
104
105void dlm_wait_requestqueue(struct dlm_ls *ls)
106{
107 for (;;) {
108 mutex_lock(&ls->ls_requestqueue_mutex);
109 if (list_empty(&ls->ls_requestqueue))
110 break;
111 if (dlm_locking_stopped(ls))
112 break;
113 mutex_unlock(&ls->ls_requestqueue_mutex);
114 schedule();
115 }
116 mutex_unlock(&ls->ls_requestqueue_mutex);
117}
118
119static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
120{
121 uint32_t type = ms->m_type;
122
123 if (dlm_is_removed(ls, nodeid))
124 return 1;
125
126 /* directory operations are always purged because the directory is
127 always rebuilt during recovery and the lookups resent */
128
129 if (type == DLM_MSG_REMOVE ||
130 type == DLM_MSG_LOOKUP ||
131 type == DLM_MSG_LOOKUP_REPLY)
132 return 1;
133
134 if (!dlm_no_directory(ls))
135 return 0;
136
137 /* with no directory, the master is likely to change as a part of
138 recovery; requests to/from the defunct master need to be purged */
139
140 switch (type) {
141 case DLM_MSG_REQUEST:
142 case DLM_MSG_CONVERT:
143 case DLM_MSG_UNLOCK:
144 case DLM_MSG_CANCEL:
145 /* we're no longer the master of this resource, the sender
146 will resend to the new master (see waiter_needs_recovery) */
147
148 if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
149 return 1;
150 break;
151
152 case DLM_MSG_REQUEST_REPLY:
153 case DLM_MSG_CONVERT_REPLY:
154 case DLM_MSG_UNLOCK_REPLY:
155 case DLM_MSG_CANCEL_REPLY:
156 case DLM_MSG_GRANT:
157 /* this reply is from the former master of the resource,
158 we'll resend to the new master if needed */
159
160 if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
161 return 1;
162 break;
163 }
164
165 return 0;
166}
167
168void dlm_purge_requestqueue(struct dlm_ls *ls)
169{
170 struct dlm_message *ms;
171 struct rq_entry *e, *safe;
172
173 mutex_lock(&ls->ls_requestqueue_mutex);
174 list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
175 ms = (struct dlm_message *) e->request;
176
177 if (purge_request(ls, ms, e->nodeid)) {
178 list_del(&e->list);
179 kfree(e);
180 }
181 }
182 mutex_unlock(&ls->ls_requestqueue_mutex);
183}
184
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
new file mode 100644
index 000000000000..349f0d292d95
--- /dev/null
+++ b/fs/dlm/requestqueue.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __REQUESTQUEUE_DOT_H__
14#define __REQUESTQUEUE_DOT_H__
15
16void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
17int dlm_process_requestqueue(struct dlm_ls *ls);
18void dlm_wait_requestqueue(struct dlm_ls *ls);
19void dlm_purge_requestqueue(struct dlm_ls *ls);
20
21#endif
22
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
new file mode 100644
index 000000000000..c37e93e4f2df
--- /dev/null
+++ b/fs/dlm/user.c
@@ -0,0 +1,788 @@
1/*
2 * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#include <linux/miscdevice.h>
10#include <linux/init.h>
11#include <linux/wait.h>
12#include <linux/module.h>
13#include <linux/file.h>
14#include <linux/fs.h>
15#include <linux/poll.h>
16#include <linux/signal.h>
17#include <linux/spinlock.h>
18#include <linux/dlm.h>
19#include <linux/dlm_device.h>
20
21#include "dlm_internal.h"
22#include "lockspace.h"
23#include "lock.h"
24#include "lvb_table.h"
25
26static const char *name_prefix="dlm";
27static struct miscdevice ctl_device;
28static struct file_operations device_fops;
29
30#ifdef CONFIG_COMPAT
31
32struct dlm_lock_params32 {
33 __u8 mode;
34 __u8 namelen;
35 __u16 flags;
36 __u32 lkid;
37 __u32 parent;
38
39 __u32 castparam;
40 __u32 castaddr;
41 __u32 bastparam;
42 __u32 bastaddr;
43 __u32 lksb;
44
45 char lvb[DLM_USER_LVB_LEN];
46 char name[0];
47};
48
49struct dlm_write_request32 {
50 __u32 version[3];
51 __u8 cmd;
52 __u8 is64bit;
53 __u8 unused[2];
54
55 union {
56 struct dlm_lock_params32 lock;
57 struct dlm_lspace_params lspace;
58 } i;
59};
60
61struct dlm_lksb32 {
62 __u32 sb_status;
63 __u32 sb_lkid;
64 __u8 sb_flags;
65 __u32 sb_lvbptr;
66};
67
68struct dlm_lock_result32 {
69 __u32 length;
70 __u32 user_astaddr;
71 __u32 user_astparam;
72 __u32 user_lksb;
73 struct dlm_lksb32 lksb;
74 __u8 bast_mode;
75 __u8 unused[3];
76 /* Offsets may be zero if no data is present */
77 __u32 lvb_offset;
78};
79
80static void compat_input(struct dlm_write_request *kb,
81 struct dlm_write_request32 *kb32)
82{
83 kb->version[0] = kb32->version[0];
84 kb->version[1] = kb32->version[1];
85 kb->version[2] = kb32->version[2];
86
87 kb->cmd = kb32->cmd;
88 kb->is64bit = kb32->is64bit;
89 if (kb->cmd == DLM_USER_CREATE_LOCKSPACE ||
90 kb->cmd == DLM_USER_REMOVE_LOCKSPACE) {
91 kb->i.lspace.flags = kb32->i.lspace.flags;
92 kb->i.lspace.minor = kb32->i.lspace.minor;
93 strcpy(kb->i.lspace.name, kb32->i.lspace.name);
94 } else {
95 kb->i.lock.mode = kb32->i.lock.mode;
96 kb->i.lock.namelen = kb32->i.lock.namelen;
97 kb->i.lock.flags = kb32->i.lock.flags;
98 kb->i.lock.lkid = kb32->i.lock.lkid;
99 kb->i.lock.parent = kb32->i.lock.parent;
100 kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam;
101 kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr;
102 kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam;
103 kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
104 kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
105 memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
106 memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen);
107 }
108}
109
110static void compat_output(struct dlm_lock_result *res,
111 struct dlm_lock_result32 *res32)
112{
113 res32->length = res->length - (sizeof(struct dlm_lock_result) -
114 sizeof(struct dlm_lock_result32));
115 res32->user_astaddr = (__u32)(long)res->user_astaddr;
116 res32->user_astparam = (__u32)(long)res->user_astparam;
117 res32->user_lksb = (__u32)(long)res->user_lksb;
118 res32->bast_mode = res->bast_mode;
119
120 res32->lvb_offset = res->lvb_offset;
121 res32->length = res->length;
122
123 res32->lksb.sb_status = res->lksb.sb_status;
124 res32->lksb.sb_flags = res->lksb.sb_flags;
125 res32->lksb.sb_lkid = res->lksb.sb_lkid;
126 res32->lksb.sb_lvbptr = (__u32)(long)res->lksb.sb_lvbptr;
127}
128#endif
129
130
131void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
132{
133 struct dlm_ls *ls;
134 struct dlm_user_args *ua;
135 struct dlm_user_proc *proc;
136 int remove_ownqueue = 0;
137
138 /* dlm_clear_proc_locks() sets ORPHAN/DEAD flag on each
139 lkb before dealing with it. We need to check this
140 flag before taking ls_clear_proc_locks mutex because if
141 it's set, dlm_clear_proc_locks() holds the mutex. */
142
143 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
144 /* log_print("user_add_ast skip1 %x", lkb->lkb_flags); */
145 return;
146 }
147
148 ls = lkb->lkb_resource->res_ls;
149 mutex_lock(&ls->ls_clear_proc_locks);
150
151 /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
152 can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed
153 lkb->ua so we can't try to use it. */
154
155 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
156 /* log_print("user_add_ast skip2 %x", lkb->lkb_flags); */
157 goto out;
158 }
159
160 DLM_ASSERT(lkb->lkb_astparam, dlm_print_lkb(lkb););
161 ua = (struct dlm_user_args *)lkb->lkb_astparam;
162 proc = ua->proc;
163
164 if (type == AST_BAST && ua->bastaddr == NULL)
165 goto out;
166
167 spin_lock(&proc->asts_spin);
168 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
169 kref_get(&lkb->lkb_ref);
170 list_add_tail(&lkb->lkb_astqueue, &proc->asts);
171 lkb->lkb_ast_type |= type;
172 wake_up_interruptible(&proc->wait);
173 }
174
175 /* noqueue requests that fail may need to be removed from the
176 proc's locks list, there should be a better way of detecting
177 this situation than checking all these things... */
178
179 if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV &&
180 ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue))
181 remove_ownqueue = 1;
182
183 /* We want to copy the lvb to userspace when the completion
184 ast is read if the status is 0, the lock has an lvb and
185 lvb_ops says we should. We could probably have set_lvb_lock()
186 set update_user_lvb instead and not need old_mode */
187
188 if ((lkb->lkb_ast_type & AST_COMP) &&
189 (lkb->lkb_lksb->sb_status == 0) &&
190 lkb->lkb_lksb->sb_lvbptr &&
191 dlm_lvb_operations[ua->old_mode + 1][lkb->lkb_grmode + 1])
192 ua->update_user_lvb = 1;
193 else
194 ua->update_user_lvb = 0;
195
196 spin_unlock(&proc->asts_spin);
197
198 if (remove_ownqueue) {
199 spin_lock(&ua->proc->locks_spin);
200 list_del_init(&lkb->lkb_ownqueue);
201 spin_unlock(&ua->proc->locks_spin);
202 dlm_put_lkb(lkb);
203 }
204 out:
205 mutex_unlock(&ls->ls_clear_proc_locks);
206}
207
208static int device_user_lock(struct dlm_user_proc *proc,
209 struct dlm_lock_params *params)
210{
211 struct dlm_ls *ls;
212 struct dlm_user_args *ua;
213 int error = -ENOMEM;
214
215 ls = dlm_find_lockspace_local(proc->lockspace);
216 if (!ls)
217 return -ENOENT;
218
219 if (!params->castaddr || !params->lksb) {
220 error = -EINVAL;
221 goto out;
222 }
223
224 ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
225 if (!ua)
226 goto out;
227 ua->proc = proc;
228 ua->user_lksb = params->lksb;
229 ua->castparam = params->castparam;
230 ua->castaddr = params->castaddr;
231 ua->bastparam = params->bastparam;
232 ua->bastaddr = params->bastaddr;
233
234 if (params->flags & DLM_LKF_CONVERT)
235 error = dlm_user_convert(ls, ua,
236 params->mode, params->flags,
237 params->lkid, params->lvb);
238 else {
239 error = dlm_user_request(ls, ua,
240 params->mode, params->flags,
241 params->name, params->namelen,
242 params->parent);
243 if (!error)
244 error = ua->lksb.sb_lkid;
245 }
246 out:
247 dlm_put_lockspace(ls);
248 return error;
249}
250
251static int device_user_unlock(struct dlm_user_proc *proc,
252 struct dlm_lock_params *params)
253{
254 struct dlm_ls *ls;
255 struct dlm_user_args *ua;
256 int error = -ENOMEM;
257
258 ls = dlm_find_lockspace_local(proc->lockspace);
259 if (!ls)
260 return -ENOENT;
261
262 ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
263 if (!ua)
264 goto out;
265 ua->proc = proc;
266 ua->user_lksb = params->lksb;
267 ua->castparam = params->castparam;
268 ua->castaddr = params->castaddr;
269
270 if (params->flags & DLM_LKF_CANCEL)
271 error = dlm_user_cancel(ls, ua, params->flags, params->lkid);
272 else
273 error = dlm_user_unlock(ls, ua, params->flags, params->lkid,
274 params->lvb);
275 out:
276 dlm_put_lockspace(ls);
277 return error;
278}
279
280static int device_create_lockspace(struct dlm_lspace_params *params)
281{
282 dlm_lockspace_t *lockspace;
283 struct dlm_ls *ls;
284 int error, len;
285
286 if (!capable(CAP_SYS_ADMIN))
287 return -EPERM;
288
289 error = dlm_new_lockspace(params->name, strlen(params->name),
290 &lockspace, 0, DLM_USER_LVB_LEN);
291 if (error)
292 return error;
293
294 ls = dlm_find_lockspace_local(lockspace);
295 if (!ls)
296 return -ENOENT;
297
298 error = -ENOMEM;
299 len = strlen(params->name) + strlen(name_prefix) + 2;
300 ls->ls_device.name = kzalloc(len, GFP_KERNEL);
301 if (!ls->ls_device.name)
302 goto fail;
303 snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
304 params->name);
305 ls->ls_device.fops = &device_fops;
306 ls->ls_device.minor = MISC_DYNAMIC_MINOR;
307
308 error = misc_register(&ls->ls_device);
309 if (error) {
310 kfree(ls->ls_device.name);
311 goto fail;
312 }
313
314 error = ls->ls_device.minor;
315 dlm_put_lockspace(ls);
316 return error;
317
318 fail:
319 dlm_put_lockspace(ls);
320 dlm_release_lockspace(lockspace, 0);
321 return error;
322}
323
324static int device_remove_lockspace(struct dlm_lspace_params *params)
325{
326 dlm_lockspace_t *lockspace;
327 struct dlm_ls *ls;
328 int error, force = 0;
329
330 if (!capable(CAP_SYS_ADMIN))
331 return -EPERM;
332
333 ls = dlm_find_lockspace_device(params->minor);
334 if (!ls)
335 return -ENOENT;
336
337 error = misc_deregister(&ls->ls_device);
338 if (error) {
339 dlm_put_lockspace(ls);
340 goto out;
341 }
342 kfree(ls->ls_device.name);
343
344 if (params->flags & DLM_USER_LSFLG_FORCEFREE)
345 force = 2;
346
347 lockspace = ls->ls_local_handle;
348
349 /* dlm_release_lockspace waits for references to go to zero,
350 so all processes will need to close their device for the ls
351 before the release will procede */
352
353 dlm_put_lockspace(ls);
354 error = dlm_release_lockspace(lockspace, force);
355 out:
356 return error;
357}
358
359/* Check the user's version matches ours */
360static int check_version(struct dlm_write_request *req)
361{
362 if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
363 (req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
364 req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
365
366 printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
367 "user (%d.%d.%d) kernel (%d.%d.%d)\n",
368 current->comm,
369 current->pid,
370 req->version[0],
371 req->version[1],
372 req->version[2],
373 DLM_DEVICE_VERSION_MAJOR,
374 DLM_DEVICE_VERSION_MINOR,
375 DLM_DEVICE_VERSION_PATCH);
376 return -EINVAL;
377 }
378 return 0;
379}
380
381/*
382 * device_write
383 *
384 * device_user_lock
385 * dlm_user_request -> request_lock
386 * dlm_user_convert -> convert_lock
387 *
388 * device_user_unlock
389 * dlm_user_unlock -> unlock_lock
390 * dlm_user_cancel -> cancel_lock
391 *
392 * device_create_lockspace
393 * dlm_new_lockspace
394 *
395 * device_remove_lockspace
396 * dlm_release_lockspace
397 */
398
399/* a write to a lockspace device is a lock or unlock request, a write
400 to the control device is to create/remove a lockspace */
401
402static ssize_t device_write(struct file *file, const char __user *buf,
403 size_t count, loff_t *ppos)
404{
405 struct dlm_user_proc *proc = file->private_data;
406 struct dlm_write_request *kbuf;
407 sigset_t tmpsig, allsigs;
408 int error;
409
410#ifdef CONFIG_COMPAT
411 if (count < sizeof(struct dlm_write_request32))
412#else
413 if (count < sizeof(struct dlm_write_request))
414#endif
415 return -EINVAL;
416
417 kbuf = kmalloc(count, GFP_KERNEL);
418 if (!kbuf)
419 return -ENOMEM;
420
421 if (copy_from_user(kbuf, buf, count)) {
422 error = -EFAULT;
423 goto out_free;
424 }
425
426 if (check_version(kbuf)) {
427 error = -EBADE;
428 goto out_free;
429 }
430
431#ifdef CONFIG_COMPAT
432 if (!kbuf->is64bit) {
433 struct dlm_write_request32 *k32buf;
434 k32buf = (struct dlm_write_request32 *)kbuf;
435 kbuf = kmalloc(count + (sizeof(struct dlm_write_request) -
436 sizeof(struct dlm_write_request32)), GFP_KERNEL);
437 if (!kbuf)
438 return -ENOMEM;
439
440 if (proc)
441 set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
442 compat_input(kbuf, k32buf);
443 kfree(k32buf);
444 }
445#endif
446
447 /* do we really need this? can a write happen after a close? */
448 if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
449 test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
450 return -EINVAL;
451
452 sigfillset(&allsigs);
453 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
454
455 error = -EINVAL;
456
457 switch (kbuf->cmd)
458 {
459 case DLM_USER_LOCK:
460 if (!proc) {
461 log_print("no locking on control device");
462 goto out_sig;
463 }
464 error = device_user_lock(proc, &kbuf->i.lock);
465 break;
466
467 case DLM_USER_UNLOCK:
468 if (!proc) {
469 log_print("no locking on control device");
470 goto out_sig;
471 }
472 error = device_user_unlock(proc, &kbuf->i.lock);
473 break;
474
475 case DLM_USER_CREATE_LOCKSPACE:
476 if (proc) {
477 log_print("create/remove only on control device");
478 goto out_sig;
479 }
480 error = device_create_lockspace(&kbuf->i.lspace);
481 break;
482
483 case DLM_USER_REMOVE_LOCKSPACE:
484 if (proc) {
485 log_print("create/remove only on control device");
486 goto out_sig;
487 }
488 error = device_remove_lockspace(&kbuf->i.lspace);
489 break;
490
491 default:
492 log_print("Unknown command passed to DLM device : %d\n",
493 kbuf->cmd);
494 }
495
496 out_sig:
497 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
498 recalc_sigpending();
499 out_free:
500 kfree(kbuf);
501 return error;
502}
503
504/* Every process that opens the lockspace device has its own "proc" structure
505 hanging off the open file that's used to keep track of locks owned by the
506 process and asts that need to be delivered to the process. */
507
508static int device_open(struct inode *inode, struct file *file)
509{
510 struct dlm_user_proc *proc;
511 struct dlm_ls *ls;
512
513 ls = dlm_find_lockspace_device(iminor(inode));
514 if (!ls)
515 return -ENOENT;
516
517 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
518 if (!proc) {
519 dlm_put_lockspace(ls);
520 return -ENOMEM;
521 }
522
523 proc->lockspace = ls->ls_local_handle;
524 INIT_LIST_HEAD(&proc->asts);
525 INIT_LIST_HEAD(&proc->locks);
526 spin_lock_init(&proc->asts_spin);
527 spin_lock_init(&proc->locks_spin);
528 init_waitqueue_head(&proc->wait);
529 file->private_data = proc;
530
531 return 0;
532}
533
534static int device_close(struct inode *inode, struct file *file)
535{
536 struct dlm_user_proc *proc = file->private_data;
537 struct dlm_ls *ls;
538 sigset_t tmpsig, allsigs;
539
540 ls = dlm_find_lockspace_local(proc->lockspace);
541 if (!ls)
542 return -ENOENT;
543
544 sigfillset(&allsigs);
545 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
546
547 set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags);
548
549 dlm_clear_proc_locks(ls, proc);
550
551 /* at this point no more lkb's should exist for this lockspace,
552 so there's no chance of dlm_user_add_ast() being called and
553 looking for lkb->ua->proc */
554
555 kfree(proc);
556 file->private_data = NULL;
557
558 dlm_put_lockspace(ls);
559 dlm_put_lockspace(ls); /* for the find in device_open() */
560
561 /* FIXME: AUTOFREE: if this ls is no longer used do
562 device_remove_lockspace() */
563
564 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
565 recalc_sigpending();
566
567 return 0;
568}
569
570static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
571 int bmode, char __user *buf, size_t count)
572{
573#ifdef CONFIG_COMPAT
574 struct dlm_lock_result32 result32;
575#endif
576 struct dlm_lock_result result;
577 void *resultptr;
578 int error=0;
579 int len;
580 int struct_len;
581
582 memset(&result, 0, sizeof(struct dlm_lock_result));
583 memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb));
584 result.user_lksb = ua->user_lksb;
585
586 /* FIXME: dlm1 provides for the user's bastparam/addr to not be updated
587 in a conversion unless the conversion is successful. See code
588 in dlm_user_convert() for updating ua from ua_tmp. OpenVMS, though,
589 notes that a new blocking AST address and parameter are set even if
590 the conversion fails, so maybe we should just do that. */
591
592 if (type == AST_BAST) {
593 result.user_astaddr = ua->bastaddr;
594 result.user_astparam = ua->bastparam;
595 result.bast_mode = bmode;
596 } else {
597 result.user_astaddr = ua->castaddr;
598 result.user_astparam = ua->castparam;
599 }
600
601#ifdef CONFIG_COMPAT
602 if (compat)
603 len = sizeof(struct dlm_lock_result32);
604 else
605#endif
606 len = sizeof(struct dlm_lock_result);
607 struct_len = len;
608
609 /* copy lvb to userspace if there is one, it's been updated, and
610 the user buffer has space for it */
611
612 if (ua->update_user_lvb && ua->lksb.sb_lvbptr &&
613 count >= len + DLM_USER_LVB_LEN) {
614 if (copy_to_user(buf+len, ua->lksb.sb_lvbptr,
615 DLM_USER_LVB_LEN)) {
616 error = -EFAULT;
617 goto out;
618 }
619
620 result.lvb_offset = len;
621 len += DLM_USER_LVB_LEN;
622 }
623
624 result.length = len;
625 resultptr = &result;
626#ifdef CONFIG_COMPAT
627 if (compat) {
628 compat_output(&result, &result32);
629 resultptr = &result32;
630 }
631#endif
632
633 if (copy_to_user(buf, resultptr, struct_len))
634 error = -EFAULT;
635 else
636 error = len;
637 out:
638 return error;
639}
640
641/* a read returns a single ast described in a struct dlm_lock_result */
642
643static ssize_t device_read(struct file *file, char __user *buf, size_t count,
644 loff_t *ppos)
645{
646 struct dlm_user_proc *proc = file->private_data;
647 struct dlm_lkb *lkb;
648 struct dlm_user_args *ua;
649 DECLARE_WAITQUEUE(wait, current);
650 int error, type=0, bmode=0, removed = 0;
651
652#ifdef CONFIG_COMPAT
653 if (count < sizeof(struct dlm_lock_result32))
654#else
655 if (count < sizeof(struct dlm_lock_result))
656#endif
657 return -EINVAL;
658
659 /* do we really need this? can a read happen after a close? */
660 if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
661 return -EINVAL;
662
663 spin_lock(&proc->asts_spin);
664 if (list_empty(&proc->asts)) {
665 if (file->f_flags & O_NONBLOCK) {
666 spin_unlock(&proc->asts_spin);
667 return -EAGAIN;
668 }
669
670 add_wait_queue(&proc->wait, &wait);
671
672 repeat:
673 set_current_state(TASK_INTERRUPTIBLE);
674 if (list_empty(&proc->asts) && !signal_pending(current)) {
675 spin_unlock(&proc->asts_spin);
676 schedule();
677 spin_lock(&proc->asts_spin);
678 goto repeat;
679 }
680 set_current_state(TASK_RUNNING);
681 remove_wait_queue(&proc->wait, &wait);
682
683 if (signal_pending(current)) {
684 spin_unlock(&proc->asts_spin);
685 return -ERESTARTSYS;
686 }
687 }
688
689 if (list_empty(&proc->asts)) {
690 spin_unlock(&proc->asts_spin);
691 return -EAGAIN;
692 }
693
694 /* there may be both completion and blocking asts to return for
695 the lkb, don't remove lkb from asts list unless no asts remain */
696
697 lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
698
699 if (lkb->lkb_ast_type & AST_COMP) {
700 lkb->lkb_ast_type &= ~AST_COMP;
701 type = AST_COMP;
702 } else if (lkb->lkb_ast_type & AST_BAST) {
703 lkb->lkb_ast_type &= ~AST_BAST;
704 type = AST_BAST;
705 bmode = lkb->lkb_bastmode;
706 }
707
708 if (!lkb->lkb_ast_type) {
709 list_del(&lkb->lkb_astqueue);
710 removed = 1;
711 }
712 spin_unlock(&proc->asts_spin);
713
714 ua = (struct dlm_user_args *)lkb->lkb_astparam;
715 error = copy_result_to_user(ua,
716 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
717 type, bmode, buf, count);
718
719 /* removes reference for the proc->asts lists added by
720 dlm_user_add_ast() and may result in the lkb being freed */
721 if (removed)
722 dlm_put_lkb(lkb);
723
724 return error;
725}
726
727static unsigned int device_poll(struct file *file, poll_table *wait)
728{
729 struct dlm_user_proc *proc = file->private_data;
730
731 poll_wait(file, &proc->wait, wait);
732
733 spin_lock(&proc->asts_spin);
734 if (!list_empty(&proc->asts)) {
735 spin_unlock(&proc->asts_spin);
736 return POLLIN | POLLRDNORM;
737 }
738 spin_unlock(&proc->asts_spin);
739 return 0;
740}
741
742static int ctl_device_open(struct inode *inode, struct file *file)
743{
744 file->private_data = NULL;
745 return 0;
746}
747
748static int ctl_device_close(struct inode *inode, struct file *file)
749{
750 return 0;
751}
752
753static struct file_operations device_fops = {
754 .open = device_open,
755 .release = device_close,
756 .read = device_read,
757 .write = device_write,
758 .poll = device_poll,
759 .owner = THIS_MODULE,
760};
761
762static struct file_operations ctl_device_fops = {
763 .open = ctl_device_open,
764 .release = ctl_device_close,
765 .write = device_write,
766 .owner = THIS_MODULE,
767};
768
769int dlm_user_init(void)
770{
771 int error;
772
773 ctl_device.name = "dlm-control";
774 ctl_device.fops = &ctl_device_fops;
775 ctl_device.minor = MISC_DYNAMIC_MINOR;
776
777 error = misc_register(&ctl_device);
778 if (error)
779 log_print("misc_register failed for control device");
780
781 return error;
782}
783
784void dlm_user_exit(void)
785{
786 misc_deregister(&ctl_device);
787}
788
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
new file mode 100644
index 000000000000..d38e9f3e4151
--- /dev/null
+++ b/fs/dlm/user.h
@@ -0,0 +1,16 @@
1/*
2 * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#ifndef __USER_DOT_H__
10#define __USER_DOT_H__
11
12void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
13int dlm_user_init(void);
14void dlm_user_exit(void);
15
16#endif
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
new file mode 100644
index 000000000000..767197db9944
--- /dev/null
+++ b/fs/dlm/util.c
@@ -0,0 +1,161 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "rcom.h"
15#include "util.h"
16
17static void header_out(struct dlm_header *hd)
18{
19 hd->h_version = cpu_to_le32(hd->h_version);
20 hd->h_lockspace = cpu_to_le32(hd->h_lockspace);
21 hd->h_nodeid = cpu_to_le32(hd->h_nodeid);
22 hd->h_length = cpu_to_le16(hd->h_length);
23}
24
25static void header_in(struct dlm_header *hd)
26{
27 hd->h_version = le32_to_cpu(hd->h_version);
28 hd->h_lockspace = le32_to_cpu(hd->h_lockspace);
29 hd->h_nodeid = le32_to_cpu(hd->h_nodeid);
30 hd->h_length = le16_to_cpu(hd->h_length);
31}
32
33void dlm_message_out(struct dlm_message *ms)
34{
35 struct dlm_header *hd = (struct dlm_header *) ms;
36
37 header_out(hd);
38
39 ms->m_type = cpu_to_le32(ms->m_type);
40 ms->m_nodeid = cpu_to_le32(ms->m_nodeid);
41 ms->m_pid = cpu_to_le32(ms->m_pid);
42 ms->m_lkid = cpu_to_le32(ms->m_lkid);
43 ms->m_remid = cpu_to_le32(ms->m_remid);
44 ms->m_parent_lkid = cpu_to_le32(ms->m_parent_lkid);
45 ms->m_parent_remid = cpu_to_le32(ms->m_parent_remid);
46 ms->m_exflags = cpu_to_le32(ms->m_exflags);
47 ms->m_sbflags = cpu_to_le32(ms->m_sbflags);
48 ms->m_flags = cpu_to_le32(ms->m_flags);
49 ms->m_lvbseq = cpu_to_le32(ms->m_lvbseq);
50 ms->m_hash = cpu_to_le32(ms->m_hash);
51 ms->m_status = cpu_to_le32(ms->m_status);
52 ms->m_grmode = cpu_to_le32(ms->m_grmode);
53 ms->m_rqmode = cpu_to_le32(ms->m_rqmode);
54 ms->m_bastmode = cpu_to_le32(ms->m_bastmode);
55 ms->m_asts = cpu_to_le32(ms->m_asts);
56 ms->m_result = cpu_to_le32(ms->m_result);
57}
58
59void dlm_message_in(struct dlm_message *ms)
60{
61 struct dlm_header *hd = (struct dlm_header *) ms;
62
63 header_in(hd);
64
65 ms->m_type = le32_to_cpu(ms->m_type);
66 ms->m_nodeid = le32_to_cpu(ms->m_nodeid);
67 ms->m_pid = le32_to_cpu(ms->m_pid);
68 ms->m_lkid = le32_to_cpu(ms->m_lkid);
69 ms->m_remid = le32_to_cpu(ms->m_remid);
70 ms->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid);
71 ms->m_parent_remid = le32_to_cpu(ms->m_parent_remid);
72 ms->m_exflags = le32_to_cpu(ms->m_exflags);
73 ms->m_sbflags = le32_to_cpu(ms->m_sbflags);
74 ms->m_flags = le32_to_cpu(ms->m_flags);
75 ms->m_lvbseq = le32_to_cpu(ms->m_lvbseq);
76 ms->m_hash = le32_to_cpu(ms->m_hash);
77 ms->m_status = le32_to_cpu(ms->m_status);
78 ms->m_grmode = le32_to_cpu(ms->m_grmode);
79 ms->m_rqmode = le32_to_cpu(ms->m_rqmode);
80 ms->m_bastmode = le32_to_cpu(ms->m_bastmode);
81 ms->m_asts = le32_to_cpu(ms->m_asts);
82 ms->m_result = le32_to_cpu(ms->m_result);
83}
84
85static void rcom_lock_out(struct rcom_lock *rl)
86{
87 rl->rl_ownpid = cpu_to_le32(rl->rl_ownpid);
88 rl->rl_lkid = cpu_to_le32(rl->rl_lkid);
89 rl->rl_remid = cpu_to_le32(rl->rl_remid);
90 rl->rl_parent_lkid = cpu_to_le32(rl->rl_parent_lkid);
91 rl->rl_parent_remid = cpu_to_le32(rl->rl_parent_remid);
92 rl->rl_exflags = cpu_to_le32(rl->rl_exflags);
93 rl->rl_flags = cpu_to_le32(rl->rl_flags);
94 rl->rl_lvbseq = cpu_to_le32(rl->rl_lvbseq);
95 rl->rl_result = cpu_to_le32(rl->rl_result);
96 rl->rl_wait_type = cpu_to_le16(rl->rl_wait_type);
97 rl->rl_namelen = cpu_to_le16(rl->rl_namelen);
98}
99
100static void rcom_lock_in(struct rcom_lock *rl)
101{
102 rl->rl_ownpid = le32_to_cpu(rl->rl_ownpid);
103 rl->rl_lkid = le32_to_cpu(rl->rl_lkid);
104 rl->rl_remid = le32_to_cpu(rl->rl_remid);
105 rl->rl_parent_lkid = le32_to_cpu(rl->rl_parent_lkid);
106 rl->rl_parent_remid = le32_to_cpu(rl->rl_parent_remid);
107 rl->rl_exflags = le32_to_cpu(rl->rl_exflags);
108 rl->rl_flags = le32_to_cpu(rl->rl_flags);
109 rl->rl_lvbseq = le32_to_cpu(rl->rl_lvbseq);
110 rl->rl_result = le32_to_cpu(rl->rl_result);
111 rl->rl_wait_type = le16_to_cpu(rl->rl_wait_type);
112 rl->rl_namelen = le16_to_cpu(rl->rl_namelen);
113}
114
115static void rcom_config_out(struct rcom_config *rf)
116{
117 rf->rf_lvblen = cpu_to_le32(rf->rf_lvblen);
118 rf->rf_lsflags = cpu_to_le32(rf->rf_lsflags);
119}
120
121static void rcom_config_in(struct rcom_config *rf)
122{
123 rf->rf_lvblen = le32_to_cpu(rf->rf_lvblen);
124 rf->rf_lsflags = le32_to_cpu(rf->rf_lsflags);
125}
126
127void dlm_rcom_out(struct dlm_rcom *rc)
128{
129 struct dlm_header *hd = (struct dlm_header *) rc;
130 int type = rc->rc_type;
131
132 header_out(hd);
133
134 rc->rc_type = cpu_to_le32(rc->rc_type);
135 rc->rc_result = cpu_to_le32(rc->rc_result);
136 rc->rc_id = cpu_to_le64(rc->rc_id);
137
138 if (type == DLM_RCOM_LOCK)
139 rcom_lock_out((struct rcom_lock *) rc->rc_buf);
140
141 else if (type == DLM_RCOM_STATUS_REPLY)
142 rcom_config_out((struct rcom_config *) rc->rc_buf);
143}
144
145void dlm_rcom_in(struct dlm_rcom *rc)
146{
147 struct dlm_header *hd = (struct dlm_header *) rc;
148
149 header_in(hd);
150
151 rc->rc_type = le32_to_cpu(rc->rc_type);
152 rc->rc_result = le32_to_cpu(rc->rc_result);
153 rc->rc_id = le64_to_cpu(rc->rc_id);
154
155 if (rc->rc_type == DLM_RCOM_LOCK)
156 rcom_lock_in((struct rcom_lock *) rc->rc_buf);
157
158 else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
159 rcom_config_in((struct rcom_config *) rc->rc_buf);
160}
161
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
new file mode 100644
index 000000000000..2b25915161c0
--- /dev/null
+++ b/fs/dlm/util.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __UTIL_DOT_H__
14#define __UTIL_DOT_H__
15
16void dlm_message_out(struct dlm_message *ms);
17void dlm_message_in(struct dlm_message *ms);
18void dlm_rcom_out(struct dlm_rcom *rc);
19void dlm_rcom_in(struct dlm_rcom *rc);
20
21#endif
22
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
new file mode 100644
index 000000000000..8c27de8b9568
--- /dev/null
+++ b/fs/gfs2/Kconfig
@@ -0,0 +1,44 @@
1config GFS2_FS
2 tristate "GFS2 file system support"
3 depends on EXPERIMENTAL
4 select FS_POSIX_ACL
5 help
6 A cluster filesystem.
7
8 Allows a cluster of computers to simultaneously use a block device
9 that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads
10 and writes to the block device like a local filesystem, but also uses
11 a lock module to allow the computers coordinate their I/O so
12 filesystem consistency is maintained. One of the nifty features of
13 GFS is perfect consistency -- changes made to the filesystem on one
14 machine show up immediately on all other machines in the cluster.
15
16 To use the GFS2 filesystem, you will need to enable one or more of
17 the below locking modules. Documentation and utilities for GFS2 can
18 be found here: http://sources.redhat.com/cluster
19
20config GFS2_FS_LOCKING_NOLOCK
21 tristate "GFS2 \"nolock\" locking module"
22 depends on GFS2_FS
23 help
24 Single node locking module for GFS2.
25
26 Use this module if you want to use GFS2 on a single node without
27 its clustering features. You can still take advantage of the
28 large file support, and upgrade to running a full cluster later on
29 if required.
30
31 If you will only be using GFS2 in cluster mode, you do not need this
32 module.
33
34config GFS2_FS_LOCKING_DLM
35 tristate "GFS2 DLM locking module"
36 depends on GFS2_FS
37 select DLM
38 help
39 Multiple node locking module for GFS2
40
41 Most users of GFS2 will require this module. It provides the locking
42 interface between GFS2 and the DLM, which is required to use GFS2
43 in a cluster environment.
44
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
new file mode 100644
index 000000000000..e3f1ada643ac
--- /dev/null
+++ b/fs/gfs2/Makefile
@@ -0,0 +1,10 @@
1obj-$(CONFIG_GFS2_FS) += gfs2.o
2gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
3 glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \
4 mount.o ondisk.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
5 ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \
6 recovery.o rgrp.o super.o sys.o trans.o util.o
7
8obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
9obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
10
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
new file mode 100644
index 000000000000..5f959b8ce406
--- /dev/null
+++ b/fs/gfs2/acl.c
@@ -0,0 +1,309 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/posix_acl.h>
16#include <linux/posix_acl_xattr.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/lm_interface.h>
19
20#include "gfs2.h"
21#include "incore.h"
22#include "acl.h"
23#include "eaops.h"
24#include "eattr.h"
25#include "glock.h"
26#include "inode.h"
27#include "meta_io.h"
28#include "trans.h"
29#include "util.h"
30
31#define ACL_ACCESS 1
32#define ACL_DEFAULT 0
33
34int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
35 struct gfs2_ea_request *er,
36 int *remove, mode_t *mode)
37{
38 struct posix_acl *acl;
39 int error;
40
41 error = gfs2_acl_validate_remove(ip, access);
42 if (error)
43 return error;
44
45 if (!er->er_data)
46 return -EINVAL;
47
48 acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
49 if (IS_ERR(acl))
50 return PTR_ERR(acl);
51 if (!acl) {
52 *remove = 1;
53 return 0;
54 }
55
56 error = posix_acl_valid(acl);
57 if (error)
58 goto out;
59
60 if (access) {
61 error = posix_acl_equiv_mode(acl, mode);
62 if (!error)
63 *remove = 1;
64 else if (error > 0)
65 error = 0;
66 }
67
68out:
69 posix_acl_release(acl);
70 return error;
71}
72
73int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
74{
75 if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
76 return -EOPNOTSUPP;
77 if (current->fsuid != ip->i_di.di_uid && !capable(CAP_FOWNER))
78 return -EPERM;
79 if (S_ISLNK(ip->i_di.di_mode))
80 return -EOPNOTSUPP;
81 if (!access && !S_ISDIR(ip->i_di.di_mode))
82 return -EACCES;
83
84 return 0;
85}
86
87static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
88 struct gfs2_ea_location *el, char **data, unsigned int *len)
89{
90 struct gfs2_ea_request er;
91 struct gfs2_ea_location el_this;
92 int error;
93
94 if (!ip->i_di.di_eattr)
95 return 0;
96
97 memset(&er, 0, sizeof(struct gfs2_ea_request));
98 if (access) {
99 er.er_name = GFS2_POSIX_ACL_ACCESS;
100 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
101 } else {
102 er.er_name = GFS2_POSIX_ACL_DEFAULT;
103 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
104 }
105 er.er_type = GFS2_EATYPE_SYS;
106
107 if (!el)
108 el = &el_this;
109
110 error = gfs2_ea_find(ip, &er, el);
111 if (error)
112 return error;
113 if (!el->el_ea)
114 return 0;
115 if (!GFS2_EA_DATA_LEN(el->el_ea))
116 goto out;
117
118 er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
119 er.er_data = kmalloc(er.er_data_len, GFP_KERNEL);
120 error = -ENOMEM;
121 if (!er.er_data)
122 goto out;
123
124 error = gfs2_ea_get_copy(ip, el, er.er_data);
125 if (error)
126 goto out_kfree;
127
128 if (acl) {
129 *acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
130 if (IS_ERR(*acl))
131 error = PTR_ERR(*acl);
132 }
133
134out_kfree:
135 if (error || !data)
136 kfree(er.er_data);
137 else {
138 *data = er.er_data;
139 *len = er.er_data_len;
140 }
141out:
142 if (error || el == &el_this)
143 brelse(el->el_bh);
144 return error;
145}
146
147/**
148 * gfs2_check_acl_locked - Check an ACL to see if we're allowed to do something
149 * @inode: the file we want to do something to
150 * @mask: what we want to do
151 *
152 * Returns: errno
153 */
154
155int gfs2_check_acl_locked(struct inode *inode, int mask)
156{
157 struct posix_acl *acl = NULL;
158 int error;
159
160 error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL);
161 if (error)
162 return error;
163
164 if (acl) {
165 error = posix_acl_permission(inode, acl, mask);
166 posix_acl_release(acl);
167 return error;
168 }
169
170 return -EAGAIN;
171}
172
173int gfs2_check_acl(struct inode *inode, int mask)
174{
175 struct gfs2_inode *ip = GFS2_I(inode);
176 struct gfs2_holder i_gh;
177 int error;
178
179 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
180 if (!error) {
181 error = gfs2_check_acl_locked(inode, mask);
182 gfs2_glock_dq_uninit(&i_gh);
183 }
184
185 return error;
186}
187
188static int munge_mode(struct gfs2_inode *ip, mode_t mode)
189{
190 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
191 struct buffer_head *dibh;
192 int error;
193
194 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
195 if (error)
196 return error;
197
198 error = gfs2_meta_inode_buffer(ip, &dibh);
199 if (!error) {
200 gfs2_assert_withdraw(sdp,
201 (ip->i_di.di_mode & S_IFMT) == (mode & S_IFMT));
202 ip->i_di.di_mode = mode;
203 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
204 gfs2_dinode_out(&ip->i_di, dibh->b_data);
205 brelse(dibh);
206 }
207
208 gfs2_trans_end(sdp);
209
210 return 0;
211}
212
213int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
214{
215 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
216 struct posix_acl *acl = NULL, *clone;
217 struct gfs2_ea_request er;
218 mode_t mode = ip->i_di.di_mode;
219 int error;
220
221 if (!sdp->sd_args.ar_posix_acl)
222 return 0;
223 if (S_ISLNK(ip->i_di.di_mode))
224 return 0;
225
226 memset(&er, 0, sizeof(struct gfs2_ea_request));
227 er.er_type = GFS2_EATYPE_SYS;
228
229 error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
230 &er.er_data, &er.er_data_len);
231 if (error)
232 return error;
233 if (!acl) {
234 mode &= ~current->fs->umask;
235 if (mode != ip->i_di.di_mode)
236 error = munge_mode(ip, mode);
237 return error;
238 }
239
240 clone = posix_acl_clone(acl, GFP_KERNEL);
241 error = -ENOMEM;
242 if (!clone)
243 goto out;
244 posix_acl_release(acl);
245 acl = clone;
246
247 if (S_ISDIR(ip->i_di.di_mode)) {
248 er.er_name = GFS2_POSIX_ACL_DEFAULT;
249 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
250 error = gfs2_system_eaops.eo_set(ip, &er);
251 if (error)
252 goto out;
253 }
254
255 error = posix_acl_create_masq(acl, &mode);
256 if (error < 0)
257 goto out;
258 if (error > 0) {
259 er.er_name = GFS2_POSIX_ACL_ACCESS;
260 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
261 posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
262 er.er_mode = mode;
263 er.er_flags = GFS2_ERF_MODE;
264 error = gfs2_system_eaops.eo_set(ip, &er);
265 if (error)
266 goto out;
267 } else
268 munge_mode(ip, mode);
269
270out:
271 posix_acl_release(acl);
272 kfree(er.er_data);
273 return error;
274}
275
276int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
277{
278 struct posix_acl *acl = NULL, *clone;
279 struct gfs2_ea_location el;
280 char *data;
281 unsigned int len;
282 int error;
283
284 error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
285 if (error)
286 return error;
287 if (!acl)
288 return gfs2_setattr_simple(ip, attr);
289
290 clone = posix_acl_clone(acl, GFP_KERNEL);
291 error = -ENOMEM;
292 if (!clone)
293 goto out;
294 posix_acl_release(acl);
295 acl = clone;
296
297 error = posix_acl_chmod_masq(acl, attr->ia_mode);
298 if (!error) {
299 posix_acl_to_xattr(acl, data, len);
300 error = gfs2_ea_acl_chmod(ip, &el, attr, data);
301 }
302
303out:
304 posix_acl_release(acl);
305 brelse(el.el_bh);
306 kfree(data);
307 return error;
308}
309
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
new file mode 100644
index 000000000000..05c294fe0d78
--- /dev/null
+++ b/fs/gfs2/acl.h
@@ -0,0 +1,39 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __ACL_DOT_H__
11#define __ACL_DOT_H__
12
13#include "incore.h"
14
15#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
16#define GFS2_POSIX_ACL_ACCESS_LEN 16
17#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
18#define GFS2_POSIX_ACL_DEFAULT_LEN 17
19
20#define GFS2_ACL_IS_ACCESS(name, len) \
21 ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
22 !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
23
24#define GFS2_ACL_IS_DEFAULT(name, len) \
25 ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
26 !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
27
28struct gfs2_ea_request;
29
30int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
31 struct gfs2_ea_request *er,
32 int *remove, mode_t *mode);
33int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
34int gfs2_check_acl_locked(struct inode *inode, int mask);
35int gfs2_check_acl(struct inode *inode, int mask);
36int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
37int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
38
39#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
new file mode 100644
index 000000000000..cc57f2ecd219
--- /dev/null
+++ b/fs/gfs2/bmap.c
@@ -0,0 +1,1221 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17#include <linux/lm_interface.h>
18
19#include "gfs2.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "inode.h"
24#include "meta_io.h"
25#include "quota.h"
26#include "rgrp.h"
27#include "trans.h"
28#include "dir.h"
29#include "util.h"
30#include "ops_address.h"
31
32/* This doesn't need to be that large as max 64 bit pointers in a 4k
33 * block is 512, so __u16 is fine for that. It saves stack space to
34 * keep it small.
35 */
36struct metapath {
37 __u16 mp_list[GFS2_MAX_META_HEIGHT];
38};
39
40typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
41 struct buffer_head *bh, u64 *top,
42 u64 *bottom, unsigned int height,
43 void *data);
44
45struct strip_mine {
46 int sm_first;
47 unsigned int sm_height;
48};
49
50/**
51 * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
52 * @ip: the inode
53 * @dibh: the dinode buffer
54 * @block: the block number that was allocated
55 * @private: any locked page held by the caller process
56 *
57 * Returns: errno
58 */
59
60static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
61 u64 block, struct page *page)
62{
63 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
64 struct inode *inode = &ip->i_inode;
65 struct buffer_head *bh;
66 int release = 0;
67
68 if (!page || page->index) {
69 page = grab_cache_page(inode->i_mapping, 0);
70 if (!page)
71 return -ENOMEM;
72 release = 1;
73 }
74
75 if (!PageUptodate(page)) {
76 void *kaddr = kmap(page);
77
78 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
79 ip->i_di.di_size);
80 memset(kaddr + ip->i_di.di_size, 0,
81 PAGE_CACHE_SIZE - ip->i_di.di_size);
82 kunmap(page);
83
84 SetPageUptodate(page);
85 }
86
87 if (!page_has_buffers(page))
88 create_empty_buffers(page, 1 << inode->i_blkbits,
89 (1 << BH_Uptodate));
90
91 bh = page_buffers(page);
92
93 if (!buffer_mapped(bh))
94 map_bh(bh, inode->i_sb, block);
95
96 set_buffer_uptodate(bh);
97 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
98 gfs2_trans_add_bh(ip->i_gl, bh, 0);
99 mark_buffer_dirty(bh);
100
101 if (release) {
102 unlock_page(page);
103 page_cache_release(page);
104 }
105
106 return 0;
107}
108
109/**
110 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
111 * @ip: The GFS2 inode to unstuff
112 * @unstuffer: the routine that handles unstuffing a non-zero length file
113 * @private: private data for the unstuffer
114 *
115 * This routine unstuffs a dinode and returns it to a "normal" state such
116 * that the height can be grown in the traditional way.
117 *
118 * Returns: errno
119 */
120
121int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
122{
123 struct buffer_head *bh, *dibh;
124 struct gfs2_dinode *di;
125 u64 block = 0;
126 int isdir = gfs2_is_dir(ip);
127 int error;
128
129 down_write(&ip->i_rw_mutex);
130
131 error = gfs2_meta_inode_buffer(ip, &dibh);
132 if (error)
133 goto out;
134
135 if (ip->i_di.di_size) {
136 /* Get a free block, fill it with the stuffed data,
137 and write it out to disk */
138
139 if (isdir) {
140 block = gfs2_alloc_meta(ip);
141
142 error = gfs2_dir_get_new_buffer(ip, block, &bh);
143 if (error)
144 goto out_brelse;
145 gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
146 dibh, sizeof(struct gfs2_dinode));
147 brelse(bh);
148 } else {
149 block = gfs2_alloc_data(ip);
150
151 error = gfs2_unstuffer_page(ip, dibh, block, page);
152 if (error)
153 goto out_brelse;
154 }
155 }
156
157 /* Set up the pointer to the new block */
158
159 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
160 di = (struct gfs2_dinode *)dibh->b_data;
161 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
162
163 if (ip->i_di.di_size) {
164 *(__be64 *)(di + 1) = cpu_to_be64(block);
165 ip->i_di.di_blocks++;
166 di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
167 }
168
169 ip->i_di.di_height = 1;
170 di->di_height = cpu_to_be16(1);
171
172out_brelse:
173 brelse(dibh);
174out:
175 up_write(&ip->i_rw_mutex);
176 return error;
177}
178
179/**
180 * calc_tree_height - Calculate the height of a metadata tree
181 * @ip: The GFS2 inode
182 * @size: The proposed size of the file
183 *
184 * Work out how tall a metadata tree needs to be in order to accommodate a
185 * file of a particular size. If size is less than the current size of
186 * the inode, then the current size of the inode is used instead of the
187 * supplied one.
188 *
189 * Returns: the height the tree should be
190 */
191
192static unsigned int calc_tree_height(struct gfs2_inode *ip, u64 size)
193{
194 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
195 u64 *arr;
196 unsigned int max, height;
197
198 if (ip->i_di.di_size > size)
199 size = ip->i_di.di_size;
200
201 if (gfs2_is_dir(ip)) {
202 arr = sdp->sd_jheightsize;
203 max = sdp->sd_max_jheight;
204 } else {
205 arr = sdp->sd_heightsize;
206 max = sdp->sd_max_height;
207 }
208
209 for (height = 0; height < max; height++)
210 if (arr[height] >= size)
211 break;
212
213 return height;
214}
215
216/**
217 * build_height - Build a metadata tree of the requested height
218 * @ip: The GFS2 inode
219 * @height: The height to build to
220 *
221 *
222 * Returns: errno
223 */
224
225static int build_height(struct inode *inode, unsigned height)
226{
227 struct gfs2_inode *ip = GFS2_I(inode);
228 unsigned new_height = height - ip->i_di.di_height;
229 struct buffer_head *dibh;
230 struct buffer_head *blocks[GFS2_MAX_META_HEIGHT];
231 struct gfs2_dinode *di;
232 int error;
233 u64 *bp;
234 u64 bn;
235 unsigned n;
236
237 if (height <= ip->i_di.di_height)
238 return 0;
239
240 error = gfs2_meta_inode_buffer(ip, &dibh);
241 if (error)
242 return error;
243
244 for(n = 0; n < new_height; n++) {
245 bn = gfs2_alloc_meta(ip);
246 blocks[n] = gfs2_meta_new(ip->i_gl, bn);
247 gfs2_trans_add_bh(ip->i_gl, blocks[n], 1);
248 }
249
250 n = 0;
251 bn = blocks[0]->b_blocknr;
252 if (new_height > 1) {
253 for(; n < new_height-1; n++) {
254 gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN,
255 GFS2_FORMAT_IN);
256 gfs2_buffer_clear_tail(blocks[n],
257 sizeof(struct gfs2_meta_header));
258 bp = (u64 *)(blocks[n]->b_data +
259 sizeof(struct gfs2_meta_header));
260 *bp = cpu_to_be64(blocks[n+1]->b_blocknr);
261 brelse(blocks[n]);
262 blocks[n] = NULL;
263 }
264 }
265 gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
266 gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header),
267 dibh, sizeof(struct gfs2_dinode));
268 brelse(blocks[n]);
269 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
270 di = (struct gfs2_dinode *)dibh->b_data;
271 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
272 *(__be64 *)(di + 1) = cpu_to_be64(bn);
273 ip->i_di.di_height += new_height;
274 ip->i_di.di_blocks += new_height;
275 di->di_height = cpu_to_be16(ip->i_di.di_height);
276 di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
277 brelse(dibh);
278 return error;
279}
280
281/**
282 * find_metapath - Find path through the metadata tree
283 * @ip: The inode pointer
284 * @mp: The metapath to return the result in
285 * @block: The disk block to look up
286 *
287 * This routine returns a struct metapath structure that defines a path
288 * through the metadata of inode "ip" to get to block "block".
289 *
290 * Example:
291 * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a
292 * filesystem with a blocksize of 4096.
293 *
294 * find_metapath() would return a struct metapath structure set to:
295 * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
296 * and mp_list[2] = 165.
297 *
298 * That means that in order to get to the block containing the byte at
299 * offset 101342453, we would load the indirect block pointed to by pointer
300 * 0 in the dinode. We would then load the indirect block pointed to by
301 * pointer 48 in that indirect block. We would then load the data block
302 * pointed to by pointer 165 in that indirect block.
303 *
304 * ----------------------------------------
305 * | Dinode | |
306 * | | 4|
307 * | |0 1 2 3 4 5 9|
308 * | | 6|
309 * ----------------------------------------
310 * |
311 * |
312 * V
313 * ----------------------------------------
314 * | Indirect Block |
315 * | 5|
316 * | 4 4 4 4 4 5 5 1|
317 * |0 5 6 7 8 9 0 1 2|
318 * ----------------------------------------
319 * |
320 * |
321 * V
322 * ----------------------------------------
323 * | Indirect Block |
324 * | 1 1 1 1 1 5|
325 * | 6 6 6 6 6 1|
326 * |0 3 4 5 6 7 2|
327 * ----------------------------------------
328 * |
329 * |
330 * V
331 * ----------------------------------------
332 * | Data block containing offset |
333 * | 101342453 |
334 * | |
335 * | |
336 * ----------------------------------------
337 *
338 */
339
340static void find_metapath(struct gfs2_inode *ip, u64 block,
341 struct metapath *mp)
342{
343 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
344 u64 b = block;
345 unsigned int i;
346
347 for (i = ip->i_di.di_height; i--;)
348 mp->mp_list[i] = do_div(b, sdp->sd_inptrs);
349
350}
351
352/**
353 * metapointer - Return pointer to start of metadata in a buffer
354 * @bh: The buffer
355 * @height: The metadata height (0 = dinode)
356 * @mp: The metapath
357 *
358 * Return a pointer to the block number of the next height of the metadata
359 * tree given a buffer containing the pointer to the current height of the
360 * metadata tree.
361 */
362
363static inline u64 *metapointer(struct buffer_head *bh, int *boundary,
364 unsigned int height, const struct metapath *mp)
365{
366 unsigned int head_size = (height > 0) ?
367 sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
368 u64 *ptr;
369 *boundary = 0;
370 ptr = ((u64 *)(bh->b_data + head_size)) + mp->mp_list[height];
371 if (ptr + 1 == (u64 *)(bh->b_data + bh->b_size))
372 *boundary = 1;
373 return ptr;
374}
375
376/**
377 * lookup_block - Get the next metadata block in metadata tree
378 * @ip: The GFS2 inode
379 * @bh: Buffer containing the pointers to metadata blocks
380 * @height: The height of the tree (0 = dinode)
381 * @mp: The metapath
382 * @create: Non-zero if we may create a new meatdata block
383 * @new: Used to indicate if we did create a new metadata block
384 * @block: the returned disk block number
385 *
386 * Given a metatree, complete to a particular height, checks to see if the next
387 * height of the tree exists. If not the next height of the tree is created.
388 * The block number of the next height of the metadata tree is returned.
389 *
390 */
391
392static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
393 unsigned int height, struct metapath *mp, int create,
394 int *new, u64 *block)
395{
396 int boundary;
397 u64 *ptr = metapointer(bh, &boundary, height, mp);
398
399 if (*ptr) {
400 *block = be64_to_cpu(*ptr);
401 return boundary;
402 }
403
404 *block = 0;
405
406 if (!create)
407 return 0;
408
409 if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip))
410 *block = gfs2_alloc_data(ip);
411 else
412 *block = gfs2_alloc_meta(ip);
413
414 gfs2_trans_add_bh(ip->i_gl, bh, 1);
415
416 *ptr = cpu_to_be64(*block);
417 ip->i_di.di_blocks++;
418
419 *new = 1;
420 return 0;
421}
422
423/**
424 * gfs2_block_pointers - Map a block from an inode to a disk block
425 * @inode: The inode
426 * @lblock: The logical block number
427 * @map_bh: The bh to be mapped
428 * @mp: metapath to use
429 *
430 * Find the block number on the current device which corresponds to an
431 * inode's block. If the block had to be created, "new" will be set.
432 *
433 * Returns: errno
434 */
435
436static int gfs2_block_pointers(struct inode *inode, u64 lblock, int create,
437 struct buffer_head *bh_map, struct metapath *mp,
438 unsigned int maxlen)
439{
440 struct gfs2_inode *ip = GFS2_I(inode);
441 struct gfs2_sbd *sdp = GFS2_SB(inode);
442 struct buffer_head *bh;
443 unsigned int bsize;
444 unsigned int height;
445 unsigned int end_of_metadata;
446 unsigned int x;
447 int error = 0;
448 int new = 0;
449 u64 dblock = 0;
450 int boundary;
451
452 BUG_ON(maxlen == 0);
453
454 if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
455 return 0;
456
457 bsize = gfs2_is_dir(ip) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
458
459 height = calc_tree_height(ip, (lblock + 1) * bsize);
460 if (ip->i_di.di_height < height) {
461 if (!create)
462 return 0;
463
464 error = build_height(inode, height);
465 if (error)
466 return error;
467 }
468
469 find_metapath(ip, lblock, mp);
470 end_of_metadata = ip->i_di.di_height - 1;
471
472 error = gfs2_meta_inode_buffer(ip, &bh);
473 if (error)
474 return error;
475
476 for (x = 0; x < end_of_metadata; x++) {
477 lookup_block(ip, bh, x, mp, create, &new, &dblock);
478 brelse(bh);
479 if (!dblock)
480 return 0;
481
482 error = gfs2_meta_indirect_buffer(ip, x+1, dblock, new, &bh);
483 if (error)
484 return error;
485 }
486
487 boundary = lookup_block(ip, bh, end_of_metadata, mp, create, &new, &dblock);
488 clear_buffer_mapped(bh_map);
489 clear_buffer_new(bh_map);
490 clear_buffer_boundary(bh_map);
491
492 if (dblock) {
493 map_bh(bh_map, inode->i_sb, dblock);
494 if (boundary)
495 set_buffer_boundary(bh);
496 if (new) {
497 struct buffer_head *dibh;
498 error = gfs2_meta_inode_buffer(ip, &dibh);
499 if (!error) {
500 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
501 gfs2_dinode_out(&ip->i_di, dibh->b_data);
502 brelse(dibh);
503 }
504 set_buffer_new(bh_map);
505 goto out_brelse;
506 }
507 while(--maxlen && !buffer_boundary(bh_map)) {
508 u64 eblock;
509
510 mp->mp_list[end_of_metadata]++;
511 boundary = lookup_block(ip, bh, end_of_metadata, mp, 0, &new, &eblock);
512 if (eblock != ++dblock)
513 break;
514 bh_map->b_size += (1 << inode->i_blkbits);
515 if (boundary)
516 set_buffer_boundary(bh_map);
517 }
518 }
519out_brelse:
520 brelse(bh);
521 return 0;
522}
523
524
525static inline void bmap_lock(struct inode *inode, int create)
526{
527 struct gfs2_inode *ip = GFS2_I(inode);
528 if (create)
529 down_write(&ip->i_rw_mutex);
530 else
531 down_read(&ip->i_rw_mutex);
532}
533
534static inline void bmap_unlock(struct inode *inode, int create)
535{
536 struct gfs2_inode *ip = GFS2_I(inode);
537 if (create)
538 up_write(&ip->i_rw_mutex);
539 else
540 up_read(&ip->i_rw_mutex);
541}
542
543int gfs2_block_map(struct inode *inode, u64 lblock, int create,
544 struct buffer_head *bh, unsigned int maxlen)
545{
546 struct metapath mp;
547 int ret;
548
549 bmap_lock(inode, create);
550 ret = gfs2_block_pointers(inode, lblock, create, bh, &mp, maxlen);
551 bmap_unlock(inode, create);
552 return ret;
553}
554
555int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
556{
557 struct metapath mp;
558 struct buffer_head bh = { .b_state = 0, .b_blocknr = 0, .b_size = 0 };
559 int ret;
560 int create = *new;
561
562 BUG_ON(!extlen);
563 BUG_ON(!dblock);
564 BUG_ON(!new);
565
566 bmap_lock(inode, create);
567 ret = gfs2_block_pointers(inode, lblock, create, &bh, &mp, 32);
568 bmap_unlock(inode, create);
569 *extlen = bh.b_size >> inode->i_blkbits;
570 *dblock = bh.b_blocknr;
571 if (buffer_new(&bh))
572 *new = 1;
573 else
574 *new = 0;
575 return ret;
576}
577
578/**
579 * recursive_scan - recursively scan through the end of a file
580 * @ip: the inode
581 * @dibh: the dinode buffer
582 * @mp: the path through the metadata to the point to start
583 * @height: the height the recursion is at
584 * @block: the indirect block to look at
585 * @first: 1 if this is the first block
586 * @bc: the call to make for each piece of metadata
587 * @data: data opaque to this function to pass to @bc
588 *
589 * When this is first called @height and @block should be zero and
590 * @first should be 1.
591 *
592 * Returns: errno
593 */
594
595static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
596 struct metapath *mp, unsigned int height,
597 u64 block, int first, block_call_t bc,
598 void *data)
599{
600 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
601 struct buffer_head *bh = NULL;
602 u64 *top, *bottom;
603 u64 bn;
604 int error;
605 int mh_size = sizeof(struct gfs2_meta_header);
606
607 if (!height) {
608 error = gfs2_meta_inode_buffer(ip, &bh);
609 if (error)
610 return error;
611 dibh = bh;
612
613 top = (u64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
614 bottom = (u64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
615 } else {
616 error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
617 if (error)
618 return error;
619
620 top = (u64 *)(bh->b_data + mh_size) +
621 (first ? mp->mp_list[height] : 0);
622
623 bottom = (u64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
624 }
625
626 error = bc(ip, dibh, bh, top, bottom, height, data);
627 if (error)
628 goto out;
629
630 if (height < ip->i_di.di_height - 1)
631 for (; top < bottom; top++, first = 0) {
632 if (!*top)
633 continue;
634
635 bn = be64_to_cpu(*top);
636
637 error = recursive_scan(ip, dibh, mp, height + 1, bn,
638 first, bc, data);
639 if (error)
640 break;
641 }
642
643out:
644 brelse(bh);
645 return error;
646}
647
648/**
649 * do_strip - Look for a layer a particular layer of the file and strip it off
650 * @ip: the inode
651 * @dibh: the dinode buffer
652 * @bh: A buffer of pointers
653 * @top: The first pointer in the buffer
654 * @bottom: One more than the last pointer
655 * @height: the height this buffer is at
656 * @data: a pointer to a struct strip_mine
657 *
658 * Returns: errno
659 */
660
661static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
662 struct buffer_head *bh, u64 *top, u64 *bottom,
663 unsigned int height, void *data)
664{
665 struct strip_mine *sm = data;
666 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
667 struct gfs2_rgrp_list rlist;
668 u64 bn, bstart;
669 u32 blen;
670 u64 *p;
671 unsigned int rg_blocks = 0;
672 int metadata;
673 unsigned int revokes = 0;
674 int x;
675 int error;
676
677 if (!*top)
678 sm->sm_first = 0;
679
680 if (height != sm->sm_height)
681 return 0;
682
683 if (sm->sm_first) {
684 top++;
685 sm->sm_first = 0;
686 }
687
688 metadata = (height != ip->i_di.di_height - 1);
689 if (metadata)
690 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
691
692 error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh);
693 if (error)
694 return error;
695
696 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
697 bstart = 0;
698 blen = 0;
699
700 for (p = top; p < bottom; p++) {
701 if (!*p)
702 continue;
703
704 bn = be64_to_cpu(*p);
705
706 if (bstart + blen == bn)
707 blen++;
708 else {
709 if (bstart)
710 gfs2_rlist_add(sdp, &rlist, bstart);
711
712 bstart = bn;
713 blen = 1;
714 }
715 }
716
717 if (bstart)
718 gfs2_rlist_add(sdp, &rlist, bstart);
719 else
720 goto out; /* Nothing to do */
721
722 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
723
724 for (x = 0; x < rlist.rl_rgrps; x++) {
725 struct gfs2_rgrpd *rgd;
726 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
727 rg_blocks += rgd->rd_ri.ri_length;
728 }
729
730 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
731 if (error)
732 goto out_rlist;
733
734 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
735 RES_INDIRECT + RES_STATFS + RES_QUOTA,
736 revokes);
737 if (error)
738 goto out_rg_gunlock;
739
740 down_write(&ip->i_rw_mutex);
741
742 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
743 gfs2_trans_add_bh(ip->i_gl, bh, 1);
744
745 bstart = 0;
746 blen = 0;
747
748 for (p = top; p < bottom; p++) {
749 if (!*p)
750 continue;
751
752 bn = be64_to_cpu(*p);
753
754 if (bstart + blen == bn)
755 blen++;
756 else {
757 if (bstart) {
758 if (metadata)
759 gfs2_free_meta(ip, bstart, blen);
760 else
761 gfs2_free_data(ip, bstart, blen);
762 }
763
764 bstart = bn;
765 blen = 1;
766 }
767
768 *p = 0;
769 if (!ip->i_di.di_blocks)
770 gfs2_consist_inode(ip);
771 ip->i_di.di_blocks--;
772 }
773 if (bstart) {
774 if (metadata)
775 gfs2_free_meta(ip, bstart, blen);
776 else
777 gfs2_free_data(ip, bstart, blen);
778 }
779
780 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
781
782 gfs2_dinode_out(&ip->i_di, dibh->b_data);
783
784 up_write(&ip->i_rw_mutex);
785
786 gfs2_trans_end(sdp);
787
788out_rg_gunlock:
789 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
790out_rlist:
791 gfs2_rlist_free(&rlist);
792out:
793 gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh);
794 return error;
795}
796
797/**
798 * do_grow - Make a file look bigger than it is
799 * @ip: the inode
800 * @size: the size to set the file to
801 *
802 * Called with an exclusive lock on @ip.
803 *
804 * Returns: errno
805 */
806
807static int do_grow(struct gfs2_inode *ip, u64 size)
808{
809 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
810 struct gfs2_alloc *al;
811 struct buffer_head *dibh;
812 unsigned int h;
813 int error;
814
815 al = gfs2_alloc_get(ip);
816
817 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
818 if (error)
819 goto out;
820
821 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
822 if (error)
823 goto out_gunlock_q;
824
825 al->al_requested = sdp->sd_max_height + RES_DATA;
826
827 error = gfs2_inplace_reserve(ip);
828 if (error)
829 goto out_gunlock_q;
830
831 error = gfs2_trans_begin(sdp,
832 sdp->sd_max_height + al->al_rgd->rd_ri.ri_length +
833 RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
834 if (error)
835 goto out_ipres;
836
837 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
838 if (gfs2_is_stuffed(ip)) {
839 error = gfs2_unstuff_dinode(ip, NULL);
840 if (error)
841 goto out_end_trans;
842 }
843
844 h = calc_tree_height(ip, size);
845 if (ip->i_di.di_height < h) {
846 down_write(&ip->i_rw_mutex);
847 error = build_height(&ip->i_inode, h);
848 up_write(&ip->i_rw_mutex);
849 if (error)
850 goto out_end_trans;
851 }
852 }
853
854 ip->i_di.di_size = size;
855 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
856
857 error = gfs2_meta_inode_buffer(ip, &dibh);
858 if (error)
859 goto out_end_trans;
860
861 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
862 gfs2_dinode_out(&ip->i_di, dibh->b_data);
863 brelse(dibh);
864
865out_end_trans:
866 gfs2_trans_end(sdp);
867out_ipres:
868 gfs2_inplace_release(ip);
869out_gunlock_q:
870 gfs2_quota_unlock(ip);
871out:
872 gfs2_alloc_put(ip);
873 return error;
874}
875
876
877/**
878 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
879 *
880 * This is partly borrowed from ext3.
881 */
882static int gfs2_block_truncate_page(struct address_space *mapping)
883{
884 struct inode *inode = mapping->host;
885 struct gfs2_inode *ip = GFS2_I(inode);
886 struct gfs2_sbd *sdp = GFS2_SB(inode);
887 loff_t from = inode->i_size;
888 unsigned long index = from >> PAGE_CACHE_SHIFT;
889 unsigned offset = from & (PAGE_CACHE_SIZE-1);
890 unsigned blocksize, iblock, length, pos;
891 struct buffer_head *bh;
892 struct page *page;
893 void *kaddr;
894 int err;
895
896 page = grab_cache_page(mapping, index);
897 if (!page)
898 return 0;
899
900 blocksize = inode->i_sb->s_blocksize;
901 length = blocksize - (offset & (blocksize - 1));
902 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
903
904 if (!page_has_buffers(page))
905 create_empty_buffers(page, blocksize, 0);
906
907 /* Find the buffer that contains "offset" */
908 bh = page_buffers(page);
909 pos = blocksize;
910 while (offset >= pos) {
911 bh = bh->b_this_page;
912 iblock++;
913 pos += blocksize;
914 }
915
916 err = 0;
917
918 if (!buffer_mapped(bh)) {
919 gfs2_get_block(inode, iblock, bh, 0);
920 /* unmapped? It's a hole - nothing to do */
921 if (!buffer_mapped(bh))
922 goto unlock;
923 }
924
925 /* Ok, it's mapped. Make sure it's up-to-date */
926 if (PageUptodate(page))
927 set_buffer_uptodate(bh);
928
929 if (!buffer_uptodate(bh)) {
930 err = -EIO;
931 ll_rw_block(READ, 1, &bh);
932 wait_on_buffer(bh);
933 /* Uhhuh. Read error. Complain and punt. */
934 if (!buffer_uptodate(bh))
935 goto unlock;
936 }
937
938 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
939 gfs2_trans_add_bh(ip->i_gl, bh, 0);
940
941 kaddr = kmap_atomic(page, KM_USER0);
942 memset(kaddr + offset, 0, length);
943 flush_dcache_page(page);
944 kunmap_atomic(kaddr, KM_USER0);
945
946unlock:
947 unlock_page(page);
948 page_cache_release(page);
949 return err;
950}
951
952static int trunc_start(struct gfs2_inode *ip, u64 size)
953{
954 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
955 struct buffer_head *dibh;
956 int journaled = gfs2_is_jdata(ip);
957 int error;
958
959 error = gfs2_trans_begin(sdp,
960 RES_DINODE + (journaled ? RES_JDATA : 0), 0);
961 if (error)
962 return error;
963
964 error = gfs2_meta_inode_buffer(ip, &dibh);
965 if (error)
966 goto out;
967
968 if (gfs2_is_stuffed(ip)) {
969 ip->i_di.di_size = size;
970 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
971 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
972 gfs2_dinode_out(&ip->i_di, dibh->b_data);
973 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
974 error = 1;
975
976 } else {
977 if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
978 error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
979
980 if (!error) {
981 ip->i_di.di_size = size;
982 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
983 ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
984 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
985 gfs2_dinode_out(&ip->i_di, dibh->b_data);
986 }
987 }
988
989 brelse(dibh);
990
991out:
992 gfs2_trans_end(sdp);
993 return error;
994}
995
996static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
997{
998 unsigned int height = ip->i_di.di_height;
999 u64 lblock;
1000 struct metapath mp;
1001 int error;
1002
1003 if (!size)
1004 lblock = 0;
1005 else
1006 lblock = (size - 1) >> GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize_shift;
1007
1008 find_metapath(ip, lblock, &mp);
1009 gfs2_alloc_get(ip);
1010
1011 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1012 if (error)
1013 goto out;
1014
1015 while (height--) {
1016 struct strip_mine sm;
1017 sm.sm_first = !!size;
1018 sm.sm_height = height;
1019
1020 error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
1021 if (error)
1022 break;
1023 }
1024
1025 gfs2_quota_unhold(ip);
1026
1027out:
1028 gfs2_alloc_put(ip);
1029 return error;
1030}
1031
1032static int trunc_end(struct gfs2_inode *ip)
1033{
1034 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1035 struct buffer_head *dibh;
1036 int error;
1037
1038 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1039 if (error)
1040 return error;
1041
1042 down_write(&ip->i_rw_mutex);
1043
1044 error = gfs2_meta_inode_buffer(ip, &dibh);
1045 if (error)
1046 goto out;
1047
1048 if (!ip->i_di.di_size) {
1049 ip->i_di.di_height = 0;
1050 ip->i_di.di_goal_meta =
1051 ip->i_di.di_goal_data =
1052 ip->i_num.no_addr;
1053 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1054 }
1055 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
1056 ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
1057
1058 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1059 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1060 brelse(dibh);
1061
1062out:
1063 up_write(&ip->i_rw_mutex);
1064 gfs2_trans_end(sdp);
1065 return error;
1066}
1067
1068/**
1069 * do_shrink - make a file smaller
1070 * @ip: the inode
1071 * @size: the size to make the file
1072 * @truncator: function to truncate the last partial block
1073 *
1074 * Called with an exclusive lock on @ip.
1075 *
1076 * Returns: errno
1077 */
1078
1079static int do_shrink(struct gfs2_inode *ip, u64 size)
1080{
1081 int error;
1082
1083 error = trunc_start(ip, size);
1084 if (error < 0)
1085 return error;
1086 if (error > 0)
1087 return 0;
1088
1089 error = trunc_dealloc(ip, size);
1090 if (!error)
1091 error = trunc_end(ip);
1092
1093 return error;
1094}
1095
1096/**
1097 * gfs2_truncatei - make a file a given size
1098 * @ip: the inode
1099 * @size: the size to make the file
1100 * @truncator: function to truncate the last partial block
1101 *
1102 * The file size can grow, shrink, or stay the same size.
1103 *
1104 * Returns: errno
1105 */
1106
1107int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
1108{
1109 int error;
1110
1111 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_di.di_mode)))
1112 return -EINVAL;
1113
1114 if (size > ip->i_di.di_size)
1115 error = do_grow(ip, size);
1116 else
1117 error = do_shrink(ip, size);
1118
1119 return error;
1120}
1121
1122int gfs2_truncatei_resume(struct gfs2_inode *ip)
1123{
1124 int error;
1125 error = trunc_dealloc(ip, ip->i_di.di_size);
1126 if (!error)
1127 error = trunc_end(ip);
1128 return error;
1129}
1130
1131int gfs2_file_dealloc(struct gfs2_inode *ip)
1132{
1133 return trunc_dealloc(ip, 0);
1134}
1135
1136/**
1137 * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
1138 * @ip: the file
1139 * @len: the number of bytes to be written to the file
1140 * @data_blocks: returns the number of data blocks required
1141 * @ind_blocks: returns the number of indirect blocks required
1142 *
1143 */
1144
1145void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
1146 unsigned int *data_blocks, unsigned int *ind_blocks)
1147{
1148 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1149 unsigned int tmp;
1150
1151 if (gfs2_is_dir(ip)) {
1152 *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
1153 *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
1154 } else {
1155 *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
1156 *ind_blocks = 3 * (sdp->sd_max_height - 1);
1157 }
1158
1159 for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
1160 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
1161 *ind_blocks += tmp;
1162 }
1163}
1164
1165/**
1166 * gfs2_write_alloc_required - figure out if a write will require an allocation
1167 * @ip: the file being written to
1168 * @offset: the offset to write to
1169 * @len: the number of bytes being written
1170 * @alloc_required: set to 1 if an alloc is required, 0 otherwise
1171 *
1172 * Returns: errno
1173 */
1174
1175int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1176 unsigned int len, int *alloc_required)
1177{
1178 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1179 u64 lblock, lblock_stop, dblock;
1180 u32 extlen;
1181 int new = 0;
1182 int error = 0;
1183
1184 *alloc_required = 0;
1185
1186 if (!len)
1187 return 0;
1188
1189 if (gfs2_is_stuffed(ip)) {
1190 if (offset + len >
1191 sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
1192 *alloc_required = 1;
1193 return 0;
1194 }
1195
1196 if (gfs2_is_dir(ip)) {
1197 unsigned int bsize = sdp->sd_jbsize;
1198 lblock = offset;
1199 do_div(lblock, bsize);
1200 lblock_stop = offset + len + bsize - 1;
1201 do_div(lblock_stop, bsize);
1202 } else {
1203 unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1204 lblock = offset >> shift;
1205 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1206 }
1207
1208 for (; lblock < lblock_stop; lblock += extlen) {
1209 error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
1210 if (error)
1211 return error;
1212
1213 if (!dblock) {
1214 *alloc_required = 1;
1215 return 0;
1216 }
1217 }
1218
1219 return 0;
1220}
1221
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
new file mode 100644
index 000000000000..0fd379b4cd9e
--- /dev/null
+++ b/fs/gfs2/bmap.h
@@ -0,0 +1,31 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __BMAP_DOT_H__
11#define __BMAP_DOT_H__
12
13struct inode;
14struct gfs2_inode;
15struct page;
16
17int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
18int gfs2_block_map(struct inode *inode, u64 lblock, int create, struct buffer_head *bh, unsigned int maxlen);
19int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
20
21int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
22int gfs2_truncatei_resume(struct gfs2_inode *ip);
23int gfs2_file_dealloc(struct gfs2_inode *ip);
24
25void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
26 unsigned int *data_blocks,
27 unsigned int *ind_blocks);
28int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
29 unsigned int len, int *alloc_required);
30
31#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
new file mode 100644
index 000000000000..cab1f68d4685
--- /dev/null
+++ b/fs/gfs2/daemon.c
@@ -0,0 +1,196 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/kthread.h>
16#include <linux/delay.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/lm_interface.h>
19
20#include "gfs2.h"
21#include "incore.h"
22#include "daemon.h"
23#include "glock.h"
24#include "log.h"
25#include "quota.h"
26#include "recovery.h"
27#include "super.h"
28#include "util.h"
29
30/* This uses schedule_timeout() instead of msleep() because it's good for
31 the daemons to wake up more often than the timeout when unmounting so
32 the user's unmount doesn't sit there forever.
33
34 The kthread functions used to start these daemons block and flush signals. */
35
36/**
37 * gfs2_scand - Look for cached glocks and inodes to toss from memory
38 * @sdp: Pointer to GFS2 superblock
39 *
40 * One of these daemons runs, finding candidates to add to sd_reclaim_list.
41 * See gfs2_glockd()
42 */
43
44int gfs2_scand(void *data)
45{
46 struct gfs2_sbd *sdp = data;
47 unsigned long t;
48
49 while (!kthread_should_stop()) {
50 gfs2_scand_internal(sdp);
51 t = gfs2_tune_get(sdp, gt_scand_secs) * HZ;
52 schedule_timeout_interruptible(t);
53 }
54
55 return 0;
56}
57
58/**
59 * gfs2_glockd - Reclaim unused glock structures
60 * @sdp: Pointer to GFS2 superblock
61 *
62 * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
63 * Number of daemons can be set by user, with num_glockd mount option.
64 */
65
66int gfs2_glockd(void *data)
67{
68 struct gfs2_sbd *sdp = data;
69
70 while (!kthread_should_stop()) {
71 while (atomic_read(&sdp->sd_reclaim_count))
72 gfs2_reclaim_glock(sdp);
73
74 wait_event_interruptible(sdp->sd_reclaim_wq,
75 (atomic_read(&sdp->sd_reclaim_count) ||
76 kthread_should_stop()));
77 }
78
79 return 0;
80}
81
82/**
83 * gfs2_recoverd - Recover dead machine's journals
84 * @sdp: Pointer to GFS2 superblock
85 *
86 */
87
88int gfs2_recoverd(void *data)
89{
90 struct gfs2_sbd *sdp = data;
91 unsigned long t;
92
93 while (!kthread_should_stop()) {
94 gfs2_check_journals(sdp);
95 t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
96 schedule_timeout_interruptible(t);
97 }
98
99 return 0;
100}
101
102/**
103 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
104 * @sdp: Pointer to GFS2 superblock
105 *
106 * Also, periodically check to make sure that we're using the most recent
107 * journal index.
108 */
109
110int gfs2_logd(void *data)
111{
112 struct gfs2_sbd *sdp = data;
113 struct gfs2_holder ji_gh;
114 unsigned long t;
115
116 while (!kthread_should_stop()) {
117 /* Advance the log tail */
118
119 t = sdp->sd_log_flush_time +
120 gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
121
122 gfs2_ail1_empty(sdp, DIO_ALL);
123
124 if (time_after_eq(jiffies, t)) {
125 gfs2_log_flush(sdp, NULL);
126 sdp->sd_log_flush_time = jiffies;
127 }
128
129 /* Check for latest journal index */
130
131 t = sdp->sd_jindex_refresh_time +
132 gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
133
134 if (time_after_eq(jiffies, t)) {
135 if (!gfs2_jindex_hold(sdp, &ji_gh))
136 gfs2_glock_dq_uninit(&ji_gh);
137 sdp->sd_jindex_refresh_time = jiffies;
138 }
139
140 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
141 schedule_timeout_interruptible(t);
142 }
143
144 return 0;
145}
146
147/**
148 * gfs2_quotad - Write cached quota changes into the quota file
149 * @sdp: Pointer to GFS2 superblock
150 *
151 */
152
153int gfs2_quotad(void *data)
154{
155 struct gfs2_sbd *sdp = data;
156 unsigned long t;
157 int error;
158
159 while (!kthread_should_stop()) {
160 /* Update the master statfs file */
161
162 t = sdp->sd_statfs_sync_time +
163 gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
164
165 if (time_after_eq(jiffies, t)) {
166 error = gfs2_statfs_sync(sdp);
167 if (error &&
168 error != -EROFS &&
169 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
170 fs_err(sdp, "quotad: (1) error=%d\n", error);
171 sdp->sd_statfs_sync_time = jiffies;
172 }
173
174 /* Update quota file */
175
176 t = sdp->sd_quota_sync_time +
177 gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
178
179 if (time_after_eq(jiffies, t)) {
180 error = gfs2_quota_sync(sdp);
181 if (error &&
182 error != -EROFS &&
183 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
184 fs_err(sdp, "quotad: (2) error=%d\n", error);
185 sdp->sd_quota_sync_time = jiffies;
186 }
187
188 gfs2_quota_scan(sdp);
189
190 t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
191 schedule_timeout_interruptible(t);
192 }
193
194 return 0;
195}
196
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
new file mode 100644
index 000000000000..801007120fb2
--- /dev/null
+++ b/fs/gfs2/daemon.h
@@ -0,0 +1,19 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __DAEMON_DOT_H__
11#define __DAEMON_DOT_H__
12
13int gfs2_scand(void *data);
14int gfs2_glockd(void *data);
15int gfs2_recoverd(void *data);
16int gfs2_logd(void *data);
17int gfs2_quotad(void *data);
18
19#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
new file mode 100644
index 000000000000..459498cac93b
--- /dev/null
+++ b/fs/gfs2/dir.c
@@ -0,0 +1,1961 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10/*
11 * Implements Extendible Hashing as described in:
12 * "Extendible Hashing" by Fagin, et al in
13 * __ACM Trans. on Database Systems__, Sept 1979.
14 *
15 *
16 * Here's the layout of dirents which is essentially the same as that of ext2
17 * within a single block. The field de_name_len is the number of bytes
18 * actually required for the name (no null terminator). The field de_rec_len
19 * is the number of bytes allocated to the dirent. The offset of the next
20 * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
21 * deleted, the preceding dirent inherits its allocated space, ie
22 * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
23 * by adding de_rec_len to the current dirent, this essentially causes the
24 * deleted dirent to get jumped over when iterating through all the dirents.
25 *
26 * When deleting the first dirent in a block, there is no previous dirent so
27 * the field de_ino is set to zero to designate it as deleted. When allocating
28 * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the
29 * first dirent has (de_ino == 0) and de_rec_len is large enough, this first
30 * dirent is allocated. Otherwise it must go through all the 'used' dirents
31 * searching for one in which the amount of total space minus the amount of
32 * used space will provide enough space for the new dirent.
33 *
34 * There are two types of blocks in which dirents reside. In a stuffed dinode,
35 * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of
36 * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
37 * beginning of the leaf block. The dirents reside in leaves when
38 *
39 * dip->i_di.di_flags & GFS2_DIF_EXHASH is true
40 *
41 * Otherwise, the dirents are "linear", within a single stuffed dinode block.
42 *
43 * When the dirents are in leaves, the actual contents of the directory file are
44 * used as an array of 64-bit block pointers pointing to the leaf blocks. The
45 * dirents are NOT in the directory file itself. There can be more than one
46 * block pointer in the array that points to the same leaf. In fact, when a
47 * directory is first converted from linear to exhash, all of the pointers
48 * point to the same leaf.
49 *
50 * When a leaf is completely full, the size of the hash table can be
51 * doubled unless it is already at the maximum size which is hard coded into
52 * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list,
53 * but never before the maximum hash table size has been reached.
54 */
55
56#include <linux/sched.h>
57#include <linux/slab.h>
58#include <linux/spinlock.h>
59#include <linux/buffer_head.h>
60#include <linux/sort.h>
61#include <linux/gfs2_ondisk.h>
62#include <linux/crc32.h>
63#include <linux/vmalloc.h>
64#include <linux/lm_interface.h>
65
66#include "gfs2.h"
67#include "incore.h"
68#include "dir.h"
69#include "glock.h"
70#include "inode.h"
71#include "meta_io.h"
72#include "quota.h"
73#include "rgrp.h"
74#include "trans.h"
75#include "bmap.h"
76#include "util.h"
77
78#define IS_LEAF 1 /* Hashed (leaf) directory */
79#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */
80
81#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
82#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
83
84typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
85 u64 leaf_no, void *data);
86typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
87 const struct qstr *name, void *opaque);
88
89
90int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
91 struct buffer_head **bhp)
92{
93 struct buffer_head *bh;
94
95 bh = gfs2_meta_new(ip->i_gl, block);
96 gfs2_trans_add_bh(ip->i_gl, bh, 1);
97 gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
98 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
99 *bhp = bh;
100 return 0;
101}
102
103static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block,
104 struct buffer_head **bhp)
105{
106 struct buffer_head *bh;
107 int error;
108
109 error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh);
110 if (error)
111 return error;
112 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
113 brelse(bh);
114 return -EIO;
115 }
116 *bhp = bh;
117 return 0;
118}
119
120static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
121 unsigned int offset, unsigned int size)
122{
123 struct buffer_head *dibh;
124 int error;
125
126 error = gfs2_meta_inode_buffer(ip, &dibh);
127 if (error)
128 return error;
129
130 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
131 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
132 if (ip->i_di.di_size < offset + size)
133 ip->i_di.di_size = offset + size;
134 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
135 gfs2_dinode_out(&ip->i_di, dibh->b_data);
136
137 brelse(dibh);
138
139 return size;
140}
141
142
143
144/**
145 * gfs2_dir_write_data - Write directory information to the inode
146 * @ip: The GFS2 inode
147 * @buf: The buffer containing information to be written
148 * @offset: The file offset to start writing at
149 * @size: The amount of data to write
150 *
151 * Returns: The number of bytes correctly written or error code
152 */
153static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
154 u64 offset, unsigned int size)
155{
156 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
157 struct buffer_head *dibh;
158 u64 lblock, dblock;
159 u32 extlen = 0;
160 unsigned int o;
161 int copied = 0;
162 int error = 0;
163
164 if (!size)
165 return 0;
166
167 if (gfs2_is_stuffed(ip) &&
168 offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
169 return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset,
170 size);
171
172 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
173 return -EINVAL;
174
175 if (gfs2_is_stuffed(ip)) {
176 error = gfs2_unstuff_dinode(ip, NULL);
177 if (error)
178 return error;
179 }
180
181 lblock = offset;
182 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
183
184 while (copied < size) {
185 unsigned int amount;
186 struct buffer_head *bh;
187 int new;
188
189 amount = size - copied;
190 if (amount > sdp->sd_sb.sb_bsize - o)
191 amount = sdp->sd_sb.sb_bsize - o;
192
193 if (!extlen) {
194 new = 1;
195 error = gfs2_extent_map(&ip->i_inode, lblock, &new,
196 &dblock, &extlen);
197 if (error)
198 goto fail;
199 error = -EIO;
200 if (gfs2_assert_withdraw(sdp, dblock))
201 goto fail;
202 }
203
204 if (amount == sdp->sd_jbsize || new)
205 error = gfs2_dir_get_new_buffer(ip, dblock, &bh);
206 else
207 error = gfs2_dir_get_existing_buffer(ip, dblock, &bh);
208
209 if (error)
210 goto fail;
211
212 gfs2_trans_add_bh(ip->i_gl, bh, 1);
213 memcpy(bh->b_data + o, buf, amount);
214 brelse(bh);
215 if (error)
216 goto fail;
217
218 buf += amount;
219 copied += amount;
220 lblock++;
221 dblock++;
222 extlen--;
223
224 o = sizeof(struct gfs2_meta_header);
225 }
226
227out:
228 error = gfs2_meta_inode_buffer(ip, &dibh);
229 if (error)
230 return error;
231
232 if (ip->i_di.di_size < offset + copied)
233 ip->i_di.di_size = offset + copied;
234 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
235
236 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
237 gfs2_dinode_out(&ip->i_di, dibh->b_data);
238 brelse(dibh);
239
240 return copied;
241fail:
242 if (copied)
243 goto out;
244 return error;
245}
246
247static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
248 u64 offset, unsigned int size)
249{
250 struct buffer_head *dibh;
251 int error;
252
253 error = gfs2_meta_inode_buffer(ip, &dibh);
254 if (!error) {
255 offset += sizeof(struct gfs2_dinode);
256 memcpy(buf, dibh->b_data + offset, size);
257 brelse(dibh);
258 }
259
260 return (error) ? error : size;
261}
262
263
264/**
265 * gfs2_dir_read_data - Read a data from a directory inode
266 * @ip: The GFS2 Inode
267 * @buf: The buffer to place result into
268 * @offset: File offset to begin jdata_readng from
269 * @size: Amount of data to transfer
270 *
271 * Returns: The amount of data actually copied or the error
272 */
273static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
274 unsigned int size, unsigned ra)
275{
276 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
277 u64 lblock, dblock;
278 u32 extlen = 0;
279 unsigned int o;
280 int copied = 0;
281 int error = 0;
282
283 if (offset >= ip->i_di.di_size)
284 return 0;
285
286 if (offset + size > ip->i_di.di_size)
287 size = ip->i_di.di_size - offset;
288
289 if (!size)
290 return 0;
291
292 if (gfs2_is_stuffed(ip))
293 return gfs2_dir_read_stuffed(ip, buf, offset, size);
294
295 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
296 return -EINVAL;
297
298 lblock = offset;
299 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
300
301 while (copied < size) {
302 unsigned int amount;
303 struct buffer_head *bh;
304 int new;
305
306 amount = size - copied;
307 if (amount > sdp->sd_sb.sb_bsize - o)
308 amount = sdp->sd_sb.sb_bsize - o;
309
310 if (!extlen) {
311 new = 0;
312 error = gfs2_extent_map(&ip->i_inode, lblock, &new,
313 &dblock, &extlen);
314 if (error || !dblock)
315 goto fail;
316 BUG_ON(extlen < 1);
317 if (!ra)
318 extlen = 1;
319 bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
320 }
321 if (!bh) {
322 error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
323 if (error)
324 goto fail;
325 }
326 error = gfs2_metatype_check(sdp, bh, GFS2_METATYPE_JD);
327 if (error) {
328 brelse(bh);
329 goto fail;
330 }
331 dblock++;
332 extlen--;
333 memcpy(buf, bh->b_data + o, amount);
334 brelse(bh);
335 bh = NULL;
336 buf += amount;
337 copied += amount;
338 lblock++;
339 o = sizeof(struct gfs2_meta_header);
340 }
341
342 return copied;
343fail:
344 return (copied) ? copied : error;
345}
346
347static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent,
348 const struct qstr *name, int ret)
349{
350 if (dent->de_inum.no_addr != 0 &&
351 be32_to_cpu(dent->de_hash) == name->hash &&
352 be16_to_cpu(dent->de_name_len) == name->len &&
353 memcmp(dent+1, name->name, name->len) == 0)
354 return ret;
355 return 0;
356}
357
358static int gfs2_dirent_find(const struct gfs2_dirent *dent,
359 const struct qstr *name,
360 void *opaque)
361{
362 return __gfs2_dirent_find(dent, name, 1);
363}
364
365static int gfs2_dirent_prev(const struct gfs2_dirent *dent,
366 const struct qstr *name,
367 void *opaque)
368{
369 return __gfs2_dirent_find(dent, name, 2);
370}
371
372/*
373 * name->name holds ptr to start of block.
374 * name->len holds size of block.
375 */
376static int gfs2_dirent_last(const struct gfs2_dirent *dent,
377 const struct qstr *name,
378 void *opaque)
379{
380 const char *start = name->name;
381 const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len);
382 if (name->len == (end - start))
383 return 1;
384 return 0;
385}
386
387static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
388 const struct qstr *name,
389 void *opaque)
390{
391 unsigned required = GFS2_DIRENT_SIZE(name->len);
392 unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
393 unsigned totlen = be16_to_cpu(dent->de_rec_len);
394
395 if (!dent->de_inum.no_addr)
396 actual = GFS2_DIRENT_SIZE(0);
397 if (totlen - actual >= required)
398 return 1;
399 return 0;
400}
401
402struct dirent_gather {
403 const struct gfs2_dirent **pdent;
404 unsigned offset;
405};
406
407static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
408 const struct qstr *name,
409 void *opaque)
410{
411 struct dirent_gather *g = opaque;
412 if (dent->de_inum.no_addr) {
413 g->pdent[g->offset++] = dent;
414 }
415 return 0;
416}
417
418/*
419 * Other possible things to check:
420 * - Inode located within filesystem size (and on valid block)
421 * - Valid directory entry type
422 * Not sure how heavy-weight we want to make this... could also check
423 * hash is correct for example, but that would take a lot of extra time.
424 * For now the most important thing is to check that the various sizes
425 * are correct.
426 */
427static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
428 unsigned int size, unsigned int len, int first)
429{
430 const char *msg = "gfs2_dirent too small";
431 if (unlikely(size < sizeof(struct gfs2_dirent)))
432 goto error;
433 msg = "gfs2_dirent misaligned";
434 if (unlikely(offset & 0x7))
435 goto error;
436 msg = "gfs2_dirent points beyond end of block";
437 if (unlikely(offset + size > len))
438 goto error;
439 msg = "zero inode number";
440 if (unlikely(!first && !dent->de_inum.no_addr))
441 goto error;
442 msg = "name length is greater than space in dirent";
443 if (dent->de_inum.no_addr &&
444 unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) >
445 size))
446 goto error;
447 return 0;
448error:
449 printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg,
450 first ? "first in block" : "not first in block");
451 return -EIO;
452}
453
454static int gfs2_dirent_offset(const void *buf)
455{
456 const struct gfs2_meta_header *h = buf;
457 int offset;
458
459 BUG_ON(buf == NULL);
460
461 switch(be32_to_cpu(h->mh_type)) {
462 case GFS2_METATYPE_LF:
463 offset = sizeof(struct gfs2_leaf);
464 break;
465 case GFS2_METATYPE_DI:
466 offset = sizeof(struct gfs2_dinode);
467 break;
468 default:
469 goto wrong_type;
470 }
471 return offset;
472wrong_type:
473 printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n",
474 be32_to_cpu(h->mh_type));
475 return -1;
476}
477
478static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf,
479 unsigned int len, gfs2_dscan_t scan,
480 const struct qstr *name,
481 void *opaque)
482{
483 struct gfs2_dirent *dent, *prev;
484 unsigned offset;
485 unsigned size;
486 int ret = 0;
487
488 ret = gfs2_dirent_offset(buf);
489 if (ret < 0)
490 goto consist_inode;
491
492 offset = ret;
493 prev = NULL;
494 dent = buf + offset;
495 size = be16_to_cpu(dent->de_rec_len);
496 if (gfs2_check_dirent(dent, offset, size, len, 1))
497 goto consist_inode;
498 do {
499 ret = scan(dent, name, opaque);
500 if (ret)
501 break;
502 offset += size;
503 if (offset == len)
504 break;
505 prev = dent;
506 dent = buf + offset;
507 size = be16_to_cpu(dent->de_rec_len);
508 if (gfs2_check_dirent(dent, offset, size, len, 0))
509 goto consist_inode;
510 } while(1);
511
512 switch(ret) {
513 case 0:
514 return NULL;
515 case 1:
516 return dent;
517 case 2:
518 return prev ? prev : dent;
519 default:
520 BUG_ON(ret > 0);
521 return ERR_PTR(ret);
522 }
523
524consist_inode:
525 gfs2_consist_inode(GFS2_I(inode));
526 return ERR_PTR(-EIO);
527}
528
529
530/**
531 * dirent_first - Return the first dirent
532 * @dip: the directory
533 * @bh: The buffer
534 * @dent: Pointer to list of dirents
535 *
536 * return first dirent whether bh points to leaf or stuffed dinode
537 *
538 * Returns: IS_LEAF, IS_DINODE, or -errno
539 */
540
541static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
542 struct gfs2_dirent **dent)
543{
544 struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
545
546 if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
547 if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh))
548 return -EIO;
549 *dent = (struct gfs2_dirent *)(bh->b_data +
550 sizeof(struct gfs2_leaf));
551 return IS_LEAF;
552 } else {
553 if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI))
554 return -EIO;
555 *dent = (struct gfs2_dirent *)(bh->b_data +
556 sizeof(struct gfs2_dinode));
557 return IS_DINODE;
558 }
559}
560
561static int dirent_check_reclen(struct gfs2_inode *dip,
562 const struct gfs2_dirent *d, const void *end_p)
563{
564 const void *ptr = d;
565 u16 rec_len = be16_to_cpu(d->de_rec_len);
566
567 if (unlikely(rec_len < sizeof(struct gfs2_dirent)))
568 goto broken;
569 ptr += rec_len;
570 if (ptr < end_p)
571 return rec_len;
572 if (ptr == end_p)
573 return -ENOENT;
574broken:
575 gfs2_consist_inode(dip);
576 return -EIO;
577}
578
579/**
580 * dirent_next - Next dirent
581 * @dip: the directory
582 * @bh: The buffer
583 * @dent: Pointer to list of dirents
584 *
585 * Returns: 0 on success, error code otherwise
586 */
587
588static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh,
589 struct gfs2_dirent **dent)
590{
591 struct gfs2_dirent *cur = *dent, *tmp;
592 char *bh_end = bh->b_data + bh->b_size;
593 int ret;
594
595 ret = dirent_check_reclen(dip, cur, bh_end);
596 if (ret < 0)
597 return ret;
598
599 tmp = (void *)cur + ret;
600 ret = dirent_check_reclen(dip, tmp, bh_end);
601 if (ret == -EIO)
602 return ret;
603
604 /* Only the first dent could ever have de_inum.no_addr == 0 */
605 if (!tmp->de_inum.no_addr) {
606 gfs2_consist_inode(dip);
607 return -EIO;
608 }
609
610 *dent = tmp;
611 return 0;
612}
613
614/**
615 * dirent_del - Delete a dirent
616 * @dip: The GFS2 inode
617 * @bh: The buffer
618 * @prev: The previous dirent
619 * @cur: The current dirent
620 *
621 */
622
623static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
624 struct gfs2_dirent *prev, struct gfs2_dirent *cur)
625{
626 u16 cur_rec_len, prev_rec_len;
627
628 if (!cur->de_inum.no_addr) {
629 gfs2_consist_inode(dip);
630 return;
631 }
632
633 gfs2_trans_add_bh(dip->i_gl, bh, 1);
634
635 /* If there is no prev entry, this is the first entry in the block.
636 The de_rec_len is already as big as it needs to be. Just zero
637 out the inode number and return. */
638
639 if (!prev) {
640 cur->de_inum.no_addr = 0; /* No endianess worries */
641 return;
642 }
643
644 /* Combine this dentry with the previous one. */
645
646 prev_rec_len = be16_to_cpu(prev->de_rec_len);
647 cur_rec_len = be16_to_cpu(cur->de_rec_len);
648
649 if ((char *)prev + prev_rec_len != (char *)cur)
650 gfs2_consist_inode(dip);
651 if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size)
652 gfs2_consist_inode(dip);
653
654 prev_rec_len += cur_rec_len;
655 prev->de_rec_len = cpu_to_be16(prev_rec_len);
656}
657
658/*
659 * Takes a dent from which to grab space as an argument. Returns the
660 * newly created dent.
661 */
662static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
663 struct gfs2_dirent *dent,
664 const struct qstr *name,
665 struct buffer_head *bh)
666{
667 struct gfs2_inode *ip = GFS2_I(inode);
668 struct gfs2_dirent *ndent;
669 unsigned offset = 0, totlen;
670
671 if (dent->de_inum.no_addr)
672 offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
673 totlen = be16_to_cpu(dent->de_rec_len);
674 BUG_ON(offset + name->len > totlen);
675 gfs2_trans_add_bh(ip->i_gl, bh, 1);
676 ndent = (struct gfs2_dirent *)((char *)dent + offset);
677 dent->de_rec_len = cpu_to_be16(offset);
678 gfs2_qstr2dirent(name, totlen - offset, ndent);
679 return ndent;
680}
681
682static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
683 struct buffer_head *bh,
684 const struct qstr *name)
685{
686 struct gfs2_dirent *dent;
687 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
688 gfs2_dirent_find_space, name, NULL);
689 if (!dent || IS_ERR(dent))
690 return dent;
691 return gfs2_init_dirent(inode, dent, name, bh);
692}
693
694static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
695 struct buffer_head **bhp)
696{
697 int error;
698
699 error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
700 if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
701 /* printk(KERN_INFO "block num=%llu\n", leaf_no); */
702 error = -EIO;
703 }
704
705 return error;
706}
707
708/**
709 * get_leaf_nr - Get a leaf number associated with the index
710 * @dip: The GFS2 inode
711 * @index:
712 * @leaf_out:
713 *
714 * Returns: 0 on success, error code otherwise
715 */
716
717static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
718 u64 *leaf_out)
719{
720 u64 leaf_no;
721 int error;
722
723 error = gfs2_dir_read_data(dip, (char *)&leaf_no,
724 index * sizeof(u64),
725 sizeof(u64), 0);
726 if (error != sizeof(u64))
727 return (error < 0) ? error : -EIO;
728
729 *leaf_out = be64_to_cpu(leaf_no);
730
731 return 0;
732}
733
734static int get_first_leaf(struct gfs2_inode *dip, u32 index,
735 struct buffer_head **bh_out)
736{
737 u64 leaf_no;
738 int error;
739
740 error = get_leaf_nr(dip, index, &leaf_no);
741 if (!error)
742 error = get_leaf(dip, leaf_no, bh_out);
743
744 return error;
745}
746
747static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
748 const struct qstr *name,
749 gfs2_dscan_t scan,
750 struct buffer_head **pbh)
751{
752 struct buffer_head *bh;
753 struct gfs2_dirent *dent;
754 struct gfs2_inode *ip = GFS2_I(inode);
755 int error;
756
757 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
758 struct gfs2_leaf *leaf;
759 unsigned hsize = 1 << ip->i_di.di_depth;
760 unsigned index;
761 u64 ln;
762 if (hsize * sizeof(u64) != ip->i_di.di_size) {
763 gfs2_consist_inode(ip);
764 return ERR_PTR(-EIO);
765 }
766
767 index = name->hash >> (32 - ip->i_di.di_depth);
768 error = get_first_leaf(ip, index, &bh);
769 if (error)
770 return ERR_PTR(error);
771 do {
772 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
773 scan, name, NULL);
774 if (dent)
775 goto got_dent;
776 leaf = (struct gfs2_leaf *)bh->b_data;
777 ln = be64_to_cpu(leaf->lf_next);
778 brelse(bh);
779 if (!ln)
780 break;
781
782 error = get_leaf(ip, ln, &bh);
783 } while(!error);
784
785 return error ? ERR_PTR(error) : NULL;
786 }
787
788
789 error = gfs2_meta_inode_buffer(ip, &bh);
790 if (error)
791 return ERR_PTR(error);
792 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL);
793got_dent:
794 if (unlikely(dent == NULL || IS_ERR(dent))) {
795 brelse(bh);
796 bh = NULL;
797 }
798 *pbh = bh;
799 return dent;
800}
801
802static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
803{
804 struct gfs2_inode *ip = GFS2_I(inode);
805 u64 bn = gfs2_alloc_meta(ip);
806 struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
807 struct gfs2_leaf *leaf;
808 struct gfs2_dirent *dent;
809 struct qstr name = { .name = "", .len = 0, .hash = 0 };
810 if (!bh)
811 return NULL;
812
813 gfs2_trans_add_bh(ip->i_gl, bh, 1);
814 gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
815 leaf = (struct gfs2_leaf *)bh->b_data;
816 leaf->lf_depth = cpu_to_be16(depth);
817 leaf->lf_entries = 0;
818 leaf->lf_dirent_format = cpu_to_be16(GFS2_FORMAT_DE);
819 leaf->lf_next = 0;
820 memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
821 dent = (struct gfs2_dirent *)(leaf+1);
822 gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
823 *pbh = bh;
824 return leaf;
825}
826
827/**
828 * dir_make_exhash - Convert a stuffed directory into an ExHash directory
829 * @dip: The GFS2 inode
830 *
831 * Returns: 0 on success, error code otherwise
832 */
833
834static int dir_make_exhash(struct inode *inode)
835{
836 struct gfs2_inode *dip = GFS2_I(inode);
837 struct gfs2_sbd *sdp = GFS2_SB(inode);
838 struct gfs2_dirent *dent;
839 struct qstr args;
840 struct buffer_head *bh, *dibh;
841 struct gfs2_leaf *leaf;
842 int y;
843 u32 x;
844 u64 *lp, bn;
845 int error;
846
847 error = gfs2_meta_inode_buffer(dip, &dibh);
848 if (error)
849 return error;
850
851 /* Turn over a new leaf */
852
853 leaf = new_leaf(inode, &bh, 0);
854 if (!leaf)
855 return -ENOSPC;
856 bn = bh->b_blocknr;
857
858 gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
859 leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
860
861 /* Copy dirents */
862
863 gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh,
864 sizeof(struct gfs2_dinode));
865
866 /* Find last entry */
867
868 x = 0;
869 args.len = bh->b_size - sizeof(struct gfs2_dinode) +
870 sizeof(struct gfs2_leaf);
871 args.name = bh->b_data;
872 dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size,
873 gfs2_dirent_last, &args, NULL);
874 if (!dent) {
875 brelse(bh);
876 brelse(dibh);
877 return -EIO;
878 }
879 if (IS_ERR(dent)) {
880 brelse(bh);
881 brelse(dibh);
882 return PTR_ERR(dent);
883 }
884
885 /* Adjust the last dirent's record length
886 (Remember that dent still points to the last entry.) */
887
888 dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) +
889 sizeof(struct gfs2_dinode) -
890 sizeof(struct gfs2_leaf));
891
892 brelse(bh);
893
894 /* We're done with the new leaf block, now setup the new
895 hash table. */
896
897 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
898 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
899
900 lp = (u64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
901
902 for (x = sdp->sd_hash_ptrs; x--; lp++)
903 *lp = cpu_to_be64(bn);
904
905 dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
906 dip->i_di.di_blocks++;
907 dip->i_di.di_flags |= GFS2_DIF_EXHASH;
908 dip->i_di.di_payload_format = 0;
909
910 for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
911 dip->i_di.di_depth = y;
912
913 gfs2_dinode_out(&dip->i_di, dibh->b_data);
914
915 brelse(dibh);
916
917 return 0;
918}
919
920/**
921 * dir_split_leaf - Split a leaf block into two
922 * @dip: The GFS2 inode
923 * @index:
924 * @leaf_no:
925 *
926 * Returns: 0 on success, error code on failure
927 */
928
929static int dir_split_leaf(struct inode *inode, const struct qstr *name)
930{
931 struct gfs2_inode *dip = GFS2_I(inode);
932 struct buffer_head *nbh, *obh, *dibh;
933 struct gfs2_leaf *nleaf, *oleaf;
934 struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new;
935 u32 start, len, half_len, divider;
936 u64 bn, *lp, leaf_no;
937 u32 index;
938 int x, moved = 0;
939 int error;
940
941 index = name->hash >> (32 - dip->i_di.di_depth);
942 error = get_leaf_nr(dip, index, &leaf_no);
943 if (error)
944 return error;
945
946 /* Get the old leaf block */
947 error = get_leaf(dip, leaf_no, &obh);
948 if (error)
949 return error;
950
951 oleaf = (struct gfs2_leaf *)obh->b_data;
952 if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) {
953 brelse(obh);
954 return 1; /* can't split */
955 }
956
957 gfs2_trans_add_bh(dip->i_gl, obh, 1);
958
959 nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
960 if (!nleaf) {
961 brelse(obh);
962 return -ENOSPC;
963 }
964 bn = nbh->b_blocknr;
965
966 /* Compute the start and len of leaf pointers in the hash table. */
967 len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth));
968 half_len = len >> 1;
969 if (!half_len) {
970 printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index);
971 gfs2_consist_inode(dip);
972 error = -EIO;
973 goto fail_brelse;
974 }
975
976 start = (index & ~(len - 1));
977
978 /* Change the pointers.
979 Don't bother distinguishing stuffed from non-stuffed.
980 This code is complicated enough already. */
981 lp = kmalloc(half_len * sizeof(u64), GFP_NOFS | __GFP_NOFAIL);
982 /* Change the pointers */
983 for (x = 0; x < half_len; x++)
984 lp[x] = cpu_to_be64(bn);
985
986 error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(u64),
987 half_len * sizeof(u64));
988 if (error != half_len * sizeof(u64)) {
989 if (error >= 0)
990 error = -EIO;
991 goto fail_lpfree;
992 }
993
994 kfree(lp);
995
996 /* Compute the divider */
997 divider = (start + half_len) << (32 - dip->i_di.di_depth);
998
999 /* Copy the entries */
1000 dirent_first(dip, obh, &dent);
1001
1002 do {
1003 next = dent;
1004 if (dirent_next(dip, obh, &next))
1005 next = NULL;
1006
1007 if (dent->de_inum.no_addr &&
1008 be32_to_cpu(dent->de_hash) < divider) {
1009 struct qstr str;
1010 str.name = (char*)(dent+1);
1011 str.len = be16_to_cpu(dent->de_name_len);
1012 str.hash = be32_to_cpu(dent->de_hash);
1013 new = gfs2_dirent_alloc(inode, nbh, &str);
1014 if (IS_ERR(new)) {
1015 error = PTR_ERR(new);
1016 break;
1017 }
1018
1019 new->de_inum = dent->de_inum; /* No endian worries */
1020 new->de_type = dent->de_type; /* No endian worries */
1021 nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1);
1022
1023 dirent_del(dip, obh, prev, dent);
1024
1025 if (!oleaf->lf_entries)
1026 gfs2_consist_inode(dip);
1027 oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1);
1028
1029 if (!prev)
1030 prev = dent;
1031
1032 moved = 1;
1033 } else {
1034 prev = dent;
1035 }
1036 dent = next;
1037 } while (dent);
1038
1039 oleaf->lf_depth = nleaf->lf_depth;
1040
1041 error = gfs2_meta_inode_buffer(dip, &dibh);
1042 if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
1043 dip->i_di.di_blocks++;
1044 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1045 brelse(dibh);
1046 }
1047
1048 brelse(obh);
1049 brelse(nbh);
1050
1051 return error;
1052
1053fail_lpfree:
1054 kfree(lp);
1055
1056fail_brelse:
1057 brelse(obh);
1058 brelse(nbh);
1059 return error;
1060}
1061
1062/**
1063 * dir_double_exhash - Double size of ExHash table
1064 * @dip: The GFS2 dinode
1065 *
1066 * Returns: 0 on success, error code on failure
1067 */
1068
1069static int dir_double_exhash(struct gfs2_inode *dip)
1070{
1071 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1072 struct buffer_head *dibh;
1073 u32 hsize;
1074 u64 *buf;
1075 u64 *from, *to;
1076 u64 block;
1077 int x;
1078 int error = 0;
1079
1080 hsize = 1 << dip->i_di.di_depth;
1081 if (hsize * sizeof(u64) != dip->i_di.di_size) {
1082 gfs2_consist_inode(dip);
1083 return -EIO;
1084 }
1085
1086 /* Allocate both the "from" and "to" buffers in one big chunk */
1087
1088 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL);
1089
1090 for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
1091 error = gfs2_dir_read_data(dip, (char *)buf,
1092 block * sdp->sd_hash_bsize,
1093 sdp->sd_hash_bsize, 1);
1094 if (error != sdp->sd_hash_bsize) {
1095 if (error >= 0)
1096 error = -EIO;
1097 goto fail;
1098 }
1099
1100 from = buf;
1101 to = (u64 *)((char *)buf + sdp->sd_hash_bsize);
1102
1103 for (x = sdp->sd_hash_ptrs; x--; from++) {
1104 *to++ = *from; /* No endianess worries */
1105 *to++ = *from;
1106 }
1107
1108 error = gfs2_dir_write_data(dip,
1109 (char *)buf + sdp->sd_hash_bsize,
1110 block * sdp->sd_sb.sb_bsize,
1111 sdp->sd_sb.sb_bsize);
1112 if (error != sdp->sd_sb.sb_bsize) {
1113 if (error >= 0)
1114 error = -EIO;
1115 goto fail;
1116 }
1117 }
1118
1119 kfree(buf);
1120
1121 error = gfs2_meta_inode_buffer(dip, &dibh);
1122 if (!gfs2_assert_withdraw(sdp, !error)) {
1123 dip->i_di.di_depth++;
1124 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1125 brelse(dibh);
1126 }
1127
1128 return error;
1129
1130fail:
1131 kfree(buf);
1132 return error;
1133}
1134
1135/**
1136 * compare_dents - compare directory entries by hash value
1137 * @a: first dent
1138 * @b: second dent
1139 *
1140 * When comparing the hash entries of @a to @b:
1141 * gt: returns 1
1142 * lt: returns -1
1143 * eq: returns 0
1144 */
1145
1146static int compare_dents(const void *a, const void *b)
1147{
1148 const struct gfs2_dirent *dent_a, *dent_b;
1149 u32 hash_a, hash_b;
1150 int ret = 0;
1151
1152 dent_a = *(const struct gfs2_dirent **)a;
1153 hash_a = be32_to_cpu(dent_a->de_hash);
1154
1155 dent_b = *(const struct gfs2_dirent **)b;
1156 hash_b = be32_to_cpu(dent_b->de_hash);
1157
1158 if (hash_a > hash_b)
1159 ret = 1;
1160 else if (hash_a < hash_b)
1161 ret = -1;
1162 else {
1163 unsigned int len_a = be16_to_cpu(dent_a->de_name_len);
1164 unsigned int len_b = be16_to_cpu(dent_b->de_name_len);
1165
1166 if (len_a > len_b)
1167 ret = 1;
1168 else if (len_a < len_b)
1169 ret = -1;
1170 else
1171 ret = memcmp(dent_a + 1, dent_b + 1, len_a);
1172 }
1173
1174 return ret;
1175}
1176
1177/**
1178 * do_filldir_main - read out directory entries
1179 * @dip: The GFS2 inode
1180 * @offset: The offset in the file to read from
1181 * @opaque: opaque data to pass to filldir
1182 * @filldir: The function to pass entries to
1183 * @darr: an array of struct gfs2_dirent pointers to read
1184 * @entries: the number of entries in darr
1185 * @copied: pointer to int that's non-zero if a entry has been copied out
1186 *
1187 * Jump through some hoops to make sure that if there are hash collsions,
1188 * they are read out at the beginning of a buffer. We want to minimize
1189 * the possibility that they will fall into different readdir buffers or
1190 * that someone will want to seek to that location.
1191 *
1192 * Returns: errno, >0 on exception from filldir
1193 */
1194
1195static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
1196 void *opaque, gfs2_filldir_t filldir,
1197 const struct gfs2_dirent **darr, u32 entries,
1198 int *copied)
1199{
1200 const struct gfs2_dirent *dent, *dent_next;
1201 struct gfs2_inum inum;
1202 u64 off, off_next;
1203 unsigned int x, y;
1204 int run = 0;
1205 int error = 0;
1206
1207 sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
1208
1209 dent_next = darr[0];
1210 off_next = be32_to_cpu(dent_next->de_hash);
1211 off_next = gfs2_disk_hash2offset(off_next);
1212
1213 for (x = 0, y = 1; x < entries; x++, y++) {
1214 dent = dent_next;
1215 off = off_next;
1216
1217 if (y < entries) {
1218 dent_next = darr[y];
1219 off_next = be32_to_cpu(dent_next->de_hash);
1220 off_next = gfs2_disk_hash2offset(off_next);
1221
1222 if (off < *offset)
1223 continue;
1224 *offset = off;
1225
1226 if (off_next == off) {
1227 if (*copied && !run)
1228 return 1;
1229 run = 1;
1230 } else
1231 run = 0;
1232 } else {
1233 if (off < *offset)
1234 continue;
1235 *offset = off;
1236 }
1237
1238 gfs2_inum_in(&inum, (char *)&dent->de_inum);
1239
1240 error = filldir(opaque, (const char *)(dent + 1),
1241 be16_to_cpu(dent->de_name_len),
1242 off, &inum,
1243 be16_to_cpu(dent->de_type));
1244 if (error)
1245 return 1;
1246
1247 *copied = 1;
1248 }
1249
1250 /* Increment the *offset by one, so the next time we come into the
1251 do_filldir fxn, we get the next entry instead of the last one in the
1252 current leaf */
1253
1254 (*offset)++;
1255
1256 return 0;
1257}
1258
1259static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1260 gfs2_filldir_t filldir, int *copied,
1261 unsigned *depth, u64 leaf_no)
1262{
1263 struct gfs2_inode *ip = GFS2_I(inode);
1264 struct buffer_head *bh;
1265 struct gfs2_leaf *lf;
1266 unsigned entries = 0;
1267 unsigned leaves = 0;
1268 const struct gfs2_dirent **darr, *dent;
1269 struct dirent_gather g;
1270 struct buffer_head **larr;
1271 int leaf = 0;
1272 int error, i;
1273 u64 lfn = leaf_no;
1274
1275 do {
1276 error = get_leaf(ip, lfn, &bh);
1277 if (error)
1278 goto out;
1279 lf = (struct gfs2_leaf *)bh->b_data;
1280 if (leaves == 0)
1281 *depth = be16_to_cpu(lf->lf_depth);
1282 entries += be16_to_cpu(lf->lf_entries);
1283 leaves++;
1284 lfn = be64_to_cpu(lf->lf_next);
1285 brelse(bh);
1286 } while(lfn);
1287
1288 if (!entries)
1289 return 0;
1290
1291 error = -ENOMEM;
1292 larr = vmalloc((leaves + entries) * sizeof(void *));
1293 if (!larr)
1294 goto out;
1295 darr = (const struct gfs2_dirent **)(larr + leaves);
1296 g.pdent = darr;
1297 g.offset = 0;
1298 lfn = leaf_no;
1299
1300 do {
1301 error = get_leaf(ip, lfn, &bh);
1302 if (error)
1303 goto out_kfree;
1304 lf = (struct gfs2_leaf *)bh->b_data;
1305 lfn = be64_to_cpu(lf->lf_next);
1306 if (lf->lf_entries) {
1307 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
1308 gfs2_dirent_gather, NULL, &g);
1309 error = PTR_ERR(dent);
1310 if (IS_ERR(dent)) {
1311 goto out_kfree;
1312 }
1313 error = 0;
1314 larr[leaf++] = bh;
1315 } else {
1316 brelse(bh);
1317 }
1318 } while(lfn);
1319
1320 error = do_filldir_main(ip, offset, opaque, filldir, darr,
1321 entries, copied);
1322out_kfree:
1323 for(i = 0; i < leaf; i++)
1324 brelse(larr[i]);
1325 vfree(larr);
1326out:
1327 return error;
1328}
1329
1330/**
1331 * dir_e_read - Reads the entries from a directory into a filldir buffer
1332 * @dip: dinode pointer
1333 * @offset: the hash of the last entry read shifted to the right once
1334 * @opaque: buffer for the filldir function to fill
1335 * @filldir: points to the filldir function to use
1336 *
1337 * Returns: errno
1338 */
1339
1340static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1341 gfs2_filldir_t filldir)
1342{
1343 struct gfs2_inode *dip = GFS2_I(inode);
1344 struct gfs2_sbd *sdp = GFS2_SB(inode);
1345 u32 hsize, len = 0;
1346 u32 ht_offset, lp_offset, ht_offset_cur = -1;
1347 u32 hash, index;
1348 u64 *lp;
1349 int copied = 0;
1350 int error = 0;
1351 unsigned depth = 0;
1352
1353 hsize = 1 << dip->i_di.di_depth;
1354 if (hsize * sizeof(u64) != dip->i_di.di_size) {
1355 gfs2_consist_inode(dip);
1356 return -EIO;
1357 }
1358
1359 hash = gfs2_dir_offset2hash(*offset);
1360 index = hash >> (32 - dip->i_di.di_depth);
1361
1362 lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
1363 if (!lp)
1364 return -ENOMEM;
1365
1366 while (index < hsize) {
1367 lp_offset = index & (sdp->sd_hash_ptrs - 1);
1368 ht_offset = index - lp_offset;
1369
1370 if (ht_offset_cur != ht_offset) {
1371 error = gfs2_dir_read_data(dip, (char *)lp,
1372 ht_offset * sizeof(u64),
1373 sdp->sd_hash_bsize, 1);
1374 if (error != sdp->sd_hash_bsize) {
1375 if (error >= 0)
1376 error = -EIO;
1377 goto out;
1378 }
1379 ht_offset_cur = ht_offset;
1380 }
1381
1382 error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
1383 &copied, &depth,
1384 be64_to_cpu(lp[lp_offset]));
1385 if (error)
1386 break;
1387
1388 len = 1 << (dip->i_di.di_depth - depth);
1389 index = (index & ~(len - 1)) + len;
1390 }
1391
1392out:
1393 kfree(lp);
1394 if (error > 0)
1395 error = 0;
1396 return error;
1397}
1398
1399int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1400 gfs2_filldir_t filldir)
1401{
1402 struct gfs2_inode *dip = GFS2_I(inode);
1403 struct dirent_gather g;
1404 const struct gfs2_dirent **darr, *dent;
1405 struct buffer_head *dibh;
1406 int copied = 0;
1407 int error;
1408
1409 if (!dip->i_di.di_entries)
1410 return 0;
1411
1412 if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
1413 return dir_e_read(inode, offset, opaque, filldir);
1414
1415 if (!gfs2_is_stuffed(dip)) {
1416 gfs2_consist_inode(dip);
1417 return -EIO;
1418 }
1419
1420 error = gfs2_meta_inode_buffer(dip, &dibh);
1421 if (error)
1422 return error;
1423
1424 error = -ENOMEM;
1425 darr = kmalloc(dip->i_di.di_entries * sizeof(struct gfs2_dirent *),
1426 GFP_KERNEL);
1427 if (darr) {
1428 g.pdent = darr;
1429 g.offset = 0;
1430 dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
1431 gfs2_dirent_gather, NULL, &g);
1432 if (IS_ERR(dent)) {
1433 error = PTR_ERR(dent);
1434 goto out;
1435 }
1436 error = do_filldir_main(dip, offset, opaque, filldir, darr,
1437 dip->i_di.di_entries, &copied);
1438out:
1439 kfree(darr);
1440 }
1441
1442 if (error > 0)
1443 error = 0;
1444
1445 brelse(dibh);
1446
1447 return error;
1448}
1449
1450/**
1451 * gfs2_dir_search - Search a directory
1452 * @dip: The GFS2 inode
1453 * @filename:
1454 * @inode:
1455 *
1456 * This routine searches a directory for a file or another directory.
1457 * Assumes a glock is held on dip.
1458 *
1459 * Returns: errno
1460 */
1461
1462int gfs2_dir_search(struct inode *dir, const struct qstr *name,
1463 struct gfs2_inum *inum, unsigned int *type)
1464{
1465 struct buffer_head *bh;
1466 struct gfs2_dirent *dent;
1467
1468 dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
1469 if (dent) {
1470 if (IS_ERR(dent))
1471 return PTR_ERR(dent);
1472 if (inum)
1473 gfs2_inum_in(inum, (char *)&dent->de_inum);
1474 if (type)
1475 *type = be16_to_cpu(dent->de_type);
1476 brelse(bh);
1477 return 0;
1478 }
1479 return -ENOENT;
1480}
1481
1482static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1483{
1484 struct buffer_head *bh, *obh;
1485 struct gfs2_inode *ip = GFS2_I(inode);
1486 struct gfs2_leaf *leaf, *oleaf;
1487 int error;
1488 u32 index;
1489 u64 bn;
1490
1491 index = name->hash >> (32 - ip->i_di.di_depth);
1492 error = get_first_leaf(ip, index, &obh);
1493 if (error)
1494 return error;
1495 do {
1496 oleaf = (struct gfs2_leaf *)obh->b_data;
1497 bn = be64_to_cpu(oleaf->lf_next);
1498 if (!bn)
1499 break;
1500 brelse(obh);
1501 error = get_leaf(ip, bn, &obh);
1502 if (error)
1503 return error;
1504 } while(1);
1505
1506 gfs2_trans_add_bh(ip->i_gl, obh, 1);
1507
1508 leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
1509 if (!leaf) {
1510 brelse(obh);
1511 return -ENOSPC;
1512 }
1513 oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
1514 brelse(bh);
1515 brelse(obh);
1516
1517 error = gfs2_meta_inode_buffer(ip, &bh);
1518 if (error)
1519 return error;
1520 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1521 ip->i_di.di_blocks++;
1522 gfs2_dinode_out(&ip->i_di, bh->b_data);
1523 brelse(bh);
1524 return 0;
1525}
1526
1527/**
1528 * gfs2_dir_add - Add new filename into directory
1529 * @dip: The GFS2 inode
1530 * @filename: The new name
1531 * @inode: The inode number of the entry
1532 * @type: The type of the entry
1533 *
1534 * Returns: 0 on success, error code on failure
1535 */
1536
1537int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1538 const struct gfs2_inum *inum, unsigned type)
1539{
1540 struct gfs2_inode *ip = GFS2_I(inode);
1541 struct buffer_head *bh;
1542 struct gfs2_dirent *dent;
1543 struct gfs2_leaf *leaf;
1544 int error;
1545
1546 while(1) {
1547 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
1548 &bh);
1549 if (dent) {
1550 if (IS_ERR(dent))
1551 return PTR_ERR(dent);
1552 dent = gfs2_init_dirent(inode, dent, name, bh);
1553 gfs2_inum_out(inum, (char *)&dent->de_inum);
1554 dent->de_type = cpu_to_be16(type);
1555 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
1556 leaf = (struct gfs2_leaf *)bh->b_data;
1557 leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1);
1558 }
1559 brelse(bh);
1560 error = gfs2_meta_inode_buffer(ip, &bh);
1561 if (error)
1562 break;
1563 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1564 ip->i_di.di_entries++;
1565 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
1566 gfs2_dinode_out(&ip->i_di, bh->b_data);
1567 brelse(bh);
1568 error = 0;
1569 break;
1570 }
1571 if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
1572 error = dir_make_exhash(inode);
1573 if (error)
1574 break;
1575 continue;
1576 }
1577 error = dir_split_leaf(inode, name);
1578 if (error == 0)
1579 continue;
1580 if (error < 0)
1581 break;
1582 if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) {
1583 error = dir_double_exhash(ip);
1584 if (error)
1585 break;
1586 error = dir_split_leaf(inode, name);
1587 if (error < 0)
1588 break;
1589 if (error == 0)
1590 continue;
1591 }
1592 error = dir_new_leaf(inode, name);
1593 if (!error)
1594 continue;
1595 error = -ENOSPC;
1596 break;
1597 }
1598 return error;
1599}
1600
1601
1602/**
1603 * gfs2_dir_del - Delete a directory entry
1604 * @dip: The GFS2 inode
1605 * @filename: The filename
1606 *
1607 * Returns: 0 on success, error code on failure
1608 */
1609
1610int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
1611{
1612 struct gfs2_dirent *dent, *prev = NULL;
1613 struct buffer_head *bh;
1614 int error;
1615
1616 /* Returns _either_ the entry (if its first in block) or the
1617 previous entry otherwise */
1618 dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh);
1619 if (!dent) {
1620 gfs2_consist_inode(dip);
1621 return -EIO;
1622 }
1623 if (IS_ERR(dent)) {
1624 gfs2_consist_inode(dip);
1625 return PTR_ERR(dent);
1626 }
1627 /* If not first in block, adjust pointers accordingly */
1628 if (gfs2_dirent_find(dent, name, NULL) == 0) {
1629 prev = dent;
1630 dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len));
1631 }
1632
1633 dirent_del(dip, bh, prev, dent);
1634 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
1635 struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
1636 u16 entries = be16_to_cpu(leaf->lf_entries);
1637 if (!entries)
1638 gfs2_consist_inode(dip);
1639 leaf->lf_entries = cpu_to_be16(--entries);
1640 }
1641 brelse(bh);
1642
1643 error = gfs2_meta_inode_buffer(dip, &bh);
1644 if (error)
1645 return error;
1646
1647 if (!dip->i_di.di_entries)
1648 gfs2_consist_inode(dip);
1649 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1650 dip->i_di.di_entries--;
1651 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1652 gfs2_dinode_out(&dip->i_di, bh->b_data);
1653 brelse(bh);
1654 mark_inode_dirty(&dip->i_inode);
1655
1656 return error;
1657}
1658
1659/**
1660 * gfs2_dir_mvino - Change inode number of directory entry
1661 * @dip: The GFS2 inode
1662 * @filename:
1663 * @new_inode:
1664 *
1665 * This routine changes the inode number of a directory entry. It's used
1666 * by rename to change ".." when a directory is moved.
1667 * Assumes a glock is held on dvp.
1668 *
1669 * Returns: errno
1670 */
1671
1672int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
1673 struct gfs2_inum *inum, unsigned int new_type)
1674{
1675 struct buffer_head *bh;
1676 struct gfs2_dirent *dent;
1677 int error;
1678
1679 dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh);
1680 if (!dent) {
1681 gfs2_consist_inode(dip);
1682 return -EIO;
1683 }
1684 if (IS_ERR(dent))
1685 return PTR_ERR(dent);
1686
1687 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1688 gfs2_inum_out(inum, (char *)&dent->de_inum);
1689 dent->de_type = cpu_to_be16(new_type);
1690
1691 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
1692 brelse(bh);
1693 error = gfs2_meta_inode_buffer(dip, &bh);
1694 if (error)
1695 return error;
1696 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1697 }
1698
1699 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1700 gfs2_dinode_out(&dip->i_di, bh->b_data);
1701 brelse(bh);
1702 return 0;
1703}
1704
1705/**
1706 * foreach_leaf - call a function for each leaf in a directory
1707 * @dip: the directory
1708 * @lc: the function to call for each each
1709 * @data: private data to pass to it
1710 *
1711 * Returns: errno
1712 */
1713
1714static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1715{
1716 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1717 struct buffer_head *bh;
1718 struct gfs2_leaf *leaf;
1719 u32 hsize, len;
1720 u32 ht_offset, lp_offset, ht_offset_cur = -1;
1721 u32 index = 0;
1722 u64 *lp;
1723 u64 leaf_no;
1724 int error = 0;
1725
1726 hsize = 1 << dip->i_di.di_depth;
1727 if (hsize * sizeof(u64) != dip->i_di.di_size) {
1728 gfs2_consist_inode(dip);
1729 return -EIO;
1730 }
1731
1732 lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
1733 if (!lp)
1734 return -ENOMEM;
1735
1736 while (index < hsize) {
1737 lp_offset = index & (sdp->sd_hash_ptrs - 1);
1738 ht_offset = index - lp_offset;
1739
1740 if (ht_offset_cur != ht_offset) {
1741 error = gfs2_dir_read_data(dip, (char *)lp,
1742 ht_offset * sizeof(u64),
1743 sdp->sd_hash_bsize, 1);
1744 if (error != sdp->sd_hash_bsize) {
1745 if (error >= 0)
1746 error = -EIO;
1747 goto out;
1748 }
1749 ht_offset_cur = ht_offset;
1750 }
1751
1752 leaf_no = be64_to_cpu(lp[lp_offset]);
1753 if (leaf_no) {
1754 error = get_leaf(dip, leaf_no, &bh);
1755 if (error)
1756 goto out;
1757 leaf = (struct gfs2_leaf *)bh->b_data;
1758 len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth));
1759 brelse(bh);
1760
1761 error = lc(dip, index, len, leaf_no, data);
1762 if (error)
1763 goto out;
1764
1765 index = (index & ~(len - 1)) + len;
1766 } else
1767 index++;
1768 }
1769
1770 if (index != hsize) {
1771 gfs2_consist_inode(dip);
1772 error = -EIO;
1773 }
1774
1775out:
1776 kfree(lp);
1777
1778 return error;
1779}
1780
1781/**
1782 * leaf_dealloc - Deallocate a directory leaf
1783 * @dip: the directory
1784 * @index: the hash table offset in the directory
1785 * @len: the number of pointers to this leaf
1786 * @leaf_no: the leaf number
1787 * @data: not used
1788 *
1789 * Returns: errno
1790 */
1791
1792static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
1793 u64 leaf_no, void *data)
1794{
1795 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1796 struct gfs2_leaf *tmp_leaf;
1797 struct gfs2_rgrp_list rlist;
1798 struct buffer_head *bh, *dibh;
1799 u64 blk, nblk;
1800 unsigned int rg_blocks = 0, l_blocks = 0;
1801 char *ht;
1802 unsigned int x, size = len * sizeof(u64);
1803 int error;
1804
1805 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1806
1807 ht = kzalloc(size, GFP_KERNEL);
1808 if (!ht)
1809 return -ENOMEM;
1810
1811 gfs2_alloc_get(dip);
1812
1813 error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1814 if (error)
1815 goto out;
1816
1817 error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh);
1818 if (error)
1819 goto out_qs;
1820
1821 /* Count the number of leaves */
1822
1823 for (blk = leaf_no; blk; blk = nblk) {
1824 error = get_leaf(dip, blk, &bh);
1825 if (error)
1826 goto out_rlist;
1827 tmp_leaf = (struct gfs2_leaf *)bh->b_data;
1828 nblk = be64_to_cpu(tmp_leaf->lf_next);
1829 brelse(bh);
1830
1831 gfs2_rlist_add(sdp, &rlist, blk);
1832 l_blocks++;
1833 }
1834
1835 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
1836
1837 for (x = 0; x < rlist.rl_rgrps; x++) {
1838 struct gfs2_rgrpd *rgd;
1839 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
1840 rg_blocks += rgd->rd_ri.ri_length;
1841 }
1842
1843 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
1844 if (error)
1845 goto out_rlist;
1846
1847 error = gfs2_trans_begin(sdp,
1848 rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) +
1849 RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
1850 if (error)
1851 goto out_rg_gunlock;
1852
1853 for (blk = leaf_no; blk; blk = nblk) {
1854 error = get_leaf(dip, blk, &bh);
1855 if (error)
1856 goto out_end_trans;
1857 tmp_leaf = (struct gfs2_leaf *)bh->b_data;
1858 nblk = be64_to_cpu(tmp_leaf->lf_next);
1859 brelse(bh);
1860
1861 gfs2_free_meta(dip, blk, 1);
1862
1863 if (!dip->i_di.di_blocks)
1864 gfs2_consist_inode(dip);
1865 dip->i_di.di_blocks--;
1866 }
1867
1868 error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size);
1869 if (error != size) {
1870 if (error >= 0)
1871 error = -EIO;
1872 goto out_end_trans;
1873 }
1874
1875 error = gfs2_meta_inode_buffer(dip, &dibh);
1876 if (error)
1877 goto out_end_trans;
1878
1879 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
1880 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1881 brelse(dibh);
1882
1883out_end_trans:
1884 gfs2_trans_end(sdp);
1885out_rg_gunlock:
1886 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1887out_rlist:
1888 gfs2_rlist_free(&rlist);
1889 gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh);
1890out_qs:
1891 gfs2_quota_unhold(dip);
1892out:
1893 gfs2_alloc_put(dip);
1894 kfree(ht);
1895 return error;
1896}
1897
1898/**
1899 * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
1900 * @dip: the directory
1901 *
1902 * Dealloc all on-disk directory leaves to FREEMETA state
1903 * Change on-disk inode type to "regular file"
1904 *
1905 * Returns: errno
1906 */
1907
1908int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
1909{
1910 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1911 struct buffer_head *bh;
1912 int error;
1913
1914 /* Dealloc on-disk leaves to FREEMETA state */
1915 error = foreach_leaf(dip, leaf_dealloc, NULL);
1916 if (error)
1917 return error;
1918
1919 /* Make this a regular file in case we crash.
1920 (We don't want to free these blocks a second time.) */
1921
1922 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1923 if (error)
1924 return error;
1925
1926 error = gfs2_meta_inode_buffer(dip, &bh);
1927 if (!error) {
1928 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1929 ((struct gfs2_dinode *)bh->b_data)->di_mode =
1930 cpu_to_be32(S_IFREG);
1931 brelse(bh);
1932 }
1933
1934 gfs2_trans_end(sdp);
1935
1936 return error;
1937}
1938
1939/**
1940 * gfs2_diradd_alloc_required - find if adding entry will require an allocation
1941 * @ip: the file being written to
1942 * @filname: the filename that's going to be added
1943 *
1944 * Returns: 1 if alloc required, 0 if not, -ve on error
1945 */
1946
1947int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
1948{
1949 struct gfs2_dirent *dent;
1950 struct buffer_head *bh;
1951
1952 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
1953 if (!dent) {
1954 return 1;
1955 }
1956 if (IS_ERR(dent))
1957 return PTR_ERR(dent);
1958 brelse(bh);
1959 return 0;
1960}
1961
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
new file mode 100644
index 000000000000..371233419b07
--- /dev/null
+++ b/fs/gfs2/dir.h
@@ -0,0 +1,79 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __DIR_DOT_H__
11#define __DIR_DOT_H__
12
13#include <linux/dcache.h>
14
15struct inode;
16struct gfs2_inode;
17struct gfs2_inum;
18
19/**
20 * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read()
21 * @opaque: opaque data used by the function
22 * @name: the name of the directory entry
23 * @length: the length of the name
24 * @offset: the entry's offset in the directory
25 * @inum: the inode number the entry points to
26 * @type: the type of inode the entry points to
27 *
28 * Returns: 0 on success, 1 if buffer full
29 */
30
31typedef int (*gfs2_filldir_t) (void *opaque,
32 const char *name, unsigned int length,
33 u64 offset,
34 struct gfs2_inum *inum, unsigned int type);
35
36int gfs2_dir_search(struct inode *dir, const struct qstr *filename,
37 struct gfs2_inum *inum, unsigned int *type);
38int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
39 const struct gfs2_inum *inum, unsigned int type);
40int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
41int gfs2_dir_read(struct inode *inode, u64 * offset, void *opaque,
42 gfs2_filldir_t filldir);
43int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
44 struct gfs2_inum *new_inum, unsigned int new_type);
45
46int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
47
48int gfs2_diradd_alloc_required(struct inode *dir,
49 const struct qstr *filename);
50int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
51 struct buffer_head **bhp);
52
53static inline u32 gfs2_disk_hash(const char *data, int len)
54{
55 return crc32_le((u32)~0, data, len) ^ (u32)~0;
56}
57
58
59static inline void gfs2_str2qstr(struct qstr *name, const char *fname)
60{
61 name->name = fname;
62 name->len = strlen(fname);
63 name->hash = gfs2_disk_hash(name->name, name->len);
64}
65
66/* N.B. This probably ought to take inum & type as args as well */
67static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent)
68{
69 dent->de_inum.no_addr = cpu_to_be64(0);
70 dent->de_inum.no_formal_ino = cpu_to_be64(0);
71 dent->de_hash = cpu_to_be32(name->hash);
72 dent->de_rec_len = cpu_to_be16(reclen);
73 dent->de_name_len = cpu_to_be16(name->len);
74 dent->de_type = cpu_to_be16(0);
75 memset(dent->__pad, 0, sizeof(dent->__pad));
76 memcpy(dent + 1, name->name, name->len);
77}
78
79#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
new file mode 100644
index 000000000000..92c54e9b0dc3
--- /dev/null
+++ b/fs/gfs2/eaops.c
@@ -0,0 +1,230 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/xattr.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/lm_interface.h>
18#include <asm/uaccess.h>
19
20#include "gfs2.h"
21#include "incore.h"
22#include "acl.h"
23#include "eaops.h"
24#include "eattr.h"
25#include "util.h"
26
27/**
28 * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
29 * @namep: ea name, possibly with type appended
30 *
31 * Returns: GFS2_EATYPE_XXX
32 */
33
34unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
35{
36 unsigned int type;
37
38 if (strncmp(name, "system.", 7) == 0) {
39 type = GFS2_EATYPE_SYS;
40 if (truncated_name)
41 *truncated_name = name + sizeof("system.") - 1;
42 } else if (strncmp(name, "user.", 5) == 0) {
43 type = GFS2_EATYPE_USR;
44 if (truncated_name)
45 *truncated_name = name + sizeof("user.") - 1;
46 } else if (strncmp(name, "security.", 9) == 0) {
47 type = GFS2_EATYPE_SECURITY;
48 if (truncated_name)
49 *truncated_name = name + sizeof("security.") - 1;
50 } else {
51 type = GFS2_EATYPE_UNUSED;
52 if (truncated_name)
53 *truncated_name = NULL;
54 }
55
56 return type;
57}
58
59static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
60{
61 struct inode *inode = &ip->i_inode;
62 int error = permission(inode, MAY_READ, NULL);
63 if (error)
64 return error;
65
66 return gfs2_ea_get_i(ip, er);
67}
68
69static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
70{
71 struct inode *inode = &ip->i_inode;
72
73 if (S_ISREG(inode->i_mode) ||
74 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
75 int error = permission(inode, MAY_WRITE, NULL);
76 if (error)
77 return error;
78 } else
79 return -EPERM;
80
81 return gfs2_ea_set_i(ip, er);
82}
83
84static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
85{
86 struct inode *inode = &ip->i_inode;
87
88 if (S_ISREG(inode->i_mode) ||
89 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
90 int error = permission(inode, MAY_WRITE, NULL);
91 if (error)
92 return error;
93 } else
94 return -EPERM;
95
96 return gfs2_ea_remove_i(ip, er);
97}
98
99static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
100{
101 if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
102 !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
103 !capable(CAP_SYS_ADMIN))
104 return -EPERM;
105
106 if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
107 (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
108 GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
109 return -EOPNOTSUPP;
110
111
112
113 return gfs2_ea_get_i(ip, er);
114}
115
116static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
117{
118 int remove = 0;
119 int error;
120
121 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
122 if (!(er->er_flags & GFS2_ERF_MODE)) {
123 er->er_mode = ip->i_di.di_mode;
124 er->er_flags |= GFS2_ERF_MODE;
125 }
126 error = gfs2_acl_validate_set(ip, 1, er,
127 &remove, &er->er_mode);
128 if (error)
129 return error;
130 error = gfs2_ea_set_i(ip, er);
131 if (error)
132 return error;
133 if (remove)
134 gfs2_ea_remove_i(ip, er);
135 return 0;
136
137 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
138 error = gfs2_acl_validate_set(ip, 0, er,
139 &remove, NULL);
140 if (error)
141 return error;
142 if (!remove)
143 error = gfs2_ea_set_i(ip, er);
144 else {
145 error = gfs2_ea_remove_i(ip, er);
146 if (error == -ENODATA)
147 error = 0;
148 }
149 return error;
150 }
151
152 return -EPERM;
153}
154
155static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
156{
157 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
158 int error = gfs2_acl_validate_remove(ip, 1);
159 if (error)
160 return error;
161
162 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
163 int error = gfs2_acl_validate_remove(ip, 0);
164 if (error)
165 return error;
166
167 } else
168 return -EPERM;
169
170 return gfs2_ea_remove_i(ip, er);
171}
172
173static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
174{
175 struct inode *inode = &ip->i_inode;
176 int error = permission(inode, MAY_READ, NULL);
177 if (error)
178 return error;
179
180 return gfs2_ea_get_i(ip, er);
181}
182
183static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
184{
185 struct inode *inode = &ip->i_inode;
186 int error = permission(inode, MAY_WRITE, NULL);
187 if (error)
188 return error;
189
190 return gfs2_ea_set_i(ip, er);
191}
192
193static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
194{
195 struct inode *inode = &ip->i_inode;
196 int error = permission(inode, MAY_WRITE, NULL);
197 if (error)
198 return error;
199
200 return gfs2_ea_remove_i(ip, er);
201}
202
203static struct gfs2_eattr_operations gfs2_user_eaops = {
204 .eo_get = user_eo_get,
205 .eo_set = user_eo_set,
206 .eo_remove = user_eo_remove,
207 .eo_name = "user",
208};
209
210struct gfs2_eattr_operations gfs2_system_eaops = {
211 .eo_get = system_eo_get,
212 .eo_set = system_eo_set,
213 .eo_remove = system_eo_remove,
214 .eo_name = "system",
215};
216
217static struct gfs2_eattr_operations gfs2_security_eaops = {
218 .eo_get = security_eo_get,
219 .eo_set = security_eo_set,
220 .eo_remove = security_eo_remove,
221 .eo_name = "security",
222};
223
224struct gfs2_eattr_operations *gfs2_ea_ops[] = {
225 NULL,
226 &gfs2_user_eaops,
227 &gfs2_system_eaops,
228 &gfs2_security_eaops,
229};
230
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
new file mode 100644
index 000000000000..508b4f7a2449
--- /dev/null
+++ b/fs/gfs2/eaops.h
@@ -0,0 +1,30 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __EAOPS_DOT_H__
11#define __EAOPS_DOT_H__
12
13struct gfs2_ea_request;
14struct gfs2_inode;
15
16struct gfs2_eattr_operations {
17 int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
18 int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
19 int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
20 char *eo_name;
21};
22
23unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
24
25extern struct gfs2_eattr_operations gfs2_system_eaops;
26
27extern struct gfs2_eattr_operations *gfs2_ea_ops[];
28
29#endif /* __EAOPS_DOT_H__ */
30
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
new file mode 100644
index 000000000000..a65a4ccfd4dd
--- /dev/null
+++ b/fs/gfs2/eattr.c
@@ -0,0 +1,1501 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/xattr.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/lm_interface.h>
18#include <asm/uaccess.h>
19
20#include "gfs2.h"
21#include "incore.h"
22#include "acl.h"
23#include "eaops.h"
24#include "eattr.h"
25#include "glock.h"
26#include "inode.h"
27#include "meta_io.h"
28#include "quota.h"
29#include "rgrp.h"
30#include "trans.h"
31#include "util.h"
32
33/**
34 * ea_calc_size - returns the acutal number of bytes the request will take up
35 * (not counting any unstuffed data blocks)
36 * @sdp:
37 * @er:
38 * @size:
39 *
40 * Returns: 1 if the EA should be stuffed
41 */
42
43static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
44 unsigned int *size)
45{
46 *size = GFS2_EAREQ_SIZE_STUFFED(er);
47 if (*size <= sdp->sd_jbsize)
48 return 1;
49
50 *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
51
52 return 0;
53}
54
55static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
56{
57 unsigned int size;
58
59 if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
60 return -ERANGE;
61
62 ea_calc_size(sdp, er, &size);
63
64 /* This can only happen with 512 byte blocks */
65 if (size > sdp->sd_jbsize)
66 return -ERANGE;
67
68 return 0;
69}
70
71typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh,
72 struct gfs2_ea_header *ea,
73 struct gfs2_ea_header *prev, void *private);
74
75static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
76 ea_call_t ea_call, void *data)
77{
78 struct gfs2_ea_header *ea, *prev = NULL;
79 int error = 0;
80
81 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA))
82 return -EIO;
83
84 for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
85 if (!GFS2_EA_REC_LEN(ea))
86 goto fail;
87 if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <=
88 bh->b_data + bh->b_size))
89 goto fail;
90 if (!GFS2_EATYPE_VALID(ea->ea_type))
91 goto fail;
92
93 error = ea_call(ip, bh, ea, prev, data);
94 if (error)
95 return error;
96
97 if (GFS2_EA_IS_LAST(ea)) {
98 if ((char *)GFS2_EA2NEXT(ea) !=
99 bh->b_data + bh->b_size)
100 goto fail;
101 break;
102 }
103 }
104
105 return error;
106
107fail:
108 gfs2_consist_inode(ip);
109 return -EIO;
110}
111
112static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
113{
114 struct buffer_head *bh, *eabh;
115 u64 *eablk, *end;
116 int error;
117
118 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh);
119 if (error)
120 return error;
121
122 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
123 error = ea_foreach_i(ip, bh, ea_call, data);
124 goto out;
125 }
126
127 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) {
128 error = -EIO;
129 goto out;
130 }
131
132 eablk = (u64 *)(bh->b_data + sizeof(struct gfs2_meta_header));
133 end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs;
134
135 for (; eablk < end; eablk++) {
136 u64 bn;
137
138 if (!*eablk)
139 break;
140 bn = be64_to_cpu(*eablk);
141
142 error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh);
143 if (error)
144 break;
145 error = ea_foreach_i(ip, eabh, ea_call, data);
146 brelse(eabh);
147 if (error)
148 break;
149 }
150out:
151 brelse(bh);
152 return error;
153}
154
155struct ea_find {
156 struct gfs2_ea_request *ef_er;
157 struct gfs2_ea_location *ef_el;
158};
159
160static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
161 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
162 void *private)
163{
164 struct ea_find *ef = private;
165 struct gfs2_ea_request *er = ef->ef_er;
166
167 if (ea->ea_type == GFS2_EATYPE_UNUSED)
168 return 0;
169
170 if (ea->ea_type == er->er_type) {
171 if (ea->ea_name_len == er->er_name_len &&
172 !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
173 struct gfs2_ea_location *el = ef->ef_el;
174 get_bh(bh);
175 el->el_bh = bh;
176 el->el_ea = ea;
177 el->el_prev = prev;
178 return 1;
179 }
180 }
181
182 return 0;
183}
184
185int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
186 struct gfs2_ea_location *el)
187{
188 struct ea_find ef;
189 int error;
190
191 ef.ef_er = er;
192 ef.ef_el = el;
193
194 memset(el, 0, sizeof(struct gfs2_ea_location));
195
196 error = ea_foreach(ip, ea_find_i, &ef);
197 if (error > 0)
198 return 0;
199
200 return error;
201}
202
203/**
204 * ea_dealloc_unstuffed -
205 * @ip:
206 * @bh:
207 * @ea:
208 * @prev:
209 * @private:
210 *
211 * Take advantage of the fact that all unstuffed blocks are
212 * allocated from the same RG. But watch, this may not always
213 * be true.
214 *
215 * Returns: errno
216 */
217
218static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
219 struct gfs2_ea_header *ea,
220 struct gfs2_ea_header *prev, void *private)
221{
222 int *leave = private;
223 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
224 struct gfs2_rgrpd *rgd;
225 struct gfs2_holder rg_gh;
226 struct buffer_head *dibh;
227 u64 *dataptrs, bn = 0;
228 u64 bstart = 0;
229 unsigned int blen = 0;
230 unsigned int blks = 0;
231 unsigned int x;
232 int error;
233
234 if (GFS2_EA_IS_STUFFED(ea))
235 return 0;
236
237 dataptrs = GFS2_EA2DATAPTRS(ea);
238 for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
239 if (*dataptrs) {
240 blks++;
241 bn = be64_to_cpu(*dataptrs);
242 }
243 }
244 if (!blks)
245 return 0;
246
247 rgd = gfs2_blk2rgrpd(sdp, bn);
248 if (!rgd) {
249 gfs2_consist_inode(ip);
250 return -EIO;
251 }
252
253 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
254 if (error)
255 return error;
256
257 error = gfs2_trans_begin(sdp, rgd->rd_ri.ri_length + RES_DINODE +
258 RES_EATTR + RES_STATFS + RES_QUOTA, blks);
259 if (error)
260 goto out_gunlock;
261
262 gfs2_trans_add_bh(ip->i_gl, bh, 1);
263
264 dataptrs = GFS2_EA2DATAPTRS(ea);
265 for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
266 if (!*dataptrs)
267 break;
268 bn = be64_to_cpu(*dataptrs);
269
270 if (bstart + blen == bn)
271 blen++;
272 else {
273 if (bstart)
274 gfs2_free_meta(ip, bstart, blen);
275 bstart = bn;
276 blen = 1;
277 }
278
279 *dataptrs = 0;
280 if (!ip->i_di.di_blocks)
281 gfs2_consist_inode(ip);
282 ip->i_di.di_blocks--;
283 }
284 if (bstart)
285 gfs2_free_meta(ip, bstart, blen);
286
287 if (prev && !leave) {
288 u32 len;
289
290 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
291 prev->ea_rec_len = cpu_to_be32(len);
292
293 if (GFS2_EA_IS_LAST(ea))
294 prev->ea_flags |= GFS2_EAFLAG_LAST;
295 } else {
296 ea->ea_type = GFS2_EATYPE_UNUSED;
297 ea->ea_num_ptrs = 0;
298 }
299
300 error = gfs2_meta_inode_buffer(ip, &dibh);
301 if (!error) {
302 ip->i_di.di_ctime = get_seconds();
303 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
304 gfs2_dinode_out(&ip->i_di, dibh->b_data);
305 brelse(dibh);
306 }
307
308 gfs2_trans_end(sdp);
309
310out_gunlock:
311 gfs2_glock_dq_uninit(&rg_gh);
312 return error;
313}
314
315static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
316 struct gfs2_ea_header *ea,
317 struct gfs2_ea_header *prev, int leave)
318{
319 struct gfs2_alloc *al;
320 int error;
321
322 al = gfs2_alloc_get(ip);
323
324 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
325 if (error)
326 goto out_alloc;
327
328 error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
329 if (error)
330 goto out_quota;
331
332 error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL);
333
334 gfs2_glock_dq_uninit(&al->al_ri_gh);
335
336out_quota:
337 gfs2_quota_unhold(ip);
338out_alloc:
339 gfs2_alloc_put(ip);
340 return error;
341}
342
343struct ea_list {
344 struct gfs2_ea_request *ei_er;
345 unsigned int ei_size;
346};
347
348static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
349 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
350 void *private)
351{
352 struct ea_list *ei = private;
353 struct gfs2_ea_request *er = ei->ei_er;
354 unsigned int ea_size = gfs2_ea_strlen(ea);
355
356 if (ea->ea_type == GFS2_EATYPE_UNUSED)
357 return 0;
358
359 if (er->er_data_len) {
360 char *prefix = NULL;
361 unsigned int l = 0;
362 char c = 0;
363
364 if (ei->ei_size + ea_size > er->er_data_len)
365 return -ERANGE;
366
367 switch (ea->ea_type) {
368 case GFS2_EATYPE_USR:
369 prefix = "user.";
370 l = 5;
371 break;
372 case GFS2_EATYPE_SYS:
373 prefix = "system.";
374 l = 7;
375 break;
376 case GFS2_EATYPE_SECURITY:
377 prefix = "security.";
378 l = 9;
379 break;
380 }
381
382 BUG_ON(l == 0);
383
384 memcpy(er->er_data + ei->ei_size, prefix, l);
385 memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea),
386 ea->ea_name_len);
387 memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1);
388 }
389
390 ei->ei_size += ea_size;
391
392 return 0;
393}
394
395/**
396 * gfs2_ea_list -
397 * @ip:
398 * @er:
399 *
400 * Returns: actual size of data on success, -errno on error
401 */
402
403int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
404{
405 struct gfs2_holder i_gh;
406 int error;
407
408 if (!er->er_data || !er->er_data_len) {
409 er->er_data = NULL;
410 er->er_data_len = 0;
411 }
412
413 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
414 if (error)
415 return error;
416
417 if (ip->i_di.di_eattr) {
418 struct ea_list ei = { .ei_er = er, .ei_size = 0 };
419
420 error = ea_foreach(ip, ea_list_i, &ei);
421 if (!error)
422 error = ei.ei_size;
423 }
424
425 gfs2_glock_dq_uninit(&i_gh);
426
427 return error;
428}
429
430/**
431 * ea_get_unstuffed - actually copies the unstuffed data into the
432 * request buffer
433 * @ip: The GFS2 inode
434 * @ea: The extended attribute header structure
435 * @data: The data to be copied
436 *
437 * Returns: errno
438 */
439
440static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
441 char *data)
442{
443 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
444 struct buffer_head **bh;
445 unsigned int amount = GFS2_EA_DATA_LEN(ea);
446 unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
447 u64 *dataptrs = GFS2_EA2DATAPTRS(ea);
448 unsigned int x;
449 int error = 0;
450
451 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
452 if (!bh)
453 return -ENOMEM;
454
455 for (x = 0; x < nptrs; x++) {
456 error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
457 bh + x);
458 if (error) {
459 while (x--)
460 brelse(bh[x]);
461 goto out;
462 }
463 dataptrs++;
464 }
465
466 for (x = 0; x < nptrs; x++) {
467 error = gfs2_meta_wait(sdp, bh[x]);
468 if (error) {
469 for (; x < nptrs; x++)
470 brelse(bh[x]);
471 goto out;
472 }
473 if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
474 for (; x < nptrs; x++)
475 brelse(bh[x]);
476 error = -EIO;
477 goto out;
478 }
479
480 memcpy(data, bh[x]->b_data + sizeof(struct gfs2_meta_header),
481 (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
482
483 amount -= sdp->sd_jbsize;
484 data += sdp->sd_jbsize;
485
486 brelse(bh[x]);
487 }
488
489out:
490 kfree(bh);
491 return error;
492}
493
494int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
495 char *data)
496{
497 if (GFS2_EA_IS_STUFFED(el->el_ea)) {
498 memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea));
499 return 0;
500 } else
501 return ea_get_unstuffed(ip, el->el_ea, data);
502}
503
504/**
505 * gfs2_ea_get_i -
506 * @ip: The GFS2 inode
507 * @er: The request structure
508 *
509 * Returns: actual size of data on success, -errno on error
510 */
511
512int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
513{
514 struct gfs2_ea_location el;
515 int error;
516
517 if (!ip->i_di.di_eattr)
518 return -ENODATA;
519
520 error = gfs2_ea_find(ip, er, &el);
521 if (error)
522 return error;
523 if (!el.el_ea)
524 return -ENODATA;
525
526 if (er->er_data_len) {
527 if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
528 error = -ERANGE;
529 else
530 error = gfs2_ea_get_copy(ip, &el, er->er_data);
531 }
532 if (!error)
533 error = GFS2_EA_DATA_LEN(el.el_ea);
534
535 brelse(el.el_bh);
536
537 return error;
538}
539
540/**
541 * gfs2_ea_get -
542 * @ip: The GFS2 inode
543 * @er: The request structure
544 *
545 * Returns: actual size of data on success, -errno on error
546 */
547
548int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
549{
550 struct gfs2_holder i_gh;
551 int error;
552
553 if (!er->er_name_len ||
554 er->er_name_len > GFS2_EA_MAX_NAME_LEN)
555 return -EINVAL;
556 if (!er->er_data || !er->er_data_len) {
557 er->er_data = NULL;
558 er->er_data_len = 0;
559 }
560
561 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
562 if (error)
563 return error;
564
565 error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
566
567 gfs2_glock_dq_uninit(&i_gh);
568
569 return error;
570}
571
572/**
573 * ea_alloc_blk - allocates a new block for extended attributes.
574 * @ip: A pointer to the inode that's getting extended attributes
575 * @bhp: Pointer to pointer to a struct buffer_head
576 *
577 * Returns: errno
578 */
579
580static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
581{
582 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
583 struct gfs2_ea_header *ea;
584 u64 block;
585
586 block = gfs2_alloc_meta(ip);
587
588 *bhp = gfs2_meta_new(ip->i_gl, block);
589 gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
590 gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
591 gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
592
593 ea = GFS2_EA_BH2FIRST(*bhp);
594 ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
595 ea->ea_type = GFS2_EATYPE_UNUSED;
596 ea->ea_flags = GFS2_EAFLAG_LAST;
597 ea->ea_num_ptrs = 0;
598
599 ip->i_di.di_blocks++;
600
601 return 0;
602}
603
604/**
605 * ea_write - writes the request info to an ea, creating new blocks if
606 * necessary
607 * @ip: inode that is being modified
608 * @ea: the location of the new ea in a block
609 * @er: the write request
610 *
611 * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
612 *
613 * returns : errno
614 */
615
616static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
617 struct gfs2_ea_request *er)
618{
619 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
620
621 ea->ea_data_len = cpu_to_be32(er->er_data_len);
622 ea->ea_name_len = er->er_name_len;
623 ea->ea_type = er->er_type;
624 ea->__pad = 0;
625
626 memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
627
628 if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
629 ea->ea_num_ptrs = 0;
630 memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
631 } else {
632 u64 *dataptr = GFS2_EA2DATAPTRS(ea);
633 const char *data = er->er_data;
634 unsigned int data_len = er->er_data_len;
635 unsigned int copy;
636 unsigned int x;
637
638 ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize);
639 for (x = 0; x < ea->ea_num_ptrs; x++) {
640 struct buffer_head *bh;
641 u64 block;
642 int mh_size = sizeof(struct gfs2_meta_header);
643
644 block = gfs2_alloc_meta(ip);
645
646 bh = gfs2_meta_new(ip->i_gl, block);
647 gfs2_trans_add_bh(ip->i_gl, bh, 1);
648 gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
649
650 ip->i_di.di_blocks++;
651
652 copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
653 data_len;
654 memcpy(bh->b_data + mh_size, data, copy);
655 if (copy < sdp->sd_jbsize)
656 memset(bh->b_data + mh_size + copy, 0,
657 sdp->sd_jbsize - copy);
658
659 *dataptr++ = cpu_to_be64(bh->b_blocknr);
660 data += copy;
661 data_len -= copy;
662
663 brelse(bh);
664 }
665
666 gfs2_assert_withdraw(sdp, !data_len);
667 }
668
669 return 0;
670}
671
672typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
673 struct gfs2_ea_request *er, void *private);
674
675static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
676 unsigned int blks,
677 ea_skeleton_call_t skeleton_call, void *private)
678{
679 struct gfs2_alloc *al;
680 struct buffer_head *dibh;
681 int error;
682
683 al = gfs2_alloc_get(ip);
684
685 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
686 if (error)
687 goto out;
688
689 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
690 if (error)
691 goto out_gunlock_q;
692
693 al->al_requested = blks;
694
695 error = gfs2_inplace_reserve(ip);
696 if (error)
697 goto out_gunlock_q;
698
699 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
700 blks + al->al_rgd->rd_ri.ri_length +
701 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
702 if (error)
703 goto out_ipres;
704
705 error = skeleton_call(ip, er, private);
706 if (error)
707 goto out_end_trans;
708
709 error = gfs2_meta_inode_buffer(ip, &dibh);
710 if (!error) {
711 if (er->er_flags & GFS2_ERF_MODE) {
712 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
713 (ip->i_di.di_mode & S_IFMT) ==
714 (er->er_mode & S_IFMT));
715 ip->i_di.di_mode = er->er_mode;
716 }
717 ip->i_di.di_ctime = get_seconds();
718 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
719 gfs2_dinode_out(&ip->i_di, dibh->b_data);
720 brelse(dibh);
721 }
722
723out_end_trans:
724 gfs2_trans_end(GFS2_SB(&ip->i_inode));
725out_ipres:
726 gfs2_inplace_release(ip);
727out_gunlock_q:
728 gfs2_quota_unlock(ip);
729out:
730 gfs2_alloc_put(ip);
731 return error;
732}
733
734static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
735 void *private)
736{
737 struct buffer_head *bh;
738 int error;
739
740 error = ea_alloc_blk(ip, &bh);
741 if (error)
742 return error;
743
744 ip->i_di.di_eattr = bh->b_blocknr;
745 error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
746
747 brelse(bh);
748
749 return error;
750}
751
752/**
753 * ea_init - initializes a new eattr block
754 * @ip:
755 * @er:
756 *
757 * Returns: errno
758 */
759
760static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
761{
762 unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
763 unsigned int blks = 1;
764
765 if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
766 blks += DIV_ROUND_UP(er->er_data_len, jbsize);
767
768 return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
769}
770
771static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
772{
773 u32 ea_size = GFS2_EA_SIZE(ea);
774 struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea +
775 ea_size);
776 u32 new_size = GFS2_EA_REC_LEN(ea) - ea_size;
777 int last = ea->ea_flags & GFS2_EAFLAG_LAST;
778
779 ea->ea_rec_len = cpu_to_be32(ea_size);
780 ea->ea_flags ^= last;
781
782 new->ea_rec_len = cpu_to_be32(new_size);
783 new->ea_flags = last;
784
785 return new;
786}
787
788static void ea_set_remove_stuffed(struct gfs2_inode *ip,
789 struct gfs2_ea_location *el)
790{
791 struct gfs2_ea_header *ea = el->el_ea;
792 struct gfs2_ea_header *prev = el->el_prev;
793 u32 len;
794
795 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
796
797 if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
798 ea->ea_type = GFS2_EATYPE_UNUSED;
799 return;
800 } else if (GFS2_EA2NEXT(prev) != ea) {
801 prev = GFS2_EA2NEXT(prev);
802 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea);
803 }
804
805 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
806 prev->ea_rec_len = cpu_to_be32(len);
807
808 if (GFS2_EA_IS_LAST(ea))
809 prev->ea_flags |= GFS2_EAFLAG_LAST;
810}
811
812struct ea_set {
813 int ea_split;
814
815 struct gfs2_ea_request *es_er;
816 struct gfs2_ea_location *es_el;
817
818 struct buffer_head *es_bh;
819 struct gfs2_ea_header *es_ea;
820};
821
822static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
823 struct gfs2_ea_header *ea, struct ea_set *es)
824{
825 struct gfs2_ea_request *er = es->es_er;
826 struct buffer_head *dibh;
827 int error;
828
829 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0);
830 if (error)
831 return error;
832
833 gfs2_trans_add_bh(ip->i_gl, bh, 1);
834
835 if (es->ea_split)
836 ea = ea_split_ea(ea);
837
838 ea_write(ip, ea, er);
839
840 if (es->es_el)
841 ea_set_remove_stuffed(ip, es->es_el);
842
843 error = gfs2_meta_inode_buffer(ip, &dibh);
844 if (error)
845 goto out;
846
847 if (er->er_flags & GFS2_ERF_MODE) {
848 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
849 (ip->i_di.di_mode & S_IFMT) == (er->er_mode & S_IFMT));
850 ip->i_di.di_mode = er->er_mode;
851 }
852 ip->i_di.di_ctime = get_seconds();
853 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
854 gfs2_dinode_out(&ip->i_di, dibh->b_data);
855 brelse(dibh);
856out:
857 gfs2_trans_end(GFS2_SB(&ip->i_inode));
858 return error;
859}
860
861static int ea_set_simple_alloc(struct gfs2_inode *ip,
862 struct gfs2_ea_request *er, void *private)
863{
864 struct ea_set *es = private;
865 struct gfs2_ea_header *ea = es->es_ea;
866 int error;
867
868 gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
869
870 if (es->ea_split)
871 ea = ea_split_ea(ea);
872
873 error = ea_write(ip, ea, er);
874 if (error)
875 return error;
876
877 if (es->es_el)
878 ea_set_remove_stuffed(ip, es->es_el);
879
880 return 0;
881}
882
883static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
884 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
885 void *private)
886{
887 struct ea_set *es = private;
888 unsigned int size;
889 int stuffed;
890 int error;
891
892 stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size);
893
894 if (ea->ea_type == GFS2_EATYPE_UNUSED) {
895 if (GFS2_EA_REC_LEN(ea) < size)
896 return 0;
897 if (!GFS2_EA_IS_STUFFED(ea)) {
898 error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
899 if (error)
900 return error;
901 }
902 es->ea_split = 0;
903 } else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
904 es->ea_split = 1;
905 else
906 return 0;
907
908 if (stuffed) {
909 error = ea_set_simple_noalloc(ip, bh, ea, es);
910 if (error)
911 return error;
912 } else {
913 unsigned int blks;
914
915 es->es_bh = bh;
916 es->es_ea = ea;
917 blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len,
918 GFS2_SB(&ip->i_inode)->sd_jbsize);
919
920 error = ea_alloc_skeleton(ip, es->es_er, blks,
921 ea_set_simple_alloc, es);
922 if (error)
923 return error;
924 }
925
926 return 1;
927}
928
929static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
930 void *private)
931{
932 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
933 struct buffer_head *indbh, *newbh;
934 u64 *eablk;
935 int error;
936 int mh_size = sizeof(struct gfs2_meta_header);
937
938 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
939 u64 *end;
940
941 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT,
942 &indbh);
943 if (error)
944 return error;
945
946 if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
947 error = -EIO;
948 goto out;
949 }
950
951 eablk = (u64 *)(indbh->b_data + mh_size);
952 end = eablk + sdp->sd_inptrs;
953
954 for (; eablk < end; eablk++)
955 if (!*eablk)
956 break;
957
958 if (eablk == end) {
959 error = -ENOSPC;
960 goto out;
961 }
962
963 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
964 } else {
965 u64 blk;
966
967 blk = gfs2_alloc_meta(ip);
968
969 indbh = gfs2_meta_new(ip->i_gl, blk);
970 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
971 gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
972 gfs2_buffer_clear_tail(indbh, mh_size);
973
974 eablk = (u64 *)(indbh->b_data + mh_size);
975 *eablk = cpu_to_be64(ip->i_di.di_eattr);
976 ip->i_di.di_eattr = blk;
977 ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
978 ip->i_di.di_blocks++;
979
980 eablk++;
981 }
982
983 error = ea_alloc_blk(ip, &newbh);
984 if (error)
985 goto out;
986
987 *eablk = cpu_to_be64((u64)newbh->b_blocknr);
988 error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
989 brelse(newbh);
990 if (error)
991 goto out;
992
993 if (private)
994 ea_set_remove_stuffed(ip, private);
995
996out:
997 brelse(indbh);
998 return error;
999}
1000
1001static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
1002 struct gfs2_ea_location *el)
1003{
1004 struct ea_set es;
1005 unsigned int blks = 2;
1006 int error;
1007
1008 memset(&es, 0, sizeof(struct ea_set));
1009 es.es_er = er;
1010 es.es_el = el;
1011
1012 error = ea_foreach(ip, ea_set_simple, &es);
1013 if (error > 0)
1014 return 0;
1015 if (error)
1016 return error;
1017
1018 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
1019 blks++;
1020 if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
1021 blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
1022
1023 return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
1024}
1025
1026static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
1027 struct gfs2_ea_location *el)
1028{
1029 if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
1030 el->el_prev = GFS2_EA2NEXT(el->el_prev);
1031 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
1032 GFS2_EA2NEXT(el->el_prev) == el->el_ea);
1033 }
1034
1035 return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
1036}
1037
1038int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1039{
1040 struct gfs2_ea_location el;
1041 int error;
1042
1043 if (!ip->i_di.di_eattr) {
1044 if (er->er_flags & XATTR_REPLACE)
1045 return -ENODATA;
1046 return ea_init(ip, er);
1047 }
1048
1049 error = gfs2_ea_find(ip, er, &el);
1050 if (error)
1051 return error;
1052
1053 if (el.el_ea) {
1054 if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
1055 brelse(el.el_bh);
1056 return -EPERM;
1057 }
1058
1059 error = -EEXIST;
1060 if (!(er->er_flags & XATTR_CREATE)) {
1061 int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
1062 error = ea_set_i(ip, er, &el);
1063 if (!error && unstuffed)
1064 ea_set_remove_unstuffed(ip, &el);
1065 }
1066
1067 brelse(el.el_bh);
1068 } else {
1069 error = -ENODATA;
1070 if (!(er->er_flags & XATTR_REPLACE))
1071 error = ea_set_i(ip, er, NULL);
1072 }
1073
1074 return error;
1075}
1076
1077int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1078{
1079 struct gfs2_holder i_gh;
1080 int error;
1081
1082 if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
1083 return -EINVAL;
1084 if (!er->er_data || !er->er_data_len) {
1085 er->er_data = NULL;
1086 er->er_data_len = 0;
1087 }
1088 error = ea_check_size(GFS2_SB(&ip->i_inode), er);
1089 if (error)
1090 return error;
1091
1092 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1093 if (error)
1094 return error;
1095
1096 if (IS_IMMUTABLE(&ip->i_inode))
1097 error = -EPERM;
1098 else
1099 error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
1100
1101 gfs2_glock_dq_uninit(&i_gh);
1102
1103 return error;
1104}
1105
1106static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
1107{
1108 struct gfs2_ea_header *ea = el->el_ea;
1109 struct gfs2_ea_header *prev = el->el_prev;
1110 struct buffer_head *dibh;
1111 int error;
1112
1113 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
1114 if (error)
1115 return error;
1116
1117 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
1118
1119 if (prev) {
1120 u32 len;
1121
1122 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
1123 prev->ea_rec_len = cpu_to_be32(len);
1124
1125 if (GFS2_EA_IS_LAST(ea))
1126 prev->ea_flags |= GFS2_EAFLAG_LAST;
1127 } else
1128 ea->ea_type = GFS2_EATYPE_UNUSED;
1129
1130 error = gfs2_meta_inode_buffer(ip, &dibh);
1131 if (!error) {
1132 ip->i_di.di_ctime = get_seconds();
1133 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1134 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1135 brelse(dibh);
1136 }
1137
1138 gfs2_trans_end(GFS2_SB(&ip->i_inode));
1139
1140 return error;
1141}
1142
1143int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1144{
1145 struct gfs2_ea_location el;
1146 int error;
1147
1148 if (!ip->i_di.di_eattr)
1149 return -ENODATA;
1150
1151 error = gfs2_ea_find(ip, er, &el);
1152 if (error)
1153 return error;
1154 if (!el.el_ea)
1155 return -ENODATA;
1156
1157 if (GFS2_EA_IS_STUFFED(el.el_ea))
1158 error = ea_remove_stuffed(ip, &el);
1159 else
1160 error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
1161 0);
1162
1163 brelse(el.el_bh);
1164
1165 return error;
1166}
1167
1168/**
1169 * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
1170 * @ip: pointer to the inode of the target file
1171 * @er: request information
1172 *
1173 * Returns: errno
1174 */
1175
1176int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1177{
1178 struct gfs2_holder i_gh;
1179 int error;
1180
1181 if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
1182 return -EINVAL;
1183
1184 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1185 if (error)
1186 return error;
1187
1188 if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
1189 error = -EPERM;
1190 else
1191 error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
1192
1193 gfs2_glock_dq_uninit(&i_gh);
1194
1195 return error;
1196}
1197
1198static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
1199 struct gfs2_ea_header *ea, char *data)
1200{
1201 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1202 struct buffer_head **bh;
1203 unsigned int amount = GFS2_EA_DATA_LEN(ea);
1204 unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
1205 u64 *dataptrs = GFS2_EA2DATAPTRS(ea);
1206 unsigned int x;
1207 int error;
1208
1209 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
1210 if (!bh)
1211 return -ENOMEM;
1212
1213 error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
1214 if (error)
1215 goto out;
1216
1217 for (x = 0; x < nptrs; x++) {
1218 error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
1219 bh + x);
1220 if (error) {
1221 while (x--)
1222 brelse(bh[x]);
1223 goto fail;
1224 }
1225 dataptrs++;
1226 }
1227
1228 for (x = 0; x < nptrs; x++) {
1229 error = gfs2_meta_wait(sdp, bh[x]);
1230 if (error) {
1231 for (; x < nptrs; x++)
1232 brelse(bh[x]);
1233 goto fail;
1234 }
1235 if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
1236 for (; x < nptrs; x++)
1237 brelse(bh[x]);
1238 error = -EIO;
1239 goto fail;
1240 }
1241
1242 gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
1243
1244 memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header), data,
1245 (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
1246
1247 amount -= sdp->sd_jbsize;
1248 data += sdp->sd_jbsize;
1249
1250 brelse(bh[x]);
1251 }
1252
1253out:
1254 kfree(bh);
1255 return error;
1256
1257fail:
1258 gfs2_trans_end(sdp);
1259 kfree(bh);
1260 return error;
1261}
1262
1263int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
1264 struct iattr *attr, char *data)
1265{
1266 struct buffer_head *dibh;
1267 int error;
1268
1269 if (GFS2_EA_IS_STUFFED(el->el_ea)) {
1270 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
1271 if (error)
1272 return error;
1273
1274 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
1275 memcpy(GFS2_EA2DATA(el->el_ea), data,
1276 GFS2_EA_DATA_LEN(el->el_ea));
1277 } else
1278 error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
1279
1280 if (error)
1281 return error;
1282
1283 error = gfs2_meta_inode_buffer(ip, &dibh);
1284 if (!error) {
1285 error = inode_setattr(&ip->i_inode, attr);
1286 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
1287 gfs2_inode_attr_out(ip);
1288 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1289 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1290 brelse(dibh);
1291 }
1292
1293 gfs2_trans_end(GFS2_SB(&ip->i_inode));
1294
1295 return error;
1296}
1297
1298static int ea_dealloc_indirect(struct gfs2_inode *ip)
1299{
1300 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1301 struct gfs2_rgrp_list rlist;
1302 struct buffer_head *indbh, *dibh;
1303 u64 *eablk, *end;
1304 unsigned int rg_blocks = 0;
1305 u64 bstart = 0;
1306 unsigned int blen = 0;
1307 unsigned int blks = 0;
1308 unsigned int x;
1309 int error;
1310
1311 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1312
1313 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh);
1314 if (error)
1315 return error;
1316
1317 if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
1318 error = -EIO;
1319 goto out;
1320 }
1321
1322 eablk = (u64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
1323 end = eablk + sdp->sd_inptrs;
1324
1325 for (; eablk < end; eablk++) {
1326 u64 bn;
1327
1328 if (!*eablk)
1329 break;
1330 bn = be64_to_cpu(*eablk);
1331
1332 if (bstart + blen == bn)
1333 blen++;
1334 else {
1335 if (bstart)
1336 gfs2_rlist_add(sdp, &rlist, bstart);
1337 bstart = bn;
1338 blen = 1;
1339 }
1340 blks++;
1341 }
1342 if (bstart)
1343 gfs2_rlist_add(sdp, &rlist, bstart);
1344 else
1345 goto out;
1346
1347 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
1348
1349 for (x = 0; x < rlist.rl_rgrps; x++) {
1350 struct gfs2_rgrpd *rgd;
1351 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
1352 rg_blocks += rgd->rd_ri.ri_length;
1353 }
1354
1355 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
1356 if (error)
1357 goto out_rlist_free;
1358
1359 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT +
1360 RES_STATFS + RES_QUOTA, blks);
1361 if (error)
1362 goto out_gunlock;
1363
1364 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
1365
1366 eablk = (u64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
1367 bstart = 0;
1368 blen = 0;
1369
1370 for (; eablk < end; eablk++) {
1371 u64 bn;
1372
1373 if (!*eablk)
1374 break;
1375 bn = be64_to_cpu(*eablk);
1376
1377 if (bstart + blen == bn)
1378 blen++;
1379 else {
1380 if (bstart)
1381 gfs2_free_meta(ip, bstart, blen);
1382 bstart = bn;
1383 blen = 1;
1384 }
1385
1386 *eablk = 0;
1387 if (!ip->i_di.di_blocks)
1388 gfs2_consist_inode(ip);
1389 ip->i_di.di_blocks--;
1390 }
1391 if (bstart)
1392 gfs2_free_meta(ip, bstart, blen);
1393
1394 ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
1395
1396 error = gfs2_meta_inode_buffer(ip, &dibh);
1397 if (!error) {
1398 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1399 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1400 brelse(dibh);
1401 }
1402
1403 gfs2_trans_end(sdp);
1404
1405out_gunlock:
1406 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1407out_rlist_free:
1408 gfs2_rlist_free(&rlist);
1409out:
1410 brelse(indbh);
1411 return error;
1412}
1413
1414static int ea_dealloc_block(struct gfs2_inode *ip)
1415{
1416 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1417 struct gfs2_alloc *al = &ip->i_alloc;
1418 struct gfs2_rgrpd *rgd;
1419 struct buffer_head *dibh;
1420 int error;
1421
1422 rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
1423 if (!rgd) {
1424 gfs2_consist_inode(ip);
1425 return -EIO;
1426 }
1427
1428 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
1429 &al->al_rgd_gh);
1430 if (error)
1431 return error;
1432
1433 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE + RES_STATFS +
1434 RES_QUOTA, 1);
1435 if (error)
1436 goto out_gunlock;
1437
1438 gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
1439
1440 ip->i_di.di_eattr = 0;
1441 if (!ip->i_di.di_blocks)
1442 gfs2_consist_inode(ip);
1443 ip->i_di.di_blocks--;
1444
1445 error = gfs2_meta_inode_buffer(ip, &dibh);
1446 if (!error) {
1447 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1448 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1449 brelse(dibh);
1450 }
1451
1452 gfs2_trans_end(sdp);
1453
1454out_gunlock:
1455 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1456 return error;
1457}
1458
1459/**
1460 * gfs2_ea_dealloc - deallocate the extended attribute fork
1461 * @ip: the inode
1462 *
1463 * Returns: errno
1464 */
1465
1466int gfs2_ea_dealloc(struct gfs2_inode *ip)
1467{
1468 struct gfs2_alloc *al;
1469 int error;
1470
1471 al = gfs2_alloc_get(ip);
1472
1473 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1474 if (error)
1475 goto out_alloc;
1476
1477 error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
1478 if (error)
1479 goto out_quota;
1480
1481 error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
1482 if (error)
1483 goto out_rindex;
1484
1485 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
1486 error = ea_dealloc_indirect(ip);
1487 if (error)
1488 goto out_rindex;
1489 }
1490
1491 error = ea_dealloc_block(ip);
1492
1493out_rindex:
1494 gfs2_glock_dq_uninit(&al->al_ri_gh);
1495out_quota:
1496 gfs2_quota_unhold(ip);
1497out_alloc:
1498 gfs2_alloc_put(ip);
1499 return error;
1500}
1501
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h
new file mode 100644
index 000000000000..ffa65947d686
--- /dev/null
+++ b/fs/gfs2/eattr.h
@@ -0,0 +1,100 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __EATTR_DOT_H__
11#define __EATTR_DOT_H__
12
13struct gfs2_inode;
14struct iattr;
15
16#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
17#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
18
19#define GFS2_EA_SIZE(ea) \
20ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
21 ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
22 (sizeof(u64) * (ea)->ea_num_ptrs)), 8)
23
24#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
25#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
26
27#define GFS2_EAREQ_SIZE_STUFFED(er) \
28ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
29
30#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
31ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
32 sizeof(u64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
33
34#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
35#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
36
37#define GFS2_EA2DATAPTRS(ea) \
38((u64 *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
39
40#define GFS2_EA2NEXT(ea) \
41((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
42
43#define GFS2_EA_BH2FIRST(bh) \
44((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
45
46#define GFS2_ERF_MODE 0x80000000
47
48struct gfs2_ea_request {
49 const char *er_name;
50 char *er_data;
51 unsigned int er_name_len;
52 unsigned int er_data_len;
53 unsigned int er_type; /* GFS2_EATYPE_... */
54 int er_flags;
55 mode_t er_mode;
56};
57
58struct gfs2_ea_location {
59 struct buffer_head *el_bh;
60 struct gfs2_ea_header *el_ea;
61 struct gfs2_ea_header *el_prev;
62};
63
64int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
65int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
66int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
67
68int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
69int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
70int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
71int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
72
73int gfs2_ea_dealloc(struct gfs2_inode *ip);
74
75/* Exported to acl.c */
76
77int gfs2_ea_find(struct gfs2_inode *ip,
78 struct gfs2_ea_request *er,
79 struct gfs2_ea_location *el);
80int gfs2_ea_get_copy(struct gfs2_inode *ip,
81 struct gfs2_ea_location *el,
82 char *data);
83int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
84 struct iattr *attr, char *data);
85
86static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
87{
88 switch (ea->ea_type) {
89 case GFS2_EATYPE_USR:
90 return 5 + ea->ea_name_len + 1;
91 case GFS2_EATYPE_SYS:
92 return 7 + ea->ea_name_len + 1;
93 case GFS2_EATYPE_SECURITY:
94 return 9 + ea->ea_name_len + 1;
95 default:
96 return 0;
97 }
98}
99
100#endif /* __EATTR_DOT_H__ */
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
new file mode 100644
index 000000000000..3bb11c0f8b56
--- /dev/null
+++ b/fs/gfs2/gfs2.h
@@ -0,0 +1,31 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __GFS2_DOT_H__
11#define __GFS2_DOT_H__
12
13enum {
14 NO_CREATE = 0,
15 CREATE = 1,
16};
17
18enum {
19 NO_WAIT = 0,
20 WAIT = 1,
21};
22
23enum {
24 NO_FORCE = 0,
25 FORCE = 1,
26};
27
28#define GFS2_FAST_NAME_SIZE 8
29
30#endif /* __GFS2_DOT_H__ */
31
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
new file mode 100644
index 000000000000..78fe0fae23ff
--- /dev/null
+++ b/fs/gfs2/glock.c
@@ -0,0 +1,2231 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/delay.h>
16#include <linux/sort.h>
17#include <linux/jhash.h>
18#include <linux/kallsyms.h>
19#include <linux/gfs2_ondisk.h>
20#include <linux/list.h>
21#include <linux/lm_interface.h>
22#include <asm/uaccess.h>
23
24#include "gfs2.h"
25#include "incore.h"
26#include "glock.h"
27#include "glops.h"
28#include "inode.h"
29#include "lm.h"
30#include "lops.h"
31#include "meta_io.h"
32#include "quota.h"
33#include "super.h"
34#include "util.h"
35
36struct greedy {
37 struct gfs2_holder gr_gh;
38 struct work_struct gr_work;
39};
40
41struct gfs2_gl_hash_bucket {
42 struct hlist_head hb_list;
43};
44
45typedef void (*glock_examiner) (struct gfs2_glock * gl);
46
47static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
48static int dump_glock(struct gfs2_glock *gl);
49static int dump_inode(struct gfs2_inode *ip);
50
51#define GFS2_GL_HASH_SHIFT 15
52#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
53#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1)
54
55static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE];
56
57/*
58 * Despite what you might think, the numbers below are not arbitrary :-)
59 * They are taken from the ipv4 routing hash code, which is well tested
60 * and thus should be nearly optimal. Later on we might tweek the numbers
61 * but for now this should be fine.
62 *
63 * The reason for putting the locks in a separate array from the list heads
64 * is that we can have fewer locks than list heads and save memory. We use
65 * the same hash function for both, but with a different hash mask.
66 */
67#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
68 defined(CONFIG_PROVE_LOCKING)
69
70#ifdef CONFIG_LOCKDEP
71# define GL_HASH_LOCK_SZ 256
72#else
73# if NR_CPUS >= 32
74# define GL_HASH_LOCK_SZ 4096
75# elif NR_CPUS >= 16
76# define GL_HASH_LOCK_SZ 2048
77# elif NR_CPUS >= 8
78# define GL_HASH_LOCK_SZ 1024
79# elif NR_CPUS >= 4
80# define GL_HASH_LOCK_SZ 512
81# else
82# define GL_HASH_LOCK_SZ 256
83# endif
84#endif
85
86/* We never want more locks than chains */
87#if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ
88# undef GL_HASH_LOCK_SZ
89# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE
90#endif
91
92static rwlock_t gl_hash_locks[GL_HASH_LOCK_SZ];
93
94static inline rwlock_t *gl_lock_addr(unsigned int x)
95{
96 return &gl_hash_locks[x & (GL_HASH_LOCK_SZ-1)];
97}
98#else /* not SMP, so no spinlocks required */
99static inline rwlock_t *gl_lock_addr(x)
100{
101 return NULL;
102}
103#endif
104
105/**
106 * relaxed_state_ok - is a requested lock compatible with the current lock mode?
107 * @actual: the current state of the lock
108 * @requested: the lock state that was requested by the caller
109 * @flags: the modifier flags passed in by the caller
110 *
111 * Returns: 1 if the locks are compatible, 0 otherwise
112 */
113
114static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
115 int flags)
116{
117 if (actual == requested)
118 return 1;
119
120 if (flags & GL_EXACT)
121 return 0;
122
123 if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
124 return 1;
125
126 if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
127 return 1;
128
129 return 0;
130}
131
132/**
133 * gl_hash() - Turn glock number into hash bucket number
134 * @lock: The glock number
135 *
136 * Returns: The number of the corresponding hash bucket
137 */
138
139static unsigned int gl_hash(const struct gfs2_sbd *sdp,
140 const struct lm_lockname *name)
141{
142 unsigned int h;
143
144 h = jhash(&name->ln_number, sizeof(u64), 0);
145 h = jhash(&name->ln_type, sizeof(unsigned int), h);
146 h = jhash(&sdp, sizeof(struct gfs2_sbd *), h);
147 h &= GFS2_GL_HASH_MASK;
148
149 return h;
150}
151
152/**
153 * glock_free() - Perform a few checks and then release struct gfs2_glock
154 * @gl: The glock to release
155 *
156 * Also calls lock module to release its internal structure for this glock.
157 *
158 */
159
160static void glock_free(struct gfs2_glock *gl)
161{
162 struct gfs2_sbd *sdp = gl->gl_sbd;
163 struct inode *aspace = gl->gl_aspace;
164
165 gfs2_lm_put_lock(sdp, gl->gl_lock);
166
167 if (aspace)
168 gfs2_aspace_put(aspace);
169
170 kmem_cache_free(gfs2_glock_cachep, gl);
171}
172
173/**
174 * gfs2_glock_hold() - increment reference count on glock
175 * @gl: The glock to hold
176 *
177 */
178
179void gfs2_glock_hold(struct gfs2_glock *gl)
180{
181 atomic_inc(&gl->gl_ref);
182}
183
184/**
185 * gfs2_glock_put() - Decrement reference count on glock
186 * @gl: The glock to put
187 *
188 */
189
190int gfs2_glock_put(struct gfs2_glock *gl)
191{
192 int rv = 0;
193 struct gfs2_sbd *sdp = gl->gl_sbd;
194
195 write_lock(gl_lock_addr(gl->gl_hash));
196 if (atomic_dec_and_test(&gl->gl_ref)) {
197 hlist_del(&gl->gl_list);
198 write_unlock(gl_lock_addr(gl->gl_hash));
199 BUG_ON(spin_is_locked(&gl->gl_spin));
200 gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
201 gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
202 gfs2_assert(sdp, list_empty(&gl->gl_holders));
203 gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
204 gfs2_assert(sdp, list_empty(&gl->gl_waiters2));
205 gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
206 glock_free(gl);
207 rv = 1;
208 goto out;
209 }
210 write_unlock(gl_lock_addr(gl->gl_hash));
211out:
212 return rv;
213}
214
215/**
216 * queue_empty - check to see if a glock's queue is empty
217 * @gl: the glock
218 * @head: the head of the queue to check
219 *
220 * This function protects the list in the event that a process already
221 * has a holder on the list and is adding a second holder for itself.
222 * The glmutex lock is what generally prevents processes from working
223 * on the same glock at once, but the special case of adding a second
224 * holder for yourself ("recursive" locking) doesn't involve locking
225 * glmutex, making the spin lock necessary.
226 *
227 * Returns: 1 if the queue is empty
228 */
229
230static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head)
231{
232 int empty;
233 spin_lock(&gl->gl_spin);
234 empty = list_empty(head);
235 spin_unlock(&gl->gl_spin);
236 return empty;
237}
238
239/**
240 * search_bucket() - Find struct gfs2_glock by lock number
241 * @bucket: the bucket to search
242 * @name: The lock name
243 *
244 * Returns: NULL, or the struct gfs2_glock with the requested number
245 */
246
247static struct gfs2_glock *search_bucket(unsigned int hash,
248 const struct gfs2_sbd *sdp,
249 const struct lm_lockname *name)
250{
251 struct gfs2_glock *gl;
252 struct hlist_node *h;
253
254 hlist_for_each_entry(gl, h, &gl_hash_table[hash].hb_list, gl_list) {
255 if (!lm_name_equal(&gl->gl_name, name))
256 continue;
257 if (gl->gl_sbd != sdp)
258 continue;
259
260 atomic_inc(&gl->gl_ref);
261
262 return gl;
263 }
264
265 return NULL;
266}
267
268/**
269 * gfs2_glock_find() - Find glock by lock number
270 * @sdp: The GFS2 superblock
271 * @name: The lock name
272 *
273 * Returns: NULL, or the struct gfs2_glock with the requested number
274 */
275
276static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
277 const struct lm_lockname *name)
278{
279 unsigned int hash = gl_hash(sdp, name);
280 struct gfs2_glock *gl;
281
282 read_lock(gl_lock_addr(hash));
283 gl = search_bucket(hash, sdp, name);
284 read_unlock(gl_lock_addr(hash));
285
286 return gl;
287}
288
289/**
290 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
291 * @sdp: The GFS2 superblock
292 * @number: the lock number
293 * @glops: The glock_operations to use
294 * @create: If 0, don't create the glock if it doesn't exist
295 * @glp: the glock is returned here
296 *
297 * This does not lock a glock, just finds/creates structures for one.
298 *
299 * Returns: errno
300 */
301
302int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
303 const struct gfs2_glock_operations *glops, int create,
304 struct gfs2_glock **glp)
305{
306 struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
307 struct gfs2_glock *gl, *tmp;
308 unsigned int hash = gl_hash(sdp, &name);
309 int error;
310
311 read_lock(gl_lock_addr(hash));
312 gl = search_bucket(hash, sdp, &name);
313 read_unlock(gl_lock_addr(hash));
314
315 if (gl || !create) {
316 *glp = gl;
317 return 0;
318 }
319
320 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
321 if (!gl)
322 return -ENOMEM;
323
324 gl->gl_flags = 0;
325 gl->gl_name = name;
326 atomic_set(&gl->gl_ref, 1);
327 gl->gl_state = LM_ST_UNLOCKED;
328 gl->gl_hash = hash;
329 gl->gl_owner = NULL;
330 gl->gl_ip = 0;
331 gl->gl_ops = glops;
332 gl->gl_req_gh = NULL;
333 gl->gl_req_bh = NULL;
334 gl->gl_vn = 0;
335 gl->gl_stamp = jiffies;
336 gl->gl_object = NULL;
337 gl->gl_sbd = sdp;
338 gl->gl_aspace = NULL;
339 lops_init_le(&gl->gl_le, &gfs2_glock_lops);
340
341 /* If this glock protects actual on-disk data or metadata blocks,
342 create a VFS inode to manage the pages/buffers holding them. */
343 if (glops == &gfs2_inode_glops || glops == &gfs2_rgrp_glops) {
344 gl->gl_aspace = gfs2_aspace_get(sdp);
345 if (!gl->gl_aspace) {
346 error = -ENOMEM;
347 goto fail;
348 }
349 }
350
351 error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock);
352 if (error)
353 goto fail_aspace;
354
355 write_lock(gl_lock_addr(hash));
356 tmp = search_bucket(hash, sdp, &name);
357 if (tmp) {
358 write_unlock(gl_lock_addr(hash));
359 glock_free(gl);
360 gl = tmp;
361 } else {
362 hlist_add_head(&gl->gl_list, &gl_hash_table[hash].hb_list);
363 write_unlock(gl_lock_addr(hash));
364 }
365
366 *glp = gl;
367
368 return 0;
369
370fail_aspace:
371 if (gl->gl_aspace)
372 gfs2_aspace_put(gl->gl_aspace);
373fail:
374 kmem_cache_free(gfs2_glock_cachep, gl);
375 return error;
376}
377
378/**
379 * gfs2_holder_init - initialize a struct gfs2_holder in the default way
380 * @gl: the glock
381 * @state: the state we're requesting
382 * @flags: the modifier flags
383 * @gh: the holder structure
384 *
385 */
386
387void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
388 struct gfs2_holder *gh)
389{
390 INIT_LIST_HEAD(&gh->gh_list);
391 gh->gh_gl = gl;
392 gh->gh_ip = (unsigned long)__builtin_return_address(0);
393 gh->gh_owner = current;
394 gh->gh_state = state;
395 gh->gh_flags = flags;
396 gh->gh_error = 0;
397 gh->gh_iflags = 0;
398 init_completion(&gh->gh_wait);
399
400 if (gh->gh_state == LM_ST_EXCLUSIVE)
401 gh->gh_flags |= GL_LOCAL_EXCL;
402
403 gfs2_glock_hold(gl);
404}
405
406/**
407 * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
408 * @state: the state we're requesting
409 * @flags: the modifier flags
410 * @gh: the holder structure
411 *
412 * Don't mess with the glock.
413 *
414 */
415
416void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh)
417{
418 gh->gh_state = state;
419 gh->gh_flags = flags;
420 if (gh->gh_state == LM_ST_EXCLUSIVE)
421 gh->gh_flags |= GL_LOCAL_EXCL;
422
423 gh->gh_iflags &= 1 << HIF_ALLOCED;
424 gh->gh_ip = (unsigned long)__builtin_return_address(0);
425}
426
427/**
428 * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference)
429 * @gh: the holder structure
430 *
431 */
432
433void gfs2_holder_uninit(struct gfs2_holder *gh)
434{
435 gfs2_glock_put(gh->gh_gl);
436 gh->gh_gl = NULL;
437 gh->gh_ip = 0;
438}
439
440/**
441 * gfs2_holder_get - get a struct gfs2_holder structure
442 * @gl: the glock
443 * @state: the state we're requesting
444 * @flags: the modifier flags
445 * @gfp_flags:
446 *
447 * Figure out how big an impact this function has. Either:
448 * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd
449 * 2) Leave it like it is
450 *
451 * Returns: the holder structure, NULL on ENOMEM
452 */
453
454static struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl,
455 unsigned int state,
456 int flags, gfp_t gfp_flags)
457{
458 struct gfs2_holder *gh;
459
460 gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags);
461 if (!gh)
462 return NULL;
463
464 gfs2_holder_init(gl, state, flags, gh);
465 set_bit(HIF_ALLOCED, &gh->gh_iflags);
466 gh->gh_ip = (unsigned long)__builtin_return_address(0);
467 return gh;
468}
469
470/**
471 * gfs2_holder_put - get rid of a struct gfs2_holder structure
472 * @gh: the holder structure
473 *
474 */
475
476static void gfs2_holder_put(struct gfs2_holder *gh)
477{
478 gfs2_holder_uninit(gh);
479 kfree(gh);
480}
481
482/**
483 * rq_mutex - process a mutex request in the queue
484 * @gh: the glock holder
485 *
486 * Returns: 1 if the queue is blocked
487 */
488
489static int rq_mutex(struct gfs2_holder *gh)
490{
491 struct gfs2_glock *gl = gh->gh_gl;
492
493 list_del_init(&gh->gh_list);
494 /* gh->gh_error never examined. */
495 set_bit(GLF_LOCK, &gl->gl_flags);
496 complete(&gh->gh_wait);
497
498 return 1;
499}
500
501/**
502 * rq_promote - process a promote request in the queue
503 * @gh: the glock holder
504 *
505 * Acquire a new inter-node lock, or change a lock state to more restrictive.
506 *
507 * Returns: 1 if the queue is blocked
508 */
509
510static int rq_promote(struct gfs2_holder *gh)
511{
512 struct gfs2_glock *gl = gh->gh_gl;
513 struct gfs2_sbd *sdp = gl->gl_sbd;
514 const struct gfs2_glock_operations *glops = gl->gl_ops;
515
516 if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
517 if (list_empty(&gl->gl_holders)) {
518 gl->gl_req_gh = gh;
519 set_bit(GLF_LOCK, &gl->gl_flags);
520 spin_unlock(&gl->gl_spin);
521
522 if (atomic_read(&sdp->sd_reclaim_count) >
523 gfs2_tune_get(sdp, gt_reclaim_limit) &&
524 !(gh->gh_flags & LM_FLAG_PRIORITY)) {
525 gfs2_reclaim_glock(sdp);
526 gfs2_reclaim_glock(sdp);
527 }
528
529 glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
530 spin_lock(&gl->gl_spin);
531 }
532 return 1;
533 }
534
535 if (list_empty(&gl->gl_holders)) {
536 set_bit(HIF_FIRST, &gh->gh_iflags);
537 set_bit(GLF_LOCK, &gl->gl_flags);
538 } else {
539 struct gfs2_holder *next_gh;
540 if (gh->gh_flags & GL_LOCAL_EXCL)
541 return 1;
542 next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
543 gh_list);
544 if (next_gh->gh_flags & GL_LOCAL_EXCL)
545 return 1;
546 }
547
548 list_move_tail(&gh->gh_list, &gl->gl_holders);
549 gh->gh_error = 0;
550 set_bit(HIF_HOLDER, &gh->gh_iflags);
551
552 complete(&gh->gh_wait);
553
554 return 0;
555}
556
557/**
558 * rq_demote - process a demote request in the queue
559 * @gh: the glock holder
560 *
561 * Returns: 1 if the queue is blocked
562 */
563
564static int rq_demote(struct gfs2_holder *gh)
565{
566 struct gfs2_glock *gl = gh->gh_gl;
567 const struct gfs2_glock_operations *glops = gl->gl_ops;
568
569 if (!list_empty(&gl->gl_holders))
570 return 1;
571
572 if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
573 list_del_init(&gh->gh_list);
574 gh->gh_error = 0;
575 spin_unlock(&gl->gl_spin);
576 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
577 gfs2_holder_put(gh);
578 else
579 complete(&gh->gh_wait);
580 spin_lock(&gl->gl_spin);
581 } else {
582 gl->gl_req_gh = gh;
583 set_bit(GLF_LOCK, &gl->gl_flags);
584 spin_unlock(&gl->gl_spin);
585
586 if (gh->gh_state == LM_ST_UNLOCKED ||
587 gl->gl_state != LM_ST_EXCLUSIVE)
588 glops->go_drop_th(gl);
589 else
590 glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
591
592 spin_lock(&gl->gl_spin);
593 }
594
595 return 0;
596}
597
598/**
599 * rq_greedy - process a queued request to drop greedy status
600 * @gh: the glock holder
601 *
602 * Returns: 1 if the queue is blocked
603 */
604
605static int rq_greedy(struct gfs2_holder *gh)
606{
607 struct gfs2_glock *gl = gh->gh_gl;
608
609 list_del_init(&gh->gh_list);
610 /* gh->gh_error never examined. */
611 clear_bit(GLF_GREEDY, &gl->gl_flags);
612 spin_unlock(&gl->gl_spin);
613
614 gfs2_holder_uninit(gh);
615 kfree(container_of(gh, struct greedy, gr_gh));
616
617 spin_lock(&gl->gl_spin);
618
619 return 0;
620}
621
622/**
623 * run_queue - process holder structures on a glock
624 * @gl: the glock
625 *
626 */
627static void run_queue(struct gfs2_glock *gl)
628{
629 struct gfs2_holder *gh;
630 int blocked = 1;
631
632 for (;;) {
633 if (test_bit(GLF_LOCK, &gl->gl_flags))
634 break;
635
636 if (!list_empty(&gl->gl_waiters1)) {
637 gh = list_entry(gl->gl_waiters1.next,
638 struct gfs2_holder, gh_list);
639
640 if (test_bit(HIF_MUTEX, &gh->gh_iflags))
641 blocked = rq_mutex(gh);
642 else
643 gfs2_assert_warn(gl->gl_sbd, 0);
644
645 } else if (!list_empty(&gl->gl_waiters2) &&
646 !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) {
647 gh = list_entry(gl->gl_waiters2.next,
648 struct gfs2_holder, gh_list);
649
650 if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
651 blocked = rq_demote(gh);
652 else if (test_bit(HIF_GREEDY, &gh->gh_iflags))
653 blocked = rq_greedy(gh);
654 else
655 gfs2_assert_warn(gl->gl_sbd, 0);
656
657 } else if (!list_empty(&gl->gl_waiters3)) {
658 gh = list_entry(gl->gl_waiters3.next,
659 struct gfs2_holder, gh_list);
660
661 if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
662 blocked = rq_promote(gh);
663 else
664 gfs2_assert_warn(gl->gl_sbd, 0);
665
666 } else
667 break;
668
669 if (blocked)
670 break;
671 }
672}
673
674/**
675 * gfs2_glmutex_lock - acquire a local lock on a glock
676 * @gl: the glock
677 *
678 * Gives caller exclusive access to manipulate a glock structure.
679 */
680
681static void gfs2_glmutex_lock(struct gfs2_glock *gl)
682{
683 struct gfs2_holder gh;
684
685 gfs2_holder_init(gl, 0, 0, &gh);
686 set_bit(HIF_MUTEX, &gh.gh_iflags);
687
688 spin_lock(&gl->gl_spin);
689 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
690 list_add_tail(&gh.gh_list, &gl->gl_waiters1);
691 } else {
692 gl->gl_owner = current;
693 gl->gl_ip = (unsigned long)__builtin_return_address(0);
694 complete(&gh.gh_wait);
695 }
696 spin_unlock(&gl->gl_spin);
697
698 wait_for_completion(&gh.gh_wait);
699 gfs2_holder_uninit(&gh);
700}
701
702/**
703 * gfs2_glmutex_trylock - try to acquire a local lock on a glock
704 * @gl: the glock
705 *
706 * Returns: 1 if the glock is acquired
707 */
708
709static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
710{
711 int acquired = 1;
712
713 spin_lock(&gl->gl_spin);
714 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
715 acquired = 0;
716 } else {
717 gl->gl_owner = current;
718 gl->gl_ip = (unsigned long)__builtin_return_address(0);
719 }
720 spin_unlock(&gl->gl_spin);
721
722 return acquired;
723}
724
725/**
726 * gfs2_glmutex_unlock - release a local lock on a glock
727 * @gl: the glock
728 *
729 */
730
731static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
732{
733 spin_lock(&gl->gl_spin);
734 clear_bit(GLF_LOCK, &gl->gl_flags);
735 gl->gl_owner = NULL;
736 gl->gl_ip = 0;
737 run_queue(gl);
738 BUG_ON(!spin_is_locked(&gl->gl_spin));
739 spin_unlock(&gl->gl_spin);
740}
741
742/**
743 * handle_callback - add a demote request to a lock's queue
744 * @gl: the glock
745 * @state: the state the caller wants us to change to
746 *
747 * Note: This may fail sliently if we are out of memory.
748 */
749
750static void handle_callback(struct gfs2_glock *gl, unsigned int state)
751{
752 struct gfs2_holder *gh, *new_gh = NULL;
753
754restart:
755 spin_lock(&gl->gl_spin);
756
757 list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
758 if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
759 gl->gl_req_gh != gh) {
760 if (gh->gh_state != state)
761 gh->gh_state = LM_ST_UNLOCKED;
762 goto out;
763 }
764 }
765
766 if (new_gh) {
767 list_add_tail(&new_gh->gh_list, &gl->gl_waiters2);
768 new_gh = NULL;
769 } else {
770 spin_unlock(&gl->gl_spin);
771
772 new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY, GFP_KERNEL);
773 if (!new_gh)
774 return;
775 set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
776 set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
777
778 goto restart;
779 }
780
781out:
782 spin_unlock(&gl->gl_spin);
783
784 if (new_gh)
785 gfs2_holder_put(new_gh);
786}
787
788void gfs2_glock_inode_squish(struct inode *inode)
789{
790 struct gfs2_holder gh;
791 struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
792 gfs2_holder_init(gl, LM_ST_UNLOCKED, 0, &gh);
793 set_bit(HIF_DEMOTE, &gh.gh_iflags);
794 spin_lock(&gl->gl_spin);
795 gfs2_assert(inode->i_sb->s_fs_info, list_empty(&gl->gl_holders));
796 list_add_tail(&gh.gh_list, &gl->gl_waiters2);
797 run_queue(gl);
798 spin_unlock(&gl->gl_spin);
799 wait_for_completion(&gh.gh_wait);
800 gfs2_holder_uninit(&gh);
801}
802
803/**
804 * state_change - record that the glock is now in a different state
805 * @gl: the glock
806 * @new_state the new state
807 *
808 */
809
810static void state_change(struct gfs2_glock *gl, unsigned int new_state)
811{
812 int held1, held2;
813
814 held1 = (gl->gl_state != LM_ST_UNLOCKED);
815 held2 = (new_state != LM_ST_UNLOCKED);
816
817 if (held1 != held2) {
818 if (held2)
819 gfs2_glock_hold(gl);
820 else
821 gfs2_glock_put(gl);
822 }
823
824 gl->gl_state = new_state;
825}
826
827/**
828 * xmote_bh - Called after the lock module is done acquiring a lock
829 * @gl: The glock in question
830 * @ret: the int returned from the lock module
831 *
832 */
833
834static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
835{
836 struct gfs2_sbd *sdp = gl->gl_sbd;
837 const struct gfs2_glock_operations *glops = gl->gl_ops;
838 struct gfs2_holder *gh = gl->gl_req_gh;
839 int prev_state = gl->gl_state;
840 int op_done = 1;
841
842 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
843 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
844 gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
845
846 state_change(gl, ret & LM_OUT_ST_MASK);
847
848 if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
849 if (glops->go_inval)
850 glops->go_inval(gl, DIO_METADATA | DIO_DATA);
851 } else if (gl->gl_state == LM_ST_DEFERRED) {
852 /* We might not want to do this here.
853 Look at moving to the inode glops. */
854 if (glops->go_inval)
855 glops->go_inval(gl, DIO_DATA);
856 }
857
858 /* Deal with each possible exit condition */
859
860 if (!gh)
861 gl->gl_stamp = jiffies;
862 else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
863 spin_lock(&gl->gl_spin);
864 list_del_init(&gh->gh_list);
865 gh->gh_error = -EIO;
866 spin_unlock(&gl->gl_spin);
867 } else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
868 spin_lock(&gl->gl_spin);
869 list_del_init(&gh->gh_list);
870 if (gl->gl_state == gh->gh_state ||
871 gl->gl_state == LM_ST_UNLOCKED) {
872 gh->gh_error = 0;
873 } else {
874 if (gfs2_assert_warn(sdp, gh->gh_flags &
875 (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1)
876 fs_warn(sdp, "ret = 0x%.8X\n", ret);
877 gh->gh_error = GLR_TRYFAILED;
878 }
879 spin_unlock(&gl->gl_spin);
880
881 if (ret & LM_OUT_CANCELED)
882 handle_callback(gl, LM_ST_UNLOCKED);
883
884 } else if (ret & LM_OUT_CANCELED) {
885 spin_lock(&gl->gl_spin);
886 list_del_init(&gh->gh_list);
887 gh->gh_error = GLR_CANCELED;
888 spin_unlock(&gl->gl_spin);
889
890 } else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
891 spin_lock(&gl->gl_spin);
892 list_move_tail(&gh->gh_list, &gl->gl_holders);
893 gh->gh_error = 0;
894 set_bit(HIF_HOLDER, &gh->gh_iflags);
895 spin_unlock(&gl->gl_spin);
896
897 set_bit(HIF_FIRST, &gh->gh_iflags);
898
899 op_done = 0;
900
901 } else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
902 spin_lock(&gl->gl_spin);
903 list_del_init(&gh->gh_list);
904 gh->gh_error = GLR_TRYFAILED;
905 spin_unlock(&gl->gl_spin);
906
907 } else {
908 if (gfs2_assert_withdraw(sdp, 0) == -1)
909 fs_err(sdp, "ret = 0x%.8X\n", ret);
910 }
911
912 if (glops->go_xmote_bh)
913 glops->go_xmote_bh(gl);
914
915 if (op_done) {
916 spin_lock(&gl->gl_spin);
917 gl->gl_req_gh = NULL;
918 gl->gl_req_bh = NULL;
919 clear_bit(GLF_LOCK, &gl->gl_flags);
920 run_queue(gl);
921 spin_unlock(&gl->gl_spin);
922 }
923
924 gfs2_glock_put(gl);
925
926 if (gh) {
927 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
928 gfs2_holder_put(gh);
929 else
930 complete(&gh->gh_wait);
931 }
932}
933
934/**
935 * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
936 * @gl: The glock in question
937 * @state: the requested state
938 * @flags: modifier flags to the lock call
939 *
940 */
941
942void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags)
943{
944 struct gfs2_sbd *sdp = gl->gl_sbd;
945 const struct gfs2_glock_operations *glops = gl->gl_ops;
946 int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
947 LM_FLAG_NOEXP | LM_FLAG_ANY |
948 LM_FLAG_PRIORITY);
949 unsigned int lck_ret;
950
951 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
952 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
953 gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
954 gfs2_assert_warn(sdp, state != gl->gl_state);
955
956 if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
957 glops->go_sync(gl, DIO_METADATA | DIO_DATA | DIO_RELEASE);
958
959 gfs2_glock_hold(gl);
960 gl->gl_req_bh = xmote_bh;
961
962 lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
963
964 if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
965 return;
966
967 if (lck_ret & LM_OUT_ASYNC)
968 gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
969 else
970 xmote_bh(gl, lck_ret);
971}
972
973/**
974 * drop_bh - Called after a lock module unlock completes
975 * @gl: the glock
976 * @ret: the return status
977 *
978 * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
979 * Doesn't drop the reference on the glock the top half took out
980 *
981 */
982
983static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
984{
985 struct gfs2_sbd *sdp = gl->gl_sbd;
986 const struct gfs2_glock_operations *glops = gl->gl_ops;
987 struct gfs2_holder *gh = gl->gl_req_gh;
988
989 clear_bit(GLF_PREFETCH, &gl->gl_flags);
990
991 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
992 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
993 gfs2_assert_warn(sdp, !ret);
994
995 state_change(gl, LM_ST_UNLOCKED);
996
997 if (glops->go_inval)
998 glops->go_inval(gl, DIO_METADATA | DIO_DATA);
999
1000 if (gh) {
1001 spin_lock(&gl->gl_spin);
1002 list_del_init(&gh->gh_list);
1003 gh->gh_error = 0;
1004 spin_unlock(&gl->gl_spin);
1005 }
1006
1007 if (glops->go_drop_bh)
1008 glops->go_drop_bh(gl);
1009
1010 spin_lock(&gl->gl_spin);
1011 gl->gl_req_gh = NULL;
1012 gl->gl_req_bh = NULL;
1013 clear_bit(GLF_LOCK, &gl->gl_flags);
1014 run_queue(gl);
1015 spin_unlock(&gl->gl_spin);
1016
1017 gfs2_glock_put(gl);
1018
1019 if (gh) {
1020 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
1021 gfs2_holder_put(gh);
1022 else
1023 complete(&gh->gh_wait);
1024 }
1025}
1026
1027/**
1028 * gfs2_glock_drop_th - call into the lock module to unlock a lock
1029 * @gl: the glock
1030 *
1031 */
1032
1033void gfs2_glock_drop_th(struct gfs2_glock *gl)
1034{
1035 struct gfs2_sbd *sdp = gl->gl_sbd;
1036 const struct gfs2_glock_operations *glops = gl->gl_ops;
1037 unsigned int ret;
1038
1039 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1040 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
1041 gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
1042
1043 if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
1044 glops->go_sync(gl, DIO_METADATA | DIO_DATA | DIO_RELEASE);
1045
1046 gfs2_glock_hold(gl);
1047 gl->gl_req_bh = drop_bh;
1048
1049 ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
1050
1051 if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
1052 return;
1053
1054 if (!ret)
1055 drop_bh(gl, ret);
1056 else
1057 gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
1058}
1059
1060/**
1061 * do_cancels - cancel requests for locks stuck waiting on an expire flag
1062 * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
1063 *
1064 * Don't cancel GL_NOCANCEL requests.
1065 */
1066
1067static void do_cancels(struct gfs2_holder *gh)
1068{
1069 struct gfs2_glock *gl = gh->gh_gl;
1070
1071 spin_lock(&gl->gl_spin);
1072
1073 while (gl->gl_req_gh != gh &&
1074 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
1075 !list_empty(&gh->gh_list)) {
1076 if (gl->gl_req_bh && !(gl->gl_req_gh &&
1077 (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
1078 spin_unlock(&gl->gl_spin);
1079 gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock);
1080 msleep(100);
1081 spin_lock(&gl->gl_spin);
1082 } else {
1083 spin_unlock(&gl->gl_spin);
1084 msleep(100);
1085 spin_lock(&gl->gl_spin);
1086 }
1087 }
1088
1089 spin_unlock(&gl->gl_spin);
1090}
1091
1092/**
1093 * glock_wait_internal - wait on a glock acquisition
1094 * @gh: the glock holder
1095 *
1096 * Returns: 0 on success
1097 */
1098
1099static int glock_wait_internal(struct gfs2_holder *gh)
1100{
1101 struct gfs2_glock *gl = gh->gh_gl;
1102 struct gfs2_sbd *sdp = gl->gl_sbd;
1103 const struct gfs2_glock_operations *glops = gl->gl_ops;
1104
1105 if (test_bit(HIF_ABORTED, &gh->gh_iflags))
1106 return -EIO;
1107
1108 if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
1109 spin_lock(&gl->gl_spin);
1110 if (gl->gl_req_gh != gh &&
1111 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
1112 !list_empty(&gh->gh_list)) {
1113 list_del_init(&gh->gh_list);
1114 gh->gh_error = GLR_TRYFAILED;
1115 run_queue(gl);
1116 spin_unlock(&gl->gl_spin);
1117 return gh->gh_error;
1118 }
1119 spin_unlock(&gl->gl_spin);
1120 }
1121
1122 if (gh->gh_flags & LM_FLAG_PRIORITY)
1123 do_cancels(gh);
1124
1125 wait_for_completion(&gh->gh_wait);
1126
1127 if (gh->gh_error)
1128 return gh->gh_error;
1129
1130 gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
1131 gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state,
1132 gh->gh_flags));
1133
1134 if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
1135 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1136
1137 if (glops->go_lock) {
1138 gh->gh_error = glops->go_lock(gh);
1139 if (gh->gh_error) {
1140 spin_lock(&gl->gl_spin);
1141 list_del_init(&gh->gh_list);
1142 spin_unlock(&gl->gl_spin);
1143 }
1144 }
1145
1146 spin_lock(&gl->gl_spin);
1147 gl->gl_req_gh = NULL;
1148 gl->gl_req_bh = NULL;
1149 clear_bit(GLF_LOCK, &gl->gl_flags);
1150 run_queue(gl);
1151 spin_unlock(&gl->gl_spin);
1152 }
1153
1154 return gh->gh_error;
1155}
1156
1157static inline struct gfs2_holder *
1158find_holder_by_owner(struct list_head *head, struct task_struct *owner)
1159{
1160 struct gfs2_holder *gh;
1161
1162 list_for_each_entry(gh, head, gh_list) {
1163 if (gh->gh_owner == owner)
1164 return gh;
1165 }
1166
1167 return NULL;
1168}
1169
1170/**
1171 * add_to_queue - Add a holder to the wait queue (but look for recursion)
1172 * @gh: the holder structure to add
1173 *
1174 */
1175
1176static void add_to_queue(struct gfs2_holder *gh)
1177{
1178 struct gfs2_glock *gl = gh->gh_gl;
1179 struct gfs2_holder *existing;
1180
1181 BUG_ON(!gh->gh_owner);
1182
1183 existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
1184 if (existing) {
1185 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
1186 printk(KERN_INFO "pid : %d\n", existing->gh_owner->pid);
1187 printk(KERN_INFO "lock type : %d lock state : %d\n",
1188 existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state);
1189 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
1190 printk(KERN_INFO "pid : %d\n", gh->gh_owner->pid);
1191 printk(KERN_INFO "lock type : %d lock state : %d\n",
1192 gl->gl_name.ln_type, gl->gl_state);
1193 BUG();
1194 }
1195
1196 existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner);
1197 if (existing) {
1198 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
1199 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
1200 BUG();
1201 }
1202
1203 if (gh->gh_flags & LM_FLAG_PRIORITY)
1204 list_add(&gh->gh_list, &gl->gl_waiters3);
1205 else
1206 list_add_tail(&gh->gh_list, &gl->gl_waiters3);
1207}
1208
1209/**
1210 * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock)
1211 * @gh: the holder structure
1212 *
1213 * if (gh->gh_flags & GL_ASYNC), this never returns an error
1214 *
1215 * Returns: 0, GLR_TRYFAILED, or errno on failure
1216 */
1217
1218int gfs2_glock_nq(struct gfs2_holder *gh)
1219{
1220 struct gfs2_glock *gl = gh->gh_gl;
1221 struct gfs2_sbd *sdp = gl->gl_sbd;
1222 int error = 0;
1223
1224restart:
1225 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
1226 set_bit(HIF_ABORTED, &gh->gh_iflags);
1227 return -EIO;
1228 }
1229
1230 set_bit(HIF_PROMOTE, &gh->gh_iflags);
1231
1232 spin_lock(&gl->gl_spin);
1233 add_to_queue(gh);
1234 run_queue(gl);
1235 spin_unlock(&gl->gl_spin);
1236
1237 if (!(gh->gh_flags & GL_ASYNC)) {
1238 error = glock_wait_internal(gh);
1239 if (error == GLR_CANCELED) {
1240 msleep(100);
1241 goto restart;
1242 }
1243 }
1244
1245 clear_bit(GLF_PREFETCH, &gl->gl_flags);
1246
1247 if (error == GLR_TRYFAILED && (gh->gh_flags & GL_DUMP))
1248 dump_glock(gl);
1249
1250 return error;
1251}
1252
1253/**
1254 * gfs2_glock_poll - poll to see if an async request has been completed
1255 * @gh: the holder
1256 *
1257 * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on
1258 */
1259
1260int gfs2_glock_poll(struct gfs2_holder *gh)
1261{
1262 struct gfs2_glock *gl = gh->gh_gl;
1263 int ready = 0;
1264
1265 spin_lock(&gl->gl_spin);
1266
1267 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
1268 ready = 1;
1269 else if (list_empty(&gh->gh_list)) {
1270 if (gh->gh_error == GLR_CANCELED) {
1271 spin_unlock(&gl->gl_spin);
1272 msleep(100);
1273 if (gfs2_glock_nq(gh))
1274 return 1;
1275 return 0;
1276 } else
1277 ready = 1;
1278 }
1279
1280 spin_unlock(&gl->gl_spin);
1281
1282 return ready;
1283}
1284
1285/**
1286 * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
1287 * @gh: the holder structure
1288 *
1289 * Returns: 0, GLR_TRYFAILED, or errno on failure
1290 */
1291
1292int gfs2_glock_wait(struct gfs2_holder *gh)
1293{
1294 int error;
1295
1296 error = glock_wait_internal(gh);
1297 if (error == GLR_CANCELED) {
1298 msleep(100);
1299 gh->gh_flags &= ~GL_ASYNC;
1300 error = gfs2_glock_nq(gh);
1301 }
1302
1303 return error;
1304}
1305
1306/**
1307 * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
1308 * @gh: the glock holder
1309 *
1310 */
1311
1312void gfs2_glock_dq(struct gfs2_holder *gh)
1313{
1314 struct gfs2_glock *gl = gh->gh_gl;
1315 const struct gfs2_glock_operations *glops = gl->gl_ops;
1316
1317 if (gh->gh_flags & GL_NOCACHE)
1318 handle_callback(gl, LM_ST_UNLOCKED);
1319
1320 gfs2_glmutex_lock(gl);
1321
1322 spin_lock(&gl->gl_spin);
1323 list_del_init(&gh->gh_list);
1324
1325 if (list_empty(&gl->gl_holders)) {
1326 spin_unlock(&gl->gl_spin);
1327
1328 if (glops->go_unlock)
1329 glops->go_unlock(gh);
1330
1331 gl->gl_stamp = jiffies;
1332
1333 spin_lock(&gl->gl_spin);
1334 }
1335
1336 clear_bit(GLF_LOCK, &gl->gl_flags);
1337 run_queue(gl);
1338 spin_unlock(&gl->gl_spin);
1339}
1340
1341/**
1342 * gfs2_glock_prefetch - Try to prefetch a glock
1343 * @gl: the glock
1344 * @state: the state to prefetch in
1345 * @flags: flags passed to go_xmote_th()
1346 *
1347 */
1348
1349static void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state,
1350 int flags)
1351{
1352 const struct gfs2_glock_operations *glops = gl->gl_ops;
1353
1354 spin_lock(&gl->gl_spin);
1355
1356 if (test_bit(GLF_LOCK, &gl->gl_flags) || !list_empty(&gl->gl_holders) ||
1357 !list_empty(&gl->gl_waiters1) || !list_empty(&gl->gl_waiters2) ||
1358 !list_empty(&gl->gl_waiters3) ||
1359 relaxed_state_ok(gl->gl_state, state, flags)) {
1360 spin_unlock(&gl->gl_spin);
1361 return;
1362 }
1363
1364 set_bit(GLF_PREFETCH, &gl->gl_flags);
1365 set_bit(GLF_LOCK, &gl->gl_flags);
1366 spin_unlock(&gl->gl_spin);
1367
1368 glops->go_xmote_th(gl, state, flags);
1369}
1370
1371static void greedy_work(void *data)
1372{
1373 struct greedy *gr = data;
1374 struct gfs2_holder *gh = &gr->gr_gh;
1375 struct gfs2_glock *gl = gh->gh_gl;
1376 const struct gfs2_glock_operations *glops = gl->gl_ops;
1377
1378 clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
1379
1380 if (glops->go_greedy)
1381 glops->go_greedy(gl);
1382
1383 spin_lock(&gl->gl_spin);
1384
1385 if (list_empty(&gl->gl_waiters2)) {
1386 clear_bit(GLF_GREEDY, &gl->gl_flags);
1387 spin_unlock(&gl->gl_spin);
1388 gfs2_holder_uninit(gh);
1389 kfree(gr);
1390 } else {
1391 gfs2_glock_hold(gl);
1392 list_add_tail(&gh->gh_list, &gl->gl_waiters2);
1393 run_queue(gl);
1394 spin_unlock(&gl->gl_spin);
1395 gfs2_glock_put(gl);
1396 }
1397}
1398
1399/**
1400 * gfs2_glock_be_greedy -
1401 * @gl:
1402 * @time:
1403 *
1404 * Returns: 0 if go_greedy will be called, 1 otherwise
1405 */
1406
1407int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time)
1408{
1409 struct greedy *gr;
1410 struct gfs2_holder *gh;
1411
1412 if (!time || gl->gl_sbd->sd_args.ar_localcaching ||
1413 test_and_set_bit(GLF_GREEDY, &gl->gl_flags))
1414 return 1;
1415
1416 gr = kmalloc(sizeof(struct greedy), GFP_KERNEL);
1417 if (!gr) {
1418 clear_bit(GLF_GREEDY, &gl->gl_flags);
1419 return 1;
1420 }
1421 gh = &gr->gr_gh;
1422
1423 gfs2_holder_init(gl, 0, 0, gh);
1424 set_bit(HIF_GREEDY, &gh->gh_iflags);
1425 INIT_WORK(&gr->gr_work, greedy_work, gr);
1426
1427 set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
1428 schedule_delayed_work(&gr->gr_work, time);
1429
1430 return 0;
1431}
1432
1433/**
1434 * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
1435 * @gh: the holder structure
1436 *
1437 */
1438
1439void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
1440{
1441 gfs2_glock_dq(gh);
1442 gfs2_holder_uninit(gh);
1443}
1444
1445/**
1446 * gfs2_glock_nq_num - acquire a glock based on lock number
1447 * @sdp: the filesystem
1448 * @number: the lock number
1449 * @glops: the glock operations for the type of glock
1450 * @state: the state to acquire the glock in
1451 * @flags: modifier flags for the aquisition
1452 * @gh: the struct gfs2_holder
1453 *
1454 * Returns: errno
1455 */
1456
1457int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
1458 const struct gfs2_glock_operations *glops,
1459 unsigned int state, int flags, struct gfs2_holder *gh)
1460{
1461 struct gfs2_glock *gl;
1462 int error;
1463
1464 error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
1465 if (!error) {
1466 error = gfs2_glock_nq_init(gl, state, flags, gh);
1467 gfs2_glock_put(gl);
1468 }
1469
1470 return error;
1471}
1472
1473/**
1474 * glock_compare - Compare two struct gfs2_glock structures for sorting
1475 * @arg_a: the first structure
1476 * @arg_b: the second structure
1477 *
1478 */
1479
1480static int glock_compare(const void *arg_a, const void *arg_b)
1481{
1482 const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a;
1483 const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b;
1484 const struct lm_lockname *a = &gh_a->gh_gl->gl_name;
1485 const struct lm_lockname *b = &gh_b->gh_gl->gl_name;
1486
1487 if (a->ln_number > b->ln_number)
1488 return 1;
1489 if (a->ln_number < b->ln_number)
1490 return -1;
1491 if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE)
1492 return 1;
1493 if (!(gh_a->gh_flags & GL_LOCAL_EXCL) && (gh_b->gh_flags & GL_LOCAL_EXCL))
1494 return 1;
1495 return 0;
1496}
1497
1498/**
1499 * nq_m_sync - synchonously acquire more than one glock in deadlock free order
1500 * @num_gh: the number of structures
1501 * @ghs: an array of struct gfs2_holder structures
1502 *
1503 * Returns: 0 on success (all glocks acquired),
1504 * errno on failure (no glocks acquired)
1505 */
1506
1507static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
1508 struct gfs2_holder **p)
1509{
1510 unsigned int x;
1511 int error = 0;
1512
1513 for (x = 0; x < num_gh; x++)
1514 p[x] = &ghs[x];
1515
1516 sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL);
1517
1518 for (x = 0; x < num_gh; x++) {
1519 p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1520
1521 error = gfs2_glock_nq(p[x]);
1522 if (error) {
1523 while (x--)
1524 gfs2_glock_dq(p[x]);
1525 break;
1526 }
1527 }
1528
1529 return error;
1530}
1531
1532/**
1533 * gfs2_glock_nq_m - acquire multiple glocks
1534 * @num_gh: the number of structures
1535 * @ghs: an array of struct gfs2_holder structures
1536 *
1537 * Figure out how big an impact this function has. Either:
1538 * 1) Replace this code with code that calls gfs2_glock_prefetch()
1539 * 2) Forget async stuff and just call nq_m_sync()
1540 * 3) Leave it like it is
1541 *
1542 * Returns: 0 on success (all glocks acquired),
1543 * errno on failure (no glocks acquired)
1544 */
1545
1546int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1547{
1548 int *e;
1549 unsigned int x;
1550 int borked = 0, serious = 0;
1551 int error = 0;
1552
1553 if (!num_gh)
1554 return 0;
1555
1556 if (num_gh == 1) {
1557 ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1558 return gfs2_glock_nq(ghs);
1559 }
1560
1561 e = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
1562 if (!e)
1563 return -ENOMEM;
1564
1565 for (x = 0; x < num_gh; x++) {
1566 ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC;
1567 error = gfs2_glock_nq(&ghs[x]);
1568 if (error) {
1569 borked = 1;
1570 serious = error;
1571 num_gh = x;
1572 break;
1573 }
1574 }
1575
1576 for (x = 0; x < num_gh; x++) {
1577 error = e[x] = glock_wait_internal(&ghs[x]);
1578 if (error) {
1579 borked = 1;
1580 if (error != GLR_TRYFAILED && error != GLR_CANCELED)
1581 serious = error;
1582 }
1583 }
1584
1585 if (!borked) {
1586 kfree(e);
1587 return 0;
1588 }
1589
1590 for (x = 0; x < num_gh; x++)
1591 if (!e[x])
1592 gfs2_glock_dq(&ghs[x]);
1593
1594 if (serious)
1595 error = serious;
1596 else {
1597 for (x = 0; x < num_gh; x++)
1598 gfs2_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags,
1599 &ghs[x]);
1600 error = nq_m_sync(num_gh, ghs, (struct gfs2_holder **)e);
1601 }
1602
1603 kfree(e);
1604
1605 return error;
1606}
1607
1608/**
1609 * gfs2_glock_dq_m - release multiple glocks
1610 * @num_gh: the number of structures
1611 * @ghs: an array of struct gfs2_holder structures
1612 *
1613 */
1614
1615void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1616{
1617 unsigned int x;
1618
1619 for (x = 0; x < num_gh; x++)
1620 gfs2_glock_dq(&ghs[x]);
1621}
1622
1623/**
1624 * gfs2_glock_dq_uninit_m - release multiple glocks
1625 * @num_gh: the number of structures
1626 * @ghs: an array of struct gfs2_holder structures
1627 *
1628 */
1629
1630void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
1631{
1632 unsigned int x;
1633
1634 for (x = 0; x < num_gh; x++)
1635 gfs2_glock_dq_uninit(&ghs[x]);
1636}
1637
1638/**
1639 * gfs2_glock_prefetch_num - prefetch a glock based on lock number
1640 * @sdp: the filesystem
1641 * @number: the lock number
1642 * @glops: the glock operations for the type of glock
1643 * @state: the state to acquire the glock in
1644 * @flags: modifier flags for the aquisition
1645 *
1646 * Returns: errno
1647 */
1648
1649void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number,
1650 const struct gfs2_glock_operations *glops,
1651 unsigned int state, int flags)
1652{
1653 struct gfs2_glock *gl;
1654 int error;
1655
1656 if (atomic_read(&sdp->sd_reclaim_count) <
1657 gfs2_tune_get(sdp, gt_reclaim_limit)) {
1658 error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
1659 if (!error) {
1660 gfs2_glock_prefetch(gl, state, flags);
1661 gfs2_glock_put(gl);
1662 }
1663 }
1664}
1665
1666/**
1667 * gfs2_lvb_hold - attach a LVB from a glock
1668 * @gl: The glock in question
1669 *
1670 */
1671
1672int gfs2_lvb_hold(struct gfs2_glock *gl)
1673{
1674 int error;
1675
1676 gfs2_glmutex_lock(gl);
1677
1678 if (!atomic_read(&gl->gl_lvb_count)) {
1679 error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
1680 if (error) {
1681 gfs2_glmutex_unlock(gl);
1682 return error;
1683 }
1684 gfs2_glock_hold(gl);
1685 }
1686 atomic_inc(&gl->gl_lvb_count);
1687
1688 gfs2_glmutex_unlock(gl);
1689
1690 return 0;
1691}
1692
1693/**
1694 * gfs2_lvb_unhold - detach a LVB from a glock
1695 * @gl: The glock in question
1696 *
1697 */
1698
1699void gfs2_lvb_unhold(struct gfs2_glock *gl)
1700{
1701 gfs2_glock_hold(gl);
1702 gfs2_glmutex_lock(gl);
1703
1704 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
1705 if (atomic_dec_and_test(&gl->gl_lvb_count)) {
1706 gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
1707 gl->gl_lvb = NULL;
1708 gfs2_glock_put(gl);
1709 }
1710
1711 gfs2_glmutex_unlock(gl);
1712 gfs2_glock_put(gl);
1713}
1714
1715static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1716 unsigned int state)
1717{
1718 struct gfs2_glock *gl;
1719
1720 gl = gfs2_glock_find(sdp, name);
1721 if (!gl)
1722 return;
1723
1724 if (gl->gl_ops->go_callback)
1725 gl->gl_ops->go_callback(gl, state);
1726 handle_callback(gl, state);
1727
1728 spin_lock(&gl->gl_spin);
1729 run_queue(gl);
1730 spin_unlock(&gl->gl_spin);
1731
1732 gfs2_glock_put(gl);
1733}
1734
1735/**
1736 * gfs2_glock_cb - Callback used by locking module
1737 * @sdp: Pointer to the superblock
1738 * @type: Type of callback
1739 * @data: Type dependent data pointer
1740 *
1741 * Called by the locking module when it wants to tell us something.
1742 * Either we need to drop a lock, one of our ASYNC requests completed, or
1743 * a journal from another client needs to be recovered.
1744 */
1745
1746void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
1747{
1748 struct gfs2_sbd *sdp = cb_data;
1749
1750 switch (type) {
1751 case LM_CB_NEED_E:
1752 blocking_cb(sdp, data, LM_ST_UNLOCKED);
1753 return;
1754
1755 case LM_CB_NEED_D:
1756 blocking_cb(sdp, data, LM_ST_DEFERRED);
1757 return;
1758
1759 case LM_CB_NEED_S:
1760 blocking_cb(sdp, data, LM_ST_SHARED);
1761 return;
1762
1763 case LM_CB_ASYNC: {
1764 struct lm_async_cb *async = data;
1765 struct gfs2_glock *gl;
1766
1767 gl = gfs2_glock_find(sdp, &async->lc_name);
1768 if (gfs2_assert_warn(sdp, gl))
1769 return;
1770 if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
1771 gl->gl_req_bh(gl, async->lc_ret);
1772 gfs2_glock_put(gl);
1773 return;
1774 }
1775
1776 case LM_CB_NEED_RECOVERY:
1777 gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data);
1778 if (sdp->sd_recoverd_process)
1779 wake_up_process(sdp->sd_recoverd_process);
1780 return;
1781
1782 case LM_CB_DROPLOCKS:
1783 gfs2_gl_hash_clear(sdp, NO_WAIT);
1784 gfs2_quota_scan(sdp);
1785 return;
1786
1787 default:
1788 gfs2_assert_warn(sdp, 0);
1789 return;
1790 }
1791}
1792
1793/**
1794 * demote_ok - Check to see if it's ok to unlock a glock
1795 * @gl: the glock
1796 *
1797 * Returns: 1 if it's ok
1798 */
1799
1800static int demote_ok(struct gfs2_glock *gl)
1801{
1802 struct gfs2_sbd *sdp = gl->gl_sbd;
1803 const struct gfs2_glock_operations *glops = gl->gl_ops;
1804 int demote = 1;
1805
1806 if (test_bit(GLF_STICKY, &gl->gl_flags))
1807 demote = 0;
1808 else if (test_bit(GLF_PREFETCH, &gl->gl_flags))
1809 demote = time_after_eq(jiffies, gl->gl_stamp +
1810 gfs2_tune_get(sdp, gt_prefetch_secs) * HZ);
1811 else if (glops->go_demote_ok)
1812 demote = glops->go_demote_ok(gl);
1813
1814 return demote;
1815}
1816
1817/**
1818 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
1819 * @gl: the glock
1820 *
1821 */
1822
1823void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
1824{
1825 struct gfs2_sbd *sdp = gl->gl_sbd;
1826
1827 spin_lock(&sdp->sd_reclaim_lock);
1828 if (list_empty(&gl->gl_reclaim)) {
1829 gfs2_glock_hold(gl);
1830 list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
1831 atomic_inc(&sdp->sd_reclaim_count);
1832 }
1833 spin_unlock(&sdp->sd_reclaim_lock);
1834
1835 wake_up(&sdp->sd_reclaim_wq);
1836}
1837
1838/**
1839 * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
1840 * @sdp: the filesystem
1841 *
1842 * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
1843 * different glock and we notice that there are a lot of glocks in the
1844 * reclaim list.
1845 *
1846 */
1847
1848void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
1849{
1850 struct gfs2_glock *gl;
1851
1852 spin_lock(&sdp->sd_reclaim_lock);
1853 if (list_empty(&sdp->sd_reclaim_list)) {
1854 spin_unlock(&sdp->sd_reclaim_lock);
1855 return;
1856 }
1857 gl = list_entry(sdp->sd_reclaim_list.next,
1858 struct gfs2_glock, gl_reclaim);
1859 list_del_init(&gl->gl_reclaim);
1860 spin_unlock(&sdp->sd_reclaim_lock);
1861
1862 atomic_dec(&sdp->sd_reclaim_count);
1863 atomic_inc(&sdp->sd_reclaimed);
1864
1865 if (gfs2_glmutex_trylock(gl)) {
1866 if (queue_empty(gl, &gl->gl_holders) &&
1867 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
1868 handle_callback(gl, LM_ST_UNLOCKED);
1869 gfs2_glmutex_unlock(gl);
1870 }
1871
1872 gfs2_glock_put(gl);
1873}
1874
1875/**
1876 * examine_bucket - Call a function for glock in a hash bucket
1877 * @examiner: the function
1878 * @sdp: the filesystem
1879 * @bucket: the bucket
1880 *
1881 * Returns: 1 if the bucket has entries
1882 */
1883
1884static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
1885 unsigned int hash)
1886{
1887 struct gfs2_glock *gl, *prev = NULL;
1888 int has_entries = 0;
1889 struct hlist_head *head = &gl_hash_table[hash].hb_list;
1890
1891 read_lock(gl_lock_addr(hash));
1892 /* Can't use hlist_for_each_entry - don't want prefetch here */
1893 if (hlist_empty(head))
1894 goto out;
1895 gl = list_entry(head->first, struct gfs2_glock, gl_list);
1896 while(1) {
1897 if (gl->gl_sbd == sdp) {
1898 gfs2_glock_hold(gl);
1899 read_unlock(gl_lock_addr(hash));
1900 if (prev)
1901 gfs2_glock_put(prev);
1902 prev = gl;
1903 examiner(gl);
1904 has_entries = 1;
1905 read_lock(gl_lock_addr(hash));
1906 }
1907 if (gl->gl_list.next == NULL)
1908 break;
1909 gl = list_entry(gl->gl_list.next, struct gfs2_glock, gl_list);
1910 }
1911out:
1912 read_unlock(gl_lock_addr(hash));
1913 if (prev)
1914 gfs2_glock_put(prev);
1915 return has_entries;
1916}
1917
1918/**
1919 * scan_glock - look at a glock and see if we can reclaim it
1920 * @gl: the glock to look at
1921 *
1922 */
1923
1924static void scan_glock(struct gfs2_glock *gl)
1925{
1926 if (gl->gl_ops == &gfs2_inode_glops)
1927 return;
1928
1929 if (gfs2_glmutex_trylock(gl)) {
1930 if (queue_empty(gl, &gl->gl_holders) &&
1931 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
1932 goto out_schedule;
1933 gfs2_glmutex_unlock(gl);
1934 }
1935 return;
1936
1937out_schedule:
1938 gfs2_glmutex_unlock(gl);
1939 gfs2_glock_schedule_for_reclaim(gl);
1940}
1941
1942/**
1943 * gfs2_scand_internal - Look for glocks and inodes to toss from memory
1944 * @sdp: the filesystem
1945 *
1946 */
1947
1948void gfs2_scand_internal(struct gfs2_sbd *sdp)
1949{
1950 unsigned int x;
1951
1952 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
1953 examine_bucket(scan_glock, sdp, x);
1954}
1955
1956/**
1957 * clear_glock - look at a glock and see if we can free it from glock cache
1958 * @gl: the glock to look at
1959 *
1960 */
1961
1962static void clear_glock(struct gfs2_glock *gl)
1963{
1964 struct gfs2_sbd *sdp = gl->gl_sbd;
1965 int released;
1966
1967 spin_lock(&sdp->sd_reclaim_lock);
1968 if (!list_empty(&gl->gl_reclaim)) {
1969 list_del_init(&gl->gl_reclaim);
1970 atomic_dec(&sdp->sd_reclaim_count);
1971 spin_unlock(&sdp->sd_reclaim_lock);
1972 released = gfs2_glock_put(gl);
1973 gfs2_assert(sdp, !released);
1974 } else {
1975 spin_unlock(&sdp->sd_reclaim_lock);
1976 }
1977
1978 if (gfs2_glmutex_trylock(gl)) {
1979 if (queue_empty(gl, &gl->gl_holders) &&
1980 gl->gl_state != LM_ST_UNLOCKED)
1981 handle_callback(gl, LM_ST_UNLOCKED);
1982 gfs2_glmutex_unlock(gl);
1983 }
1984}
1985
1986/**
1987 * gfs2_gl_hash_clear - Empty out the glock hash table
1988 * @sdp: the filesystem
1989 * @wait: wait until it's all gone
1990 *
1991 * Called when unmounting the filesystem, or when inter-node lock manager
1992 * requests DROPLOCKS because it is running out of capacity.
1993 */
1994
1995void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
1996{
1997 unsigned long t;
1998 unsigned int x;
1999 int cont;
2000
2001 t = jiffies;
2002
2003 for (;;) {
2004 cont = 0;
2005 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
2006 if (examine_bucket(clear_glock, sdp, x))
2007 cont = 1;
2008 }
2009
2010 if (!wait || !cont)
2011 break;
2012
2013 if (time_after_eq(jiffies,
2014 t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
2015 fs_warn(sdp, "Unmount seems to be stalled. "
2016 "Dumping lock state...\n");
2017 gfs2_dump_lockstate(sdp);
2018 t = jiffies;
2019 }
2020
2021 invalidate_inodes(sdp->sd_vfs);
2022 msleep(10);
2023 }
2024}
2025
2026/*
2027 * Diagnostic routines to help debug distributed deadlock
2028 */
2029
2030/**
2031 * dump_holder - print information about a glock holder
2032 * @str: a string naming the type of holder
2033 * @gh: the glock holder
2034 *
2035 * Returns: 0 on success, -ENOBUFS when we run out of space
2036 */
2037
2038static int dump_holder(char *str, struct gfs2_holder *gh)
2039{
2040 unsigned int x;
2041 int error = -ENOBUFS;
2042
2043 printk(KERN_INFO " %s\n", str);
2044 printk(KERN_INFO " owner = %ld\n",
2045 (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
2046 printk(KERN_INFO " gh_state = %u\n", gh->gh_state);
2047 printk(KERN_INFO " gh_flags =");
2048 for (x = 0; x < 32; x++)
2049 if (gh->gh_flags & (1 << x))
2050 printk(" %u", x);
2051 printk(" \n");
2052 printk(KERN_INFO " error = %d\n", gh->gh_error);
2053 printk(KERN_INFO " gh_iflags =");
2054 for (x = 0; x < 32; x++)
2055 if (test_bit(x, &gh->gh_iflags))
2056 printk(" %u", x);
2057 printk(" \n");
2058 print_symbol(KERN_INFO " initialized at: %s\n", gh->gh_ip);
2059
2060 error = 0;
2061
2062 return error;
2063}
2064
2065/**
2066 * dump_inode - print information about an inode
2067 * @ip: the inode
2068 *
2069 * Returns: 0 on success, -ENOBUFS when we run out of space
2070 */
2071
2072static int dump_inode(struct gfs2_inode *ip)
2073{
2074 unsigned int x;
2075 int error = -ENOBUFS;
2076
2077 printk(KERN_INFO " Inode:\n");
2078 printk(KERN_INFO " num = %llu %llu\n",
2079 (unsigned long long)ip->i_num.no_formal_ino,
2080 (unsigned long long)ip->i_num.no_addr);
2081 printk(KERN_INFO " type = %u\n", IF2DT(ip->i_di.di_mode));
2082 printk(KERN_INFO " i_flags =");
2083 for (x = 0; x < 32; x++)
2084 if (test_bit(x, &ip->i_flags))
2085 printk(" %u", x);
2086 printk(" \n");
2087
2088 error = 0;
2089
2090 return error;
2091}
2092
2093/**
2094 * dump_glock - print information about a glock
2095 * @gl: the glock
2096 * @count: where we are in the buffer
2097 *
2098 * Returns: 0 on success, -ENOBUFS when we run out of space
2099 */
2100
2101static int dump_glock(struct gfs2_glock *gl)
2102{
2103 struct gfs2_holder *gh;
2104 unsigned int x;
2105 int error = -ENOBUFS;
2106
2107 spin_lock(&gl->gl_spin);
2108
2109 printk(KERN_INFO "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type,
2110 (unsigned long long)gl->gl_name.ln_number);
2111 printk(KERN_INFO " gl_flags =");
2112 for (x = 0; x < 32; x++) {
2113 if (test_bit(x, &gl->gl_flags))
2114 printk(" %u", x);
2115 }
2116 printk(" \n");
2117 printk(KERN_INFO " gl_ref = %d\n", atomic_read(&gl->gl_ref));
2118 printk(KERN_INFO " gl_state = %u\n", gl->gl_state);
2119 printk(KERN_INFO " gl_owner = %s\n", gl->gl_owner->comm);
2120 print_symbol(KERN_INFO " gl_ip = %s\n", gl->gl_ip);
2121 printk(KERN_INFO " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
2122 printk(KERN_INFO " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
2123 printk(KERN_INFO " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
2124 printk(KERN_INFO " object = %s\n", (gl->gl_object) ? "yes" : "no");
2125 printk(KERN_INFO " le = %s\n",
2126 (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
2127 printk(KERN_INFO " reclaim = %s\n",
2128 (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
2129 if (gl->gl_aspace)
2130 printk(KERN_INFO " aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
2131 gl->gl_aspace->i_mapping->nrpages);
2132 else
2133 printk(KERN_INFO " aspace = no\n");
2134 printk(KERN_INFO " ail = %d\n", atomic_read(&gl->gl_ail_count));
2135 if (gl->gl_req_gh) {
2136 error = dump_holder("Request", gl->gl_req_gh);
2137 if (error)
2138 goto out;
2139 }
2140 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
2141 error = dump_holder("Holder", gh);
2142 if (error)
2143 goto out;
2144 }
2145 list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
2146 error = dump_holder("Waiter1", gh);
2147 if (error)
2148 goto out;
2149 }
2150 list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
2151 error = dump_holder("Waiter2", gh);
2152 if (error)
2153 goto out;
2154 }
2155 list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
2156 error = dump_holder("Waiter3", gh);
2157 if (error)
2158 goto out;
2159 }
2160 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
2161 if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
2162 list_empty(&gl->gl_holders)) {
2163 error = dump_inode(gl->gl_object);
2164 if (error)
2165 goto out;
2166 } else {
2167 error = -ENOBUFS;
2168 printk(KERN_INFO " Inode: busy\n");
2169 }
2170 }
2171
2172 error = 0;
2173
2174out:
2175 spin_unlock(&gl->gl_spin);
2176 return error;
2177}
2178
2179/**
2180 * gfs2_dump_lockstate - print out the current lockstate
2181 * @sdp: the filesystem
2182 * @ub: the buffer to copy the information into
2183 *
2184 * If @ub is NULL, dump the lockstate to the console.
2185 *
2186 */
2187
2188static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
2189{
2190 struct gfs2_glock *gl;
2191 struct hlist_node *h;
2192 unsigned int x;
2193 int error = 0;
2194
2195 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
2196
2197 read_lock(gl_lock_addr(x));
2198
2199 hlist_for_each_entry(gl, h, &gl_hash_table[x].hb_list, gl_list) {
2200 if (gl->gl_sbd != sdp)
2201 continue;
2202
2203 error = dump_glock(gl);
2204 if (error)
2205 break;
2206 }
2207
2208 read_unlock(gl_lock_addr(x));
2209
2210 if (error)
2211 break;
2212 }
2213
2214
2215 return error;
2216}
2217
2218int __init gfs2_glock_init(void)
2219{
2220 unsigned i;
2221 for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
2222 INIT_HLIST_HEAD(&gl_hash_table[i].hb_list);
2223 }
2224#ifdef GL_HASH_LOCK_SZ
2225 for(i = 0; i < GL_HASH_LOCK_SZ; i++) {
2226 rwlock_init(&gl_hash_locks[i]);
2227 }
2228#endif
2229 return 0;
2230}
2231
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
new file mode 100644
index 000000000000..2b2a889ee2cc
--- /dev/null
+++ b/fs/gfs2/glock.h
@@ -0,0 +1,153 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __GLOCK_DOT_H__
11#define __GLOCK_DOT_H__
12
13#include "incore.h"
14
15/* Flags for lock requests; used in gfs2_holder gh_flag field.
16 From lm_interface.h:
17#define LM_FLAG_TRY 0x00000001
18#define LM_FLAG_TRY_1CB 0x00000002
19#define LM_FLAG_NOEXP 0x00000004
20#define LM_FLAG_ANY 0x00000008
21#define LM_FLAG_PRIORITY 0x00000010 */
22
23#define GL_LOCAL_EXCL 0x00000020
24#define GL_ASYNC 0x00000040
25#define GL_EXACT 0x00000080
26#define GL_SKIP 0x00000100
27#define GL_ATIME 0x00000200
28#define GL_NOCACHE 0x00000400
29#define GL_NOCANCEL 0x00001000
30#define GL_AOP 0x00004000
31#define GL_DUMP 0x00008000
32
33#define GLR_TRYFAILED 13
34#define GLR_CANCELED 14
35
36static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
37{
38 struct gfs2_holder *gh;
39 int locked = 0;
40
41 /* Look in glock's list of holders for one with current task as owner */
42 spin_lock(&gl->gl_spin);
43 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
44 if (gh->gh_owner == current) {
45 locked = 1;
46 break;
47 }
48 }
49 spin_unlock(&gl->gl_spin);
50
51 return locked;
52}
53
54static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
55{
56 return gl->gl_state == LM_ST_EXCLUSIVE;
57}
58
59static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl)
60{
61 return gl->gl_state == LM_ST_DEFERRED;
62}
63
64static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
65{
66 return gl->gl_state == LM_ST_SHARED;
67}
68
69static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
70{
71 int ret;
72 spin_lock(&gl->gl_spin);
73 ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3);
74 spin_unlock(&gl->gl_spin);
75 return ret;
76}
77
78int gfs2_glock_get(struct gfs2_sbd *sdp,
79 u64 number, const struct gfs2_glock_operations *glops,
80 int create, struct gfs2_glock **glp);
81void gfs2_glock_hold(struct gfs2_glock *gl);
82int gfs2_glock_put(struct gfs2_glock *gl);
83void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
84 struct gfs2_holder *gh);
85void gfs2_holder_reinit(unsigned int state, unsigned flags,
86 struct gfs2_holder *gh);
87void gfs2_holder_uninit(struct gfs2_holder *gh);
88
89void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags);
90void gfs2_glock_drop_th(struct gfs2_glock *gl);
91
92int gfs2_glock_nq(struct gfs2_holder *gh);
93int gfs2_glock_poll(struct gfs2_holder *gh);
94int gfs2_glock_wait(struct gfs2_holder *gh);
95void gfs2_glock_dq(struct gfs2_holder *gh);
96
97int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time);
98
99void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
100int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
101 u64 number, const struct gfs2_glock_operations *glops,
102 unsigned int state, int flags, struct gfs2_holder *gh);
103
104int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
105void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
106void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
107
108void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number,
109 const struct gfs2_glock_operations *glops,
110 unsigned int state, int flags);
111void gfs2_glock_inode_squish(struct inode *inode);
112
113/**
114 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
115 * @gl: the glock
116 * @state: the state we're requesting
117 * @flags: the modifier flags
118 * @gh: the holder structure
119 *
120 * Returns: 0, GLR_*, or errno
121 */
122
123static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
124 unsigned int state, int flags,
125 struct gfs2_holder *gh)
126{
127 int error;
128
129 gfs2_holder_init(gl, state, flags, gh);
130
131 error = gfs2_glock_nq(gh);
132 if (error)
133 gfs2_holder_uninit(gh);
134
135 return error;
136}
137
138/* Lock Value Block functions */
139
140int gfs2_lvb_hold(struct gfs2_glock *gl);
141void gfs2_lvb_unhold(struct gfs2_glock *gl);
142
143void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
144
145void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
146void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
147
148void gfs2_scand_internal(struct gfs2_sbd *sdp);
149void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
150
151int __init gfs2_glock_init(void);
152
153#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
new file mode 100644
index 000000000000..41a6b6818a50
--- /dev/null
+++ b/fs/gfs2/glops.c
@@ -0,0 +1,615 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/lm_interface.h>
17
18#include "gfs2.h"
19#include "incore.h"
20#include "bmap.h"
21#include "glock.h"
22#include "glops.h"
23#include "inode.h"
24#include "log.h"
25#include "meta_io.h"
26#include "recovery.h"
27#include "rgrp.h"
28#include "util.h"
29#include "trans.h"
30
31/**
32 * ail_empty_gl - remove all buffers for a given lock from the AIL
33 * @gl: the glock
34 *
35 * None of the buffers should be dirty, locked, or pinned.
36 */
37
38static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
39{
40 struct gfs2_sbd *sdp = gl->gl_sbd;
41 unsigned int blocks;
42 struct list_head *head = &gl->gl_ail_list;
43 struct gfs2_bufdata *bd;
44 struct buffer_head *bh;
45 u64 blkno;
46 int error;
47
48 blocks = atomic_read(&gl->gl_ail_count);
49 if (!blocks)
50 return;
51
52 error = gfs2_trans_begin(sdp, 0, blocks);
53 if (gfs2_assert_withdraw(sdp, !error))
54 return;
55
56 gfs2_log_lock(sdp);
57 while (!list_empty(head)) {
58 bd = list_entry(head->next, struct gfs2_bufdata,
59 bd_ail_gl_list);
60 bh = bd->bd_bh;
61 blkno = bh->b_blocknr;
62 gfs2_assert_withdraw(sdp, !buffer_busy(bh));
63
64 bd->bd_ail = NULL;
65 list_del(&bd->bd_ail_st_list);
66 list_del(&bd->bd_ail_gl_list);
67 atomic_dec(&gl->gl_ail_count);
68 brelse(bh);
69 gfs2_log_unlock(sdp);
70
71 gfs2_trans_add_revoke(sdp, blkno);
72
73 gfs2_log_lock(sdp);
74 }
75 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
76 gfs2_log_unlock(sdp);
77
78 gfs2_trans_end(sdp);
79 gfs2_log_flush(sdp, NULL);
80}
81
82/**
83 * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock
84 * @gl: the glock
85 *
86 */
87
88static void gfs2_pte_inval(struct gfs2_glock *gl)
89{
90 struct gfs2_inode *ip;
91 struct inode *inode;
92
93 ip = gl->gl_object;
94 inode = &ip->i_inode;
95 if (!ip || !S_ISREG(ip->i_di.di_mode))
96 return;
97
98 if (!test_bit(GIF_PAGED, &ip->i_flags))
99 return;
100
101 unmap_shared_mapping_range(inode->i_mapping, 0, 0);
102
103 if (test_bit(GIF_SW_PAGED, &ip->i_flags))
104 set_bit(GLF_DIRTY, &gl->gl_flags);
105
106 clear_bit(GIF_SW_PAGED, &ip->i_flags);
107}
108
109/**
110 * gfs2_page_inval - Invalidate all pages associated with a glock
111 * @gl: the glock
112 *
113 */
114
115static void gfs2_page_inval(struct gfs2_glock *gl)
116{
117 struct gfs2_inode *ip;
118 struct inode *inode;
119
120 ip = gl->gl_object;
121 inode = &ip->i_inode;
122 if (!ip || !S_ISREG(ip->i_di.di_mode))
123 return;
124
125 truncate_inode_pages(inode->i_mapping, 0);
126 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), !inode->i_mapping->nrpages);
127 clear_bit(GIF_PAGED, &ip->i_flags);
128}
129
130/**
131 * gfs2_page_wait - Wait for writeback of data
132 * @gl: the glock
133 *
134 * Syncs data (not metadata) for a regular file.
135 * No-op for all other types.
136 */
137
138static void gfs2_page_wait(struct gfs2_glock *gl)
139{
140 struct gfs2_inode *ip = gl->gl_object;
141 struct inode *inode = &ip->i_inode;
142 struct address_space *mapping = inode->i_mapping;
143 int error;
144
145 if (!S_ISREG(ip->i_di.di_mode))
146 return;
147
148 error = filemap_fdatawait(mapping);
149
150 /* Put back any errors cleared by filemap_fdatawait()
151 so they can be caught by someone who can pass them
152 up to user space. */
153
154 if (error == -ENOSPC)
155 set_bit(AS_ENOSPC, &mapping->flags);
156 else if (error)
157 set_bit(AS_EIO, &mapping->flags);
158
159}
160
161static void gfs2_page_writeback(struct gfs2_glock *gl)
162{
163 struct gfs2_inode *ip = gl->gl_object;
164 struct inode *inode = &ip->i_inode;
165 struct address_space *mapping = inode->i_mapping;
166
167 if (!S_ISREG(ip->i_di.di_mode))
168 return;
169
170 filemap_fdatawrite(mapping);
171}
172
173/**
174 * meta_go_sync - sync out the metadata for this glock
175 * @gl: the glock
176 * @flags: DIO_*
177 *
178 * Called when demoting or unlocking an EX glock. We must flush
179 * to disk all dirty buffers/pages relating to this glock, and must not
180 * not return to caller to demote/unlock the glock until I/O is complete.
181 */
182
183static void meta_go_sync(struct gfs2_glock *gl, int flags)
184{
185 if (!(flags & DIO_METADATA))
186 return;
187
188 if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) {
189 gfs2_log_flush(gl->gl_sbd, gl);
190 gfs2_meta_sync(gl);
191 if (flags & DIO_RELEASE)
192 gfs2_ail_empty_gl(gl);
193 }
194
195}
196
197/**
198 * meta_go_inval - invalidate the metadata for this glock
199 * @gl: the glock
200 * @flags:
201 *
202 */
203
204static void meta_go_inval(struct gfs2_glock *gl, int flags)
205{
206 if (!(flags & DIO_METADATA))
207 return;
208
209 gfs2_meta_inval(gl);
210 gl->gl_vn++;
211}
212
213/**
214 * inode_go_xmote_th - promote/demote a glock
215 * @gl: the glock
216 * @state: the requested state
217 * @flags:
218 *
219 */
220
221static void inode_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
222 int flags)
223{
224 if (gl->gl_state != LM_ST_UNLOCKED)
225 gfs2_pte_inval(gl);
226 gfs2_glock_xmote_th(gl, state, flags);
227}
228
229/**
230 * inode_go_xmote_bh - After promoting/demoting a glock
231 * @gl: the glock
232 *
233 */
234
235static void inode_go_xmote_bh(struct gfs2_glock *gl)
236{
237 struct gfs2_holder *gh = gl->gl_req_gh;
238 struct buffer_head *bh;
239 int error;
240
241 if (gl->gl_state != LM_ST_UNLOCKED &&
242 (!gh || !(gh->gh_flags & GL_SKIP))) {
243 error = gfs2_meta_read(gl, gl->gl_name.ln_number, 0, &bh);
244 if (!error)
245 brelse(bh);
246 }
247}
248
249/**
250 * inode_go_drop_th - unlock a glock
251 * @gl: the glock
252 *
253 * Invoked from rq_demote().
254 * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long)
255 * is being purged from our node's glock cache; we're dropping lock.
256 */
257
258static void inode_go_drop_th(struct gfs2_glock *gl)
259{
260 gfs2_pte_inval(gl);
261 gfs2_glock_drop_th(gl);
262}
263
264/**
265 * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
266 * @gl: the glock protecting the inode
267 * @flags:
268 *
269 */
270
271static void inode_go_sync(struct gfs2_glock *gl, int flags)
272{
273 int meta = (flags & DIO_METADATA);
274 int data = (flags & DIO_DATA);
275
276 if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
277 if (meta && data) {
278 gfs2_page_writeback(gl);
279 gfs2_log_flush(gl->gl_sbd, gl);
280 gfs2_meta_sync(gl);
281 gfs2_page_wait(gl);
282 clear_bit(GLF_DIRTY, &gl->gl_flags);
283 } else if (meta) {
284 gfs2_log_flush(gl->gl_sbd, gl);
285 gfs2_meta_sync(gl);
286 } else if (data) {
287 gfs2_page_writeback(gl);
288 gfs2_page_wait(gl);
289 }
290 if (flags & DIO_RELEASE)
291 gfs2_ail_empty_gl(gl);
292 }
293}
294
295/**
296 * inode_go_inval - prepare a inode glock to be released
297 * @gl: the glock
298 * @flags:
299 *
300 */
301
302static void inode_go_inval(struct gfs2_glock *gl, int flags)
303{
304 int meta = (flags & DIO_METADATA);
305 int data = (flags & DIO_DATA);
306
307 if (meta) {
308 gfs2_meta_inval(gl);
309 gl->gl_vn++;
310 }
311 if (data)
312 gfs2_page_inval(gl);
313}
314
315/**
316 * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock
317 * @gl: the glock
318 *
319 * Returns: 1 if it's ok
320 */
321
322static int inode_go_demote_ok(struct gfs2_glock *gl)
323{
324 struct gfs2_sbd *sdp = gl->gl_sbd;
325 int demote = 0;
326
327 if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages)
328 demote = 1;
329 else if (!sdp->sd_args.ar_localcaching &&
330 time_after_eq(jiffies, gl->gl_stamp +
331 gfs2_tune_get(sdp, gt_demote_secs) * HZ))
332 demote = 1;
333
334 return demote;
335}
336
337/**
338 * inode_go_lock - operation done after an inode lock is locked by a process
339 * @gl: the glock
340 * @flags:
341 *
342 * Returns: errno
343 */
344
345static int inode_go_lock(struct gfs2_holder *gh)
346{
347 struct gfs2_glock *gl = gh->gh_gl;
348 struct gfs2_inode *ip = gl->gl_object;
349 int error = 0;
350
351 if (!ip)
352 return 0;
353
354 if (ip->i_vn != gl->gl_vn) {
355 error = gfs2_inode_refresh(ip);
356 if (error)
357 return error;
358 gfs2_inode_attr_in(ip);
359 }
360
361 if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
362 (gl->gl_state == LM_ST_EXCLUSIVE) &&
363 (gh->gh_flags & GL_LOCAL_EXCL))
364 error = gfs2_truncatei_resume(ip);
365
366 return error;
367}
368
369/**
370 * inode_go_unlock - operation done before an inode lock is unlocked by a
371 * process
372 * @gl: the glock
373 * @flags:
374 *
375 */
376
377static void inode_go_unlock(struct gfs2_holder *gh)
378{
379 struct gfs2_glock *gl = gh->gh_gl;
380 struct gfs2_inode *ip = gl->gl_object;
381
382 if (ip == NULL)
383 return;
384 if (test_bit(GLF_DIRTY, &gl->gl_flags))
385 gfs2_inode_attr_in(ip);
386 gfs2_meta_cache_flush(ip);
387}
388
389/**
390 * inode_greedy -
391 * @gl: the glock
392 *
393 */
394
395static void inode_greedy(struct gfs2_glock *gl)
396{
397 struct gfs2_sbd *sdp = gl->gl_sbd;
398 struct gfs2_inode *ip = gl->gl_object;
399 unsigned int quantum = gfs2_tune_get(sdp, gt_greedy_quantum);
400 unsigned int max = gfs2_tune_get(sdp, gt_greedy_max);
401 unsigned int new_time;
402
403 spin_lock(&ip->i_spin);
404
405 if (time_after(ip->i_last_pfault + quantum, jiffies)) {
406 new_time = ip->i_greedy + quantum;
407 if (new_time > max)
408 new_time = max;
409 } else {
410 new_time = ip->i_greedy - quantum;
411 if (!new_time || new_time > max)
412 new_time = 1;
413 }
414
415 ip->i_greedy = new_time;
416
417 spin_unlock(&ip->i_spin);
418
419 iput(&ip->i_inode);
420}
421
422/**
423 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
424 * @gl: the glock
425 *
426 * Returns: 1 if it's ok
427 */
428
429static int rgrp_go_demote_ok(struct gfs2_glock *gl)
430{
431 return !gl->gl_aspace->i_mapping->nrpages;
432}
433
434/**
435 * rgrp_go_lock - operation done after an rgrp lock is locked by
436 * a first holder on this node.
437 * @gl: the glock
438 * @flags:
439 *
440 * Returns: errno
441 */
442
443static int rgrp_go_lock(struct gfs2_holder *gh)
444{
445 return gfs2_rgrp_bh_get(gh->gh_gl->gl_object);
446}
447
448/**
449 * rgrp_go_unlock - operation done before an rgrp lock is unlocked by
450 * a last holder on this node.
451 * @gl: the glock
452 * @flags:
453 *
454 */
455
456static void rgrp_go_unlock(struct gfs2_holder *gh)
457{
458 gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
459}
460
461/**
462 * trans_go_xmote_th - promote/demote the transaction glock
463 * @gl: the glock
464 * @state: the requested state
465 * @flags:
466 *
467 */
468
469static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
470 int flags)
471{
472 struct gfs2_sbd *sdp = gl->gl_sbd;
473
474 if (gl->gl_state != LM_ST_UNLOCKED &&
475 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
476 gfs2_meta_syncfs(sdp);
477 gfs2_log_shutdown(sdp);
478 }
479
480 gfs2_glock_xmote_th(gl, state, flags);
481}
482
483/**
484 * trans_go_xmote_bh - After promoting/demoting the transaction glock
485 * @gl: the glock
486 *
487 */
488
489static void trans_go_xmote_bh(struct gfs2_glock *gl)
490{
491 struct gfs2_sbd *sdp = gl->gl_sbd;
492 struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
493 struct gfs2_glock *j_gl = ip->i_gl;
494 struct gfs2_log_header head;
495 int error;
496
497 if (gl->gl_state != LM_ST_UNLOCKED &&
498 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
499 gfs2_meta_cache_flush(GFS2_I(sdp->sd_jdesc->jd_inode));
500 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
501
502 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
503 if (error)
504 gfs2_consist(sdp);
505 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT))
506 gfs2_consist(sdp);
507
508 /* Initialize some head of the log stuff */
509 if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
510 sdp->sd_log_sequence = head.lh_sequence + 1;
511 gfs2_log_pointers_init(sdp, head.lh_blkno);
512 }
513 }
514}
515
516/**
517 * trans_go_drop_th - unlock the transaction glock
518 * @gl: the glock
519 *
520 * We want to sync the device even with localcaching. Remember
521 * that localcaching journal replay only marks buffers dirty.
522 */
523
524static void trans_go_drop_th(struct gfs2_glock *gl)
525{
526 struct gfs2_sbd *sdp = gl->gl_sbd;
527
528 if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
529 gfs2_meta_syncfs(sdp);
530 gfs2_log_shutdown(sdp);
531 }
532
533 gfs2_glock_drop_th(gl);
534}
535
536/**
537 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
538 * @gl: the glock
539 *
540 * Returns: 1 if it's ok
541 */
542
543static int quota_go_demote_ok(struct gfs2_glock *gl)
544{
545 return !atomic_read(&gl->gl_lvb_count);
546}
547
548const struct gfs2_glock_operations gfs2_meta_glops = {
549 .go_xmote_th = gfs2_glock_xmote_th,
550 .go_drop_th = gfs2_glock_drop_th,
551 .go_type = LM_TYPE_META,
552};
553
554const struct gfs2_glock_operations gfs2_inode_glops = {
555 .go_xmote_th = inode_go_xmote_th,
556 .go_xmote_bh = inode_go_xmote_bh,
557 .go_drop_th = inode_go_drop_th,
558 .go_sync = inode_go_sync,
559 .go_inval = inode_go_inval,
560 .go_demote_ok = inode_go_demote_ok,
561 .go_lock = inode_go_lock,
562 .go_unlock = inode_go_unlock,
563 .go_greedy = inode_greedy,
564 .go_type = LM_TYPE_INODE,
565};
566
567const struct gfs2_glock_operations gfs2_rgrp_glops = {
568 .go_xmote_th = gfs2_glock_xmote_th,
569 .go_drop_th = gfs2_glock_drop_th,
570 .go_sync = meta_go_sync,
571 .go_inval = meta_go_inval,
572 .go_demote_ok = rgrp_go_demote_ok,
573 .go_lock = rgrp_go_lock,
574 .go_unlock = rgrp_go_unlock,
575 .go_type = LM_TYPE_RGRP,
576};
577
578const struct gfs2_glock_operations gfs2_trans_glops = {
579 .go_xmote_th = trans_go_xmote_th,
580 .go_xmote_bh = trans_go_xmote_bh,
581 .go_drop_th = trans_go_drop_th,
582 .go_type = LM_TYPE_NONDISK,
583};
584
585const struct gfs2_glock_operations gfs2_iopen_glops = {
586 .go_xmote_th = gfs2_glock_xmote_th,
587 .go_drop_th = gfs2_glock_drop_th,
588 .go_type = LM_TYPE_IOPEN,
589};
590
591const struct gfs2_glock_operations gfs2_flock_glops = {
592 .go_xmote_th = gfs2_glock_xmote_th,
593 .go_drop_th = gfs2_glock_drop_th,
594 .go_type = LM_TYPE_FLOCK,
595};
596
597const struct gfs2_glock_operations gfs2_nondisk_glops = {
598 .go_xmote_th = gfs2_glock_xmote_th,
599 .go_drop_th = gfs2_glock_drop_th,
600 .go_type = LM_TYPE_NONDISK,
601};
602
603const struct gfs2_glock_operations gfs2_quota_glops = {
604 .go_xmote_th = gfs2_glock_xmote_th,
605 .go_drop_th = gfs2_glock_drop_th,
606 .go_demote_ok = quota_go_demote_ok,
607 .go_type = LM_TYPE_QUOTA,
608};
609
610const struct gfs2_glock_operations gfs2_journal_glops = {
611 .go_xmote_th = gfs2_glock_xmote_th,
612 .go_drop_th = gfs2_glock_drop_th,
613 .go_type = LM_TYPE_JOURNAL,
614};
615
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
new file mode 100644
index 000000000000..a1d9b5b024e6
--- /dev/null
+++ b/fs/gfs2/glops.h
@@ -0,0 +1,25 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __GLOPS_DOT_H__
11#define __GLOPS_DOT_H__
12
13#include "incore.h"
14
15extern const struct gfs2_glock_operations gfs2_meta_glops;
16extern const struct gfs2_glock_operations gfs2_inode_glops;
17extern const struct gfs2_glock_operations gfs2_rgrp_glops;
18extern const struct gfs2_glock_operations gfs2_trans_glops;
19extern const struct gfs2_glock_operations gfs2_iopen_glops;
20extern const struct gfs2_glock_operations gfs2_flock_glops;
21extern const struct gfs2_glock_operations gfs2_nondisk_glops;
22extern const struct gfs2_glock_operations gfs2_quota_glops;
23extern const struct gfs2_glock_operations gfs2_journal_glops;
24
25#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
new file mode 100644
index 000000000000..118dc693d111
--- /dev/null
+++ b/fs/gfs2/incore.h
@@ -0,0 +1,634 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __INCORE_DOT_H__
11#define __INCORE_DOT_H__
12
13#include <linux/fs.h>
14
15#define DIO_WAIT 0x00000010
16#define DIO_METADATA 0x00000020
17#define DIO_DATA 0x00000040
18#define DIO_RELEASE 0x00000080
19#define DIO_ALL 0x00000100
20
21struct gfs2_log_operations;
22struct gfs2_log_element;
23struct gfs2_holder;
24struct gfs2_glock;
25struct gfs2_quota_data;
26struct gfs2_trans;
27struct gfs2_ail;
28struct gfs2_jdesc;
29struct gfs2_sbd;
30
31typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
32
33/*
34 * Structure of operations that are associated with each
35 * type of element in the log.
36 */
37
38struct gfs2_log_operations {
39 void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le);
40 void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
41 void (*lo_before_commit) (struct gfs2_sbd *sdp);
42 void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
43 void (*lo_before_scan) (struct gfs2_jdesc *jd,
44 struct gfs2_log_header *head, int pass);
45 int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start,
46 struct gfs2_log_descriptor *ld, __be64 *ptr,
47 int pass);
48 void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass);
49 const char *lo_name;
50};
51
52struct gfs2_log_element {
53 struct list_head le_list;
54 const struct gfs2_log_operations *le_ops;
55};
56
57struct gfs2_bitmap {
58 struct buffer_head *bi_bh;
59 char *bi_clone;
60 u32 bi_offset;
61 u32 bi_start;
62 u32 bi_len;
63};
64
65struct gfs2_rgrpd {
66 struct list_head rd_list; /* Link with superblock */
67 struct list_head rd_list_mru;
68 struct list_head rd_recent; /* Recently used rgrps */
69 struct gfs2_glock *rd_gl; /* Glock for this rgrp */
70 struct gfs2_rindex rd_ri;
71 struct gfs2_rgrp rd_rg;
72 u64 rd_rg_vn;
73 struct gfs2_bitmap *rd_bits;
74 unsigned int rd_bh_count;
75 struct mutex rd_mutex;
76 u32 rd_free_clone;
77 struct gfs2_log_element rd_le;
78 u32 rd_last_alloc_data;
79 u32 rd_last_alloc_meta;
80 struct gfs2_sbd *rd_sbd;
81};
82
83enum gfs2_state_bits {
84 BH_Pinned = BH_PrivateStart,
85 BH_Escaped = BH_PrivateStart + 1,
86};
87
88BUFFER_FNS(Pinned, pinned)
89TAS_BUFFER_FNS(Pinned, pinned)
90BUFFER_FNS(Escaped, escaped)
91TAS_BUFFER_FNS(Escaped, escaped)
92
93struct gfs2_bufdata {
94 struct buffer_head *bd_bh;
95 struct gfs2_glock *bd_gl;
96
97 struct list_head bd_list_tr;
98 struct gfs2_log_element bd_le;
99
100 struct gfs2_ail *bd_ail;
101 struct list_head bd_ail_st_list;
102 struct list_head bd_ail_gl_list;
103};
104
105struct gfs2_glock_operations {
106 void (*go_xmote_th) (struct gfs2_glock * gl, unsigned int state,
107 int flags);
108 void (*go_xmote_bh) (struct gfs2_glock * gl);
109 void (*go_drop_th) (struct gfs2_glock * gl);
110 void (*go_drop_bh) (struct gfs2_glock * gl);
111 void (*go_sync) (struct gfs2_glock * gl, int flags);
112 void (*go_inval) (struct gfs2_glock * gl, int flags);
113 int (*go_demote_ok) (struct gfs2_glock * gl);
114 int (*go_lock) (struct gfs2_holder * gh);
115 void (*go_unlock) (struct gfs2_holder * gh);
116 void (*go_callback) (struct gfs2_glock * gl, unsigned int state);
117 void (*go_greedy) (struct gfs2_glock * gl);
118 const int go_type;
119};
120
121enum {
122 /* Actions */
123 HIF_MUTEX = 0,
124 HIF_PROMOTE = 1,
125 HIF_DEMOTE = 2,
126 HIF_GREEDY = 3,
127
128 /* States */
129 HIF_ALLOCED = 4,
130 HIF_DEALLOC = 5,
131 HIF_HOLDER = 6,
132 HIF_FIRST = 7,
133 HIF_ABORTED = 9,
134};
135
136struct gfs2_holder {
137 struct list_head gh_list;
138
139 struct gfs2_glock *gh_gl;
140 struct task_struct *gh_owner;
141 unsigned int gh_state;
142 unsigned gh_flags;
143
144 int gh_error;
145 unsigned long gh_iflags;
146 struct completion gh_wait;
147 unsigned long gh_ip;
148};
149
150enum {
151 GLF_LOCK = 1,
152 GLF_STICKY = 2,
153 GLF_PREFETCH = 3,
154 GLF_DIRTY = 5,
155 GLF_SKIP_WAITERS2 = 6,
156 GLF_GREEDY = 7,
157};
158
159struct gfs2_glock {
160 struct hlist_node gl_list;
161 unsigned long gl_flags; /* GLF_... */
162 struct lm_lockname gl_name;
163 atomic_t gl_ref;
164
165 spinlock_t gl_spin;
166
167 unsigned int gl_state;
168 unsigned int gl_hash;
169 struct task_struct *gl_owner;
170 unsigned long gl_ip;
171 struct list_head gl_holders;
172 struct list_head gl_waiters1; /* HIF_MUTEX */
173 struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_GREEDY */
174 struct list_head gl_waiters3; /* HIF_PROMOTE */
175
176 const struct gfs2_glock_operations *gl_ops;
177
178 struct gfs2_holder *gl_req_gh;
179 gfs2_glop_bh_t gl_req_bh;
180
181 void *gl_lock;
182 char *gl_lvb;
183 atomic_t gl_lvb_count;
184
185 u64 gl_vn;
186 unsigned long gl_stamp;
187 void *gl_object;
188
189 struct list_head gl_reclaim;
190
191 struct gfs2_sbd *gl_sbd;
192
193 struct inode *gl_aspace;
194 struct gfs2_log_element gl_le;
195 struct list_head gl_ail_list;
196 atomic_t gl_ail_count;
197};
198
199struct gfs2_alloc {
200 /* Quota stuff */
201
202 struct gfs2_quota_data *al_qd[2*MAXQUOTAS];
203 struct gfs2_holder al_qd_ghs[2*MAXQUOTAS];
204 unsigned int al_qd_num;
205
206 u32 al_requested; /* Filled in by caller of gfs2_inplace_reserve() */
207 u32 al_alloced; /* Filled in by gfs2_alloc_*() */
208
209 /* Filled in by gfs2_inplace_reserve() */
210
211 unsigned int al_line;
212 char *al_file;
213 struct gfs2_holder al_ri_gh;
214 struct gfs2_holder al_rgd_gh;
215 struct gfs2_rgrpd *al_rgd;
216
217};
218
219enum {
220 GIF_QD_LOCKED = 1,
221 GIF_PAGED = 2,
222 GIF_SW_PAGED = 3,
223};
224
225struct gfs2_inode {
226 struct inode i_inode;
227 struct gfs2_inum i_num;
228
229 unsigned long i_flags; /* GIF_... */
230
231 u64 i_vn;
232 struct gfs2_dinode i_di; /* To be replaced by ref to block */
233
234 struct gfs2_glock *i_gl; /* Move into i_gh? */
235 struct gfs2_holder i_iopen_gh;
236 struct gfs2_holder i_gh; /* for prepare/commit_write only */
237 struct gfs2_alloc i_alloc;
238 u64 i_last_rg_alloc;
239
240 spinlock_t i_spin;
241 struct rw_semaphore i_rw_mutex;
242 unsigned int i_greedy;
243 unsigned long i_last_pfault;
244
245 struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT];
246};
247
248/*
249 * Since i_inode is the first element of struct gfs2_inode,
250 * this is effectively a cast.
251 */
252static inline struct gfs2_inode *GFS2_I(struct inode *inode)
253{
254 return container_of(inode, struct gfs2_inode, i_inode);
255}
256
257/* To be removed? */
258static inline struct gfs2_sbd *GFS2_SB(struct inode *inode)
259{
260 return inode->i_sb->s_fs_info;
261}
262
263enum {
264 GFF_DID_DIRECT_ALLOC = 0,
265 GFF_EXLOCK = 1,
266};
267
268struct gfs2_file {
269 unsigned long f_flags; /* GFF_... */
270 struct mutex f_fl_mutex;
271 struct gfs2_holder f_fl_gh;
272};
273
274struct gfs2_revoke {
275 struct gfs2_log_element rv_le;
276 u64 rv_blkno;
277};
278
279struct gfs2_revoke_replay {
280 struct list_head rr_list;
281 u64 rr_blkno;
282 unsigned int rr_where;
283};
284
285enum {
286 QDF_USER = 0,
287 QDF_CHANGE = 1,
288 QDF_LOCKED = 2,
289};
290
291struct gfs2_quota_lvb {
292 __be32 qb_magic;
293 u32 __pad;
294 __be64 qb_limit; /* Hard limit of # blocks to alloc */
295 __be64 qb_warn; /* Warn user when alloc is above this # */
296 __be64 qb_value; /* Current # blocks allocated */
297};
298
299struct gfs2_quota_data {
300 struct list_head qd_list;
301 unsigned int qd_count;
302
303 u32 qd_id;
304 unsigned long qd_flags; /* QDF_... */
305
306 s64 qd_change;
307 s64 qd_change_sync;
308
309 unsigned int qd_slot;
310 unsigned int qd_slot_count;
311
312 struct buffer_head *qd_bh;
313 struct gfs2_quota_change *qd_bh_qc;
314 unsigned int qd_bh_count;
315
316 struct gfs2_glock *qd_gl;
317 struct gfs2_quota_lvb qd_qb;
318
319 u64 qd_sync_gen;
320 unsigned long qd_last_warn;
321 unsigned long qd_last_touched;
322};
323
324struct gfs2_log_buf {
325 struct list_head lb_list;
326 struct buffer_head *lb_bh;
327 struct buffer_head *lb_real;
328};
329
330struct gfs2_trans {
331 unsigned long tr_ip;
332
333 unsigned int tr_blocks;
334 unsigned int tr_revokes;
335 unsigned int tr_reserved;
336
337 struct gfs2_holder tr_t_gh;
338
339 int tr_touched;
340
341 unsigned int tr_num_buf;
342 unsigned int tr_num_buf_new;
343 unsigned int tr_num_buf_rm;
344 struct list_head tr_list_buf;
345
346 unsigned int tr_num_revoke;
347 unsigned int tr_num_revoke_rm;
348};
349
350struct gfs2_ail {
351 struct list_head ai_list;
352
353 unsigned int ai_first;
354 struct list_head ai_ail1_list;
355 struct list_head ai_ail2_list;
356
357 u64 ai_sync_gen;
358};
359
360struct gfs2_jdesc {
361 struct list_head jd_list;
362
363 struct inode *jd_inode;
364 unsigned int jd_jid;
365 int jd_dirty;
366
367 unsigned int jd_blocks;
368};
369
370#define GFS2_GLOCKD_DEFAULT 1
371#define GFS2_GLOCKD_MAX 16
372
373#define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF
374#define GFS2_QUOTA_OFF 0
375#define GFS2_QUOTA_ACCOUNT 1
376#define GFS2_QUOTA_ON 2
377
378#define GFS2_DATA_DEFAULT GFS2_DATA_ORDERED
379#define GFS2_DATA_WRITEBACK 1
380#define GFS2_DATA_ORDERED 2
381
382struct gfs2_args {
383 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
384 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
385 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
386 int ar_spectator; /* Don't get a journal because we're always RO */
387 int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */
388 int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */
389 int ar_localcaching; /* Local-style caching (dangerous on multihost) */
390 int ar_debug; /* Oops on errors instead of trying to be graceful */
391 int ar_upgrade; /* Upgrade ondisk/multihost format */
392 unsigned int ar_num_glockd; /* Number of glockd threads */
393 int ar_posix_acl; /* Enable posix acls */
394 int ar_quota; /* off/account/on */
395 int ar_suiddir; /* suiddir support */
396 int ar_data; /* ordered/writeback */
397};
398
399struct gfs2_tune {
400 spinlock_t gt_spin;
401
402 unsigned int gt_ilimit;
403 unsigned int gt_ilimit_tries;
404 unsigned int gt_ilimit_min;
405 unsigned int gt_demote_secs; /* Cache retention for unheld glock */
406 unsigned int gt_incore_log_blocks;
407 unsigned int gt_log_flush_secs;
408 unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
409
410 unsigned int gt_scand_secs;
411 unsigned int gt_recoverd_secs;
412 unsigned int gt_logd_secs;
413 unsigned int gt_quotad_secs;
414
415 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
416 unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
417 unsigned int gt_quota_scale_num; /* Numerator */
418 unsigned int gt_quota_scale_den; /* Denominator */
419 unsigned int gt_quota_cache_secs;
420 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
421 unsigned int gt_atime_quantum; /* Min secs between atime updates */
422 unsigned int gt_new_files_jdata;
423 unsigned int gt_new_files_directio;
424 unsigned int gt_max_atomic_write; /* Split big writes into this size */
425 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
426 unsigned int gt_lockdump_size;
427 unsigned int gt_stall_secs; /* Detects trouble! */
428 unsigned int gt_complain_secs;
429 unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
430 unsigned int gt_entries_per_readdir;
431 unsigned int gt_prefetch_secs; /* Usage window for prefetched glocks */
432 unsigned int gt_greedy_default;
433 unsigned int gt_greedy_quantum;
434 unsigned int gt_greedy_max;
435 unsigned int gt_statfs_quantum;
436 unsigned int gt_statfs_slow;
437};
438
439enum {
440 SDF_JOURNAL_CHECKED = 0,
441 SDF_JOURNAL_LIVE = 1,
442 SDF_SHUTDOWN = 2,
443 SDF_NOATIME = 3,
444};
445
446#define GFS2_FSNAME_LEN 256
447
448struct gfs2_sbd {
449 struct super_block *sd_vfs;
450 struct super_block *sd_vfs_meta;
451 struct kobject sd_kobj;
452 unsigned long sd_flags; /* SDF_... */
453 struct gfs2_sb sd_sb;
454
455 /* Constants computed on mount */
456
457 u32 sd_fsb2bb;
458 u32 sd_fsb2bb_shift;
459 u32 sd_diptrs; /* Number of pointers in a dinode */
460 u32 sd_inptrs; /* Number of pointers in a indirect block */
461 u32 sd_jbsize; /* Size of a journaled data block */
462 u32 sd_hash_bsize; /* sizeof(exhash block) */
463 u32 sd_hash_bsize_shift;
464 u32 sd_hash_ptrs; /* Number of pointers in a hash block */
465 u32 sd_qc_per_block;
466 u32 sd_max_dirres; /* Max blocks needed to add a directory entry */
467 u32 sd_max_height; /* Max height of a file's metadata tree */
468 u64 sd_heightsize[GFS2_MAX_META_HEIGHT];
469 u32 sd_max_jheight; /* Max height of journaled file's meta tree */
470 u64 sd_jheightsize[GFS2_MAX_META_HEIGHT];
471
472 struct gfs2_args sd_args; /* Mount arguments */
473 struct gfs2_tune sd_tune; /* Filesystem tuning structure */
474
475 /* Lock Stuff */
476
477 struct lm_lockstruct sd_lockstruct;
478 struct list_head sd_reclaim_list;
479 spinlock_t sd_reclaim_lock;
480 wait_queue_head_t sd_reclaim_wq;
481 atomic_t sd_reclaim_count;
482 struct gfs2_holder sd_live_gh;
483 struct gfs2_glock *sd_rename_gl;
484 struct gfs2_glock *sd_trans_gl;
485
486 /* Inode Stuff */
487
488 struct inode *sd_master_dir;
489 struct inode *sd_jindex;
490 struct inode *sd_inum_inode;
491 struct inode *sd_statfs_inode;
492 struct inode *sd_ir_inode;
493 struct inode *sd_sc_inode;
494 struct inode *sd_qc_inode;
495 struct inode *sd_rindex;
496 struct inode *sd_quota_inode;
497
498 /* Inum stuff */
499
500 struct mutex sd_inum_mutex;
501
502 /* StatFS stuff */
503
504 spinlock_t sd_statfs_spin;
505 struct mutex sd_statfs_mutex;
506 struct gfs2_statfs_change sd_statfs_master;
507 struct gfs2_statfs_change sd_statfs_local;
508 unsigned long sd_statfs_sync_time;
509
510 /* Resource group stuff */
511
512 u64 sd_rindex_vn;
513 spinlock_t sd_rindex_spin;
514 struct mutex sd_rindex_mutex;
515 struct list_head sd_rindex_list;
516 struct list_head sd_rindex_mru_list;
517 struct list_head sd_rindex_recent_list;
518 struct gfs2_rgrpd *sd_rindex_forward;
519 unsigned int sd_rgrps;
520
521 /* Journal index stuff */
522
523 struct list_head sd_jindex_list;
524 spinlock_t sd_jindex_spin;
525 struct mutex sd_jindex_mutex;
526 unsigned int sd_journals;
527 unsigned long sd_jindex_refresh_time;
528
529 struct gfs2_jdesc *sd_jdesc;
530 struct gfs2_holder sd_journal_gh;
531 struct gfs2_holder sd_jinode_gh;
532
533 struct gfs2_holder sd_ir_gh;
534 struct gfs2_holder sd_sc_gh;
535 struct gfs2_holder sd_qc_gh;
536
537 /* Daemon stuff */
538
539 struct task_struct *sd_scand_process;
540 struct task_struct *sd_recoverd_process;
541 struct task_struct *sd_logd_process;
542 struct task_struct *sd_quotad_process;
543 struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
544 unsigned int sd_glockd_num;
545
546 /* Quota stuff */
547
548 struct list_head sd_quota_list;
549 atomic_t sd_quota_count;
550 spinlock_t sd_quota_spin;
551 struct mutex sd_quota_mutex;
552
553 unsigned int sd_quota_slots;
554 unsigned int sd_quota_chunks;
555 unsigned char **sd_quota_bitmap;
556
557 u64 sd_quota_sync_gen;
558 unsigned long sd_quota_sync_time;
559
560 /* Log stuff */
561
562 spinlock_t sd_log_lock;
563
564 unsigned int sd_log_blks_reserved;
565 unsigned int sd_log_commited_buf;
566 unsigned int sd_log_commited_revoke;
567
568 unsigned int sd_log_num_gl;
569 unsigned int sd_log_num_buf;
570 unsigned int sd_log_num_revoke;
571 unsigned int sd_log_num_rg;
572 unsigned int sd_log_num_databuf;
573 unsigned int sd_log_num_jdata;
574 unsigned int sd_log_num_hdrs;
575
576 struct list_head sd_log_le_gl;
577 struct list_head sd_log_le_buf;
578 struct list_head sd_log_le_revoke;
579 struct list_head sd_log_le_rg;
580 struct list_head sd_log_le_databuf;
581
582 unsigned int sd_log_blks_free;
583 struct mutex sd_log_reserve_mutex;
584
585 u64 sd_log_sequence;
586 unsigned int sd_log_head;
587 unsigned int sd_log_tail;
588 int sd_log_idle;
589
590 unsigned long sd_log_flush_time;
591 struct rw_semaphore sd_log_flush_lock;
592 struct list_head sd_log_flush_list;
593
594 unsigned int sd_log_flush_head;
595 u64 sd_log_flush_wrapped;
596
597 struct list_head sd_ail1_list;
598 struct list_head sd_ail2_list;
599 u64 sd_ail_sync_gen;
600
601 /* Replay stuff */
602
603 struct list_head sd_revoke_list;
604 unsigned int sd_replay_tail;
605
606 unsigned int sd_found_blocks;
607 unsigned int sd_found_revokes;
608 unsigned int sd_replayed_blocks;
609
610 /* For quiescing the filesystem */
611
612 struct gfs2_holder sd_freeze_gh;
613 struct mutex sd_freeze_lock;
614 unsigned int sd_freeze_count;
615
616 /* Counters */
617
618 atomic_t sd_glock_count;
619 atomic_t sd_glock_held_count;
620 atomic_t sd_inode_count;
621 atomic_t sd_reclaimed;
622
623 char sd_fsname[GFS2_FSNAME_LEN];
624 char sd_table_name[GFS2_FSNAME_LEN];
625 char sd_proto_name[GFS2_FSNAME_LEN];
626
627 /* Debugging crud */
628
629 unsigned long sd_last_warning;
630 struct vfsmount *sd_gfs2mnt;
631};
632
633#endif /* __INCORE_DOT_H__ */
634
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
new file mode 100644
index 000000000000..57c43ac47925
--- /dev/null
+++ b/fs/gfs2/inode.c
@@ -0,0 +1,1379 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/posix_acl.h>
16#include <linux/sort.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/crc32.h>
19#include <linux/lm_interface.h>
20#include <linux/security.h>
21
22#include "gfs2.h"
23#include "incore.h"
24#include "acl.h"
25#include "bmap.h"
26#include "dir.h"
27#include "eattr.h"
28#include "glock.h"
29#include "glops.h"
30#include "inode.h"
31#include "log.h"
32#include "meta_io.h"
33#include "ops_address.h"
34#include "ops_file.h"
35#include "ops_inode.h"
36#include "quota.h"
37#include "rgrp.h"
38#include "trans.h"
39#include "util.h"
40
41/**
42 * gfs2_inode_attr_in - Copy attributes from the dinode into the VFS inode
43 * @ip: The GFS2 inode (with embedded disk inode data)
44 * @inode: The Linux VFS inode
45 *
46 */
47
48void gfs2_inode_attr_in(struct gfs2_inode *ip)
49{
50 struct inode *inode = &ip->i_inode;
51 struct gfs2_dinode *di = &ip->i_di;
52
53 inode->i_ino = ip->i_num.no_addr;
54
55 switch (di->di_mode & S_IFMT) {
56 case S_IFBLK:
57 case S_IFCHR:
58 inode->i_rdev = MKDEV(di->di_major, di->di_minor);
59 break;
60 default:
61 inode->i_rdev = 0;
62 break;
63 };
64
65 inode->i_mode = di->di_mode;
66 inode->i_nlink = di->di_nlink;
67 inode->i_uid = di->di_uid;
68 inode->i_gid = di->di_gid;
69 i_size_write(inode, di->di_size);
70 inode->i_atime.tv_sec = di->di_atime;
71 inode->i_mtime.tv_sec = di->di_mtime;
72 inode->i_ctime.tv_sec = di->di_ctime;
73 inode->i_atime.tv_nsec = 0;
74 inode->i_mtime.tv_nsec = 0;
75 inode->i_ctime.tv_nsec = 0;
76 inode->i_blocks = di->di_blocks <<
77 (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
78
79 if (di->di_flags & GFS2_DIF_IMMUTABLE)
80 inode->i_flags |= S_IMMUTABLE;
81 else
82 inode->i_flags &= ~S_IMMUTABLE;
83
84 if (di->di_flags & GFS2_DIF_APPENDONLY)
85 inode->i_flags |= S_APPEND;
86 else
87 inode->i_flags &= ~S_APPEND;
88}
89
90/**
91 * gfs2_inode_attr_out - Copy attributes from VFS inode into the dinode
92 * @ip: The GFS2 inode
93 *
94 * Only copy out the attributes that we want the VFS layer
95 * to be able to modify.
96 */
97
98void gfs2_inode_attr_out(struct gfs2_inode *ip)
99{
100 struct inode *inode = &ip->i_inode;
101 struct gfs2_dinode *di = &ip->i_di;
102 gfs2_assert_withdraw(GFS2_SB(inode),
103 (di->di_mode & S_IFMT) == (inode->i_mode & S_IFMT));
104 di->di_mode = inode->i_mode;
105 di->di_uid = inode->i_uid;
106 di->di_gid = inode->i_gid;
107 di->di_atime = inode->i_atime.tv_sec;
108 di->di_mtime = inode->i_mtime.tv_sec;
109 di->di_ctime = inode->i_ctime.tv_sec;
110}
111
112static int iget_test(struct inode *inode, void *opaque)
113{
114 struct gfs2_inode *ip = GFS2_I(inode);
115 struct gfs2_inum *inum = opaque;
116
117 if (ip && ip->i_num.no_addr == inum->no_addr)
118 return 1;
119
120 return 0;
121}
122
123static int iget_set(struct inode *inode, void *opaque)
124{
125 struct gfs2_inode *ip = GFS2_I(inode);
126 struct gfs2_inum *inum = opaque;
127
128 ip->i_num = *inum;
129 return 0;
130}
131
132struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum)
133{
134 return ilookup5(sb, (unsigned long)inum->no_formal_ino,
135 iget_test, inum);
136}
137
138static struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum *inum)
139{
140 return iget5_locked(sb, (unsigned long)inum->no_formal_ino,
141 iget_test, iget_set, inum);
142}
143
144/**
145 * gfs2_inode_lookup - Lookup an inode
146 * @sb: The super block
147 * @inum: The inode number
148 * @type: The type of the inode
149 *
150 * Returns: A VFS inode, or an error
151 */
152
153struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned int type)
154{
155 struct inode *inode = gfs2_iget(sb, inum);
156 struct gfs2_inode *ip = GFS2_I(inode);
157 struct gfs2_glock *io_gl;
158 int error;
159
160 if (inode->i_state & I_NEW) {
161 struct gfs2_sbd *sdp = GFS2_SB(inode);
162 umode_t mode = DT2IF(type);
163 inode->i_private = ip;
164 inode->i_mode = mode;
165
166 if (S_ISREG(mode)) {
167 inode->i_op = &gfs2_file_iops;
168 inode->i_fop = &gfs2_file_fops;
169 inode->i_mapping->a_ops = &gfs2_file_aops;
170 } else if (S_ISDIR(mode)) {
171 inode->i_op = &gfs2_dir_iops;
172 inode->i_fop = &gfs2_dir_fops;
173 } else if (S_ISLNK(mode)) {
174 inode->i_op = &gfs2_symlink_iops;
175 } else {
176 inode->i_op = &gfs2_dev_iops;
177 }
178
179 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
180 if (unlikely(error))
181 goto fail;
182 ip->i_gl->gl_object = ip;
183
184 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
185 if (unlikely(error))
186 goto fail_put;
187
188 ip->i_vn = ip->i_gl->gl_vn - 1;
189 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
190 if (unlikely(error))
191 goto fail_iopen;
192
193 gfs2_glock_put(io_gl);
194 unlock_new_inode(inode);
195 }
196
197 return inode;
198fail_iopen:
199 gfs2_glock_put(io_gl);
200fail_put:
201 ip->i_gl->gl_object = NULL;
202 gfs2_glock_put(ip->i_gl);
203fail:
204 iput(inode);
205 return ERR_PTR(error);
206}
207
208/**
209 * gfs2_inode_refresh - Refresh the incore copy of the dinode
210 * @ip: The GFS2 inode
211 *
212 * Returns: errno
213 */
214
215int gfs2_inode_refresh(struct gfs2_inode *ip)
216{
217 struct buffer_head *dibh;
218 int error;
219
220 error = gfs2_meta_inode_buffer(ip, &dibh);
221 if (error)
222 return error;
223
224 if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) {
225 brelse(dibh);
226 return -EIO;
227 }
228
229 gfs2_dinode_in(&ip->i_di, dibh->b_data);
230
231 brelse(dibh);
232
233 if (ip->i_num.no_addr != ip->i_di.di_num.no_addr) {
234 if (gfs2_consist_inode(ip))
235 gfs2_dinode_print(&ip->i_di);
236 return -EIO;
237 }
238 if (ip->i_num.no_formal_ino != ip->i_di.di_num.no_formal_ino)
239 return -ESTALE;
240
241 ip->i_vn = ip->i_gl->gl_vn;
242
243 return 0;
244}
245
246int gfs2_dinode_dealloc(struct gfs2_inode *ip)
247{
248 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
249 struct gfs2_alloc *al;
250 struct gfs2_rgrpd *rgd;
251 int error;
252
253 if (ip->i_di.di_blocks != 1) {
254 if (gfs2_consist_inode(ip))
255 gfs2_dinode_print(&ip->i_di);
256 return -EIO;
257 }
258
259 al = gfs2_alloc_get(ip);
260
261 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
262 if (error)
263 goto out;
264
265 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
266 if (error)
267 goto out_qs;
268
269 rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
270 if (!rgd) {
271 gfs2_consist_inode(ip);
272 error = -EIO;
273 goto out_rindex_relse;
274 }
275
276 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
277 &al->al_rgd_gh);
278 if (error)
279 goto out_rindex_relse;
280
281 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 1);
282 if (error)
283 goto out_rg_gunlock;
284
285 gfs2_trans_add_gl(ip->i_gl);
286
287 gfs2_free_di(rgd, ip);
288
289 gfs2_trans_end(sdp);
290 clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
291
292out_rg_gunlock:
293 gfs2_glock_dq_uninit(&al->al_rgd_gh);
294out_rindex_relse:
295 gfs2_glock_dq_uninit(&al->al_ri_gh);
296out_qs:
297 gfs2_quota_unhold(ip);
298out:
299 gfs2_alloc_put(ip);
300 return error;
301}
302
303/**
304 * gfs2_change_nlink - Change nlink count on inode
305 * @ip: The GFS2 inode
306 * @diff: The change in the nlink count required
307 *
308 * Returns: errno
309 */
310
311int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
312{
313 struct gfs2_sbd *sdp = ip->i_inode.i_sb->s_fs_info;
314 struct buffer_head *dibh;
315 u32 nlink;
316 int error;
317
318 BUG_ON(ip->i_di.di_nlink != ip->i_inode.i_nlink);
319 nlink = ip->i_di.di_nlink + diff;
320
321 /* If we are reducing the nlink count, but the new value ends up being
322 bigger than the old one, we must have underflowed. */
323 if (diff < 0 && nlink > ip->i_di.di_nlink) {
324 if (gfs2_consist_inode(ip))
325 gfs2_dinode_print(&ip->i_di);
326 return -EIO;
327 }
328
329 error = gfs2_meta_inode_buffer(ip, &dibh);
330 if (error)
331 return error;
332
333 ip->i_di.di_nlink = nlink;
334 ip->i_di.di_ctime = get_seconds();
335 ip->i_inode.i_nlink = nlink;
336
337 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
338 gfs2_dinode_out(&ip->i_di, dibh->b_data);
339 brelse(dibh);
340 mark_inode_dirty(&ip->i_inode);
341
342 if (ip->i_di.di_nlink == 0) {
343 struct gfs2_rgrpd *rgd;
344 struct gfs2_holder ri_gh, rg_gh;
345
346 error = gfs2_rindex_hold(sdp, &ri_gh);
347 if (error)
348 goto out;
349 error = -EIO;
350 rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
351 if (!rgd)
352 goto out_norgrp;
353 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
354 if (error)
355 goto out_norgrp;
356
357 clear_nlink(&ip->i_inode);
358 gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */
359 gfs2_glock_dq_uninit(&rg_gh);
360out_norgrp:
361 gfs2_glock_dq_uninit(&ri_gh);
362 }
363out:
364 return error;
365}
366
367struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
368{
369 struct qstr qstr;
370 gfs2_str2qstr(&qstr, name);
371 return gfs2_lookupi(dip, &qstr, 1, NULL);
372}
373
374
375/**
376 * gfs2_lookupi - Look up a filename in a directory and return its inode
377 * @d_gh: An initialized holder for the directory glock
378 * @name: The name of the inode to look for
379 * @is_root: If 1, ignore the caller's permissions
380 * @i_gh: An uninitialized holder for the new inode glock
381 *
382 * There will always be a vnode (Linux VFS inode) for the d_gh inode unless
383 * @is_root is true.
384 *
385 * Returns: errno
386 */
387
388struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
389 int is_root, struct nameidata *nd)
390{
391 struct super_block *sb = dir->i_sb;
392 struct gfs2_inode *dip = GFS2_I(dir);
393 struct gfs2_holder d_gh;
394 struct gfs2_inum inum;
395 unsigned int type;
396 int error = 0;
397 struct inode *inode = NULL;
398
399 if (!name->len || name->len > GFS2_FNAMESIZE)
400 return ERR_PTR(-ENAMETOOLONG);
401
402 if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) ||
403 (name->len == 2 && memcmp(name->name, "..", 2) == 0 &&
404 dir == sb->s_root->d_inode)) {
405 igrab(dir);
406 return dir;
407 }
408
409 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
410 if (error)
411 return ERR_PTR(error);
412
413 if (!is_root) {
414 error = permission(dir, MAY_EXEC, NULL);
415 if (error)
416 goto out;
417 }
418
419 error = gfs2_dir_search(dir, name, &inum, &type);
420 if (error)
421 goto out;
422
423 inode = gfs2_inode_lookup(sb, &inum, type);
424
425out:
426 gfs2_glock_dq_uninit(&d_gh);
427 if (error == -ENOENT)
428 return NULL;
429 return inode;
430}
431
432static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
433{
434 struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
435 struct buffer_head *bh;
436 struct gfs2_inum_range ir;
437 int error;
438
439 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
440 if (error)
441 return error;
442 mutex_lock(&sdp->sd_inum_mutex);
443
444 error = gfs2_meta_inode_buffer(ip, &bh);
445 if (error) {
446 mutex_unlock(&sdp->sd_inum_mutex);
447 gfs2_trans_end(sdp);
448 return error;
449 }
450
451 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
452
453 if (ir.ir_length) {
454 *formal_ino = ir.ir_start++;
455 ir.ir_length--;
456 gfs2_trans_add_bh(ip->i_gl, bh, 1);
457 gfs2_inum_range_out(&ir,
458 bh->b_data + sizeof(struct gfs2_dinode));
459 brelse(bh);
460 mutex_unlock(&sdp->sd_inum_mutex);
461 gfs2_trans_end(sdp);
462 return 0;
463 }
464
465 brelse(bh);
466
467 mutex_unlock(&sdp->sd_inum_mutex);
468 gfs2_trans_end(sdp);
469
470 return 1;
471}
472
473static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino)
474{
475 struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
476 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
477 struct gfs2_holder gh;
478 struct buffer_head *bh;
479 struct gfs2_inum_range ir;
480 int error;
481
482 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
483 if (error)
484 return error;
485
486 error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
487 if (error)
488 goto out;
489 mutex_lock(&sdp->sd_inum_mutex);
490
491 error = gfs2_meta_inode_buffer(ip, &bh);
492 if (error)
493 goto out_end_trans;
494
495 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
496
497 if (!ir.ir_length) {
498 struct buffer_head *m_bh;
499 u64 x, y;
500
501 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
502 if (error)
503 goto out_brelse;
504
505 x = *(u64 *)(m_bh->b_data + sizeof(struct gfs2_dinode));
506 x = y = be64_to_cpu(x);
507 ir.ir_start = x;
508 ir.ir_length = GFS2_INUM_QUANTUM;
509 x += GFS2_INUM_QUANTUM;
510 if (x < y)
511 gfs2_consist_inode(m_ip);
512 x = cpu_to_be64(x);
513 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
514 *(u64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = x;
515
516 brelse(m_bh);
517 }
518
519 *formal_ino = ir.ir_start++;
520 ir.ir_length--;
521
522 gfs2_trans_add_bh(ip->i_gl, bh, 1);
523 gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
524
525out_brelse:
526 brelse(bh);
527out_end_trans:
528 mutex_unlock(&sdp->sd_inum_mutex);
529 gfs2_trans_end(sdp);
530out:
531 gfs2_glock_dq_uninit(&gh);
532 return error;
533}
534
535static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum)
536{
537 int error;
538
539 error = pick_formal_ino_1(sdp, inum);
540 if (error <= 0)
541 return error;
542
543 error = pick_formal_ino_2(sdp, inum);
544
545 return error;
546}
547
548/**
549 * create_ok - OK to create a new on-disk inode here?
550 * @dip: Directory in which dinode is to be created
551 * @name: Name of new dinode
552 * @mode:
553 *
554 * Returns: errno
555 */
556
557static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
558 unsigned int mode)
559{
560 int error;
561
562 error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
563 if (error)
564 return error;
565
566 /* Don't create entries in an unlinked directory */
567 if (!dip->i_di.di_nlink)
568 return -EPERM;
569
570 error = gfs2_dir_search(&dip->i_inode, name, NULL, NULL);
571 switch (error) {
572 case -ENOENT:
573 error = 0;
574 break;
575 case 0:
576 return -EEXIST;
577 default:
578 return error;
579 }
580
581 if (dip->i_di.di_entries == (u32)-1)
582 return -EFBIG;
583 if (S_ISDIR(mode) && dip->i_di.di_nlink == (u32)-1)
584 return -EMLINK;
585
586 return 0;
587}
588
589static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
590 unsigned int *uid, unsigned int *gid)
591{
592 if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
593 (dip->i_di.di_mode & S_ISUID) && dip->i_di.di_uid) {
594 if (S_ISDIR(*mode))
595 *mode |= S_ISUID;
596 else if (dip->i_di.di_uid != current->fsuid)
597 *mode &= ~07111;
598 *uid = dip->i_di.di_uid;
599 } else
600 *uid = current->fsuid;
601
602 if (dip->i_di.di_mode & S_ISGID) {
603 if (S_ISDIR(*mode))
604 *mode |= S_ISGID;
605 *gid = dip->i_di.di_gid;
606 } else
607 *gid = current->fsgid;
608}
609
610static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_inum *inum,
611 u64 *generation)
612{
613 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
614 int error;
615
616 gfs2_alloc_get(dip);
617
618 dip->i_alloc.al_requested = RES_DINODE;
619 error = gfs2_inplace_reserve(dip);
620 if (error)
621 goto out;
622
623 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS, 0);
624 if (error)
625 goto out_ipreserv;
626
627 inum->no_addr = gfs2_alloc_di(dip, generation);
628
629 gfs2_trans_end(sdp);
630
631out_ipreserv:
632 gfs2_inplace_release(dip);
633out:
634 gfs2_alloc_put(dip);
635 return error;
636}
637
638/**
639 * init_dinode - Fill in a new dinode structure
640 * @dip: the directory this inode is being created in
641 * @gl: The glock covering the new inode
642 * @inum: the inode number
643 * @mode: the file permissions
644 * @uid:
645 * @gid:
646 *
647 */
648
649static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
650 const struct gfs2_inum *inum, unsigned int mode,
651 unsigned int uid, unsigned int gid,
652 const u64 *generation)
653{
654 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
655 struct gfs2_dinode *di;
656 struct buffer_head *dibh;
657
658 dibh = gfs2_meta_new(gl, inum->no_addr);
659 gfs2_trans_add_bh(gl, dibh, 1);
660 gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
661 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
662 di = (struct gfs2_dinode *)dibh->b_data;
663
664 di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino);
665 di->di_num.no_addr = cpu_to_be64(inum->no_addr);
666 di->di_mode = cpu_to_be32(mode);
667 di->di_uid = cpu_to_be32(uid);
668 di->di_gid = cpu_to_be32(gid);
669 di->di_nlink = cpu_to_be32(0);
670 di->di_size = cpu_to_be64(0);
671 di->di_blocks = cpu_to_be64(1);
672 di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(get_seconds());
673 di->di_major = di->di_minor = cpu_to_be32(0);
674 di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
675 di->di_generation = cpu_to_be64(*generation);
676 di->di_flags = cpu_to_be32(0);
677
678 if (S_ISREG(mode)) {
679 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
680 gfs2_tune_get(sdp, gt_new_files_jdata))
681 di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
682 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
683 gfs2_tune_get(sdp, gt_new_files_directio))
684 di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
685 } else if (S_ISDIR(mode)) {
686 di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
687 GFS2_DIF_INHERIT_DIRECTIO);
688 di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
689 GFS2_DIF_INHERIT_JDATA);
690 }
691
692 di->__pad1 = 0;
693 di->di_payload_format = cpu_to_be32(0);
694 di->di_height = cpu_to_be32(0);
695 di->__pad2 = 0;
696 di->__pad3 = 0;
697 di->di_depth = cpu_to_be16(0);
698 di->di_entries = cpu_to_be32(0);
699 memset(&di->__pad4, 0, sizeof(di->__pad4));
700 di->di_eattr = cpu_to_be64(0);
701 memset(&di->di_reserved, 0, sizeof(di->di_reserved));
702
703 brelse(dibh);
704}
705
706static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
707 unsigned int mode, const struct gfs2_inum *inum,
708 const u64 *generation)
709{
710 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
711 unsigned int uid, gid;
712 int error;
713
714 munge_mode_uid_gid(dip, &mode, &uid, &gid);
715 gfs2_alloc_get(dip);
716
717 error = gfs2_quota_lock(dip, uid, gid);
718 if (error)
719 goto out;
720
721 error = gfs2_quota_check(dip, uid, gid);
722 if (error)
723 goto out_quota;
724
725 error = gfs2_trans_begin(sdp, RES_DINODE + RES_QUOTA, 0);
726 if (error)
727 goto out_quota;
728
729 init_dinode(dip, gl, inum, mode, uid, gid, generation);
730 gfs2_quota_change(dip, +1, uid, gid);
731 gfs2_trans_end(sdp);
732
733out_quota:
734 gfs2_quota_unlock(dip);
735out:
736 gfs2_alloc_put(dip);
737 return error;
738}
739
740static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
741 struct gfs2_inode *ip)
742{
743 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
744 struct gfs2_alloc *al;
745 int alloc_required;
746 struct buffer_head *dibh;
747 int error;
748
749 al = gfs2_alloc_get(dip);
750
751 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
752 if (error)
753 goto fail;
754
755 error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name);
756 if (alloc_required < 0)
757 goto fail;
758 if (alloc_required) {
759 error = gfs2_quota_check(dip, dip->i_di.di_uid,
760 dip->i_di.di_gid);
761 if (error)
762 goto fail_quota_locks;
763
764 al->al_requested = sdp->sd_max_dirres;
765
766 error = gfs2_inplace_reserve(dip);
767 if (error)
768 goto fail_quota_locks;
769
770 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
771 al->al_rgd->rd_ri.ri_length +
772 2 * RES_DINODE +
773 RES_STATFS + RES_QUOTA, 0);
774 if (error)
775 goto fail_ipreserv;
776 } else {
777 error = gfs2_trans_begin(sdp, RES_LEAF + 2 * RES_DINODE, 0);
778 if (error)
779 goto fail_quota_locks;
780 }
781
782 error = gfs2_dir_add(&dip->i_inode, name, &ip->i_num, IF2DT(ip->i_di.di_mode));
783 if (error)
784 goto fail_end_trans;
785
786 error = gfs2_meta_inode_buffer(ip, &dibh);
787 if (error)
788 goto fail_end_trans;
789 ip->i_di.di_nlink = 1;
790 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
791 gfs2_dinode_out(&ip->i_di, dibh->b_data);
792 brelse(dibh);
793 return 0;
794
795fail_end_trans:
796 gfs2_trans_end(sdp);
797
798fail_ipreserv:
799 if (dip->i_alloc.al_rgd)
800 gfs2_inplace_release(dip);
801
802fail_quota_locks:
803 gfs2_quota_unlock(dip);
804
805fail:
806 gfs2_alloc_put(dip);
807 return error;
808}
809
810static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
811{
812 int err;
813 size_t len;
814 void *value;
815 char *name;
816 struct gfs2_ea_request er;
817
818 err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
819 &name, &value, &len);
820
821 if (err) {
822 if (err == -EOPNOTSUPP)
823 return 0;
824 return err;
825 }
826
827 memset(&er, 0, sizeof(struct gfs2_ea_request));
828
829 er.er_type = GFS2_EATYPE_SECURITY;
830 er.er_name = name;
831 er.er_data = value;
832 er.er_name_len = strlen(name);
833 er.er_data_len = len;
834
835 err = gfs2_ea_set_i(ip, &er);
836
837 kfree(value);
838 kfree(name);
839
840 return err;
841}
842
843/**
844 * gfs2_createi - Create a new inode
845 * @ghs: An array of two holders
846 * @name: The name of the new file
847 * @mode: the permissions on the new inode
848 *
849 * @ghs[0] is an initialized holder for the directory
850 * @ghs[1] is the holder for the inode lock
851 *
852 * If the return value is not NULL, the glocks on both the directory and the new
853 * file are held. A transaction has been started and an inplace reservation
854 * is held, as well.
855 *
856 * Returns: An inode
857 */
858
859struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
860 unsigned int mode)
861{
862 struct inode *inode;
863 struct gfs2_inode *dip = ghs->gh_gl->gl_object;
864 struct inode *dir = &dip->i_inode;
865 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
866 struct gfs2_inum inum;
867 int error;
868 u64 generation;
869
870 if (!name->len || name->len > GFS2_FNAMESIZE)
871 return ERR_PTR(-ENAMETOOLONG);
872
873 gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
874 error = gfs2_glock_nq(ghs);
875 if (error)
876 goto fail;
877
878 error = create_ok(dip, name, mode);
879 if (error)
880 goto fail_gunlock;
881
882 error = pick_formal_ino(sdp, &inum.no_formal_ino);
883 if (error)
884 goto fail_gunlock;
885
886 error = alloc_dinode(dip, &inum, &generation);
887 if (error)
888 goto fail_gunlock;
889
890 if (inum.no_addr < dip->i_num.no_addr) {
891 gfs2_glock_dq(ghs);
892
893 error = gfs2_glock_nq_num(sdp, inum.no_addr,
894 &gfs2_inode_glops, LM_ST_EXCLUSIVE,
895 GL_SKIP, ghs + 1);
896 if (error) {
897 return ERR_PTR(error);
898 }
899
900 gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
901 error = gfs2_glock_nq(ghs);
902 if (error) {
903 gfs2_glock_dq_uninit(ghs + 1);
904 return ERR_PTR(error);
905 }
906
907 error = create_ok(dip, name, mode);
908 if (error)
909 goto fail_gunlock2;
910 } else {
911 error = gfs2_glock_nq_num(sdp, inum.no_addr,
912 &gfs2_inode_glops, LM_ST_EXCLUSIVE,
913 GL_SKIP, ghs + 1);
914 if (error)
915 goto fail_gunlock;
916 }
917
918 error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation);
919 if (error)
920 goto fail_gunlock2;
921
922 inode = gfs2_inode_lookup(dir->i_sb, &inum, IF2DT(mode));
923 if (IS_ERR(inode))
924 goto fail_gunlock2;
925
926 error = gfs2_inode_refresh(GFS2_I(inode));
927 if (error)
928 goto fail_iput;
929
930 error = gfs2_acl_create(dip, GFS2_I(inode));
931 if (error)
932 goto fail_iput;
933
934 error = gfs2_security_init(dip, GFS2_I(inode));
935 if (error)
936 goto fail_iput;
937
938 error = link_dinode(dip, name, GFS2_I(inode));
939 if (error)
940 goto fail_iput;
941
942 if (!inode)
943 return ERR_PTR(-ENOMEM);
944 return inode;
945
946fail_iput:
947 iput(inode);
948fail_gunlock2:
949 gfs2_glock_dq_uninit(ghs + 1);
950fail_gunlock:
951 gfs2_glock_dq(ghs);
952fail:
953 return ERR_PTR(error);
954}
955
956/**
957 * gfs2_rmdiri - Remove a directory
958 * @dip: The parent directory of the directory to be removed
959 * @name: The name of the directory to be removed
960 * @ip: The GFS2 inode of the directory to be removed
961 *
962 * Assumes Glocks on dip and ip are held
963 *
964 * Returns: errno
965 */
966
967int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
968 struct gfs2_inode *ip)
969{
970 struct qstr dotname;
971 int error;
972
973 if (ip->i_di.di_entries != 2) {
974 if (gfs2_consist_inode(ip))
975 gfs2_dinode_print(&ip->i_di);
976 return -EIO;
977 }
978
979 error = gfs2_dir_del(dip, name);
980 if (error)
981 return error;
982
983 error = gfs2_change_nlink(dip, -1);
984 if (error)
985 return error;
986
987 gfs2_str2qstr(&dotname, ".");
988 error = gfs2_dir_del(ip, &dotname);
989 if (error)
990 return error;
991
992 gfs2_str2qstr(&dotname, "..");
993 error = gfs2_dir_del(ip, &dotname);
994 if (error)
995 return error;
996
997 error = gfs2_change_nlink(ip, -2);
998 if (error)
999 return error;
1000
1001 return error;
1002}
1003
1004/*
1005 * gfs2_unlink_ok - check to see that a inode is still in a directory
1006 * @dip: the directory
1007 * @name: the name of the file
1008 * @ip: the inode
1009 *
1010 * Assumes that the lock on (at least) @dip is held.
1011 *
1012 * Returns: 0 if the parent/child relationship is correct, errno if it isn't
1013 */
1014
1015int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
1016 struct gfs2_inode *ip)
1017{
1018 struct gfs2_inum inum;
1019 unsigned int type;
1020 int error;
1021
1022 if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
1023 return -EPERM;
1024
1025 if ((dip->i_di.di_mode & S_ISVTX) &&
1026 dip->i_di.di_uid != current->fsuid &&
1027 ip->i_di.di_uid != current->fsuid && !capable(CAP_FOWNER))
1028 return -EPERM;
1029
1030 if (IS_APPEND(&dip->i_inode))
1031 return -EPERM;
1032
1033 error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
1034 if (error)
1035 return error;
1036
1037 error = gfs2_dir_search(&dip->i_inode, name, &inum, &type);
1038 if (error)
1039 return error;
1040
1041 if (!gfs2_inum_equal(&inum, &ip->i_num))
1042 return -ENOENT;
1043
1044 if (IF2DT(ip->i_di.di_mode) != type) {
1045 gfs2_consist_inode(dip);
1046 return -EIO;
1047 }
1048
1049 return 0;
1050}
1051
1052/*
1053 * gfs2_ok_to_move - check if it's ok to move a directory to another directory
1054 * @this: move this
1055 * @to: to here
1056 *
1057 * Follow @to back to the root and make sure we don't encounter @this
1058 * Assumes we already hold the rename lock.
1059 *
1060 * Returns: errno
1061 */
1062
1063int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
1064{
1065 struct inode *dir = &to->i_inode;
1066 struct super_block *sb = dir->i_sb;
1067 struct inode *tmp;
1068 struct qstr dotdot;
1069 int error = 0;
1070
1071 gfs2_str2qstr(&dotdot, "..");
1072
1073 igrab(dir);
1074
1075 for (;;) {
1076 if (dir == &this->i_inode) {
1077 error = -EINVAL;
1078 break;
1079 }
1080 if (dir == sb->s_root->d_inode) {
1081 error = 0;
1082 break;
1083 }
1084
1085 tmp = gfs2_lookupi(dir, &dotdot, 1, NULL);
1086 if (IS_ERR(tmp)) {
1087 error = PTR_ERR(tmp);
1088 break;
1089 }
1090
1091 iput(dir);
1092 dir = tmp;
1093 }
1094
1095 iput(dir);
1096
1097 return error;
1098}
1099
1100/**
1101 * gfs2_readlinki - return the contents of a symlink
1102 * @ip: the symlink's inode
1103 * @buf: a pointer to the buffer to be filled
1104 * @len: a pointer to the length of @buf
1105 *
1106 * If @buf is too small, a piece of memory is kmalloc()ed and needs
1107 * to be freed by the caller.
1108 *
1109 * Returns: errno
1110 */
1111
1112int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1113{
1114 struct gfs2_holder i_gh;
1115 struct buffer_head *dibh;
1116 unsigned int x;
1117 int error;
1118
1119 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
1120 error = gfs2_glock_nq_atime(&i_gh);
1121 if (error) {
1122 gfs2_holder_uninit(&i_gh);
1123 return error;
1124 }
1125
1126 if (!ip->i_di.di_size) {
1127 gfs2_consist_inode(ip);
1128 error = -EIO;
1129 goto out;
1130 }
1131
1132 error = gfs2_meta_inode_buffer(ip, &dibh);
1133 if (error)
1134 goto out;
1135
1136 x = ip->i_di.di_size + 1;
1137 if (x > *len) {
1138 *buf = kmalloc(x, GFP_KERNEL);
1139 if (!*buf) {
1140 error = -ENOMEM;
1141 goto out_brelse;
1142 }
1143 }
1144
1145 memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
1146 *len = x;
1147
1148out_brelse:
1149 brelse(dibh);
1150out:
1151 gfs2_glock_dq_uninit(&i_gh);
1152 return error;
1153}
1154
1155/**
1156 * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
1157 * conditionally update the inode's atime
1158 * @gh: the holder to acquire
1159 *
1160 * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
1161 * Update if the difference between the current time and the inode's current
1162 * atime is greater than an interval specified at mount.
1163 *
1164 * Returns: errno
1165 */
1166
1167int gfs2_glock_nq_atime(struct gfs2_holder *gh)
1168{
1169 struct gfs2_glock *gl = gh->gh_gl;
1170 struct gfs2_sbd *sdp = gl->gl_sbd;
1171 struct gfs2_inode *ip = gl->gl_object;
1172 s64 curtime, quantum = gfs2_tune_get(sdp, gt_atime_quantum);
1173 unsigned int state;
1174 int flags;
1175 int error;
1176
1177 if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
1178 gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
1179 gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
1180 return -EINVAL;
1181
1182 state = gh->gh_state;
1183 flags = gh->gh_flags;
1184
1185 error = gfs2_glock_nq(gh);
1186 if (error)
1187 return error;
1188
1189 if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
1190 (sdp->sd_vfs->s_flags & MS_RDONLY))
1191 return 0;
1192
1193 curtime = get_seconds();
1194 if (curtime - ip->i_di.di_atime >= quantum) {
1195 gfs2_glock_dq(gh);
1196 gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
1197 gh);
1198 error = gfs2_glock_nq(gh);
1199 if (error)
1200 return error;
1201
1202 /* Verify that atime hasn't been updated while we were
1203 trying to get exclusive lock. */
1204
1205 curtime = get_seconds();
1206 if (curtime - ip->i_di.di_atime >= quantum) {
1207 struct buffer_head *dibh;
1208 struct gfs2_dinode *di;
1209
1210 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1211 if (error == -EROFS)
1212 return 0;
1213 if (error)
1214 goto fail;
1215
1216 error = gfs2_meta_inode_buffer(ip, &dibh);
1217 if (error)
1218 goto fail_end_trans;
1219
1220 ip->i_di.di_atime = curtime;
1221
1222 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1223 di = (struct gfs2_dinode *)dibh->b_data;
1224 di->di_atime = cpu_to_be64(ip->i_di.di_atime);
1225 brelse(dibh);
1226
1227 gfs2_trans_end(sdp);
1228 }
1229
1230 /* If someone else has asked for the glock,
1231 unlock and let them have it. Then reacquire
1232 in the original state. */
1233 if (gfs2_glock_is_blocking(gl)) {
1234 gfs2_glock_dq(gh);
1235 gfs2_holder_reinit(state, flags, gh);
1236 return gfs2_glock_nq(gh);
1237 }
1238 }
1239
1240 return 0;
1241
1242fail_end_trans:
1243 gfs2_trans_end(sdp);
1244fail:
1245 gfs2_glock_dq(gh);
1246 return error;
1247}
1248
1249/**
1250 * glock_compare_atime - Compare two struct gfs2_glock structures for sort
1251 * @arg_a: the first structure
1252 * @arg_b: the second structure
1253 *
1254 * Returns: 1 if A > B
1255 * -1 if A < B
1256 * 0 if A == B
1257 */
1258
1259static int glock_compare_atime(const void *arg_a, const void *arg_b)
1260{
1261 const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a;
1262 const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b;
1263 const struct lm_lockname *a = &gh_a->gh_gl->gl_name;
1264 const struct lm_lockname *b = &gh_b->gh_gl->gl_name;
1265
1266 if (a->ln_number > b->ln_number)
1267 return 1;
1268 if (a->ln_number < b->ln_number)
1269 return -1;
1270 if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE)
1271 return 1;
1272 if (gh_a->gh_state == LM_ST_SHARED && (gh_b->gh_flags & GL_ATIME))
1273 return 1;
1274
1275 return 0;
1276}
1277
1278/**
1279 * gfs2_glock_nq_m_atime - acquire multiple glocks where one may need an
1280 * atime update
1281 * @num_gh: the number of structures
1282 * @ghs: an array of struct gfs2_holder structures
1283 *
1284 * Returns: 0 on success (all glocks acquired),
1285 * errno on failure (no glocks acquired)
1286 */
1287
1288int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs)
1289{
1290 struct gfs2_holder **p;
1291 unsigned int x;
1292 int error = 0;
1293
1294 if (!num_gh)
1295 return 0;
1296
1297 if (num_gh == 1) {
1298 ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1299 if (ghs->gh_flags & GL_ATIME)
1300 error = gfs2_glock_nq_atime(ghs);
1301 else
1302 error = gfs2_glock_nq(ghs);
1303 return error;
1304 }
1305
1306 p = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
1307 if (!p)
1308 return -ENOMEM;
1309
1310 for (x = 0; x < num_gh; x++)
1311 p[x] = &ghs[x];
1312
1313 sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare_atime,NULL);
1314
1315 for (x = 0; x < num_gh; x++) {
1316 p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1317
1318 if (p[x]->gh_flags & GL_ATIME)
1319 error = gfs2_glock_nq_atime(p[x]);
1320 else
1321 error = gfs2_glock_nq(p[x]);
1322
1323 if (error) {
1324 while (x--)
1325 gfs2_glock_dq(p[x]);
1326 break;
1327 }
1328 }
1329
1330 kfree(p);
1331 return error;
1332}
1333
1334
1335static int
1336__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1337{
1338 struct buffer_head *dibh;
1339 int error;
1340
1341 error = gfs2_meta_inode_buffer(ip, &dibh);
1342 if (!error) {
1343 error = inode_setattr(&ip->i_inode, attr);
1344 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
1345 gfs2_inode_attr_out(ip);
1346
1347 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1348 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1349 brelse(dibh);
1350 }
1351 return error;
1352}
1353
1354/**
1355 * gfs2_setattr_simple -
1356 * @ip:
1357 * @attr:
1358 *
1359 * Called with a reference on the vnode.
1360 *
1361 * Returns: errno
1362 */
1363
1364int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1365{
1366 int error;
1367
1368 if (current->journal_info)
1369 return __gfs2_setattr_simple(ip, attr);
1370
1371 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0);
1372 if (error)
1373 return error;
1374
1375 error = __gfs2_setattr_simple(ip, attr);
1376 gfs2_trans_end(GFS2_SB(&ip->i_inode));
1377 return error;
1378}
1379
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
new file mode 100644
index 000000000000..f5d861760579
--- /dev/null
+++ b/fs/gfs2/inode.h
@@ -0,0 +1,56 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __INODE_DOT_H__
11#define __INODE_DOT_H__
12
13static inline int gfs2_is_stuffed(struct gfs2_inode *ip)
14{
15 return !ip->i_di.di_height;
16}
17
18static inline int gfs2_is_jdata(struct gfs2_inode *ip)
19{
20 return ip->i_di.di_flags & GFS2_DIF_JDATA;
21}
22
23static inline int gfs2_is_dir(struct gfs2_inode *ip)
24{
25 return S_ISDIR(ip->i_di.di_mode);
26}
27
28void gfs2_inode_attr_in(struct gfs2_inode *ip);
29void gfs2_inode_attr_out(struct gfs2_inode *ip);
30struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned type);
31struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum);
32
33int gfs2_inode_refresh(struct gfs2_inode *ip);
34
35int gfs2_dinode_dealloc(struct gfs2_inode *inode);
36int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
37struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
38 int is_root, struct nameidata *nd);
39struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
40 unsigned int mode);
41int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
42 struct gfs2_inode *ip);
43int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
44 struct gfs2_inode *ip);
45int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
46int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
47
48int gfs2_glock_nq_atime(struct gfs2_holder *gh);
49int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs);
50
51int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
52
53struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
54
55#endif /* __INODE_DOT_H__ */
56
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
new file mode 100644
index 000000000000..effe4a337c1d
--- /dev/null
+++ b/fs/gfs2/lm.c
@@ -0,0 +1,217 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/delay.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/lm_interface.h>
18
19#include "gfs2.h"
20#include "incore.h"
21#include "glock.h"
22#include "lm.h"
23#include "super.h"
24#include "util.h"
25
26/**
27 * gfs2_lm_mount - mount a locking protocol
28 * @sdp: the filesystem
29 * @args: mount arguements
30 * @silent: if 1, don't complain if the FS isn't a GFS2 fs
31 *
32 * Returns: errno
33 */
34
35int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
36{
37 char *proto = sdp->sd_proto_name;
38 char *table = sdp->sd_table_name;
39 int flags = 0;
40 int error;
41
42 if (sdp->sd_args.ar_spectator)
43 flags |= LM_MFLAG_SPECTATOR;
44
45 fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
46
47 error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
48 gfs2_glock_cb, sdp,
49 GFS2_MIN_LVB_SIZE, flags,
50 &sdp->sd_lockstruct, &sdp->sd_kobj);
51 if (error) {
52 fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
53 proto, table, sdp->sd_args.ar_hostdata);
54 goto out;
55 }
56
57 if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
58 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
59 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
60 GFS2_MIN_LVB_SIZE)) {
61 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
62 goto out;
63 }
64
65 if (sdp->sd_args.ar_spectator)
66 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
67 else
68 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
69 sdp->sd_lockstruct.ls_jid);
70
71 fs_info(sdp, "Joined cluster. Now mounting FS...\n");
72
73 if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
74 !sdp->sd_args.ar_ignore_local_fs) {
75 sdp->sd_args.ar_localflocks = 1;
76 sdp->sd_args.ar_localcaching = 1;
77 }
78
79out:
80 return error;
81}
82
83void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
84{
85 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
86 sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
87 sdp->sd_lockstruct.ls_lockspace);
88}
89
90void gfs2_lm_unmount(struct gfs2_sbd *sdp)
91{
92 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
93 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
94}
95
96int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
97{
98 va_list args;
99
100 if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
101 return 0;
102
103 va_start(args, fmt);
104 vprintk(fmt, args);
105 va_end(args);
106
107 fs_err(sdp, "about to withdraw from the cluster\n");
108 BUG_ON(sdp->sd_args.ar_debug);
109
110
111 fs_err(sdp, "waiting for outstanding I/O\n");
112
113 /* FIXME: suspend dm device so oustanding bio's complete
114 and all further io requests fail */
115
116 fs_err(sdp, "telling LM to withdraw\n");
117 gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
118 fs_err(sdp, "withdrawn\n");
119 dump_stack();
120
121 return -1;
122}
123
124int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
125 void **lockp)
126{
127 int error = -EIO;
128 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
129 error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
130 sdp->sd_lockstruct.ls_lockspace, name, lockp);
131 return error;
132}
133
134void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock)
135{
136 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
137 sdp->sd_lockstruct.ls_ops->lm_put_lock(lock);
138}
139
140unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
141 unsigned int cur_state, unsigned int req_state,
142 unsigned int flags)
143{
144 int ret = 0;
145 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
146 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
147 req_state, flags);
148 return ret;
149}
150
151unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
152 unsigned int cur_state)
153{
154 int ret = 0;
155 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
156 ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
157 return ret;
158}
159
160void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock)
161{
162 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
163 sdp->sd_lockstruct.ls_ops->lm_cancel(lock);
164}
165
166int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
167{
168 int error = -EIO;
169 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
170 error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
171 return error;
172}
173
174void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb)
175{
176 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
177 sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb);
178}
179
180int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
181 struct file *file, struct file_lock *fl)
182{
183 int error = -EIO;
184 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
185 error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
186 sdp->sd_lockstruct.ls_lockspace, name, file, fl);
187 return error;
188}
189
190int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
191 struct file *file, int cmd, struct file_lock *fl)
192{
193 int error = -EIO;
194 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
195 error = sdp->sd_lockstruct.ls_ops->lm_plock(
196 sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
197 return error;
198}
199
200int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
201 struct file *file, struct file_lock *fl)
202{
203 int error = -EIO;
204 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
205 error = sdp->sd_lockstruct.ls_ops->lm_punlock(
206 sdp->sd_lockstruct.ls_lockspace, name, file, fl);
207 return error;
208}
209
210void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
211 unsigned int message)
212{
213 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
214 sdp->sd_lockstruct.ls_ops->lm_recovery_done(
215 sdp->sd_lockstruct.ls_lockspace, jid, message);
216}
217
diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h
new file mode 100644
index 000000000000..21cdc30ee08c
--- /dev/null
+++ b/fs/gfs2/lm.h
@@ -0,0 +1,42 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __LM_DOT_H__
11#define __LM_DOT_H__
12
13struct gfs2_sbd;
14
15#define GFS2_MIN_LVB_SIZE 32
16
17int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent);
18void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp);
19void gfs2_lm_unmount(struct gfs2_sbd *sdp);
20int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
21 __attribute__ ((format(printf, 2, 3)));
22int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
23 void **lockp);
24void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock);
25unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
26 unsigned int cur_state, unsigned int req_state,
27 unsigned int flags);
28unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
29 unsigned int cur_state);
30void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock);
31int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp);
32void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb);
33int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
34 struct file *file, struct file_lock *fl);
35int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
36 struct file *file, int cmd, struct file_lock *fl);
37int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
38 struct file *file, struct file_lock *fl);
39void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
40 unsigned int message);
41
42#endif /* __LM_DOT_H__ */
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
new file mode 100644
index 000000000000..663fee728783
--- /dev/null
+++ b/fs/gfs2/locking.c
@@ -0,0 +1,184 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/string.h>
13#include <linux/slab.h>
14#include <linux/wait.h>
15#include <linux/sched.h>
16#include <linux/kmod.h>
17#include <linux/fs.h>
18#include <linux/delay.h>
19#include <linux/lm_interface.h>
20
21struct lmh_wrapper {
22 struct list_head lw_list;
23 const struct lm_lockops *lw_ops;
24};
25
26/* List of registered low-level locking protocols. A file system selects one
27 of them by name at mount time, e.g. lock_nolock, lock_dlm. */
28
29static LIST_HEAD(lmh_list);
30static DEFINE_MUTEX(lmh_lock);
31
32/**
33 * gfs2_register_lockproto - Register a low-level locking protocol
34 * @proto: the protocol definition
35 *
36 * Returns: 0 on success, -EXXX on failure
37 */
38
39int gfs2_register_lockproto(const struct lm_lockops *proto)
40{
41 struct lmh_wrapper *lw;
42
43 mutex_lock(&lmh_lock);
44
45 list_for_each_entry(lw, &lmh_list, lw_list) {
46 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
47 mutex_unlock(&lmh_lock);
48 printk(KERN_INFO "GFS2: protocol %s already exists\n",
49 proto->lm_proto_name);
50 return -EEXIST;
51 }
52 }
53
54 lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL);
55 if (!lw) {
56 mutex_unlock(&lmh_lock);
57 return -ENOMEM;
58 }
59
60 lw->lw_ops = proto;
61 list_add(&lw->lw_list, &lmh_list);
62
63 mutex_unlock(&lmh_lock);
64
65 return 0;
66}
67
68/**
69 * gfs2_unregister_lockproto - Unregister a low-level locking protocol
70 * @proto: the protocol definition
71 *
72 */
73
74void gfs2_unregister_lockproto(const struct lm_lockops *proto)
75{
76 struct lmh_wrapper *lw;
77
78 mutex_lock(&lmh_lock);
79
80 list_for_each_entry(lw, &lmh_list, lw_list) {
81 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
82 list_del(&lw->lw_list);
83 mutex_unlock(&lmh_lock);
84 kfree(lw);
85 return;
86 }
87 }
88
89 mutex_unlock(&lmh_lock);
90
91 printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n",
92 proto->lm_proto_name);
93}
94
95/**
96 * gfs2_mount_lockproto - Mount a lock protocol
97 * @proto_name - the name of the protocol
98 * @table_name - the name of the lock space
99 * @host_data - data specific to this host
100 * @cb - the callback to the code using the lock module
101 * @sdp - The GFS2 superblock
102 * @min_lvb_size - the mininum LVB size that the caller can deal with
103 * @flags - LM_MFLAG_*
104 * @lockstruct - a structure returned describing the mount
105 *
106 * Returns: 0 on success, -EXXX on failure
107 */
108
109int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
110 lm_callback_t cb, void *cb_data,
111 unsigned int min_lvb_size, int flags,
112 struct lm_lockstruct *lockstruct,
113 struct kobject *fskobj)
114{
115 struct lmh_wrapper *lw = NULL;
116 int try = 0;
117 int error, found;
118
119retry:
120 mutex_lock(&lmh_lock);
121
122 found = 0;
123 list_for_each_entry(lw, &lmh_list, lw_list) {
124 if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
125 found = 1;
126 break;
127 }
128 }
129
130 if (!found) {
131 if (!try && capable(CAP_SYS_MODULE)) {
132 try = 1;
133 mutex_unlock(&lmh_lock);
134 request_module(proto_name);
135 goto retry;
136 }
137 printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name);
138 error = -ENOENT;
139 goto out;
140 }
141
142 if (!try_module_get(lw->lw_ops->lm_owner)) {
143 try = 0;
144 mutex_unlock(&lmh_lock);
145 msleep(1000);
146 goto retry;
147 }
148
149 error = lw->lw_ops->lm_mount(table_name, host_data, cb, cb_data,
150 min_lvb_size, flags, lockstruct, fskobj);
151 if (error)
152 module_put(lw->lw_ops->lm_owner);
153out:
154 mutex_unlock(&lmh_lock);
155 return error;
156}
157
158void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
159{
160 mutex_lock(&lmh_lock);
161 lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
162 if (lockstruct->ls_ops->lm_owner)
163 module_put(lockstruct->ls_ops->lm_owner);
164 mutex_unlock(&lmh_lock);
165}
166
167/**
168 * gfs2_withdraw_lockproto - abnormally unmount a lock module
169 * @lockstruct: the lockstruct passed into mount
170 *
171 */
172
173void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct)
174{
175 mutex_lock(&lmh_lock);
176 lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace);
177 if (lockstruct->ls_ops->lm_owner)
178 module_put(lockstruct->ls_ops->lm_owner);
179 mutex_unlock(&lmh_lock);
180}
181
182EXPORT_SYMBOL_GPL(gfs2_register_lockproto);
183EXPORT_SYMBOL_GPL(gfs2_unregister_lockproto);
184
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
new file mode 100644
index 000000000000..89b93b6b45cf
--- /dev/null
+++ b/fs/gfs2/locking/dlm/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
2lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o
3
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
new file mode 100644
index 000000000000..b167addf9fd1
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -0,0 +1,524 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include "lock_dlm.h"
11
12static char junk_lvb[GDLM_LVB_SIZE];
13
14static void queue_complete(struct gdlm_lock *lp)
15{
16 struct gdlm_ls *ls = lp->ls;
17
18 clear_bit(LFL_ACTIVE, &lp->flags);
19
20 spin_lock(&ls->async_lock);
21 list_add_tail(&lp->clist, &ls->complete);
22 spin_unlock(&ls->async_lock);
23 wake_up(&ls->thread_wait);
24}
25
26static inline void gdlm_ast(void *astarg)
27{
28 queue_complete(astarg);
29}
30
31static inline void gdlm_bast(void *astarg, int mode)
32{
33 struct gdlm_lock *lp = astarg;
34 struct gdlm_ls *ls = lp->ls;
35
36 if (!mode) {
37 printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
38 lp->lockname.ln_type,
39 (unsigned long long)lp->lockname.ln_number);
40 return;
41 }
42
43 spin_lock(&ls->async_lock);
44 if (!lp->bast_mode) {
45 list_add_tail(&lp->blist, &ls->blocking);
46 lp->bast_mode = mode;
47 } else if (lp->bast_mode < mode)
48 lp->bast_mode = mode;
49 spin_unlock(&ls->async_lock);
50 wake_up(&ls->thread_wait);
51}
52
53void gdlm_queue_delayed(struct gdlm_lock *lp)
54{
55 struct gdlm_ls *ls = lp->ls;
56
57 spin_lock(&ls->async_lock);
58 list_add_tail(&lp->delay_list, &ls->delayed);
59 spin_unlock(&ls->async_lock);
60}
61
62/* convert gfs lock-state to dlm lock-mode */
63
64static s16 make_mode(s16 lmstate)
65{
66 switch (lmstate) {
67 case LM_ST_UNLOCKED:
68 return DLM_LOCK_NL;
69 case LM_ST_EXCLUSIVE:
70 return DLM_LOCK_EX;
71 case LM_ST_DEFERRED:
72 return DLM_LOCK_CW;
73 case LM_ST_SHARED:
74 return DLM_LOCK_PR;
75 }
76 gdlm_assert(0, "unknown LM state %d", lmstate);
77 return -1;
78}
79
80/* convert dlm lock-mode to gfs lock-state */
81
82s16 gdlm_make_lmstate(s16 dlmmode)
83{
84 switch (dlmmode) {
85 case DLM_LOCK_IV:
86 case DLM_LOCK_NL:
87 return LM_ST_UNLOCKED;
88 case DLM_LOCK_EX:
89 return LM_ST_EXCLUSIVE;
90 case DLM_LOCK_CW:
91 return LM_ST_DEFERRED;
92 case DLM_LOCK_PR:
93 return LM_ST_SHARED;
94 }
95 gdlm_assert(0, "unknown DLM mode %d", dlmmode);
96 return -1;
97}
98
99/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
100 DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
101
102static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state)
103{
104 s16 cur = make_mode(cur_state);
105 if (lp->cur != DLM_LOCK_IV)
106 gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur);
107}
108
109static inline unsigned int make_flags(struct gdlm_lock *lp,
110 unsigned int gfs_flags,
111 s16 cur, s16 req)
112{
113 unsigned int lkf = 0;
114
115 if (gfs_flags & LM_FLAG_TRY)
116 lkf |= DLM_LKF_NOQUEUE;
117
118 if (gfs_flags & LM_FLAG_TRY_1CB) {
119 lkf |= DLM_LKF_NOQUEUE;
120 lkf |= DLM_LKF_NOQUEUEBAST;
121 }
122
123 if (gfs_flags & LM_FLAG_PRIORITY) {
124 lkf |= DLM_LKF_NOORDER;
125 lkf |= DLM_LKF_HEADQUE;
126 }
127
128 if (gfs_flags & LM_FLAG_ANY) {
129 if (req == DLM_LOCK_PR)
130 lkf |= DLM_LKF_ALTCW;
131 else if (req == DLM_LOCK_CW)
132 lkf |= DLM_LKF_ALTPR;
133 }
134
135 if (lp->lksb.sb_lkid != 0) {
136 lkf |= DLM_LKF_CONVERT;
137
138 /* Conversion deadlock avoidance by DLM */
139
140 if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
141 !(lkf & DLM_LKF_NOQUEUE) &&
142 cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
143 lkf |= DLM_LKF_CONVDEADLK;
144 }
145
146 if (lp->lvb)
147 lkf |= DLM_LKF_VALBLK;
148
149 return lkf;
150}
151
152/* make_strname - convert GFS lock numbers to a string */
153
154static inline void make_strname(struct lm_lockname *lockname,
155 struct gdlm_strname *str)
156{
157 sprintf(str->name, "%8x%16llx", lockname->ln_type,
158 (unsigned long long)lockname->ln_number);
159 str->namelen = GDLM_STRNAME_BYTES;
160}
161
162static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
163 struct gdlm_lock **lpp)
164{
165 struct gdlm_lock *lp;
166
167 lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL);
168 if (!lp)
169 return -ENOMEM;
170
171 lp->lockname = *name;
172 lp->ls = ls;
173 lp->cur = DLM_LOCK_IV;
174 lp->lvb = NULL;
175 lp->hold_null = NULL;
176 init_completion(&lp->ast_wait);
177 INIT_LIST_HEAD(&lp->clist);
178 INIT_LIST_HEAD(&lp->blist);
179 INIT_LIST_HEAD(&lp->delay_list);
180
181 spin_lock(&ls->async_lock);
182 list_add(&lp->all_list, &ls->all_locks);
183 ls->all_locks_count++;
184 spin_unlock(&ls->async_lock);
185
186 *lpp = lp;
187 return 0;
188}
189
190void gdlm_delete_lp(struct gdlm_lock *lp)
191{
192 struct gdlm_ls *ls = lp->ls;
193
194 spin_lock(&ls->async_lock);
195 if (!list_empty(&lp->clist))
196 list_del_init(&lp->clist);
197 if (!list_empty(&lp->blist))
198 list_del_init(&lp->blist);
199 if (!list_empty(&lp->delay_list))
200 list_del_init(&lp->delay_list);
201 gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
202 (unsigned long long)lp->lockname.ln_number);
203 list_del_init(&lp->all_list);
204 ls->all_locks_count--;
205 spin_unlock(&ls->async_lock);
206
207 kfree(lp);
208}
209
210int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
211 void **lockp)
212{
213 struct gdlm_lock *lp;
214 int error;
215
216 error = gdlm_create_lp(lockspace, name, &lp);
217
218 *lockp = lp;
219 return error;
220}
221
222void gdlm_put_lock(void *lock)
223{
224 gdlm_delete_lp(lock);
225}
226
227unsigned int gdlm_do_lock(struct gdlm_lock *lp)
228{
229 struct gdlm_ls *ls = lp->ls;
230 struct gdlm_strname str;
231 int error, bast = 1;
232
233 /*
234 * When recovery is in progress, delay lock requests for submission
235 * once recovery is done. Requests for recovery (NOEXP) and unlocks
236 * can pass.
237 */
238
239 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
240 !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) {
241 gdlm_queue_delayed(lp);
242 return LM_OUT_ASYNC;
243 }
244
245 /*
246 * Submit the actual lock request.
247 */
248
249 if (test_bit(LFL_NOBAST, &lp->flags))
250 bast = 0;
251
252 make_strname(&lp->lockname, &str);
253
254 set_bit(LFL_ACTIVE, &lp->flags);
255
256 log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
257 (unsigned long long)lp->lockname.ln_number, lp->lksb.sb_lkid,
258 lp->cur, lp->req, lp->lkf);
259
260 error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
261 str.name, str.namelen, 0, gdlm_ast, lp,
262 bast ? gdlm_bast : NULL);
263
264 if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
265 lp->lksb.sb_status = -EAGAIN;
266 queue_complete(lp);
267 error = 0;
268 }
269
270 if (error) {
271 log_debug("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
272 "flags=%lx", ls->fsname, lp->lockname.ln_type,
273 (unsigned long long)lp->lockname.ln_number, error,
274 lp->cur, lp->req, lp->lkf, lp->flags);
275 return LM_OUT_ERROR;
276 }
277 return LM_OUT_ASYNC;
278}
279
280static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
281{
282 struct gdlm_ls *ls = lp->ls;
283 unsigned int lkf = 0;
284 int error;
285
286 set_bit(LFL_DLM_UNLOCK, &lp->flags);
287 set_bit(LFL_ACTIVE, &lp->flags);
288
289 if (lp->lvb)
290 lkf = DLM_LKF_VALBLK;
291
292 log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type,
293 (unsigned long long)lp->lockname.ln_number,
294 lp->lksb.sb_lkid, lp->cur, lkf);
295
296 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
297
298 if (error) {
299 log_debug("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
300 "flags=%lx", ls->fsname, lp->lockname.ln_type,
301 (unsigned long long)lp->lockname.ln_number, error,
302 lp->cur, lp->req, lp->lkf, lp->flags);
303 return LM_OUT_ERROR;
304 }
305 return LM_OUT_ASYNC;
306}
307
308unsigned int gdlm_lock(void *lock, unsigned int cur_state,
309 unsigned int req_state, unsigned int flags)
310{
311 struct gdlm_lock *lp = lock;
312
313 clear_bit(LFL_DLM_CANCEL, &lp->flags);
314 if (flags & LM_FLAG_NOEXP)
315 set_bit(LFL_NOBLOCK, &lp->flags);
316
317 check_cur_state(lp, cur_state);
318 lp->req = make_mode(req_state);
319 lp->lkf = make_flags(lp, flags, lp->cur, lp->req);
320
321 return gdlm_do_lock(lp);
322}
323
324unsigned int gdlm_unlock(void *lock, unsigned int cur_state)
325{
326 struct gdlm_lock *lp = lock;
327
328 clear_bit(LFL_DLM_CANCEL, &lp->flags);
329 if (lp->cur == DLM_LOCK_IV)
330 return 0;
331 return gdlm_do_unlock(lp);
332}
333
334void gdlm_cancel(void *lock)
335{
336 struct gdlm_lock *lp = lock;
337 struct gdlm_ls *ls = lp->ls;
338 int error, delay_list = 0;
339
340 if (test_bit(LFL_DLM_CANCEL, &lp->flags))
341 return;
342
343 log_info("gdlm_cancel %x,%llx flags %lx", lp->lockname.ln_type,
344 (unsigned long long)lp->lockname.ln_number, lp->flags);
345
346 spin_lock(&ls->async_lock);
347 if (!list_empty(&lp->delay_list)) {
348 list_del_init(&lp->delay_list);
349 delay_list = 1;
350 }
351 spin_unlock(&ls->async_lock);
352
353 if (delay_list) {
354 set_bit(LFL_CANCEL, &lp->flags);
355 set_bit(LFL_ACTIVE, &lp->flags);
356 queue_complete(lp);
357 return;
358 }
359
360 if (!test_bit(LFL_ACTIVE, &lp->flags) ||
361 test_bit(LFL_DLM_UNLOCK, &lp->flags)) {
362 log_info("gdlm_cancel skip %x,%llx flags %lx",
363 lp->lockname.ln_type,
364 (unsigned long long)lp->lockname.ln_number, lp->flags);
365 return;
366 }
367
368 /* the lock is blocked in the dlm */
369
370 set_bit(LFL_DLM_CANCEL, &lp->flags);
371 set_bit(LFL_ACTIVE, &lp->flags);
372
373 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL,
374 NULL, lp);
375
376 log_info("gdlm_cancel rv %d %x,%llx flags %lx", error,
377 lp->lockname.ln_type,
378 (unsigned long long)lp->lockname.ln_number, lp->flags);
379
380 if (error == -EBUSY)
381 clear_bit(LFL_DLM_CANCEL, &lp->flags);
382}
383
384static int gdlm_add_lvb(struct gdlm_lock *lp)
385{
386 char *lvb;
387
388 lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL);
389 if (!lvb)
390 return -ENOMEM;
391
392 lp->lksb.sb_lvbptr = lvb;
393 lp->lvb = lvb;
394 return 0;
395}
396
397static void gdlm_del_lvb(struct gdlm_lock *lp)
398{
399 kfree(lp->lvb);
400 lp->lvb = NULL;
401 lp->lksb.sb_lvbptr = NULL;
402}
403
404/* This can do a synchronous dlm request (requiring a lock_dlm thread to get
405 the completion) because gfs won't call hold_lvb() during a callback (from
406 the context of a lock_dlm thread). */
407
408static int hold_null_lock(struct gdlm_lock *lp)
409{
410 struct gdlm_lock *lpn = NULL;
411 int error;
412
413 if (lp->hold_null) {
414 printk(KERN_INFO "lock_dlm: lvb already held\n");
415 return 0;
416 }
417
418 error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn);
419 if (error)
420 goto out;
421
422 lpn->lksb.sb_lvbptr = junk_lvb;
423 lpn->lvb = junk_lvb;
424
425 lpn->req = DLM_LOCK_NL;
426 lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE;
427 set_bit(LFL_NOBAST, &lpn->flags);
428 set_bit(LFL_INLOCK, &lpn->flags);
429
430 init_completion(&lpn->ast_wait);
431 gdlm_do_lock(lpn);
432 wait_for_completion(&lpn->ast_wait);
433 error = lpn->lksb.sb_status;
434 if (error) {
435 printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n",
436 error);
437 gdlm_delete_lp(lpn);
438 lpn = NULL;
439 }
440out:
441 lp->hold_null = lpn;
442 return error;
443}
444
445/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get
446 the completion) because gfs may call unhold_lvb() during a callback (from
447 the context of a lock_dlm thread) which could cause a deadlock since the
448 other lock_dlm thread could be engaged in recovery. */
449
450static void unhold_null_lock(struct gdlm_lock *lp)
451{
452 struct gdlm_lock *lpn = lp->hold_null;
453
454 gdlm_assert(lpn, "%x,%llx", lp->lockname.ln_type,
455 (unsigned long long)lp->lockname.ln_number);
456 lpn->lksb.sb_lvbptr = NULL;
457 lpn->lvb = NULL;
458 set_bit(LFL_UNLOCK_DELETE, &lpn->flags);
459 gdlm_do_unlock(lpn);
460 lp->hold_null = NULL;
461}
462
463/* Acquire a NL lock because gfs requires the value block to remain
464 intact on the resource while the lvb is "held" even if it's holding no locks
465 on the resource. */
466
467int gdlm_hold_lvb(void *lock, char **lvbp)
468{
469 struct gdlm_lock *lp = lock;
470 int error;
471
472 error = gdlm_add_lvb(lp);
473 if (error)
474 return error;
475
476 *lvbp = lp->lvb;
477
478 error = hold_null_lock(lp);
479 if (error)
480 gdlm_del_lvb(lp);
481
482 return error;
483}
484
485void gdlm_unhold_lvb(void *lock, char *lvb)
486{
487 struct gdlm_lock *lp = lock;
488
489 unhold_null_lock(lp);
490 gdlm_del_lvb(lp);
491}
492
493void gdlm_submit_delayed(struct gdlm_ls *ls)
494{
495 struct gdlm_lock *lp, *safe;
496
497 spin_lock(&ls->async_lock);
498 list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) {
499 list_del_init(&lp->delay_list);
500 list_add_tail(&lp->delay_list, &ls->submit);
501 }
502 spin_unlock(&ls->async_lock);
503 wake_up(&ls->thread_wait);
504}
505
506int gdlm_release_all_locks(struct gdlm_ls *ls)
507{
508 struct gdlm_lock *lp, *safe;
509 int count = 0;
510
511 spin_lock(&ls->async_lock);
512 list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
513 list_del_init(&lp->all_list);
514
515 if (lp->lvb && lp->lvb != junk_lvb)
516 kfree(lp->lvb);
517 kfree(lp);
518 count++;
519 }
520 spin_unlock(&ls->async_lock);
521
522 return count;
523}
524
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
new file mode 100644
index 000000000000..33af707a4d3f
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -0,0 +1,187 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef LOCK_DLM_DOT_H
11#define LOCK_DLM_DOT_H
12
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/spinlock.h>
16#include <linux/module.h>
17#include <linux/types.h>
18#include <linux/string.h>
19#include <linux/list.h>
20#include <linux/socket.h>
21#include <linux/delay.h>
22#include <linux/kthread.h>
23#include <linux/kobject.h>
24#include <linux/fcntl.h>
25#include <linux/wait.h>
26#include <net/sock.h>
27
28#include <linux/dlm.h>
29#include <linux/lm_interface.h>
30
31/*
32 * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
33 * prefix of lock_dlm_ gets awkward. Externally, GFS refers to this module
34 * as "lock_dlm".
35 */
36
37#define GDLM_STRNAME_BYTES 24
38#define GDLM_LVB_SIZE 32
39#define GDLM_DROP_COUNT 50000
40#define GDLM_DROP_PERIOD 60
41#define GDLM_NAME_LEN 128
42
43/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number).
44 We sprintf these numbers into a 24 byte string of hex values to make them
45 human-readable (to make debugging simpler.) */
46
47struct gdlm_strname {
48 unsigned char name[GDLM_STRNAME_BYTES];
49 unsigned short namelen;
50};
51
52enum {
53 DFL_BLOCK_LOCKS = 0,
54 DFL_SPECTATOR = 1,
55 DFL_WITHDRAW = 2,
56};
57
58struct gdlm_ls {
59 u32 id;
60 int jid;
61 int first;
62 int first_done;
63 unsigned long flags;
64 struct kobject kobj;
65 char clustername[GDLM_NAME_LEN];
66 char fsname[GDLM_NAME_LEN];
67 int fsflags;
68 dlm_lockspace_t *dlm_lockspace;
69 lm_callback_t fscb;
70 struct gfs2_sbd *sdp;
71 int recover_jid;
72 int recover_jid_done;
73 int recover_jid_status;
74 spinlock_t async_lock;
75 struct list_head complete;
76 struct list_head blocking;
77 struct list_head delayed;
78 struct list_head submit;
79 struct list_head all_locks;
80 u32 all_locks_count;
81 wait_queue_head_t wait_control;
82 struct task_struct *thread1;
83 struct task_struct *thread2;
84 wait_queue_head_t thread_wait;
85 unsigned long drop_time;
86 int drop_locks_count;
87 int drop_locks_period;
88};
89
90enum {
91 LFL_NOBLOCK = 0,
92 LFL_NOCACHE = 1,
93 LFL_DLM_UNLOCK = 2,
94 LFL_DLM_CANCEL = 3,
95 LFL_SYNC_LVB = 4,
96 LFL_FORCE_PROMOTE = 5,
97 LFL_REREQUEST = 6,
98 LFL_ACTIVE = 7,
99 LFL_INLOCK = 8,
100 LFL_CANCEL = 9,
101 LFL_NOBAST = 10,
102 LFL_HEADQUE = 11,
103 LFL_UNLOCK_DELETE = 12,
104};
105
106struct gdlm_lock {
107 struct gdlm_ls *ls;
108 struct lm_lockname lockname;
109 char *lvb;
110 struct dlm_lksb lksb;
111
112 s16 cur;
113 s16 req;
114 s16 prev_req;
115 u32 lkf; /* dlm flags DLM_LKF_ */
116 unsigned long flags; /* lock_dlm flags LFL_ */
117
118 int bast_mode; /* protected by async_lock */
119 struct completion ast_wait;
120
121 struct list_head clist; /* complete */
122 struct list_head blist; /* blocking */
123 struct list_head delay_list; /* delayed */
124 struct list_head all_list; /* all locks for the fs */
125 struct gdlm_lock *hold_null; /* NL lock for hold_lvb */
126};
127
128#define gdlm_assert(assertion, fmt, args...) \
129do { \
130 if (unlikely(!(assertion))) { \
131 printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \
132 "lock_dlm: " fmt "\n", \
133 #assertion, ##args); \
134 BUG(); \
135 } \
136} while (0)
137
138#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg)
139#define log_info(fmt, arg...) log_print(KERN_INFO , fmt , ## arg)
140#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg)
141#ifdef LOCK_DLM_LOG_DEBUG
142#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg)
143#else
144#define log_debug(fmt, arg...)
145#endif
146
147/* sysfs.c */
148
149int gdlm_sysfs_init(void);
150void gdlm_sysfs_exit(void);
151int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *);
152void gdlm_kobject_release(struct gdlm_ls *);
153
154/* thread.c */
155
156int gdlm_init_threads(struct gdlm_ls *);
157void gdlm_release_threads(struct gdlm_ls *);
158
159/* lock.c */
160
161s16 gdlm_make_lmstate(s16);
162void gdlm_queue_delayed(struct gdlm_lock *);
163void gdlm_submit_delayed(struct gdlm_ls *);
164int gdlm_release_all_locks(struct gdlm_ls *);
165void gdlm_delete_lp(struct gdlm_lock *);
166unsigned int gdlm_do_lock(struct gdlm_lock *);
167
168int gdlm_get_lock(void *, struct lm_lockname *, void **);
169void gdlm_put_lock(void *);
170unsigned int gdlm_lock(void *, unsigned int, unsigned int, unsigned int);
171unsigned int gdlm_unlock(void *, unsigned int);
172void gdlm_cancel(void *);
173int gdlm_hold_lvb(void *, char **);
174void gdlm_unhold_lvb(void *, char *);
175
176/* plock.c */
177
178int gdlm_plock_init(void);
179void gdlm_plock_exit(void);
180int gdlm_plock(void *, struct lm_lockname *, struct file *, int,
181 struct file_lock *);
182int gdlm_plock_get(void *, struct lm_lockname *, struct file *,
183 struct file_lock *);
184int gdlm_punlock(void *, struct lm_lockname *, struct file *,
185 struct file_lock *);
186#endif
187
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
new file mode 100644
index 000000000000..2194b1d5b5ec
--- /dev/null
+++ b/fs/gfs2/locking/dlm/main.c
@@ -0,0 +1,64 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/init.h>
11
12#include "lock_dlm.h"
13
14extern int gdlm_drop_count;
15extern int gdlm_drop_period;
16
17extern struct lm_lockops gdlm_ops;
18
19static int __init init_lock_dlm(void)
20{
21 int error;
22
23 error = gfs2_register_lockproto(&gdlm_ops);
24 if (error) {
25 printk(KERN_WARNING "lock_dlm: can't register protocol: %d\n",
26 error);
27 return error;
28 }
29
30 error = gdlm_sysfs_init();
31 if (error) {
32 gfs2_unregister_lockproto(&gdlm_ops);
33 return error;
34 }
35
36 error = gdlm_plock_init();
37 if (error) {
38 gdlm_sysfs_exit();
39 gfs2_unregister_lockproto(&gdlm_ops);
40 return error;
41 }
42
43 gdlm_drop_count = GDLM_DROP_COUNT;
44 gdlm_drop_period = GDLM_DROP_PERIOD;
45
46 printk(KERN_INFO
47 "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
48 return 0;
49}
50
51static void __exit exit_lock_dlm(void)
52{
53 gdlm_plock_exit();
54 gdlm_sysfs_exit();
55 gfs2_unregister_lockproto(&gdlm_ops);
56}
57
58module_init(init_lock_dlm);
59module_exit(exit_lock_dlm);
60
61MODULE_DESCRIPTION("GFS DLM Locking Module");
62MODULE_AUTHOR("Red Hat, Inc.");
63MODULE_LICENSE("GPL");
64
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
new file mode 100644
index 000000000000..1f94dd35a943
--- /dev/null
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -0,0 +1,255 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include "lock_dlm.h"
11
12int gdlm_drop_count;
13int gdlm_drop_period;
14const struct lm_lockops gdlm_ops;
15
16
17static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
18 int flags, char *table_name)
19{
20 struct gdlm_ls *ls;
21 char buf[256], *p;
22
23 ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL);
24 if (!ls)
25 return NULL;
26
27 ls->drop_locks_count = gdlm_drop_count;
28 ls->drop_locks_period = gdlm_drop_period;
29 ls->fscb = cb;
30 ls->sdp = sdp;
31 ls->fsflags = flags;
32 spin_lock_init(&ls->async_lock);
33 INIT_LIST_HEAD(&ls->complete);
34 INIT_LIST_HEAD(&ls->blocking);
35 INIT_LIST_HEAD(&ls->delayed);
36 INIT_LIST_HEAD(&ls->submit);
37 INIT_LIST_HEAD(&ls->all_locks);
38 init_waitqueue_head(&ls->thread_wait);
39 init_waitqueue_head(&ls->wait_control);
40 ls->thread1 = NULL;
41 ls->thread2 = NULL;
42 ls->drop_time = jiffies;
43 ls->jid = -1;
44
45 strncpy(buf, table_name, 256);
46 buf[255] = '\0';
47
48 p = strstr(buf, ":");
49 if (!p) {
50 log_info("invalid table_name \"%s\"", table_name);
51 kfree(ls);
52 return NULL;
53 }
54 *p = '\0';
55 p++;
56
57 strncpy(ls->clustername, buf, GDLM_NAME_LEN);
58 strncpy(ls->fsname, p, GDLM_NAME_LEN);
59
60 return ls;
61}
62
63static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
64{
65 char data[256];
66 char *options, *x, *y;
67 int error = 0;
68
69 memset(data, 0, 256);
70 strncpy(data, data_arg, 255);
71
72 for (options = data; (x = strsep(&options, ":")); ) {
73 if (!*x)
74 continue;
75
76 y = strchr(x, '=');
77 if (y)
78 *y++ = 0;
79
80 if (!strcmp(x, "jid")) {
81 if (!y) {
82 log_error("need argument to jid");
83 error = -EINVAL;
84 break;
85 }
86 sscanf(y, "%u", &ls->jid);
87
88 } else if (!strcmp(x, "first")) {
89 if (!y) {
90 log_error("need argument to first");
91 error = -EINVAL;
92 break;
93 }
94 sscanf(y, "%u", &ls->first);
95
96 } else if (!strcmp(x, "id")) {
97 if (!y) {
98 log_error("need argument to id");
99 error = -EINVAL;
100 break;
101 }
102 sscanf(y, "%u", &ls->id);
103
104 } else if (!strcmp(x, "nodir")) {
105 if (!y) {
106 log_error("need argument to nodir");
107 error = -EINVAL;
108 break;
109 }
110 sscanf(y, "%u", nodir);
111
112 } else {
113 log_error("unkonwn option: %s", x);
114 error = -EINVAL;
115 break;
116 }
117 }
118
119 return error;
120}
121
122static int gdlm_mount(char *table_name, char *host_data,
123 lm_callback_t cb, void *cb_data,
124 unsigned int min_lvb_size, int flags,
125 struct lm_lockstruct *lockstruct,
126 struct kobject *fskobj)
127{
128 struct gdlm_ls *ls;
129 int error = -ENOMEM, nodir = 0;
130
131 if (min_lvb_size > GDLM_LVB_SIZE)
132 goto out;
133
134 ls = init_gdlm(cb, cb_data, flags, table_name);
135 if (!ls)
136 goto out;
137
138 error = make_args(ls, host_data, &nodir);
139 if (error)
140 goto out;
141
142 error = gdlm_init_threads(ls);
143 if (error)
144 goto out_free;
145
146 error = gdlm_kobject_setup(ls, fskobj);
147 if (error)
148 goto out_thread;
149
150 error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
151 &ls->dlm_lockspace,
152 nodir ? DLM_LSFL_NODIR : 0,
153 GDLM_LVB_SIZE);
154 if (error) {
155 log_error("dlm_new_lockspace error %d", error);
156 goto out_kobj;
157 }
158
159 lockstruct->ls_jid = ls->jid;
160 lockstruct->ls_first = ls->first;
161 lockstruct->ls_lockspace = ls;
162 lockstruct->ls_ops = &gdlm_ops;
163 lockstruct->ls_flags = 0;
164 lockstruct->ls_lvb_size = GDLM_LVB_SIZE;
165 return 0;
166
167out_kobj:
168 gdlm_kobject_release(ls);
169out_thread:
170 gdlm_release_threads(ls);
171out_free:
172 kfree(ls);
173out:
174 return error;
175}
176
177static void gdlm_unmount(void *lockspace)
178{
179 struct gdlm_ls *ls = lockspace;
180 int rv;
181
182 log_debug("unmount flags %lx", ls->flags);
183
184 /* FIXME: serialize unmount and withdraw in case they
185 happen at once. Also, if unmount follows withdraw,
186 wait for withdraw to finish. */
187
188 if (test_bit(DFL_WITHDRAW, &ls->flags))
189 goto out;
190
191 gdlm_kobject_release(ls);
192 dlm_release_lockspace(ls->dlm_lockspace, 2);
193 gdlm_release_threads(ls);
194 rv = gdlm_release_all_locks(ls);
195 if (rv)
196 log_info("gdlm_unmount: %d stray locks freed", rv);
197out:
198 kfree(ls);
199}
200
201static void gdlm_recovery_done(void *lockspace, unsigned int jid,
202 unsigned int message)
203{
204 struct gdlm_ls *ls = lockspace;
205 ls->recover_jid_done = jid;
206 ls->recover_jid_status = message;
207 kobject_uevent(&ls->kobj, KOBJ_CHANGE);
208}
209
210static void gdlm_others_may_mount(void *lockspace)
211{
212 struct gdlm_ls *ls = lockspace;
213 ls->first_done = 1;
214 kobject_uevent(&ls->kobj, KOBJ_CHANGE);
215}
216
217/* Userspace gets the offline uevent, blocks new gfs locks on
218 other mounters, and lets us know (sets WITHDRAW flag). Then,
219 userspace leaves the mount group while we leave the lockspace. */
220
221static void gdlm_withdraw(void *lockspace)
222{
223 struct gdlm_ls *ls = lockspace;
224
225 kobject_uevent(&ls->kobj, KOBJ_OFFLINE);
226
227 wait_event_interruptible(ls->wait_control,
228 test_bit(DFL_WITHDRAW, &ls->flags));
229
230 dlm_release_lockspace(ls->dlm_lockspace, 2);
231 gdlm_release_threads(ls);
232 gdlm_release_all_locks(ls);
233 gdlm_kobject_release(ls);
234}
235
236const struct lm_lockops gdlm_ops = {
237 .lm_proto_name = "lock_dlm",
238 .lm_mount = gdlm_mount,
239 .lm_others_may_mount = gdlm_others_may_mount,
240 .lm_unmount = gdlm_unmount,
241 .lm_withdraw = gdlm_withdraw,
242 .lm_get_lock = gdlm_get_lock,
243 .lm_put_lock = gdlm_put_lock,
244 .lm_lock = gdlm_lock,
245 .lm_unlock = gdlm_unlock,
246 .lm_plock = gdlm_plock,
247 .lm_punlock = gdlm_punlock,
248 .lm_plock_get = gdlm_plock_get,
249 .lm_cancel = gdlm_cancel,
250 .lm_hold_lvb = gdlm_hold_lvb,
251 .lm_unhold_lvb = gdlm_unhold_lvb,
252 .lm_recovery_done = gdlm_recovery_done,
253 .lm_owner = THIS_MODULE,
254};
255
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
new file mode 100644
index 000000000000..7365aec9511b
--- /dev/null
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -0,0 +1,301 @@
1/*
2 * Copyright (C) 2005 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License version 2.
7 */
8
9#include <linux/miscdevice.h>
10#include <linux/lock_dlm_plock.h>
11
12#include "lock_dlm.h"
13
14
15static spinlock_t ops_lock;
16static struct list_head send_list;
17static struct list_head recv_list;
18static wait_queue_head_t send_wq;
19static wait_queue_head_t recv_wq;
20
21struct plock_op {
22 struct list_head list;
23 int done;
24 struct gdlm_plock_info info;
25};
26
27static inline void set_version(struct gdlm_plock_info *info)
28{
29 info->version[0] = GDLM_PLOCK_VERSION_MAJOR;
30 info->version[1] = GDLM_PLOCK_VERSION_MINOR;
31 info->version[2] = GDLM_PLOCK_VERSION_PATCH;
32}
33
34static int check_version(struct gdlm_plock_info *info)
35{
36 if ((GDLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
37 (GDLM_PLOCK_VERSION_MINOR < info->version[1])) {
38 log_error("plock device version mismatch: "
39 "kernel (%u.%u.%u), user (%u.%u.%u)",
40 GDLM_PLOCK_VERSION_MAJOR,
41 GDLM_PLOCK_VERSION_MINOR,
42 GDLM_PLOCK_VERSION_PATCH,
43 info->version[0],
44 info->version[1],
45 info->version[2]);
46 return -EINVAL;
47 }
48 return 0;
49}
50
51static void send_op(struct plock_op *op)
52{
53 set_version(&op->info);
54 INIT_LIST_HEAD(&op->list);
55 spin_lock(&ops_lock);
56 list_add_tail(&op->list, &send_list);
57 spin_unlock(&ops_lock);
58 wake_up(&send_wq);
59}
60
61int gdlm_plock(void *lockspace, struct lm_lockname *name,
62 struct file *file, int cmd, struct file_lock *fl)
63{
64 struct gdlm_ls *ls = lockspace;
65 struct plock_op *op;
66 int rv;
67
68 op = kzalloc(sizeof(*op), GFP_KERNEL);
69 if (!op)
70 return -ENOMEM;
71
72 op->info.optype = GDLM_PLOCK_OP_LOCK;
73 op->info.pid = fl->fl_pid;
74 op->info.ex = (fl->fl_type == F_WRLCK);
75 op->info.wait = IS_SETLKW(cmd);
76 op->info.fsid = ls->id;
77 op->info.number = name->ln_number;
78 op->info.start = fl->fl_start;
79 op->info.end = fl->fl_end;
80 op->info.owner = (__u64)(long) fl->fl_owner;
81
82 send_op(op);
83 wait_event(recv_wq, (op->done != 0));
84
85 spin_lock(&ops_lock);
86 if (!list_empty(&op->list)) {
87 printk(KERN_INFO "plock op on list\n");
88 list_del(&op->list);
89 }
90 spin_unlock(&ops_lock);
91
92 rv = op->info.rv;
93
94 if (!rv) {
95 if (posix_lock_file_wait(file, fl) < 0)
96 log_error("gdlm_plock: vfs lock error %x,%llx",
97 name->ln_type,
98 (unsigned long long)name->ln_number);
99 }
100
101 kfree(op);
102 return rv;
103}
104
105int gdlm_punlock(void *lockspace, struct lm_lockname *name,
106 struct file *file, struct file_lock *fl)
107{
108 struct gdlm_ls *ls = lockspace;
109 struct plock_op *op;
110 int rv;
111
112 op = kzalloc(sizeof(*op), GFP_KERNEL);
113 if (!op)
114 return -ENOMEM;
115
116 if (posix_lock_file_wait(file, fl) < 0)
117 log_error("gdlm_punlock: vfs unlock error %x,%llx",
118 name->ln_type, (unsigned long long)name->ln_number);
119
120 op->info.optype = GDLM_PLOCK_OP_UNLOCK;
121 op->info.pid = fl->fl_pid;
122 op->info.fsid = ls->id;
123 op->info.number = name->ln_number;
124 op->info.start = fl->fl_start;
125 op->info.end = fl->fl_end;
126 op->info.owner = (__u64)(long) fl->fl_owner;
127
128 send_op(op);
129 wait_event(recv_wq, (op->done != 0));
130
131 spin_lock(&ops_lock);
132 if (!list_empty(&op->list)) {
133 printk(KERN_INFO "punlock op on list\n");
134 list_del(&op->list);
135 }
136 spin_unlock(&ops_lock);
137
138 rv = op->info.rv;
139
140 kfree(op);
141 return rv;
142}
143
144int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
145 struct file *file, struct file_lock *fl)
146{
147 struct gdlm_ls *ls = lockspace;
148 struct plock_op *op;
149 int rv;
150
151 op = kzalloc(sizeof(*op), GFP_KERNEL);
152 if (!op)
153 return -ENOMEM;
154
155 op->info.optype = GDLM_PLOCK_OP_GET;
156 op->info.pid = fl->fl_pid;
157 op->info.ex = (fl->fl_type == F_WRLCK);
158 op->info.fsid = ls->id;
159 op->info.number = name->ln_number;
160 op->info.start = fl->fl_start;
161 op->info.end = fl->fl_end;
162
163 send_op(op);
164 wait_event(recv_wq, (op->done != 0));
165
166 spin_lock(&ops_lock);
167 if (!list_empty(&op->list)) {
168 printk(KERN_INFO "plock_get op on list\n");
169 list_del(&op->list);
170 }
171 spin_unlock(&ops_lock);
172
173 rv = op->info.rv;
174
175 if (rv == 0)
176 fl->fl_type = F_UNLCK;
177 else if (rv > 0) {
178 fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
179 fl->fl_pid = op->info.pid;
180 fl->fl_start = op->info.start;
181 fl->fl_end = op->info.end;
182 }
183
184 kfree(op);
185 return rv;
186}
187
188/* a read copies out one plock request from the send list */
189static ssize_t dev_read(struct file *file, char __user *u, size_t count,
190 loff_t *ppos)
191{
192 struct gdlm_plock_info info;
193 struct plock_op *op = NULL;
194
195 if (count < sizeof(info))
196 return -EINVAL;
197
198 spin_lock(&ops_lock);
199 if (!list_empty(&send_list)) {
200 op = list_entry(send_list.next, struct plock_op, list);
201 list_move(&op->list, &recv_list);
202 memcpy(&info, &op->info, sizeof(info));
203 }
204 spin_unlock(&ops_lock);
205
206 if (!op)
207 return -EAGAIN;
208
209 if (copy_to_user(u, &info, sizeof(info)))
210 return -EFAULT;
211 return sizeof(info);
212}
213
214/* a write copies in one plock result that should match a plock_op
215 on the recv list */
216static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
217 loff_t *ppos)
218{
219 struct gdlm_plock_info info;
220 struct plock_op *op;
221 int found = 0;
222
223 if (count != sizeof(info))
224 return -EINVAL;
225
226 if (copy_from_user(&info, u, sizeof(info)))
227 return -EFAULT;
228
229 if (check_version(&info))
230 return -EINVAL;
231
232 spin_lock(&ops_lock);
233 list_for_each_entry(op, &recv_list, list) {
234 if (op->info.fsid == info.fsid && op->info.number == info.number &&
235 op->info.owner == info.owner) {
236 list_del_init(&op->list);
237 found = 1;
238 op->done = 1;
239 memcpy(&op->info, &info, sizeof(info));
240 break;
241 }
242 }
243 spin_unlock(&ops_lock);
244
245 if (found)
246 wake_up(&recv_wq);
247 else
248 printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid,
249 (unsigned long long)info.number);
250 return count;
251}
252
253static unsigned int dev_poll(struct file *file, poll_table *wait)
254{
255 poll_wait(file, &send_wq, wait);
256
257 spin_lock(&ops_lock);
258 if (!list_empty(&send_list)) {
259 spin_unlock(&ops_lock);
260 return POLLIN | POLLRDNORM;
261 }
262 spin_unlock(&ops_lock);
263 return 0;
264}
265
266static struct file_operations dev_fops = {
267 .read = dev_read,
268 .write = dev_write,
269 .poll = dev_poll,
270 .owner = THIS_MODULE
271};
272
273static struct miscdevice plock_dev_misc = {
274 .minor = MISC_DYNAMIC_MINOR,
275 .name = GDLM_PLOCK_MISC_NAME,
276 .fops = &dev_fops
277};
278
279int gdlm_plock_init(void)
280{
281 int rv;
282
283 spin_lock_init(&ops_lock);
284 INIT_LIST_HEAD(&send_list);
285 INIT_LIST_HEAD(&recv_list);
286 init_waitqueue_head(&send_wq);
287 init_waitqueue_head(&recv_wq);
288
289 rv = misc_register(&plock_dev_misc);
290 if (rv)
291 printk(KERN_INFO "gdlm_plock_init: misc_register failed %d",
292 rv);
293 return rv;
294}
295
296void gdlm_plock_exit(void)
297{
298 if (misc_deregister(&plock_dev_misc) < 0)
299 printk(KERN_INFO "gdlm_plock_exit: misc_deregister failed");
300}
301
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
new file mode 100644
index 000000000000..29ae06f94944
--- /dev/null
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -0,0 +1,226 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/ctype.h>
11#include <linux/stat.h>
12
13#include "lock_dlm.h"
14
15extern struct lm_lockops gdlm_ops;
16
17static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
18{
19 return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
20}
21
22static ssize_t block_show(struct gdlm_ls *ls, char *buf)
23{
24 ssize_t ret;
25 int val = 0;
26
27 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags))
28 val = 1;
29 ret = sprintf(buf, "%d\n", val);
30 return ret;
31}
32
33static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len)
34{
35 ssize_t ret = len;
36 int val;
37
38 val = simple_strtol(buf, NULL, 0);
39
40 if (val == 1)
41 set_bit(DFL_BLOCK_LOCKS, &ls->flags);
42 else if (val == 0) {
43 clear_bit(DFL_BLOCK_LOCKS, &ls->flags);
44 gdlm_submit_delayed(ls);
45 } else {
46 ret = -EINVAL;
47 }
48 return ret;
49}
50
51static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf)
52{
53 ssize_t ret;
54 int val = 0;
55
56 if (test_bit(DFL_WITHDRAW, &ls->flags))
57 val = 1;
58 ret = sprintf(buf, "%d\n", val);
59 return ret;
60}
61
62static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len)
63{
64 ssize_t ret = len;
65 int val;
66
67 val = simple_strtol(buf, NULL, 0);
68
69 if (val == 1)
70 set_bit(DFL_WITHDRAW, &ls->flags);
71 else
72 ret = -EINVAL;
73 wake_up(&ls->wait_control);
74 return ret;
75}
76
77static ssize_t id_show(struct gdlm_ls *ls, char *buf)
78{
79 return sprintf(buf, "%u\n", ls->id);
80}
81
82static ssize_t jid_show(struct gdlm_ls *ls, char *buf)
83{
84 return sprintf(buf, "%d\n", ls->jid);
85}
86
87static ssize_t first_show(struct gdlm_ls *ls, char *buf)
88{
89 return sprintf(buf, "%d\n", ls->first);
90}
91
92static ssize_t first_done_show(struct gdlm_ls *ls, char *buf)
93{
94 return sprintf(buf, "%d\n", ls->first_done);
95}
96
97static ssize_t recover_show(struct gdlm_ls *ls, char *buf)
98{
99 return sprintf(buf, "%d\n", ls->recover_jid);
100}
101
102static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len)
103{
104 ls->recover_jid = simple_strtol(buf, NULL, 0);
105 ls->fscb(ls->sdp, LM_CB_NEED_RECOVERY, &ls->recover_jid);
106 return len;
107}
108
109static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf)
110{
111 return sprintf(buf, "%d\n", ls->recover_jid_done);
112}
113
114static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
115{
116 return sprintf(buf, "%d\n", ls->recover_jid_status);
117}
118
119struct gdlm_attr {
120 struct attribute attr;
121 ssize_t (*show)(struct gdlm_ls *, char *);
122 ssize_t (*store)(struct gdlm_ls *, const char *, size_t);
123};
124
125#define GDLM_ATTR(_name,_mode,_show,_store) \
126static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
127
128GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
129GDLM_ATTR(block, 0644, block_show, block_store);
130GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
131GDLM_ATTR(id, 0444, id_show, NULL);
132GDLM_ATTR(jid, 0444, jid_show, NULL);
133GDLM_ATTR(first, 0444, first_show, NULL);
134GDLM_ATTR(first_done, 0444, first_done_show, NULL);
135GDLM_ATTR(recover, 0644, recover_show, recover_store);
136GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
137GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
138
139static struct attribute *gdlm_attrs[] = {
140 &gdlm_attr_proto_name.attr,
141 &gdlm_attr_block.attr,
142 &gdlm_attr_withdraw.attr,
143 &gdlm_attr_id.attr,
144 &gdlm_attr_jid.attr,
145 &gdlm_attr_first.attr,
146 &gdlm_attr_first_done.attr,
147 &gdlm_attr_recover.attr,
148 &gdlm_attr_recover_done.attr,
149 &gdlm_attr_recover_status.attr,
150 NULL,
151};
152
153static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr,
154 char *buf)
155{
156 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
157 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
158 return a->show ? a->show(ls, buf) : 0;
159}
160
161static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr,
162 const char *buf, size_t len)
163{
164 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
165 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
166 return a->store ? a->store(ls, buf, len) : len;
167}
168
169static struct sysfs_ops gdlm_attr_ops = {
170 .show = gdlm_attr_show,
171 .store = gdlm_attr_store,
172};
173
174static struct kobj_type gdlm_ktype = {
175 .default_attrs = gdlm_attrs,
176 .sysfs_ops = &gdlm_attr_ops,
177};
178
179static struct kset gdlm_kset = {
180 .subsys = &kernel_subsys,
181 .kobj = {.name = "lock_dlm",},
182 .ktype = &gdlm_ktype,
183};
184
185int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
186{
187 int error;
188
189 error = kobject_set_name(&ls->kobj, "%s", "lock_module");
190 if (error) {
191 log_error("can't set kobj name %d", error);
192 return error;
193 }
194
195 ls->kobj.kset = &gdlm_kset;
196 ls->kobj.ktype = &gdlm_ktype;
197 ls->kobj.parent = fskobj;
198
199 error = kobject_register(&ls->kobj);
200 if (error)
201 log_error("can't register kobj %d", error);
202
203 return error;
204}
205
206void gdlm_kobject_release(struct gdlm_ls *ls)
207{
208 kobject_unregister(&ls->kobj);
209}
210
211int gdlm_sysfs_init(void)
212{
213 int error;
214
215 error = kset_register(&gdlm_kset);
216 if (error)
217 printk("lock_dlm: cannot register kset %d\n", error);
218
219 return error;
220}
221
222void gdlm_sysfs_exit(void)
223{
224 kset_unregister(&gdlm_kset);
225}
226
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
new file mode 100644
index 000000000000..9cf1f168eaf8
--- /dev/null
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -0,0 +1,359 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include "lock_dlm.h"
11
12/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
13 thread gets to it. */
14
15static void queue_submit(struct gdlm_lock *lp)
16{
17 struct gdlm_ls *ls = lp->ls;
18
19 spin_lock(&ls->async_lock);
20 list_add_tail(&lp->delay_list, &ls->submit);
21 spin_unlock(&ls->async_lock);
22 wake_up(&ls->thread_wait);
23}
24
25static void process_blocking(struct gdlm_lock *lp, int bast_mode)
26{
27 struct gdlm_ls *ls = lp->ls;
28 unsigned int cb = 0;
29
30 switch (gdlm_make_lmstate(bast_mode)) {
31 case LM_ST_EXCLUSIVE:
32 cb = LM_CB_NEED_E;
33 break;
34 case LM_ST_DEFERRED:
35 cb = LM_CB_NEED_D;
36 break;
37 case LM_ST_SHARED:
38 cb = LM_CB_NEED_S;
39 break;
40 default:
41 gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
42 }
43
44 ls->fscb(ls->sdp, cb, &lp->lockname);
45}
46
47static void process_complete(struct gdlm_lock *lp)
48{
49 struct gdlm_ls *ls = lp->ls;
50 struct lm_async_cb acb;
51 s16 prev_mode = lp->cur;
52
53 memset(&acb, 0, sizeof(acb));
54
55 if (lp->lksb.sb_status == -DLM_ECANCEL) {
56 log_info("complete dlm cancel %x,%llx flags %lx",
57 lp->lockname.ln_type,
58 (unsigned long long)lp->lockname.ln_number,
59 lp->flags);
60
61 lp->req = lp->cur;
62 acb.lc_ret |= LM_OUT_CANCELED;
63 if (lp->cur == DLM_LOCK_IV)
64 lp->lksb.sb_lkid = 0;
65 goto out;
66 }
67
68 if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
69 if (lp->lksb.sb_status != -DLM_EUNLOCK) {
70 log_info("unlock sb_status %d %x,%llx flags %lx",
71 lp->lksb.sb_status, lp->lockname.ln_type,
72 (unsigned long long)lp->lockname.ln_number,
73 lp->flags);
74 return;
75 }
76
77 lp->cur = DLM_LOCK_IV;
78 lp->req = DLM_LOCK_IV;
79 lp->lksb.sb_lkid = 0;
80
81 if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
82 gdlm_delete_lp(lp);
83 return;
84 }
85 goto out;
86 }
87
88 if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
89 memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
90
91 if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
92 if (lp->req == DLM_LOCK_PR)
93 lp->req = DLM_LOCK_CW;
94 else if (lp->req == DLM_LOCK_CW)
95 lp->req = DLM_LOCK_PR;
96 }
97
98 /*
99 * A canceled lock request. The lock was just taken off the delayed
100 * list and was never even submitted to dlm.
101 */
102
103 if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
104 log_info("complete internal cancel %x,%llx",
105 lp->lockname.ln_type,
106 (unsigned long long)lp->lockname.ln_number);
107 lp->req = lp->cur;
108 acb.lc_ret |= LM_OUT_CANCELED;
109 goto out;
110 }
111
112 /*
113 * An error occured.
114 */
115
116 if (lp->lksb.sb_status) {
117 /* a "normal" error */
118 if ((lp->lksb.sb_status == -EAGAIN) &&
119 (lp->lkf & DLM_LKF_NOQUEUE)) {
120 lp->req = lp->cur;
121 if (lp->cur == DLM_LOCK_IV)
122 lp->lksb.sb_lkid = 0;
123 goto out;
124 }
125
126 /* this could only happen with cancels I think */
127 log_info("ast sb_status %d %x,%llx flags %lx",
128 lp->lksb.sb_status, lp->lockname.ln_type,
129 (unsigned long long)lp->lockname.ln_number,
130 lp->flags);
131 return;
132 }
133
134 /*
135 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
136 */
137
138 if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
139 complete(&lp->ast_wait);
140 return;
141 }
142
143 /*
144 * A lock has been demoted to NL because it initially completed during
145 * BLOCK_LOCKS. Now it must be requested in the originally requested
146 * mode.
147 */
148
149 if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
150 gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
151 lp->lockname.ln_type,
152 (unsigned long long)lp->lockname.ln_number);
153 gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
154 lp->lockname.ln_type,
155 (unsigned long long)lp->lockname.ln_number);
156
157 lp->cur = DLM_LOCK_NL;
158 lp->req = lp->prev_req;
159 lp->prev_req = DLM_LOCK_IV;
160 lp->lkf &= ~DLM_LKF_CONVDEADLK;
161
162 set_bit(LFL_NOCACHE, &lp->flags);
163
164 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
165 !test_bit(LFL_NOBLOCK, &lp->flags))
166 gdlm_queue_delayed(lp);
167 else
168 queue_submit(lp);
169 return;
170 }
171
172 /*
173 * A request is granted during dlm recovery. It may be granted
174 * because the locks of a failed node were cleared. In that case,
175 * there may be inconsistent data beneath this lock and we must wait
176 * for recovery to complete to use it. When gfs recovery is done this
177 * granted lock will be converted to NL and then reacquired in this
178 * granted state.
179 */
180
181 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
182 !test_bit(LFL_NOBLOCK, &lp->flags) &&
183 lp->req != DLM_LOCK_NL) {
184
185 lp->cur = lp->req;
186 lp->prev_req = lp->req;
187 lp->req = DLM_LOCK_NL;
188 lp->lkf |= DLM_LKF_CONVERT;
189 lp->lkf &= ~DLM_LKF_CONVDEADLK;
190
191 log_debug("rereq %x,%llx id %x %d,%d",
192 lp->lockname.ln_type,
193 (unsigned long long)lp->lockname.ln_number,
194 lp->lksb.sb_lkid, lp->cur, lp->req);
195
196 set_bit(LFL_REREQUEST, &lp->flags);
197 queue_submit(lp);
198 return;
199 }
200
201 /*
202 * DLM demoted the lock to NL before it was granted so GFS must be
203 * told it cannot cache data for this lock.
204 */
205
206 if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
207 set_bit(LFL_NOCACHE, &lp->flags);
208
209out:
210 /*
211 * This is an internal lock_dlm lock
212 */
213
214 if (test_bit(LFL_INLOCK, &lp->flags)) {
215 clear_bit(LFL_NOBLOCK, &lp->flags);
216 lp->cur = lp->req;
217 complete(&lp->ast_wait);
218 return;
219 }
220
221 /*
222 * Normal completion of a lock request. Tell GFS it now has the lock.
223 */
224
225 clear_bit(LFL_NOBLOCK, &lp->flags);
226 lp->cur = lp->req;
227
228 acb.lc_name = lp->lockname;
229 acb.lc_ret |= gdlm_make_lmstate(lp->cur);
230
231 if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
232 (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
233 acb.lc_ret |= LM_OUT_CACHEABLE;
234
235 ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
236}
237
238static inline int no_work(struct gdlm_ls *ls, int blocking)
239{
240 int ret;
241
242 spin_lock(&ls->async_lock);
243 ret = list_empty(&ls->complete) && list_empty(&ls->submit);
244 if (ret && blocking)
245 ret = list_empty(&ls->blocking);
246 spin_unlock(&ls->async_lock);
247
248 return ret;
249}
250
251static inline int check_drop(struct gdlm_ls *ls)
252{
253 if (!ls->drop_locks_count)
254 return 0;
255
256 if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
257 ls->drop_time = jiffies;
258 if (ls->all_locks_count >= ls->drop_locks_count)
259 return 1;
260 }
261 return 0;
262}
263
264static int gdlm_thread(void *data)
265{
266 struct gdlm_ls *ls = (struct gdlm_ls *) data;
267 struct gdlm_lock *lp = NULL;
268 int blist = 0;
269 uint8_t complete, blocking, submit, drop;
270 DECLARE_WAITQUEUE(wait, current);
271
272 /* Only thread1 is allowed to do blocking callbacks since gfs
273 may wait for a completion callback within a blocking cb. */
274
275 if (current == ls->thread1)
276 blist = 1;
277
278 while (!kthread_should_stop()) {
279 set_current_state(TASK_INTERRUPTIBLE);
280 add_wait_queue(&ls->thread_wait, &wait);
281 if (no_work(ls, blist))
282 schedule();
283 remove_wait_queue(&ls->thread_wait, &wait);
284 set_current_state(TASK_RUNNING);
285
286 complete = blocking = submit = drop = 0;
287
288 spin_lock(&ls->async_lock);
289
290 if (blist && !list_empty(&ls->blocking)) {
291 lp = list_entry(ls->blocking.next, struct gdlm_lock,
292 blist);
293 list_del_init(&lp->blist);
294 blocking = lp->bast_mode;
295 lp->bast_mode = 0;
296 } else if (!list_empty(&ls->complete)) {
297 lp = list_entry(ls->complete.next, struct gdlm_lock,
298 clist);
299 list_del_init(&lp->clist);
300 complete = 1;
301 } else if (!list_empty(&ls->submit)) {
302 lp = list_entry(ls->submit.next, struct gdlm_lock,
303 delay_list);
304 list_del_init(&lp->delay_list);
305 submit = 1;
306 }
307
308 drop = check_drop(ls);
309 spin_unlock(&ls->async_lock);
310
311 if (complete)
312 process_complete(lp);
313
314 else if (blocking)
315 process_blocking(lp, blocking);
316
317 else if (submit)
318 gdlm_do_lock(lp);
319
320 if (drop)
321 ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL);
322
323 schedule();
324 }
325
326 return 0;
327}
328
329int gdlm_init_threads(struct gdlm_ls *ls)
330{
331 struct task_struct *p;
332 int error;
333
334 p = kthread_run(gdlm_thread, ls, "lock_dlm1");
335 error = IS_ERR(p);
336 if (error) {
337 log_error("can't start lock_dlm1 thread %d", error);
338 return error;
339 }
340 ls->thread1 = p;
341
342 p = kthread_run(gdlm_thread, ls, "lock_dlm2");
343 error = IS_ERR(p);
344 if (error) {
345 log_error("can't start lock_dlm2 thread %d", error);
346 kthread_stop(ls->thread1);
347 return error;
348 }
349 ls->thread2 = p;
350
351 return 0;
352}
353
354void gdlm_release_threads(struct gdlm_ls *ls)
355{
356 kthread_stop(ls->thread1);
357 kthread_stop(ls->thread2);
358}
359
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
new file mode 100644
index 000000000000..35e9730bc3a8
--- /dev/null
+++ b/fs/gfs2/locking/nolock/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += lock_nolock.o
2lock_nolock-y := main.o
3
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
new file mode 100644
index 000000000000..acfbc941f319
--- /dev/null
+++ b/fs/gfs2/locking/nolock/main.c
@@ -0,0 +1,246 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/module.h>
11#include <linux/slab.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/types.h>
15#include <linux/fs.h>
16#include <linux/smp_lock.h>
17#include <linux/lm_interface.h>
18
19struct nolock_lockspace {
20 unsigned int nl_lvb_size;
21};
22
23static const struct lm_lockops nolock_ops;
24
25static int nolock_mount(char *table_name, char *host_data,
26 lm_callback_t cb, void *cb_data,
27 unsigned int min_lvb_size, int flags,
28 struct lm_lockstruct *lockstruct,
29 struct kobject *fskobj)
30{
31 char *c;
32 unsigned int jid;
33 struct nolock_lockspace *nl;
34
35 c = strstr(host_data, "jid=");
36 if (!c)
37 jid = 0;
38 else {
39 c += 4;
40 sscanf(c, "%u", &jid);
41 }
42
43 nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
44 if (!nl)
45 return -ENOMEM;
46
47 nl->nl_lvb_size = min_lvb_size;
48
49 lockstruct->ls_jid = jid;
50 lockstruct->ls_first = 1;
51 lockstruct->ls_lvb_size = min_lvb_size;
52 lockstruct->ls_lockspace = nl;
53 lockstruct->ls_ops = &nolock_ops;
54 lockstruct->ls_flags = LM_LSFLAG_LOCAL;
55
56 return 0;
57}
58
59static void nolock_others_may_mount(void *lockspace)
60{
61}
62
63static void nolock_unmount(void *lockspace)
64{
65 struct nolock_lockspace *nl = lockspace;
66 kfree(nl);
67}
68
69static void nolock_withdraw(void *lockspace)
70{
71}
72
73/**
74 * nolock_get_lock - get a lm_lock_t given a descripton of the lock
75 * @lockspace: the lockspace the lock lives in
76 * @name: the name of the lock
77 * @lockp: return the lm_lock_t here
78 *
79 * Returns: 0 on success, -EXXX on failure
80 */
81
82static int nolock_get_lock(void *lockspace, struct lm_lockname *name,
83 void **lockp)
84{
85 *lockp = lockspace;
86 return 0;
87}
88
89/**
90 * nolock_put_lock - get rid of a lock structure
91 * @lock: the lock to throw away
92 *
93 */
94
95static void nolock_put_lock(void *lock)
96{
97}
98
99/**
100 * nolock_lock - acquire a lock
101 * @lock: the lock to manipulate
102 * @cur_state: the current state
103 * @req_state: the requested state
104 * @flags: modifier flags
105 *
106 * Returns: A bitmap of LM_OUT_*
107 */
108
109static unsigned int nolock_lock(void *lock, unsigned int cur_state,
110 unsigned int req_state, unsigned int flags)
111{
112 return req_state | LM_OUT_CACHEABLE;
113}
114
115/**
116 * nolock_unlock - unlock a lock
117 * @lock: the lock to manipulate
118 * @cur_state: the current state
119 *
120 * Returns: 0
121 */
122
123static unsigned int nolock_unlock(void *lock, unsigned int cur_state)
124{
125 return 0;
126}
127
128static void nolock_cancel(void *lock)
129{
130}
131
132/**
133 * nolock_hold_lvb - hold on to a lock value block
134 * @lock: the lock the LVB is associated with
135 * @lvbp: return the lm_lvb_t here
136 *
137 * Returns: 0 on success, -EXXX on failure
138 */
139
140static int nolock_hold_lvb(void *lock, char **lvbp)
141{
142 struct nolock_lockspace *nl = lock;
143 int error = 0;
144
145 *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL);
146 if (!*lvbp)
147 error = -ENOMEM;
148
149 return error;
150}
151
152/**
153 * nolock_unhold_lvb - release a LVB
154 * @lock: the lock the LVB is associated with
155 * @lvb: the lock value block
156 *
157 */
158
159static void nolock_unhold_lvb(void *lock, char *lvb)
160{
161 kfree(lvb);
162}
163
164static int nolock_plock_get(void *lockspace, struct lm_lockname *name,
165 struct file *file, struct file_lock *fl)
166{
167 struct file_lock tmp;
168 int ret;
169
170 ret = posix_test_lock(file, fl, &tmp);
171 fl->fl_type = F_UNLCK;
172 if (ret)
173 memcpy(fl, &tmp, sizeof(struct file_lock));
174
175 return 0;
176}
177
178static int nolock_plock(void *lockspace, struct lm_lockname *name,
179 struct file *file, int cmd, struct file_lock *fl)
180{
181 int error;
182 error = posix_lock_file_wait(file, fl);
183 return error;
184}
185
186static int nolock_punlock(void *lockspace, struct lm_lockname *name,
187 struct file *file, struct file_lock *fl)
188{
189 int error;
190 error = posix_lock_file_wait(file, fl);
191 return error;
192}
193
194static void nolock_recovery_done(void *lockspace, unsigned int jid,
195 unsigned int message)
196{
197}
198
199static const struct lm_lockops nolock_ops = {
200 .lm_proto_name = "lock_nolock",
201 .lm_mount = nolock_mount,
202 .lm_others_may_mount = nolock_others_may_mount,
203 .lm_unmount = nolock_unmount,
204 .lm_withdraw = nolock_withdraw,
205 .lm_get_lock = nolock_get_lock,
206 .lm_put_lock = nolock_put_lock,
207 .lm_lock = nolock_lock,
208 .lm_unlock = nolock_unlock,
209 .lm_cancel = nolock_cancel,
210 .lm_hold_lvb = nolock_hold_lvb,
211 .lm_unhold_lvb = nolock_unhold_lvb,
212 .lm_plock_get = nolock_plock_get,
213 .lm_plock = nolock_plock,
214 .lm_punlock = nolock_punlock,
215 .lm_recovery_done = nolock_recovery_done,
216 .lm_owner = THIS_MODULE,
217};
218
219static int __init init_nolock(void)
220{
221 int error;
222
223 error = gfs2_register_lockproto(&nolock_ops);
224 if (error) {
225 printk(KERN_WARNING
226 "lock_nolock: can't register protocol: %d\n", error);
227 return error;
228 }
229
230 printk(KERN_INFO
231 "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
232 return 0;
233}
234
235static void __exit exit_nolock(void)
236{
237 gfs2_unregister_lockproto(&nolock_ops);
238}
239
240module_init(init_nolock);
241module_exit(exit_nolock);
242
243MODULE_DESCRIPTION("GFS Nolock Locking Module");
244MODULE_AUTHOR("Red Hat, Inc.");
245MODULE_LICENSE("GPL");
246
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
new file mode 100644
index 000000000000..554fe5bd1b72
--- /dev/null
+++ b/fs/gfs2/log.c
@@ -0,0 +1,687 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17#include <linux/lm_interface.h>
18
19#include "gfs2.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "log.h"
24#include "lops.h"
25#include "meta_io.h"
26#include "util.h"
27#include "dir.h"
28
29#define PULL 1
30
31/**
32 * gfs2_struct2blk - compute stuff
33 * @sdp: the filesystem
34 * @nstruct: the number of structures
35 * @ssize: the size of the structures
36 *
37 * Compute the number of log descriptor blocks needed to hold a certain number
38 * of structures of a certain size.
39 *
40 * Returns: the number of blocks needed (minimum is always 1)
41 */
42
43unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
44 unsigned int ssize)
45{
46 unsigned int blks;
47 unsigned int first, second;
48
49 blks = 1;
50 first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / ssize;
51
52 if (nstruct > first) {
53 second = (sdp->sd_sb.sb_bsize -
54 sizeof(struct gfs2_meta_header)) / ssize;
55 blks += DIV_ROUND_UP(nstruct - first, second);
56 }
57
58 return blks;
59}
60
61/**
62 * gfs2_ail1_start_one - Start I/O on a part of the AIL
63 * @sdp: the filesystem
64 * @tr: the part of the AIL
65 *
66 */
67
68static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
69{
70 struct gfs2_bufdata *bd, *s;
71 struct buffer_head *bh;
72 int retry;
73
74 BUG_ON(!spin_is_locked(&sdp->sd_log_lock));
75
76 do {
77 retry = 0;
78
79 list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
80 bd_ail_st_list) {
81 bh = bd->bd_bh;
82
83 gfs2_assert(sdp, bd->bd_ail == ai);
84
85 if (!buffer_busy(bh)) {
86 if (!buffer_uptodate(bh)) {
87 gfs2_log_unlock(sdp);
88 gfs2_io_error_bh(sdp, bh);
89 gfs2_log_lock(sdp);
90 }
91 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
92 continue;
93 }
94
95 if (!buffer_dirty(bh))
96 continue;
97
98 list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
99
100 gfs2_log_unlock(sdp);
101 wait_on_buffer(bh);
102 ll_rw_block(WRITE, 1, &bh);
103 gfs2_log_lock(sdp);
104
105 retry = 1;
106 break;
107 }
108 } while (retry);
109}
110
111/**
112 * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced
113 * @sdp: the filesystem
114 * @ai: the AIL entry
115 *
116 */
117
118static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags)
119{
120 struct gfs2_bufdata *bd, *s;
121 struct buffer_head *bh;
122
123 list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
124 bd_ail_st_list) {
125 bh = bd->bd_bh;
126
127 gfs2_assert(sdp, bd->bd_ail == ai);
128
129 if (buffer_busy(bh)) {
130 if (flags & DIO_ALL)
131 continue;
132 else
133 break;
134 }
135
136 if (!buffer_uptodate(bh))
137 gfs2_io_error_bh(sdp, bh);
138
139 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
140 }
141
142 return list_empty(&ai->ai_ail1_list);
143}
144
145void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
146{
147 struct list_head *head = &sdp->sd_ail1_list;
148 u64 sync_gen;
149 struct list_head *first;
150 struct gfs2_ail *first_ai, *ai, *tmp;
151 int done = 0;
152
153 gfs2_log_lock(sdp);
154 if (list_empty(head)) {
155 gfs2_log_unlock(sdp);
156 return;
157 }
158 sync_gen = sdp->sd_ail_sync_gen++;
159
160 first = head->prev;
161 first_ai = list_entry(first, struct gfs2_ail, ai_list);
162 first_ai->ai_sync_gen = sync_gen;
163 gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */
164
165 if (flags & DIO_ALL)
166 first = NULL;
167
168 while(!done) {
169 if (first && (head->prev != first ||
170 gfs2_ail1_empty_one(sdp, first_ai, 0)))
171 break;
172
173 done = 1;
174 list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) {
175 if (ai->ai_sync_gen >= sync_gen)
176 continue;
177 ai->ai_sync_gen = sync_gen;
178 gfs2_ail1_start_one(sdp, ai); /* This may drop log lock */
179 done = 0;
180 break;
181 }
182 }
183
184 gfs2_log_unlock(sdp);
185}
186
187int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
188{
189 struct gfs2_ail *ai, *s;
190 int ret;
191
192 gfs2_log_lock(sdp);
193
194 list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
195 if (gfs2_ail1_empty_one(sdp, ai, flags))
196 list_move(&ai->ai_list, &sdp->sd_ail2_list);
197 else if (!(flags & DIO_ALL))
198 break;
199 }
200
201 ret = list_empty(&sdp->sd_ail1_list);
202
203 gfs2_log_unlock(sdp);
204
205 return ret;
206}
207
208
209/**
210 * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
211 * @sdp: the filesystem
212 * @ai: the AIL entry
213 *
214 */
215
216static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
217{
218 struct list_head *head = &ai->ai_ail2_list;
219 struct gfs2_bufdata *bd;
220
221 while (!list_empty(head)) {
222 bd = list_entry(head->prev, struct gfs2_bufdata,
223 bd_ail_st_list);
224 gfs2_assert(sdp, bd->bd_ail == ai);
225 bd->bd_ail = NULL;
226 list_del(&bd->bd_ail_st_list);
227 list_del(&bd->bd_ail_gl_list);
228 atomic_dec(&bd->bd_gl->gl_ail_count);
229 brelse(bd->bd_bh);
230 }
231}
232
233static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
234{
235 struct gfs2_ail *ai, *safe;
236 unsigned int old_tail = sdp->sd_log_tail;
237 int wrap = (new_tail < old_tail);
238 int a, b, rm;
239
240 gfs2_log_lock(sdp);
241
242 list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
243 a = (old_tail <= ai->ai_first);
244 b = (ai->ai_first < new_tail);
245 rm = (wrap) ? (a || b) : (a && b);
246 if (!rm)
247 continue;
248
249 gfs2_ail2_empty_one(sdp, ai);
250 list_del(&ai->ai_list);
251 gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list));
252 gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list));
253 kfree(ai);
254 }
255
256 gfs2_log_unlock(sdp);
257}
258
259/**
260 * gfs2_log_reserve - Make a log reservation
261 * @sdp: The GFS2 superblock
262 * @blks: The number of blocks to reserve
263 *
264 * Returns: errno
265 */
266
267int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
268{
269 unsigned int try = 0;
270
271 if (gfs2_assert_warn(sdp, blks) ||
272 gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
273 return -EINVAL;
274
275 mutex_lock(&sdp->sd_log_reserve_mutex);
276 gfs2_log_lock(sdp);
277 while(sdp->sd_log_blks_free <= blks) {
278 gfs2_log_unlock(sdp);
279 gfs2_ail1_empty(sdp, 0);
280 gfs2_log_flush(sdp, NULL);
281
282 if (try++)
283 gfs2_ail1_start(sdp, 0);
284 gfs2_log_lock(sdp);
285 }
286 sdp->sd_log_blks_free -= blks;
287 gfs2_log_unlock(sdp);
288 mutex_unlock(&sdp->sd_log_reserve_mutex);
289
290 down_read(&sdp->sd_log_flush_lock);
291
292 return 0;
293}
294
295/**
296 * gfs2_log_release - Release a given number of log blocks
297 * @sdp: The GFS2 superblock
298 * @blks: The number of blocks
299 *
300 */
301
302void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
303{
304
305 gfs2_log_lock(sdp);
306 sdp->sd_log_blks_free += blks;
307 gfs2_assert_withdraw(sdp,
308 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
309 gfs2_log_unlock(sdp);
310 up_read(&sdp->sd_log_flush_lock);
311}
312
313static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
314{
315 int error;
316 struct buffer_head bh_map;
317
318 error = gfs2_block_map(sdp->sd_jdesc->jd_inode, lbn, 0, &bh_map, 1);
319 if (error || !bh_map.b_blocknr)
320 printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error, bh_map.b_blocknr, lbn);
321 gfs2_assert_withdraw(sdp, !error && bh_map.b_blocknr);
322
323 return bh_map.b_blocknr;
324}
325
326/**
327 * log_distance - Compute distance between two journal blocks
328 * @sdp: The GFS2 superblock
329 * @newer: The most recent journal block of the pair
330 * @older: The older journal block of the pair
331 *
332 * Compute the distance (in the journal direction) between two
333 * blocks in the journal
334 *
335 * Returns: the distance in blocks
336 */
337
338static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer,
339 unsigned int older)
340{
341 int dist;
342
343 dist = newer - older;
344 if (dist < 0)
345 dist += sdp->sd_jdesc->jd_blocks;
346
347 return dist;
348}
349
350static unsigned int current_tail(struct gfs2_sbd *sdp)
351{
352 struct gfs2_ail *ai;
353 unsigned int tail;
354
355 gfs2_log_lock(sdp);
356
357 if (list_empty(&sdp->sd_ail1_list)) {
358 tail = sdp->sd_log_head;
359 } else {
360 ai = list_entry(sdp->sd_ail1_list.prev, struct gfs2_ail, ai_list);
361 tail = ai->ai_first;
362 }
363
364 gfs2_log_unlock(sdp);
365
366 return tail;
367}
368
369static inline void log_incr_head(struct gfs2_sbd *sdp)
370{
371 if (sdp->sd_log_flush_head == sdp->sd_log_tail)
372 gfs2_assert_withdraw(sdp, sdp->sd_log_flush_head == sdp->sd_log_head);
373
374 if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
375 sdp->sd_log_flush_head = 0;
376 sdp->sd_log_flush_wrapped = 1;
377 }
378}
379
380/**
381 * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
382 * @sdp: The GFS2 superblock
383 *
384 * Returns: the buffer_head
385 */
386
387struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
388{
389 u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
390 struct gfs2_log_buf *lb;
391 struct buffer_head *bh;
392
393 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
394 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
395
396 bh = lb->lb_bh = sb_getblk(sdp->sd_vfs, blkno);
397 lock_buffer(bh);
398 memset(bh->b_data, 0, bh->b_size);
399 set_buffer_uptodate(bh);
400 clear_buffer_dirty(bh);
401 unlock_buffer(bh);
402
403 log_incr_head(sdp);
404
405 return bh;
406}
407
408/**
409 * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
410 * @sdp: the filesystem
411 * @data: the data the buffer_head should point to
412 *
413 * Returns: the log buffer descriptor
414 */
415
416struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
417 struct buffer_head *real)
418{
419 u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
420 struct gfs2_log_buf *lb;
421 struct buffer_head *bh;
422
423 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
424 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
425 lb->lb_real = real;
426
427 bh = lb->lb_bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
428 atomic_set(&bh->b_count, 1);
429 bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate);
430 set_bh_page(bh, real->b_page, bh_offset(real));
431 bh->b_blocknr = blkno;
432 bh->b_size = sdp->sd_sb.sb_bsize;
433 bh->b_bdev = sdp->sd_vfs->s_bdev;
434
435 log_incr_head(sdp);
436
437 return bh;
438}
439
440static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail, int pull)
441{
442 unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail);
443
444 ail2_empty(sdp, new_tail);
445
446 gfs2_log_lock(sdp);
447 sdp->sd_log_blks_free += dist - (pull ? 1 : 0);
448 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
449 gfs2_log_unlock(sdp);
450
451 sdp->sd_log_tail = new_tail;
452}
453
454/**
455 * log_write_header - Get and initialize a journal header buffer
456 * @sdp: The GFS2 superblock
457 *
458 * Returns: the initialized log buffer descriptor
459 */
460
461static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
462{
463 u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
464 struct buffer_head *bh;
465 struct gfs2_log_header *lh;
466 unsigned int tail;
467 u32 hash;
468
469 bh = sb_getblk(sdp->sd_vfs, blkno);
470 lock_buffer(bh);
471 memset(bh->b_data, 0, bh->b_size);
472 set_buffer_uptodate(bh);
473 clear_buffer_dirty(bh);
474 unlock_buffer(bh);
475
476 gfs2_ail1_empty(sdp, 0);
477 tail = current_tail(sdp);
478
479 lh = (struct gfs2_log_header *)bh->b_data;
480 memset(lh, 0, sizeof(struct gfs2_log_header));
481 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
482 lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
483 lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
484 lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
485 lh->lh_flags = cpu_to_be32(flags);
486 lh->lh_tail = cpu_to_be32(tail);
487 lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head);
488 hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
489 lh->lh_hash = cpu_to_be32(hash);
490
491 set_buffer_dirty(bh);
492 if (sync_dirty_buffer(bh))
493 gfs2_io_error_bh(sdp, bh);
494 brelse(bh);
495
496 if (sdp->sd_log_tail != tail)
497 log_pull_tail(sdp, tail, pull);
498 else
499 gfs2_assert_withdraw(sdp, !pull);
500
501 sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
502 log_incr_head(sdp);
503}
504
505static void log_flush_commit(struct gfs2_sbd *sdp)
506{
507 struct list_head *head = &sdp->sd_log_flush_list;
508 struct gfs2_log_buf *lb;
509 struct buffer_head *bh;
510
511 while (!list_empty(head)) {
512 lb = list_entry(head->next, struct gfs2_log_buf, lb_list);
513 list_del(&lb->lb_list);
514 bh = lb->lb_bh;
515
516 wait_on_buffer(bh);
517 if (!buffer_uptodate(bh))
518 gfs2_io_error_bh(sdp, bh);
519 if (lb->lb_real) {
520 while (atomic_read(&bh->b_count) != 1) /* Grrrr... */
521 schedule();
522 free_buffer_head(bh);
523 } else
524 brelse(bh);
525 kfree(lb);
526 }
527
528 log_write_header(sdp, 0, 0);
529}
530
531/**
532 * gfs2_log_flush - flush incore transaction(s)
533 * @sdp: the filesystem
534 * @gl: The glock structure to flush. If NULL, flush the whole incore log
535 *
536 */
537
538void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
539{
540 struct gfs2_ail *ai;
541
542 down_write(&sdp->sd_log_flush_lock);
543
544 if (gl) {
545 gfs2_log_lock(sdp);
546 if (list_empty(&gl->gl_le.le_list)) {
547 gfs2_log_unlock(sdp);
548 up_write(&sdp->sd_log_flush_lock);
549 return;
550 }
551 gfs2_log_unlock(sdp);
552 }
553
554 ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
555 INIT_LIST_HEAD(&ai->ai_ail1_list);
556 INIT_LIST_HEAD(&ai->ai_ail2_list);
557
558 gfs2_assert_withdraw(sdp, sdp->sd_log_num_buf == sdp->sd_log_commited_buf);
559 gfs2_assert_withdraw(sdp,
560 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
561
562 sdp->sd_log_flush_head = sdp->sd_log_head;
563 sdp->sd_log_flush_wrapped = 0;
564 ai->ai_first = sdp->sd_log_flush_head;
565
566 lops_before_commit(sdp);
567 if (!list_empty(&sdp->sd_log_flush_list))
568 log_flush_commit(sdp);
569 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle)
570 log_write_header(sdp, 0, PULL);
571 lops_after_commit(sdp, ai);
572 sdp->sd_log_head = sdp->sd_log_flush_head;
573
574 sdp->sd_log_blks_free -= sdp->sd_log_num_hdrs;
575
576 sdp->sd_log_blks_reserved = 0;
577 sdp->sd_log_commited_buf = 0;
578 sdp->sd_log_num_hdrs = 0;
579 sdp->sd_log_commited_revoke = 0;
580
581 gfs2_log_lock(sdp);
582 if (!list_empty(&ai->ai_ail1_list)) {
583 list_add(&ai->ai_list, &sdp->sd_ail1_list);
584 ai = NULL;
585 }
586 gfs2_log_unlock(sdp);
587
588 sdp->sd_vfs->s_dirt = 0;
589 up_write(&sdp->sd_log_flush_lock);
590
591 kfree(ai);
592}
593
594static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
595{
596 unsigned int reserved = 0;
597 unsigned int old;
598
599 gfs2_log_lock(sdp);
600
601 sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm;
602 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_buf) >= 0);
603 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
604 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
605
606 if (sdp->sd_log_commited_buf)
607 reserved += sdp->sd_log_commited_buf;
608 if (sdp->sd_log_commited_revoke)
609 reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
610 sizeof(u64));
611 if (reserved)
612 reserved++;
613
614 old = sdp->sd_log_blks_free;
615 sdp->sd_log_blks_free += tr->tr_reserved -
616 (reserved - sdp->sd_log_blks_reserved);
617
618 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old);
619 gfs2_assert_withdraw(sdp,
620 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks +
621 sdp->sd_log_num_hdrs);
622
623 sdp->sd_log_blks_reserved = reserved;
624
625 gfs2_log_unlock(sdp);
626}
627
628/**
629 * gfs2_log_commit - Commit a transaction to the log
630 * @sdp: the filesystem
631 * @tr: the transaction
632 *
633 * Returns: errno
634 */
635
636void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
637{
638 log_refund(sdp, tr);
639 lops_incore_commit(sdp, tr);
640
641 sdp->sd_vfs->s_dirt = 1;
642 up_read(&sdp->sd_log_flush_lock);
643
644 gfs2_log_lock(sdp);
645 if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) {
646 gfs2_log_unlock(sdp);
647 gfs2_log_flush(sdp, NULL);
648 } else {
649 gfs2_log_unlock(sdp);
650 }
651}
652
653/**
654 * gfs2_log_shutdown - write a shutdown header into a journal
655 * @sdp: the filesystem
656 *
657 */
658
659void gfs2_log_shutdown(struct gfs2_sbd *sdp)
660{
661 down_write(&sdp->sd_log_flush_lock);
662
663 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
664 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
665 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
666 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_jdata);
667 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
668 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
669 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
670 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_hdrs);
671 gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
672
673 sdp->sd_log_flush_head = sdp->sd_log_head;
674 sdp->sd_log_flush_wrapped = 0;
675
676 log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, 0);
677
678 gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks);
679 gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
680 gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
681
682 sdp->sd_log_head = sdp->sd_log_flush_head;
683 sdp->sd_log_tail = sdp->sd_log_head;
684
685 up_write(&sdp->sd_log_flush_lock);
686}
687
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
new file mode 100644
index 000000000000..7f5737d55612
--- /dev/null
+++ b/fs/gfs2/log.h
@@ -0,0 +1,65 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __LOG_DOT_H__
11#define __LOG_DOT_H__
12
13#include <linux/list.h>
14#include <linux/spinlock.h>
15#include "incore.h"
16
17/**
18 * gfs2_log_lock - acquire the right to mess with the log manager
19 * @sdp: the filesystem
20 *
21 */
22
23static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
24{
25 spin_lock(&sdp->sd_log_lock);
26}
27
28/**
29 * gfs2_log_unlock - release the right to mess with the log manager
30 * @sdp: the filesystem
31 *
32 */
33
34static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
35{
36 spin_unlock(&sdp->sd_log_lock);
37}
38
39static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
40 unsigned int value)
41{
42 if (++value == sdp->sd_jdesc->jd_blocks) {
43 value = 0;
44 }
45 sdp->sd_log_head = sdp->sd_log_tail = value;
46}
47
48unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
49 unsigned int ssize);
50
51void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags);
52int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
53
54int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
55void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
56
57struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
58struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
59 struct buffer_head *real);
60void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
61void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
62
63void gfs2_log_shutdown(struct gfs2_sbd *sdp);
64
65#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
new file mode 100644
index 000000000000..881e337b6a70
--- /dev/null
+++ b/fs/gfs2/lops.c
@@ -0,0 +1,809 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/lm_interface.h>
17
18#include "gfs2.h"
19#include "incore.h"
20#include "glock.h"
21#include "log.h"
22#include "lops.h"
23#include "meta_io.h"
24#include "recovery.h"
25#include "rgrp.h"
26#include "trans.h"
27#include "util.h"
28
29static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
30{
31 struct gfs2_glock *gl;
32 struct gfs2_trans *tr = current->journal_info;
33
34 tr->tr_touched = 1;
35
36 if (!list_empty(&le->le_list))
37 return;
38
39 gl = container_of(le, struct gfs2_glock, gl_le);
40 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
41 return;
42 gfs2_glock_hold(gl);
43 set_bit(GLF_DIRTY, &gl->gl_flags);
44
45 gfs2_log_lock(sdp);
46 sdp->sd_log_num_gl++;
47 list_add(&le->le_list, &sdp->sd_log_le_gl);
48 gfs2_log_unlock(sdp);
49}
50
51static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
52{
53 struct list_head *head = &sdp->sd_log_le_gl;
54 struct gfs2_glock *gl;
55
56 while (!list_empty(head)) {
57 gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list);
58 list_del_init(&gl->gl_le.le_list);
59 sdp->sd_log_num_gl--;
60
61 gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl));
62 gfs2_glock_put(gl);
63 }
64 gfs2_assert_warn(sdp, !sdp->sd_log_num_gl);
65}
66
67static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
68{
69 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
70 struct gfs2_trans *tr;
71
72 if (!list_empty(&bd->bd_list_tr))
73 return;
74
75 tr = current->journal_info;
76 tr->tr_touched = 1;
77 tr->tr_num_buf++;
78 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
79
80 if (!list_empty(&le->le_list))
81 return;
82
83 gfs2_trans_add_gl(bd->bd_gl);
84
85 gfs2_meta_check(sdp, bd->bd_bh);
86 gfs2_pin(sdp, bd->bd_bh);
87
88 gfs2_log_lock(sdp);
89 sdp->sd_log_num_buf++;
90 list_add(&le->le_list, &sdp->sd_log_le_buf);
91 gfs2_log_unlock(sdp);
92
93 tr->tr_num_buf_new++;
94}
95
96static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
97{
98 struct list_head *head = &tr->tr_list_buf;
99 struct gfs2_bufdata *bd;
100
101 while (!list_empty(head)) {
102 bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
103 list_del_init(&bd->bd_list_tr);
104 tr->tr_num_buf--;
105 }
106 gfs2_assert_warn(sdp, !tr->tr_num_buf);
107}
108
109static void buf_lo_before_commit(struct gfs2_sbd *sdp)
110{
111 struct buffer_head *bh;
112 struct gfs2_log_descriptor *ld;
113 struct gfs2_bufdata *bd1 = NULL, *bd2;
114 unsigned int total = sdp->sd_log_num_buf;
115 unsigned int offset = sizeof(struct gfs2_log_descriptor);
116 unsigned int limit;
117 unsigned int num;
118 unsigned n;
119 __be64 *ptr;
120
121 offset += sizeof(__be64) - 1;
122 offset &= ~(sizeof(__be64) - 1);
123 limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
124 /* for 4k blocks, limit = 503 */
125
126 bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list);
127 while(total) {
128 num = total;
129 if (total > limit)
130 num = limit;
131 bh = gfs2_log_get_buf(sdp);
132 sdp->sd_log_num_hdrs++;
133 ld = (struct gfs2_log_descriptor *)bh->b_data;
134 ptr = (__be64 *)(bh->b_data + offset);
135 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
136 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
137 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
138 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_METADATA);
139 ld->ld_length = cpu_to_be32(num + 1);
140 ld->ld_data1 = cpu_to_be32(num);
141 ld->ld_data2 = cpu_to_be32(0);
142 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
143
144 n = 0;
145 list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf,
146 bd_le.le_list) {
147 *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
148 if (++n >= num)
149 break;
150 }
151
152 set_buffer_dirty(bh);
153 ll_rw_block(WRITE, 1, &bh);
154
155 n = 0;
156 list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf,
157 bd_le.le_list) {
158 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
159 set_buffer_dirty(bh);
160 ll_rw_block(WRITE, 1, &bh);
161 if (++n >= num)
162 break;
163 }
164
165 total -= num;
166 }
167}
168
169static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
170{
171 struct list_head *head = &sdp->sd_log_le_buf;
172 struct gfs2_bufdata *bd;
173
174 while (!list_empty(head)) {
175 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
176 list_del_init(&bd->bd_le.le_list);
177 sdp->sd_log_num_buf--;
178
179 gfs2_unpin(sdp, bd->bd_bh, ai);
180 }
181 gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
182}
183
184static void buf_lo_before_scan(struct gfs2_jdesc *jd,
185 struct gfs2_log_header *head, int pass)
186{
187 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
188
189 if (pass != 0)
190 return;
191
192 sdp->sd_found_blocks = 0;
193 sdp->sd_replayed_blocks = 0;
194}
195
196static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
197 struct gfs2_log_descriptor *ld, __be64 *ptr,
198 int pass)
199{
200 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
201 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
202 struct gfs2_glock *gl = ip->i_gl;
203 unsigned int blks = be32_to_cpu(ld->ld_data1);
204 struct buffer_head *bh_log, *bh_ip;
205 u64 blkno;
206 int error = 0;
207
208 if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA)
209 return 0;
210
211 gfs2_replay_incr_blk(sdp, &start);
212
213 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
214 blkno = be64_to_cpu(*ptr++);
215
216 sdp->sd_found_blocks++;
217
218 if (gfs2_revoke_check(sdp, blkno, start))
219 continue;
220
221 error = gfs2_replay_read_block(jd, start, &bh_log);
222 if (error)
223 return error;
224
225 bh_ip = gfs2_meta_new(gl, blkno);
226 memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
227
228 if (gfs2_meta_check(sdp, bh_ip))
229 error = -EIO;
230 else
231 mark_buffer_dirty(bh_ip);
232
233 brelse(bh_log);
234 brelse(bh_ip);
235
236 if (error)
237 break;
238
239 sdp->sd_replayed_blocks++;
240 }
241
242 return error;
243}
244
245static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
246{
247 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
248 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
249
250 if (error) {
251 gfs2_meta_sync(ip->i_gl);
252 return;
253 }
254 if (pass != 1)
255 return;
256
257 gfs2_meta_sync(ip->i_gl);
258
259 fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
260 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
261}
262
263static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
264{
265 struct gfs2_trans *tr;
266
267 tr = current->journal_info;
268 tr->tr_touched = 1;
269 tr->tr_num_revoke++;
270
271 gfs2_log_lock(sdp);
272 sdp->sd_log_num_revoke++;
273 list_add(&le->le_list, &sdp->sd_log_le_revoke);
274 gfs2_log_unlock(sdp);
275}
276
277static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
278{
279 struct gfs2_log_descriptor *ld;
280 struct gfs2_meta_header *mh;
281 struct buffer_head *bh;
282 unsigned int offset;
283 struct list_head *head = &sdp->sd_log_le_revoke;
284 struct gfs2_revoke *rv;
285
286 if (!sdp->sd_log_num_revoke)
287 return;
288
289 bh = gfs2_log_get_buf(sdp);
290 ld = (struct gfs2_log_descriptor *)bh->b_data;
291 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
292 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
293 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
294 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_REVOKE);
295 ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke,
296 sizeof(u64)));
297 ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
298 ld->ld_data2 = cpu_to_be32(0);
299 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
300 offset = sizeof(struct gfs2_log_descriptor);
301
302 while (!list_empty(head)) {
303 rv = list_entry(head->next, struct gfs2_revoke, rv_le.le_list);
304 list_del_init(&rv->rv_le.le_list);
305 sdp->sd_log_num_revoke--;
306
307 if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
308 set_buffer_dirty(bh);
309 ll_rw_block(WRITE, 1, &bh);
310
311 bh = gfs2_log_get_buf(sdp);
312 mh = (struct gfs2_meta_header *)bh->b_data;
313 mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
314 mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB);
315 mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB);
316 offset = sizeof(struct gfs2_meta_header);
317 }
318
319 *(__be64 *)(bh->b_data + offset) = cpu_to_be64(rv->rv_blkno);
320 kfree(rv);
321
322 offset += sizeof(u64);
323 }
324 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
325
326 set_buffer_dirty(bh);
327 ll_rw_block(WRITE, 1, &bh);
328}
329
330static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
331 struct gfs2_log_header *head, int pass)
332{
333 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
334
335 if (pass != 0)
336 return;
337
338 sdp->sd_found_revokes = 0;
339 sdp->sd_replay_tail = head->lh_tail;
340}
341
342static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
343 struct gfs2_log_descriptor *ld, __be64 *ptr,
344 int pass)
345{
346 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
347 unsigned int blks = be32_to_cpu(ld->ld_length);
348 unsigned int revokes = be32_to_cpu(ld->ld_data1);
349 struct buffer_head *bh;
350 unsigned int offset;
351 u64 blkno;
352 int first = 1;
353 int error;
354
355 if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE)
356 return 0;
357
358 offset = sizeof(struct gfs2_log_descriptor);
359
360 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
361 error = gfs2_replay_read_block(jd, start, &bh);
362 if (error)
363 return error;
364
365 if (!first)
366 gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB);
367
368 while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) {
369 blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
370
371 error = gfs2_revoke_add(sdp, blkno, start);
372 if (error < 0)
373 return error;
374 else if (error)
375 sdp->sd_found_revokes++;
376
377 if (!--revokes)
378 break;
379 offset += sizeof(u64);
380 }
381
382 brelse(bh);
383 offset = sizeof(struct gfs2_meta_header);
384 first = 0;
385 }
386
387 return 0;
388}
389
390static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
391{
392 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
393
394 if (error) {
395 gfs2_revoke_clean(sdp);
396 return;
397 }
398 if (pass != 1)
399 return;
400
401 fs_info(sdp, "jid=%u: Found %u revoke tags\n",
402 jd->jd_jid, sdp->sd_found_revokes);
403
404 gfs2_revoke_clean(sdp);
405}
406
407static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
408{
409 struct gfs2_rgrpd *rgd;
410 struct gfs2_trans *tr = current->journal_info;
411
412 tr->tr_touched = 1;
413
414 if (!list_empty(&le->le_list))
415 return;
416
417 rgd = container_of(le, struct gfs2_rgrpd, rd_le);
418 gfs2_rgrp_bh_hold(rgd);
419
420 gfs2_log_lock(sdp);
421 sdp->sd_log_num_rg++;
422 list_add(&le->le_list, &sdp->sd_log_le_rg);
423 gfs2_log_unlock(sdp);
424}
425
426static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
427{
428 struct list_head *head = &sdp->sd_log_le_rg;
429 struct gfs2_rgrpd *rgd;
430
431 while (!list_empty(head)) {
432 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list);
433 list_del_init(&rgd->rd_le.le_list);
434 sdp->sd_log_num_rg--;
435
436 gfs2_rgrp_repolish_clones(rgd);
437 gfs2_rgrp_bh_put(rgd);
438 }
439 gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
440}
441
442/**
443 * databuf_lo_add - Add a databuf to the transaction.
444 *
445 * This is used in two distinct cases:
446 * i) In ordered write mode
447 * We put the data buffer on a list so that we can ensure that its
448 * synced to disk at the right time
449 * ii) In journaled data mode
450 * We need to journal the data block in the same way as metadata in
451 * the functions above. The difference is that here we have a tag
452 * which is two __be64's being the block number (as per meta data)
453 * and a flag which says whether the data block needs escaping or
454 * not. This means we need a new log entry for each 251 or so data
455 * blocks, which isn't an enormous overhead but twice as much as
456 * for normal metadata blocks.
457 */
458static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
459{
460 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
461 struct gfs2_trans *tr = current->journal_info;
462 struct address_space *mapping = bd->bd_bh->b_page->mapping;
463 struct gfs2_inode *ip = GFS2_I(mapping->host);
464
465 tr->tr_touched = 1;
466 if (list_empty(&bd->bd_list_tr) &&
467 (ip->i_di.di_flags & GFS2_DIF_JDATA)) {
468 tr->tr_num_buf++;
469 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
470 gfs2_pin(sdp, bd->bd_bh);
471 tr->tr_num_buf_new++;
472 }
473 gfs2_trans_add_gl(bd->bd_gl);
474 gfs2_log_lock(sdp);
475 if (list_empty(&le->le_list)) {
476 if (ip->i_di.di_flags & GFS2_DIF_JDATA)
477 sdp->sd_log_num_jdata++;
478 sdp->sd_log_num_databuf++;
479 list_add(&le->le_list, &sdp->sd_log_le_databuf);
480 }
481 gfs2_log_unlock(sdp);
482}
483
484static int gfs2_check_magic(struct buffer_head *bh)
485{
486 struct page *page = bh->b_page;
487 void *kaddr;
488 __be32 *ptr;
489 int rv = 0;
490
491 kaddr = kmap_atomic(page, KM_USER0);
492 ptr = kaddr + bh_offset(bh);
493 if (*ptr == cpu_to_be32(GFS2_MAGIC))
494 rv = 1;
495 kunmap_atomic(page, KM_USER0);
496
497 return rv;
498}
499
500/**
501 * databuf_lo_before_commit - Scan the data buffers, writing as we go
502 *
503 * Here we scan through the lists of buffers and make the assumption
504 * that any buffer thats been pinned is being journaled, and that
505 * any unpinned buffer is an ordered write data buffer and therefore
506 * will be written back rather than journaled.
507 */
508static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
509{
510 LIST_HEAD(started);
511 struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt;
512 struct buffer_head *bh = NULL;
513 unsigned int offset = sizeof(struct gfs2_log_descriptor);
514 struct gfs2_log_descriptor *ld;
515 unsigned int limit;
516 unsigned int total_dbuf = sdp->sd_log_num_databuf;
517 unsigned int total_jdata = sdp->sd_log_num_jdata;
518 unsigned int num, n;
519 __be64 *ptr = NULL;
520
521 offset += 2*sizeof(__be64) - 1;
522 offset &= ~(2*sizeof(__be64) - 1);
523 limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
524
525 /*
526 * Start writing ordered buffers, write journaled buffers
527 * into the log along with a header
528 */
529 gfs2_log_lock(sdp);
530 bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf,
531 bd_le.le_list);
532 while(total_dbuf) {
533 num = total_jdata;
534 if (num > limit)
535 num = limit;
536 n = 0;
537 list_for_each_entry_safe_continue(bd1, bdt,
538 &sdp->sd_log_le_databuf,
539 bd_le.le_list) {
540 /* An ordered write buffer */
541 if (bd1->bd_bh && !buffer_pinned(bd1->bd_bh)) {
542 list_move(&bd1->bd_le.le_list, &started);
543 if (bd1 == bd2) {
544 bd2 = NULL;
545 bd2 = list_prepare_entry(bd2,
546 &sdp->sd_log_le_databuf,
547 bd_le.le_list);
548 }
549 total_dbuf--;
550 if (bd1->bd_bh) {
551 get_bh(bd1->bd_bh);
552 if (buffer_dirty(bd1->bd_bh)) {
553 gfs2_log_unlock(sdp);
554 wait_on_buffer(bd1->bd_bh);
555 ll_rw_block(WRITE, 1,
556 &bd1->bd_bh);
557 gfs2_log_lock(sdp);
558 }
559 brelse(bd1->bd_bh);
560 continue;
561 }
562 continue;
563 } else if (bd1->bd_bh) { /* A journaled buffer */
564 int magic;
565 gfs2_log_unlock(sdp);
566 if (!bh) {
567 bh = gfs2_log_get_buf(sdp);
568 sdp->sd_log_num_hdrs++;
569 ld = (struct gfs2_log_descriptor *)
570 bh->b_data;
571 ptr = (__be64 *)(bh->b_data + offset);
572 ld->ld_header.mh_magic =
573 cpu_to_be32(GFS2_MAGIC);
574 ld->ld_header.mh_type =
575 cpu_to_be32(GFS2_METATYPE_LD);
576 ld->ld_header.mh_format =
577 cpu_to_be32(GFS2_FORMAT_LD);
578 ld->ld_type =
579 cpu_to_be32(GFS2_LOG_DESC_JDATA);
580 ld->ld_length = cpu_to_be32(num + 1);
581 ld->ld_data1 = cpu_to_be32(num);
582 ld->ld_data2 = cpu_to_be32(0);
583 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
584 }
585 magic = gfs2_check_magic(bd1->bd_bh);
586 *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
587 *ptr++ = cpu_to_be64((__u64)magic);
588 clear_buffer_escaped(bd1->bd_bh);
589 if (unlikely(magic != 0))
590 set_buffer_escaped(bd1->bd_bh);
591 gfs2_log_lock(sdp);
592 if (n++ > num)
593 break;
594 } else if (!bd1->bd_bh) {
595 total_dbuf--;
596 sdp->sd_log_num_databuf--;
597 list_del_init(&bd1->bd_le.le_list);
598 if (bd1 == bd2) {
599 bd2 = NULL;
600 bd2 = list_prepare_entry(bd2,
601 &sdp->sd_log_le_databuf,
602 bd_le.le_list);
603 }
604 kmem_cache_free(gfs2_bufdata_cachep, bd1);
605 }
606 }
607 gfs2_log_unlock(sdp);
608 if (bh) {
609 set_buffer_dirty(bh);
610 ll_rw_block(WRITE, 1, &bh);
611 bh = NULL;
612 }
613 n = 0;
614 gfs2_log_lock(sdp);
615 list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf,
616 bd_le.le_list) {
617 if (!bd2->bd_bh)
618 continue;
619 /* copy buffer if it needs escaping */
620 gfs2_log_unlock(sdp);
621 if (unlikely(buffer_escaped(bd2->bd_bh))) {
622 void *kaddr;
623 struct page *page = bd2->bd_bh->b_page;
624 bh = gfs2_log_get_buf(sdp);
625 kaddr = kmap_atomic(page, KM_USER0);
626 memcpy(bh->b_data,
627 kaddr + bh_offset(bd2->bd_bh),
628 sdp->sd_sb.sb_bsize);
629 kunmap_atomic(page, KM_USER0);
630 *(__be32 *)bh->b_data = 0;
631 } else {
632 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
633 }
634 set_buffer_dirty(bh);
635 ll_rw_block(WRITE, 1, &bh);
636 gfs2_log_lock(sdp);
637 if (++n >= num)
638 break;
639 }
640 bh = NULL;
641 total_dbuf -= num;
642 total_jdata -= num;
643 }
644 gfs2_log_unlock(sdp);
645
646 /* Wait on all ordered buffers */
647 while (!list_empty(&started)) {
648 gfs2_log_lock(sdp);
649 bd1 = list_entry(started.next, struct gfs2_bufdata,
650 bd_le.le_list);
651 list_del_init(&bd1->bd_le.le_list);
652 sdp->sd_log_num_databuf--;
653 bh = bd1->bd_bh;
654 if (bh) {
655 bh->b_private = NULL;
656 get_bh(bh);
657 gfs2_log_unlock(sdp);
658 wait_on_buffer(bh);
659 brelse(bh);
660 } else
661 gfs2_log_unlock(sdp);
662
663 kmem_cache_free(gfs2_bufdata_cachep, bd1);
664 }
665
666 /* We've removed all the ordered write bufs here, so only jdata left */
667 gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata);
668}
669
670static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
671 struct gfs2_log_descriptor *ld,
672 __be64 *ptr, int pass)
673{
674 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
675 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
676 struct gfs2_glock *gl = ip->i_gl;
677 unsigned int blks = be32_to_cpu(ld->ld_data1);
678 struct buffer_head *bh_log, *bh_ip;
679 u64 blkno;
680 u64 esc;
681 int error = 0;
682
683 if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
684 return 0;
685
686 gfs2_replay_incr_blk(sdp, &start);
687 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
688 blkno = be64_to_cpu(*ptr++);
689 esc = be64_to_cpu(*ptr++);
690
691 sdp->sd_found_blocks++;
692
693 if (gfs2_revoke_check(sdp, blkno, start))
694 continue;
695
696 error = gfs2_replay_read_block(jd, start, &bh_log);
697 if (error)
698 return error;
699
700 bh_ip = gfs2_meta_new(gl, blkno);
701 memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
702
703 /* Unescape */
704 if (esc) {
705 __be32 *eptr = (__be32 *)bh_ip->b_data;
706 *eptr = cpu_to_be32(GFS2_MAGIC);
707 }
708 mark_buffer_dirty(bh_ip);
709
710 brelse(bh_log);
711 brelse(bh_ip);
712 if (error)
713 break;
714
715 sdp->sd_replayed_blocks++;
716 }
717
718 return error;
719}
720
721/* FIXME: sort out accounting for log blocks etc. */
722
723static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
724{
725 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
726 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
727
728 if (error) {
729 gfs2_meta_sync(ip->i_gl);
730 return;
731 }
732 if (pass != 1)
733 return;
734
735 /* data sync? */
736 gfs2_meta_sync(ip->i_gl);
737
738 fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
739 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
740}
741
742static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
743{
744 struct list_head *head = &sdp->sd_log_le_databuf;
745 struct gfs2_bufdata *bd;
746
747 while (!list_empty(head)) {
748 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
749 list_del_init(&bd->bd_le.le_list);
750 sdp->sd_log_num_databuf--;
751 sdp->sd_log_num_jdata--;
752 gfs2_unpin(sdp, bd->bd_bh, ai);
753 }
754 gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
755 gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata);
756}
757
758
759const struct gfs2_log_operations gfs2_glock_lops = {
760 .lo_add = glock_lo_add,
761 .lo_after_commit = glock_lo_after_commit,
762 .lo_name = "glock",
763};
764
765const struct gfs2_log_operations gfs2_buf_lops = {
766 .lo_add = buf_lo_add,
767 .lo_incore_commit = buf_lo_incore_commit,
768 .lo_before_commit = buf_lo_before_commit,
769 .lo_after_commit = buf_lo_after_commit,
770 .lo_before_scan = buf_lo_before_scan,
771 .lo_scan_elements = buf_lo_scan_elements,
772 .lo_after_scan = buf_lo_after_scan,
773 .lo_name = "buf",
774};
775
776const struct gfs2_log_operations gfs2_revoke_lops = {
777 .lo_add = revoke_lo_add,
778 .lo_before_commit = revoke_lo_before_commit,
779 .lo_before_scan = revoke_lo_before_scan,
780 .lo_scan_elements = revoke_lo_scan_elements,
781 .lo_after_scan = revoke_lo_after_scan,
782 .lo_name = "revoke",
783};
784
785const struct gfs2_log_operations gfs2_rg_lops = {
786 .lo_add = rg_lo_add,
787 .lo_after_commit = rg_lo_after_commit,
788 .lo_name = "rg",
789};
790
791const struct gfs2_log_operations gfs2_databuf_lops = {
792 .lo_add = databuf_lo_add,
793 .lo_incore_commit = buf_lo_incore_commit,
794 .lo_before_commit = databuf_lo_before_commit,
795 .lo_after_commit = databuf_lo_after_commit,
796 .lo_scan_elements = databuf_lo_scan_elements,
797 .lo_after_scan = databuf_lo_after_scan,
798 .lo_name = "databuf",
799};
800
801const struct gfs2_log_operations *gfs2_log_ops[] = {
802 &gfs2_glock_lops,
803 &gfs2_buf_lops,
804 &gfs2_revoke_lops,
805 &gfs2_rg_lops,
806 &gfs2_databuf_lops,
807 NULL,
808};
809
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
new file mode 100644
index 000000000000..5839c05ae6be
--- /dev/null
+++ b/fs/gfs2/lops.h
@@ -0,0 +1,99 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __LOPS_DOT_H__
11#define __LOPS_DOT_H__
12
13#include <linux/list.h>
14#include "incore.h"
15
16extern const struct gfs2_log_operations gfs2_glock_lops;
17extern const struct gfs2_log_operations gfs2_buf_lops;
18extern const struct gfs2_log_operations gfs2_revoke_lops;
19extern const struct gfs2_log_operations gfs2_rg_lops;
20extern const struct gfs2_log_operations gfs2_databuf_lops;
21
22extern const struct gfs2_log_operations *gfs2_log_ops[];
23
24static inline void lops_init_le(struct gfs2_log_element *le,
25 const struct gfs2_log_operations *lops)
26{
27 INIT_LIST_HEAD(&le->le_list);
28 le->le_ops = lops;
29}
30
31static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
32{
33 if (le->le_ops->lo_add)
34 le->le_ops->lo_add(sdp, le);
35}
36
37static inline void lops_incore_commit(struct gfs2_sbd *sdp,
38 struct gfs2_trans *tr)
39{
40 int x;
41 for (x = 0; gfs2_log_ops[x]; x++)
42 if (gfs2_log_ops[x]->lo_incore_commit)
43 gfs2_log_ops[x]->lo_incore_commit(sdp, tr);
44}
45
46static inline void lops_before_commit(struct gfs2_sbd *sdp)
47{
48 int x;
49 for (x = 0; gfs2_log_ops[x]; x++)
50 if (gfs2_log_ops[x]->lo_before_commit)
51 gfs2_log_ops[x]->lo_before_commit(sdp);
52}
53
54static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
55{
56 int x;
57 for (x = 0; gfs2_log_ops[x]; x++)
58 if (gfs2_log_ops[x]->lo_after_commit)
59 gfs2_log_ops[x]->lo_after_commit(sdp, ai);
60}
61
62static inline void lops_before_scan(struct gfs2_jdesc *jd,
63 struct gfs2_log_header *head,
64 unsigned int pass)
65{
66 int x;
67 for (x = 0; gfs2_log_ops[x]; x++)
68 if (gfs2_log_ops[x]->lo_before_scan)
69 gfs2_log_ops[x]->lo_before_scan(jd, head, pass);
70}
71
72static inline int lops_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
73 struct gfs2_log_descriptor *ld,
74 __be64 *ptr,
75 unsigned int pass)
76{
77 int x, error;
78 for (x = 0; gfs2_log_ops[x]; x++)
79 if (gfs2_log_ops[x]->lo_scan_elements) {
80 error = gfs2_log_ops[x]->lo_scan_elements(jd, start,
81 ld, ptr, pass);
82 if (error)
83 return error;
84 }
85
86 return 0;
87}
88
89static inline void lops_after_scan(struct gfs2_jdesc *jd, int error,
90 unsigned int pass)
91{
92 int x;
93 for (x = 0; gfs2_log_ops[x]; x++)
94 if (gfs2_log_ops[x]->lo_before_scan)
95 gfs2_log_ops[x]->lo_after_scan(jd, error, pass);
96}
97
98#endif /* __LOPS_DOT_H__ */
99
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
new file mode 100644
index 000000000000..21508a13bb78
--- /dev/null
+++ b/fs/gfs2/main.c
@@ -0,0 +1,150 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/module.h>
16#include <linux/init.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/lm_interface.h>
19#include <asm/atomic.h>
20
21#include "gfs2.h"
22#include "incore.h"
23#include "ops_fstype.h"
24#include "sys.h"
25#include "util.h"
26#include "glock.h"
27
28static void gfs2_init_inode_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
29{
30 struct gfs2_inode *ip = foo;
31 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
32 SLAB_CTOR_CONSTRUCTOR) {
33 inode_init_once(&ip->i_inode);
34 spin_lock_init(&ip->i_spin);
35 init_rwsem(&ip->i_rw_mutex);
36 memset(ip->i_cache, 0, sizeof(ip->i_cache));
37 }
38}
39
40static void gfs2_init_glock_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
41{
42 struct gfs2_glock *gl = foo;
43 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
44 SLAB_CTOR_CONSTRUCTOR) {
45 INIT_HLIST_NODE(&gl->gl_list);
46 spin_lock_init(&gl->gl_spin);
47 INIT_LIST_HEAD(&gl->gl_holders);
48 INIT_LIST_HEAD(&gl->gl_waiters1);
49 INIT_LIST_HEAD(&gl->gl_waiters2);
50 INIT_LIST_HEAD(&gl->gl_waiters3);
51 gl->gl_lvb = NULL;
52 atomic_set(&gl->gl_lvb_count, 0);
53 INIT_LIST_HEAD(&gl->gl_reclaim);
54 INIT_LIST_HEAD(&gl->gl_ail_list);
55 atomic_set(&gl->gl_ail_count, 0);
56 }
57}
58
59/**
60 * init_gfs2_fs - Register GFS2 as a filesystem
61 *
62 * Returns: 0 on success, error code on failure
63 */
64
65static int __init init_gfs2_fs(void)
66{
67 int error;
68
69 error = gfs2_sys_init();
70 if (error)
71 return error;
72
73 error = gfs2_glock_init();
74 if (error)
75 goto fail;
76
77 error = -ENOMEM;
78 gfs2_glock_cachep = kmem_cache_create("gfs2_glock",
79 sizeof(struct gfs2_glock),
80 0, 0,
81 gfs2_init_glock_once, NULL);
82 if (!gfs2_glock_cachep)
83 goto fail;
84
85 gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
86 sizeof(struct gfs2_inode),
87 0, (SLAB_RECLAIM_ACCOUNT|
88 SLAB_PANIC|SLAB_MEM_SPREAD),
89 gfs2_init_inode_once, NULL);
90 if (!gfs2_inode_cachep)
91 goto fail;
92
93 gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata",
94 sizeof(struct gfs2_bufdata),
95 0, 0, NULL, NULL);
96 if (!gfs2_bufdata_cachep)
97 goto fail;
98
99 error = register_filesystem(&gfs2_fs_type);
100 if (error)
101 goto fail;
102
103 error = register_filesystem(&gfs2meta_fs_type);
104 if (error)
105 goto fail_unregister;
106
107 printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
108
109 return 0;
110
111fail_unregister:
112 unregister_filesystem(&gfs2_fs_type);
113fail:
114 if (gfs2_bufdata_cachep)
115 kmem_cache_destroy(gfs2_bufdata_cachep);
116
117 if (gfs2_inode_cachep)
118 kmem_cache_destroy(gfs2_inode_cachep);
119
120 if (gfs2_glock_cachep)
121 kmem_cache_destroy(gfs2_glock_cachep);
122
123 gfs2_sys_uninit();
124 return error;
125}
126
127/**
128 * exit_gfs2_fs - Unregister the file system
129 *
130 */
131
132static void __exit exit_gfs2_fs(void)
133{
134 unregister_filesystem(&gfs2_fs_type);
135 unregister_filesystem(&gfs2meta_fs_type);
136
137 kmem_cache_destroy(gfs2_bufdata_cachep);
138 kmem_cache_destroy(gfs2_inode_cachep);
139 kmem_cache_destroy(gfs2_glock_cachep);
140
141 gfs2_sys_uninit();
142}
143
144MODULE_DESCRIPTION("Global File System");
145MODULE_AUTHOR("Red Hat, Inc.");
146MODULE_LICENSE("GPL");
147
148module_init(init_gfs2_fs);
149module_exit(exit_gfs2_fs);
150
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
new file mode 100644
index 000000000000..3912d6a4b1e6
--- /dev/null
+++ b/fs/gfs2/meta_io.c
@@ -0,0 +1,590 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/writeback.h>
18#include <linux/swap.h>
19#include <linux/delay.h>
20#include <linux/bio.h>
21#include <linux/gfs2_ondisk.h>
22#include <linux/lm_interface.h>
23
24#include "gfs2.h"
25#include "incore.h"
26#include "glock.h"
27#include "glops.h"
28#include "inode.h"
29#include "log.h"
30#include "lops.h"
31#include "meta_io.h"
32#include "rgrp.h"
33#include "trans.h"
34#include "util.h"
35#include "ops_address.h"
36
37static int aspace_get_block(struct inode *inode, sector_t lblock,
38 struct buffer_head *bh_result, int create)
39{
40 gfs2_assert_warn(inode->i_sb->s_fs_info, 0);
41 return -EOPNOTSUPP;
42}
43
44static int gfs2_aspace_writepage(struct page *page,
45 struct writeback_control *wbc)
46{
47 return block_write_full_page(page, aspace_get_block, wbc);
48}
49
50static const struct address_space_operations aspace_aops = {
51 .writepage = gfs2_aspace_writepage,
52 .releasepage = gfs2_releasepage,
53};
54
55/**
56 * gfs2_aspace_get - Create and initialize a struct inode structure
57 * @sdp: the filesystem the aspace is in
58 *
59 * Right now a struct inode is just a struct inode. Maybe Linux
60 * will supply a more lightweight address space construct (that works)
61 * in the future.
62 *
63 * Make sure pages/buffers in this aspace aren't in high memory.
64 *
65 * Returns: the aspace
66 */
67
68struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
69{
70 struct inode *aspace;
71
72 aspace = new_inode(sdp->sd_vfs);
73 if (aspace) {
74 mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS);
75 aspace->i_mapping->a_ops = &aspace_aops;
76 aspace->i_size = ~0ULL;
77 aspace->i_private = NULL;
78 insert_inode_hash(aspace);
79 }
80 return aspace;
81}
82
83void gfs2_aspace_put(struct inode *aspace)
84{
85 remove_inode_hash(aspace);
86 iput(aspace);
87}
88
89/**
90 * gfs2_meta_inval - Invalidate all buffers associated with a glock
91 * @gl: the glock
92 *
93 */
94
95void gfs2_meta_inval(struct gfs2_glock *gl)
96{
97 struct gfs2_sbd *sdp = gl->gl_sbd;
98 struct inode *aspace = gl->gl_aspace;
99 struct address_space *mapping = gl->gl_aspace->i_mapping;
100
101 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
102
103 atomic_inc(&aspace->i_writecount);
104 truncate_inode_pages(mapping, 0);
105 atomic_dec(&aspace->i_writecount);
106
107 gfs2_assert_withdraw(sdp, !mapping->nrpages);
108}
109
110/**
111 * gfs2_meta_sync - Sync all buffers associated with a glock
112 * @gl: The glock
113 *
114 */
115
116void gfs2_meta_sync(struct gfs2_glock *gl)
117{
118 struct address_space *mapping = gl->gl_aspace->i_mapping;
119 int error;
120
121 filemap_fdatawrite(mapping);
122 error = filemap_fdatawait(mapping);
123
124 if (error)
125 gfs2_io_error(gl->gl_sbd);
126}
127
128/**
129 * getbuf - Get a buffer with a given address space
130 * @sdp: the filesystem
131 * @aspace: the address space
132 * @blkno: the block number (filesystem scope)
133 * @create: 1 if the buffer should be created
134 *
135 * Returns: the buffer
136 */
137
138static struct buffer_head *getbuf(struct gfs2_sbd *sdp, struct inode *aspace,
139 u64 blkno, int create)
140{
141 struct page *page;
142 struct buffer_head *bh;
143 unsigned int shift;
144 unsigned long index;
145 unsigned int bufnum;
146
147 shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
148 index = blkno >> shift; /* convert block to page */
149 bufnum = blkno - (index << shift); /* block buf index within page */
150
151 if (create) {
152 for (;;) {
153 page = grab_cache_page(aspace->i_mapping, index);
154 if (page)
155 break;
156 yield();
157 }
158 } else {
159 page = find_lock_page(aspace->i_mapping, index);
160 if (!page)
161 return NULL;
162 }
163
164 if (!page_has_buffers(page))
165 create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
166
167 /* Locate header for our buffer within our page */
168 for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
169 /* Do nothing */;
170 get_bh(bh);
171
172 if (!buffer_mapped(bh))
173 map_bh(bh, sdp->sd_vfs, blkno);
174
175 unlock_page(page);
176 mark_page_accessed(page);
177 page_cache_release(page);
178
179 return bh;
180}
181
182static void meta_prep_new(struct buffer_head *bh)
183{
184 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
185
186 lock_buffer(bh);
187 clear_buffer_dirty(bh);
188 set_buffer_uptodate(bh);
189 unlock_buffer(bh);
190
191 mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
192}
193
194/**
195 * gfs2_meta_new - Get a block
196 * @gl: The glock associated with this block
197 * @blkno: The block number
198 *
199 * Returns: The buffer
200 */
201
202struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
203{
204 struct buffer_head *bh;
205 bh = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
206 meta_prep_new(bh);
207 return bh;
208}
209
210/**
211 * gfs2_meta_read - Read a block from disk
212 * @gl: The glock covering the block
213 * @blkno: The block number
214 * @flags: flags
215 * @bhp: the place where the buffer is returned (NULL on failure)
216 *
217 * Returns: errno
218 */
219
220int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
221 struct buffer_head **bhp)
222{
223 *bhp = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
224 if (!buffer_uptodate(*bhp))
225 ll_rw_block(READ_META, 1, bhp);
226 if (flags & DIO_WAIT) {
227 int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
228 if (error) {
229 brelse(*bhp);
230 return error;
231 }
232 }
233
234 return 0;
235}
236
237/**
238 * gfs2_meta_wait - Reread a block from disk
239 * @sdp: the filesystem
240 * @bh: The block to wait for
241 *
242 * Returns: errno
243 */
244
245int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
246{
247 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
248 return -EIO;
249
250 wait_on_buffer(bh);
251
252 if (!buffer_uptodate(bh)) {
253 struct gfs2_trans *tr = current->journal_info;
254 if (tr && tr->tr_touched)
255 gfs2_io_error_bh(sdp, bh);
256 return -EIO;
257 }
258 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
259 return -EIO;
260
261 return 0;
262}
263
264/**
265 * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer
266 * @gl: the glock the buffer belongs to
267 * @bh: The buffer to be attached to
268 * @meta: Flag to indicate whether its metadata or not
269 */
270
271void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
272 int meta)
273{
274 struct gfs2_bufdata *bd;
275
276 if (meta)
277 lock_page(bh->b_page);
278
279 if (bh->b_private) {
280 if (meta)
281 unlock_page(bh->b_page);
282 return;
283 }
284
285 bd = kmem_cache_alloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL),
286 memset(bd, 0, sizeof(struct gfs2_bufdata));
287 bd->bd_bh = bh;
288 bd->bd_gl = gl;
289
290 INIT_LIST_HEAD(&bd->bd_list_tr);
291 if (meta)
292 lops_init_le(&bd->bd_le, &gfs2_buf_lops);
293 else
294 lops_init_le(&bd->bd_le, &gfs2_databuf_lops);
295 bh->b_private = bd;
296
297 if (meta)
298 unlock_page(bh->b_page);
299}
300
301/**
302 * gfs2_pin - Pin a buffer in memory
303 * @sdp: the filesystem the buffer belongs to
304 * @bh: The buffer to be pinned
305 *
306 */
307
308void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
309{
310 struct gfs2_bufdata *bd = bh->b_private;
311
312 gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
313
314 if (test_set_buffer_pinned(bh))
315 gfs2_assert_withdraw(sdp, 0);
316
317 wait_on_buffer(bh);
318
319 /* If this buffer is in the AIL and it has already been written
320 to in-place disk block, remove it from the AIL. */
321
322 gfs2_log_lock(sdp);
323 if (bd->bd_ail && !buffer_in_io(bh))
324 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
325 gfs2_log_unlock(sdp);
326
327 clear_buffer_dirty(bh);
328 wait_on_buffer(bh);
329
330 if (!buffer_uptodate(bh))
331 gfs2_io_error_bh(sdp, bh);
332
333 get_bh(bh);
334}
335
336/**
337 * gfs2_unpin - Unpin a buffer
338 * @sdp: the filesystem the buffer belongs to
339 * @bh: The buffer to unpin
340 * @ai:
341 *
342 */
343
344void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
345 struct gfs2_ail *ai)
346{
347 struct gfs2_bufdata *bd = bh->b_private;
348
349 gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
350
351 if (!buffer_pinned(bh))
352 gfs2_assert_withdraw(sdp, 0);
353
354 mark_buffer_dirty(bh);
355 clear_buffer_pinned(bh);
356
357 gfs2_log_lock(sdp);
358 if (bd->bd_ail) {
359 list_del(&bd->bd_ail_st_list);
360 brelse(bh);
361 } else {
362 struct gfs2_glock *gl = bd->bd_gl;
363 list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
364 atomic_inc(&gl->gl_ail_count);
365 }
366 bd->bd_ail = ai;
367 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
368 gfs2_log_unlock(sdp);
369}
370
371/**
372 * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore
373 * @ip: the inode who owns the buffers
374 * @bstart: the first buffer in the run
375 * @blen: the number of buffers in the run
376 *
377 */
378
379void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
380{
381 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
382 struct inode *aspace = ip->i_gl->gl_aspace;
383 struct buffer_head *bh;
384
385 while (blen) {
386 bh = getbuf(sdp, aspace, bstart, NO_CREATE);
387 if (bh) {
388 struct gfs2_bufdata *bd = bh->b_private;
389
390 if (test_clear_buffer_pinned(bh)) {
391 struct gfs2_trans *tr = current->journal_info;
392 gfs2_log_lock(sdp);
393 list_del_init(&bd->bd_le.le_list);
394 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
395 sdp->sd_log_num_buf--;
396 gfs2_log_unlock(sdp);
397 tr->tr_num_buf_rm++;
398 brelse(bh);
399 }
400 if (bd) {
401 gfs2_log_lock(sdp);
402 if (bd->bd_ail) {
403 u64 blkno = bh->b_blocknr;
404 bd->bd_ail = NULL;
405 list_del(&bd->bd_ail_st_list);
406 list_del(&bd->bd_ail_gl_list);
407 atomic_dec(&bd->bd_gl->gl_ail_count);
408 brelse(bh);
409 gfs2_log_unlock(sdp);
410 gfs2_trans_add_revoke(sdp, blkno);
411 } else
412 gfs2_log_unlock(sdp);
413 }
414
415 lock_buffer(bh);
416 clear_buffer_dirty(bh);
417 clear_buffer_uptodate(bh);
418 unlock_buffer(bh);
419
420 brelse(bh);
421 }
422
423 bstart++;
424 blen--;
425 }
426}
427
428/**
429 * gfs2_meta_cache_flush - get rid of any references on buffers for this inode
430 * @ip: The GFS2 inode
431 *
432 * This releases buffers that are in the most-recently-used array of
433 * blocks used for indirect block addressing for this inode.
434 */
435
436void gfs2_meta_cache_flush(struct gfs2_inode *ip)
437{
438 struct buffer_head **bh_slot;
439 unsigned int x;
440
441 spin_lock(&ip->i_spin);
442
443 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
444 bh_slot = &ip->i_cache[x];
445 if (!*bh_slot)
446 break;
447 brelse(*bh_slot);
448 *bh_slot = NULL;
449 }
450
451 spin_unlock(&ip->i_spin);
452}
453
454/**
455 * gfs2_meta_indirect_buffer - Get a metadata buffer
456 * @ip: The GFS2 inode
457 * @height: The level of this buf in the metadata (indir addr) tree (if any)
458 * @num: The block number (device relative) of the buffer
459 * @new: Non-zero if we may create a new buffer
460 * @bhp: the buffer is returned here
461 *
462 * Try to use the gfs2_inode's MRU metadata tree cache.
463 *
464 * Returns: errno
465 */
466
467int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
468 int new, struct buffer_head **bhp)
469{
470 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
471 struct gfs2_glock *gl = ip->i_gl;
472 struct buffer_head *bh = NULL, **bh_slot = ip->i_cache + height;
473 int in_cache = 0;
474
475 spin_lock(&ip->i_spin);
476 if (*bh_slot && (*bh_slot)->b_blocknr == num) {
477 bh = *bh_slot;
478 get_bh(bh);
479 in_cache = 1;
480 }
481 spin_unlock(&ip->i_spin);
482
483 if (!bh)
484 bh = getbuf(gl->gl_sbd, gl->gl_aspace, num, CREATE);
485
486 if (!bh)
487 return -ENOBUFS;
488
489 if (new) {
490 if (gfs2_assert_warn(sdp, height))
491 goto err;
492 meta_prep_new(bh);
493 gfs2_trans_add_bh(ip->i_gl, bh, 1);
494 gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
495 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
496 } else {
497 u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
498 if (!buffer_uptodate(bh)) {
499 ll_rw_block(READ_META, 1, &bh);
500 if (gfs2_meta_wait(sdp, bh))
501 goto err;
502 }
503 if (gfs2_metatype_check(sdp, bh, mtype))
504 goto err;
505 }
506
507 if (!in_cache) {
508 spin_lock(&ip->i_spin);
509 if (*bh_slot)
510 brelse(*bh_slot);
511 *bh_slot = bh;
512 get_bh(bh);
513 spin_unlock(&ip->i_spin);
514 }
515
516 *bhp = bh;
517 return 0;
518err:
519 brelse(bh);
520 return -EIO;
521}
522
523/**
524 * gfs2_meta_ra - start readahead on an extent of a file
525 * @gl: the glock the blocks belong to
526 * @dblock: the starting disk block
527 * @extlen: the number of blocks in the extent
528 *
529 * returns: the first buffer in the extent
530 */
531
532struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
533{
534 struct gfs2_sbd *sdp = gl->gl_sbd;
535 struct inode *aspace = gl->gl_aspace;
536 struct buffer_head *first_bh, *bh;
537 u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
538 sdp->sd_sb.sb_bsize_shift;
539
540 BUG_ON(!extlen);
541
542 if (max_ra < 1)
543 max_ra = 1;
544 if (extlen > max_ra)
545 extlen = max_ra;
546
547 first_bh = getbuf(sdp, aspace, dblock, CREATE);
548
549 if (buffer_uptodate(first_bh))
550 goto out;
551 if (!buffer_locked(first_bh))
552 ll_rw_block(READ_META, 1, &first_bh);
553
554 dblock++;
555 extlen--;
556
557 while (extlen) {
558 bh = getbuf(sdp, aspace, dblock, CREATE);
559
560 if (!buffer_uptodate(bh) && !buffer_locked(bh))
561 ll_rw_block(READA, 1, &bh);
562 brelse(bh);
563 dblock++;
564 extlen--;
565 if (!buffer_locked(first_bh) && buffer_uptodate(first_bh))
566 goto out;
567 }
568
569 wait_on_buffer(first_bh);
570out:
571 return first_bh;
572}
573
574/**
575 * gfs2_meta_syncfs - sync all the buffers in a filesystem
576 * @sdp: the filesystem
577 *
578 */
579
580void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
581{
582 gfs2_log_flush(sdp, NULL);
583 for (;;) {
584 gfs2_ail1_start(sdp, DIO_ALL);
585 if (gfs2_ail1_empty(sdp, DIO_ALL))
586 break;
587 msleep(10);
588 }
589}
590
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
new file mode 100644
index 000000000000..3ec939e20dff
--- /dev/null
+++ b/fs/gfs2/meta_io.h
@@ -0,0 +1,78 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __DIO_DOT_H__
11#define __DIO_DOT_H__
12
13#include <linux/buffer_head.h>
14#include <linux/string.h>
15#include "incore.h"
16
17static inline void gfs2_buffer_clear(struct buffer_head *bh)
18{
19 memset(bh->b_data, 0, bh->b_size);
20}
21
22static inline void gfs2_buffer_clear_tail(struct buffer_head *bh, int head)
23{
24 BUG_ON(head > bh->b_size);
25 memset(bh->b_data + head, 0, bh->b_size - head);
26}
27
28static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
29 int to_head,
30 struct buffer_head *from_bh,
31 int from_head)
32{
33 BUG_ON(from_head < to_head);
34 memcpy(to_bh->b_data + to_head, from_bh->b_data + from_head,
35 from_bh->b_size - from_head);
36 memset(to_bh->b_data + to_bh->b_size + to_head - from_head,
37 0, from_head - to_head);
38}
39
40struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
41void gfs2_aspace_put(struct inode *aspace);
42
43void gfs2_meta_inval(struct gfs2_glock *gl);
44void gfs2_meta_sync(struct gfs2_glock *gl);
45
46struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
47int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
48 int flags, struct buffer_head **bhp);
49int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
50
51void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
52 int meta);
53void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
54void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
55 struct gfs2_ail *ai);
56
57void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
58
59void gfs2_meta_cache_flush(struct gfs2_inode *ip);
60int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
61 int new, struct buffer_head **bhp);
62
63static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
64 struct buffer_head **bhp)
65{
66 return gfs2_meta_indirect_buffer(ip, 0, ip->i_num.no_addr, 0, bhp);
67}
68
69struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen);
70void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
71
72#define buffer_busy(bh) \
73((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned)))
74#define buffer_in_io(bh) \
75((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock)))
76
77#endif /* __DIO_DOT_H__ */
78
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
new file mode 100644
index 000000000000..ef3092e29607
--- /dev/null
+++ b/fs/gfs2/mount.c
@@ -0,0 +1,214 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/lm_interface.h>
17
18#include "gfs2.h"
19#include "incore.h"
20#include "mount.h"
21#include "sys.h"
22#include "util.h"
23
24/**
25 * gfs2_mount_args - Parse mount options
26 * @sdp:
27 * @data:
28 *
29 * Return: errno
30 */
31
32int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
33{
34 struct gfs2_args *args = &sdp->sd_args;
35 char *data = data_arg;
36 char *options, *o, *v;
37 int error = 0;
38
39 if (!remount) {
40 /* If someone preloaded options, use those instead */
41 spin_lock(&gfs2_sys_margs_lock);
42 if (gfs2_sys_margs) {
43 data = gfs2_sys_margs;
44 gfs2_sys_margs = NULL;
45 }
46 spin_unlock(&gfs2_sys_margs_lock);
47
48 /* Set some defaults */
49 args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
50 args->ar_quota = GFS2_QUOTA_DEFAULT;
51 args->ar_data = GFS2_DATA_DEFAULT;
52 }
53
54 /* Split the options into tokens with the "," character and
55 process them */
56
57 for (options = data; (o = strsep(&options, ",")); ) {
58 if (!*o)
59 continue;
60
61 v = strchr(o, '=');
62 if (v)
63 *v++ = 0;
64
65 if (!strcmp(o, "lockproto")) {
66 if (!v)
67 goto need_value;
68 if (remount && strcmp(v, args->ar_lockproto))
69 goto cant_remount;
70 strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
71 args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
72 }
73
74 else if (!strcmp(o, "locktable")) {
75 if (!v)
76 goto need_value;
77 if (remount && strcmp(v, args->ar_locktable))
78 goto cant_remount;
79 strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
80 args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0;
81 }
82
83 else if (!strcmp(o, "hostdata")) {
84 if (!v)
85 goto need_value;
86 if (remount && strcmp(v, args->ar_hostdata))
87 goto cant_remount;
88 strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
89 args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
90 }
91
92 else if (!strcmp(o, "spectator")) {
93 if (remount && !args->ar_spectator)
94 goto cant_remount;
95 args->ar_spectator = 1;
96 sdp->sd_vfs->s_flags |= MS_RDONLY;
97 }
98
99 else if (!strcmp(o, "ignore_local_fs")) {
100 if (remount && !args->ar_ignore_local_fs)
101 goto cant_remount;
102 args->ar_ignore_local_fs = 1;
103 }
104
105 else if (!strcmp(o, "localflocks")) {
106 if (remount && !args->ar_localflocks)
107 goto cant_remount;
108 args->ar_localflocks = 1;
109 }
110
111 else if (!strcmp(o, "localcaching")) {
112 if (remount && !args->ar_localcaching)
113 goto cant_remount;
114 args->ar_localcaching = 1;
115 }
116
117 else if (!strcmp(o, "debug"))
118 args->ar_debug = 1;
119
120 else if (!strcmp(o, "nodebug"))
121 args->ar_debug = 0;
122
123 else if (!strcmp(o, "upgrade")) {
124 if (remount && !args->ar_upgrade)
125 goto cant_remount;
126 args->ar_upgrade = 1;
127 }
128
129 else if (!strcmp(o, "num_glockd")) {
130 unsigned int x;
131 if (!v)
132 goto need_value;
133 sscanf(v, "%u", &x);
134 if (remount && x != args->ar_num_glockd)
135 goto cant_remount;
136 if (!x || x > GFS2_GLOCKD_MAX) {
137 fs_info(sdp, "0 < num_glockd <= %u (not %u)\n",
138 GFS2_GLOCKD_MAX, x);
139 error = -EINVAL;
140 break;
141 }
142 args->ar_num_glockd = x;
143 }
144
145 else if (!strcmp(o, "acl")) {
146 args->ar_posix_acl = 1;
147 sdp->sd_vfs->s_flags |= MS_POSIXACL;
148 }
149
150 else if (!strcmp(o, "noacl")) {
151 args->ar_posix_acl = 0;
152 sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
153 }
154
155 else if (!strcmp(o, "quota")) {
156 if (!v)
157 goto need_value;
158 if (!strcmp(v, "off"))
159 args->ar_quota = GFS2_QUOTA_OFF;
160 else if (!strcmp(v, "account"))
161 args->ar_quota = GFS2_QUOTA_ACCOUNT;
162 else if (!strcmp(v, "on"))
163 args->ar_quota = GFS2_QUOTA_ON;
164 else {
165 fs_info(sdp, "invalid value for quota\n");
166 error = -EINVAL;
167 break;
168 }
169 }
170
171 else if (!strcmp(o, "suiddir"))
172 args->ar_suiddir = 1;
173
174 else if (!strcmp(o, "nosuiddir"))
175 args->ar_suiddir = 0;
176
177 else if (!strcmp(o, "data")) {
178 if (!v)
179 goto need_value;
180 if (!strcmp(v, "writeback"))
181 args->ar_data = GFS2_DATA_WRITEBACK;
182 else if (!strcmp(v, "ordered"))
183 args->ar_data = GFS2_DATA_ORDERED;
184 else {
185 fs_info(sdp, "invalid value for data\n");
186 error = -EINVAL;
187 break;
188 }
189 }
190
191 else {
192 fs_info(sdp, "unknown option: %s\n", o);
193 error = -EINVAL;
194 break;
195 }
196 }
197
198 if (error)
199 fs_info(sdp, "invalid mount option(s)\n");
200
201 if (data != data_arg)
202 kfree(data);
203
204 return error;
205
206need_value:
207 fs_info(sdp, "need value for option %s\n", o);
208 return -EINVAL;
209
210cant_remount:
211 fs_info(sdp, "can't remount with option %s\n", o);
212 return -EINVAL;
213}
214
diff --git a/fs/gfs2/mount.h b/fs/gfs2/mount.h
new file mode 100644
index 000000000000..401288acfdf3
--- /dev/null
+++ b/fs/gfs2/mount.h
@@ -0,0 +1,17 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __MOUNT_DOT_H__
11#define __MOUNT_DOT_H__
12
13struct gfs2_sbd;
14
15int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount);
16
17#endif /* __MOUNT_DOT_H__ */
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
new file mode 100644
index 000000000000..1025960b0e6e
--- /dev/null
+++ b/fs/gfs2/ondisk.c
@@ -0,0 +1,308 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15
16#include "gfs2.h"
17#include <linux/gfs2_ondisk.h>
18
19#define pv(struct, member, fmt) printk(KERN_INFO " "#member" = "fmt"\n", \
20 struct->member);
21
22/*
23 * gfs2_xxx_in - read in an xxx struct
24 * first arg: the cpu-order structure
25 * buf: the disk-order buffer
26 *
27 * gfs2_xxx_out - write out an xxx struct
28 * first arg: the cpu-order structure
29 * buf: the disk-order buffer
30 *
31 * gfs2_xxx_print - print out an xxx struct
32 * first arg: the cpu-order structure
33 */
34
35void gfs2_inum_in(struct gfs2_inum *no, const void *buf)
36{
37 const struct gfs2_inum *str = buf;
38
39 no->no_formal_ino = be64_to_cpu(str->no_formal_ino);
40 no->no_addr = be64_to_cpu(str->no_addr);
41}
42
43void gfs2_inum_out(const struct gfs2_inum *no, void *buf)
44{
45 struct gfs2_inum *str = buf;
46
47 str->no_formal_ino = cpu_to_be64(no->no_formal_ino);
48 str->no_addr = cpu_to_be64(no->no_addr);
49}
50
51static void gfs2_inum_print(const struct gfs2_inum *no)
52{
53 printk(KERN_INFO " no_formal_ino = %llu\n", (unsigned long long)no->no_formal_ino);
54 printk(KERN_INFO " no_addr = %llu\n", (unsigned long long)no->no_addr);
55}
56
57static void gfs2_meta_header_in(struct gfs2_meta_header *mh, const void *buf)
58{
59 const struct gfs2_meta_header *str = buf;
60
61 mh->mh_magic = be32_to_cpu(str->mh_magic);
62 mh->mh_type = be32_to_cpu(str->mh_type);
63 mh->mh_format = be32_to_cpu(str->mh_format);
64}
65
66static void gfs2_meta_header_out(const struct gfs2_meta_header *mh, void *buf)
67{
68 struct gfs2_meta_header *str = buf;
69
70 str->mh_magic = cpu_to_be32(mh->mh_magic);
71 str->mh_type = cpu_to_be32(mh->mh_type);
72 str->mh_format = cpu_to_be32(mh->mh_format);
73}
74
75static void gfs2_meta_header_print(const struct gfs2_meta_header *mh)
76{
77 pv(mh, mh_magic, "0x%.8X");
78 pv(mh, mh_type, "%u");
79 pv(mh, mh_format, "%u");
80}
81
82void gfs2_sb_in(struct gfs2_sb *sb, const void *buf)
83{
84 const struct gfs2_sb *str = buf;
85
86 gfs2_meta_header_in(&sb->sb_header, buf);
87
88 sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
89 sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
90 sb->sb_bsize = be32_to_cpu(str->sb_bsize);
91 sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
92
93 gfs2_inum_in(&sb->sb_master_dir, (char *)&str->sb_master_dir);
94 gfs2_inum_in(&sb->sb_root_dir, (char *)&str->sb_root_dir);
95
96 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
97 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
98}
99
100void gfs2_rindex_in(struct gfs2_rindex *ri, const void *buf)
101{
102 const struct gfs2_rindex *str = buf;
103
104 ri->ri_addr = be64_to_cpu(str->ri_addr);
105 ri->ri_length = be32_to_cpu(str->ri_length);
106 ri->ri_data0 = be64_to_cpu(str->ri_data0);
107 ri->ri_data = be32_to_cpu(str->ri_data);
108 ri->ri_bitbytes = be32_to_cpu(str->ri_bitbytes);
109
110}
111
112void gfs2_rindex_print(const struct gfs2_rindex *ri)
113{
114 printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)ri->ri_addr);
115 pv(ri, ri_length, "%u");
116
117 printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)ri->ri_data0);
118 pv(ri, ri_data, "%u");
119
120 pv(ri, ri_bitbytes, "%u");
121}
122
123void gfs2_rgrp_in(struct gfs2_rgrp *rg, const void *buf)
124{
125 const struct gfs2_rgrp *str = buf;
126
127 gfs2_meta_header_in(&rg->rg_header, buf);
128 rg->rg_flags = be32_to_cpu(str->rg_flags);
129 rg->rg_free = be32_to_cpu(str->rg_free);
130 rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
131 rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
132}
133
134void gfs2_rgrp_out(const struct gfs2_rgrp *rg, void *buf)
135{
136 struct gfs2_rgrp *str = buf;
137
138 gfs2_meta_header_out(&rg->rg_header, buf);
139 str->rg_flags = cpu_to_be32(rg->rg_flags);
140 str->rg_free = cpu_to_be32(rg->rg_free);
141 str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
142 str->__pad = cpu_to_be32(0);
143 str->rg_igeneration = cpu_to_be64(rg->rg_igeneration);
144 memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
145}
146
147void gfs2_quota_in(struct gfs2_quota *qu, const void *buf)
148{
149 const struct gfs2_quota *str = buf;
150
151 qu->qu_limit = be64_to_cpu(str->qu_limit);
152 qu->qu_warn = be64_to_cpu(str->qu_warn);
153 qu->qu_value = be64_to_cpu(str->qu_value);
154}
155
156void gfs2_dinode_in(struct gfs2_dinode *di, const void *buf)
157{
158 const struct gfs2_dinode *str = buf;
159
160 gfs2_meta_header_in(&di->di_header, buf);
161 gfs2_inum_in(&di->di_num, &str->di_num);
162
163 di->di_mode = be32_to_cpu(str->di_mode);
164 di->di_uid = be32_to_cpu(str->di_uid);
165 di->di_gid = be32_to_cpu(str->di_gid);
166 di->di_nlink = be32_to_cpu(str->di_nlink);
167 di->di_size = be64_to_cpu(str->di_size);
168 di->di_blocks = be64_to_cpu(str->di_blocks);
169 di->di_atime = be64_to_cpu(str->di_atime);
170 di->di_mtime = be64_to_cpu(str->di_mtime);
171 di->di_ctime = be64_to_cpu(str->di_ctime);
172 di->di_major = be32_to_cpu(str->di_major);
173 di->di_minor = be32_to_cpu(str->di_minor);
174
175 di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
176 di->di_goal_data = be64_to_cpu(str->di_goal_data);
177 di->di_generation = be64_to_cpu(str->di_generation);
178
179 di->di_flags = be32_to_cpu(str->di_flags);
180 di->di_payload_format = be32_to_cpu(str->di_payload_format);
181 di->di_height = be16_to_cpu(str->di_height);
182
183 di->di_depth = be16_to_cpu(str->di_depth);
184 di->di_entries = be32_to_cpu(str->di_entries);
185
186 di->di_eattr = be64_to_cpu(str->di_eattr);
187
188}
189
190void gfs2_dinode_out(const struct gfs2_dinode *di, void *buf)
191{
192 struct gfs2_dinode *str = buf;
193
194 gfs2_meta_header_out(&di->di_header, buf);
195 gfs2_inum_out(&di->di_num, (char *)&str->di_num);
196
197 str->di_mode = cpu_to_be32(di->di_mode);
198 str->di_uid = cpu_to_be32(di->di_uid);
199 str->di_gid = cpu_to_be32(di->di_gid);
200 str->di_nlink = cpu_to_be32(di->di_nlink);
201 str->di_size = cpu_to_be64(di->di_size);
202 str->di_blocks = cpu_to_be64(di->di_blocks);
203 str->di_atime = cpu_to_be64(di->di_atime);
204 str->di_mtime = cpu_to_be64(di->di_mtime);
205 str->di_ctime = cpu_to_be64(di->di_ctime);
206 str->di_major = cpu_to_be32(di->di_major);
207 str->di_minor = cpu_to_be32(di->di_minor);
208
209 str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
210 str->di_goal_data = cpu_to_be64(di->di_goal_data);
211 str->di_generation = cpu_to_be64(di->di_generation);
212
213 str->di_flags = cpu_to_be32(di->di_flags);
214 str->di_payload_format = cpu_to_be32(di->di_payload_format);
215 str->di_height = cpu_to_be16(di->di_height);
216
217 str->di_depth = cpu_to_be16(di->di_depth);
218 str->di_entries = cpu_to_be32(di->di_entries);
219
220 str->di_eattr = cpu_to_be64(di->di_eattr);
221
222}
223
224void gfs2_dinode_print(const struct gfs2_dinode *di)
225{
226 gfs2_meta_header_print(&di->di_header);
227 gfs2_inum_print(&di->di_num);
228
229 pv(di, di_mode, "0%o");
230 pv(di, di_uid, "%u");
231 pv(di, di_gid, "%u");
232 pv(di, di_nlink, "%u");
233 printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size);
234 printk(KERN_INFO " di_blocks = %llu\n", (unsigned long long)di->di_blocks);
235 printk(KERN_INFO " di_atime = %lld\n", (long long)di->di_atime);
236 printk(KERN_INFO " di_mtime = %lld\n", (long long)di->di_mtime);
237 printk(KERN_INFO " di_ctime = %lld\n", (long long)di->di_ctime);
238 pv(di, di_major, "%u");
239 pv(di, di_minor, "%u");
240
241 printk(KERN_INFO " di_goal_meta = %llu\n", (unsigned long long)di->di_goal_meta);
242 printk(KERN_INFO " di_goal_data = %llu\n", (unsigned long long)di->di_goal_data);
243
244 pv(di, di_flags, "0x%.8X");
245 pv(di, di_payload_format, "%u");
246 pv(di, di_height, "%u");
247
248 pv(di, di_depth, "%u");
249 pv(di, di_entries, "%u");
250
251 printk(KERN_INFO " di_eattr = %llu\n", (unsigned long long)di->di_eattr);
252}
253
254void gfs2_log_header_in(struct gfs2_log_header *lh, const void *buf)
255{
256 const struct gfs2_log_header *str = buf;
257
258 gfs2_meta_header_in(&lh->lh_header, buf);
259 lh->lh_sequence = be64_to_cpu(str->lh_sequence);
260 lh->lh_flags = be32_to_cpu(str->lh_flags);
261 lh->lh_tail = be32_to_cpu(str->lh_tail);
262 lh->lh_blkno = be32_to_cpu(str->lh_blkno);
263 lh->lh_hash = be32_to_cpu(str->lh_hash);
264}
265
266void gfs2_inum_range_in(struct gfs2_inum_range *ir, const void *buf)
267{
268 const struct gfs2_inum_range *str = buf;
269
270 ir->ir_start = be64_to_cpu(str->ir_start);
271 ir->ir_length = be64_to_cpu(str->ir_length);
272}
273
274void gfs2_inum_range_out(const struct gfs2_inum_range *ir, void *buf)
275{
276 struct gfs2_inum_range *str = buf;
277
278 str->ir_start = cpu_to_be64(ir->ir_start);
279 str->ir_length = cpu_to_be64(ir->ir_length);
280}
281
282void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, const void *buf)
283{
284 const struct gfs2_statfs_change *str = buf;
285
286 sc->sc_total = be64_to_cpu(str->sc_total);
287 sc->sc_free = be64_to_cpu(str->sc_free);
288 sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
289}
290
291void gfs2_statfs_change_out(const struct gfs2_statfs_change *sc, void *buf)
292{
293 struct gfs2_statfs_change *str = buf;
294
295 str->sc_total = cpu_to_be64(sc->sc_total);
296 str->sc_free = cpu_to_be64(sc->sc_free);
297 str->sc_dinodes = cpu_to_be64(sc->sc_dinodes);
298}
299
300void gfs2_quota_change_in(struct gfs2_quota_change *qc, const void *buf)
301{
302 const struct gfs2_quota_change *str = buf;
303
304 qc->qc_change = be64_to_cpu(str->qc_change);
305 qc->qc_flags = be32_to_cpu(str->qc_flags);
306 qc->qc_id = be32_to_cpu(str->qc_id);
307}
308
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
new file mode 100644
index 000000000000..4fb743f4e4a4
--- /dev/null
+++ b/fs/gfs2/ops_address.c
@@ -0,0 +1,790 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/pagevec.h>
17#include <linux/mpage.h>
18#include <linux/fs.h>
19#include <linux/gfs2_ondisk.h>
20#include <linux/lm_interface.h>
21
22#include "gfs2.h"
23#include "incore.h"
24#include "bmap.h"
25#include "glock.h"
26#include "inode.h"
27#include "log.h"
28#include "meta_io.h"
29#include "ops_address.h"
30#include "quota.h"
31#include "trans.h"
32#include "rgrp.h"
33#include "ops_file.h"
34#include "util.h"
35#include "glops.h"
36
37
38static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
39 unsigned int from, unsigned int to)
40{
41 struct buffer_head *head = page_buffers(page);
42 unsigned int bsize = head->b_size;
43 struct buffer_head *bh;
44 unsigned int start, end;
45
46 for (bh = head, start = 0; bh != head || !start;
47 bh = bh->b_this_page, start = end) {
48 end = start + bsize;
49 if (end <= from || start >= to)
50 continue;
51 gfs2_trans_add_bh(ip->i_gl, bh, 0);
52 }
53}
54
55/**
56 * gfs2_get_block - Fills in a buffer head with details about a block
57 * @inode: The inode
58 * @lblock: The block number to look up
59 * @bh_result: The buffer head to return the result in
60 * @create: Non-zero if we may add block to the file
61 *
62 * Returns: errno
63 */
64
65int gfs2_get_block(struct inode *inode, sector_t lblock,
66 struct buffer_head *bh_result, int create)
67{
68 return gfs2_block_map(inode, lblock, create, bh_result, 32);
69}
70
71/**
72 * gfs2_get_block_noalloc - Fills in a buffer head with details about a block
73 * @inode: The inode
74 * @lblock: The block number to look up
75 * @bh_result: The buffer head to return the result in
76 * @create: Non-zero if we may add block to the file
77 *
78 * Returns: errno
79 */
80
81static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
82 struct buffer_head *bh_result, int create)
83{
84 int error;
85
86 error = gfs2_block_map(inode, lblock, 0, bh_result, 1);
87 if (error)
88 return error;
89 if (bh_result->b_blocknr == 0)
90 return -EIO;
91 return 0;
92}
93
94static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
95 struct buffer_head *bh_result, int create)
96{
97 return gfs2_block_map(inode, lblock, 0, bh_result, 32);
98}
99
100/**
101 * gfs2_writepage - Write complete page
102 * @page: Page to write
103 *
104 * Returns: errno
105 *
106 * Some of this is copied from block_write_full_page() although we still
107 * call it to do most of the work.
108 */
109
110static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
111{
112 struct inode *inode = page->mapping->host;
113 struct gfs2_inode *ip = GFS2_I(inode);
114 struct gfs2_sbd *sdp = GFS2_SB(inode);
115 loff_t i_size = i_size_read(inode);
116 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
117 unsigned offset;
118 int error;
119 int done_trans = 0;
120
121 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) {
122 unlock_page(page);
123 return -EIO;
124 }
125 if (current->journal_info)
126 goto out_ignore;
127
128 /* Is the page fully outside i_size? (truncate in progress) */
129 offset = i_size & (PAGE_CACHE_SIZE-1);
130 if (page->index > end_index || (page->index == end_index && !offset)) {
131 page->mapping->a_ops->invalidatepage(page, 0);
132 unlock_page(page);
133 return 0; /* don't care */
134 }
135
136 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) {
137 error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
138 if (error)
139 goto out_ignore;
140 if (!page_has_buffers(page)) {
141 create_empty_buffers(page, inode->i_sb->s_blocksize,
142 (1 << BH_Dirty)|(1 << BH_Uptodate));
143 }
144 gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
145 done_trans = 1;
146 }
147 error = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
148 if (done_trans)
149 gfs2_trans_end(sdp);
150 gfs2_meta_cache_flush(ip);
151 return error;
152
153out_ignore:
154 redirty_page_for_writepage(wbc, page);
155 unlock_page(page);
156 return 0;
157}
158
159static int zero_readpage(struct page *page)
160{
161 void *kaddr;
162
163 kaddr = kmap_atomic(page, KM_USER0);
164 memset(kaddr, 0, PAGE_CACHE_SIZE);
165 kunmap_atomic(page, KM_USER0);
166
167 SetPageUptodate(page);
168
169 return 0;
170}
171
172/**
173 * stuffed_readpage - Fill in a Linux page with stuffed file data
174 * @ip: the inode
175 * @page: the page
176 *
177 * Returns: errno
178 */
179
180static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
181{
182 struct buffer_head *dibh;
183 void *kaddr;
184 int error;
185
186 /* Only the first page of a stuffed file might contain data */
187 if (unlikely(page->index))
188 return zero_readpage(page);
189
190 error = gfs2_meta_inode_buffer(ip, &dibh);
191 if (error)
192 return error;
193
194 kaddr = kmap_atomic(page, KM_USER0);
195 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
196 ip->i_di.di_size);
197 memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
198 kunmap_atomic(page, KM_USER0);
199
200 brelse(dibh);
201
202 SetPageUptodate(page);
203
204 return 0;
205}
206
207
208/**
209 * gfs2_readpage - readpage with locking
210 * @file: The file to read a page for. N.B. This may be NULL if we are
211 * reading an internal file.
212 * @page: The page to read
213 *
214 * Returns: errno
215 */
216
217static int gfs2_readpage(struct file *file, struct page *page)
218{
219 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
220 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
221 struct gfs2_file *gf = NULL;
222 struct gfs2_holder gh;
223 int error;
224 int do_unlock = 0;
225
226 if (likely(file != &gfs2_internal_file_sentinel)) {
227 if (file) {
228 gf = file->private_data;
229 if (test_bit(GFF_EXLOCK, &gf->f_flags))
230 /* gfs2_sharewrite_nopage has grabbed the ip->i_gl already */
231 goto skip_lock;
232 }
233 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|GL_AOP, &gh);
234 do_unlock = 1;
235 error = gfs2_glock_nq_m_atime(1, &gh);
236 if (unlikely(error))
237 goto out_unlock;
238 }
239
240skip_lock:
241 if (gfs2_is_stuffed(ip)) {
242 error = stuffed_readpage(ip, page);
243 unlock_page(page);
244 } else
245 error = mpage_readpage(page, gfs2_get_block);
246
247 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
248 error = -EIO;
249
250 if (do_unlock) {
251 gfs2_glock_dq_m(1, &gh);
252 gfs2_holder_uninit(&gh);
253 }
254out:
255 return error;
256out_unlock:
257 unlock_page(page);
258 if (do_unlock)
259 gfs2_holder_uninit(&gh);
260 goto out;
261}
262
263/**
264 * gfs2_readpages - Read a bunch of pages at once
265 *
266 * Some notes:
267 * 1. This is only for readahead, so we can simply ignore any things
268 * which are slightly inconvenient (such as locking conflicts between
269 * the page lock and the glock) and return having done no I/O. Its
270 * obviously not something we'd want to do on too regular a basis.
271 * Any I/O we ignore at this time will be done via readpage later.
272 * 2. We have to handle stuffed files here too.
273 * 3. mpage_readpages() does most of the heavy lifting in the common case.
274 * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places.
275 * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
276 * well as read-ahead.
277 */
278static int gfs2_readpages(struct file *file, struct address_space *mapping,
279 struct list_head *pages, unsigned nr_pages)
280{
281 struct inode *inode = mapping->host;
282 struct gfs2_inode *ip = GFS2_I(inode);
283 struct gfs2_sbd *sdp = GFS2_SB(inode);
284 struct gfs2_holder gh;
285 unsigned page_idx;
286 int ret;
287 int do_unlock = 0;
288
289 if (likely(file != &gfs2_internal_file_sentinel)) {
290 if (file) {
291 struct gfs2_file *gf = file->private_data;
292 if (test_bit(GFF_EXLOCK, &gf->f_flags))
293 goto skip_lock;
294 }
295 gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
296 LM_FLAG_TRY_1CB|GL_ATIME|GL_AOP, &gh);
297 do_unlock = 1;
298 ret = gfs2_glock_nq_m_atime(1, &gh);
299 if (ret == GLR_TRYFAILED)
300 goto out_noerror;
301 if (unlikely(ret))
302 goto out_unlock;
303 }
304skip_lock:
305 if (gfs2_is_stuffed(ip)) {
306 struct pagevec lru_pvec;
307 pagevec_init(&lru_pvec, 0);
308 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
309 struct page *page = list_entry(pages->prev, struct page, lru);
310 prefetchw(&page->flags);
311 list_del(&page->lru);
312 if (!add_to_page_cache(page, mapping,
313 page->index, GFP_KERNEL)) {
314 ret = stuffed_readpage(ip, page);
315 unlock_page(page);
316 if (!pagevec_add(&lru_pvec, page))
317 __pagevec_lru_add(&lru_pvec);
318 } else {
319 page_cache_release(page);
320 }
321 }
322 pagevec_lru_add(&lru_pvec);
323 ret = 0;
324 } else {
325 /* What we really want to do .... */
326 ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
327 }
328
329 if (do_unlock) {
330 gfs2_glock_dq_m(1, &gh);
331 gfs2_holder_uninit(&gh);
332 }
333out:
334 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
335 ret = -EIO;
336 return ret;
337out_noerror:
338 ret = 0;
339out_unlock:
340 /* unlock all pages, we can't do any I/O right now */
341 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
342 struct page *page = list_entry(pages->prev, struct page, lru);
343 list_del(&page->lru);
344 unlock_page(page);
345 page_cache_release(page);
346 }
347 if (do_unlock)
348 gfs2_holder_uninit(&gh);
349 goto out;
350}
351
352/**
353 * gfs2_prepare_write - Prepare to write a page to a file
354 * @file: The file to write to
355 * @page: The page which is to be prepared for writing
356 * @from: From (byte range within page)
357 * @to: To (byte range within page)
358 *
359 * Returns: errno
360 */
361
362static int gfs2_prepare_write(struct file *file, struct page *page,
363 unsigned from, unsigned to)
364{
365 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
366 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
367 unsigned int data_blocks, ind_blocks, rblocks;
368 int alloc_required;
369 int error = 0;
370 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + from;
371 loff_t end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
372 struct gfs2_alloc *al;
373
374 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|GL_AOP, &ip->i_gh);
375 error = gfs2_glock_nq_m_atime(1, &ip->i_gh);
376 if (error)
377 goto out_uninit;
378
379 gfs2_write_calc_reserv(ip, to - from, &data_blocks, &ind_blocks);
380
381 error = gfs2_write_alloc_required(ip, pos, from - to, &alloc_required);
382 if (error)
383 goto out_unlock;
384
385
386 if (alloc_required) {
387 al = gfs2_alloc_get(ip);
388
389 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
390 if (error)
391 goto out_alloc_put;
392
393 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
394 if (error)
395 goto out_qunlock;
396
397 al->al_requested = data_blocks + ind_blocks;
398 error = gfs2_inplace_reserve(ip);
399 if (error)
400 goto out_qunlock;
401 }
402
403 rblocks = RES_DINODE + ind_blocks;
404 if (gfs2_is_jdata(ip))
405 rblocks += data_blocks ? data_blocks : 1;
406 if (ind_blocks || data_blocks)
407 rblocks += RES_STATFS + RES_QUOTA;
408
409 error = gfs2_trans_begin(sdp, rblocks, 0);
410 if (error)
411 goto out;
412
413 if (gfs2_is_stuffed(ip)) {
414 if (end > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
415 error = gfs2_unstuff_dinode(ip, page);
416 if (error == 0)
417 goto prepare_write;
418 } else if (!PageUptodate(page))
419 error = stuffed_readpage(ip, page);
420 goto out;
421 }
422
423prepare_write:
424 error = block_prepare_write(page, from, to, gfs2_get_block);
425
426out:
427 if (error) {
428 gfs2_trans_end(sdp);
429 if (alloc_required) {
430 gfs2_inplace_release(ip);
431out_qunlock:
432 gfs2_quota_unlock(ip);
433out_alloc_put:
434 gfs2_alloc_put(ip);
435 }
436out_unlock:
437 gfs2_glock_dq_m(1, &ip->i_gh);
438out_uninit:
439 gfs2_holder_uninit(&ip->i_gh);
440 }
441
442 return error;
443}
444
445/**
446 * gfs2_commit_write - Commit write to a file
447 * @file: The file to write to
448 * @page: The page containing the data
449 * @from: From (byte range within page)
450 * @to: To (byte range within page)
451 *
452 * Returns: errno
453 */
454
455static int gfs2_commit_write(struct file *file, struct page *page,
456 unsigned from, unsigned to)
457{
458 struct inode *inode = page->mapping->host;
459 struct gfs2_inode *ip = GFS2_I(inode);
460 struct gfs2_sbd *sdp = GFS2_SB(inode);
461 int error = -EOPNOTSUPP;
462 struct buffer_head *dibh;
463 struct gfs2_alloc *al = &ip->i_alloc;
464 struct gfs2_dinode *di;
465
466 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(ip->i_gl)))
467 goto fail_nounlock;
468
469 error = gfs2_meta_inode_buffer(ip, &dibh);
470 if (error)
471 goto fail_endtrans;
472
473 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
474 di = (struct gfs2_dinode *)dibh->b_data;
475
476 if (gfs2_is_stuffed(ip)) {
477 u64 file_size;
478 void *kaddr;
479
480 file_size = ((u64)page->index << PAGE_CACHE_SHIFT) + to;
481
482 kaddr = kmap_atomic(page, KM_USER0);
483 memcpy(dibh->b_data + sizeof(struct gfs2_dinode) + from,
484 kaddr + from, to - from);
485 kunmap_atomic(page, KM_USER0);
486
487 SetPageUptodate(page);
488
489 if (inode->i_size < file_size)
490 i_size_write(inode, file_size);
491 } else {
492 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED ||
493 gfs2_is_jdata(ip))
494 gfs2_page_add_databufs(ip, page, from, to);
495 error = generic_commit_write(file, page, from, to);
496 if (error)
497 goto fail;
498 }
499
500 if (ip->i_di.di_size < inode->i_size) {
501 ip->i_di.di_size = inode->i_size;
502 di->di_size = cpu_to_be64(inode->i_size);
503 }
504
505 di->di_mode = cpu_to_be32(inode->i_mode);
506 di->di_atime = cpu_to_be64(inode->i_atime.tv_sec);
507 di->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec);
508 di->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec);
509
510 brelse(dibh);
511 gfs2_trans_end(sdp);
512 if (al->al_requested) {
513 gfs2_inplace_release(ip);
514 gfs2_quota_unlock(ip);
515 gfs2_alloc_put(ip);
516 }
517 gfs2_glock_dq_m(1, &ip->i_gh);
518 gfs2_holder_uninit(&ip->i_gh);
519 return 0;
520
521fail:
522 brelse(dibh);
523fail_endtrans:
524 gfs2_trans_end(sdp);
525 if (al->al_requested) {
526 gfs2_inplace_release(ip);
527 gfs2_quota_unlock(ip);
528 gfs2_alloc_put(ip);
529 }
530 gfs2_glock_dq_m(1, &ip->i_gh);
531 gfs2_holder_uninit(&ip->i_gh);
532fail_nounlock:
533 ClearPageUptodate(page);
534 return error;
535}
536
537/**
538 * gfs2_bmap - Block map function
539 * @mapping: Address space info
540 * @lblock: The block to map
541 *
542 * Returns: The disk address for the block or 0 on hole or error
543 */
544
545static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
546{
547 struct gfs2_inode *ip = GFS2_I(mapping->host);
548 struct gfs2_holder i_gh;
549 sector_t dblock = 0;
550 int error;
551
552 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
553 if (error)
554 return 0;
555
556 if (!gfs2_is_stuffed(ip))
557 dblock = generic_block_bmap(mapping, lblock, gfs2_get_block);
558
559 gfs2_glock_dq_uninit(&i_gh);
560
561 return dblock;
562}
563
564static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh)
565{
566 struct gfs2_bufdata *bd;
567
568 gfs2_log_lock(sdp);
569 bd = bh->b_private;
570 if (bd) {
571 bd->bd_bh = NULL;
572 bh->b_private = NULL;
573 }
574 gfs2_log_unlock(sdp);
575
576 lock_buffer(bh);
577 clear_buffer_dirty(bh);
578 bh->b_bdev = NULL;
579 clear_buffer_mapped(bh);
580 clear_buffer_req(bh);
581 clear_buffer_new(bh);
582 clear_buffer_delay(bh);
583 unlock_buffer(bh);
584}
585
586static void gfs2_invalidatepage(struct page *page, unsigned long offset)
587{
588 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
589 struct buffer_head *head, *bh, *next;
590 unsigned int curr_off = 0;
591
592 BUG_ON(!PageLocked(page));
593 if (!page_has_buffers(page))
594 return;
595
596 bh = head = page_buffers(page);
597 do {
598 unsigned int next_off = curr_off + bh->b_size;
599 next = bh->b_this_page;
600
601 if (offset <= curr_off)
602 discard_buffer(sdp, bh);
603
604 curr_off = next_off;
605 bh = next;
606 } while (bh != head);
607
608 if (!offset)
609 try_to_release_page(page, 0);
610
611 return;
612}
613
614static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
615 const struct iovec *iov, loff_t offset,
616 unsigned long nr_segs)
617{
618 struct file *file = iocb->ki_filp;
619 struct inode *inode = file->f_mapping->host;
620 struct gfs2_inode *ip = GFS2_I(inode);
621 struct gfs2_holder gh;
622 int rv;
623
624 if (rw == READ)
625 mutex_lock(&inode->i_mutex);
626 /*
627 * Shared lock, even if its a write, since we do no allocation
628 * on this path. All we need change is atime.
629 */
630 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
631 rv = gfs2_glock_nq_m_atime(1, &gh);
632 if (rv)
633 goto out;
634
635 if (offset > i_size_read(inode))
636 goto out;
637
638 /*
639 * Should we return an error here? I can't see that O_DIRECT for
640 * a journaled file makes any sense. For now we'll silently fall
641 * back to buffered I/O, likewise we do the same for stuffed
642 * files since they are (a) small and (b) unaligned.
643 */
644 if (gfs2_is_jdata(ip))
645 goto out;
646
647 if (gfs2_is_stuffed(ip))
648 goto out;
649
650 rv = blockdev_direct_IO_own_locking(rw, iocb, inode,
651 inode->i_sb->s_bdev,
652 iov, offset, nr_segs,
653 gfs2_get_block_direct, NULL);
654out:
655 gfs2_glock_dq_m(1, &gh);
656 gfs2_holder_uninit(&gh);
657 if (rw == READ)
658 mutex_unlock(&inode->i_mutex);
659
660 return rv;
661}
662
663/**
664 * stuck_releasepage - We're stuck in gfs2_releasepage(). Print stuff out.
665 * @bh: the buffer we're stuck on
666 *
667 */
668
669static void stuck_releasepage(struct buffer_head *bh)
670{
671 struct inode *inode = bh->b_page->mapping->host;
672 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
673 struct gfs2_bufdata *bd = bh->b_private;
674 struct gfs2_glock *gl;
675static unsigned limit = 0;
676
677 if (limit > 3)
678 return;
679 limit++;
680
681 fs_warn(sdp, "stuck in gfs2_releasepage() %p\n", inode);
682 fs_warn(sdp, "blkno = %llu, bh->b_count = %d\n",
683 (unsigned long long)bh->b_blocknr, atomic_read(&bh->b_count));
684 fs_warn(sdp, "pinned = %u\n", buffer_pinned(bh));
685 fs_warn(sdp, "bh->b_private = %s\n", (bd) ? "!NULL" : "NULL");
686
687 if (!bd)
688 return;
689
690 gl = bd->bd_gl;
691
692 fs_warn(sdp, "gl = (%u, %llu)\n",
693 gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number);
694
695 fs_warn(sdp, "bd_list_tr = %s, bd_le.le_list = %s\n",
696 (list_empty(&bd->bd_list_tr)) ? "no" : "yes",
697 (list_empty(&bd->bd_le.le_list)) ? "no" : "yes");
698
699 if (gl->gl_ops == &gfs2_inode_glops) {
700 struct gfs2_inode *ip = gl->gl_object;
701 unsigned int x;
702
703 if (!ip)
704 return;
705
706 fs_warn(sdp, "ip = %llu %llu\n",
707 (unsigned long long)ip->i_num.no_formal_ino,
708 (unsigned long long)ip->i_num.no_addr);
709
710 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++)
711 fs_warn(sdp, "ip->i_cache[%u] = %s\n",
712 x, (ip->i_cache[x]) ? "!NULL" : "NULL");
713 }
714}
715
716/**
717 * gfs2_releasepage - free the metadata associated with a page
718 * @page: the page that's being released
719 * @gfp_mask: passed from Linux VFS, ignored by us
720 *
721 * Call try_to_free_buffers() if the buffers in this page can be
722 * released.
723 *
724 * Returns: 0
725 */
726
727int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
728{
729 struct inode *aspace = page->mapping->host;
730 struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
731 struct buffer_head *bh, *head;
732 struct gfs2_bufdata *bd;
733 unsigned long t = jiffies + gfs2_tune_get(sdp, gt_stall_secs) * HZ;
734
735 if (!page_has_buffers(page))
736 goto out;
737
738 head = bh = page_buffers(page);
739 do {
740 while (atomic_read(&bh->b_count)) {
741 if (!atomic_read(&aspace->i_writecount))
742 return 0;
743
744 if (time_after_eq(jiffies, t)) {
745 stuck_releasepage(bh);
746 /* should we withdraw here? */
747 return 0;
748 }
749
750 yield();
751 }
752
753 gfs2_assert_warn(sdp, !buffer_pinned(bh));
754 gfs2_assert_warn(sdp, !buffer_dirty(bh));
755
756 gfs2_log_lock(sdp);
757 bd = bh->b_private;
758 if (bd) {
759 gfs2_assert_warn(sdp, bd->bd_bh == bh);
760 gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
761 gfs2_assert_warn(sdp, !bd->bd_ail);
762 bd->bd_bh = NULL;
763 if (!list_empty(&bd->bd_le.le_list))
764 bd = NULL;
765 bh->b_private = NULL;
766 }
767 gfs2_log_unlock(sdp);
768 if (bd)
769 kmem_cache_free(gfs2_bufdata_cachep, bd);
770
771 bh = bh->b_this_page;
772 } while (bh != head);
773
774out:
775 return try_to_free_buffers(page);
776}
777
778const struct address_space_operations gfs2_file_aops = {
779 .writepage = gfs2_writepage,
780 .readpage = gfs2_readpage,
781 .readpages = gfs2_readpages,
782 .sync_page = block_sync_page,
783 .prepare_write = gfs2_prepare_write,
784 .commit_write = gfs2_commit_write,
785 .bmap = gfs2_bmap,
786 .invalidatepage = gfs2_invalidatepage,
787 .releasepage = gfs2_releasepage,
788 .direct_IO = gfs2_direct_IO,
789};
790
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
new file mode 100644
index 000000000000..35aaee4aa7e1
--- /dev/null
+++ b/fs/gfs2/ops_address.h
@@ -0,0 +1,22 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_ADDRESS_DOT_H__
11#define __OPS_ADDRESS_DOT_H__
12
13#include <linux/fs.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16
17extern const struct address_space_operations gfs2_file_aops;
18extern int gfs2_get_block(struct inode *inode, sector_t lblock,
19 struct buffer_head *bh_result, int create);
20extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
21
22#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
new file mode 100644
index 000000000000..00041b1b8025
--- /dev/null
+++ b/fs/gfs2/ops_dentry.c
@@ -0,0 +1,119 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/smp_lock.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/crc32.h>
18#include <linux/lm_interface.h>
19
20#include "gfs2.h"
21#include "incore.h"
22#include "dir.h"
23#include "glock.h"
24#include "ops_dentry.h"
25#include "util.h"
26
27/**
28 * gfs2_drevalidate - Check directory lookup consistency
29 * @dentry: the mapping to check
30 * @nd:
31 *
32 * Check to make sure the lookup necessary to arrive at this inode from its
33 * parent is still good.
34 *
35 * Returns: 1 if the dentry is ok, 0 if it isn't
36 */
37
38static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
39{
40 struct dentry *parent = dget_parent(dentry);
41 struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode);
42 struct gfs2_inode *dip = GFS2_I(parent->d_inode);
43 struct inode *inode = dentry->d_inode;
44 struct gfs2_holder d_gh;
45 struct gfs2_inode *ip;
46 struct gfs2_inum inum;
47 unsigned int type;
48 int error;
49
50 if (inode && is_bad_inode(inode))
51 goto invalid;
52
53 if (sdp->sd_args.ar_localcaching)
54 goto valid;
55
56 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
57 if (error)
58 goto fail;
59
60 error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type);
61 switch (error) {
62 case 0:
63 if (!inode)
64 goto invalid_gunlock;
65 break;
66 case -ENOENT:
67 if (!inode)
68 goto valid_gunlock;
69 goto invalid_gunlock;
70 default:
71 goto fail_gunlock;
72 }
73
74 ip = GFS2_I(inode);
75
76 if (!gfs2_inum_equal(&ip->i_num, &inum))
77 goto invalid_gunlock;
78
79 if (IF2DT(ip->i_di.di_mode) != type) {
80 gfs2_consist_inode(dip);
81 goto fail_gunlock;
82 }
83
84valid_gunlock:
85 gfs2_glock_dq_uninit(&d_gh);
86valid:
87 dput(parent);
88 return 1;
89
90invalid_gunlock:
91 gfs2_glock_dq_uninit(&d_gh);
92invalid:
93 if (inode && S_ISDIR(inode->i_mode)) {
94 if (have_submounts(dentry))
95 goto valid;
96 shrink_dcache_parent(dentry);
97 }
98 d_drop(dentry);
99 dput(parent);
100 return 0;
101
102fail_gunlock:
103 gfs2_glock_dq_uninit(&d_gh);
104fail:
105 dput(parent);
106 return 0;
107}
108
109static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
110{
111 str->hash = gfs2_disk_hash(str->name, str->len);
112 return 0;
113}
114
115struct dentry_operations gfs2_dops = {
116 .d_revalidate = gfs2_drevalidate,
117 .d_hash = gfs2_dhash,
118};
119
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
new file mode 100644
index 000000000000..5caa3db4d3f5
--- /dev/null
+++ b/fs/gfs2/ops_dentry.h
@@ -0,0 +1,17 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_DENTRY_DOT_H__
11#define __OPS_DENTRY_DOT_H__
12
13#include <linux/dcache.h>
14
15extern struct dentry_operations gfs2_dops;
16
17#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
new file mode 100644
index 000000000000..86127d93bd35
--- /dev/null
+++ b/fs/gfs2/ops_export.c
@@ -0,0 +1,298 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17#include <linux/lm_interface.h>
18
19#include "gfs2.h"
20#include "incore.h"
21#include "dir.h"
22#include "glock.h"
23#include "glops.h"
24#include "inode.h"
25#include "ops_export.h"
26#include "rgrp.h"
27#include "util.h"
28
29static struct dentry *gfs2_decode_fh(struct super_block *sb,
30 __u32 *fh,
31 int fh_len,
32 int fh_type,
33 int (*acceptable)(void *context,
34 struct dentry *dentry),
35 void *context)
36{
37 struct gfs2_fh_obj fh_obj;
38 struct gfs2_inum *this, parent;
39
40 if (fh_type != fh_len)
41 return NULL;
42
43 this = &fh_obj.this;
44 fh_obj.imode = DT_UNKNOWN;
45 memset(&parent, 0, sizeof(struct gfs2_inum));
46
47 switch (fh_type) {
48 case GFS2_LARGE_FH_SIZE:
49 parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
50 parent.no_formal_ino |= be32_to_cpu(fh[5]);
51 parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32;
52 parent.no_addr |= be32_to_cpu(fh[7]);
53 fh_obj.imode = be32_to_cpu(fh[8]);
54 case GFS2_SMALL_FH_SIZE:
55 this->no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32;
56 this->no_formal_ino |= be32_to_cpu(fh[1]);
57 this->no_addr = ((u64)be32_to_cpu(fh[2])) << 32;
58 this->no_addr |= be32_to_cpu(fh[3]);
59 break;
60 default:
61 return NULL;
62 }
63
64 return gfs2_export_ops.find_exported_dentry(sb, &fh_obj, &parent,
65 acceptable, context);
66}
67
68static int gfs2_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
69 int connectable)
70{
71 struct inode *inode = dentry->d_inode;
72 struct super_block *sb = inode->i_sb;
73 struct gfs2_inode *ip = GFS2_I(inode);
74
75 if (*len < GFS2_SMALL_FH_SIZE ||
76 (connectable && *len < GFS2_LARGE_FH_SIZE))
77 return 255;
78
79 fh[0] = ip->i_num.no_formal_ino >> 32;
80 fh[0] = cpu_to_be32(fh[0]);
81 fh[1] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
82 fh[1] = cpu_to_be32(fh[1]);
83 fh[2] = ip->i_num.no_addr >> 32;
84 fh[2] = cpu_to_be32(fh[2]);
85 fh[3] = ip->i_num.no_addr & 0xFFFFFFFF;
86 fh[3] = cpu_to_be32(fh[3]);
87 *len = GFS2_SMALL_FH_SIZE;
88
89 if (!connectable || inode == sb->s_root->d_inode)
90 return *len;
91
92 spin_lock(&dentry->d_lock);
93 inode = dentry->d_parent->d_inode;
94 ip = GFS2_I(inode);
95 igrab(inode);
96 spin_unlock(&dentry->d_lock);
97
98 fh[4] = ip->i_num.no_formal_ino >> 32;
99 fh[4] = cpu_to_be32(fh[4]);
100 fh[5] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
101 fh[5] = cpu_to_be32(fh[5]);
102 fh[6] = ip->i_num.no_addr >> 32;
103 fh[6] = cpu_to_be32(fh[6]);
104 fh[7] = ip->i_num.no_addr & 0xFFFFFFFF;
105 fh[7] = cpu_to_be32(fh[7]);
106
107 fh[8] = cpu_to_be32(inode->i_mode);
108 fh[9] = 0; /* pad to double word */
109 *len = GFS2_LARGE_FH_SIZE;
110
111 iput(inode);
112
113 return *len;
114}
115
116struct get_name_filldir {
117 struct gfs2_inum inum;
118 char *name;
119};
120
121static int get_name_filldir(void *opaque, const char *name, unsigned int length,
122 u64 offset, struct gfs2_inum *inum,
123 unsigned int type)
124{
125 struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque;
126
127 if (!gfs2_inum_equal(inum, &gnfd->inum))
128 return 0;
129
130 memcpy(gnfd->name, name, length);
131 gnfd->name[length] = 0;
132
133 return 1;
134}
135
136static int gfs2_get_name(struct dentry *parent, char *name,
137 struct dentry *child)
138{
139 struct inode *dir = parent->d_inode;
140 struct inode *inode = child->d_inode;
141 struct gfs2_inode *dip, *ip;
142 struct get_name_filldir gnfd;
143 struct gfs2_holder gh;
144 u64 offset = 0;
145 int error;
146
147 if (!dir)
148 return -EINVAL;
149
150 if (!S_ISDIR(dir->i_mode) || !inode)
151 return -EINVAL;
152
153 dip = GFS2_I(dir);
154 ip = GFS2_I(inode);
155
156 *name = 0;
157 gnfd.inum = ip->i_num;
158 gnfd.name = name;
159
160 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
161 if (error)
162 return error;
163
164 error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
165
166 gfs2_glock_dq_uninit(&gh);
167
168 if (!error && !*name)
169 error = -ENOENT;
170
171 return error;
172}
173
174static struct dentry *gfs2_get_parent(struct dentry *child)
175{
176 struct qstr dotdot;
177 struct inode *inode;
178 struct dentry *dentry;
179
180 gfs2_str2qstr(&dotdot, "..");
181 inode = gfs2_lookupi(child->d_inode, &dotdot, 1, NULL);
182
183 if (!inode)
184 return ERR_PTR(-ENOENT);
185 /*
186 * In case of an error, @inode carries the error value, and we
187 * have to return that as a(n invalid) pointer to dentry.
188 */
189 if (IS_ERR(inode))
190 return ERR_PTR(PTR_ERR(inode));
191
192 dentry = d_alloc_anon(inode);
193 if (!dentry) {
194 iput(inode);
195 return ERR_PTR(-ENOMEM);
196 }
197
198 return dentry;
199}
200
201static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
202{
203 struct gfs2_sbd *sdp = sb->s_fs_info;
204 struct gfs2_fh_obj *fh_obj = (struct gfs2_fh_obj *)inum_obj;
205 struct gfs2_inum *inum = &fh_obj->this;
206 struct gfs2_holder i_gh, ri_gh, rgd_gh;
207 struct gfs2_rgrpd *rgd;
208 struct inode *inode;
209 struct dentry *dentry;
210 int error;
211
212 /* System files? */
213
214 inode = gfs2_ilookup(sb, inum);
215 if (inode) {
216 if (GFS2_I(inode)->i_num.no_formal_ino != inum->no_formal_ino) {
217 iput(inode);
218 return ERR_PTR(-ESTALE);
219 }
220 goto out_inode;
221 }
222
223 error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
224 LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL,
225 &i_gh);
226 if (error)
227 return ERR_PTR(error);
228
229 error = gfs2_rindex_hold(sdp, &ri_gh);
230 if (error)
231 goto fail;
232
233 error = -EINVAL;
234 rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
235 if (!rgd)
236 goto fail_rindex;
237
238 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
239 if (error)
240 goto fail_rindex;
241
242 error = -ESTALE;
243 if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
244 goto fail_rgd;
245
246 gfs2_glock_dq_uninit(&rgd_gh);
247 gfs2_glock_dq_uninit(&ri_gh);
248
249 inode = gfs2_inode_lookup(sb, inum, fh_obj->imode);
250 if (!inode)
251 goto fail;
252 if (IS_ERR(inode)) {
253 error = PTR_ERR(inode);
254 goto fail;
255 }
256
257 error = gfs2_inode_refresh(GFS2_I(inode));
258 if (error) {
259 iput(inode);
260 goto fail;
261 }
262
263 error = -EIO;
264 if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) {
265 iput(inode);
266 goto fail;
267 }
268
269 gfs2_glock_dq_uninit(&i_gh);
270
271out_inode:
272 dentry = d_alloc_anon(inode);
273 if (!dentry) {
274 iput(inode);
275 return ERR_PTR(-ENOMEM);
276 }
277
278 return dentry;
279
280fail_rgd:
281 gfs2_glock_dq_uninit(&rgd_gh);
282
283fail_rindex:
284 gfs2_glock_dq_uninit(&ri_gh);
285
286fail:
287 gfs2_glock_dq_uninit(&i_gh);
288 return ERR_PTR(error);
289}
290
291struct export_operations gfs2_export_ops = {
292 .decode_fh = gfs2_decode_fh,
293 .encode_fh = gfs2_encode_fh,
294 .get_name = gfs2_get_name,
295 .get_parent = gfs2_get_parent,
296 .get_dentry = gfs2_get_dentry,
297};
298
diff --git a/fs/gfs2/ops_export.h b/fs/gfs2/ops_export.h
new file mode 100644
index 000000000000..09aca5046fb1
--- /dev/null
+++ b/fs/gfs2/ops_export.h
@@ -0,0 +1,22 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_EXPORT_DOT_H__
11#define __OPS_EXPORT_DOT_H__
12
13#define GFS2_SMALL_FH_SIZE 4
14#define GFS2_LARGE_FH_SIZE 10
15
16extern struct export_operations gfs2_export_ops;
17struct gfs2_fh_obj {
18 struct gfs2_inum this;
19 __u32 imode;
20};
21
22#endif /* __OPS_EXPORT_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
new file mode 100644
index 000000000000..3064f133bf3c
--- /dev/null
+++ b/fs/gfs2/ops_file.c
@@ -0,0 +1,661 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/uio.h>
17#include <linux/blkdev.h>
18#include <linux/mm.h>
19#include <linux/smp_lock.h>
20#include <linux/fs.h>
21#include <linux/gfs2_ondisk.h>
22#include <linux/ext2_fs.h>
23#include <linux/crc32.h>
24#include <linux/lm_interface.h>
25#include <asm/uaccess.h>
26
27#include "gfs2.h"
28#include "incore.h"
29#include "bmap.h"
30#include "dir.h"
31#include "glock.h"
32#include "glops.h"
33#include "inode.h"
34#include "lm.h"
35#include "log.h"
36#include "meta_io.h"
37#include "ops_file.h"
38#include "ops_vm.h"
39#include "quota.h"
40#include "rgrp.h"
41#include "trans.h"
42#include "util.h"
43#include "eaops.h"
44
45/* For regular, non-NFS */
46struct filldir_reg {
47 struct gfs2_sbd *fdr_sbd;
48 int fdr_prefetch;
49
50 filldir_t fdr_filldir;
51 void *fdr_opaque;
52};
53
54/*
55 * Most fields left uninitialised to catch anybody who tries to
56 * use them. f_flags set to prevent file_accessed() from touching
57 * any other part of this. Its use is purely as a flag so that we
58 * know (in readpage()) whether or not do to locking.
59 */
60struct file gfs2_internal_file_sentinel = {
61 .f_flags = O_NOATIME|O_RDONLY,
62};
63
64static int gfs2_read_actor(read_descriptor_t *desc, struct page *page,
65 unsigned long offset, unsigned long size)
66{
67 char *kaddr;
68 unsigned long count = desc->count;
69
70 if (size > count)
71 size = count;
72
73 kaddr = kmap(page);
74 memcpy(desc->arg.buf, kaddr + offset, size);
75 kunmap(page);
76
77 desc->count = count - size;
78 desc->written += size;
79 desc->arg.buf += size;
80 return size;
81}
82
83int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
84 char *buf, loff_t *pos, unsigned size)
85{
86 struct inode *inode = &ip->i_inode;
87 read_descriptor_t desc;
88 desc.written = 0;
89 desc.arg.buf = buf;
90 desc.count = size;
91 desc.error = 0;
92 do_generic_mapping_read(inode->i_mapping, ra_state,
93 &gfs2_internal_file_sentinel, pos, &desc,
94 gfs2_read_actor);
95 return desc.written ? desc.written : desc.error;
96}
97
98/**
99 * gfs2_llseek - seek to a location in a file
100 * @file: the file
101 * @offset: the offset
102 * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
103 *
104 * SEEK_END requires the glock for the file because it references the
105 * file's size.
106 *
107 * Returns: The new offset, or errno
108 */
109
110static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
111{
112 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
113 struct gfs2_holder i_gh;
114 loff_t error;
115
116 if (origin == 2) {
117 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
118 &i_gh);
119 if (!error) {
120 error = remote_llseek(file, offset, origin);
121 gfs2_glock_dq_uninit(&i_gh);
122 }
123 } else
124 error = remote_llseek(file, offset, origin);
125
126 return error;
127}
128
129/**
130 * filldir_func - Report a directory entry to the caller of gfs2_dir_read()
131 * @opaque: opaque data used by the function
132 * @name: the name of the directory entry
133 * @length: the length of the name
134 * @offset: the entry's offset in the directory
135 * @inum: the inode number the entry points to
136 * @type: the type of inode the entry points to
137 *
138 * Returns: 0 on success, 1 if buffer full
139 */
140
141static int filldir_func(void *opaque, const char *name, unsigned int length,
142 u64 offset, struct gfs2_inum *inum,
143 unsigned int type)
144{
145 struct filldir_reg *fdr = (struct filldir_reg *)opaque;
146 struct gfs2_sbd *sdp = fdr->fdr_sbd;
147 int error;
148
149 error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset,
150 inum->no_addr, type);
151 if (error)
152 return 1;
153
154 if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) {
155 gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_inode_glops,
156 LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
157 gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_iopen_glops,
158 LM_ST_SHARED, LM_FLAG_TRY);
159 }
160
161 return 0;
162}
163
164/**
165 * gfs2_readdir - Read directory entries from a directory
166 * @file: The directory to read from
167 * @dirent: Buffer for dirents
168 * @filldir: Function used to do the copying
169 *
170 * Returns: errno
171 */
172
173static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
174{
175 struct inode *dir = file->f_mapping->host;
176 struct gfs2_inode *dip = GFS2_I(dir);
177 struct filldir_reg fdr;
178 struct gfs2_holder d_gh;
179 u64 offset = file->f_pos;
180 int error;
181
182 fdr.fdr_sbd = GFS2_SB(dir);
183 fdr.fdr_prefetch = 1;
184 fdr.fdr_filldir = filldir;
185 fdr.fdr_opaque = dirent;
186
187 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
188 error = gfs2_glock_nq_atime(&d_gh);
189 if (error) {
190 gfs2_holder_uninit(&d_gh);
191 return error;
192 }
193
194 error = gfs2_dir_read(dir, &offset, &fdr, filldir_func);
195
196 gfs2_glock_dq_uninit(&d_gh);
197
198 file->f_pos = offset;
199
200 return error;
201}
202
203/**
204 * fsflags_cvt
205 * @table: A table of 32 u32 flags
206 * @val: a 32 bit value to convert
207 *
208 * This function can be used to convert between fsflags values and
209 * GFS2's own flags values.
210 *
211 * Returns: the converted flags
212 */
213static u32 fsflags_cvt(const u32 *table, u32 val)
214{
215 u32 res = 0;
216 while(val) {
217 if (val & 1)
218 res |= *table;
219 table++;
220 val >>= 1;
221 }
222 return res;
223}
224
225static const u32 fsflags_to_gfs2[32] = {
226 [3] = GFS2_DIF_SYNC,
227 [4] = GFS2_DIF_IMMUTABLE,
228 [5] = GFS2_DIF_APPENDONLY,
229 [7] = GFS2_DIF_NOATIME,
230 [12] = GFS2_DIF_EXHASH,
231 [14] = GFS2_DIF_JDATA,
232 [20] = GFS2_DIF_DIRECTIO,
233};
234
235static const u32 gfs2_to_fsflags[32] = {
236 [gfs2fl_Sync] = FS_SYNC_FL,
237 [gfs2fl_Immutable] = FS_IMMUTABLE_FL,
238 [gfs2fl_AppendOnly] = FS_APPEND_FL,
239 [gfs2fl_NoAtime] = FS_NOATIME_FL,
240 [gfs2fl_ExHash] = FS_INDEX_FL,
241 [gfs2fl_Jdata] = FS_JOURNAL_DATA_FL,
242 [gfs2fl_Directio] = FS_DIRECTIO_FL,
243 [gfs2fl_InheritDirectio] = FS_DIRECTIO_FL,
244 [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
245};
246
247static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
248{
249 struct inode *inode = filp->f_dentry->d_inode;
250 struct gfs2_inode *ip = GFS2_I(inode);
251 struct gfs2_holder gh;
252 int error;
253 u32 fsflags;
254
255 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
256 error = gfs2_glock_nq_m_atime(1, &gh);
257 if (error)
258 return error;
259
260 fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
261 if (put_user(fsflags, ptr))
262 error = -EFAULT;
263
264 gfs2_glock_dq_m(1, &gh);
265 gfs2_holder_uninit(&gh);
266 return error;
267}
268
269/* Flags that can be set by user space */
270#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \
271 GFS2_DIF_DIRECTIO| \
272 GFS2_DIF_IMMUTABLE| \
273 GFS2_DIF_APPENDONLY| \
274 GFS2_DIF_NOATIME| \
275 GFS2_DIF_SYNC| \
276 GFS2_DIF_SYSTEM| \
277 GFS2_DIF_INHERIT_DIRECTIO| \
278 GFS2_DIF_INHERIT_JDATA)
279
280/**
281 * gfs2_set_flags - set flags on an inode
282 * @inode: The inode
283 * @flags: The flags to set
284 * @mask: Indicates which flags are valid
285 *
286 */
287static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
288{
289 struct inode *inode = filp->f_dentry->d_inode;
290 struct gfs2_inode *ip = GFS2_I(inode);
291 struct gfs2_sbd *sdp = GFS2_SB(inode);
292 struct buffer_head *bh;
293 struct gfs2_holder gh;
294 int error;
295 u32 new_flags, flags;
296
297 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
298 if (error)
299 return error;
300
301 flags = ip->i_di.di_flags;
302 new_flags = (flags & ~mask) | (reqflags & mask);
303 if ((new_flags ^ flags) == 0)
304 goto out;
305
306 if (S_ISDIR(inode->i_mode)) {
307 if ((new_flags ^ flags) & GFS2_DIF_JDATA)
308 new_flags ^= (GFS2_DIF_JDATA|GFS2_DIF_INHERIT_JDATA);
309 if ((new_flags ^ flags) & GFS2_DIF_DIRECTIO)
310 new_flags ^= (GFS2_DIF_DIRECTIO|GFS2_DIF_INHERIT_DIRECTIO);
311 }
312
313 error = -EINVAL;
314 if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
315 goto out;
316
317 error = -EPERM;
318 if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
319 goto out;
320 if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
321 goto out;
322 if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&
323 !capable(CAP_LINUX_IMMUTABLE))
324 goto out;
325 if (!IS_IMMUTABLE(inode)) {
326 error = permission(inode, MAY_WRITE, NULL);
327 if (error)
328 goto out;
329 }
330
331 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
332 if (error)
333 goto out;
334 error = gfs2_meta_inode_buffer(ip, &bh);
335 if (error)
336 goto out_trans_end;
337 gfs2_trans_add_bh(ip->i_gl, bh, 1);
338 ip->i_di.di_flags = new_flags;
339 gfs2_dinode_out(&ip->i_di, bh->b_data);
340 brelse(bh);
341out_trans_end:
342 gfs2_trans_end(sdp);
343out:
344 gfs2_glock_dq_uninit(&gh);
345 return error;
346}
347
348static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
349{
350 u32 fsflags, gfsflags;
351 if (get_user(fsflags, ptr))
352 return -EFAULT;
353 gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
354 return do_gfs2_set_flags(filp, gfsflags, ~0);
355}
356
357static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
358{
359 switch(cmd) {
360 case FS_IOC_GETFLAGS:
361 return gfs2_get_flags(filp, (u32 __user *)arg);
362 case FS_IOC_SETFLAGS:
363 return gfs2_set_flags(filp, (u32 __user *)arg);
364 }
365 return -ENOTTY;
366}
367
368
369/**
370 * gfs2_mmap -
371 * @file: The file to map
372 * @vma: The VMA which described the mapping
373 *
374 * Returns: 0 or error code
375 */
376
377static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
378{
379 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
380 struct gfs2_holder i_gh;
381 int error;
382
383 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
384 error = gfs2_glock_nq_atime(&i_gh);
385 if (error) {
386 gfs2_holder_uninit(&i_gh);
387 return error;
388 }
389
390 /* This is VM_MAYWRITE instead of VM_WRITE because a call
391 to mprotect() can turn on VM_WRITE later. */
392
393 if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) ==
394 (VM_MAYSHARE | VM_MAYWRITE))
395 vma->vm_ops = &gfs2_vm_ops_sharewrite;
396 else
397 vma->vm_ops = &gfs2_vm_ops_private;
398
399 gfs2_glock_dq_uninit(&i_gh);
400
401 return error;
402}
403
404/**
405 * gfs2_open - open a file
406 * @inode: the inode to open
407 * @file: the struct file for this opening
408 *
409 * Returns: errno
410 */
411
412static int gfs2_open(struct inode *inode, struct file *file)
413{
414 struct gfs2_inode *ip = GFS2_I(inode);
415 struct gfs2_holder i_gh;
416 struct gfs2_file *fp;
417 int error;
418
419 fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
420 if (!fp)
421 return -ENOMEM;
422
423 mutex_init(&fp->f_fl_mutex);
424
425 gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
426 file->private_data = fp;
427
428 if (S_ISREG(ip->i_di.di_mode)) {
429 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
430 &i_gh);
431 if (error)
432 goto fail;
433
434 if (!(file->f_flags & O_LARGEFILE) &&
435 ip->i_di.di_size > MAX_NON_LFS) {
436 error = -EFBIG;
437 goto fail_gunlock;
438 }
439
440 /* Listen to the Direct I/O flag */
441
442 if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
443 file->f_flags |= O_DIRECT;
444
445 gfs2_glock_dq_uninit(&i_gh);
446 }
447
448 return 0;
449
450fail_gunlock:
451 gfs2_glock_dq_uninit(&i_gh);
452fail:
453 file->private_data = NULL;
454 kfree(fp);
455 return error;
456}
457
458/**
459 * gfs2_close - called to close a struct file
460 * @inode: the inode the struct file belongs to
461 * @file: the struct file being closed
462 *
463 * Returns: errno
464 */
465
466static int gfs2_close(struct inode *inode, struct file *file)
467{
468 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
469 struct gfs2_file *fp;
470
471 fp = file->private_data;
472 file->private_data = NULL;
473
474 if (gfs2_assert_warn(sdp, fp))
475 return -EIO;
476
477 kfree(fp);
478
479 return 0;
480}
481
482/**
483 * gfs2_fsync - sync the dirty data for a file (across the cluster)
484 * @file: the file that points to the dentry (we ignore this)
485 * @dentry: the dentry that points to the inode to sync
486 *
487 * Returns: errno
488 */
489
490static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
491{
492 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
493
494 gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
495
496 return 0;
497}
498
499/**
500 * gfs2_lock - acquire/release a posix lock on a file
501 * @file: the file pointer
502 * @cmd: either modify or retrieve lock state, possibly wait
503 * @fl: type and range of lock
504 *
505 * Returns: errno
506 */
507
508static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
509{
510 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
511 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
512 struct lm_lockname name =
513 { .ln_number = ip->i_num.no_addr,
514 .ln_type = LM_TYPE_PLOCK };
515
516 if (!(fl->fl_flags & FL_POSIX))
517 return -ENOLCK;
518 if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
519 return -ENOLCK;
520
521 if (sdp->sd_args.ar_localflocks) {
522 if (IS_GETLK(cmd)) {
523 struct file_lock tmp;
524 int ret;
525 ret = posix_test_lock(file, fl, &tmp);
526 fl->fl_type = F_UNLCK;
527 if (ret)
528 memcpy(fl, &tmp, sizeof(struct file_lock));
529 return 0;
530 } else {
531 return posix_lock_file_wait(file, fl);
532 }
533 }
534
535 if (IS_GETLK(cmd))
536 return gfs2_lm_plock_get(sdp, &name, file, fl);
537 else if (fl->fl_type == F_UNLCK)
538 return gfs2_lm_punlock(sdp, &name, file, fl);
539 else
540 return gfs2_lm_plock(sdp, &name, file, cmd, fl);
541}
542
543static int do_flock(struct file *file, int cmd, struct file_lock *fl)
544{
545 struct gfs2_file *fp = file->private_data;
546 struct gfs2_holder *fl_gh = &fp->f_fl_gh;
547 struct gfs2_inode *ip = GFS2_I(file->f_dentry->d_inode);
548 struct gfs2_glock *gl;
549 unsigned int state;
550 int flags;
551 int error = 0;
552
553 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
554 flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
555
556 mutex_lock(&fp->f_fl_mutex);
557
558 gl = fl_gh->gh_gl;
559 if (gl) {
560 if (fl_gh->gh_state == state)
561 goto out;
562 gfs2_glock_hold(gl);
563 flock_lock_file_wait(file,
564 &(struct file_lock){.fl_type = F_UNLCK});
565 gfs2_glock_dq_uninit(fl_gh);
566 } else {
567 error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
568 ip->i_num.no_addr, &gfs2_flock_glops,
569 CREATE, &gl);
570 if (error)
571 goto out;
572 }
573
574 gfs2_holder_init(gl, state, flags, fl_gh);
575 gfs2_glock_put(gl);
576
577 error = gfs2_glock_nq(fl_gh);
578 if (error) {
579 gfs2_holder_uninit(fl_gh);
580 if (error == GLR_TRYFAILED)
581 error = -EAGAIN;
582 } else {
583 error = flock_lock_file_wait(file, fl);
584 gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
585 }
586
587out:
588 mutex_unlock(&fp->f_fl_mutex);
589 return error;
590}
591
592static void do_unflock(struct file *file, struct file_lock *fl)
593{
594 struct gfs2_file *fp = file->private_data;
595 struct gfs2_holder *fl_gh = &fp->f_fl_gh;
596
597 mutex_lock(&fp->f_fl_mutex);
598 flock_lock_file_wait(file, fl);
599 if (fl_gh->gh_gl)
600 gfs2_glock_dq_uninit(fl_gh);
601 mutex_unlock(&fp->f_fl_mutex);
602}
603
604/**
605 * gfs2_flock - acquire/release a flock lock on a file
606 * @file: the file pointer
607 * @cmd: either modify or retrieve lock state, possibly wait
608 * @fl: type and range of lock
609 *
610 * Returns: errno
611 */
612
613static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
614{
615 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
616 struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
617
618 if (!(fl->fl_flags & FL_FLOCK))
619 return -ENOLCK;
620 if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
621 return -ENOLCK;
622
623 if (sdp->sd_args.ar_localflocks)
624 return flock_lock_file_wait(file, fl);
625
626 if (fl->fl_type == F_UNLCK) {
627 do_unflock(file, fl);
628 return 0;
629 } else {
630 return do_flock(file, cmd, fl);
631 }
632}
633
634const struct file_operations gfs2_file_fops = {
635 .llseek = gfs2_llseek,
636 .read = do_sync_read,
637 .aio_read = generic_file_aio_read,
638 .write = do_sync_write,
639 .aio_write = generic_file_aio_write,
640 .unlocked_ioctl = gfs2_ioctl,
641 .mmap = gfs2_mmap,
642 .open = gfs2_open,
643 .release = gfs2_close,
644 .fsync = gfs2_fsync,
645 .lock = gfs2_lock,
646 .sendfile = generic_file_sendfile,
647 .flock = gfs2_flock,
648 .splice_read = generic_file_splice_read,
649 .splice_write = generic_file_splice_write,
650};
651
652const struct file_operations gfs2_dir_fops = {
653 .readdir = gfs2_readdir,
654 .unlocked_ioctl = gfs2_ioctl,
655 .open = gfs2_open,
656 .release = gfs2_close,
657 .fsync = gfs2_fsync,
658 .lock = gfs2_lock,
659 .flock = gfs2_flock,
660};
661
diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h
new file mode 100644
index 000000000000..ce319f89ec8e
--- /dev/null
+++ b/fs/gfs2/ops_file.h
@@ -0,0 +1,24 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_FILE_DOT_H__
11#define __OPS_FILE_DOT_H__
12
13#include <linux/fs.h>
14struct gfs2_inode;
15
16extern struct file gfs2_internal_file_sentinel;
17extern int gfs2_internal_read(struct gfs2_inode *ip,
18 struct file_ra_state *ra_state,
19 char *buf, loff_t *pos, unsigned size);
20
21extern const struct file_operations gfs2_file_fops;
22extern const struct file_operations gfs2_dir_fops;
23
24#endif /* __OPS_FILE_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
new file mode 100644
index 000000000000..178b33911843
--- /dev/null
+++ b/fs/gfs2/ops_fstype.c
@@ -0,0 +1,928 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/blkdev.h>
16#include <linux/kthread.h>
17#include <linux/namei.h>
18#include <linux/mount.h>
19#include <linux/gfs2_ondisk.h>
20#include <linux/lm_interface.h>
21
22#include "gfs2.h"
23#include "incore.h"
24#include "daemon.h"
25#include "glock.h"
26#include "glops.h"
27#include "inode.h"
28#include "lm.h"
29#include "mount.h"
30#include "ops_export.h"
31#include "ops_fstype.h"
32#include "ops_super.h"
33#include "recovery.h"
34#include "rgrp.h"
35#include "super.h"
36#include "sys.h"
37#include "util.h"
38
39#define DO 0
40#define UNDO 1
41
42extern struct dentry_operations gfs2_dops;
43
44static struct gfs2_sbd *init_sbd(struct super_block *sb)
45{
46 struct gfs2_sbd *sdp;
47
48 sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
49 if (!sdp)
50 return NULL;
51
52 sb->s_fs_info = sdp;
53 sdp->sd_vfs = sb;
54
55 gfs2_tune_init(&sdp->sd_tune);
56
57 INIT_LIST_HEAD(&sdp->sd_reclaim_list);
58 spin_lock_init(&sdp->sd_reclaim_lock);
59 init_waitqueue_head(&sdp->sd_reclaim_wq);
60
61 mutex_init(&sdp->sd_inum_mutex);
62 spin_lock_init(&sdp->sd_statfs_spin);
63 mutex_init(&sdp->sd_statfs_mutex);
64
65 spin_lock_init(&sdp->sd_rindex_spin);
66 mutex_init(&sdp->sd_rindex_mutex);
67 INIT_LIST_HEAD(&sdp->sd_rindex_list);
68 INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
69 INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
70
71 INIT_LIST_HEAD(&sdp->sd_jindex_list);
72 spin_lock_init(&sdp->sd_jindex_spin);
73 mutex_init(&sdp->sd_jindex_mutex);
74
75 INIT_LIST_HEAD(&sdp->sd_quota_list);
76 spin_lock_init(&sdp->sd_quota_spin);
77 mutex_init(&sdp->sd_quota_mutex);
78
79 spin_lock_init(&sdp->sd_log_lock);
80
81 INIT_LIST_HEAD(&sdp->sd_log_le_gl);
82 INIT_LIST_HEAD(&sdp->sd_log_le_buf);
83 INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
84 INIT_LIST_HEAD(&sdp->sd_log_le_rg);
85 INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
86
87 mutex_init(&sdp->sd_log_reserve_mutex);
88 INIT_LIST_HEAD(&sdp->sd_ail1_list);
89 INIT_LIST_HEAD(&sdp->sd_ail2_list);
90
91 init_rwsem(&sdp->sd_log_flush_lock);
92 INIT_LIST_HEAD(&sdp->sd_log_flush_list);
93
94 INIT_LIST_HEAD(&sdp->sd_revoke_list);
95
96 mutex_init(&sdp->sd_freeze_lock);
97
98 return sdp;
99}
100
101static void init_vfs(struct super_block *sb, unsigned noatime)
102{
103 struct gfs2_sbd *sdp = sb->s_fs_info;
104
105 sb->s_magic = GFS2_MAGIC;
106 sb->s_op = &gfs2_super_ops;
107 sb->s_export_op = &gfs2_export_ops;
108 sb->s_maxbytes = MAX_LFS_FILESIZE;
109
110 if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
111 set_bit(noatime, &sdp->sd_flags);
112
113 /* Don't let the VFS update atimes. GFS2 handles this itself. */
114 sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
115}
116
117static int init_names(struct gfs2_sbd *sdp, int silent)
118{
119 struct page *page;
120 char *proto, *table;
121 int error = 0;
122
123 proto = sdp->sd_args.ar_lockproto;
124 table = sdp->sd_args.ar_locktable;
125
126 /* Try to autodetect */
127
128 if (!proto[0] || !table[0]) {
129 struct gfs2_sb *sb;
130 page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
131 if (!page)
132 return -ENOBUFS;
133 sb = kmap(page);
134 gfs2_sb_in(&sdp->sd_sb, sb);
135 kunmap(page);
136 __free_page(page);
137
138 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
139 if (error)
140 goto out;
141
142 if (!proto[0])
143 proto = sdp->sd_sb.sb_lockproto;
144 if (!table[0])
145 table = sdp->sd_sb.sb_locktable;
146 }
147
148 if (!table[0])
149 table = sdp->sd_vfs->s_id;
150
151 snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto);
152 snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table);
153
154out:
155 return error;
156}
157
158static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
159 int undo)
160{
161 struct task_struct *p;
162 int error = 0;
163
164 if (undo)
165 goto fail_trans;
166
167 p = kthread_run(gfs2_scand, sdp, "gfs2_scand");
168 error = IS_ERR(p);
169 if (error) {
170 fs_err(sdp, "can't start scand thread: %d\n", error);
171 return error;
172 }
173 sdp->sd_scand_process = p;
174
175 for (sdp->sd_glockd_num = 0;
176 sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
177 sdp->sd_glockd_num++) {
178 p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
179 error = IS_ERR(p);
180 if (error) {
181 fs_err(sdp, "can't start glockd thread: %d\n", error);
182 goto fail;
183 }
184 sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
185 }
186
187 error = gfs2_glock_nq_num(sdp,
188 GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
189 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
190 mount_gh);
191 if (error) {
192 fs_err(sdp, "can't acquire mount glock: %d\n", error);
193 goto fail;
194 }
195
196 error = gfs2_glock_nq_num(sdp,
197 GFS2_LIVE_LOCK, &gfs2_nondisk_glops,
198 LM_ST_SHARED,
199 LM_FLAG_NOEXP | GL_EXACT,
200 &sdp->sd_live_gh);
201 if (error) {
202 fs_err(sdp, "can't acquire live glock: %d\n", error);
203 goto fail_mount;
204 }
205
206 error = gfs2_glock_get(sdp, GFS2_RENAME_LOCK, &gfs2_nondisk_glops,
207 CREATE, &sdp->sd_rename_gl);
208 if (error) {
209 fs_err(sdp, "can't create rename glock: %d\n", error);
210 goto fail_live;
211 }
212
213 error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops,
214 CREATE, &sdp->sd_trans_gl);
215 if (error) {
216 fs_err(sdp, "can't create transaction glock: %d\n", error);
217 goto fail_rename;
218 }
219 set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
220
221 return 0;
222
223fail_trans:
224 gfs2_glock_put(sdp->sd_trans_gl);
225fail_rename:
226 gfs2_glock_put(sdp->sd_rename_gl);
227fail_live:
228 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
229fail_mount:
230 gfs2_glock_dq_uninit(mount_gh);
231fail:
232 while (sdp->sd_glockd_num--)
233 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
234
235 kthread_stop(sdp->sd_scand_process);
236 return error;
237}
238
239static struct inode *gfs2_lookup_root(struct super_block *sb,
240 struct gfs2_inum *inum)
241{
242 return gfs2_inode_lookup(sb, inum, DT_DIR);
243}
244
245static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
246{
247 struct super_block *sb = sdp->sd_vfs;
248 struct gfs2_holder sb_gh;
249 struct gfs2_inum *inum;
250 struct inode *inode;
251 int error = 0;
252
253 if (undo) {
254 if (sb->s_root) {
255 dput(sb->s_root);
256 sb->s_root = NULL;
257 }
258 return 0;
259 }
260
261 error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
262 LM_ST_SHARED, 0, &sb_gh);
263 if (error) {
264 fs_err(sdp, "can't acquire superblock glock: %d\n", error);
265 return error;
266 }
267
268 error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
269 if (error) {
270 fs_err(sdp, "can't read superblock: %d\n", error);
271 goto out;
272 }
273
274 /* Set up the buffer cache and SB for real */
275 if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
276 error = -EINVAL;
277 fs_err(sdp, "FS block size (%u) is too small for device "
278 "block size (%u)\n",
279 sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
280 goto out;
281 }
282 if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
283 error = -EINVAL;
284 fs_err(sdp, "FS block size (%u) is too big for machine "
285 "page size (%u)\n",
286 sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
287 goto out;
288 }
289 sb_set_blocksize(sb, sdp->sd_sb.sb_bsize);
290
291 /* Get the root inode */
292 inum = &sdp->sd_sb.sb_root_dir;
293 if (sb->s_type == &gfs2meta_fs_type)
294 inum = &sdp->sd_sb.sb_master_dir;
295 inode = gfs2_lookup_root(sb, inum);
296 if (IS_ERR(inode)) {
297 error = PTR_ERR(inode);
298 fs_err(sdp, "can't read in root inode: %d\n", error);
299 goto out;
300 }
301
302 sb->s_root = d_alloc_root(inode);
303 if (!sb->s_root) {
304 fs_err(sdp, "can't get root dentry\n");
305 error = -ENOMEM;
306 iput(inode);
307 }
308 sb->s_root->d_op = &gfs2_dops;
309out:
310 gfs2_glock_dq_uninit(&sb_gh);
311 return error;
312}
313
314static int init_journal(struct gfs2_sbd *sdp, int undo)
315{
316 struct gfs2_holder ji_gh;
317 struct task_struct *p;
318 struct gfs2_inode *ip;
319 int jindex = 1;
320 int error = 0;
321
322 if (undo) {
323 jindex = 0;
324 goto fail_recoverd;
325 }
326
327 sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex");
328 if (IS_ERR(sdp->sd_jindex)) {
329 fs_err(sdp, "can't lookup journal index: %d\n", error);
330 return PTR_ERR(sdp->sd_jindex);
331 }
332 ip = GFS2_I(sdp->sd_jindex);
333 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
334
335 /* Load in the journal index special file */
336
337 error = gfs2_jindex_hold(sdp, &ji_gh);
338 if (error) {
339 fs_err(sdp, "can't read journal index: %d\n", error);
340 goto fail;
341 }
342
343 error = -EINVAL;
344 if (!gfs2_jindex_size(sdp)) {
345 fs_err(sdp, "no journals!\n");
346 goto fail_jindex;
347 }
348
349 if (sdp->sd_args.ar_spectator) {
350 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
351 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
352 } else {
353 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
354 fs_err(sdp, "can't mount journal #%u\n",
355 sdp->sd_lockstruct.ls_jid);
356 fs_err(sdp, "there are only %u journals (0 - %u)\n",
357 gfs2_jindex_size(sdp),
358 gfs2_jindex_size(sdp) - 1);
359 goto fail_jindex;
360 }
361 sdp->sd_jdesc = gfs2_jdesc_find(sdp, sdp->sd_lockstruct.ls_jid);
362
363 error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid,
364 &gfs2_journal_glops,
365 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP,
366 &sdp->sd_journal_gh);
367 if (error) {
368 fs_err(sdp, "can't acquire journal glock: %d\n", error);
369 goto fail_jindex;
370 }
371
372 ip = GFS2_I(sdp->sd_jdesc->jd_inode);
373 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
374 LM_FLAG_NOEXP | GL_EXACT,
375 &sdp->sd_jinode_gh);
376 if (error) {
377 fs_err(sdp, "can't acquire journal inode glock: %d\n",
378 error);
379 goto fail_journal_gh;
380 }
381
382 error = gfs2_jdesc_check(sdp->sd_jdesc);
383 if (error) {
384 fs_err(sdp, "my journal (%u) is bad: %d\n",
385 sdp->sd_jdesc->jd_jid, error);
386 goto fail_jinode_gh;
387 }
388 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
389 }
390
391 if (sdp->sd_lockstruct.ls_first) {
392 unsigned int x;
393 for (x = 0; x < sdp->sd_journals; x++) {
394 error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x));
395 if (error) {
396 fs_err(sdp, "error recovering journal %u: %d\n",
397 x, error);
398 goto fail_jinode_gh;
399 }
400 }
401
402 gfs2_lm_others_may_mount(sdp);
403 } else if (!sdp->sd_args.ar_spectator) {
404 error = gfs2_recover_journal(sdp->sd_jdesc);
405 if (error) {
406 fs_err(sdp, "error recovering my journal: %d\n", error);
407 goto fail_jinode_gh;
408 }
409 }
410
411 set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags);
412 gfs2_glock_dq_uninit(&ji_gh);
413 jindex = 0;
414
415 p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
416 error = IS_ERR(p);
417 if (error) {
418 fs_err(sdp, "can't start recoverd thread: %d\n", error);
419 goto fail_jinode_gh;
420 }
421 sdp->sd_recoverd_process = p;
422
423 return 0;
424
425fail_recoverd:
426 kthread_stop(sdp->sd_recoverd_process);
427fail_jinode_gh:
428 if (!sdp->sd_args.ar_spectator)
429 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
430fail_journal_gh:
431 if (!sdp->sd_args.ar_spectator)
432 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
433fail_jindex:
434 gfs2_jindex_free(sdp);
435 if (jindex)
436 gfs2_glock_dq_uninit(&ji_gh);
437fail:
438 iput(sdp->sd_jindex);
439 return error;
440}
441
442
443static int init_inodes(struct gfs2_sbd *sdp, int undo)
444{
445 int error = 0;
446 struct gfs2_inode *ip;
447 struct inode *inode;
448
449 if (undo)
450 goto fail_qinode;
451
452 inode = gfs2_lookup_root(sdp->sd_vfs, &sdp->sd_sb.sb_master_dir);
453 if (IS_ERR(inode)) {
454 error = PTR_ERR(inode);
455 fs_err(sdp, "can't read in master directory: %d\n", error);
456 goto fail;
457 }
458 sdp->sd_master_dir = inode;
459
460 error = init_journal(sdp, undo);
461 if (error)
462 goto fail_master;
463
464 /* Read in the master inode number inode */
465 sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum");
466 if (IS_ERR(sdp->sd_inum_inode)) {
467 error = PTR_ERR(sdp->sd_inum_inode);
468 fs_err(sdp, "can't read in inum inode: %d\n", error);
469 goto fail_journal;
470 }
471
472
473 /* Read in the master statfs inode */
474 sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs");
475 if (IS_ERR(sdp->sd_statfs_inode)) {
476 error = PTR_ERR(sdp->sd_statfs_inode);
477 fs_err(sdp, "can't read in statfs inode: %d\n", error);
478 goto fail_inum;
479 }
480
481 /* Read in the resource index inode */
482 sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex");
483 if (IS_ERR(sdp->sd_rindex)) {
484 error = PTR_ERR(sdp->sd_rindex);
485 fs_err(sdp, "can't get resource index inode: %d\n", error);
486 goto fail_statfs;
487 }
488 ip = GFS2_I(sdp->sd_rindex);
489 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
490 sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1;
491
492 /* Read in the quota inode */
493 sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
494 if (IS_ERR(sdp->sd_quota_inode)) {
495 error = PTR_ERR(sdp->sd_quota_inode);
496 fs_err(sdp, "can't get quota file inode: %d\n", error);
497 goto fail_rindex;
498 }
499 return 0;
500
501fail_qinode:
502 iput(sdp->sd_quota_inode);
503fail_rindex:
504 gfs2_clear_rgrpd(sdp);
505 iput(sdp->sd_rindex);
506fail_statfs:
507 iput(sdp->sd_statfs_inode);
508fail_inum:
509 iput(sdp->sd_inum_inode);
510fail_journal:
511 init_journal(sdp, UNDO);
512fail_master:
513 iput(sdp->sd_master_dir);
514fail:
515 return error;
516}
517
518static int init_per_node(struct gfs2_sbd *sdp, int undo)
519{
520 struct inode *pn = NULL;
521 char buf[30];
522 int error = 0;
523 struct gfs2_inode *ip;
524
525 if (sdp->sd_args.ar_spectator)
526 return 0;
527
528 if (undo)
529 goto fail_qc_gh;
530
531 pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node");
532 if (IS_ERR(pn)) {
533 error = PTR_ERR(pn);
534 fs_err(sdp, "can't find per_node directory: %d\n", error);
535 return error;
536 }
537
538 sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
539 sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
540 if (IS_ERR(sdp->sd_ir_inode)) {
541 error = PTR_ERR(sdp->sd_ir_inode);
542 fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
543 goto fail;
544 }
545
546 sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
547 sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
548 if (IS_ERR(sdp->sd_sc_inode)) {
549 error = PTR_ERR(sdp->sd_sc_inode);
550 fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
551 goto fail_ir_i;
552 }
553
554 sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
555 sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf);
556 if (IS_ERR(sdp->sd_qc_inode)) {
557 error = PTR_ERR(sdp->sd_qc_inode);
558 fs_err(sdp, "can't find local \"qc\" file: %d\n", error);
559 goto fail_ut_i;
560 }
561
562 iput(pn);
563 pn = NULL;
564
565 ip = GFS2_I(sdp->sd_ir_inode);
566 error = gfs2_glock_nq_init(ip->i_gl,
567 LM_ST_EXCLUSIVE, 0,
568 &sdp->sd_ir_gh);
569 if (error) {
570 fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
571 goto fail_qc_i;
572 }
573
574 ip = GFS2_I(sdp->sd_sc_inode);
575 error = gfs2_glock_nq_init(ip->i_gl,
576 LM_ST_EXCLUSIVE, 0,
577 &sdp->sd_sc_gh);
578 if (error) {
579 fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
580 goto fail_ir_gh;
581 }
582
583 ip = GFS2_I(sdp->sd_qc_inode);
584 error = gfs2_glock_nq_init(ip->i_gl,
585 LM_ST_EXCLUSIVE, 0,
586 &sdp->sd_qc_gh);
587 if (error) {
588 fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
589 goto fail_ut_gh;
590 }
591
592 return 0;
593
594fail_qc_gh:
595 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
596fail_ut_gh:
597 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
598fail_ir_gh:
599 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
600fail_qc_i:
601 iput(sdp->sd_qc_inode);
602fail_ut_i:
603 iput(sdp->sd_sc_inode);
604fail_ir_i:
605 iput(sdp->sd_ir_inode);
606fail:
607 if (pn)
608 iput(pn);
609 return error;
610}
611
612static int init_threads(struct gfs2_sbd *sdp, int undo)
613{
614 struct task_struct *p;
615 int error = 0;
616
617 if (undo)
618 goto fail_quotad;
619
620 sdp->sd_log_flush_time = jiffies;
621 sdp->sd_jindex_refresh_time = jiffies;
622
623 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
624 error = IS_ERR(p);
625 if (error) {
626 fs_err(sdp, "can't start logd thread: %d\n", error);
627 return error;
628 }
629 sdp->sd_logd_process = p;
630
631 sdp->sd_statfs_sync_time = jiffies;
632 sdp->sd_quota_sync_time = jiffies;
633
634 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
635 error = IS_ERR(p);
636 if (error) {
637 fs_err(sdp, "can't start quotad thread: %d\n", error);
638 goto fail;
639 }
640 sdp->sd_quotad_process = p;
641
642 return 0;
643
644
645fail_quotad:
646 kthread_stop(sdp->sd_quotad_process);
647fail:
648 kthread_stop(sdp->sd_logd_process);
649 return error;
650}
651
652/**
653 * fill_super - Read in superblock
654 * @sb: The VFS superblock
655 * @data: Mount options
656 * @silent: Don't complain if it's not a GFS2 filesystem
657 *
658 * Returns: errno
659 */
660
661static int fill_super(struct super_block *sb, void *data, int silent)
662{
663 struct gfs2_sbd *sdp;
664 struct gfs2_holder mount_gh;
665 int error;
666
667 sdp = init_sbd(sb);
668 if (!sdp) {
669 printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
670 return -ENOMEM;
671 }
672
673 error = gfs2_mount_args(sdp, (char *)data, 0);
674 if (error) {
675 printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
676 goto fail;
677 }
678
679 init_vfs(sb, SDF_NOATIME);
680
681 /* Set up the buffer cache and fill in some fake block size values
682 to allow us to read-in the on-disk superblock. */
683 sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK);
684 sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
685 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
686 GFS2_BASIC_BLOCK_SHIFT;
687 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
688
689 error = init_names(sdp, silent);
690 if (error)
691 goto fail;
692
693 error = gfs2_sys_fs_add(sdp);
694 if (error)
695 goto fail;
696
697 error = gfs2_lm_mount(sdp, silent);
698 if (error)
699 goto fail_sys;
700
701 error = init_locking(sdp, &mount_gh, DO);
702 if (error)
703 goto fail_lm;
704
705 error = init_sb(sdp, silent, DO);
706 if (error)
707 goto fail_locking;
708
709 error = init_inodes(sdp, DO);
710 if (error)
711 goto fail_sb;
712
713 error = init_per_node(sdp, DO);
714 if (error)
715 goto fail_inodes;
716
717 error = gfs2_statfs_init(sdp);
718 if (error) {
719 fs_err(sdp, "can't initialize statfs subsystem: %d\n", error);
720 goto fail_per_node;
721 }
722
723 error = init_threads(sdp, DO);
724 if (error)
725 goto fail_per_node;
726
727 if (!(sb->s_flags & MS_RDONLY)) {
728 error = gfs2_make_fs_rw(sdp);
729 if (error) {
730 fs_err(sdp, "can't make FS RW: %d\n", error);
731 goto fail_threads;
732 }
733 }
734
735 gfs2_glock_dq_uninit(&mount_gh);
736
737 return 0;
738
739fail_threads:
740 init_threads(sdp, UNDO);
741fail_per_node:
742 init_per_node(sdp, UNDO);
743fail_inodes:
744 init_inodes(sdp, UNDO);
745fail_sb:
746 init_sb(sdp, 0, UNDO);
747fail_locking:
748 init_locking(sdp, &mount_gh, UNDO);
749fail_lm:
750 gfs2_gl_hash_clear(sdp, WAIT);
751 gfs2_lm_unmount(sdp);
752 while (invalidate_inodes(sb))
753 yield();
754fail_sys:
755 gfs2_sys_fs_del(sdp);
756fail:
757 kfree(sdp);
758 sb->s_fs_info = NULL;
759 return error;
760}
761
762static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
763 const char *dev_name, void *data, struct vfsmount *mnt)
764{
765 struct super_block *sb;
766 struct gfs2_sbd *sdp;
767 int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
768 if (error)
769 goto out;
770 sb = mnt->mnt_sb;
771 sdp = sb->s_fs_info;
772 sdp->sd_gfs2mnt = mnt;
773out:
774 return error;
775}
776
777static int fill_super_meta(struct super_block *sb, struct super_block *new,
778 void *data, int silent)
779{
780 struct gfs2_sbd *sdp = sb->s_fs_info;
781 struct inode *inode;
782 int error = 0;
783
784 new->s_fs_info = sdp;
785 sdp->sd_vfs_meta = sb;
786
787 init_vfs(new, SDF_NOATIME);
788
789 /* Get the master inode */
790 inode = igrab(sdp->sd_master_dir);
791
792 new->s_root = d_alloc_root(inode);
793 if (!new->s_root) {
794 fs_err(sdp, "can't get root dentry\n");
795 error = -ENOMEM;
796 iput(inode);
797 }
798 new->s_root->d_op = &gfs2_dops;
799
800 return error;
801}
802
803static int set_bdev_super(struct super_block *s, void *data)
804{
805 s->s_bdev = data;
806 s->s_dev = s->s_bdev->bd_dev;
807 return 0;
808}
809
810static int test_bdev_super(struct super_block *s, void *data)
811{
812 return s->s_bdev == data;
813}
814
815static struct super_block* get_gfs2_sb(const char *dev_name)
816{
817 struct kstat stat;
818 struct nameidata nd;
819 struct file_system_type *fstype;
820 struct super_block *sb = NULL, *s;
821 struct list_head *l;
822 int error;
823
824 error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
825 if (error) {
826 printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n",
827 dev_name);
828 goto out;
829 }
830 error = vfs_getattr(nd.mnt, nd.dentry, &stat);
831
832 fstype = get_fs_type("gfs2");
833 list_for_each(l, &fstype->fs_supers) {
834 s = list_entry(l, struct super_block, s_instances);
835 if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
836 (S_ISDIR(stat.mode) && s == nd.dentry->d_inode->i_sb)) {
837 sb = s;
838 goto free_nd;
839 }
840 }
841
842 printk(KERN_WARNING "GFS2: Unrecognized block device or "
843 "mount point %s", dev_name);
844
845free_nd:
846 path_release(&nd);
847out:
848 return sb;
849}
850
851static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
852 const char *dev_name, void *data, struct vfsmount *mnt)
853{
854 int error = 0;
855 struct super_block *sb = NULL, *new;
856 struct gfs2_sbd *sdp;
857 char *gfs2mnt = NULL;
858
859 sb = get_gfs2_sb(dev_name);
860 if (!sb) {
861 printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
862 error = -ENOENT;
863 goto error;
864 }
865 sdp = (struct gfs2_sbd*) sb->s_fs_info;
866 if (sdp->sd_vfs_meta) {
867 printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n");
868 error = -EBUSY;
869 goto error;
870 }
871 mutex_lock(&sb->s_bdev->bd_mount_mutex);
872 new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev);
873 mutex_unlock(&sb->s_bdev->bd_mount_mutex);
874 if (IS_ERR(new)) {
875 error = PTR_ERR(new);
876 goto error;
877 }
878 module_put(fs_type->owner);
879 new->s_flags = flags;
880 strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
881 sb_set_blocksize(new, sb->s_blocksize);
882 error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0);
883 if (error) {
884 up_write(&new->s_umount);
885 deactivate_super(new);
886 goto error;
887 }
888
889 new->s_flags |= MS_ACTIVE;
890
891 /* Grab a reference to the gfs2 mount point */
892 atomic_inc(&sdp->sd_gfs2mnt->mnt_count);
893 return simple_set_mnt(mnt, new);
894error:
895 if (gfs2mnt)
896 kfree(gfs2mnt);
897 return error;
898}
899
900static void gfs2_kill_sb(struct super_block *sb)
901{
902 kill_block_super(sb);
903}
904
905static void gfs2_kill_sb_meta(struct super_block *sb)
906{
907 struct gfs2_sbd *sdp = sb->s_fs_info;
908 generic_shutdown_super(sb);
909 sdp->sd_vfs_meta = NULL;
910 atomic_dec(&sdp->sd_gfs2mnt->mnt_count);
911}
912
913struct file_system_type gfs2_fs_type = {
914 .name = "gfs2",
915 .fs_flags = FS_REQUIRES_DEV,
916 .get_sb = gfs2_get_sb,
917 .kill_sb = gfs2_kill_sb,
918 .owner = THIS_MODULE,
919};
920
921struct file_system_type gfs2meta_fs_type = {
922 .name = "gfs2meta",
923 .fs_flags = FS_REQUIRES_DEV,
924 .get_sb = gfs2_get_sb_meta,
925 .kill_sb = gfs2_kill_sb_meta,
926 .owner = THIS_MODULE,
927};
928
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
new file mode 100644
index 000000000000..7cc2c296271b
--- /dev/null
+++ b/fs/gfs2/ops_fstype.h
@@ -0,0 +1,18 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_FSTYPE_DOT_H__
11#define __OPS_FSTYPE_DOT_H__
12
13#include <linux/fs.h>
14
15extern struct file_system_type gfs2_fs_type;
16extern struct file_system_type gfs2meta_fs_type;
17
18#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
new file mode 100644
index 000000000000..ef6e5ed70e94
--- /dev/null
+++ b/fs/gfs2/ops_inode.c
@@ -0,0 +1,1151 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/namei.h>
16#include <linux/utsname.h>
17#include <linux/mm.h>
18#include <linux/xattr.h>
19#include <linux/posix_acl.h>
20#include <linux/gfs2_ondisk.h>
21#include <linux/crc32.h>
22#include <linux/lm_interface.h>
23#include <asm/uaccess.h>
24
25#include "gfs2.h"
26#include "incore.h"
27#include "acl.h"
28#include "bmap.h"
29#include "dir.h"
30#include "eaops.h"
31#include "eattr.h"
32#include "glock.h"
33#include "inode.h"
34#include "meta_io.h"
35#include "ops_dentry.h"
36#include "ops_inode.h"
37#include "quota.h"
38#include "rgrp.h"
39#include "trans.h"
40#include "util.h"
41
42/**
43 * gfs2_create - Create a file
44 * @dir: The directory in which to create the file
45 * @dentry: The dentry of the new file
46 * @mode: The mode of the new file
47 *
48 * Returns: errno
49 */
50
51static int gfs2_create(struct inode *dir, struct dentry *dentry,
52 int mode, struct nameidata *nd)
53{
54 struct gfs2_inode *dip = GFS2_I(dir);
55 struct gfs2_sbd *sdp = GFS2_SB(dir);
56 struct gfs2_holder ghs[2];
57 struct inode *inode;
58
59 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
60
61 for (;;) {
62 inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode);
63 if (!IS_ERR(inode)) {
64 gfs2_trans_end(sdp);
65 if (dip->i_alloc.al_rgd)
66 gfs2_inplace_release(dip);
67 gfs2_quota_unlock(dip);
68 gfs2_alloc_put(dip);
69 gfs2_glock_dq_uninit_m(2, ghs);
70 mark_inode_dirty(inode);
71 break;
72 } else if (PTR_ERR(inode) != -EEXIST ||
73 (nd->intent.open.flags & O_EXCL)) {
74 gfs2_holder_uninit(ghs);
75 return PTR_ERR(inode);
76 }
77
78 inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
79 if (inode) {
80 if (!IS_ERR(inode)) {
81 gfs2_holder_uninit(ghs);
82 break;
83 } else {
84 gfs2_holder_uninit(ghs);
85 return PTR_ERR(inode);
86 }
87 }
88 }
89
90 d_instantiate(dentry, inode);
91
92 return 0;
93}
94
95/**
96 * gfs2_lookup - Look up a filename in a directory and return its inode
97 * @dir: The directory inode
98 * @dentry: The dentry of the new inode
99 * @nd: passed from Linux VFS, ignored by us
100 *
101 * Called by the VFS layer. Lock dir and call gfs2_lookupi()
102 *
103 * Returns: errno
104 */
105
106static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
107 struct nameidata *nd)
108{
109 struct inode *inode = NULL;
110
111 dentry->d_op = &gfs2_dops;
112
113 inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
114 if (inode && IS_ERR(inode))
115 return ERR_PTR(PTR_ERR(inode));
116
117 if (inode)
118 return d_splice_alias(inode, dentry);
119 d_add(dentry, inode);
120
121 return NULL;
122}
123
124/**
125 * gfs2_link - Link to a file
126 * @old_dentry: The inode to link
127 * @dir: Add link to this directory
128 * @dentry: The name of the link
129 *
130 * Link the inode in "old_dentry" into the directory "dir" with the
131 * name in "dentry".
132 *
133 * Returns: errno
134 */
135
136static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
137 struct dentry *dentry)
138{
139 struct gfs2_inode *dip = GFS2_I(dir);
140 struct gfs2_sbd *sdp = GFS2_SB(dir);
141 struct inode *inode = old_dentry->d_inode;
142 struct gfs2_inode *ip = GFS2_I(inode);
143 struct gfs2_holder ghs[2];
144 int alloc_required;
145 int error;
146
147 if (S_ISDIR(ip->i_di.di_mode))
148 return -EPERM;
149
150 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
151 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
152
153 error = gfs2_glock_nq_m(2, ghs);
154 if (error)
155 goto out;
156
157 error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
158 if (error)
159 goto out_gunlock;
160
161 error = gfs2_dir_search(dir, &dentry->d_name, NULL, NULL);
162 switch (error) {
163 case -ENOENT:
164 break;
165 case 0:
166 error = -EEXIST;
167 default:
168 goto out_gunlock;
169 }
170
171 error = -EINVAL;
172 if (!dip->i_di.di_nlink)
173 goto out_gunlock;
174 error = -EFBIG;
175 if (dip->i_di.di_entries == (u32)-1)
176 goto out_gunlock;
177 error = -EPERM;
178 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
179 goto out_gunlock;
180 error = -EINVAL;
181 if (!ip->i_di.di_nlink)
182 goto out_gunlock;
183 error = -EMLINK;
184 if (ip->i_di.di_nlink == (u32)-1)
185 goto out_gunlock;
186
187 alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
188 if (error < 0)
189 goto out_gunlock;
190 error = 0;
191
192 if (alloc_required) {
193 struct gfs2_alloc *al = gfs2_alloc_get(dip);
194
195 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
196 if (error)
197 goto out_alloc;
198
199 error = gfs2_quota_check(dip, dip->i_di.di_uid,
200 dip->i_di.di_gid);
201 if (error)
202 goto out_gunlock_q;
203
204 al->al_requested = sdp->sd_max_dirres;
205
206 error = gfs2_inplace_reserve(dip);
207 if (error)
208 goto out_gunlock_q;
209
210 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
211 al->al_rgd->rd_ri.ri_length +
212 2 * RES_DINODE + RES_STATFS +
213 RES_QUOTA, 0);
214 if (error)
215 goto out_ipres;
216 } else {
217 error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
218 if (error)
219 goto out_ipres;
220 }
221
222 error = gfs2_dir_add(dir, &dentry->d_name, &ip->i_num,
223 IF2DT(ip->i_di.di_mode));
224 if (error)
225 goto out_end_trans;
226
227 error = gfs2_change_nlink(ip, +1);
228
229out_end_trans:
230 gfs2_trans_end(sdp);
231out_ipres:
232 if (alloc_required)
233 gfs2_inplace_release(dip);
234out_gunlock_q:
235 if (alloc_required)
236 gfs2_quota_unlock(dip);
237out_alloc:
238 if (alloc_required)
239 gfs2_alloc_put(dip);
240out_gunlock:
241 gfs2_glock_dq_m(2, ghs);
242out:
243 gfs2_holder_uninit(ghs);
244 gfs2_holder_uninit(ghs + 1);
245 if (!error) {
246 atomic_inc(&inode->i_count);
247 d_instantiate(dentry, inode);
248 mark_inode_dirty(inode);
249 }
250 return error;
251}
252
253/**
254 * gfs2_unlink - Unlink a file
255 * @dir: The inode of the directory containing the file to unlink
256 * @dentry: The file itself
257 *
258 * Unlink a file. Call gfs2_unlinki()
259 *
260 * Returns: errno
261 */
262
263static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
264{
265 struct gfs2_inode *dip = GFS2_I(dir);
266 struct gfs2_sbd *sdp = GFS2_SB(dir);
267 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
268 struct gfs2_holder ghs[2];
269 int error;
270
271 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
272 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
273
274 error = gfs2_glock_nq_m(2, ghs);
275 if (error)
276 goto out;
277
278 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
279 if (error)
280 goto out_gunlock;
281
282 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
283 if (error)
284 goto out_gunlock;
285
286 error = gfs2_dir_del(dip, &dentry->d_name);
287 if (error)
288 goto out_end_trans;
289
290 error = gfs2_change_nlink(ip, -1);
291
292out_end_trans:
293 gfs2_trans_end(sdp);
294out_gunlock:
295 gfs2_glock_dq_m(2, ghs);
296out:
297 gfs2_holder_uninit(ghs);
298 gfs2_holder_uninit(ghs + 1);
299 return error;
300}
301
302/**
303 * gfs2_symlink - Create a symlink
304 * @dir: The directory to create the symlink in
305 * @dentry: The dentry to put the symlink in
306 * @symname: The thing which the link points to
307 *
308 * Returns: errno
309 */
310
311static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
312 const char *symname)
313{
314 struct gfs2_inode *dip = GFS2_I(dir), *ip;
315 struct gfs2_sbd *sdp = GFS2_SB(dir);
316 struct gfs2_holder ghs[2];
317 struct inode *inode;
318 struct buffer_head *dibh;
319 int size;
320 int error;
321
322 /* Must be stuffed with a null terminator for gfs2_follow_link() */
323 size = strlen(symname);
324 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
325 return -ENAMETOOLONG;
326
327 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
328
329 inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO);
330 if (IS_ERR(inode)) {
331 gfs2_holder_uninit(ghs);
332 return PTR_ERR(inode);
333 }
334
335 ip = ghs[1].gh_gl->gl_object;
336
337 ip->i_di.di_size = size;
338
339 error = gfs2_meta_inode_buffer(ip, &dibh);
340
341 if (!gfs2_assert_withdraw(sdp, !error)) {
342 gfs2_dinode_out(&ip->i_di, dibh->b_data);
343 memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname,
344 size);
345 brelse(dibh);
346 }
347
348 gfs2_trans_end(sdp);
349 if (dip->i_alloc.al_rgd)
350 gfs2_inplace_release(dip);
351 gfs2_quota_unlock(dip);
352 gfs2_alloc_put(dip);
353
354 gfs2_glock_dq_uninit_m(2, ghs);
355
356 d_instantiate(dentry, inode);
357 mark_inode_dirty(inode);
358
359 return 0;
360}
361
362/**
363 * gfs2_mkdir - Make a directory
364 * @dir: The parent directory of the new one
365 * @dentry: The dentry of the new directory
366 * @mode: The mode of the new directory
367 *
368 * Returns: errno
369 */
370
371static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
372{
373 struct gfs2_inode *dip = GFS2_I(dir), *ip;
374 struct gfs2_sbd *sdp = GFS2_SB(dir);
375 struct gfs2_holder ghs[2];
376 struct inode *inode;
377 struct buffer_head *dibh;
378 int error;
379
380 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
381
382 inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode);
383 if (IS_ERR(inode)) {
384 gfs2_holder_uninit(ghs);
385 return PTR_ERR(inode);
386 }
387
388 ip = ghs[1].gh_gl->gl_object;
389
390 ip->i_di.di_nlink = 2;
391 ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
392 ip->i_di.di_flags |= GFS2_DIF_JDATA;
393 ip->i_di.di_payload_format = GFS2_FORMAT_DE;
394 ip->i_di.di_entries = 2;
395
396 error = gfs2_meta_inode_buffer(ip, &dibh);
397
398 if (!gfs2_assert_withdraw(sdp, !error)) {
399 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
400 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
401 struct qstr str;
402
403 gfs2_str2qstr(&str, ".");
404 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
405 gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent);
406 dent->de_inum = di->di_num; /* already GFS2 endian */
407 dent->de_type = cpu_to_be16(DT_DIR);
408 di->di_entries = cpu_to_be32(1);
409
410 gfs2_str2qstr(&str, "..");
411 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
412 gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
413
414 gfs2_inum_out(&dip->i_num, &dent->de_inum);
415 dent->de_type = cpu_to_be16(DT_DIR);
416
417 gfs2_dinode_out(&ip->i_di, di);
418
419 brelse(dibh);
420 }
421
422 error = gfs2_change_nlink(dip, +1);
423 gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
424
425 gfs2_trans_end(sdp);
426 if (dip->i_alloc.al_rgd)
427 gfs2_inplace_release(dip);
428 gfs2_quota_unlock(dip);
429 gfs2_alloc_put(dip);
430
431 gfs2_glock_dq_uninit_m(2, ghs);
432
433 d_instantiate(dentry, inode);
434 mark_inode_dirty(inode);
435
436 return 0;
437}
438
439/**
440 * gfs2_rmdir - Remove a directory
441 * @dir: The parent directory of the directory to be removed
442 * @dentry: The dentry of the directory to remove
443 *
444 * Remove a directory. Call gfs2_rmdiri()
445 *
446 * Returns: errno
447 */
448
449static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
450{
451 struct gfs2_inode *dip = GFS2_I(dir);
452 struct gfs2_sbd *sdp = GFS2_SB(dir);
453 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
454 struct gfs2_holder ghs[2];
455 int error;
456
457 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
458 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
459
460 error = gfs2_glock_nq_m(2, ghs);
461 if (error)
462 goto out;
463
464 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
465 if (error)
466 goto out_gunlock;
467
468 if (ip->i_di.di_entries < 2) {
469 if (gfs2_consist_inode(ip))
470 gfs2_dinode_print(&ip->i_di);
471 error = -EIO;
472 goto out_gunlock;
473 }
474 if (ip->i_di.di_entries > 2) {
475 error = -ENOTEMPTY;
476 goto out_gunlock;
477 }
478
479 error = gfs2_trans_begin(sdp, 2 * RES_DINODE + 3 * RES_LEAF + RES_RG_BIT, 0);
480 if (error)
481 goto out_gunlock;
482
483 error = gfs2_rmdiri(dip, &dentry->d_name, ip);
484
485 gfs2_trans_end(sdp);
486
487out_gunlock:
488 gfs2_glock_dq_m(2, ghs);
489out:
490 gfs2_holder_uninit(ghs);
491 gfs2_holder_uninit(ghs + 1);
492 return error;
493}
494
495/**
496 * gfs2_mknod - Make a special file
497 * @dir: The directory in which the special file will reside
498 * @dentry: The dentry of the special file
499 * @mode: The mode of the special file
500 * @rdev: The device specification of the special file
501 *
502 */
503
504static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
505 dev_t dev)
506{
507 struct gfs2_inode *dip = GFS2_I(dir), *ip;
508 struct gfs2_sbd *sdp = GFS2_SB(dir);
509 struct gfs2_holder ghs[2];
510 struct inode *inode;
511 struct buffer_head *dibh;
512 u32 major = 0, minor = 0;
513 int error;
514
515 switch (mode & S_IFMT) {
516 case S_IFBLK:
517 case S_IFCHR:
518 major = MAJOR(dev);
519 minor = MINOR(dev);
520 break;
521 case S_IFIFO:
522 case S_IFSOCK:
523 break;
524 default:
525 return -EOPNOTSUPP;
526 };
527
528 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
529
530 inode = gfs2_createi(ghs, &dentry->d_name, mode);
531 if (IS_ERR(inode)) {
532 gfs2_holder_uninit(ghs);
533 return PTR_ERR(inode);
534 }
535
536 ip = ghs[1].gh_gl->gl_object;
537
538 ip->i_di.di_major = major;
539 ip->i_di.di_minor = minor;
540
541 error = gfs2_meta_inode_buffer(ip, &dibh);
542
543 if (!gfs2_assert_withdraw(sdp, !error)) {
544 gfs2_dinode_out(&ip->i_di, dibh->b_data);
545 brelse(dibh);
546 }
547
548 gfs2_trans_end(sdp);
549 if (dip->i_alloc.al_rgd)
550 gfs2_inplace_release(dip);
551 gfs2_quota_unlock(dip);
552 gfs2_alloc_put(dip);
553
554 gfs2_glock_dq_uninit_m(2, ghs);
555
556 d_instantiate(dentry, inode);
557 mark_inode_dirty(inode);
558
559 return 0;
560}
561
562/**
563 * gfs2_rename - Rename a file
564 * @odir: Parent directory of old file name
565 * @odentry: The old dentry of the file
566 * @ndir: Parent directory of new file name
567 * @ndentry: The new dentry of the file
568 *
569 * Returns: errno
570 */
571
572static int gfs2_rename(struct inode *odir, struct dentry *odentry,
573 struct inode *ndir, struct dentry *ndentry)
574{
575 struct gfs2_inode *odip = GFS2_I(odir);
576 struct gfs2_inode *ndip = GFS2_I(ndir);
577 struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
578 struct gfs2_inode *nip = NULL;
579 struct gfs2_sbd *sdp = GFS2_SB(odir);
580 struct gfs2_holder ghs[4], r_gh;
581 unsigned int num_gh;
582 int dir_rename = 0;
583 int alloc_required;
584 unsigned int x;
585 int error;
586
587 if (ndentry->d_inode) {
588 nip = GFS2_I(ndentry->d_inode);
589 if (ip == nip)
590 return 0;
591 }
592
593 /* Make sure we aren't trying to move a dirctory into it's subdir */
594
595 if (S_ISDIR(ip->i_di.di_mode) && odip != ndip) {
596 dir_rename = 1;
597
598 error = gfs2_glock_nq_init(sdp->sd_rename_gl,
599 LM_ST_EXCLUSIVE, 0,
600 &r_gh);
601 if (error)
602 goto out;
603
604 error = gfs2_ok_to_move(ip, ndip);
605 if (error)
606 goto out_gunlock_r;
607 }
608
609 num_gh = 1;
610 gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
611 if (odip != ndip) {
612 gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
613 num_gh++;
614 }
615 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
616 num_gh++;
617
618 if (nip) {
619 gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
620 num_gh++;
621 }
622
623 error = gfs2_glock_nq_m(num_gh, ghs);
624 if (error)
625 goto out_uninit;
626
627 /* Check out the old directory */
628
629 error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
630 if (error)
631 goto out_gunlock;
632
633 /* Check out the new directory */
634
635 if (nip) {
636 error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
637 if (error)
638 goto out_gunlock;
639
640 if (S_ISDIR(nip->i_di.di_mode)) {
641 if (nip->i_di.di_entries < 2) {
642 if (gfs2_consist_inode(nip))
643 gfs2_dinode_print(&nip->i_di);
644 error = -EIO;
645 goto out_gunlock;
646 }
647 if (nip->i_di.di_entries > 2) {
648 error = -ENOTEMPTY;
649 goto out_gunlock;
650 }
651 }
652 } else {
653 error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL);
654 if (error)
655 goto out_gunlock;
656
657 error = gfs2_dir_search(ndir, &ndentry->d_name, NULL, NULL);
658 switch (error) {
659 case -ENOENT:
660 error = 0;
661 break;
662 case 0:
663 error = -EEXIST;
664 default:
665 goto out_gunlock;
666 };
667
668 if (odip != ndip) {
669 if (!ndip->i_di.di_nlink) {
670 error = -EINVAL;
671 goto out_gunlock;
672 }
673 if (ndip->i_di.di_entries == (u32)-1) {
674 error = -EFBIG;
675 goto out_gunlock;
676 }
677 if (S_ISDIR(ip->i_di.di_mode) &&
678 ndip->i_di.di_nlink == (u32)-1) {
679 error = -EMLINK;
680 goto out_gunlock;
681 }
682 }
683 }
684
685 /* Check out the dir to be renamed */
686
687 if (dir_rename) {
688 error = permission(odentry->d_inode, MAY_WRITE, NULL);
689 if (error)
690 goto out_gunlock;
691 }
692
693 alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
694 if (error < 0)
695 goto out_gunlock;
696 error = 0;
697
698 if (alloc_required) {
699 struct gfs2_alloc *al = gfs2_alloc_get(ndip);
700
701 error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
702 if (error)
703 goto out_alloc;
704
705 error = gfs2_quota_check(ndip, ndip->i_di.di_uid,
706 ndip->i_di.di_gid);
707 if (error)
708 goto out_gunlock_q;
709
710 al->al_requested = sdp->sd_max_dirres;
711
712 error = gfs2_inplace_reserve(ndip);
713 if (error)
714 goto out_gunlock_q;
715
716 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
717 al->al_rgd->rd_ri.ri_length +
718 4 * RES_DINODE + 4 * RES_LEAF +
719 RES_STATFS + RES_QUOTA, 0);
720 if (error)
721 goto out_ipreserv;
722 } else {
723 error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
724 5 * RES_LEAF, 0);
725 if (error)
726 goto out_gunlock;
727 }
728
729 /* Remove the target file, if it exists */
730
731 if (nip) {
732 if (S_ISDIR(nip->i_di.di_mode))
733 error = gfs2_rmdiri(ndip, &ndentry->d_name, nip);
734 else {
735 error = gfs2_dir_del(ndip, &ndentry->d_name);
736 if (error)
737 goto out_end_trans;
738 error = gfs2_change_nlink(nip, -1);
739 }
740 if (error)
741 goto out_end_trans;
742 }
743
744 if (dir_rename) {
745 struct qstr name;
746 gfs2_str2qstr(&name, "..");
747
748 error = gfs2_change_nlink(ndip, +1);
749 if (error)
750 goto out_end_trans;
751 error = gfs2_change_nlink(odip, -1);
752 if (error)
753 goto out_end_trans;
754
755 error = gfs2_dir_mvino(ip, &name, &ndip->i_num, DT_DIR);
756 if (error)
757 goto out_end_trans;
758 } else {
759 struct buffer_head *dibh;
760 error = gfs2_meta_inode_buffer(ip, &dibh);
761 if (error)
762 goto out_end_trans;
763 ip->i_di.di_ctime = get_seconds();
764 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
765 gfs2_dinode_out(&ip->i_di, dibh->b_data);
766 brelse(dibh);
767 }
768
769 error = gfs2_dir_del(odip, &odentry->d_name);
770 if (error)
771 goto out_end_trans;
772
773 error = gfs2_dir_add(ndir, &ndentry->d_name, &ip->i_num,
774 IF2DT(ip->i_di.di_mode));
775 if (error)
776 goto out_end_trans;
777
778out_end_trans:
779 gfs2_trans_end(sdp);
780out_ipreserv:
781 if (alloc_required)
782 gfs2_inplace_release(ndip);
783out_gunlock_q:
784 if (alloc_required)
785 gfs2_quota_unlock(ndip);
786out_alloc:
787 if (alloc_required)
788 gfs2_alloc_put(ndip);
789out_gunlock:
790 gfs2_glock_dq_m(num_gh, ghs);
791out_uninit:
792 for (x = 0; x < num_gh; x++)
793 gfs2_holder_uninit(ghs + x);
794out_gunlock_r:
795 if (dir_rename)
796 gfs2_glock_dq_uninit(&r_gh);
797out:
798 return error;
799}
800
801/**
802 * gfs2_readlink - Read the value of a symlink
803 * @dentry: the symlink
804 * @buf: the buffer to read the symlink data into
805 * @size: the size of the buffer
806 *
807 * Returns: errno
808 */
809
810static int gfs2_readlink(struct dentry *dentry, char __user *user_buf,
811 int user_size)
812{
813 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
814 char array[GFS2_FAST_NAME_SIZE], *buf = array;
815 unsigned int len = GFS2_FAST_NAME_SIZE;
816 int error;
817
818 error = gfs2_readlinki(ip, &buf, &len);
819 if (error)
820 return error;
821
822 if (user_size > len - 1)
823 user_size = len - 1;
824
825 if (copy_to_user(user_buf, buf, user_size))
826 error = -EFAULT;
827 else
828 error = user_size;
829
830 if (buf != array)
831 kfree(buf);
832
833 return error;
834}
835
836/**
837 * gfs2_follow_link - Follow a symbolic link
838 * @dentry: The dentry of the link
839 * @nd: Data that we pass to vfs_follow_link()
840 *
841 * This can handle symlinks of any size. It is optimised for symlinks
842 * under GFS2_FAST_NAME_SIZE.
843 *
844 * Returns: 0 on success or error code
845 */
846
847static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
848{
849 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
850 char array[GFS2_FAST_NAME_SIZE], *buf = array;
851 unsigned int len = GFS2_FAST_NAME_SIZE;
852 int error;
853
854 error = gfs2_readlinki(ip, &buf, &len);
855 if (!error) {
856 error = vfs_follow_link(nd, buf);
857 if (buf != array)
858 kfree(buf);
859 }
860
861 return ERR_PTR(error);
862}
863
864/**
865 * gfs2_permission -
866 * @inode:
867 * @mask:
868 * @nd: passed from Linux VFS, ignored by us
869 *
870 * Returns: errno
871 */
872
873static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
874{
875 struct gfs2_inode *ip = GFS2_I(inode);
876 struct gfs2_holder i_gh;
877 int error;
878
879 if (ip->i_vn == ip->i_gl->gl_vn)
880 return generic_permission(inode, mask, gfs2_check_acl);
881
882 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
883 if (!error) {
884 error = generic_permission(inode, mask, gfs2_check_acl_locked);
885 gfs2_glock_dq_uninit(&i_gh);
886 }
887
888 return error;
889}
890
891static int setattr_size(struct inode *inode, struct iattr *attr)
892{
893 struct gfs2_inode *ip = GFS2_I(inode);
894 int error;
895
896 if (attr->ia_size != ip->i_di.di_size) {
897 error = vmtruncate(inode, attr->ia_size);
898 if (error)
899 return error;
900 }
901
902 error = gfs2_truncatei(ip, attr->ia_size);
903 if (error)
904 return error;
905
906 return error;
907}
908
909static int setattr_chown(struct inode *inode, struct iattr *attr)
910{
911 struct gfs2_inode *ip = GFS2_I(inode);
912 struct gfs2_sbd *sdp = GFS2_SB(inode);
913 struct buffer_head *dibh;
914 u32 ouid, ogid, nuid, ngid;
915 int error;
916
917 ouid = ip->i_di.di_uid;
918 ogid = ip->i_di.di_gid;
919 nuid = attr->ia_uid;
920 ngid = attr->ia_gid;
921
922 if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
923 ouid = nuid = NO_QUOTA_CHANGE;
924 if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
925 ogid = ngid = NO_QUOTA_CHANGE;
926
927 gfs2_alloc_get(ip);
928
929 error = gfs2_quota_lock(ip, nuid, ngid);
930 if (error)
931 goto out_alloc;
932
933 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
934 error = gfs2_quota_check(ip, nuid, ngid);
935 if (error)
936 goto out_gunlock_q;
937 }
938
939 error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
940 if (error)
941 goto out_gunlock_q;
942
943 error = gfs2_meta_inode_buffer(ip, &dibh);
944 if (error)
945 goto out_end_trans;
946
947 error = inode_setattr(inode, attr);
948 gfs2_assert_warn(sdp, !error);
949 gfs2_inode_attr_out(ip);
950
951 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
952 gfs2_dinode_out(&ip->i_di, dibh->b_data);
953 brelse(dibh);
954
955 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
956 gfs2_quota_change(ip, -ip->i_di.di_blocks, ouid, ogid);
957 gfs2_quota_change(ip, ip->i_di.di_blocks, nuid, ngid);
958 }
959
960out_end_trans:
961 gfs2_trans_end(sdp);
962out_gunlock_q:
963 gfs2_quota_unlock(ip);
964out_alloc:
965 gfs2_alloc_put(ip);
966 return error;
967}
968
969/**
970 * gfs2_setattr - Change attributes on an inode
971 * @dentry: The dentry which is changing
972 * @attr: The structure describing the change
973 *
974 * The VFS layer wants to change one or more of an inodes attributes. Write
975 * that change out to disk.
976 *
977 * Returns: errno
978 */
979
980static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
981{
982 struct inode *inode = dentry->d_inode;
983 struct gfs2_inode *ip = GFS2_I(inode);
984 struct gfs2_holder i_gh;
985 int error;
986
987 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
988 if (error)
989 return error;
990
991 error = -EPERM;
992 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
993 goto out;
994
995 error = inode_change_ok(inode, attr);
996 if (error)
997 goto out;
998
999 if (attr->ia_valid & ATTR_SIZE)
1000 error = setattr_size(inode, attr);
1001 else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
1002 error = setattr_chown(inode, attr);
1003 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
1004 error = gfs2_acl_chmod(ip, attr);
1005 else
1006 error = gfs2_setattr_simple(ip, attr);
1007
1008out:
1009 gfs2_glock_dq_uninit(&i_gh);
1010 if (!error)
1011 mark_inode_dirty(inode);
1012 return error;
1013}
1014
1015/**
1016 * gfs2_getattr - Read out an inode's attributes
1017 * @mnt: The vfsmount the inode is being accessed from
1018 * @dentry: The dentry to stat
1019 * @stat: The inode's stats
1020 *
1021 * Returns: errno
1022 */
1023
1024static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
1025 struct kstat *stat)
1026{
1027 struct inode *inode = dentry->d_inode;
1028 struct gfs2_inode *ip = GFS2_I(inode);
1029 struct gfs2_holder gh;
1030 int error;
1031
1032 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
1033 if (!error) {
1034 generic_fillattr(inode, stat);
1035 gfs2_glock_dq_uninit(&gh);
1036 }
1037
1038 return error;
1039}
1040
1041static int gfs2_setxattr(struct dentry *dentry, const char *name,
1042 const void *data, size_t size, int flags)
1043{
1044 struct inode *inode = dentry->d_inode;
1045 struct gfs2_ea_request er;
1046
1047 memset(&er, 0, sizeof(struct gfs2_ea_request));
1048 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1049 if (er.er_type == GFS2_EATYPE_UNUSED)
1050 return -EOPNOTSUPP;
1051 er.er_data = (char *)data;
1052 er.er_name_len = strlen(er.er_name);
1053 er.er_data_len = size;
1054 er.er_flags = flags;
1055
1056 gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE));
1057
1058 return gfs2_ea_set(GFS2_I(inode), &er);
1059}
1060
1061static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
1062 void *data, size_t size)
1063{
1064 struct gfs2_ea_request er;
1065
1066 memset(&er, 0, sizeof(struct gfs2_ea_request));
1067 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1068 if (er.er_type == GFS2_EATYPE_UNUSED)
1069 return -EOPNOTSUPP;
1070 er.er_data = data;
1071 er.er_name_len = strlen(er.er_name);
1072 er.er_data_len = size;
1073
1074 return gfs2_ea_get(GFS2_I(dentry->d_inode), &er);
1075}
1076
1077static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
1078{
1079 struct gfs2_ea_request er;
1080
1081 memset(&er, 0, sizeof(struct gfs2_ea_request));
1082 er.er_data = (size) ? buffer : NULL;
1083 er.er_data_len = size;
1084
1085 return gfs2_ea_list(GFS2_I(dentry->d_inode), &er);
1086}
1087
1088static int gfs2_removexattr(struct dentry *dentry, const char *name)
1089{
1090 struct gfs2_ea_request er;
1091
1092 memset(&er, 0, sizeof(struct gfs2_ea_request));
1093 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1094 if (er.er_type == GFS2_EATYPE_UNUSED)
1095 return -EOPNOTSUPP;
1096 er.er_name_len = strlen(er.er_name);
1097
1098 return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
1099}
1100
1101struct inode_operations gfs2_file_iops = {
1102 .permission = gfs2_permission,
1103 .setattr = gfs2_setattr,
1104 .getattr = gfs2_getattr,
1105 .setxattr = gfs2_setxattr,
1106 .getxattr = gfs2_getxattr,
1107 .listxattr = gfs2_listxattr,
1108 .removexattr = gfs2_removexattr,
1109};
1110
1111struct inode_operations gfs2_dev_iops = {
1112 .permission = gfs2_permission,
1113 .setattr = gfs2_setattr,
1114 .getattr = gfs2_getattr,
1115 .setxattr = gfs2_setxattr,
1116 .getxattr = gfs2_getxattr,
1117 .listxattr = gfs2_listxattr,
1118 .removexattr = gfs2_removexattr,
1119};
1120
1121struct inode_operations gfs2_dir_iops = {
1122 .create = gfs2_create,
1123 .lookup = gfs2_lookup,
1124 .link = gfs2_link,
1125 .unlink = gfs2_unlink,
1126 .symlink = gfs2_symlink,
1127 .mkdir = gfs2_mkdir,
1128 .rmdir = gfs2_rmdir,
1129 .mknod = gfs2_mknod,
1130 .rename = gfs2_rename,
1131 .permission = gfs2_permission,
1132 .setattr = gfs2_setattr,
1133 .getattr = gfs2_getattr,
1134 .setxattr = gfs2_setxattr,
1135 .getxattr = gfs2_getxattr,
1136 .listxattr = gfs2_listxattr,
1137 .removexattr = gfs2_removexattr,
1138};
1139
1140struct inode_operations gfs2_symlink_iops = {
1141 .readlink = gfs2_readlink,
1142 .follow_link = gfs2_follow_link,
1143 .permission = gfs2_permission,
1144 .setattr = gfs2_setattr,
1145 .getattr = gfs2_getattr,
1146 .setxattr = gfs2_setxattr,
1147 .getxattr = gfs2_getxattr,
1148 .listxattr = gfs2_listxattr,
1149 .removexattr = gfs2_removexattr,
1150};
1151
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
new file mode 100644
index 000000000000..b15acb4fd34c
--- /dev/null
+++ b/fs/gfs2/ops_inode.h
@@ -0,0 +1,20 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_INODE_DOT_H__
11#define __OPS_INODE_DOT_H__
12
13#include <linux/fs.h>
14
15extern struct inode_operations gfs2_file_iops;
16extern struct inode_operations gfs2_dir_iops;
17extern struct inode_operations gfs2_symlink_iops;
18extern struct inode_operations gfs2_dev_iops;
19
20#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
new file mode 100644
index 000000000000..06f06f7773d0
--- /dev/null
+++ b/fs/gfs2/ops_super.c
@@ -0,0 +1,468 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/statfs.h>
16#include <linux/seq_file.h>
17#include <linux/mount.h>
18#include <linux/kthread.h>
19#include <linux/delay.h>
20#include <linux/gfs2_ondisk.h>
21#include <linux/crc32.h>
22#include <linux/lm_interface.h>
23
24#include "gfs2.h"
25#include "incore.h"
26#include "glock.h"
27#include "inode.h"
28#include "lm.h"
29#include "log.h"
30#include "mount.h"
31#include "ops_super.h"
32#include "quota.h"
33#include "recovery.h"
34#include "rgrp.h"
35#include "super.h"
36#include "sys.h"
37#include "util.h"
38#include "trans.h"
39#include "dir.h"
40#include "eattr.h"
41#include "bmap.h"
42
43/**
44 * gfs2_write_inode - Make sure the inode is stable on the disk
45 * @inode: The inode
46 * @sync: synchronous write flag
47 *
48 * Returns: errno
49 */
50
51static int gfs2_write_inode(struct inode *inode, int sync)
52{
53 struct gfs2_inode *ip = GFS2_I(inode);
54
55 /* Check this is a "normal" inode */
56 if (inode->i_private) {
57 if (current->flags & PF_MEMALLOC)
58 return 0;
59 if (sync)
60 gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
61 }
62
63 return 0;
64}
65
66/**
67 * gfs2_put_super - Unmount the filesystem
68 * @sb: The VFS superblock
69 *
70 */
71
72static void gfs2_put_super(struct super_block *sb)
73{
74 struct gfs2_sbd *sdp = sb->s_fs_info;
75 int error;
76
77 if (!sdp)
78 return;
79
80 if (!strncmp(sb->s_type->name, "gfs2meta", 8))
81 return; /* Nothing to do */
82
83 /* Unfreeze the filesystem, if we need to */
84
85 mutex_lock(&sdp->sd_freeze_lock);
86 if (sdp->sd_freeze_count)
87 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
88 mutex_unlock(&sdp->sd_freeze_lock);
89
90 kthread_stop(sdp->sd_quotad_process);
91 kthread_stop(sdp->sd_logd_process);
92 kthread_stop(sdp->sd_recoverd_process);
93 while (sdp->sd_glockd_num--)
94 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
95 kthread_stop(sdp->sd_scand_process);
96
97 if (!(sb->s_flags & MS_RDONLY)) {
98 error = gfs2_make_fs_ro(sdp);
99 if (error)
100 gfs2_io_error(sdp);
101 }
102 /* At this point, we're through modifying the disk */
103
104 /* Release stuff */
105
106 iput(sdp->sd_master_dir);
107 iput(sdp->sd_jindex);
108 iput(sdp->sd_inum_inode);
109 iput(sdp->sd_statfs_inode);
110 iput(sdp->sd_rindex);
111 iput(sdp->sd_quota_inode);
112
113 gfs2_glock_put(sdp->sd_rename_gl);
114 gfs2_glock_put(sdp->sd_trans_gl);
115
116 if (!sdp->sd_args.ar_spectator) {
117 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
118 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
119 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
120 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
121 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
122 iput(sdp->sd_ir_inode);
123 iput(sdp->sd_sc_inode);
124 iput(sdp->sd_qc_inode);
125 }
126
127 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
128 gfs2_clear_rgrpd(sdp);
129 gfs2_jindex_free(sdp);
130 /* Take apart glock structures and buffer lists */
131 gfs2_gl_hash_clear(sdp, WAIT);
132 /* Unmount the locking protocol */
133 gfs2_lm_unmount(sdp);
134
135 /* At this point, we're through participating in the lockspace */
136 gfs2_sys_fs_del(sdp);
137 kfree(sdp);
138}
139
140/**
141 * gfs2_write_super - disk commit all incore transactions
142 * @sb: the filesystem
143 *
144 * This function is called every time sync(2) is called.
145 * After this exits, all dirty buffers are synced.
146 */
147
148static void gfs2_write_super(struct super_block *sb)
149{
150 gfs2_log_flush(sb->s_fs_info, NULL);
151}
152
153/**
154 * gfs2_write_super_lockfs - prevent further writes to the filesystem
155 * @sb: the VFS structure for the filesystem
156 *
157 */
158
159static void gfs2_write_super_lockfs(struct super_block *sb)
160{
161 struct gfs2_sbd *sdp = sb->s_fs_info;
162 int error;
163
164 for (;;) {
165 error = gfs2_freeze_fs(sdp);
166 if (!error)
167 break;
168
169 switch (error) {
170 case -EBUSY:
171 fs_err(sdp, "waiting for recovery before freeze\n");
172 break;
173
174 default:
175 fs_err(sdp, "error freezing FS: %d\n", error);
176 break;
177 }
178
179 fs_err(sdp, "retrying...\n");
180 msleep(1000);
181 }
182}
183
184/**
185 * gfs2_unlockfs - reallow writes to the filesystem
186 * @sb: the VFS structure for the filesystem
187 *
188 */
189
190static void gfs2_unlockfs(struct super_block *sb)
191{
192 gfs2_unfreeze_fs(sb->s_fs_info);
193}
194
195/**
196 * gfs2_statfs - Gather and return stats about the filesystem
197 * @sb: The superblock
198 * @statfsbuf: The buffer
199 *
200 * Returns: 0 on success or error code
201 */
202
203static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
204{
205 struct super_block *sb = dentry->d_inode->i_sb;
206 struct gfs2_sbd *sdp = sb->s_fs_info;
207 struct gfs2_statfs_change sc;
208 int error;
209
210 if (gfs2_tune_get(sdp, gt_statfs_slow))
211 error = gfs2_statfs_slow(sdp, &sc);
212 else
213 error = gfs2_statfs_i(sdp, &sc);
214
215 if (error)
216 return error;
217
218 buf->f_type = GFS2_MAGIC;
219 buf->f_bsize = sdp->sd_sb.sb_bsize;
220 buf->f_blocks = sc.sc_total;
221 buf->f_bfree = sc.sc_free;
222 buf->f_bavail = sc.sc_free;
223 buf->f_files = sc.sc_dinodes + sc.sc_free;
224 buf->f_ffree = sc.sc_free;
225 buf->f_namelen = GFS2_FNAMESIZE;
226
227 return 0;
228}
229
230/**
231 * gfs2_remount_fs - called when the FS is remounted
232 * @sb: the filesystem
233 * @flags: the remount flags
234 * @data: extra data passed in (not used right now)
235 *
236 * Returns: errno
237 */
238
239static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
240{
241 struct gfs2_sbd *sdp = sb->s_fs_info;
242 int error;
243
244 error = gfs2_mount_args(sdp, data, 1);
245 if (error)
246 return error;
247
248 if (sdp->sd_args.ar_spectator)
249 *flags |= MS_RDONLY;
250 else {
251 if (*flags & MS_RDONLY) {
252 if (!(sb->s_flags & MS_RDONLY))
253 error = gfs2_make_fs_ro(sdp);
254 } else if (!(*flags & MS_RDONLY) &&
255 (sb->s_flags & MS_RDONLY)) {
256 error = gfs2_make_fs_rw(sdp);
257 }
258 }
259
260 if (*flags & (MS_NOATIME | MS_NODIRATIME))
261 set_bit(SDF_NOATIME, &sdp->sd_flags);
262 else
263 clear_bit(SDF_NOATIME, &sdp->sd_flags);
264
265 /* Don't let the VFS update atimes. GFS2 handles this itself. */
266 *flags |= MS_NOATIME | MS_NODIRATIME;
267
268 return error;
269}
270
271/**
272 * gfs2_clear_inode - Deallocate an inode when VFS is done with it
273 * @inode: The VFS inode
274 *
275 */
276
277static void gfs2_clear_inode(struct inode *inode)
278{
279 /* This tells us its a "real" inode and not one which only
280 * serves to contain an address space (see rgrp.c, meta_io.c)
281 * which therefore doesn't have its own glocks.
282 */
283 if (inode->i_private) {
284 struct gfs2_inode *ip = GFS2_I(inode);
285 gfs2_glock_inode_squish(inode);
286 gfs2_assert(inode->i_sb->s_fs_info, ip->i_gl->gl_state == LM_ST_UNLOCKED);
287 ip->i_gl->gl_object = NULL;
288 gfs2_glock_schedule_for_reclaim(ip->i_gl);
289 gfs2_glock_put(ip->i_gl);
290 ip->i_gl = NULL;
291 if (ip->i_iopen_gh.gh_gl)
292 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
293 }
294}
295
296/**
297 * gfs2_show_options - Show mount options for /proc/mounts
298 * @s: seq_file structure
299 * @mnt: vfsmount
300 *
301 * Returns: 0 on success or error code
302 */
303
304static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
305{
306 struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
307 struct gfs2_args *args = &sdp->sd_args;
308
309 if (args->ar_lockproto[0])
310 seq_printf(s, ",lockproto=%s", args->ar_lockproto);
311 if (args->ar_locktable[0])
312 seq_printf(s, ",locktable=%s", args->ar_locktable);
313 if (args->ar_hostdata[0])
314 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
315 if (args->ar_spectator)
316 seq_printf(s, ",spectator");
317 if (args->ar_ignore_local_fs)
318 seq_printf(s, ",ignore_local_fs");
319 if (args->ar_localflocks)
320 seq_printf(s, ",localflocks");
321 if (args->ar_localcaching)
322 seq_printf(s, ",localcaching");
323 if (args->ar_debug)
324 seq_printf(s, ",debug");
325 if (args->ar_upgrade)
326 seq_printf(s, ",upgrade");
327 if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
328 seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
329 if (args->ar_posix_acl)
330 seq_printf(s, ",acl");
331 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
332 char *state;
333 switch (args->ar_quota) {
334 case GFS2_QUOTA_OFF:
335 state = "off";
336 break;
337 case GFS2_QUOTA_ACCOUNT:
338 state = "account";
339 break;
340 case GFS2_QUOTA_ON:
341 state = "on";
342 break;
343 default:
344 state = "unknown";
345 break;
346 }
347 seq_printf(s, ",quota=%s", state);
348 }
349 if (args->ar_suiddir)
350 seq_printf(s, ",suiddir");
351 if (args->ar_data != GFS2_DATA_DEFAULT) {
352 char *state;
353 switch (args->ar_data) {
354 case GFS2_DATA_WRITEBACK:
355 state = "writeback";
356 break;
357 case GFS2_DATA_ORDERED:
358 state = "ordered";
359 break;
360 default:
361 state = "unknown";
362 break;
363 }
364 seq_printf(s, ",data=%s", state);
365 }
366
367 return 0;
368}
369
370/*
371 * We have to (at the moment) hold the inodes main lock to cover
372 * the gap between unlocking the shared lock on the iopen lock and
373 * taking the exclusive lock. I'd rather do a shared -> exclusive
374 * conversion on the iopen lock, but we can change that later. This
375 * is safe, just less efficient.
376 */
377static void gfs2_delete_inode(struct inode *inode)
378{
379 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
380 struct gfs2_inode *ip = GFS2_I(inode);
381 struct gfs2_holder gh;
382 int error;
383
384 if (!inode->i_private)
385 goto out;
386
387 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &gh);
388 if (unlikely(error)) {
389 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
390 goto out;
391 }
392
393 gfs2_glock_dq(&ip->i_iopen_gh);
394 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
395 error = gfs2_glock_nq(&ip->i_iopen_gh);
396 if (error)
397 goto out_uninit;
398
399 if (S_ISDIR(ip->i_di.di_mode) &&
400 (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
401 error = gfs2_dir_exhash_dealloc(ip);
402 if (error)
403 goto out_unlock;
404 }
405
406 if (ip->i_di.di_eattr) {
407 error = gfs2_ea_dealloc(ip);
408 if (error)
409 goto out_unlock;
410 }
411
412 if (!gfs2_is_stuffed(ip)) {
413 error = gfs2_file_dealloc(ip);
414 if (error)
415 goto out_unlock;
416 }
417
418 error = gfs2_dinode_dealloc(ip);
419
420out_unlock:
421 gfs2_glock_dq(&ip->i_iopen_gh);
422out_uninit:
423 gfs2_holder_uninit(&ip->i_iopen_gh);
424 gfs2_glock_dq_uninit(&gh);
425 if (error)
426 fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
427out:
428 truncate_inode_pages(&inode->i_data, 0);
429 clear_inode(inode);
430}
431
432
433
434static struct inode *gfs2_alloc_inode(struct super_block *sb)
435{
436 struct gfs2_sbd *sdp = sb->s_fs_info;
437 struct gfs2_inode *ip;
438
439 ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
440 if (ip) {
441 ip->i_flags = 0;
442 ip->i_gl = NULL;
443 ip->i_greedy = gfs2_tune_get(sdp, gt_greedy_default);
444 ip->i_last_pfault = jiffies;
445 }
446 return &ip->i_inode;
447}
448
449static void gfs2_destroy_inode(struct inode *inode)
450{
451 kmem_cache_free(gfs2_inode_cachep, inode);
452}
453
454struct super_operations gfs2_super_ops = {
455 .alloc_inode = gfs2_alloc_inode,
456 .destroy_inode = gfs2_destroy_inode,
457 .write_inode = gfs2_write_inode,
458 .delete_inode = gfs2_delete_inode,
459 .put_super = gfs2_put_super,
460 .write_super = gfs2_write_super,
461 .write_super_lockfs = gfs2_write_super_lockfs,
462 .unlockfs = gfs2_unlockfs,
463 .statfs = gfs2_statfs,
464 .remount_fs = gfs2_remount_fs,
465 .clear_inode = gfs2_clear_inode,
466 .show_options = gfs2_show_options,
467};
468
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
new file mode 100644
index 000000000000..9de73f042f78
--- /dev/null
+++ b/fs/gfs2/ops_super.h
@@ -0,0 +1,17 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_SUPER_DOT_H__
11#define __OPS_SUPER_DOT_H__
12
13#include <linux/fs.h>
14
15extern struct super_operations gfs2_super_ops;
16
17#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
new file mode 100644
index 000000000000..5453d2947ab3
--- /dev/null
+++ b/fs/gfs2/ops_vm.c
@@ -0,0 +1,184 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/lm_interface.h>
19
20#include "gfs2.h"
21#include "incore.h"
22#include "bmap.h"
23#include "glock.h"
24#include "inode.h"
25#include "ops_vm.h"
26#include "quota.h"
27#include "rgrp.h"
28#include "trans.h"
29#include "util.h"
30
31static void pfault_be_greedy(struct gfs2_inode *ip)
32{
33 unsigned int time;
34
35 spin_lock(&ip->i_spin);
36 time = ip->i_greedy;
37 ip->i_last_pfault = jiffies;
38 spin_unlock(&ip->i_spin);
39
40 igrab(&ip->i_inode);
41 if (gfs2_glock_be_greedy(ip->i_gl, time))
42 iput(&ip->i_inode);
43}
44
45static struct page *gfs2_private_nopage(struct vm_area_struct *area,
46 unsigned long address, int *type)
47{
48 struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host);
49 struct page *result;
50
51 set_bit(GIF_PAGED, &ip->i_flags);
52
53 result = filemap_nopage(area, address, type);
54
55 if (result && result != NOPAGE_OOM)
56 pfault_be_greedy(ip);
57
58 return result;
59}
60
61static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
62{
63 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
64 unsigned long index = page->index;
65 u64 lblock = index << (PAGE_CACHE_SHIFT -
66 sdp->sd_sb.sb_bsize_shift);
67 unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift;
68 struct gfs2_alloc *al;
69 unsigned int data_blocks, ind_blocks;
70 unsigned int x;
71 int error;
72
73 al = gfs2_alloc_get(ip);
74
75 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
76 if (error)
77 goto out;
78
79 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
80 if (error)
81 goto out_gunlock_q;
82
83 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
84
85 al->al_requested = data_blocks + ind_blocks;
86
87 error = gfs2_inplace_reserve(ip);
88 if (error)
89 goto out_gunlock_q;
90
91 error = gfs2_trans_begin(sdp, al->al_rgd->rd_ri.ri_length +
92 ind_blocks + RES_DINODE +
93 RES_STATFS + RES_QUOTA, 0);
94 if (error)
95 goto out_ipres;
96
97 if (gfs2_is_stuffed(ip)) {
98 error = gfs2_unstuff_dinode(ip, NULL);
99 if (error)
100 goto out_trans;
101 }
102
103 for (x = 0; x < blocks; ) {
104 u64 dblock;
105 unsigned int extlen;
106 int new = 1;
107
108 error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
109 if (error)
110 goto out_trans;
111
112 lblock += extlen;
113 x += extlen;
114 }
115
116 gfs2_assert_warn(sdp, al->al_alloced);
117
118out_trans:
119 gfs2_trans_end(sdp);
120out_ipres:
121 gfs2_inplace_release(ip);
122out_gunlock_q:
123 gfs2_quota_unlock(ip);
124out:
125 gfs2_alloc_put(ip);
126 return error;
127}
128
129static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
130 unsigned long address, int *type)
131{
132 struct file *file = area->vm_file;
133 struct gfs2_file *gf = file->private_data;
134 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
135 struct gfs2_holder i_gh;
136 struct page *result = NULL;
137 unsigned long index = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) +
138 area->vm_pgoff;
139 int alloc_required;
140 int error;
141
142 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
143 if (error)
144 return NULL;
145
146 set_bit(GIF_PAGED, &ip->i_flags);
147 set_bit(GIF_SW_PAGED, &ip->i_flags);
148
149 error = gfs2_write_alloc_required(ip, (u64)index << PAGE_CACHE_SHIFT,
150 PAGE_CACHE_SIZE, &alloc_required);
151 if (error)
152 goto out;
153
154 set_bit(GFF_EXLOCK, &gf->f_flags);
155 result = filemap_nopage(area, address, type);
156 clear_bit(GFF_EXLOCK, &gf->f_flags);
157 if (!result || result == NOPAGE_OOM)
158 goto out;
159
160 if (alloc_required) {
161 error = alloc_page_backing(ip, result);
162 if (error) {
163 page_cache_release(result);
164 result = NULL;
165 goto out;
166 }
167 set_page_dirty(result);
168 }
169
170 pfault_be_greedy(ip);
171out:
172 gfs2_glock_dq_uninit(&i_gh);
173
174 return result;
175}
176
177struct vm_operations_struct gfs2_vm_ops_private = {
178 .nopage = gfs2_private_nopage,
179};
180
181struct vm_operations_struct gfs2_vm_ops_sharewrite = {
182 .nopage = gfs2_sharewrite_nopage,
183};
184
diff --git a/fs/gfs2/ops_vm.h b/fs/gfs2/ops_vm.h
new file mode 100644
index 000000000000..4ae8f43ed5e3
--- /dev/null
+++ b/fs/gfs2/ops_vm.h
@@ -0,0 +1,18 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_VM_DOT_H__
11#define __OPS_VM_DOT_H__
12
13#include <linux/mm.h>
14
15extern struct vm_operations_struct gfs2_vm_ops_private;
16extern struct vm_operations_struct gfs2_vm_ops_sharewrite;
17
18#endif /* __OPS_VM_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
new file mode 100644
index 000000000000..c69b94a55588
--- /dev/null
+++ b/fs/gfs2/quota.c
@@ -0,0 +1,1227 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10/*
11 * Quota change tags are associated with each transaction that allocates or
12 * deallocates space. Those changes are accumulated locally to each node (in a
13 * per-node file) and then are periodically synced to the quota file. This
14 * avoids the bottleneck of constantly touching the quota file, but introduces
15 * fuzziness in the current usage value of IDs that are being used on different
16 * nodes in the cluster simultaneously. So, it is possible for a user on
17 * multiple nodes to overrun their quota, but that overrun is controlable.
18 * Since quota tags are part of transactions, there is no need to a quota check
19 * program to be run on node crashes or anything like that.
20 *
21 * There are couple of knobs that let the administrator manage the quota
22 * fuzziness. "quota_quantum" sets the maximum time a quota change can be
23 * sitting on one node before being synced to the quota file. (The default is
24 * 60 seconds.) Another knob, "quota_scale" controls how quickly the frequency
25 * of quota file syncs increases as the user moves closer to their limit. The
26 * more frequent the syncs, the more accurate the quota enforcement, but that
27 * means that there is more contention between the nodes for the quota file.
28 * The default value is one. This sets the maximum theoretical quota overrun
29 * (with infinite node with infinite bandwidth) to twice the user's limit. (In
30 * practice, the maximum overrun you see should be much less.) A "quota_scale"
31 * number greater than one makes quota syncs more frequent and reduces the
32 * maximum overrun. Numbers less than one (but greater than zero) make quota
33 * syncs less frequent.
34 *
35 * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of
36 * the quota file, so it is not being constantly read.
37 */
38
39#include <linux/sched.h>
40#include <linux/slab.h>
41#include <linux/spinlock.h>
42#include <linux/completion.h>
43#include <linux/buffer_head.h>
44#include <linux/sort.h>
45#include <linux/fs.h>
46#include <linux/bio.h>
47#include <linux/gfs2_ondisk.h>
48#include <linux/lm_interface.h>
49
50#include "gfs2.h"
51#include "incore.h"
52#include "bmap.h"
53#include "glock.h"
54#include "glops.h"
55#include "log.h"
56#include "meta_io.h"
57#include "quota.h"
58#include "rgrp.h"
59#include "super.h"
60#include "trans.h"
61#include "inode.h"
62#include "ops_file.h"
63#include "ops_address.h"
64#include "util.h"
65
66#define QUOTA_USER 1
67#define QUOTA_GROUP 0
68
69static u64 qd2offset(struct gfs2_quota_data *qd)
70{
71 u64 offset;
72
73 offset = 2 * (u64)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags);
74 offset *= sizeof(struct gfs2_quota);
75
76 return offset;
77}
78
79static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
80 struct gfs2_quota_data **qdp)
81{
82 struct gfs2_quota_data *qd;
83 int error;
84
85 qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL);
86 if (!qd)
87 return -ENOMEM;
88
89 qd->qd_count = 1;
90 qd->qd_id = id;
91 if (user)
92 set_bit(QDF_USER, &qd->qd_flags);
93 qd->qd_slot = -1;
94
95 error = gfs2_glock_get(sdp, 2 * (u64)id + !user,
96 &gfs2_quota_glops, CREATE, &qd->qd_gl);
97 if (error)
98 goto fail;
99
100 error = gfs2_lvb_hold(qd->qd_gl);
101 gfs2_glock_put(qd->qd_gl);
102 if (error)
103 goto fail;
104
105 *qdp = qd;
106
107 return 0;
108
109fail:
110 kfree(qd);
111 return error;
112}
113
114static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
115 struct gfs2_quota_data **qdp)
116{
117 struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
118 int error, found;
119
120 *qdp = NULL;
121
122 for (;;) {
123 found = 0;
124 spin_lock(&sdp->sd_quota_spin);
125 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
126 if (qd->qd_id == id &&
127 !test_bit(QDF_USER, &qd->qd_flags) == !user) {
128 qd->qd_count++;
129 found = 1;
130 break;
131 }
132 }
133
134 if (!found)
135 qd = NULL;
136
137 if (!qd && new_qd) {
138 qd = new_qd;
139 list_add(&qd->qd_list, &sdp->sd_quota_list);
140 atomic_inc(&sdp->sd_quota_count);
141 new_qd = NULL;
142 }
143
144 spin_unlock(&sdp->sd_quota_spin);
145
146 if (qd || !create) {
147 if (new_qd) {
148 gfs2_lvb_unhold(new_qd->qd_gl);
149 kfree(new_qd);
150 }
151 *qdp = qd;
152 return 0;
153 }
154
155 error = qd_alloc(sdp, user, id, &new_qd);
156 if (error)
157 return error;
158 }
159}
160
161static void qd_hold(struct gfs2_quota_data *qd)
162{
163 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
164
165 spin_lock(&sdp->sd_quota_spin);
166 gfs2_assert(sdp, qd->qd_count);
167 qd->qd_count++;
168 spin_unlock(&sdp->sd_quota_spin);
169}
170
171static void qd_put(struct gfs2_quota_data *qd)
172{
173 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
174 spin_lock(&sdp->sd_quota_spin);
175 gfs2_assert(sdp, qd->qd_count);
176 if (!--qd->qd_count)
177 qd->qd_last_touched = jiffies;
178 spin_unlock(&sdp->sd_quota_spin);
179}
180
181static int slot_get(struct gfs2_quota_data *qd)
182{
183 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
184 unsigned int c, o = 0, b;
185 unsigned char byte = 0;
186
187 spin_lock(&sdp->sd_quota_spin);
188
189 if (qd->qd_slot_count++) {
190 spin_unlock(&sdp->sd_quota_spin);
191 return 0;
192 }
193
194 for (c = 0; c < sdp->sd_quota_chunks; c++)
195 for (o = 0; o < PAGE_SIZE; o++) {
196 byte = sdp->sd_quota_bitmap[c][o];
197 if (byte != 0xFF)
198 goto found;
199 }
200
201 goto fail;
202
203found:
204 for (b = 0; b < 8; b++)
205 if (!(byte & (1 << b)))
206 break;
207 qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
208
209 if (qd->qd_slot >= sdp->sd_quota_slots)
210 goto fail;
211
212 sdp->sd_quota_bitmap[c][o] |= 1 << b;
213
214 spin_unlock(&sdp->sd_quota_spin);
215
216 return 0;
217
218fail:
219 qd->qd_slot_count--;
220 spin_unlock(&sdp->sd_quota_spin);
221 return -ENOSPC;
222}
223
224static void slot_hold(struct gfs2_quota_data *qd)
225{
226 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
227
228 spin_lock(&sdp->sd_quota_spin);
229 gfs2_assert(sdp, qd->qd_slot_count);
230 qd->qd_slot_count++;
231 spin_unlock(&sdp->sd_quota_spin);
232}
233
234static void slot_put(struct gfs2_quota_data *qd)
235{
236 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
237
238 spin_lock(&sdp->sd_quota_spin);
239 gfs2_assert(sdp, qd->qd_slot_count);
240 if (!--qd->qd_slot_count) {
241 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
242 qd->qd_slot = -1;
243 }
244 spin_unlock(&sdp->sd_quota_spin);
245}
246
247static int bh_get(struct gfs2_quota_data *qd)
248{
249 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
250 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
251 unsigned int block, offset;
252 struct buffer_head *bh;
253 int error;
254 struct buffer_head bh_map;
255
256 mutex_lock(&sdp->sd_quota_mutex);
257
258 if (qd->qd_bh_count++) {
259 mutex_unlock(&sdp->sd_quota_mutex);
260 return 0;
261 }
262
263 block = qd->qd_slot / sdp->sd_qc_per_block;
264 offset = qd->qd_slot % sdp->sd_qc_per_block;;
265
266 error = gfs2_block_map(&ip->i_inode, block, 0, &bh_map, 1);
267 if (error)
268 goto fail;
269 error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh);
270 if (error)
271 goto fail;
272 error = -EIO;
273 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC))
274 goto fail_brelse;
275
276 qd->qd_bh = bh;
277 qd->qd_bh_qc = (struct gfs2_quota_change *)
278 (bh->b_data + sizeof(struct gfs2_meta_header) +
279 offset * sizeof(struct gfs2_quota_change));
280
281 mutex_lock(&sdp->sd_quota_mutex);
282
283 return 0;
284
285fail_brelse:
286 brelse(bh);
287fail:
288 qd->qd_bh_count--;
289 mutex_unlock(&sdp->sd_quota_mutex);
290 return error;
291}
292
293static void bh_put(struct gfs2_quota_data *qd)
294{
295 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
296
297 mutex_lock(&sdp->sd_quota_mutex);
298 gfs2_assert(sdp, qd->qd_bh_count);
299 if (!--qd->qd_bh_count) {
300 brelse(qd->qd_bh);
301 qd->qd_bh = NULL;
302 qd->qd_bh_qc = NULL;
303 }
304 mutex_unlock(&sdp->sd_quota_mutex);
305}
306
307static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
308{
309 struct gfs2_quota_data *qd = NULL;
310 int error;
311 int found = 0;
312
313 *qdp = NULL;
314
315 if (sdp->sd_vfs->s_flags & MS_RDONLY)
316 return 0;
317
318 spin_lock(&sdp->sd_quota_spin);
319
320 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
321 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
322 !test_bit(QDF_CHANGE, &qd->qd_flags) ||
323 qd->qd_sync_gen >= sdp->sd_quota_sync_gen)
324 continue;
325
326 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
327
328 set_bit(QDF_LOCKED, &qd->qd_flags);
329 gfs2_assert_warn(sdp, qd->qd_count);
330 qd->qd_count++;
331 qd->qd_change_sync = qd->qd_change;
332 gfs2_assert_warn(sdp, qd->qd_slot_count);
333 qd->qd_slot_count++;
334 found = 1;
335
336 break;
337 }
338
339 if (!found)
340 qd = NULL;
341
342 spin_unlock(&sdp->sd_quota_spin);
343
344 if (qd) {
345 gfs2_assert_warn(sdp, qd->qd_change_sync);
346 error = bh_get(qd);
347 if (error) {
348 clear_bit(QDF_LOCKED, &qd->qd_flags);
349 slot_put(qd);
350 qd_put(qd);
351 return error;
352 }
353 }
354
355 *qdp = qd;
356
357 return 0;
358}
359
360static int qd_trylock(struct gfs2_quota_data *qd)
361{
362 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
363
364 if (sdp->sd_vfs->s_flags & MS_RDONLY)
365 return 0;
366
367 spin_lock(&sdp->sd_quota_spin);
368
369 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
370 !test_bit(QDF_CHANGE, &qd->qd_flags)) {
371 spin_unlock(&sdp->sd_quota_spin);
372 return 0;
373 }
374
375 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
376
377 set_bit(QDF_LOCKED, &qd->qd_flags);
378 gfs2_assert_warn(sdp, qd->qd_count);
379 qd->qd_count++;
380 qd->qd_change_sync = qd->qd_change;
381 gfs2_assert_warn(sdp, qd->qd_slot_count);
382 qd->qd_slot_count++;
383
384 spin_unlock(&sdp->sd_quota_spin);
385
386 gfs2_assert_warn(sdp, qd->qd_change_sync);
387 if (bh_get(qd)) {
388 clear_bit(QDF_LOCKED, &qd->qd_flags);
389 slot_put(qd);
390 qd_put(qd);
391 return 0;
392 }
393
394 return 1;
395}
396
397static void qd_unlock(struct gfs2_quota_data *qd)
398{
399 gfs2_assert_warn(qd->qd_gl->gl_sbd,
400 test_bit(QDF_LOCKED, &qd->qd_flags));
401 clear_bit(QDF_LOCKED, &qd->qd_flags);
402 bh_put(qd);
403 slot_put(qd);
404 qd_put(qd);
405}
406
407static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
408 struct gfs2_quota_data **qdp)
409{
410 int error;
411
412 error = qd_get(sdp, user, id, create, qdp);
413 if (error)
414 return error;
415
416 error = slot_get(*qdp);
417 if (error)
418 goto fail;
419
420 error = bh_get(*qdp);
421 if (error)
422 goto fail_slot;
423
424 return 0;
425
426fail_slot:
427 slot_put(*qdp);
428fail:
429 qd_put(*qdp);
430 return error;
431}
432
433static void qdsb_put(struct gfs2_quota_data *qd)
434{
435 bh_put(qd);
436 slot_put(qd);
437 qd_put(qd);
438}
439
440int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
441{
442 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
443 struct gfs2_alloc *al = &ip->i_alloc;
444 struct gfs2_quota_data **qd = al->al_qd;
445 int error;
446
447 if (gfs2_assert_warn(sdp, !al->al_qd_num) ||
448 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
449 return -EIO;
450
451 if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
452 return 0;
453
454 error = qdsb_get(sdp, QUOTA_USER, ip->i_di.di_uid, CREATE, qd);
455 if (error)
456 goto out;
457 al->al_qd_num++;
458 qd++;
459
460 error = qdsb_get(sdp, QUOTA_GROUP, ip->i_di.di_gid, CREATE, qd);
461 if (error)
462 goto out;
463 al->al_qd_num++;
464 qd++;
465
466 if (uid != NO_QUOTA_CHANGE && uid != ip->i_di.di_uid) {
467 error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd);
468 if (error)
469 goto out;
470 al->al_qd_num++;
471 qd++;
472 }
473
474 if (gid != NO_QUOTA_CHANGE && gid != ip->i_di.di_gid) {
475 error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd);
476 if (error)
477 goto out;
478 al->al_qd_num++;
479 qd++;
480 }
481
482out:
483 if (error)
484 gfs2_quota_unhold(ip);
485 return error;
486}
487
488void gfs2_quota_unhold(struct gfs2_inode *ip)
489{
490 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
491 struct gfs2_alloc *al = &ip->i_alloc;
492 unsigned int x;
493
494 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
495
496 for (x = 0; x < al->al_qd_num; x++) {
497 qdsb_put(al->al_qd[x]);
498 al->al_qd[x] = NULL;
499 }
500 al->al_qd_num = 0;
501}
502
503static int sort_qd(const void *a, const void *b)
504{
505 const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a;
506 const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b;
507
508 if (!test_bit(QDF_USER, &qd_a->qd_flags) !=
509 !test_bit(QDF_USER, &qd_b->qd_flags)) {
510 if (test_bit(QDF_USER, &qd_a->qd_flags))
511 return -1;
512 else
513 return 1;
514 }
515 if (qd_a->qd_id < qd_b->qd_id)
516 return -1;
517 if (qd_a->qd_id > qd_b->qd_id)
518 return 1;
519
520 return 0;
521}
522
523static void do_qc(struct gfs2_quota_data *qd, s64 change)
524{
525 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
526 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
527 struct gfs2_quota_change *qc = qd->qd_bh_qc;
528 s64 x;
529
530 mutex_lock(&sdp->sd_quota_mutex);
531 gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1);
532
533 if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {
534 qc->qc_change = 0;
535 qc->qc_flags = 0;
536 if (test_bit(QDF_USER, &qd->qd_flags))
537 qc->qc_flags = cpu_to_be32(GFS2_QCF_USER);
538 qc->qc_id = cpu_to_be32(qd->qd_id);
539 }
540
541 x = qc->qc_change;
542 x = be64_to_cpu(x) + change;
543 qc->qc_change = cpu_to_be64(x);
544
545 spin_lock(&sdp->sd_quota_spin);
546 qd->qd_change = x;
547 spin_unlock(&sdp->sd_quota_spin);
548
549 if (!x) {
550 gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
551 clear_bit(QDF_CHANGE, &qd->qd_flags);
552 qc->qc_flags = 0;
553 qc->qc_id = 0;
554 slot_put(qd);
555 qd_put(qd);
556 } else if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) {
557 qd_hold(qd);
558 slot_hold(qd);
559 }
560
561 mutex_unlock(&sdp->sd_quota_mutex);
562}
563
564/**
565 * gfs2_adjust_quota
566 *
567 * This function was mostly borrowed from gfs2_block_truncate_page which was
568 * in turn mostly borrowed from ext3
569 */
570static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
571 s64 change, struct gfs2_quota_data *qd)
572{
573 struct inode *inode = &ip->i_inode;
574 struct address_space *mapping = inode->i_mapping;
575 unsigned long index = loc >> PAGE_CACHE_SHIFT;
576 unsigned offset = loc & (PAGE_CACHE_SHIFT - 1);
577 unsigned blocksize, iblock, pos;
578 struct buffer_head *bh;
579 struct page *page;
580 void *kaddr;
581 __be64 *ptr;
582 s64 value;
583 int err = -EIO;
584
585 page = grab_cache_page(mapping, index);
586 if (!page)
587 return -ENOMEM;
588
589 blocksize = inode->i_sb->s_blocksize;
590 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
591
592 if (!page_has_buffers(page))
593 create_empty_buffers(page, blocksize, 0);
594
595 bh = page_buffers(page);
596 pos = blocksize;
597 while (offset >= pos) {
598 bh = bh->b_this_page;
599 iblock++;
600 pos += blocksize;
601 }
602
603 if (!buffer_mapped(bh)) {
604 gfs2_get_block(inode, iblock, bh, 1);
605 if (!buffer_mapped(bh))
606 goto unlock;
607 }
608
609 if (PageUptodate(page))
610 set_buffer_uptodate(bh);
611
612 if (!buffer_uptodate(bh)) {
613 ll_rw_block(READ_META, 1, &bh);
614 wait_on_buffer(bh);
615 if (!buffer_uptodate(bh))
616 goto unlock;
617 }
618
619 gfs2_trans_add_bh(ip->i_gl, bh, 0);
620
621 kaddr = kmap_atomic(page, KM_USER0);
622 ptr = kaddr + offset;
623 value = (s64)be64_to_cpu(*ptr) + change;
624 *ptr = cpu_to_be64(value);
625 flush_dcache_page(page);
626 kunmap_atomic(kaddr, KM_USER0);
627 err = 0;
628 qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC);
629 qd->qd_qb.qb_value = cpu_to_be64(value);
630unlock:
631 unlock_page(page);
632 page_cache_release(page);
633 return err;
634}
635
636static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
637{
638 struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
639 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
640 unsigned int data_blocks, ind_blocks;
641 struct gfs2_holder *ghs, i_gh;
642 unsigned int qx, x;
643 struct gfs2_quota_data *qd;
644 loff_t offset;
645 unsigned int nalloc = 0;
646 struct gfs2_alloc *al = NULL;
647 int error;
648
649 gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
650 &data_blocks, &ind_blocks);
651
652 ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL);
653 if (!ghs)
654 return -ENOMEM;
655
656 sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
657 for (qx = 0; qx < num_qd; qx++) {
658 error = gfs2_glock_nq_init(qda[qx]->qd_gl,
659 LM_ST_EXCLUSIVE,
660 GL_NOCACHE, &ghs[qx]);
661 if (error)
662 goto out;
663 }
664
665 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
666 if (error)
667 goto out;
668
669 for (x = 0; x < num_qd; x++) {
670 int alloc_required;
671
672 offset = qd2offset(qda[x]);
673 error = gfs2_write_alloc_required(ip, offset,
674 sizeof(struct gfs2_quota),
675 &alloc_required);
676 if (error)
677 goto out_gunlock;
678 if (alloc_required)
679 nalloc++;
680 }
681
682 if (nalloc) {
683 al = gfs2_alloc_get(ip);
684
685 al->al_requested = nalloc * (data_blocks + ind_blocks);
686
687 error = gfs2_inplace_reserve(ip);
688 if (error)
689 goto out_alloc;
690
691 error = gfs2_trans_begin(sdp,
692 al->al_rgd->rd_ri.ri_length +
693 num_qd * data_blocks +
694 nalloc * ind_blocks +
695 RES_DINODE + num_qd +
696 RES_STATFS, 0);
697 if (error)
698 goto out_ipres;
699 } else {
700 error = gfs2_trans_begin(sdp,
701 num_qd * data_blocks +
702 RES_DINODE + num_qd, 0);
703 if (error)
704 goto out_gunlock;
705 }
706
707 for (x = 0; x < num_qd; x++) {
708 qd = qda[x];
709 offset = qd2offset(qd);
710 error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync,
711 (struct gfs2_quota_data *)
712 qd->qd_gl->gl_lvb);
713 if (error)
714 goto out_end_trans;
715
716 do_qc(qd, -qd->qd_change_sync);
717 }
718
719 error = 0;
720
721out_end_trans:
722 gfs2_trans_end(sdp);
723out_ipres:
724 if (nalloc)
725 gfs2_inplace_release(ip);
726out_alloc:
727 if (nalloc)
728 gfs2_alloc_put(ip);
729out_gunlock:
730 gfs2_glock_dq_uninit(&i_gh);
731out:
732 while (qx--)
733 gfs2_glock_dq_uninit(&ghs[qx]);
734 kfree(ghs);
735 gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
736 return error;
737}
738
739static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
740 struct gfs2_holder *q_gh)
741{
742 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
743 struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
744 struct gfs2_holder i_gh;
745 struct gfs2_quota q;
746 char buf[sizeof(struct gfs2_quota)];
747 struct file_ra_state ra_state;
748 int error;
749 struct gfs2_quota_lvb *qlvb;
750
751 file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping);
752restart:
753 error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
754 if (error)
755 return error;
756
757 qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
758
759 if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
760 loff_t pos;
761 gfs2_glock_dq_uninit(q_gh);
762 error = gfs2_glock_nq_init(qd->qd_gl,
763 LM_ST_EXCLUSIVE, GL_NOCACHE,
764 q_gh);
765 if (error)
766 return error;
767
768 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
769 if (error)
770 goto fail;
771
772 memset(buf, 0, sizeof(struct gfs2_quota));
773 pos = qd2offset(qd);
774 error = gfs2_internal_read(ip, &ra_state, buf,
775 &pos, sizeof(struct gfs2_quota));
776 if (error < 0)
777 goto fail_gunlock;
778
779 gfs2_glock_dq_uninit(&i_gh);
780
781
782 gfs2_quota_in(&q, buf);
783 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
784 qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
785 qlvb->__pad = 0;
786 qlvb->qb_limit = cpu_to_be64(q.qu_limit);
787 qlvb->qb_warn = cpu_to_be64(q.qu_warn);
788 qlvb->qb_value = cpu_to_be64(q.qu_value);
789 qd->qd_qb = *qlvb;
790
791 if (gfs2_glock_is_blocking(qd->qd_gl)) {
792 gfs2_glock_dq_uninit(q_gh);
793 force_refresh = 0;
794 goto restart;
795 }
796 }
797
798 return 0;
799
800fail_gunlock:
801 gfs2_glock_dq_uninit(&i_gh);
802fail:
803 gfs2_glock_dq_uninit(q_gh);
804 return error;
805}
806
807int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
808{
809 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
810 struct gfs2_alloc *al = &ip->i_alloc;
811 unsigned int x;
812 int error = 0;
813
814 gfs2_quota_hold(ip, uid, gid);
815
816 if (capable(CAP_SYS_RESOURCE) ||
817 sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
818 return 0;
819
820 sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *),
821 sort_qd, NULL);
822
823 for (x = 0; x < al->al_qd_num; x++) {
824 error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]);
825 if (error)
826 break;
827 }
828
829 if (!error)
830 set_bit(GIF_QD_LOCKED, &ip->i_flags);
831 else {
832 while (x--)
833 gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
834 gfs2_quota_unhold(ip);
835 }
836
837 return error;
838}
839
840static int need_sync(struct gfs2_quota_data *qd)
841{
842 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
843 struct gfs2_tune *gt = &sdp->sd_tune;
844 s64 value;
845 unsigned int num, den;
846 int do_sync = 1;
847
848 if (!qd->qd_qb.qb_limit)
849 return 0;
850
851 spin_lock(&sdp->sd_quota_spin);
852 value = qd->qd_change;
853 spin_unlock(&sdp->sd_quota_spin);
854
855 spin_lock(&gt->gt_spin);
856 num = gt->gt_quota_scale_num;
857 den = gt->gt_quota_scale_den;
858 spin_unlock(&gt->gt_spin);
859
860 if (value < 0)
861 do_sync = 0;
862 else if ((s64)be64_to_cpu(qd->qd_qb.qb_value) >=
863 (s64)be64_to_cpu(qd->qd_qb.qb_limit))
864 do_sync = 0;
865 else {
866 value *= gfs2_jindex_size(sdp) * num;
867 do_div(value, den);
868 value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
869 if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
870 do_sync = 0;
871 }
872
873 return do_sync;
874}
875
876void gfs2_quota_unlock(struct gfs2_inode *ip)
877{
878 struct gfs2_alloc *al = &ip->i_alloc;
879 struct gfs2_quota_data *qda[4];
880 unsigned int count = 0;
881 unsigned int x;
882
883 if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
884 goto out;
885
886 for (x = 0; x < al->al_qd_num; x++) {
887 struct gfs2_quota_data *qd;
888 int sync;
889
890 qd = al->al_qd[x];
891 sync = need_sync(qd);
892
893 gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
894
895 if (sync && qd_trylock(qd))
896 qda[count++] = qd;
897 }
898
899 if (count) {
900 do_sync(count, qda);
901 for (x = 0; x < count; x++)
902 qd_unlock(qda[x]);
903 }
904
905out:
906 gfs2_quota_unhold(ip);
907}
908
909#define MAX_LINE 256
910
911static int print_message(struct gfs2_quota_data *qd, char *type)
912{
913 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
914
915 printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n",
916 sdp->sd_fsname, type,
917 (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
918 qd->qd_id);
919
920 return 0;
921}
922
923int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
924{
925 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
926 struct gfs2_alloc *al = &ip->i_alloc;
927 struct gfs2_quota_data *qd;
928 s64 value;
929 unsigned int x;
930 int error = 0;
931
932 if (!test_bit(GIF_QD_LOCKED, &ip->i_flags))
933 return 0;
934
935 if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
936 return 0;
937
938 for (x = 0; x < al->al_qd_num; x++) {
939 qd = al->al_qd[x];
940
941 if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
942 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))))
943 continue;
944
945 value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
946 spin_lock(&sdp->sd_quota_spin);
947 value += qd->qd_change;
948 spin_unlock(&sdp->sd_quota_spin);
949
950 if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
951 print_message(qd, "exceeded");
952 error = -EDQUOT;
953 break;
954 } else if (be64_to_cpu(qd->qd_qb.qb_warn) &&
955 (s64)be64_to_cpu(qd->qd_qb.qb_warn) < value &&
956 time_after_eq(jiffies, qd->qd_last_warn +
957 gfs2_tune_get(sdp,
958 gt_quota_warn_period) * HZ)) {
959 error = print_message(qd, "warning");
960 qd->qd_last_warn = jiffies;
961 }
962 }
963
964 return error;
965}
966
967void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
968 u32 uid, u32 gid)
969{
970 struct gfs2_alloc *al = &ip->i_alloc;
971 struct gfs2_quota_data *qd;
972 unsigned int x;
973 unsigned int found = 0;
974
975 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
976 return;
977 if (ip->i_di.di_flags & GFS2_DIF_SYSTEM)
978 return;
979
980 for (x = 0; x < al->al_qd_num; x++) {
981 qd = al->al_qd[x];
982
983 if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
984 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
985 do_qc(qd, change);
986 found++;
987 }
988 }
989}
990
991int gfs2_quota_sync(struct gfs2_sbd *sdp)
992{
993 struct gfs2_quota_data **qda;
994 unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
995 unsigned int num_qd;
996 unsigned int x;
997 int error = 0;
998
999 sdp->sd_quota_sync_gen++;
1000
1001 qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL);
1002 if (!qda)
1003 return -ENOMEM;
1004
1005 do {
1006 num_qd = 0;
1007
1008 for (;;) {
1009 error = qd_fish(sdp, qda + num_qd);
1010 if (error || !qda[num_qd])
1011 break;
1012 if (++num_qd == max_qd)
1013 break;
1014 }
1015
1016 if (num_qd) {
1017 if (!error)
1018 error = do_sync(num_qd, qda);
1019 if (!error)
1020 for (x = 0; x < num_qd; x++)
1021 qda[x]->qd_sync_gen =
1022 sdp->sd_quota_sync_gen;
1023
1024 for (x = 0; x < num_qd; x++)
1025 qd_unlock(qda[x]);
1026 }
1027 } while (!error && num_qd == max_qd);
1028
1029 kfree(qda);
1030
1031 return error;
1032}
1033
1034int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
1035{
1036 struct gfs2_quota_data *qd;
1037 struct gfs2_holder q_gh;
1038 int error;
1039
1040 error = qd_get(sdp, user, id, CREATE, &qd);
1041 if (error)
1042 return error;
1043
1044 error = do_glock(qd, FORCE, &q_gh);
1045 if (!error)
1046 gfs2_glock_dq_uninit(&q_gh);
1047
1048 qd_put(qd);
1049
1050 return error;
1051}
1052
1053int gfs2_quota_init(struct gfs2_sbd *sdp)
1054{
1055 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
1056 unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
1057 unsigned int x, slot = 0;
1058 unsigned int found = 0;
1059 u64 dblock;
1060 u32 extlen = 0;
1061 int error;
1062
1063 if (!ip->i_di.di_size || ip->i_di.di_size > (64 << 20) ||
1064 ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
1065 gfs2_consist_inode(ip);
1066 return -EIO;
1067 }
1068 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
1069 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
1070
1071 error = -ENOMEM;
1072
1073 sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
1074 sizeof(unsigned char *), GFP_KERNEL);
1075 if (!sdp->sd_quota_bitmap)
1076 return error;
1077
1078 for (x = 0; x < sdp->sd_quota_chunks; x++) {
1079 sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL);
1080 if (!sdp->sd_quota_bitmap[x])
1081 goto fail;
1082 }
1083
1084 for (x = 0; x < blocks; x++) {
1085 struct buffer_head *bh;
1086 unsigned int y;
1087
1088 if (!extlen) {
1089 int new = 0;
1090 error = gfs2_extent_map(&ip->i_inode, x, &new, &dblock, &extlen);
1091 if (error)
1092 goto fail;
1093 }
1094 error = -EIO;
1095 bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
1096 if (!bh)
1097 goto fail;
1098 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) {
1099 brelse(bh);
1100 goto fail;
1101 }
1102
1103 for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
1104 y++, slot++) {
1105 struct gfs2_quota_change qc;
1106 struct gfs2_quota_data *qd;
1107
1108 gfs2_quota_change_in(&qc, bh->b_data +
1109 sizeof(struct gfs2_meta_header) +
1110 y * sizeof(struct gfs2_quota_change));
1111 if (!qc.qc_change)
1112 continue;
1113
1114 error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER),
1115 qc.qc_id, &qd);
1116 if (error) {
1117 brelse(bh);
1118 goto fail;
1119 }
1120
1121 set_bit(QDF_CHANGE, &qd->qd_flags);
1122 qd->qd_change = qc.qc_change;
1123 qd->qd_slot = slot;
1124 qd->qd_slot_count = 1;
1125 qd->qd_last_touched = jiffies;
1126
1127 spin_lock(&sdp->sd_quota_spin);
1128 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
1129 list_add(&qd->qd_list, &sdp->sd_quota_list);
1130 atomic_inc(&sdp->sd_quota_count);
1131 spin_unlock(&sdp->sd_quota_spin);
1132
1133 found++;
1134 }
1135
1136 brelse(bh);
1137 dblock++;
1138 extlen--;
1139 }
1140
1141 if (found)
1142 fs_info(sdp, "found %u quota changes\n", found);
1143
1144 return 0;
1145
1146fail:
1147 gfs2_quota_cleanup(sdp);
1148 return error;
1149}
1150
1151void gfs2_quota_scan(struct gfs2_sbd *sdp)
1152{
1153 struct gfs2_quota_data *qd, *safe;
1154 LIST_HEAD(dead);
1155
1156 spin_lock(&sdp->sd_quota_spin);
1157 list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) {
1158 if (!qd->qd_count &&
1159 time_after_eq(jiffies, qd->qd_last_touched +
1160 gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) {
1161 list_move(&qd->qd_list, &dead);
1162 gfs2_assert_warn(sdp,
1163 atomic_read(&sdp->sd_quota_count) > 0);
1164 atomic_dec(&sdp->sd_quota_count);
1165 }
1166 }
1167 spin_unlock(&sdp->sd_quota_spin);
1168
1169 while (!list_empty(&dead)) {
1170 qd = list_entry(dead.next, struct gfs2_quota_data, qd_list);
1171 list_del(&qd->qd_list);
1172
1173 gfs2_assert_warn(sdp, !qd->qd_change);
1174 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1175 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1176
1177 gfs2_lvb_unhold(qd->qd_gl);
1178 kfree(qd);
1179 }
1180}
1181
1182void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1183{
1184 struct list_head *head = &sdp->sd_quota_list;
1185 struct gfs2_quota_data *qd;
1186 unsigned int x;
1187
1188 spin_lock(&sdp->sd_quota_spin);
1189 while (!list_empty(head)) {
1190 qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
1191
1192 if (qd->qd_count > 1 ||
1193 (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
1194 list_move(&qd->qd_list, head);
1195 spin_unlock(&sdp->sd_quota_spin);
1196 schedule();
1197 spin_lock(&sdp->sd_quota_spin);
1198 continue;
1199 }
1200
1201 list_del(&qd->qd_list);
1202 atomic_dec(&sdp->sd_quota_count);
1203 spin_unlock(&sdp->sd_quota_spin);
1204
1205 if (!qd->qd_count) {
1206 gfs2_assert_warn(sdp, !qd->qd_change);
1207 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1208 } else
1209 gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
1210 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1211
1212 gfs2_lvb_unhold(qd->qd_gl);
1213 kfree(qd);
1214
1215 spin_lock(&sdp->sd_quota_spin);
1216 }
1217 spin_unlock(&sdp->sd_quota_spin);
1218
1219 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
1220
1221 if (sdp->sd_quota_bitmap) {
1222 for (x = 0; x < sdp->sd_quota_chunks; x++)
1223 kfree(sdp->sd_quota_bitmap[x]);
1224 kfree(sdp->sd_quota_bitmap);
1225 }
1226}
1227
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
new file mode 100644
index 000000000000..a8be1417051f
--- /dev/null
+++ b/fs/gfs2/quota.h
@@ -0,0 +1,35 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __QUOTA_DOT_H__
11#define __QUOTA_DOT_H__
12
13struct gfs2_inode;
14struct gfs2_sbd;
15
16#define NO_QUOTA_CHANGE ((u32)-1)
17
18int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
19void gfs2_quota_unhold(struct gfs2_inode *ip);
20
21int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
22void gfs2_quota_unlock(struct gfs2_inode *ip);
23
24int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
25void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
26 u32 uid, u32 gid);
27
28int gfs2_quota_sync(struct gfs2_sbd *sdp);
29int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
30
31int gfs2_quota_init(struct gfs2_sbd *sdp);
32void gfs2_quota_scan(struct gfs2_sbd *sdp);
33void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
34
35#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
new file mode 100644
index 000000000000..0a8a4b87dcc6
--- /dev/null
+++ b/fs/gfs2/recovery.c
@@ -0,0 +1,570 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17#include <linux/lm_interface.h>
18
19#include "gfs2.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "glops.h"
24#include "lm.h"
25#include "lops.h"
26#include "meta_io.h"
27#include "recovery.h"
28#include "super.h"
29#include "util.h"
30#include "dir.h"
31
32int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
33 struct buffer_head **bh)
34{
35 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
36 struct gfs2_glock *gl = ip->i_gl;
37 int new = 0;
38 u64 dblock;
39 u32 extlen;
40 int error;
41
42 error = gfs2_extent_map(&ip->i_inode, blk, &new, &dblock, &extlen);
43 if (error)
44 return error;
45 if (!dblock) {
46 gfs2_consist_inode(ip);
47 return -EIO;
48 }
49
50 *bh = gfs2_meta_ra(gl, dblock, extlen);
51
52 return error;
53}
54
55int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
56{
57 struct list_head *head = &sdp->sd_revoke_list;
58 struct gfs2_revoke_replay *rr;
59 int found = 0;
60
61 list_for_each_entry(rr, head, rr_list) {
62 if (rr->rr_blkno == blkno) {
63 found = 1;
64 break;
65 }
66 }
67
68 if (found) {
69 rr->rr_where = where;
70 return 0;
71 }
72
73 rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL);
74 if (!rr)
75 return -ENOMEM;
76
77 rr->rr_blkno = blkno;
78 rr->rr_where = where;
79 list_add(&rr->rr_list, head);
80
81 return 1;
82}
83
84int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
85{
86 struct gfs2_revoke_replay *rr;
87 int wrap, a, b, revoke;
88 int found = 0;
89
90 list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) {
91 if (rr->rr_blkno == blkno) {
92 found = 1;
93 break;
94 }
95 }
96
97 if (!found)
98 return 0;
99
100 wrap = (rr->rr_where < sdp->sd_replay_tail);
101 a = (sdp->sd_replay_tail < where);
102 b = (where < rr->rr_where);
103 revoke = (wrap) ? (a || b) : (a && b);
104
105 return revoke;
106}
107
108void gfs2_revoke_clean(struct gfs2_sbd *sdp)
109{
110 struct list_head *head = &sdp->sd_revoke_list;
111 struct gfs2_revoke_replay *rr;
112
113 while (!list_empty(head)) {
114 rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list);
115 list_del(&rr->rr_list);
116 kfree(rr);
117 }
118}
119
120/**
121 * get_log_header - read the log header for a given segment
122 * @jd: the journal
123 * @blk: the block to look at
124 * @lh: the log header to return
125 *
126 * Read the log header for a given segement in a given journal. Do a few
127 * sanity checks on it.
128 *
129 * Returns: 0 on success,
130 * 1 if the header was invalid or incomplete,
131 * errno on error
132 */
133
134static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
135 struct gfs2_log_header *head)
136{
137 struct buffer_head *bh;
138 struct gfs2_log_header lh;
139 u32 hash;
140 int error;
141
142 error = gfs2_replay_read_block(jd, blk, &bh);
143 if (error)
144 return error;
145
146 memcpy(&lh, bh->b_data, sizeof(struct gfs2_log_header));
147 lh.lh_hash = 0;
148 hash = gfs2_disk_hash((char *)&lh, sizeof(struct gfs2_log_header));
149 gfs2_log_header_in(&lh, bh->b_data);
150
151 brelse(bh);
152
153 if (lh.lh_header.mh_magic != GFS2_MAGIC ||
154 lh.lh_header.mh_type != GFS2_METATYPE_LH ||
155 lh.lh_blkno != blk || lh.lh_hash != hash)
156 return 1;
157
158 *head = lh;
159
160 return 0;
161}
162
163/**
164 * find_good_lh - find a good log header
165 * @jd: the journal
166 * @blk: the segment to start searching from
167 * @lh: the log header to fill in
168 * @forward: if true search forward in the log, else search backward
169 *
170 * Call get_log_header() to get a log header for a segment, but if the
171 * segment is bad, either scan forward or backward until we find a good one.
172 *
173 * Returns: errno
174 */
175
176static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk,
177 struct gfs2_log_header *head)
178{
179 unsigned int orig_blk = *blk;
180 int error;
181
182 for (;;) {
183 error = get_log_header(jd, *blk, head);
184 if (error <= 0)
185 return error;
186
187 if (++*blk == jd->jd_blocks)
188 *blk = 0;
189
190 if (*blk == orig_blk) {
191 gfs2_consist_inode(GFS2_I(jd->jd_inode));
192 return -EIO;
193 }
194 }
195}
196
197/**
198 * jhead_scan - make sure we've found the head of the log
199 * @jd: the journal
200 * @head: this is filled in with the log descriptor of the head
201 *
202 * At this point, seg and lh should be either the head of the log or just
203 * before. Scan forward until we find the head.
204 *
205 * Returns: errno
206 */
207
208static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
209{
210 unsigned int blk = head->lh_blkno;
211 struct gfs2_log_header lh;
212 int error;
213
214 for (;;) {
215 if (++blk == jd->jd_blocks)
216 blk = 0;
217
218 error = get_log_header(jd, blk, &lh);
219 if (error < 0)
220 return error;
221 if (error == 1)
222 continue;
223
224 if (lh.lh_sequence == head->lh_sequence) {
225 gfs2_consist_inode(GFS2_I(jd->jd_inode));
226 return -EIO;
227 }
228 if (lh.lh_sequence < head->lh_sequence)
229 break;
230
231 *head = lh;
232 }
233
234 return 0;
235}
236
237/**
238 * gfs2_find_jhead - find the head of a log
239 * @jd: the journal
240 * @head: the log descriptor for the head of the log is returned here
241 *
242 * Do a binary search of a journal and find the valid log entry with the
243 * highest sequence number. (i.e. the log head)
244 *
245 * Returns: errno
246 */
247
248int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
249{
250 struct gfs2_log_header lh_1, lh_m;
251 u32 blk_1, blk_2, blk_m;
252 int error;
253
254 blk_1 = 0;
255 blk_2 = jd->jd_blocks - 1;
256
257 for (;;) {
258 blk_m = (blk_1 + blk_2) / 2;
259
260 error = find_good_lh(jd, &blk_1, &lh_1);
261 if (error)
262 return error;
263
264 error = find_good_lh(jd, &blk_m, &lh_m);
265 if (error)
266 return error;
267
268 if (blk_1 == blk_m || blk_m == blk_2)
269 break;
270
271 if (lh_1.lh_sequence <= lh_m.lh_sequence)
272 blk_1 = blk_m;
273 else
274 blk_2 = blk_m;
275 }
276
277 error = jhead_scan(jd, &lh_1);
278 if (error)
279 return error;
280
281 *head = lh_1;
282
283 return error;
284}
285
286/**
287 * foreach_descriptor - go through the active part of the log
288 * @jd: the journal
289 * @start: the first log header in the active region
290 * @end: the last log header (don't process the contents of this entry))
291 *
292 * Call a given function once for every log descriptor in the active
293 * portion of the log.
294 *
295 * Returns: errno
296 */
297
298static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start,
299 unsigned int end, int pass)
300{
301 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
302 struct buffer_head *bh;
303 struct gfs2_log_descriptor *ld;
304 int error = 0;
305 u32 length;
306 __be64 *ptr;
307 unsigned int offset = sizeof(struct gfs2_log_descriptor);
308 offset += sizeof(__be64) - 1;
309 offset &= ~(sizeof(__be64) - 1);
310
311 while (start != end) {
312 error = gfs2_replay_read_block(jd, start, &bh);
313 if (error)
314 return error;
315 if (gfs2_meta_check(sdp, bh)) {
316 brelse(bh);
317 return -EIO;
318 }
319 ld = (struct gfs2_log_descriptor *)bh->b_data;
320 length = be32_to_cpu(ld->ld_length);
321
322 if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) {
323 struct gfs2_log_header lh;
324 error = get_log_header(jd, start, &lh);
325 if (!error) {
326 gfs2_replay_incr_blk(sdp, &start);
327 brelse(bh);
328 continue;
329 }
330 if (error == 1) {
331 gfs2_consist_inode(GFS2_I(jd->jd_inode));
332 error = -EIO;
333 }
334 brelse(bh);
335 return error;
336 } else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) {
337 brelse(bh);
338 return -EIO;
339 }
340 ptr = (__be64 *)(bh->b_data + offset);
341 error = lops_scan_elements(jd, start, ld, ptr, pass);
342 if (error) {
343 brelse(bh);
344 return error;
345 }
346
347 while (length--)
348 gfs2_replay_incr_blk(sdp, &start);
349
350 brelse(bh);
351 }
352
353 return 0;
354}
355
356/**
357 * clean_journal - mark a dirty journal as being clean
358 * @sdp: the filesystem
359 * @jd: the journal
360 * @gl: the journal's glock
361 * @head: the head journal to start from
362 *
363 * Returns: errno
364 */
365
366static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
367{
368 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
369 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
370 unsigned int lblock;
371 struct gfs2_log_header *lh;
372 u32 hash;
373 struct buffer_head *bh;
374 int error;
375 struct buffer_head bh_map;
376
377 lblock = head->lh_blkno;
378 gfs2_replay_incr_blk(sdp, &lblock);
379 error = gfs2_block_map(&ip->i_inode, lblock, 0, &bh_map, 1);
380 if (error)
381 return error;
382 if (!bh_map.b_blocknr) {
383 gfs2_consist_inode(ip);
384 return -EIO;
385 }
386
387 bh = sb_getblk(sdp->sd_vfs, bh_map.b_blocknr);
388 lock_buffer(bh);
389 memset(bh->b_data, 0, bh->b_size);
390 set_buffer_uptodate(bh);
391 clear_buffer_dirty(bh);
392 unlock_buffer(bh);
393
394 lh = (struct gfs2_log_header *)bh->b_data;
395 memset(lh, 0, sizeof(struct gfs2_log_header));
396 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
397 lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
398 lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
399 lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
400 lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
401 lh->lh_blkno = cpu_to_be32(lblock);
402 hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header));
403 lh->lh_hash = cpu_to_be32(hash);
404
405 set_buffer_dirty(bh);
406 if (sync_dirty_buffer(bh))
407 gfs2_io_error_bh(sdp, bh);
408 brelse(bh);
409
410 return error;
411}
412
413/**
414 * gfs2_recover_journal - recovery a given journal
415 * @jd: the struct gfs2_jdesc describing the journal
416 *
417 * Acquire the journal's lock, check to see if the journal is clean, and
418 * do recovery if necessary.
419 *
420 * Returns: errno
421 */
422
423int gfs2_recover_journal(struct gfs2_jdesc *jd)
424{
425 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
426 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
427 struct gfs2_log_header head;
428 struct gfs2_holder j_gh, ji_gh, t_gh;
429 unsigned long t;
430 int ro = 0;
431 unsigned int pass;
432 int error;
433
434 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
435 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
436 jd->jd_jid);
437
438 /* Aquire the journal lock so we can do recovery */
439
440 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
441 LM_ST_EXCLUSIVE,
442 LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE,
443 &j_gh);
444 switch (error) {
445 case 0:
446 break;
447
448 case GLR_TRYFAILED:
449 fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid);
450 error = 0;
451
452 default:
453 goto fail;
454 };
455
456 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
457 LM_FLAG_NOEXP, &ji_gh);
458 if (error)
459 goto fail_gunlock_j;
460 } else {
461 fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid);
462 }
463
464 fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid);
465
466 error = gfs2_jdesc_check(jd);
467 if (error)
468 goto fail_gunlock_ji;
469
470 error = gfs2_find_jhead(jd, &head);
471 if (error)
472 goto fail_gunlock_ji;
473
474 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
475 fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n",
476 jd->jd_jid);
477
478 t = jiffies;
479
480 /* Acquire a shared hold on the transaction lock */
481
482 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
483 LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
484 GL_NOCANCEL | GL_NOCACHE, &t_gh);
485 if (error)
486 goto fail_gunlock_ji;
487
488 if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
489 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
490 ro = 1;
491 } else {
492 if (sdp->sd_vfs->s_flags & MS_RDONLY)
493 ro = 1;
494 }
495
496 if (ro) {
497 fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n",
498 jd->jd_jid);
499 error = -EROFS;
500 goto fail_gunlock_tr;
501 }
502
503 fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid);
504
505 for (pass = 0; pass < 2; pass++) {
506 lops_before_scan(jd, &head, pass);
507 error = foreach_descriptor(jd, head.lh_tail,
508 head.lh_blkno, pass);
509 lops_after_scan(jd, error, pass);
510 if (error)
511 goto fail_gunlock_tr;
512 }
513
514 error = clean_journal(jd, &head);
515 if (error)
516 goto fail_gunlock_tr;
517
518 gfs2_glock_dq_uninit(&t_gh);
519 t = DIV_ROUND_UP(jiffies - t, HZ);
520 fs_info(sdp, "jid=%u: Journal replayed in %lus\n",
521 jd->jd_jid, t);
522 }
523
524 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
525 gfs2_glock_dq_uninit(&ji_gh);
526
527 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
528
529 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
530 gfs2_glock_dq_uninit(&j_gh);
531
532 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
533 return 0;
534
535fail_gunlock_tr:
536 gfs2_glock_dq_uninit(&t_gh);
537fail_gunlock_ji:
538 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
539 gfs2_glock_dq_uninit(&ji_gh);
540fail_gunlock_j:
541 gfs2_glock_dq_uninit(&j_gh);
542 }
543
544 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
545
546fail:
547 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
548 return error;
549}
550
551/**
552 * gfs2_check_journals - Recover any dirty journals
553 * @sdp: the filesystem
554 *
555 */
556
557void gfs2_check_journals(struct gfs2_sbd *sdp)
558{
559 struct gfs2_jdesc *jd;
560
561 for (;;) {
562 jd = gfs2_jdesc_find_dirty(sdp);
563 if (!jd)
564 break;
565
566 if (jd != sdp->sd_jdesc)
567 gfs2_recover_journal(jd);
568 }
569}
570
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
new file mode 100644
index 000000000000..961feedf4d8b
--- /dev/null
+++ b/fs/gfs2/recovery.h
@@ -0,0 +1,34 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __RECOVERY_DOT_H__
11#define __RECOVERY_DOT_H__
12
13#include "incore.h"
14
15static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
16{
17 if (++*blk == sdp->sd_jdesc->jd_blocks)
18 *blk = 0;
19}
20
21int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
22 struct buffer_head **bh);
23
24int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
25int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
26void gfs2_revoke_clean(struct gfs2_sbd *sdp);
27
28int gfs2_find_jhead(struct gfs2_jdesc *jd,
29 struct gfs2_log_header *head);
30int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
31void gfs2_check_journals(struct gfs2_sbd *sdp);
32
33#endif /* __RECOVERY_DOT_H__ */
34
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
new file mode 100644
index 000000000000..b261385c0065
--- /dev/null
+++ b/fs/gfs2/rgrp.c
@@ -0,0 +1,1513 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/fs.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/lm_interface.h>
18
19#include "gfs2.h"
20#include "incore.h"
21#include "glock.h"
22#include "glops.h"
23#include "lops.h"
24#include "meta_io.h"
25#include "quota.h"
26#include "rgrp.h"
27#include "super.h"
28#include "trans.h"
29#include "ops_file.h"
30#include "util.h"
31
32#define BFITNOENT ((u32)~0)
33
34/*
35 * These routines are used by the resource group routines (rgrp.c)
36 * to keep track of block allocation. Each block is represented by two
37 * bits. So, each byte represents GFS2_NBBY (i.e. 4) blocks.
38 *
39 * 0 = Free
40 * 1 = Used (not metadata)
41 * 2 = Unlinked (still in use) inode
42 * 3 = Used (metadata)
43 */
44
45static const char valid_change[16] = {
46 /* current */
47 /* n */ 0, 1, 1, 1,
48 /* e */ 1, 0, 0, 0,
49 /* w */ 0, 0, 0, 1,
50 1, 0, 0, 0
51};
52
53/**
54 * gfs2_setbit - Set a bit in the bitmaps
55 * @buffer: the buffer that holds the bitmaps
56 * @buflen: the length (in bytes) of the buffer
57 * @block: the block to set
58 * @new_state: the new state of the block
59 *
60 */
61
62static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
63 unsigned int buflen, u32 block,
64 unsigned char new_state)
65{
66 unsigned char *byte, *end, cur_state;
67 unsigned int bit;
68
69 byte = buffer + (block / GFS2_NBBY);
70 bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
71 end = buffer + buflen;
72
73 gfs2_assert(rgd->rd_sbd, byte < end);
74
75 cur_state = (*byte >> bit) & GFS2_BIT_MASK;
76
77 if (valid_change[new_state * 4 + cur_state]) {
78 *byte ^= cur_state << bit;
79 *byte |= new_state << bit;
80 } else
81 gfs2_consist_rgrpd(rgd);
82}
83
84/**
85 * gfs2_testbit - test a bit in the bitmaps
86 * @buffer: the buffer that holds the bitmaps
87 * @buflen: the length (in bytes) of the buffer
88 * @block: the block to read
89 *
90 */
91
92static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
93 unsigned int buflen, u32 block)
94{
95 unsigned char *byte, *end, cur_state;
96 unsigned int bit;
97
98 byte = buffer + (block / GFS2_NBBY);
99 bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
100 end = buffer + buflen;
101
102 gfs2_assert(rgd->rd_sbd, byte < end);
103
104 cur_state = (*byte >> bit) & GFS2_BIT_MASK;
105
106 return cur_state;
107}
108
109/**
110 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
111 * a block in a given allocation state.
112 * @buffer: the buffer that holds the bitmaps
113 * @buflen: the length (in bytes) of the buffer
114 * @goal: start search at this block's bit-pair (within @buffer)
115 * @old_state: GFS2_BLKST_XXX the state of the block we're looking for;
116 * bit 0 = alloc(1)/free(0), bit 1 = meta(1)/data(0)
117 *
118 * Scope of @goal and returned block number is only within this bitmap buffer,
119 * not entire rgrp or filesystem. @buffer will be offset from the actual
120 * beginning of a bitmap block buffer, skipping any header structures.
121 *
122 * Return: the block number (bitmap buffer scope) that was found
123 */
124
125static u32 gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
126 unsigned int buflen, u32 goal,
127 unsigned char old_state)
128{
129 unsigned char *byte, *end, alloc;
130 u32 blk = goal;
131 unsigned int bit;
132
133 byte = buffer + (goal / GFS2_NBBY);
134 bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
135 end = buffer + buflen;
136 alloc = (old_state & 1) ? 0 : 0x55;
137
138 while (byte < end) {
139 if ((*byte & 0x55) == alloc) {
140 blk += (8 - bit) >> 1;
141
142 bit = 0;
143 byte++;
144
145 continue;
146 }
147
148 if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
149 return blk;
150
151 bit += GFS2_BIT_SIZE;
152 if (bit >= 8) {
153 bit = 0;
154 byte++;
155 }
156
157 blk++;
158 }
159
160 return BFITNOENT;
161}
162
163/**
164 * gfs2_bitcount - count the number of bits in a certain state
165 * @buffer: the buffer that holds the bitmaps
166 * @buflen: the length (in bytes) of the buffer
167 * @state: the state of the block we're looking for
168 *
169 * Returns: The number of bits
170 */
171
172static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer,
173 unsigned int buflen, unsigned char state)
174{
175 unsigned char *byte = buffer;
176 unsigned char *end = buffer + buflen;
177 unsigned char state1 = state << 2;
178 unsigned char state2 = state << 4;
179 unsigned char state3 = state << 6;
180 u32 count = 0;
181
182 for (; byte < end; byte++) {
183 if (((*byte) & 0x03) == state)
184 count++;
185 if (((*byte) & 0x0C) == state1)
186 count++;
187 if (((*byte) & 0x30) == state2)
188 count++;
189 if (((*byte) & 0xC0) == state3)
190 count++;
191 }
192
193 return count;
194}
195
196/**
197 * gfs2_rgrp_verify - Verify that a resource group is consistent
198 * @sdp: the filesystem
199 * @rgd: the rgrp
200 *
201 */
202
203void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
204{
205 struct gfs2_sbd *sdp = rgd->rd_sbd;
206 struct gfs2_bitmap *bi = NULL;
207 u32 length = rgd->rd_ri.ri_length;
208 u32 count[4], tmp;
209 int buf, x;
210
211 memset(count, 0, 4 * sizeof(u32));
212
213 /* Count # blocks in each of 4 possible allocation states */
214 for (buf = 0; buf < length; buf++) {
215 bi = rgd->rd_bits + buf;
216 for (x = 0; x < 4; x++)
217 count[x] += gfs2_bitcount(rgd,
218 bi->bi_bh->b_data +
219 bi->bi_offset,
220 bi->bi_len, x);
221 }
222
223 if (count[0] != rgd->rd_rg.rg_free) {
224 if (gfs2_consist_rgrpd(rgd))
225 fs_err(sdp, "free data mismatch: %u != %u\n",
226 count[0], rgd->rd_rg.rg_free);
227 return;
228 }
229
230 tmp = rgd->rd_ri.ri_data -
231 rgd->rd_rg.rg_free -
232 rgd->rd_rg.rg_dinodes;
233 if (count[1] + count[2] != tmp) {
234 if (gfs2_consist_rgrpd(rgd))
235 fs_err(sdp, "used data mismatch: %u != %u\n",
236 count[1], tmp);
237 return;
238 }
239
240 if (count[3] != rgd->rd_rg.rg_dinodes) {
241 if (gfs2_consist_rgrpd(rgd))
242 fs_err(sdp, "used metadata mismatch: %u != %u\n",
243 count[3], rgd->rd_rg.rg_dinodes);
244 return;
245 }
246
247 if (count[2] > count[3]) {
248 if (gfs2_consist_rgrpd(rgd))
249 fs_err(sdp, "unlinked inodes > inodes: %u\n",
250 count[2]);
251 return;
252 }
253
254}
255
256static inline int rgrp_contains_block(struct gfs2_rindex *ri, u64 block)
257{
258 u64 first = ri->ri_data0;
259 u64 last = first + ri->ri_data;
260 return first <= block && block < last;
261}
262
263/**
264 * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
265 * @sdp: The GFS2 superblock
266 * @n: The data block number
267 *
268 * Returns: The resource group, or NULL if not found
269 */
270
271struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
272{
273 struct gfs2_rgrpd *rgd;
274
275 spin_lock(&sdp->sd_rindex_spin);
276
277 list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) {
278 if (rgrp_contains_block(&rgd->rd_ri, blk)) {
279 list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
280 spin_unlock(&sdp->sd_rindex_spin);
281 return rgd;
282 }
283 }
284
285 spin_unlock(&sdp->sd_rindex_spin);
286
287 return NULL;
288}
289
290/**
291 * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
292 * @sdp: The GFS2 superblock
293 *
294 * Returns: The first rgrp in the filesystem
295 */
296
297struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
298{
299 gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list));
300 return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list);
301}
302
303/**
304 * gfs2_rgrpd_get_next - get the next RG
305 * @rgd: A RG
306 *
307 * Returns: The next rgrp
308 */
309
310struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
311{
312 if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list)
313 return NULL;
314 return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list);
315}
316
317static void clear_rgrpdi(struct gfs2_sbd *sdp)
318{
319 struct list_head *head;
320 struct gfs2_rgrpd *rgd;
321 struct gfs2_glock *gl;
322
323 spin_lock(&sdp->sd_rindex_spin);
324 sdp->sd_rindex_forward = NULL;
325 head = &sdp->sd_rindex_recent_list;
326 while (!list_empty(head)) {
327 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
328 list_del(&rgd->rd_recent);
329 }
330 spin_unlock(&sdp->sd_rindex_spin);
331
332 head = &sdp->sd_rindex_list;
333 while (!list_empty(head)) {
334 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list);
335 gl = rgd->rd_gl;
336
337 list_del(&rgd->rd_list);
338 list_del(&rgd->rd_list_mru);
339
340 if (gl) {
341 gl->gl_object = NULL;
342 gfs2_glock_put(gl);
343 }
344
345 kfree(rgd->rd_bits);
346 kfree(rgd);
347 }
348}
349
350void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
351{
352 mutex_lock(&sdp->sd_rindex_mutex);
353 clear_rgrpdi(sdp);
354 mutex_unlock(&sdp->sd_rindex_mutex);
355}
356
357/**
358 * gfs2_compute_bitstructs - Compute the bitmap sizes
359 * @rgd: The resource group descriptor
360 *
361 * Calculates bitmap descriptors, one for each block that contains bitmap data
362 *
363 * Returns: errno
364 */
365
366static int compute_bitstructs(struct gfs2_rgrpd *rgd)
367{
368 struct gfs2_sbd *sdp = rgd->rd_sbd;
369 struct gfs2_bitmap *bi;
370 u32 length = rgd->rd_ri.ri_length; /* # blocks in hdr & bitmap */
371 u32 bytes_left, bytes;
372 int x;
373
374 if (!length)
375 return -EINVAL;
376
377 rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
378 if (!rgd->rd_bits)
379 return -ENOMEM;
380
381 bytes_left = rgd->rd_ri.ri_bitbytes;
382
383 for (x = 0; x < length; x++) {
384 bi = rgd->rd_bits + x;
385
386 /* small rgrp; bitmap stored completely in header block */
387 if (length == 1) {
388 bytes = bytes_left;
389 bi->bi_offset = sizeof(struct gfs2_rgrp);
390 bi->bi_start = 0;
391 bi->bi_len = bytes;
392 /* header block */
393 } else if (x == 0) {
394 bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
395 bi->bi_offset = sizeof(struct gfs2_rgrp);
396 bi->bi_start = 0;
397 bi->bi_len = bytes;
398 /* last block */
399 } else if (x + 1 == length) {
400 bytes = bytes_left;
401 bi->bi_offset = sizeof(struct gfs2_meta_header);
402 bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
403 bi->bi_len = bytes;
404 /* other blocks */
405 } else {
406 bytes = sdp->sd_sb.sb_bsize -
407 sizeof(struct gfs2_meta_header);
408 bi->bi_offset = sizeof(struct gfs2_meta_header);
409 bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
410 bi->bi_len = bytes;
411 }
412
413 bytes_left -= bytes;
414 }
415
416 if (bytes_left) {
417 gfs2_consist_rgrpd(rgd);
418 return -EIO;
419 }
420 bi = rgd->rd_bits + (length - 1);
421 if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_ri.ri_data) {
422 if (gfs2_consist_rgrpd(rgd)) {
423 gfs2_rindex_print(&rgd->rd_ri);
424 fs_err(sdp, "start=%u len=%u offset=%u\n",
425 bi->bi_start, bi->bi_len, bi->bi_offset);
426 }
427 return -EIO;
428 }
429
430 return 0;
431}
432
433/**
434 * gfs2_ri_update - Pull in a new resource index from the disk
435 * @gl: The glock covering the rindex inode
436 *
437 * Returns: 0 on successful update, error code otherwise
438 */
439
440static int gfs2_ri_update(struct gfs2_inode *ip)
441{
442 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
443 struct inode *inode = &ip->i_inode;
444 struct gfs2_rgrpd *rgd;
445 char buf[sizeof(struct gfs2_rindex)];
446 struct file_ra_state ra_state;
447 u64 junk = ip->i_di.di_size;
448 int error;
449
450 if (do_div(junk, sizeof(struct gfs2_rindex))) {
451 gfs2_consist_inode(ip);
452 return -EIO;
453 }
454
455 clear_rgrpdi(sdp);
456
457 file_ra_state_init(&ra_state, inode->i_mapping);
458 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
459 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
460 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
461 sizeof(struct gfs2_rindex));
462 if (!error)
463 break;
464 if (error != sizeof(struct gfs2_rindex)) {
465 if (error > 0)
466 error = -EIO;
467 goto fail;
468 }
469
470 rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS);
471 error = -ENOMEM;
472 if (!rgd)
473 goto fail;
474
475 mutex_init(&rgd->rd_mutex);
476 lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
477 rgd->rd_sbd = sdp;
478
479 list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
480 list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
481
482 gfs2_rindex_in(&rgd->rd_ri, buf);
483 error = compute_bitstructs(rgd);
484 if (error)
485 goto fail;
486
487 error = gfs2_glock_get(sdp, rgd->rd_ri.ri_addr,
488 &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
489 if (error)
490 goto fail;
491
492 rgd->rd_gl->gl_object = rgd;
493 rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
494 }
495
496 sdp->sd_rindex_vn = ip->i_gl->gl_vn;
497 return 0;
498
499fail:
500 clear_rgrpdi(sdp);
501 return error;
502}
503
504/**
505 * gfs2_rindex_hold - Grab a lock on the rindex
506 * @sdp: The GFS2 superblock
507 * @ri_gh: the glock holder
508 *
509 * We grab a lock on the rindex inode to make sure that it doesn't
510 * change whilst we are performing an operation. We keep this lock
511 * for quite long periods of time compared to other locks. This
512 * doesn't matter, since it is shared and it is very, very rarely
513 * accessed in the exclusive mode (i.e. only when expanding the filesystem).
514 *
515 * This makes sure that we're using the latest copy of the resource index
516 * special file, which might have been updated if someone expanded the
517 * filesystem (via gfs2_grow utility), which adds new resource groups.
518 *
519 * Returns: 0 on success, error code otherwise
520 */
521
522int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
523{
524 struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
525 struct gfs2_glock *gl = ip->i_gl;
526 int error;
527
528 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
529 if (error)
530 return error;
531
532 /* Read new copy from disk if we don't have the latest */
533 if (sdp->sd_rindex_vn != gl->gl_vn) {
534 mutex_lock(&sdp->sd_rindex_mutex);
535 if (sdp->sd_rindex_vn != gl->gl_vn) {
536 error = gfs2_ri_update(ip);
537 if (error)
538 gfs2_glock_dq_uninit(ri_gh);
539 }
540 mutex_unlock(&sdp->sd_rindex_mutex);
541 }
542
543 return error;
544}
545
546/**
547 * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
548 * @rgd: the struct gfs2_rgrpd describing the RG to read in
549 *
550 * Read in all of a Resource Group's header and bitmap blocks.
551 * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
552 *
553 * Returns: errno
554 */
555
556int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
557{
558 struct gfs2_sbd *sdp = rgd->rd_sbd;
559 struct gfs2_glock *gl = rgd->rd_gl;
560 unsigned int length = rgd->rd_ri.ri_length;
561 struct gfs2_bitmap *bi;
562 unsigned int x, y;
563 int error;
564
565 mutex_lock(&rgd->rd_mutex);
566
567 spin_lock(&sdp->sd_rindex_spin);
568 if (rgd->rd_bh_count) {
569 rgd->rd_bh_count++;
570 spin_unlock(&sdp->sd_rindex_spin);
571 mutex_unlock(&rgd->rd_mutex);
572 return 0;
573 }
574 spin_unlock(&sdp->sd_rindex_spin);
575
576 for (x = 0; x < length; x++) {
577 bi = rgd->rd_bits + x;
578 error = gfs2_meta_read(gl, rgd->rd_ri.ri_addr + x, 0, &bi->bi_bh);
579 if (error)
580 goto fail;
581 }
582
583 for (y = length; y--;) {
584 bi = rgd->rd_bits + y;
585 error = gfs2_meta_wait(sdp, bi->bi_bh);
586 if (error)
587 goto fail;
588 if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
589 GFS2_METATYPE_RG)) {
590 error = -EIO;
591 goto fail;
592 }
593 }
594
595 if (rgd->rd_rg_vn != gl->gl_vn) {
596 gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data);
597 rgd->rd_rg_vn = gl->gl_vn;
598 }
599
600 spin_lock(&sdp->sd_rindex_spin);
601 rgd->rd_free_clone = rgd->rd_rg.rg_free;
602 rgd->rd_bh_count++;
603 spin_unlock(&sdp->sd_rindex_spin);
604
605 mutex_unlock(&rgd->rd_mutex);
606
607 return 0;
608
609fail:
610 while (x--) {
611 bi = rgd->rd_bits + x;
612 brelse(bi->bi_bh);
613 bi->bi_bh = NULL;
614 gfs2_assert_warn(sdp, !bi->bi_clone);
615 }
616 mutex_unlock(&rgd->rd_mutex);
617
618 return error;
619}
620
621void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
622{
623 struct gfs2_sbd *sdp = rgd->rd_sbd;
624
625 spin_lock(&sdp->sd_rindex_spin);
626 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
627 rgd->rd_bh_count++;
628 spin_unlock(&sdp->sd_rindex_spin);
629}
630
631/**
632 * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get()
633 * @rgd: the struct gfs2_rgrpd describing the RG to read in
634 *
635 */
636
637void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
638{
639 struct gfs2_sbd *sdp = rgd->rd_sbd;
640 int x, length = rgd->rd_ri.ri_length;
641
642 spin_lock(&sdp->sd_rindex_spin);
643 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
644 if (--rgd->rd_bh_count) {
645 spin_unlock(&sdp->sd_rindex_spin);
646 return;
647 }
648
649 for (x = 0; x < length; x++) {
650 struct gfs2_bitmap *bi = rgd->rd_bits + x;
651 kfree(bi->bi_clone);
652 bi->bi_clone = NULL;
653 brelse(bi->bi_bh);
654 bi->bi_bh = NULL;
655 }
656
657 spin_unlock(&sdp->sd_rindex_spin);
658}
659
660void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
661{
662 struct gfs2_sbd *sdp = rgd->rd_sbd;
663 unsigned int length = rgd->rd_ri.ri_length;
664 unsigned int x;
665
666 for (x = 0; x < length; x++) {
667 struct gfs2_bitmap *bi = rgd->rd_bits + x;
668 if (!bi->bi_clone)
669 continue;
670 memcpy(bi->bi_clone + bi->bi_offset,
671 bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
672 }
673
674 spin_lock(&sdp->sd_rindex_spin);
675 rgd->rd_free_clone = rgd->rd_rg.rg_free;
676 spin_unlock(&sdp->sd_rindex_spin);
677}
678
679/**
680 * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
681 * @ip: the incore GFS2 inode structure
682 *
683 * Returns: the struct gfs2_alloc
684 */
685
686struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
687{
688 struct gfs2_alloc *al = &ip->i_alloc;
689
690 /* FIXME: Should assert that the correct locks are held here... */
691 memset(al, 0, sizeof(*al));
692 return al;
693}
694
695/**
696 * try_rgrp_fit - See if a given reservation will fit in a given RG
697 * @rgd: the RG data
698 * @al: the struct gfs2_alloc structure describing the reservation
699 *
700 * If there's room for the requested blocks to be allocated from the RG:
701 * Sets the $al_reserved_data field in @al.
702 * Sets the $al_reserved_meta field in @al.
703 * Sets the $al_rgd field in @al.
704 *
705 * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
706 */
707
708static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
709{
710 struct gfs2_sbd *sdp = rgd->rd_sbd;
711 int ret = 0;
712
713 spin_lock(&sdp->sd_rindex_spin);
714 if (rgd->rd_free_clone >= al->al_requested) {
715 al->al_rgd = rgd;
716 ret = 1;
717 }
718 spin_unlock(&sdp->sd_rindex_spin);
719
720 return ret;
721}
722
723/**
724 * recent_rgrp_first - get first RG from "recent" list
725 * @sdp: The GFS2 superblock
726 * @rglast: address of the rgrp used last
727 *
728 * Returns: The first rgrp in the recent list
729 */
730
731static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
732 u64 rglast)
733{
734 struct gfs2_rgrpd *rgd = NULL;
735
736 spin_lock(&sdp->sd_rindex_spin);
737
738 if (list_empty(&sdp->sd_rindex_recent_list))
739 goto out;
740
741 if (!rglast)
742 goto first;
743
744 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
745 if (rgd->rd_ri.ri_addr == rglast)
746 goto out;
747 }
748
749first:
750 rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd,
751 rd_recent);
752out:
753 spin_unlock(&sdp->sd_rindex_spin);
754 return rgd;
755}
756
757/**
758 * recent_rgrp_next - get next RG from "recent" list
759 * @cur_rgd: current rgrp
760 * @remove:
761 *
762 * Returns: The next rgrp in the recent list
763 */
764
765static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
766 int remove)
767{
768 struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
769 struct list_head *head;
770 struct gfs2_rgrpd *rgd;
771
772 spin_lock(&sdp->sd_rindex_spin);
773
774 head = &sdp->sd_rindex_recent_list;
775
776 list_for_each_entry(rgd, head, rd_recent) {
777 if (rgd == cur_rgd) {
778 if (cur_rgd->rd_recent.next != head)
779 rgd = list_entry(cur_rgd->rd_recent.next,
780 struct gfs2_rgrpd, rd_recent);
781 else
782 rgd = NULL;
783
784 if (remove)
785 list_del(&cur_rgd->rd_recent);
786
787 goto out;
788 }
789 }
790
791 rgd = NULL;
792 if (!list_empty(head))
793 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
794
795out:
796 spin_unlock(&sdp->sd_rindex_spin);
797 return rgd;
798}
799
800/**
801 * recent_rgrp_add - add an RG to tail of "recent" list
802 * @new_rgd: The rgrp to add
803 *
804 */
805
806static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
807{
808 struct gfs2_sbd *sdp = new_rgd->rd_sbd;
809 struct gfs2_rgrpd *rgd;
810 unsigned int count = 0;
811 unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
812
813 spin_lock(&sdp->sd_rindex_spin);
814
815 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
816 if (rgd == new_rgd)
817 goto out;
818
819 if (++count >= max)
820 goto out;
821 }
822 list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
823
824out:
825 spin_unlock(&sdp->sd_rindex_spin);
826}
827
828/**
829 * forward_rgrp_get - get an rgrp to try next from full list
830 * @sdp: The GFS2 superblock
831 *
832 * Returns: The rgrp to try next
833 */
834
835static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp)
836{
837 struct gfs2_rgrpd *rgd;
838 unsigned int journals = gfs2_jindex_size(sdp);
839 unsigned int rg = 0, x;
840
841 spin_lock(&sdp->sd_rindex_spin);
842
843 rgd = sdp->sd_rindex_forward;
844 if (!rgd) {
845 if (sdp->sd_rgrps >= journals)
846 rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals;
847
848 for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg;
849 x++, rgd = gfs2_rgrpd_get_next(rgd))
850 /* Do Nothing */;
851
852 sdp->sd_rindex_forward = rgd;
853 }
854
855 spin_unlock(&sdp->sd_rindex_spin);
856
857 return rgd;
858}
859
860/**
861 * forward_rgrp_set - set the forward rgrp pointer
862 * @sdp: the filesystem
863 * @rgd: The new forward rgrp
864 *
865 */
866
867static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
868{
869 spin_lock(&sdp->sd_rindex_spin);
870 sdp->sd_rindex_forward = rgd;
871 spin_unlock(&sdp->sd_rindex_spin);
872}
873
874/**
875 * get_local_rgrp - Choose and lock a rgrp for allocation
876 * @ip: the inode to reserve space for
877 * @rgp: the chosen and locked rgrp
878 *
879 * Try to acquire rgrp in way which avoids contending with others.
880 *
881 * Returns: errno
882 */
883
884static int get_local_rgrp(struct gfs2_inode *ip)
885{
886 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
887 struct gfs2_rgrpd *rgd, *begin = NULL;
888 struct gfs2_alloc *al = &ip->i_alloc;
889 int flags = LM_FLAG_TRY;
890 int skipped = 0;
891 int loops = 0;
892 int error;
893
894 /* Try recently successful rgrps */
895
896 rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
897
898 while (rgd) {
899 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
900 LM_FLAG_TRY, &al->al_rgd_gh);
901 switch (error) {
902 case 0:
903 if (try_rgrp_fit(rgd, al))
904 goto out;
905 gfs2_glock_dq_uninit(&al->al_rgd_gh);
906 rgd = recent_rgrp_next(rgd, 1);
907 break;
908
909 case GLR_TRYFAILED:
910 rgd = recent_rgrp_next(rgd, 0);
911 break;
912
913 default:
914 return error;
915 }
916 }
917
918 /* Go through full list of rgrps */
919
920 begin = rgd = forward_rgrp_get(sdp);
921
922 for (;;) {
923 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
924 &al->al_rgd_gh);
925 switch (error) {
926 case 0:
927 if (try_rgrp_fit(rgd, al))
928 goto out;
929 gfs2_glock_dq_uninit(&al->al_rgd_gh);
930 break;
931
932 case GLR_TRYFAILED:
933 skipped++;
934 break;
935
936 default:
937 return error;
938 }
939
940 rgd = gfs2_rgrpd_get_next(rgd);
941 if (!rgd)
942 rgd = gfs2_rgrpd_get_first(sdp);
943
944 if (rgd == begin) {
945 if (++loops >= 2 || !skipped)
946 return -ENOSPC;
947 flags = 0;
948 }
949 }
950
951out:
952 ip->i_last_rg_alloc = rgd->rd_ri.ri_addr;
953
954 if (begin) {
955 recent_rgrp_add(rgd);
956 rgd = gfs2_rgrpd_get_next(rgd);
957 if (!rgd)
958 rgd = gfs2_rgrpd_get_first(sdp);
959 forward_rgrp_set(sdp, rgd);
960 }
961
962 return 0;
963}
964
965/**
966 * gfs2_inplace_reserve_i - Reserve space in the filesystem
967 * @ip: the inode to reserve space for
968 *
969 * Returns: errno
970 */
971
972int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
973{
974 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
975 struct gfs2_alloc *al = &ip->i_alloc;
976 int error;
977
978 if (gfs2_assert_warn(sdp, al->al_requested))
979 return -EINVAL;
980
981 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
982 if (error)
983 return error;
984
985 error = get_local_rgrp(ip);
986 if (error) {
987 gfs2_glock_dq_uninit(&al->al_ri_gh);
988 return error;
989 }
990
991 al->al_file = file;
992 al->al_line = line;
993
994 return 0;
995}
996
997/**
998 * gfs2_inplace_release - release an inplace reservation
999 * @ip: the inode the reservation was taken out on
1000 *
1001 * Release a reservation made by gfs2_inplace_reserve().
1002 */
1003
1004void gfs2_inplace_release(struct gfs2_inode *ip)
1005{
1006 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1007 struct gfs2_alloc *al = &ip->i_alloc;
1008
1009 if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
1010 fs_warn(sdp, "al_alloced = %u, al_requested = %u "
1011 "al_file = %s, al_line = %u\n",
1012 al->al_alloced, al->al_requested, al->al_file,
1013 al->al_line);
1014
1015 al->al_rgd = NULL;
1016 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1017 gfs2_glock_dq_uninit(&al->al_ri_gh);
1018}
1019
1020/**
1021 * gfs2_get_block_type - Check a block in a RG is of given type
1022 * @rgd: the resource group holding the block
1023 * @block: the block number
1024 *
1025 * Returns: The block type (GFS2_BLKST_*)
1026 */
1027
1028unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
1029{
1030 struct gfs2_bitmap *bi = NULL;
1031 u32 length, rgrp_block, buf_block;
1032 unsigned int buf;
1033 unsigned char type;
1034
1035 length = rgd->rd_ri.ri_length;
1036 rgrp_block = block - rgd->rd_ri.ri_data0;
1037
1038 for (buf = 0; buf < length; buf++) {
1039 bi = rgd->rd_bits + buf;
1040 if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1041 break;
1042 }
1043
1044 gfs2_assert(rgd->rd_sbd, buf < length);
1045 buf_block = rgrp_block - bi->bi_start * GFS2_NBBY;
1046
1047 type = gfs2_testbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
1048 bi->bi_len, buf_block);
1049
1050 return type;
1051}
1052
1053/**
1054 * rgblk_search - find a block in @old_state, change allocation
1055 * state to @new_state
1056 * @rgd: the resource group descriptor
1057 * @goal: the goal block within the RG (start here to search for avail block)
1058 * @old_state: GFS2_BLKST_XXX the before-allocation state to find
1059 * @new_state: GFS2_BLKST_XXX the after-allocation block state
1060 *
1061 * Walk rgrp's bitmap to find bits that represent a block in @old_state.
1062 * Add the found bitmap buffer to the transaction.
1063 * Set the found bits to @new_state to change block's allocation state.
1064 *
1065 * This function never fails, because we wouldn't call it unless we
1066 * know (from reservation results, etc.) that a block is available.
1067 *
1068 * Scope of @goal and returned block is just within rgrp, not the whole
1069 * filesystem.
1070 *
1071 * Returns: the block number allocated
1072 */
1073
1074static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
1075 unsigned char old_state, unsigned char new_state)
1076{
1077 struct gfs2_bitmap *bi = NULL;
1078 u32 length = rgd->rd_ri.ri_length;
1079 u32 blk = 0;
1080 unsigned int buf, x;
1081
1082 /* Find bitmap block that contains bits for goal block */
1083 for (buf = 0; buf < length; buf++) {
1084 bi = rgd->rd_bits + buf;
1085 if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1086 break;
1087 }
1088
1089 gfs2_assert(rgd->rd_sbd, buf < length);
1090
1091 /* Convert scope of "goal" from rgrp-wide to within found bit block */
1092 goal -= bi->bi_start * GFS2_NBBY;
1093
1094 /* Search (up to entire) bitmap in this rgrp for allocatable block.
1095 "x <= length", instead of "x < length", because we typically start
1096 the search in the middle of a bit block, but if we can't find an
1097 allocatable block anywhere else, we want to be able wrap around and
1098 search in the first part of our first-searched bit block. */
1099 for (x = 0; x <= length; x++) {
1100 if (bi->bi_clone)
1101 blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset,
1102 bi->bi_len, goal, old_state);
1103 else
1104 blk = gfs2_bitfit(rgd,
1105 bi->bi_bh->b_data + bi->bi_offset,
1106 bi->bi_len, goal, old_state);
1107 if (blk != BFITNOENT)
1108 break;
1109
1110 /* Try next bitmap block (wrap back to rgrp header if at end) */
1111 buf = (buf + 1) % length;
1112 bi = rgd->rd_bits + buf;
1113 goal = 0;
1114 }
1115
1116 if (gfs2_assert_withdraw(rgd->rd_sbd, x <= length))
1117 blk = 0;
1118
1119 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1120 gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
1121 bi->bi_len, blk, new_state);
1122 if (bi->bi_clone)
1123 gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset,
1124 bi->bi_len, blk, new_state);
1125
1126 return bi->bi_start * GFS2_NBBY + blk;
1127}
1128
1129/**
1130 * rgblk_free - Change alloc state of given block(s)
1131 * @sdp: the filesystem
1132 * @bstart: the start of a run of blocks to free
1133 * @blen: the length of the block run (all must lie within ONE RG!)
1134 * @new_state: GFS2_BLKST_XXX the after-allocation block state
1135 *
1136 * Returns: Resource group containing the block(s)
1137 */
1138
1139static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
1140 u32 blen, unsigned char new_state)
1141{
1142 struct gfs2_rgrpd *rgd;
1143 struct gfs2_bitmap *bi = NULL;
1144 u32 length, rgrp_blk, buf_blk;
1145 unsigned int buf;
1146
1147 rgd = gfs2_blk2rgrpd(sdp, bstart);
1148 if (!rgd) {
1149 if (gfs2_consist(sdp))
1150 fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
1151 return NULL;
1152 }
1153
1154 length = rgd->rd_ri.ri_length;
1155
1156 rgrp_blk = bstart - rgd->rd_ri.ri_data0;
1157
1158 while (blen--) {
1159 for (buf = 0; buf < length; buf++) {
1160 bi = rgd->rd_bits + buf;
1161 if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1162 break;
1163 }
1164
1165 gfs2_assert(rgd->rd_sbd, buf < length);
1166
1167 buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY;
1168 rgrp_blk++;
1169
1170 if (!bi->bi_clone) {
1171 bi->bi_clone = kmalloc(bi->bi_bh->b_size,
1172 GFP_NOFS | __GFP_NOFAIL);
1173 memcpy(bi->bi_clone + bi->bi_offset,
1174 bi->bi_bh->b_data + bi->bi_offset,
1175 bi->bi_len);
1176 }
1177 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1178 gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
1179 bi->bi_len, buf_blk, new_state);
1180 }
1181
1182 return rgd;
1183}
1184
1185/**
1186 * gfs2_alloc_data - Allocate a data block
1187 * @ip: the inode to allocate the data block for
1188 *
1189 * Returns: the allocated block
1190 */
1191
1192u64 gfs2_alloc_data(struct gfs2_inode *ip)
1193{
1194 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1195 struct gfs2_alloc *al = &ip->i_alloc;
1196 struct gfs2_rgrpd *rgd = al->al_rgd;
1197 u32 goal, blk;
1198 u64 block;
1199
1200 if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_data))
1201 goal = ip->i_di.di_goal_data - rgd->rd_ri.ri_data0;
1202 else
1203 goal = rgd->rd_last_alloc_data;
1204
1205 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
1206 rgd->rd_last_alloc_data = blk;
1207
1208 block = rgd->rd_ri.ri_data0 + blk;
1209 ip->i_di.di_goal_data = block;
1210
1211 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1212 rgd->rd_rg.rg_free--;
1213
1214 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1215 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1216
1217 al->al_alloced++;
1218
1219 gfs2_statfs_change(sdp, 0, -1, 0);
1220 gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
1221
1222 spin_lock(&sdp->sd_rindex_spin);
1223 rgd->rd_free_clone--;
1224 spin_unlock(&sdp->sd_rindex_spin);
1225
1226 return block;
1227}
1228
1229/**
1230 * gfs2_alloc_meta - Allocate a metadata block
1231 * @ip: the inode to allocate the metadata block for
1232 *
1233 * Returns: the allocated block
1234 */
1235
1236u64 gfs2_alloc_meta(struct gfs2_inode *ip)
1237{
1238 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1239 struct gfs2_alloc *al = &ip->i_alloc;
1240 struct gfs2_rgrpd *rgd = al->al_rgd;
1241 u32 goal, blk;
1242 u64 block;
1243
1244 if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_meta))
1245 goal = ip->i_di.di_goal_meta - rgd->rd_ri.ri_data0;
1246 else
1247 goal = rgd->rd_last_alloc_meta;
1248
1249 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
1250 rgd->rd_last_alloc_meta = blk;
1251
1252 block = rgd->rd_ri.ri_data0 + blk;
1253 ip->i_di.di_goal_meta = block;
1254
1255 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1256 rgd->rd_rg.rg_free--;
1257
1258 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1259 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1260
1261 al->al_alloced++;
1262
1263 gfs2_statfs_change(sdp, 0, -1, 0);
1264 gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
1265 gfs2_trans_add_unrevoke(sdp, block);
1266
1267 spin_lock(&sdp->sd_rindex_spin);
1268 rgd->rd_free_clone--;
1269 spin_unlock(&sdp->sd_rindex_spin);
1270
1271 return block;
1272}
1273
1274/**
1275 * gfs2_alloc_di - Allocate a dinode
1276 * @dip: the directory that the inode is going in
1277 *
1278 * Returns: the block allocated
1279 */
1280
1281u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1282{
1283 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1284 struct gfs2_alloc *al = &dip->i_alloc;
1285 struct gfs2_rgrpd *rgd = al->al_rgd;
1286 u32 blk;
1287 u64 block;
1288
1289 blk = rgblk_search(rgd, rgd->rd_last_alloc_meta,
1290 GFS2_BLKST_FREE, GFS2_BLKST_DINODE);
1291
1292 rgd->rd_last_alloc_meta = blk;
1293
1294 block = rgd->rd_ri.ri_data0 + blk;
1295
1296 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1297 rgd->rd_rg.rg_free--;
1298 rgd->rd_rg.rg_dinodes++;
1299 *generation = rgd->rd_rg.rg_igeneration++;
1300 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1301 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1302
1303 al->al_alloced++;
1304
1305 gfs2_statfs_change(sdp, 0, -1, +1);
1306 gfs2_trans_add_unrevoke(sdp, block);
1307
1308 spin_lock(&sdp->sd_rindex_spin);
1309 rgd->rd_free_clone--;
1310 spin_unlock(&sdp->sd_rindex_spin);
1311
1312 return block;
1313}
1314
1315/**
1316 * gfs2_free_data - free a contiguous run of data block(s)
1317 * @ip: the inode these blocks are being freed from
1318 * @bstart: first block of a run of contiguous blocks
1319 * @blen: the length of the block run
1320 *
1321 */
1322
1323void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
1324{
1325 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1326 struct gfs2_rgrpd *rgd;
1327
1328 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1329 if (!rgd)
1330 return;
1331
1332 rgd->rd_rg.rg_free += blen;
1333
1334 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1335 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1336
1337 gfs2_trans_add_rg(rgd);
1338
1339 gfs2_statfs_change(sdp, 0, +blen, 0);
1340 gfs2_quota_change(ip, -(s64)blen,
1341 ip->i_di.di_uid, ip->i_di.di_gid);
1342}
1343
1344/**
1345 * gfs2_free_meta - free a contiguous run of data block(s)
1346 * @ip: the inode these blocks are being freed from
1347 * @bstart: first block of a run of contiguous blocks
1348 * @blen: the length of the block run
1349 *
1350 */
1351
1352void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
1353{
1354 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1355 struct gfs2_rgrpd *rgd;
1356
1357 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1358 if (!rgd)
1359 return;
1360
1361 rgd->rd_rg.rg_free += blen;
1362
1363 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1364 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1365
1366 gfs2_trans_add_rg(rgd);
1367
1368 gfs2_statfs_change(sdp, 0, +blen, 0);
1369 gfs2_quota_change(ip, -(s64)blen, ip->i_di.di_uid, ip->i_di.di_gid);
1370 gfs2_meta_wipe(ip, bstart, blen);
1371}
1372
1373void gfs2_unlink_di(struct inode *inode)
1374{
1375 struct gfs2_inode *ip = GFS2_I(inode);
1376 struct gfs2_sbd *sdp = GFS2_SB(inode);
1377 struct gfs2_rgrpd *rgd;
1378 u64 blkno = ip->i_num.no_addr;
1379
1380 rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
1381 if (!rgd)
1382 return;
1383 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1384 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1385 gfs2_trans_add_rg(rgd);
1386}
1387
1388static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
1389{
1390 struct gfs2_sbd *sdp = rgd->rd_sbd;
1391 struct gfs2_rgrpd *tmp_rgd;
1392
1393 tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE);
1394 if (!tmp_rgd)
1395 return;
1396 gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
1397
1398 if (!rgd->rd_rg.rg_dinodes)
1399 gfs2_consist_rgrpd(rgd);
1400 rgd->rd_rg.rg_dinodes--;
1401 rgd->rd_rg.rg_free++;
1402
1403 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1404 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1405
1406 gfs2_statfs_change(sdp, 0, +1, -1);
1407 gfs2_trans_add_rg(rgd);
1408}
1409
1410
1411void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
1412{
1413 gfs2_free_uninit_di(rgd, ip->i_num.no_addr);
1414 gfs2_quota_change(ip, -1, ip->i_di.di_uid, ip->i_di.di_gid);
1415 gfs2_meta_wipe(ip, ip->i_num.no_addr, 1);
1416}
1417
1418/**
1419 * gfs2_rlist_add - add a RG to a list of RGs
1420 * @sdp: the filesystem
1421 * @rlist: the list of resource groups
1422 * @block: the block
1423 *
1424 * Figure out what RG a block belongs to and add that RG to the list
1425 *
1426 * FIXME: Don't use NOFAIL
1427 *
1428 */
1429
1430void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
1431 u64 block)
1432{
1433 struct gfs2_rgrpd *rgd;
1434 struct gfs2_rgrpd **tmp;
1435 unsigned int new_space;
1436 unsigned int x;
1437
1438 if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
1439 return;
1440
1441 rgd = gfs2_blk2rgrpd(sdp, block);
1442 if (!rgd) {
1443 if (gfs2_consist(sdp))
1444 fs_err(sdp, "block = %llu\n", (unsigned long long)block);
1445 return;
1446 }
1447
1448 for (x = 0; x < rlist->rl_rgrps; x++)
1449 if (rlist->rl_rgd[x] == rgd)
1450 return;
1451
1452 if (rlist->rl_rgrps == rlist->rl_space) {
1453 new_space = rlist->rl_space + 10;
1454
1455 tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
1456 GFP_NOFS | __GFP_NOFAIL);
1457
1458 if (rlist->rl_rgd) {
1459 memcpy(tmp, rlist->rl_rgd,
1460 rlist->rl_space * sizeof(struct gfs2_rgrpd *));
1461 kfree(rlist->rl_rgd);
1462 }
1463
1464 rlist->rl_space = new_space;
1465 rlist->rl_rgd = tmp;
1466 }
1467
1468 rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
1469}
1470
1471/**
1472 * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
1473 * and initialize an array of glock holders for them
1474 * @rlist: the list of resource groups
1475 * @state: the lock state to acquire the RG lock in
1476 * @flags: the modifier flags for the holder structures
1477 *
1478 * FIXME: Don't use NOFAIL
1479 *
1480 */
1481
1482void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
1483 int flags)
1484{
1485 unsigned int x;
1486
1487 rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
1488 GFP_NOFS | __GFP_NOFAIL);
1489 for (x = 0; x < rlist->rl_rgrps; x++)
1490 gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
1491 state, flags,
1492 &rlist->rl_ghs[x]);
1493}
1494
1495/**
1496 * gfs2_rlist_free - free a resource group list
1497 * @list: the list of resource groups
1498 *
1499 */
1500
1501void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
1502{
1503 unsigned int x;
1504
1505 kfree(rlist->rl_rgd);
1506
1507 if (rlist->rl_ghs) {
1508 for (x = 0; x < rlist->rl_rgrps; x++)
1509 gfs2_holder_uninit(&rlist->rl_ghs[x]);
1510 kfree(rlist->rl_ghs);
1511 }
1512}
1513
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
new file mode 100644
index 000000000000..9eedfd12bfff
--- /dev/null
+++ b/fs/gfs2/rgrp.h
@@ -0,0 +1,69 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __RGRP_DOT_H__
11#define __RGRP_DOT_H__
12
13struct gfs2_rgrpd;
14struct gfs2_sbd;
15struct gfs2_holder;
16
17void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
18
19struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
20struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
21struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
22
23void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
24int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
25
26int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
27void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
28void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
29
30void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
31
32struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
33static inline void gfs2_alloc_put(struct gfs2_inode *ip)
34{
35 return; /* Se we can see where ip->i_alloc is used */
36}
37
38int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
39 char *file, unsigned int line);
40#define gfs2_inplace_reserve(ip) \
41gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
42
43void gfs2_inplace_release(struct gfs2_inode *ip);
44
45unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
46
47u64 gfs2_alloc_data(struct gfs2_inode *ip);
48u64 gfs2_alloc_meta(struct gfs2_inode *ip);
49u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
50
51void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
52void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
53void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
54void gfs2_unlink_di(struct inode *inode);
55
56struct gfs2_rgrp_list {
57 unsigned int rl_rgrps;
58 unsigned int rl_space;
59 struct gfs2_rgrpd **rl_rgd;
60 struct gfs2_holder *rl_ghs;
61};
62
63void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
64 u64 block);
65void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
66 int flags);
67void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
68
69#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
new file mode 100644
index 000000000000..6a78b1b32e25
--- /dev/null
+++ b/fs/gfs2/super.c
@@ -0,0 +1,976 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/crc32.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/bio.h>
18#include <linux/lm_interface.h>
19
20#include "gfs2.h"
21#include "incore.h"
22#include "bmap.h"
23#include "dir.h"
24#include "glock.h"
25#include "glops.h"
26#include "inode.h"
27#include "log.h"
28#include "meta_io.h"
29#include "quota.h"
30#include "recovery.h"
31#include "rgrp.h"
32#include "super.h"
33#include "trans.h"
34#include "util.h"
35
36static const u32 gfs2_old_fs_formats[] = {
37 0
38};
39
40static const u32 gfs2_old_multihost_formats[] = {
41 0
42};
43
44/**
45 * gfs2_tune_init - Fill a gfs2_tune structure with default values
46 * @gt: tune
47 *
48 */
49
50void gfs2_tune_init(struct gfs2_tune *gt)
51{
52 spin_lock_init(&gt->gt_spin);
53
54 gt->gt_ilimit = 100;
55 gt->gt_ilimit_tries = 3;
56 gt->gt_ilimit_min = 1;
57 gt->gt_demote_secs = 300;
58 gt->gt_incore_log_blocks = 1024;
59 gt->gt_log_flush_secs = 60;
60 gt->gt_jindex_refresh_secs = 60;
61 gt->gt_scand_secs = 15;
62 gt->gt_recoverd_secs = 60;
63 gt->gt_logd_secs = 1;
64 gt->gt_quotad_secs = 5;
65 gt->gt_quota_simul_sync = 64;
66 gt->gt_quota_warn_period = 10;
67 gt->gt_quota_scale_num = 1;
68 gt->gt_quota_scale_den = 1;
69 gt->gt_quota_cache_secs = 300;
70 gt->gt_quota_quantum = 60;
71 gt->gt_atime_quantum = 3600;
72 gt->gt_new_files_jdata = 0;
73 gt->gt_new_files_directio = 0;
74 gt->gt_max_atomic_write = 4 << 20;
75 gt->gt_max_readahead = 1 << 18;
76 gt->gt_lockdump_size = 131072;
77 gt->gt_stall_secs = 600;
78 gt->gt_complain_secs = 10;
79 gt->gt_reclaim_limit = 5000;
80 gt->gt_entries_per_readdir = 32;
81 gt->gt_prefetch_secs = 10;
82 gt->gt_greedy_default = HZ / 10;
83 gt->gt_greedy_quantum = HZ / 40;
84 gt->gt_greedy_max = HZ / 4;
85 gt->gt_statfs_quantum = 30;
86 gt->gt_statfs_slow = 0;
87}
88
89/**
90 * gfs2_check_sb - Check superblock
91 * @sdp: the filesystem
92 * @sb: The superblock
93 * @silent: Don't print a message if the check fails
94 *
95 * Checks the version code of the FS is one that we understand how to
96 * read and that the sizes of the various on-disk structures have not
97 * changed.
98 */
99
100int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent)
101{
102 unsigned int x;
103
104 if (sb->sb_header.mh_magic != GFS2_MAGIC ||
105 sb->sb_header.mh_type != GFS2_METATYPE_SB) {
106 if (!silent)
107 printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
108 return -EINVAL;
109 }
110
111 /* If format numbers match exactly, we're done. */
112
113 if (sb->sb_fs_format == GFS2_FORMAT_FS &&
114 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
115 return 0;
116
117 if (sb->sb_fs_format != GFS2_FORMAT_FS) {
118 for (x = 0; gfs2_old_fs_formats[x]; x++)
119 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
120 break;
121
122 if (!gfs2_old_fs_formats[x]) {
123 printk(KERN_WARNING
124 "GFS2: code version (%u, %u) is incompatible "
125 "with ondisk format (%u, %u)\n",
126 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
127 sb->sb_fs_format, sb->sb_multihost_format);
128 printk(KERN_WARNING
129 "GFS2: I don't know how to upgrade this FS\n");
130 return -EINVAL;
131 }
132 }
133
134 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
135 for (x = 0; gfs2_old_multihost_formats[x]; x++)
136 if (gfs2_old_multihost_formats[x] ==
137 sb->sb_multihost_format)
138 break;
139
140 if (!gfs2_old_multihost_formats[x]) {
141 printk(KERN_WARNING
142 "GFS2: code version (%u, %u) is incompatible "
143 "with ondisk format (%u, %u)\n",
144 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
145 sb->sb_fs_format, sb->sb_multihost_format);
146 printk(KERN_WARNING
147 "GFS2: I don't know how to upgrade this FS\n");
148 return -EINVAL;
149 }
150 }
151
152 if (!sdp->sd_args.ar_upgrade) {
153 printk(KERN_WARNING
154 "GFS2: code version (%u, %u) is incompatible "
155 "with ondisk format (%u, %u)\n",
156 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
157 sb->sb_fs_format, sb->sb_multihost_format);
158 printk(KERN_INFO
159 "GFS2: Use the \"upgrade\" mount option to upgrade "
160 "the FS\n");
161 printk(KERN_INFO "GFS2: See the manual for more details\n");
162 return -EINVAL;
163 }
164
165 return 0;
166}
167
168
169static int end_bio_io_page(struct bio *bio, unsigned int bytes_done, int error)
170{
171 struct page *page = bio->bi_private;
172 if (bio->bi_size)
173 return 1;
174
175 if (!error)
176 SetPageUptodate(page);
177 else
178 printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
179 unlock_page(page);
180 return 0;
181}
182
183struct page *gfs2_read_super(struct super_block *sb, sector_t sector)
184{
185 struct page *page;
186 struct bio *bio;
187
188 page = alloc_page(GFP_KERNEL);
189 if (unlikely(!page))
190 return NULL;
191
192 ClearPageUptodate(page);
193 ClearPageDirty(page);
194 lock_page(page);
195
196 bio = bio_alloc(GFP_KERNEL, 1);
197 if (unlikely(!bio)) {
198 __free_page(page);
199 return NULL;
200 }
201
202 bio->bi_sector = sector;
203 bio->bi_bdev = sb->s_bdev;
204 bio_add_page(bio, page, PAGE_SIZE, 0);
205
206 bio->bi_end_io = end_bio_io_page;
207 bio->bi_private = page;
208 submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
209 wait_on_page_locked(page);
210 bio_put(bio);
211 if (!PageUptodate(page)) {
212 __free_page(page);
213 return NULL;
214 }
215 return page;
216}
217
218/**
219 * gfs2_read_sb - Read super block
220 * @sdp: The GFS2 superblock
221 * @gl: the glock for the superblock (assumed to be held)
222 * @silent: Don't print message if mount fails
223 *
224 */
225
226int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
227{
228 u32 hash_blocks, ind_blocks, leaf_blocks;
229 u32 tmp_blocks;
230 unsigned int x;
231 int error;
232 struct page *page;
233 char *sb;
234
235 page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
236 if (!page) {
237 if (!silent)
238 fs_err(sdp, "can't read superblock\n");
239 return -EIO;
240 }
241 sb = kmap(page);
242 gfs2_sb_in(&sdp->sd_sb, sb);
243 kunmap(page);
244 __free_page(page);
245
246 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
247 if (error)
248 return error;
249
250 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
251 GFS2_BASIC_BLOCK_SHIFT;
252 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
253 sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
254 sizeof(struct gfs2_dinode)) / sizeof(u64);
255 sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
256 sizeof(struct gfs2_meta_header)) / sizeof(u64);
257 sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
258 sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
259 sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
260 sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
261 sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
262 sizeof(struct gfs2_meta_header)) /
263 sizeof(struct gfs2_quota_change);
264
265 /* Compute maximum reservation required to add a entry to a directory */
266
267 hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
268 sdp->sd_jbsize);
269
270 ind_blocks = 0;
271 for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
272 tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
273 ind_blocks += tmp_blocks;
274 }
275
276 leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
277
278 sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
279
280 sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
281 sizeof(struct gfs2_dinode);
282 sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
283 for (x = 2;; x++) {
284 u64 space, d;
285 u32 m;
286
287 space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
288 d = space;
289 m = do_div(d, sdp->sd_inptrs);
290
291 if (d != sdp->sd_heightsize[x - 1] || m)
292 break;
293 sdp->sd_heightsize[x] = space;
294 }
295 sdp->sd_max_height = x;
296 gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
297
298 sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
299 sizeof(struct gfs2_dinode);
300 sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
301 for (x = 2;; x++) {
302 u64 space, d;
303 u32 m;
304
305 space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
306 d = space;
307 m = do_div(d, sdp->sd_inptrs);
308
309 if (d != sdp->sd_jheightsize[x - 1] || m)
310 break;
311 sdp->sd_jheightsize[x] = space;
312 }
313 sdp->sd_max_jheight = x;
314 gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
315
316 return 0;
317}
318
319/**
320 * gfs2_jindex_hold - Grab a lock on the jindex
321 * @sdp: The GFS2 superblock
322 * @ji_gh: the holder for the jindex glock
323 *
324 * This is very similar to the gfs2_rindex_hold() function, except that
325 * in general we hold the jindex lock for longer periods of time and
326 * we grab it far less frequently (in general) then the rgrp lock.
327 *
328 * Returns: errno
329 */
330
331int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
332{
333 struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
334 struct qstr name;
335 char buf[20];
336 struct gfs2_jdesc *jd;
337 int error;
338
339 name.name = buf;
340
341 mutex_lock(&sdp->sd_jindex_mutex);
342
343 for (;;) {
344 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED,
345 GL_LOCAL_EXCL, ji_gh);
346 if (error)
347 break;
348
349 name.len = sprintf(buf, "journal%u", sdp->sd_journals);
350 name.hash = gfs2_disk_hash(name.name, name.len);
351
352 error = gfs2_dir_search(sdp->sd_jindex, &name, NULL, NULL);
353 if (error == -ENOENT) {
354 error = 0;
355 break;
356 }
357
358 gfs2_glock_dq_uninit(ji_gh);
359
360 if (error)
361 break;
362
363 error = -ENOMEM;
364 jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
365 if (!jd)
366 break;
367
368 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
369 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
370 if (!jd->jd_inode)
371 error = -ENOENT;
372 else
373 error = PTR_ERR(jd->jd_inode);
374 kfree(jd);
375 break;
376 }
377
378 spin_lock(&sdp->sd_jindex_spin);
379 jd->jd_jid = sdp->sd_journals++;
380 list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
381 spin_unlock(&sdp->sd_jindex_spin);
382 }
383
384 mutex_unlock(&sdp->sd_jindex_mutex);
385
386 return error;
387}
388
389/**
390 * gfs2_jindex_free - Clear all the journal index information
391 * @sdp: The GFS2 superblock
392 *
393 */
394
395void gfs2_jindex_free(struct gfs2_sbd *sdp)
396{
397 struct list_head list;
398 struct gfs2_jdesc *jd;
399
400 spin_lock(&sdp->sd_jindex_spin);
401 list_add(&list, &sdp->sd_jindex_list);
402 list_del_init(&sdp->sd_jindex_list);
403 sdp->sd_journals = 0;
404 spin_unlock(&sdp->sd_jindex_spin);
405
406 while (!list_empty(&list)) {
407 jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
408 list_del(&jd->jd_list);
409 iput(jd->jd_inode);
410 kfree(jd);
411 }
412}
413
414static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid)
415{
416 struct gfs2_jdesc *jd;
417 int found = 0;
418
419 list_for_each_entry(jd, head, jd_list) {
420 if (jd->jd_jid == jid) {
421 found = 1;
422 break;
423 }
424 }
425
426 if (!found)
427 jd = NULL;
428
429 return jd;
430}
431
432struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
433{
434 struct gfs2_jdesc *jd;
435
436 spin_lock(&sdp->sd_jindex_spin);
437 jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
438 spin_unlock(&sdp->sd_jindex_spin);
439
440 return jd;
441}
442
443void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
444{
445 struct gfs2_jdesc *jd;
446
447 spin_lock(&sdp->sd_jindex_spin);
448 jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
449 if (jd)
450 jd->jd_dirty = 1;
451 spin_unlock(&sdp->sd_jindex_spin);
452}
453
454struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
455{
456 struct gfs2_jdesc *jd;
457 int found = 0;
458
459 spin_lock(&sdp->sd_jindex_spin);
460
461 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
462 if (jd->jd_dirty) {
463 jd->jd_dirty = 0;
464 found = 1;
465 break;
466 }
467 }
468 spin_unlock(&sdp->sd_jindex_spin);
469
470 if (!found)
471 jd = NULL;
472
473 return jd;
474}
475
476int gfs2_jdesc_check(struct gfs2_jdesc *jd)
477{
478 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
479 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
480 int ar;
481 int error;
482
483 if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) ||
484 (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) {
485 gfs2_consist_inode(ip);
486 return -EIO;
487 }
488 jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
489
490 error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar);
491 if (!error && ar) {
492 gfs2_consist_inode(ip);
493 error = -EIO;
494 }
495
496 return error;
497}
498
499/**
500 * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
501 * @sdp: the filesystem
502 *
503 * Returns: errno
504 */
505
506int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
507{
508 struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
509 struct gfs2_glock *j_gl = ip->i_gl;
510 struct gfs2_holder t_gh;
511 struct gfs2_log_header head;
512 int error;
513
514 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
515 GL_LOCAL_EXCL, &t_gh);
516 if (error)
517 return error;
518
519 gfs2_meta_cache_flush(ip);
520 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
521
522 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
523 if (error)
524 goto fail;
525
526 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
527 gfs2_consist(sdp);
528 error = -EIO;
529 goto fail;
530 }
531
532 /* Initialize some head of the log stuff */
533 sdp->sd_log_sequence = head.lh_sequence + 1;
534 gfs2_log_pointers_init(sdp, head.lh_blkno);
535
536 error = gfs2_quota_init(sdp);
537 if (error)
538 goto fail;
539
540 set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
541
542 gfs2_glock_dq_uninit(&t_gh);
543
544 return 0;
545
546fail:
547 t_gh.gh_flags |= GL_NOCACHE;
548 gfs2_glock_dq_uninit(&t_gh);
549
550 return error;
551}
552
553/**
554 * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
555 * @sdp: the filesystem
556 *
557 * Returns: errno
558 */
559
560int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
561{
562 struct gfs2_holder t_gh;
563 int error;
564
565 gfs2_quota_sync(sdp);
566 gfs2_statfs_sync(sdp);
567
568 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
569 GL_LOCAL_EXCL | GL_NOCACHE,
570 &t_gh);
571 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
572 return error;
573
574 gfs2_meta_syncfs(sdp);
575 gfs2_log_shutdown(sdp);
576
577 clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
578
579 if (t_gh.gh_gl)
580 gfs2_glock_dq_uninit(&t_gh);
581
582 gfs2_quota_cleanup(sdp);
583
584 return error;
585}
586
587int gfs2_statfs_init(struct gfs2_sbd *sdp)
588{
589 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
590 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
591 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
592 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
593 struct buffer_head *m_bh, *l_bh;
594 struct gfs2_holder gh;
595 int error;
596
597 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
598 &gh);
599 if (error)
600 return error;
601
602 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
603 if (error)
604 goto out;
605
606 if (sdp->sd_args.ar_spectator) {
607 spin_lock(&sdp->sd_statfs_spin);
608 gfs2_statfs_change_in(m_sc, m_bh->b_data +
609 sizeof(struct gfs2_dinode));
610 spin_unlock(&sdp->sd_statfs_spin);
611 } else {
612 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
613 if (error)
614 goto out_m_bh;
615
616 spin_lock(&sdp->sd_statfs_spin);
617 gfs2_statfs_change_in(m_sc, m_bh->b_data +
618 sizeof(struct gfs2_dinode));
619 gfs2_statfs_change_in(l_sc, l_bh->b_data +
620 sizeof(struct gfs2_dinode));
621 spin_unlock(&sdp->sd_statfs_spin);
622
623 brelse(l_bh);
624 }
625
626out_m_bh:
627 brelse(m_bh);
628out:
629 gfs2_glock_dq_uninit(&gh);
630 return 0;
631}
632
633void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
634 s64 dinodes)
635{
636 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
637 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
638 struct buffer_head *l_bh;
639 int error;
640
641 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
642 if (error)
643 return;
644
645 mutex_lock(&sdp->sd_statfs_mutex);
646 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
647 mutex_unlock(&sdp->sd_statfs_mutex);
648
649 spin_lock(&sdp->sd_statfs_spin);
650 l_sc->sc_total += total;
651 l_sc->sc_free += free;
652 l_sc->sc_dinodes += dinodes;
653 gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode));
654 spin_unlock(&sdp->sd_statfs_spin);
655
656 brelse(l_bh);
657}
658
659int gfs2_statfs_sync(struct gfs2_sbd *sdp)
660{
661 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
662 struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
663 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
664 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
665 struct gfs2_holder gh;
666 struct buffer_head *m_bh, *l_bh;
667 int error;
668
669 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
670 &gh);
671 if (error)
672 return error;
673
674 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
675 if (error)
676 goto out;
677
678 spin_lock(&sdp->sd_statfs_spin);
679 gfs2_statfs_change_in(m_sc, m_bh->b_data +
680 sizeof(struct gfs2_dinode));
681 if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) {
682 spin_unlock(&sdp->sd_statfs_spin);
683 goto out_bh;
684 }
685 spin_unlock(&sdp->sd_statfs_spin);
686
687 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
688 if (error)
689 goto out_bh;
690
691 error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
692 if (error)
693 goto out_bh2;
694
695 mutex_lock(&sdp->sd_statfs_mutex);
696 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
697 mutex_unlock(&sdp->sd_statfs_mutex);
698
699 spin_lock(&sdp->sd_statfs_spin);
700 m_sc->sc_total += l_sc->sc_total;
701 m_sc->sc_free += l_sc->sc_free;
702 m_sc->sc_dinodes += l_sc->sc_dinodes;
703 memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
704 memset(l_bh->b_data + sizeof(struct gfs2_dinode),
705 0, sizeof(struct gfs2_statfs_change));
706 spin_unlock(&sdp->sd_statfs_spin);
707
708 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
709 gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
710
711 gfs2_trans_end(sdp);
712
713out_bh2:
714 brelse(l_bh);
715out_bh:
716 brelse(m_bh);
717out:
718 gfs2_glock_dq_uninit(&gh);
719 return error;
720}
721
722/**
723 * gfs2_statfs_i - Do a statfs
724 * @sdp: the filesystem
725 * @sg: the sg structure
726 *
727 * Returns: errno
728 */
729
730int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
731{
732 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
733 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
734
735 spin_lock(&sdp->sd_statfs_spin);
736
737 *sc = *m_sc;
738 sc->sc_total += l_sc->sc_total;
739 sc->sc_free += l_sc->sc_free;
740 sc->sc_dinodes += l_sc->sc_dinodes;
741
742 spin_unlock(&sdp->sd_statfs_spin);
743
744 if (sc->sc_free < 0)
745 sc->sc_free = 0;
746 if (sc->sc_free > sc->sc_total)
747 sc->sc_free = sc->sc_total;
748 if (sc->sc_dinodes < 0)
749 sc->sc_dinodes = 0;
750
751 return 0;
752}
753
754/**
755 * statfs_fill - fill in the sg for a given RG
756 * @rgd: the RG
757 * @sc: the sc structure
758 *
759 * Returns: 0 on success, -ESTALE if the LVB is invalid
760 */
761
762static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
763 struct gfs2_statfs_change *sc)
764{
765 gfs2_rgrp_verify(rgd);
766 sc->sc_total += rgd->rd_ri.ri_data;
767 sc->sc_free += rgd->rd_rg.rg_free;
768 sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
769 return 0;
770}
771
772/**
773 * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
774 * @sdp: the filesystem
775 * @sc: the sc info that will be returned
776 *
777 * Any error (other than a signal) will cause this routine to fall back
778 * to the synchronous version.
779 *
780 * FIXME: This really shouldn't busy wait like this.
781 *
782 * Returns: errno
783 */
784
785int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
786{
787 struct gfs2_holder ri_gh;
788 struct gfs2_rgrpd *rgd_next;
789 struct gfs2_holder *gha, *gh;
790 unsigned int slots = 64;
791 unsigned int x;
792 int done;
793 int error = 0, err;
794
795 memset(sc, 0, sizeof(struct gfs2_statfs_change));
796 gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
797 if (!gha)
798 return -ENOMEM;
799
800 error = gfs2_rindex_hold(sdp, &ri_gh);
801 if (error)
802 goto out;
803
804 rgd_next = gfs2_rgrpd_get_first(sdp);
805
806 for (;;) {
807 done = 1;
808
809 for (x = 0; x < slots; x++) {
810 gh = gha + x;
811
812 if (gh->gh_gl && gfs2_glock_poll(gh)) {
813 err = gfs2_glock_wait(gh);
814 if (err) {
815 gfs2_holder_uninit(gh);
816 error = err;
817 } else {
818 if (!error)
819 error = statfs_slow_fill(
820 gh->gh_gl->gl_object, sc);
821 gfs2_glock_dq_uninit(gh);
822 }
823 }
824
825 if (gh->gh_gl)
826 done = 0;
827 else if (rgd_next && !error) {
828 error = gfs2_glock_nq_init(rgd_next->rd_gl,
829 LM_ST_SHARED,
830 GL_ASYNC,
831 gh);
832 rgd_next = gfs2_rgrpd_get_next(rgd_next);
833 done = 0;
834 }
835
836 if (signal_pending(current))
837 error = -ERESTARTSYS;
838 }
839
840 if (done)
841 break;
842
843 yield();
844 }
845
846 gfs2_glock_dq_uninit(&ri_gh);
847
848out:
849 kfree(gha);
850 return error;
851}
852
853struct lfcc {
854 struct list_head list;
855 struct gfs2_holder gh;
856};
857
858/**
859 * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all
860 * journals are clean
861 * @sdp: the file system
862 * @state: the state to put the transaction lock into
863 * @t_gh: the hold on the transaction lock
864 *
865 * Returns: errno
866 */
867
868static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
869 struct gfs2_holder *t_gh)
870{
871 struct gfs2_inode *ip;
872 struct gfs2_holder ji_gh;
873 struct gfs2_jdesc *jd;
874 struct lfcc *lfcc;
875 LIST_HEAD(list);
876 struct gfs2_log_header lh;
877 int error;
878
879 error = gfs2_jindex_hold(sdp, &ji_gh);
880 if (error)
881 return error;
882
883 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
884 lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
885 if (!lfcc) {
886 error = -ENOMEM;
887 goto out;
888 }
889 ip = GFS2_I(jd->jd_inode);
890 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh);
891 if (error) {
892 kfree(lfcc);
893 goto out;
894 }
895 list_add(&lfcc->list, &list);
896 }
897
898 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
899 LM_FLAG_PRIORITY | GL_NOCACHE,
900 t_gh);
901
902 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
903 error = gfs2_jdesc_check(jd);
904 if (error)
905 break;
906 error = gfs2_find_jhead(jd, &lh);
907 if (error)
908 break;
909 if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
910 error = -EBUSY;
911 break;
912 }
913 }
914
915 if (error)
916 gfs2_glock_dq_uninit(t_gh);
917
918out:
919 while (!list_empty(&list)) {
920 lfcc = list_entry(list.next, struct lfcc, list);
921 list_del(&lfcc->list);
922 gfs2_glock_dq_uninit(&lfcc->gh);
923 kfree(lfcc);
924 }
925 gfs2_glock_dq_uninit(&ji_gh);
926 return error;
927}
928
929/**
930 * gfs2_freeze_fs - freezes the file system
931 * @sdp: the file system
932 *
933 * This function flushes data and meta data for all machines by
934 * aquiring the transaction log exclusively. All journals are
935 * ensured to be in a clean state as well.
936 *
937 * Returns: errno
938 */
939
940int gfs2_freeze_fs(struct gfs2_sbd *sdp)
941{
942 int error = 0;
943
944 mutex_lock(&sdp->sd_freeze_lock);
945
946 if (!sdp->sd_freeze_count++) {
947 error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
948 if (error)
949 sdp->sd_freeze_count--;
950 }
951
952 mutex_unlock(&sdp->sd_freeze_lock);
953
954 return error;
955}
956
957/**
958 * gfs2_unfreeze_fs - unfreezes the file system
959 * @sdp: the file system
960 *
961 * This function allows the file system to proceed by unlocking
962 * the exclusively held transaction lock. Other GFS2 nodes are
963 * now free to acquire the lock shared and go on with their lives.
964 *
965 */
966
967void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
968{
969 mutex_lock(&sdp->sd_freeze_lock);
970
971 if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
972 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
973
974 mutex_unlock(&sdp->sd_freeze_lock);
975}
976
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
new file mode 100644
index 000000000000..5bb443ae0f59
--- /dev/null
+++ b/fs/gfs2/super.h
@@ -0,0 +1,55 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __SUPER_DOT_H__
11#define __SUPER_DOT_H__
12
13#include "incore.h"
14
15void gfs2_tune_init(struct gfs2_tune *gt);
16
17int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent);
18int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
19struct page *gfs2_read_super(struct super_block *sb, sector_t sector);
20
21static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
22{
23 unsigned int x;
24 spin_lock(&sdp->sd_jindex_spin);
25 x = sdp->sd_journals;
26 spin_unlock(&sdp->sd_jindex_spin);
27 return x;
28}
29
30int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
31void gfs2_jindex_free(struct gfs2_sbd *sdp);
32
33struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
34void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
35struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
36int gfs2_jdesc_check(struct gfs2_jdesc *jd);
37
38int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
39 struct gfs2_inode **ipp);
40
41int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
42int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
43
44int gfs2_statfs_init(struct gfs2_sbd *sdp);
45void gfs2_statfs_change(struct gfs2_sbd *sdp,
46 s64 total, s64 free, s64 dinodes);
47int gfs2_statfs_sync(struct gfs2_sbd *sdp);
48int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
49int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
50
51int gfs2_freeze_fs(struct gfs2_sbd *sdp);
52void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
53
54#endif /* __SUPER_DOT_H__ */
55
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
new file mode 100644
index 000000000000..0e0ec988f731
--- /dev/null
+++ b/fs/gfs2/sys.c
@@ -0,0 +1,583 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/module.h>
16#include <linux/kobject.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/lm_interface.h>
19#include <asm/uaccess.h>
20
21#include "gfs2.h"
22#include "incore.h"
23#include "lm.h"
24#include "sys.h"
25#include "super.h"
26#include "glock.h"
27#include "quota.h"
28#include "util.h"
29
30char *gfs2_sys_margs;
31spinlock_t gfs2_sys_margs_lock;
32
33static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
34{
35 return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_vfs->s_id);
36}
37
38static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
39{
40 return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname);
41}
42
43static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
44{
45 unsigned int count;
46
47 mutex_lock(&sdp->sd_freeze_lock);
48 count = sdp->sd_freeze_count;
49 mutex_unlock(&sdp->sd_freeze_lock);
50
51 return snprintf(buf, PAGE_SIZE, "%u\n", count);
52}
53
54static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
55{
56 ssize_t ret = len;
57 int error = 0;
58 int n = simple_strtol(buf, NULL, 0);
59
60 if (!capable(CAP_SYS_ADMIN))
61 return -EACCES;
62
63 switch (n) {
64 case 0:
65 gfs2_unfreeze_fs(sdp);
66 break;
67 case 1:
68 error = gfs2_freeze_fs(sdp);
69 break;
70 default:
71 ret = -EINVAL;
72 }
73
74 if (error)
75 fs_warn(sdp, "freeze %d error %d", n, error);
76
77 return ret;
78}
79
80static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
81{
82 unsigned int b = test_bit(SDF_SHUTDOWN, &sdp->sd_flags);
83 return snprintf(buf, PAGE_SIZE, "%u\n", b);
84}
85
86static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
87{
88 if (!capable(CAP_SYS_ADMIN))
89 return -EACCES;
90
91 if (simple_strtol(buf, NULL, 0) != 1)
92 return -EINVAL;
93
94 gfs2_lm_withdraw(sdp,
95 "GFS2: fsid=%s: withdrawing from cluster at user's request\n",
96 sdp->sd_fsname);
97 return len;
98}
99
100static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
101 size_t len)
102{
103 if (!capable(CAP_SYS_ADMIN))
104 return -EACCES;
105
106 if (simple_strtol(buf, NULL, 0) != 1)
107 return -EINVAL;
108
109 gfs2_statfs_sync(sdp);
110 return len;
111}
112
113static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
114{
115 if (!capable(CAP_SYS_ADMIN))
116 return -EACCES;
117
118 if (simple_strtol(buf, NULL, 0) != 1)
119 return -EINVAL;
120
121 gfs2_gl_hash_clear(sdp, NO_WAIT);
122 return len;
123}
124
125static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
126 size_t len)
127{
128 if (!capable(CAP_SYS_ADMIN))
129 return -EACCES;
130
131 if (simple_strtol(buf, NULL, 0) != 1)
132 return -EINVAL;
133
134 gfs2_quota_sync(sdp);
135 return len;
136}
137
138static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
139 size_t len)
140{
141 u32 id;
142
143 if (!capable(CAP_SYS_ADMIN))
144 return -EACCES;
145
146 id = simple_strtoul(buf, NULL, 0);
147
148 gfs2_quota_refresh(sdp, 1, id);
149 return len;
150}
151
152static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
153 size_t len)
154{
155 u32 id;
156
157 if (!capable(CAP_SYS_ADMIN))
158 return -EACCES;
159
160 id = simple_strtoul(buf, NULL, 0);
161
162 gfs2_quota_refresh(sdp, 0, id);
163 return len;
164}
165
166struct gfs2_attr {
167 struct attribute attr;
168 ssize_t (*show)(struct gfs2_sbd *, char *);
169 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
170};
171
172#define GFS2_ATTR(name, mode, show, store) \
173static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
174
175GFS2_ATTR(id, 0444, id_show, NULL);
176GFS2_ATTR(fsname, 0444, fsname_show, NULL);
177GFS2_ATTR(freeze, 0644, freeze_show, freeze_store);
178GFS2_ATTR(shrink, 0200, NULL, shrink_store);
179GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
180GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store);
181GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store);
182GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store);
183GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store);
184
185static struct attribute *gfs2_attrs[] = {
186 &gfs2_attr_id.attr,
187 &gfs2_attr_fsname.attr,
188 &gfs2_attr_freeze.attr,
189 &gfs2_attr_shrink.attr,
190 &gfs2_attr_withdraw.attr,
191 &gfs2_attr_statfs_sync.attr,
192 &gfs2_attr_quota_sync.attr,
193 &gfs2_attr_quota_refresh_user.attr,
194 &gfs2_attr_quota_refresh_group.attr,
195 NULL,
196};
197
198static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
199 char *buf)
200{
201 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
202 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
203 return a->show ? a->show(sdp, buf) : 0;
204}
205
206static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
207 const char *buf, size_t len)
208{
209 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
210 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
211 return a->store ? a->store(sdp, buf, len) : len;
212}
213
214static struct sysfs_ops gfs2_attr_ops = {
215 .show = gfs2_attr_show,
216 .store = gfs2_attr_store,
217};
218
219static struct kobj_type gfs2_ktype = {
220 .default_attrs = gfs2_attrs,
221 .sysfs_ops = &gfs2_attr_ops,
222};
223
224static struct kset gfs2_kset = {
225 .subsys = &fs_subsys,
226 .kobj = {.name = "gfs2"},
227 .ktype = &gfs2_ktype,
228};
229
230/*
231 * display struct lm_lockstruct fields
232 */
233
234struct lockstruct_attr {
235 struct attribute attr;
236 ssize_t (*show)(struct gfs2_sbd *, char *);
237};
238
239#define LOCKSTRUCT_ATTR(name, fmt) \
240static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
241{ \
242 return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \
243} \
244static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
245
246LOCKSTRUCT_ATTR(jid, "%u\n");
247LOCKSTRUCT_ATTR(first, "%u\n");
248LOCKSTRUCT_ATTR(lvb_size, "%u\n");
249LOCKSTRUCT_ATTR(flags, "%d\n");
250
251static struct attribute *lockstruct_attrs[] = {
252 &lockstruct_attr_jid.attr,
253 &lockstruct_attr_first.attr,
254 &lockstruct_attr_lvb_size.attr,
255 &lockstruct_attr_flags.attr,
256 NULL,
257};
258
259/*
260 * display struct gfs2_args fields
261 */
262
263struct args_attr {
264 struct attribute attr;
265 ssize_t (*show)(struct gfs2_sbd *, char *);
266};
267
268#define ARGS_ATTR(name, fmt) \
269static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
270{ \
271 return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name); \
272} \
273static struct args_attr args_attr_##name = __ATTR_RO(name)
274
275ARGS_ATTR(lockproto, "%s\n");
276ARGS_ATTR(locktable, "%s\n");
277ARGS_ATTR(hostdata, "%s\n");
278ARGS_ATTR(spectator, "%d\n");
279ARGS_ATTR(ignore_local_fs, "%d\n");
280ARGS_ATTR(localcaching, "%d\n");
281ARGS_ATTR(localflocks, "%d\n");
282ARGS_ATTR(debug, "%d\n");
283ARGS_ATTR(upgrade, "%d\n");
284ARGS_ATTR(num_glockd, "%u\n");
285ARGS_ATTR(posix_acl, "%d\n");
286ARGS_ATTR(quota, "%u\n");
287ARGS_ATTR(suiddir, "%d\n");
288ARGS_ATTR(data, "%d\n");
289
290/* one oddball doesn't fit the macro mold */
291static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
292{
293 return snprintf(buf, PAGE_SIZE, "%d\n",
294 !!test_bit(SDF_NOATIME, &sdp->sd_flags));
295}
296static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
297
298static struct attribute *args_attrs[] = {
299 &args_attr_lockproto.attr,
300 &args_attr_locktable.attr,
301 &args_attr_hostdata.attr,
302 &args_attr_spectator.attr,
303 &args_attr_ignore_local_fs.attr,
304 &args_attr_localcaching.attr,
305 &args_attr_localflocks.attr,
306 &args_attr_debug.attr,
307 &args_attr_upgrade.attr,
308 &args_attr_num_glockd.attr,
309 &args_attr_posix_acl.attr,
310 &args_attr_quota.attr,
311 &args_attr_suiddir.attr,
312 &args_attr_data.attr,
313 &args_attr_noatime.attr,
314 NULL,
315};
316
317/*
318 * display counters from superblock
319 */
320
321struct counters_attr {
322 struct attribute attr;
323 ssize_t (*show)(struct gfs2_sbd *, char *);
324};
325
326#define COUNTERS_ATTR(name, fmt) \
327static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
328{ \
329 return snprintf(buf, PAGE_SIZE, fmt, \
330 (unsigned int)atomic_read(&sdp->sd_##name)); \
331} \
332static struct counters_attr counters_attr_##name = __ATTR_RO(name)
333
334COUNTERS_ATTR(glock_count, "%u\n");
335COUNTERS_ATTR(glock_held_count, "%u\n");
336COUNTERS_ATTR(inode_count, "%u\n");
337COUNTERS_ATTR(reclaimed, "%u\n");
338
339static struct attribute *counters_attrs[] = {
340 &counters_attr_glock_count.attr,
341 &counters_attr_glock_held_count.attr,
342 &counters_attr_inode_count.attr,
343 &counters_attr_reclaimed.attr,
344 NULL,
345};
346
347/*
348 * get and set struct gfs2_tune fields
349 */
350
351static ssize_t quota_scale_show(struct gfs2_sbd *sdp, char *buf)
352{
353 return snprintf(buf, PAGE_SIZE, "%u %u\n",
354 sdp->sd_tune.gt_quota_scale_num,
355 sdp->sd_tune.gt_quota_scale_den);
356}
357
358static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf,
359 size_t len)
360{
361 struct gfs2_tune *gt = &sdp->sd_tune;
362 unsigned int x, y;
363
364 if (!capable(CAP_SYS_ADMIN))
365 return -EACCES;
366
367 if (sscanf(buf, "%u %u", &x, &y) != 2 || !y)
368 return -EINVAL;
369
370 spin_lock(&gt->gt_spin);
371 gt->gt_quota_scale_num = x;
372 gt->gt_quota_scale_den = y;
373 spin_unlock(&gt->gt_spin);
374 return len;
375}
376
377static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
378 int check_zero, const char *buf, size_t len)
379{
380 struct gfs2_tune *gt = &sdp->sd_tune;
381 unsigned int x;
382
383 if (!capable(CAP_SYS_ADMIN))
384 return -EACCES;
385
386 x = simple_strtoul(buf, NULL, 0);
387
388 if (check_zero && !x)
389 return -EINVAL;
390
391 spin_lock(&gt->gt_spin);
392 *field = x;
393 spin_unlock(&gt->gt_spin);
394 return len;
395}
396
397struct tune_attr {
398 struct attribute attr;
399 ssize_t (*show)(struct gfs2_sbd *, char *);
400 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
401};
402
403#define TUNE_ATTR_3(name, show, store) \
404static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store)
405
406#define TUNE_ATTR_2(name, store) \
407static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
408{ \
409 return snprintf(buf, PAGE_SIZE, "%u\n", sdp->sd_tune.gt_##name); \
410} \
411TUNE_ATTR_3(name, name##_show, store)
412
413#define TUNE_ATTR(name, check_zero) \
414static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
415{ \
416 return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len); \
417} \
418TUNE_ATTR_2(name, name##_store)
419
420#define TUNE_ATTR_DAEMON(name, process) \
421static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
422{ \
423 ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len); \
424 wake_up_process(sdp->sd_##process); \
425 return r; \
426} \
427TUNE_ATTR_2(name, name##_store)
428
429TUNE_ATTR(ilimit, 0);
430TUNE_ATTR(ilimit_tries, 0);
431TUNE_ATTR(ilimit_min, 0);
432TUNE_ATTR(demote_secs, 0);
433TUNE_ATTR(incore_log_blocks, 0);
434TUNE_ATTR(log_flush_secs, 0);
435TUNE_ATTR(jindex_refresh_secs, 0);
436TUNE_ATTR(quota_warn_period, 0);
437TUNE_ATTR(quota_quantum, 0);
438TUNE_ATTR(atime_quantum, 0);
439TUNE_ATTR(max_readahead, 0);
440TUNE_ATTR(complain_secs, 0);
441TUNE_ATTR(reclaim_limit, 0);
442TUNE_ATTR(prefetch_secs, 0);
443TUNE_ATTR(statfs_slow, 0);
444TUNE_ATTR(new_files_jdata, 0);
445TUNE_ATTR(new_files_directio, 0);
446TUNE_ATTR(quota_simul_sync, 1);
447TUNE_ATTR(quota_cache_secs, 1);
448TUNE_ATTR(max_atomic_write, 1);
449TUNE_ATTR(stall_secs, 1);
450TUNE_ATTR(entries_per_readdir, 1);
451TUNE_ATTR(greedy_default, 1);
452TUNE_ATTR(greedy_quantum, 1);
453TUNE_ATTR(greedy_max, 1);
454TUNE_ATTR(statfs_quantum, 1);
455TUNE_ATTR_DAEMON(scand_secs, scand_process);
456TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
457TUNE_ATTR_DAEMON(logd_secs, logd_process);
458TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
459TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
460
461static struct attribute *tune_attrs[] = {
462 &tune_attr_ilimit.attr,
463 &tune_attr_ilimit_tries.attr,
464 &tune_attr_ilimit_min.attr,
465 &tune_attr_demote_secs.attr,
466 &tune_attr_incore_log_blocks.attr,
467 &tune_attr_log_flush_secs.attr,
468 &tune_attr_jindex_refresh_secs.attr,
469 &tune_attr_quota_warn_period.attr,
470 &tune_attr_quota_quantum.attr,
471 &tune_attr_atime_quantum.attr,
472 &tune_attr_max_readahead.attr,
473 &tune_attr_complain_secs.attr,
474 &tune_attr_reclaim_limit.attr,
475 &tune_attr_prefetch_secs.attr,
476 &tune_attr_statfs_slow.attr,
477 &tune_attr_quota_simul_sync.attr,
478 &tune_attr_quota_cache_secs.attr,
479 &tune_attr_max_atomic_write.attr,
480 &tune_attr_stall_secs.attr,
481 &tune_attr_entries_per_readdir.attr,
482 &tune_attr_greedy_default.attr,
483 &tune_attr_greedy_quantum.attr,
484 &tune_attr_greedy_max.attr,
485 &tune_attr_statfs_quantum.attr,
486 &tune_attr_scand_secs.attr,
487 &tune_attr_recoverd_secs.attr,
488 &tune_attr_logd_secs.attr,
489 &tune_attr_quotad_secs.attr,
490 &tune_attr_quota_scale.attr,
491 &tune_attr_new_files_jdata.attr,
492 &tune_attr_new_files_directio.attr,
493 NULL,
494};
495
496static struct attribute_group lockstruct_group = {
497 .name = "lockstruct",
498 .attrs = lockstruct_attrs,
499};
500
501static struct attribute_group counters_group = {
502 .name = "counters",
503 .attrs = counters_attrs,
504};
505
506static struct attribute_group args_group = {
507 .name = "args",
508 .attrs = args_attrs,
509};
510
511static struct attribute_group tune_group = {
512 .name = "tune",
513 .attrs = tune_attrs,
514};
515
516int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
517{
518 int error;
519
520 sdp->sd_kobj.kset = &gfs2_kset;
521 sdp->sd_kobj.ktype = &gfs2_ktype;
522
523 error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name);
524 if (error)
525 goto fail;
526
527 error = kobject_register(&sdp->sd_kobj);
528 if (error)
529 goto fail;
530
531 error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
532 if (error)
533 goto fail_reg;
534
535 error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
536 if (error)
537 goto fail_lockstruct;
538
539 error = sysfs_create_group(&sdp->sd_kobj, &args_group);
540 if (error)
541 goto fail_counters;
542
543 error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
544 if (error)
545 goto fail_args;
546
547 return 0;
548
549fail_args:
550 sysfs_remove_group(&sdp->sd_kobj, &args_group);
551fail_counters:
552 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
553fail_lockstruct:
554 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
555fail_reg:
556 kobject_unregister(&sdp->sd_kobj);
557fail:
558 fs_err(sdp, "error %d adding sysfs files", error);
559 return error;
560}
561
562void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
563{
564 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
565 sysfs_remove_group(&sdp->sd_kobj, &args_group);
566 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
567 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
568 kobject_unregister(&sdp->sd_kobj);
569}
570
571int gfs2_sys_init(void)
572{
573 gfs2_sys_margs = NULL;
574 spin_lock_init(&gfs2_sys_margs_lock);
575 return kset_register(&gfs2_kset);
576}
577
578void gfs2_sys_uninit(void)
579{
580 kfree(gfs2_sys_margs);
581 kset_unregister(&gfs2_kset);
582}
583
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
new file mode 100644
index 000000000000..1ca8cdac5304
--- /dev/null
+++ b/fs/gfs2/sys.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __SYS_DOT_H__
11#define __SYS_DOT_H__
12
13#include <linux/spinlock.h>
14struct gfs2_sbd;
15
16/* Allow args to be passed to GFS2 when using an initial ram disk */
17extern char *gfs2_sys_margs;
18extern spinlock_t gfs2_sys_margs_lock;
19
20int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
21void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
22
23int gfs2_sys_init(void);
24void gfs2_sys_uninit(void);
25
26#endif /* __SYS_DOT_H__ */
27
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
new file mode 100644
index 000000000000..f8dabf8446bb
--- /dev/null
+++ b/fs/gfs2/trans.c
@@ -0,0 +1,184 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/kallsyms.h>
17#include <linux/lm_interface.h>
18
19#include "gfs2.h"
20#include "incore.h"
21#include "glock.h"
22#include "log.h"
23#include "lops.h"
24#include "meta_io.h"
25#include "trans.h"
26#include "util.h"
27
28int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
29 unsigned int revokes)
30{
31 struct gfs2_trans *tr;
32 int error;
33
34 BUG_ON(current->journal_info);
35 BUG_ON(blocks == 0 && revokes == 0);
36
37 tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
38 if (!tr)
39 return -ENOMEM;
40
41 tr->tr_ip = (unsigned long)__builtin_return_address(0);
42 tr->tr_blocks = blocks;
43 tr->tr_revokes = revokes;
44 tr->tr_reserved = 1;
45 if (blocks)
46 tr->tr_reserved += 6 + blocks;
47 if (revokes)
48 tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
49 sizeof(u64));
50 INIT_LIST_HEAD(&tr->tr_list_buf);
51
52 gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
53
54 error = gfs2_glock_nq(&tr->tr_t_gh);
55 if (error)
56 goto fail_holder_uninit;
57
58 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
59 tr->tr_t_gh.gh_flags |= GL_NOCACHE;
60 error = -EROFS;
61 goto fail_gunlock;
62 }
63
64 error = gfs2_log_reserve(sdp, tr->tr_reserved);
65 if (error)
66 goto fail_gunlock;
67
68 current->journal_info = tr;
69
70 return 0;
71
72fail_gunlock:
73 gfs2_glock_dq(&tr->tr_t_gh);
74
75fail_holder_uninit:
76 gfs2_holder_uninit(&tr->tr_t_gh);
77 kfree(tr);
78
79 return error;
80}
81
82void gfs2_trans_end(struct gfs2_sbd *sdp)
83{
84 struct gfs2_trans *tr = current->journal_info;
85
86 BUG_ON(!tr);
87 current->journal_info = NULL;
88
89 if (!tr->tr_touched) {
90 gfs2_log_release(sdp, tr->tr_reserved);
91 gfs2_glock_dq(&tr->tr_t_gh);
92 gfs2_holder_uninit(&tr->tr_t_gh);
93 kfree(tr);
94 return;
95 }
96
97 if (gfs2_assert_withdraw(sdp, tr->tr_num_buf <= tr->tr_blocks)) {
98 fs_err(sdp, "tr_num_buf = %u, tr_blocks = %u ",
99 tr->tr_num_buf, tr->tr_blocks);
100 print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
101 }
102 if (gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) {
103 fs_err(sdp, "tr_num_revoke = %u, tr_revokes = %u ",
104 tr->tr_num_revoke, tr->tr_revokes);
105 print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
106 }
107
108 gfs2_log_commit(sdp, tr);
109 gfs2_glock_dq(&tr->tr_t_gh);
110 gfs2_holder_uninit(&tr->tr_t_gh);
111 kfree(tr);
112
113 if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
114 gfs2_log_flush(sdp, NULL);
115}
116
117void gfs2_trans_add_gl(struct gfs2_glock *gl)
118{
119 lops_add(gl->gl_sbd, &gl->gl_le);
120}
121
122/**
123 * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
124 * @gl: the glock the buffer belongs to
125 * @bh: The buffer to add
126 * @meta: True in the case of adding metadata
127 *
128 */
129
130void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
131{
132 struct gfs2_sbd *sdp = gl->gl_sbd;
133 struct gfs2_bufdata *bd;
134
135 bd = bh->b_private;
136 if (bd)
137 gfs2_assert(sdp, bd->bd_gl == gl);
138 else {
139 gfs2_attach_bufdata(gl, bh, meta);
140 bd = bh->b_private;
141 }
142 lops_add(sdp, &bd->bd_le);
143}
144
145void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno)
146{
147 struct gfs2_revoke *rv = kmalloc(sizeof(struct gfs2_revoke),
148 GFP_NOFS | __GFP_NOFAIL);
149 lops_init_le(&rv->rv_le, &gfs2_revoke_lops);
150 rv->rv_blkno = blkno;
151 lops_add(sdp, &rv->rv_le);
152}
153
154void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno)
155{
156 struct gfs2_revoke *rv;
157 int found = 0;
158
159 gfs2_log_lock(sdp);
160
161 list_for_each_entry(rv, &sdp->sd_log_le_revoke, rv_le.le_list) {
162 if (rv->rv_blkno == blkno) {
163 list_del(&rv->rv_le.le_list);
164 gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
165 sdp->sd_log_num_revoke--;
166 found = 1;
167 break;
168 }
169 }
170
171 gfs2_log_unlock(sdp);
172
173 if (found) {
174 struct gfs2_trans *tr = current->journal_info;
175 kfree(rv);
176 tr->tr_num_revoke_rm++;
177 }
178}
179
180void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
181{
182 lops_add(rgd->rd_sbd, &rgd->rd_le);
183}
184
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
new file mode 100644
index 000000000000..23d4cbe1de5b
--- /dev/null
+++ b/fs/gfs2/trans.h
@@ -0,0 +1,39 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __TRANS_DOT_H__
11#define __TRANS_DOT_H__
12
13#include <linux/buffer_head.h>
14struct gfs2_sbd;
15struct gfs2_rgrpd;
16struct gfs2_glock;
17
18#define RES_DINODE 1
19#define RES_INDIRECT 1
20#define RES_JDATA 1
21#define RES_DATA 1
22#define RES_LEAF 1
23#define RES_RG_BIT 2
24#define RES_EATTR 1
25#define RES_STATFS 1
26#define RES_QUOTA 2
27
28int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
29 unsigned int revokes);
30
31void gfs2_trans_end(struct gfs2_sbd *sdp);
32
33void gfs2_trans_add_gl(struct gfs2_glock *gl);
34void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
35void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno);
36void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno);
37void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
38
39#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
new file mode 100644
index 000000000000..196c604faadc
--- /dev/null
+++ b/fs/gfs2/util.c
@@ -0,0 +1,245 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/crc32.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/lm_interface.h>
18#include <asm/uaccess.h>
19
20#include "gfs2.h"
21#include "incore.h"
22#include "glock.h"
23#include "lm.h"
24#include "util.h"
25
26kmem_cache_t *gfs2_glock_cachep __read_mostly;
27kmem_cache_t *gfs2_inode_cachep __read_mostly;
28kmem_cache_t *gfs2_bufdata_cachep __read_mostly;
29
30void gfs2_assert_i(struct gfs2_sbd *sdp)
31{
32 printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n",
33 sdp->sd_fsname);
34}
35
36/**
37 * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
38 * Returns: -1 if this call withdrew the machine,
39 * -2 if it was already withdrawn
40 */
41
42int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
43 const char *function, char *file, unsigned int line)
44{
45 int me;
46 me = gfs2_lm_withdraw(sdp,
47 "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n"
48 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
49 sdp->sd_fsname, assertion,
50 sdp->sd_fsname, function, file, line);
51 dump_stack();
52 return (me) ? -1 : -2;
53}
54
55/**
56 * gfs2_assert_warn_i - Print a message to the console if @assertion is false
57 * Returns: -1 if we printed something
58 * -2 if we didn't
59 */
60
61int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
62 const char *function, char *file, unsigned int line)
63{
64 if (time_before(jiffies,
65 sdp->sd_last_warning +
66 gfs2_tune_get(sdp, gt_complain_secs) * HZ))
67 return -2;
68
69 printk(KERN_WARNING
70 "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
71 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
72 sdp->sd_fsname, assertion,
73 sdp->sd_fsname, function, file, line);
74
75 if (sdp->sd_args.ar_debug)
76 BUG();
77 else
78 dump_stack();
79
80 sdp->sd_last_warning = jiffies;
81
82 return -1;
83}
84
85/**
86 * gfs2_consist_i - Flag a filesystem consistency error and withdraw
87 * Returns: -1 if this call withdrew the machine,
88 * 0 if it was already withdrawn
89 */
90
91int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
92 char *file, unsigned int line)
93{
94 int rv;
95 rv = gfs2_lm_withdraw(sdp,
96 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
97 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
98 sdp->sd_fsname,
99 sdp->sd_fsname, function, file, line);
100 return rv;
101}
102
103/**
104 * gfs2_consist_inode_i - Flag an inode consistency error and withdraw
105 * Returns: -1 if this call withdrew the machine,
106 * 0 if it was already withdrawn
107 */
108
109int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
110 const char *function, char *file, unsigned int line)
111{
112 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
113 int rv;
114 rv = gfs2_lm_withdraw(sdp,
115 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
116 "GFS2: fsid=%s: inode = %llu %llu\n"
117 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
118 sdp->sd_fsname,
119 sdp->sd_fsname, (unsigned long long)ip->i_num.no_formal_ino,
120 (unsigned long long)ip->i_num.no_addr,
121 sdp->sd_fsname, function, file, line);
122 return rv;
123}
124
125/**
126 * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw
127 * Returns: -1 if this call withdrew the machine,
128 * 0 if it was already withdrawn
129 */
130
131int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
132 const char *function, char *file, unsigned int line)
133{
134 struct gfs2_sbd *sdp = rgd->rd_sbd;
135 int rv;
136 rv = gfs2_lm_withdraw(sdp,
137 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
138 "GFS2: fsid=%s: RG = %llu\n"
139 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
140 sdp->sd_fsname,
141 sdp->sd_fsname, (unsigned long long)rgd->rd_ri.ri_addr,
142 sdp->sd_fsname, function, file, line);
143 return rv;
144}
145
146/**
147 * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw
148 * Returns: -1 if this call withdrew the machine,
149 * -2 if it was already withdrawn
150 */
151
152int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
153 const char *type, const char *function, char *file,
154 unsigned int line)
155{
156 int me;
157 me = gfs2_lm_withdraw(sdp,
158 "GFS2: fsid=%s: fatal: invalid metadata block\n"
159 "GFS2: fsid=%s: bh = %llu (%s)\n"
160 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
161 sdp->sd_fsname,
162 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type,
163 sdp->sd_fsname, function, file, line);
164 return (me) ? -1 : -2;
165}
166
167/**
168 * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw
169 * Returns: -1 if this call withdrew the machine,
170 * -2 if it was already withdrawn
171 */
172
173int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
174 u16 type, u16 t, const char *function,
175 char *file, unsigned int line)
176{
177 int me;
178 me = gfs2_lm_withdraw(sdp,
179 "GFS2: fsid=%s: fatal: invalid metadata block\n"
180 "GFS2: fsid=%s: bh = %llu (type: exp=%u, found=%u)\n"
181 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
182 sdp->sd_fsname,
183 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t,
184 sdp->sd_fsname, function, file, line);
185 return (me) ? -1 : -2;
186}
187
188/**
189 * gfs2_io_error_i - Flag an I/O error and withdraw
190 * Returns: -1 if this call withdrew the machine,
191 * 0 if it was already withdrawn
192 */
193
194int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
195 unsigned int line)
196{
197 int rv;
198 rv = gfs2_lm_withdraw(sdp,
199 "GFS2: fsid=%s: fatal: I/O error\n"
200 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
201 sdp->sd_fsname,
202 sdp->sd_fsname, function, file, line);
203 return rv;
204}
205
206/**
207 * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw
208 * Returns: -1 if this call withdrew the machine,
209 * 0 if it was already withdrawn
210 */
211
212int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
213 const char *function, char *file, unsigned int line)
214{
215 int rv;
216 rv = gfs2_lm_withdraw(sdp,
217 "GFS2: fsid=%s: fatal: I/O error\n"
218 "GFS2: fsid=%s: block = %llu\n"
219 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
220 sdp->sd_fsname,
221 sdp->sd_fsname, (unsigned long long)bh->b_blocknr,
222 sdp->sd_fsname, function, file, line);
223 return rv;
224}
225
226void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
227 unsigned int bit, int new_value)
228{
229 unsigned int c, o, b = bit;
230 int old_value;
231
232 c = b / (8 * PAGE_SIZE);
233 b %= 8 * PAGE_SIZE;
234 o = b / 8;
235 b %= 8;
236
237 old_value = (bitmap[c][o] & (1 << b));
238 gfs2_assert_withdraw(sdp, !old_value != !new_value);
239
240 if (new_value)
241 bitmap[c][o] |= 1 << b;
242 else
243 bitmap[c][o] &= ~(1 << b);
244}
245
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
new file mode 100644
index 000000000000..76a50899fe9e
--- /dev/null
+++ b/fs/gfs2/util.h
@@ -0,0 +1,170 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __UTIL_DOT_H__
11#define __UTIL_DOT_H__
12
13#include "incore.h"
14
15#define fs_printk(level, fs, fmt, arg...) \
16 printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg)
17
18#define fs_info(fs, fmt, arg...) \
19 fs_printk(KERN_INFO , fs , fmt , ## arg)
20
21#define fs_warn(fs, fmt, arg...) \
22 fs_printk(KERN_WARNING , fs , fmt , ## arg)
23
24#define fs_err(fs, fmt, arg...) \
25 fs_printk(KERN_ERR, fs , fmt , ## arg)
26
27
28void gfs2_assert_i(struct gfs2_sbd *sdp);
29
30#define gfs2_assert(sdp, assertion) \
31do { \
32 if (unlikely(!(assertion))) { \
33 gfs2_assert_i(sdp); \
34 BUG(); \
35 } \
36} while (0)
37
38
39int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
40 const char *function, char *file, unsigned int line);
41
42#define gfs2_assert_withdraw(sdp, assertion) \
43((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \
44 __FUNCTION__, __FILE__, __LINE__))
45
46
47int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
48 const char *function, char *file, unsigned int line);
49
50#define gfs2_assert_warn(sdp, assertion) \
51((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \
52 __FUNCTION__, __FILE__, __LINE__))
53
54
55int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide,
56 const char *function, char *file, unsigned int line);
57
58#define gfs2_consist(sdp) \
59gfs2_consist_i((sdp), 0, __FUNCTION__, __FILE__, __LINE__)
60
61
62int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
63 const char *function, char *file, unsigned int line);
64
65#define gfs2_consist_inode(ip) \
66gfs2_consist_inode_i((ip), 0, __FUNCTION__, __FILE__, __LINE__)
67
68
69int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
70 const char *function, char *file, unsigned int line);
71
72#define gfs2_consist_rgrpd(rgd) \
73gfs2_consist_rgrpd_i((rgd), 0, __FUNCTION__, __FILE__, __LINE__)
74
75
76int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
77 const char *type, const char *function,
78 char *file, unsigned int line);
79
80static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp,
81 struct buffer_head *bh,
82 const char *function,
83 char *file, unsigned int line)
84{
85 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
86 u32 magic = mh->mh_magic;
87 magic = be32_to_cpu(magic);
88 if (unlikely(magic != GFS2_MAGIC))
89 return gfs2_meta_check_ii(sdp, bh, "magic number", function,
90 file, line);
91 return 0;
92}
93
94#define gfs2_meta_check(sdp, bh) \
95gfs2_meta_check_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__)
96
97
98int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
99 u16 type, u16 t,
100 const char *function,
101 char *file, unsigned int line);
102
103static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
104 struct buffer_head *bh,
105 u16 type,
106 const char *function,
107 char *file, unsigned int line)
108{
109 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
110 u32 magic = mh->mh_magic;
111 u16 t = be32_to_cpu(mh->mh_type);
112 magic = be32_to_cpu(magic);
113 if (unlikely(magic != GFS2_MAGIC))
114 return gfs2_meta_check_ii(sdp, bh, "magic number", function,
115 file, line);
116 if (unlikely(t != type))
117 return gfs2_metatype_check_ii(sdp, bh, type, t, function,
118 file, line);
119 return 0;
120}
121
122#define gfs2_metatype_check(sdp, bh, type) \
123gfs2_metatype_check_i((sdp), (bh), (type), __FUNCTION__, __FILE__, __LINE__)
124
125static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type,
126 u16 format)
127{
128 struct gfs2_meta_header *mh;
129 mh = (struct gfs2_meta_header *)bh->b_data;
130 mh->mh_type = cpu_to_be32(type);
131 mh->mh_format = cpu_to_be32(format);
132}
133
134
135int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
136 char *file, unsigned int line);
137
138#define gfs2_io_error(sdp) \
139gfs2_io_error_i((sdp), __FUNCTION__, __FILE__, __LINE__);
140
141
142int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
143 const char *function, char *file, unsigned int line);
144
145#define gfs2_io_error_bh(sdp, bh) \
146gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__);
147
148
149extern kmem_cache_t *gfs2_glock_cachep;
150extern kmem_cache_t *gfs2_inode_cachep;
151extern kmem_cache_t *gfs2_bufdata_cachep;
152
153static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
154 unsigned int *p)
155{
156 unsigned int x;
157 spin_lock(&gt->gt_spin);
158 x = *p;
159 spin_unlock(&gt->gt_spin);
160 return x;
161}
162
163#define gfs2_tune_get(sdp, field) \
164gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
165
166void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
167 unsigned int bit, int new_value);
168
169#endif /* __UTIL_DOT_H__ */
170
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 7d564b6fc98f..ea005c0a79fd 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -46,6 +46,7 @@ header-y += coff.h
46header-y += comstats.h 46header-y += comstats.h
47header-y += consolemap.h 47header-y += consolemap.h
48header-y += cycx_cfm.h 48header-y += cycx_cfm.h
49header-y += dlm_device.h
49header-y += dm-ioctl.h 50header-y += dm-ioctl.h
50header-y += dn.h 51header-y += dn.h
51header-y += dqblk_v1.h 52header-y += dqblk_v1.h
@@ -104,6 +105,7 @@ header-y += ixjuser.h
104header-y += jffs2.h 105header-y += jffs2.h
105header-y += keyctl.h 106header-y += keyctl.h
106header-y += limits.h 107header-y += limits.h
108header-y += lock_dlm_plock.h
107header-y += magic.h 109header-y += magic.h
108header-y += major.h 110header-y += major.h
109header-y += matroxfb.h 111header-y += matroxfb.h
@@ -192,6 +194,7 @@ unifdef-y += cyclades.h
192unifdef-y += dccp.h 194unifdef-y += dccp.h
193unifdef-y += dirent.h 195unifdef-y += dirent.h
194unifdef-y += divert.h 196unifdef-y += divert.h
197unifdef-y += dlm.h
195unifdef-y += elfcore.h 198unifdef-y += elfcore.h
196unifdef-y += errno.h 199unifdef-y += errno.h
197unifdef-y += errqueue.h 200unifdef-y += errqueue.h
@@ -208,6 +211,7 @@ unifdef-y += ftape.h
208unifdef-y += gameport.h 211unifdef-y += gameport.h
209unifdef-y += generic_serial.h 212unifdef-y += generic_serial.h
210unifdef-y += genhd.h 213unifdef-y += genhd.h
214unifdef-y += gfs2_ondisk.h
211unifdef-y += hayesesp.h 215unifdef-y += hayesesp.h
212unifdef-y += hdlcdrv.h 216unifdef-y += hdlcdrv.h
213unifdef-y += hdlc.h 217unifdef-y += hdlc.h
diff --git a/include/linux/dlm.h b/include/linux/dlm.h
new file mode 100644
index 000000000000..1b1dcb9a40bb
--- /dev/null
+++ b/include/linux/dlm.h
@@ -0,0 +1,302 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DLM_DOT_H__
15#define __DLM_DOT_H__
16
17/*
18 * Interface to Distributed Lock Manager (DLM)
19 * routines and structures to use DLM lockspaces
20 */
21
22/*
23 * Lock Modes
24 */
25
26#define DLM_LOCK_IV -1 /* invalid */
27#define DLM_LOCK_NL 0 /* null */
28#define DLM_LOCK_CR 1 /* concurrent read */
29#define DLM_LOCK_CW 2 /* concurrent write */
30#define DLM_LOCK_PR 3 /* protected read */
31#define DLM_LOCK_PW 4 /* protected write */
32#define DLM_LOCK_EX 5 /* exclusive */
33
34/*
35 * Maximum size in bytes of a dlm_lock name
36 */
37
38#define DLM_RESNAME_MAXLEN 64
39
40/*
41 * Flags to dlm_lock
42 *
43 * DLM_LKF_NOQUEUE
44 *
45 * Do not queue the lock request on the wait queue if it cannot be granted
46 * immediately. If the lock cannot be granted because of this flag, DLM will
47 * either return -EAGAIN from the dlm_lock call or will return 0 from
48 * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
49 *
50 * DLM_LKF_CANCEL
51 *
52 * Used to cancel a pending lock request or conversion. A converting lock is
53 * returned to its previously granted mode.
54 *
55 * DLM_LKF_CONVERT
56 *
57 * Indicates a lock conversion request. For conversions the name and namelen
58 * are ignored and the lock ID in the LKSB is used to identify the lock.
59 *
60 * DLM_LKF_VALBLK
61 *
62 * Requests DLM to return the current contents of the lock value block in the
63 * lock status block. When this flag is set in a lock conversion from PW or EX
64 * modes, DLM assigns the value specified in the lock status block to the lock
65 * value block of the lock resource. The LVB is a DLM_LVB_LEN size array
66 * containing application-specific information.
67 *
68 * DLM_LKF_QUECVT
69 *
70 * Force a conversion request to be queued, even if it is compatible with
71 * the granted modes of other locks on the same resource.
72 *
73 * DLM_LKF_IVVALBLK
74 *
75 * Invalidate the lock value block.
76 *
77 * DLM_LKF_CONVDEADLK
78 *
79 * Allows the dlm to resolve conversion deadlocks internally by demoting the
80 * granted mode of a converting lock to NL. The DLM_SBF_DEMOTED flag is
81 * returned for a conversion that's been effected by this.
82 *
83 * DLM_LKF_PERSISTENT
84 *
85 * Only relevant to locks originating in userspace. A persistent lock will not
86 * be removed if the process holding the lock exits.
87 *
88 * DLM_LKF_NODLKWT
89 * DLM_LKF_NODLCKBLK
90 *
91 * net yet implemented
92 *
93 * DLM_LKF_EXPEDITE
94 *
95 * Used only with new requests for NL mode locks. Tells the lock manager
96 * to grant the lock, ignoring other locks in convert and wait queues.
97 *
98 * DLM_LKF_NOQUEUEBAST
99 *
100 * Send blocking AST's before returning -EAGAIN to the caller. It is only
101 * used along with the NOQUEUE flag. Blocking AST's are not sent for failed
102 * NOQUEUE requests otherwise.
103 *
104 * DLM_LKF_HEADQUE
105 *
106 * Add a lock to the head of the convert or wait queue rather than the tail.
107 *
108 * DLM_LKF_NOORDER
109 *
110 * Disregard the standard grant order rules and grant a lock as soon as it
111 * is compatible with other granted locks.
112 *
113 * DLM_LKF_ORPHAN
114 *
115 * not yet implemented
116 *
117 * DLM_LKF_ALTPR
118 *
119 * If the requested mode cannot be granted immediately, try to grant the lock
120 * in PR mode instead. If this alternate mode is granted instead of the
121 * requested mode, DLM_SBF_ALTMODE is returned in the lksb.
122 *
123 * DLM_LKF_ALTCW
124 *
125 * The same as ALTPR, but the alternate mode is CW.
126 *
127 * DLM_LKF_FORCEUNLOCK
128 *
129 * Unlock the lock even if it is converting or waiting or has sublocks.
130 * Only really for use by the userland device.c code.
131 *
132 */
133
134#define DLM_LKF_NOQUEUE 0x00000001
135#define DLM_LKF_CANCEL 0x00000002
136#define DLM_LKF_CONVERT 0x00000004
137#define DLM_LKF_VALBLK 0x00000008
138#define DLM_LKF_QUECVT 0x00000010
139#define DLM_LKF_IVVALBLK 0x00000020
140#define DLM_LKF_CONVDEADLK 0x00000040
141#define DLM_LKF_PERSISTENT 0x00000080
142#define DLM_LKF_NODLCKWT 0x00000100
143#define DLM_LKF_NODLCKBLK 0x00000200
144#define DLM_LKF_EXPEDITE 0x00000400
145#define DLM_LKF_NOQUEUEBAST 0x00000800
146#define DLM_LKF_HEADQUE 0x00001000
147#define DLM_LKF_NOORDER 0x00002000
148#define DLM_LKF_ORPHAN 0x00004000
149#define DLM_LKF_ALTPR 0x00008000
150#define DLM_LKF_ALTCW 0x00010000
151#define DLM_LKF_FORCEUNLOCK 0x00020000
152
153/*
154 * Some return codes that are not in errno.h
155 */
156
157#define DLM_ECANCEL 0x10001
158#define DLM_EUNLOCK 0x10002
159
160typedef void dlm_lockspace_t;
161
162/*
163 * Lock status block
164 *
165 * Use this structure to specify the contents of the lock value block. For a
166 * conversion request, this structure is used to specify the lock ID of the
167 * lock. DLM writes the status of the lock request and the lock ID assigned
168 * to the request in the lock status block.
169 *
170 * sb_lkid: the returned lock ID. It is set on new (non-conversion) requests.
171 * It is available when dlm_lock returns.
172 *
173 * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
174 * shown for the DLM_LKF_VALBLK flag.
175 *
176 * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
177 * it was first demoted to NL to avoid conversion deadlock.
178 * DLM_SBF_VALNOTVALID is returned if the resource's LVB is marked invalid.
179 *
180 * sb_status: the returned status of the lock request set prior to AST
181 * execution. Possible return values:
182 *
183 * 0 if lock request was successful
184 * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
185 * -ENOMEM if there is no memory to process request
186 * -EINVAL if there are invalid parameters
187 * -DLM_EUNLOCK if unlock request was successful
188 * -DLM_ECANCEL if a cancel completed successfully
189 */
190
191#define DLM_SBF_DEMOTED 0x01
192#define DLM_SBF_VALNOTVALID 0x02
193#define DLM_SBF_ALTMODE 0x04
194
195struct dlm_lksb {
196 int sb_status;
197 uint32_t sb_lkid;
198 char sb_flags;
199 char * sb_lvbptr;
200};
201
202
203#ifdef __KERNEL__
204
205#define DLM_LSFL_NODIR 0x00000001
206
207/*
208 * dlm_new_lockspace
209 *
210 * Starts a lockspace with the given name. If the named lockspace exists in
211 * the cluster, the calling node joins it.
212 */
213
214int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
215 uint32_t flags, int lvblen);
216
217/*
218 * dlm_release_lockspace
219 *
220 * Stop a lockspace.
221 */
222
223int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
224
225/*
226 * dlm_lock
227 *
228 * Make an asyncronous request to acquire or convert a lock on a named
229 * resource.
230 *
231 * lockspace: context for the request
232 * mode: the requested mode of the lock (DLM_LOCK_)
233 * lksb: lock status block for input and async return values
234 * flags: input flags (DLM_LKF_)
235 * name: name of the resource to lock, can be binary
236 * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
237 * parent: the lock ID of a parent lock or 0 if none
238 * lockast: function DLM executes when it completes processing the request
239 * astarg: argument passed to lockast and bast functions
240 * bast: function DLM executes when this lock later blocks another request
241 *
242 * Returns:
243 * 0 if request is successfully queued for processing
244 * -EINVAL if any input parameters are invalid
245 * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
246 * -ENOMEM if there is no memory to process request
247 * -ENOTCONN if there is a communication error
248 *
249 * If the call to dlm_lock returns an error then the operation has failed and
250 * the AST routine will not be called. If dlm_lock returns 0 it is still
251 * possible that the lock operation will fail. The AST routine will be called
252 * when the locking is complete and the status is returned in the lksb.
253 *
254 * If the AST routines or parameter are passed to a conversion operation then
255 * they will overwrite those values that were passed to a previous dlm_lock
256 * call.
257 *
258 * AST routines should not block (at least not for long), but may make
259 * any locking calls they please.
260 */
261
262int dlm_lock(dlm_lockspace_t *lockspace,
263 int mode,
264 struct dlm_lksb *lksb,
265 uint32_t flags,
266 void *name,
267 unsigned int namelen,
268 uint32_t parent_lkid,
269 void (*lockast) (void *astarg),
270 void *astarg,
271 void (*bast) (void *astarg, int mode));
272
273/*
274 * dlm_unlock
275 *
276 * Asynchronously release a lock on a resource. The AST routine is called
277 * when the resource is successfully unlocked.
278 *
279 * lockspace: context for the request
280 * lkid: the lock ID as returned in the lksb
281 * flags: input flags (DLM_LKF_)
282 * lksb: if NULL the lksb parameter passed to last lock request is used
283 * astarg: the arg used with the completion ast for the unlock
284 *
285 * Returns:
286 * 0 if request is successfully queued for processing
287 * -EINVAL if any input parameters are invalid
288 * -ENOTEMPTY if the lock still has sublocks
289 * -EBUSY if the lock is waiting for a remote lock operation
290 * -ENOTCONN if there is a communication error
291 */
292
293int dlm_unlock(dlm_lockspace_t *lockspace,
294 uint32_t lkid,
295 uint32_t flags,
296 struct dlm_lksb *lksb,
297 void *astarg);
298
299#endif /* __KERNEL__ */
300
301#endif /* __DLM_DOT_H__ */
302
diff --git a/include/linux/dlm_device.h b/include/linux/dlm_device.h
new file mode 100644
index 000000000000..2a2dd189b9fd
--- /dev/null
+++ b/include/linux/dlm_device.h
@@ -0,0 +1,86 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/* This is the device interface for dlm, most users will use a library
15 * interface.
16 */
17
18#define DLM_USER_LVB_LEN 32
19
20/* Version of the device interface */
21#define DLM_DEVICE_VERSION_MAJOR 5
22#define DLM_DEVICE_VERSION_MINOR 0
23#define DLM_DEVICE_VERSION_PATCH 0
24
25/* struct passed to the lock write */
26struct dlm_lock_params {
27 __u8 mode;
28 __u8 namelen;
29 __u16 flags;
30 __u32 lkid;
31 __u32 parent;
32 void __user *castparam;
33 void __user *castaddr;
34 void __user *bastparam;
35 void __user *bastaddr;
36 struct dlm_lksb __user *lksb;
37 char lvb[DLM_USER_LVB_LEN];
38 char name[0];
39};
40
41struct dlm_lspace_params {
42 __u32 flags;
43 __u32 minor;
44 char name[0];
45};
46
47struct dlm_write_request {
48 __u32 version[3];
49 __u8 cmd;
50 __u8 is64bit;
51 __u8 unused[2];
52
53 union {
54 struct dlm_lock_params lock;
55 struct dlm_lspace_params lspace;
56 } i;
57};
58
59/* struct read from the "device" fd,
60 consists mainly of userspace pointers for the library to use */
61struct dlm_lock_result {
62 __u32 length;
63 void __user * user_astaddr;
64 void __user * user_astparam;
65 struct dlm_lksb __user * user_lksb;
66 struct dlm_lksb lksb;
67 __u8 bast_mode;
68 __u8 unused[3];
69 /* Offsets may be zero if no data is present */
70 __u32 lvb_offset;
71};
72
73/* Commands passed to the device */
74#define DLM_USER_LOCK 1
75#define DLM_USER_UNLOCK 2
76#define DLM_USER_QUERY 3
77#define DLM_USER_CREATE_LOCKSPACE 4
78#define DLM_USER_REMOVE_LOCKSPACE 5
79
80/* Arbitrary length restriction */
81#define MAX_LS_NAME_LEN 64
82
83/* Lockspace flags */
84#define DLM_USER_LSFLG_AUTOFREE 1
85#define DLM_USER_LSFLG_FORCEFREE 2
86
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f53bf4ff1955..34406ed467c3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -250,6 +250,8 @@ extern int dir_notify_enable;
250#define FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */ 250#define FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */
251#define FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ 251#define FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
252#define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ 252#define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
253#define FS_EXTENT_FL 0x00080000 /* Extents */
254#define FS_DIRECTIO_FL 0x00100000 /* Use direct i/o */
253#define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ 255#define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */
254 256
255#define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ 257#define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
new file mode 100644
index 000000000000..a7ae7c177cac
--- /dev/null
+++ b/include/linux/gfs2_ondisk.h
@@ -0,0 +1,443 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GFS2_ONDISK_DOT_H__
11#define __GFS2_ONDISK_DOT_H__
12
13#define GFS2_MAGIC 0x01161970
14#define GFS2_BASIC_BLOCK 512
15#define GFS2_BASIC_BLOCK_SHIFT 9
16
17/* Lock numbers of the LM_TYPE_NONDISK type */
18
19#define GFS2_MOUNT_LOCK 0
20#define GFS2_LIVE_LOCK 1
21#define GFS2_TRANS_LOCK 2
22#define GFS2_RENAME_LOCK 3
23
24/* Format numbers for various metadata types */
25
26#define GFS2_FORMAT_NONE 0
27#define GFS2_FORMAT_SB 100
28#define GFS2_FORMAT_RG 200
29#define GFS2_FORMAT_RB 300
30#define GFS2_FORMAT_DI 400
31#define GFS2_FORMAT_IN 500
32#define GFS2_FORMAT_LF 600
33#define GFS2_FORMAT_JD 700
34#define GFS2_FORMAT_LH 800
35#define GFS2_FORMAT_LD 900
36#define GFS2_FORMAT_LB 1000
37#define GFS2_FORMAT_EA 1600
38#define GFS2_FORMAT_ED 1700
39#define GFS2_FORMAT_QC 1400
40/* These are format numbers for entities contained in files */
41#define GFS2_FORMAT_RI 1100
42#define GFS2_FORMAT_DE 1200
43#define GFS2_FORMAT_QU 1500
44/* These are part of the superblock */
45#define GFS2_FORMAT_FS 1801
46#define GFS2_FORMAT_MULTI 1900
47
48/*
49 * An on-disk inode number
50 */
51
52struct gfs2_inum {
53 __be64 no_formal_ino;
54 __be64 no_addr;
55};
56
57static inline int gfs2_inum_equal(const struct gfs2_inum *ino1,
58 const struct gfs2_inum *ino2)
59{
60 return ino1->no_formal_ino == ino2->no_formal_ino &&
61 ino1->no_addr == ino2->no_addr;
62}
63
64/*
65 * Generic metadata head structure
66 * Every inplace buffer logged in the journal must start with this.
67 */
68
69#define GFS2_METATYPE_NONE 0
70#define GFS2_METATYPE_SB 1
71#define GFS2_METATYPE_RG 2
72#define GFS2_METATYPE_RB 3
73#define GFS2_METATYPE_DI 4
74#define GFS2_METATYPE_IN 5
75#define GFS2_METATYPE_LF 6
76#define GFS2_METATYPE_JD 7
77#define GFS2_METATYPE_LH 8
78#define GFS2_METATYPE_LD 9
79#define GFS2_METATYPE_LB 12
80#define GFS2_METATYPE_EA 10
81#define GFS2_METATYPE_ED 11
82#define GFS2_METATYPE_QC 14
83
84struct gfs2_meta_header {
85 __be32 mh_magic;
86 __be32 mh_type;
87 __be64 __pad0; /* Was generation number in gfs1 */
88 __be32 mh_format;
89 __be32 __pad1; /* Was incarnation number in gfs1 */
90};
91
92/*
93 * super-block structure
94 *
95 * It's probably good if SIZEOF_SB <= GFS2_BASIC_BLOCK (512 bytes)
96 *
97 * Order is important, need to be able to read old superblocks to do on-disk
98 * version upgrades.
99 */
100
101/* Address of superblock in GFS2 basic blocks */
102#define GFS2_SB_ADDR 128
103
104/* The lock number for the superblock (must be zero) */
105#define GFS2_SB_LOCK 0
106
107/* Requirement: GFS2_LOCKNAME_LEN % 8 == 0
108 Includes: the fencing zero at the end */
109#define GFS2_LOCKNAME_LEN 64
110
111struct gfs2_sb {
112 struct gfs2_meta_header sb_header;
113
114 __be32 sb_fs_format;
115 __be32 sb_multihost_format;
116 __u32 __pad0; /* Was superblock flags in gfs1 */
117
118 __be32 sb_bsize;
119 __be32 sb_bsize_shift;
120 __u32 __pad1; /* Was journal segment size in gfs1 */
121
122 struct gfs2_inum sb_master_dir; /* Was jindex dinode in gfs1 */
123 struct gfs2_inum __pad2; /* Was rindex dinode in gfs1 */
124 struct gfs2_inum sb_root_dir;
125
126 char sb_lockproto[GFS2_LOCKNAME_LEN];
127 char sb_locktable[GFS2_LOCKNAME_LEN];
128 /* In gfs1, quota and license dinodes followed */
129};
130
131/*
132 * resource index structure
133 */
134
135struct gfs2_rindex {
136 __be64 ri_addr; /* grp block disk address */
137 __be32 ri_length; /* length of rgrp header in fs blocks */
138 __u32 __pad;
139
140 __be64 ri_data0; /* first data location */
141 __be32 ri_data; /* num of data blocks in rgrp */
142
143 __be32 ri_bitbytes; /* number of bytes in data bitmaps */
144
145 __u8 ri_reserved[64];
146};
147
148/*
149 * resource group header structure
150 */
151
152/* Number of blocks per byte in rgrp */
153#define GFS2_NBBY 4
154#define GFS2_BIT_SIZE 2
155#define GFS2_BIT_MASK 0x00000003
156
157#define GFS2_BLKST_FREE 0
158#define GFS2_BLKST_USED 1
159#define GFS2_BLKST_UNLINKED 2
160#define GFS2_BLKST_DINODE 3
161
162#define GFS2_RGF_JOURNAL 0x00000001
163#define GFS2_RGF_METAONLY 0x00000002
164#define GFS2_RGF_DATAONLY 0x00000004
165#define GFS2_RGF_NOALLOC 0x00000008
166
167struct gfs2_rgrp {
168 struct gfs2_meta_header rg_header;
169
170 __be32 rg_flags;
171 __be32 rg_free;
172 __be32 rg_dinodes;
173 __be32 __pad;
174 __be64 rg_igeneration;
175
176 __u8 rg_reserved[80]; /* Several fields from gfs1 now reserved */
177};
178
179/*
180 * quota structure
181 */
182
183struct gfs2_quota {
184 __be64 qu_limit;
185 __be64 qu_warn;
186 __be64 qu_value;
187 __u8 qu_reserved[64];
188};
189
190/*
191 * dinode structure
192 */
193
194#define GFS2_MAX_META_HEIGHT 10
195#define GFS2_DIR_MAX_DEPTH 17
196
197#define DT2IF(dt) (((dt) << 12) & S_IFMT)
198#define IF2DT(sif) (((sif) & S_IFMT) >> 12)
199
200enum {
201 gfs2fl_Jdata = 0,
202 gfs2fl_ExHash = 1,
203 gfs2fl_Unused = 2,
204 gfs2fl_EaIndirect = 3,
205 gfs2fl_Directio = 4,
206 gfs2fl_Immutable = 5,
207 gfs2fl_AppendOnly = 6,
208 gfs2fl_NoAtime = 7,
209 gfs2fl_Sync = 8,
210 gfs2fl_System = 9,
211 gfs2fl_TruncInProg = 29,
212 gfs2fl_InheritDirectio = 30,
213 gfs2fl_InheritJdata = 31,
214};
215
216/* Dinode flags */
217#define GFS2_DIF_JDATA 0x00000001
218#define GFS2_DIF_EXHASH 0x00000002
219#define GFS2_DIF_UNUSED 0x00000004 /* only in gfs1 */
220#define GFS2_DIF_EA_INDIRECT 0x00000008
221#define GFS2_DIF_DIRECTIO 0x00000010
222#define GFS2_DIF_IMMUTABLE 0x00000020
223#define GFS2_DIF_APPENDONLY 0x00000040
224#define GFS2_DIF_NOATIME 0x00000080
225#define GFS2_DIF_SYNC 0x00000100
226#define GFS2_DIF_SYSTEM 0x00000200 /* New in gfs2 */
227#define GFS2_DIF_TRUNC_IN_PROG 0x20000000 /* New in gfs2 */
228#define GFS2_DIF_INHERIT_DIRECTIO 0x40000000
229#define GFS2_DIF_INHERIT_JDATA 0x80000000
230
231struct gfs2_dinode {
232 struct gfs2_meta_header di_header;
233
234 struct gfs2_inum di_num;
235
236 __be32 di_mode; /* mode of file */
237 __be32 di_uid; /* owner's user id */
238 __be32 di_gid; /* owner's group id */
239 __be32 di_nlink; /* number of links to this file */
240 __be64 di_size; /* number of bytes in file */
241 __be64 di_blocks; /* number of blocks in file */
242 __be64 di_atime; /* time last accessed */
243 __be64 di_mtime; /* time last modified */
244 __be64 di_ctime; /* time last changed */
245 __be32 di_major; /* device major number */
246 __be32 di_minor; /* device minor number */
247
248 /* This section varies from gfs1. Padding added to align with
249 * remainder of dinode
250 */
251 __be64 di_goal_meta; /* rgrp to alloc from next */
252 __be64 di_goal_data; /* data block goal */
253 __be64 di_generation; /* generation number for NFS */
254
255 __be32 di_flags; /* GFS2_DIF_... */
256 __be32 di_payload_format; /* GFS2_FORMAT_... */
257 __u16 __pad1; /* Was ditype in gfs1 */
258 __be16 di_height; /* height of metadata */
259 __u32 __pad2; /* Unused incarnation number from gfs1 */
260
261 /* These only apply to directories */
262 __u16 __pad3; /* Padding */
263 __be16 di_depth; /* Number of bits in the table */
264 __be32 di_entries; /* The number of entries in the directory */
265
266 struct gfs2_inum __pad4; /* Unused even in current gfs1 */
267
268 __be64 di_eattr; /* extended attribute block number */
269
270 __u8 di_reserved[56];
271};
272
273/*
274 * directory structure - many of these per directory file
275 */
276
277#define GFS2_FNAMESIZE 255
278#define GFS2_DIRENT_SIZE(name_len) ((sizeof(struct gfs2_dirent) + (name_len) + 7) & ~7)
279
280struct gfs2_dirent {
281 struct gfs2_inum de_inum;
282 __be32 de_hash;
283 __be16 de_rec_len;
284 __be16 de_name_len;
285 __be16 de_type;
286 __u8 __pad[14];
287};
288
289/*
290 * Header of leaf directory nodes
291 */
292
293struct gfs2_leaf {
294 struct gfs2_meta_header lf_header;
295
296 __be16 lf_depth; /* Depth of leaf */
297 __be16 lf_entries; /* Number of dirents in leaf */
298 __be32 lf_dirent_format; /* Format of the dirents */
299 __be64 lf_next; /* Next leaf, if overflow */
300
301 __u8 lf_reserved[64];
302};
303
304/*
305 * Extended attribute header format
306 */
307
308#define GFS2_EA_MAX_NAME_LEN 255
309#define GFS2_EA_MAX_DATA_LEN 65536
310
311#define GFS2_EATYPE_UNUSED 0
312#define GFS2_EATYPE_USR 1
313#define GFS2_EATYPE_SYS 2
314#define GFS2_EATYPE_SECURITY 3
315
316#define GFS2_EATYPE_LAST 3
317#define GFS2_EATYPE_VALID(x) ((x) <= GFS2_EATYPE_LAST)
318
319#define GFS2_EAFLAG_LAST 0x01 /* last ea in block */
320
321struct gfs2_ea_header {
322 __be32 ea_rec_len;
323 __be32 ea_data_len;
324 __u8 ea_name_len; /* no NULL pointer after the string */
325 __u8 ea_type; /* GFS2_EATYPE_... */
326 __u8 ea_flags; /* GFS2_EAFLAG_... */
327 __u8 ea_num_ptrs;
328 __u32 __pad;
329};
330
331/*
332 * Log header structure
333 */
334
335#define GFS2_LOG_HEAD_UNMOUNT 0x00000001 /* log is clean */
336
337struct gfs2_log_header {
338 struct gfs2_meta_header lh_header;
339
340 __be64 lh_sequence; /* Sequence number of this transaction */
341 __be32 lh_flags; /* GFS2_LOG_HEAD_... */
342 __be32 lh_tail; /* Block number of log tail */
343 __be32 lh_blkno;
344 __be32 lh_hash;
345};
346
347/*
348 * Log type descriptor
349 */
350
351#define GFS2_LOG_DESC_METADATA 300
352/* ld_data1 is the number of metadata blocks in the descriptor.
353 ld_data2 is unused. */
354
355#define GFS2_LOG_DESC_REVOKE 301
356/* ld_data1 is the number of revoke blocks in the descriptor.
357 ld_data2 is unused. */
358
359#define GFS2_LOG_DESC_JDATA 302
360/* ld_data1 is the number of data blocks in the descriptor.
361 ld_data2 is unused. */
362
363struct gfs2_log_descriptor {
364 struct gfs2_meta_header ld_header;
365
366 __be32 ld_type; /* GFS2_LOG_DESC_... */
367 __be32 ld_length; /* Number of buffers in this chunk */
368 __be32 ld_data1; /* descriptor-specific field */
369 __be32 ld_data2; /* descriptor-specific field */
370
371 __u8 ld_reserved[32];
372};
373
374/*
375 * Inum Range
376 * Describe a range of formal inode numbers allocated to
377 * one machine to assign to inodes.
378 */
379
380#define GFS2_INUM_QUANTUM 1048576
381
382struct gfs2_inum_range {
383 __be64 ir_start;
384 __be64 ir_length;
385};
386
387/*
388 * Statfs change
389 * Describes an change to the pool of free and allocated
390 * blocks.
391 */
392
393struct gfs2_statfs_change {
394 __be64 sc_total;
395 __be64 sc_free;
396 __be64 sc_dinodes;
397};
398
399/*
400 * Quota change
401 * Describes an allocation change for a particular
402 * user or group.
403 */
404
405#define GFS2_QCF_USER 0x00000001
406
407struct gfs2_quota_change {
408 __be64 qc_change;
409 __be32 qc_flags; /* GFS2_QCF_... */
410 __be32 qc_id;
411};
412
413#ifdef __KERNEL__
414/* Translation functions */
415
416extern void gfs2_inum_in(struct gfs2_inum *no, const void *buf);
417extern void gfs2_inum_out(const struct gfs2_inum *no, void *buf);
418extern void gfs2_sb_in(struct gfs2_sb *sb, const void *buf);
419extern void gfs2_rindex_in(struct gfs2_rindex *ri, const void *buf);
420extern void gfs2_rindex_out(const struct gfs2_rindex *ri, void *buf);
421extern void gfs2_rgrp_in(struct gfs2_rgrp *rg, const void *buf);
422extern void gfs2_rgrp_out(const struct gfs2_rgrp *rg, void *buf);
423extern void gfs2_quota_in(struct gfs2_quota *qu, const void *buf);
424extern void gfs2_quota_out(const struct gfs2_quota *qu, void *buf);
425extern void gfs2_dinode_in(struct gfs2_dinode *di, const void *buf);
426extern void gfs2_dinode_out(const struct gfs2_dinode *di, void *buf);
427extern void gfs2_ea_header_in(struct gfs2_ea_header *ea, const void *buf);
428extern void gfs2_ea_header_out(const struct gfs2_ea_header *ea, void *buf);
429extern void gfs2_log_header_in(struct gfs2_log_header *lh, const void *buf);
430extern void gfs2_inum_range_in(struct gfs2_inum_range *ir, const void *buf);
431extern void gfs2_inum_range_out(const struct gfs2_inum_range *ir, void *buf);
432extern void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, const void *buf);
433extern void gfs2_statfs_change_out(const struct gfs2_statfs_change *sc, void *buf);
434extern void gfs2_quota_change_in(struct gfs2_quota_change *qc, const void *buf);
435
436/* Printing functions */
437
438extern void gfs2_rindex_print(const struct gfs2_rindex *ri);
439extern void gfs2_dinode_print(const struct gfs2_dinode *di);
440
441#endif /* __KERNEL__ */
442
443#endif /* __GFS2_ONDISK_DOT_H__ */
diff --git a/include/linux/lm_interface.h b/include/linux/lm_interface.h
new file mode 100644
index 000000000000..1418fdc9ac02
--- /dev/null
+++ b/include/linux/lm_interface.h
@@ -0,0 +1,273 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __LM_INTERFACE_DOT_H__
11#define __LM_INTERFACE_DOT_H__
12
13
14typedef void (*lm_callback_t) (void *ptr, unsigned int type, void *data);
15
16/*
17 * lm_mount() flags
18 *
19 * LM_MFLAG_SPECTATOR
20 * GFS is asking to join the filesystem's lockspace, but it doesn't want to
21 * modify the filesystem. The lock module shouldn't assign a journal to the FS
22 * mount. It shouldn't send recovery callbacks to the FS mount. If the node
23 * dies or withdraws, all locks can be wiped immediately.
24 */
25
26#define LM_MFLAG_SPECTATOR 0x00000001
27
28/*
29 * lm_lockstruct flags
30 *
31 * LM_LSFLAG_LOCAL
32 * The lock_nolock module returns LM_LSFLAG_LOCAL to GFS, indicating that GFS
33 * can make single-node optimizations.
34 */
35
36#define LM_LSFLAG_LOCAL 0x00000001
37
38/*
39 * lm_lockname types
40 */
41
42#define LM_TYPE_RESERVED 0x00
43#define LM_TYPE_NONDISK 0x01
44#define LM_TYPE_INODE 0x02
45#define LM_TYPE_RGRP 0x03
46#define LM_TYPE_META 0x04
47#define LM_TYPE_IOPEN 0x05
48#define LM_TYPE_FLOCK 0x06
49#define LM_TYPE_PLOCK 0x07
50#define LM_TYPE_QUOTA 0x08
51#define LM_TYPE_JOURNAL 0x09
52
53/*
54 * lm_lock() states
55 *
56 * SHARED is compatible with SHARED, not with DEFERRED or EX.
57 * DEFERRED is compatible with DEFERRED, not with SHARED or EX.
58 */
59
60#define LM_ST_UNLOCKED 0
61#define LM_ST_EXCLUSIVE 1
62#define LM_ST_DEFERRED 2
63#define LM_ST_SHARED 3
64
65/*
66 * lm_lock() flags
67 *
68 * LM_FLAG_TRY
69 * Don't wait to acquire the lock if it can't be granted immediately.
70 *
71 * LM_FLAG_TRY_1CB
72 * Send one blocking callback if TRY is set and the lock is not granted.
73 *
74 * LM_FLAG_NOEXP
75 * GFS sets this flag on lock requests it makes while doing journal recovery.
76 * These special requests should not be blocked due to the recovery like
77 * ordinary locks would be.
78 *
79 * LM_FLAG_ANY
80 * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may
81 * also be granted in SHARED. The preferred state is whichever is compatible
82 * with other granted locks, or the specified state if no other locks exist.
83 *
84 * LM_FLAG_PRIORITY
85 * Override fairness considerations. Suppose a lock is held in a shared state
86 * and there is a pending request for the deferred state. A shared lock
87 * request with the priority flag would be allowed to bypass the deferred
88 * request and directly join the other shared lock. A shared lock request
89 * without the priority flag might be forced to wait until the deferred
90 * requested had acquired and released the lock.
91 */
92
93#define LM_FLAG_TRY 0x00000001
94#define LM_FLAG_TRY_1CB 0x00000002
95#define LM_FLAG_NOEXP 0x00000004
96#define LM_FLAG_ANY 0x00000008
97#define LM_FLAG_PRIORITY 0x00000010
98
99/*
100 * lm_lock() and lm_async_cb return flags
101 *
102 * LM_OUT_ST_MASK
103 * Masks the lower two bits of lock state in the returned value.
104 *
105 * LM_OUT_CACHEABLE
106 * The lock hasn't been released so GFS can continue to cache data for it.
107 *
108 * LM_OUT_CANCELED
109 * The lock request was canceled.
110 *
111 * LM_OUT_ASYNC
112 * The result of the request will be returned in an LM_CB_ASYNC callback.
113 */
114
115#define LM_OUT_ST_MASK 0x00000003
116#define LM_OUT_CACHEABLE 0x00000004
117#define LM_OUT_CANCELED 0x00000008
118#define LM_OUT_ASYNC 0x00000080
119#define LM_OUT_ERROR 0x00000100
120
121/*
122 * lm_callback_t types
123 *
124 * LM_CB_NEED_E LM_CB_NEED_D LM_CB_NEED_S
125 * Blocking callback, a remote node is requesting the given lock in
126 * EXCLUSIVE, DEFERRED, or SHARED.
127 *
128 * LM_CB_NEED_RECOVERY
129 * The given journal needs to be recovered.
130 *
131 * LM_CB_DROPLOCKS
132 * Reduce the number of cached locks.
133 *
134 * LM_CB_ASYNC
135 * The given lock has been granted.
136 */
137
138#define LM_CB_NEED_E 257
139#define LM_CB_NEED_D 258
140#define LM_CB_NEED_S 259
141#define LM_CB_NEED_RECOVERY 260
142#define LM_CB_DROPLOCKS 261
143#define LM_CB_ASYNC 262
144
145/*
146 * lm_recovery_done() messages
147 */
148
149#define LM_RD_GAVEUP 308
150#define LM_RD_SUCCESS 309
151
152
153struct lm_lockname {
154 u64 ln_number;
155 unsigned int ln_type;
156};
157
158#define lm_name_equal(name1, name2) \
159 (((name1)->ln_number == (name2)->ln_number) && \
160 ((name1)->ln_type == (name2)->ln_type)) \
161
162struct lm_async_cb {
163 struct lm_lockname lc_name;
164 int lc_ret;
165};
166
167struct lm_lockstruct;
168
169struct lm_lockops {
170 const char *lm_proto_name;
171
172 /*
173 * Mount/Unmount
174 */
175
176 int (*lm_mount) (char *table_name, char *host_data,
177 lm_callback_t cb, void *cb_data,
178 unsigned int min_lvb_size, int flags,
179 struct lm_lockstruct *lockstruct,
180 struct kobject *fskobj);
181
182 void (*lm_others_may_mount) (void *lockspace);
183
184 void (*lm_unmount) (void *lockspace);
185
186 void (*lm_withdraw) (void *lockspace);
187
188 /*
189 * Lock oriented operations
190 */
191
192 int (*lm_get_lock) (void *lockspace, struct lm_lockname *name, void **lockp);
193
194 void (*lm_put_lock) (void *lock);
195
196 unsigned int (*lm_lock) (void *lock, unsigned int cur_state,
197 unsigned int req_state, unsigned int flags);
198
199 unsigned int (*lm_unlock) (void *lock, unsigned int cur_state);
200
201 void (*lm_cancel) (void *lock);
202
203 int (*lm_hold_lvb) (void *lock, char **lvbp);
204 void (*lm_unhold_lvb) (void *lock, char *lvb);
205
206 /*
207 * Posix Lock oriented operations
208 */
209
210 int (*lm_plock_get) (void *lockspace, struct lm_lockname *name,
211 struct file *file, struct file_lock *fl);
212
213 int (*lm_plock) (void *lockspace, struct lm_lockname *name,
214 struct file *file, int cmd, struct file_lock *fl);
215
216 int (*lm_punlock) (void *lockspace, struct lm_lockname *name,
217 struct file *file, struct file_lock *fl);
218
219 /*
220 * Client oriented operations
221 */
222
223 void (*lm_recovery_done) (void *lockspace, unsigned int jid,
224 unsigned int message);
225
226 struct module *lm_owner;
227};
228
229/*
230 * lm_mount() return values
231 *
232 * ls_jid - the journal ID this node should use
233 * ls_first - this node is the first to mount the file system
234 * ls_lvb_size - size in bytes of lock value blocks
235 * ls_lockspace - lock module's context for this file system
236 * ls_ops - lock module's functions
237 * ls_flags - lock module features
238 */
239
240struct lm_lockstruct {
241 unsigned int ls_jid;
242 unsigned int ls_first;
243 unsigned int ls_lvb_size;
244 void *ls_lockspace;
245 const struct lm_lockops *ls_ops;
246 int ls_flags;
247};
248
249/*
250 * Lock module bottom interface. A lock module makes itself available to GFS
251 * with these functions.
252 */
253
254int gfs2_register_lockproto(const struct lm_lockops *proto);
255void gfs2_unregister_lockproto(const struct lm_lockops *proto);
256
257/*
258 * Lock module top interface. GFS calls these functions when mounting or
259 * unmounting a file system.
260 */
261
262int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
263 lm_callback_t cb, void *cb_data,
264 unsigned int min_lvb_size, int flags,
265 struct lm_lockstruct *lockstruct,
266 struct kobject *fskobj);
267
268void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct);
269
270void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct);
271
272#endif /* __LM_INTERFACE_DOT_H__ */
273
diff --git a/include/linux/lock_dlm_plock.h b/include/linux/lock_dlm_plock.h
new file mode 100644
index 000000000000..fc3415113973
--- /dev/null
+++ b/include/linux/lock_dlm_plock.h
@@ -0,0 +1,41 @@
1/*
2 * Copyright (C) 2005 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#ifndef __LOCK_DLM_PLOCK_DOT_H__
10#define __LOCK_DLM_PLOCK_DOT_H__
11
12#define GDLM_PLOCK_MISC_NAME "lock_dlm_plock"
13
14#define GDLM_PLOCK_VERSION_MAJOR 1
15#define GDLM_PLOCK_VERSION_MINOR 1
16#define GDLM_PLOCK_VERSION_PATCH 0
17
18enum {
19 GDLM_PLOCK_OP_LOCK = 1,
20 GDLM_PLOCK_OP_UNLOCK,
21 GDLM_PLOCK_OP_GET,
22};
23
24struct gdlm_plock_info {
25 __u32 version[3];
26 __u8 optype;
27 __u8 ex;
28 __u8 wait;
29 __u8 pad;
30 __u32 pid;
31 __s32 nodeid;
32 __s32 rv;
33 __u32 fsid;
34 __u64 number;
35 __u64 start;
36 __u64 end;
37 __u64 owner;
38};
39
40#endif
41
diff --git a/mm/filemap.c b/mm/filemap.c
index f789500406fe..3464b681f844 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1198,8 +1198,10 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1198 if (retval > 0) 1198 if (retval > 0)
1199 *ppos = pos + retval; 1199 *ppos = pos + retval;
1200 } 1200 }
1201 file_accessed(filp); 1201 if (likely(retval != 0)) {
1202 goto out; 1202 file_accessed(filp);
1203 goto out;
1204 }
1203 } 1205 }
1204 1206
1205 retval = 0; 1207 retval = 0;
diff --git a/mm/readahead.c b/mm/readahead.c
index aa7ec424656a..1ba736ac0367 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -38,6 +38,7 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
38 ra->ra_pages = mapping->backing_dev_info->ra_pages; 38 ra->ra_pages = mapping->backing_dev_info->ra_pages;
39 ra->prev_page = -1; 39 ra->prev_page = -1;
40} 40}
41EXPORT_SYMBOL_GPL(file_ra_state_init);
41 42
42/* 43/*
43 * Return max readahead size for this inode in number-of-pages. 44 * Return max readahead size for this inode in number-of-pages.