aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CREDITS6
-rw-r--r--Documentation/filesystems/gfs2.txt43
-rw-r--r--Documentation/ioctl-number.txt1
-rw-r--r--MAINTAINERS18
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/configfs/item.c2
-rw-r--r--fs/dlm/Kconfig29
-rw-r--r--fs/dlm/Makefile21
-rw-r--r--fs/dlm/ast.c167
-rw-r--r--fs/dlm/ast.h26
-rw-r--r--fs/dlm/config.c789
-rw-r--r--fs/dlm/config.h42
-rw-r--r--fs/dlm/debug_fs.c296
-rw-r--r--fs/dlm/device.c1091
-rw-r--r--fs/dlm/dir.c423
-rw-r--r--fs/dlm/dir.h30
-rw-r--r--fs/dlm/dlm_internal.h494
-rw-r--r--fs/dlm/lock.c3547
-rw-r--r--fs/dlm/lock.h50
-rw-r--r--fs/dlm/lockspace.c678
-rw-r--r--fs/dlm/lockspace.h24
-rw-r--r--fs/dlm/lowcomms.c1239
-rw-r--r--fs/dlm/lowcomms.h26
-rw-r--r--fs/dlm/lvb_table.h18
-rw-r--r--fs/dlm/main.c89
-rw-r--r--fs/dlm/member.c312
-rw-r--r--fs/dlm/member.h24
-rw-r--r--fs/dlm/memory.c106
-rw-r--r--fs/dlm/memory.h29
-rw-r--r--fs/dlm/midcomms.c140
-rw-r--r--fs/dlm/midcomms.h21
-rw-r--r--fs/dlm/rcom.c457
-rw-r--r--fs/dlm/rcom.h24
-rw-r--r--fs/dlm/recover.c762
-rw-r--r--fs/dlm/recover.h34
-rw-r--r--fs/dlm/recoverd.c285
-rw-r--r--fs/dlm/recoverd.h24
-rw-r--r--fs/dlm/requestqueue.c184
-rw-r--r--fs/dlm/requestqueue.h22
-rw-r--r--fs/dlm/util.c161
-rw-r--r--fs/dlm/util.h22
-rw-r--r--fs/gfs2/Kconfig44
-rw-r--r--fs/gfs2/Makefile10
-rw-r--r--fs/gfs2/acl.c315
-rw-r--r--fs/gfs2/acl.h37
-rw-r--r--fs/gfs2/bmap.c1103
-rw-r--r--fs/gfs2/bmap.h32
-rw-r--r--fs/gfs2/daemon.c223
-rw-r--r--fs/gfs2/daemon.h20
-rw-r--r--fs/gfs2/dir.c1974
-rw-r--r--fs/gfs2/dir.h73
-rw-r--r--fs/gfs2/eaops.c229
-rw-r--r--fs/gfs2/eaops.h31
-rw-r--r--fs/gfs2/eattr.c1549
-rw-r--r--fs/gfs2/eattr.h97
-rw-r--r--fs/gfs2/format.h21
-rw-r--r--fs/gfs2/gfs2.h31
-rw-r--r--fs/gfs2/glock.c2340
-rw-r--r--fs/gfs2/glock.h155
-rw-r--r--fs/gfs2/glops.c491
-rw-r--r--fs/gfs2/glops.h23
-rw-r--r--fs/gfs2/incore.h687
-rw-r--r--fs/gfs2/inode.c1820
-rw-r--r--fs/gfs2/inode.h72
-rw-r--r--fs/gfs2/lm.c244
-rw-r--r--fs/gfs2/lm.h41
-rw-r--r--fs/gfs2/lm_interface.h295
-rw-r--r--fs/gfs2/locking.c191
-rw-r--r--fs/gfs2/locking/dlm/Makefile3
-rw-r--r--fs/gfs2/locking/dlm/lock.c541
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h188
-rw-r--r--fs/gfs2/locking/dlm/main.c64
-rw-r--r--fs/gfs2/locking/dlm/mount.c256
-rw-r--r--fs/gfs2/locking/dlm/plock.c299
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c225
-rw-r--r--fs/gfs2/locking/dlm/thread.c352
-rw-r--r--fs/gfs2/locking/nolock/Makefile3
-rw-r--r--fs/gfs2/locking/nolock/main.c259
-rw-r--r--fs/gfs2/log.c598
-rw-r--r--fs/gfs2/log.h61
-rw-r--r--fs/gfs2/lops.c804
-rw-r--r--fs/gfs2/lops.h96
-rw-r--r--fs/gfs2/lvb.c45
-rw-r--r--fs/gfs2/lvb.h19
-rw-r--r--fs/gfs2/main.c129
-rw-r--r--fs/gfs2/meta_io.c892
-rw-r--r--fs/gfs2/meta_io.h89
-rw-r--r--fs/gfs2/mount.c214
-rw-r--r--fs/gfs2/mount.h15
-rw-r--r--fs/gfs2/ondisk.c321
-rw-r--r--fs/gfs2/ops_address.c670
-rw-r--r--fs/gfs2/ops_address.h17
-rw-r--r--fs/gfs2/ops_dentry.c123
-rw-r--r--fs/gfs2/ops_dentry.h15
-rw-r--r--fs/gfs2/ops_export.c297
-rw-r--r--fs/gfs2/ops_export.h15
-rw-r--r--fs/gfs2/ops_file.c1000
-rw-r--r--fs/gfs2/ops_file.h20
-rw-r--r--fs/gfs2/ops_fstype.c901
-rw-r--r--fs/gfs2/ops_fstype.h16
-rw-r--r--fs/gfs2/ops_inode.c1194
-rw-r--r--fs/gfs2/ops_inode.h18
-rw-r--r--fs/gfs2/ops_super.c399
-rw-r--r--fs/gfs2/ops_super.h15
-rw-r--r--fs/gfs2/ops_vm.c195
-rw-r--r--fs/gfs2/ops_vm.h16
-rw-r--r--fs/gfs2/page.c280
-rw-r--r--fs/gfs2/page.h23
-rw-r--r--fs/gfs2/quota.c1305
-rw-r--r--fs/gfs2/quota.h32
-rw-r--r--fs/gfs2/recovery.c576
-rw-r--r--fs/gfs2/recovery.h32
-rw-r--r--fs/gfs2/rgrp.c1524
-rw-r--r--fs/gfs2/rgrp.h62
-rw-r--r--fs/gfs2/super.c945
-rw-r--r--fs/gfs2/super.h52
-rw-r--r--fs/gfs2/sys.c581
-rw-r--r--fs/gfs2/sys.h24
-rw-r--r--fs/gfs2/trans.c184
-rw-r--r--fs/gfs2/trans.h35
-rw-r--r--fs/gfs2/unlinked.c459
-rw-r--r--fs/gfs2/unlinked.h25
-rw-r--r--fs/gfs2/util.c245
-rw-r--r--fs/gfs2/util.h169
-rw-r--r--include/linux/dlm.h302
-rw-r--r--include/linux/dlm_device.h83
-rw-r--r--include/linux/fs.h3
-rw-r--r--include/linux/gfs2_ondisk.h459
-rw-r--r--include/linux/iflags.h102
-rw-r--r--include/linux/kernel.h1
-rw-r--r--include/linux/lock_dlm_plock.h40
-rw-r--r--kernel/printk.c1
-rw-r--r--mm/filemap.c1
-rw-r--r--mm/readahead.c1
135 files changed, 41834 insertions, 4 deletions
diff --git a/CREDITS b/CREDITS
index 9bf714a1c7d9..3ebb93b2f7c8 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3540,11 +3540,11 @@ S: Fargo, North Dakota 58122
3540S: USA 3540S: USA
3541 3541
3542N: Steven Whitehouse 3542N: Steven Whitehouse
3543E: SteveW@ACM.org 3543E: steve@chygwyn.com
3544W: http://www.chygwyn.com/~steve 3544W: http://www.chygwyn.com/~steve
3545D: Linux DECnet project: http://www.sucs.swan.ac.uk/~rohan/DECnet/index.html 3545D: Linux DECnet project
3546D: Minor debugging of other networking protocols. 3546D: Minor debugging of other networking protocols.
3547D: Misc bug fixes and filesystem development 3547D: Misc bug fixes and GFS2 filesystem development
3548 3548
3549N: Hans-Joachim Widmaier 3549N: Hans-Joachim Widmaier
3550E: hjw@zvw.de 3550E: hjw@zvw.de
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt
new file mode 100644
index 000000000000..593004b6bbab
--- /dev/null
+++ b/Documentation/filesystems/gfs2.txt
@@ -0,0 +1,43 @@
1Global File System
2------------------
3
4http://sources.redhat.com/cluster/
5
6GFS is a cluster file system. It allows a cluster of computers to
7simultaneously use a block device that is shared between them (with FC,
8iSCSI, NBD, etc). GFS reads and writes to the block device like a local
9file system, but also uses a lock module to allow the computers coordinate
10their I/O so file system consistency is maintained. One of the nifty
11features of GFS is perfect consistency -- changes made to the file system
12on one machine show up immediately on all other machines in the cluster.
13
14GFS uses interchangable inter-node locking mechanisms. Different lock
15modules can plug into GFS and each file system selects the appropriate
16lock module at mount time. Lock modules include:
17
18 lock_nolock -- allows gfs to be used as a local file system
19
20 lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking
21 The dlm is found at linux/fs/dlm/
22
23In addition to interfacing with an external locking manager, a gfs lock
24module is responsible for interacting with external cluster management
25systems. Lock_dlm depends on user space cluster management systems found
26at the URL above.
27
28To use gfs as a local file system, no external clustering systems are
29needed, simply:
30
31 $ mkfs -t gfs2 -p lock_nolock -j 1 /dev/block_device
32 $ mount -t gfs2 /dev/block_device /dir
33
34GFS2 is not on-disk compatible with previous versions of GFS.
35
36The following man pages can be found at the URL above:
37 gfs2_fsck to repair a filesystem
38 gfs2_grow to expand a filesystem online
39 gfs2_jadd to add journals to a filesystem online
40 gfs2_tool to manipulate, examine and tune a filesystem
41 gfs2_quota to examine and change quota values in a filesystem
42 mount.gfs2 to help mount(8) mount a filesystem
43 mkfs.gfs2 to make a filesystem
diff --git a/Documentation/ioctl-number.txt b/Documentation/ioctl-number.txt
index 171a44ebd939..93a86ac23cdd 100644
--- a/Documentation/ioctl-number.txt
+++ b/Documentation/ioctl-number.txt
@@ -124,6 +124,7 @@ Code Seq# Include File Comments
124'e' 00-1F linux/video_encoder.h conflict! 124'e' 00-1F linux/video_encoder.h conflict!
125'e' 00-1F net/irda/irtty.h conflict! 125'e' 00-1F net/irda/irtty.h conflict!
126'f' 00-1F linux/ext2_fs.h 126'f' 00-1F linux/ext2_fs.h
127'g' 00-1F linux/gfs2_ioctl.h
127'h' 00-7F Charon filesystem 128'h' 00-7F Charon filesystem
128 <mailto:zapman@interlan.net> 129 <mailto:zapman@interlan.net>
129'i' 00-3F linux/i2o.h 130'i' 00-3F linux/i2o.h
diff --git a/MAINTAINERS b/MAINTAINERS
index bd10b2af2223..328da34a7de2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -828,6 +828,16 @@ M: jack@suse.cz
828L: linux-kernel@vger.kernel.org 828L: linux-kernel@vger.kernel.org
829S: Maintained 829S: Maintained
830 830
831DISTRIBUTED LOCK MANAGER
832P: Patrick Caulfield
833M: pcaulfie@redhat.com
834P: David Teigland
835M: teigland@redhat.com
836L: linux-cluster@redhat.com
837W: http://sources.redhat.com/cluster/
838T: git kernel.org:/pub/scm/linux/kernel/git/steve/gfs-2.6.git
839S: Supported
840
831DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER 841DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER
832P: Tobias Ringstrom 842P: Tobias Ringstrom
833M: tori@unhappy.mine.nu 843M: tori@unhappy.mine.nu
@@ -1078,6 +1088,14 @@ M: khc@pm.waw.pl
1078W: http://www.kernel.org/pub/linux/utils/net/hdlc/ 1088W: http://www.kernel.org/pub/linux/utils/net/hdlc/
1079S: Maintained 1089S: Maintained
1080 1090
1091GFS2 FILE SYSTEM
1092P: Steven Whitehouse
1093M: swhiteho@redhat.com
1094L: linux-cluster@redhat.com
1095W: http://sources.redhat.com/cluster/
1096T: git kernel.org:/pub/scm/linux/kernel/git/steve/gfs-2.6.git
1097S: Supported
1098
1081GIGASET ISDN DRIVERS 1099GIGASET ISDN DRIVERS
1082P: Hansjoerg Lipp 1100P: Hansjoerg Lipp
1083M: hjlipp@web.de 1101M: hjlipp@web.de
diff --git a/fs/Kconfig b/fs/Kconfig
index f9b5842c8d2d..563a59e5e694 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -323,6 +323,7 @@ config FS_POSIX_ACL
323 default n 323 default n
324 324
325source "fs/xfs/Kconfig" 325source "fs/xfs/Kconfig"
326source "fs/gfs2/Kconfig"
326 327
327config OCFS2_FS 328config OCFS2_FS
328 tristate "OCFS2 file system support (EXPERIMENTAL)" 329 tristate "OCFS2 file system support (EXPERIMENTAL)"
@@ -1824,6 +1825,7 @@ source "fs/partitions/Kconfig"
1824endmenu 1825endmenu
1825 1826
1826source "fs/nls/Kconfig" 1827source "fs/nls/Kconfig"
1828source "fs/dlm/Kconfig"
1827 1829
1828endmenu 1830endmenu
1829 1831
diff --git a/fs/Makefile b/fs/Makefile
index 078d3d1191a5..c731d2c0f409 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_CONFIGFS_FS) += configfs/
49obj-y += devpts/ 49obj-y += devpts/
50 50
51obj-$(CONFIG_PROFILING) += dcookies.o 51obj-$(CONFIG_PROFILING) += dcookies.o
52obj-$(CONFIG_DLM) += dlm/
52 53
53# Do not add any filesystems before this line 54# Do not add any filesystems before this line
54obj-$(CONFIG_REISERFS_FS) += reiserfs/ 55obj-$(CONFIG_REISERFS_FS) += reiserfs/
@@ -102,3 +103,4 @@ obj-$(CONFIG_HOSTFS) += hostfs/
102obj-$(CONFIG_HPPFS) += hppfs/ 103obj-$(CONFIG_HPPFS) += hppfs/
103obj-$(CONFIG_DEBUG_FS) += debugfs/ 104obj-$(CONFIG_DEBUG_FS) += debugfs/
104obj-$(CONFIG_OCFS2_FS) += ocfs2/ 105obj-$(CONFIG_OCFS2_FS) += ocfs2/
106obj-$(CONFIG_GFS2_FS) += gfs2/
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index e07485ac50ad..24421209f854 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -224,4 +224,4 @@ EXPORT_SYMBOL(config_item_init);
224EXPORT_SYMBOL(config_group_init); 224EXPORT_SYMBOL(config_group_init);
225EXPORT_SYMBOL(config_item_get); 225EXPORT_SYMBOL(config_item_get);
226EXPORT_SYMBOL(config_item_put); 226EXPORT_SYMBOL(config_item_put);
227 227EXPORT_SYMBOL(config_group_find_obj);
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
new file mode 100644
index 000000000000..09e78bf6e7a4
--- /dev/null
+++ b/fs/dlm/Kconfig
@@ -0,0 +1,29 @@
1menu "Distributed Lock Manager"
2 depends on INET && EXPERIMENTAL
3
4config DLM
5 tristate "Distributed Lock Manager (DLM)"
6 depends on IPV6 || IPV6=n
7 depends on IP_SCTP
8 select CONFIGFS_FS
9 help
10 A general purpose distributed lock manager for kernel or userspace
11 applications.
12
13config DLM_DEVICE
14 tristate "DLM device for userspace access"
15 depends on DLM
16 help
17 This module creates a misc device through which the dlm lockspace
18 and locking functions become available to userspace applications
19 (usually through the libdlm library).
20
21config DLM_DEBUG
22 bool "DLM debugging"
23 depends on DLM
24 help
25 Under the debugfs mount point, the name of each lockspace will
26 appear as a file in the "dlm" directory. The output is the
27 list of resource and locks the local node knows about.
28
29endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
new file mode 100644
index 000000000000..1e6232e7d8e5
--- /dev/null
+++ b/fs/dlm/Makefile
@@ -0,0 +1,21 @@
1obj-$(CONFIG_DLM) += dlm.o
2obj-$(CONFIG_DLM_DEVICE) += dlm_device.o
3
4dlm-y := ast.o \
5 config.o \
6 dir.o \
7 lock.o \
8 lockspace.o \
9 lowcomms.o \
10 main.o \
11 member.o \
12 memory.o \
13 midcomms.o \
14 rcom.o \
15 recover.o \
16 recoverd.o \
17 requestqueue.o \
18 util.o
19dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o
20
21dlm_device-y := device.o
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
new file mode 100644
index 000000000000..57bdf09b520a
--- /dev/null
+++ b/fs/dlm/ast.c
@@ -0,0 +1,167 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lock.h"
16#include "ast.h"
17
18#define WAKE_ASTS 0
19
20static struct list_head ast_queue;
21static spinlock_t ast_queue_lock;
22static struct task_struct * astd_task;
23static unsigned long astd_wakeflags;
24static struct mutex astd_running;
25
26
27void dlm_del_ast(struct dlm_lkb *lkb)
28{
29 spin_lock(&ast_queue_lock);
30 if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
31 list_del(&lkb->lkb_astqueue);
32 spin_unlock(&ast_queue_lock);
33}
34
35void dlm_add_ast(struct dlm_lkb *lkb, int type)
36{
37 spin_lock(&ast_queue_lock);
38 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
39 kref_get(&lkb->lkb_ref);
40 list_add_tail(&lkb->lkb_astqueue, &ast_queue);
41 }
42 lkb->lkb_ast_type |= type;
43 spin_unlock(&ast_queue_lock);
44
45 set_bit(WAKE_ASTS, &astd_wakeflags);
46 wake_up_process(astd_task);
47}
48
49static void process_asts(void)
50{
51 struct dlm_ls *ls = NULL;
52 struct dlm_rsb *r = NULL;
53 struct dlm_lkb *lkb;
54 void (*cast) (long param);
55 void (*bast) (long param, int mode);
56 int type = 0, found, bmode;
57
58 for (;;) {
59 found = 0;
60 spin_lock(&ast_queue_lock);
61 list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
62 r = lkb->lkb_resource;
63 ls = r->res_ls;
64
65 if (dlm_locking_stopped(ls))
66 continue;
67
68 list_del(&lkb->lkb_astqueue);
69 type = lkb->lkb_ast_type;
70 lkb->lkb_ast_type = 0;
71 found = 1;
72 break;
73 }
74 spin_unlock(&ast_queue_lock);
75
76 if (!found)
77 break;
78
79 cast = lkb->lkb_astaddr;
80 bast = lkb->lkb_bastaddr;
81 bmode = lkb->lkb_bastmode;
82
83 if ((type & AST_COMP) && cast)
84 cast(lkb->lkb_astparam);
85
86 /* FIXME: Is it safe to look at lkb_grmode here
87 without doing a lock_rsb() ?
88 Look at other checks in v1 to avoid basts. */
89
90 if ((type & AST_BAST) && bast)
91 if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
92 bast(lkb->lkb_astparam, bmode);
93
94 /* this removes the reference added by dlm_add_ast
95 and may result in the lkb being freed */
96 dlm_put_lkb(lkb);
97
98 schedule();
99 }
100}
101
102static inline int no_asts(void)
103{
104 int ret;
105
106 spin_lock(&ast_queue_lock);
107 ret = list_empty(&ast_queue);
108 spin_unlock(&ast_queue_lock);
109 return ret;
110}
111
112static int dlm_astd(void *data)
113{
114 while (!kthread_should_stop()) {
115 set_current_state(TASK_INTERRUPTIBLE);
116 if (!test_bit(WAKE_ASTS, &astd_wakeflags))
117 schedule();
118 set_current_state(TASK_RUNNING);
119
120 mutex_lock(&astd_running);
121 if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
122 process_asts();
123 mutex_unlock(&astd_running);
124 }
125 return 0;
126}
127
128void dlm_astd_wake(void)
129{
130 if (!no_asts()) {
131 set_bit(WAKE_ASTS, &astd_wakeflags);
132 wake_up_process(astd_task);
133 }
134}
135
136int dlm_astd_start(void)
137{
138 struct task_struct *p;
139 int error = 0;
140
141 INIT_LIST_HEAD(&ast_queue);
142 spin_lock_init(&ast_queue_lock);
143 mutex_init(&astd_running);
144
145 p = kthread_run(dlm_astd, NULL, "dlm_astd");
146 if (IS_ERR(p))
147 error = PTR_ERR(p);
148 else
149 astd_task = p;
150 return error;
151}
152
153void dlm_astd_stop(void)
154{
155 kthread_stop(astd_task);
156}
157
158void dlm_astd_suspend(void)
159{
160 mutex_lock(&astd_running);
161}
162
163void dlm_astd_resume(void)
164{
165 mutex_unlock(&astd_running);
166}
167
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
new file mode 100644
index 000000000000..6ee276c74c52
--- /dev/null
+++ b/fs/dlm/ast.h
@@ -0,0 +1,26 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __ASTD_DOT_H__
14#define __ASTD_DOT_H__
15
16void dlm_add_ast(struct dlm_lkb *lkb, int type);
17void dlm_del_ast(struct dlm_lkb *lkb);
18
19void dlm_astd_wake(void);
20int dlm_astd_start(void);
21void dlm_astd_stop(void);
22void dlm_astd_suspend(void);
23void dlm_astd_resume(void);
24
25#endif
26
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
new file mode 100644
index 000000000000..88553054bbfa
--- /dev/null
+++ b/fs/dlm/config.c
@@ -0,0 +1,789 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/configfs.h>
17#include <net/sock.h>
18
19#include "config.h"
20#include "lowcomms.h"
21
22/*
23 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
24 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
25 * /config/dlm/<cluster>/comms/<comm>/nodeid
26 * /config/dlm/<cluster>/comms/<comm>/local
27 * /config/dlm/<cluster>/comms/<comm>/addr
28 * The <cluster> level is useless, but I haven't figured out how to avoid it.
29 */
30
31static struct config_group *space_list;
32static struct config_group *comm_list;
33static struct comm *local_comm;
34
35struct clusters;
36struct cluster;
37struct spaces;
38struct space;
39struct comms;
40struct comm;
41struct nodes;
42struct node;
43
44static struct config_group *make_cluster(struct config_group *, const char *);
45static void drop_cluster(struct config_group *, struct config_item *);
46static void release_cluster(struct config_item *);
47static struct config_group *make_space(struct config_group *, const char *);
48static void drop_space(struct config_group *, struct config_item *);
49static void release_space(struct config_item *);
50static struct config_item *make_comm(struct config_group *, const char *);
51static void drop_comm(struct config_group *, struct config_item *);
52static void release_comm(struct config_item *);
53static struct config_item *make_node(struct config_group *, const char *);
54static void drop_node(struct config_group *, struct config_item *);
55static void release_node(struct config_item *);
56
57static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
58 char *buf);
59static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
60 const char *buf, size_t len);
61static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
62 char *buf);
63static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
64 const char *buf, size_t len);
65
66static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
67static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
68static ssize_t comm_local_read(struct comm *cm, char *buf);
69static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
70static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
71static ssize_t node_nodeid_read(struct node *nd, char *buf);
72static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
73static ssize_t node_weight_read(struct node *nd, char *buf);
74static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
75
76enum {
77 COMM_ATTR_NODEID = 0,
78 COMM_ATTR_LOCAL,
79 COMM_ATTR_ADDR,
80};
81
82struct comm_attribute {
83 struct configfs_attribute attr;
84 ssize_t (*show)(struct comm *, char *);
85 ssize_t (*store)(struct comm *, const char *, size_t);
86};
87
88static struct comm_attribute comm_attr_nodeid = {
89 .attr = { .ca_owner = THIS_MODULE,
90 .ca_name = "nodeid",
91 .ca_mode = S_IRUGO | S_IWUSR },
92 .show = comm_nodeid_read,
93 .store = comm_nodeid_write,
94};
95
96static struct comm_attribute comm_attr_local = {
97 .attr = { .ca_owner = THIS_MODULE,
98 .ca_name = "local",
99 .ca_mode = S_IRUGO | S_IWUSR },
100 .show = comm_local_read,
101 .store = comm_local_write,
102};
103
104static struct comm_attribute comm_attr_addr = {
105 .attr = { .ca_owner = THIS_MODULE,
106 .ca_name = "addr",
107 .ca_mode = S_IRUGO | S_IWUSR },
108 .store = comm_addr_write,
109};
110
111static struct configfs_attribute *comm_attrs[] = {
112 [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
113 [COMM_ATTR_LOCAL] = &comm_attr_local.attr,
114 [COMM_ATTR_ADDR] = &comm_attr_addr.attr,
115 NULL,
116};
117
118enum {
119 NODE_ATTR_NODEID = 0,
120 NODE_ATTR_WEIGHT,
121};
122
123struct node_attribute {
124 struct configfs_attribute attr;
125 ssize_t (*show)(struct node *, char *);
126 ssize_t (*store)(struct node *, const char *, size_t);
127};
128
129static struct node_attribute node_attr_nodeid = {
130 .attr = { .ca_owner = THIS_MODULE,
131 .ca_name = "nodeid",
132 .ca_mode = S_IRUGO | S_IWUSR },
133 .show = node_nodeid_read,
134 .store = node_nodeid_write,
135};
136
137static struct node_attribute node_attr_weight = {
138 .attr = { .ca_owner = THIS_MODULE,
139 .ca_name = "weight",
140 .ca_mode = S_IRUGO | S_IWUSR },
141 .show = node_weight_read,
142 .store = node_weight_write,
143};
144
145static struct configfs_attribute *node_attrs[] = {
146 [NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
147 [NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
148 NULL,
149};
150
151struct clusters {
152 struct configfs_subsystem subsys;
153};
154
155struct cluster {
156 struct config_group group;
157};
158
159struct spaces {
160 struct config_group ss_group;
161};
162
163struct space {
164 struct config_group group;
165 struct list_head members;
166 struct mutex members_lock;
167 int members_count;
168};
169
170struct comms {
171 struct config_group cs_group;
172};
173
174struct comm {
175 struct config_item item;
176 int nodeid;
177 int local;
178 int addr_count;
179 struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
180};
181
182struct nodes {
183 struct config_group ns_group;
184};
185
186struct node {
187 struct config_item item;
188 struct list_head list; /* space->members */
189 int nodeid;
190 int weight;
191};
192
193static struct configfs_group_operations clusters_ops = {
194 .make_group = make_cluster,
195 .drop_item = drop_cluster,
196};
197
198static struct configfs_item_operations cluster_ops = {
199 .release = release_cluster,
200};
201
202static struct configfs_group_operations spaces_ops = {
203 .make_group = make_space,
204 .drop_item = drop_space,
205};
206
207static struct configfs_item_operations space_ops = {
208 .release = release_space,
209};
210
211static struct configfs_group_operations comms_ops = {
212 .make_item = make_comm,
213 .drop_item = drop_comm,
214};
215
216static struct configfs_item_operations comm_ops = {
217 .release = release_comm,
218 .show_attribute = show_comm,
219 .store_attribute = store_comm,
220};
221
222static struct configfs_group_operations nodes_ops = {
223 .make_item = make_node,
224 .drop_item = drop_node,
225};
226
227static struct configfs_item_operations node_ops = {
228 .release = release_node,
229 .show_attribute = show_node,
230 .store_attribute = store_node,
231};
232
233static struct config_item_type clusters_type = {
234 .ct_group_ops = &clusters_ops,
235 .ct_owner = THIS_MODULE,
236};
237
238static struct config_item_type cluster_type = {
239 .ct_item_ops = &cluster_ops,
240 .ct_owner = THIS_MODULE,
241};
242
243static struct config_item_type spaces_type = {
244 .ct_group_ops = &spaces_ops,
245 .ct_owner = THIS_MODULE,
246};
247
248static struct config_item_type space_type = {
249 .ct_item_ops = &space_ops,
250 .ct_owner = THIS_MODULE,
251};
252
253static struct config_item_type comms_type = {
254 .ct_group_ops = &comms_ops,
255 .ct_owner = THIS_MODULE,
256};
257
258static struct config_item_type comm_type = {
259 .ct_item_ops = &comm_ops,
260 .ct_attrs = comm_attrs,
261 .ct_owner = THIS_MODULE,
262};
263
264static struct config_item_type nodes_type = {
265 .ct_group_ops = &nodes_ops,
266 .ct_owner = THIS_MODULE,
267};
268
269static struct config_item_type node_type = {
270 .ct_item_ops = &node_ops,
271 .ct_attrs = node_attrs,
272 .ct_owner = THIS_MODULE,
273};
274
275static struct cluster *to_cluster(struct config_item *i)
276{
277 return i ? container_of(to_config_group(i), struct cluster, group):NULL;
278}
279
280static struct space *to_space(struct config_item *i)
281{
282 return i ? container_of(to_config_group(i), struct space, group) : NULL;
283}
284
285static struct comm *to_comm(struct config_item *i)
286{
287 return i ? container_of(i, struct comm, item) : NULL;
288}
289
290static struct node *to_node(struct config_item *i)
291{
292 return i ? container_of(i, struct node, item) : NULL;
293}
294
295static struct config_group *make_cluster(struct config_group *g,
296 const char *name)
297{
298 struct cluster *cl = NULL;
299 struct spaces *sps = NULL;
300 struct comms *cms = NULL;
301 void *gps = NULL;
302
303 cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
304 gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
305 sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
306 cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
307
308 if (!cl || !gps || !sps || !cms)
309 goto fail;
310
311 config_group_init_type_name(&cl->group, name, &cluster_type);
312 config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
313 config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
314
315 cl->group.default_groups = gps;
316 cl->group.default_groups[0] = &sps->ss_group;
317 cl->group.default_groups[1] = &cms->cs_group;
318 cl->group.default_groups[2] = NULL;
319
320 space_list = &sps->ss_group;
321 comm_list = &cms->cs_group;
322 return &cl->group;
323
324 fail:
325 kfree(cl);
326 kfree(gps);
327 kfree(sps);
328 kfree(cms);
329 return NULL;
330}
331
332static void drop_cluster(struct config_group *g, struct config_item *i)
333{
334 struct cluster *cl = to_cluster(i);
335 struct config_item *tmp;
336 int j;
337
338 for (j = 0; cl->group.default_groups[j]; j++) {
339 tmp = &cl->group.default_groups[j]->cg_item;
340 cl->group.default_groups[j] = NULL;
341 config_item_put(tmp);
342 }
343
344 space_list = NULL;
345 comm_list = NULL;
346
347 config_item_put(i);
348}
349
350static void release_cluster(struct config_item *i)
351{
352 struct cluster *cl = to_cluster(i);
353 kfree(cl->group.default_groups);
354 kfree(cl);
355}
356
357static struct config_group *make_space(struct config_group *g, const char *name)
358{
359 struct space *sp = NULL;
360 struct nodes *nds = NULL;
361 void *gps = NULL;
362
363 sp = kzalloc(sizeof(struct space), GFP_KERNEL);
364 gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
365 nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
366
367 if (!sp || !gps || !nds)
368 goto fail;
369
370 config_group_init_type_name(&sp->group, name, &space_type);
371 config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
372
373 sp->group.default_groups = gps;
374 sp->group.default_groups[0] = &nds->ns_group;
375 sp->group.default_groups[1] = NULL;
376
377 INIT_LIST_HEAD(&sp->members);
378 mutex_init(&sp->members_lock);
379 sp->members_count = 0;
380 return &sp->group;
381
382 fail:
383 kfree(sp);
384 kfree(gps);
385 kfree(nds);
386 return NULL;
387}
388
389static void drop_space(struct config_group *g, struct config_item *i)
390{
391 struct space *sp = to_space(i);
392 struct config_item *tmp;
393 int j;
394
395 /* assert list_empty(&sp->members) */
396
397 for (j = 0; sp->group.default_groups[j]; j++) {
398 tmp = &sp->group.default_groups[j]->cg_item;
399 sp->group.default_groups[j] = NULL;
400 config_item_put(tmp);
401 }
402
403 config_item_put(i);
404}
405
406static void release_space(struct config_item *i)
407{
408 struct space *sp = to_space(i);
409 kfree(sp->group.default_groups);
410 kfree(sp);
411}
412
413static struct config_item *make_comm(struct config_group *g, const char *name)
414{
415 struct comm *cm;
416
417 cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
418 if (!cm)
419 return NULL;
420
421 config_item_init_type_name(&cm->item, name, &comm_type);
422 cm->nodeid = -1;
423 cm->local = 0;
424 cm->addr_count = 0;
425 return &cm->item;
426}
427
428static void drop_comm(struct config_group *g, struct config_item *i)
429{
430 struct comm *cm = to_comm(i);
431 if (local_comm == cm)
432 local_comm = NULL;
433 dlm_lowcomms_close(cm->nodeid);
434 while (cm->addr_count--)
435 kfree(cm->addr[cm->addr_count]);
436 config_item_put(i);
437}
438
439static void release_comm(struct config_item *i)
440{
441 struct comm *cm = to_comm(i);
442 kfree(cm);
443}
444
445static struct config_item *make_node(struct config_group *g, const char *name)
446{
447 struct space *sp = to_space(g->cg_item.ci_parent);
448 struct node *nd;
449
450 nd = kzalloc(sizeof(struct node), GFP_KERNEL);
451 if (!nd)
452 return NULL;
453
454 config_item_init_type_name(&nd->item, name, &node_type);
455 nd->nodeid = -1;
456 nd->weight = 1; /* default weight of 1 if none is set */
457
458 mutex_lock(&sp->members_lock);
459 list_add(&nd->list, &sp->members);
460 sp->members_count++;
461 mutex_unlock(&sp->members_lock);
462
463 return &nd->item;
464}
465
466static void drop_node(struct config_group *g, struct config_item *i)
467{
468 struct space *sp = to_space(g->cg_item.ci_parent);
469 struct node *nd = to_node(i);
470
471 mutex_lock(&sp->members_lock);
472 list_del(&nd->list);
473 sp->members_count--;
474 mutex_unlock(&sp->members_lock);
475
476 config_item_put(i);
477}
478
479static void release_node(struct config_item *i)
480{
481 struct node *nd = to_node(i);
482 kfree(nd);
483}
484
485static struct clusters clusters_root = {
486 .subsys = {
487 .su_group = {
488 .cg_item = {
489 .ci_namebuf = "dlm",
490 .ci_type = &clusters_type,
491 },
492 },
493 },
494};
495
496int dlm_config_init(void)
497{
498 config_group_init(&clusters_root.subsys.su_group);
499 init_MUTEX(&clusters_root.subsys.su_sem);
500 return configfs_register_subsystem(&clusters_root.subsys);
501}
502
503void dlm_config_exit(void)
504{
505 configfs_unregister_subsystem(&clusters_root.subsys);
506}
507
508/*
509 * Functions for user space to read/write attributes
510 */
511
512static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
513 char *buf)
514{
515 struct comm *cm = to_comm(i);
516 struct comm_attribute *cma =
517 container_of(a, struct comm_attribute, attr);
518 return cma->show ? cma->show(cm, buf) : 0;
519}
520
521static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
522 const char *buf, size_t len)
523{
524 struct comm *cm = to_comm(i);
525 struct comm_attribute *cma =
526 container_of(a, struct comm_attribute, attr);
527 return cma->store ? cma->store(cm, buf, len) : -EINVAL;
528}
529
530static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
531{
532 return sprintf(buf, "%d\n", cm->nodeid);
533}
534
535static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
536{
537 cm->nodeid = simple_strtol(buf, NULL, 0);
538 return len;
539}
540
541static ssize_t comm_local_read(struct comm *cm, char *buf)
542{
543 return sprintf(buf, "%d\n", cm->local);
544}
545
546static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
547{
548 cm->local= simple_strtol(buf, NULL, 0);
549 if (cm->local && !local_comm)
550 local_comm = cm;
551 return len;
552}
553
554static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
555{
556 struct sockaddr_storage *addr;
557
558 if (len != sizeof(struct sockaddr_storage))
559 return -EINVAL;
560
561 if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
562 return -ENOSPC;
563
564 addr = kzalloc(sizeof(*addr), GFP_KERNEL);
565 if (!addr)
566 return -ENOMEM;
567
568 memcpy(addr, buf, len);
569 cm->addr[cm->addr_count++] = addr;
570 return len;
571}
572
573static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
574 char *buf)
575{
576 struct node *nd = to_node(i);
577 struct node_attribute *nda =
578 container_of(a, struct node_attribute, attr);
579 return nda->show ? nda->show(nd, buf) : 0;
580}
581
582static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
583 const char *buf, size_t len)
584{
585 struct node *nd = to_node(i);
586 struct node_attribute *nda =
587 container_of(a, struct node_attribute, attr);
588 return nda->store ? nda->store(nd, buf, len) : -EINVAL;
589}
590
591static ssize_t node_nodeid_read(struct node *nd, char *buf)
592{
593 return sprintf(buf, "%d\n", nd->nodeid);
594}
595
596static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
597{
598 nd->nodeid = simple_strtol(buf, NULL, 0);
599 return len;
600}
601
602static ssize_t node_weight_read(struct node *nd, char *buf)
603{
604 return sprintf(buf, "%d\n", nd->weight);
605}
606
607static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
608{
609 nd->weight = simple_strtol(buf, NULL, 0);
610 return len;
611}
612
613/*
614 * Functions for the dlm to get the info that's been configured
615 */
616
617static struct space *get_space(char *name)
618{
619 if (!space_list)
620 return NULL;
621 return to_space(config_group_find_obj(space_list, name));
622}
623
624static void put_space(struct space *sp)
625{
626 config_item_put(&sp->group.cg_item);
627}
628
629static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
630{
631 struct config_item *i;
632 struct comm *cm = NULL;
633 int found = 0;
634
635 if (!comm_list)
636 return NULL;
637
638 down(&clusters_root.subsys.su_sem);
639
640 list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
641 cm = to_comm(i);
642
643 if (nodeid) {
644 if (cm->nodeid != nodeid)
645 continue;
646 found = 1;
647 break;
648 } else {
649 if (!cm->addr_count ||
650 memcmp(cm->addr[0], addr, sizeof(*addr)))
651 continue;
652 found = 1;
653 break;
654 }
655 }
656 up(&clusters_root.subsys.su_sem);
657
658 if (found)
659 config_item_get(i);
660 else
661 cm = NULL;
662 return cm;
663}
664
665static void put_comm(struct comm *cm)
666{
667 config_item_put(&cm->item);
668}
669
670/* caller must free mem */
671int dlm_nodeid_list(char *lsname, int **ids_out)
672{
673 struct space *sp;
674 struct node *nd;
675 int i = 0, rv = 0;
676 int *ids;
677
678 sp = get_space(lsname);
679 if (!sp)
680 return -EEXIST;
681
682 mutex_lock(&sp->members_lock);
683 if (!sp->members_count) {
684 rv = 0;
685 goto out;
686 }
687
688 ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL);
689 if (!ids) {
690 rv = -ENOMEM;
691 goto out;
692 }
693
694 rv = sp->members_count;
695 list_for_each_entry(nd, &sp->members, list)
696 ids[i++] = nd->nodeid;
697
698 if (rv != i)
699 printk("bad nodeid count %d %d\n", rv, i);
700
701 *ids_out = ids;
702 out:
703 mutex_unlock(&sp->members_lock);
704 put_space(sp);
705 return rv;
706}
707
708int dlm_node_weight(char *lsname, int nodeid)
709{
710 struct space *sp;
711 struct node *nd;
712 int w = -EEXIST;
713
714 sp = get_space(lsname);
715 if (!sp)
716 goto out;
717
718 mutex_lock(&sp->members_lock);
719 list_for_each_entry(nd, &sp->members, list) {
720 if (nd->nodeid != nodeid)
721 continue;
722 w = nd->weight;
723 break;
724 }
725 mutex_unlock(&sp->members_lock);
726 put_space(sp);
727 out:
728 return w;
729}
730
731int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
732{
733 struct comm *cm = get_comm(nodeid, NULL);
734 if (!cm)
735 return -EEXIST;
736 if (!cm->addr_count)
737 return -ENOENT;
738 memcpy(addr, cm->addr[0], sizeof(*addr));
739 put_comm(cm);
740 return 0;
741}
742
743int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
744{
745 struct comm *cm = get_comm(0, addr);
746 if (!cm)
747 return -EEXIST;
748 *nodeid = cm->nodeid;
749 put_comm(cm);
750 return 0;
751}
752
753int dlm_our_nodeid(void)
754{
755 return local_comm ? local_comm->nodeid : 0;
756}
757
758/* num 0 is first addr, num 1 is second addr */
759int dlm_our_addr(struct sockaddr_storage *addr, int num)
760{
761 if (!local_comm)
762 return -1;
763 if (num + 1 > local_comm->addr_count)
764 return -1;
765 memcpy(addr, local_comm->addr[num], sizeof(*addr));
766 return 0;
767}
768
769/* Config file defaults */
770#define DEFAULT_TCP_PORT 21064
771#define DEFAULT_BUFFER_SIZE 4096
772#define DEFAULT_RSBTBL_SIZE 256
773#define DEFAULT_LKBTBL_SIZE 1024
774#define DEFAULT_DIRTBL_SIZE 512
775#define DEFAULT_RECOVER_TIMER 5
776#define DEFAULT_TOSS_SECS 10
777#define DEFAULT_SCAN_SECS 5
778
779struct dlm_config_info dlm_config = {
780 .tcp_port = DEFAULT_TCP_PORT,
781 .buffer_size = DEFAULT_BUFFER_SIZE,
782 .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
783 .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
784 .dirtbl_size = DEFAULT_DIRTBL_SIZE,
785 .recover_timer = DEFAULT_RECOVER_TIMER,
786 .toss_secs = DEFAULT_TOSS_SECS,
787 .scan_secs = DEFAULT_SCAN_SECS
788};
789
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
new file mode 100644
index 000000000000..9da7839958a9
--- /dev/null
+++ b/fs/dlm/config.h
@@ -0,0 +1,42 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __CONFIG_DOT_H__
15#define __CONFIG_DOT_H__
16
17#define DLM_MAX_ADDR_COUNT 3
18
19struct dlm_config_info {
20 int tcp_port;
21 int buffer_size;
22 int rsbtbl_size;
23 int lkbtbl_size;
24 int dirtbl_size;
25 int recover_timer;
26 int toss_secs;
27 int scan_secs;
28};
29
30extern struct dlm_config_info dlm_config;
31
32int dlm_config_init(void);
33void dlm_config_exit(void);
34int dlm_node_weight(char *lsname, int nodeid);
35int dlm_nodeid_list(char *lsname, int **ids_out);
36int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
37int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
38int dlm_our_nodeid(void);
39int dlm_our_addr(struct sockaddr_storage *addr, int num);
40
41#endif /* __CONFIG_DOT_H__ */
42
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
new file mode 100644
index 000000000000..49deca845dba
--- /dev/null
+++ b/fs/dlm/debug_fs.c
@@ -0,0 +1,296 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include <linux/pagemap.h>
14#include <linux/seq_file.h>
15#include <linux/module.h>
16#include <linux/ctype.h>
17#include <linux/debugfs.h>
18
19#include "dlm_internal.h"
20
21
22static struct dentry *dlm_root;
23
24struct rsb_iter {
25 int entry;
26 struct dlm_ls *ls;
27 struct list_head *next;
28 struct dlm_rsb *rsb;
29};
30
31static char *print_lockmode(int mode)
32{
33 switch (mode) {
34 case DLM_LOCK_IV:
35 return "--";
36 case DLM_LOCK_NL:
37 return "NL";
38 case DLM_LOCK_CR:
39 return "CR";
40 case DLM_LOCK_CW:
41 return "CW";
42 case DLM_LOCK_PR:
43 return "PR";
44 case DLM_LOCK_PW:
45 return "PW";
46 case DLM_LOCK_EX:
47 return "EX";
48 default:
49 return "??";
50 }
51}
52
53static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
54 struct dlm_rsb *res)
55{
56 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
57
58 if (lkb->lkb_status == DLM_LKSTS_CONVERT
59 || lkb->lkb_status == DLM_LKSTS_WAITING)
60 seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
61
62 if (lkb->lkb_nodeid) {
63 if (lkb->lkb_nodeid != res->res_nodeid)
64 seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
65 lkb->lkb_remid);
66 else
67 seq_printf(s, " Master: %08x", lkb->lkb_remid);
68 }
69
70 if (lkb->lkb_wait_type)
71 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
72
73 seq_printf(s, "\n");
74}
75
76static int print_resource(struct dlm_rsb *res, struct seq_file *s)
77{
78 struct dlm_lkb *lkb;
79 int i, lvblen = res->res_ls->ls_lvblen;
80
81 seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
82 for (i = 0; i < res->res_length; i++) {
83 if (isprint(res->res_name[i]))
84 seq_printf(s, "%c", res->res_name[i]);
85 else
86 seq_printf(s, "%c", '.');
87 }
88 if (res->res_nodeid > 0)
89 seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
90 res->res_nodeid);
91 else if (res->res_nodeid == 0)
92 seq_printf(s, "\" \nMaster Copy\n");
93 else if (res->res_nodeid == -1)
94 seq_printf(s, "\" \nLooking up master (lkid %x)\n",
95 res->res_first_lkid);
96 else
97 seq_printf(s, "\" \nInvalid master %d\n", res->res_nodeid);
98
99 /* Print the LVB: */
100 if (res->res_lvbptr) {
101 seq_printf(s, "LVB: ");
102 for (i = 0; i < lvblen; i++) {
103 if (i == lvblen / 2)
104 seq_printf(s, "\n ");
105 seq_printf(s, "%02x ",
106 (unsigned char) res->res_lvbptr[i]);
107 }
108 if (rsb_flag(res, RSB_VALNOTVALID))
109 seq_printf(s, " (INVALID)");
110 seq_printf(s, "\n");
111 }
112
113 /* Print the locks attached to this resource */
114 seq_printf(s, "Granted Queue\n");
115 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
116 print_lock(s, lkb, res);
117
118 seq_printf(s, "Conversion Queue\n");
119 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
120 print_lock(s, lkb, res);
121
122 seq_printf(s, "Waiting Queue\n");
123 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
124 print_lock(s, lkb, res);
125
126 return 0;
127}
128
129static int rsb_iter_next(struct rsb_iter *ri)
130{
131 struct dlm_ls *ls = ri->ls;
132 int i;
133
134 if (!ri->next) {
135 top:
136 /* Find the next non-empty hash bucket */
137 for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
138 read_lock(&ls->ls_rsbtbl[i].lock);
139 if (!list_empty(&ls->ls_rsbtbl[i].list)) {
140 ri->next = ls->ls_rsbtbl[i].list.next;
141 read_unlock(&ls->ls_rsbtbl[i].lock);
142 break;
143 }
144 read_unlock(&ls->ls_rsbtbl[i].lock);
145 }
146 ri->entry = i;
147
148 if (ri->entry >= ls->ls_rsbtbl_size)
149 return 1;
150 } else {
151 i = ri->entry;
152 read_lock(&ls->ls_rsbtbl[i].lock);
153 ri->next = ri->next->next;
154 if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
155 /* End of list - move to next bucket */
156 ri->next = NULL;
157 ri->entry++;
158 read_unlock(&ls->ls_rsbtbl[i].lock);
159 goto top;
160 }
161 read_unlock(&ls->ls_rsbtbl[i].lock);
162 }
163 ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
164
165 return 0;
166}
167
168static void rsb_iter_free(struct rsb_iter *ri)
169{
170 kfree(ri);
171}
172
173static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
174{
175 struct rsb_iter *ri;
176
177 ri = kmalloc(sizeof *ri, GFP_KERNEL);
178 if (!ri)
179 return NULL;
180
181 ri->ls = ls;
182 ri->entry = 0;
183 ri->next = NULL;
184
185 if (rsb_iter_next(ri)) {
186 rsb_iter_free(ri);
187 return NULL;
188 }
189
190 return ri;
191}
192
193static void *seq_start(struct seq_file *file, loff_t *pos)
194{
195 struct rsb_iter *ri;
196 loff_t n = *pos;
197
198 ri = rsb_iter_init(file->private);
199 if (!ri)
200 return NULL;
201
202 while (n--) {
203 if (rsb_iter_next(ri)) {
204 rsb_iter_free(ri);
205 return NULL;
206 }
207 }
208
209 return ri;
210}
211
212static void *seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
213{
214 struct rsb_iter *ri = iter_ptr;
215
216 (*pos)++;
217
218 if (rsb_iter_next(ri)) {
219 rsb_iter_free(ri);
220 return NULL;
221 }
222
223 return ri;
224}
225
226static void seq_stop(struct seq_file *file, void *iter_ptr)
227{
228 /* nothing for now */
229}
230
231static int seq_show(struct seq_file *file, void *iter_ptr)
232{
233 struct rsb_iter *ri = iter_ptr;
234
235 print_resource(ri->rsb, file);
236
237 return 0;
238}
239
240static struct seq_operations dlm_seq_ops = {
241 .start = seq_start,
242 .next = seq_next,
243 .stop = seq_stop,
244 .show = seq_show,
245};
246
247static int do_open(struct inode *inode, struct file *file)
248{
249 struct seq_file *seq;
250 int ret;
251
252 ret = seq_open(file, &dlm_seq_ops);
253 if (ret)
254 return ret;
255
256 seq = file->private_data;
257 seq->private = inode->u.generic_ip;
258
259 return 0;
260}
261
262static struct file_operations dlm_fops = {
263 .owner = THIS_MODULE,
264 .open = do_open,
265 .read = seq_read,
266 .llseek = seq_lseek,
267 .release = seq_release
268};
269
270int dlm_create_debug_file(struct dlm_ls *ls)
271{
272 ls->ls_debug_dentry = debugfs_create_file(ls->ls_name,
273 S_IFREG | S_IRUGO,
274 dlm_root,
275 ls,
276 &dlm_fops);
277 return ls->ls_debug_dentry ? 0 : -ENOMEM;
278}
279
280void dlm_delete_debug_file(struct dlm_ls *ls)
281{
282 if (ls->ls_debug_dentry)
283 debugfs_remove(ls->ls_debug_dentry);
284}
285
286int dlm_register_debugfs(void)
287{
288 dlm_root = debugfs_create_dir("dlm", NULL);
289 return dlm_root ? 0 : -ENOMEM;
290}
291
292void dlm_unregister_debugfs(void)
293{
294 debugfs_remove(dlm_root);
295}
296
diff --git a/fs/dlm/device.c b/fs/dlm/device.c
new file mode 100644
index 000000000000..49a20d549216
--- /dev/null
+++ b/fs/dlm/device.c
@@ -0,0 +1,1091 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * device.c
16 *
17 * This is the userland interface to the DLM.
18 *
19 * The locking is done via a misc char device (find the
20 * registered minor number in /proc/misc).
21 *
22 * User code should not use this interface directly but
23 * call the library routines in libdlm.a instead.
24 *
25 */
26
27#include <linux/miscdevice.h>
28#include <linux/init.h>
29#include <linux/wait.h>
30#include <linux/module.h>
31#include <linux/file.h>
32#include <linux/fs.h>
33#include <linux/poll.h>
34#include <linux/signal.h>
35#include <linux/spinlock.h>
36#include <linux/idr.h>
37
38#include <linux/dlm.h>
39#include <linux/dlm_device.h>
40
41#include "lvb_table.h"
42
43static struct file_operations _dlm_fops;
44static const char *name_prefix="dlm";
45static struct list_head user_ls_list;
46static struct mutex user_ls_lock;
47
48/* Lock infos are stored in here indexed by lock ID */
49static DEFINE_IDR(lockinfo_idr);
50static rwlock_t lockinfo_lock;
51
52/* Flags in li_flags */
53#define LI_FLAG_COMPLETE 1
54#define LI_FLAG_FIRSTLOCK 2
55#define LI_FLAG_PERSISTENT 3
56#define LI_FLAG_ONLIST 4
57
58/* flags in ls_flags*/
59#define LS_FLAG_DELETED 1
60#define LS_FLAG_AUTOFREE 2
61
62
63#define LOCKINFO_MAGIC 0x53595324
64
65struct lock_info {
66 uint32_t li_magic;
67 uint8_t li_cmd;
68 int8_t li_grmode;
69 int8_t li_rqmode;
70 struct dlm_lksb li_lksb;
71 wait_queue_head_t li_waitq;
72 unsigned long li_flags;
73 void __user *li_castparam;
74 void __user *li_castaddr;
75 void __user *li_bastparam;
76 void __user *li_bastaddr;
77 void __user *li_pend_bastparam;
78 void __user *li_pend_bastaddr;
79 struct list_head li_ownerqueue;
80 struct file_info *li_file;
81 struct dlm_lksb __user *li_user_lksb;
82 struct completion li_firstcomp;
83};
84
85/* A queued AST no less */
86struct ast_info {
87 struct dlm_lock_result result;
88 struct list_head list;
89 uint32_t lvb_updated;
90 uint32_t progress; /* How much has been read */
91};
92
93/* One of these per userland lockspace */
94struct user_ls {
95 void *ls_lockspace;
96 atomic_t ls_refcnt;
97 long ls_flags;
98
99 /* Passed into misc_register() */
100 struct miscdevice ls_miscinfo;
101 struct list_head ls_list;
102};
103
104/* misc_device info for the control device */
105static struct miscdevice ctl_device;
106
107/*
108 * Stuff we hang off the file struct.
109 * The first two are to cope with unlocking all the
110 * locks help by a process when it dies.
111 */
112struct file_info {
113 struct list_head fi_li_list; /* List of active lock_infos */
114 spinlock_t fi_li_lock;
115 struct list_head fi_ast_list; /* Queue of ASTs to be delivered */
116 spinlock_t fi_ast_lock;
117 wait_queue_head_t fi_wait;
118 struct user_ls *fi_ls;
119 atomic_t fi_refcnt; /* Number of users */
120 unsigned long fi_flags; /* Bit 1 means the device is open */
121};
122
123
124/* get and put ops for file_info.
125 Actually I don't really like "get" and "put", but everyone
126 else seems to use them and I can't think of anything
127 nicer at the moment */
128static void get_file_info(struct file_info *f)
129{
130 atomic_inc(&f->fi_refcnt);
131}
132
133static void put_file_info(struct file_info *f)
134{
135 if (atomic_dec_and_test(&f->fi_refcnt))
136 kfree(f);
137}
138
139static void release_lockinfo(struct lock_info *li)
140{
141 put_file_info(li->li_file);
142
143 write_lock(&lockinfo_lock);
144 idr_remove(&lockinfo_idr, li->li_lksb.sb_lkid);
145 write_unlock(&lockinfo_lock);
146
147 if (li->li_lksb.sb_lvbptr)
148 kfree(li->li_lksb.sb_lvbptr);
149 kfree(li);
150
151 module_put(THIS_MODULE);
152}
153
154static struct lock_info *get_lockinfo(uint32_t lockid)
155{
156 struct lock_info *li;
157
158 read_lock(&lockinfo_lock);
159 li = idr_find(&lockinfo_idr, lockid);
160 read_unlock(&lockinfo_lock);
161
162 return li;
163}
164
165static int add_lockinfo(struct lock_info *li)
166{
167 int n;
168 int r;
169 int ret = -EINVAL;
170
171 write_lock(&lockinfo_lock);
172
173 if (idr_find(&lockinfo_idr, li->li_lksb.sb_lkid))
174 goto out_up;
175
176 ret = -ENOMEM;
177 r = idr_pre_get(&lockinfo_idr, GFP_KERNEL);
178 if (!r)
179 goto out_up;
180
181 r = idr_get_new_above(&lockinfo_idr, li, li->li_lksb.sb_lkid, &n);
182 if (r)
183 goto out_up;
184
185 if (n != li->li_lksb.sb_lkid) {
186 idr_remove(&lockinfo_idr, n);
187 goto out_up;
188 }
189
190 ret = 0;
191
192 out_up:
193 write_unlock(&lockinfo_lock);
194
195 return ret;
196}
197
198
199static struct user_ls *__find_lockspace(int minor)
200{
201 struct user_ls *lsinfo;
202
203 list_for_each_entry(lsinfo, &user_ls_list, ls_list) {
204 if (lsinfo->ls_miscinfo.minor == minor)
205 return lsinfo;
206 }
207 return NULL;
208}
209
210/* Find a lockspace struct given the device minor number */
211static struct user_ls *find_lockspace(int minor)
212{
213 struct user_ls *lsinfo;
214
215 mutex_lock(&user_ls_lock);
216 lsinfo = __find_lockspace(minor);
217 mutex_unlock(&user_ls_lock);
218
219 return lsinfo;
220}
221
222static void add_lockspace_to_list(struct user_ls *lsinfo)
223{
224 mutex_lock(&user_ls_lock);
225 list_add(&lsinfo->ls_list, &user_ls_list);
226 mutex_unlock(&user_ls_lock);
227}
228
229/* Register a lockspace with the DLM and create a misc
230 device for userland to access it */
231static int register_lockspace(char *name, struct user_ls **ls, int flags)
232{
233 struct user_ls *newls;
234 int status;
235 int namelen;
236
237 namelen = strlen(name)+strlen(name_prefix)+2;
238
239 newls = kzalloc(sizeof(struct user_ls), GFP_KERNEL);
240 if (!newls)
241 return -ENOMEM;
242
243 newls->ls_miscinfo.name = kzalloc(namelen, GFP_KERNEL);
244 if (!newls->ls_miscinfo.name) {
245 kfree(newls);
246 return -ENOMEM;
247 }
248
249 status = dlm_new_lockspace(name, strlen(name), &newls->ls_lockspace, 0,
250 DLM_USER_LVB_LEN);
251 if (status != 0) {
252 kfree(newls->ls_miscinfo.name);
253 kfree(newls);
254 return status;
255 }
256
257 snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s",
258 name_prefix, name);
259
260 newls->ls_miscinfo.fops = &_dlm_fops;
261 newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR;
262
263 status = misc_register(&newls->ls_miscinfo);
264 if (status) {
265 printk(KERN_ERR "dlm: misc register failed for %s\n", name);
266 dlm_release_lockspace(newls->ls_lockspace, 0);
267 kfree(newls->ls_miscinfo.name);
268 kfree(newls);
269 return status;
270 }
271
272 if (flags & DLM_USER_LSFLG_AUTOFREE)
273 set_bit(LS_FLAG_AUTOFREE, &newls->ls_flags);
274
275 add_lockspace_to_list(newls);
276 *ls = newls;
277 return 0;
278}
279
280/* Called with the user_ls_lock mutex held */
281static int unregister_lockspace(struct user_ls *lsinfo, int force)
282{
283 int status;
284
285 status = dlm_release_lockspace(lsinfo->ls_lockspace, force);
286 if (status)
287 return status;
288
289 status = misc_deregister(&lsinfo->ls_miscinfo);
290 if (status)
291 return status;
292
293 list_del(&lsinfo->ls_list);
294 set_bit(LS_FLAG_DELETED, &lsinfo->ls_flags);
295 lsinfo->ls_lockspace = NULL;
296 if (atomic_read(&lsinfo->ls_refcnt) == 0) {
297 kfree(lsinfo->ls_miscinfo.name);
298 kfree(lsinfo);
299 }
300
301 return 0;
302}
303
304/* Add it to userland's AST queue */
305static void add_to_astqueue(struct lock_info *li, void *astaddr, void *astparam,
306 int lvb_updated)
307{
308 struct ast_info *ast = kzalloc(sizeof(struct ast_info), GFP_KERNEL);
309 if (!ast)
310 return;
311
312 ast->result.user_astparam = astparam;
313 ast->result.user_astaddr = astaddr;
314 ast->result.user_lksb = li->li_user_lksb;
315 memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb));
316 ast->lvb_updated = lvb_updated;
317
318 spin_lock(&li->li_file->fi_ast_lock);
319 list_add_tail(&ast->list, &li->li_file->fi_ast_list);
320 spin_unlock(&li->li_file->fi_ast_lock);
321 wake_up_interruptible(&li->li_file->fi_wait);
322}
323
324static void bast_routine(void *param, int mode)
325{
326 struct lock_info *li = param;
327
328 if (li && li->li_bastaddr)
329 add_to_astqueue(li, li->li_bastaddr, li->li_bastparam, 0);
330}
331
332/*
333 * This is the kernel's AST routine.
334 * All lock, unlock & query operations complete here.
335 * The only syncronous ops are those done during device close.
336 */
337static void ast_routine(void *param)
338{
339 struct lock_info *li = param;
340
341 /* Param may be NULL if a persistent lock is unlocked by someone else */
342 if (!li)
343 return;
344
345 /* If this is a succesful conversion then activate the blocking ast
346 * args from the conversion request */
347 if (!test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
348 li->li_lksb.sb_status == 0) {
349
350 li->li_bastparam = li->li_pend_bastparam;
351 li->li_bastaddr = li->li_pend_bastaddr;
352 li->li_pend_bastaddr = NULL;
353 }
354
355 /* If it's an async request then post data to the user's AST queue. */
356 if (li->li_castaddr) {
357 int lvb_updated = 0;
358
359 /* See if the lvb has been updated */
360 if (dlm_lvb_operations[li->li_grmode+1][li->li_rqmode+1] == 1)
361 lvb_updated = 1;
362
363 if (li->li_lksb.sb_status == 0)
364 li->li_grmode = li->li_rqmode;
365
366 /* Only queue AST if the device is still open */
367 if (test_bit(1, &li->li_file->fi_flags))
368 add_to_astqueue(li, li->li_castaddr, li->li_castparam,
369 lvb_updated);
370
371 /* If it's a new lock operation that failed, then
372 * remove it from the owner queue and free the
373 * lock_info.
374 */
375 if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
376 li->li_lksb.sb_status != 0) {
377
378 /* Wait till dlm_lock() has finished */
379 wait_for_completion(&li->li_firstcomp);
380
381 spin_lock(&li->li_file->fi_li_lock);
382 list_del(&li->li_ownerqueue);
383 clear_bit(LI_FLAG_ONLIST, &li->li_flags);
384 spin_unlock(&li->li_file->fi_li_lock);
385 release_lockinfo(li);
386 return;
387 }
388 /* Free unlocks & queries */
389 if (li->li_lksb.sb_status == -DLM_EUNLOCK ||
390 li->li_cmd == DLM_USER_QUERY) {
391 release_lockinfo(li);
392 }
393 } else {
394 /* Synchronous request, just wake up the caller */
395 set_bit(LI_FLAG_COMPLETE, &li->li_flags);
396 wake_up_interruptible(&li->li_waitq);
397 }
398}
399
400/*
401 * Wait for the lock op to complete and return the status.
402 */
403static int wait_for_ast(struct lock_info *li)
404{
405 /* Wait for the AST routine to complete */
406 set_task_state(current, TASK_INTERRUPTIBLE);
407 while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags))
408 schedule();
409
410 set_task_state(current, TASK_RUNNING);
411
412 return li->li_lksb.sb_status;
413}
414
415
416/* Open on control device */
417static int dlm_ctl_open(struct inode *inode, struct file *file)
418{
419 file->private_data = NULL;
420 return 0;
421}
422
423/* Close on control device */
424static int dlm_ctl_close(struct inode *inode, struct file *file)
425{
426 return 0;
427}
428
429/* Open on lockspace device */
430static int dlm_open(struct inode *inode, struct file *file)
431{
432 struct file_info *f;
433 struct user_ls *lsinfo;
434
435 lsinfo = find_lockspace(iminor(inode));
436 if (!lsinfo)
437 return -ENOENT;
438
439 f = kzalloc(sizeof(struct file_info), GFP_KERNEL);
440 if (!f)
441 return -ENOMEM;
442
443 atomic_inc(&lsinfo->ls_refcnt);
444 INIT_LIST_HEAD(&f->fi_li_list);
445 INIT_LIST_HEAD(&f->fi_ast_list);
446 spin_lock_init(&f->fi_li_lock);
447 spin_lock_init(&f->fi_ast_lock);
448 init_waitqueue_head(&f->fi_wait);
449 f->fi_ls = lsinfo;
450 f->fi_flags = 0;
451 get_file_info(f);
452 set_bit(1, &f->fi_flags);
453
454 file->private_data = f;
455
456 return 0;
457}
458
459/* Check the user's version matches ours */
460static int check_version(struct dlm_write_request *req)
461{
462 if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
463 (req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
464 req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
465
466 printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
467 "user (%d.%d.%d) kernel (%d.%d.%d)\n",
468 current->comm,
469 current->pid,
470 req->version[0],
471 req->version[1],
472 req->version[2],
473 DLM_DEVICE_VERSION_MAJOR,
474 DLM_DEVICE_VERSION_MINOR,
475 DLM_DEVICE_VERSION_PATCH);
476 return -EINVAL;
477 }
478 return 0;
479}
480
481/* Close on lockspace device */
482static int dlm_close(struct inode *inode, struct file *file)
483{
484 struct file_info *f = file->private_data;
485 struct lock_info li;
486 struct lock_info *old_li, *safe;
487 sigset_t tmpsig;
488 sigset_t allsigs;
489 struct user_ls *lsinfo;
490 DECLARE_WAITQUEUE(wq, current);
491
492 lsinfo = find_lockspace(iminor(inode));
493 if (!lsinfo)
494 return -ENOENT;
495
496 /* Mark this closed so that ASTs will not be delivered any more */
497 clear_bit(1, &f->fi_flags);
498
499 /* Block signals while we are doing this */
500 sigfillset(&allsigs);
501 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
502
503 /* We use our own lock_info struct here, so that any
504 * outstanding "real" ASTs will be delivered with the
505 * corresponding "real" params, thus freeing the lock_info
506 * that belongs the lock. This catches the corner case where
507 * a lock is BUSY when we try to unlock it here
508 */
509 memset(&li, 0, sizeof(li));
510 clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
511 init_waitqueue_head(&li.li_waitq);
512 add_wait_queue(&li.li_waitq, &wq);
513
514 /*
515 * Free any outstanding locks, they are on the
516 * list in LIFO order so there should be no problems
517 * about unlocking parents before children.
518 */
519 list_for_each_entry_safe(old_li, safe, &f->fi_li_list, li_ownerqueue) {
520 int status;
521 int flags = 0;
522
523 /* Don't unlock persistent locks, just mark them orphaned */
524 if (test_bit(LI_FLAG_PERSISTENT, &old_li->li_flags)) {
525 list_del(&old_li->li_ownerqueue);
526
527 /* Update master copy */
528 /* TODO: Check locking core updates the local and
529 remote ORPHAN flags */
530 li.li_lksb.sb_lkid = old_li->li_lksb.sb_lkid;
531 status = dlm_lock(f->fi_ls->ls_lockspace,
532 old_li->li_grmode, &li.li_lksb,
533 DLM_LKF_CONVERT|DLM_LKF_ORPHAN,
534 NULL, 0, 0, ast_routine, NULL, NULL);
535 if (status != 0)
536 printk("dlm: Error orphaning lock %x: %d\n",
537 old_li->li_lksb.sb_lkid, status);
538
539 /* But tidy our references in it */
540 release_lockinfo(old_li);
541 continue;
542 }
543
544 clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
545
546 flags = DLM_LKF_FORCEUNLOCK;
547 if (old_li->li_grmode >= DLM_LOCK_PW)
548 flags |= DLM_LKF_IVVALBLK;
549
550 status = dlm_unlock(f->fi_ls->ls_lockspace,
551 old_li->li_lksb.sb_lkid, flags,
552 &li.li_lksb, &li);
553
554 /* Must wait for it to complete as the next lock could be its
555 * parent */
556 if (status == 0)
557 wait_for_ast(&li);
558
559 /* Unlock suceeded, free the lock_info struct. */
560 if (status == 0)
561 release_lockinfo(old_li);
562 }
563
564 remove_wait_queue(&li.li_waitq, &wq);
565
566 /*
567 * If this is the last reference to the lockspace
568 * then free the struct. If it's an AUTOFREE lockspace
569 * then free the whole thing.
570 */
571 mutex_lock(&user_ls_lock);
572 if (atomic_dec_and_test(&lsinfo->ls_refcnt)) {
573
574 if (lsinfo->ls_lockspace) {
575 if (test_bit(LS_FLAG_AUTOFREE, &lsinfo->ls_flags)) {
576 unregister_lockspace(lsinfo, 1);
577 }
578 } else {
579 kfree(lsinfo->ls_miscinfo.name);
580 kfree(lsinfo);
581 }
582 }
583 mutex_unlock(&user_ls_lock);
584 put_file_info(f);
585
586 /* Restore signals */
587 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
588 recalc_sigpending();
589
590 return 0;
591}
592
593static int do_user_create_lockspace(struct file_info *fi, uint8_t cmd,
594 struct dlm_lspace_params *kparams)
595{
596 int status;
597 struct user_ls *lsinfo;
598
599 if (!capable(CAP_SYS_ADMIN))
600 return -EPERM;
601
602 status = register_lockspace(kparams->name, &lsinfo, kparams->flags);
603
604 /* If it succeeded then return the minor number */
605 if (status == 0)
606 status = lsinfo->ls_miscinfo.minor;
607
608 return status;
609}
610
611static int do_user_remove_lockspace(struct file_info *fi, uint8_t cmd,
612 struct dlm_lspace_params *kparams)
613{
614 int status;
615 int force = 1;
616 struct user_ls *lsinfo;
617
618 if (!capable(CAP_SYS_ADMIN))
619 return -EPERM;
620
621 mutex_lock(&user_ls_lock);
622 lsinfo = __find_lockspace(kparams->minor);
623 if (!lsinfo) {
624 mutex_unlock(&user_ls_lock);
625 return -EINVAL;
626 }
627
628 if (kparams->flags & DLM_USER_LSFLG_FORCEFREE)
629 force = 3;
630
631 status = unregister_lockspace(lsinfo, force);
632 mutex_unlock(&user_ls_lock);
633
634 return status;
635}
636
637/* Read call, might block if no ASTs are waiting.
638 * It will only ever return one message at a time, regardless
639 * of how many are pending.
640 */
641static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count,
642 loff_t *ppos)
643{
644 struct file_info *fi = file->private_data;
645 struct ast_info *ast;
646 int data_size;
647 int offset;
648 DECLARE_WAITQUEUE(wait, current);
649
650 if (count < sizeof(struct dlm_lock_result))
651 return -EINVAL;
652
653 spin_lock(&fi->fi_ast_lock);
654 if (list_empty(&fi->fi_ast_list)) {
655
656 /* No waiting ASTs.
657 * Return EOF if the lockspace been deleted.
658 */
659 if (test_bit(LS_FLAG_DELETED, &fi->fi_ls->ls_flags))
660 return 0;
661
662 if (file->f_flags & O_NONBLOCK) {
663 spin_unlock(&fi->fi_ast_lock);
664 return -EAGAIN;
665 }
666
667 add_wait_queue(&fi->fi_wait, &wait);
668
669 repeat:
670 set_current_state(TASK_INTERRUPTIBLE);
671 if (list_empty(&fi->fi_ast_list) &&
672 !signal_pending(current)) {
673
674 spin_unlock(&fi->fi_ast_lock);
675 schedule();
676 spin_lock(&fi->fi_ast_lock);
677 goto repeat;
678 }
679
680 current->state = TASK_RUNNING;
681 remove_wait_queue(&fi->fi_wait, &wait);
682
683 if (signal_pending(current)) {
684 spin_unlock(&fi->fi_ast_lock);
685 return -ERESTARTSYS;
686 }
687 }
688
689 ast = list_entry(fi->fi_ast_list.next, struct ast_info, list);
690 list_del(&ast->list);
691 spin_unlock(&fi->fi_ast_lock);
692
693 /* Work out the size of the returned data */
694 data_size = sizeof(struct dlm_lock_result);
695 if (ast->lvb_updated && ast->result.lksb.sb_lvbptr)
696 data_size += DLM_USER_LVB_LEN;
697
698 offset = sizeof(struct dlm_lock_result);
699
700 /* Room for the extended data ? */
701 if (count >= data_size) {
702
703 if (ast->lvb_updated && ast->result.lksb.sb_lvbptr) {
704 if (copy_to_user(buffer+offset,
705 ast->result.lksb.sb_lvbptr,
706 DLM_USER_LVB_LEN))
707 return -EFAULT;
708 ast->result.lvb_offset = offset;
709 offset += DLM_USER_LVB_LEN;
710 }
711 }
712
713 ast->result.length = data_size;
714 /* Copy the header now it has all the offsets in it */
715 if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result)))
716 offset = -EFAULT;
717
718 /* If we only returned a header and there's more to come then put it
719 back on the list */
720 if (count < data_size) {
721 spin_lock(&fi->fi_ast_lock);
722 list_add(&ast->list, &fi->fi_ast_list);
723 spin_unlock(&fi->fi_ast_lock);
724 } else
725 kfree(ast);
726 return offset;
727}
728
729static unsigned int dlm_poll(struct file *file, poll_table *wait)
730{
731 struct file_info *fi = file->private_data;
732
733 poll_wait(file, &fi->fi_wait, wait);
734
735 spin_lock(&fi->fi_ast_lock);
736 if (!list_empty(&fi->fi_ast_list)) {
737 spin_unlock(&fi->fi_ast_lock);
738 return POLLIN | POLLRDNORM;
739 }
740
741 spin_unlock(&fi->fi_ast_lock);
742 return 0;
743}
744
745static struct lock_info *allocate_lockinfo(struct file_info *fi, uint8_t cmd,
746 struct dlm_lock_params *kparams)
747{
748 struct lock_info *li;
749
750 if (!try_module_get(THIS_MODULE))
751 return NULL;
752
753 li = kzalloc(sizeof(struct lock_info), GFP_KERNEL);
754 if (li) {
755 li->li_magic = LOCKINFO_MAGIC;
756 li->li_file = fi;
757 li->li_cmd = cmd;
758 li->li_flags = 0;
759 li->li_grmode = -1;
760 li->li_rqmode = -1;
761 li->li_pend_bastparam = NULL;
762 li->li_pend_bastaddr = NULL;
763 li->li_castaddr = NULL;
764 li->li_castparam = NULL;
765 li->li_lksb.sb_lvbptr = NULL;
766 li->li_bastaddr = kparams->bastaddr;
767 li->li_bastparam = kparams->bastparam;
768
769 get_file_info(fi);
770 }
771 return li;
772}
773
774static int do_user_lock(struct file_info *fi, uint8_t cmd,
775 struct dlm_lock_params *kparams)
776{
777 struct lock_info *li;
778 int status;
779
780 /*
781 * Validate things that we need to have correct.
782 */
783 if (!kparams->castaddr)
784 return -EINVAL;
785
786 if (!kparams->lksb)
787 return -EINVAL;
788
789 /* Persistent child locks are not available yet */
790 if ((kparams->flags & DLM_LKF_PERSISTENT) && kparams->parent)
791 return -EINVAL;
792
793 /* For conversions, there should already be a lockinfo struct,
794 unless we are adopting an orphaned persistent lock */
795 if (kparams->flags & DLM_LKF_CONVERT) {
796
797 li = get_lockinfo(kparams->lkid);
798
799 /* If this is a persistent lock we will have to create a
800 lockinfo again */
801 if (!li && (kparams->flags & DLM_LKF_PERSISTENT)) {
802 li = allocate_lockinfo(fi, cmd, kparams);
803 if (!li)
804 return -ENOMEM;
805
806 li->li_lksb.sb_lkid = kparams->lkid;
807 li->li_castaddr = kparams->castaddr;
808 li->li_castparam = kparams->castparam;
809
810 /* OK, this isn't exactly a FIRSTLOCK but it is the
811 first time we've used this lockinfo, and if things
812 fail we want rid of it */
813 init_completion(&li->li_firstcomp);
814 set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
815 add_lockinfo(li);
816
817 /* TODO: do a query to get the current state ?? */
818 }
819 if (!li)
820 return -EINVAL;
821
822 if (li->li_magic != LOCKINFO_MAGIC)
823 return -EINVAL;
824
825 /* For conversions don't overwrite the current blocking AST
826 info so that:
827 a) if a blocking AST fires before the conversion is queued
828 it runs the current handler
829 b) if the conversion is cancelled, the original blocking AST
830 declaration is active
831 The pend_ info is made active when the conversion
832 completes.
833 */
834 li->li_pend_bastaddr = kparams->bastaddr;
835 li->li_pend_bastparam = kparams->bastparam;
836 } else {
837 li = allocate_lockinfo(fi, cmd, kparams);
838 if (!li)
839 return -ENOMEM;
840
841 /* Allow us to complete our work before
842 the AST routine runs. In fact we only need (and use) this
843 when the initial lock fails */
844 init_completion(&li->li_firstcomp);
845 set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
846 }
847
848 li->li_user_lksb = kparams->lksb;
849 li->li_castaddr = kparams->castaddr;
850 li->li_castparam = kparams->castparam;
851 li->li_lksb.sb_lkid = kparams->lkid;
852 li->li_rqmode = kparams->mode;
853 if (kparams->flags & DLM_LKF_PERSISTENT)
854 set_bit(LI_FLAG_PERSISTENT, &li->li_flags);
855
856 /* Copy in the value block */
857 if (kparams->flags & DLM_LKF_VALBLK) {
858 if (!li->li_lksb.sb_lvbptr) {
859 li->li_lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN,
860 GFP_KERNEL);
861 if (!li->li_lksb.sb_lvbptr) {
862 status = -ENOMEM;
863 goto out_err;
864 }
865 }
866
867 memcpy(li->li_lksb.sb_lvbptr, kparams->lvb, DLM_USER_LVB_LEN);
868 }
869
870 /* Lock it ... */
871 status = dlm_lock(fi->fi_ls->ls_lockspace,
872 kparams->mode, &li->li_lksb,
873 kparams->flags,
874 kparams->name, kparams->namelen,
875 kparams->parent,
876 ast_routine,
877 li,
878 (li->li_pend_bastaddr || li->li_bastaddr) ?
879 bast_routine : NULL);
880 if (status)
881 goto out_err;
882
883 /* If it succeeded (this far) with a new lock then keep track of
884 it on the file's lockinfo list */
885 if (!status && test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags)) {
886
887 spin_lock(&fi->fi_li_lock);
888 list_add(&li->li_ownerqueue, &fi->fi_li_list);
889 set_bit(LI_FLAG_ONLIST, &li->li_flags);
890 spin_unlock(&fi->fi_li_lock);
891 if (add_lockinfo(li))
892 printk(KERN_WARNING "Add lockinfo failed\n");
893
894 complete(&li->li_firstcomp);
895 }
896
897 /* Return the lockid as the user needs it /now/ */
898 return li->li_lksb.sb_lkid;
899
900 out_err:
901 if (test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags))
902 release_lockinfo(li);
903 return status;
904
905}
906
907static int do_user_unlock(struct file_info *fi, uint8_t cmd,
908 struct dlm_lock_params *kparams)
909{
910 struct lock_info *li;
911 int status;
912 int convert_cancel = 0;
913
914 li = get_lockinfo(kparams->lkid);
915 if (!li) {
916 li = allocate_lockinfo(fi, cmd, kparams);
917 if (!li)
918 return -ENOMEM;
919 spin_lock(&fi->fi_li_lock);
920 list_add(&li->li_ownerqueue, &fi->fi_li_list);
921 set_bit(LI_FLAG_ONLIST, &li->li_flags);
922 spin_unlock(&fi->fi_li_lock);
923 }
924
925 if (li->li_magic != LOCKINFO_MAGIC)
926 return -EINVAL;
927
928 li->li_user_lksb = kparams->lksb;
929 li->li_castparam = kparams->castparam;
930 li->li_cmd = cmd;
931
932 /* Cancelling a conversion doesn't remove the lock...*/
933 if (kparams->flags & DLM_LKF_CANCEL && li->li_grmode != -1)
934 convert_cancel = 1;
935
936 /* Wait until dlm_lock() has completed */
937 if (!test_bit(LI_FLAG_ONLIST, &li->li_flags)) {
938 wait_for_completion(&li->li_firstcomp);
939 }
940
941 /* dlm_unlock() passes a 0 for castaddr which means don't overwrite
942 the existing li_castaddr as that's the completion routine for
943 unlocks. dlm_unlock_wait() specifies a new AST routine to be
944 executed when the unlock completes. */
945 if (kparams->castaddr)
946 li->li_castaddr = kparams->castaddr;
947
948 /* Use existing lksb & astparams */
949 status = dlm_unlock(fi->fi_ls->ls_lockspace,
950 kparams->lkid,
951 kparams->flags, &li->li_lksb, li);
952
953 if (!status && !convert_cancel) {
954 spin_lock(&fi->fi_li_lock);
955 list_del(&li->li_ownerqueue);
956 clear_bit(LI_FLAG_ONLIST, &li->li_flags);
957 spin_unlock(&fi->fi_li_lock);
958 }
959
960 return status;
961}
962
963/* Write call, submit a locking request */
964static ssize_t dlm_write(struct file *file, const char __user *buffer,
965 size_t count, loff_t *ppos)
966{
967 struct file_info *fi = file->private_data;
968 struct dlm_write_request *kparams;
969 sigset_t tmpsig;
970 sigset_t allsigs;
971 int status;
972
973 /* -1 because lock name is optional */
974 if (count < sizeof(struct dlm_write_request)-1)
975 return -EINVAL;
976
977 /* Has the lockspace been deleted */
978 if (fi && test_bit(LS_FLAG_DELETED, &fi->fi_ls->ls_flags))
979 return -ENOENT;
980
981 kparams = kmalloc(count, GFP_KERNEL);
982 if (!kparams)
983 return -ENOMEM;
984
985 status = -EFAULT;
986 /* Get the command info */
987 if (copy_from_user(kparams, buffer, count))
988 goto out_free;
989
990 status = -EBADE;
991 if (check_version(kparams))
992 goto out_free;
993
994 /* Block signals while we are doing this */
995 sigfillset(&allsigs);
996 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
997
998 status = -EINVAL;
999 switch (kparams->cmd)
1000 {
1001 case DLM_USER_LOCK:
1002 if (!fi) goto out_sig;
1003 status = do_user_lock(fi, kparams->cmd, &kparams->i.lock);
1004 break;
1005
1006 case DLM_USER_UNLOCK:
1007 if (!fi) goto out_sig;
1008 status = do_user_unlock(fi, kparams->cmd, &kparams->i.lock);
1009 break;
1010
1011 case DLM_USER_CREATE_LOCKSPACE:
1012 if (fi) goto out_sig;
1013 status = do_user_create_lockspace(fi, kparams->cmd,
1014 &kparams->i.lspace);
1015 break;
1016
1017 case DLM_USER_REMOVE_LOCKSPACE:
1018 if (fi) goto out_sig;
1019 status = do_user_remove_lockspace(fi, kparams->cmd,
1020 &kparams->i.lspace);
1021 break;
1022 default:
1023 printk("Unknown command passed to DLM device : %d\n",
1024 kparams->cmd);
1025 break;
1026 }
1027
1028 out_sig:
1029 /* Restore signals */
1030 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
1031 recalc_sigpending();
1032
1033 out_free:
1034 kfree(kparams);
1035 if (status == 0)
1036 return count;
1037 else
1038 return status;
1039}
1040
1041static struct file_operations _dlm_fops = {
1042 .open = dlm_open,
1043 .release = dlm_close,
1044 .read = dlm_read,
1045 .write = dlm_write,
1046 .poll = dlm_poll,
1047 .owner = THIS_MODULE,
1048};
1049
1050static struct file_operations _dlm_ctl_fops = {
1051 .open = dlm_ctl_open,
1052 .release = dlm_ctl_close,
1053 .write = dlm_write,
1054 .owner = THIS_MODULE,
1055};
1056
1057/*
1058 * Create control device
1059 */
1060static int __init dlm_device_init(void)
1061{
1062 int r;
1063
1064 INIT_LIST_HEAD(&user_ls_list);
1065 mutex_init(&user_ls_lock);
1066 rwlock_init(&lockinfo_lock);
1067
1068 ctl_device.name = "dlm-control";
1069 ctl_device.fops = &_dlm_ctl_fops;
1070 ctl_device.minor = MISC_DYNAMIC_MINOR;
1071
1072 r = misc_register(&ctl_device);
1073 if (r) {
1074 printk(KERN_ERR "dlm: misc_register failed for control dev\n");
1075 return r;
1076 }
1077
1078 return 0;
1079}
1080
1081static void __exit dlm_device_exit(void)
1082{
1083 misc_deregister(&ctl_device);
1084}
1085
1086MODULE_DESCRIPTION("Distributed Lock Manager device interface");
1087MODULE_AUTHOR("Red Hat, Inc.");
1088MODULE_LICENSE("GPL");
1089
1090module_init(dlm_device_init);
1091module_exit(dlm_device_exit);
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
new file mode 100644
index 000000000000..46754553fdcc
--- /dev/null
+++ b/fs/dlm/dir.c
@@ -0,0 +1,423 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "rcom.h"
19#include "config.h"
20#include "memory.h"
21#include "recover.h"
22#include "util.h"
23#include "lock.h"
24#include "dir.h"
25
26
27static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
28{
29 spin_lock(&ls->ls_recover_list_lock);
30 list_add(&de->list, &ls->ls_recover_list);
31 spin_unlock(&ls->ls_recover_list_lock);
32}
33
34static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
35{
36 int found = 0;
37 struct dlm_direntry *de;
38
39 spin_lock(&ls->ls_recover_list_lock);
40 list_for_each_entry(de, &ls->ls_recover_list, list) {
41 if (de->length == len) {
42 list_del(&de->list);
43 de->master_nodeid = 0;
44 memset(de->name, 0, len);
45 found = 1;
46 break;
47 }
48 }
49 spin_unlock(&ls->ls_recover_list_lock);
50
51 if (!found)
52 de = allocate_direntry(ls, len);
53 return de;
54}
55
56void dlm_clear_free_entries(struct dlm_ls *ls)
57{
58 struct dlm_direntry *de;
59
60 spin_lock(&ls->ls_recover_list_lock);
61 while (!list_empty(&ls->ls_recover_list)) {
62 de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
63 list);
64 list_del(&de->list);
65 free_direntry(de);
66 }
67 spin_unlock(&ls->ls_recover_list_lock);
68}
69
70/*
71 * We use the upper 16 bits of the hash value to select the directory node.
72 * Low bits are used for distribution of rsb's among hash buckets on each node.
73 *
74 * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
75 * num_nodes to the hash value. This value in the desired range is used as an
76 * offset into the sorted list of nodeid's to give the particular nodeid.
77 */
78
79int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
80{
81 struct list_head *tmp;
82 struct dlm_member *memb = NULL;
83 uint32_t node, n = 0;
84 int nodeid;
85
86 if (ls->ls_num_nodes == 1) {
87 nodeid = dlm_our_nodeid();
88 goto out;
89 }
90
91 if (ls->ls_node_array) {
92 node = (hash >> 16) % ls->ls_total_weight;
93 nodeid = ls->ls_node_array[node];
94 goto out;
95 }
96
97 /* make_member_array() failed to kmalloc ls_node_array... */
98
99 node = (hash >> 16) % ls->ls_num_nodes;
100
101 list_for_each(tmp, &ls->ls_nodes) {
102 if (n++ != node)
103 continue;
104 memb = list_entry(tmp, struct dlm_member, list);
105 break;
106 }
107
108 DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n",
109 ls->ls_num_nodes, n, node););
110 nodeid = memb->nodeid;
111 out:
112 return nodeid;
113}
114
115int dlm_dir_nodeid(struct dlm_rsb *r)
116{
117 return dlm_hash2nodeid(r->res_ls, r->res_hash);
118}
119
120static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
121{
122 uint32_t val;
123
124 val = jhash(name, len, 0);
125 val &= (ls->ls_dirtbl_size - 1);
126
127 return val;
128}
129
130static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
131{
132 uint32_t bucket;
133
134 bucket = dir_hash(ls, de->name, de->length);
135 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
136}
137
138static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
139 int namelen, uint32_t bucket)
140{
141 struct dlm_direntry *de;
142
143 list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
144 if (de->length == namelen && !memcmp(name, de->name, namelen))
145 goto out;
146 }
147 de = NULL;
148 out:
149 return de;
150}
151
152void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen)
153{
154 struct dlm_direntry *de;
155 uint32_t bucket;
156
157 bucket = dir_hash(ls, name, namelen);
158
159 write_lock(&ls->ls_dirtbl[bucket].lock);
160
161 de = search_bucket(ls, name, namelen, bucket);
162
163 if (!de) {
164 log_error(ls, "remove fr %u none", nodeid);
165 goto out;
166 }
167
168 if (de->master_nodeid != nodeid) {
169 log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
170 goto out;
171 }
172
173 list_del(&de->list);
174 free_direntry(de);
175 out:
176 write_unlock(&ls->ls_dirtbl[bucket].lock);
177}
178
179void dlm_dir_clear(struct dlm_ls *ls)
180{
181 struct list_head *head;
182 struct dlm_direntry *de;
183 int i;
184
185 DLM_ASSERT(list_empty(&ls->ls_recover_list), );
186
187 for (i = 0; i < ls->ls_dirtbl_size; i++) {
188 write_lock(&ls->ls_dirtbl[i].lock);
189 head = &ls->ls_dirtbl[i].list;
190 while (!list_empty(head)) {
191 de = list_entry(head->next, struct dlm_direntry, list);
192 list_del(&de->list);
193 put_free_de(ls, de);
194 }
195 write_unlock(&ls->ls_dirtbl[i].lock);
196 }
197}
198
199int dlm_recover_directory(struct dlm_ls *ls)
200{
201 struct dlm_member *memb;
202 struct dlm_direntry *de;
203 char *b, *last_name = NULL;
204 int error = -ENOMEM, last_len, count = 0;
205 uint16_t namelen;
206
207 log_debug(ls, "dlm_recover_directory");
208
209 if (dlm_no_directory(ls))
210 goto out_status;
211
212 dlm_dir_clear(ls);
213
214 last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
215 if (!last_name)
216 goto out;
217
218 list_for_each_entry(memb, &ls->ls_nodes, list) {
219 memset(last_name, 0, DLM_RESNAME_MAXLEN);
220 last_len = 0;
221
222 for (;;) {
223 error = dlm_recovery_stopped(ls);
224 if (error)
225 goto out_free;
226
227 error = dlm_rcom_names(ls, memb->nodeid,
228 last_name, last_len);
229 if (error)
230 goto out_free;
231
232 schedule();
233
234 /*
235 * pick namelen/name pairs out of received buffer
236 */
237
238 b = ls->ls_recover_buf + sizeof(struct dlm_rcom);
239
240 for (;;) {
241 memcpy(&namelen, b, sizeof(uint16_t));
242 namelen = be16_to_cpu(namelen);
243 b += sizeof(uint16_t);
244
245 /* namelen of 0xFFFFF marks end of names for
246 this node; namelen of 0 marks end of the
247 buffer */
248
249 if (namelen == 0xFFFF)
250 goto done;
251 if (!namelen)
252 break;
253
254 error = -ENOMEM;
255 de = get_free_de(ls, namelen);
256 if (!de)
257 goto out_free;
258
259 de->master_nodeid = memb->nodeid;
260 de->length = namelen;
261 last_len = namelen;
262 memcpy(de->name, b, namelen);
263 memcpy(last_name, b, namelen);
264 b += namelen;
265
266 add_entry_to_hash(ls, de);
267 count++;
268 }
269 }
270 done:
271 ;
272 }
273
274 out_status:
275 error = 0;
276 dlm_set_recover_status(ls, DLM_RS_DIR);
277 log_debug(ls, "dlm_recover_directory %d entries", count);
278 out_free:
279 kfree(last_name);
280 out:
281 dlm_clear_free_entries(ls);
282 return error;
283}
284
285static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
286 int namelen, int *r_nodeid)
287{
288 struct dlm_direntry *de, *tmp;
289 uint32_t bucket;
290
291 bucket = dir_hash(ls, name, namelen);
292
293 write_lock(&ls->ls_dirtbl[bucket].lock);
294 de = search_bucket(ls, name, namelen, bucket);
295 if (de) {
296 *r_nodeid = de->master_nodeid;
297 write_unlock(&ls->ls_dirtbl[bucket].lock);
298 if (*r_nodeid == nodeid)
299 return -EEXIST;
300 return 0;
301 }
302
303 write_unlock(&ls->ls_dirtbl[bucket].lock);
304
305 de = allocate_direntry(ls, namelen);
306 if (!de)
307 return -ENOMEM;
308
309 de->master_nodeid = nodeid;
310 de->length = namelen;
311 memcpy(de->name, name, namelen);
312
313 write_lock(&ls->ls_dirtbl[bucket].lock);
314 tmp = search_bucket(ls, name, namelen, bucket);
315 if (tmp) {
316 free_direntry(de);
317 de = tmp;
318 } else {
319 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
320 }
321 *r_nodeid = de->master_nodeid;
322 write_unlock(&ls->ls_dirtbl[bucket].lock);
323 return 0;
324}
325
326int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
327 int *r_nodeid)
328{
329 return get_entry(ls, nodeid, name, namelen, r_nodeid);
330}
331
332/* Copy the names of master rsb's into the buffer provided.
333 Only select names whose dir node is the given nodeid. */
334
335void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
336 char *outbuf, int outlen, int nodeid)
337{
338 struct list_head *list;
339 struct dlm_rsb *start_r = NULL, *r = NULL;
340 int offset = 0, start_namelen, error, dir_nodeid;
341 char *start_name;
342 uint16_t be_namelen;
343
344 /*
345 * Find the rsb where we left off (or start again)
346 */
347
348 start_namelen = inlen;
349 start_name = inbuf;
350
351 if (start_namelen > 1) {
352 /*
353 * We could also use a find_rsb_root() function here that
354 * searched the ls_root_list.
355 */
356 error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
357 &start_r);
358 DLM_ASSERT(!error && start_r,
359 printk("error %d\n", error););
360 DLM_ASSERT(!list_empty(&start_r->res_root_list),
361 dlm_print_rsb(start_r););
362 dlm_put_rsb(start_r);
363 }
364
365 /*
366 * Send rsb names for rsb's we're master of and whose directory node
367 * matches the requesting node.
368 */
369
370 down_read(&ls->ls_root_sem);
371 if (start_r)
372 list = start_r->res_root_list.next;
373 else
374 list = ls->ls_root_list.next;
375
376 for (offset = 0; list != &ls->ls_root_list; list = list->next) {
377 r = list_entry(list, struct dlm_rsb, res_root_list);
378 if (r->res_nodeid)
379 continue;
380
381 dir_nodeid = dlm_dir_nodeid(r);
382 if (dir_nodeid != nodeid)
383 continue;
384
385 /*
386 * The block ends when we can't fit the following in the
387 * remaining buffer space:
388 * namelen (uint16_t) +
389 * name (r->res_length) +
390 * end-of-block record 0x0000 (uint16_t)
391 */
392
393 if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
394 /* Write end-of-block record */
395 be_namelen = 0;
396 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
397 offset += sizeof(uint16_t);
398 goto out;
399 }
400
401 be_namelen = cpu_to_be16(r->res_length);
402 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
403 offset += sizeof(uint16_t);
404 memcpy(outbuf + offset, r->res_name, r->res_length);
405 offset += r->res_length;
406 }
407
408 /*
409 * If we've reached the end of the list (and there's room) write a
410 * terminating record.
411 */
412
413 if ((list == &ls->ls_root_list) &&
414 (offset + sizeof(uint16_t) <= outlen)) {
415 be_namelen = 0xFFFF;
416 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
417 offset += sizeof(uint16_t);
418 }
419
420 out:
421 up_read(&ls->ls_root_sem);
422}
423
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
new file mode 100644
index 000000000000..0b0eb1267b6e
--- /dev/null
+++ b/fs/dlm/dir.h
@@ -0,0 +1,30 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DIR_DOT_H__
15#define __DIR_DOT_H__
16
17
18int dlm_dir_nodeid(struct dlm_rsb *rsb);
19int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
20void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len);
21void dlm_dir_clear(struct dlm_ls *ls);
22void dlm_clear_free_entries(struct dlm_ls *ls);
23int dlm_recover_directory(struct dlm_ls *ls);
24int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
25 int *r_nodeid);
26void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
27 char *outbuf, int outlen, int nodeid);
28
29#endif /* __DIR_DOT_H__ */
30
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
new file mode 100644
index 000000000000..149106f2b80f
--- /dev/null
+++ b/fs/dlm/dlm_internal.h
@@ -0,0 +1,494 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DLM_INTERNAL_DOT_H__
15#define __DLM_INTERNAL_DOT_H__
16
17/*
18 * This is the main header file to be included in each DLM source file.
19 */
20
21#include <linux/module.h>
22#include <linux/slab.h>
23#include <linux/sched.h>
24#include <linux/types.h>
25#include <linux/ctype.h>
26#include <linux/spinlock.h>
27#include <linux/vmalloc.h>
28#include <linux/list.h>
29#include <linux/errno.h>
30#include <linux/random.h>
31#include <linux/delay.h>
32#include <linux/socket.h>
33#include <linux/kthread.h>
34#include <linux/kobject.h>
35#include <linux/kref.h>
36#include <linux/kernel.h>
37#include <linux/jhash.h>
38#include <linux/mutex.h>
39#include <asm/semaphore.h>
40#include <asm/uaccess.h>
41
42#include <linux/dlm.h>
43
44#define DLM_LOCKSPACE_LEN 64
45
46/* Size of the temp buffer midcomms allocates on the stack.
47 We try to make this large enough so most messages fit.
48 FIXME: should sctp make this unnecessary? */
49
50#define DLM_INBUF_LEN 148
51
52struct dlm_ls;
53struct dlm_lkb;
54struct dlm_rsb;
55struct dlm_member;
56struct dlm_lkbtable;
57struct dlm_rsbtable;
58struct dlm_dirtable;
59struct dlm_direntry;
60struct dlm_recover;
61struct dlm_header;
62struct dlm_message;
63struct dlm_rcom;
64struct dlm_mhandle;
65
66#define log_print(fmt, args...) \
67 printk(KERN_ERR "dlm: "fmt"\n" , ##args)
68#define log_error(ls, fmt, args...) \
69 printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
70
71#ifdef DLM_LOG_DEBUG
72#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args)
73#else
74#define log_debug(ls, fmt, args...)
75#endif
76
77#define DLM_ASSERT(x, do) \
78{ \
79 if (!(x)) \
80 { \
81 printk(KERN_ERR "\nDLM: Assertion failed on line %d of file %s\n" \
82 "DLM: assertion: \"%s\"\n" \
83 "DLM: time = %lu\n", \
84 __LINE__, __FILE__, #x, jiffies); \
85 {do} \
86 printk("\n"); \
87 BUG(); \
88 panic("DLM: Record message above and reboot.\n"); \
89 } \
90}
91
92
93struct dlm_direntry {
94 struct list_head list;
95 uint32_t master_nodeid;
96 uint16_t length;
97 char name[1];
98};
99
100struct dlm_dirtable {
101 struct list_head list;
102 rwlock_t lock;
103};
104
105struct dlm_rsbtable {
106 struct list_head list;
107 struct list_head toss;
108 rwlock_t lock;
109};
110
111struct dlm_lkbtable {
112 struct list_head list;
113 rwlock_t lock;
114 uint16_t counter;
115};
116
117/*
118 * Lockspace member (per node in a ls)
119 */
120
121struct dlm_member {
122 struct list_head list;
123 int nodeid;
124 int weight;
125};
126
127/*
128 * Save and manage recovery state for a lockspace.
129 */
130
131struct dlm_recover {
132 struct list_head list;
133 int *nodeids;
134 int node_count;
135 uint64_t seq;
136};
137
138/*
139 * Pass input args to second stage locking function.
140 */
141
142struct dlm_args {
143 uint32_t flags;
144 void *astaddr;
145 long astparam;
146 void *bastaddr;
147 int mode;
148 struct dlm_lksb *lksb;
149};
150
151
152/*
153 * Lock block
154 *
155 * A lock can be one of three types:
156 *
157 * local copy lock is mastered locally
158 * (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
159 * process copy lock is mastered on a remote node
160 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
161 * master copy master node's copy of a lock owned by remote node
162 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
163 *
164 * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
165 * dlm_unlock. The dlm does not modify these or use any private flags in
166 * this field; it only contains DLM_LKF_ flags from dlm.h. These flags
167 * are sent as-is to the remote master when the lock is remote.
168 *
169 * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
170 * Some internal flags are shared between the master and process nodes;
171 * these shared flags are kept in the lower two bytes. One of these
172 * flags set on the master copy will be propagated to the process copy
173 * and v.v. Other internal flags are private to the master or process
174 * node (e.g. DLM_IFL_MSTCPY). These are kept in the high two bytes.
175 *
176 * lkb_sbflags: status block flags. These flags are copied directly into
177 * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
178 * ast. All defined in dlm.h with DLM_SBF_ prefix.
179 *
180 * lkb_status: the lock status indicates which rsb queue the lock is
181 * on, grant, convert, or wait. DLM_LKSTS_ WAITING/GRANTED/CONVERT
182 *
183 * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
184 * reply is needed. Only set when the lkb is on the lockspace waiters
185 * list awaiting a reply from a remote node.
186 *
187 * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
188 * is a master copy, nodeid specifies the remote lock holder, when the
189 * lkb is a process copy, the nodeid specifies the lock master.
190 */
191
192/* lkb_ast_type */
193
194#define AST_COMP 1
195#define AST_BAST 2
196
197/* lkb_status */
198
199#define DLM_LKSTS_WAITING 1
200#define DLM_LKSTS_GRANTED 2
201#define DLM_LKSTS_CONVERT 3
202
203/* lkb_flags */
204
205#define DLM_IFL_MSTCPY 0x00010000
206#define DLM_IFL_RESEND 0x00020000
207
208struct dlm_lkb {
209 struct dlm_rsb *lkb_resource; /* the rsb */
210 struct kref lkb_ref;
211 int lkb_nodeid; /* copied from rsb */
212 int lkb_ownpid; /* pid of lock owner */
213 uint32_t lkb_id; /* our lock ID */
214 uint32_t lkb_remid; /* lock ID on remote partner */
215 uint32_t lkb_exflags; /* external flags from caller */
216 uint32_t lkb_sbflags; /* lksb flags */
217 uint32_t lkb_flags; /* internal flags */
218 uint32_t lkb_lvbseq; /* lvb sequence number */
219
220 int8_t lkb_status; /* granted, waiting, convert */
221 int8_t lkb_rqmode; /* requested lock mode */
222 int8_t lkb_grmode; /* granted lock mode */
223 int8_t lkb_bastmode; /* requested mode */
224 int8_t lkb_highbast; /* highest mode bast sent for */
225
226 int8_t lkb_wait_type; /* type of reply waiting for */
227 int8_t lkb_ast_type; /* type of ast queued for */
228
229 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
230 struct list_head lkb_statequeue; /* rsb g/c/w list */
231 struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */
232 struct list_head lkb_wait_reply; /* waiting for remote reply */
233 struct list_head lkb_astqueue; /* need ast to be sent */
234
235 char *lkb_lvbptr;
236 struct dlm_lksb *lkb_lksb; /* caller's status block */
237 void *lkb_astaddr; /* caller's ast function */
238 void *lkb_bastaddr; /* caller's bast function */
239 long lkb_astparam; /* caller's ast arg */
240};
241
242
243struct dlm_rsb {
244 struct dlm_ls *res_ls; /* the lockspace */
245 struct kref res_ref;
246 struct mutex res_mutex;
247 unsigned long res_flags;
248 int res_length; /* length of rsb name */
249 int res_nodeid;
250 uint32_t res_lvbseq;
251 uint32_t res_hash;
252 uint32_t res_bucket; /* rsbtbl */
253 unsigned long res_toss_time;
254 uint32_t res_first_lkid;
255 struct list_head res_lookup; /* lkbs waiting on first */
256 struct list_head res_hashchain; /* rsbtbl */
257 struct list_head res_grantqueue;
258 struct list_head res_convertqueue;
259 struct list_head res_waitqueue;
260
261 struct list_head res_root_list; /* used for recovery */
262 struct list_head res_recover_list; /* used for recovery */
263 int res_recover_locks_count;
264
265 char *res_lvbptr;
266 char res_name[1];
267};
268
269/* find_rsb() flags */
270
271#define R_MASTER 1 /* only return rsb if it's a master */
272#define R_CREATE 2 /* create/add rsb if not found */
273
274/* rsb_flags */
275
276enum rsb_flags {
277 RSB_MASTER_UNCERTAIN,
278 RSB_VALNOTVALID,
279 RSB_VALNOTVALID_PREV,
280 RSB_NEW_MASTER,
281 RSB_NEW_MASTER2,
282 RSB_RECOVER_CONVERT,
283 RSB_LOCKS_PURGED,
284};
285
286static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
287{
288 __set_bit(flag, &r->res_flags);
289}
290
291static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
292{
293 __clear_bit(flag, &r->res_flags);
294}
295
296static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
297{
298 return test_bit(flag, &r->res_flags);
299}
300
301
302/* dlm_header is first element of all structs sent between nodes */
303
304#define DLM_HEADER_MAJOR 0x00020000
305#define DLM_HEADER_MINOR 0x00000001
306
307#define DLM_MSG 1
308#define DLM_RCOM 2
309
310struct dlm_header {
311 uint32_t h_version;
312 uint32_t h_lockspace;
313 uint32_t h_nodeid; /* nodeid of sender */
314 uint16_t h_length;
315 uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */
316 uint8_t h_pad;
317};
318
319
320#define DLM_MSG_REQUEST 1
321#define DLM_MSG_CONVERT 2
322#define DLM_MSG_UNLOCK 3
323#define DLM_MSG_CANCEL 4
324#define DLM_MSG_REQUEST_REPLY 5
325#define DLM_MSG_CONVERT_REPLY 6
326#define DLM_MSG_UNLOCK_REPLY 7
327#define DLM_MSG_CANCEL_REPLY 8
328#define DLM_MSG_GRANT 9
329#define DLM_MSG_BAST 10
330#define DLM_MSG_LOOKUP 11
331#define DLM_MSG_REMOVE 12
332#define DLM_MSG_LOOKUP_REPLY 13
333
334struct dlm_message {
335 struct dlm_header m_header;
336 uint32_t m_type; /* DLM_MSG_ */
337 uint32_t m_nodeid;
338 uint32_t m_pid;
339 uint32_t m_lkid; /* lkid on sender */
340 uint32_t m_remid; /* lkid on receiver */
341 uint32_t m_parent_lkid;
342 uint32_t m_parent_remid;
343 uint32_t m_exflags;
344 uint32_t m_sbflags;
345 uint32_t m_flags;
346 uint32_t m_lvbseq;
347 uint32_t m_hash;
348 int m_status;
349 int m_grmode;
350 int m_rqmode;
351 int m_bastmode;
352 int m_asts;
353 int m_result; /* 0 or -EXXX */
354 char m_extra[0]; /* name or lvb */
355};
356
357
358#define DLM_RS_NODES 0x00000001
359#define DLM_RS_NODES_ALL 0x00000002
360#define DLM_RS_DIR 0x00000004
361#define DLM_RS_DIR_ALL 0x00000008
362#define DLM_RS_LOCKS 0x00000010
363#define DLM_RS_LOCKS_ALL 0x00000020
364#define DLM_RS_DONE 0x00000040
365#define DLM_RS_DONE_ALL 0x00000080
366
367#define DLM_RCOM_STATUS 1
368#define DLM_RCOM_NAMES 2
369#define DLM_RCOM_LOOKUP 3
370#define DLM_RCOM_LOCK 4
371#define DLM_RCOM_STATUS_REPLY 5
372#define DLM_RCOM_NAMES_REPLY 6
373#define DLM_RCOM_LOOKUP_REPLY 7
374#define DLM_RCOM_LOCK_REPLY 8
375
376struct dlm_rcom {
377 struct dlm_header rc_header;
378 uint32_t rc_type; /* DLM_RCOM_ */
379 int rc_result; /* multi-purpose */
380 uint64_t rc_id; /* match reply with request */
381 char rc_buf[0];
382};
383
384struct rcom_config {
385 uint32_t rf_lvblen;
386 uint32_t rf_lsflags;
387 uint64_t rf_unused;
388};
389
390struct rcom_lock {
391 uint32_t rl_ownpid;
392 uint32_t rl_lkid;
393 uint32_t rl_remid;
394 uint32_t rl_parent_lkid;
395 uint32_t rl_parent_remid;
396 uint32_t rl_exflags;
397 uint32_t rl_flags;
398 uint32_t rl_lvbseq;
399 int rl_result;
400 int8_t rl_rqmode;
401 int8_t rl_grmode;
402 int8_t rl_status;
403 int8_t rl_asts;
404 uint16_t rl_wait_type;
405 uint16_t rl_namelen;
406 char rl_name[DLM_RESNAME_MAXLEN];
407 char rl_lvb[0];
408};
409
410struct dlm_ls {
411 struct list_head ls_list; /* list of lockspaces */
412 uint32_t ls_global_id; /* global unique lockspace ID */
413 uint32_t ls_exflags;
414 int ls_lvblen;
415 int ls_count; /* reference count */
416 unsigned long ls_flags; /* LSFL_ */
417 struct kobject ls_kobj;
418
419 struct dlm_rsbtable *ls_rsbtbl;
420 uint32_t ls_rsbtbl_size;
421
422 struct dlm_lkbtable *ls_lkbtbl;
423 uint32_t ls_lkbtbl_size;
424
425 struct dlm_dirtable *ls_dirtbl;
426 uint32_t ls_dirtbl_size;
427
428 struct mutex ls_waiters_mutex;
429 struct list_head ls_waiters; /* lkbs needing a reply */
430
431 struct list_head ls_nodes; /* current nodes in ls */
432 struct list_head ls_nodes_gone; /* dead node list, recovery */
433 int ls_num_nodes; /* number of nodes in ls */
434 int ls_low_nodeid;
435 int ls_total_weight;
436 int *ls_node_array;
437
438 struct dlm_rsb ls_stub_rsb; /* for returning errors */
439 struct dlm_lkb ls_stub_lkb; /* for returning errors */
440 struct dlm_message ls_stub_ms; /* for faking a reply */
441
442 struct dentry *ls_debug_dentry; /* debugfs */
443
444 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
445 int ls_uevent_result;
446
447 /* recovery related */
448
449 struct timer_list ls_timer;
450 struct task_struct *ls_recoverd_task;
451 struct mutex ls_recoverd_active;
452 spinlock_t ls_recover_lock;
453 uint32_t ls_recover_status; /* DLM_RS_ */
454 uint64_t ls_recover_seq;
455 struct dlm_recover *ls_recover_args;
456 struct rw_semaphore ls_in_recovery; /* block local requests */
457 struct list_head ls_requestqueue;/* queue remote requests */
458 struct mutex ls_requestqueue_mutex;
459 char *ls_recover_buf;
460 struct list_head ls_recover_list;
461 spinlock_t ls_recover_list_lock;
462 int ls_recover_list_count;
463 wait_queue_head_t ls_wait_general;
464
465 struct list_head ls_root_list; /* root resources */
466 struct rw_semaphore ls_root_sem; /* protect root_list */
467
468 int ls_namelen;
469 char ls_name[1];
470};
471
472#define LSFL_WORK 0
473#define LSFL_RUNNING 1
474#define LSFL_RECOVERY_STOP 2
475#define LSFL_RCOM_READY 3
476#define LSFL_UEVENT_WAIT 4
477
478static inline int dlm_locking_stopped(struct dlm_ls *ls)
479{
480 return !test_bit(LSFL_RUNNING, &ls->ls_flags);
481}
482
483static inline int dlm_recovery_stopped(struct dlm_ls *ls)
484{
485 return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
486}
487
488static inline int dlm_no_directory(struct dlm_ls *ls)
489{
490 return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
491}
492
493#endif /* __DLM_INTERNAL_DOT_H__ */
494
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
new file mode 100644
index 000000000000..5f6963904107
--- /dev/null
+++ b/fs/dlm/lock.c
@@ -0,0 +1,3547 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13/* Central locking logic has four stages:
14
15 dlm_lock()
16 dlm_unlock()
17
18 request_lock(ls, lkb)
19 convert_lock(ls, lkb)
20 unlock_lock(ls, lkb)
21 cancel_lock(ls, lkb)
22
23 _request_lock(r, lkb)
24 _convert_lock(r, lkb)
25 _unlock_lock(r, lkb)
26 _cancel_lock(r, lkb)
27
28 do_request(r, lkb)
29 do_convert(r, lkb)
30 do_unlock(r, lkb)
31 do_cancel(r, lkb)
32
33 Stage 1 (lock, unlock) is mainly about checking input args and
34 splitting into one of the four main operations:
35
36 dlm_lock = request_lock
37 dlm_lock+CONVERT = convert_lock
38 dlm_unlock = unlock_lock
39 dlm_unlock+CANCEL = cancel_lock
40
41 Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
42 provided to the next stage.
43
44 Stage 3, _xxxx_lock(), determines if the operation is local or remote.
45 When remote, it calls send_xxxx(), when local it calls do_xxxx().
46
47 Stage 4, do_xxxx(), is the guts of the operation. It manipulates the
48 given rsb and lkb and queues callbacks.
49
50 For remote operations, send_xxxx() results in the corresponding do_xxxx()
51 function being executed on the remote node. The connecting send/receive
52 calls on local (L) and remote (R) nodes:
53
54 L: send_xxxx() -> R: receive_xxxx()
55 R: do_xxxx()
56 L: receive_xxxx_reply() <- R: send_xxxx_reply()
57*/
58
59#include "dlm_internal.h"
60#include "memory.h"
61#include "lowcomms.h"
62#include "requestqueue.h"
63#include "util.h"
64#include "dir.h"
65#include "member.h"
66#include "lockspace.h"
67#include "ast.h"
68#include "lock.h"
69#include "rcom.h"
70#include "recover.h"
71#include "lvb_table.h"
72#include "config.h"
73
74static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
75static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
76static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
77static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
78static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
79static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
80static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
81static int send_remove(struct dlm_rsb *r);
82static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
83static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
84 struct dlm_message *ms);
85static int receive_extralen(struct dlm_message *ms);
86
87/*
88 * Lock compatibilty matrix - thanks Steve
89 * UN = Unlocked state. Not really a state, used as a flag
90 * PD = Padding. Used to make the matrix a nice power of two in size
91 * Other states are the same as the VMS DLM.
92 * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
93 */
94
95static const int __dlm_compat_matrix[8][8] = {
96 /* UN NL CR CW PR PW EX PD */
97 {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
98 {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
99 {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
100 {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
101 {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
102 {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
103 {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
104 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
105};
106
107/*
108 * This defines the direction of transfer of LVB data.
109 * Granted mode is the row; requested mode is the column.
110 * Usage: matrix[grmode+1][rqmode+1]
111 * 1 = LVB is returned to the caller
112 * 0 = LVB is written to the resource
113 * -1 = nothing happens to the LVB
114 */
115
116const int dlm_lvb_operations[8][8] = {
117 /* UN NL CR CW PR PW EX PD*/
118 { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
119 { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
120 { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
121 { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
122 { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
123 { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
124 { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
125 { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
126};
127EXPORT_SYMBOL_GPL(dlm_lvb_operations);
128
129#define modes_compat(gr, rq) \
130 __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
131
132int dlm_modes_compat(int mode1, int mode2)
133{
134 return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
135}
136
137/*
138 * Compatibility matrix for conversions with QUECVT set.
139 * Granted mode is the row; requested mode is the column.
140 * Usage: matrix[grmode+1][rqmode+1]
141 */
142
143static const int __quecvt_compat_matrix[8][8] = {
144 /* UN NL CR CW PR PW EX PD */
145 {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
146 {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
147 {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
148 {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
149 {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
150 {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
151 {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
152 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
153};
154
155static void dlm_print_lkb(struct dlm_lkb *lkb)
156{
157 printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
158 " status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
159 lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
160 lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
161 lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
162}
163
164void dlm_print_rsb(struct dlm_rsb *r)
165{
166 printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
167 r->res_nodeid, r->res_flags, r->res_first_lkid,
168 r->res_recover_locks_count, r->res_name);
169}
170
171/* Threads cannot use the lockspace while it's being recovered */
172
173static inline void lock_recovery(struct dlm_ls *ls)
174{
175 down_read(&ls->ls_in_recovery);
176}
177
178static inline void unlock_recovery(struct dlm_ls *ls)
179{
180 up_read(&ls->ls_in_recovery);
181}
182
183static inline int lock_recovery_try(struct dlm_ls *ls)
184{
185 return down_read_trylock(&ls->ls_in_recovery);
186}
187
188static inline int can_be_queued(struct dlm_lkb *lkb)
189{
190 return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
191}
192
193static inline int force_blocking_asts(struct dlm_lkb *lkb)
194{
195 return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
196}
197
198static inline int is_demoted(struct dlm_lkb *lkb)
199{
200 return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
201}
202
203static inline int is_remote(struct dlm_rsb *r)
204{
205 DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
206 return !!r->res_nodeid;
207}
208
209static inline int is_process_copy(struct dlm_lkb *lkb)
210{
211 return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
212}
213
214static inline int is_master_copy(struct dlm_lkb *lkb)
215{
216 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
217 DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
218 return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
219}
220
221static inline int middle_conversion(struct dlm_lkb *lkb)
222{
223 if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
224 (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
225 return 1;
226 return 0;
227}
228
229static inline int down_conversion(struct dlm_lkb *lkb)
230{
231 return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
232}
233
234static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
235{
236 if (is_master_copy(lkb))
237 return;
238
239 DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
240
241 lkb->lkb_lksb->sb_status = rv;
242 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
243
244 dlm_add_ast(lkb, AST_COMP);
245}
246
247static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
248{
249 if (is_master_copy(lkb))
250 send_bast(r, lkb, rqmode);
251 else {
252 lkb->lkb_bastmode = rqmode;
253 dlm_add_ast(lkb, AST_BAST);
254 }
255}
256
257/*
258 * Basic operations on rsb's and lkb's
259 */
260
261static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
262{
263 struct dlm_rsb *r;
264
265 r = allocate_rsb(ls, len);
266 if (!r)
267 return NULL;
268
269 r->res_ls = ls;
270 r->res_length = len;
271 memcpy(r->res_name, name, len);
272 mutex_init(&r->res_mutex);
273
274 INIT_LIST_HEAD(&r->res_lookup);
275 INIT_LIST_HEAD(&r->res_grantqueue);
276 INIT_LIST_HEAD(&r->res_convertqueue);
277 INIT_LIST_HEAD(&r->res_waitqueue);
278 INIT_LIST_HEAD(&r->res_root_list);
279 INIT_LIST_HEAD(&r->res_recover_list);
280
281 return r;
282}
283
284static int search_rsb_list(struct list_head *head, char *name, int len,
285 unsigned int flags, struct dlm_rsb **r_ret)
286{
287 struct dlm_rsb *r;
288 int error = 0;
289
290 list_for_each_entry(r, head, res_hashchain) {
291 if (len == r->res_length && !memcmp(name, r->res_name, len))
292 goto found;
293 }
294 return -ENOENT;
295
296 found:
297 if (r->res_nodeid && (flags & R_MASTER))
298 error = -ENOTBLK;
299 *r_ret = r;
300 return error;
301}
302
303static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
304 unsigned int flags, struct dlm_rsb **r_ret)
305{
306 struct dlm_rsb *r;
307 int error;
308
309 error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
310 if (!error) {
311 kref_get(&r->res_ref);
312 goto out;
313 }
314 error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
315 if (error)
316 goto out;
317
318 list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
319
320 if (dlm_no_directory(ls))
321 goto out;
322
323 if (r->res_nodeid == -1) {
324 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
325 r->res_first_lkid = 0;
326 } else if (r->res_nodeid > 0) {
327 rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
328 r->res_first_lkid = 0;
329 } else {
330 DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
331 DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
332 }
333 out:
334 *r_ret = r;
335 return error;
336}
337
338static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
339 unsigned int flags, struct dlm_rsb **r_ret)
340{
341 int error;
342 write_lock(&ls->ls_rsbtbl[b].lock);
343 error = _search_rsb(ls, name, len, b, flags, r_ret);
344 write_unlock(&ls->ls_rsbtbl[b].lock);
345 return error;
346}
347
348/*
349 * Find rsb in rsbtbl and potentially create/add one
350 *
351 * Delaying the release of rsb's has a similar benefit to applications keeping
352 * NL locks on an rsb, but without the guarantee that the cached master value
353 * will still be valid when the rsb is reused. Apps aren't always smart enough
354 * to keep NL locks on an rsb that they may lock again shortly; this can lead
355 * to excessive master lookups and removals if we don't delay the release.
356 *
357 * Searching for an rsb means looking through both the normal list and toss
358 * list. When found on the toss list the rsb is moved to the normal list with
359 * ref count of 1; when found on normal list the ref count is incremented.
360 */
361
362static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
363 unsigned int flags, struct dlm_rsb **r_ret)
364{
365 struct dlm_rsb *r, *tmp;
366 uint32_t hash, bucket;
367 int error = 0;
368
369 if (dlm_no_directory(ls))
370 flags |= R_CREATE;
371
372 hash = jhash(name, namelen, 0);
373 bucket = hash & (ls->ls_rsbtbl_size - 1);
374
375 error = search_rsb(ls, name, namelen, bucket, flags, &r);
376 if (!error)
377 goto out;
378
379 if (error == -ENOENT && !(flags & R_CREATE))
380 goto out;
381
382 /* the rsb was found but wasn't a master copy */
383 if (error == -ENOTBLK)
384 goto out;
385
386 error = -ENOMEM;
387 r = create_rsb(ls, name, namelen);
388 if (!r)
389 goto out;
390
391 r->res_hash = hash;
392 r->res_bucket = bucket;
393 r->res_nodeid = -1;
394 kref_init(&r->res_ref);
395
396 /* With no directory, the master can be set immediately */
397 if (dlm_no_directory(ls)) {
398 int nodeid = dlm_dir_nodeid(r);
399 if (nodeid == dlm_our_nodeid())
400 nodeid = 0;
401 r->res_nodeid = nodeid;
402 }
403
404 write_lock(&ls->ls_rsbtbl[bucket].lock);
405 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
406 if (!error) {
407 write_unlock(&ls->ls_rsbtbl[bucket].lock);
408 free_rsb(r);
409 r = tmp;
410 goto out;
411 }
412 list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
413 write_unlock(&ls->ls_rsbtbl[bucket].lock);
414 error = 0;
415 out:
416 *r_ret = r;
417 return error;
418}
419
420int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
421 unsigned int flags, struct dlm_rsb **r_ret)
422{
423 return find_rsb(ls, name, namelen, flags, r_ret);
424}
425
426/* This is only called to add a reference when the code already holds
427 a valid reference to the rsb, so there's no need for locking. */
428
429static inline void hold_rsb(struct dlm_rsb *r)
430{
431 kref_get(&r->res_ref);
432}
433
434void dlm_hold_rsb(struct dlm_rsb *r)
435{
436 hold_rsb(r);
437}
438
439static void toss_rsb(struct kref *kref)
440{
441 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
442 struct dlm_ls *ls = r->res_ls;
443
444 DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
445 kref_init(&r->res_ref);
446 list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
447 r->res_toss_time = jiffies;
448 if (r->res_lvbptr) {
449 free_lvb(r->res_lvbptr);
450 r->res_lvbptr = NULL;
451 }
452}
453
454/* When all references to the rsb are gone it's transfered to
455 the tossed list for later disposal. */
456
457static void put_rsb(struct dlm_rsb *r)
458{
459 struct dlm_ls *ls = r->res_ls;
460 uint32_t bucket = r->res_bucket;
461
462 write_lock(&ls->ls_rsbtbl[bucket].lock);
463 kref_put(&r->res_ref, toss_rsb);
464 write_unlock(&ls->ls_rsbtbl[bucket].lock);
465}
466
467void dlm_put_rsb(struct dlm_rsb *r)
468{
469 put_rsb(r);
470}
471
472/* See comment for unhold_lkb */
473
474static void unhold_rsb(struct dlm_rsb *r)
475{
476 int rv;
477 rv = kref_put(&r->res_ref, toss_rsb);
478 DLM_ASSERT(!rv, dlm_print_rsb(r););
479}
480
481static void kill_rsb(struct kref *kref)
482{
483 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
484
485 /* All work is done after the return from kref_put() so we
486 can release the write_lock before the remove and free. */
487
488 DLM_ASSERT(list_empty(&r->res_lookup),);
489 DLM_ASSERT(list_empty(&r->res_grantqueue),);
490 DLM_ASSERT(list_empty(&r->res_convertqueue),);
491 DLM_ASSERT(list_empty(&r->res_waitqueue),);
492 DLM_ASSERT(list_empty(&r->res_root_list),);
493 DLM_ASSERT(list_empty(&r->res_recover_list),);
494}
495
496/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
497 The rsb must exist as long as any lkb's for it do. */
498
499static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
500{
501 hold_rsb(r);
502 lkb->lkb_resource = r;
503}
504
505static void detach_lkb(struct dlm_lkb *lkb)
506{
507 if (lkb->lkb_resource) {
508 put_rsb(lkb->lkb_resource);
509 lkb->lkb_resource = NULL;
510 }
511}
512
513static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
514{
515 struct dlm_lkb *lkb, *tmp;
516 uint32_t lkid = 0;
517 uint16_t bucket;
518
519 lkb = allocate_lkb(ls);
520 if (!lkb)
521 return -ENOMEM;
522
523 lkb->lkb_nodeid = -1;
524 lkb->lkb_grmode = DLM_LOCK_IV;
525 kref_init(&lkb->lkb_ref);
526
527 get_random_bytes(&bucket, sizeof(bucket));
528 bucket &= (ls->ls_lkbtbl_size - 1);
529
530 write_lock(&ls->ls_lkbtbl[bucket].lock);
531
532 /* counter can roll over so we must verify lkid is not in use */
533
534 while (lkid == 0) {
535 lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
536
537 list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
538 lkb_idtbl_list) {
539 if (tmp->lkb_id != lkid)
540 continue;
541 lkid = 0;
542 break;
543 }
544 }
545
546 lkb->lkb_id = lkid;
547 list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
548 write_unlock(&ls->ls_lkbtbl[bucket].lock);
549
550 *lkb_ret = lkb;
551 return 0;
552}
553
554static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
555{
556 uint16_t bucket = lkid & 0xFFFF;
557 struct dlm_lkb *lkb;
558
559 list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
560 if (lkb->lkb_id == lkid)
561 return lkb;
562 }
563 return NULL;
564}
565
566static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
567{
568 struct dlm_lkb *lkb;
569 uint16_t bucket = lkid & 0xFFFF;
570
571 if (bucket >= ls->ls_lkbtbl_size)
572 return -EBADSLT;
573
574 read_lock(&ls->ls_lkbtbl[bucket].lock);
575 lkb = __find_lkb(ls, lkid);
576 if (lkb)
577 kref_get(&lkb->lkb_ref);
578 read_unlock(&ls->ls_lkbtbl[bucket].lock);
579
580 *lkb_ret = lkb;
581 return lkb ? 0 : -ENOENT;
582}
583
584static void kill_lkb(struct kref *kref)
585{
586 struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
587
588 /* All work is done after the return from kref_put() so we
589 can release the write_lock before the detach_lkb */
590
591 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
592}
593
594/* __put_lkb() is used when an lkb may not have an rsb attached to
595 it so we need to provide the lockspace explicitly */
596
597static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
598{
599 uint16_t bucket = lkb->lkb_id & 0xFFFF;
600
601 write_lock(&ls->ls_lkbtbl[bucket].lock);
602 if (kref_put(&lkb->lkb_ref, kill_lkb)) {
603 list_del(&lkb->lkb_idtbl_list);
604 write_unlock(&ls->ls_lkbtbl[bucket].lock);
605
606 detach_lkb(lkb);
607
608 /* for local/process lkbs, lvbptr points to caller's lksb */
609 if (lkb->lkb_lvbptr && is_master_copy(lkb))
610 free_lvb(lkb->lkb_lvbptr);
611 free_lkb(lkb);
612 return 1;
613 } else {
614 write_unlock(&ls->ls_lkbtbl[bucket].lock);
615 return 0;
616 }
617}
618
619int dlm_put_lkb(struct dlm_lkb *lkb)
620{
621 struct dlm_ls *ls;
622
623 DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb););
624 DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb););
625
626 ls = lkb->lkb_resource->res_ls;
627 return __put_lkb(ls, lkb);
628}
629
630/* This is only called to add a reference when the code already holds
631 a valid reference to the lkb, so there's no need for locking. */
632
633static inline void hold_lkb(struct dlm_lkb *lkb)
634{
635 kref_get(&lkb->lkb_ref);
636}
637
638/* This is called when we need to remove a reference and are certain
639 it's not the last ref. e.g. del_lkb is always called between a
640 find_lkb/put_lkb and is always the inverse of a previous add_lkb.
641 put_lkb would work fine, but would involve unnecessary locking */
642
643static inline void unhold_lkb(struct dlm_lkb *lkb)
644{
645 int rv;
646 rv = kref_put(&lkb->lkb_ref, kill_lkb);
647 DLM_ASSERT(!rv, dlm_print_lkb(lkb););
648}
649
650static void lkb_add_ordered(struct list_head *new, struct list_head *head,
651 int mode)
652{
653 struct dlm_lkb *lkb = NULL;
654
655 list_for_each_entry(lkb, head, lkb_statequeue)
656 if (lkb->lkb_rqmode < mode)
657 break;
658
659 if (!lkb)
660 list_add_tail(new, head);
661 else
662 __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
663}
664
665/* add/remove lkb to rsb's grant/convert/wait queue */
666
667static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
668{
669 kref_get(&lkb->lkb_ref);
670
671 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
672
673 lkb->lkb_status = status;
674
675 switch (status) {
676 case DLM_LKSTS_WAITING:
677 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
678 list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
679 else
680 list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
681 break;
682 case DLM_LKSTS_GRANTED:
683 /* convention says granted locks kept in order of grmode */
684 lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
685 lkb->lkb_grmode);
686 break;
687 case DLM_LKSTS_CONVERT:
688 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
689 list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
690 else
691 list_add_tail(&lkb->lkb_statequeue,
692 &r->res_convertqueue);
693 break;
694 default:
695 DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
696 }
697}
698
699static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
700{
701 lkb->lkb_status = 0;
702 list_del(&lkb->lkb_statequeue);
703 unhold_lkb(lkb);
704}
705
706static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
707{
708 hold_lkb(lkb);
709 del_lkb(r, lkb);
710 add_lkb(r, lkb, sts);
711 unhold_lkb(lkb);
712}
713
714/* add/remove lkb from global waiters list of lkb's waiting for
715 a reply from a remote node */
716
717static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
718{
719 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
720
721 mutex_lock(&ls->ls_waiters_mutex);
722 if (lkb->lkb_wait_type) {
723 log_print("add_to_waiters error %d", lkb->lkb_wait_type);
724 goto out;
725 }
726 lkb->lkb_wait_type = mstype;
727 kref_get(&lkb->lkb_ref);
728 list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
729 out:
730 mutex_unlock(&ls->ls_waiters_mutex);
731}
732
733static int _remove_from_waiters(struct dlm_lkb *lkb)
734{
735 int error = 0;
736
737 if (!lkb->lkb_wait_type) {
738 log_print("remove_from_waiters error");
739 error = -EINVAL;
740 goto out;
741 }
742 lkb->lkb_wait_type = 0;
743 list_del(&lkb->lkb_wait_reply);
744 unhold_lkb(lkb);
745 out:
746 return error;
747}
748
749static int remove_from_waiters(struct dlm_lkb *lkb)
750{
751 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
752 int error;
753
754 mutex_lock(&ls->ls_waiters_mutex);
755 error = _remove_from_waiters(lkb);
756 mutex_unlock(&ls->ls_waiters_mutex);
757 return error;
758}
759
760static void dir_remove(struct dlm_rsb *r)
761{
762 int to_nodeid;
763
764 if (dlm_no_directory(r->res_ls))
765 return;
766
767 to_nodeid = dlm_dir_nodeid(r);
768 if (to_nodeid != dlm_our_nodeid())
769 send_remove(r);
770 else
771 dlm_dir_remove_entry(r->res_ls, to_nodeid,
772 r->res_name, r->res_length);
773}
774
775/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
776 found since they are in order of newest to oldest? */
777
778static int shrink_bucket(struct dlm_ls *ls, int b)
779{
780 struct dlm_rsb *r;
781 int count = 0, found;
782
783 for (;;) {
784 found = 0;
785 write_lock(&ls->ls_rsbtbl[b].lock);
786 list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
787 res_hashchain) {
788 if (!time_after_eq(jiffies, r->res_toss_time +
789 dlm_config.toss_secs * HZ))
790 continue;
791 found = 1;
792 break;
793 }
794
795 if (!found) {
796 write_unlock(&ls->ls_rsbtbl[b].lock);
797 break;
798 }
799
800 if (kref_put(&r->res_ref, kill_rsb)) {
801 list_del(&r->res_hashchain);
802 write_unlock(&ls->ls_rsbtbl[b].lock);
803
804 if (is_master(r))
805 dir_remove(r);
806 free_rsb(r);
807 count++;
808 } else {
809 write_unlock(&ls->ls_rsbtbl[b].lock);
810 log_error(ls, "tossed rsb in use %s", r->res_name);
811 }
812 }
813
814 return count;
815}
816
817void dlm_scan_rsbs(struct dlm_ls *ls)
818{
819 int i;
820
821 if (dlm_locking_stopped(ls))
822 return;
823
824 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
825 shrink_bucket(ls, i);
826 cond_resched();
827 }
828}
829
830/* lkb is master or local copy */
831
832static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
833{
834 int b, len = r->res_ls->ls_lvblen;
835
836 /* b=1 lvb returned to caller
837 b=0 lvb written to rsb or invalidated
838 b=-1 do nothing */
839
840 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
841
842 if (b == 1) {
843 if (!lkb->lkb_lvbptr)
844 return;
845
846 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
847 return;
848
849 if (!r->res_lvbptr)
850 return;
851
852 memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
853 lkb->lkb_lvbseq = r->res_lvbseq;
854
855 } else if (b == 0) {
856 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
857 rsb_set_flag(r, RSB_VALNOTVALID);
858 return;
859 }
860
861 if (!lkb->lkb_lvbptr)
862 return;
863
864 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
865 return;
866
867 if (!r->res_lvbptr)
868 r->res_lvbptr = allocate_lvb(r->res_ls);
869
870 if (!r->res_lvbptr)
871 return;
872
873 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
874 r->res_lvbseq++;
875 lkb->lkb_lvbseq = r->res_lvbseq;
876 rsb_clear_flag(r, RSB_VALNOTVALID);
877 }
878
879 if (rsb_flag(r, RSB_VALNOTVALID))
880 lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
881}
882
883static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
884{
885 if (lkb->lkb_grmode < DLM_LOCK_PW)
886 return;
887
888 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
889 rsb_set_flag(r, RSB_VALNOTVALID);
890 return;
891 }
892
893 if (!lkb->lkb_lvbptr)
894 return;
895
896 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
897 return;
898
899 if (!r->res_lvbptr)
900 r->res_lvbptr = allocate_lvb(r->res_ls);
901
902 if (!r->res_lvbptr)
903 return;
904
905 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
906 r->res_lvbseq++;
907 rsb_clear_flag(r, RSB_VALNOTVALID);
908}
909
910/* lkb is process copy (pc) */
911
912static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
913 struct dlm_message *ms)
914{
915 int b;
916
917 if (!lkb->lkb_lvbptr)
918 return;
919
920 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
921 return;
922
923 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
924 if (b == 1) {
925 int len = receive_extralen(ms);
926 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
927 lkb->lkb_lvbseq = ms->m_lvbseq;
928 }
929}
930
931/* Manipulate lkb's on rsb's convert/granted/waiting queues
932 remove_lock -- used for unlock, removes lkb from granted
933 revert_lock -- used for cancel, moves lkb from convert to granted
934 grant_lock -- used for request and convert, adds lkb to granted or
935 moves lkb from convert or waiting to granted
936
937 Each of these is used for master or local copy lkb's. There is
938 also a _pc() variation used to make the corresponding change on
939 a process copy (pc) lkb. */
940
941static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
942{
943 del_lkb(r, lkb);
944 lkb->lkb_grmode = DLM_LOCK_IV;
945 /* this unhold undoes the original ref from create_lkb()
946 so this leads to the lkb being freed */
947 unhold_lkb(lkb);
948}
949
950static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
951{
952 set_lvb_unlock(r, lkb);
953 _remove_lock(r, lkb);
954}
955
956static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
957{
958 _remove_lock(r, lkb);
959}
960
961static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
962{
963 lkb->lkb_rqmode = DLM_LOCK_IV;
964
965 switch (lkb->lkb_status) {
966 case DLM_LKSTS_CONVERT:
967 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
968 break;
969 case DLM_LKSTS_WAITING:
970 del_lkb(r, lkb);
971 lkb->lkb_grmode = DLM_LOCK_IV;
972 /* this unhold undoes the original ref from create_lkb()
973 so this leads to the lkb being freed */
974 unhold_lkb(lkb);
975 break;
976 default:
977 log_print("invalid status for revert %d", lkb->lkb_status);
978 }
979}
980
981static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
982{
983 revert_lock(r, lkb);
984}
985
986static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
987{
988 if (lkb->lkb_grmode != lkb->lkb_rqmode) {
989 lkb->lkb_grmode = lkb->lkb_rqmode;
990 if (lkb->lkb_status)
991 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
992 else
993 add_lkb(r, lkb, DLM_LKSTS_GRANTED);
994 }
995
996 lkb->lkb_rqmode = DLM_LOCK_IV;
997}
998
999static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1000{
1001 set_lvb_lock(r, lkb);
1002 _grant_lock(r, lkb);
1003 lkb->lkb_highbast = 0;
1004}
1005
1006static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
1007 struct dlm_message *ms)
1008{
1009 set_lvb_lock_pc(r, lkb, ms);
1010 _grant_lock(r, lkb);
1011}
1012
1013/* called by grant_pending_locks() which means an async grant message must
1014 be sent to the requesting node in addition to granting the lock if the
1015 lkb belongs to a remote node. */
1016
1017static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
1018{
1019 grant_lock(r, lkb);
1020 if (is_master_copy(lkb))
1021 send_grant(r, lkb);
1022 else
1023 queue_cast(r, lkb, 0);
1024}
1025
1026static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
1027{
1028 struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
1029 lkb_statequeue);
1030 if (lkb->lkb_id == first->lkb_id)
1031 return 1;
1032
1033 return 0;
1034}
1035
1036/* Check if the given lkb conflicts with another lkb on the queue. */
1037
1038static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
1039{
1040 struct dlm_lkb *this;
1041
1042 list_for_each_entry(this, head, lkb_statequeue) {
1043 if (this == lkb)
1044 continue;
1045 if (!modes_compat(this, lkb))
1046 return 1;
1047 }
1048 return 0;
1049}
1050
1051/*
1052 * "A conversion deadlock arises with a pair of lock requests in the converting
1053 * queue for one resource. The granted mode of each lock blocks the requested
1054 * mode of the other lock."
1055 *
1056 * Part 2: if the granted mode of lkb is preventing the first lkb in the
1057 * convert queue from being granted, then demote lkb (set grmode to NL).
1058 * This second form requires that we check for conv-deadlk even when
1059 * now == 0 in _can_be_granted().
1060 *
1061 * Example:
1062 * Granted Queue: empty
1063 * Convert Queue: NL->EX (first lock)
1064 * PR->EX (second lock)
1065 *
1066 * The first lock can't be granted because of the granted mode of the second
1067 * lock and the second lock can't be granted because it's not first in the
1068 * list. We demote the granted mode of the second lock (the lkb passed to this
1069 * function).
1070 *
1071 * After the resolution, the "grant pending" function needs to go back and try
1072 * to grant locks on the convert queue again since the first lock can now be
1073 * granted.
1074 */
1075
1076static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
1077{
1078 struct dlm_lkb *this, *first = NULL, *self = NULL;
1079
1080 list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
1081 if (!first)
1082 first = this;
1083 if (this == lkb) {
1084 self = lkb;
1085 continue;
1086 }
1087
1088 if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
1089 return 1;
1090 }
1091
1092 /* if lkb is on the convert queue and is preventing the first
1093 from being granted, then there's deadlock and we demote lkb.
1094 multiple converting locks may need to do this before the first
1095 converting lock can be granted. */
1096
1097 if (self && self != first) {
1098 if (!modes_compat(lkb, first) &&
1099 !queue_conflict(&rsb->res_grantqueue, first))
1100 return 1;
1101 }
1102
1103 return 0;
1104}
1105
1106/*
1107 * Return 1 if the lock can be granted, 0 otherwise.
1108 * Also detect and resolve conversion deadlocks.
1109 *
1110 * lkb is the lock to be granted
1111 *
1112 * now is 1 if the function is being called in the context of the
1113 * immediate request, it is 0 if called later, after the lock has been
1114 * queued.
1115 *
1116 * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
1117 */
1118
1119static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1120{
1121 int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
1122
1123 /*
1124 * 6-10: Version 5.4 introduced an option to address the phenomenon of
1125 * a new request for a NL mode lock being blocked.
1126 *
1127 * 6-11: If the optional EXPEDITE flag is used with the new NL mode
1128 * request, then it would be granted. In essence, the use of this flag
1129 * tells the Lock Manager to expedite theis request by not considering
1130 * what may be in the CONVERTING or WAITING queues... As of this
1131 * writing, the EXPEDITE flag can be used only with new requests for NL
1132 * mode locks. This flag is not valid for conversion requests.
1133 *
1134 * A shortcut. Earlier checks return an error if EXPEDITE is used in a
1135 * conversion or used with a non-NL requested mode. We also know an
1136 * EXPEDITE request is always granted immediately, so now must always
1137 * be 1. The full condition to grant an expedite request: (now &&
1138 * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
1139 * therefore be shortened to just checking the flag.
1140 */
1141
1142 if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
1143 return 1;
1144
1145 /*
1146 * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
1147 * added to the remaining conditions.
1148 */
1149
1150 if (queue_conflict(&r->res_grantqueue, lkb))
1151 goto out;
1152
1153 /*
1154 * 6-3: By default, a conversion request is immediately granted if the
1155 * requested mode is compatible with the modes of all other granted
1156 * locks
1157 */
1158
1159 if (queue_conflict(&r->res_convertqueue, lkb))
1160 goto out;
1161
1162 /*
1163 * 6-5: But the default algorithm for deciding whether to grant or
1164 * queue conversion requests does not by itself guarantee that such
1165 * requests are serviced on a "first come first serve" basis. This, in
1166 * turn, can lead to a phenomenon known as "indefinate postponement".
1167 *
1168 * 6-7: This issue is dealt with by using the optional QUECVT flag with
1169 * the system service employed to request a lock conversion. This flag
1170 * forces certain conversion requests to be queued, even if they are
1171 * compatible with the granted modes of other locks on the same
1172 * resource. Thus, the use of this flag results in conversion requests
1173 * being ordered on a "first come first servce" basis.
1174 *
1175 * DCT: This condition is all about new conversions being able to occur
1176 * "in place" while the lock remains on the granted queue (assuming
1177 * nothing else conflicts.) IOW if QUECVT isn't set, a conversion
1178 * doesn't _have_ to go onto the convert queue where it's processed in
1179 * order. The "now" variable is necessary to distinguish converts
1180 * being received and processed for the first time now, because once a
1181 * convert is moved to the conversion queue the condition below applies
1182 * requiring fifo granting.
1183 */
1184
1185 if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
1186 return 1;
1187
1188 /*
1189 * The NOORDER flag is set to avoid the standard vms rules on grant
1190 * order.
1191 */
1192
1193 if (lkb->lkb_exflags & DLM_LKF_NOORDER)
1194 return 1;
1195
1196 /*
1197 * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
1198 * granted until all other conversion requests ahead of it are granted
1199 * and/or canceled.
1200 */
1201
1202 if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
1203 return 1;
1204
1205 /*
1206 * 6-4: By default, a new request is immediately granted only if all
1207 * three of the following conditions are satisfied when the request is
1208 * issued:
1209 * - The queue of ungranted conversion requests for the resource is
1210 * empty.
1211 * - The queue of ungranted new requests for the resource is empty.
1212 * - The mode of the new request is compatible with the most
1213 * restrictive mode of all granted locks on the resource.
1214 */
1215
1216 if (now && !conv && list_empty(&r->res_convertqueue) &&
1217 list_empty(&r->res_waitqueue))
1218 return 1;
1219
1220 /*
1221 * 6-4: Once a lock request is in the queue of ungranted new requests,
1222 * it cannot be granted until the queue of ungranted conversion
1223 * requests is empty, all ungranted new requests ahead of it are
1224 * granted and/or canceled, and it is compatible with the granted mode
1225 * of the most restrictive lock granted on the resource.
1226 */
1227
1228 if (!now && !conv && list_empty(&r->res_convertqueue) &&
1229 first_in_list(lkb, &r->res_waitqueue))
1230 return 1;
1231
1232 out:
1233 /*
1234 * The following, enabled by CONVDEADLK, departs from VMS.
1235 */
1236
1237 if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
1238 conversion_deadlock_detect(r, lkb)) {
1239 lkb->lkb_grmode = DLM_LOCK_NL;
1240 lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
1241 }
1242
1243 return 0;
1244}
1245
1246/*
1247 * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a
1248 * simple way to provide a big optimization to applications that can use them.
1249 */
1250
1251static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1252{
1253 uint32_t flags = lkb->lkb_exflags;
1254 int rv;
1255 int8_t alt = 0, rqmode = lkb->lkb_rqmode;
1256
1257 rv = _can_be_granted(r, lkb, now);
1258 if (rv)
1259 goto out;
1260
1261 if (lkb->lkb_sbflags & DLM_SBF_DEMOTED)
1262 goto out;
1263
1264 if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR)
1265 alt = DLM_LOCK_PR;
1266 else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW)
1267 alt = DLM_LOCK_CW;
1268
1269 if (alt) {
1270 lkb->lkb_rqmode = alt;
1271 rv = _can_be_granted(r, lkb, now);
1272 if (rv)
1273 lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
1274 else
1275 lkb->lkb_rqmode = rqmode;
1276 }
1277 out:
1278 return rv;
1279}
1280
1281static int grant_pending_convert(struct dlm_rsb *r, int high)
1282{
1283 struct dlm_lkb *lkb, *s;
1284 int hi, demoted, quit, grant_restart, demote_restart;
1285
1286 quit = 0;
1287 restart:
1288 grant_restart = 0;
1289 demote_restart = 0;
1290 hi = DLM_LOCK_IV;
1291
1292 list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
1293 demoted = is_demoted(lkb);
1294 if (can_be_granted(r, lkb, 0)) {
1295 grant_lock_pending(r, lkb);
1296 grant_restart = 1;
1297 } else {
1298 hi = max_t(int, lkb->lkb_rqmode, hi);
1299 if (!demoted && is_demoted(lkb))
1300 demote_restart = 1;
1301 }
1302 }
1303
1304 if (grant_restart)
1305 goto restart;
1306 if (demote_restart && !quit) {
1307 quit = 1;
1308 goto restart;
1309 }
1310
1311 return max_t(int, high, hi);
1312}
1313
1314static int grant_pending_wait(struct dlm_rsb *r, int high)
1315{
1316 struct dlm_lkb *lkb, *s;
1317
1318 list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
1319 if (can_be_granted(r, lkb, 0))
1320 grant_lock_pending(r, lkb);
1321 else
1322 high = max_t(int, lkb->lkb_rqmode, high);
1323 }
1324
1325 return high;
1326}
1327
1328static void grant_pending_locks(struct dlm_rsb *r)
1329{
1330 struct dlm_lkb *lkb, *s;
1331 int high = DLM_LOCK_IV;
1332
1333 DLM_ASSERT(is_master(r), dlm_print_rsb(r););
1334
1335 high = grant_pending_convert(r, high);
1336 high = grant_pending_wait(r, high);
1337
1338 if (high == DLM_LOCK_IV)
1339 return;
1340
1341 /*
1342 * If there are locks left on the wait/convert queue then send blocking
1343 * ASTs to granted locks based on the largest requested mode (high)
1344 * found above. FIXME: highbast < high comparison not valid for PR/CW.
1345 */
1346
1347 list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
1348 if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
1349 !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
1350 queue_bast(r, lkb, high);
1351 lkb->lkb_highbast = high;
1352 }
1353 }
1354}
1355
1356static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
1357 struct dlm_lkb *lkb)
1358{
1359 struct dlm_lkb *gr;
1360
1361 list_for_each_entry(gr, head, lkb_statequeue) {
1362 if (gr->lkb_bastaddr &&
1363 gr->lkb_highbast < lkb->lkb_rqmode &&
1364 !modes_compat(gr, lkb)) {
1365 queue_bast(r, gr, lkb->lkb_rqmode);
1366 gr->lkb_highbast = lkb->lkb_rqmode;
1367 }
1368 }
1369}
1370
1371static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
1372{
1373 send_bast_queue(r, &r->res_grantqueue, lkb);
1374}
1375
1376static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
1377{
1378 send_bast_queue(r, &r->res_grantqueue, lkb);
1379 send_bast_queue(r, &r->res_convertqueue, lkb);
1380}
1381
1382/* set_master(r, lkb) -- set the master nodeid of a resource
1383
1384 The purpose of this function is to set the nodeid field in the given
1385 lkb using the nodeid field in the given rsb. If the rsb's nodeid is
1386 known, it can just be copied to the lkb and the function will return
1387 0. If the rsb's nodeid is _not_ known, it needs to be looked up
1388 before it can be copied to the lkb.
1389
1390 When the rsb nodeid is being looked up remotely, the initial lkb
1391 causing the lookup is kept on the ls_waiters list waiting for the
1392 lookup reply. Other lkb's waiting for the same rsb lookup are kept
1393 on the rsb's res_lookup list until the master is verified.
1394
1395 Return values:
1396 0: nodeid is set in rsb/lkb and the caller should go ahead and use it
1397 1: the rsb master is not available and the lkb has been placed on
1398 a wait queue
1399*/
1400
1401static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1402{
1403 struct dlm_ls *ls = r->res_ls;
1404 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
1405
1406 if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
1407 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
1408 r->res_first_lkid = lkb->lkb_id;
1409 lkb->lkb_nodeid = r->res_nodeid;
1410 return 0;
1411 }
1412
1413 if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
1414 list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
1415 return 1;
1416 }
1417
1418 if (r->res_nodeid == 0) {
1419 lkb->lkb_nodeid = 0;
1420 return 0;
1421 }
1422
1423 if (r->res_nodeid > 0) {
1424 lkb->lkb_nodeid = r->res_nodeid;
1425 return 0;
1426 }
1427
1428 DLM_ASSERT(r->res_nodeid == -1, dlm_print_rsb(r););
1429
1430 dir_nodeid = dlm_dir_nodeid(r);
1431
1432 if (dir_nodeid != our_nodeid) {
1433 r->res_first_lkid = lkb->lkb_id;
1434 send_lookup(r, lkb);
1435 return 1;
1436 }
1437
1438 for (;;) {
1439 /* It's possible for dlm_scand to remove an old rsb for
1440 this same resource from the toss list, us to create
1441 a new one, look up the master locally, and find it
1442 already exists just before dlm_scand does the
1443 dir_remove() on the previous rsb. */
1444
1445 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
1446 r->res_length, &ret_nodeid);
1447 if (!error)
1448 break;
1449 log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
1450 schedule();
1451 }
1452
1453 if (ret_nodeid == our_nodeid) {
1454 r->res_first_lkid = 0;
1455 r->res_nodeid = 0;
1456 lkb->lkb_nodeid = 0;
1457 } else {
1458 r->res_first_lkid = lkb->lkb_id;
1459 r->res_nodeid = ret_nodeid;
1460 lkb->lkb_nodeid = ret_nodeid;
1461 }
1462 return 0;
1463}
1464
1465static void process_lookup_list(struct dlm_rsb *r)
1466{
1467 struct dlm_lkb *lkb, *safe;
1468
1469 list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
1470 list_del(&lkb->lkb_rsb_lookup);
1471 _request_lock(r, lkb);
1472 schedule();
1473 }
1474}
1475
1476/* confirm_master -- confirm (or deny) an rsb's master nodeid */
1477
1478static void confirm_master(struct dlm_rsb *r, int error)
1479{
1480 struct dlm_lkb *lkb;
1481
1482 if (!r->res_first_lkid)
1483 return;
1484
1485 switch (error) {
1486 case 0:
1487 case -EINPROGRESS:
1488 r->res_first_lkid = 0;
1489 process_lookup_list(r);
1490 break;
1491
1492 case -EAGAIN:
1493 /* the remote master didn't queue our NOQUEUE request;
1494 make a waiting lkb the first_lkid */
1495
1496 r->res_first_lkid = 0;
1497
1498 if (!list_empty(&r->res_lookup)) {
1499 lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
1500 lkb_rsb_lookup);
1501 list_del(&lkb->lkb_rsb_lookup);
1502 r->res_first_lkid = lkb->lkb_id;
1503 _request_lock(r, lkb);
1504 } else
1505 r->res_nodeid = -1;
1506 break;
1507
1508 default:
1509 log_error(r->res_ls, "confirm_master unknown error %d", error);
1510 }
1511}
1512
1513static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
1514 int namelen, uint32_t parent_lkid, void *ast,
1515 void *astarg, void *bast, struct dlm_args *args)
1516{
1517 int rv = -EINVAL;
1518
1519 /* check for invalid arg usage */
1520
1521 if (mode < 0 || mode > DLM_LOCK_EX)
1522 goto out;
1523
1524 if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
1525 goto out;
1526
1527 if (flags & DLM_LKF_CANCEL)
1528 goto out;
1529
1530 if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
1531 goto out;
1532
1533 if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
1534 goto out;
1535
1536 if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
1537 goto out;
1538
1539 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
1540 goto out;
1541
1542 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
1543 goto out;
1544
1545 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
1546 goto out;
1547
1548 if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
1549 goto out;
1550
1551 if (!ast || !lksb)
1552 goto out;
1553
1554 if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
1555 goto out;
1556
1557 /* parent/child locks not yet supported */
1558 if (parent_lkid)
1559 goto out;
1560
1561 if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
1562 goto out;
1563
1564 /* these args will be copied to the lkb in validate_lock_args,
1565 it cannot be done now because when converting locks, fields in
1566 an active lkb cannot be modified before locking the rsb */
1567
1568 args->flags = flags;
1569 args->astaddr = ast;
1570 args->astparam = (long) astarg;
1571 args->bastaddr = bast;
1572 args->mode = mode;
1573 args->lksb = lksb;
1574 rv = 0;
1575 out:
1576 return rv;
1577}
1578
1579static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
1580{
1581 if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
1582 DLM_LKF_FORCEUNLOCK))
1583 return -EINVAL;
1584
1585 args->flags = flags;
1586 args->astparam = (long) astarg;
1587 return 0;
1588}
1589
1590static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
1591 struct dlm_args *args)
1592{
1593 int rv = -EINVAL;
1594
1595 if (args->flags & DLM_LKF_CONVERT) {
1596 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1597 goto out;
1598
1599 if (args->flags & DLM_LKF_QUECVT &&
1600 !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
1601 goto out;
1602
1603 rv = -EBUSY;
1604 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
1605 goto out;
1606
1607 if (lkb->lkb_wait_type)
1608 goto out;
1609 }
1610
1611 lkb->lkb_exflags = args->flags;
1612 lkb->lkb_sbflags = 0;
1613 lkb->lkb_astaddr = args->astaddr;
1614 lkb->lkb_astparam = args->astparam;
1615 lkb->lkb_bastaddr = args->bastaddr;
1616 lkb->lkb_rqmode = args->mode;
1617 lkb->lkb_lksb = args->lksb;
1618 lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
1619 lkb->lkb_ownpid = (int) current->pid;
1620 rv = 0;
1621 out:
1622 return rv;
1623}
1624
1625static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
1626{
1627 int rv = -EINVAL;
1628
1629 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1630 goto out;
1631
1632 if (args->flags & DLM_LKF_FORCEUNLOCK)
1633 goto out_ok;
1634
1635 if (args->flags & DLM_LKF_CANCEL &&
1636 lkb->lkb_status == DLM_LKSTS_GRANTED)
1637 goto out;
1638
1639 if (!(args->flags & DLM_LKF_CANCEL) &&
1640 lkb->lkb_status != DLM_LKSTS_GRANTED)
1641 goto out;
1642
1643 rv = -EBUSY;
1644 if (lkb->lkb_wait_type)
1645 goto out;
1646
1647 out_ok:
1648 lkb->lkb_exflags = args->flags;
1649 lkb->lkb_sbflags = 0;
1650 lkb->lkb_astparam = args->astparam;
1651
1652 rv = 0;
1653 out:
1654 return rv;
1655}
1656
1657/*
1658 * Four stage 4 varieties:
1659 * do_request(), do_convert(), do_unlock(), do_cancel()
1660 * These are called on the master node for the given lock and
1661 * from the central locking logic.
1662 */
1663
1664static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
1665{
1666 int error = 0;
1667
1668 if (can_be_granted(r, lkb, 1)) {
1669 grant_lock(r, lkb);
1670 queue_cast(r, lkb, 0);
1671 goto out;
1672 }
1673
1674 if (can_be_queued(lkb)) {
1675 error = -EINPROGRESS;
1676 add_lkb(r, lkb, DLM_LKSTS_WAITING);
1677 send_blocking_asts(r, lkb);
1678 goto out;
1679 }
1680
1681 error = -EAGAIN;
1682 if (force_blocking_asts(lkb))
1683 send_blocking_asts_all(r, lkb);
1684 queue_cast(r, lkb, -EAGAIN);
1685
1686 out:
1687 return error;
1688}
1689
1690static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
1691{
1692 int error = 0;
1693
1694 /* changing an existing lock may allow others to be granted */
1695
1696 if (can_be_granted(r, lkb, 1)) {
1697 grant_lock(r, lkb);
1698 queue_cast(r, lkb, 0);
1699 grant_pending_locks(r);
1700 goto out;
1701 }
1702
1703 if (can_be_queued(lkb)) {
1704 if (is_demoted(lkb))
1705 grant_pending_locks(r);
1706 error = -EINPROGRESS;
1707 del_lkb(r, lkb);
1708 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
1709 send_blocking_asts(r, lkb);
1710 goto out;
1711 }
1712
1713 error = -EAGAIN;
1714 if (force_blocking_asts(lkb))
1715 send_blocking_asts_all(r, lkb);
1716 queue_cast(r, lkb, -EAGAIN);
1717
1718 out:
1719 return error;
1720}
1721
1722static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1723{
1724 remove_lock(r, lkb);
1725 queue_cast(r, lkb, -DLM_EUNLOCK);
1726 grant_pending_locks(r);
1727 return -DLM_EUNLOCK;
1728}
1729
1730static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
1731{
1732 revert_lock(r, lkb);
1733 queue_cast(r, lkb, -DLM_ECANCEL);
1734 grant_pending_locks(r);
1735 return -DLM_ECANCEL;
1736}
1737
1738/*
1739 * Four stage 3 varieties:
1740 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
1741 */
1742
1743/* add a new lkb to a possibly new rsb, called by requesting process */
1744
1745static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1746{
1747 int error;
1748
1749 /* set_master: sets lkb nodeid from r */
1750
1751 error = set_master(r, lkb);
1752 if (error < 0)
1753 goto out;
1754 if (error) {
1755 error = 0;
1756 goto out;
1757 }
1758
1759 if (is_remote(r))
1760 /* receive_request() calls do_request() on remote node */
1761 error = send_request(r, lkb);
1762 else
1763 error = do_request(r, lkb);
1764 out:
1765 return error;
1766}
1767
1768/* change some property of an existing lkb, e.g. mode */
1769
1770static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1771{
1772 int error;
1773
1774 if (is_remote(r))
1775 /* receive_convert() calls do_convert() on remote node */
1776 error = send_convert(r, lkb);
1777 else
1778 error = do_convert(r, lkb);
1779
1780 return error;
1781}
1782
1783/* remove an existing lkb from the granted queue */
1784
1785static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1786{
1787 int error;
1788
1789 if (is_remote(r))
1790 /* receive_unlock() calls do_unlock() on remote node */
1791 error = send_unlock(r, lkb);
1792 else
1793 error = do_unlock(r, lkb);
1794
1795 return error;
1796}
1797
1798/* remove an existing lkb from the convert or wait queue */
1799
1800static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1801{
1802 int error;
1803
1804 if (is_remote(r))
1805 /* receive_cancel() calls do_cancel() on remote node */
1806 error = send_cancel(r, lkb);
1807 else
1808 error = do_cancel(r, lkb);
1809
1810 return error;
1811}
1812
1813/*
1814 * Four stage 2 varieties:
1815 * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
1816 */
1817
1818static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
1819 int len, struct dlm_args *args)
1820{
1821 struct dlm_rsb *r;
1822 int error;
1823
1824 error = validate_lock_args(ls, lkb, args);
1825 if (error)
1826 goto out;
1827
1828 error = find_rsb(ls, name, len, R_CREATE, &r);
1829 if (error)
1830 goto out;
1831
1832 lock_rsb(r);
1833
1834 attach_lkb(r, lkb);
1835 lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
1836
1837 error = _request_lock(r, lkb);
1838
1839 unlock_rsb(r);
1840 put_rsb(r);
1841
1842 out:
1843 return error;
1844}
1845
1846static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1847 struct dlm_args *args)
1848{
1849 struct dlm_rsb *r;
1850 int error;
1851
1852 r = lkb->lkb_resource;
1853
1854 hold_rsb(r);
1855 lock_rsb(r);
1856
1857 error = validate_lock_args(ls, lkb, args);
1858 if (error)
1859 goto out;
1860
1861 error = _convert_lock(r, lkb);
1862 out:
1863 unlock_rsb(r);
1864 put_rsb(r);
1865 return error;
1866}
1867
1868static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1869 struct dlm_args *args)
1870{
1871 struct dlm_rsb *r;
1872 int error;
1873
1874 r = lkb->lkb_resource;
1875
1876 hold_rsb(r);
1877 lock_rsb(r);
1878
1879 error = validate_unlock_args(lkb, args);
1880 if (error)
1881 goto out;
1882
1883 error = _unlock_lock(r, lkb);
1884 out:
1885 unlock_rsb(r);
1886 put_rsb(r);
1887 return error;
1888}
1889
1890static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1891 struct dlm_args *args)
1892{
1893 struct dlm_rsb *r;
1894 int error;
1895
1896 r = lkb->lkb_resource;
1897
1898 hold_rsb(r);
1899 lock_rsb(r);
1900
1901 error = validate_unlock_args(lkb, args);
1902 if (error)
1903 goto out;
1904
1905 error = _cancel_lock(r, lkb);
1906 out:
1907 unlock_rsb(r);
1908 put_rsb(r);
1909 return error;
1910}
1911
1912/*
1913 * Two stage 1 varieties: dlm_lock() and dlm_unlock()
1914 */
1915
1916int dlm_lock(dlm_lockspace_t *lockspace,
1917 int mode,
1918 struct dlm_lksb *lksb,
1919 uint32_t flags,
1920 void *name,
1921 unsigned int namelen,
1922 uint32_t parent_lkid,
1923 void (*ast) (void *astarg),
1924 void *astarg,
1925 void (*bast) (void *astarg, int mode))
1926{
1927 struct dlm_ls *ls;
1928 struct dlm_lkb *lkb;
1929 struct dlm_args args;
1930 int error, convert = flags & DLM_LKF_CONVERT;
1931
1932 ls = dlm_find_lockspace_local(lockspace);
1933 if (!ls)
1934 return -EINVAL;
1935
1936 lock_recovery(ls);
1937
1938 if (convert)
1939 error = find_lkb(ls, lksb->sb_lkid, &lkb);
1940 else
1941 error = create_lkb(ls, &lkb);
1942
1943 if (error)
1944 goto out;
1945
1946 error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast,
1947 astarg, bast, &args);
1948 if (error)
1949 goto out_put;
1950
1951 if (convert)
1952 error = convert_lock(ls, lkb, &args);
1953 else
1954 error = request_lock(ls, lkb, name, namelen, &args);
1955
1956 if (error == -EINPROGRESS)
1957 error = 0;
1958 out_put:
1959 if (convert || error)
1960 __put_lkb(ls, lkb);
1961 if (error == -EAGAIN)
1962 error = 0;
1963 out:
1964 unlock_recovery(ls);
1965 dlm_put_lockspace(ls);
1966 return error;
1967}
1968
1969int dlm_unlock(dlm_lockspace_t *lockspace,
1970 uint32_t lkid,
1971 uint32_t flags,
1972 struct dlm_lksb *lksb,
1973 void *astarg)
1974{
1975 struct dlm_ls *ls;
1976 struct dlm_lkb *lkb;
1977 struct dlm_args args;
1978 int error;
1979
1980 ls = dlm_find_lockspace_local(lockspace);
1981 if (!ls)
1982 return -EINVAL;
1983
1984 lock_recovery(ls);
1985
1986 error = find_lkb(ls, lkid, &lkb);
1987 if (error)
1988 goto out;
1989
1990 error = set_unlock_args(flags, astarg, &args);
1991 if (error)
1992 goto out_put;
1993
1994 if (flags & DLM_LKF_CANCEL)
1995 error = cancel_lock(ls, lkb, &args);
1996 else
1997 error = unlock_lock(ls, lkb, &args);
1998
1999 if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
2000 error = 0;
2001 out_put:
2002 dlm_put_lkb(lkb);
2003 out:
2004 unlock_recovery(ls);
2005 dlm_put_lockspace(ls);
2006 return error;
2007}
2008
2009/*
2010 * send/receive routines for remote operations and replies
2011 *
2012 * send_args
2013 * send_common
2014 * send_request receive_request
2015 * send_convert receive_convert
2016 * send_unlock receive_unlock
2017 * send_cancel receive_cancel
2018 * send_grant receive_grant
2019 * send_bast receive_bast
2020 * send_lookup receive_lookup
2021 * send_remove receive_remove
2022 *
2023 * send_common_reply
2024 * receive_request_reply send_request_reply
2025 * receive_convert_reply send_convert_reply
2026 * receive_unlock_reply send_unlock_reply
2027 * receive_cancel_reply send_cancel_reply
2028 * receive_lookup_reply send_lookup_reply
2029 */
2030
2031static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
2032 int to_nodeid, int mstype,
2033 struct dlm_message **ms_ret,
2034 struct dlm_mhandle **mh_ret)
2035{
2036 struct dlm_message *ms;
2037 struct dlm_mhandle *mh;
2038 char *mb;
2039 int mb_len = sizeof(struct dlm_message);
2040
2041 switch (mstype) {
2042 case DLM_MSG_REQUEST:
2043 case DLM_MSG_LOOKUP:
2044 case DLM_MSG_REMOVE:
2045 mb_len += r->res_length;
2046 break;
2047 case DLM_MSG_CONVERT:
2048 case DLM_MSG_UNLOCK:
2049 case DLM_MSG_REQUEST_REPLY:
2050 case DLM_MSG_CONVERT_REPLY:
2051 case DLM_MSG_GRANT:
2052 if (lkb && lkb->lkb_lvbptr)
2053 mb_len += r->res_ls->ls_lvblen;
2054 break;
2055 }
2056
2057 /* get_buffer gives us a message handle (mh) that we need to
2058 pass into lowcomms_commit and a message buffer (mb) that we
2059 write our data into */
2060
2061 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
2062 if (!mh)
2063 return -ENOBUFS;
2064
2065 memset(mb, 0, mb_len);
2066
2067 ms = (struct dlm_message *) mb;
2068
2069 ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
2070 ms->m_header.h_lockspace = r->res_ls->ls_global_id;
2071 ms->m_header.h_nodeid = dlm_our_nodeid();
2072 ms->m_header.h_length = mb_len;
2073 ms->m_header.h_cmd = DLM_MSG;
2074
2075 ms->m_type = mstype;
2076
2077 *mh_ret = mh;
2078 *ms_ret = ms;
2079 return 0;
2080}
2081
2082/* further lowcomms enhancements or alternate implementations may make
2083 the return value from this function useful at some point */
2084
2085static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
2086{
2087 dlm_message_out(ms);
2088 dlm_lowcomms_commit_buffer(mh);
2089 return 0;
2090}
2091
2092static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
2093 struct dlm_message *ms)
2094{
2095 ms->m_nodeid = lkb->lkb_nodeid;
2096 ms->m_pid = lkb->lkb_ownpid;
2097 ms->m_lkid = lkb->lkb_id;
2098 ms->m_remid = lkb->lkb_remid;
2099 ms->m_exflags = lkb->lkb_exflags;
2100 ms->m_sbflags = lkb->lkb_sbflags;
2101 ms->m_flags = lkb->lkb_flags;
2102 ms->m_lvbseq = lkb->lkb_lvbseq;
2103 ms->m_status = lkb->lkb_status;
2104 ms->m_grmode = lkb->lkb_grmode;
2105 ms->m_rqmode = lkb->lkb_rqmode;
2106 ms->m_hash = r->res_hash;
2107
2108 /* m_result and m_bastmode are set from function args,
2109 not from lkb fields */
2110
2111 if (lkb->lkb_bastaddr)
2112 ms->m_asts |= AST_BAST;
2113 if (lkb->lkb_astaddr)
2114 ms->m_asts |= AST_COMP;
2115
2116 if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
2117 memcpy(ms->m_extra, r->res_name, r->res_length);
2118
2119 else if (lkb->lkb_lvbptr)
2120 memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
2121
2122}
2123
2124static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
2125{
2126 struct dlm_message *ms;
2127 struct dlm_mhandle *mh;
2128 int to_nodeid, error;
2129
2130 add_to_waiters(lkb, mstype);
2131
2132 to_nodeid = r->res_nodeid;
2133
2134 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2135 if (error)
2136 goto fail;
2137
2138 send_args(r, lkb, ms);
2139
2140 error = send_message(mh, ms);
2141 if (error)
2142 goto fail;
2143 return 0;
2144
2145 fail:
2146 remove_from_waiters(lkb);
2147 return error;
2148}
2149
2150static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
2151{
2152 return send_common(r, lkb, DLM_MSG_REQUEST);
2153}
2154
2155static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2156{
2157 int error;
2158
2159 error = send_common(r, lkb, DLM_MSG_CONVERT);
2160
2161 /* down conversions go without a reply from the master */
2162 if (!error && down_conversion(lkb)) {
2163 remove_from_waiters(lkb);
2164 r->res_ls->ls_stub_ms.m_result = 0;
2165 __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
2166 }
2167
2168 return error;
2169}
2170
2171/* FIXME: if this lkb is the only lock we hold on the rsb, then set
2172 MASTER_UNCERTAIN to force the next request on the rsb to confirm
2173 that the master is still correct. */
2174
2175static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2176{
2177 return send_common(r, lkb, DLM_MSG_UNLOCK);
2178}
2179
2180static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
2181{
2182 return send_common(r, lkb, DLM_MSG_CANCEL);
2183}
2184
2185static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
2186{
2187 struct dlm_message *ms;
2188 struct dlm_mhandle *mh;
2189 int to_nodeid, error;
2190
2191 to_nodeid = lkb->lkb_nodeid;
2192
2193 error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
2194 if (error)
2195 goto out;
2196
2197 send_args(r, lkb, ms);
2198
2199 ms->m_result = 0;
2200
2201 error = send_message(mh, ms);
2202 out:
2203 return error;
2204}
2205
2206static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
2207{
2208 struct dlm_message *ms;
2209 struct dlm_mhandle *mh;
2210 int to_nodeid, error;
2211
2212 to_nodeid = lkb->lkb_nodeid;
2213
2214 error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
2215 if (error)
2216 goto out;
2217
2218 send_args(r, lkb, ms);
2219
2220 ms->m_bastmode = mode;
2221
2222 error = send_message(mh, ms);
2223 out:
2224 return error;
2225}
2226
2227static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
2228{
2229 struct dlm_message *ms;
2230 struct dlm_mhandle *mh;
2231 int to_nodeid, error;
2232
2233 add_to_waiters(lkb, DLM_MSG_LOOKUP);
2234
2235 to_nodeid = dlm_dir_nodeid(r);
2236
2237 error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
2238 if (error)
2239 goto fail;
2240
2241 send_args(r, lkb, ms);
2242
2243 error = send_message(mh, ms);
2244 if (error)
2245 goto fail;
2246 return 0;
2247
2248 fail:
2249 remove_from_waiters(lkb);
2250 return error;
2251}
2252
2253static int send_remove(struct dlm_rsb *r)
2254{
2255 struct dlm_message *ms;
2256 struct dlm_mhandle *mh;
2257 int to_nodeid, error;
2258
2259 to_nodeid = dlm_dir_nodeid(r);
2260
2261 error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
2262 if (error)
2263 goto out;
2264
2265 memcpy(ms->m_extra, r->res_name, r->res_length);
2266 ms->m_hash = r->res_hash;
2267
2268 error = send_message(mh, ms);
2269 out:
2270 return error;
2271}
2272
2273static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2274 int mstype, int rv)
2275{
2276 struct dlm_message *ms;
2277 struct dlm_mhandle *mh;
2278 int to_nodeid, error;
2279
2280 to_nodeid = lkb->lkb_nodeid;
2281
2282 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2283 if (error)
2284 goto out;
2285
2286 send_args(r, lkb, ms);
2287
2288 ms->m_result = rv;
2289
2290 error = send_message(mh, ms);
2291 out:
2292 return error;
2293}
2294
2295static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2296{
2297 return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
2298}
2299
2300static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2301{
2302 return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
2303}
2304
2305static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2306{
2307 return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
2308}
2309
2310static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2311{
2312 return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
2313}
2314
2315static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
2316 int ret_nodeid, int rv)
2317{
2318 struct dlm_rsb *r = &ls->ls_stub_rsb;
2319 struct dlm_message *ms;
2320 struct dlm_mhandle *mh;
2321 int error, nodeid = ms_in->m_header.h_nodeid;
2322
2323 error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
2324 if (error)
2325 goto out;
2326
2327 ms->m_lkid = ms_in->m_lkid;
2328 ms->m_result = rv;
2329 ms->m_nodeid = ret_nodeid;
2330
2331 error = send_message(mh, ms);
2332 out:
2333 return error;
2334}
2335
2336/* which args we save from a received message depends heavily on the type
2337 of message, unlike the send side where we can safely send everything about
2338 the lkb for any type of message */
2339
2340static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
2341{
2342 lkb->lkb_exflags = ms->m_exflags;
2343 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2344 (ms->m_flags & 0x0000FFFF);
2345}
2346
2347static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2348{
2349 lkb->lkb_sbflags = ms->m_sbflags;
2350 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2351 (ms->m_flags & 0x0000FFFF);
2352}
2353
2354static int receive_extralen(struct dlm_message *ms)
2355{
2356 return (ms->m_header.h_length - sizeof(struct dlm_message));
2357}
2358
2359static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
2360 struct dlm_message *ms)
2361{
2362 int len;
2363
2364 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
2365 if (!lkb->lkb_lvbptr)
2366 lkb->lkb_lvbptr = allocate_lvb(ls);
2367 if (!lkb->lkb_lvbptr)
2368 return -ENOMEM;
2369 len = receive_extralen(ms);
2370 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
2371 }
2372 return 0;
2373}
2374
2375static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2376 struct dlm_message *ms)
2377{
2378 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2379 lkb->lkb_ownpid = ms->m_pid;
2380 lkb->lkb_remid = ms->m_lkid;
2381 lkb->lkb_grmode = DLM_LOCK_IV;
2382 lkb->lkb_rqmode = ms->m_rqmode;
2383 lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
2384 lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
2385
2386 DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
2387
2388 if (receive_lvb(ls, lkb, ms))
2389 return -ENOMEM;
2390
2391 return 0;
2392}
2393
2394static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2395 struct dlm_message *ms)
2396{
2397 if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
2398 log_error(ls, "convert_args nodeid %d %d lkid %x %x",
2399 lkb->lkb_nodeid, ms->m_header.h_nodeid,
2400 lkb->lkb_id, lkb->lkb_remid);
2401 return -EINVAL;
2402 }
2403
2404 if (!is_master_copy(lkb))
2405 return -EINVAL;
2406
2407 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
2408 return -EBUSY;
2409
2410 if (receive_lvb(ls, lkb, ms))
2411 return -ENOMEM;
2412
2413 lkb->lkb_rqmode = ms->m_rqmode;
2414 lkb->lkb_lvbseq = ms->m_lvbseq;
2415
2416 return 0;
2417}
2418
2419static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2420 struct dlm_message *ms)
2421{
2422 if (!is_master_copy(lkb))
2423 return -EINVAL;
2424 if (receive_lvb(ls, lkb, ms))
2425 return -ENOMEM;
2426 return 0;
2427}
2428
2429/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
2430 uses to send a reply and that the remote end uses to process the reply. */
2431
2432static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
2433{
2434 struct dlm_lkb *lkb = &ls->ls_stub_lkb;
2435 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2436 lkb->lkb_remid = ms->m_lkid;
2437}
2438
2439static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
2440{
2441 struct dlm_lkb *lkb;
2442 struct dlm_rsb *r;
2443 int error, namelen;
2444
2445 error = create_lkb(ls, &lkb);
2446 if (error)
2447 goto fail;
2448
2449 receive_flags(lkb, ms);
2450 lkb->lkb_flags |= DLM_IFL_MSTCPY;
2451 error = receive_request_args(ls, lkb, ms);
2452 if (error) {
2453 __put_lkb(ls, lkb);
2454 goto fail;
2455 }
2456
2457 namelen = receive_extralen(ms);
2458
2459 error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
2460 if (error) {
2461 __put_lkb(ls, lkb);
2462 goto fail;
2463 }
2464
2465 lock_rsb(r);
2466
2467 attach_lkb(r, lkb);
2468 error = do_request(r, lkb);
2469 send_request_reply(r, lkb, error);
2470
2471 unlock_rsb(r);
2472 put_rsb(r);
2473
2474 if (error == -EINPROGRESS)
2475 error = 0;
2476 if (error)
2477 dlm_put_lkb(lkb);
2478 return;
2479
2480 fail:
2481 setup_stub_lkb(ls, ms);
2482 send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2483}
2484
2485static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
2486{
2487 struct dlm_lkb *lkb;
2488 struct dlm_rsb *r;
2489 int error, reply = 1;
2490
2491 error = find_lkb(ls, ms->m_remid, &lkb);
2492 if (error)
2493 goto fail;
2494
2495 r = lkb->lkb_resource;
2496
2497 hold_rsb(r);
2498 lock_rsb(r);
2499
2500 receive_flags(lkb, ms);
2501 error = receive_convert_args(ls, lkb, ms);
2502 if (error)
2503 goto out;
2504 reply = !down_conversion(lkb);
2505
2506 error = do_convert(r, lkb);
2507 out:
2508 if (reply)
2509 send_convert_reply(r, lkb, error);
2510
2511 unlock_rsb(r);
2512 put_rsb(r);
2513 dlm_put_lkb(lkb);
2514 return;
2515
2516 fail:
2517 setup_stub_lkb(ls, ms);
2518 send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2519}
2520
2521static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
2522{
2523 struct dlm_lkb *lkb;
2524 struct dlm_rsb *r;
2525 int error;
2526
2527 error = find_lkb(ls, ms->m_remid, &lkb);
2528 if (error)
2529 goto fail;
2530
2531 r = lkb->lkb_resource;
2532
2533 hold_rsb(r);
2534 lock_rsb(r);
2535
2536 receive_flags(lkb, ms);
2537 error = receive_unlock_args(ls, lkb, ms);
2538 if (error)
2539 goto out;
2540
2541 error = do_unlock(r, lkb);
2542 out:
2543 send_unlock_reply(r, lkb, error);
2544
2545 unlock_rsb(r);
2546 put_rsb(r);
2547 dlm_put_lkb(lkb);
2548 return;
2549
2550 fail:
2551 setup_stub_lkb(ls, ms);
2552 send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2553}
2554
2555static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
2556{
2557 struct dlm_lkb *lkb;
2558 struct dlm_rsb *r;
2559 int error;
2560
2561 error = find_lkb(ls, ms->m_remid, &lkb);
2562 if (error)
2563 goto fail;
2564
2565 receive_flags(lkb, ms);
2566
2567 r = lkb->lkb_resource;
2568
2569 hold_rsb(r);
2570 lock_rsb(r);
2571
2572 error = do_cancel(r, lkb);
2573 send_cancel_reply(r, lkb, error);
2574
2575 unlock_rsb(r);
2576 put_rsb(r);
2577 dlm_put_lkb(lkb);
2578 return;
2579
2580 fail:
2581 setup_stub_lkb(ls, ms);
2582 send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2583}
2584
2585static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
2586{
2587 struct dlm_lkb *lkb;
2588 struct dlm_rsb *r;
2589 int error;
2590
2591 error = find_lkb(ls, ms->m_remid, &lkb);
2592 if (error) {
2593 log_error(ls, "receive_grant no lkb");
2594 return;
2595 }
2596 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2597
2598 r = lkb->lkb_resource;
2599
2600 hold_rsb(r);
2601 lock_rsb(r);
2602
2603 receive_flags_reply(lkb, ms);
2604 grant_lock_pc(r, lkb, ms);
2605 queue_cast(r, lkb, 0);
2606
2607 unlock_rsb(r);
2608 put_rsb(r);
2609 dlm_put_lkb(lkb);
2610}
2611
2612static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
2613{
2614 struct dlm_lkb *lkb;
2615 struct dlm_rsb *r;
2616 int error;
2617
2618 error = find_lkb(ls, ms->m_remid, &lkb);
2619 if (error) {
2620 log_error(ls, "receive_bast no lkb");
2621 return;
2622 }
2623 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2624
2625 r = lkb->lkb_resource;
2626
2627 hold_rsb(r);
2628 lock_rsb(r);
2629
2630 queue_bast(r, lkb, ms->m_bastmode);
2631
2632 unlock_rsb(r);
2633 put_rsb(r);
2634 dlm_put_lkb(lkb);
2635}
2636
2637static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
2638{
2639 int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
2640
2641 from_nodeid = ms->m_header.h_nodeid;
2642 our_nodeid = dlm_our_nodeid();
2643
2644 len = receive_extralen(ms);
2645
2646 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2647 if (dir_nodeid != our_nodeid) {
2648 log_error(ls, "lookup dir_nodeid %d from %d",
2649 dir_nodeid, from_nodeid);
2650 error = -EINVAL;
2651 ret_nodeid = -1;
2652 goto out;
2653 }
2654
2655 error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
2656
2657 /* Optimization: we're master so treat lookup as a request */
2658 if (!error && ret_nodeid == our_nodeid) {
2659 receive_request(ls, ms);
2660 return;
2661 }
2662 out:
2663 send_lookup_reply(ls, ms, ret_nodeid, error);
2664}
2665
2666static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
2667{
2668 int len, dir_nodeid, from_nodeid;
2669
2670 from_nodeid = ms->m_header.h_nodeid;
2671
2672 len = receive_extralen(ms);
2673
2674 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2675 if (dir_nodeid != dlm_our_nodeid()) {
2676 log_error(ls, "remove dir entry dir_nodeid %d from %d",
2677 dir_nodeid, from_nodeid);
2678 return;
2679 }
2680
2681 dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
2682}
2683
2684static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
2685{
2686 struct dlm_lkb *lkb;
2687 struct dlm_rsb *r;
2688 int error, mstype;
2689
2690 error = find_lkb(ls, ms->m_remid, &lkb);
2691 if (error) {
2692 log_error(ls, "receive_request_reply no lkb");
2693 return;
2694 }
2695 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2696
2697 mstype = lkb->lkb_wait_type;
2698 error = remove_from_waiters(lkb);
2699 if (error) {
2700 log_error(ls, "receive_request_reply not on waiters");
2701 goto out;
2702 }
2703
2704 /* this is the value returned from do_request() on the master */
2705 error = ms->m_result;
2706
2707 r = lkb->lkb_resource;
2708 hold_rsb(r);
2709 lock_rsb(r);
2710
2711 /* Optimization: the dir node was also the master, so it took our
2712 lookup as a request and sent request reply instead of lookup reply */
2713 if (mstype == DLM_MSG_LOOKUP) {
2714 r->res_nodeid = ms->m_header.h_nodeid;
2715 lkb->lkb_nodeid = r->res_nodeid;
2716 }
2717
2718 switch (error) {
2719 case -EAGAIN:
2720 /* request would block (be queued) on remote master;
2721 the unhold undoes the original ref from create_lkb()
2722 so it leads to the lkb being freed */
2723 queue_cast(r, lkb, -EAGAIN);
2724 confirm_master(r, -EAGAIN);
2725 unhold_lkb(lkb);
2726 break;
2727
2728 case -EINPROGRESS:
2729 case 0:
2730 /* request was queued or granted on remote master */
2731 receive_flags_reply(lkb, ms);
2732 lkb->lkb_remid = ms->m_lkid;
2733 if (error)
2734 add_lkb(r, lkb, DLM_LKSTS_WAITING);
2735 else {
2736 grant_lock_pc(r, lkb, ms);
2737 queue_cast(r, lkb, 0);
2738 }
2739 confirm_master(r, error);
2740 break;
2741
2742 case -ENOENT:
2743 case -ENOTBLK:
2744 /* find_rsb failed to find rsb or rsb wasn't master */
2745 r->res_nodeid = -1;
2746 lkb->lkb_nodeid = -1;
2747 _request_lock(r, lkb);
2748 break;
2749
2750 default:
2751 log_error(ls, "receive_request_reply error %d", error);
2752 }
2753
2754 unlock_rsb(r);
2755 put_rsb(r);
2756 out:
2757 dlm_put_lkb(lkb);
2758}
2759
2760static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2761 struct dlm_message *ms)
2762{
2763 int error = ms->m_result;
2764
2765 /* this is the value returned from do_convert() on the master */
2766
2767 switch (error) {
2768 case -EAGAIN:
2769 /* convert would block (be queued) on remote master */
2770 queue_cast(r, lkb, -EAGAIN);
2771 break;
2772
2773 case -EINPROGRESS:
2774 /* convert was queued on remote master */
2775 del_lkb(r, lkb);
2776 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
2777 break;
2778
2779 case 0:
2780 /* convert was granted on remote master */
2781 receive_flags_reply(lkb, ms);
2782 grant_lock_pc(r, lkb, ms);
2783 queue_cast(r, lkb, 0);
2784 break;
2785
2786 default:
2787 log_error(r->res_ls, "receive_convert_reply error %d", error);
2788 }
2789}
2790
2791static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2792{
2793 struct dlm_rsb *r = lkb->lkb_resource;
2794
2795 hold_rsb(r);
2796 lock_rsb(r);
2797
2798 __receive_convert_reply(r, lkb, ms);
2799
2800 unlock_rsb(r);
2801 put_rsb(r);
2802}
2803
2804static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
2805{
2806 struct dlm_lkb *lkb;
2807 int error;
2808
2809 error = find_lkb(ls, ms->m_remid, &lkb);
2810 if (error) {
2811 log_error(ls, "receive_convert_reply no lkb");
2812 return;
2813 }
2814 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2815
2816 error = remove_from_waiters(lkb);
2817 if (error) {
2818 log_error(ls, "receive_convert_reply not on waiters");
2819 goto out;
2820 }
2821
2822 _receive_convert_reply(lkb, ms);
2823 out:
2824 dlm_put_lkb(lkb);
2825}
2826
2827static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2828{
2829 struct dlm_rsb *r = lkb->lkb_resource;
2830 int error = ms->m_result;
2831
2832 hold_rsb(r);
2833 lock_rsb(r);
2834
2835 /* this is the value returned from do_unlock() on the master */
2836
2837 switch (error) {
2838 case -DLM_EUNLOCK:
2839 receive_flags_reply(lkb, ms);
2840 remove_lock_pc(r, lkb);
2841 queue_cast(r, lkb, -DLM_EUNLOCK);
2842 break;
2843 default:
2844 log_error(r->res_ls, "receive_unlock_reply error %d", error);
2845 }
2846
2847 unlock_rsb(r);
2848 put_rsb(r);
2849}
2850
2851static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
2852{
2853 struct dlm_lkb *lkb;
2854 int error;
2855
2856 error = find_lkb(ls, ms->m_remid, &lkb);
2857 if (error) {
2858 log_error(ls, "receive_unlock_reply no lkb");
2859 return;
2860 }
2861 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2862
2863 error = remove_from_waiters(lkb);
2864 if (error) {
2865 log_error(ls, "receive_unlock_reply not on waiters");
2866 goto out;
2867 }
2868
2869 _receive_unlock_reply(lkb, ms);
2870 out:
2871 dlm_put_lkb(lkb);
2872}
2873
2874static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2875{
2876 struct dlm_rsb *r = lkb->lkb_resource;
2877 int error = ms->m_result;
2878
2879 hold_rsb(r);
2880 lock_rsb(r);
2881
2882 /* this is the value returned from do_cancel() on the master */
2883
2884 switch (error) {
2885 case -DLM_ECANCEL:
2886 receive_flags_reply(lkb, ms);
2887 revert_lock_pc(r, lkb);
2888 queue_cast(r, lkb, -DLM_ECANCEL);
2889 break;
2890 default:
2891 log_error(r->res_ls, "receive_cancel_reply error %d", error);
2892 }
2893
2894 unlock_rsb(r);
2895 put_rsb(r);
2896}
2897
2898static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
2899{
2900 struct dlm_lkb *lkb;
2901 int error;
2902
2903 error = find_lkb(ls, ms->m_remid, &lkb);
2904 if (error) {
2905 log_error(ls, "receive_cancel_reply no lkb");
2906 return;
2907 }
2908 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2909
2910 error = remove_from_waiters(lkb);
2911 if (error) {
2912 log_error(ls, "receive_cancel_reply not on waiters");
2913 goto out;
2914 }
2915
2916 _receive_cancel_reply(lkb, ms);
2917 out:
2918 dlm_put_lkb(lkb);
2919}
2920
2921static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
2922{
2923 struct dlm_lkb *lkb;
2924 struct dlm_rsb *r;
2925 int error, ret_nodeid;
2926
2927 error = find_lkb(ls, ms->m_lkid, &lkb);
2928 if (error) {
2929 log_error(ls, "receive_lookup_reply no lkb");
2930 return;
2931 }
2932
2933 error = remove_from_waiters(lkb);
2934 if (error) {
2935 log_error(ls, "receive_lookup_reply not on waiters");
2936 goto out;
2937 }
2938
2939 /* this is the value returned by dlm_dir_lookup on dir node
2940 FIXME: will a non-zero error ever be returned? */
2941 error = ms->m_result;
2942
2943 r = lkb->lkb_resource;
2944 hold_rsb(r);
2945 lock_rsb(r);
2946
2947 ret_nodeid = ms->m_nodeid;
2948 if (ret_nodeid == dlm_our_nodeid()) {
2949 r->res_nodeid = 0;
2950 ret_nodeid = 0;
2951 r->res_first_lkid = 0;
2952 } else {
2953 /* set_master() will copy res_nodeid to lkb_nodeid */
2954 r->res_nodeid = ret_nodeid;
2955 }
2956
2957 _request_lock(r, lkb);
2958
2959 if (!ret_nodeid)
2960 process_lookup_list(r);
2961
2962 unlock_rsb(r);
2963 put_rsb(r);
2964 out:
2965 dlm_put_lkb(lkb);
2966}
2967
2968int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
2969{
2970 struct dlm_message *ms = (struct dlm_message *) hd;
2971 struct dlm_ls *ls;
2972 int error;
2973
2974 if (!recovery)
2975 dlm_message_in(ms);
2976
2977 ls = dlm_find_lockspace_global(hd->h_lockspace);
2978 if (!ls) {
2979 log_print("drop message %d from %d for unknown lockspace %d",
2980 ms->m_type, nodeid, hd->h_lockspace);
2981 return -EINVAL;
2982 }
2983
2984 /* recovery may have just ended leaving a bunch of backed-up requests
2985 in the requestqueue; wait while dlm_recoverd clears them */
2986
2987 if (!recovery)
2988 dlm_wait_requestqueue(ls);
2989
2990 /* recovery may have just started while there were a bunch of
2991 in-flight requests -- save them in requestqueue to be processed
2992 after recovery. we can't let dlm_recvd block on the recovery
2993 lock. if dlm_recoverd is calling this function to clear the
2994 requestqueue, it needs to be interrupted (-EINTR) if another
2995 recovery operation is starting. */
2996
2997 while (1) {
2998 if (dlm_locking_stopped(ls)) {
2999 if (!recovery)
3000 dlm_add_requestqueue(ls, nodeid, hd);
3001 error = -EINTR;
3002 goto out;
3003 }
3004
3005 if (lock_recovery_try(ls))
3006 break;
3007 schedule();
3008 }
3009
3010 switch (ms->m_type) {
3011
3012 /* messages sent to a master node */
3013
3014 case DLM_MSG_REQUEST:
3015 receive_request(ls, ms);
3016 break;
3017
3018 case DLM_MSG_CONVERT:
3019 receive_convert(ls, ms);
3020 break;
3021
3022 case DLM_MSG_UNLOCK:
3023 receive_unlock(ls, ms);
3024 break;
3025
3026 case DLM_MSG_CANCEL:
3027 receive_cancel(ls, ms);
3028 break;
3029
3030 /* messages sent from a master node (replies to above) */
3031
3032 case DLM_MSG_REQUEST_REPLY:
3033 receive_request_reply(ls, ms);
3034 break;
3035
3036 case DLM_MSG_CONVERT_REPLY:
3037 receive_convert_reply(ls, ms);
3038 break;
3039
3040 case DLM_MSG_UNLOCK_REPLY:
3041 receive_unlock_reply(ls, ms);
3042 break;
3043
3044 case DLM_MSG_CANCEL_REPLY:
3045 receive_cancel_reply(ls, ms);
3046 break;
3047
3048 /* messages sent from a master node (only two types of async msg) */
3049
3050 case DLM_MSG_GRANT:
3051 receive_grant(ls, ms);
3052 break;
3053
3054 case DLM_MSG_BAST:
3055 receive_bast(ls, ms);
3056 break;
3057
3058 /* messages sent to a dir node */
3059
3060 case DLM_MSG_LOOKUP:
3061 receive_lookup(ls, ms);
3062 break;
3063
3064 case DLM_MSG_REMOVE:
3065 receive_remove(ls, ms);
3066 break;
3067
3068 /* messages sent from a dir node (remove has no reply) */
3069
3070 case DLM_MSG_LOOKUP_REPLY:
3071 receive_lookup_reply(ls, ms);
3072 break;
3073
3074 default:
3075 log_error(ls, "unknown message type %d", ms->m_type);
3076 }
3077
3078 unlock_recovery(ls);
3079 out:
3080 dlm_put_lockspace(ls);
3081 dlm_astd_wake();
3082 return 0;
3083}
3084
3085
3086/*
3087 * Recovery related
3088 */
3089
3090static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
3091{
3092 if (middle_conversion(lkb)) {
3093 hold_lkb(lkb);
3094 ls->ls_stub_ms.m_result = -EINPROGRESS;
3095 _remove_from_waiters(lkb);
3096 _receive_convert_reply(lkb, &ls->ls_stub_ms);
3097
3098 /* Same special case as in receive_rcom_lock_args() */
3099 lkb->lkb_grmode = DLM_LOCK_IV;
3100 rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
3101 unhold_lkb(lkb);
3102
3103 } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
3104 lkb->lkb_flags |= DLM_IFL_RESEND;
3105 }
3106
3107 /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
3108 conversions are async; there's no reply from the remote master */
3109}
3110
3111/* A waiting lkb needs recovery if the master node has failed, or
3112 the master node is changing (only when no directory is used) */
3113
3114static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
3115{
3116 if (dlm_is_removed(ls, lkb->lkb_nodeid))
3117 return 1;
3118
3119 if (!dlm_no_directory(ls))
3120 return 0;
3121
3122 if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
3123 return 1;
3124
3125 return 0;
3126}
3127
3128/* Recovery for locks that are waiting for replies from nodes that are now
3129 gone. We can just complete unlocks and cancels by faking a reply from the
3130 dead node. Requests and up-conversions we flag to be resent after
3131 recovery. Down-conversions can just be completed with a fake reply like
3132 unlocks. Conversions between PR and CW need special attention. */
3133
3134void dlm_recover_waiters_pre(struct dlm_ls *ls)
3135{
3136 struct dlm_lkb *lkb, *safe;
3137
3138 mutex_lock(&ls->ls_waiters_mutex);
3139
3140 list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
3141 log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
3142 lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
3143
3144 /* all outstanding lookups, regardless of destination will be
3145 resent after recovery is done */
3146
3147 if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
3148 lkb->lkb_flags |= DLM_IFL_RESEND;
3149 continue;
3150 }
3151
3152 if (!waiter_needs_recovery(ls, lkb))
3153 continue;
3154
3155 switch (lkb->lkb_wait_type) {
3156
3157 case DLM_MSG_REQUEST:
3158 lkb->lkb_flags |= DLM_IFL_RESEND;
3159 break;
3160
3161 case DLM_MSG_CONVERT:
3162 recover_convert_waiter(ls, lkb);
3163 break;
3164
3165 case DLM_MSG_UNLOCK:
3166 hold_lkb(lkb);
3167 ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
3168 _remove_from_waiters(lkb);
3169 _receive_unlock_reply(lkb, &ls->ls_stub_ms);
3170 dlm_put_lkb(lkb);
3171 break;
3172
3173 case DLM_MSG_CANCEL:
3174 hold_lkb(lkb);
3175 ls->ls_stub_ms.m_result = -DLM_ECANCEL;
3176 _remove_from_waiters(lkb);
3177 _receive_cancel_reply(lkb, &ls->ls_stub_ms);
3178 dlm_put_lkb(lkb);
3179 break;
3180
3181 default:
3182 log_error(ls, "invalid lkb wait_type %d",
3183 lkb->lkb_wait_type);
3184 }
3185 }
3186 mutex_unlock(&ls->ls_waiters_mutex);
3187}
3188
3189static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
3190{
3191 struct dlm_lkb *lkb;
3192 int rv = 0;
3193
3194 mutex_lock(&ls->ls_waiters_mutex);
3195 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
3196 if (lkb->lkb_flags & DLM_IFL_RESEND) {
3197 rv = lkb->lkb_wait_type;
3198 _remove_from_waiters(lkb);
3199 lkb->lkb_flags &= ~DLM_IFL_RESEND;
3200 break;
3201 }
3202 }
3203 mutex_unlock(&ls->ls_waiters_mutex);
3204
3205 if (!rv)
3206 lkb = NULL;
3207 *lkb_ret = lkb;
3208 return rv;
3209}
3210
3211/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the
3212 master or dir-node for r. Processing the lkb may result in it being placed
3213 back on waiters. */
3214
3215int dlm_recover_waiters_post(struct dlm_ls *ls)
3216{
3217 struct dlm_lkb *lkb;
3218 struct dlm_rsb *r;
3219 int error = 0, mstype;
3220
3221 while (1) {
3222 if (dlm_locking_stopped(ls)) {
3223 log_debug(ls, "recover_waiters_post aborted");
3224 error = -EINTR;
3225 break;
3226 }
3227
3228 mstype = remove_resend_waiter(ls, &lkb);
3229 if (!mstype)
3230 break;
3231
3232 r = lkb->lkb_resource;
3233
3234 log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
3235 lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
3236
3237 switch (mstype) {
3238
3239 case DLM_MSG_LOOKUP:
3240 hold_rsb(r);
3241 lock_rsb(r);
3242 _request_lock(r, lkb);
3243 if (is_master(r))
3244 confirm_master(r, 0);
3245 unlock_rsb(r);
3246 put_rsb(r);
3247 break;
3248
3249 case DLM_MSG_REQUEST:
3250 hold_rsb(r);
3251 lock_rsb(r);
3252 _request_lock(r, lkb);
3253 unlock_rsb(r);
3254 put_rsb(r);
3255 break;
3256
3257 case DLM_MSG_CONVERT:
3258 hold_rsb(r);
3259 lock_rsb(r);
3260 _convert_lock(r, lkb);
3261 unlock_rsb(r);
3262 put_rsb(r);
3263 break;
3264
3265 default:
3266 log_error(ls, "recover_waiters_post type %d", mstype);
3267 }
3268 }
3269
3270 return error;
3271}
3272
3273static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
3274 int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
3275{
3276 struct dlm_ls *ls = r->res_ls;
3277 struct dlm_lkb *lkb, *safe;
3278
3279 list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
3280 if (test(ls, lkb)) {
3281 rsb_set_flag(r, RSB_LOCKS_PURGED);
3282 del_lkb(r, lkb);
3283 /* this put should free the lkb */
3284 if (!dlm_put_lkb(lkb))
3285 log_error(ls, "purged lkb not released");
3286 }
3287 }
3288}
3289
3290static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3291{
3292 return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
3293}
3294
3295static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3296{
3297 return is_master_copy(lkb);
3298}
3299
3300static void purge_dead_locks(struct dlm_rsb *r)
3301{
3302 purge_queue(r, &r->res_grantqueue, &purge_dead_test);
3303 purge_queue(r, &r->res_convertqueue, &purge_dead_test);
3304 purge_queue(r, &r->res_waitqueue, &purge_dead_test);
3305}
3306
3307void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
3308{
3309 purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
3310 purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
3311 purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
3312}
3313
3314/* Get rid of locks held by nodes that are gone. */
3315
3316int dlm_purge_locks(struct dlm_ls *ls)
3317{
3318 struct dlm_rsb *r;
3319
3320 log_debug(ls, "dlm_purge_locks");
3321
3322 down_write(&ls->ls_root_sem);
3323 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
3324 hold_rsb(r);
3325 lock_rsb(r);
3326 if (is_master(r))
3327 purge_dead_locks(r);
3328 unlock_rsb(r);
3329 unhold_rsb(r);
3330
3331 schedule();
3332 }
3333 up_write(&ls->ls_root_sem);
3334
3335 return 0;
3336}
3337
3338static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
3339{
3340 struct dlm_rsb *r, *r_ret = NULL;
3341
3342 read_lock(&ls->ls_rsbtbl[bucket].lock);
3343 list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
3344 if (!rsb_flag(r, RSB_LOCKS_PURGED))
3345 continue;
3346 hold_rsb(r);
3347 rsb_clear_flag(r, RSB_LOCKS_PURGED);
3348 r_ret = r;
3349 break;
3350 }
3351 read_unlock(&ls->ls_rsbtbl[bucket].lock);
3352 return r_ret;
3353}
3354
3355void dlm_grant_after_purge(struct dlm_ls *ls)
3356{
3357 struct dlm_rsb *r;
3358 int i;
3359
3360 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
3361 r = find_purged_rsb(ls, i);
3362 if (!r)
3363 continue;
3364 lock_rsb(r);
3365 if (is_master(r)) {
3366 grant_pending_locks(r);
3367 confirm_master(r, 0);
3368 }
3369 unlock_rsb(r);
3370 put_rsb(r);
3371 }
3372}
3373
3374static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
3375 uint32_t remid)
3376{
3377 struct dlm_lkb *lkb;
3378
3379 list_for_each_entry(lkb, head, lkb_statequeue) {
3380 if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
3381 return lkb;
3382 }
3383 return NULL;
3384}
3385
3386static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
3387 uint32_t remid)
3388{
3389 struct dlm_lkb *lkb;
3390
3391 lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
3392 if (lkb)
3393 return lkb;
3394 lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
3395 if (lkb)
3396 return lkb;
3397 lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
3398 if (lkb)
3399 return lkb;
3400 return NULL;
3401}
3402
3403static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3404 struct dlm_rsb *r, struct dlm_rcom *rc)
3405{
3406 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3407 int lvblen;
3408
3409 lkb->lkb_nodeid = rc->rc_header.h_nodeid;
3410 lkb->lkb_ownpid = rl->rl_ownpid;
3411 lkb->lkb_remid = rl->rl_lkid;
3412 lkb->lkb_exflags = rl->rl_exflags;
3413 lkb->lkb_flags = rl->rl_flags & 0x0000FFFF;
3414 lkb->lkb_flags |= DLM_IFL_MSTCPY;
3415 lkb->lkb_lvbseq = rl->rl_lvbseq;
3416 lkb->lkb_rqmode = rl->rl_rqmode;
3417 lkb->lkb_grmode = rl->rl_grmode;
3418 /* don't set lkb_status because add_lkb wants to itself */
3419
3420 lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST);
3421 lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
3422
3423 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3424 lkb->lkb_lvbptr = allocate_lvb(ls);
3425 if (!lkb->lkb_lvbptr)
3426 return -ENOMEM;
3427 lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
3428 sizeof(struct rcom_lock);
3429 memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
3430 }
3431
3432 /* Conversions between PR and CW (middle modes) need special handling.
3433 The real granted mode of these converting locks cannot be determined
3434 until all locks have been rebuilt on the rsb (recover_conversion) */
3435
3436 if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) {
3437 rl->rl_status = DLM_LKSTS_CONVERT;
3438 lkb->lkb_grmode = DLM_LOCK_IV;
3439 rsb_set_flag(r, RSB_RECOVER_CONVERT);
3440 }
3441
3442 return 0;
3443}
3444
3445/* This lkb may have been recovered in a previous aborted recovery so we need
3446 to check if the rsb already has an lkb with the given remote nodeid/lkid.
3447 If so we just send back a standard reply. If not, we create a new lkb with
3448 the given values and send back our lkid. We send back our lkid by sending
3449 back the rcom_lock struct we got but with the remid field filled in. */
3450
3451int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3452{
3453 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3454 struct dlm_rsb *r;
3455 struct dlm_lkb *lkb;
3456 int error;
3457
3458 if (rl->rl_parent_lkid) {
3459 error = -EOPNOTSUPP;
3460 goto out;
3461 }
3462
3463 error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r);
3464 if (error)
3465 goto out;
3466
3467 lock_rsb(r);
3468
3469 lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid);
3470 if (lkb) {
3471 error = -EEXIST;
3472 goto out_remid;
3473 }
3474
3475 error = create_lkb(ls, &lkb);
3476 if (error)
3477 goto out_unlock;
3478
3479 error = receive_rcom_lock_args(ls, lkb, r, rc);
3480 if (error) {
3481 __put_lkb(ls, lkb);
3482 goto out_unlock;
3483 }
3484
3485 attach_lkb(r, lkb);
3486 add_lkb(r, lkb, rl->rl_status);
3487 error = 0;
3488
3489 out_remid:
3490 /* this is the new value returned to the lock holder for
3491 saving in its process-copy lkb */
3492 rl->rl_remid = lkb->lkb_id;
3493
3494 out_unlock:
3495 unlock_rsb(r);
3496 put_rsb(r);
3497 out:
3498 if (error)
3499 log_print("recover_master_copy %d %x", error, rl->rl_lkid);
3500 rl->rl_result = error;
3501 return error;
3502}
3503
3504int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3505{
3506 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3507 struct dlm_rsb *r;
3508 struct dlm_lkb *lkb;
3509 int error;
3510
3511 error = find_lkb(ls, rl->rl_lkid, &lkb);
3512 if (error) {
3513 log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid);
3514 return error;
3515 }
3516
3517 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3518
3519 error = rl->rl_result;
3520
3521 r = lkb->lkb_resource;
3522 hold_rsb(r);
3523 lock_rsb(r);
3524
3525 switch (error) {
3526 case -EEXIST:
3527 log_debug(ls, "master copy exists %x", lkb->lkb_id);
3528 /* fall through */
3529 case 0:
3530 lkb->lkb_remid = rl->rl_remid;
3531 break;
3532 default:
3533 log_error(ls, "dlm_recover_process_copy unknown error %d %x",
3534 error, lkb->lkb_id);
3535 }
3536
3537 /* an ack for dlm_recover_locks() which waits for replies from
3538 all the locks it sends to new masters */
3539 dlm_recovered_lock(r);
3540
3541 unlock_rsb(r);
3542 put_rsb(r);
3543 dlm_put_lkb(lkb);
3544
3545 return 0;
3546}
3547
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
new file mode 100644
index 000000000000..56cdc073b1f6
--- /dev/null
+++ b/fs/dlm/lock.h
@@ -0,0 +1,50 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LOCK_DOT_H__
14#define __LOCK_DOT_H__
15
16void dlm_print_rsb(struct dlm_rsb *r);
17int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery);
18int dlm_modes_compat(int mode1, int mode2);
19int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
20 unsigned int flags, struct dlm_rsb **r_ret);
21void dlm_put_rsb(struct dlm_rsb *r);
22void dlm_hold_rsb(struct dlm_rsb *r);
23int dlm_put_lkb(struct dlm_lkb *lkb);
24void dlm_scan_rsbs(struct dlm_ls *ls);
25
26int dlm_purge_locks(struct dlm_ls *ls);
27void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
28void dlm_grant_after_purge(struct dlm_ls *ls);
29int dlm_recover_waiters_post(struct dlm_ls *ls);
30void dlm_recover_waiters_pre(struct dlm_ls *ls);
31int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
32int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
33
34static inline int is_master(struct dlm_rsb *r)
35{
36 return !r->res_nodeid;
37}
38
39static inline void lock_rsb(struct dlm_rsb *r)
40{
41 mutex_lock(&r->res_mutex);
42}
43
44static inline void unlock_rsb(struct dlm_rsb *r)
45{
46 mutex_unlock(&r->res_mutex);
47}
48
49#endif
50
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
new file mode 100644
index 000000000000..9ed4b70348fb
--- /dev/null
+++ b/fs/dlm/lockspace.c
@@ -0,0 +1,678 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "recoverd.h"
18#include "ast.h"
19#include "dir.h"
20#include "lowcomms.h"
21#include "config.h"
22#include "memory.h"
23#include "lock.h"
24#include "recover.h"
25
26#ifdef CONFIG_DLM_DEBUG
27int dlm_create_debug_file(struct dlm_ls *ls);
28void dlm_delete_debug_file(struct dlm_ls *ls);
29#else
30static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
31static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
32#endif
33
34static int ls_count;
35static struct mutex ls_lock;
36static struct list_head lslist;
37static spinlock_t lslist_lock;
38static struct task_struct * scand_task;
39
40
41static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
42{
43 ssize_t ret = len;
44 int n = simple_strtol(buf, NULL, 0);
45
46 switch (n) {
47 case 0:
48 dlm_ls_stop(ls);
49 break;
50 case 1:
51 dlm_ls_start(ls);
52 break;
53 default:
54 ret = -EINVAL;
55 }
56 return ret;
57}
58
59static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
60{
61 ls->ls_uevent_result = simple_strtol(buf, NULL, 0);
62 set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
63 wake_up(&ls->ls_uevent_wait);
64 return len;
65}
66
67static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
68{
69 return sprintf(buf, "%u\n", ls->ls_global_id);
70}
71
72static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
73{
74 ls->ls_global_id = simple_strtoul(buf, NULL, 0);
75 return len;
76}
77
78static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf)
79{
80 uint32_t status = dlm_recover_status(ls);
81 return sprintf(buf, "%x\n", status);
82}
83
84struct dlm_attr {
85 struct attribute attr;
86 ssize_t (*show)(struct dlm_ls *, char *);
87 ssize_t (*store)(struct dlm_ls *, const char *, size_t);
88};
89
90static struct dlm_attr dlm_attr_control = {
91 .attr = {.name = "control", .mode = S_IWUSR},
92 .store = dlm_control_store
93};
94
95static struct dlm_attr dlm_attr_event = {
96 .attr = {.name = "event_done", .mode = S_IWUSR},
97 .store = dlm_event_store
98};
99
100static struct dlm_attr dlm_attr_id = {
101 .attr = {.name = "id", .mode = S_IRUGO | S_IWUSR},
102 .show = dlm_id_show,
103 .store = dlm_id_store
104};
105
106static struct dlm_attr dlm_attr_recover_status = {
107 .attr = {.name = "recover_status", .mode = S_IRUGO},
108 .show = dlm_recover_status_show
109};
110
111static struct attribute *dlm_attrs[] = {
112 &dlm_attr_control.attr,
113 &dlm_attr_event.attr,
114 &dlm_attr_id.attr,
115 &dlm_attr_recover_status.attr,
116 NULL,
117};
118
119static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
120 char *buf)
121{
122 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
123 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
124 return a->show ? a->show(ls, buf) : 0;
125}
126
127static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
128 const char *buf, size_t len)
129{
130 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
131 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
132 return a->store ? a->store(ls, buf, len) : len;
133}
134
135static struct sysfs_ops dlm_attr_ops = {
136 .show = dlm_attr_show,
137 .store = dlm_attr_store,
138};
139
140static struct kobj_type dlm_ktype = {
141 .default_attrs = dlm_attrs,
142 .sysfs_ops = &dlm_attr_ops,
143};
144
145static struct kset dlm_kset = {
146 .subsys = &kernel_subsys,
147 .kobj = {.name = "dlm",},
148 .ktype = &dlm_ktype,
149};
150
151static int kobject_setup(struct dlm_ls *ls)
152{
153 char lsname[DLM_LOCKSPACE_LEN];
154 int error;
155
156 memset(lsname, 0, DLM_LOCKSPACE_LEN);
157 snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
158
159 error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
160 if (error)
161 return error;
162
163 ls->ls_kobj.kset = &dlm_kset;
164 ls->ls_kobj.ktype = &dlm_ktype;
165 return 0;
166}
167
168static int do_uevent(struct dlm_ls *ls, int in)
169{
170 int error;
171
172 if (in)
173 kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
174 else
175 kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
176
177 error = wait_event_interruptible(ls->ls_uevent_wait,
178 test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
179 if (error)
180 goto out;
181
182 error = ls->ls_uevent_result;
183 out:
184 return error;
185}
186
187
188int dlm_lockspace_init(void)
189{
190 int error;
191
192 ls_count = 0;
193 mutex_init(&ls_lock);
194 INIT_LIST_HEAD(&lslist);
195 spin_lock_init(&lslist_lock);
196
197 error = kset_register(&dlm_kset);
198 if (error)
199 printk("dlm_lockspace_init: cannot register kset %d\n", error);
200 return error;
201}
202
203void dlm_lockspace_exit(void)
204{
205 kset_unregister(&dlm_kset);
206}
207
208static int dlm_scand(void *data)
209{
210 struct dlm_ls *ls;
211
212 while (!kthread_should_stop()) {
213 list_for_each_entry(ls, &lslist, ls_list)
214 dlm_scan_rsbs(ls);
215 schedule_timeout_interruptible(dlm_config.scan_secs * HZ);
216 }
217 return 0;
218}
219
220static int dlm_scand_start(void)
221{
222 struct task_struct *p;
223 int error = 0;
224
225 p = kthread_run(dlm_scand, NULL, "dlm_scand");
226 if (IS_ERR(p))
227 error = PTR_ERR(p);
228 else
229 scand_task = p;
230 return error;
231}
232
233static void dlm_scand_stop(void)
234{
235 kthread_stop(scand_task);
236}
237
238static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
239{
240 struct dlm_ls *ls;
241
242 spin_lock(&lslist_lock);
243
244 list_for_each_entry(ls, &lslist, ls_list) {
245 if (ls->ls_namelen == namelen &&
246 memcmp(ls->ls_name, name, namelen) == 0)
247 goto out;
248 }
249 ls = NULL;
250 out:
251 spin_unlock(&lslist_lock);
252 return ls;
253}
254
255struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
256{
257 struct dlm_ls *ls;
258
259 spin_lock(&lslist_lock);
260
261 list_for_each_entry(ls, &lslist, ls_list) {
262 if (ls->ls_global_id == id) {
263 ls->ls_count++;
264 goto out;
265 }
266 }
267 ls = NULL;
268 out:
269 spin_unlock(&lslist_lock);
270 return ls;
271}
272
273struct dlm_ls *dlm_find_lockspace_local(void *id)
274{
275 struct dlm_ls *ls = id;
276
277 spin_lock(&lslist_lock);
278 ls->ls_count++;
279 spin_unlock(&lslist_lock);
280 return ls;
281}
282
283void dlm_put_lockspace(struct dlm_ls *ls)
284{
285 spin_lock(&lslist_lock);
286 ls->ls_count--;
287 spin_unlock(&lslist_lock);
288}
289
290static void remove_lockspace(struct dlm_ls *ls)
291{
292 for (;;) {
293 spin_lock(&lslist_lock);
294 if (ls->ls_count == 0) {
295 list_del(&ls->ls_list);
296 spin_unlock(&lslist_lock);
297 return;
298 }
299 spin_unlock(&lslist_lock);
300 ssleep(1);
301 }
302}
303
304static int threads_start(void)
305{
306 int error;
307
308 /* Thread which process lock requests for all lockspace's */
309 error = dlm_astd_start();
310 if (error) {
311 log_print("cannot start dlm_astd thread %d", error);
312 goto fail;
313 }
314
315 error = dlm_scand_start();
316 if (error) {
317 log_print("cannot start dlm_scand thread %d", error);
318 goto astd_fail;
319 }
320
321 /* Thread for sending/receiving messages for all lockspace's */
322 error = dlm_lowcomms_start();
323 if (error) {
324 log_print("cannot start dlm lowcomms %d", error);
325 goto scand_fail;
326 }
327
328 return 0;
329
330 scand_fail:
331 dlm_scand_stop();
332 astd_fail:
333 dlm_astd_stop();
334 fail:
335 return error;
336}
337
338static void threads_stop(void)
339{
340 dlm_scand_stop();
341 dlm_lowcomms_stop();
342 dlm_astd_stop();
343}
344
345static int new_lockspace(char *name, int namelen, void **lockspace,
346 uint32_t flags, int lvblen)
347{
348 struct dlm_ls *ls;
349 int i, size, error = -ENOMEM;
350
351 if (namelen > DLM_LOCKSPACE_LEN)
352 return -EINVAL;
353
354 if (!lvblen || (lvblen % 8))
355 return -EINVAL;
356
357 if (!try_module_get(THIS_MODULE))
358 return -EINVAL;
359
360 ls = dlm_find_lockspace_name(name, namelen);
361 if (ls) {
362 *lockspace = ls;
363 module_put(THIS_MODULE);
364 return -EEXIST;
365 }
366
367 ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
368 if (!ls)
369 goto out;
370 memcpy(ls->ls_name, name, namelen);
371 ls->ls_namelen = namelen;
372 ls->ls_exflags = flags;
373 ls->ls_lvblen = lvblen;
374 ls->ls_count = 0;
375 ls->ls_flags = 0;
376
377 size = dlm_config.rsbtbl_size;
378 ls->ls_rsbtbl_size = size;
379
380 ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
381 if (!ls->ls_rsbtbl)
382 goto out_lsfree;
383 for (i = 0; i < size; i++) {
384 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
385 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
386 rwlock_init(&ls->ls_rsbtbl[i].lock);
387 }
388
389 size = dlm_config.lkbtbl_size;
390 ls->ls_lkbtbl_size = size;
391
392 ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
393 if (!ls->ls_lkbtbl)
394 goto out_rsbfree;
395 for (i = 0; i < size; i++) {
396 INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
397 rwlock_init(&ls->ls_lkbtbl[i].lock);
398 ls->ls_lkbtbl[i].counter = 1;
399 }
400
401 size = dlm_config.dirtbl_size;
402 ls->ls_dirtbl_size = size;
403
404 ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
405 if (!ls->ls_dirtbl)
406 goto out_lkbfree;
407 for (i = 0; i < size; i++) {
408 INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
409 rwlock_init(&ls->ls_dirtbl[i].lock);
410 }
411
412 INIT_LIST_HEAD(&ls->ls_waiters);
413 mutex_init(&ls->ls_waiters_mutex);
414
415 INIT_LIST_HEAD(&ls->ls_nodes);
416 INIT_LIST_HEAD(&ls->ls_nodes_gone);
417 ls->ls_num_nodes = 0;
418 ls->ls_low_nodeid = 0;
419 ls->ls_total_weight = 0;
420 ls->ls_node_array = NULL;
421
422 memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
423 ls->ls_stub_rsb.res_ls = ls;
424
425 ls->ls_debug_dentry = NULL;
426
427 init_waitqueue_head(&ls->ls_uevent_wait);
428 ls->ls_uevent_result = 0;
429
430 ls->ls_recoverd_task = NULL;
431 mutex_init(&ls->ls_recoverd_active);
432 spin_lock_init(&ls->ls_recover_lock);
433 ls->ls_recover_status = 0;
434 ls->ls_recover_seq = 0;
435 ls->ls_recover_args = NULL;
436 init_rwsem(&ls->ls_in_recovery);
437 INIT_LIST_HEAD(&ls->ls_requestqueue);
438 mutex_init(&ls->ls_requestqueue_mutex);
439
440 ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
441 if (!ls->ls_recover_buf)
442 goto out_dirfree;
443
444 INIT_LIST_HEAD(&ls->ls_recover_list);
445 spin_lock_init(&ls->ls_recover_list_lock);
446 ls->ls_recover_list_count = 0;
447 init_waitqueue_head(&ls->ls_wait_general);
448 INIT_LIST_HEAD(&ls->ls_root_list);
449 init_rwsem(&ls->ls_root_sem);
450
451 down_write(&ls->ls_in_recovery);
452
453 error = dlm_recoverd_start(ls);
454 if (error) {
455 log_error(ls, "can't start dlm_recoverd %d", error);
456 goto out_rcomfree;
457 }
458
459 spin_lock(&lslist_lock);
460 list_add(&ls->ls_list, &lslist);
461 spin_unlock(&lslist_lock);
462
463 dlm_create_debug_file(ls);
464
465 error = kobject_setup(ls);
466 if (error)
467 goto out_del;
468
469 error = kobject_register(&ls->ls_kobj);
470 if (error)
471 goto out_del;
472
473 error = do_uevent(ls, 1);
474 if (error)
475 goto out_unreg;
476
477 *lockspace = ls;
478 return 0;
479
480 out_unreg:
481 kobject_unregister(&ls->ls_kobj);
482 out_del:
483 dlm_delete_debug_file(ls);
484 spin_lock(&lslist_lock);
485 list_del(&ls->ls_list);
486 spin_unlock(&lslist_lock);
487 dlm_recoverd_stop(ls);
488 out_rcomfree:
489 kfree(ls->ls_recover_buf);
490 out_dirfree:
491 kfree(ls->ls_dirtbl);
492 out_lkbfree:
493 kfree(ls->ls_lkbtbl);
494 out_rsbfree:
495 kfree(ls->ls_rsbtbl);
496 out_lsfree:
497 kfree(ls);
498 out:
499 module_put(THIS_MODULE);
500 return error;
501}
502
503int dlm_new_lockspace(char *name, int namelen, void **lockspace,
504 uint32_t flags, int lvblen)
505{
506 int error = 0;
507
508 mutex_lock(&ls_lock);
509 if (!ls_count)
510 error = threads_start();
511 if (error)
512 goto out;
513
514 error = new_lockspace(name, namelen, lockspace, flags, lvblen);
515 if (!error)
516 ls_count++;
517 out:
518 mutex_unlock(&ls_lock);
519 return error;
520}
521
522/* Return 1 if the lockspace still has active remote locks,
523 * 2 if the lockspace still has active local locks.
524 */
525static int lockspace_busy(struct dlm_ls *ls)
526{
527 int i, lkb_found = 0;
528 struct dlm_lkb *lkb;
529
530 /* NOTE: We check the lockidtbl here rather than the resource table.
531 This is because there may be LKBs queued as ASTs that have been
532 unlinked from their RSBs and are pending deletion once the AST has
533 been delivered */
534
535 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
536 read_lock(&ls->ls_lkbtbl[i].lock);
537 if (!list_empty(&ls->ls_lkbtbl[i].list)) {
538 lkb_found = 1;
539 list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
540 lkb_idtbl_list) {
541 if (!lkb->lkb_nodeid) {
542 read_unlock(&ls->ls_lkbtbl[i].lock);
543 return 2;
544 }
545 }
546 }
547 read_unlock(&ls->ls_lkbtbl[i].lock);
548 }
549 return lkb_found;
550}
551
552static int release_lockspace(struct dlm_ls *ls, int force)
553{
554 struct dlm_lkb *lkb;
555 struct dlm_rsb *rsb;
556 struct list_head *head;
557 int i;
558 int busy = lockspace_busy(ls);
559
560 if (busy > force)
561 return -EBUSY;
562
563 if (force < 3)
564 do_uevent(ls, 0);
565
566 dlm_recoverd_stop(ls);
567
568 remove_lockspace(ls);
569
570 dlm_delete_debug_file(ls);
571
572 dlm_astd_suspend();
573
574 kfree(ls->ls_recover_buf);
575
576 /*
577 * Free direntry structs.
578 */
579
580 dlm_dir_clear(ls);
581 kfree(ls->ls_dirtbl);
582
583 /*
584 * Free all lkb's on lkbtbl[] lists.
585 */
586
587 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
588 head = &ls->ls_lkbtbl[i].list;
589 while (!list_empty(head)) {
590 lkb = list_entry(head->next, struct dlm_lkb,
591 lkb_idtbl_list);
592
593 list_del(&lkb->lkb_idtbl_list);
594
595 dlm_del_ast(lkb);
596
597 if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
598 free_lvb(lkb->lkb_lvbptr);
599
600 free_lkb(lkb);
601 }
602 }
603 dlm_astd_resume();
604
605 kfree(ls->ls_lkbtbl);
606
607 /*
608 * Free all rsb's on rsbtbl[] lists
609 */
610
611 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
612 head = &ls->ls_rsbtbl[i].list;
613 while (!list_empty(head)) {
614 rsb = list_entry(head->next, struct dlm_rsb,
615 res_hashchain);
616
617 list_del(&rsb->res_hashchain);
618 free_rsb(rsb);
619 }
620
621 head = &ls->ls_rsbtbl[i].toss;
622 while (!list_empty(head)) {
623 rsb = list_entry(head->next, struct dlm_rsb,
624 res_hashchain);
625 list_del(&rsb->res_hashchain);
626 free_rsb(rsb);
627 }
628 }
629
630 kfree(ls->ls_rsbtbl);
631
632 /*
633 * Free structures on any other lists
634 */
635
636 kfree(ls->ls_recover_args);
637 dlm_clear_free_entries(ls);
638 dlm_clear_members(ls);
639 dlm_clear_members_gone(ls);
640 kfree(ls->ls_node_array);
641 kobject_unregister(&ls->ls_kobj);
642 kfree(ls);
643
644 mutex_lock(&ls_lock);
645 ls_count--;
646 if (!ls_count)
647 threads_stop();
648 mutex_unlock(&ls_lock);
649
650 module_put(THIS_MODULE);
651 return 0;
652}
653
654/*
655 * Called when a system has released all its locks and is not going to use the
656 * lockspace any longer. We free everything we're managing for this lockspace.
657 * Remaining nodes will go through the recovery process as if we'd died. The
658 * lockspace must continue to function as usual, participating in recoveries,
659 * until this returns.
660 *
661 * Force has 4 possible values:
662 * 0 - don't destroy locksapce if it has any LKBs
663 * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
664 * 2 - destroy lockspace regardless of LKBs
665 * 3 - destroy lockspace as part of a forced shutdown
666 */
667
668int dlm_release_lockspace(void *lockspace, int force)
669{
670 struct dlm_ls *ls;
671
672 ls = dlm_find_lockspace_local(lockspace);
673 if (!ls)
674 return -EINVAL;
675 dlm_put_lockspace(ls);
676 return release_lockspace(ls, force);
677}
678
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
new file mode 100644
index 000000000000..17bd3ba863a9
--- /dev/null
+++ b/fs/dlm/lockspace.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOCKSPACE_DOT_H__
15#define __LOCKSPACE_DOT_H__
16
17int dlm_lockspace_init(void);
18void dlm_lockspace_exit(void);
19struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
20struct dlm_ls *dlm_find_lockspace_local(void *id);
21void dlm_put_lockspace(struct dlm_ls *ls);
22
23#endif /* __LOCKSPACE_DOT_H__ */
24
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
new file mode 100644
index 000000000000..cdd168e4bf45
--- /dev/null
+++ b/fs/dlm/lowcomms.c
@@ -0,0 +1,1239 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * lowcomms.c
16 *
17 * This is the "low-level" comms layer.
18 *
19 * It is responsible for sending/receiving messages
20 * from other nodes in the cluster.
21 *
22 * Cluster nodes are referred to by their nodeids. nodeids are
23 * simply 32 bit numbers to the locking module - if they need to
24 * be expanded for the cluster infrastructure then that is it's
25 * responsibility. It is this layer's
26 * responsibility to resolve these into IP address or
27 * whatever it needs for inter-node communication.
28 *
29 * The comms level is two kernel threads that deal mainly with
30 * the receiving of messages from other nodes and passing them
31 * up to the mid-level comms layer (which understands the
32 * message format) for execution by the locking core, and
33 * a send thread which does all the setting up of connections
34 * to remote nodes and the sending of data. Threads are not allowed
35 * to send their own data because it may cause them to wait in times
36 * of high load. Also, this way, the sending thread can collect together
37 * messages bound for one node and send them in one block.
38 *
39 * I don't see any problem with the recv thread executing the locking
40 * code on behalf of remote processes as the locking code is
41 * short, efficient and never (well, hardly ever) waits.
42 *
43 */
44
45#include <asm/ioctls.h>
46#include <net/sock.h>
47#include <net/tcp.h>
48#include <net/sctp/user.h>
49#include <linux/pagemap.h>
50#include <linux/socket.h>
51#include <linux/idr.h>
52
53#include "dlm_internal.h"
54#include "lowcomms.h"
55#include "config.h"
56#include "midcomms.h"
57
58static struct sockaddr_storage *local_addr[DLM_MAX_ADDR_COUNT];
59static int local_count;
60static int local_nodeid;
61
62/* One of these per connected node */
63
64#define NI_INIT_PENDING 1
65#define NI_WRITE_PENDING 2
66
67struct nodeinfo {
68 spinlock_t lock;
69 sctp_assoc_t assoc_id;
70 unsigned long flags;
71 struct list_head write_list; /* nodes with pending writes */
72 struct list_head writequeue; /* outgoing writequeue_entries */
73 spinlock_t writequeue_lock;
74 int nodeid;
75};
76
77static DEFINE_IDR(nodeinfo_idr);
78static struct rw_semaphore nodeinfo_lock;
79static int max_nodeid;
80
81struct cbuf {
82 unsigned base;
83 unsigned len;
84 unsigned mask;
85};
86
87/* Just the one of these, now. But this struct keeps
88 the connection-specific variables together */
89
90#define CF_READ_PENDING 1
91
92struct connection {
93 struct socket *sock;
94 unsigned long flags;
95 struct page *rx_page;
96 atomic_t waiting_requests;
97 struct cbuf cb;
98 int eagain_flag;
99};
100
101/* An entry waiting to be sent */
102
103struct writequeue_entry {
104 struct list_head list;
105 struct page *page;
106 int offset;
107 int len;
108 int end;
109 int users;
110 struct nodeinfo *ni;
111};
112
113#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
114#define CBUF_EMPTY(cb) ((cb)->len == 0)
115#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
116#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
117
118#define CBUF_INIT(cb, size) \
119do { \
120 (cb)->base = (cb)->len = 0; \
121 (cb)->mask = ((size)-1); \
122} while(0)
123
124#define CBUF_EAT(cb, n) \
125do { \
126 (cb)->len -= (n); \
127 (cb)->base += (n); \
128 (cb)->base &= (cb)->mask; \
129} while(0)
130
131
132/* List of nodes which have writes pending */
133static struct list_head write_nodes;
134static spinlock_t write_nodes_lock;
135
136/* Maximum number of incoming messages to process before
137 * doing a schedule()
138 */
139#define MAX_RX_MSG_COUNT 25
140
141/* Manage daemons */
142static struct task_struct *recv_task;
143static struct task_struct *send_task;
144static wait_queue_head_t lowcomms_recv_wait;
145static atomic_t accepting;
146
147/* The SCTP connection */
148static struct connection sctp_con;
149
150
151static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
152{
153 struct sockaddr_storage addr;
154 int error;
155
156 if (!local_count)
157 return -1;
158
159 error = dlm_nodeid_to_addr(nodeid, &addr);
160 if (error)
161 return error;
162
163 if (local_addr[0]->ss_family == AF_INET) {
164 struct sockaddr_in *in4 = (struct sockaddr_in *) &addr;
165 struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
166 ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
167 } else {
168 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
169 struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
170 memcpy(&ret6->sin6_addr, &in6->sin6_addr,
171 sizeof(in6->sin6_addr));
172 }
173
174 return 0;
175}
176
177static struct nodeinfo *nodeid2nodeinfo(int nodeid, int alloc)
178{
179 struct nodeinfo *ni;
180 int r;
181 int n;
182
183 down_read(&nodeinfo_lock);
184 ni = idr_find(&nodeinfo_idr, nodeid);
185 up_read(&nodeinfo_lock);
186
187 if (!ni && alloc) {
188 down_write(&nodeinfo_lock);
189
190 ni = idr_find(&nodeinfo_idr, nodeid);
191 if (ni)
192 goto out_up;
193
194 r = idr_pre_get(&nodeinfo_idr, alloc);
195 if (!r)
196 goto out_up;
197
198 ni = kmalloc(sizeof(struct nodeinfo), alloc);
199 if (!ni)
200 goto out_up;
201
202 r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
203 if (r) {
204 kfree(ni);
205 ni = NULL;
206 goto out_up;
207 }
208 if (n != nodeid) {
209 idr_remove(&nodeinfo_idr, n);
210 kfree(ni);
211 ni = NULL;
212 goto out_up;
213 }
214 memset(ni, 0, sizeof(struct nodeinfo));
215 spin_lock_init(&ni->lock);
216 INIT_LIST_HEAD(&ni->writequeue);
217 spin_lock_init(&ni->writequeue_lock);
218 ni->nodeid = nodeid;
219
220 if (nodeid > max_nodeid)
221 max_nodeid = nodeid;
222 out_up:
223 up_write(&nodeinfo_lock);
224 }
225
226 return ni;
227}
228
229/* Don't call this too often... */
230static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
231{
232 int i;
233 struct nodeinfo *ni;
234
235 for (i=1; i<=max_nodeid; i++) {
236 ni = nodeid2nodeinfo(i, 0);
237 if (ni && ni->assoc_id == assoc)
238 return ni;
239 }
240 return NULL;
241}
242
243/* Data or notification available on socket */
244static void lowcomms_data_ready(struct sock *sk, int count_unused)
245{
246 atomic_inc(&sctp_con.waiting_requests);
247 if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
248 return;
249
250 wake_up_interruptible(&lowcomms_recv_wait);
251}
252
253
254/* Add the port number to an IP6 or 4 sockaddr and return the address length.
255 Also padd out the struct with zeros to make comparisons meaningful */
256
257static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
258 int *addr_len)
259{
260 struct sockaddr_in *local4_addr;
261 struct sockaddr_in6 *local6_addr;
262
263 if (!local_count)
264 return;
265
266 if (!port) {
267 if (local_addr[0]->ss_family == AF_INET) {
268 local4_addr = (struct sockaddr_in *)local_addr[0];
269 port = be16_to_cpu(local4_addr->sin_port);
270 } else {
271 local6_addr = (struct sockaddr_in6 *)local_addr[0];
272 port = be16_to_cpu(local6_addr->sin6_port);
273 }
274 }
275
276 saddr->ss_family = local_addr[0]->ss_family;
277 if (local_addr[0]->ss_family == AF_INET) {
278 struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
279 in4_addr->sin_port = cpu_to_be16(port);
280 memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
281 memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
282 sizeof(struct sockaddr_in));
283 *addr_len = sizeof(struct sockaddr_in);
284 } else {
285 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
286 in6_addr->sin6_port = cpu_to_be16(port);
287 memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
288 sizeof(struct sockaddr_in6));
289 *addr_len = sizeof(struct sockaddr_in6);
290 }
291}
292
293/* Close the connection and tidy up */
294static void close_connection(void)
295{
296 if (sctp_con.sock) {
297 sock_release(sctp_con.sock);
298 sctp_con.sock = NULL;
299 }
300
301 if (sctp_con.rx_page) {
302 __free_page(sctp_con.rx_page);
303 sctp_con.rx_page = NULL;
304 }
305}
306
307/* We only send shutdown messages to nodes that are not part of the cluster */
308static void send_shutdown(sctp_assoc_t associd)
309{
310 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
311 struct msghdr outmessage;
312 struct cmsghdr *cmsg;
313 struct sctp_sndrcvinfo *sinfo;
314 int ret;
315
316 outmessage.msg_name = NULL;
317 outmessage.msg_namelen = 0;
318 outmessage.msg_control = outcmsg;
319 outmessage.msg_controllen = sizeof(outcmsg);
320 outmessage.msg_flags = MSG_EOR;
321
322 cmsg = CMSG_FIRSTHDR(&outmessage);
323 cmsg->cmsg_level = IPPROTO_SCTP;
324 cmsg->cmsg_type = SCTP_SNDRCV;
325 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
326 outmessage.msg_controllen = cmsg->cmsg_len;
327 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
328 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
329
330 sinfo->sinfo_flags |= MSG_EOF;
331 sinfo->sinfo_assoc_id = associd;
332
333 ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
334
335 if (ret != 0)
336 log_print("send EOF to node failed: %d", ret);
337}
338
339
340/* INIT failed but we don't know which node...
341 restart INIT on all pending nodes */
342static void init_failed(void)
343{
344 int i;
345 struct nodeinfo *ni;
346
347 for (i=1; i<=max_nodeid; i++) {
348 ni = nodeid2nodeinfo(i, 0);
349 if (!ni)
350 continue;
351
352 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
353 ni->assoc_id = 0;
354 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
355 spin_lock_bh(&write_nodes_lock);
356 list_add_tail(&ni->write_list, &write_nodes);
357 spin_unlock_bh(&write_nodes_lock);
358 }
359 }
360 }
361 wake_up_process(send_task);
362}
363
364/* Something happened to an association */
365static void process_sctp_notification(struct msghdr *msg, char *buf)
366{
367 union sctp_notification *sn = (union sctp_notification *)buf;
368
369 if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
370 switch (sn->sn_assoc_change.sac_state) {
371
372 case SCTP_COMM_UP:
373 case SCTP_RESTART:
374 {
375 /* Check that the new node is in the lockspace */
376 struct sctp_prim prim;
377 mm_segment_t fs;
378 int nodeid;
379 int prim_len, ret;
380 int addr_len;
381 struct nodeinfo *ni;
382
383 /* This seems to happen when we received a connection
384 * too early... or something... anyway, it happens but
385 * we always seem to get a real message too, see
386 * receive_from_sock */
387
388 if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
389 log_print("COMM_UP for invalid assoc ID %d",
390 (int)sn->sn_assoc_change.sac_assoc_id);
391 init_failed();
392 return;
393 }
394 memset(&prim, 0, sizeof(struct sctp_prim));
395 prim_len = sizeof(struct sctp_prim);
396 prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
397
398 fs = get_fs();
399 set_fs(get_ds());
400 ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
401 IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
402 (char*)&prim, &prim_len);
403 set_fs(fs);
404 if (ret < 0) {
405 struct nodeinfo *ni;
406
407 log_print("getsockopt/sctp_primary_addr on "
408 "new assoc %d failed : %d",
409 (int)sn->sn_assoc_change.sac_assoc_id, ret);
410
411 /* Retry INIT later */
412 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
413 if (ni)
414 clear_bit(NI_INIT_PENDING, &ni->flags);
415 return;
416 }
417 make_sockaddr(&prim.ssp_addr, 0, &addr_len);
418 if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
419 log_print("reject connect from unknown addr");
420 send_shutdown(prim.ssp_assoc_id);
421 return;
422 }
423
424 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
425 if (!ni)
426 return;
427
428 /* Save the assoc ID */
429 spin_lock(&ni->lock);
430 ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
431 spin_unlock(&ni->lock);
432
433 log_print("got new/restarted association %d nodeid %d",
434 (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
435
436 /* Send any pending writes */
437 clear_bit(NI_INIT_PENDING, &ni->flags);
438 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
439 spin_lock_bh(&write_nodes_lock);
440 list_add_tail(&ni->write_list, &write_nodes);
441 spin_unlock_bh(&write_nodes_lock);
442 }
443 wake_up_process(send_task);
444 }
445 break;
446
447 case SCTP_COMM_LOST:
448 case SCTP_SHUTDOWN_COMP:
449 {
450 struct nodeinfo *ni;
451
452 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
453 if (ni) {
454 spin_lock(&ni->lock);
455 ni->assoc_id = 0;
456 spin_unlock(&ni->lock);
457 }
458 }
459 break;
460
461 /* We don't know which INIT failed, so clear the PENDING flags
462 * on them all. if assoc_id is zero then it will then try
463 * again */
464
465 case SCTP_CANT_STR_ASSOC:
466 {
467 log_print("Can't start SCTP association - retrying");
468 init_failed();
469 }
470 break;
471
472 default:
473 log_print("unexpected SCTP assoc change id=%d state=%d",
474 (int)sn->sn_assoc_change.sac_assoc_id,
475 sn->sn_assoc_change.sac_state);
476 }
477 }
478}
479
480/* Data received from remote end */
481static int receive_from_sock(void)
482{
483 int ret = 0;
484 struct msghdr msg;
485 struct kvec iov[2];
486 unsigned len;
487 int r;
488 struct sctp_sndrcvinfo *sinfo;
489 struct cmsghdr *cmsg;
490 struct nodeinfo *ni;
491
492 /* These two are marginally too big for stack allocation, but this
493 * function is (currently) only called by dlm_recvd so static should be
494 * OK.
495 */
496 static struct sockaddr_storage msgname;
497 static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
498
499 if (sctp_con.sock == NULL)
500 goto out;
501
502 if (sctp_con.rx_page == NULL) {
503 /*
504 * This doesn't need to be atomic, but I think it should
505 * improve performance if it is.
506 */
507 sctp_con.rx_page = alloc_page(GFP_ATOMIC);
508 if (sctp_con.rx_page == NULL)
509 goto out_resched;
510 CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
511 }
512
513 memset(&incmsg, 0, sizeof(incmsg));
514 memset(&msgname, 0, sizeof(msgname));
515
516 memset(incmsg, 0, sizeof(incmsg));
517 msg.msg_name = &msgname;
518 msg.msg_namelen = sizeof(msgname);
519 msg.msg_flags = 0;
520 msg.msg_control = incmsg;
521 msg.msg_controllen = sizeof(incmsg);
522
523 /* I don't see why this circular buffer stuff is necessary for SCTP
524 * which is a packet-based protocol, but the whole thing breaks under
525 * load without it! The overhead is minimal (and is in the TCP lowcomms
526 * anyway, of course) so I'll leave it in until I can figure out what's
527 * really happening.
528 */
529
530 /*
531 * iov[0] is the bit of the circular buffer between the current end
532 * point (cb.base + cb.len) and the end of the buffer.
533 */
534 iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
535 iov[0].iov_base = page_address(sctp_con.rx_page) +
536 CBUF_DATA(&sctp_con.cb);
537 iov[1].iov_len = 0;
538
539 /*
540 * iov[1] is the bit of the circular buffer between the start of the
541 * buffer and the start of the currently used section (cb.base)
542 */
543 if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
544 iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
545 iov[1].iov_len = sctp_con.cb.base;
546 iov[1].iov_base = page_address(sctp_con.rx_page);
547 msg.msg_iovlen = 2;
548 }
549 len = iov[0].iov_len + iov[1].iov_len;
550
551 r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, 1, len,
552 MSG_NOSIGNAL | MSG_DONTWAIT);
553 if (ret <= 0)
554 goto out_close;
555
556 msg.msg_control = incmsg;
557 msg.msg_controllen = sizeof(incmsg);
558 cmsg = CMSG_FIRSTHDR(&msg);
559 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
560
561 if (msg.msg_flags & MSG_NOTIFICATION) {
562 process_sctp_notification(&msg, page_address(sctp_con.rx_page));
563 return 0;
564 }
565
566 /* Is this a new association ? */
567 ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
568 if (ni) {
569 ni->assoc_id = sinfo->sinfo_assoc_id;
570 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
571
572 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
573 spin_lock_bh(&write_nodes_lock);
574 list_add_tail(&ni->write_list, &write_nodes);
575 spin_unlock_bh(&write_nodes_lock);
576 }
577 wake_up_process(send_task);
578 }
579 }
580
581 /* INIT sends a message with length of 1 - ignore it */
582 if (r == 1)
583 return 0;
584
585 CBUF_ADD(&sctp_con.cb, ret);
586 ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
587 page_address(sctp_con.rx_page),
588 sctp_con.cb.base, sctp_con.cb.len,
589 PAGE_CACHE_SIZE);
590 if (ret < 0)
591 goto out_close;
592 CBUF_EAT(&sctp_con.cb, ret);
593
594 out:
595 ret = 0;
596 goto out_ret;
597
598 out_resched:
599 lowcomms_data_ready(sctp_con.sock->sk, 0);
600 ret = 0;
601 schedule();
602 goto out_ret;
603
604 out_close:
605 if (ret != -EAGAIN)
606 log_print("error reading from sctp socket: %d", ret);
607 out_ret:
608 return ret;
609}
610
611/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
612static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
613{
614 mm_segment_t fs;
615 int result = 0;
616
617 fs = get_fs();
618 set_fs(get_ds());
619 if (num == 1)
620 result = sctp_con.sock->ops->bind(sctp_con.sock,
621 (struct sockaddr *) addr, addr_len);
622 else
623 result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
624 SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
625 set_fs(fs);
626
627 if (result < 0)
628 log_print("Can't bind to port %d addr number %d",
629 dlm_config.tcp_port, num);
630
631 return result;
632}
633
634static void init_local(void)
635{
636 struct sockaddr_storage sas, *addr;
637 int i;
638
639 local_nodeid = dlm_our_nodeid();
640
641 for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
642 if (dlm_our_addr(&sas, i))
643 break;
644
645 addr = kmalloc(sizeof(*addr), GFP_KERNEL);
646 if (!addr)
647 break;
648 memcpy(addr, &sas, sizeof(*addr));
649 local_addr[local_count++] = addr;
650 }
651}
652
653/* Initialise SCTP socket and bind to all interfaces */
654static int init_sock(void)
655{
656 mm_segment_t fs;
657 struct socket *sock = NULL;
658 struct sockaddr_storage localaddr;
659 struct sctp_event_subscribe subscribe;
660 int result = -EINVAL, num = 1, i, addr_len;
661
662 if (!local_count) {
663 init_local();
664 if (!local_count) {
665 log_print("no local IP address has been set");
666 goto out;
667 }
668 }
669
670 result = sock_create_kern(local_addr[0]->ss_family, SOCK_SEQPACKET,
671 IPPROTO_SCTP, &sock);
672 if (result < 0) {
673 log_print("Can't create comms socket, check SCTP is loaded");
674 goto out;
675 }
676
677 /* Listen for events */
678 memset(&subscribe, 0, sizeof(subscribe));
679 subscribe.sctp_data_io_event = 1;
680 subscribe.sctp_association_event = 1;
681 subscribe.sctp_send_failure_event = 1;
682 subscribe.sctp_shutdown_event = 1;
683 subscribe.sctp_partial_delivery_event = 1;
684
685 fs = get_fs();
686 set_fs(get_ds());
687 result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
688 (char *)&subscribe, sizeof(subscribe));
689 set_fs(fs);
690
691 if (result < 0) {
692 log_print("Failed to set SCTP_EVENTS on socket: result=%d",
693 result);
694 goto create_delsock;
695 }
696
697 /* Init con struct */
698 sock->sk->sk_user_data = &sctp_con;
699 sctp_con.sock = sock;
700 sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
701
702 /* Bind to all interfaces. */
703 for (i = 0; i < local_count; i++) {
704 memcpy(&localaddr, local_addr[i], sizeof(localaddr));
705 make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
706
707 result = add_bind_addr(&localaddr, addr_len, num);
708 if (result)
709 goto create_delsock;
710 ++num;
711 }
712
713 result = sock->ops->listen(sock, 5);
714 if (result < 0) {
715 log_print("Can't set socket listening");
716 goto create_delsock;
717 }
718
719 return 0;
720
721 create_delsock:
722 sock_release(sock);
723 sctp_con.sock = NULL;
724 out:
725 return result;
726}
727
728
729static struct writequeue_entry *new_writequeue_entry(int allocation)
730{
731 struct writequeue_entry *entry;
732
733 entry = kmalloc(sizeof(struct writequeue_entry), allocation);
734 if (!entry)
735 return NULL;
736
737 entry->page = alloc_page(allocation);
738 if (!entry->page) {
739 kfree(entry);
740 return NULL;
741 }
742
743 entry->offset = 0;
744 entry->len = 0;
745 entry->end = 0;
746 entry->users = 0;
747
748 return entry;
749}
750
751void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc)
752{
753 struct writequeue_entry *e;
754 int offset = 0;
755 int users = 0;
756 struct nodeinfo *ni;
757
758 if (!atomic_read(&accepting))
759 return NULL;
760
761 ni = nodeid2nodeinfo(nodeid, allocation);
762 if (!ni)
763 return NULL;
764
765 spin_lock(&ni->writequeue_lock);
766 e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
767 if (((struct list_head *) e == &ni->writequeue) ||
768 (PAGE_CACHE_SIZE - e->end < len)) {
769 e = NULL;
770 } else {
771 offset = e->end;
772 e->end += len;
773 users = e->users++;
774 }
775 spin_unlock(&ni->writequeue_lock);
776
777 if (e) {
778 got_one:
779 if (users == 0)
780 kmap(e->page);
781 *ppc = page_address(e->page) + offset;
782 return e;
783 }
784
785 e = new_writequeue_entry(allocation);
786 if (e) {
787 spin_lock(&ni->writequeue_lock);
788 offset = e->end;
789 e->end += len;
790 e->ni = ni;
791 users = e->users++;
792 list_add_tail(&e->list, &ni->writequeue);
793 spin_unlock(&ni->writequeue_lock);
794 goto got_one;
795 }
796 return NULL;
797}
798
799void dlm_lowcomms_commit_buffer(void *arg)
800{
801 struct writequeue_entry *e = (struct writequeue_entry *) arg;
802 int users;
803 struct nodeinfo *ni = e->ni;
804
805 if (!atomic_read(&accepting))
806 return;
807
808 spin_lock(&ni->writequeue_lock);
809 users = --e->users;
810 if (users)
811 goto out;
812 e->len = e->end - e->offset;
813 kunmap(e->page);
814 spin_unlock(&ni->writequeue_lock);
815
816 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
817 spin_lock_bh(&write_nodes_lock);
818 list_add_tail(&ni->write_list, &write_nodes);
819 spin_unlock_bh(&write_nodes_lock);
820 wake_up_process(send_task);
821 }
822 return;
823
824 out:
825 spin_unlock(&ni->writequeue_lock);
826 return;
827}
828
829static void free_entry(struct writequeue_entry *e)
830{
831 __free_page(e->page);
832 kfree(e);
833}
834
835/* Initiate an SCTP association. In theory we could just use sendmsg() on
836 the first IP address and it should work, but this allows us to set up the
837 association before sending any valuable data that we can't afford to lose.
838 It also keeps the send path clean as it can now always use the association ID */
839static void initiate_association(int nodeid)
840{
841 struct sockaddr_storage rem_addr;
842 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
843 struct msghdr outmessage;
844 struct cmsghdr *cmsg;
845 struct sctp_sndrcvinfo *sinfo;
846 int ret;
847 int addrlen;
848 char buf[1];
849 struct kvec iov[1];
850 struct nodeinfo *ni;
851
852 log_print("Initiating association with node %d", nodeid);
853
854 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
855 if (!ni)
856 return;
857
858 if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
859 log_print("no address for nodeid %d", nodeid);
860 return;
861 }
862
863 make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
864
865 outmessage.msg_name = &rem_addr;
866 outmessage.msg_namelen = addrlen;
867 outmessage.msg_control = outcmsg;
868 outmessage.msg_controllen = sizeof(outcmsg);
869 outmessage.msg_flags = MSG_EOR;
870
871 iov[0].iov_base = buf;
872 iov[0].iov_len = 1;
873
874 /* Real INIT messages seem to cause trouble. Just send a 1 byte message
875 we can afford to lose */
876 cmsg = CMSG_FIRSTHDR(&outmessage);
877 cmsg->cmsg_level = IPPROTO_SCTP;
878 cmsg->cmsg_type = SCTP_SNDRCV;
879 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
880 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
881 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
882 sinfo->sinfo_ppid = cpu_to_le32(local_nodeid);
883
884 outmessage.msg_controllen = cmsg->cmsg_len;
885 ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
886 if (ret < 0) {
887 log_print("send INIT to node failed: %d", ret);
888 /* Try again later */
889 clear_bit(NI_INIT_PENDING, &ni->flags);
890 }
891}
892
893/* Send a message */
894static int send_to_sock(struct nodeinfo *ni)
895{
896 int ret = 0;
897 struct writequeue_entry *e;
898 int len, offset;
899 struct msghdr outmsg;
900 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
901 struct cmsghdr *cmsg;
902 struct sctp_sndrcvinfo *sinfo;
903 struct kvec iov;
904
905 /* See if we need to init an association before we start
906 sending precious messages */
907 spin_lock(&ni->lock);
908 if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
909 spin_unlock(&ni->lock);
910 initiate_association(ni->nodeid);
911 return 0;
912 }
913 spin_unlock(&ni->lock);
914
915 outmsg.msg_name = NULL; /* We use assoc_id */
916 outmsg.msg_namelen = 0;
917 outmsg.msg_control = outcmsg;
918 outmsg.msg_controllen = sizeof(outcmsg);
919 outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
920
921 cmsg = CMSG_FIRSTHDR(&outmsg);
922 cmsg->cmsg_level = IPPROTO_SCTP;
923 cmsg->cmsg_type = SCTP_SNDRCV;
924 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
925 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
926 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
927 sinfo->sinfo_ppid = cpu_to_le32(local_nodeid);
928 sinfo->sinfo_assoc_id = ni->assoc_id;
929 outmsg.msg_controllen = cmsg->cmsg_len;
930
931 spin_lock(&ni->writequeue_lock);
932 for (;;) {
933 if (list_empty(&ni->writequeue))
934 break;
935 e = list_entry(ni->writequeue.next, struct writequeue_entry,
936 list);
937 kmap(e->page);
938 len = e->len;
939 offset = e->offset;
940 BUG_ON(len == 0 && e->users == 0);
941 spin_unlock(&ni->writequeue_lock);
942
943 ret = 0;
944 if (len) {
945 iov.iov_base = page_address(e->page)+offset;
946 iov.iov_len = len;
947
948 ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
949 len);
950 if (ret == -EAGAIN) {
951 sctp_con.eagain_flag = 1;
952 goto out;
953 } else if (ret < 0)
954 goto send_error;
955 } else {
956 /* Don't starve people filling buffers */
957 schedule();
958 }
959
960 spin_lock(&ni->writequeue_lock);
961 e->offset += ret;
962 e->len -= ret;
963
964 if (e->len == 0 && e->users == 0) {
965 list_del(&e->list);
966 free_entry(e);
967 continue;
968 }
969 }
970 spin_unlock(&ni->writequeue_lock);
971 out:
972 return ret;
973
974 send_error:
975 log_print("Error sending to node %d %d", ni->nodeid, ret);
976 spin_lock(&ni->lock);
977 if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
978 ni->assoc_id = 0;
979 spin_unlock(&ni->lock);
980 initiate_association(ni->nodeid);
981 } else
982 spin_unlock(&ni->lock);
983
984 return ret;
985}
986
987/* Try to send any messages that are pending */
988static void process_output_queue(void)
989{
990 struct list_head *list;
991 struct list_head *temp;
992
993 spin_lock_bh(&write_nodes_lock);
994 list_for_each_safe(list, temp, &write_nodes) {
995 struct nodeinfo *ni =
996 list_entry(list, struct nodeinfo, write_list);
997 clear_bit(NI_WRITE_PENDING, &ni->flags);
998 list_del(&ni->write_list);
999
1000 spin_unlock_bh(&write_nodes_lock);
1001
1002 send_to_sock(ni);
1003 spin_lock_bh(&write_nodes_lock);
1004 }
1005 spin_unlock_bh(&write_nodes_lock);
1006}
1007
1008/* Called after we've had -EAGAIN and been woken up */
1009static void refill_write_queue(void)
1010{
1011 int i;
1012
1013 for (i=1; i<=max_nodeid; i++) {
1014 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1015
1016 if (ni) {
1017 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
1018 spin_lock_bh(&write_nodes_lock);
1019 list_add_tail(&ni->write_list, &write_nodes);
1020 spin_unlock_bh(&write_nodes_lock);
1021 }
1022 }
1023 }
1024}
1025
1026static void clean_one_writequeue(struct nodeinfo *ni)
1027{
1028 struct list_head *list;
1029 struct list_head *temp;
1030
1031 spin_lock(&ni->writequeue_lock);
1032 list_for_each_safe(list, temp, &ni->writequeue) {
1033 struct writequeue_entry *e =
1034 list_entry(list, struct writequeue_entry, list);
1035 list_del(&e->list);
1036 free_entry(e);
1037 }
1038 spin_unlock(&ni->writequeue_lock);
1039}
1040
1041static void clean_writequeues(void)
1042{
1043 int i;
1044
1045 for (i=1; i<=max_nodeid; i++) {
1046 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1047 if (ni)
1048 clean_one_writequeue(ni);
1049 }
1050}
1051
1052
1053static void dealloc_nodeinfo(void)
1054{
1055 int i;
1056
1057 for (i=1; i<=max_nodeid; i++) {
1058 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1059 if (ni) {
1060 idr_remove(&nodeinfo_idr, i);
1061 kfree(ni);
1062 }
1063 }
1064}
1065
1066int dlm_lowcomms_close(int nodeid)
1067{
1068 struct nodeinfo *ni;
1069
1070 ni = nodeid2nodeinfo(nodeid, 0);
1071 if (!ni)
1072 return -1;
1073
1074 spin_lock(&ni->lock);
1075 if (ni->assoc_id) {
1076 ni->assoc_id = 0;
1077 /* Don't send shutdown here, sctp will just queue it
1078 till the node comes back up! */
1079 }
1080 spin_unlock(&ni->lock);
1081
1082 clean_one_writequeue(ni);
1083 clear_bit(NI_INIT_PENDING, &ni->flags);
1084 return 0;
1085}
1086
1087static int write_list_empty(void)
1088{
1089 int status;
1090
1091 spin_lock_bh(&write_nodes_lock);
1092 status = list_empty(&write_nodes);
1093 spin_unlock_bh(&write_nodes_lock);
1094
1095 return status;
1096}
1097
1098static int dlm_recvd(void *data)
1099{
1100 DECLARE_WAITQUEUE(wait, current);
1101
1102 while (!kthread_should_stop()) {
1103 int count = 0;
1104
1105 set_current_state(TASK_INTERRUPTIBLE);
1106 add_wait_queue(&lowcomms_recv_wait, &wait);
1107 if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
1108 schedule();
1109 remove_wait_queue(&lowcomms_recv_wait, &wait);
1110 set_current_state(TASK_RUNNING);
1111
1112 if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
1113 int ret;
1114
1115 do {
1116 ret = receive_from_sock();
1117
1118 /* Don't starve out everyone else */
1119 if (++count >= MAX_RX_MSG_COUNT) {
1120 schedule();
1121 count = 0;
1122 }
1123 } while (!kthread_should_stop() && ret >=0);
1124 }
1125 schedule();
1126 }
1127
1128 return 0;
1129}
1130
1131static int dlm_sendd(void *data)
1132{
1133 DECLARE_WAITQUEUE(wait, current);
1134
1135 add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1136
1137 while (!kthread_should_stop()) {
1138 set_current_state(TASK_INTERRUPTIBLE);
1139 if (write_list_empty())
1140 schedule();
1141 set_current_state(TASK_RUNNING);
1142
1143 if (sctp_con.eagain_flag) {
1144 sctp_con.eagain_flag = 0;
1145 refill_write_queue();
1146 }
1147 process_output_queue();
1148 }
1149
1150 remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1151
1152 return 0;
1153}
1154
1155static void daemons_stop(void)
1156{
1157 kthread_stop(recv_task);
1158 kthread_stop(send_task);
1159}
1160
1161static int daemons_start(void)
1162{
1163 struct task_struct *p;
1164 int error;
1165
1166 p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
1167 error = IS_ERR(p);
1168 if (error) {
1169 log_print("can't start dlm_recvd %d", error);
1170 return error;
1171 }
1172 recv_task = p;
1173
1174 p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
1175 error = IS_ERR(p);
1176 if (error) {
1177 log_print("can't start dlm_sendd %d", error);
1178 kthread_stop(recv_task);
1179 return error;
1180 }
1181 send_task = p;
1182
1183 return 0;
1184}
1185
1186/*
1187 * This is quite likely to sleep...
1188 */
1189int dlm_lowcomms_start(void)
1190{
1191 int error;
1192
1193 spin_lock_init(&write_nodes_lock);
1194 INIT_LIST_HEAD(&write_nodes);
1195 init_rwsem(&nodeinfo_lock);
1196
1197 error = init_sock();
1198 if (error)
1199 goto fail_sock;
1200 error = daemons_start();
1201 if (error)
1202 goto fail_sock;
1203 atomic_set(&accepting, 1);
1204 return 0;
1205
1206 fail_sock:
1207 close_connection();
1208 return error;
1209}
1210
1211/* Set all the activity flags to prevent any socket activity. */
1212
1213void dlm_lowcomms_stop(void)
1214{
1215 atomic_set(&accepting, 0);
1216 sctp_con.flags = 0x7;
1217 daemons_stop();
1218 clean_writequeues();
1219 close_connection();
1220 dealloc_nodeinfo();
1221 max_nodeid = 0;
1222}
1223
1224int dlm_lowcomms_init(void)
1225{
1226 init_waitqueue_head(&lowcomms_recv_wait);
1227 return 0;
1228}
1229
1230void dlm_lowcomms_exit(void)
1231{
1232 int i;
1233
1234 for (i = 0; i < local_count; i++)
1235 kfree(local_addr[i]);
1236 local_count = 0;
1237 local_nodeid = 0;
1238}
1239
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
new file mode 100644
index 000000000000..6c04bb09cfa8
--- /dev/null
+++ b/fs/dlm/lowcomms.h
@@ -0,0 +1,26 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOWCOMMS_DOT_H__
15#define __LOWCOMMS_DOT_H__
16
17int dlm_lowcomms_init(void);
18void dlm_lowcomms_exit(void);
19int dlm_lowcomms_start(void);
20void dlm_lowcomms_stop(void);
21int dlm_lowcomms_close(int nodeid);
22void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc);
23void dlm_lowcomms_commit_buffer(void *mh);
24
25#endif /* __LOWCOMMS_DOT_H__ */
26
diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h
new file mode 100644
index 000000000000..cc3e92f3feef
--- /dev/null
+++ b/fs/dlm/lvb_table.h
@@ -0,0 +1,18 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LVB_TABLE_DOT_H__
14#define __LVB_TABLE_DOT_H__
15
16extern const int dlm_lvb_operations[8][8];
17
18#endif
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
new file mode 100644
index 000000000000..81bf4cb22033
--- /dev/null
+++ b/fs/dlm/main.c
@@ -0,0 +1,89 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "lock.h"
17#include "memory.h"
18#include "lowcomms.h"
19#include "config.h"
20
21#ifdef CONFIG_DLM_DEBUG
22int dlm_register_debugfs(void);
23void dlm_unregister_debugfs(void);
24#else
25static inline int dlm_register_debugfs(void) { return 0; }
26static inline void dlm_unregister_debugfs(void) { }
27#endif
28
29static int __init init_dlm(void)
30{
31 int error;
32
33 error = dlm_memory_init();
34 if (error)
35 goto out;
36
37 error = dlm_lockspace_init();
38 if (error)
39 goto out_mem;
40
41 error = dlm_config_init();
42 if (error)
43 goto out_lockspace;
44
45 error = dlm_register_debugfs();
46 if (error)
47 goto out_config;
48
49 error = dlm_lowcomms_init();
50 if (error)
51 goto out_debug;
52
53 printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
54
55 return 0;
56
57 out_debug:
58 dlm_unregister_debugfs();
59 out_config:
60 dlm_config_exit();
61 out_lockspace:
62 dlm_lockspace_exit();
63 out_mem:
64 dlm_memory_exit();
65 out:
66 return error;
67}
68
69static void __exit exit_dlm(void)
70{
71 dlm_lowcomms_exit();
72 dlm_config_exit();
73 dlm_memory_exit();
74 dlm_lockspace_exit();
75 dlm_unregister_debugfs();
76}
77
78module_init(init_dlm);
79module_exit(exit_dlm);
80
81MODULE_DESCRIPTION("Distributed Lock Manager");
82MODULE_AUTHOR("Red Hat, Inc.");
83MODULE_LICENSE("GPL");
84
85EXPORT_SYMBOL_GPL(dlm_new_lockspace);
86EXPORT_SYMBOL_GPL(dlm_release_lockspace);
87EXPORT_SYMBOL_GPL(dlm_lock);
88EXPORT_SYMBOL_GPL(dlm_unlock);
89
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
new file mode 100644
index 000000000000..cd0c51e724e0
--- /dev/null
+++ b/fs/dlm/member.c
@@ -0,0 +1,312 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "lockspace.h"
15#include "member.h"
16#include "recoverd.h"
17#include "recover.h"
18#include "rcom.h"
19#include "config.h"
20
21/*
22 * Following called by dlm_recoverd thread
23 */
24
25static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
26{
27 struct dlm_member *memb = NULL;
28 struct list_head *tmp;
29 struct list_head *newlist = &new->list;
30 struct list_head *head = &ls->ls_nodes;
31
32 list_for_each(tmp, head) {
33 memb = list_entry(tmp, struct dlm_member, list);
34 if (new->nodeid < memb->nodeid)
35 break;
36 }
37
38 if (!memb)
39 list_add_tail(newlist, head);
40 else {
41 /* FIXME: can use list macro here */
42 newlist->prev = tmp->prev;
43 newlist->next = tmp;
44 tmp->prev->next = newlist;
45 tmp->prev = newlist;
46 }
47}
48
49static int dlm_add_member(struct dlm_ls *ls, int nodeid)
50{
51 struct dlm_member *memb;
52 int w;
53
54 memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
55 if (!memb)
56 return -ENOMEM;
57
58 w = dlm_node_weight(ls->ls_name, nodeid);
59 if (w < 0)
60 return w;
61
62 memb->nodeid = nodeid;
63 memb->weight = w;
64 add_ordered_member(ls, memb);
65 ls->ls_num_nodes++;
66 return 0;
67}
68
69static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
70{
71 list_move(&memb->list, &ls->ls_nodes_gone);
72 ls->ls_num_nodes--;
73}
74
75static int dlm_is_member(struct dlm_ls *ls, int nodeid)
76{
77 struct dlm_member *memb;
78
79 list_for_each_entry(memb, &ls->ls_nodes, list) {
80 if (memb->nodeid == nodeid)
81 return 1;
82 }
83 return 0;
84}
85
86int dlm_is_removed(struct dlm_ls *ls, int nodeid)
87{
88 struct dlm_member *memb;
89
90 list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
91 if (memb->nodeid == nodeid)
92 return 1;
93 }
94 return 0;
95}
96
97static void clear_memb_list(struct list_head *head)
98{
99 struct dlm_member *memb;
100
101 while (!list_empty(head)) {
102 memb = list_entry(head->next, struct dlm_member, list);
103 list_del(&memb->list);
104 kfree(memb);
105 }
106}
107
108void dlm_clear_members(struct dlm_ls *ls)
109{
110 clear_memb_list(&ls->ls_nodes);
111 ls->ls_num_nodes = 0;
112}
113
114void dlm_clear_members_gone(struct dlm_ls *ls)
115{
116 clear_memb_list(&ls->ls_nodes_gone);
117}
118
119static void make_member_array(struct dlm_ls *ls)
120{
121 struct dlm_member *memb;
122 int i, w, x = 0, total = 0, all_zero = 0, *array;
123
124 kfree(ls->ls_node_array);
125 ls->ls_node_array = NULL;
126
127 list_for_each_entry(memb, &ls->ls_nodes, list) {
128 if (memb->weight)
129 total += memb->weight;
130 }
131
132 /* all nodes revert to weight of 1 if all have weight 0 */
133
134 if (!total) {
135 total = ls->ls_num_nodes;
136 all_zero = 1;
137 }
138
139 ls->ls_total_weight = total;
140
141 array = kmalloc(sizeof(int) * total, GFP_KERNEL);
142 if (!array)
143 return;
144
145 list_for_each_entry(memb, &ls->ls_nodes, list) {
146 if (!all_zero && !memb->weight)
147 continue;
148
149 if (all_zero)
150 w = 1;
151 else
152 w = memb->weight;
153
154 DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
155
156 for (i = 0; i < w; i++)
157 array[x++] = memb->nodeid;
158 }
159
160 ls->ls_node_array = array;
161}
162
163/* send a status request to all members just to establish comms connections */
164
165static void ping_members(struct dlm_ls *ls)
166{
167 struct dlm_member *memb;
168 list_for_each_entry(memb, &ls->ls_nodes, list)
169 dlm_rcom_status(ls, memb->nodeid);
170}
171
172int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
173{
174 struct dlm_member *memb, *safe;
175 int i, error, found, pos = 0, neg = 0, low = -1;
176
177 /* move departed members from ls_nodes to ls_nodes_gone */
178
179 list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
180 found = 0;
181 for (i = 0; i < rv->node_count; i++) {
182 if (memb->nodeid == rv->nodeids[i]) {
183 found = 1;
184 break;
185 }
186 }
187
188 if (!found) {
189 neg++;
190 dlm_remove_member(ls, memb);
191 log_debug(ls, "remove member %d", memb->nodeid);
192 }
193 }
194
195 /* add new members to ls_nodes */
196
197 for (i = 0; i < rv->node_count; i++) {
198 if (dlm_is_member(ls, rv->nodeids[i]))
199 continue;
200 dlm_add_member(ls, rv->nodeids[i]);
201 pos++;
202 log_debug(ls, "add member %d", rv->nodeids[i]);
203 }
204
205 list_for_each_entry(memb, &ls->ls_nodes, list) {
206 if (low == -1 || memb->nodeid < low)
207 low = memb->nodeid;
208 }
209 ls->ls_low_nodeid = low;
210
211 make_member_array(ls);
212 dlm_set_recover_status(ls, DLM_RS_NODES);
213 *neg_out = neg;
214
215 ping_members(ls);
216
217 error = dlm_recover_members_wait(ls);
218 log_debug(ls, "total members %d", ls->ls_num_nodes);
219 return error;
220}
221
222/*
223 * Following called from lockspace.c
224 */
225
226int dlm_ls_stop(struct dlm_ls *ls)
227{
228 int new;
229
230 /*
231 * A stop cancels any recovery that's in progress (see RECOVERY_STOP,
232 * dlm_recovery_stopped()) and prevents any new locks from being
233 * processed (see RUNNING, dlm_locking_stopped()).
234 */
235
236 spin_lock(&ls->ls_recover_lock);
237 set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
238 new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
239 ls->ls_recover_seq++;
240 spin_unlock(&ls->ls_recover_lock);
241
242 /*
243 * This in_recovery lock does two things:
244 *
245 * 1) Keeps this function from returning until all threads are out
246 * of locking routines and locking is truely stopped.
247 * 2) Keeps any new requests from being processed until it's unlocked
248 * when recovery is complete.
249 */
250
251 if (new)
252 down_write(&ls->ls_in_recovery);
253
254 /*
255 * The recoverd suspend/resume makes sure that dlm_recoverd (if
256 * running) has noticed the clearing of RUNNING above and quit
257 * processing the previous recovery. This will be true for all nodes
258 * before any nodes start the new recovery.
259 */
260
261 dlm_recoverd_suspend(ls);
262 ls->ls_recover_status = 0;
263 dlm_recoverd_resume(ls);
264 return 0;
265}
266
267int dlm_ls_start(struct dlm_ls *ls)
268{
269 struct dlm_recover *rv = NULL, *rv_old;
270 int *ids = NULL;
271 int error, count;
272
273 rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL);
274 if (!rv)
275 return -ENOMEM;
276
277 error = count = dlm_nodeid_list(ls->ls_name, &ids);
278 if (error <= 0)
279 goto fail;
280
281 spin_lock(&ls->ls_recover_lock);
282
283 /* the lockspace needs to be stopped before it can be started */
284
285 if (!dlm_locking_stopped(ls)) {
286 spin_unlock(&ls->ls_recover_lock);
287 log_error(ls, "start ignored: lockspace running");
288 error = -EINVAL;
289 goto fail;
290 }
291
292 rv->nodeids = ids;
293 rv->node_count = count;
294 rv->seq = ++ls->ls_recover_seq;
295 rv_old = ls->ls_recover_args;
296 ls->ls_recover_args = rv;
297 spin_unlock(&ls->ls_recover_lock);
298
299 if (rv_old) {
300 kfree(rv_old->nodeids);
301 kfree(rv_old);
302 }
303
304 dlm_recoverd_kick(ls);
305 return 0;
306
307 fail:
308 kfree(rv);
309 kfree(ids);
310 return error;
311}
312
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
new file mode 100644
index 000000000000..927c08c19214
--- /dev/null
+++ b/fs/dlm/member.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __MEMBER_DOT_H__
14#define __MEMBER_DOT_H__
15
16int dlm_ls_stop(struct dlm_ls *ls);
17int dlm_ls_start(struct dlm_ls *ls);
18void dlm_clear_members(struct dlm_ls *ls);
19void dlm_clear_members_gone(struct dlm_ls *ls);
20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
21int dlm_is_removed(struct dlm_ls *ls, int nodeid);
22
23#endif /* __MEMBER_DOT_H__ */
24
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
new file mode 100644
index 000000000000..f7cf4589fae8
--- /dev/null
+++ b/fs/dlm/memory.c
@@ -0,0 +1,106 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "config.h"
16#include "memory.h"
17
18static kmem_cache_t *lkb_cache;
19
20
21int dlm_memory_init(void)
22{
23 int ret = 0;
24
25 lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
26 __alignof__(struct dlm_lkb), 0, NULL, NULL);
27 if (!lkb_cache)
28 ret = -ENOMEM;
29 return ret;
30}
31
32void dlm_memory_exit(void)
33{
34 if (lkb_cache)
35 kmem_cache_destroy(lkb_cache);
36}
37
38char *allocate_lvb(struct dlm_ls *ls)
39{
40 char *p;
41
42 p = kmalloc(ls->ls_lvblen, GFP_KERNEL);
43 if (p)
44 memset(p, 0, ls->ls_lvblen);
45 return p;
46}
47
48void free_lvb(char *p)
49{
50 kfree(p);
51}
52
53/* FIXME: have some minimal space built-in to rsb for the name and
54 kmalloc a separate name if needed, like dentries are done */
55
56struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
57{
58 struct dlm_rsb *r;
59
60 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
61
62 r = kmalloc(sizeof(*r) + namelen, GFP_KERNEL);
63 if (r)
64 memset(r, 0, sizeof(*r) + namelen);
65 return r;
66}
67
68void free_rsb(struct dlm_rsb *r)
69{
70 if (r->res_lvbptr)
71 free_lvb(r->res_lvbptr);
72 kfree(r);
73}
74
75struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
76{
77 struct dlm_lkb *lkb;
78
79 lkb = kmem_cache_alloc(lkb_cache, GFP_KERNEL);
80 if (lkb)
81 memset(lkb, 0, sizeof(*lkb));
82 return lkb;
83}
84
85void free_lkb(struct dlm_lkb *lkb)
86{
87 kmem_cache_free(lkb_cache, lkb);
88}
89
90struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
91{
92 struct dlm_direntry *de;
93
94 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
95
96 de = kmalloc(sizeof(*de) + namelen, GFP_KERNEL);
97 if (de)
98 memset(de, 0, sizeof(*de) + namelen);
99 return de;
100}
101
102void free_direntry(struct dlm_direntry *de)
103{
104 kfree(de);
105}
106
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
new file mode 100644
index 000000000000..6ead158ccc5c
--- /dev/null
+++ b/fs/dlm/memory.h
@@ -0,0 +1,29 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MEMORY_DOT_H__
15#define __MEMORY_DOT_H__
16
17int dlm_memory_init(void);
18void dlm_memory_exit(void);
19struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
20void free_rsb(struct dlm_rsb *r);
21struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
22void free_lkb(struct dlm_lkb *l);
23struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
24void free_direntry(struct dlm_direntry *de);
25char *allocate_lvb(struct dlm_ls *ls);
26void free_lvb(char *l);
27
28#endif /* __MEMORY_DOT_H__ */
29
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
new file mode 100644
index 000000000000..c9b1c3d535f4
--- /dev/null
+++ b/fs/dlm/midcomms.c
@@ -0,0 +1,140 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * midcomms.c
16 *
17 * This is the appallingly named "mid-level" comms layer.
18 *
19 * Its purpose is to take packets from the "real" comms layer,
20 * split them up into packets and pass them to the interested
21 * part of the locking mechanism.
22 *
23 * It also takes messages from the locking layer, formats them
24 * into packets and sends them to the comms layer.
25 */
26
27#include "dlm_internal.h"
28#include "lowcomms.h"
29#include "config.h"
30#include "rcom.h"
31#include "lock.h"
32#include "midcomms.h"
33
34
35static void copy_from_cb(void *dst, const void *base, unsigned offset,
36 unsigned len, unsigned limit)
37{
38 unsigned copy = len;
39
40 if ((copy + offset) > limit)
41 copy = limit - offset;
42 memcpy(dst, base + offset, copy);
43 len -= copy;
44 if (len)
45 memcpy(dst + copy, base, len);
46}
47
48/*
49 * Called from the low-level comms layer to process a buffer of
50 * commands.
51 *
52 * Only complete messages are processed here, any "spare" bytes from
53 * the end of a buffer are saved and tacked onto the front of the next
54 * message that comes in. I doubt this will happen very often but we
55 * need to be able to cope with it and I don't want the task to be waiting
56 * for packets to come in when there is useful work to be done.
57 */
58
59int dlm_process_incoming_buffer(int nodeid, const void *base,
60 unsigned offset, unsigned len, unsigned limit)
61{
62 unsigned char __tmp[DLM_INBUF_LEN];
63 struct dlm_header *msg = (struct dlm_header *) __tmp;
64 int ret = 0;
65 int err = 0;
66 uint16_t msglen;
67 uint32_t lockspace;
68
69 while (len > sizeof(struct dlm_header)) {
70
71 /* Copy just the header to check the total length. The
72 message may wrap around the end of the buffer back to the
73 start, so we need to use a temp buffer and copy_from_cb. */
74
75 copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
76 limit);
77
78 msglen = le16_to_cpu(msg->h_length);
79 lockspace = msg->h_lockspace;
80
81 err = -EINVAL;
82 if (msglen < sizeof(struct dlm_header))
83 break;
84 err = -E2BIG;
85 if (msglen > dlm_config.buffer_size) {
86 log_print("message size %d from %d too big, buf len %d",
87 msglen, nodeid, len);
88 break;
89 }
90 err = 0;
91
92 /* If only part of the full message is contained in this
93 buffer, then do nothing and wait for lowcomms to call
94 us again later with more data. We return 0 meaning
95 we've consumed none of the input buffer. */
96
97 if (msglen > len)
98 break;
99
100 /* Allocate a larger temp buffer if the full message won't fit
101 in the buffer on the stack (which should work for most
102 ordinary messages). */
103
104 if (msglen > sizeof(__tmp) &&
105 msg == (struct dlm_header *) __tmp) {
106 msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
107 if (msg == NULL)
108 return ret;
109 }
110
111 copy_from_cb(msg, base, offset, msglen, limit);
112
113 BUG_ON(lockspace != msg->h_lockspace);
114
115 ret += msglen;
116 offset += msglen;
117 offset &= (limit - 1);
118 len -= msglen;
119
120 switch (msg->h_cmd) {
121 case DLM_MSG:
122 dlm_receive_message(msg, nodeid, 0);
123 break;
124
125 case DLM_RCOM:
126 dlm_receive_rcom(msg, nodeid);
127 break;
128
129 default:
130 log_print("unknown msg type %x from %u: %u %u %u %u",
131 msg->h_cmd, nodeid, msglen, len, offset, ret);
132 }
133 }
134
135 if (msg != (struct dlm_header *) __tmp)
136 kfree(msg);
137
138 return err ? err : ret;
139}
140
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
new file mode 100644
index 000000000000..95852a5f111d
--- /dev/null
+++ b/fs/dlm/midcomms.h
@@ -0,0 +1,21 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MIDCOMMS_DOT_H__
15#define __MIDCOMMS_DOT_H__
16
17int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
18 unsigned len, unsigned limit);
19
20#endif /* __MIDCOMMS_DOT_H__ */
21
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
new file mode 100644
index 000000000000..55fbe313340e
--- /dev/null
+++ b/fs/dlm/rcom.c
@@ -0,0 +1,457 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "midcomms.h"
19#include "rcom.h"
20#include "recover.h"
21#include "dir.h"
22#include "config.h"
23#include "memory.h"
24#include "lock.h"
25#include "util.h"
26
27
28static int rcom_response(struct dlm_ls *ls)
29{
30 return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
31}
32
33static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
34 struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
35{
36 struct dlm_rcom *rc;
37 struct dlm_mhandle *mh;
38 char *mb;
39 int mb_len = sizeof(struct dlm_rcom) + len;
40
41 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
42 if (!mh) {
43 log_print("create_rcom to %d type %d len %d ENOBUFS",
44 to_nodeid, type, len);
45 return -ENOBUFS;
46 }
47 memset(mb, 0, mb_len);
48
49 rc = (struct dlm_rcom *) mb;
50
51 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
52 rc->rc_header.h_lockspace = ls->ls_global_id;
53 rc->rc_header.h_nodeid = dlm_our_nodeid();
54 rc->rc_header.h_length = mb_len;
55 rc->rc_header.h_cmd = DLM_RCOM;
56
57 rc->rc_type = type;
58
59 *mh_ret = mh;
60 *rc_ret = rc;
61 return 0;
62}
63
64static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
65 struct dlm_rcom *rc)
66{
67 dlm_rcom_out(rc);
68 dlm_lowcomms_commit_buffer(mh);
69}
70
71/* When replying to a status request, a node also sends back its
72 configuration values. The requesting node then checks that the remote
73 node is configured the same way as itself. */
74
75static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
76{
77 rf->rf_lvblen = ls->ls_lvblen;
78 rf->rf_lsflags = ls->ls_exflags;
79}
80
81static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
82{
83 if (rf->rf_lvblen != ls->ls_lvblen ||
84 rf->rf_lsflags != ls->ls_exflags) {
85 log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
86 ls->ls_lvblen, ls->ls_exflags,
87 nodeid, rf->rf_lvblen, rf->rf_lsflags);
88 return -EINVAL;
89 }
90 return 0;
91}
92
93int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
94{
95 struct dlm_rcom *rc;
96 struct dlm_mhandle *mh;
97 int error = 0;
98
99 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
100
101 if (nodeid == dlm_our_nodeid()) {
102 rc = (struct dlm_rcom *) ls->ls_recover_buf;
103 rc->rc_result = dlm_recover_status(ls);
104 goto out;
105 }
106
107 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
108 if (error)
109 goto out;
110
111 send_rcom(ls, mh, rc);
112
113 error = dlm_wait_function(ls, &rcom_response);
114 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
115 if (error)
116 goto out;
117
118 rc = (struct dlm_rcom *) ls->ls_recover_buf;
119
120 if (rc->rc_result == -ESRCH) {
121 /* we pretend the remote lockspace exists with 0 status */
122 log_debug(ls, "remote node %d not ready", nodeid);
123 rc->rc_result = 0;
124 } else
125 error = check_config(ls, (struct rcom_config *) rc->rc_buf,
126 nodeid);
127 /* the caller looks at rc_result for the remote recovery status */
128 out:
129 return error;
130}
131
132static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
133{
134 struct dlm_rcom *rc;
135 struct dlm_mhandle *mh;
136 int error, nodeid = rc_in->rc_header.h_nodeid;
137
138 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
139 sizeof(struct rcom_config), &rc, &mh);
140 if (error)
141 return;
142 rc->rc_result = dlm_recover_status(ls);
143 make_config(ls, (struct rcom_config *) rc->rc_buf);
144
145 send_rcom(ls, mh, rc);
146}
147
148static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
149{
150 memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
151 set_bit(LSFL_RCOM_READY, &ls->ls_flags);
152 wake_up(&ls->ls_wait_general);
153}
154
155int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
156{
157 struct dlm_rcom *rc;
158 struct dlm_mhandle *mh;
159 int error = 0, len = sizeof(struct dlm_rcom);
160
161 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
162
163 if (nodeid == dlm_our_nodeid()) {
164 dlm_copy_master_names(ls, last_name, last_len,
165 ls->ls_recover_buf + len,
166 dlm_config.buffer_size - len, nodeid);
167 goto out;
168 }
169
170 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
171 if (error)
172 goto out;
173 memcpy(rc->rc_buf, last_name, last_len);
174
175 send_rcom(ls, mh, rc);
176
177 error = dlm_wait_function(ls, &rcom_response);
178 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
179 out:
180 return error;
181}
182
183static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
184{
185 struct dlm_rcom *rc;
186 struct dlm_mhandle *mh;
187 int error, inlen, outlen;
188 int nodeid = rc_in->rc_header.h_nodeid;
189 uint32_t status = dlm_recover_status(ls);
190
191 /*
192 * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
193 * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
194 * It could only happen in rare cases where we get a late NAMES
195 * message from a previous instance of recovery.
196 */
197
198 if (!(status & DLM_RS_NODES)) {
199 log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
200 return;
201 }
202
203 nodeid = rc_in->rc_header.h_nodeid;
204 inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
205 outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
206
207 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
208 if (error)
209 return;
210
211 dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
212 nodeid);
213 send_rcom(ls, mh, rc);
214}
215
216static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
217{
218 memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
219 set_bit(LSFL_RCOM_READY, &ls->ls_flags);
220 wake_up(&ls->ls_wait_general);
221}
222
223int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
224{
225 struct dlm_rcom *rc;
226 struct dlm_mhandle *mh;
227 struct dlm_ls *ls = r->res_ls;
228 int error;
229
230 error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
231 &rc, &mh);
232 if (error)
233 goto out;
234 memcpy(rc->rc_buf, r->res_name, r->res_length);
235 rc->rc_id = (unsigned long) r;
236
237 send_rcom(ls, mh, rc);
238 out:
239 return error;
240}
241
242static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
243{
244 struct dlm_rcom *rc;
245 struct dlm_mhandle *mh;
246 int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
247 int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
248
249 error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
250 if (error)
251 return;
252
253 error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
254 if (error)
255 ret_nodeid = error;
256 rc->rc_result = ret_nodeid;
257 rc->rc_id = rc_in->rc_id;
258
259 send_rcom(ls, mh, rc);
260}
261
262static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
263{
264 dlm_recover_master_reply(ls, rc_in);
265}
266
267static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
268 struct rcom_lock *rl)
269{
270 memset(rl, 0, sizeof(*rl));
271
272 rl->rl_ownpid = lkb->lkb_ownpid;
273 rl->rl_lkid = lkb->lkb_id;
274 rl->rl_exflags = lkb->lkb_exflags;
275 rl->rl_flags = lkb->lkb_flags;
276 rl->rl_lvbseq = lkb->lkb_lvbseq;
277 rl->rl_rqmode = lkb->lkb_rqmode;
278 rl->rl_grmode = lkb->lkb_grmode;
279 rl->rl_status = lkb->lkb_status;
280 rl->rl_wait_type = lkb->lkb_wait_type;
281
282 if (lkb->lkb_bastaddr)
283 rl->rl_asts |= AST_BAST;
284 if (lkb->lkb_astaddr)
285 rl->rl_asts |= AST_COMP;
286
287 rl->rl_namelen = r->res_length;
288 memcpy(rl->rl_name, r->res_name, r->res_length);
289
290 /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
291 If so, receive_rcom_lock_args() won't take this copy. */
292
293 if (lkb->lkb_lvbptr)
294 memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
295}
296
297int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
298{
299 struct dlm_ls *ls = r->res_ls;
300 struct dlm_rcom *rc;
301 struct dlm_mhandle *mh;
302 struct rcom_lock *rl;
303 int error, len = sizeof(struct rcom_lock);
304
305 if (lkb->lkb_lvbptr)
306 len += ls->ls_lvblen;
307
308 error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
309 if (error)
310 goto out;
311
312 rl = (struct rcom_lock *) rc->rc_buf;
313 pack_rcom_lock(r, lkb, rl);
314 rc->rc_id = (unsigned long) r;
315
316 send_rcom(ls, mh, rc);
317 out:
318 return error;
319}
320
321static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
322{
323 struct dlm_rcom *rc;
324 struct dlm_mhandle *mh;
325 int error, nodeid = rc_in->rc_header.h_nodeid;
326
327 dlm_recover_master_copy(ls, rc_in);
328
329 error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
330 sizeof(struct rcom_lock), &rc, &mh);
331 if (error)
332 return;
333
334 /* We send back the same rcom_lock struct we received, but
335 dlm_recover_master_copy() has filled in rl_remid and rl_result */
336
337 memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
338 rc->rc_id = rc_in->rc_id;
339
340 send_rcom(ls, mh, rc);
341}
342
343static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
344{
345 uint32_t status = dlm_recover_status(ls);
346
347 if (!(status & DLM_RS_DIR)) {
348 log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
349 rc_in->rc_header.h_nodeid);
350 return;
351 }
352
353 dlm_recover_process_copy(ls, rc_in);
354}
355
356static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
357{
358 struct dlm_rcom *rc;
359 struct dlm_mhandle *mh;
360 char *mb;
361 int mb_len = sizeof(struct dlm_rcom);
362
363 mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb);
364 if (!mh)
365 return -ENOBUFS;
366 memset(mb, 0, mb_len);
367
368 rc = (struct dlm_rcom *) mb;
369
370 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
371 rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
372 rc->rc_header.h_nodeid = dlm_our_nodeid();
373 rc->rc_header.h_length = mb_len;
374 rc->rc_header.h_cmd = DLM_RCOM;
375
376 rc->rc_type = DLM_RCOM_STATUS_REPLY;
377 rc->rc_result = -ESRCH;
378
379 dlm_rcom_out(rc);
380 dlm_lowcomms_commit_buffer(mh);
381
382 return 0;
383}
384
385/* Called by dlm_recvd; corresponds to dlm_receive_message() but special
386 recovery-only comms are sent through here. */
387
388void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
389{
390 struct dlm_rcom *rc = (struct dlm_rcom *) hd;
391 struct dlm_ls *ls;
392
393 dlm_rcom_in(rc);
394
395 /* If the lockspace doesn't exist then still send a status message
396 back; it's possible that it just doesn't have its global_id yet. */
397
398 ls = dlm_find_lockspace_global(hd->h_lockspace);
399 if (!ls) {
400 log_print("lockspace %x from %d not found",
401 hd->h_lockspace, nodeid);
402 send_ls_not_ready(nodeid, rc);
403 return;
404 }
405
406 if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
407 log_error(ls, "ignoring recovery message %x from %d",
408 rc->rc_type, nodeid);
409 goto out;
410 }
411
412 if (nodeid != rc->rc_header.h_nodeid) {
413 log_error(ls, "bad rcom nodeid %d from %d",
414 rc->rc_header.h_nodeid, nodeid);
415 goto out;
416 }
417
418 switch (rc->rc_type) {
419 case DLM_RCOM_STATUS:
420 receive_rcom_status(ls, rc);
421 break;
422
423 case DLM_RCOM_NAMES:
424 receive_rcom_names(ls, rc);
425 break;
426
427 case DLM_RCOM_LOOKUP:
428 receive_rcom_lookup(ls, rc);
429 break;
430
431 case DLM_RCOM_LOCK:
432 receive_rcom_lock(ls, rc);
433 break;
434
435 case DLM_RCOM_STATUS_REPLY:
436 receive_rcom_status_reply(ls, rc);
437 break;
438
439 case DLM_RCOM_NAMES_REPLY:
440 receive_rcom_names_reply(ls, rc);
441 break;
442
443 case DLM_RCOM_LOOKUP_REPLY:
444 receive_rcom_lookup_reply(ls, rc);
445 break;
446
447 case DLM_RCOM_LOCK_REPLY:
448 receive_rcom_lock_reply(ls, rc);
449 break;
450
451 default:
452 DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
453 }
454 out:
455 dlm_put_lockspace(ls);
456}
457
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
new file mode 100644
index 000000000000..d7984321ff41
--- /dev/null
+++ b/fs/dlm/rcom.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RCOM_DOT_H__
15#define __RCOM_DOT_H__
16
17int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
18int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
19int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
20int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
21void dlm_receive_rcom(struct dlm_header *hd, int nodeid);
22
23#endif
24
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
new file mode 100644
index 000000000000..34876f60f298
--- /dev/null
+++ b/fs/dlm/recover.c
@@ -0,0 +1,762 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "dir.h"
17#include "config.h"
18#include "ast.h"
19#include "memory.h"
20#include "rcom.h"
21#include "lock.h"
22#include "lowcomms.h"
23#include "member.h"
24#include "recover.h"
25
26
27/*
28 * Recovery waiting routines: these functions wait for a particular reply from
29 * a remote node, or for the remote node to report a certain status. They need
30 * to abort if the lockspace is stopped indicating a node has failed (perhaps
31 * the one being waited for).
32 */
33
34/*
35 * Wait until given function returns non-zero or lockspace is stopped
36 * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another
37 * function thinks it could have completed the waited-on task, they should wake
38 * up ls_wait_general to get an immediate response rather than waiting for the
39 * timer to detect the result. A timer wakes us up periodically while waiting
40 * to see if we should abort due to a node failure. This should only be called
41 * by the dlm_recoverd thread.
42 */
43
44static void dlm_wait_timer_fn(unsigned long data)
45{
46 struct dlm_ls *ls = (struct dlm_ls *) data;
47 mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ));
48 wake_up(&ls->ls_wait_general);
49}
50
51int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
52{
53 int error = 0;
54
55 init_timer(&ls->ls_timer);
56 ls->ls_timer.function = dlm_wait_timer_fn;
57 ls->ls_timer.data = (long) ls;
58 ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ);
59 add_timer(&ls->ls_timer);
60
61 wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
62 del_timer_sync(&ls->ls_timer);
63
64 if (dlm_recovery_stopped(ls)) {
65 log_debug(ls, "dlm_wait_function aborted");
66 error = -EINTR;
67 }
68 return error;
69}
70
71/*
72 * An efficient way for all nodes to wait for all others to have a certain
73 * status. The node with the lowest nodeid polls all the others for their
74 * status (wait_status_all) and all the others poll the node with the low id
75 * for its accumulated result (wait_status_low). When all nodes have set
76 * status flag X, then status flag X_ALL will be set on the low nodeid.
77 */
78
79uint32_t dlm_recover_status(struct dlm_ls *ls)
80{
81 uint32_t status;
82 spin_lock(&ls->ls_recover_lock);
83 status = ls->ls_recover_status;
84 spin_unlock(&ls->ls_recover_lock);
85 return status;
86}
87
88void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
89{
90 spin_lock(&ls->ls_recover_lock);
91 ls->ls_recover_status |= status;
92 spin_unlock(&ls->ls_recover_lock);
93}
94
95static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
96{
97 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
98 struct dlm_member *memb;
99 int error = 0, delay;
100
101 list_for_each_entry(memb, &ls->ls_nodes, list) {
102 delay = 0;
103 for (;;) {
104 if (dlm_recovery_stopped(ls)) {
105 error = -EINTR;
106 goto out;
107 }
108
109 error = dlm_rcom_status(ls, memb->nodeid);
110 if (error)
111 goto out;
112
113 if (rc->rc_result & wait_status)
114 break;
115 if (delay < 1000)
116 delay += 20;
117 msleep(delay);
118 }
119 }
120 out:
121 return error;
122}
123
124static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
125{
126 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
127 int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
128
129 for (;;) {
130 if (dlm_recovery_stopped(ls)) {
131 error = -EINTR;
132 goto out;
133 }
134
135 error = dlm_rcom_status(ls, nodeid);
136 if (error)
137 break;
138
139 if (rc->rc_result & wait_status)
140 break;
141 if (delay < 1000)
142 delay += 20;
143 msleep(delay);
144 }
145 out:
146 return error;
147}
148
149static int wait_status(struct dlm_ls *ls, uint32_t status)
150{
151 uint32_t status_all = status << 1;
152 int error;
153
154 if (ls->ls_low_nodeid == dlm_our_nodeid()) {
155 error = wait_status_all(ls, status);
156 if (!error)
157 dlm_set_recover_status(ls, status_all);
158 } else
159 error = wait_status_low(ls, status_all);
160
161 return error;
162}
163
164int dlm_recover_members_wait(struct dlm_ls *ls)
165{
166 return wait_status(ls, DLM_RS_NODES);
167}
168
169int dlm_recover_directory_wait(struct dlm_ls *ls)
170{
171 return wait_status(ls, DLM_RS_DIR);
172}
173
174int dlm_recover_locks_wait(struct dlm_ls *ls)
175{
176 return wait_status(ls, DLM_RS_LOCKS);
177}
178
179int dlm_recover_done_wait(struct dlm_ls *ls)
180{
181 return wait_status(ls, DLM_RS_DONE);
182}
183
184/*
185 * The recover_list contains all the rsb's for which we've requested the new
186 * master nodeid. As replies are returned from the resource directories the
187 * rsb's are removed from the list. When the list is empty we're done.
188 *
189 * The recover_list is later similarly used for all rsb's for which we've sent
190 * new lkb's and need to receive new corresponding lkid's.
191 *
192 * We use the address of the rsb struct as a simple local identifier for the
193 * rsb so we can match an rcom reply with the rsb it was sent for.
194 */
195
196static int recover_list_empty(struct dlm_ls *ls)
197{
198 int empty;
199
200 spin_lock(&ls->ls_recover_list_lock);
201 empty = list_empty(&ls->ls_recover_list);
202 spin_unlock(&ls->ls_recover_list_lock);
203
204 return empty;
205}
206
207static void recover_list_add(struct dlm_rsb *r)
208{
209 struct dlm_ls *ls = r->res_ls;
210
211 spin_lock(&ls->ls_recover_list_lock);
212 if (list_empty(&r->res_recover_list)) {
213 list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
214 ls->ls_recover_list_count++;
215 dlm_hold_rsb(r);
216 }
217 spin_unlock(&ls->ls_recover_list_lock);
218}
219
220static void recover_list_del(struct dlm_rsb *r)
221{
222 struct dlm_ls *ls = r->res_ls;
223
224 spin_lock(&ls->ls_recover_list_lock);
225 list_del_init(&r->res_recover_list);
226 ls->ls_recover_list_count--;
227 spin_unlock(&ls->ls_recover_list_lock);
228
229 dlm_put_rsb(r);
230}
231
232static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id)
233{
234 struct dlm_rsb *r = NULL;
235
236 spin_lock(&ls->ls_recover_list_lock);
237
238 list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) {
239 if (id == (unsigned long) r)
240 goto out;
241 }
242 r = NULL;
243 out:
244 spin_unlock(&ls->ls_recover_list_lock);
245 return r;
246}
247
248static void recover_list_clear(struct dlm_ls *ls)
249{
250 struct dlm_rsb *r, *s;
251
252 spin_lock(&ls->ls_recover_list_lock);
253 list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
254 list_del_init(&r->res_recover_list);
255 dlm_put_rsb(r);
256 ls->ls_recover_list_count--;
257 }
258
259 if (ls->ls_recover_list_count != 0) {
260 log_error(ls, "warning: recover_list_count %d",
261 ls->ls_recover_list_count);
262 ls->ls_recover_list_count = 0;
263 }
264 spin_unlock(&ls->ls_recover_list_lock);
265}
266
267
268/* Master recovery: find new master node for rsb's that were
269 mastered on nodes that have been removed.
270
271 dlm_recover_masters
272 recover_master
273 dlm_send_rcom_lookup -> receive_rcom_lookup
274 dlm_dir_lookup
275 receive_rcom_lookup_reply <-
276 dlm_recover_master_reply
277 set_new_master
278 set_master_lkbs
279 set_lock_master
280*/
281
282/*
283 * Set the lock master for all LKBs in a lock queue
284 * If we are the new master of the rsb, we may have received new
285 * MSTCPY locks from other nodes already which we need to ignore
286 * when setting the new nodeid.
287 */
288
289static void set_lock_master(struct list_head *queue, int nodeid)
290{
291 struct dlm_lkb *lkb;
292
293 list_for_each_entry(lkb, queue, lkb_statequeue)
294 if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
295 lkb->lkb_nodeid = nodeid;
296}
297
298static void set_master_lkbs(struct dlm_rsb *r)
299{
300 set_lock_master(&r->res_grantqueue, r->res_nodeid);
301 set_lock_master(&r->res_convertqueue, r->res_nodeid);
302 set_lock_master(&r->res_waitqueue, r->res_nodeid);
303}
304
305/*
306 * Propogate the new master nodeid to locks
307 * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
308 * The NEW_MASTER2 flag tells recover_lvb() which rsb's to consider.
309 */
310
311static void set_new_master(struct dlm_rsb *r, int nodeid)
312{
313 lock_rsb(r);
314 r->res_nodeid = nodeid;
315 set_master_lkbs(r);
316 rsb_set_flag(r, RSB_NEW_MASTER);
317 rsb_set_flag(r, RSB_NEW_MASTER2);
318 unlock_rsb(r);
319}
320
321/*
322 * We do async lookups on rsb's that need new masters. The rsb's
323 * waiting for a lookup reply are kept on the recover_list.
324 */
325
326static int recover_master(struct dlm_rsb *r)
327{
328 struct dlm_ls *ls = r->res_ls;
329 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
330
331 dir_nodeid = dlm_dir_nodeid(r);
332
333 if (dir_nodeid == our_nodeid) {
334 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
335 r->res_length, &ret_nodeid);
336 if (error)
337 log_error(ls, "recover dir lookup error %d", error);
338
339 if (ret_nodeid == our_nodeid)
340 ret_nodeid = 0;
341 set_new_master(r, ret_nodeid);
342 } else {
343 recover_list_add(r);
344 error = dlm_send_rcom_lookup(r, dir_nodeid);
345 }
346
347 return error;
348}
349
350/*
351 * When not using a directory, most resource names will hash to a new static
352 * master nodeid and the resource will need to be remastered.
353 */
354
355static int recover_master_static(struct dlm_rsb *r)
356{
357 int master = dlm_dir_nodeid(r);
358
359 if (master == dlm_our_nodeid())
360 master = 0;
361
362 if (r->res_nodeid != master) {
363 if (is_master(r))
364 dlm_purge_mstcpy_locks(r);
365 set_new_master(r, master);
366 return 1;
367 }
368 return 0;
369}
370
371/*
372 * Go through local root resources and for each rsb which has a master which
373 * has departed, get the new master nodeid from the directory. The dir will
374 * assign mastery to the first node to look up the new master. That means
375 * we'll discover in this lookup if we're the new master of any rsb's.
376 *
377 * We fire off all the dir lookup requests individually and asynchronously to
378 * the correct dir node.
379 */
380
381int dlm_recover_masters(struct dlm_ls *ls)
382{
383 struct dlm_rsb *r;
384 int error = 0, count = 0;
385
386 log_debug(ls, "dlm_recover_masters");
387
388 down_read(&ls->ls_root_sem);
389 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
390 if (dlm_recovery_stopped(ls)) {
391 up_read(&ls->ls_root_sem);
392 error = -EINTR;
393 goto out;
394 }
395
396 if (dlm_no_directory(ls))
397 count += recover_master_static(r);
398 else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) {
399 recover_master(r);
400 count++;
401 }
402
403 schedule();
404 }
405 up_read(&ls->ls_root_sem);
406
407 log_debug(ls, "dlm_recover_masters %d resources", count);
408
409 error = dlm_wait_function(ls, &recover_list_empty);
410 out:
411 if (error)
412 recover_list_clear(ls);
413 return error;
414}
415
416int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
417{
418 struct dlm_rsb *r;
419 int nodeid;
420
421 r = recover_list_find(ls, rc->rc_id);
422 if (!r) {
423 log_error(ls, "dlm_recover_master_reply no id %llx",
424 (unsigned long long)rc->rc_id);
425 goto out;
426 }
427
428 nodeid = rc->rc_result;
429 if (nodeid == dlm_our_nodeid())
430 nodeid = 0;
431
432 set_new_master(r, nodeid);
433 recover_list_del(r);
434
435 if (recover_list_empty(ls))
436 wake_up(&ls->ls_wait_general);
437 out:
438 return 0;
439}
440
441
442/* Lock recovery: rebuild the process-copy locks we hold on a
443 remastered rsb on the new rsb master.
444
445 dlm_recover_locks
446 recover_locks
447 recover_locks_queue
448 dlm_send_rcom_lock -> receive_rcom_lock
449 dlm_recover_master_copy
450 receive_rcom_lock_reply <-
451 dlm_recover_process_copy
452*/
453
454
455/*
456 * keep a count of the number of lkb's we send to the new master; when we get
457 * an equal number of replies then recovery for the rsb is done
458 */
459
460static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
461{
462 struct dlm_lkb *lkb;
463 int error = 0;
464
465 list_for_each_entry(lkb, head, lkb_statequeue) {
466 error = dlm_send_rcom_lock(r, lkb);
467 if (error)
468 break;
469 r->res_recover_locks_count++;
470 }
471
472 return error;
473}
474
475static int all_queues_empty(struct dlm_rsb *r)
476{
477 if (!list_empty(&r->res_grantqueue) ||
478 !list_empty(&r->res_convertqueue) ||
479 !list_empty(&r->res_waitqueue))
480 return 0;
481 return 1;
482}
483
484static int recover_locks(struct dlm_rsb *r)
485{
486 int error = 0;
487
488 lock_rsb(r);
489 if (all_queues_empty(r))
490 goto out;
491
492 DLM_ASSERT(!r->res_recover_locks_count, dlm_print_rsb(r););
493
494 error = recover_locks_queue(r, &r->res_grantqueue);
495 if (error)
496 goto out;
497 error = recover_locks_queue(r, &r->res_convertqueue);
498 if (error)
499 goto out;
500 error = recover_locks_queue(r, &r->res_waitqueue);
501 if (error)
502 goto out;
503
504 if (r->res_recover_locks_count)
505 recover_list_add(r);
506 else
507 rsb_clear_flag(r, RSB_NEW_MASTER);
508 out:
509 unlock_rsb(r);
510 return error;
511}
512
513int dlm_recover_locks(struct dlm_ls *ls)
514{
515 struct dlm_rsb *r;
516 int error, count = 0;
517
518 log_debug(ls, "dlm_recover_locks");
519
520 down_read(&ls->ls_root_sem);
521 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
522 if (is_master(r)) {
523 rsb_clear_flag(r, RSB_NEW_MASTER);
524 continue;
525 }
526
527 if (!rsb_flag(r, RSB_NEW_MASTER))
528 continue;
529
530 if (dlm_recovery_stopped(ls)) {
531 error = -EINTR;
532 up_read(&ls->ls_root_sem);
533 goto out;
534 }
535
536 error = recover_locks(r);
537 if (error) {
538 up_read(&ls->ls_root_sem);
539 goto out;
540 }
541
542 count += r->res_recover_locks_count;
543 }
544 up_read(&ls->ls_root_sem);
545
546 log_debug(ls, "dlm_recover_locks %d locks", count);
547
548 error = dlm_wait_function(ls, &recover_list_empty);
549 out:
550 if (error)
551 recover_list_clear(ls);
552 else
553 dlm_set_recover_status(ls, DLM_RS_LOCKS);
554 return error;
555}
556
557void dlm_recovered_lock(struct dlm_rsb *r)
558{
559 DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_print_rsb(r););
560
561 r->res_recover_locks_count--;
562 if (!r->res_recover_locks_count) {
563 rsb_clear_flag(r, RSB_NEW_MASTER);
564 recover_list_del(r);
565 }
566
567 if (recover_list_empty(r->res_ls))
568 wake_up(&r->res_ls->ls_wait_general);
569}
570
571/*
572 * The lvb needs to be recovered on all master rsb's. This includes setting
573 * the VALNOTVALID flag if necessary, and determining the correct lvb contents
574 * based on the lvb's of the locks held on the rsb.
575 *
576 * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it
577 * was already set prior to recovery, it's not cleared, regardless of locks.
578 *
579 * The LVB contents are only considered for changing when this is a new master
580 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
581 * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken
582 * from the lkb with the largest lvb sequence number.
583 */
584
585static void recover_lvb(struct dlm_rsb *r)
586{
587 struct dlm_lkb *lkb, *high_lkb = NULL;
588 uint32_t high_seq = 0;
589 int lock_lvb_exists = 0;
590 int big_lock_exists = 0;
591 int lvblen = r->res_ls->ls_lvblen;
592
593 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
594 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
595 continue;
596
597 lock_lvb_exists = 1;
598
599 if (lkb->lkb_grmode > DLM_LOCK_CR) {
600 big_lock_exists = 1;
601 goto setflag;
602 }
603
604 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
605 high_lkb = lkb;
606 high_seq = lkb->lkb_lvbseq;
607 }
608 }
609
610 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
611 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
612 continue;
613
614 lock_lvb_exists = 1;
615
616 if (lkb->lkb_grmode > DLM_LOCK_CR) {
617 big_lock_exists = 1;
618 goto setflag;
619 }
620
621 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
622 high_lkb = lkb;
623 high_seq = lkb->lkb_lvbseq;
624 }
625 }
626
627 setflag:
628 if (!lock_lvb_exists)
629 goto out;
630
631 if (!big_lock_exists)
632 rsb_set_flag(r, RSB_VALNOTVALID);
633
634 /* don't mess with the lvb unless we're the new master */
635 if (!rsb_flag(r, RSB_NEW_MASTER2))
636 goto out;
637
638 if (!r->res_lvbptr) {
639 r->res_lvbptr = allocate_lvb(r->res_ls);
640 if (!r->res_lvbptr)
641 goto out;
642 }
643
644 if (big_lock_exists) {
645 r->res_lvbseq = lkb->lkb_lvbseq;
646 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
647 } else if (high_lkb) {
648 r->res_lvbseq = high_lkb->lkb_lvbseq;
649 memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
650 } else {
651 r->res_lvbseq = 0;
652 memset(r->res_lvbptr, 0, lvblen);
653 }
654 out:
655 return;
656}
657
658/* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks
659 converting PR->CW or CW->PR need to have their lkb_grmode set. */
660
661static void recover_conversion(struct dlm_rsb *r)
662{
663 struct dlm_lkb *lkb;
664 int grmode = -1;
665
666 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
667 if (lkb->lkb_grmode == DLM_LOCK_PR ||
668 lkb->lkb_grmode == DLM_LOCK_CW) {
669 grmode = lkb->lkb_grmode;
670 break;
671 }
672 }
673
674 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
675 if (lkb->lkb_grmode != DLM_LOCK_IV)
676 continue;
677 if (grmode == -1)
678 lkb->lkb_grmode = lkb->lkb_rqmode;
679 else
680 lkb->lkb_grmode = grmode;
681 }
682}
683
684void dlm_recover_rsbs(struct dlm_ls *ls)
685{
686 struct dlm_rsb *r;
687 int count = 0;
688
689 log_debug(ls, "dlm_recover_rsbs");
690
691 down_read(&ls->ls_root_sem);
692 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
693 lock_rsb(r);
694 if (is_master(r)) {
695 if (rsb_flag(r, RSB_RECOVER_CONVERT))
696 recover_conversion(r);
697 recover_lvb(r);
698 count++;
699 }
700 rsb_clear_flag(r, RSB_RECOVER_CONVERT);
701 unlock_rsb(r);
702 }
703 up_read(&ls->ls_root_sem);
704
705 log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
706}
707
708/* Create a single list of all root rsb's to be used during recovery */
709
710int dlm_create_root_list(struct dlm_ls *ls)
711{
712 struct dlm_rsb *r;
713 int i, error = 0;
714
715 down_write(&ls->ls_root_sem);
716 if (!list_empty(&ls->ls_root_list)) {
717 log_error(ls, "root list not empty");
718 error = -EINVAL;
719 goto out;
720 }
721
722 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
723 read_lock(&ls->ls_rsbtbl[i].lock);
724 list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
725 list_add(&r->res_root_list, &ls->ls_root_list);
726 dlm_hold_rsb(r);
727 }
728 read_unlock(&ls->ls_rsbtbl[i].lock);
729 }
730 out:
731 up_write(&ls->ls_root_sem);
732 return error;
733}
734
735void dlm_release_root_list(struct dlm_ls *ls)
736{
737 struct dlm_rsb *r, *safe;
738
739 down_write(&ls->ls_root_sem);
740 list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
741 list_del_init(&r->res_root_list);
742 dlm_put_rsb(r);
743 }
744 up_write(&ls->ls_root_sem);
745}
746
747void dlm_clear_toss_list(struct dlm_ls *ls)
748{
749 struct dlm_rsb *r, *safe;
750 int i;
751
752 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
753 write_lock(&ls->ls_rsbtbl[i].lock);
754 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
755 res_hashchain) {
756 list_del(&r->res_hashchain);
757 free_rsb(r);
758 }
759 write_unlock(&ls->ls_rsbtbl[i].lock);
760 }
761}
762
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
new file mode 100644
index 000000000000..ebd0363f1e08
--- /dev/null
+++ b/fs/dlm/recover.h
@@ -0,0 +1,34 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVER_DOT_H__
15#define __RECOVER_DOT_H__
16
17int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
18uint32_t dlm_recover_status(struct dlm_ls *ls);
19void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
20int dlm_recover_members_wait(struct dlm_ls *ls);
21int dlm_recover_directory_wait(struct dlm_ls *ls);
22int dlm_recover_locks_wait(struct dlm_ls *ls);
23int dlm_recover_done_wait(struct dlm_ls *ls);
24int dlm_recover_masters(struct dlm_ls *ls);
25int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
26int dlm_recover_locks(struct dlm_ls *ls);
27void dlm_recovered_lock(struct dlm_rsb *r);
28int dlm_create_root_list(struct dlm_ls *ls);
29void dlm_release_root_list(struct dlm_ls *ls);
30void dlm_clear_toss_list(struct dlm_ls *ls);
31void dlm_recover_rsbs(struct dlm_ls *ls);
32
33#endif /* __RECOVER_DOT_H__ */
34
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
new file mode 100644
index 000000000000..70103533677d
--- /dev/null
+++ b/fs/dlm/recoverd.c
@@ -0,0 +1,285 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "dir.h"
18#include "ast.h"
19#include "recover.h"
20#include "lowcomms.h"
21#include "lock.h"
22#include "requestqueue.h"
23#include "recoverd.h"
24
25
26/* If the start for which we're re-enabling locking (seq) has been superseded
27 by a newer stop (ls_recover_seq), we need to leave locking disabled. */
28
29static int enable_locking(struct dlm_ls *ls, uint64_t seq)
30{
31 int error = -EINTR;
32
33 spin_lock(&ls->ls_recover_lock);
34 if (ls->ls_recover_seq == seq) {
35 set_bit(LSFL_RUNNING, &ls->ls_flags);
36 up_write(&ls->ls_in_recovery);
37 error = 0;
38 }
39 spin_unlock(&ls->ls_recover_lock);
40 return error;
41}
42
43static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
44{
45 unsigned long start;
46 int error, neg = 0;
47
48 log_debug(ls, "recover %llx", rv->seq);
49
50 mutex_lock(&ls->ls_recoverd_active);
51
52 /*
53 * Suspending and resuming dlm_astd ensures that no lkb's from this ls
54 * will be processed by dlm_astd during recovery.
55 */
56
57 dlm_astd_suspend();
58 dlm_astd_resume();
59
60 /*
61 * This list of root rsb's will be the basis of most of the recovery
62 * routines.
63 */
64
65 dlm_create_root_list(ls);
66
67 /*
68 * Free all the tossed rsb's so we don't have to recover them.
69 */
70
71 dlm_clear_toss_list(ls);
72
73 /*
74 * Add or remove nodes from the lockspace's ls_nodes list.
75 * Also waits for all nodes to complete dlm_recover_members.
76 */
77
78 error = dlm_recover_members(ls, rv, &neg);
79 if (error) {
80 log_error(ls, "recover_members failed %d", error);
81 goto fail;
82 }
83 start = jiffies;
84
85 /*
86 * Rebuild our own share of the directory by collecting from all other
87 * nodes their master rsb names that hash to us.
88 */
89
90 error = dlm_recover_directory(ls);
91 if (error) {
92 log_error(ls, "recover_directory failed %d", error);
93 goto fail;
94 }
95
96 /*
97 * Purge directory-related requests that are saved in requestqueue.
98 * All dir requests from before recovery are invalid now due to the dir
99 * rebuild and will be resent by the requesting nodes.
100 */
101
102 dlm_purge_requestqueue(ls);
103
104 /*
105 * Wait for all nodes to complete directory rebuild.
106 */
107
108 error = dlm_recover_directory_wait(ls);
109 if (error) {
110 log_error(ls, "recover_directory_wait failed %d", error);
111 goto fail;
112 }
113
114 /*
115 * We may have outstanding operations that are waiting for a reply from
116 * a failed node. Mark these to be resent after recovery. Unlock and
117 * cancel ops can just be completed.
118 */
119
120 dlm_recover_waiters_pre(ls);
121
122 error = dlm_recovery_stopped(ls);
123 if (error)
124 goto fail;
125
126 if (neg || dlm_no_directory(ls)) {
127 /*
128 * Clear lkb's for departed nodes.
129 */
130
131 dlm_purge_locks(ls);
132
133 /*
134 * Get new master nodeid's for rsb's that were mastered on
135 * departed nodes.
136 */
137
138 error = dlm_recover_masters(ls);
139 if (error) {
140 log_error(ls, "recover_masters failed %d", error);
141 goto fail;
142 }
143
144 /*
145 * Send our locks on remastered rsb's to the new masters.
146 */
147
148 error = dlm_recover_locks(ls);
149 if (error) {
150 log_error(ls, "recover_locks failed %d", error);
151 goto fail;
152 }
153
154 error = dlm_recover_locks_wait(ls);
155 if (error) {
156 log_error(ls, "recover_locks_wait failed %d", error);
157 goto fail;
158 }
159
160 /*
161 * Finalize state in master rsb's now that all locks can be
162 * checked. This includes conversion resolution and lvb
163 * settings.
164 */
165
166 dlm_recover_rsbs(ls);
167 }
168
169 dlm_release_root_list(ls);
170
171 dlm_set_recover_status(ls, DLM_RS_DONE);
172 error = dlm_recover_done_wait(ls);
173 if (error) {
174 log_error(ls, "recover_done_wait failed %d", error);
175 goto fail;
176 }
177
178 dlm_clear_members_gone(ls);
179
180 error = enable_locking(ls, rv->seq);
181 if (error) {
182 log_error(ls, "enable_locking failed %d", error);
183 goto fail;
184 }
185
186 error = dlm_process_requestqueue(ls);
187 if (error) {
188 log_error(ls, "process_requestqueue failed %d", error);
189 goto fail;
190 }
191
192 error = dlm_recover_waiters_post(ls);
193 if (error) {
194 log_error(ls, "recover_waiters_post failed %d", error);
195 goto fail;
196 }
197
198 dlm_grant_after_purge(ls);
199
200 dlm_astd_wake();
201
202 log_debug(ls, "recover %llx done: %u ms", rv->seq,
203 jiffies_to_msecs(jiffies - start));
204 mutex_unlock(&ls->ls_recoverd_active);
205
206 return 0;
207
208 fail:
209 dlm_release_root_list(ls);
210 log_debug(ls, "recover %llx error %d", rv->seq, error);
211 mutex_unlock(&ls->ls_recoverd_active);
212 return error;
213}
214
215static void do_ls_recovery(struct dlm_ls *ls)
216{
217 struct dlm_recover *rv = NULL;
218
219 spin_lock(&ls->ls_recover_lock);
220 rv = ls->ls_recover_args;
221 ls->ls_recover_args = NULL;
222 clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
223 spin_unlock(&ls->ls_recover_lock);
224
225 if (rv) {
226 ls_recover(ls, rv);
227 kfree(rv->nodeids);
228 kfree(rv);
229 }
230}
231
232static int dlm_recoverd(void *arg)
233{
234 struct dlm_ls *ls;
235
236 ls = dlm_find_lockspace_local(arg);
237
238 while (!kthread_should_stop()) {
239 set_current_state(TASK_INTERRUPTIBLE);
240 if (!test_bit(LSFL_WORK, &ls->ls_flags))
241 schedule();
242 set_current_state(TASK_RUNNING);
243
244 if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
245 do_ls_recovery(ls);
246 }
247
248 dlm_put_lockspace(ls);
249 return 0;
250}
251
252void dlm_recoverd_kick(struct dlm_ls *ls)
253{
254 set_bit(LSFL_WORK, &ls->ls_flags);
255 wake_up_process(ls->ls_recoverd_task);
256}
257
258int dlm_recoverd_start(struct dlm_ls *ls)
259{
260 struct task_struct *p;
261 int error = 0;
262
263 p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
264 if (IS_ERR(p))
265 error = PTR_ERR(p);
266 else
267 ls->ls_recoverd_task = p;
268 return error;
269}
270
271void dlm_recoverd_stop(struct dlm_ls *ls)
272{
273 kthread_stop(ls->ls_recoverd_task);
274}
275
276void dlm_recoverd_suspend(struct dlm_ls *ls)
277{
278 mutex_lock(&ls->ls_recoverd_active);
279}
280
281void dlm_recoverd_resume(struct dlm_ls *ls)
282{
283 mutex_unlock(&ls->ls_recoverd_active);
284}
285
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
new file mode 100644
index 000000000000..866657c5d69d
--- /dev/null
+++ b/fs/dlm/recoverd.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVERD_DOT_H__
15#define __RECOVERD_DOT_H__
16
17void dlm_recoverd_kick(struct dlm_ls *ls);
18void dlm_recoverd_stop(struct dlm_ls *ls);
19int dlm_recoverd_start(struct dlm_ls *ls);
20void dlm_recoverd_suspend(struct dlm_ls *ls);
21void dlm_recoverd_resume(struct dlm_ls *ls);
22
23#endif /* __RECOVERD_DOT_H__ */
24
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
new file mode 100644
index 000000000000..7b2b089634a2
--- /dev/null
+++ b/fs/dlm/requestqueue.c
@@ -0,0 +1,184 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "member.h"
15#include "lock.h"
16#include "dir.h"
17#include "config.h"
18#include "requestqueue.h"
19
20struct rq_entry {
21 struct list_head list;
22 int nodeid;
23 char request[1];
24};
25
26/*
27 * Requests received while the lockspace is in recovery get added to the
28 * request queue and processed when recovery is complete. This happens when
29 * the lockspace is suspended on some nodes before it is on others, or the
30 * lockspace is enabled on some while still suspended on others.
31 */
32
33void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
34{
35 struct rq_entry *e;
36 int length = hd->h_length;
37
38 if (dlm_is_removed(ls, nodeid))
39 return;
40
41 e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
42 if (!e) {
43 log_print("dlm_add_requestqueue: out of memory\n");
44 return;
45 }
46
47 e->nodeid = nodeid;
48 memcpy(e->request, hd, length);
49
50 mutex_lock(&ls->ls_requestqueue_mutex);
51 list_add_tail(&e->list, &ls->ls_requestqueue);
52 mutex_unlock(&ls->ls_requestqueue_mutex);
53}
54
55int dlm_process_requestqueue(struct dlm_ls *ls)
56{
57 struct rq_entry *e;
58 struct dlm_header *hd;
59 int error = 0;
60
61 mutex_lock(&ls->ls_requestqueue_mutex);
62
63 for (;;) {
64 if (list_empty(&ls->ls_requestqueue)) {
65 mutex_unlock(&ls->ls_requestqueue_mutex);
66 error = 0;
67 break;
68 }
69 e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
70 mutex_unlock(&ls->ls_requestqueue_mutex);
71
72 hd = (struct dlm_header *) e->request;
73 error = dlm_receive_message(hd, e->nodeid, 1);
74
75 if (error == -EINTR) {
76 /* entry is left on requestqueue */
77 log_debug(ls, "process_requestqueue abort eintr");
78 break;
79 }
80
81 mutex_lock(&ls->ls_requestqueue_mutex);
82 list_del(&e->list);
83 kfree(e);
84
85 if (dlm_locking_stopped(ls)) {
86 log_debug(ls, "process_requestqueue abort running");
87 mutex_unlock(&ls->ls_requestqueue_mutex);
88 error = -EINTR;
89 break;
90 }
91 schedule();
92 }
93
94 return error;
95}
96
97/*
98 * After recovery is done, locking is resumed and dlm_recoverd takes all the
99 * saved requests and processes them as they would have been by dlm_recvd. At
100 * the same time, dlm_recvd will start receiving new requests from remote
101 * nodes. We want to delay dlm_recvd processing new requests until
102 * dlm_recoverd has finished processing the old saved requests.
103 */
104
105void dlm_wait_requestqueue(struct dlm_ls *ls)
106{
107 for (;;) {
108 mutex_lock(&ls->ls_requestqueue_mutex);
109 if (list_empty(&ls->ls_requestqueue))
110 break;
111 if (dlm_locking_stopped(ls))
112 break;
113 mutex_unlock(&ls->ls_requestqueue_mutex);
114 schedule();
115 }
116 mutex_unlock(&ls->ls_requestqueue_mutex);
117}
118
119static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
120{
121 uint32_t type = ms->m_type;
122
123 if (dlm_is_removed(ls, nodeid))
124 return 1;
125
126 /* directory operations are always purged because the directory is
127 always rebuilt during recovery and the lookups resent */
128
129 if (type == DLM_MSG_REMOVE ||
130 type == DLM_MSG_LOOKUP ||
131 type == DLM_MSG_LOOKUP_REPLY)
132 return 1;
133
134 if (!dlm_no_directory(ls))
135 return 0;
136
137 /* with no directory, the master is likely to change as a part of
138 recovery; requests to/from the defunct master need to be purged */
139
140 switch (type) {
141 case DLM_MSG_REQUEST:
142 case DLM_MSG_CONVERT:
143 case DLM_MSG_UNLOCK:
144 case DLM_MSG_CANCEL:
145 /* we're no longer the master of this resource, the sender
146 will resend to the new master (see waiter_needs_recovery) */
147
148 if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
149 return 1;
150 break;
151
152 case DLM_MSG_REQUEST_REPLY:
153 case DLM_MSG_CONVERT_REPLY:
154 case DLM_MSG_UNLOCK_REPLY:
155 case DLM_MSG_CANCEL_REPLY:
156 case DLM_MSG_GRANT:
157 /* this reply is from the former master of the resource,
158 we'll resend to the new master if needed */
159
160 if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
161 return 1;
162 break;
163 }
164
165 return 0;
166}
167
168void dlm_purge_requestqueue(struct dlm_ls *ls)
169{
170 struct dlm_message *ms;
171 struct rq_entry *e, *safe;
172
173 mutex_lock(&ls->ls_requestqueue_mutex);
174 list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
175 ms = (struct dlm_message *) e->request;
176
177 if (purge_request(ls, ms, e->nodeid)) {
178 list_del(&e->list);
179 kfree(e);
180 }
181 }
182 mutex_unlock(&ls->ls_requestqueue_mutex);
183}
184
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
new file mode 100644
index 000000000000..349f0d292d95
--- /dev/null
+++ b/fs/dlm/requestqueue.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __REQUESTQUEUE_DOT_H__
14#define __REQUESTQUEUE_DOT_H__
15
16void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
17int dlm_process_requestqueue(struct dlm_ls *ls);
18void dlm_wait_requestqueue(struct dlm_ls *ls);
19void dlm_purge_requestqueue(struct dlm_ls *ls);
20
21#endif
22
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
new file mode 100644
index 000000000000..767197db9944
--- /dev/null
+++ b/fs/dlm/util.c
@@ -0,0 +1,161 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "rcom.h"
15#include "util.h"
16
17static void header_out(struct dlm_header *hd)
18{
19 hd->h_version = cpu_to_le32(hd->h_version);
20 hd->h_lockspace = cpu_to_le32(hd->h_lockspace);
21 hd->h_nodeid = cpu_to_le32(hd->h_nodeid);
22 hd->h_length = cpu_to_le16(hd->h_length);
23}
24
25static void header_in(struct dlm_header *hd)
26{
27 hd->h_version = le32_to_cpu(hd->h_version);
28 hd->h_lockspace = le32_to_cpu(hd->h_lockspace);
29 hd->h_nodeid = le32_to_cpu(hd->h_nodeid);
30 hd->h_length = le16_to_cpu(hd->h_length);
31}
32
33void dlm_message_out(struct dlm_message *ms)
34{
35 struct dlm_header *hd = (struct dlm_header *) ms;
36
37 header_out(hd);
38
39 ms->m_type = cpu_to_le32(ms->m_type);
40 ms->m_nodeid = cpu_to_le32(ms->m_nodeid);
41 ms->m_pid = cpu_to_le32(ms->m_pid);
42 ms->m_lkid = cpu_to_le32(ms->m_lkid);
43 ms->m_remid = cpu_to_le32(ms->m_remid);
44 ms->m_parent_lkid = cpu_to_le32(ms->m_parent_lkid);
45 ms->m_parent_remid = cpu_to_le32(ms->m_parent_remid);
46 ms->m_exflags = cpu_to_le32(ms->m_exflags);
47 ms->m_sbflags = cpu_to_le32(ms->m_sbflags);
48 ms->m_flags = cpu_to_le32(ms->m_flags);
49 ms->m_lvbseq = cpu_to_le32(ms->m_lvbseq);
50 ms->m_hash = cpu_to_le32(ms->m_hash);
51 ms->m_status = cpu_to_le32(ms->m_status);
52 ms->m_grmode = cpu_to_le32(ms->m_grmode);
53 ms->m_rqmode = cpu_to_le32(ms->m_rqmode);
54 ms->m_bastmode = cpu_to_le32(ms->m_bastmode);
55 ms->m_asts = cpu_to_le32(ms->m_asts);
56 ms->m_result = cpu_to_le32(ms->m_result);
57}
58
59void dlm_message_in(struct dlm_message *ms)
60{
61 struct dlm_header *hd = (struct dlm_header *) ms;
62
63 header_in(hd);
64
65 ms->m_type = le32_to_cpu(ms->m_type);
66 ms->m_nodeid = le32_to_cpu(ms->m_nodeid);
67 ms->m_pid = le32_to_cpu(ms->m_pid);
68 ms->m_lkid = le32_to_cpu(ms->m_lkid);
69 ms->m_remid = le32_to_cpu(ms->m_remid);
70 ms->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid);
71 ms->m_parent_remid = le32_to_cpu(ms->m_parent_remid);
72 ms->m_exflags = le32_to_cpu(ms->m_exflags);
73 ms->m_sbflags = le32_to_cpu(ms->m_sbflags);
74 ms->m_flags = le32_to_cpu(ms->m_flags);
75 ms->m_lvbseq = le32_to_cpu(ms->m_lvbseq);
76 ms->m_hash = le32_to_cpu(ms->m_hash);
77 ms->m_status = le32_to_cpu(ms->m_status);
78 ms->m_grmode = le32_to_cpu(ms->m_grmode);
79 ms->m_rqmode = le32_to_cpu(ms->m_rqmode);
80 ms->m_bastmode = le32_to_cpu(ms->m_bastmode);
81 ms->m_asts = le32_to_cpu(ms->m_asts);
82 ms->m_result = le32_to_cpu(ms->m_result);
83}
84
85static void rcom_lock_out(struct rcom_lock *rl)
86{
87 rl->rl_ownpid = cpu_to_le32(rl->rl_ownpid);
88 rl->rl_lkid = cpu_to_le32(rl->rl_lkid);
89 rl->rl_remid = cpu_to_le32(rl->rl_remid);
90 rl->rl_parent_lkid = cpu_to_le32(rl->rl_parent_lkid);
91 rl->rl_parent_remid = cpu_to_le32(rl->rl_parent_remid);
92 rl->rl_exflags = cpu_to_le32(rl->rl_exflags);
93 rl->rl_flags = cpu_to_le32(rl->rl_flags);
94 rl->rl_lvbseq = cpu_to_le32(rl->rl_lvbseq);
95 rl->rl_result = cpu_to_le32(rl->rl_result);
96 rl->rl_wait_type = cpu_to_le16(rl->rl_wait_type);
97 rl->rl_namelen = cpu_to_le16(rl->rl_namelen);
98}
99
100static void rcom_lock_in(struct rcom_lock *rl)
101{
102 rl->rl_ownpid = le32_to_cpu(rl->rl_ownpid);
103 rl->rl_lkid = le32_to_cpu(rl->rl_lkid);
104 rl->rl_remid = le32_to_cpu(rl->rl_remid);
105 rl->rl_parent_lkid = le32_to_cpu(rl->rl_parent_lkid);
106 rl->rl_parent_remid = le32_to_cpu(rl->rl_parent_remid);
107 rl->rl_exflags = le32_to_cpu(rl->rl_exflags);
108 rl->rl_flags = le32_to_cpu(rl->rl_flags);
109 rl->rl_lvbseq = le32_to_cpu(rl->rl_lvbseq);
110 rl->rl_result = le32_to_cpu(rl->rl_result);
111 rl->rl_wait_type = le16_to_cpu(rl->rl_wait_type);
112 rl->rl_namelen = le16_to_cpu(rl->rl_namelen);
113}
114
115static void rcom_config_out(struct rcom_config *rf)
116{
117 rf->rf_lvblen = cpu_to_le32(rf->rf_lvblen);
118 rf->rf_lsflags = cpu_to_le32(rf->rf_lsflags);
119}
120
121static void rcom_config_in(struct rcom_config *rf)
122{
123 rf->rf_lvblen = le32_to_cpu(rf->rf_lvblen);
124 rf->rf_lsflags = le32_to_cpu(rf->rf_lsflags);
125}
126
127void dlm_rcom_out(struct dlm_rcom *rc)
128{
129 struct dlm_header *hd = (struct dlm_header *) rc;
130 int type = rc->rc_type;
131
132 header_out(hd);
133
134 rc->rc_type = cpu_to_le32(rc->rc_type);
135 rc->rc_result = cpu_to_le32(rc->rc_result);
136 rc->rc_id = cpu_to_le64(rc->rc_id);
137
138 if (type == DLM_RCOM_LOCK)
139 rcom_lock_out((struct rcom_lock *) rc->rc_buf);
140
141 else if (type == DLM_RCOM_STATUS_REPLY)
142 rcom_config_out((struct rcom_config *) rc->rc_buf);
143}
144
145void dlm_rcom_in(struct dlm_rcom *rc)
146{
147 struct dlm_header *hd = (struct dlm_header *) rc;
148
149 header_in(hd);
150
151 rc->rc_type = le32_to_cpu(rc->rc_type);
152 rc->rc_result = le32_to_cpu(rc->rc_result);
153 rc->rc_id = le64_to_cpu(rc->rc_id);
154
155 if (rc->rc_type == DLM_RCOM_LOCK)
156 rcom_lock_in((struct rcom_lock *) rc->rc_buf);
157
158 else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
159 rcom_config_in((struct rcom_config *) rc->rc_buf);
160}
161
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
new file mode 100644
index 000000000000..2b25915161c0
--- /dev/null
+++ b/fs/dlm/util.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __UTIL_DOT_H__
14#define __UTIL_DOT_H__
15
16void dlm_message_out(struct dlm_message *ms);
17void dlm_message_in(struct dlm_message *ms);
18void dlm_rcom_out(struct dlm_rcom *rc);
19void dlm_rcom_in(struct dlm_rcom *rc);
20
21#endif
22
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
new file mode 100644
index 000000000000..115f30d8c22e
--- /dev/null
+++ b/fs/gfs2/Kconfig
@@ -0,0 +1,44 @@
1config GFS2_FS
2 tristate "GFS2 file system support"
3 depends on EXPERIMENTAL
4 select FS_POSIX_ACL
5 help
6 A cluster filesystem.
7
8 Allows a cluster of computers to simultaneously use a block device
9 that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads
10 and writes to the block device like a local filesystem, but also uses
11 a lock module to allow the computers coordinate their I/O so
12 filesystem consistency is maintained. One of the nifty features of
13 GFS is perfect consistency -- changes made to the filesystem on one
14 machine show up immediately on all other machines in the cluster.
15
16 To use the GFS2 filesystem, you will need to enable one or more of
17 the below locking modules. Documentation and utilities for GFS2 can
18 be found here: http://sources.redhat.com/cluster/gfs/
19
20config GFS2_FS_LOCKING_NOLOCK
21 tristate "GFS2 \"nolock\" locking module"
22 depends on GFS2_FS
23 help
24 Single node locking module for GFS2.
25
26 Use this module if you want to use GFS2 on a single node without
27 its clustering features. You can still take advantage of the
28 large file support, and upgrade to running a full cluster later on
29 if required.
30
31 If you will only be using GFS2 in cluster mode, you do not need this
32 module.
33
34config GFS2_FS_LOCKING_DLM
35 tristate "GFS2 DLM locking module"
36 depends on GFS2_FS
37 select DLM
38 help
39 Multiple node locking module for GFS2
40
41 Most users of GFS2 will require this module. It provides the locking
42 interface between GFS2 and the DLM, which is required to use GFS2
43 in a cluster environment.
44
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
new file mode 100644
index 000000000000..9974201aa16c
--- /dev/null
+++ b/fs/gfs2/Makefile
@@ -0,0 +1,10 @@
1obj-$(CONFIG_GFS2_FS) += gfs2.o
2gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
3 glops.o inode.o lm.o log.o lops.o locking.o lvb.o main.o meta_io.o \
4 mount.o ondisk.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
5 ops_fstype.o ops_inode.o ops_super.o ops_vm.o page.o quota.o \
6 recovery.o rgrp.o super.o sys.o trans.o unlinked.o util.o
7
8obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
9obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
10
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
new file mode 100644
index 000000000000..343dbe3e87bb
--- /dev/null
+++ b/fs/gfs2/acl.c
@@ -0,0 +1,315 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/posix_acl.h>
16#include <linux/posix_acl_xattr.h>
17#include <linux/gfs2_ondisk.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "acl.h"
23#include "eaops.h"
24#include "eattr.h"
25#include "glock.h"
26#include "inode.h"
27#include "meta_io.h"
28#include "trans.h"
29#include "util.h"
30
31#define ACL_ACCESS 1
32#define ACL_DEFAULT 0
33
34int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
35 struct gfs2_ea_request *er,
36 int *remove, mode_t *mode)
37{
38 struct posix_acl *acl;
39 int error;
40
41 error = gfs2_acl_validate_remove(ip, access);
42 if (error)
43 return error;
44
45 if (!er->er_data)
46 return -EINVAL;
47
48 acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
49 if (IS_ERR(acl))
50 return PTR_ERR(acl);
51 if (!acl) {
52 *remove = 1;
53 return 0;
54 }
55
56 error = posix_acl_valid(acl);
57 if (error)
58 goto out;
59
60 if (access) {
61 error = posix_acl_equiv_mode(acl, mode);
62 if (!error)
63 *remove = 1;
64 else if (error > 0)
65 error = 0;
66 }
67
68 out:
69 posix_acl_release(acl);
70
71 return error;
72}
73
74int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
75{
76 if (!ip->i_sbd->sd_args.ar_posix_acl)
77 return -EOPNOTSUPP;
78 if (current->fsuid != ip->i_di.di_uid && !capable(CAP_FOWNER))
79 return -EPERM;
80 if (S_ISLNK(ip->i_di.di_mode))
81 return -EOPNOTSUPP;
82 if (!access && !S_ISDIR(ip->i_di.di_mode))
83 return -EACCES;
84
85 return 0;
86}
87
88static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
89 struct gfs2_ea_location *el, char **data, unsigned int *len)
90{
91 struct gfs2_ea_request er;
92 struct gfs2_ea_location el_this;
93 int error;
94
95 if (!ip->i_di.di_eattr)
96 return 0;
97
98 memset(&er, 0, sizeof(struct gfs2_ea_request));
99 if (access) {
100 er.er_name = GFS2_POSIX_ACL_ACCESS;
101 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
102 } else {
103 er.er_name = GFS2_POSIX_ACL_DEFAULT;
104 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
105 }
106 er.er_type = GFS2_EATYPE_SYS;
107
108 if (!el)
109 el = &el_this;
110
111 error = gfs2_ea_find(ip, &er, el);
112 if (error)
113 return error;
114 if (!el->el_ea)
115 return 0;
116 if (!GFS2_EA_DATA_LEN(el->el_ea))
117 goto out;
118
119 er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
120 er.er_data = kmalloc(er.er_data_len, GFP_KERNEL);
121 error = -ENOMEM;
122 if (!er.er_data)
123 goto out;
124
125 error = gfs2_ea_get_copy(ip, el, er.er_data);
126 if (error)
127 goto out_kfree;
128
129 if (acl) {
130 *acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
131 if (IS_ERR(*acl))
132 error = PTR_ERR(*acl);
133 }
134
135 out_kfree:
136 if (error || !data)
137 kfree(er.er_data);
138 else {
139 *data = er.er_data;
140 *len = er.er_data_len;
141 }
142
143 out:
144 if (error || el == &el_this)
145 brelse(el->el_bh);
146
147 return error;
148}
149
150/**
151 * gfs2_check_acl_locked - Check an ACL to see if we're allowed to do something
152 * @inode: the file we want to do something to
153 * @mask: what we want to do
154 *
155 * Returns: errno
156 */
157
158int gfs2_check_acl_locked(struct inode *inode, int mask)
159{
160 struct posix_acl *acl = NULL;
161 int error;
162
163 error = acl_get(inode->u.generic_ip, ACL_ACCESS, &acl, NULL, NULL, NULL);
164 if (error)
165 return error;
166
167 if (acl) {
168 error = posix_acl_permission(inode, acl, mask);
169 posix_acl_release(acl);
170 return error;
171 }
172
173 return -EAGAIN;
174}
175
176int gfs2_check_acl(struct inode *inode, int mask)
177{
178 struct gfs2_inode *ip = inode->u.generic_ip;
179 struct gfs2_holder i_gh;
180 int error;
181
182 error = gfs2_glock_nq_init(ip->i_gl,
183 LM_ST_SHARED, LM_FLAG_ANY,
184 &i_gh);
185 if (!error) {
186 error = gfs2_check_acl_locked(inode, mask);
187 gfs2_glock_dq_uninit(&i_gh);
188 }
189
190 return error;
191}
192
193static int munge_mode(struct gfs2_inode *ip, mode_t mode)
194{
195 struct gfs2_sbd *sdp = ip->i_sbd;
196 struct buffer_head *dibh;
197 int error;
198
199 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
200 if (error)
201 return error;
202
203 error = gfs2_meta_inode_buffer(ip, &dibh);
204 if (!error) {
205 gfs2_assert_withdraw(sdp,
206 (ip->i_di.di_mode & S_IFMT) == (mode & S_IFMT));
207 ip->i_di.di_mode = mode;
208 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
209 gfs2_dinode_out(&ip->i_di, dibh->b_data);
210 brelse(dibh);
211 }
212
213 gfs2_trans_end(sdp);
214
215 return 0;
216}
217
218int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
219{
220 struct gfs2_sbd *sdp = dip->i_sbd;
221 struct posix_acl *acl = NULL, *clone;
222 struct gfs2_ea_request er;
223 mode_t mode = ip->i_di.di_mode;
224 int error;
225
226 if (!sdp->sd_args.ar_posix_acl)
227 return 0;
228 if (S_ISLNK(ip->i_di.di_mode))
229 return 0;
230
231 memset(&er, 0, sizeof(struct gfs2_ea_request));
232 er.er_type = GFS2_EATYPE_SYS;
233
234 error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
235 &er.er_data, &er.er_data_len);
236 if (error)
237 return error;
238 if (!acl) {
239 mode &= ~current->fs->umask;
240 if (mode != ip->i_di.di_mode)
241 error = munge_mode(ip, mode);
242 return error;
243 }
244
245 clone = posix_acl_clone(acl, GFP_KERNEL);
246 error = -ENOMEM;
247 if (!clone)
248 goto out;
249 posix_acl_release(acl);
250 acl = clone;
251
252 if (S_ISDIR(ip->i_di.di_mode)) {
253 er.er_name = GFS2_POSIX_ACL_DEFAULT;
254 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
255 error = gfs2_system_eaops.eo_set(ip, &er);
256 if (error)
257 goto out;
258 }
259
260 error = posix_acl_create_masq(acl, &mode);
261 if (error < 0)
262 goto out;
263 if (error > 0) {
264 er.er_name = GFS2_POSIX_ACL_ACCESS;
265 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
266 posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
267 er.er_mode = mode;
268 er.er_flags = GFS2_ERF_MODE;
269 error = gfs2_system_eaops.eo_set(ip, &er);
270 if (error)
271 goto out;
272 } else
273 munge_mode(ip, mode);
274
275 out:
276 posix_acl_release(acl);
277 kfree(er.er_data);
278 return error;
279}
280
281int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
282{
283 struct posix_acl *acl = NULL, *clone;
284 struct gfs2_ea_location el;
285 char *data;
286 unsigned int len;
287 int error;
288
289 error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
290 if (error)
291 return error;
292 if (!acl)
293 return gfs2_setattr_simple(ip, attr);
294
295 clone = posix_acl_clone(acl, GFP_KERNEL);
296 error = -ENOMEM;
297 if (!clone)
298 goto out;
299 posix_acl_release(acl);
300 acl = clone;
301
302 error = posix_acl_chmod_masq(acl, attr->ia_mode);
303 if (!error) {
304 posix_acl_to_xattr(acl, data, len);
305 error = gfs2_ea_acl_chmod(ip, &el, attr, data);
306 }
307
308 out:
309 posix_acl_release(acl);
310 brelse(el.el_bh);
311 kfree(data);
312
313 return error;
314}
315
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
new file mode 100644
index 000000000000..067105786eaa
--- /dev/null
+++ b/fs/gfs2/acl.h
@@ -0,0 +1,37 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __ACL_DOT_H__
11#define __ACL_DOT_H__
12
13#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
14#define GFS2_POSIX_ACL_ACCESS_LEN 16
15#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
16#define GFS2_POSIX_ACL_DEFAULT_LEN 17
17
18#define GFS2_ACL_IS_ACCESS(name, len) \
19 ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
20 !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
21
22#define GFS2_ACL_IS_DEFAULT(name, len) \
23 ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
24 !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
25
26struct gfs2_ea_request;
27
28int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
29 struct gfs2_ea_request *er,
30 int *remove, mode_t *mode);
31int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
32int gfs2_check_acl_locked(struct inode *inode, int mask);
33int gfs2_check_acl(struct inode *inode, int mask);
34int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
35int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
36
37#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
new file mode 100644
index 000000000000..41abd3f4fc73
--- /dev/null
+++ b/fs/gfs2/bmap.c
@@ -0,0 +1,1103 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "inode.h"
24#include "meta_io.h"
25#include "page.h"
26#include "quota.h"
27#include "rgrp.h"
28#include "trans.h"
29#include "dir.h"
30#include "util.h"
31
32/* This doesn't need to be that large as max 64 bit pointers in a 4k
33 * block is 512, so __u16 is fine for that. It saves stack space to
34 * keep it small.
35 */
36struct metapath {
37 __u16 mp_list[GFS2_MAX_META_HEIGHT];
38};
39
40typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
41 struct buffer_head *bh, uint64_t *top,
42 uint64_t *bottom, unsigned int height,
43 void *data);
44
45struct strip_mine {
46 int sm_first;
47 unsigned int sm_height;
48};
49
50/**
51 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
52 * @ip: The GFS2 inode to unstuff
53 * @unstuffer: the routine that handles unstuffing a non-zero length file
54 * @private: private data for the unstuffer
55 *
56 * This routine unstuffs a dinode and returns it to a "normal" state such
57 * that the height can be grown in the traditional way.
58 *
59 * Returns: errno
60 */
61
62int gfs2_unstuff_dinode(struct gfs2_inode *ip, gfs2_unstuffer_t unstuffer,
63 void *private)
64{
65 struct buffer_head *bh, *dibh;
66 uint64_t block = 0;
67 int isdir = gfs2_is_dir(ip);
68 int error;
69
70 down_write(&ip->i_rw_mutex);
71
72 error = gfs2_meta_inode_buffer(ip, &dibh);
73 if (error)
74 goto out;
75
76 if (ip->i_di.di_size) {
77 /* Get a free block, fill it with the stuffed data,
78 and write it out to disk */
79
80 if (isdir) {
81 block = gfs2_alloc_meta(ip);
82
83 error = gfs2_dir_get_new_buffer(ip, block, &bh);
84 if (error)
85 goto out_brelse;
86 gfs2_buffer_copy_tail(bh,
87 sizeof(struct gfs2_meta_header),
88 dibh, sizeof(struct gfs2_dinode));
89 brelse(bh);
90 } else {
91 block = gfs2_alloc_data(ip);
92
93 error = unstuffer(ip, dibh, block, private);
94 if (error)
95 goto out_brelse;
96 }
97 }
98
99 /* Set up the pointer to the new block */
100
101 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
102
103 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
104
105 if (ip->i_di.di_size) {
106 *(uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode)) =
107 cpu_to_be64(block);
108 ip->i_di.di_blocks++;
109 }
110
111 ip->i_di.di_height = 1;
112
113 gfs2_dinode_out(&ip->i_di, dibh->b_data);
114
115 out_brelse:
116 brelse(dibh);
117
118 out:
119 up_write(&ip->i_rw_mutex);
120
121 return error;
122}
123
124/**
125 * calc_tree_height - Calculate the height of a metadata tree
126 * @ip: The GFS2 inode
127 * @size: The proposed size of the file
128 *
129 * Work out how tall a metadata tree needs to be in order to accommodate a
130 * file of a particular size. If size is less than the current size of
131 * the inode, then the current size of the inode is used instead of the
132 * supplied one.
133 *
134 * Returns: the height the tree should be
135 */
136
137static unsigned int calc_tree_height(struct gfs2_inode *ip, uint64_t size)
138{
139 struct gfs2_sbd *sdp = ip->i_sbd;
140 uint64_t *arr;
141 unsigned int max, height;
142
143 if (ip->i_di.di_size > size)
144 size = ip->i_di.di_size;
145
146 if (gfs2_is_dir(ip)) {
147 arr = sdp->sd_jheightsize;
148 max = sdp->sd_max_jheight;
149 } else {
150 arr = sdp->sd_heightsize;
151 max = sdp->sd_max_height;
152 }
153
154 for (height = 0; height < max; height++)
155 if (arr[height] >= size)
156 break;
157
158 return height;
159}
160
161/**
162 * build_height - Build a metadata tree of the requested height
163 * @ip: The GFS2 inode
164 * @height: The height to build to
165 *
166 *
167 * Returns: errno
168 */
169
170static int build_height(struct inode *inode, unsigned height)
171{
172 struct gfs2_inode *ip = inode->u.generic_ip;
173 unsigned new_height = height - ip->i_di.di_height;
174 struct buffer_head *dibh;
175 struct buffer_head *blocks[GFS2_MAX_META_HEIGHT];
176 int error;
177 u64 *bp;
178 u64 bn;
179 unsigned n;
180
181 if (height <= ip->i_di.di_height)
182 return 0;
183
184 error = gfs2_meta_inode_buffer(ip, &dibh);
185 if (error)
186 return error;
187
188 for(n = 0; n < new_height; n++) {
189 bn = gfs2_alloc_meta(ip);
190 blocks[n] = gfs2_meta_new(ip->i_gl, bn);
191 gfs2_trans_add_bh(ip->i_gl, blocks[n], 1);
192 }
193
194 n = 0;
195 bn = blocks[0]->b_blocknr;
196 if (new_height > 1) {
197 for(; n < new_height-1; n++) {
198 gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN,
199 GFS2_FORMAT_IN);
200 gfs2_buffer_clear_tail(blocks[n],
201 sizeof(struct gfs2_meta_header));
202 bp = (u64 *)(blocks[n]->b_data +
203 sizeof(struct gfs2_meta_header));
204 *bp = cpu_to_be64(blocks[n+1]->b_blocknr);
205 brelse(blocks[n]);
206 blocks[n] = NULL;
207 }
208 }
209 gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
210 gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header),
211 dibh, sizeof(struct gfs2_dinode));
212 brelse(blocks[n]);
213 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
214 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
215 bp = (u64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
216 *bp = cpu_to_be64(bn);
217 ip->i_di.di_height += new_height;
218 ip->i_di.di_blocks += new_height;
219 gfs2_dinode_out(&ip->i_di, dibh->b_data);
220 brelse(dibh);
221 return error;
222}
223
224/**
225 * find_metapath - Find path through the metadata tree
226 * @ip: The inode pointer
227 * @mp: The metapath to return the result in
228 * @block: The disk block to look up
229 *
230 * This routine returns a struct metapath structure that defines a path
231 * through the metadata of inode "ip" to get to block "block".
232 *
233 * Example:
234 * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a
235 * filesystem with a blocksize of 4096.
236 *
237 * find_metapath() would return a struct metapath structure set to:
238 * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
239 * and mp_list[2] = 165.
240 *
241 * That means that in order to get to the block containing the byte at
242 * offset 101342453, we would load the indirect block pointed to by pointer
243 * 0 in the dinode. We would then load the indirect block pointed to by
244 * pointer 48 in that indirect block. We would then load the data block
245 * pointed to by pointer 165 in that indirect block.
246 *
247 * ----------------------------------------
248 * | Dinode | |
249 * | | 4|
250 * | |0 1 2 3 4 5 9|
251 * | | 6|
252 * ----------------------------------------
253 * |
254 * |
255 * V
256 * ----------------------------------------
257 * | Indirect Block |
258 * | 5|
259 * | 4 4 4 4 4 5 5 1|
260 * |0 5 6 7 8 9 0 1 2|
261 * ----------------------------------------
262 * |
263 * |
264 * V
265 * ----------------------------------------
266 * | Indirect Block |
267 * | 1 1 1 1 1 5|
268 * | 6 6 6 6 6 1|
269 * |0 3 4 5 6 7 2|
270 * ----------------------------------------
271 * |
272 * |
273 * V
274 * ----------------------------------------
275 * | Data block containing offset |
276 * | 101342453 |
277 * | |
278 * | |
279 * ----------------------------------------
280 *
281 */
282
283static void find_metapath(struct gfs2_inode *ip, uint64_t block,
284 struct metapath *mp)
285{
286 struct gfs2_sbd *sdp = ip->i_sbd;
287 uint64_t b = block;
288 unsigned int i;
289
290 for (i = ip->i_di.di_height; i--;)
291 mp->mp_list[i] = (__u16)do_div(b, sdp->sd_inptrs);
292
293}
294
295/**
296 * metapointer - Return pointer to start of metadata in a buffer
297 * @bh: The buffer
298 * @height: The metadata height (0 = dinode)
299 * @mp: The metapath
300 *
301 * Return a pointer to the block number of the next height of the metadata
302 * tree given a buffer containing the pointer to the current height of the
303 * metadata tree.
304 */
305
306static inline u64 *metapointer(struct buffer_head *bh, int *boundary,
307 unsigned int height, const struct metapath *mp)
308{
309 unsigned int head_size = (height > 0) ?
310 sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
311 u64 *ptr;
312 *boundary = 0;
313 ptr = ((u64 *)(bh->b_data + head_size)) + mp->mp_list[height];
314 if (ptr + 1 == (u64*)(bh->b_data + bh->b_size))
315 *boundary = 1;
316 return ptr;
317}
318
319/**
320 * lookup_block - Get the next metadata block in metadata tree
321 * @ip: The GFS2 inode
322 * @bh: Buffer containing the pointers to metadata blocks
323 * @height: The height of the tree (0 = dinode)
324 * @mp: The metapath
325 * @create: Non-zero if we may create a new meatdata block
326 * @new: Used to indicate if we did create a new metadata block
327 * @block: the returned disk block number
328 *
329 * Given a metatree, complete to a particular height, checks to see if the next
330 * height of the tree exists. If not the next height of the tree is created.
331 * The block number of the next height of the metadata tree is returned.
332 *
333 */
334
335static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
336 unsigned int height, struct metapath *mp, int create,
337 int *new, uint64_t *block)
338{
339 int boundary;
340 uint64_t *ptr = metapointer(bh, &boundary, height, mp);
341
342 if (*ptr) {
343 *block = be64_to_cpu(*ptr);
344 return boundary;
345 }
346
347 *block = 0;
348
349 if (!create)
350 return 0;
351
352 if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip))
353 *block = gfs2_alloc_data(ip);
354 else
355 *block = gfs2_alloc_meta(ip);
356
357 gfs2_trans_add_bh(ip->i_gl, bh, 1);
358
359 *ptr = cpu_to_be64(*block);
360 ip->i_di.di_blocks++;
361
362 *new = 1;
363 return 0;
364}
365
366/**
367 * gfs2_block_pointers - Map a block from an inode to a disk block
368 * @inode: The inode
369 * @lblock: The logical block number
370 * @new: Value/Result argument (1 = may create/did create new blocks)
371 * @boundary: gets set if we've hit a block boundary
372 * @mp: metapath to use
373 *
374 * Find the block number on the current device which corresponds to an
375 * inode's block. If the block had to be created, "new" will be set.
376 *
377 * Returns: errno
378 */
379
380static struct buffer_head *gfs2_block_pointers(struct inode *inode, u64 lblock,
381 int *new, u64 *dblock,
382 int *boundary,
383 struct metapath *mp)
384{
385 struct gfs2_inode *ip = inode->u.generic_ip;
386 struct gfs2_sbd *sdp = ip->i_sbd;
387 struct buffer_head *bh;
388 int create = *new;
389 unsigned int bsize;
390 unsigned int height;
391 unsigned int end_of_metadata;
392 unsigned int x;
393 int error = 0;
394
395 *new = 0;
396 *dblock = 0;
397
398 if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
399 goto out;
400
401 bsize = (gfs2_is_dir(ip)) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
402
403 height = calc_tree_height(ip, (lblock + 1) * bsize);
404 if (ip->i_di.di_height < height) {
405 if (!create)
406 goto out;
407
408 error = build_height(inode, height);
409 if (error)
410 goto out;
411 }
412
413 find_metapath(ip, lblock, mp);
414 end_of_metadata = ip->i_di.di_height - 1;
415
416 error = gfs2_meta_inode_buffer(ip, &bh);
417 if (error)
418 goto out;
419
420 for (x = 0; x < end_of_metadata; x++) {
421 lookup_block(ip, bh, x, mp, create, new, dblock);
422 brelse(bh);
423 if (!*dblock)
424 goto out;
425
426 error = gfs2_meta_indirect_buffer(ip, x+1, *dblock, *new, &bh);
427 if (error)
428 goto out;
429 }
430
431 *boundary = lookup_block(ip, bh, end_of_metadata, mp, create, new, dblock);
432 if (*new) {
433 struct buffer_head *dibh;
434 error = gfs2_meta_inode_buffer(ip, &dibh);
435 if (!error) {
436 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
437 gfs2_dinode_out(&ip->i_di, dibh->b_data);
438 brelse(dibh);
439 }
440 }
441 return bh;
442out:
443 return ERR_PTR(error);
444}
445
446
447static inline void bmap_lock(struct inode *inode, int create)
448{
449 struct gfs2_inode *ip = inode->u.generic_ip;
450 if (create)
451 down_write(&ip->i_rw_mutex);
452 else
453 down_read(&ip->i_rw_mutex);
454}
455
456static inline void bmap_unlock(struct inode *inode, int create)
457{
458 struct gfs2_inode *ip = inode->u.generic_ip;
459 if (create)
460 up_write(&ip->i_rw_mutex);
461 else
462 up_read(&ip->i_rw_mutex);
463}
464
465int gfs2_block_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, int *boundary)
466{
467 struct metapath mp;
468 struct buffer_head *bh;
469 int create = *new;
470
471 bmap_lock(inode, create);
472 bh = gfs2_block_pointers(inode, lblock, new, dblock, boundary, &mp);
473 bmap_unlock(inode, create);
474 if (!bh)
475 return 0;
476 if (IS_ERR(bh))
477 return PTR_ERR(bh);
478 brelse(bh);
479 return 0;
480}
481
482int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
483{
484 struct gfs2_inode *ip = inode->u.generic_ip;
485 struct gfs2_sbd *sdp = ip->i_sbd;
486 struct metapath mp;
487 struct buffer_head *bh;
488 int boundary;
489 int create = *new;
490
491 BUG_ON(!extlen);
492 BUG_ON(!dblock);
493 BUG_ON(!new);
494
495 bmap_lock(inode, create);
496 bh = gfs2_block_pointers(inode, lblock, new, dblock, &boundary, &mp);
497 *extlen = 1;
498
499 if (bh && !IS_ERR(bh) && *dblock && !*new) {
500 u64 tmp_dblock;
501 int tmp_new;
502 unsigned int nptrs;
503 unsigned end_of_metadata = ip->i_di.di_height - 1;
504
505 nptrs = (end_of_metadata) ? sdp->sd_inptrs : sdp->sd_diptrs;
506 while (++mp.mp_list[end_of_metadata] < nptrs) {
507 lookup_block(ip, bh, end_of_metadata, &mp, 0, &tmp_new, &tmp_dblock);
508 if (*dblock + *extlen != tmp_dblock)
509 break;
510 (*extlen)++;
511 }
512 }
513 bmap_unlock(inode, create);
514 if (!bh)
515 return 0;
516 if (IS_ERR(bh))
517 return PTR_ERR(bh);
518 brelse(bh);
519 return 0;
520}
521
522/**
523 * recursive_scan - recursively scan through the end of a file
524 * @ip: the inode
525 * @dibh: the dinode buffer
526 * @mp: the path through the metadata to the point to start
527 * @height: the height the recursion is at
528 * @block: the indirect block to look at
529 * @first: 1 if this is the first block
530 * @bc: the call to make for each piece of metadata
531 * @data: data opaque to this function to pass to @bc
532 *
533 * When this is first called @height and @block should be zero and
534 * @first should be 1.
535 *
536 * Returns: errno
537 */
538
539static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
540 struct metapath *mp, unsigned int height,
541 uint64_t block, int first, block_call_t bc,
542 void *data)
543{
544 struct gfs2_sbd *sdp = ip->i_sbd;
545 struct buffer_head *bh = NULL;
546 uint64_t *top, *bottom;
547 uint64_t bn;
548 int error;
549 int mh_size = sizeof(struct gfs2_meta_header);
550
551 if (!height) {
552 error = gfs2_meta_inode_buffer(ip, &bh);
553 if (error)
554 return error;
555 dibh = bh;
556
557 top = (uint64_t *)(bh->b_data + sizeof(struct gfs2_dinode)) +
558 mp->mp_list[0];
559 bottom = (uint64_t *)(bh->b_data + sizeof(struct gfs2_dinode)) +
560 sdp->sd_diptrs;
561 } else {
562 error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
563 if (error)
564 return error;
565
566 top = (uint64_t *)(bh->b_data + mh_size) +
567 ((first) ? mp->mp_list[height] : 0);
568
569 bottom = (uint64_t *)(bh->b_data + mh_size) + sdp->sd_inptrs;
570 }
571
572 error = bc(ip, dibh, bh, top, bottom, height, data);
573 if (error)
574 goto out;
575
576 if (height < ip->i_di.di_height - 1)
577 for (; top < bottom; top++, first = 0) {
578 if (!*top)
579 continue;
580
581 bn = be64_to_cpu(*top);
582
583 error = recursive_scan(ip, dibh, mp, height + 1, bn,
584 first, bc, data);
585 if (error)
586 break;
587 }
588
589 out:
590 brelse(bh);
591
592 return error;
593}
594
595/**
596 * do_strip - Look for a layer a particular layer of the file and strip it off
597 * @ip: the inode
598 * @dibh: the dinode buffer
599 * @bh: A buffer of pointers
600 * @top: The first pointer in the buffer
601 * @bottom: One more than the last pointer
602 * @height: the height this buffer is at
603 * @data: a pointer to a struct strip_mine
604 *
605 * Returns: errno
606 */
607
608static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
609 struct buffer_head *bh, uint64_t *top, uint64_t *bottom,
610 unsigned int height, void *data)
611{
612 struct strip_mine *sm = (struct strip_mine *)data;
613 struct gfs2_sbd *sdp = ip->i_sbd;
614 struct gfs2_rgrp_list rlist;
615 uint64_t bn, bstart;
616 uint32_t blen;
617 uint64_t *p;
618 unsigned int rg_blocks = 0;
619 int metadata;
620 unsigned int revokes = 0;
621 int x;
622 int error;
623
624 if (!*top)
625 sm->sm_first = 0;
626
627 if (height != sm->sm_height)
628 return 0;
629
630 if (sm->sm_first) {
631 top++;
632 sm->sm_first = 0;
633 }
634
635 metadata = (height != ip->i_di.di_height - 1);
636 if (metadata)
637 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
638
639 error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh);
640 if (error)
641 return error;
642
643 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
644 bstart = 0;
645 blen = 0;
646
647 for (p = top; p < bottom; p++) {
648 if (!*p)
649 continue;
650
651 bn = be64_to_cpu(*p);
652
653 if (bstart + blen == bn)
654 blen++;
655 else {
656 if (bstart)
657 gfs2_rlist_add(sdp, &rlist, bstart);
658
659 bstart = bn;
660 blen = 1;
661 }
662 }
663
664 if (bstart)
665 gfs2_rlist_add(sdp, &rlist, bstart);
666 else
667 goto out; /* Nothing to do */
668
669 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
670
671 for (x = 0; x < rlist.rl_rgrps; x++) {
672 struct gfs2_rgrpd *rgd;
673 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
674 rg_blocks += rgd->rd_ri.ri_length;
675 }
676
677 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
678 if (error)
679 goto out_rlist;
680
681 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
682 RES_INDIRECT + RES_STATFS + RES_QUOTA,
683 revokes);
684 if (error)
685 goto out_rg_gunlock;
686
687 down_write(&ip->i_rw_mutex);
688
689 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
690 gfs2_trans_add_bh(ip->i_gl, bh, 1);
691
692 bstart = 0;
693 blen = 0;
694
695 for (p = top; p < bottom; p++) {
696 if (!*p)
697 continue;
698
699 bn = be64_to_cpu(*p);
700
701 if (bstart + blen == bn)
702 blen++;
703 else {
704 if (bstart) {
705 if (metadata)
706 gfs2_free_meta(ip, bstart, blen);
707 else
708 gfs2_free_data(ip, bstart, blen);
709 }
710
711 bstart = bn;
712 blen = 1;
713 }
714
715 *p = 0;
716 if (!ip->i_di.di_blocks)
717 gfs2_consist_inode(ip);
718 ip->i_di.di_blocks--;
719 }
720 if (bstart) {
721 if (metadata)
722 gfs2_free_meta(ip, bstart, blen);
723 else
724 gfs2_free_data(ip, bstart, blen);
725 }
726
727 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
728
729 gfs2_dinode_out(&ip->i_di, dibh->b_data);
730
731 up_write(&ip->i_rw_mutex);
732
733 gfs2_trans_end(sdp);
734
735 out_rg_gunlock:
736 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
737
738 out_rlist:
739 gfs2_rlist_free(&rlist);
740
741 out:
742 gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh);
743
744 return error;
745}
746
747/**
748 * do_grow - Make a file look bigger than it is
749 * @ip: the inode
750 * @size: the size to set the file to
751 *
752 * Called with an exclusive lock on @ip.
753 *
754 * Returns: errno
755 */
756
757static int do_grow(struct gfs2_inode *ip, uint64_t size)
758{
759 struct gfs2_sbd *sdp = ip->i_sbd;
760 struct gfs2_alloc *al;
761 struct buffer_head *dibh;
762 unsigned int h;
763 int error;
764
765 al = gfs2_alloc_get(ip);
766
767 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
768 if (error)
769 goto out;
770
771 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
772 if (error)
773 goto out_gunlock_q;
774
775 al->al_requested = sdp->sd_max_height + RES_DATA;
776
777 error = gfs2_inplace_reserve(ip);
778 if (error)
779 goto out_gunlock_q;
780
781 error = gfs2_trans_begin(sdp,
782 sdp->sd_max_height + al->al_rgd->rd_ri.ri_length +
783 RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
784 if (error)
785 goto out_ipres;
786
787 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
788 if (gfs2_is_stuffed(ip)) {
789 error = gfs2_unstuff_dinode(ip, gfs2_unstuffer_page,
790 NULL);
791 if (error)
792 goto out_end_trans;
793 }
794
795 h = calc_tree_height(ip, size);
796 if (ip->i_di.di_height < h) {
797 down_write(&ip->i_rw_mutex);
798 error = build_height(ip->i_vnode, h);
799 up_write(&ip->i_rw_mutex);
800 if (error)
801 goto out_end_trans;
802 }
803 }
804
805 ip->i_di.di_size = size;
806 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
807
808 error = gfs2_meta_inode_buffer(ip, &dibh);
809 if (error)
810 goto out_end_trans;
811
812 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
813 gfs2_dinode_out(&ip->i_di, dibh->b_data);
814 brelse(dibh);
815
816 out_end_trans:
817 gfs2_trans_end(sdp);
818
819 out_ipres:
820 gfs2_inplace_release(ip);
821
822 out_gunlock_q:
823 gfs2_quota_unlock(ip);
824
825 out:
826 gfs2_alloc_put(ip);
827
828 return error;
829}
830
831static int trunc_start(struct gfs2_inode *ip, uint64_t size)
832{
833 struct gfs2_sbd *sdp = ip->i_sbd;
834 struct buffer_head *dibh;
835 int journaled = gfs2_is_jdata(ip);
836 int error;
837
838 error = gfs2_trans_begin(sdp,
839 RES_DINODE + ((journaled) ? RES_JDATA : 0), 0);
840 if (error)
841 return error;
842
843 error = gfs2_meta_inode_buffer(ip, &dibh);
844 if (error)
845 goto out;
846
847 if (gfs2_is_stuffed(ip)) {
848 ip->i_di.di_size = size;
849 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
850 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
851 gfs2_dinode_out(&ip->i_di, dibh->b_data);
852 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
853 error = 1;
854
855 } else {
856 if (size & (uint64_t)(sdp->sd_sb.sb_bsize - 1))
857 error = gfs2_block_truncate_page(ip->i_vnode->i_mapping);
858
859 if (!error) {
860 ip->i_di.di_size = size;
861 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
862 ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
863 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
864 gfs2_dinode_out(&ip->i_di, dibh->b_data);
865 }
866 }
867
868 brelse(dibh);
869
870 out:
871 gfs2_trans_end(sdp);
872
873 return error;
874}
875
876static int trunc_dealloc(struct gfs2_inode *ip, uint64_t size)
877{
878 unsigned int height = ip->i_di.di_height;
879 uint64_t lblock;
880 struct metapath mp;
881 int error;
882
883 if (!size)
884 lblock = 0;
885 else
886 lblock = (size - 1) >> ip->i_sbd->sd_sb.sb_bsize_shift;
887
888 find_metapath(ip, lblock, &mp);
889 gfs2_alloc_get(ip);
890
891 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
892 if (error)
893 goto out;
894
895 while (height--) {
896 struct strip_mine sm;
897 sm.sm_first = !!size;
898 sm.sm_height = height;
899
900 error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
901 if (error)
902 break;
903 }
904
905 gfs2_quota_unhold(ip);
906
907 out:
908 gfs2_alloc_put(ip);
909 return error;
910}
911
912static int trunc_end(struct gfs2_inode *ip)
913{
914 struct gfs2_sbd *sdp = ip->i_sbd;
915 struct buffer_head *dibh;
916 int error;
917
918 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
919 if (error)
920 return error;
921
922 down_write(&ip->i_rw_mutex);
923
924 error = gfs2_meta_inode_buffer(ip, &dibh);
925 if (error)
926 goto out;
927
928 if (!ip->i_di.di_size) {
929 ip->i_di.di_height = 0;
930 ip->i_di.di_goal_meta =
931 ip->i_di.di_goal_data =
932 ip->i_num.no_addr;
933 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
934 }
935 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
936 ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
937
938 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
939 gfs2_dinode_out(&ip->i_di, dibh->b_data);
940 brelse(dibh);
941
942 out:
943 up_write(&ip->i_rw_mutex);
944
945 gfs2_trans_end(sdp);
946
947 return error;
948}
949
950/**
951 * do_shrink - make a file smaller
952 * @ip: the inode
953 * @size: the size to make the file
954 * @truncator: function to truncate the last partial block
955 *
956 * Called with an exclusive lock on @ip.
957 *
958 * Returns: errno
959 */
960
961static int do_shrink(struct gfs2_inode *ip, uint64_t size)
962{
963 int error;
964
965 error = trunc_start(ip, size);
966 if (error < 0)
967 return error;
968 if (error > 0)
969 return 0;
970
971 error = trunc_dealloc(ip, size);
972 if (!error)
973 error = trunc_end(ip);
974
975 return error;
976}
977
978/**
979 * gfs2_truncatei - make a file a given size
980 * @ip: the inode
981 * @size: the size to make the file
982 * @truncator: function to truncate the last partial block
983 *
984 * The file size can grow, shrink, or stay the same size.
985 *
986 * Returns: errno
987 */
988
989int gfs2_truncatei(struct gfs2_inode *ip, uint64_t size)
990{
991 int error;
992
993 if (gfs2_assert_warn(ip->i_sbd, S_ISREG(ip->i_di.di_mode)))
994 return -EINVAL;
995
996 if (size > ip->i_di.di_size)
997 error = do_grow(ip, size);
998 else
999 error = do_shrink(ip, size);
1000
1001 return error;
1002}
1003
1004int gfs2_truncatei_resume(struct gfs2_inode *ip)
1005{
1006 int error;
1007 error = trunc_dealloc(ip, ip->i_di.di_size);
1008 if (!error)
1009 error = trunc_end(ip);
1010 return error;
1011}
1012
1013int gfs2_file_dealloc(struct gfs2_inode *ip)
1014{
1015 return trunc_dealloc(ip, 0);
1016}
1017
1018/**
1019 * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
1020 * @ip: the file
1021 * @len: the number of bytes to be written to the file
1022 * @data_blocks: returns the number of data blocks required
1023 * @ind_blocks: returns the number of indirect blocks required
1024 *
1025 */
1026
1027void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
1028 unsigned int *data_blocks, unsigned int *ind_blocks)
1029{
1030 struct gfs2_sbd *sdp = ip->i_sbd;
1031 unsigned int tmp;
1032
1033 if (gfs2_is_dir(ip)) {
1034 *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
1035 *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
1036 } else {
1037 *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
1038 *ind_blocks = 3 * (sdp->sd_max_height - 1);
1039 }
1040
1041 for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
1042 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
1043 *ind_blocks += tmp;
1044 }
1045}
1046
1047/**
1048 * gfs2_write_alloc_required - figure out if a write will require an allocation
1049 * @ip: the file being written to
1050 * @offset: the offset to write to
1051 * @len: the number of bytes being written
1052 * @alloc_required: set to 1 if an alloc is required, 0 otherwise
1053 *
1054 * Returns: errno
1055 */
1056
1057int gfs2_write_alloc_required(struct gfs2_inode *ip, uint64_t offset,
1058 unsigned int len, int *alloc_required)
1059{
1060 struct gfs2_sbd *sdp = ip->i_sbd;
1061 uint64_t lblock, lblock_stop, dblock;
1062 uint32_t extlen;
1063 int new = 0;
1064 int error = 0;
1065
1066 *alloc_required = 0;
1067
1068 if (!len)
1069 return 0;
1070
1071 if (gfs2_is_stuffed(ip)) {
1072 if (offset + len >
1073 sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
1074 *alloc_required = 1;
1075 return 0;
1076 }
1077
1078 if (gfs2_is_dir(ip)) {
1079 unsigned int bsize = sdp->sd_jbsize;
1080 lblock = offset;
1081 do_div(lblock, bsize);
1082 lblock_stop = offset + len + bsize - 1;
1083 do_div(lblock_stop, bsize);
1084 } else {
1085 unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1086 lblock = offset >> shift;
1087 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1088 }
1089
1090 for (; lblock < lblock_stop; lblock += extlen) {
1091 error = gfs2_extent_map(ip->i_vnode, lblock, &new, &dblock, &extlen);
1092 if (error)
1093 return error;
1094
1095 if (!dblock) {
1096 *alloc_required = 1;
1097 return 0;
1098 }
1099 }
1100
1101 return 0;
1102}
1103
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
new file mode 100644
index 000000000000..06ccb2d808ad
--- /dev/null
+++ b/fs/gfs2/bmap.h
@@ -0,0 +1,32 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __BMAP_DOT_H__
11#define __BMAP_DOT_H__
12
13typedef int (*gfs2_unstuffer_t) (struct gfs2_inode * ip,
14 struct buffer_head * dibh, uint64_t block,
15 void *private);
16int gfs2_unstuff_dinode(struct gfs2_inode *ip, gfs2_unstuffer_t unstuffer,
17 void *private);
18
19int gfs2_block_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, int *boundary);
20int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
21
22int gfs2_truncatei(struct gfs2_inode *ip, uint64_t size);
23int gfs2_truncatei_resume(struct gfs2_inode *ip);
24int gfs2_file_dealloc(struct gfs2_inode *ip);
25
26void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
27 unsigned int *data_blocks,
28 unsigned int *ind_blocks);
29int gfs2_write_alloc_required(struct gfs2_inode *ip, uint64_t offset,
30 unsigned int len, int *alloc_required);
31
32#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
new file mode 100644
index 000000000000..9e7b9f296786
--- /dev/null
+++ b/fs/gfs2/daemon.c
@@ -0,0 +1,223 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/kthread.h>
16#include <linux/delay.h>
17#include <linux/gfs2_ondisk.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "daemon.h"
23#include "glock.h"
24#include "log.h"
25#include "quota.h"
26#include "recovery.h"
27#include "super.h"
28#include "unlinked.h"
29#include "util.h"
30
31/* This uses schedule_timeout() instead of msleep() because it's good for
32 the daemons to wake up more often than the timeout when unmounting so
33 the user's unmount doesn't sit there forever.
34
35 The kthread functions used to start these daemons block and flush signals. */
36
37/**
38 * gfs2_scand - Look for cached glocks and inodes to toss from memory
39 * @sdp: Pointer to GFS2 superblock
40 *
41 * One of these daemons runs, finding candidates to add to sd_reclaim_list.
42 * See gfs2_glockd()
43 */
44
45int gfs2_scand(void *data)
46{
47 struct gfs2_sbd *sdp = data;
48 unsigned long t;
49
50 while (!kthread_should_stop()) {
51 gfs2_scand_internal(sdp);
52 t = gfs2_tune_get(sdp, gt_scand_secs) * HZ;
53 schedule_timeout_interruptible(t);
54 }
55
56 return 0;
57}
58
59/**
60 * gfs2_glockd - Reclaim unused glock structures
61 * @sdp: Pointer to GFS2 superblock
62 *
63 * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
64 * Number of daemons can be set by user, with num_glockd mount option.
65 */
66
67int gfs2_glockd(void *data)
68{
69 struct gfs2_sbd *sdp = data;
70
71 while (!kthread_should_stop()) {
72 while (atomic_read(&sdp->sd_reclaim_count))
73 gfs2_reclaim_glock(sdp);
74
75 wait_event_interruptible(sdp->sd_reclaim_wq,
76 (atomic_read(&sdp->sd_reclaim_count) ||
77 kthread_should_stop()));
78 }
79
80 return 0;
81}
82
83/**
84 * gfs2_recoverd - Recover dead machine's journals
85 * @sdp: Pointer to GFS2 superblock
86 *
87 */
88
89int gfs2_recoverd(void *data)
90{
91 struct gfs2_sbd *sdp = data;
92 unsigned long t;
93
94 while (!kthread_should_stop()) {
95 gfs2_check_journals(sdp);
96 t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
97 schedule_timeout_interruptible(t);
98 }
99
100 return 0;
101}
102
103/**
104 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
105 * @sdp: Pointer to GFS2 superblock
106 *
107 * Also, periodically check to make sure that we're using the most recent
108 * journal index.
109 */
110
111int gfs2_logd(void *data)
112{
113 struct gfs2_sbd *sdp = data;
114 struct gfs2_holder ji_gh;
115 unsigned long t;
116
117 while (!kthread_should_stop()) {
118 /* Advance the log tail */
119
120 t = sdp->sd_log_flush_time +
121 gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
122
123 gfs2_ail1_empty(sdp, DIO_ALL);
124
125 if (time_after_eq(jiffies, t)) {
126 gfs2_log_flush(sdp, NULL);
127 sdp->sd_log_flush_time = jiffies;
128 }
129
130 /* Check for latest journal index */
131
132 t = sdp->sd_jindex_refresh_time +
133 gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
134
135 if (time_after_eq(jiffies, t)) {
136 if (!gfs2_jindex_hold(sdp, &ji_gh))
137 gfs2_glock_dq_uninit(&ji_gh);
138 sdp->sd_jindex_refresh_time = jiffies;
139 }
140
141 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
142 schedule_timeout_interruptible(t);
143 }
144
145 return 0;
146}
147
148/**
149 * gfs2_quotad - Write cached quota changes into the quota file
150 * @sdp: Pointer to GFS2 superblock
151 *
152 */
153
154int gfs2_quotad(void *data)
155{
156 struct gfs2_sbd *sdp = data;
157 unsigned long t;
158 int error;
159
160 while (!kthread_should_stop()) {
161 /* Update the master statfs file */
162
163 t = sdp->sd_statfs_sync_time +
164 gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
165
166 if (time_after_eq(jiffies, t)) {
167 error = gfs2_statfs_sync(sdp);
168 if (error &&
169 error != -EROFS &&
170 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
171 fs_err(sdp, "quotad: (1) error=%d\n", error);
172 sdp->sd_statfs_sync_time = jiffies;
173 }
174
175 /* Update quota file */
176
177 t = sdp->sd_quota_sync_time +
178 gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
179
180 if (time_after_eq(jiffies, t)) {
181 error = gfs2_quota_sync(sdp);
182 if (error &&
183 error != -EROFS &&
184 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
185 fs_err(sdp, "quotad: (2) error=%d\n", error);
186 sdp->sd_quota_sync_time = jiffies;
187 }
188
189 gfs2_quota_scan(sdp);
190
191 t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
192 schedule_timeout_interruptible(t);
193 }
194
195 return 0;
196}
197
198/**
199 * gfs2_inoded - Deallocate unlinked inodes
200 * @sdp: Pointer to GFS2 superblock
201 *
202 */
203
204int gfs2_inoded(void *data)
205{
206 struct gfs2_sbd *sdp = data;
207 unsigned long t;
208 int error;
209
210 while (!kthread_should_stop()) {
211 error = gfs2_unlinked_dealloc(sdp);
212 if (error &&
213 error != -EROFS &&
214 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
215 fs_err(sdp, "inoded: error = %d\n", error);
216
217 t = gfs2_tune_get(sdp, gt_inoded_secs) * HZ;
218 schedule_timeout_interruptible(t);
219 }
220
221 return 0;
222}
223
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
new file mode 100644
index 000000000000..aa68e7a1b0b7
--- /dev/null
+++ b/fs/gfs2/daemon.h
@@ -0,0 +1,20 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __DAEMON_DOT_H__
11#define __DAEMON_DOT_H__
12
13int gfs2_scand(void *data);
14int gfs2_glockd(void *data);
15int gfs2_recoverd(void *data);
16int gfs2_logd(void *data);
17int gfs2_quotad(void *data);
18int gfs2_inoded(void *data);
19
20#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
new file mode 100644
index 000000000000..6918a58261e2
--- /dev/null
+++ b/fs/gfs2/dir.c
@@ -0,0 +1,1974 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10/*
11 * Implements Extendible Hashing as described in:
12 * "Extendible Hashing" by Fagin, et al in
13 * __ACM Trans. on Database Systems__, Sept 1979.
14 *
15 *
16 * Here's the layout of dirents which is essentially the same as that of ext2
17 * within a single block. The field de_name_len is the number of bytes
18 * actually required for the name (no null terminator). The field de_rec_len
19 * is the number of bytes allocated to the dirent. The offset of the next
20 * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
21 * deleted, the preceding dirent inherits its allocated space, ie
22 * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
23 * by adding de_rec_len to the current dirent, this essentially causes the
24 * deleted dirent to get jumped over when iterating through all the dirents.
25 *
26 * When deleting the first dirent in a block, there is no previous dirent so
27 * the field de_ino is set to zero to designate it as deleted. When allocating
28 * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the
29 * first dirent has (de_ino == 0) and de_rec_len is large enough, this first
30 * dirent is allocated. Otherwise it must go through all the 'used' dirents
31 * searching for one in which the amount of total space minus the amount of
32 * used space will provide enough space for the new dirent.
33 *
34 * There are two types of blocks in which dirents reside. In a stuffed dinode,
35 * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of
36 * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
37 * beginning of the leaf block. The dirents reside in leaves when
38 *
39 * dip->i_di.di_flags & GFS2_DIF_EXHASH is true
40 *
41 * Otherwise, the dirents are "linear", within a single stuffed dinode block.
42 *
43 * When the dirents are in leaves, the actual contents of the directory file are
44 * used as an array of 64-bit block pointers pointing to the leaf blocks. The
45 * dirents are NOT in the directory file itself. There can be more than one
46 * block pointer in the array that points to the same leaf. In fact, when a
47 * directory is first converted from linear to exhash, all of the pointers
48 * point to the same leaf.
49 *
50 * When a leaf is completely full, the size of the hash table can be
51 * doubled unless it is already at the maximum size which is hard coded into
52 * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list,
53 * but never before the maximum hash table size has been reached.
54 */
55
56#include <linux/sched.h>
57#include <linux/slab.h>
58#include <linux/spinlock.h>
59#include <linux/buffer_head.h>
60#include <linux/sort.h>
61#include <linux/gfs2_ondisk.h>
62#include <linux/crc32.h>
63#include <linux/vmalloc.h>
64
65#include "gfs2.h"
66#include "lm_interface.h"
67#include "incore.h"
68#include "dir.h"
69#include "glock.h"
70#include "inode.h"
71#include "meta_io.h"
72#include "quota.h"
73#include "rgrp.h"
74#include "trans.h"
75#include "bmap.h"
76#include "util.h"
77
78#define IS_LEAF 1 /* Hashed (leaf) directory */
79#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */
80
81#if 1
82#define gfs2_disk_hash2offset(h) (((uint64_t)(h)) >> 1)
83#define gfs2_dir_offset2hash(p) ((uint32_t)(((uint64_t)(p)) << 1))
84#else
85#define gfs2_disk_hash2offset(h) (((uint64_t)(h)))
86#define gfs2_dir_offset2hash(p) ((uint32_t)(((uint64_t)(p))))
87#endif
88
89typedef int (*leaf_call_t) (struct gfs2_inode *dip,
90 uint32_t index, uint32_t len, uint64_t leaf_no,
91 void *data);
92
93
94int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, uint64_t block,
95 struct buffer_head **bhp)
96{
97 struct buffer_head *bh;
98
99 bh = gfs2_meta_new(ip->i_gl, block);
100 gfs2_trans_add_bh(ip->i_gl, bh, 1);
101 gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
102 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
103 *bhp = bh;
104 return 0;
105}
106
107static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, uint64_t block,
108 struct buffer_head **bhp)
109{
110 struct buffer_head *bh;
111 int error;
112
113 error = gfs2_meta_read(ip->i_gl, block, DIO_START | DIO_WAIT, &bh);
114 if (error)
115 return error;
116 if (gfs2_metatype_check(ip->i_sbd, bh, GFS2_METATYPE_JD)) {
117 brelse(bh);
118 return -EIO;
119 }
120 *bhp = bh;
121 return 0;
122}
123
124static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
125 unsigned int offset, unsigned int size)
126
127{
128 struct buffer_head *dibh;
129 int error;
130
131 error = gfs2_meta_inode_buffer(ip, &dibh);
132 if (error)
133 return error;
134
135 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
136 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
137 if (ip->i_di.di_size < offset + size)
138 ip->i_di.di_size = offset + size;
139 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
140 gfs2_dinode_out(&ip->i_di, dibh->b_data);
141
142 brelse(dibh);
143
144 return size;
145}
146
147
148
149/**
150 * gfs2_dir_write_data - Write directory information to the inode
151 * @ip: The GFS2 inode
152 * @buf: The buffer containing information to be written
153 * @offset: The file offset to start writing at
154 * @size: The amount of data to write
155 *
156 * Returns: The number of bytes correctly written or error code
157 */
158static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
159 uint64_t offset, unsigned int size)
160{
161 struct gfs2_sbd *sdp = ip->i_sbd;
162 struct buffer_head *dibh;
163 uint64_t lblock, dblock;
164 uint32_t extlen = 0;
165 unsigned int o;
166 int copied = 0;
167 int error = 0;
168
169 if (!size)
170 return 0;
171
172 if (gfs2_is_stuffed(ip) &&
173 offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
174 return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset,
175 size);
176
177 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
178 return -EINVAL;
179
180 if (gfs2_is_stuffed(ip)) {
181 error = gfs2_unstuff_dinode(ip, NULL, NULL);
182 if (error)
183 return error;
184 }
185
186 lblock = offset;
187 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
188
189 while (copied < size) {
190 unsigned int amount;
191 struct buffer_head *bh;
192 int new;
193
194 amount = size - copied;
195 if (amount > sdp->sd_sb.sb_bsize - o)
196 amount = sdp->sd_sb.sb_bsize - o;
197
198 if (!extlen) {
199 new = 1;
200 error = gfs2_extent_map(ip->i_vnode, lblock, &new,
201 &dblock, &extlen);
202 if (error)
203 goto fail;
204 error = -EIO;
205 if (gfs2_assert_withdraw(sdp, dblock))
206 goto fail;
207 }
208
209 if (amount == sdp->sd_jbsize || new)
210 error = gfs2_dir_get_new_buffer(ip, dblock, &bh);
211 else
212 error = gfs2_dir_get_existing_buffer(ip, dblock, &bh);
213
214 if (error)
215 goto fail;
216
217 gfs2_trans_add_bh(ip->i_gl, bh, 1);
218 memcpy(bh->b_data + o, buf, amount);
219 brelse(bh);
220 if (error)
221 goto fail;
222
223 copied += amount;
224 lblock++;
225 dblock++;
226 extlen--;
227
228 o = sizeof(struct gfs2_meta_header);
229 }
230
231out:
232 error = gfs2_meta_inode_buffer(ip, &dibh);
233 if (error)
234 return error;
235
236 if (ip->i_di.di_size < offset + copied)
237 ip->i_di.di_size = offset + copied;
238 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
239
240 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
241 gfs2_dinode_out(&ip->i_di, dibh->b_data);
242 brelse(dibh);
243
244 return copied;
245fail:
246 if (copied)
247 goto out;
248 return error;
249}
250
251static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
252 unsigned int offset, unsigned int size)
253{
254 struct buffer_head *dibh;
255 int error;
256
257 error = gfs2_meta_inode_buffer(ip, &dibh);
258 if (!error) {
259 offset += sizeof(struct gfs2_dinode);
260 memcpy(buf, dibh->b_data + offset, size);
261 brelse(dibh);
262 }
263
264 return (error) ? error : size;
265}
266
267
268/**
269 * gfs2_dir_read_data - Read a data from a directory inode
270 * @ip: The GFS2 Inode
271 * @buf: The buffer to place result into
272 * @offset: File offset to begin jdata_readng from
273 * @size: Amount of data to transfer
274 *
275 * Returns: The amount of data actually copied or the error
276 */
277static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf,
278 uint64_t offset, unsigned int size)
279{
280 struct gfs2_sbd *sdp = ip->i_sbd;
281 uint64_t lblock, dblock;
282 uint32_t extlen = 0;
283 unsigned int o;
284 int copied = 0;
285 int error = 0;
286
287 if (offset >= ip->i_di.di_size)
288 return 0;
289
290 if ((offset + size) > ip->i_di.di_size)
291 size = ip->i_di.di_size - offset;
292
293 if (!size)
294 return 0;
295
296 if (gfs2_is_stuffed(ip))
297 return gfs2_dir_read_stuffed(ip, buf, (unsigned int)offset,
298 size);
299
300 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
301 return -EINVAL;
302
303 lblock = offset;
304 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
305
306 while (copied < size) {
307 unsigned int amount;
308 struct buffer_head *bh;
309 int new;
310
311 amount = size - copied;
312 if (amount > sdp->sd_sb.sb_bsize - o)
313 amount = sdp->sd_sb.sb_bsize - o;
314
315 if (!extlen) {
316 new = 0;
317 error = gfs2_extent_map(ip->i_vnode, lblock, &new,
318 &dblock, &extlen);
319 if (error)
320 goto fail;
321 }
322
323 if (extlen > 1)
324 gfs2_meta_ra(ip->i_gl, dblock, extlen);
325
326 if (dblock) {
327 if (new)
328 error = gfs2_dir_get_new_buffer(ip, dblock, &bh);
329 else
330 error = gfs2_dir_get_existing_buffer(ip, dblock, &bh);
331 if (error)
332 goto fail;
333 dblock++;
334 extlen--;
335 } else
336 bh = NULL;
337
338 memcpy(buf, bh->b_data + o, amount);
339 brelse(bh);
340 if (error)
341 goto fail;
342
343 copied += amount;
344 lblock++;
345
346 o = sizeof(struct gfs2_meta_header);
347 }
348
349 return copied;
350fail:
351 return (copied) ? copied : error;
352}
353
354typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
355 const struct qstr *name,
356 void *opaque);
357
358static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent,
359 const struct qstr *name, int ret)
360{
361 if (dent->de_inum.no_addr != 0 &&
362 be32_to_cpu(dent->de_hash) == name->hash &&
363 be16_to_cpu(dent->de_name_len) == name->len &&
364 memcmp((char *)(dent+1), name->name, name->len) == 0)
365 return ret;
366 return 0;
367}
368
369static int gfs2_dirent_find(const struct gfs2_dirent *dent,
370 const struct qstr *name,
371 void *opaque)
372{
373 return __gfs2_dirent_find(dent, name, 1);
374}
375
376static int gfs2_dirent_prev(const struct gfs2_dirent *dent,
377 const struct qstr *name,
378 void *opaque)
379{
380 return __gfs2_dirent_find(dent, name, 2);
381}
382
383/*
384 * name->name holds ptr to start of block.
385 * name->len holds size of block.
386 */
387static int gfs2_dirent_last(const struct gfs2_dirent *dent,
388 const struct qstr *name,
389 void *opaque)
390{
391 const char *start = name->name;
392 const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len);
393 if (name->len == (end - start))
394 return 1;
395 return 0;
396}
397
398static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
399 const struct qstr *name,
400 void *opaque)
401{
402 unsigned required = GFS2_DIRENT_SIZE(name->len);
403 unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
404 unsigned totlen = be16_to_cpu(dent->de_rec_len);
405
406 if (!dent->de_inum.no_addr)
407 actual = GFS2_DIRENT_SIZE(0);
408 if ((totlen - actual) >= required)
409 return 1;
410 return 0;
411}
412
413struct dirent_gather {
414 const struct gfs2_dirent **pdent;
415 unsigned offset;
416};
417
418static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
419 const struct qstr *name,
420 void *opaque)
421{
422 struct dirent_gather *g = opaque;
423 if (dent->de_inum.no_addr) {
424 g->pdent[g->offset++] = dent;
425 }
426 return 0;
427}
428
429/*
430 * Other possible things to check:
431 * - Inode located within filesystem size (and on valid block)
432 * - Valid directory entry type
433 * Not sure how heavy-weight we want to make this... could also check
434 * hash is correct for example, but that would take a lot of extra time.
435 * For now the most important thing is to check that the various sizes
436 * are correct.
437 */
438static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
439 unsigned int size, unsigned int len, int first)
440{
441 const char *msg = "gfs2_dirent too small";
442 if (unlikely(size < sizeof(struct gfs2_dirent)))
443 goto error;
444 msg = "gfs2_dirent misaligned";
445 if (unlikely(offset & 0x7))
446 goto error;
447 msg = "gfs2_dirent points beyond end of block";
448 if (unlikely(offset + size > len))
449 goto error;
450 msg = "zero inode number";
451 if (unlikely(!first && !dent->de_inum.no_addr))
452 goto error;
453 msg = "name length is greater than space in dirent";
454 if (dent->de_inum.no_addr &&
455 unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) >
456 size))
457 goto error;
458 return 0;
459error:
460 printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg,
461 first ? "first in block" : "not first in block");
462 return -EIO;
463}
464
465static int gfs2_dirent_offset(const void *buf)
466{
467 const struct gfs2_meta_header *h = buf;
468 int offset;
469
470 BUG_ON(buf == NULL);
471
472 switch(be32_to_cpu(h->mh_type)) {
473 case GFS2_METATYPE_LF:
474 offset = sizeof(struct gfs2_leaf);
475 break;
476 case GFS2_METATYPE_DI:
477 offset = sizeof(struct gfs2_dinode);
478 break;
479 default:
480 goto wrong_type;
481 }
482 return offset;
483wrong_type:
484 printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n",
485 be32_to_cpu(h->mh_type));
486 return -1;
487}
488
489static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode,
490 void *buf,
491 unsigned int len, gfs2_dscan_t scan,
492 const struct qstr *name,
493 void *opaque)
494{
495 struct gfs2_dirent *dent, *prev;
496 unsigned offset;
497 unsigned size;
498 int ret = 0;
499
500 ret = gfs2_dirent_offset(buf);
501 if (ret < 0)
502 goto consist_inode;
503
504 offset = ret;
505 prev = NULL;
506 dent = (struct gfs2_dirent *)(buf + offset);
507 size = be16_to_cpu(dent->de_rec_len);
508 if (gfs2_check_dirent(dent, offset, size, len, 1))
509 goto consist_inode;
510 do {
511 ret = scan(dent, name, opaque);
512 if (ret)
513 break;
514 offset += size;
515 if (offset == len)
516 break;
517 prev = dent;
518 dent = (struct gfs2_dirent *)(buf + offset);
519 size = be16_to_cpu(dent->de_rec_len);
520 if (gfs2_check_dirent(dent, offset, size, len, 0))
521 goto consist_inode;
522 } while(1);
523
524 switch(ret) {
525 case 0:
526 return NULL;
527 case 1:
528 return dent;
529 case 2:
530 return prev ? prev : dent;
531 default:
532 BUG_ON(ret > 0);
533 return ERR_PTR(ret);
534 }
535
536consist_inode:
537 gfs2_consist_inode(inode->u.generic_ip);
538 return ERR_PTR(-EIO);
539}
540
541
542/**
543 * dirent_first - Return the first dirent
544 * @dip: the directory
545 * @bh: The buffer
546 * @dent: Pointer to list of dirents
547 *
548 * return first dirent whether bh points to leaf or stuffed dinode
549 *
550 * Returns: IS_LEAF, IS_DINODE, or -errno
551 */
552
553static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
554 struct gfs2_dirent **dent)
555{
556 struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
557
558 if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
559 if (gfs2_meta_check(dip->i_sbd, bh))
560 return -EIO;
561 *dent = (struct gfs2_dirent *)(bh->b_data +
562 sizeof(struct gfs2_leaf));
563 return IS_LEAF;
564 } else {
565 if (gfs2_metatype_check(dip->i_sbd, bh, GFS2_METATYPE_DI))
566 return -EIO;
567 *dent = (struct gfs2_dirent *)(bh->b_data +
568 sizeof(struct gfs2_dinode));
569 return IS_DINODE;
570 }
571}
572
573/**
574 * dirent_next - Next dirent
575 * @dip: the directory
576 * @bh: The buffer
577 * @dent: Pointer to list of dirents
578 *
579 * Returns: 0 on success, error code otherwise
580 */
581
582static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh,
583 struct gfs2_dirent **dent)
584{
585 struct gfs2_dirent *tmp, *cur;
586 char *bh_end;
587 uint16_t cur_rec_len;
588
589 cur = *dent;
590 bh_end = bh->b_data + bh->b_size;
591 cur_rec_len = be16_to_cpu(cur->de_rec_len);
592
593 if ((char *)cur + cur_rec_len >= bh_end) {
594 if ((char *)cur + cur_rec_len > bh_end) {
595 gfs2_consist_inode(dip);
596 return -EIO;
597 }
598 return -ENOENT;
599 }
600
601 tmp = (struct gfs2_dirent *)((char *)cur + cur_rec_len);
602
603 if ((char *)tmp + be16_to_cpu(tmp->de_rec_len) > bh_end) {
604 gfs2_consist_inode(dip);
605 return -EIO;
606 }
607
608 if (cur_rec_len == 0) {
609 gfs2_consist_inode(dip);
610 return -EIO;
611 }
612
613 /* Only the first dent could ever have de_inum.no_addr == 0 */
614 if (!tmp->de_inum.no_addr) {
615 gfs2_consist_inode(dip);
616 return -EIO;
617 }
618
619 *dent = tmp;
620
621 return 0;
622}
623
624/**
625 * dirent_del - Delete a dirent
626 * @dip: The GFS2 inode
627 * @bh: The buffer
628 * @prev: The previous dirent
629 * @cur: The current dirent
630 *
631 */
632
633static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
634 struct gfs2_dirent *prev, struct gfs2_dirent *cur)
635{
636 uint16_t cur_rec_len, prev_rec_len;
637
638 if (!cur->de_inum.no_addr) {
639 gfs2_consist_inode(dip);
640 return;
641 }
642
643 gfs2_trans_add_bh(dip->i_gl, bh, 1);
644
645 /* If there is no prev entry, this is the first entry in the block.
646 The de_rec_len is already as big as it needs to be. Just zero
647 out the inode number and return. */
648
649 if (!prev) {
650 cur->de_inum.no_addr = 0; /* No endianess worries */
651 return;
652 }
653
654 /* Combine this dentry with the previous one. */
655
656 prev_rec_len = be16_to_cpu(prev->de_rec_len);
657 cur_rec_len = be16_to_cpu(cur->de_rec_len);
658
659 if ((char *)prev + prev_rec_len != (char *)cur)
660 gfs2_consist_inode(dip);
661 if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size)
662 gfs2_consist_inode(dip);
663
664 prev_rec_len += cur_rec_len;
665 prev->de_rec_len = cpu_to_be16(prev_rec_len);
666}
667
668/*
669 * Takes a dent from which to grab space as an argument. Returns the
670 * newly created dent.
671 */
672static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
673 struct gfs2_dirent *dent,
674 const struct qstr *name,
675 struct buffer_head *bh)
676{
677 struct gfs2_inode *ip = inode->u.generic_ip;
678 struct gfs2_dirent *ndent;
679 unsigned offset = 0, totlen;
680
681 if (dent->de_inum.no_addr)
682 offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
683 totlen = be16_to_cpu(dent->de_rec_len);
684 BUG_ON(offset + name->len > totlen);
685 gfs2_trans_add_bh(ip->i_gl, bh, 1);
686 ndent = (struct gfs2_dirent *)((char *)dent + offset);
687 dent->de_rec_len = cpu_to_be16(offset);
688 gfs2_qstr2dirent(name, totlen - offset, ndent);
689 return ndent;
690}
691
692static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
693 struct buffer_head *bh,
694 const struct qstr *name)
695{
696 struct gfs2_dirent *dent;
697 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
698 gfs2_dirent_find_space, name, NULL);
699 if (!dent || IS_ERR(dent))
700 return dent;
701 return gfs2_init_dirent(inode, dent, name, bh);
702}
703
704static int get_leaf(struct gfs2_inode *dip, uint64_t leaf_no,
705 struct buffer_head **bhp)
706{
707 int error;
708
709 error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_START | DIO_WAIT, bhp);
710 if (!error && gfs2_metatype_check(dip->i_sbd, *bhp, GFS2_METATYPE_LF))
711 error = -EIO;
712
713 return error;
714}
715
716/**
717 * get_leaf_nr - Get a leaf number associated with the index
718 * @dip: The GFS2 inode
719 * @index:
720 * @leaf_out:
721 *
722 * Returns: 0 on success, error code otherwise
723 */
724
725static int get_leaf_nr(struct gfs2_inode *dip, uint32_t index,
726 uint64_t *leaf_out)
727{
728 uint64_t leaf_no;
729 int error;
730
731 error = gfs2_dir_read_data(dip, (char *)&leaf_no,
732 index * sizeof(uint64_t),
733 sizeof(uint64_t));
734 if (error != sizeof(uint64_t))
735 return (error < 0) ? error : -EIO;
736
737 *leaf_out = be64_to_cpu(leaf_no);
738
739 return 0;
740}
741
742static int get_first_leaf(struct gfs2_inode *dip, uint32_t index,
743 struct buffer_head **bh_out)
744{
745 uint64_t leaf_no;
746 int error;
747
748 error = get_leaf_nr(dip, index, &leaf_no);
749 if (!error)
750 error = get_leaf(dip, leaf_no, bh_out);
751
752 return error;
753}
754
755static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
756 const struct qstr *name,
757 gfs2_dscan_t scan,
758 struct buffer_head **pbh)
759{
760 struct buffer_head *bh;
761 struct gfs2_dirent *dent;
762 struct gfs2_inode *ip = inode->u.generic_ip;
763 int error;
764
765 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
766 struct gfs2_leaf *leaf;
767 unsigned hsize = 1 << ip->i_di.di_depth;
768 unsigned index;
769 u64 ln;
770 if (hsize * sizeof(u64) != ip->i_di.di_size) {
771 gfs2_consist_inode(ip);
772 return ERR_PTR(-EIO);
773 }
774
775 index = name->hash >> (32 - ip->i_di.di_depth);
776 error = get_first_leaf(ip, index, &bh);
777 if (error)
778 return ERR_PTR(error);
779 do {
780 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
781 scan, name, NULL);
782 if (dent)
783 goto got_dent;
784 leaf = (struct gfs2_leaf *)bh->b_data;
785 ln = be64_to_cpu(leaf->lf_next);
786 brelse(bh);
787 if (!ln)
788 break;
789 error = get_leaf(ip, ln, &bh);
790 } while(!error);
791
792 return error ? ERR_PTR(error) : NULL;
793 }
794
795 error = gfs2_meta_inode_buffer(ip, &bh);
796 if (error)
797 return ERR_PTR(error);
798 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL);
799got_dent:
800 if (unlikely(dent == NULL || IS_ERR(dent))) {
801 brelse(bh);
802 bh = NULL;
803 }
804 *pbh = bh;
805 return dent;
806}
807
808static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
809{
810 struct gfs2_inode *ip = inode->u.generic_ip;
811 u64 bn = gfs2_alloc_meta(ip);
812 struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
813 struct gfs2_leaf *leaf;
814 struct gfs2_dirent *dent;
815 struct qstr name = { .name = "", .len = 0, .hash = 0 };
816 if (!bh)
817 return NULL;
818 gfs2_trans_add_bh(ip->i_gl, bh, 1);
819 gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
820 leaf = (struct gfs2_leaf *)bh->b_data;
821 leaf->lf_depth = cpu_to_be16(depth);
822 leaf->lf_entries = cpu_to_be16(0);
823 leaf->lf_dirent_format = cpu_to_be16(GFS2_FORMAT_DE);
824 leaf->lf_next = cpu_to_be64(0);
825 memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
826 dent = (struct gfs2_dirent *)(leaf+1);
827 gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
828 *pbh = bh;
829 return leaf;
830}
831
832/**
833 * dir_make_exhash - Convert a stuffed directory into an ExHash directory
834 * @dip: The GFS2 inode
835 *
836 * Returns: 0 on success, error code otherwise
837 */
838
839static int dir_make_exhash(struct inode *inode)
840{
841 struct gfs2_inode *dip = inode->u.generic_ip;
842 struct gfs2_sbd *sdp = dip->i_sbd;
843 struct gfs2_dirent *dent;
844 struct qstr args;
845 struct buffer_head *bh, *dibh;
846 struct gfs2_leaf *leaf;
847 int y;
848 uint32_t x;
849 uint64_t *lp, bn;
850 int error;
851
852 error = gfs2_meta_inode_buffer(dip, &dibh);
853 if (error)
854 return error;
855
856 /* Turn over a new leaf */
857
858 leaf = new_leaf(inode, &bh, 0);
859 if (!leaf)
860 return -ENOSPC;
861 bn = bh->b_blocknr;
862
863 gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
864 leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
865
866 /* Copy dirents */
867
868 gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh,
869 sizeof(struct gfs2_dinode));
870
871 /* Find last entry */
872
873 x = 0;
874 args.len = bh->b_size - sizeof(struct gfs2_dinode) +
875 sizeof(struct gfs2_leaf);
876 args.name = bh->b_data;
877 dent = gfs2_dirent_scan(dip->i_vnode, bh->b_data, bh->b_size,
878 gfs2_dirent_last, &args, NULL);
879 if (!dent) {
880 brelse(bh);
881 brelse(dibh);
882 return -EIO;
883 }
884 if (IS_ERR(dent)) {
885 brelse(bh);
886 brelse(dibh);
887 return PTR_ERR(dent);
888 }
889
890 /* Adjust the last dirent's record length
891 (Remember that dent still points to the last entry.) */
892
893 dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) +
894 sizeof(struct gfs2_dinode) -
895 sizeof(struct gfs2_leaf));
896
897 brelse(bh);
898
899 /* We're done with the new leaf block, now setup the new
900 hash table. */
901
902 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
903 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
904
905 lp = (uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode));
906
907 for (x = sdp->sd_hash_ptrs; x--; lp++)
908 *lp = cpu_to_be64(bn);
909
910 dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
911 dip->i_di.di_blocks++;
912 dip->i_di.di_flags |= GFS2_DIF_EXHASH;
913 dip->i_di.di_payload_format = 0;
914
915 for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
916 dip->i_di.di_depth = y;
917
918 gfs2_dinode_out(&dip->i_di, dibh->b_data);
919
920 brelse(dibh);
921
922 return 0;
923}
924
925/**
926 * dir_split_leaf - Split a leaf block into two
927 * @dip: The GFS2 inode
928 * @index:
929 * @leaf_no:
930 *
931 * Returns: 0 on success, error code on failure
932 */
933
934static int dir_split_leaf(struct inode *inode, const struct qstr *name)
935{
936 struct gfs2_inode *dip = inode->u.generic_ip;
937 struct buffer_head *nbh, *obh, *dibh;
938 struct gfs2_leaf *nleaf, *oleaf;
939 struct gfs2_dirent *dent, *prev = NULL, *next = NULL, *new;
940 uint32_t start, len, half_len, divider;
941 uint64_t bn, *lp, leaf_no;
942 uint32_t index;
943 int x, moved = 0;
944 int error;
945
946 index = name->hash >> (32 - dip->i_di.di_depth);
947 error = get_leaf_nr(dip, index, &leaf_no);
948 if (error)
949 return error;
950
951 /* Get the old leaf block */
952 error = get_leaf(dip, leaf_no, &obh);
953 if (error)
954 return error;
955
956 oleaf = (struct gfs2_leaf *)obh->b_data;
957 if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) {
958 brelse(obh);
959 return 1; /* can't split */
960 }
961
962 gfs2_trans_add_bh(dip->i_gl, obh, 1);
963
964 nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
965 if (!nleaf) {
966 brelse(obh);
967 return -ENOSPC;
968 }
969 bn = nbh->b_blocknr;
970
971 /* Compute the start and len of leaf pointers in the hash table. */
972 len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth));
973 half_len = len >> 1;
974 if (!half_len) {
975 printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index);
976 gfs2_consist_inode(dip);
977 error = -EIO;
978 goto fail_brelse;
979 }
980
981 start = (index & ~(len - 1));
982
983 /* Change the pointers.
984 Don't bother distinguishing stuffed from non-stuffed.
985 This code is complicated enough already. */
986 lp = kmalloc(half_len * sizeof(uint64_t), GFP_NOFS | __GFP_NOFAIL);
987 /* Change the pointers */
988 for (x = 0; x < half_len; x++)
989 lp[x] = cpu_to_be64(bn);
990
991 error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(uint64_t),
992 half_len * sizeof(uint64_t));
993 if (error != half_len * sizeof(uint64_t)) {
994 if (error >= 0)
995 error = -EIO;
996 goto fail_lpfree;
997 }
998
999 kfree(lp);
1000
1001 /* Compute the divider */
1002 divider = (start + half_len) << (32 - dip->i_di.di_depth);
1003
1004 /* Copy the entries */
1005 dirent_first(dip, obh, &dent);
1006
1007 do {
1008 next = dent;
1009 if (dirent_next(dip, obh, &next))
1010 next = NULL;
1011
1012 if (dent->de_inum.no_addr &&
1013 be32_to_cpu(dent->de_hash) < divider) {
1014 struct qstr str;
1015 str.name = (char*)(dent+1);
1016 str.len = be16_to_cpu(dent->de_name_len);
1017 str.hash = be32_to_cpu(dent->de_hash);
1018 new = gfs2_dirent_alloc(inode, nbh, &str);
1019 if (IS_ERR(new)) {
1020 error = PTR_ERR(new);
1021 break;
1022 }
1023
1024 new->de_inum = dent->de_inum; /* No endian worries */
1025 new->de_type = dent->de_type; /* No endian worries */
1026 nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1);
1027
1028 dirent_del(dip, obh, prev, dent);
1029
1030 if (!oleaf->lf_entries)
1031 gfs2_consist_inode(dip);
1032 oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1);
1033
1034 if (!prev)
1035 prev = dent;
1036
1037 moved = 1;
1038 } else {
1039 prev = dent;
1040 }
1041 dent = next;
1042 } while (dent);
1043
1044 oleaf->lf_depth = nleaf->lf_depth;
1045
1046 error = gfs2_meta_inode_buffer(dip, &dibh);
1047 if (!gfs2_assert_withdraw(dip->i_sbd, !error)) {
1048 dip->i_di.di_blocks++;
1049 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1050 brelse(dibh);
1051 }
1052
1053 brelse(obh);
1054 brelse(nbh);
1055
1056 return error;
1057
1058fail_lpfree:
1059 kfree(lp);
1060
1061fail_brelse:
1062 brelse(obh);
1063 brelse(nbh);
1064 return error;
1065}
1066
1067/**
1068 * dir_double_exhash - Double size of ExHash table
1069 * @dip: The GFS2 dinode
1070 *
1071 * Returns: 0 on success, error code on failure
1072 */
1073
1074static int dir_double_exhash(struct gfs2_inode *dip)
1075{
1076 struct gfs2_sbd *sdp = dip->i_sbd;
1077 struct buffer_head *dibh;
1078 uint32_t hsize;
1079 uint64_t *buf;
1080 uint64_t *from, *to;
1081 uint64_t block;
1082 int x;
1083 int error = 0;
1084
1085 hsize = 1 << dip->i_di.di_depth;
1086 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
1087 gfs2_consist_inode(dip);
1088 return -EIO;
1089 }
1090
1091 /* Allocate both the "from" and "to" buffers in one big chunk */
1092
1093 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL);
1094
1095 for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
1096 error = gfs2_dir_read_data(dip, (char *)buf,
1097 block * sdp->sd_hash_bsize,
1098 sdp->sd_hash_bsize);
1099 if (error != sdp->sd_hash_bsize) {
1100 if (error >= 0)
1101 error = -EIO;
1102 goto fail;
1103 }
1104
1105 from = buf;
1106 to = (uint64_t *)((char *)buf + sdp->sd_hash_bsize);
1107
1108 for (x = sdp->sd_hash_ptrs; x--; from++) {
1109 *to++ = *from; /* No endianess worries */
1110 *to++ = *from;
1111 }
1112
1113 error = gfs2_dir_write_data(dip,
1114 (char *)buf + sdp->sd_hash_bsize,
1115 block * sdp->sd_sb.sb_bsize,
1116 sdp->sd_sb.sb_bsize);
1117 if (error != sdp->sd_sb.sb_bsize) {
1118 if (error >= 0)
1119 error = -EIO;
1120 goto fail;
1121 }
1122 }
1123
1124 kfree(buf);
1125
1126 error = gfs2_meta_inode_buffer(dip, &dibh);
1127 if (!gfs2_assert_withdraw(sdp, !error)) {
1128 dip->i_di.di_depth++;
1129 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1130 brelse(dibh);
1131 }
1132
1133 return error;
1134
1135 fail:
1136 kfree(buf);
1137
1138 return error;
1139}
1140
1141/**
1142 * compare_dents - compare directory entries by hash value
1143 * @a: first dent
1144 * @b: second dent
1145 *
1146 * When comparing the hash entries of @a to @b:
1147 * gt: returns 1
1148 * lt: returns -1
1149 * eq: returns 0
1150 */
1151
1152static int compare_dents(const void *a, const void *b)
1153{
1154 struct gfs2_dirent *dent_a, *dent_b;
1155 uint32_t hash_a, hash_b;
1156 int ret = 0;
1157
1158 dent_a = *(struct gfs2_dirent **)a;
1159 hash_a = be32_to_cpu(dent_a->de_hash);
1160
1161 dent_b = *(struct gfs2_dirent **)b;
1162 hash_b = be32_to_cpu(dent_b->de_hash);
1163
1164 if (hash_a > hash_b)
1165 ret = 1;
1166 else if (hash_a < hash_b)
1167 ret = -1;
1168 else {
1169 unsigned int len_a = be16_to_cpu(dent_a->de_name_len);
1170 unsigned int len_b = be16_to_cpu(dent_b->de_name_len);
1171
1172 if (len_a > len_b)
1173 ret = 1;
1174 else if (len_a < len_b)
1175 ret = -1;
1176 else
1177 ret = memcmp((char *)(dent_a + 1),
1178 (char *)(dent_b + 1),
1179 len_a);
1180 }
1181
1182 return ret;
1183}
1184
1185/**
1186 * do_filldir_main - read out directory entries
1187 * @dip: The GFS2 inode
1188 * @offset: The offset in the file to read from
1189 * @opaque: opaque data to pass to filldir
1190 * @filldir: The function to pass entries to
1191 * @darr: an array of struct gfs2_dirent pointers to read
1192 * @entries: the number of entries in darr
1193 * @copied: pointer to int that's non-zero if a entry has been copied out
1194 *
1195 * Jump through some hoops to make sure that if there are hash collsions,
1196 * they are read out at the beginning of a buffer. We want to minimize
1197 * the possibility that they will fall into different readdir buffers or
1198 * that someone will want to seek to that location.
1199 *
1200 * Returns: errno, >0 on exception from filldir
1201 */
1202
1203static int do_filldir_main(struct gfs2_inode *dip, uint64_t *offset,
1204 void *opaque, gfs2_filldir_t filldir,
1205 const struct gfs2_dirent **darr, uint32_t entries,
1206 int *copied)
1207{
1208 const struct gfs2_dirent *dent, *dent_next;
1209 struct gfs2_inum inum;
1210 uint64_t off, off_next;
1211 unsigned int x, y;
1212 int run = 0;
1213 int error = 0;
1214
1215 sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
1216
1217 dent_next = darr[0];
1218 off_next = be32_to_cpu(dent_next->de_hash);
1219 off_next = gfs2_disk_hash2offset(off_next);
1220
1221 for (x = 0, y = 1; x < entries; x++, y++) {
1222 dent = dent_next;
1223 off = off_next;
1224
1225 if (y < entries) {
1226 dent_next = darr[y];
1227 off_next = be32_to_cpu(dent_next->de_hash);
1228 off_next = gfs2_disk_hash2offset(off_next);
1229
1230 if (off < *offset)
1231 continue;
1232 *offset = off;
1233
1234 if (off_next == off) {
1235 if (*copied && !run)
1236 return 1;
1237 run = 1;
1238 } else
1239 run = 0;
1240 } else {
1241 if (off < *offset)
1242 continue;
1243 *offset = off;
1244 }
1245
1246 gfs2_inum_in(&inum, (char *)&dent->de_inum);
1247
1248 error = filldir(opaque, (char *)(dent + 1),
1249 be16_to_cpu(dent->de_name_len),
1250 off, &inum,
1251 be16_to_cpu(dent->de_type));
1252 if (error)
1253 return 1;
1254
1255 *copied = 1;
1256 }
1257
1258 /* Increment the *offset by one, so the next time we come into the
1259 do_filldir fxn, we get the next entry instead of the last one in the
1260 current leaf */
1261
1262 (*offset)++;
1263
1264 return 0;
1265}
1266
1267static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1268 gfs2_filldir_t filldir, int *copied,
1269 unsigned *depth, u64 leaf_no)
1270{
1271 struct gfs2_inode *ip = inode->u.generic_ip;
1272 struct buffer_head *bh;
1273 struct gfs2_leaf *lf;
1274 unsigned entries = 0;
1275 unsigned leaves = 0;
1276 const struct gfs2_dirent **darr, *dent;
1277 struct dirent_gather g;
1278 struct buffer_head **larr;
1279 int leaf = 0;
1280 int error, i;
1281 u64 lfn = leaf_no;
1282
1283 do {
1284 error = get_leaf(ip, lfn, &bh);
1285 if (error)
1286 goto out;
1287 lf = (struct gfs2_leaf *)bh->b_data;
1288 if (leaves == 0)
1289 *depth = be16_to_cpu(lf->lf_depth);
1290 entries += be16_to_cpu(lf->lf_entries);
1291 leaves++;
1292 lfn = be64_to_cpu(lf->lf_next);
1293 brelse(bh);
1294 } while(lfn);
1295
1296 if (!entries)
1297 return 0;
1298
1299 error = -ENOMEM;
1300 larr = vmalloc((leaves + entries) * sizeof(void*));
1301 if (!larr)
1302 goto out;
1303 darr = (const struct gfs2_dirent **)(larr + leaves);
1304 g.pdent = darr;
1305 g.offset = 0;
1306 lfn = leaf_no;
1307
1308 do {
1309 error = get_leaf(ip, lfn, &bh);
1310 if (error)
1311 goto out_kfree;
1312 lf = (struct gfs2_leaf *)bh->b_data;
1313 lfn = be64_to_cpu(lf->lf_next);
1314 if (lf->lf_entries) {
1315 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
1316 gfs2_dirent_gather, NULL, &g);
1317 error = PTR_ERR(dent);
1318 if (IS_ERR(dent)) {
1319 goto out_kfree;
1320 }
1321 error = 0;
1322 larr[leaf++] = bh;
1323 } else {
1324 brelse(bh);
1325 }
1326 } while(lfn);
1327
1328 error = do_filldir_main(ip, offset, opaque, filldir, darr,
1329 entries, copied);
1330out_kfree:
1331 for(i = 0; i < leaf; i++)
1332 brelse(larr[i]);
1333 vfree(larr);
1334out:
1335 return error;
1336}
1337
1338/**
1339 * dir_e_read - Reads the entries from a directory into a filldir buffer
1340 * @dip: dinode pointer
1341 * @offset: the hash of the last entry read shifted to the right once
1342 * @opaque: buffer for the filldir function to fill
1343 * @filldir: points to the filldir function to use
1344 *
1345 * Returns: errno
1346 */
1347
1348static int dir_e_read(struct inode *inode, uint64_t *offset, void *opaque,
1349 gfs2_filldir_t filldir)
1350{
1351 struct gfs2_inode *dip = inode->u.generic_ip;
1352 struct gfs2_sbd *sdp = dip->i_sbd;
1353 uint32_t hsize, len = 0;
1354 uint32_t ht_offset, lp_offset, ht_offset_cur = -1;
1355 uint32_t hash, index;
1356 uint64_t *lp;
1357 int copied = 0;
1358 int error = 0;
1359 unsigned depth;
1360
1361 hsize = 1 << dip->i_di.di_depth;
1362 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
1363 gfs2_consist_inode(dip);
1364 return -EIO;
1365 }
1366
1367 hash = gfs2_dir_offset2hash(*offset);
1368 index = hash >> (32 - dip->i_di.di_depth);
1369
1370 lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
1371 if (!lp)
1372 return -ENOMEM;
1373
1374 while (index < hsize) {
1375 lp_offset = index & (sdp->sd_hash_ptrs - 1);
1376 ht_offset = index - lp_offset;
1377
1378 if (ht_offset_cur != ht_offset) {
1379 error = gfs2_dir_read_data(dip, (char *)lp,
1380 ht_offset * sizeof(uint64_t),
1381 sdp->sd_hash_bsize);
1382 if (error != sdp->sd_hash_bsize) {
1383 if (error >= 0)
1384 error = -EIO;
1385 goto out;
1386 }
1387 ht_offset_cur = ht_offset;
1388 }
1389
1390 error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
1391 &copied, &depth,
1392 be64_to_cpu(lp[lp_offset]));
1393 if (error)
1394 break;
1395
1396 len = 1 << (dip->i_di.di_depth - depth);
1397 index = (index & ~(len - 1)) + len;
1398 }
1399
1400out:
1401 kfree(lp);
1402 if (error > 0)
1403 error = 0;
1404 return error;
1405}
1406
1407int gfs2_dir_read(struct inode *inode, uint64_t *offset, void *opaque,
1408 gfs2_filldir_t filldir)
1409{
1410 struct gfs2_inode *dip = inode->u.generic_ip;
1411 struct dirent_gather g;
1412 const struct gfs2_dirent **darr, *dent;
1413 struct buffer_head *dibh;
1414 int copied = 0;
1415 int error;
1416
1417 if (!dip->i_di.di_entries)
1418 return 0;
1419
1420 if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
1421 return dir_e_read(inode, offset, opaque, filldir);
1422
1423 if (!gfs2_is_stuffed(dip)) {
1424 gfs2_consist_inode(dip);
1425 return -EIO;
1426 }
1427
1428 error = gfs2_meta_inode_buffer(dip, &dibh);
1429 if (error)
1430 return error;
1431
1432 error = -ENOMEM;
1433 darr = kmalloc(dip->i_di.di_entries * sizeof(struct gfs2_dirent *),
1434 GFP_KERNEL);
1435 if (darr) {
1436 g.pdent = darr;
1437 g.offset = 0;
1438 dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
1439 gfs2_dirent_gather, NULL, &g);
1440 if (IS_ERR(dent)) {
1441 error = PTR_ERR(dent);
1442 goto out;
1443 }
1444 error = do_filldir_main(dip, offset, opaque, filldir, darr,
1445 dip->i_di.di_entries, &copied);
1446out:
1447 kfree(darr);
1448 }
1449
1450 if (error > 0)
1451 error = 0;
1452
1453 brelse(dibh);
1454
1455 return error;
1456}
1457
1458/**
1459 * gfs2_dir_search - Search a directory
1460 * @dip: The GFS2 inode
1461 * @filename:
1462 * @inode:
1463 *
1464 * This routine searches a directory for a file or another directory.
1465 * Assumes a glock is held on dip.
1466 *
1467 * Returns: errno
1468 */
1469
1470int gfs2_dir_search(struct inode *dir, const struct qstr *name,
1471 struct gfs2_inum *inum, unsigned int *type)
1472{
1473 struct buffer_head *bh;
1474 struct gfs2_dirent *dent;
1475
1476 dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
1477 if (dent) {
1478 if (IS_ERR(dent))
1479 return PTR_ERR(dent);
1480 if (inum)
1481 gfs2_inum_in(inum, (char *)&dent->de_inum);
1482 if (type)
1483 *type = be16_to_cpu(dent->de_type);
1484 brelse(bh);
1485 return 0;
1486 }
1487 return -ENOENT;
1488}
1489
1490static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1491{
1492 struct buffer_head *bh, *obh;
1493 struct gfs2_inode *ip = inode->u.generic_ip;
1494 struct gfs2_leaf *leaf, *oleaf;
1495 int error;
1496 u32 index;
1497 u64 bn;
1498
1499 index = name->hash >> (32 - ip->i_di.di_depth);
1500 error = get_first_leaf(ip, index, &obh);
1501 if (error)
1502 return error;
1503 do {
1504 oleaf = (struct gfs2_leaf *)obh->b_data;
1505 bn = be64_to_cpu(oleaf->lf_next);
1506 if (!bn)
1507 break;
1508 brelse(obh);
1509 error = get_leaf(ip, bn, &obh);
1510 if (error)
1511 return error;
1512 } while(1);
1513
1514 gfs2_trans_add_bh(ip->i_gl, obh, 1);
1515
1516 leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
1517 if (!leaf) {
1518 brelse(obh);
1519 return -ENOSPC;
1520 }
1521 oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
1522 brelse(bh);
1523 brelse(obh);
1524
1525 error = gfs2_meta_inode_buffer(ip, &bh);
1526 if (error)
1527 return error;
1528 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1529 ip->i_di.di_blocks++;
1530 gfs2_dinode_out(&ip->i_di, bh->b_data);
1531 brelse(bh);
1532 return 0;
1533}
1534
1535/**
1536 * gfs2_dir_add - Add new filename into directory
1537 * @dip: The GFS2 inode
1538 * @filename: The new name
1539 * @inode: The inode number of the entry
1540 * @type: The type of the entry
1541 *
1542 * Returns: 0 on success, error code on failure
1543 */
1544
1545int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1546 const struct gfs2_inum *inum, unsigned type)
1547{
1548 struct gfs2_inode *ip = inode->u.generic_ip;
1549 struct buffer_head *bh;
1550 struct gfs2_dirent *dent;
1551 struct gfs2_leaf *leaf;
1552 int error;
1553
1554 while(1) {
1555 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
1556 &bh);
1557 if (dent) {
1558 if (IS_ERR(dent))
1559 return PTR_ERR(dent);
1560 dent = gfs2_init_dirent(inode, dent, name, bh);
1561 gfs2_inum_out(inum, (char *)&dent->de_inum);
1562 dent->de_type = cpu_to_be16(type);
1563 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
1564 leaf = (struct gfs2_leaf *)bh->b_data;
1565 leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1);
1566 }
1567 brelse(bh);
1568 error = gfs2_meta_inode_buffer(ip, &bh);
1569 if (error)
1570 break;
1571 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1572 ip->i_di.di_entries++;
1573 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
1574 gfs2_dinode_out(&ip->i_di, bh->b_data);
1575 brelse(bh);
1576 error = 0;
1577 break;
1578 }
1579 if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
1580 error = dir_make_exhash(inode);
1581 if (error)
1582 break;
1583 continue;
1584 }
1585 error = dir_split_leaf(inode, name);
1586 if (error == 0)
1587 continue;
1588 if (error < 0)
1589 break;
1590 if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) {
1591 error = dir_double_exhash(ip);
1592 if (error)
1593 break;
1594 error = dir_split_leaf(inode, name);
1595 if (error < 0)
1596 break;
1597 if (error == 0)
1598 continue;
1599 }
1600 error = dir_new_leaf(inode, name);
1601 if (!error)
1602 continue;
1603 error = -ENOSPC;
1604 break;
1605 }
1606 return error;
1607}
1608
1609
1610/**
1611 * gfs2_dir_del - Delete a directory entry
1612 * @dip: The GFS2 inode
1613 * @filename: The filename
1614 *
1615 * Returns: 0 on success, error code on failure
1616 */
1617
1618int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
1619{
1620 struct gfs2_dirent *dent, *prev = NULL;
1621 struct buffer_head *bh;
1622 int error;
1623
1624 /* Returns _either_ the entry (if its first in block) or the
1625 previous entry otherwise */
1626 dent = gfs2_dirent_search(dip->i_vnode, name, gfs2_dirent_prev, &bh);
1627 if (!dent) {
1628 gfs2_consist_inode(dip);
1629 return -EIO;
1630 }
1631 if (IS_ERR(dent)) {
1632 gfs2_consist_inode(dip);
1633 return PTR_ERR(dent);
1634 }
1635 /* If not first in block, adjust pointers accordingly */
1636 if (gfs2_dirent_find(dent, name, NULL) == 0) {
1637 prev = dent;
1638 dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len));
1639 }
1640
1641 dirent_del(dip, bh, prev, dent);
1642 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
1643 struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
1644 u16 entries = be16_to_cpu(leaf->lf_entries);
1645 if (!entries)
1646 gfs2_consist_inode(dip);
1647 leaf->lf_entries = cpu_to_be16(--entries);
1648 }
1649 brelse(bh);
1650
1651 error = gfs2_meta_inode_buffer(dip, &bh);
1652 if (error)
1653 return error;
1654
1655 if (!dip->i_di.di_entries)
1656 gfs2_consist_inode(dip);
1657 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1658 dip->i_di.di_entries--;
1659 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1660 gfs2_dinode_out(&dip->i_di, bh->b_data);
1661 brelse(bh);
1662
1663 return error;
1664}
1665
1666/**
1667 * gfs2_dir_mvino - Change inode number of directory entry
1668 * @dip: The GFS2 inode
1669 * @filename:
1670 * @new_inode:
1671 *
1672 * This routine changes the inode number of a directory entry. It's used
1673 * by rename to change ".." when a directory is moved.
1674 * Assumes a glock is held on dvp.
1675 *
1676 * Returns: errno
1677 */
1678
1679int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
1680 struct gfs2_inum *inum, unsigned int new_type)
1681{
1682 struct buffer_head *bh;
1683 struct gfs2_dirent *dent;
1684 int error;
1685
1686 dent = gfs2_dirent_search(dip->i_vnode, filename, gfs2_dirent_find, &bh);
1687 if (!dent) {
1688 gfs2_consist_inode(dip);
1689 return -EIO;
1690 }
1691 if (IS_ERR(dent))
1692 return PTR_ERR(dent);
1693
1694 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1695 gfs2_inum_out(inum, (char *)&dent->de_inum);
1696 dent->de_type = cpu_to_be16(new_type);
1697
1698 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
1699 brelse(bh);
1700 error = gfs2_meta_inode_buffer(dip, &bh);
1701 if (error)
1702 return error;
1703 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1704 }
1705
1706 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1707 gfs2_dinode_out(&dip->i_di, bh->b_data);
1708 brelse(bh);
1709 return 0;
1710}
1711
1712/**
1713 * foreach_leaf - call a function for each leaf in a directory
1714 * @dip: the directory
1715 * @lc: the function to call for each each
1716 * @data: private data to pass to it
1717 *
1718 * Returns: errno
1719 */
1720
1721static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1722{
1723 struct gfs2_sbd *sdp = dip->i_sbd;
1724 struct buffer_head *bh;
1725 struct gfs2_leaf *leaf;
1726 uint32_t hsize, len;
1727 uint32_t ht_offset, lp_offset, ht_offset_cur = -1;
1728 uint32_t index = 0;
1729 uint64_t *lp;
1730 uint64_t leaf_no;
1731 int error = 0;
1732
1733 hsize = 1 << dip->i_di.di_depth;
1734 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
1735 gfs2_consist_inode(dip);
1736 return -EIO;
1737 }
1738
1739 lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
1740 if (!lp)
1741 return -ENOMEM;
1742
1743 while (index < hsize) {
1744 lp_offset = index & (sdp->sd_hash_ptrs - 1);
1745 ht_offset = index - lp_offset;
1746
1747 if (ht_offset_cur != ht_offset) {
1748 error = gfs2_dir_read_data(dip, (char *)lp,
1749 ht_offset * sizeof(uint64_t),
1750 sdp->sd_hash_bsize);
1751 if (error != sdp->sd_hash_bsize) {
1752 if (error >= 0)
1753 error = -EIO;
1754 goto out;
1755 }
1756 ht_offset_cur = ht_offset;
1757 }
1758
1759 leaf_no = be64_to_cpu(lp[lp_offset]);
1760 if (leaf_no) {
1761 error = get_leaf(dip, leaf_no, &bh);
1762 if (error)
1763 goto out;
1764 leaf = (struct gfs2_leaf *)bh->b_data;
1765 brelse(bh);
1766
1767 len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth));
1768
1769 error = lc(dip, index, len, leaf_no, data);
1770 if (error)
1771 goto out;
1772
1773 index = (index & ~(len - 1)) + len;
1774 } else
1775 index++;
1776 }
1777
1778 if (index != hsize) {
1779 gfs2_consist_inode(dip);
1780 error = -EIO;
1781 }
1782
1783 out:
1784 kfree(lp);
1785
1786 return error;
1787}
1788
1789/**
1790 * leaf_dealloc - Deallocate a directory leaf
1791 * @dip: the directory
1792 * @index: the hash table offset in the directory
1793 * @len: the number of pointers to this leaf
1794 * @leaf_no: the leaf number
1795 * @data: not used
1796 *
1797 * Returns: errno
1798 */
1799
1800static int leaf_dealloc(struct gfs2_inode *dip, uint32_t index, uint32_t len,
1801 uint64_t leaf_no, void *data)
1802{
1803 struct gfs2_sbd *sdp = dip->i_sbd;
1804 struct gfs2_leaf *tmp_leaf;
1805 struct gfs2_rgrp_list rlist;
1806 struct buffer_head *bh, *dibh;
1807 uint64_t blk, nblk;
1808 unsigned int rg_blocks = 0, l_blocks = 0;
1809 char *ht;
1810 unsigned int x, size = len * sizeof(uint64_t);
1811 int error;
1812
1813 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1814
1815 ht = kzalloc(size, GFP_KERNEL);
1816 if (!ht)
1817 return -ENOMEM;
1818
1819 gfs2_alloc_get(dip);
1820
1821 error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1822 if (error)
1823 goto out;
1824
1825 error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh);
1826 if (error)
1827 goto out_qs;
1828
1829 /* Count the number of leaves */
1830
1831 for (blk = leaf_no; blk; blk = nblk) {
1832 error = get_leaf(dip, blk, &bh);
1833 if (error)
1834 goto out_rlist;
1835 tmp_leaf = (struct gfs2_leaf *)bh->b_data;
1836 nblk = be64_to_cpu(tmp_leaf->lf_next);
1837 brelse(bh);
1838
1839 gfs2_rlist_add(sdp, &rlist, blk);
1840 l_blocks++;
1841 }
1842
1843 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
1844
1845 for (x = 0; x < rlist.rl_rgrps; x++) {
1846 struct gfs2_rgrpd *rgd;
1847 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
1848 rg_blocks += rgd->rd_ri.ri_length;
1849 }
1850
1851 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
1852 if (error)
1853 goto out_rlist;
1854
1855 error = gfs2_trans_begin(sdp,
1856 rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) +
1857 RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
1858 if (error)
1859 goto out_rg_gunlock;
1860
1861 for (blk = leaf_no; blk; blk = nblk) {
1862 error = get_leaf(dip, blk, &bh);
1863 if (error)
1864 goto out_end_trans;
1865 tmp_leaf = (struct gfs2_leaf *)bh->b_data;
1866 nblk = be64_to_cpu(tmp_leaf->lf_next);
1867 brelse(bh);
1868
1869 gfs2_free_meta(dip, blk, 1);
1870
1871 if (!dip->i_di.di_blocks)
1872 gfs2_consist_inode(dip);
1873 dip->i_di.di_blocks--;
1874 }
1875
1876 error = gfs2_dir_write_data(dip, ht, index * sizeof(uint64_t), size);
1877 if (error != size) {
1878 if (error >= 0)
1879 error = -EIO;
1880 goto out_end_trans;
1881 }
1882
1883 error = gfs2_meta_inode_buffer(dip, &dibh);
1884 if (error)
1885 goto out_end_trans;
1886
1887 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
1888 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1889 brelse(dibh);
1890
1891 out_end_trans:
1892 gfs2_trans_end(sdp);
1893
1894 out_rg_gunlock:
1895 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1896
1897 out_rlist:
1898 gfs2_rlist_free(&rlist);
1899 gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh);
1900
1901 out_qs:
1902 gfs2_quota_unhold(dip);
1903
1904 out:
1905 gfs2_alloc_put(dip);
1906 kfree(ht);
1907
1908 return error;
1909}
1910
1911/**
1912 * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
1913 * @dip: the directory
1914 *
1915 * Dealloc all on-disk directory leaves to FREEMETA state
1916 * Change on-disk inode type to "regular file"
1917 *
1918 * Returns: errno
1919 */
1920
1921int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
1922{
1923 struct gfs2_sbd *sdp = dip->i_sbd;
1924 struct buffer_head *bh;
1925 int error;
1926
1927 /* Dealloc on-disk leaves to FREEMETA state */
1928 error = foreach_leaf(dip, leaf_dealloc, NULL);
1929 if (error)
1930 return error;
1931
1932 /* Make this a regular file in case we crash.
1933 (We don't want to free these blocks a second time.) */
1934
1935 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1936 if (error)
1937 return error;
1938
1939 error = gfs2_meta_inode_buffer(dip, &bh);
1940 if (!error) {
1941 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1942 ((struct gfs2_dinode *)bh->b_data)->di_mode =
1943 cpu_to_be32(S_IFREG);
1944 brelse(bh);
1945 }
1946
1947 gfs2_trans_end(sdp);
1948
1949 return error;
1950}
1951
1952/**
1953 * gfs2_diradd_alloc_required - find if adding entry will require an allocation
1954 * @ip: the file being written to
1955 * @filname: the filename that's going to be added
1956 *
1957 * Returns: 1 if alloc required, 0 if not, -ve on error
1958 */
1959
1960int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
1961{
1962 struct gfs2_dirent *dent;
1963 struct buffer_head *bh;
1964
1965 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
1966 if (!dent) {
1967 return 1;
1968 }
1969 if (IS_ERR(dent))
1970 return PTR_ERR(dent);
1971 brelse(bh);
1972 return 0;
1973}
1974
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
new file mode 100644
index 000000000000..173403095eb2
--- /dev/null
+++ b/fs/gfs2/dir.h
@@ -0,0 +1,73 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __DIR_DOT_H__
11#define __DIR_DOT_H__
12
13/**
14 * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read()
15 * @opaque: opaque data used by the function
16 * @name: the name of the directory entry
17 * @length: the length of the name
18 * @offset: the entry's offset in the directory
19 * @inum: the inode number the entry points to
20 * @type: the type of inode the entry points to
21 *
22 * Returns: 0 on success, 1 if buffer full
23 */
24
25typedef int (*gfs2_filldir_t) (void *opaque,
26 const char *name, unsigned int length,
27 uint64_t offset,
28 struct gfs2_inum *inum, unsigned int type);
29
30int gfs2_dir_search(struct inode *dir, const struct qstr *filename,
31 struct gfs2_inum *inum, unsigned int *type);
32int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
33 const struct gfs2_inum *inum, unsigned int type);
34int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
35int gfs2_dir_read(struct inode *inode, uint64_t * offset, void *opaque,
36 gfs2_filldir_t filldir);
37int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
38 struct gfs2_inum *new_inum, unsigned int new_type);
39
40int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
41
42int gfs2_diradd_alloc_required(struct inode *dir,
43 const struct qstr *filename);
44int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, uint64_t block,
45 struct buffer_head **bhp);
46
47static inline uint32_t gfs2_disk_hash(const char *data, int len)
48{
49 return crc32_le(0xFFFFFFFF, data, len) ^ 0xFFFFFFFF;
50}
51
52
53static inline void gfs2_str2qstr(struct qstr *name, const char *fname)
54{
55 name->name = fname;
56 name->len = strlen(fname);
57 name->hash = gfs2_disk_hash(name->name, name->len);
58}
59
60/* N.B. This probably ought to take inum & type as args as well */
61static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent)
62{
63 dent->de_inum.no_addr = cpu_to_be64(0);
64 dent->de_inum.no_formal_ino = cpu_to_be64(0);
65 dent->de_hash = cpu_to_be32(name->hash);
66 dent->de_rec_len = cpu_to_be16(reclen);
67 dent->de_name_len = cpu_to_be16(name->len);
68 dent->de_type = cpu_to_be16(0);
69 memset(dent->__pad, 0, sizeof(dent->__pad));
70 memcpy((char*)(dent+1), name->name, name->len);
71}
72
73#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
new file mode 100644
index 000000000000..2243b44ecb07
--- /dev/null
+++ b/fs/gfs2/eaops.c
@@ -0,0 +1,229 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/xattr.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/uaccess.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "acl.h"
23#include "eaops.h"
24#include "eattr.h"
25#include "util.h"
26
27/**
28 * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
29 * @namep: ea name, possibly with type appended
30 *
31 * Returns: GFS2_EATYPE_XXX
32 */
33
34unsigned int gfs2_ea_name2type(const char *name, char **truncated_name)
35{
36 unsigned int type;
37
38 if (strncmp(name, "system.", 7) == 0) {
39 type = GFS2_EATYPE_SYS;
40 if (truncated_name)
41 *truncated_name = strchr(name, '.') + 1;
42 } else if (strncmp(name, "user.", 5) == 0) {
43 type = GFS2_EATYPE_USR;
44 if (truncated_name)
45 *truncated_name = strchr(name, '.') + 1;
46 } else if (strncmp(name, "security.", 9) == 0) {
47 type = GFS2_EATYPE_SECURITY;
48 if (truncated_name)
49 *truncated_name = strchr(name, '.') + 1;
50 } else {
51 type = GFS2_EATYPE_UNUSED;
52 if (truncated_name)
53 *truncated_name = NULL;
54 }
55
56 return type;
57}
58
59static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
60{
61 struct inode *inode = ip->i_vnode;
62 int error = permission(inode, MAY_READ, NULL);
63 if (error)
64 return error;
65
66 return gfs2_ea_get_i(ip, er);
67}
68
69static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
70{
71 struct inode *inode = ip->i_vnode;
72
73 if (S_ISREG(inode->i_mode) ||
74 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
75 int error = permission(inode, MAY_WRITE, NULL);
76 if (error)
77 return error;
78 } else
79 return -EPERM;
80
81 return gfs2_ea_set_i(ip, er);
82}
83
84static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
85{
86 struct inode *inode = ip->i_vnode;
87
88 if (S_ISREG(inode->i_mode) ||
89 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
90 int error = permission(inode, MAY_WRITE, NULL);
91 if (error)
92 return error;
93 } else
94 return -EPERM;
95
96 return gfs2_ea_remove_i(ip, er);
97}
98
99static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
100{
101 if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
102 !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
103 !capable(CAP_SYS_ADMIN))
104 return -EPERM;
105
106 if (ip->i_sbd->sd_args.ar_posix_acl == 0 &&
107 (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
108 GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
109 return -EOPNOTSUPP;
110
111
112
113 return gfs2_ea_get_i(ip, er);
114}
115
116static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
117{
118 int remove = 0;
119 int error;
120
121 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
122 if (!(er->er_flags & GFS2_ERF_MODE)) {
123 er->er_mode = ip->i_di.di_mode;
124 er->er_flags |= GFS2_ERF_MODE;
125 }
126 error = gfs2_acl_validate_set(ip, 1, er,
127 &remove, &er->er_mode);
128 if (error)
129 return error;
130 error = gfs2_ea_set_i(ip, er);
131 if (error)
132 return error;
133 if (remove)
134 gfs2_ea_remove_i(ip, er);
135 return 0;
136
137 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
138 error = gfs2_acl_validate_set(ip, 0, er,
139 &remove, NULL);
140 if (error)
141 return error;
142 if (!remove)
143 error = gfs2_ea_set_i(ip, er);
144 else {
145 error = gfs2_ea_remove_i(ip, er);
146 if (error == -ENODATA)
147 error = 0;
148 }
149 return error;
150 }
151
152 return -EPERM;
153}
154
155static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
156{
157 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
158 int error = gfs2_acl_validate_remove(ip, 1);
159 if (error)
160 return error;
161
162 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
163 int error = gfs2_acl_validate_remove(ip, 0);
164 if (error)
165 return error;
166
167 } else
168 return -EPERM;
169
170 return gfs2_ea_remove_i(ip, er);
171}
172
173static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
174{
175 struct inode *inode = ip->i_vnode;
176 int error = permission(inode, MAY_READ, NULL);
177 if (error)
178 return error;
179
180 return gfs2_ea_get_i(ip, er);
181}
182
183static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
184{
185 struct inode *inode = ip->i_vnode;
186 int error = permission(inode, MAY_WRITE, NULL);
187 if (error)
188 return error;
189
190 return gfs2_ea_set_i(ip, er);
191}
192
193static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
194{
195 struct inode *inode = ip->i_vnode;
196 int error = permission(inode, MAY_WRITE, NULL);
197 if (error)
198 return error;
199
200 return gfs2_ea_remove_i(ip, er);
201}
202
203static struct gfs2_eattr_operations gfs2_user_eaops = {
204 .eo_get = user_eo_get,
205 .eo_set = user_eo_set,
206 .eo_remove = user_eo_remove,
207 .eo_name = "user",
208};
209
210struct gfs2_eattr_operations gfs2_system_eaops = {
211 .eo_get = system_eo_get,
212 .eo_set = system_eo_set,
213 .eo_remove = system_eo_remove,
214 .eo_name = "system",
215};
216
217struct gfs2_eattr_operations gfs2_security_eaops = {
218 .eo_get = security_eo_get,
219 .eo_set = security_eo_set,
220 .eo_remove = security_eo_remove,
221 .eo_name = "security",
222};
223
224struct gfs2_eattr_operations *gfs2_ea_ops[] = {
225 NULL,
226 &gfs2_user_eaops,
227 &gfs2_system_eaops,
228};
229
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
new file mode 100644
index 000000000000..965a235c96e8
--- /dev/null
+++ b/fs/gfs2/eaops.h
@@ -0,0 +1,31 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __EAOPS_DOT_H__
11#define __EAOPS_DOT_H__
12
13struct gfs2_ea_request;
14
15struct gfs2_eattr_operations {
16 int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
17 int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
18 int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
19 char *eo_name;
20};
21
22unsigned int gfs2_ea_name2type(const char *name, char **truncated_name);
23
24extern struct gfs2_eattr_operations gfs2_system_eaops;
25
26extern struct gfs2_eattr_operations gfs2_security_eaops;
27
28extern struct gfs2_eattr_operations *gfs2_ea_ops[];
29
30#endif /* __EAOPS_DOT_H__ */
31
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
new file mode 100644
index 000000000000..346601538ac7
--- /dev/null
+++ b/fs/gfs2/eattr.c
@@ -0,0 +1,1549 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/xattr.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/uaccess.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "acl.h"
23#include "eaops.h"
24#include "eattr.h"
25#include "glock.h"
26#include "inode.h"
27#include "meta_io.h"
28#include "quota.h"
29#include "rgrp.h"
30#include "trans.h"
31#include "util.h"
32
33/**
34 * ea_calc_size - returns the acutal number of bytes the request will take up
35 * (not counting any unstuffed data blocks)
36 * @sdp:
37 * @er:
38 * @size:
39 *
40 * Returns: 1 if the EA should be stuffed
41 */
42
43static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
44 unsigned int *size)
45{
46 *size = GFS2_EAREQ_SIZE_STUFFED(er);
47 if (*size <= sdp->sd_jbsize)
48 return 1;
49
50 *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
51
52 return 0;
53}
54
55static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
56{
57 unsigned int size;
58
59 if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
60 return -ERANGE;
61
62 ea_calc_size(sdp, er, &size);
63
64 /* This can only happen with 512 byte blocks */
65 if (size > sdp->sd_jbsize)
66 return -ERANGE;
67
68 return 0;
69}
70
71typedef int (*ea_call_t) (struct gfs2_inode *ip,
72 struct buffer_head *bh,
73 struct gfs2_ea_header *ea,
74 struct gfs2_ea_header *prev,
75 void *private);
76
77static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
78 ea_call_t ea_call, void *data)
79{
80 struct gfs2_ea_header *ea, *prev = NULL;
81 int error = 0;
82
83 if (gfs2_metatype_check(ip->i_sbd, bh, GFS2_METATYPE_EA))
84 return -EIO;
85
86 for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
87 if (!GFS2_EA_REC_LEN(ea))
88 goto fail;
89 if (!(bh->b_data <= (char *)ea &&
90 (char *)GFS2_EA2NEXT(ea) <=
91 bh->b_data + bh->b_size))
92 goto fail;
93 if (!GFS2_EATYPE_VALID(ea->ea_type))
94 goto fail;
95
96 error = ea_call(ip, bh, ea, prev, data);
97 if (error)
98 return error;
99
100 if (GFS2_EA_IS_LAST(ea)) {
101 if ((char *)GFS2_EA2NEXT(ea) !=
102 bh->b_data + bh->b_size)
103 goto fail;
104 break;
105 }
106 }
107
108 return error;
109
110 fail:
111 gfs2_consist_inode(ip);
112 return -EIO;
113}
114
115static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
116{
117 struct buffer_head *bh, *eabh;
118 uint64_t *eablk, *end;
119 int error;
120
121 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
122 DIO_START | DIO_WAIT, &bh);
123 if (error)
124 return error;
125
126 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
127 error = ea_foreach_i(ip, bh, ea_call, data);
128 goto out;
129 }
130
131 if (gfs2_metatype_check(ip->i_sbd, bh, GFS2_METATYPE_IN)) {
132 error = -EIO;
133 goto out;
134 }
135
136 eablk = (uint64_t *)(bh->b_data + sizeof(struct gfs2_meta_header));
137 end = eablk + ip->i_sbd->sd_inptrs;
138
139 for (; eablk < end; eablk++) {
140 uint64_t bn;
141
142 if (!*eablk)
143 break;
144 bn = be64_to_cpu(*eablk);
145
146 error = gfs2_meta_read(ip->i_gl, bn, DIO_START | DIO_WAIT,
147 &eabh);
148 if (error)
149 break;
150 error = ea_foreach_i(ip, eabh, ea_call, data);
151 brelse(eabh);
152 if (error)
153 break;
154 }
155 out:
156 brelse(bh);
157
158 return error;
159}
160
161struct ea_find {
162 struct gfs2_ea_request *ef_er;
163 struct gfs2_ea_location *ef_el;
164};
165
166static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
167 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
168 void *private)
169{
170 struct ea_find *ef = private;
171 struct gfs2_ea_request *er = ef->ef_er;
172
173 if (ea->ea_type == GFS2_EATYPE_UNUSED)
174 return 0;
175
176 if (ea->ea_type == er->er_type) {
177 if (ea->ea_name_len == er->er_name_len &&
178 !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
179 struct gfs2_ea_location *el = ef->ef_el;
180 get_bh(bh);
181 el->el_bh = bh;
182 el->el_ea = ea;
183 el->el_prev = prev;
184 return 1;
185 }
186 }
187
188#if 0
189 else if ((ip->i_di.di_flags & GFS2_DIF_EA_PACKED) &&
190 er->er_type == GFS2_EATYPE_SYS)
191 return 1;
192#endif
193
194 return 0;
195}
196
197int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
198 struct gfs2_ea_location *el)
199{
200 struct ea_find ef;
201 int error;
202
203 ef.ef_er = er;
204 ef.ef_el = el;
205
206 memset(el, 0, sizeof(struct gfs2_ea_location));
207
208 error = ea_foreach(ip, ea_find_i, &ef);
209 if (error > 0)
210 return 0;
211
212 return error;
213}
214
215/**
216 * ea_dealloc_unstuffed -
217 * @ip:
218 * @bh:
219 * @ea:
220 * @prev:
221 * @private:
222 *
223 * Take advantage of the fact that all unstuffed blocks are
224 * allocated from the same RG. But watch, this may not always
225 * be true.
226 *
227 * Returns: errno
228 */
229
230static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
231 struct gfs2_ea_header *ea,
232 struct gfs2_ea_header *prev, void *private)
233{
234 int *leave = private;
235 struct gfs2_sbd *sdp = ip->i_sbd;
236 struct gfs2_rgrpd *rgd;
237 struct gfs2_holder rg_gh;
238 struct buffer_head *dibh;
239 uint64_t *dataptrs, bn = 0;
240 uint64_t bstart = 0;
241 unsigned int blen = 0;
242 unsigned int blks = 0;
243 unsigned int x;
244 int error;
245
246 if (GFS2_EA_IS_STUFFED(ea))
247 return 0;
248
249 dataptrs = GFS2_EA2DATAPTRS(ea);
250 for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++)
251 if (*dataptrs) {
252 blks++;
253 bn = be64_to_cpu(*dataptrs);
254 }
255 if (!blks)
256 return 0;
257
258 rgd = gfs2_blk2rgrpd(sdp, bn);
259 if (!rgd) {
260 gfs2_consist_inode(ip);
261 return -EIO;
262 }
263
264 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
265 if (error)
266 return error;
267
268 error = gfs2_trans_begin(sdp, rgd->rd_ri.ri_length +
269 RES_DINODE + RES_EATTR + RES_STATFS +
270 RES_QUOTA, blks);
271 if (error)
272 goto out_gunlock;
273
274 gfs2_trans_add_bh(ip->i_gl, bh, 1);
275
276 dataptrs = GFS2_EA2DATAPTRS(ea);
277 for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
278 if (!*dataptrs)
279 break;
280 bn = be64_to_cpu(*dataptrs);
281
282 if (bstart + blen == bn)
283 blen++;
284 else {
285 if (bstart)
286 gfs2_free_meta(ip, bstart, blen);
287 bstart = bn;
288 blen = 1;
289 }
290
291 *dataptrs = 0;
292 if (!ip->i_di.di_blocks)
293 gfs2_consist_inode(ip);
294 ip->i_di.di_blocks--;
295 }
296 if (bstart)
297 gfs2_free_meta(ip, bstart, blen);
298
299 if (prev && !leave) {
300 uint32_t len;
301
302 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
303 prev->ea_rec_len = cpu_to_be32(len);
304
305 if (GFS2_EA_IS_LAST(ea))
306 prev->ea_flags |= GFS2_EAFLAG_LAST;
307 } else {
308 ea->ea_type = GFS2_EATYPE_UNUSED;
309 ea->ea_num_ptrs = 0;
310 }
311
312 error = gfs2_meta_inode_buffer(ip, &dibh);
313 if (!error) {
314 ip->i_di.di_ctime = get_seconds();
315 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
316 gfs2_dinode_out(&ip->i_di, dibh->b_data);
317 brelse(dibh);
318 }
319
320 gfs2_trans_end(sdp);
321
322 out_gunlock:
323 gfs2_glock_dq_uninit(&rg_gh);
324
325 return error;
326}
327
328static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
329 struct gfs2_ea_header *ea,
330 struct gfs2_ea_header *prev, int leave)
331{
332 struct gfs2_alloc *al;
333 int error;
334
335 al = gfs2_alloc_get(ip);
336
337 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
338 if (error)
339 goto out_alloc;
340
341 error = gfs2_rindex_hold(ip->i_sbd, &al->al_ri_gh);
342 if (error)
343 goto out_quota;
344
345 error = ea_dealloc_unstuffed(ip,
346 bh, ea, prev,
347 (leave) ? &error : NULL);
348
349 gfs2_glock_dq_uninit(&al->al_ri_gh);
350
351 out_quota:
352 gfs2_quota_unhold(ip);
353
354 out_alloc:
355 gfs2_alloc_put(ip);
356
357 return error;
358}
359
360struct ea_list {
361 struct gfs2_ea_request *ei_er;
362 unsigned int ei_size;
363};
364
365static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
366 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
367 void *private)
368{
369 struct ea_list *ei = private;
370 struct gfs2_ea_request *er = ei->ei_er;
371 unsigned int ea_size = gfs2_ea_strlen(ea);
372
373 if (ea->ea_type == GFS2_EATYPE_UNUSED)
374 return 0;
375
376 if (er->er_data_len) {
377 char *prefix;
378 unsigned int l;
379 char c = 0;
380
381 if (ei->ei_size + ea_size > er->er_data_len)
382 return -ERANGE;
383
384 switch (ea->ea_type) {
385 case GFS2_EATYPE_USR:
386 prefix = "user.";
387 l = 5;
388 break;
389 case GFS2_EATYPE_SYS:
390 prefix = "system.";
391 l = 7;
392 break;
393 case GFS2_EATYPE_SECURITY:
394 prefix = "security.";
395 l = 9;
396 break;
397 default:
398 /* FIXME: Needs looking at again */
399 break;
400 }
401
402 memcpy(er->er_data + ei->ei_size, prefix, l);
403 memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea),
404 ea->ea_name_len);
405 memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1);
406 }
407
408 ei->ei_size += ea_size;
409
410 return 0;
411}
412
413/**
414 * gfs2_ea_list -
415 * @ip:
416 * @er:
417 *
418 * Returns: actual size of data on success, -errno on error
419 */
420
421int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
422{
423 struct gfs2_holder i_gh;
424 int error;
425
426 if (!er->er_data || !er->er_data_len) {
427 er->er_data = NULL;
428 er->er_data_len = 0;
429 }
430
431 error = gfs2_glock_nq_init(ip->i_gl,
432 LM_ST_SHARED, LM_FLAG_ANY,
433 &i_gh);
434 if (error)
435 return error;
436
437 if (ip->i_di.di_eattr) {
438 struct ea_list ei = { .ei_er = er, .ei_size = 0 };
439
440 error = ea_foreach(ip, ea_list_i, &ei);
441 if (!error)
442 error = ei.ei_size;
443 }
444
445 gfs2_glock_dq_uninit(&i_gh);
446
447 return error;
448}
449
450/**
451 * ea_get_unstuffed - actually copies the unstuffed data into the
452 * request buffer
453 * @ip:
454 * @ea:
455 * @data:
456 *
457 * Returns: errno
458 */
459
460static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
461 char *data)
462{
463 struct gfs2_sbd *sdp = ip->i_sbd;
464 struct buffer_head **bh;
465 unsigned int amount = GFS2_EA_DATA_LEN(ea);
466 unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
467 uint64_t *dataptrs = GFS2_EA2DATAPTRS(ea);
468 unsigned int x;
469 int error = 0;
470
471 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
472 if (!bh)
473 return -ENOMEM;
474
475 for (x = 0; x < nptrs; x++) {
476 error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs),
477 DIO_START, bh + x);
478 if (error) {
479 while (x--)
480 brelse(bh[x]);
481 goto out;
482 }
483 dataptrs++;
484 }
485
486 for (x = 0; x < nptrs; x++) {
487 error = gfs2_meta_reread(sdp, bh[x], DIO_WAIT);
488 if (error) {
489 for (; x < nptrs; x++)
490 brelse(bh[x]);
491 goto out;
492 }
493 if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
494 for (; x < nptrs; x++)
495 brelse(bh[x]);
496 error = -EIO;
497 goto out;
498 }
499
500 memcpy(data,
501 bh[x]->b_data + sizeof(struct gfs2_meta_header),
502 (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
503
504 amount -= sdp->sd_jbsize;
505 data += sdp->sd_jbsize;
506
507 brelse(bh[x]);
508 }
509
510 out:
511 kfree(bh);
512
513 return error;
514}
515
516int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
517 char *data)
518{
519 if (GFS2_EA_IS_STUFFED(el->el_ea)) {
520 memcpy(data,
521 GFS2_EA2DATA(el->el_ea),
522 GFS2_EA_DATA_LEN(el->el_ea));
523 return 0;
524 } else
525 return ea_get_unstuffed(ip, el->el_ea, data);
526}
527
528/**
529 * gfs2_ea_get_i -
530 * @ip:
531 * @er:
532 *
533 * Returns: actual size of data on success, -errno on error
534 */
535
536int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
537{
538 struct gfs2_ea_location el;
539 int error;
540
541 if (!ip->i_di.di_eattr)
542 return -ENODATA;
543
544 error = gfs2_ea_find(ip, er, &el);
545 if (error)
546 return error;
547 if (!el.el_ea)
548 return -ENODATA;
549
550 if (er->er_data_len) {
551 if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
552 error = -ERANGE;
553 else
554 error = gfs2_ea_get_copy(ip, &el, er->er_data);
555 }
556 if (!error)
557 error = GFS2_EA_DATA_LEN(el.el_ea);
558
559 brelse(el.el_bh);
560
561 return error;
562}
563
564/**
565 * gfs2_ea_get -
566 * @ip:
567 * @er:
568 *
569 * Returns: actual size of data on success, -errno on error
570 */
571
572int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
573{
574 struct gfs2_holder i_gh;
575 int error;
576
577 if (!er->er_name_len ||
578 er->er_name_len > GFS2_EA_MAX_NAME_LEN)
579 return -EINVAL;
580 if (!er->er_data || !er->er_data_len) {
581 er->er_data = NULL;
582 er->er_data_len = 0;
583 }
584
585 error = gfs2_glock_nq_init(ip->i_gl,
586 LM_ST_SHARED, LM_FLAG_ANY,
587 &i_gh);
588 if (error)
589 return error;
590
591 error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
592
593 gfs2_glock_dq_uninit(&i_gh);
594
595 return error;
596}
597
598/**
599 * ea_alloc_blk - allocates a new block for extended attributes.
600 * @ip: A pointer to the inode that's getting extended attributes
601 * @bhp:
602 *
603 * Returns: errno
604 */
605
606static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
607{
608 struct gfs2_sbd *sdp = ip->i_sbd;
609 struct gfs2_ea_header *ea;
610 uint64_t block;
611
612 block = gfs2_alloc_meta(ip);
613
614 *bhp = gfs2_meta_new(ip->i_gl, block);
615 gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
616 gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
617 gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
618
619 ea = GFS2_EA_BH2FIRST(*bhp);
620 ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
621 ea->ea_type = GFS2_EATYPE_UNUSED;
622 ea->ea_flags = GFS2_EAFLAG_LAST;
623 ea->ea_num_ptrs = 0;
624
625 ip->i_di.di_blocks++;
626
627 return 0;
628}
629
630/**
631 * ea_write - writes the request info to an ea, creating new blocks if
632 * necessary
633 * @ip: inode that is being modified
634 * @ea: the location of the new ea in a block
635 * @er: the write request
636 *
637 * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
638 *
639 * returns : errno
640 */
641
642static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
643 struct gfs2_ea_request *er)
644{
645 struct gfs2_sbd *sdp = ip->i_sbd;
646
647 ea->ea_data_len = cpu_to_be32(er->er_data_len);
648 ea->ea_name_len = er->er_name_len;
649 ea->ea_type = er->er_type;
650 ea->__pad = 0;
651
652 memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
653
654 if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
655 ea->ea_num_ptrs = 0;
656 memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
657 } else {
658 uint64_t *dataptr = GFS2_EA2DATAPTRS(ea);
659 const char *data = er->er_data;
660 unsigned int data_len = er->er_data_len;
661 unsigned int copy;
662 unsigned int x;
663
664 ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize);
665 for (x = 0; x < ea->ea_num_ptrs; x++) {
666 struct buffer_head *bh;
667 uint64_t block;
668 int mh_size = sizeof(struct gfs2_meta_header);
669
670 block = gfs2_alloc_meta(ip);
671
672 bh = gfs2_meta_new(ip->i_gl, block);
673 gfs2_trans_add_bh(ip->i_gl, bh, 1);
674 gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
675
676 ip->i_di.di_blocks++;
677
678 copy = (data_len > sdp->sd_jbsize) ? sdp->sd_jbsize :
679 data_len;
680 memcpy(bh->b_data + mh_size, data, copy);
681 if (copy < sdp->sd_jbsize)
682 memset(bh->b_data + mh_size + copy, 0,
683 sdp->sd_jbsize - copy);
684
685 *dataptr++ = cpu_to_be64((uint64_t)bh->b_blocknr);
686 data += copy;
687 data_len -= copy;
688
689 brelse(bh);
690 }
691
692 gfs2_assert_withdraw(sdp, !data_len);
693 }
694
695 return 0;
696}
697
698typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
699 struct gfs2_ea_request *er,
700 void *private);
701
702static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
703 unsigned int blks,
704 ea_skeleton_call_t skeleton_call,
705 void *private)
706{
707 struct gfs2_alloc *al;
708 struct buffer_head *dibh;
709 int error;
710
711 al = gfs2_alloc_get(ip);
712
713 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
714 if (error)
715 goto out;
716
717 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
718 if (error)
719 goto out_gunlock_q;
720
721 al->al_requested = blks;
722
723 error = gfs2_inplace_reserve(ip);
724 if (error)
725 goto out_gunlock_q;
726
727 error = gfs2_trans_begin(ip->i_sbd,
728 blks + al->al_rgd->rd_ri.ri_length +
729 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
730 if (error)
731 goto out_ipres;
732
733 error = skeleton_call(ip, er, private);
734 if (error)
735 goto out_end_trans;
736
737 error = gfs2_meta_inode_buffer(ip, &dibh);
738 if (!error) {
739 if (er->er_flags & GFS2_ERF_MODE) {
740 gfs2_assert_withdraw(ip->i_sbd,
741 (ip->i_di.di_mode & S_IFMT) ==
742 (er->er_mode & S_IFMT));
743 ip->i_di.di_mode = er->er_mode;
744 }
745 ip->i_di.di_ctime = get_seconds();
746 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
747 gfs2_dinode_out(&ip->i_di, dibh->b_data);
748 brelse(dibh);
749 }
750
751 out_end_trans:
752 gfs2_trans_end(ip->i_sbd);
753
754 out_ipres:
755 gfs2_inplace_release(ip);
756
757 out_gunlock_q:
758 gfs2_quota_unlock(ip);
759
760 out:
761 gfs2_alloc_put(ip);
762
763 return error;
764}
765
766static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
767 void *private)
768{
769 struct buffer_head *bh;
770 int error;
771
772 error = ea_alloc_blk(ip, &bh);
773 if (error)
774 return error;
775
776 ip->i_di.di_eattr = bh->b_blocknr;
777 error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
778
779 brelse(bh);
780
781 return error;
782}
783
784/**
785 * ea_init - initializes a new eattr block
786 * @ip:
787 * @er:
788 *
789 * Returns: errno
790 */
791
792static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
793{
794 unsigned int jbsize = ip->i_sbd->sd_jbsize;
795 unsigned int blks = 1;
796
797 if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
798 blks += DIV_ROUND_UP(er->er_data_len, jbsize);
799
800 return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
801}
802
803static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
804{
805 uint32_t ea_size = GFS2_EA_SIZE(ea);
806 struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea +
807 ea_size);
808 uint32_t new_size = GFS2_EA_REC_LEN(ea) - ea_size;
809 int last = ea->ea_flags & GFS2_EAFLAG_LAST;
810
811 ea->ea_rec_len = cpu_to_be32(ea_size);
812 ea->ea_flags ^= last;
813
814 new->ea_rec_len = cpu_to_be32(new_size);
815 new->ea_flags = last;
816
817 return new;
818}
819
820static void ea_set_remove_stuffed(struct gfs2_inode *ip,
821 struct gfs2_ea_location *el)
822{
823 struct gfs2_ea_header *ea = el->el_ea;
824 struct gfs2_ea_header *prev = el->el_prev;
825 uint32_t len;
826
827 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
828
829 if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
830 ea->ea_type = GFS2_EATYPE_UNUSED;
831 return;
832 } else if (GFS2_EA2NEXT(prev) != ea) {
833 prev = GFS2_EA2NEXT(prev);
834 gfs2_assert_withdraw(ip->i_sbd, GFS2_EA2NEXT(prev) == ea);
835 }
836
837 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
838 prev->ea_rec_len = cpu_to_be32(len);
839
840 if (GFS2_EA_IS_LAST(ea))
841 prev->ea_flags |= GFS2_EAFLAG_LAST;
842}
843
844struct ea_set {
845 int ea_split;
846
847 struct gfs2_ea_request *es_er;
848 struct gfs2_ea_location *es_el;
849
850 struct buffer_head *es_bh;
851 struct gfs2_ea_header *es_ea;
852};
853
854static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
855 struct gfs2_ea_header *ea, struct ea_set *es)
856{
857 struct gfs2_ea_request *er = es->es_er;
858 struct buffer_head *dibh;
859 int error;
860
861 error = gfs2_trans_begin(ip->i_sbd, RES_DINODE + 2 * RES_EATTR, 0);
862 if (error)
863 return error;
864
865 gfs2_trans_add_bh(ip->i_gl, bh, 1);
866
867 if (es->ea_split)
868 ea = ea_split_ea(ea);
869
870 ea_write(ip, ea, er);
871
872 if (es->es_el)
873 ea_set_remove_stuffed(ip, es->es_el);
874
875 error = gfs2_meta_inode_buffer(ip, &dibh);
876 if (error)
877 goto out;
878
879 if (er->er_flags & GFS2_ERF_MODE) {
880 gfs2_assert_withdraw(ip->i_sbd,
881 (ip->i_di.di_mode & S_IFMT) == (er->er_mode & S_IFMT));
882 ip->i_di.di_mode = er->er_mode;
883 }
884 ip->i_di.di_ctime = get_seconds();
885 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
886 gfs2_dinode_out(&ip->i_di, dibh->b_data);
887 brelse(dibh);
888 out:
889 gfs2_trans_end(ip->i_sbd);
890
891 return error;
892}
893
894static int ea_set_simple_alloc(struct gfs2_inode *ip,
895 struct gfs2_ea_request *er, void *private)
896{
897 struct ea_set *es = private;
898 struct gfs2_ea_header *ea = es->es_ea;
899 int error;
900
901 gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
902
903 if (es->ea_split)
904 ea = ea_split_ea(ea);
905
906 error = ea_write(ip, ea, er);
907 if (error)
908 return error;
909
910 if (es->es_el)
911 ea_set_remove_stuffed(ip, es->es_el);
912
913 return 0;
914}
915
916static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
917 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
918 void *private)
919{
920 struct ea_set *es = private;
921 unsigned int size;
922 int stuffed;
923 int error;
924
925 stuffed = ea_calc_size(ip->i_sbd, es->es_er, &size);
926
927 if (ea->ea_type == GFS2_EATYPE_UNUSED) {
928 if (GFS2_EA_REC_LEN(ea) < size)
929 return 0;
930 if (!GFS2_EA_IS_STUFFED(ea)) {
931 error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
932 if (error)
933 return error;
934 }
935 es->ea_split = 0;
936 } else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
937 es->ea_split = 1;
938 else
939 return 0;
940
941 if (stuffed) {
942 error = ea_set_simple_noalloc(ip, bh, ea, es);
943 if (error)
944 return error;
945 } else {
946 unsigned int blks;
947
948 es->es_bh = bh;
949 es->es_ea = ea;
950 blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len,
951 ip->i_sbd->sd_jbsize);
952
953 error = ea_alloc_skeleton(ip, es->es_er, blks,
954 ea_set_simple_alloc, es);
955 if (error)
956 return error;
957 }
958
959 return 1;
960}
961
962static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
963 void *private)
964{
965 struct gfs2_sbd *sdp = ip->i_sbd;
966 struct buffer_head *indbh, *newbh;
967 uint64_t *eablk;
968 int error;
969 int mh_size = sizeof(struct gfs2_meta_header);
970
971 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
972 uint64_t *end;
973
974 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
975 DIO_START | DIO_WAIT, &indbh);
976 if (error)
977 return error;
978
979 if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
980 error = -EIO;
981 goto out;
982 }
983
984 eablk = (uint64_t *)(indbh->b_data + mh_size);
985 end = eablk + sdp->sd_inptrs;
986
987 for (; eablk < end; eablk++)
988 if (!*eablk)
989 break;
990
991 if (eablk == end) {
992 error = -ENOSPC;
993 goto out;
994 }
995
996 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
997 } else {
998 uint64_t blk;
999
1000 blk = gfs2_alloc_meta(ip);
1001
1002 indbh = gfs2_meta_new(ip->i_gl, blk);
1003 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
1004 gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
1005 gfs2_buffer_clear_tail(indbh, mh_size);
1006
1007 eablk = (uint64_t *)(indbh->b_data + mh_size);
1008 *eablk = cpu_to_be64(ip->i_di.di_eattr);
1009 ip->i_di.di_eattr = blk;
1010 ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
1011 ip->i_di.di_blocks++;
1012
1013 eablk++;
1014 }
1015
1016 error = ea_alloc_blk(ip, &newbh);
1017 if (error)
1018 goto out;
1019
1020 *eablk = cpu_to_be64((uint64_t)newbh->b_blocknr);
1021 error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
1022 brelse(newbh);
1023 if (error)
1024 goto out;
1025
1026 if (private)
1027 ea_set_remove_stuffed(ip, (struct gfs2_ea_location *)private);
1028
1029 out:
1030 brelse(indbh);
1031
1032 return error;
1033}
1034
1035static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
1036 struct gfs2_ea_location *el)
1037{
1038 struct ea_set es;
1039 unsigned int blks = 2;
1040 int error;
1041
1042 memset(&es, 0, sizeof(struct ea_set));
1043 es.es_er = er;
1044 es.es_el = el;
1045
1046 error = ea_foreach(ip, ea_set_simple, &es);
1047 if (error > 0)
1048 return 0;
1049 if (error)
1050 return error;
1051
1052 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
1053 blks++;
1054 if (GFS2_EAREQ_SIZE_STUFFED(er) > ip->i_sbd->sd_jbsize)
1055 blks += DIV_ROUND_UP(er->er_data_len, ip->i_sbd->sd_jbsize);
1056
1057 return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
1058}
1059
1060static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
1061 struct gfs2_ea_location *el)
1062{
1063 if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
1064 el->el_prev = GFS2_EA2NEXT(el->el_prev);
1065 gfs2_assert_withdraw(ip->i_sbd,
1066 GFS2_EA2NEXT(el->el_prev) == el->el_ea);
1067 }
1068
1069 return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
1070}
1071
1072int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1073{
1074 struct gfs2_ea_location el;
1075 int error;
1076
1077 if (!ip->i_di.di_eattr) {
1078 if (er->er_flags & XATTR_REPLACE)
1079 return -ENODATA;
1080 return ea_init(ip, er);
1081 }
1082
1083 error = gfs2_ea_find(ip, er, &el);
1084 if (error)
1085 return error;
1086
1087 if (el.el_ea) {
1088 if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
1089 brelse(el.el_bh);
1090 return -EPERM;
1091 }
1092
1093 error = -EEXIST;
1094 if (!(er->er_flags & XATTR_CREATE)) {
1095 int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
1096 error = ea_set_i(ip, er, &el);
1097 if (!error && unstuffed)
1098 ea_set_remove_unstuffed(ip, &el);
1099 }
1100
1101 brelse(el.el_bh);
1102 } else {
1103 error = -ENODATA;
1104 if (!(er->er_flags & XATTR_REPLACE))
1105 error = ea_set_i(ip, er, NULL);
1106 }
1107
1108 return error;
1109}
1110
1111int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1112{
1113 struct gfs2_holder i_gh;
1114 int error;
1115
1116 if (!er->er_name_len ||
1117 er->er_name_len > GFS2_EA_MAX_NAME_LEN)
1118 return -EINVAL;
1119 if (!er->er_data || !er->er_data_len) {
1120 er->er_data = NULL;
1121 er->er_data_len = 0;
1122 }
1123 error = ea_check_size(ip->i_sbd, er);
1124 if (error)
1125 return error;
1126
1127 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1128 if (error)
1129 return error;
1130
1131 if (IS_IMMUTABLE(ip->i_vnode))
1132 error = -EPERM;
1133 else
1134 error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
1135
1136 gfs2_glock_dq_uninit(&i_gh);
1137
1138 return error;
1139}
1140
1141static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
1142{
1143 struct gfs2_ea_header *ea = el->el_ea;
1144 struct gfs2_ea_header *prev = el->el_prev;
1145 struct buffer_head *dibh;
1146 int error;
1147
1148 error = gfs2_trans_begin(ip->i_sbd, RES_DINODE + RES_EATTR, 0);
1149 if (error)
1150 return error;
1151
1152 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
1153
1154 if (prev) {
1155 uint32_t len;
1156
1157 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
1158 prev->ea_rec_len = cpu_to_be32(len);
1159
1160 if (GFS2_EA_IS_LAST(ea))
1161 prev->ea_flags |= GFS2_EAFLAG_LAST;
1162 } else
1163 ea->ea_type = GFS2_EATYPE_UNUSED;
1164
1165 error = gfs2_meta_inode_buffer(ip, &dibh);
1166 if (!error) {
1167 ip->i_di.di_ctime = get_seconds();
1168 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1169 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1170 brelse(dibh);
1171 }
1172
1173 gfs2_trans_end(ip->i_sbd);
1174
1175 return error;
1176}
1177
1178int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1179{
1180 struct gfs2_ea_location el;
1181 int error;
1182
1183 if (!ip->i_di.di_eattr)
1184 return -ENODATA;
1185
1186 error = gfs2_ea_find(ip, er, &el);
1187 if (error)
1188 return error;
1189 if (!el.el_ea)
1190 return -ENODATA;
1191
1192 if (GFS2_EA_IS_STUFFED(el.el_ea))
1193 error = ea_remove_stuffed(ip, &el);
1194 else
1195 error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
1196 0);
1197
1198 brelse(el.el_bh);
1199
1200 return error;
1201}
1202
1203/**
1204 * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
1205 * @ip: pointer to the inode of the target file
1206 * @er: request information
1207 *
1208 * Returns: errno
1209 */
1210
1211int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1212{
1213 struct gfs2_holder i_gh;
1214 int error;
1215
1216 if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
1217 return -EINVAL;
1218
1219 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1220 if (error)
1221 return error;
1222
1223 if (IS_IMMUTABLE(ip->i_vnode) || IS_APPEND(ip->i_vnode))
1224 error = -EPERM;
1225 else
1226 error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
1227
1228 gfs2_glock_dq_uninit(&i_gh);
1229
1230 return error;
1231}
1232
1233static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
1234 struct gfs2_ea_header *ea, char *data)
1235{
1236 struct gfs2_sbd *sdp = ip->i_sbd;
1237 struct buffer_head **bh;
1238 unsigned int amount = GFS2_EA_DATA_LEN(ea);
1239 unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
1240 uint64_t *dataptrs = GFS2_EA2DATAPTRS(ea);
1241 unsigned int x;
1242 int error;
1243
1244 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
1245 if (!bh)
1246 return -ENOMEM;
1247
1248 error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
1249 if (error)
1250 goto out;
1251
1252 for (x = 0; x < nptrs; x++) {
1253 error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs),
1254 DIO_START, bh + x);
1255 if (error) {
1256 while (x--)
1257 brelse(bh[x]);
1258 goto fail;
1259 }
1260 dataptrs++;
1261 }
1262
1263 for (x = 0; x < nptrs; x++) {
1264 error = gfs2_meta_reread(sdp, bh[x], DIO_WAIT);
1265 if (error) {
1266 for (; x < nptrs; x++)
1267 brelse(bh[x]);
1268 goto fail;
1269 }
1270 if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
1271 for (; x < nptrs; x++)
1272 brelse(bh[x]);
1273 error = -EIO;
1274 goto fail;
1275 }
1276
1277 gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
1278
1279 memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header),
1280 data,
1281 (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
1282
1283 amount -= sdp->sd_jbsize;
1284 data += sdp->sd_jbsize;
1285
1286 brelse(bh[x]);
1287 }
1288
1289 out:
1290 kfree(bh);
1291
1292 return error;
1293
1294 fail:
1295 gfs2_trans_end(sdp);
1296 kfree(bh);
1297
1298 return error;
1299}
1300
1301int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
1302 struct iattr *attr, char *data)
1303{
1304 struct buffer_head *dibh;
1305 int error;
1306
1307 if (GFS2_EA_IS_STUFFED(el->el_ea)) {
1308 error = gfs2_trans_begin(ip->i_sbd, RES_DINODE + RES_EATTR, 0);
1309 if (error)
1310 return error;
1311
1312 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
1313 memcpy(GFS2_EA2DATA(el->el_ea),
1314 data,
1315 GFS2_EA_DATA_LEN(el->el_ea));
1316 } else
1317 error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
1318
1319 if (error)
1320 return error;
1321
1322 error = gfs2_meta_inode_buffer(ip, &dibh);
1323 if (!error) {
1324 error = inode_setattr(ip->i_vnode, attr);
1325 gfs2_assert_warn(ip->i_sbd, !error);
1326 gfs2_inode_attr_out(ip);
1327 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1328 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1329 brelse(dibh);
1330 }
1331
1332 gfs2_trans_end(ip->i_sbd);
1333
1334 return error;
1335}
1336
1337static int ea_dealloc_indirect(struct gfs2_inode *ip)
1338{
1339 struct gfs2_sbd *sdp = ip->i_sbd;
1340 struct gfs2_rgrp_list rlist;
1341 struct buffer_head *indbh, *dibh;
1342 uint64_t *eablk, *end;
1343 unsigned int rg_blocks = 0;
1344 uint64_t bstart = 0;
1345 unsigned int blen = 0;
1346 unsigned int blks = 0;
1347 unsigned int x;
1348 int error;
1349
1350 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1351
1352 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
1353 DIO_START | DIO_WAIT, &indbh);
1354 if (error)
1355 return error;
1356
1357 if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
1358 error = -EIO;
1359 goto out;
1360 }
1361
1362 eablk = (uint64_t *)(indbh->b_data + sizeof(struct gfs2_meta_header));
1363 end = eablk + sdp->sd_inptrs;
1364
1365 for (; eablk < end; eablk++) {
1366 uint64_t bn;
1367
1368 if (!*eablk)
1369 break;
1370 bn = be64_to_cpu(*eablk);
1371
1372 if (bstart + blen == bn)
1373 blen++;
1374 else {
1375 if (bstart)
1376 gfs2_rlist_add(sdp, &rlist, bstart);
1377 bstart = bn;
1378 blen = 1;
1379 }
1380 blks++;
1381 }
1382 if (bstart)
1383 gfs2_rlist_add(sdp, &rlist, bstart);
1384 else
1385 goto out;
1386
1387 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
1388
1389 for (x = 0; x < rlist.rl_rgrps; x++) {
1390 struct gfs2_rgrpd *rgd;
1391 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
1392 rg_blocks += rgd->rd_ri.ri_length;
1393 }
1394
1395 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
1396 if (error)
1397 goto out_rlist_free;
1398
1399 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
1400 RES_INDIRECT + RES_STATFS +
1401 RES_QUOTA, blks);
1402 if (error)
1403 goto out_gunlock;
1404
1405 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
1406
1407 eablk = (uint64_t *)(indbh->b_data + sizeof(struct gfs2_meta_header));
1408 bstart = 0;
1409 blen = 0;
1410
1411 for (; eablk < end; eablk++) {
1412 uint64_t bn;
1413
1414 if (!*eablk)
1415 break;
1416 bn = be64_to_cpu(*eablk);
1417
1418 if (bstart + blen == bn)
1419 blen++;
1420 else {
1421 if (bstart)
1422 gfs2_free_meta(ip, bstart, blen);
1423 bstart = bn;
1424 blen = 1;
1425 }
1426
1427 *eablk = 0;
1428 if (!ip->i_di.di_blocks)
1429 gfs2_consist_inode(ip);
1430 ip->i_di.di_blocks--;
1431 }
1432 if (bstart)
1433 gfs2_free_meta(ip, bstart, blen);
1434
1435 ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
1436
1437 error = gfs2_meta_inode_buffer(ip, &dibh);
1438 if (!error) {
1439 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1440 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1441 brelse(dibh);
1442 }
1443
1444 gfs2_trans_end(sdp);
1445
1446 out_gunlock:
1447 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1448
1449 out_rlist_free:
1450 gfs2_rlist_free(&rlist);
1451
1452 out:
1453 brelse(indbh);
1454
1455 return error;
1456}
1457
1458static int ea_dealloc_block(struct gfs2_inode *ip)
1459{
1460 struct gfs2_sbd *sdp = ip->i_sbd;
1461 struct gfs2_alloc *al = &ip->i_alloc;
1462 struct gfs2_rgrpd *rgd;
1463 struct buffer_head *dibh;
1464 int error;
1465
1466 rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
1467 if (!rgd) {
1468 gfs2_consist_inode(ip);
1469 return -EIO;
1470 }
1471
1472 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
1473 &al->al_rgd_gh);
1474 if (error)
1475 return error;
1476
1477 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE +
1478 RES_STATFS + RES_QUOTA, 1);
1479 if (error)
1480 goto out_gunlock;
1481
1482 gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
1483
1484 ip->i_di.di_eattr = 0;
1485 if (!ip->i_di.di_blocks)
1486 gfs2_consist_inode(ip);
1487 ip->i_di.di_blocks--;
1488
1489 error = gfs2_meta_inode_buffer(ip, &dibh);
1490 if (!error) {
1491 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1492 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1493 brelse(dibh);
1494 }
1495
1496 gfs2_trans_end(sdp);
1497
1498 out_gunlock:
1499 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1500
1501 return error;
1502}
1503
1504/**
1505 * gfs2_ea_dealloc - deallocate the extended attribute fork
1506 * @ip: the inode
1507 *
1508 * Returns: errno
1509 */
1510
1511int gfs2_ea_dealloc(struct gfs2_inode *ip)
1512{
1513 struct gfs2_alloc *al;
1514 int error;
1515
1516 al = gfs2_alloc_get(ip);
1517
1518 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1519 if (error)
1520 goto out_alloc;
1521
1522 error = gfs2_rindex_hold(ip->i_sbd, &al->al_ri_gh);
1523 if (error)
1524 goto out_quota;
1525
1526 error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
1527 if (error)
1528 goto out_rindex;
1529
1530 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
1531 error = ea_dealloc_indirect(ip);
1532 if (error)
1533 goto out_rindex;
1534 }
1535
1536 error = ea_dealloc_block(ip);
1537
1538 out_rindex:
1539 gfs2_glock_dq_uninit(&al->al_ri_gh);
1540
1541 out_quota:
1542 gfs2_quota_unhold(ip);
1543
1544 out_alloc:
1545 gfs2_alloc_put(ip);
1546
1547 return error;
1548}
1549
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h
new file mode 100644
index 000000000000..ae199692e51d
--- /dev/null
+++ b/fs/gfs2/eattr.h
@@ -0,0 +1,97 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __EATTR_DOT_H__
11#define __EATTR_DOT_H__
12
13#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
14#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
15
16#define GFS2_EA_SIZE(ea) \
17ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
18 ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
19 (sizeof(uint64_t) * (ea)->ea_num_ptrs)), 8)
20
21#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
22#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
23
24#define GFS2_EAREQ_SIZE_STUFFED(er) \
25ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
26
27#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
28ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
29 sizeof(uint64_t) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
30
31#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
32#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
33
34#define GFS2_EA2DATAPTRS(ea) \
35((uint64_t *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
36
37#define GFS2_EA2NEXT(ea) \
38((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
39
40#define GFS2_EA_BH2FIRST(bh) \
41((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
42
43#define GFS2_ERF_MODE 0x80000000
44
45struct gfs2_ea_request {
46 char *er_name;
47 char *er_data;
48 unsigned int er_name_len;
49 unsigned int er_data_len;
50 unsigned int er_type; /* GFS2_EATYPE_... */
51 int er_flags;
52 mode_t er_mode;
53};
54
55struct gfs2_ea_location {
56 struct buffer_head *el_bh;
57 struct gfs2_ea_header *el_ea;
58 struct gfs2_ea_header *el_prev;
59};
60
61int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
62int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
63int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
64
65int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
66int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
67int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
68int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
69
70int gfs2_ea_dealloc(struct gfs2_inode *ip);
71
72/* Exported to acl.c */
73
74int gfs2_ea_find(struct gfs2_inode *ip,
75 struct gfs2_ea_request *er,
76 struct gfs2_ea_location *el);
77int gfs2_ea_get_copy(struct gfs2_inode *ip,
78 struct gfs2_ea_location *el,
79 char *data);
80int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
81 struct iattr *attr, char *data);
82
83static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
84{
85 switch (ea->ea_type) {
86 case GFS2_EATYPE_USR:
87 return (5 + (ea->ea_name_len + 1));
88 case GFS2_EATYPE_SYS:
89 return (7 + (ea->ea_name_len + 1));
90 case GFS2_EATYPE_SECURITY:
91 return (9 + (ea->ea_name_len + 1));
92 default:
93 return (0);
94 }
95}
96
97#endif /* __EATTR_DOT_H__ */
diff --git a/fs/gfs2/format.h b/fs/gfs2/format.h
new file mode 100644
index 000000000000..239f0c3553fc
--- /dev/null
+++ b/fs/gfs2/format.h
@@ -0,0 +1,21 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __FORMAT_DOT_H__
11#define __FORMAT_DOT_H__
12
13static const uint32_t gfs2_old_fs_formats[] = {
14 0
15};
16
17static const uint32_t gfs2_old_multihost_formats[] = {
18 0
19};
20
21#endif /* __FORMAT_DOT_H__ */
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
new file mode 100644
index 000000000000..6edbd551a4c0
--- /dev/null
+++ b/fs/gfs2/gfs2.h
@@ -0,0 +1,31 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GFS2_DOT_H__
11#define __GFS2_DOT_H__
12
13enum {
14 NO_CREATE = 0,
15 CREATE = 1,
16};
17
18enum {
19 NO_WAIT = 0,
20 WAIT = 1,
21};
22
23enum {
24 NO_FORCE = 0,
25 FORCE = 1,
26};
27
28#define GFS2_FAST_NAME_SIZE 8
29
30#endif /* __GFS2_DOT_H__ */
31
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
new file mode 100644
index 000000000000..0603a6de52c9
--- /dev/null
+++ b/fs/gfs2/glock.c
@@ -0,0 +1,2340 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/delay.h>
16#include <linux/sort.h>
17#include <linux/jhash.h>
18#include <linux/kref.h>
19#include <linux/kallsyms.h>
20#include <linux/gfs2_ondisk.h>
21#include <asm/uaccess.h>
22
23#include "gfs2.h"
24#include "lm_interface.h"
25#include "incore.h"
26#include "glock.h"
27#include "glops.h"
28#include "inode.h"
29#include "lm.h"
30#include "lops.h"
31#include "meta_io.h"
32#include "quota.h"
33#include "super.h"
34#include "util.h"
35
36/* Must be kept in sync with the beginning of struct gfs2_glock */
37struct glock_plug {
38 struct list_head gl_list;
39 unsigned long gl_flags;
40};
41
42struct greedy {
43 struct gfs2_holder gr_gh;
44 struct work_struct gr_work;
45};
46
47typedef void (*glock_examiner) (struct gfs2_glock * gl);
48
49static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
50static int dump_glock(struct gfs2_glock *gl);
51
52/**
53 * relaxed_state_ok - is a requested lock compatible with the current lock mode?
54 * @actual: the current state of the lock
55 * @requested: the lock state that was requested by the caller
56 * @flags: the modifier flags passed in by the caller
57 *
58 * Returns: 1 if the locks are compatible, 0 otherwise
59 */
60
61static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
62 int flags)
63{
64 if (actual == requested)
65 return 1;
66
67 if (flags & GL_EXACT)
68 return 0;
69
70 if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
71 return 1;
72
73 if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
74 return 1;
75
76 return 0;
77}
78
79/**
80 * gl_hash() - Turn glock number into hash bucket number
81 * @lock: The glock number
82 *
83 * Returns: The number of the corresponding hash bucket
84 */
85
86static unsigned int gl_hash(struct lm_lockname *name)
87{
88 unsigned int h;
89
90 h = jhash(&name->ln_number, sizeof(uint64_t), 0);
91 h = jhash(&name->ln_type, sizeof(unsigned int), h);
92 h &= GFS2_GL_HASH_MASK;
93
94 return h;
95}
96
97/**
98 * glock_free() - Perform a few checks and then release struct gfs2_glock
99 * @gl: The glock to release
100 *
101 * Also calls lock module to release its internal structure for this glock.
102 *
103 */
104
105static void glock_free(struct gfs2_glock *gl)
106{
107 struct gfs2_sbd *sdp = gl->gl_sbd;
108 struct inode *aspace = gl->gl_aspace;
109
110 gfs2_lm_put_lock(sdp, gl->gl_lock);
111
112 if (aspace)
113 gfs2_aspace_put(aspace);
114
115 kmem_cache_free(gfs2_glock_cachep, gl);
116}
117
118/**
119 * gfs2_glock_hold() - increment reference count on glock
120 * @gl: The glock to hold
121 *
122 */
123
124void gfs2_glock_hold(struct gfs2_glock *gl)
125{
126 kref_get(&gl->gl_ref);
127}
128
129/* All work is done after the return from kref_put() so we
130 can release the write_lock before the free. */
131
132static void kill_glock(struct kref *kref)
133{
134 struct gfs2_glock *gl = container_of(kref, struct gfs2_glock, gl_ref);
135 struct gfs2_sbd *sdp = gl->gl_sbd;
136
137 gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
138 gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
139 gfs2_assert(sdp, list_empty(&gl->gl_holders));
140 gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
141 gfs2_assert(sdp, list_empty(&gl->gl_waiters2));
142 gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
143}
144
145/**
146 * gfs2_glock_put() - Decrement reference count on glock
147 * @gl: The glock to put
148 *
149 */
150
151int gfs2_glock_put(struct gfs2_glock *gl)
152{
153 struct gfs2_sbd *sdp = gl->gl_sbd;
154 struct gfs2_gl_hash_bucket *bucket = gl->gl_bucket;
155 int rv = 0;
156
157 mutex_lock(&sdp->sd_invalidate_inodes_mutex);
158
159 write_lock(&bucket->hb_lock);
160 if (kref_put(&gl->gl_ref, kill_glock)) {
161 list_del_init(&gl->gl_list);
162 write_unlock(&bucket->hb_lock);
163 BUG_ON(spin_is_locked(&gl->gl_spin));
164 glock_free(gl);
165 rv = 1;
166 goto out;
167 }
168 write_unlock(&bucket->hb_lock);
169 out:
170 mutex_unlock(&sdp->sd_invalidate_inodes_mutex);
171 return rv;
172}
173
174/**
175 * queue_empty - check to see if a glock's queue is empty
176 * @gl: the glock
177 * @head: the head of the queue to check
178 *
179 * This function protects the list in the event that a process already
180 * has a holder on the list and is adding a second holder for itself.
181 * The glmutex lock is what generally prevents processes from working
182 * on the same glock at once, but the special case of adding a second
183 * holder for yourself ("recursive" locking) doesn't involve locking
184 * glmutex, making the spin lock necessary.
185 *
186 * Returns: 1 if the queue is empty
187 */
188
189static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head)
190{
191 int empty;
192 spin_lock(&gl->gl_spin);
193 empty = list_empty(head);
194 spin_unlock(&gl->gl_spin);
195 return empty;
196}
197
198/**
199 * search_bucket() - Find struct gfs2_glock by lock number
200 * @bucket: the bucket to search
201 * @name: The lock name
202 *
203 * Returns: NULL, or the struct gfs2_glock with the requested number
204 */
205
206static struct gfs2_glock *search_bucket(struct gfs2_gl_hash_bucket *bucket,
207 struct lm_lockname *name)
208{
209 struct gfs2_glock *gl;
210
211 list_for_each_entry(gl, &bucket->hb_list, gl_list) {
212 if (test_bit(GLF_PLUG, &gl->gl_flags))
213 continue;
214 if (!lm_name_equal(&gl->gl_name, name))
215 continue;
216
217 kref_get(&gl->gl_ref);
218
219 return gl;
220 }
221
222 return NULL;
223}
224
225/**
226 * gfs2_glock_find() - Find glock by lock number
227 * @sdp: The GFS2 superblock
228 * @name: The lock name
229 *
230 * Returns: NULL, or the struct gfs2_glock with the requested number
231 */
232
233static struct gfs2_glock *gfs2_glock_find(struct gfs2_sbd *sdp,
234 struct lm_lockname *name)
235{
236 struct gfs2_gl_hash_bucket *bucket = &sdp->sd_gl_hash[gl_hash(name)];
237 struct gfs2_glock *gl;
238
239 read_lock(&bucket->hb_lock);
240 gl = search_bucket(bucket, name);
241 read_unlock(&bucket->hb_lock);
242
243 return gl;
244}
245
246/**
247 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
248 * @sdp: The GFS2 superblock
249 * @number: the lock number
250 * @glops: The glock_operations to use
251 * @create: If 0, don't create the glock if it doesn't exist
252 * @glp: the glock is returned here
253 *
254 * This does not lock a glock, just finds/creates structures for one.
255 *
256 * Returns: errno
257 */
258
259int gfs2_glock_get(struct gfs2_sbd *sdp, uint64_t number,
260 struct gfs2_glock_operations *glops, int create,
261 struct gfs2_glock **glp)
262{
263 struct lm_lockname name;
264 struct gfs2_glock *gl, *tmp;
265 struct gfs2_gl_hash_bucket *bucket;
266 int error;
267
268 name.ln_number = number;
269 name.ln_type = glops->go_type;
270 bucket = &sdp->sd_gl_hash[gl_hash(&name)];
271
272 read_lock(&bucket->hb_lock);
273 gl = search_bucket(bucket, &name);
274 read_unlock(&bucket->hb_lock);
275
276 if (gl || !create) {
277 *glp = gl;
278 return 0;
279 }
280
281 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
282 if (!gl)
283 return -ENOMEM;
284
285 memset(gl, 0, sizeof(struct gfs2_glock));
286
287 INIT_LIST_HEAD(&gl->gl_list);
288 gl->gl_name = name;
289 kref_init(&gl->gl_ref);
290
291 spin_lock_init(&gl->gl_spin);
292
293 gl->gl_state = LM_ST_UNLOCKED;
294 gl->gl_owner = NULL;
295 gl->gl_ip = 0;
296 INIT_LIST_HEAD(&gl->gl_holders);
297 INIT_LIST_HEAD(&gl->gl_waiters1);
298 INIT_LIST_HEAD(&gl->gl_waiters2);
299 INIT_LIST_HEAD(&gl->gl_waiters3);
300
301 gl->gl_ops = glops;
302
303 gl->gl_bucket = bucket;
304 INIT_LIST_HEAD(&gl->gl_reclaim);
305
306 gl->gl_sbd = sdp;
307
308 lops_init_le(&gl->gl_le, &gfs2_glock_lops);
309 INIT_LIST_HEAD(&gl->gl_ail_list);
310
311 /* If this glock protects actual on-disk data or metadata blocks,
312 create a VFS inode to manage the pages/buffers holding them. */
313 if (glops == &gfs2_inode_glops ||
314 glops == &gfs2_rgrp_glops ||
315 glops == &gfs2_meta_glops) {
316 gl->gl_aspace = gfs2_aspace_get(sdp);
317 if (!gl->gl_aspace) {
318 error = -ENOMEM;
319 goto fail;
320 }
321 }
322
323 error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock);
324 if (error)
325 goto fail_aspace;
326
327 write_lock(&bucket->hb_lock);
328 tmp = search_bucket(bucket, &name);
329 if (tmp) {
330 write_unlock(&bucket->hb_lock);
331 glock_free(gl);
332 gl = tmp;
333 } else {
334 list_add_tail(&gl->gl_list, &bucket->hb_list);
335 write_unlock(&bucket->hb_lock);
336 }
337
338 *glp = gl;
339
340 return 0;
341
342 fail_aspace:
343 if (gl->gl_aspace)
344 gfs2_aspace_put(gl->gl_aspace);
345
346 fail:
347 kmem_cache_free(gfs2_glock_cachep, gl);
348
349 return error;
350}
351
352/**
353 * gfs2_holder_init - initialize a struct gfs2_holder in the default way
354 * @gl: the glock
355 * @state: the state we're requesting
356 * @flags: the modifier flags
357 * @gh: the holder structure
358 *
359 */
360
361void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
362 struct gfs2_holder *gh)
363{
364 INIT_LIST_HEAD(&gh->gh_list);
365 gh->gh_gl = gl;
366 gh->gh_ip = (unsigned long)__builtin_return_address(0);
367 gh->gh_owner = current;
368 gh->gh_state = state;
369 gh->gh_flags = flags;
370 gh->gh_error = 0;
371 gh->gh_iflags = 0;
372 init_completion(&gh->gh_wait);
373
374 if (gh->gh_state == LM_ST_EXCLUSIVE)
375 gh->gh_flags |= GL_LOCAL_EXCL;
376
377 gfs2_glock_hold(gl);
378}
379
380/**
381 * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
382 * @state: the state we're requesting
383 * @flags: the modifier flags
384 * @gh: the holder structure
385 *
386 * Don't mess with the glock.
387 *
388 */
389
390void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh)
391{
392 gh->gh_state = state;
393 gh->gh_flags = flags;
394 if (gh->gh_state == LM_ST_EXCLUSIVE)
395 gh->gh_flags |= GL_LOCAL_EXCL;
396
397 gh->gh_iflags &= 1 << HIF_ALLOCED;
398 gh->gh_ip = (unsigned long)__builtin_return_address(0);
399}
400
401/**
402 * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference)
403 * @gh: the holder structure
404 *
405 */
406
407void gfs2_holder_uninit(struct gfs2_holder *gh)
408{
409 gfs2_glock_put(gh->gh_gl);
410 gh->gh_gl = NULL;
411 gh->gh_ip = 0;
412}
413
414/**
415 * gfs2_holder_get - get a struct gfs2_holder structure
416 * @gl: the glock
417 * @state: the state we're requesting
418 * @flags: the modifier flags
419 * @gfp_flags: __GFP_NOFAIL
420 *
421 * Figure out how big an impact this function has. Either:
422 * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd
423 * 2) Leave it like it is
424 *
425 * Returns: the holder structure, NULL on ENOMEM
426 */
427
428static struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl,
429 unsigned int state,
430 int flags, gfp_t gfp_flags)
431{
432 struct gfs2_holder *gh;
433
434 gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags);
435 if (!gh)
436 return NULL;
437
438 gfs2_holder_init(gl, state, flags, gh);
439 set_bit(HIF_ALLOCED, &gh->gh_iflags);
440 gh->gh_ip = (unsigned long)__builtin_return_address(0);
441 return gh;
442}
443
444/**
445 * gfs2_holder_put - get rid of a struct gfs2_holder structure
446 * @gh: the holder structure
447 *
448 */
449
450static void gfs2_holder_put(struct gfs2_holder *gh)
451{
452 gfs2_holder_uninit(gh);
453 kfree(gh);
454}
455
456/**
457 * rq_mutex - process a mutex request in the queue
458 * @gh: the glock holder
459 *
460 * Returns: 1 if the queue is blocked
461 */
462
463static int rq_mutex(struct gfs2_holder *gh)
464{
465 struct gfs2_glock *gl = gh->gh_gl;
466
467 list_del_init(&gh->gh_list);
468 /* gh->gh_error never examined. */
469 set_bit(GLF_LOCK, &gl->gl_flags);
470 complete(&gh->gh_wait);
471
472 return 1;
473}
474
475/**
476 * rq_promote - process a promote request in the queue
477 * @gh: the glock holder
478 *
479 * Acquire a new inter-node lock, or change a lock state to more restrictive.
480 *
481 * Returns: 1 if the queue is blocked
482 */
483
484static int rq_promote(struct gfs2_holder *gh)
485{
486 struct gfs2_glock *gl = gh->gh_gl;
487 struct gfs2_sbd *sdp = gl->gl_sbd;
488 struct gfs2_glock_operations *glops = gl->gl_ops;
489
490 if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
491 if (list_empty(&gl->gl_holders)) {
492 gl->gl_req_gh = gh;
493 set_bit(GLF_LOCK, &gl->gl_flags);
494 spin_unlock(&gl->gl_spin);
495
496 if (atomic_read(&sdp->sd_reclaim_count) >
497 gfs2_tune_get(sdp, gt_reclaim_limit) &&
498 !(gh->gh_flags & LM_FLAG_PRIORITY)) {
499 gfs2_reclaim_glock(sdp);
500 gfs2_reclaim_glock(sdp);
501 }
502
503 glops->go_xmote_th(gl, gh->gh_state,
504 gh->gh_flags);
505
506 spin_lock(&gl->gl_spin);
507 }
508 return 1;
509 }
510
511 if (list_empty(&gl->gl_holders)) {
512 set_bit(HIF_FIRST, &gh->gh_iflags);
513 set_bit(GLF_LOCK, &gl->gl_flags);
514 } else {
515 struct gfs2_holder *next_gh;
516 if (gh->gh_flags & GL_LOCAL_EXCL)
517 return 1;
518 next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
519 gh_list);
520 if (next_gh->gh_flags & GL_LOCAL_EXCL)
521 return 1;
522 }
523
524 list_move_tail(&gh->gh_list, &gl->gl_holders);
525 gh->gh_error = 0;
526 set_bit(HIF_HOLDER, &gh->gh_iflags);
527
528 complete(&gh->gh_wait);
529
530 return 0;
531}
532
533/**
534 * rq_demote - process a demote request in the queue
535 * @gh: the glock holder
536 *
537 * Returns: 1 if the queue is blocked
538 */
539
540static int rq_demote(struct gfs2_holder *gh)
541{
542 struct gfs2_glock *gl = gh->gh_gl;
543 struct gfs2_glock_operations *glops = gl->gl_ops;
544
545 if (!list_empty(&gl->gl_holders))
546 return 1;
547
548 if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
549 list_del_init(&gh->gh_list);
550 gh->gh_error = 0;
551 spin_unlock(&gl->gl_spin);
552 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
553 gfs2_holder_put(gh);
554 else
555 complete(&gh->gh_wait);
556 spin_lock(&gl->gl_spin);
557 } else {
558 gl->gl_req_gh = gh;
559 set_bit(GLF_LOCK, &gl->gl_flags);
560 spin_unlock(&gl->gl_spin);
561
562 if (gh->gh_state == LM_ST_UNLOCKED ||
563 gl->gl_state != LM_ST_EXCLUSIVE)
564 glops->go_drop_th(gl);
565 else
566 glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
567
568 spin_lock(&gl->gl_spin);
569 }
570
571 return 0;
572}
573
574/**
575 * rq_greedy - process a queued request to drop greedy status
576 * @gh: the glock holder
577 *
578 * Returns: 1 if the queue is blocked
579 */
580
581static int rq_greedy(struct gfs2_holder *gh)
582{
583 struct gfs2_glock *gl = gh->gh_gl;
584
585 list_del_init(&gh->gh_list);
586 /* gh->gh_error never examined. */
587 clear_bit(GLF_GREEDY, &gl->gl_flags);
588 spin_unlock(&gl->gl_spin);
589
590 gfs2_holder_uninit(gh);
591 kfree(container_of(gh, struct greedy, gr_gh));
592
593 spin_lock(&gl->gl_spin);
594
595 return 0;
596}
597
598/**
599 * run_queue - process holder structures on a glock
600 * @gl: the glock
601 *
602 */
603static void run_queue(struct gfs2_glock *gl)
604{
605 struct gfs2_holder *gh;
606 int blocked = 1;
607
608 for (;;) {
609 if (test_bit(GLF_LOCK, &gl->gl_flags))
610 break;
611
612 if (!list_empty(&gl->gl_waiters1)) {
613 gh = list_entry(gl->gl_waiters1.next,
614 struct gfs2_holder, gh_list);
615
616 if (test_bit(HIF_MUTEX, &gh->gh_iflags))
617 blocked = rq_mutex(gh);
618 else
619 gfs2_assert_warn(gl->gl_sbd, 0);
620
621 } else if (!list_empty(&gl->gl_waiters2) &&
622 !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) {
623 gh = list_entry(gl->gl_waiters2.next,
624 struct gfs2_holder, gh_list);
625
626 if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
627 blocked = rq_demote(gh);
628 else if (test_bit(HIF_GREEDY, &gh->gh_iflags))
629 blocked = rq_greedy(gh);
630 else
631 gfs2_assert_warn(gl->gl_sbd, 0);
632
633 } else if (!list_empty(&gl->gl_waiters3)) {
634 gh = list_entry(gl->gl_waiters3.next,
635 struct gfs2_holder, gh_list);
636
637 if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
638 blocked = rq_promote(gh);
639 else
640 gfs2_assert_warn(gl->gl_sbd, 0);
641
642 } else
643 break;
644
645 if (blocked)
646 break;
647 }
648}
649
650/**
651 * gfs2_glmutex_lock - acquire a local lock on a glock
652 * @gl: the glock
653 *
654 * Gives caller exclusive access to manipulate a glock structure.
655 */
656
657void gfs2_glmutex_lock(struct gfs2_glock *gl)
658{
659 struct gfs2_holder gh;
660
661 gfs2_holder_init(gl, 0, 0, &gh);
662 set_bit(HIF_MUTEX, &gh.gh_iflags);
663
664 spin_lock(&gl->gl_spin);
665 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
666 list_add_tail(&gh.gh_list, &gl->gl_waiters1);
667 else {
668 gl->gl_owner = current;
669 gl->gl_ip = (unsigned long)__builtin_return_address(0);
670 complete(&gh.gh_wait);
671 }
672 spin_unlock(&gl->gl_spin);
673
674 wait_for_completion(&gh.gh_wait);
675 gfs2_holder_uninit(&gh);
676}
677
678/**
679 * gfs2_glmutex_trylock - try to acquire a local lock on a glock
680 * @gl: the glock
681 *
682 * Returns: 1 if the glock is acquired
683 */
684
685static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
686{
687 int acquired = 1;
688
689 spin_lock(&gl->gl_spin);
690 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
691 acquired = 0;
692 else {
693 gl->gl_owner = current;
694 gl->gl_ip = (unsigned long)__builtin_return_address(0);
695 }
696 spin_unlock(&gl->gl_spin);
697
698 return acquired;
699}
700
701/**
702 * gfs2_glmutex_unlock - release a local lock on a glock
703 * @gl: the glock
704 *
705 */
706
707void gfs2_glmutex_unlock(struct gfs2_glock *gl)
708{
709 spin_lock(&gl->gl_spin);
710 clear_bit(GLF_LOCK, &gl->gl_flags);
711 gl->gl_owner = NULL;
712 gl->gl_ip = 0;
713 run_queue(gl);
714 BUG_ON(!spin_is_locked(&gl->gl_spin));
715 spin_unlock(&gl->gl_spin);
716}
717
718/**
719 * handle_callback - add a demote request to a lock's queue
720 * @gl: the glock
721 * @state: the state the caller wants us to change to
722 *
723 */
724
725static void handle_callback(struct gfs2_glock *gl, unsigned int state)
726{
727 struct gfs2_holder *gh, *new_gh = NULL;
728
729 restart:
730 spin_lock(&gl->gl_spin);
731
732 list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
733 if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
734 gl->gl_req_gh != gh) {
735 if (gh->gh_state != state)
736 gh->gh_state = LM_ST_UNLOCKED;
737 goto out;
738 }
739 }
740
741 if (new_gh) {
742 list_add_tail(&new_gh->gh_list, &gl->gl_waiters2);
743 new_gh = NULL;
744 } else {
745 spin_unlock(&gl->gl_spin);
746
747 new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY,
748 GFP_KERNEL | __GFP_NOFAIL),
749 set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
750 set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
751
752 goto restart;
753 }
754
755 out:
756 spin_unlock(&gl->gl_spin);
757
758 if (new_gh)
759 gfs2_holder_put(new_gh);
760}
761
762/**
763 * state_change - record that the glock is now in a different state
764 * @gl: the glock
765 * @new_state the new state
766 *
767 */
768
769static void state_change(struct gfs2_glock *gl, unsigned int new_state)
770{
771 int held1, held2;
772
773 held1 = (gl->gl_state != LM_ST_UNLOCKED);
774 held2 = (new_state != LM_ST_UNLOCKED);
775
776 if (held1 != held2) {
777 if (held2)
778 gfs2_glock_hold(gl);
779 else
780 gfs2_glock_put(gl);
781 }
782
783 gl->gl_state = new_state;
784}
785
786/**
787 * xmote_bh - Called after the lock module is done acquiring a lock
788 * @gl: The glock in question
789 * @ret: the int returned from the lock module
790 *
791 */
792
793static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
794{
795 struct gfs2_sbd *sdp = gl->gl_sbd;
796 struct gfs2_glock_operations *glops = gl->gl_ops;
797 struct gfs2_holder *gh = gl->gl_req_gh;
798 int prev_state = gl->gl_state;
799 int op_done = 1;
800
801 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
802 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
803 gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
804
805 state_change(gl, ret & LM_OUT_ST_MASK);
806
807 if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
808 if (glops->go_inval)
809 glops->go_inval(gl, DIO_METADATA | DIO_DATA);
810 } else if (gl->gl_state == LM_ST_DEFERRED) {
811 /* We might not want to do this here.
812 Look at moving to the inode glops. */
813 if (glops->go_inval)
814 glops->go_inval(gl, DIO_DATA);
815 }
816
817 /* Deal with each possible exit condition */
818
819 if (!gh)
820 gl->gl_stamp = jiffies;
821
822 else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
823 spin_lock(&gl->gl_spin);
824 list_del_init(&gh->gh_list);
825 gh->gh_error = -EIO;
826 spin_unlock(&gl->gl_spin);
827
828 } else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
829 spin_lock(&gl->gl_spin);
830 list_del_init(&gh->gh_list);
831 if (gl->gl_state == gh->gh_state ||
832 gl->gl_state == LM_ST_UNLOCKED)
833 gh->gh_error = 0;
834 else {
835 if (gfs2_assert_warn(sdp, gh->gh_flags &
836 (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1)
837 fs_warn(sdp, "ret = 0x%.8X\n", ret);
838 gh->gh_error = GLR_TRYFAILED;
839 }
840 spin_unlock(&gl->gl_spin);
841
842 if (ret & LM_OUT_CANCELED)
843 handle_callback(gl, LM_ST_UNLOCKED); /* Lame */
844
845 } else if (ret & LM_OUT_CANCELED) {
846 spin_lock(&gl->gl_spin);
847 list_del_init(&gh->gh_list);
848 gh->gh_error = GLR_CANCELED;
849 spin_unlock(&gl->gl_spin);
850
851 } else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
852 spin_lock(&gl->gl_spin);
853 list_move_tail(&gh->gh_list, &gl->gl_holders);
854 gh->gh_error = 0;
855 set_bit(HIF_HOLDER, &gh->gh_iflags);
856 spin_unlock(&gl->gl_spin);
857
858 set_bit(HIF_FIRST, &gh->gh_iflags);
859
860 op_done = 0;
861
862 } else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
863 spin_lock(&gl->gl_spin);
864 list_del_init(&gh->gh_list);
865 gh->gh_error = GLR_TRYFAILED;
866 spin_unlock(&gl->gl_spin);
867
868 } else {
869 if (gfs2_assert_withdraw(sdp, 0) == -1)
870 fs_err(sdp, "ret = 0x%.8X\n", ret);
871 }
872
873 if (glops->go_xmote_bh)
874 glops->go_xmote_bh(gl);
875
876 if (op_done) {
877 spin_lock(&gl->gl_spin);
878 gl->gl_req_gh = NULL;
879 gl->gl_req_bh = NULL;
880 clear_bit(GLF_LOCK, &gl->gl_flags);
881 run_queue(gl);
882 spin_unlock(&gl->gl_spin);
883 }
884
885 gfs2_glock_put(gl);
886
887 if (gh) {
888 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
889 gfs2_holder_put(gh);
890 else
891 complete(&gh->gh_wait);
892 }
893}
894
895/**
896 * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
897 * @gl: The glock in question
898 * @state: the requested state
899 * @flags: modifier flags to the lock call
900 *
901 */
902
903void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags)
904{
905 struct gfs2_sbd *sdp = gl->gl_sbd;
906 struct gfs2_glock_operations *glops = gl->gl_ops;
907 int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
908 LM_FLAG_NOEXP | LM_FLAG_ANY |
909 LM_FLAG_PRIORITY);
910 unsigned int lck_ret;
911
912 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
913 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
914 gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
915 gfs2_assert_warn(sdp, state != gl->gl_state);
916
917 if (gl->gl_state == LM_ST_EXCLUSIVE) {
918 if (glops->go_sync)
919 glops->go_sync(gl,
920 DIO_METADATA | DIO_DATA | DIO_RELEASE);
921 }
922
923 gfs2_glock_hold(gl);
924 gl->gl_req_bh = xmote_bh;
925
926 lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state,
927 lck_flags);
928
929 if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
930 return;
931
932 if (lck_ret & LM_OUT_ASYNC)
933 gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
934 else
935 xmote_bh(gl, lck_ret);
936}
937
938/**
939 * drop_bh - Called after a lock module unlock completes
940 * @gl: the glock
941 * @ret: the return status
942 *
943 * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
944 * Doesn't drop the reference on the glock the top half took out
945 *
946 */
947
948static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
949{
950 struct gfs2_sbd *sdp = gl->gl_sbd;
951 struct gfs2_glock_operations *glops = gl->gl_ops;
952 struct gfs2_holder *gh = gl->gl_req_gh;
953
954 clear_bit(GLF_PREFETCH, &gl->gl_flags);
955
956 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
957 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
958 gfs2_assert_warn(sdp, !ret);
959
960 state_change(gl, LM_ST_UNLOCKED);
961
962 if (glops->go_inval)
963 glops->go_inval(gl, DIO_METADATA | DIO_DATA);
964
965 if (gh) {
966 spin_lock(&gl->gl_spin);
967 list_del_init(&gh->gh_list);
968 gh->gh_error = 0;
969 spin_unlock(&gl->gl_spin);
970 }
971
972 if (glops->go_drop_bh)
973 glops->go_drop_bh(gl);
974
975 spin_lock(&gl->gl_spin);
976 gl->gl_req_gh = NULL;
977 gl->gl_req_bh = NULL;
978 clear_bit(GLF_LOCK, &gl->gl_flags);
979 run_queue(gl);
980 spin_unlock(&gl->gl_spin);
981
982 gfs2_glock_put(gl);
983
984 if (gh) {
985 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
986 gfs2_holder_put(gh);
987 else
988 complete(&gh->gh_wait);
989 }
990}
991
992/**
993 * gfs2_glock_drop_th - call into the lock module to unlock a lock
994 * @gl: the glock
995 *
996 */
997
998void gfs2_glock_drop_th(struct gfs2_glock *gl)
999{
1000 struct gfs2_sbd *sdp = gl->gl_sbd;
1001 struct gfs2_glock_operations *glops = gl->gl_ops;
1002 unsigned int ret;
1003
1004 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1005 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
1006 gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
1007
1008 if (gl->gl_state == LM_ST_EXCLUSIVE) {
1009 if (glops->go_sync)
1010 glops->go_sync(gl,
1011 DIO_METADATA | DIO_DATA | DIO_RELEASE);
1012 }
1013
1014 gfs2_glock_hold(gl);
1015 gl->gl_req_bh = drop_bh;
1016
1017 ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
1018
1019 if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
1020 return;
1021
1022 if (!ret)
1023 drop_bh(gl, ret);
1024 else
1025 gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
1026}
1027
1028/**
1029 * do_cancels - cancel requests for locks stuck waiting on an expire flag
1030 * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
1031 *
1032 * Don't cancel GL_NOCANCEL requests.
1033 */
1034
1035static void do_cancels(struct gfs2_holder *gh)
1036{
1037 struct gfs2_glock *gl = gh->gh_gl;
1038
1039 spin_lock(&gl->gl_spin);
1040
1041 while (gl->gl_req_gh != gh &&
1042 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
1043 !list_empty(&gh->gh_list)) {
1044 if (gl->gl_req_bh &&
1045 !(gl->gl_req_gh &&
1046 (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
1047 spin_unlock(&gl->gl_spin);
1048 gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock);
1049 msleep(100);
1050 spin_lock(&gl->gl_spin);
1051 } else {
1052 spin_unlock(&gl->gl_spin);
1053 msleep(100);
1054 spin_lock(&gl->gl_spin);
1055 }
1056 }
1057
1058 spin_unlock(&gl->gl_spin);
1059}
1060
1061/**
1062 * glock_wait_internal - wait on a glock acquisition
1063 * @gh: the glock holder
1064 *
1065 * Returns: 0 on success
1066 */
1067
1068static int glock_wait_internal(struct gfs2_holder *gh)
1069{
1070 struct gfs2_glock *gl = gh->gh_gl;
1071 struct gfs2_sbd *sdp = gl->gl_sbd;
1072 struct gfs2_glock_operations *glops = gl->gl_ops;
1073
1074 if (test_bit(HIF_ABORTED, &gh->gh_iflags))
1075 return -EIO;
1076
1077 if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
1078 spin_lock(&gl->gl_spin);
1079 if (gl->gl_req_gh != gh &&
1080 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
1081 !list_empty(&gh->gh_list)) {
1082 list_del_init(&gh->gh_list);
1083 gh->gh_error = GLR_TRYFAILED;
1084 run_queue(gl);
1085 spin_unlock(&gl->gl_spin);
1086 return gh->gh_error;
1087 }
1088 spin_unlock(&gl->gl_spin);
1089 }
1090
1091 if (gh->gh_flags & LM_FLAG_PRIORITY)
1092 do_cancels(gh);
1093
1094 wait_for_completion(&gh->gh_wait);
1095
1096 if (gh->gh_error)
1097 return gh->gh_error;
1098
1099 gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
1100 gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state,
1101 gh->gh_state,
1102 gh->gh_flags));
1103
1104 if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
1105 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1106
1107 if (glops->go_lock) {
1108 gh->gh_error = glops->go_lock(gh);
1109 if (gh->gh_error) {
1110 spin_lock(&gl->gl_spin);
1111 list_del_init(&gh->gh_list);
1112 spin_unlock(&gl->gl_spin);
1113 }
1114 }
1115
1116 spin_lock(&gl->gl_spin);
1117 gl->gl_req_gh = NULL;
1118 gl->gl_req_bh = NULL;
1119 clear_bit(GLF_LOCK, &gl->gl_flags);
1120 run_queue(gl);
1121 spin_unlock(&gl->gl_spin);
1122 }
1123
1124 return gh->gh_error;
1125}
1126
1127static inline struct gfs2_holder *
1128find_holder_by_owner(struct list_head *head, struct task_struct *owner)
1129{
1130 struct gfs2_holder *gh;
1131
1132 list_for_each_entry(gh, head, gh_list) {
1133 if (gh->gh_owner == owner)
1134 return gh;
1135 }
1136
1137 return NULL;
1138}
1139
1140/**
1141 * add_to_queue - Add a holder to the wait queue (but look for recursion)
1142 * @gh: the holder structure to add
1143 *
1144 */
1145
1146static void add_to_queue(struct gfs2_holder *gh)
1147{
1148 struct gfs2_glock *gl = gh->gh_gl;
1149 struct gfs2_holder *existing;
1150
1151 BUG_ON(!gh->gh_owner);
1152
1153 existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
1154 if (existing) {
1155 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
1156 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
1157 BUG();
1158 }
1159
1160 existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner);
1161 if (existing) {
1162 print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
1163 print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
1164 BUG();
1165 }
1166
1167 if (gh->gh_flags & LM_FLAG_PRIORITY)
1168 list_add(&gh->gh_list, &gl->gl_waiters3);
1169 else
1170 list_add_tail(&gh->gh_list, &gl->gl_waiters3);
1171}
1172
1173/**
1174 * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock)
1175 * @gh: the holder structure
1176 *
1177 * if (gh->gh_flags & GL_ASYNC), this never returns an error
1178 *
1179 * Returns: 0, GLR_TRYFAILED, or errno on failure
1180 */
1181
1182int gfs2_glock_nq(struct gfs2_holder *gh)
1183{
1184 struct gfs2_glock *gl = gh->gh_gl;
1185 struct gfs2_sbd *sdp = gl->gl_sbd;
1186 int error = 0;
1187
1188restart:
1189 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
1190 set_bit(HIF_ABORTED, &gh->gh_iflags);
1191 return -EIO;
1192 }
1193
1194 set_bit(HIF_PROMOTE, &gh->gh_iflags);
1195
1196 spin_lock(&gl->gl_spin);
1197 add_to_queue(gh);
1198 run_queue(gl);
1199 spin_unlock(&gl->gl_spin);
1200
1201 if (!(gh->gh_flags & GL_ASYNC)) {
1202 error = glock_wait_internal(gh);
1203 if (error == GLR_CANCELED) {
1204 msleep(100);
1205 goto restart;
1206 }
1207 }
1208
1209 clear_bit(GLF_PREFETCH, &gl->gl_flags);
1210
1211 if (error == GLR_TRYFAILED && (gh->gh_flags & GL_DUMP))
1212 dump_glock(gl);
1213
1214 return error;
1215}
1216
1217/**
1218 * gfs2_glock_poll - poll to see if an async request has been completed
1219 * @gh: the holder
1220 *
1221 * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on
1222 */
1223
1224int gfs2_glock_poll(struct gfs2_holder *gh)
1225{
1226 struct gfs2_glock *gl = gh->gh_gl;
1227 int ready = 0;
1228
1229 spin_lock(&gl->gl_spin);
1230
1231 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
1232 ready = 1;
1233 else if (list_empty(&gh->gh_list)) {
1234 if (gh->gh_error == GLR_CANCELED) {
1235 spin_unlock(&gl->gl_spin);
1236 msleep(100);
1237 if (gfs2_glock_nq(gh))
1238 return 1;
1239 return 0;
1240 } else
1241 ready = 1;
1242 }
1243
1244 spin_unlock(&gl->gl_spin);
1245
1246 return ready;
1247}
1248
1249/**
1250 * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
1251 * @gh: the holder structure
1252 *
1253 * Returns: 0, GLR_TRYFAILED, or errno on failure
1254 */
1255
1256int gfs2_glock_wait(struct gfs2_holder *gh)
1257{
1258 int error;
1259
1260 error = glock_wait_internal(gh);
1261 if (error == GLR_CANCELED) {
1262 msleep(100);
1263 gh->gh_flags &= ~GL_ASYNC;
1264 error = gfs2_glock_nq(gh);
1265 }
1266
1267 return error;
1268}
1269
1270/**
1271 * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
1272 * @gh: the glock holder
1273 *
1274 */
1275
1276void gfs2_glock_dq(struct gfs2_holder *gh)
1277{
1278 struct gfs2_glock *gl = gh->gh_gl;
1279 struct gfs2_glock_operations *glops = gl->gl_ops;
1280
1281 if (gh->gh_flags & GL_SYNC)
1282 set_bit(GLF_SYNC, &gl->gl_flags);
1283
1284 if (gh->gh_flags & GL_NOCACHE)
1285 handle_callback(gl, LM_ST_UNLOCKED);
1286
1287 gfs2_glmutex_lock(gl);
1288
1289 spin_lock(&gl->gl_spin);
1290 list_del_init(&gh->gh_list);
1291
1292 if (list_empty(&gl->gl_holders)) {
1293 spin_unlock(&gl->gl_spin);
1294
1295 if (glops->go_unlock)
1296 glops->go_unlock(gh);
1297
1298 if (test_bit(GLF_SYNC, &gl->gl_flags)) {
1299 if (glops->go_sync)
1300 glops->go_sync(gl, DIO_METADATA | DIO_DATA);
1301 }
1302
1303 gl->gl_stamp = jiffies;
1304
1305 spin_lock(&gl->gl_spin);
1306 }
1307
1308 clear_bit(GLF_LOCK, &gl->gl_flags);
1309 run_queue(gl);
1310 spin_unlock(&gl->gl_spin);
1311}
1312
1313/**
1314 * gfs2_glock_prefetch - Try to prefetch a glock
1315 * @gl: the glock
1316 * @state: the state to prefetch in
1317 * @flags: flags passed to go_xmote_th()
1318 *
1319 */
1320
1321static void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state,
1322 int flags)
1323{
1324 struct gfs2_glock_operations *glops = gl->gl_ops;
1325
1326 spin_lock(&gl->gl_spin);
1327
1328 if (test_bit(GLF_LOCK, &gl->gl_flags) ||
1329 !list_empty(&gl->gl_holders) ||
1330 !list_empty(&gl->gl_waiters1) ||
1331 !list_empty(&gl->gl_waiters2) ||
1332 !list_empty(&gl->gl_waiters3) ||
1333 relaxed_state_ok(gl->gl_state, state, flags)) {
1334 spin_unlock(&gl->gl_spin);
1335 return;
1336 }
1337
1338 set_bit(GLF_PREFETCH, &gl->gl_flags);
1339 set_bit(GLF_LOCK, &gl->gl_flags);
1340 spin_unlock(&gl->gl_spin);
1341
1342 glops->go_xmote_th(gl, state, flags);
1343}
1344
1345static void greedy_work(void *data)
1346{
1347 struct greedy *gr = data;
1348 struct gfs2_holder *gh = &gr->gr_gh;
1349 struct gfs2_glock *gl = gh->gh_gl;
1350 struct gfs2_glock_operations *glops = gl->gl_ops;
1351
1352 clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
1353
1354 if (glops->go_greedy)
1355 glops->go_greedy(gl);
1356
1357 spin_lock(&gl->gl_spin);
1358
1359 if (list_empty(&gl->gl_waiters2)) {
1360 clear_bit(GLF_GREEDY, &gl->gl_flags);
1361 spin_unlock(&gl->gl_spin);
1362 gfs2_holder_uninit(gh);
1363 kfree(gr);
1364 } else {
1365 gfs2_glock_hold(gl);
1366 list_add_tail(&gh->gh_list, &gl->gl_waiters2);
1367 run_queue(gl);
1368 spin_unlock(&gl->gl_spin);
1369 gfs2_glock_put(gl);
1370 }
1371}
1372
1373/**
1374 * gfs2_glock_be_greedy -
1375 * @gl:
1376 * @time:
1377 *
1378 * Returns: 0 if go_greedy will be called, 1 otherwise
1379 */
1380
1381int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time)
1382{
1383 struct greedy *gr;
1384 struct gfs2_holder *gh;
1385
1386 if (!time ||
1387 gl->gl_sbd->sd_args.ar_localcaching ||
1388 test_and_set_bit(GLF_GREEDY, &gl->gl_flags))
1389 return 1;
1390
1391 gr = kmalloc(sizeof(struct greedy), GFP_KERNEL);
1392 if (!gr) {
1393 clear_bit(GLF_GREEDY, &gl->gl_flags);
1394 return 1;
1395 }
1396 gh = &gr->gr_gh;
1397
1398 gfs2_holder_init(gl, 0, 0, gh);
1399 set_bit(HIF_GREEDY, &gh->gh_iflags);
1400 INIT_WORK(&gr->gr_work, greedy_work, gr);
1401
1402 set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
1403 schedule_delayed_work(&gr->gr_work, time);
1404
1405 return 0;
1406}
1407
1408/**
1409 * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
1410 * @gh: the holder structure
1411 *
1412 */
1413
1414void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
1415{
1416 gfs2_glock_dq(gh);
1417 gfs2_holder_uninit(gh);
1418}
1419
1420/**
1421 * gfs2_glock_nq_num - acquire a glock based on lock number
1422 * @sdp: the filesystem
1423 * @number: the lock number
1424 * @glops: the glock operations for the type of glock
1425 * @state: the state to acquire the glock in
1426 * @flags: modifier flags for the aquisition
1427 * @gh: the struct gfs2_holder
1428 *
1429 * Returns: errno
1430 */
1431
1432int gfs2_glock_nq_num(struct gfs2_sbd *sdp, uint64_t number,
1433 struct gfs2_glock_operations *glops, unsigned int state,
1434 int flags, struct gfs2_holder *gh)
1435{
1436 struct gfs2_glock *gl;
1437 int error;
1438
1439 error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
1440 if (!error) {
1441 error = gfs2_glock_nq_init(gl, state, flags, gh);
1442 gfs2_glock_put(gl);
1443 }
1444
1445 return error;
1446}
1447
1448/**
1449 * glock_compare - Compare two struct gfs2_glock structures for sorting
1450 * @arg_a: the first structure
1451 * @arg_b: the second structure
1452 *
1453 */
1454
1455static int glock_compare(const void *arg_a, const void *arg_b)
1456{
1457 struct gfs2_holder *gh_a = *(struct gfs2_holder **)arg_a;
1458 struct gfs2_holder *gh_b = *(struct gfs2_holder **)arg_b;
1459 struct lm_lockname *a = &gh_a->gh_gl->gl_name;
1460 struct lm_lockname *b = &gh_b->gh_gl->gl_name;
1461 int ret = 0;
1462
1463 if (a->ln_number > b->ln_number)
1464 ret = 1;
1465 else if (a->ln_number < b->ln_number)
1466 ret = -1;
1467 else {
1468 if (gh_a->gh_state == LM_ST_SHARED &&
1469 gh_b->gh_state == LM_ST_EXCLUSIVE)
1470 ret = 1;
1471 else if (!(gh_a->gh_flags & GL_LOCAL_EXCL) &&
1472 (gh_b->gh_flags & GL_LOCAL_EXCL))
1473 ret = 1;
1474 }
1475
1476 return ret;
1477}
1478
1479/**
1480 * nq_m_sync - synchonously acquire more than one glock in deadlock free order
1481 * @num_gh: the number of structures
1482 * @ghs: an array of struct gfs2_holder structures
1483 *
1484 * Returns: 0 on success (all glocks acquired),
1485 * errno on failure (no glocks acquired)
1486 */
1487
1488static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
1489 struct gfs2_holder **p)
1490{
1491 unsigned int x;
1492 int error = 0;
1493
1494 for (x = 0; x < num_gh; x++)
1495 p[x] = &ghs[x];
1496
1497 sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL);
1498
1499 for (x = 0; x < num_gh; x++) {
1500 p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1501
1502 error = gfs2_glock_nq(p[x]);
1503 if (error) {
1504 while (x--)
1505 gfs2_glock_dq(p[x]);
1506 break;
1507 }
1508 }
1509
1510 return error;
1511}
1512
1513/**
1514 * gfs2_glock_nq_m - acquire multiple glocks
1515 * @num_gh: the number of structures
1516 * @ghs: an array of struct gfs2_holder structures
1517 *
1518 * Figure out how big an impact this function has. Either:
1519 * 1) Replace this code with code that calls gfs2_glock_prefetch()
1520 * 2) Forget async stuff and just call nq_m_sync()
1521 * 3) Leave it like it is
1522 *
1523 * Returns: 0 on success (all glocks acquired),
1524 * errno on failure (no glocks acquired)
1525 */
1526
1527int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1528{
1529 int *e;
1530 unsigned int x;
1531 int borked = 0, serious = 0;
1532 int error = 0;
1533
1534 if (!num_gh)
1535 return 0;
1536
1537 if (num_gh == 1) {
1538 ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1539 return gfs2_glock_nq(ghs);
1540 }
1541
1542 e = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
1543 if (!e)
1544 return -ENOMEM;
1545
1546 for (x = 0; x < num_gh; x++) {
1547 ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC;
1548 error = gfs2_glock_nq(&ghs[x]);
1549 if (error) {
1550 borked = 1;
1551 serious = error;
1552 num_gh = x;
1553 break;
1554 }
1555 }
1556
1557 for (x = 0; x < num_gh; x++) {
1558 error = e[x] = glock_wait_internal(&ghs[x]);
1559 if (error) {
1560 borked = 1;
1561 if (error != GLR_TRYFAILED && error != GLR_CANCELED)
1562 serious = error;
1563 }
1564 }
1565
1566 if (!borked) {
1567 kfree(e);
1568 return 0;
1569 }
1570
1571 for (x = 0; x < num_gh; x++)
1572 if (!e[x])
1573 gfs2_glock_dq(&ghs[x]);
1574
1575 if (serious)
1576 error = serious;
1577 else {
1578 for (x = 0; x < num_gh; x++)
1579 gfs2_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags,
1580 &ghs[x]);
1581 error = nq_m_sync(num_gh, ghs, (struct gfs2_holder **)e);
1582 }
1583
1584 kfree(e);
1585
1586 return error;
1587}
1588
1589/**
1590 * gfs2_glock_dq_m - release multiple glocks
1591 * @num_gh: the number of structures
1592 * @ghs: an array of struct gfs2_holder structures
1593 *
1594 */
1595
1596void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1597{
1598 unsigned int x;
1599
1600 for (x = 0; x < num_gh; x++)
1601 gfs2_glock_dq(&ghs[x]);
1602}
1603
1604/**
1605 * gfs2_glock_dq_uninit_m - release multiple glocks
1606 * @num_gh: the number of structures
1607 * @ghs: an array of struct gfs2_holder structures
1608 *
1609 */
1610
1611void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
1612{
1613 unsigned int x;
1614
1615 for (x = 0; x < num_gh; x++)
1616 gfs2_glock_dq_uninit(&ghs[x]);
1617}
1618
1619/**
1620 * gfs2_glock_prefetch_num - prefetch a glock based on lock number
1621 * @sdp: the filesystem
1622 * @number: the lock number
1623 * @glops: the glock operations for the type of glock
1624 * @state: the state to acquire the glock in
1625 * @flags: modifier flags for the aquisition
1626 *
1627 * Returns: errno
1628 */
1629
1630void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, uint64_t number,
1631 struct gfs2_glock_operations *glops,
1632 unsigned int state, int flags)
1633{
1634 struct gfs2_glock *gl;
1635 int error;
1636
1637 if (atomic_read(&sdp->sd_reclaim_count) <
1638 gfs2_tune_get(sdp, gt_reclaim_limit)) {
1639 error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
1640 if (!error) {
1641 gfs2_glock_prefetch(gl, state, flags);
1642 gfs2_glock_put(gl);
1643 }
1644 }
1645}
1646
1647/**
1648 * gfs2_lvb_hold - attach a LVB from a glock
1649 * @gl: The glock in question
1650 *
1651 */
1652
1653int gfs2_lvb_hold(struct gfs2_glock *gl)
1654{
1655 int error;
1656
1657 gfs2_glmutex_lock(gl);
1658
1659 if (!atomic_read(&gl->gl_lvb_count)) {
1660 error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
1661 if (error) {
1662 gfs2_glmutex_unlock(gl);
1663 return error;
1664 }
1665 gfs2_glock_hold(gl);
1666 }
1667 atomic_inc(&gl->gl_lvb_count);
1668
1669 gfs2_glmutex_unlock(gl);
1670
1671 return 0;
1672}
1673
1674/**
1675 * gfs2_lvb_unhold - detach a LVB from a glock
1676 * @gl: The glock in question
1677 *
1678 */
1679
1680void gfs2_lvb_unhold(struct gfs2_glock *gl)
1681{
1682 gfs2_glock_hold(gl);
1683 gfs2_glmutex_lock(gl);
1684
1685 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
1686 if (atomic_dec_and_test(&gl->gl_lvb_count)) {
1687 gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
1688 gl->gl_lvb = NULL;
1689 gfs2_glock_put(gl);
1690 }
1691
1692 gfs2_glmutex_unlock(gl);
1693 gfs2_glock_put(gl);
1694}
1695
1696#if 0
1697void gfs2_lvb_sync(struct gfs2_glock *gl)
1698{
1699 gfs2_glmutex_lock(gl);
1700
1701 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count));
1702 if (!gfs2_assert_warn(gl->gl_sbd, gfs2_glock_is_held_excl(gl)))
1703 gfs2_lm_sync_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
1704
1705 gfs2_glmutex_unlock(gl);
1706}
1707#endif /* 0 */
1708
1709static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1710 unsigned int state)
1711{
1712 struct gfs2_glock *gl;
1713
1714 gl = gfs2_glock_find(sdp, name);
1715 if (!gl)
1716 return;
1717
1718 if (gl->gl_ops->go_callback)
1719 gl->gl_ops->go_callback(gl, state);
1720 handle_callback(gl, state);
1721
1722 spin_lock(&gl->gl_spin);
1723 run_queue(gl);
1724 spin_unlock(&gl->gl_spin);
1725
1726 gfs2_glock_put(gl);
1727}
1728
1729/**
1730 * gfs2_glock_cb - Callback used by locking module
1731 * @fsdata: Pointer to the superblock
1732 * @type: Type of callback
1733 * @data: Type dependent data pointer
1734 *
1735 * Called by the locking module when it wants to tell us something.
1736 * Either we need to drop a lock, one of our ASYNC requests completed, or
1737 * a journal from another client needs to be recovered.
1738 */
1739
1740void gfs2_glock_cb(lm_fsdata_t *fsdata, unsigned int type, void *data)
1741{
1742 struct gfs2_sbd *sdp = (struct gfs2_sbd *)fsdata;
1743
1744 switch (type) {
1745 case LM_CB_NEED_E:
1746 blocking_cb(sdp, data, LM_ST_UNLOCKED);
1747 return;
1748
1749 case LM_CB_NEED_D:
1750 blocking_cb(sdp, data, LM_ST_DEFERRED);
1751 return;
1752
1753 case LM_CB_NEED_S:
1754 blocking_cb(sdp, data, LM_ST_SHARED);
1755 return;
1756
1757 case LM_CB_ASYNC: {
1758 struct lm_async_cb *async = data;
1759 struct gfs2_glock *gl;
1760
1761 gl = gfs2_glock_find(sdp, &async->lc_name);
1762 if (gfs2_assert_warn(sdp, gl))
1763 return;
1764 if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
1765 gl->gl_req_bh(gl, async->lc_ret);
1766 gfs2_glock_put(gl);
1767 return;
1768 }
1769
1770 case LM_CB_NEED_RECOVERY:
1771 gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data);
1772 if (sdp->sd_recoverd_process)
1773 wake_up_process(sdp->sd_recoverd_process);
1774 return;
1775
1776 case LM_CB_DROPLOCKS:
1777 gfs2_gl_hash_clear(sdp, NO_WAIT);
1778 gfs2_quota_scan(sdp);
1779 return;
1780
1781 default:
1782 gfs2_assert_warn(sdp, 0);
1783 return;
1784 }
1785}
1786
1787/**
1788 * gfs2_try_toss_inode - try to remove a particular inode struct from cache
1789 * sdp: the filesystem
1790 * inum: the inode number
1791 *
1792 */
1793
1794void gfs2_try_toss_inode(struct gfs2_sbd *sdp, struct gfs2_inum *inum)
1795{
1796 struct gfs2_glock *gl;
1797 struct gfs2_inode *ip;
1798 int error;
1799
1800 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_inode_glops,
1801 NO_CREATE, &gl);
1802 if (error || !gl)
1803 return;
1804
1805 if (!gfs2_glmutex_trylock(gl))
1806 goto out;
1807
1808 ip = gl->gl_object;
1809 if (!ip)
1810 goto out_unlock;
1811
1812 if (atomic_read(&ip->i_count))
1813 goto out_unlock;
1814
1815 gfs2_inode_destroy(ip, 1);
1816
1817 out_unlock:
1818 gfs2_glmutex_unlock(gl);
1819
1820 out:
1821 gfs2_glock_put(gl);
1822}
1823
1824/**
1825 * gfs2_iopen_go_callback - Try to kick the inode/vnode associated with an
1826 * iopen glock from memory
1827 * @io_gl: the iopen glock
1828 * @state: the state into which the glock should be put
1829 *
1830 */
1831
1832void gfs2_iopen_go_callback(struct gfs2_glock *io_gl, unsigned int state)
1833{
1834 struct gfs2_glock *i_gl;
1835
1836 if (state != LM_ST_UNLOCKED)
1837 return;
1838
1839 spin_lock(&io_gl->gl_spin);
1840 i_gl = io_gl->gl_object;
1841 if (i_gl) {
1842 gfs2_glock_hold(i_gl);
1843 spin_unlock(&io_gl->gl_spin);
1844 } else {
1845 spin_unlock(&io_gl->gl_spin);
1846 return;
1847 }
1848
1849 if (gfs2_glmutex_trylock(i_gl)) {
1850 struct gfs2_inode *ip = i_gl->gl_object;
1851 if (ip) {
1852 gfs2_try_toss_vnode(ip);
1853 gfs2_glmutex_unlock(i_gl);
1854 gfs2_glock_schedule_for_reclaim(i_gl);
1855 goto out;
1856 }
1857 gfs2_glmutex_unlock(i_gl);
1858 }
1859
1860 out:
1861 gfs2_glock_put(i_gl);
1862}
1863
1864/**
1865 * demote_ok - Check to see if it's ok to unlock a glock
1866 * @gl: the glock
1867 *
1868 * Returns: 1 if it's ok
1869 */
1870
1871static int demote_ok(struct gfs2_glock *gl)
1872{
1873 struct gfs2_sbd *sdp = gl->gl_sbd;
1874 struct gfs2_glock_operations *glops = gl->gl_ops;
1875 int demote = 1;
1876
1877 if (test_bit(GLF_STICKY, &gl->gl_flags))
1878 demote = 0;
1879 else if (test_bit(GLF_PREFETCH, &gl->gl_flags))
1880 demote = time_after_eq(jiffies,
1881 gl->gl_stamp +
1882 gfs2_tune_get(sdp, gt_prefetch_secs) * HZ);
1883 else if (glops->go_demote_ok)
1884 demote = glops->go_demote_ok(gl);
1885
1886 return demote;
1887}
1888
1889/**
1890 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
1891 * @gl: the glock
1892 *
1893 */
1894
1895void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
1896{
1897 struct gfs2_sbd *sdp = gl->gl_sbd;
1898
1899 spin_lock(&sdp->sd_reclaim_lock);
1900 if (list_empty(&gl->gl_reclaim)) {
1901 gfs2_glock_hold(gl);
1902 list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
1903 atomic_inc(&sdp->sd_reclaim_count);
1904 }
1905 spin_unlock(&sdp->sd_reclaim_lock);
1906
1907 wake_up(&sdp->sd_reclaim_wq);
1908}
1909
1910/**
1911 * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
1912 * @sdp: the filesystem
1913 *
1914 * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
1915 * different glock and we notice that there are a lot of glocks in the
1916 * reclaim list.
1917 *
1918 */
1919
1920void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
1921{
1922 struct gfs2_glock *gl;
1923
1924 spin_lock(&sdp->sd_reclaim_lock);
1925 if (list_empty(&sdp->sd_reclaim_list)) {
1926 spin_unlock(&sdp->sd_reclaim_lock);
1927 return;
1928 }
1929 gl = list_entry(sdp->sd_reclaim_list.next,
1930 struct gfs2_glock, gl_reclaim);
1931 list_del_init(&gl->gl_reclaim);
1932 spin_unlock(&sdp->sd_reclaim_lock);
1933
1934 atomic_dec(&sdp->sd_reclaim_count);
1935 atomic_inc(&sdp->sd_reclaimed);
1936
1937 if (gfs2_glmutex_trylock(gl)) {
1938 if (gl->gl_ops == &gfs2_inode_glops) {
1939 struct gfs2_inode *ip = gl->gl_object;
1940 if (ip && !atomic_read(&ip->i_count))
1941 gfs2_inode_destroy(ip, 1);
1942 }
1943 if (queue_empty(gl, &gl->gl_holders) &&
1944 gl->gl_state != LM_ST_UNLOCKED &&
1945 demote_ok(gl))
1946 handle_callback(gl, LM_ST_UNLOCKED);
1947 gfs2_glmutex_unlock(gl);
1948 }
1949
1950 gfs2_glock_put(gl);
1951}
1952
1953/**
1954 * examine_bucket - Call a function for glock in a hash bucket
1955 * @examiner: the function
1956 * @sdp: the filesystem
1957 * @bucket: the bucket
1958 *
1959 * Returns: 1 if the bucket has entries
1960 */
1961
1962static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
1963 struct gfs2_gl_hash_bucket *bucket)
1964{
1965 struct glock_plug plug;
1966 struct list_head *tmp;
1967 struct gfs2_glock *gl;
1968 int entries;
1969
1970 /* Add "plug" to end of bucket list, work back up list from there */
1971 memset(&plug.gl_flags, 0, sizeof(unsigned long));
1972 set_bit(GLF_PLUG, &plug.gl_flags);
1973
1974 write_lock(&bucket->hb_lock);
1975 list_add(&plug.gl_list, &bucket->hb_list);
1976 write_unlock(&bucket->hb_lock);
1977
1978 for (;;) {
1979 write_lock(&bucket->hb_lock);
1980
1981 for (;;) {
1982 tmp = plug.gl_list.next;
1983
1984 if (tmp == &bucket->hb_list) {
1985 list_del(&plug.gl_list);
1986 entries = !list_empty(&bucket->hb_list);
1987 write_unlock(&bucket->hb_lock);
1988 return entries;
1989 }
1990 gl = list_entry(tmp, struct gfs2_glock, gl_list);
1991
1992 /* Move plug up list */
1993 list_move(&plug.gl_list, &gl->gl_list);
1994
1995 if (test_bit(GLF_PLUG, &gl->gl_flags))
1996 continue;
1997
1998 /* examiner() must glock_put() */
1999 gfs2_glock_hold(gl);
2000
2001 break;
2002 }
2003
2004 write_unlock(&bucket->hb_lock);
2005
2006 examiner(gl);
2007 }
2008}
2009
2010/**
2011 * scan_glock - look at a glock and see if we can reclaim it
2012 * @gl: the glock to look at
2013 *
2014 */
2015
2016static void scan_glock(struct gfs2_glock *gl)
2017{
2018 if (gfs2_glmutex_trylock(gl)) {
2019 if (gl->gl_ops == &gfs2_inode_glops) {
2020 struct gfs2_inode *ip = gl->gl_object;
2021 if (ip && !atomic_read(&ip->i_count))
2022 goto out_schedule;
2023 }
2024 if (queue_empty(gl, &gl->gl_holders) &&
2025 gl->gl_state != LM_ST_UNLOCKED &&
2026 demote_ok(gl))
2027 goto out_schedule;
2028
2029 gfs2_glmutex_unlock(gl);
2030 }
2031
2032 gfs2_glock_put(gl);
2033
2034 return;
2035
2036 out_schedule:
2037 gfs2_glmutex_unlock(gl);
2038 gfs2_glock_schedule_for_reclaim(gl);
2039 gfs2_glock_put(gl);
2040}
2041
2042/**
2043 * gfs2_scand_internal - Look for glocks and inodes to toss from memory
2044 * @sdp: the filesystem
2045 *
2046 */
2047
2048void gfs2_scand_internal(struct gfs2_sbd *sdp)
2049{
2050 unsigned int x;
2051
2052 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
2053 examine_bucket(scan_glock, sdp, &sdp->sd_gl_hash[x]);
2054 cond_resched();
2055 }
2056}
2057
2058/**
2059 * clear_glock - look at a glock and see if we can free it from glock cache
2060 * @gl: the glock to look at
2061 *
2062 */
2063
2064static void clear_glock(struct gfs2_glock *gl)
2065{
2066 struct gfs2_sbd *sdp = gl->gl_sbd;
2067 int released;
2068
2069 spin_lock(&sdp->sd_reclaim_lock);
2070 if (!list_empty(&gl->gl_reclaim)) {
2071 list_del_init(&gl->gl_reclaim);
2072 atomic_dec(&sdp->sd_reclaim_count);
2073 spin_unlock(&sdp->sd_reclaim_lock);
2074 released = gfs2_glock_put(gl);
2075 gfs2_assert(sdp, !released);
2076 } else {
2077 spin_unlock(&sdp->sd_reclaim_lock);
2078 }
2079
2080 if (gfs2_glmutex_trylock(gl)) {
2081 if (gl->gl_ops == &gfs2_inode_glops) {
2082 struct gfs2_inode *ip = gl->gl_object;
2083 if (ip && !atomic_read(&ip->i_count))
2084 gfs2_inode_destroy(ip, 1);
2085 }
2086 if (queue_empty(gl, &gl->gl_holders) &&
2087 gl->gl_state != LM_ST_UNLOCKED)
2088 handle_callback(gl, LM_ST_UNLOCKED);
2089
2090 gfs2_glmutex_unlock(gl);
2091 }
2092
2093 gfs2_glock_put(gl);
2094}
2095
2096/**
2097 * gfs2_gl_hash_clear - Empty out the glock hash table
2098 * @sdp: the filesystem
2099 * @wait: wait until it's all gone
2100 *
2101 * Called when unmounting the filesystem, or when inter-node lock manager
2102 * requests DROPLOCKS because it is running out of capacity.
2103 */
2104
2105void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
2106{
2107 unsigned long t;
2108 unsigned int x;
2109 int cont;
2110
2111 t = jiffies;
2112
2113 for (;;) {
2114 cont = 0;
2115
2116 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
2117 if (examine_bucket(clear_glock, sdp,
2118 &sdp->sd_gl_hash[x]))
2119 cont = 1;
2120
2121 if (!wait || !cont)
2122 break;
2123
2124 if (time_after_eq(jiffies,
2125 t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
2126 fs_warn(sdp, "Unmount seems to be stalled. "
2127 "Dumping lock state...\n");
2128 gfs2_dump_lockstate(sdp);
2129 t = jiffies;
2130 }
2131
2132 /* invalidate_inodes() requires that the sb inodes list
2133 not change, but an async completion callback for an
2134 unlock can occur which does glock_put() which
2135 can call iput() which will change the sb inodes list.
2136 invalidate_inodes_mutex prevents glock_put()'s during
2137 an invalidate_inodes() */
2138
2139 mutex_lock(&sdp->sd_invalidate_inodes_mutex);
2140 invalidate_inodes(sdp->sd_vfs);
2141 mutex_unlock(&sdp->sd_invalidate_inodes_mutex);
2142 msleep(10);
2143 }
2144}
2145
2146/*
2147 * Diagnostic routines to help debug distributed deadlock
2148 */
2149
2150/**
2151 * dump_holder - print information about a glock holder
2152 * @str: a string naming the type of holder
2153 * @gh: the glock holder
2154 *
2155 * Returns: 0 on success, -ENOBUFS when we run out of space
2156 */
2157
2158static int dump_holder(char *str, struct gfs2_holder *gh)
2159{
2160 unsigned int x;
2161 int error = -ENOBUFS;
2162
2163 printk(KERN_INFO " %s\n", str);
2164 printk(KERN_INFO " owner = %ld\n",
2165 (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
2166 printk(KERN_INFO " gh_state = %u\n", gh->gh_state);
2167 printk(KERN_INFO " gh_flags =");
2168 for (x = 0; x < 32; x++)
2169 if (gh->gh_flags & (1 << x))
2170 printk(" %u", x);
2171 printk(" \n");
2172 printk(KERN_INFO " error = %d\n", gh->gh_error);
2173 printk(KERN_INFO " gh_iflags =");
2174 for (x = 0; x < 32; x++)
2175 if (test_bit(x, &gh->gh_iflags))
2176 printk(" %u", x);
2177 printk(" \n");
2178 print_symbol(KERN_INFO " initialized at: %s\n", gh->gh_ip);
2179
2180 error = 0;
2181
2182 return error;
2183}
2184
2185/**
2186 * dump_inode - print information about an inode
2187 * @ip: the inode
2188 *
2189 * Returns: 0 on success, -ENOBUFS when we run out of space
2190 */
2191
2192static int dump_inode(struct gfs2_inode *ip)
2193{
2194 unsigned int x;
2195 int error = -ENOBUFS;
2196
2197 printk(KERN_INFO " Inode:\n");
2198 printk(KERN_INFO " num = %llu %llu\n",
2199 (unsigned long long)ip->i_num.no_formal_ino,
2200 (unsigned long long)ip->i_num.no_addr);
2201 printk(KERN_INFO " type = %u\n", IF2DT(ip->i_di.di_mode));
2202 printk(KERN_INFO " i_count = %d\n", atomic_read(&ip->i_count));
2203 printk(KERN_INFO " i_flags =");
2204 for (x = 0; x < 32; x++)
2205 if (test_bit(x, &ip->i_flags))
2206 printk(" %u", x);
2207 printk(" \n");
2208 printk(KERN_INFO " vnode = %s\n", (ip->i_vnode) ? "yes" : "no");
2209
2210 error = 0;
2211
2212 return error;
2213}
2214
2215/**
2216 * dump_glock - print information about a glock
2217 * @gl: the glock
2218 * @count: where we are in the buffer
2219 *
2220 * Returns: 0 on success, -ENOBUFS when we run out of space
2221 */
2222
2223static int dump_glock(struct gfs2_glock *gl)
2224{
2225 struct gfs2_holder *gh;
2226 unsigned int x;
2227 int error = -ENOBUFS;
2228
2229 spin_lock(&gl->gl_spin);
2230
2231 printk(KERN_INFO "Glock (%u, %llu)\n", gl->gl_name.ln_type,
2232 (unsigned long long)gl->gl_name.ln_number);
2233 printk(KERN_INFO " gl_flags =");
2234 for (x = 0; x < 32; x++)
2235 if (test_bit(x, &gl->gl_flags))
2236 printk(" %u", x);
2237 printk(" \n");
2238 printk(KERN_INFO " gl_ref = %d\n", atomic_read(&gl->gl_ref.refcount));
2239 printk(KERN_INFO " gl_state = %u\n", gl->gl_state);
2240 printk(KERN_INFO " gl_owner = %s\n", gl->gl_owner->comm);
2241 print_symbol(KERN_INFO " gl_ip = %s\n", gl->gl_ip);
2242 printk(KERN_INFO " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
2243 printk(KERN_INFO " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
2244 printk(KERN_INFO " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
2245 printk(KERN_INFO " object = %s\n", (gl->gl_object) ? "yes" : "no");
2246 printk(KERN_INFO " le = %s\n",
2247 (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
2248 printk(KERN_INFO " reclaim = %s\n",
2249 (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
2250 if (gl->gl_aspace)
2251 printk(KERN_INFO " aspace = %lu\n",
2252 gl->gl_aspace->i_mapping->nrpages);
2253 else
2254 printk(KERN_INFO " aspace = no\n");
2255 printk(KERN_INFO " ail = %d\n", atomic_read(&gl->gl_ail_count));
2256 if (gl->gl_req_gh) {
2257 error = dump_holder("Request", gl->gl_req_gh);
2258 if (error)
2259 goto out;
2260 }
2261 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
2262 error = dump_holder("Holder", gh);
2263 if (error)
2264 goto out;
2265 }
2266 list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
2267 error = dump_holder("Waiter1", gh);
2268 if (error)
2269 goto out;
2270 }
2271 list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
2272 error = dump_holder("Waiter2", gh);
2273 if (error)
2274 goto out;
2275 }
2276 list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
2277 error = dump_holder("Waiter3", gh);
2278 if (error)
2279 goto out;
2280 }
2281 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
2282 if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
2283 list_empty(&gl->gl_holders)) {
2284 error = dump_inode(gl->gl_object);
2285 if (error)
2286 goto out;
2287 } else {
2288 error = -ENOBUFS;
2289 printk(KERN_INFO " Inode: busy\n");
2290 }
2291 }
2292
2293 error = 0;
2294
2295 out:
2296 spin_unlock(&gl->gl_spin);
2297
2298 return error;
2299}
2300
2301/**
2302 * gfs2_dump_lockstate - print out the current lockstate
2303 * @sdp: the filesystem
2304 * @ub: the buffer to copy the information into
2305 *
2306 * If @ub is NULL, dump the lockstate to the console.
2307 *
2308 */
2309
2310static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
2311{
2312 struct gfs2_gl_hash_bucket *bucket;
2313 struct gfs2_glock *gl;
2314 unsigned int x;
2315 int error = 0;
2316
2317 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
2318 bucket = &sdp->sd_gl_hash[x];
2319
2320 read_lock(&bucket->hb_lock);
2321
2322 list_for_each_entry(gl, &bucket->hb_list, gl_list) {
2323 if (test_bit(GLF_PLUG, &gl->gl_flags))
2324 continue;
2325
2326 error = dump_glock(gl);
2327 if (error)
2328 break;
2329 }
2330
2331 read_unlock(&bucket->hb_lock);
2332
2333 if (error)
2334 break;
2335 }
2336
2337
2338 return error;
2339}
2340
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
new file mode 100644
index 000000000000..2e0a2ba92aa0
--- /dev/null
+++ b/fs/gfs2/glock.h
@@ -0,0 +1,155 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GLOCK_DOT_H__
11#define __GLOCK_DOT_H__
12
13/* Flags for lock requests; used in gfs2_holder gh_flag field.
14 From lm_interface.h:
15#define LM_FLAG_TRY 0x00000001
16#define LM_FLAG_TRY_1CB 0x00000002
17#define LM_FLAG_NOEXP 0x00000004
18#define LM_FLAG_ANY 0x00000008
19#define LM_FLAG_PRIORITY 0x00000010 */
20
21#define GL_LOCAL_EXCL 0x00000020
22#define GL_ASYNC 0x00000040
23#define GL_EXACT 0x00000080
24#define GL_SKIP 0x00000100
25#define GL_ATIME 0x00000200
26#define GL_NOCACHE 0x00000400
27#define GL_SYNC 0x00000800
28#define GL_NOCANCEL 0x00001000
29#define GL_AOP 0x00004000
30#define GL_DUMP 0x00008000
31
32#define GLR_TRYFAILED 13
33#define GLR_CANCELED 14
34
35static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
36{
37 struct gfs2_holder *gh;
38 int locked = 0;
39
40 /* Look in glock's list of holders for one with current task as owner */
41 spin_lock(&gl->gl_spin);
42 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
43 if (gh->gh_owner == current) {
44 locked = 1;
45 break;
46 }
47 }
48 spin_unlock(&gl->gl_spin);
49
50 return locked;
51}
52
53static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
54{
55 return (gl->gl_state == LM_ST_EXCLUSIVE);
56}
57
58static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl)
59{
60 return (gl->gl_state == LM_ST_DEFERRED);
61}
62
63static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
64{
65 return (gl->gl_state == LM_ST_SHARED);
66}
67
68static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
69{
70 int ret;
71 spin_lock(&gl->gl_spin);
72 ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3);
73 spin_unlock(&gl->gl_spin);
74 return ret;
75}
76
77int gfs2_glock_get(struct gfs2_sbd *sdp,
78 uint64_t number, struct gfs2_glock_operations *glops,
79 int create, struct gfs2_glock **glp);
80void gfs2_glock_hold(struct gfs2_glock *gl);
81int gfs2_glock_put(struct gfs2_glock *gl);
82void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
83 struct gfs2_holder *gh);
84void gfs2_holder_reinit(unsigned int state, unsigned flags,
85 struct gfs2_holder *gh);
86void gfs2_holder_uninit(struct gfs2_holder *gh);
87
88void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags);
89void gfs2_glock_drop_th(struct gfs2_glock *gl);
90
91void gfs2_glmutex_lock(struct gfs2_glock *gl);
92void gfs2_glmutex_unlock(struct gfs2_glock *gl);
93
94int gfs2_glock_nq(struct gfs2_holder *gh);
95int gfs2_glock_poll(struct gfs2_holder *gh);
96int gfs2_glock_wait(struct gfs2_holder *gh);
97void gfs2_glock_dq(struct gfs2_holder *gh);
98
99int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time);
100
101void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
102int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
103 uint64_t number, struct gfs2_glock_operations *glops,
104 unsigned int state, int flags, struct gfs2_holder *gh);
105
106int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
107void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
108void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
109
110void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, uint64_t number,
111 struct gfs2_glock_operations *glops,
112 unsigned int state, int flags);
113
114/**
115 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
116 * @gl: the glock
117 * @state: the state we're requesting
118 * @flags: the modifier flags
119 * @gh: the holder structure
120 *
121 * Returns: 0, GLR_*, or errno
122 */
123
124static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
125 unsigned int state, int flags,
126 struct gfs2_holder *gh)
127{
128 int error;
129
130 gfs2_holder_init(gl, state, flags, gh);
131
132 error = gfs2_glock_nq(gh);
133 if (error)
134 gfs2_holder_uninit(gh);
135
136 return error;
137}
138
139/* Lock Value Block functions */
140
141int gfs2_lvb_hold(struct gfs2_glock *gl);
142void gfs2_lvb_unhold(struct gfs2_glock *gl);
143
144void gfs2_glock_cb(lm_fsdata_t *fsdata, unsigned int type, void *data);
145
146void gfs2_try_toss_inode(struct gfs2_sbd *sdp, struct gfs2_inum *inum);
147void gfs2_iopen_go_callback(struct gfs2_glock *gl, unsigned int state);
148
149void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
150void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
151
152void gfs2_scand_internal(struct gfs2_sbd *sdp);
153void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
154
155#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
new file mode 100644
index 000000000000..e262f22f744e
--- /dev/null
+++ b/fs/gfs2/glops.c
@@ -0,0 +1,491 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16
17#include "gfs2.h"
18#include "lm_interface.h"
19#include "incore.h"
20#include "bmap.h"
21#include "glock.h"
22#include "glops.h"
23#include "inode.h"
24#include "log.h"
25#include "meta_io.h"
26#include "page.h"
27#include "recovery.h"
28#include "rgrp.h"
29#include "util.h"
30
31/**
32 * meta_go_sync - sync out the metadata for this glock
33 * @gl: the glock
34 * @flags: DIO_*
35 *
36 * Called when demoting or unlocking an EX glock. We must flush
37 * to disk all dirty buffers/pages relating to this glock, and must not
38 * not return to caller to demote/unlock the glock until I/O is complete.
39 */
40
41static void meta_go_sync(struct gfs2_glock *gl, int flags)
42{
43 if (!(flags & DIO_METADATA))
44 return;
45
46 if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) {
47 gfs2_log_flush(gl->gl_sbd, gl);
48 gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
49 if (flags & DIO_RELEASE)
50 gfs2_ail_empty_gl(gl);
51 }
52
53 clear_bit(GLF_SYNC, &gl->gl_flags);
54}
55
56/**
57 * meta_go_inval - invalidate the metadata for this glock
58 * @gl: the glock
59 * @flags:
60 *
61 */
62
63static void meta_go_inval(struct gfs2_glock *gl, int flags)
64{
65 if (!(flags & DIO_METADATA))
66 return;
67
68 gfs2_meta_inval(gl);
69 gl->gl_vn++;
70}
71
72/**
73 * meta_go_demote_ok - Check to see if it's ok to unlock a glock
74 * @gl: the glock
75 *
76 * Returns: 1 if we have no cached data; ok to demote meta glock
77 */
78
79static int meta_go_demote_ok(struct gfs2_glock *gl)
80{
81 return !gl->gl_aspace->i_mapping->nrpages;
82}
83
84/**
85 * inode_go_xmote_th - promote/demote a glock
86 * @gl: the glock
87 * @state: the requested state
88 * @flags:
89 *
90 */
91
92static void inode_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
93 int flags)
94{
95 if (gl->gl_state != LM_ST_UNLOCKED)
96 gfs2_pte_inval(gl);
97 gfs2_glock_xmote_th(gl, state, flags);
98}
99
100/**
101 * inode_go_xmote_bh - After promoting/demoting a glock
102 * @gl: the glock
103 *
104 */
105
106static void inode_go_xmote_bh(struct gfs2_glock *gl)
107{
108 struct gfs2_holder *gh = gl->gl_req_gh;
109 struct buffer_head *bh;
110 int error;
111
112 if (gl->gl_state != LM_ST_UNLOCKED &&
113 (!gh || !(gh->gh_flags & GL_SKIP))) {
114 error = gfs2_meta_read(gl, gl->gl_name.ln_number, DIO_START,
115 &bh);
116 if (!error)
117 brelse(bh);
118 }
119}
120
121/**
122 * inode_go_drop_th - unlock a glock
123 * @gl: the glock
124 *
125 * Invoked from rq_demote().
126 * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long)
127 * is being purged from our node's glock cache; we're dropping lock.
128 */
129
130static void inode_go_drop_th(struct gfs2_glock *gl)
131{
132 gfs2_pte_inval(gl);
133 gfs2_glock_drop_th(gl);
134}
135
136/**
137 * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
138 * @gl: the glock protecting the inode
139 * @flags:
140 *
141 */
142
143static void inode_go_sync(struct gfs2_glock *gl, int flags)
144{
145 int meta = (flags & DIO_METADATA);
146 int data = (flags & DIO_DATA);
147
148 if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
149 if (meta && data) {
150 gfs2_page_sync(gl, flags | DIO_START);
151 gfs2_log_flush(gl->gl_sbd, gl);
152 gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
153 gfs2_page_sync(gl, flags | DIO_WAIT);
154 clear_bit(GLF_DIRTY, &gl->gl_flags);
155 } else if (meta) {
156 gfs2_log_flush(gl->gl_sbd, gl);
157 gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
158 } else if (data)
159 gfs2_page_sync(gl, flags | DIO_START | DIO_WAIT);
160 if (flags & DIO_RELEASE)
161 gfs2_ail_empty_gl(gl);
162 }
163
164 clear_bit(GLF_SYNC, &gl->gl_flags);
165}
166
167/**
168 * inode_go_inval - prepare a inode glock to be released
169 * @gl: the glock
170 * @flags:
171 *
172 */
173
174static void inode_go_inval(struct gfs2_glock *gl, int flags)
175{
176 int meta = (flags & DIO_METADATA);
177 int data = (flags & DIO_DATA);
178
179 if (meta) {
180 gfs2_meta_inval(gl);
181 gl->gl_vn++;
182 }
183 if (data)
184 gfs2_page_inval(gl);
185}
186
187/**
188 * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock
189 * @gl: the glock
190 *
191 * Returns: 1 if it's ok
192 */
193
194static int inode_go_demote_ok(struct gfs2_glock *gl)
195{
196 struct gfs2_sbd *sdp = gl->gl_sbd;
197 int demote = 0;
198
199 if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages)
200 demote = 1;
201 else if (!sdp->sd_args.ar_localcaching &&
202 time_after_eq(jiffies, gl->gl_stamp +
203 gfs2_tune_get(sdp, gt_demote_secs) * HZ))
204 demote = 1;
205
206 return demote;
207}
208
209/**
210 * inode_go_lock - operation done after an inode lock is locked by a process
211 * @gl: the glock
212 * @flags:
213 *
214 * Returns: errno
215 */
216
217static int inode_go_lock(struct gfs2_holder *gh)
218{
219 struct gfs2_glock *gl = gh->gh_gl;
220 struct gfs2_inode *ip = gl->gl_object;
221 int error = 0;
222
223 if (!ip)
224 return 0;
225
226 if (ip->i_vn != gl->gl_vn) {
227 error = gfs2_inode_refresh(ip);
228 if (error)
229 return error;
230 gfs2_inode_attr_in(ip);
231 }
232
233 if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
234 (gl->gl_state == LM_ST_EXCLUSIVE) &&
235 (gh->gh_flags & GL_LOCAL_EXCL))
236 error = gfs2_truncatei_resume(ip);
237
238 return error;
239}
240
241/**
242 * inode_go_unlock - operation done before an inode lock is unlocked by a
243 * process
244 * @gl: the glock
245 * @flags:
246 *
247 */
248
249static void inode_go_unlock(struct gfs2_holder *gh)
250{
251 struct gfs2_glock *gl = gh->gh_gl;
252 struct gfs2_inode *ip = gl->gl_object;
253
254 if (ip && test_bit(GLF_DIRTY, &gl->gl_flags))
255 gfs2_inode_attr_in(ip);
256
257 if (ip)
258 gfs2_meta_cache_flush(ip);
259}
260
261/**
262 * inode_greedy -
263 * @gl: the glock
264 *
265 */
266
267static void inode_greedy(struct gfs2_glock *gl)
268{
269 struct gfs2_sbd *sdp = gl->gl_sbd;
270 struct gfs2_inode *ip = gl->gl_object;
271 unsigned int quantum = gfs2_tune_get(sdp, gt_greedy_quantum);
272 unsigned int max = gfs2_tune_get(sdp, gt_greedy_max);
273 unsigned int new_time;
274
275 spin_lock(&ip->i_spin);
276
277 if (time_after(ip->i_last_pfault + quantum, jiffies)) {
278 new_time = ip->i_greedy + quantum;
279 if (new_time > max)
280 new_time = max;
281 } else {
282 new_time = ip->i_greedy - quantum;
283 if (!new_time || new_time > max)
284 new_time = 1;
285 }
286
287 ip->i_greedy = new_time;
288
289 spin_unlock(&ip->i_spin);
290
291 gfs2_inode_put(ip);
292}
293
294/**
295 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
296 * @gl: the glock
297 *
298 * Returns: 1 if it's ok
299 */
300
301static int rgrp_go_demote_ok(struct gfs2_glock *gl)
302{
303 return !gl->gl_aspace->i_mapping->nrpages;
304}
305
306/**
307 * rgrp_go_lock - operation done after an rgrp lock is locked by
308 * a first holder on this node.
309 * @gl: the glock
310 * @flags:
311 *
312 * Returns: errno
313 */
314
315static int rgrp_go_lock(struct gfs2_holder *gh)
316{
317 return gfs2_rgrp_bh_get(gh->gh_gl->gl_object);
318}
319
320/**
321 * rgrp_go_unlock - operation done before an rgrp lock is unlocked by
322 * a last holder on this node.
323 * @gl: the glock
324 * @flags:
325 *
326 */
327
328static void rgrp_go_unlock(struct gfs2_holder *gh)
329{
330 gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
331}
332
333/**
334 * trans_go_xmote_th - promote/demote the transaction glock
335 * @gl: the glock
336 * @state: the requested state
337 * @flags:
338 *
339 */
340
341static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
342 int flags)
343{
344 struct gfs2_sbd *sdp = gl->gl_sbd;
345
346 if (gl->gl_state != LM_ST_UNLOCKED &&
347 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
348 gfs2_meta_syncfs(sdp);
349 gfs2_log_shutdown(sdp);
350 }
351
352 gfs2_glock_xmote_th(gl, state, flags);
353}
354
355/**
356 * trans_go_xmote_bh - After promoting/demoting the transaction glock
357 * @gl: the glock
358 *
359 */
360
361static void trans_go_xmote_bh(struct gfs2_glock *gl)
362{
363 struct gfs2_sbd *sdp = gl->gl_sbd;
364 struct gfs2_inode *ip = sdp->sd_jdesc->jd_inode->u.generic_ip;
365 struct gfs2_glock *j_gl = ip->i_gl;
366 struct gfs2_log_header head;
367 int error;
368
369 if (gl->gl_state != LM_ST_UNLOCKED &&
370 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
371 gfs2_meta_cache_flush(sdp->sd_jdesc->jd_inode->u.generic_ip);
372 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
373
374 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
375 if (error)
376 gfs2_consist(sdp);
377 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT))
378 gfs2_consist(sdp);
379
380 /* Initialize some head of the log stuff */
381 if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
382 sdp->sd_log_sequence = head.lh_sequence + 1;
383 gfs2_log_pointers_init(sdp, head.lh_blkno);
384 }
385 }
386}
387
388/**
389 * trans_go_drop_th - unlock the transaction glock
390 * @gl: the glock
391 *
392 * We want to sync the device even with localcaching. Remember
393 * that localcaching journal replay only marks buffers dirty.
394 */
395
396static void trans_go_drop_th(struct gfs2_glock *gl)
397{
398 struct gfs2_sbd *sdp = gl->gl_sbd;
399
400 if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
401 gfs2_meta_syncfs(sdp);
402 gfs2_log_shutdown(sdp);
403 }
404
405 gfs2_glock_drop_th(gl);
406}
407
408/**
409 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
410 * @gl: the glock
411 *
412 * Returns: 1 if it's ok
413 */
414
415static int quota_go_demote_ok(struct gfs2_glock *gl)
416{
417 return !atomic_read(&gl->gl_lvb_count);
418}
419
420struct gfs2_glock_operations gfs2_meta_glops = {
421 .go_xmote_th = gfs2_glock_xmote_th,
422 .go_drop_th = gfs2_glock_drop_th,
423 .go_sync = meta_go_sync,
424 .go_inval = meta_go_inval,
425 .go_demote_ok = meta_go_demote_ok,
426 .go_type = LM_TYPE_META
427};
428
429struct gfs2_glock_operations gfs2_inode_glops = {
430 .go_xmote_th = inode_go_xmote_th,
431 .go_xmote_bh = inode_go_xmote_bh,
432 .go_drop_th = inode_go_drop_th,
433 .go_sync = inode_go_sync,
434 .go_inval = inode_go_inval,
435 .go_demote_ok = inode_go_demote_ok,
436 .go_lock = inode_go_lock,
437 .go_unlock = inode_go_unlock,
438 .go_greedy = inode_greedy,
439 .go_type = LM_TYPE_INODE
440};
441
442struct gfs2_glock_operations gfs2_rgrp_glops = {
443 .go_xmote_th = gfs2_glock_xmote_th,
444 .go_drop_th = gfs2_glock_drop_th,
445 .go_sync = meta_go_sync,
446 .go_inval = meta_go_inval,
447 .go_demote_ok = rgrp_go_demote_ok,
448 .go_lock = rgrp_go_lock,
449 .go_unlock = rgrp_go_unlock,
450 .go_type = LM_TYPE_RGRP
451};
452
453struct gfs2_glock_operations gfs2_trans_glops = {
454 .go_xmote_th = trans_go_xmote_th,
455 .go_xmote_bh = trans_go_xmote_bh,
456 .go_drop_th = trans_go_drop_th,
457 .go_type = LM_TYPE_NONDISK
458};
459
460struct gfs2_glock_operations gfs2_iopen_glops = {
461 .go_xmote_th = gfs2_glock_xmote_th,
462 .go_drop_th = gfs2_glock_drop_th,
463 .go_callback = gfs2_iopen_go_callback,
464 .go_type = LM_TYPE_IOPEN
465};
466
467struct gfs2_glock_operations gfs2_flock_glops = {
468 .go_xmote_th = gfs2_glock_xmote_th,
469 .go_drop_th = gfs2_glock_drop_th,
470 .go_type = LM_TYPE_FLOCK
471};
472
473struct gfs2_glock_operations gfs2_nondisk_glops = {
474 .go_xmote_th = gfs2_glock_xmote_th,
475 .go_drop_th = gfs2_glock_drop_th,
476 .go_type = LM_TYPE_NONDISK
477};
478
479struct gfs2_glock_operations gfs2_quota_glops = {
480 .go_xmote_th = gfs2_glock_xmote_th,
481 .go_drop_th = gfs2_glock_drop_th,
482 .go_demote_ok = quota_go_demote_ok,
483 .go_type = LM_TYPE_QUOTA
484};
485
486struct gfs2_glock_operations gfs2_journal_glops = {
487 .go_xmote_th = gfs2_glock_xmote_th,
488 .go_drop_th = gfs2_glock_drop_th,
489 .go_type = LM_TYPE_JOURNAL
490};
491
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
new file mode 100644
index 000000000000..5c1e9491024f
--- /dev/null
+++ b/fs/gfs2/glops.h
@@ -0,0 +1,23 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GLOPS_DOT_H__
11#define __GLOPS_DOT_H__
12
13extern struct gfs2_glock_operations gfs2_meta_glops;
14extern struct gfs2_glock_operations gfs2_inode_glops;
15extern struct gfs2_glock_operations gfs2_rgrp_glops;
16extern struct gfs2_glock_operations gfs2_trans_glops;
17extern struct gfs2_glock_operations gfs2_iopen_glops;
18extern struct gfs2_glock_operations gfs2_flock_glops;
19extern struct gfs2_glock_operations gfs2_nondisk_glops;
20extern struct gfs2_glock_operations gfs2_quota_glops;
21extern struct gfs2_glock_operations gfs2_journal_glops;
22
23#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
new file mode 100644
index 000000000000..92091d006a02
--- /dev/null
+++ b/fs/gfs2/incore.h
@@ -0,0 +1,687 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __INCORE_DOT_H__
11#define __INCORE_DOT_H__
12
13#define DIO_FORCE 0x00000001
14#define DIO_CLEAN 0x00000002
15#define DIO_DIRTY 0x00000004
16#define DIO_START 0x00000008
17#define DIO_WAIT 0x00000010
18#define DIO_METADATA 0x00000020
19#define DIO_DATA 0x00000040
20#define DIO_RELEASE 0x00000080
21#define DIO_ALL 0x00000100
22
23struct gfs2_log_operations;
24struct gfs2_log_element;
25struct gfs2_bitmap;
26struct gfs2_rgrpd;
27struct gfs2_bufdata;
28struct gfs2_glock_operations;
29struct gfs2_holder;
30struct gfs2_glock;
31struct gfs2_alloc;
32struct gfs2_inode;
33struct gfs2_file;
34struct gfs2_revoke;
35struct gfs2_revoke_replay;
36struct gfs2_unlinked;
37struct gfs2_quota_data;
38struct gfs2_log_buf;
39struct gfs2_trans;
40struct gfs2_ail;
41struct gfs2_jdesc;
42struct gfs2_args;
43struct gfs2_tune;
44struct gfs2_gl_hash_bucket;
45struct gfs2_sbd;
46
47typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
48
49/*
50 * Structure of operations that are associated with each
51 * type of element in the log.
52 */
53
54struct gfs2_log_operations {
55 void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le);
56 void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
57 void (*lo_before_commit) (struct gfs2_sbd *sdp);
58 void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
59 void (*lo_before_scan) (struct gfs2_jdesc *jd,
60 struct gfs2_log_header *head, int pass);
61 int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start,
62 struct gfs2_log_descriptor *ld, __be64 *ptr,
63 int pass);
64 void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass);
65 const char *lo_name;
66};
67
68struct gfs2_log_element {
69 struct list_head le_list;
70 const struct gfs2_log_operations *le_ops;
71};
72
73struct gfs2_bitmap {
74 struct buffer_head *bi_bh;
75 char *bi_clone;
76 uint32_t bi_offset;
77 uint32_t bi_start;
78 uint32_t bi_len;
79};
80
81struct gfs2_rgrpd {
82 struct list_head rd_list; /* Link with superblock */
83 struct list_head rd_list_mru;
84 struct list_head rd_recent; /* Recently used rgrps */
85 struct gfs2_glock *rd_gl; /* Glock for this rgrp */
86 struct gfs2_rindex rd_ri;
87 struct gfs2_rgrp rd_rg;
88 uint64_t rd_rg_vn;
89 struct gfs2_bitmap *rd_bits;
90 unsigned int rd_bh_count;
91 struct mutex rd_mutex;
92 uint32_t rd_free_clone;
93 struct gfs2_log_element rd_le;
94 uint32_t rd_last_alloc_data;
95 uint32_t rd_last_alloc_meta;
96 struct gfs2_sbd *rd_sbd;
97};
98
99enum gfs2_state_bits {
100 BH_Pinned = BH_PrivateStart,
101 BH_Escaped = BH_PrivateStart + 1,
102};
103
104BUFFER_FNS(Pinned, pinned)
105TAS_BUFFER_FNS(Pinned, pinned)
106BUFFER_FNS(Escaped, escaped)
107TAS_BUFFER_FNS(Escaped, escaped)
108
109struct gfs2_bufdata {
110 struct buffer_head *bd_bh;
111 struct gfs2_glock *bd_gl;
112
113 struct list_head bd_list_tr;
114 struct gfs2_log_element bd_le;
115
116 struct gfs2_ail *bd_ail;
117 struct list_head bd_ail_st_list;
118 struct list_head bd_ail_gl_list;
119};
120
121struct gfs2_glock_operations {
122 void (*go_xmote_th) (struct gfs2_glock * gl, unsigned int state,
123 int flags);
124 void (*go_xmote_bh) (struct gfs2_glock * gl);
125 void (*go_drop_th) (struct gfs2_glock * gl);
126 void (*go_drop_bh) (struct gfs2_glock * gl);
127 void (*go_sync) (struct gfs2_glock * gl, int flags);
128 void (*go_inval) (struct gfs2_glock * gl, int flags);
129 int (*go_demote_ok) (struct gfs2_glock * gl);
130 int (*go_lock) (struct gfs2_holder * gh);
131 void (*go_unlock) (struct gfs2_holder * gh);
132 void (*go_callback) (struct gfs2_glock * gl, unsigned int state);
133 void (*go_greedy) (struct gfs2_glock * gl);
134 int go_type;
135};
136
137enum {
138 /* Actions */
139 HIF_MUTEX = 0,
140 HIF_PROMOTE = 1,
141 HIF_DEMOTE = 2,
142 HIF_GREEDY = 3,
143
144 /* States */
145 HIF_ALLOCED = 4,
146 HIF_DEALLOC = 5,
147 HIF_HOLDER = 6,
148 HIF_FIRST = 7,
149 HIF_ABORTED = 9,
150};
151
152struct gfs2_holder {
153 struct list_head gh_list;
154
155 struct gfs2_glock *gh_gl;
156 struct task_struct *gh_owner;
157 unsigned int gh_state;
158 unsigned gh_flags;
159
160 int gh_error;
161 unsigned long gh_iflags;
162 struct completion gh_wait;
163 unsigned long gh_ip;
164};
165
166enum {
167 GLF_PLUG = 0,
168 GLF_LOCK = 1,
169 GLF_STICKY = 2,
170 GLF_PREFETCH = 3,
171 GLF_SYNC = 4,
172 GLF_DIRTY = 5,
173 GLF_SKIP_WAITERS2 = 6,
174 GLF_GREEDY = 7,
175};
176
177struct gfs2_glock {
178 struct list_head gl_list;
179 unsigned long gl_flags; /* GLF_... */
180 struct lm_lockname gl_name;
181 struct kref gl_ref;
182
183 spinlock_t gl_spin;
184
185 unsigned int gl_state;
186 struct task_struct *gl_owner;
187 unsigned long gl_ip;
188 struct list_head gl_holders;
189 struct list_head gl_waiters1; /* HIF_MUTEX */
190 struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_GREEDY */
191 struct list_head gl_waiters3; /* HIF_PROMOTE */
192
193 struct gfs2_glock_operations *gl_ops;
194
195 struct gfs2_holder *gl_req_gh;
196 gfs2_glop_bh_t gl_req_bh;
197
198 lm_lock_t *gl_lock;
199 char *gl_lvb;
200 atomic_t gl_lvb_count;
201
202 uint64_t gl_vn;
203 unsigned long gl_stamp;
204 void *gl_object;
205
206 struct gfs2_gl_hash_bucket *gl_bucket;
207 struct list_head gl_reclaim;
208
209 struct gfs2_sbd *gl_sbd;
210
211 struct inode *gl_aspace;
212 struct gfs2_log_element gl_le;
213 struct list_head gl_ail_list;
214 atomic_t gl_ail_count;
215};
216
217struct gfs2_alloc {
218 /* Quota stuff */
219
220 unsigned int al_qd_num;
221 struct gfs2_quota_data *al_qd[4];
222 struct gfs2_holder al_qd_ghs[4];
223
224 /* Filled in by the caller to gfs2_inplace_reserve() */
225
226 uint32_t al_requested;
227
228 /* Filled in by gfs2_inplace_reserve() */
229
230 char *al_file;
231 unsigned int al_line;
232 struct gfs2_holder al_ri_gh;
233 struct gfs2_holder al_rgd_gh;
234 struct gfs2_rgrpd *al_rgd;
235
236 /* Filled in by gfs2_alloc_*() */
237
238 uint32_t al_alloced;
239};
240
241enum {
242 GIF_MIN_INIT = 0,
243 GIF_QD_LOCKED = 1,
244 GIF_PAGED = 2,
245 GIF_SW_PAGED = 3,
246};
247
248struct gfs2_inode {
249 struct inode i_inode;
250 struct gfs2_inum i_num;
251
252 atomic_t i_count;
253 unsigned long i_flags; /* GIF_... */
254
255 uint64_t i_vn;
256 struct gfs2_dinode i_di;
257
258 struct gfs2_glock *i_gl;
259 struct gfs2_sbd *i_sbd;
260 struct inode *i_vnode;
261
262 struct gfs2_holder i_iopen_gh;
263 struct gfs2_holder i_gh; /* for prepare/commit_write only */
264 struct gfs2_alloc i_alloc;
265 uint64_t i_last_rg_alloc;
266
267 spinlock_t i_spin;
268 struct rw_semaphore i_rw_mutex;
269
270 unsigned int i_greedy;
271 unsigned long i_last_pfault;
272
273 struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT];
274};
275
276static inline struct gfs2_inode *GFS2_I(struct inode *inode)
277{
278 return container_of(inode, struct gfs2_inode, i_inode);
279}
280
281enum {
282 GFF_DID_DIRECT_ALLOC = 0,
283};
284
285struct gfs2_file {
286 unsigned long f_flags; /* GFF_... */
287 struct mutex f_fl_mutex;
288 struct gfs2_holder f_fl_gh;
289};
290
291struct gfs2_revoke {
292 struct gfs2_log_element rv_le;
293 uint64_t rv_blkno;
294};
295
296struct gfs2_revoke_replay {
297 struct list_head rr_list;
298 uint64_t rr_blkno;
299 unsigned int rr_where;
300};
301
302enum {
303 ULF_LOCKED = 0,
304};
305
306struct gfs2_unlinked {
307 struct list_head ul_list;
308 unsigned int ul_count;
309 struct gfs2_unlinked_tag ul_ut;
310 unsigned long ul_flags; /* ULF_... */
311 unsigned int ul_slot;
312};
313
314enum {
315 QDF_USER = 0,
316 QDF_CHANGE = 1,
317 QDF_LOCKED = 2,
318};
319
320struct gfs2_quota_lvb {
321 uint32_t qb_magic;
322 uint32_t __pad;
323 uint64_t qb_limit; /* Hard limit of # blocks to alloc */
324 uint64_t qb_warn; /* Warn user when alloc is above this # */
325 int64_t qb_value; /* Current # blocks allocated */
326};
327
328struct gfs2_quota_data {
329 struct list_head qd_list;
330 unsigned int qd_count;
331
332 uint32_t qd_id;
333 unsigned long qd_flags; /* QDF_... */
334
335 int64_t qd_change;
336 int64_t qd_change_sync;
337
338 unsigned int qd_slot;
339 unsigned int qd_slot_count;
340
341 struct buffer_head *qd_bh;
342 struct gfs2_quota_change *qd_bh_qc;
343 unsigned int qd_bh_count;
344
345 struct gfs2_glock *qd_gl;
346 struct gfs2_quota_lvb qd_qb;
347
348 uint64_t qd_sync_gen;
349 unsigned long qd_last_warn;
350 unsigned long qd_last_touched;
351};
352
353struct gfs2_log_buf {
354 struct list_head lb_list;
355 struct buffer_head *lb_bh;
356 struct buffer_head *lb_real;
357};
358
359struct gfs2_trans {
360 unsigned long tr_ip;
361
362 unsigned int tr_blocks;
363 unsigned int tr_revokes;
364 unsigned int tr_reserved;
365
366 struct gfs2_holder tr_t_gh;
367
368 int tr_touched;
369
370 unsigned int tr_num_buf;
371 unsigned int tr_num_buf_new;
372 unsigned int tr_num_buf_rm;
373 struct list_head tr_list_buf;
374
375 unsigned int tr_num_revoke;
376 unsigned int tr_num_revoke_rm;
377};
378
379struct gfs2_ail {
380 struct list_head ai_list;
381
382 unsigned int ai_first;
383 struct list_head ai_ail1_list;
384 struct list_head ai_ail2_list;
385
386 uint64_t ai_sync_gen;
387};
388
389struct gfs2_jdesc {
390 struct list_head jd_list;
391
392 struct inode *jd_inode;
393 unsigned int jd_jid;
394 int jd_dirty;
395
396 unsigned int jd_blocks;
397};
398
399#define GFS2_GLOCKD_DEFAULT 1
400#define GFS2_GLOCKD_MAX 16
401
402#define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF
403#define GFS2_QUOTA_OFF 0
404#define GFS2_QUOTA_ACCOUNT 1
405#define GFS2_QUOTA_ON 2
406
407#define GFS2_DATA_DEFAULT GFS2_DATA_ORDERED
408#define GFS2_DATA_WRITEBACK 1
409#define GFS2_DATA_ORDERED 2
410
411struct gfs2_args {
412 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
413 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
414 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
415 int ar_spectator; /* Don't get a journal because we're always RO */
416 int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */
417 int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */
418 int ar_localcaching; /* Local-style caching (dangerous on multihost) */
419 int ar_debug; /* Oops on errors instead of trying to be graceful */
420 int ar_upgrade; /* Upgrade ondisk/multihost format */
421 unsigned int ar_num_glockd; /* Number of glockd threads */
422 int ar_posix_acl; /* Enable posix acls */
423 int ar_quota; /* off/account/on */
424 int ar_suiddir; /* suiddir support */
425 int ar_data; /* ordered/writeback */
426};
427
428struct gfs2_tune {
429 spinlock_t gt_spin;
430
431 unsigned int gt_ilimit;
432 unsigned int gt_ilimit_tries;
433 unsigned int gt_ilimit_min;
434 unsigned int gt_demote_secs; /* Cache retention for unheld glock */
435 unsigned int gt_incore_log_blocks;
436 unsigned int gt_log_flush_secs;
437 unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
438
439 unsigned int gt_scand_secs;
440 unsigned int gt_recoverd_secs;
441 unsigned int gt_logd_secs;
442 unsigned int gt_quotad_secs;
443 unsigned int gt_inoded_secs;
444
445 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
446 unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
447 unsigned int gt_quota_scale_num; /* Numerator */
448 unsigned int gt_quota_scale_den; /* Denominator */
449 unsigned int gt_quota_cache_secs;
450 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
451 unsigned int gt_atime_quantum; /* Min secs between atime updates */
452 unsigned int gt_new_files_jdata;
453 unsigned int gt_new_files_directio;
454 unsigned int gt_max_atomic_write; /* Split big writes into this size */
455 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
456 unsigned int gt_lockdump_size;
457 unsigned int gt_stall_secs; /* Detects trouble! */
458 unsigned int gt_complain_secs;
459 unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
460 unsigned int gt_entries_per_readdir;
461 unsigned int gt_prefetch_secs; /* Usage window for prefetched glocks */
462 unsigned int gt_greedy_default;
463 unsigned int gt_greedy_quantum;
464 unsigned int gt_greedy_max;
465 unsigned int gt_statfs_quantum;
466 unsigned int gt_statfs_slow;
467};
468
469struct gfs2_gl_hash_bucket {
470 rwlock_t hb_lock;
471 struct list_head hb_list;
472};
473
474enum {
475 SDF_JOURNAL_CHECKED = 0,
476 SDF_JOURNAL_LIVE = 1,
477 SDF_SHUTDOWN = 2,
478 SDF_NOATIME = 3,
479};
480
481#define GFS2_GL_HASH_SHIFT 13
482#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
483#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1)
484#define GFS2_FSNAME_LEN 256
485
486struct gfs2_sbd {
487 struct super_block *sd_vfs;
488 struct kobject sd_kobj;
489 unsigned long sd_flags; /* SDF_... */
490 struct gfs2_sb sd_sb;
491
492 /* Constants computed on mount */
493
494 uint32_t sd_fsb2bb;
495 uint32_t sd_fsb2bb_shift;
496 uint32_t sd_diptrs; /* Number of pointers in a dinode */
497 uint32_t sd_inptrs; /* Number of pointers in a indirect block */
498 uint32_t sd_jbsize; /* Size of a journaled data block */
499 uint32_t sd_hash_bsize; /* sizeof(exhash block) */
500 uint32_t sd_hash_bsize_shift;
501 uint32_t sd_hash_ptrs; /* Number of pointers in a hash block */
502 uint32_t sd_ut_per_block;
503 uint32_t sd_qc_per_block;
504 uint32_t sd_max_dirres; /* Max blocks needed to add a directory entry */
505 uint32_t sd_max_height; /* Max height of a file's metadata tree */
506 uint64_t sd_heightsize[GFS2_MAX_META_HEIGHT];
507 uint32_t sd_max_jheight; /* Max height of journaled file's meta tree */
508 uint64_t sd_jheightsize[GFS2_MAX_META_HEIGHT];
509
510 struct gfs2_args sd_args; /* Mount arguments */
511 struct gfs2_tune sd_tune; /* Filesystem tuning structure */
512
513 /* Lock Stuff */
514
515 struct lm_lockstruct sd_lockstruct;
516 struct gfs2_gl_hash_bucket sd_gl_hash[GFS2_GL_HASH_SIZE];
517 struct list_head sd_reclaim_list;
518 spinlock_t sd_reclaim_lock;
519 wait_queue_head_t sd_reclaim_wq;
520 atomic_t sd_reclaim_count;
521 struct gfs2_holder sd_live_gh;
522 struct gfs2_glock *sd_rename_gl;
523 struct gfs2_glock *sd_trans_gl;
524 struct mutex sd_invalidate_inodes_mutex;
525
526 /* Inode Stuff */
527
528 struct inode *sd_master_dir;
529 struct inode *sd_jindex;
530 struct inode *sd_inum_inode;
531 struct inode *sd_statfs_inode;
532 struct inode *sd_ir_inode;
533 struct inode *sd_sc_inode;
534 struct inode *sd_ut_inode;
535 struct inode *sd_qc_inode;
536 struct inode *sd_rindex;
537 struct inode *sd_quota_inode;
538
539 /* Inum stuff */
540
541 struct mutex sd_inum_mutex;
542
543 /* StatFS stuff */
544
545 spinlock_t sd_statfs_spin;
546 struct mutex sd_statfs_mutex;
547 struct gfs2_statfs_change sd_statfs_master;
548 struct gfs2_statfs_change sd_statfs_local;
549 unsigned long sd_statfs_sync_time;
550
551 /* Resource group stuff */
552
553 uint64_t sd_rindex_vn;
554 spinlock_t sd_rindex_spin;
555 struct mutex sd_rindex_mutex;
556 struct list_head sd_rindex_list;
557 struct list_head sd_rindex_mru_list;
558 struct list_head sd_rindex_recent_list;
559 struct gfs2_rgrpd *sd_rindex_forward;
560 unsigned int sd_rgrps;
561
562 /* Journal index stuff */
563
564 struct list_head sd_jindex_list;
565 spinlock_t sd_jindex_spin;
566 struct mutex sd_jindex_mutex;
567 unsigned int sd_journals;
568 unsigned long sd_jindex_refresh_time;
569
570 struct gfs2_jdesc *sd_jdesc;
571 struct gfs2_holder sd_journal_gh;
572 struct gfs2_holder sd_jinode_gh;
573
574 struct gfs2_holder sd_ir_gh;
575 struct gfs2_holder sd_sc_gh;
576 struct gfs2_holder sd_ut_gh;
577 struct gfs2_holder sd_qc_gh;
578
579 /* Daemon stuff */
580
581 struct task_struct *sd_scand_process;
582 struct task_struct *sd_recoverd_process;
583 struct task_struct *sd_logd_process;
584 struct task_struct *sd_quotad_process;
585 struct task_struct *sd_inoded_process;
586 struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
587 unsigned int sd_glockd_num;
588
589 /* Unlinked inode stuff */
590
591 struct list_head sd_unlinked_list;
592 atomic_t sd_unlinked_count;
593 spinlock_t sd_unlinked_spin;
594 struct mutex sd_unlinked_mutex;
595
596 unsigned int sd_unlinked_slots;
597 unsigned int sd_unlinked_chunks;
598 unsigned char **sd_unlinked_bitmap;
599
600 /* Quota stuff */
601
602 struct list_head sd_quota_list;
603 atomic_t sd_quota_count;
604 spinlock_t sd_quota_spin;
605 struct mutex sd_quota_mutex;
606
607 unsigned int sd_quota_slots;
608 unsigned int sd_quota_chunks;
609 unsigned char **sd_quota_bitmap;
610
611 uint64_t sd_quota_sync_gen;
612 unsigned long sd_quota_sync_time;
613
614 /* Log stuff */
615
616 spinlock_t sd_log_lock;
617
618 unsigned int sd_log_blks_reserved;
619 unsigned int sd_log_commited_buf;
620 unsigned int sd_log_commited_revoke;
621
622 unsigned int sd_log_num_gl;
623 unsigned int sd_log_num_buf;
624 unsigned int sd_log_num_revoke;
625 unsigned int sd_log_num_rg;
626 unsigned int sd_log_num_databuf;
627 unsigned int sd_log_num_jdata;
628 unsigned int sd_log_num_hdrs;
629
630 struct list_head sd_log_le_gl;
631 struct list_head sd_log_le_buf;
632 struct list_head sd_log_le_revoke;
633 struct list_head sd_log_le_rg;
634 struct list_head sd_log_le_databuf;
635
636 unsigned int sd_log_blks_free;
637 struct mutex sd_log_reserve_mutex;
638
639 uint64_t sd_log_sequence;
640 unsigned int sd_log_head;
641 unsigned int sd_log_tail;
642 int sd_log_idle;
643
644 unsigned long sd_log_flush_time;
645 struct rw_semaphore sd_log_flush_lock;
646 struct list_head sd_log_flush_list;
647
648 unsigned int sd_log_flush_head;
649 uint64_t sd_log_flush_wrapped;
650
651 struct list_head sd_ail1_list;
652 struct list_head sd_ail2_list;
653 uint64_t sd_ail_sync_gen;
654
655 /* Replay stuff */
656
657 struct list_head sd_revoke_list;
658 unsigned int sd_replay_tail;
659
660 unsigned int sd_found_blocks;
661 unsigned int sd_found_revokes;
662 unsigned int sd_replayed_blocks;
663
664 /* For quiescing the filesystem */
665
666 struct gfs2_holder sd_freeze_gh;
667 struct mutex sd_freeze_lock;
668 unsigned int sd_freeze_count;
669
670 /* Counters */
671
672 atomic_t sd_glock_count;
673 atomic_t sd_glock_held_count;
674 atomic_t sd_inode_count;
675 atomic_t sd_reclaimed;
676
677 char sd_fsname[GFS2_FSNAME_LEN];
678 char sd_table_name[GFS2_FSNAME_LEN];
679 char sd_proto_name[GFS2_FSNAME_LEN];
680
681 /* Debugging crud */
682
683 unsigned long sd_last_warning;
684};
685
686#endif /* __INCORE_DOT_H__ */
687
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
new file mode 100644
index 000000000000..c2c7d2b63a57
--- /dev/null
+++ b/fs/gfs2/inode.c
@@ -0,0 +1,1820 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/posix_acl.h>
16#include <linux/sort.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/crc32.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "acl.h"
24#include "bmap.h"
25#include "dir.h"
26#include "eattr.h"
27#include "glock.h"
28#include "glops.h"
29#include "inode.h"
30#include "log.h"
31#include "meta_io.h"
32#include "ops_address.h"
33#include "ops_file.h"
34#include "ops_inode.h"
35#include "quota.h"
36#include "rgrp.h"
37#include "trans.h"
38#include "unlinked.h"
39#include "util.h"
40
41/**
42 * inode_attr_in - Copy attributes from the dinode into the VFS inode
43 * @ip: The GFS2 inode (with embedded disk inode data)
44 * @inode: The Linux VFS inode
45 *
46 */
47
48static void inode_attr_in(struct gfs2_inode *ip, struct inode *inode)
49{
50 inode->i_ino = ip->i_num.no_formal_ino;
51
52 switch (ip->i_di.di_mode & S_IFMT) {
53 case S_IFBLK:
54 case S_IFCHR:
55 inode->i_rdev = MKDEV(ip->i_di.di_major, ip->i_di.di_minor);
56 break;
57 default:
58 inode->i_rdev = 0;
59 break;
60 };
61
62 inode->i_mode = ip->i_di.di_mode;
63 inode->i_nlink = ip->i_di.di_nlink;
64 inode->i_uid = ip->i_di.di_uid;
65 inode->i_gid = ip->i_di.di_gid;
66 i_size_write(inode, ip->i_di.di_size);
67 inode->i_atime.tv_sec = ip->i_di.di_atime;
68 inode->i_mtime.tv_sec = ip->i_di.di_mtime;
69 inode->i_ctime.tv_sec = ip->i_di.di_ctime;
70 inode->i_atime.tv_nsec = 0;
71 inode->i_mtime.tv_nsec = 0;
72 inode->i_ctime.tv_nsec = 0;
73 inode->i_blksize = PAGE_SIZE;
74 inode->i_blocks = ip->i_di.di_blocks <<
75 (ip->i_sbd->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
76
77 if (ip->i_di.di_flags & GFS2_DIF_IMMUTABLE)
78 inode->i_flags |= S_IMMUTABLE;
79 else
80 inode->i_flags &= ~S_IMMUTABLE;
81
82 if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY)
83 inode->i_flags |= S_APPEND;
84 else
85 inode->i_flags &= ~S_APPEND;
86}
87
88/**
89 * gfs2_inode_attr_in - Copy attributes from the dinode into the VFS inode
90 * @ip: The GFS2 inode (with embedded disk inode data)
91 *
92 */
93
94void gfs2_inode_attr_in(struct gfs2_inode *ip)
95{
96 struct inode *inode;
97
98 inode = gfs2_ip2v_lookup(ip);
99 if (inode) {
100 inode_attr_in(ip, inode);
101 iput(inode);
102 }
103}
104
105/**
106 * gfs2_inode_attr_out - Copy attributes from VFS inode into the dinode
107 * @ip: The GFS2 inode
108 *
109 * Only copy out the attributes that we want the VFS layer
110 * to be able to modify.
111 */
112
113void gfs2_inode_attr_out(struct gfs2_inode *ip)
114{
115 struct inode *inode = ip->i_vnode;
116
117 gfs2_assert_withdraw(ip->i_sbd,
118 (ip->i_di.di_mode & S_IFMT) == (inode->i_mode & S_IFMT));
119 ip->i_di.di_mode = inode->i_mode;
120 ip->i_di.di_uid = inode->i_uid;
121 ip->i_di.di_gid = inode->i_gid;
122 ip->i_di.di_atime = inode->i_atime.tv_sec;
123 ip->i_di.di_mtime = inode->i_mtime.tv_sec;
124 ip->i_di.di_ctime = inode->i_ctime.tv_sec;
125}
126
127/**
128 * gfs2_ip2v_lookup - Get the struct inode for a struct gfs2_inode
129 * @ip: the struct gfs2_inode to get the struct inode for
130 *
131 * Returns: A VFS inode, or NULL if none
132 */
133
134struct inode *gfs2_ip2v_lookup(struct gfs2_inode *ip)
135{
136 struct inode *inode = NULL;
137
138 gfs2_assert_warn(ip->i_sbd, test_bit(GIF_MIN_INIT, &ip->i_flags));
139
140 spin_lock(&ip->i_spin);
141 if (ip->i_vnode)
142 inode = igrab(ip->i_vnode);
143 spin_unlock(&ip->i_spin);
144
145 return inode;
146}
147
148/**
149 * gfs2_ip2v - Get/Create a struct inode for a struct gfs2_inode
150 * @ip: the struct gfs2_inode to get the struct inode for
151 *
152 * Returns: A VFS inode, or NULL if no mem
153 */
154
155struct inode *gfs2_ip2v(struct gfs2_inode *ip)
156{
157 struct inode *inode, *tmp;
158
159 inode = gfs2_ip2v_lookup(ip);
160 if (inode)
161 return inode;
162
163 tmp = new_inode(ip->i_sbd->sd_vfs);
164 if (!tmp)
165 return NULL;
166
167 inode_attr_in(ip, tmp);
168
169 if (S_ISREG(ip->i_di.di_mode)) {
170 tmp->i_op = &gfs2_file_iops;
171 tmp->i_fop = &gfs2_file_fops;
172 tmp->i_mapping->a_ops = &gfs2_file_aops;
173 } else if (S_ISDIR(ip->i_di.di_mode)) {
174 tmp->i_op = &gfs2_dir_iops;
175 tmp->i_fop = &gfs2_dir_fops;
176 } else if (S_ISLNK(ip->i_di.di_mode)) {
177 tmp->i_op = &gfs2_symlink_iops;
178 } else {
179 tmp->i_op = &gfs2_dev_iops;
180 init_special_inode(tmp, tmp->i_mode, tmp->i_rdev);
181 }
182
183 tmp->u.generic_ip = NULL;
184
185 for (;;) {
186 spin_lock(&ip->i_spin);
187 if (!ip->i_vnode)
188 break;
189 inode = igrab(ip->i_vnode);
190 spin_unlock(&ip->i_spin);
191
192 if (inode) {
193 iput(tmp);
194 return inode;
195 }
196 yield();
197 }
198
199 inode = tmp;
200
201 gfs2_inode_hold(ip);
202 ip->i_vnode = inode;
203 inode->u.generic_ip = ip;
204
205 spin_unlock(&ip->i_spin);
206
207 insert_inode_hash(inode);
208
209 return inode;
210}
211
212static int iget_test(struct inode *inode, void *opaque)
213{
214 struct gfs2_inode *ip = inode->u.generic_ip;
215 struct gfs2_inum *inum = (struct gfs2_inum *)opaque;
216
217 if (ip && ip->i_num.no_addr == inum->no_addr)
218 return 1;
219
220 return 0;
221}
222
223struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum *inum)
224{
225 return ilookup5(sb, (unsigned long)inum->no_formal_ino,
226 iget_test, inum);
227}
228
229void gfs2_inode_min_init(struct gfs2_inode *ip, unsigned int type)
230{
231 if (!test_and_set_bit(GIF_MIN_INIT, &ip->i_flags)) {
232 ip->i_di.di_nlink = 1;
233 ip->i_di.di_mode = DT2IF(type);
234 }
235}
236
237/**
238 * gfs2_inode_refresh - Refresh the incore copy of the dinode
239 * @ip: The GFS2 inode
240 *
241 * Returns: errno
242 */
243
244int gfs2_inode_refresh(struct gfs2_inode *ip)
245{
246 struct buffer_head *dibh;
247 int error;
248
249 error = gfs2_meta_inode_buffer(ip, &dibh);
250 if (error)
251 return error;
252
253 if (gfs2_metatype_check(ip->i_sbd, dibh, GFS2_METATYPE_DI)) {
254 brelse(dibh);
255 return -EIO;
256 }
257
258 gfs2_dinode_in(&ip->i_di, dibh->b_data);
259 set_bit(GIF_MIN_INIT, &ip->i_flags);
260
261 brelse(dibh);
262
263 if (ip->i_num.no_addr != ip->i_di.di_num.no_addr) {
264 if (gfs2_consist_inode(ip))
265 gfs2_dinode_print(&ip->i_di);
266 return -EIO;
267 }
268 if (ip->i_num.no_formal_ino != ip->i_di.di_num.no_formal_ino)
269 return -ESTALE;
270
271 ip->i_vn = ip->i_gl->gl_vn;
272
273 return 0;
274}
275
276/**
277 * inode_create - create a struct gfs2_inode
278 * @i_gl: The glock covering the inode
279 * @inum: The inode number
280 * @io_gl: the iopen glock to acquire/hold (using holder in new gfs2_inode)
281 * @io_state: the state the iopen glock should be acquired in
282 * @ipp: pointer to put the returned inode in
283 *
284 * Returns: errno
285 */
286
287static int inode_create(struct gfs2_glock *i_gl, const struct gfs2_inum *inum,
288 struct gfs2_glock *io_gl, unsigned int io_state,
289 struct gfs2_inode **ipp, int need_lock)
290{
291 struct gfs2_sbd *sdp = i_gl->gl_sbd;
292 struct gfs2_inode *ip;
293 int error = 0;
294
295 ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
296 if (!ip)
297 return -ENOMEM;
298 memset(ip, 0, sizeof(struct gfs2_inode));
299 ip->i_num = *inum;
300 atomic_set(&ip->i_count, 1);
301 ip->i_vn = i_gl->gl_vn - 1;
302 ip->i_gl = i_gl;
303 ip->i_sbd = sdp;
304 spin_lock_init(&ip->i_spin);
305 init_rwsem(&ip->i_rw_mutex);
306 ip->i_greedy = gfs2_tune_get(sdp, gt_greedy_default);
307
308 if (need_lock) {
309 error = gfs2_glock_nq_init(io_gl,
310 io_state, GL_LOCAL_EXCL | GL_EXACT,
311 &ip->i_iopen_gh);
312 if (error)
313 goto fail;
314
315 spin_lock(&io_gl->gl_spin);
316 gfs2_glock_hold(i_gl);
317 io_gl->gl_object = i_gl;
318 spin_unlock(&io_gl->gl_spin);
319 }
320
321 gfs2_glock_hold(i_gl);
322 i_gl->gl_object = ip;
323 atomic_inc(&sdp->sd_inode_count);
324 *ipp = ip;
325 return 0;
326
327fail:
328 gfs2_meta_cache_flush(ip);
329 kmem_cache_free(gfs2_inode_cachep, ip);
330 *ipp = NULL;
331 return error;
332}
333
334/**
335 * gfs2_inode_get - Create or get a reference on an inode
336 * @i_gl: The glock covering the inode
337 * @inum: The inode number
338 * @create:
339 * @ipp: pointer to put the returned inode in
340 *
341 * Returns: errno
342 */
343
344int gfs2_inode_get(struct gfs2_glock *i_gl, const struct gfs2_inum *inum,
345 int create, struct gfs2_inode **ipp)
346{
347 struct gfs2_sbd *sdp = i_gl->gl_sbd;
348 struct gfs2_glock *io_gl;
349 int error = 0;
350
351 gfs2_glmutex_lock(i_gl);
352
353 *ipp = i_gl->gl_object;
354 if (*ipp) {
355 error = -ESTALE;
356 if ((*ipp)->i_num.no_formal_ino != inum->no_formal_ino)
357 goto out;
358 atomic_inc(&(*ipp)->i_count);
359 error = 0;
360 goto out;
361 }
362
363 if (!create)
364 goto out;
365
366 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_iopen_glops,
367 CREATE, &io_gl);
368 if (!error) {
369 error = inode_create(i_gl, inum, io_gl, LM_ST_SHARED, ipp, 1);
370 gfs2_glock_put(io_gl);
371 }
372
373 out:
374 gfs2_glmutex_unlock(i_gl);
375
376 return error;
377}
378
379void gfs2_inode_hold(struct gfs2_inode *ip)
380{
381 gfs2_assert(ip->i_sbd, atomic_read(&ip->i_count) > 0);
382 atomic_inc(&ip->i_count);
383}
384
385void gfs2_inode_put(struct gfs2_inode *ip)
386{
387 gfs2_assert(ip->i_sbd, atomic_read(&ip->i_count) > 0);
388 atomic_dec(&ip->i_count);
389}
390
391void gfs2_inode_destroy(struct gfs2_inode *ip, int unlock)
392{
393 struct gfs2_sbd *sdp = ip->i_sbd;
394 struct gfs2_glock *i_gl = ip->i_gl;
395
396 gfs2_assert_warn(sdp, !atomic_read(&ip->i_count));
397 if (unlock) {
398 struct gfs2_glock *io_gl = ip->i_iopen_gh.gh_gl;
399 gfs2_assert(sdp, io_gl->gl_object == i_gl);
400
401 spin_lock(&io_gl->gl_spin);
402 io_gl->gl_object = NULL;
403 spin_unlock(&io_gl->gl_spin);
404 gfs2_glock_put(i_gl);
405
406 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
407 }
408
409 gfs2_meta_cache_flush(ip);
410 kmem_cache_free(gfs2_inode_cachep, ip);
411
412 i_gl->gl_object = NULL;
413 gfs2_glock_put(i_gl);
414
415 atomic_dec(&sdp->sd_inode_count);
416}
417
418static int dinode_dealloc(struct gfs2_inode *ip, struct gfs2_unlinked *ul)
419{
420 struct gfs2_sbd *sdp = ip->i_sbd;
421 struct gfs2_alloc *al;
422 struct gfs2_rgrpd *rgd;
423 int error;
424
425 if (ip->i_di.di_blocks != 1) {
426 if (gfs2_consist_inode(ip))
427 gfs2_dinode_print(&ip->i_di);
428 return -EIO;
429 }
430
431 al = gfs2_alloc_get(ip);
432
433 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
434 if (error)
435 goto out;
436
437 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
438 if (error)
439 goto out_qs;
440
441 rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
442 if (!rgd) {
443 gfs2_consist_inode(ip);
444 error = -EIO;
445 goto out_rindex_relse;
446 }
447
448 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
449 &al->al_rgd_gh);
450 if (error)
451 goto out_rindex_relse;
452
453 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_UNLINKED +
454 RES_STATFS + RES_QUOTA, 1);
455 if (error)
456 goto out_rg_gunlock;
457
458 gfs2_trans_add_gl(ip->i_gl);
459
460 gfs2_free_di(rgd, ip);
461
462 error = gfs2_unlinked_ondisk_rm(sdp, ul);
463
464 gfs2_trans_end(sdp);
465 clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
466
467 out_rg_gunlock:
468 gfs2_glock_dq_uninit(&al->al_rgd_gh);
469
470 out_rindex_relse:
471 gfs2_glock_dq_uninit(&al->al_ri_gh);
472
473 out_qs:
474 gfs2_quota_unhold(ip);
475
476 out:
477 gfs2_alloc_put(ip);
478
479 return error;
480}
481
482/**
483 * inode_dealloc - Deallocate all on-disk blocks for an inode (dinode)
484 * @sdp: the filesystem
485 * @inum: the inode number to deallocate
486 * @io_gh: a holder for the iopen glock for this inode
487 *
488 * N.B. When we enter this we already hold the iopen glock and getting
489 * the glock for the inode means that we are grabbing the locks in the
490 * "wrong" order so we must only so a try lock operation and fail if we
491 * don't get the lock. Thats ok, since if we fail it means someone else
492 * is using the inode still and thus we shouldn't be deallocating it
493 * anyway.
494 *
495 * Returns: errno
496 */
497
498static int inode_dealloc(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul,
499 struct gfs2_holder *io_gh)
500{
501 struct gfs2_inode *ip;
502 struct gfs2_holder i_gh;
503 int error;
504
505 error = gfs2_glock_nq_num(sdp, ul->ul_ut.ut_inum.no_addr,
506 &gfs2_inode_glops, LM_ST_EXCLUSIVE,
507 LM_FLAG_TRY_1CB|GL_DUMP, &i_gh);
508 switch(error) {
509 case 0:
510 break;
511 case GLR_TRYFAILED:
512 return 1; /* or back off and relock in different order? */
513 default:
514 return error;
515 }
516
517 gfs2_assert_warn(sdp, !i_gh.gh_gl->gl_object);
518 error = inode_create(i_gh.gh_gl, &ul->ul_ut.ut_inum, io_gh->gh_gl,
519 LM_ST_EXCLUSIVE, &ip, 0);
520
521 if (error)
522 goto out;
523
524 error = gfs2_inode_refresh(ip);
525 if (error)
526 goto out_iput;
527
528 if (ip->i_di.di_nlink) {
529 if (gfs2_consist_inode(ip))
530 gfs2_dinode_print(&ip->i_di);
531 error = -EIO;
532 goto out_iput;
533 }
534
535 if (S_ISDIR(ip->i_di.di_mode) &&
536 (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
537 error = gfs2_dir_exhash_dealloc(ip);
538 if (error)
539 goto out_iput;
540 }
541
542 if (ip->i_di.di_eattr) {
543 error = gfs2_ea_dealloc(ip);
544 if (error)
545 goto out_iput;
546 }
547
548 if (!gfs2_is_stuffed(ip)) {
549 error = gfs2_file_dealloc(ip);
550 if (error)
551 goto out_iput;
552 }
553
554 error = dinode_dealloc(ip, ul);
555 if (error)
556 goto out_iput;
557
558out_iput:
559 gfs2_glmutex_lock(i_gh.gh_gl);
560 gfs2_inode_put(ip);
561 gfs2_inode_destroy(ip, 0);
562 gfs2_glmutex_unlock(i_gh.gh_gl);
563
564out:
565 gfs2_glock_dq_uninit(&i_gh);
566
567 return error;
568}
569
570/**
571 * try_inode_dealloc - Try to deallocate an inode and all its blocks
572 * @sdp: the filesystem
573 *
574 * Returns: 0 on success, -errno on error, 1 on busy (inode open)
575 */
576
577static int try_inode_dealloc(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
578{
579 int error = 0;
580 struct gfs2_holder iogh;
581
582 gfs2_try_toss_inode(sdp, &ul->ul_ut.ut_inum);
583 error = gfs2_glock_nq_num(sdp, ul->ul_ut.ut_inum.no_addr,
584 &gfs2_iopen_glops, LM_ST_EXCLUSIVE,
585 LM_FLAG_TRY_1CB, &iogh);
586 switch (error) {
587 case 0:
588 break;
589 case GLR_TRYFAILED:
590 return 1;
591 default:
592 return error;
593 }
594
595 error = inode_dealloc(sdp, ul, &iogh);
596 gfs2_glock_dq_uninit(&iogh);
597
598 return error;
599}
600
601static int inode_dealloc_uninit(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
602{
603 struct gfs2_rgrpd *rgd;
604 struct gfs2_holder ri_gh, rgd_gh;
605 int error;
606
607 error = gfs2_rindex_hold(sdp, &ri_gh);
608 if (error)
609 return error;
610
611 rgd = gfs2_blk2rgrpd(sdp, ul->ul_ut.ut_inum.no_addr);
612 if (!rgd) {
613 gfs2_consist(sdp);
614 error = -EIO;
615 goto out;
616 }
617
618 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rgd_gh);
619 if (error)
620 goto out;
621
622 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_UNLINKED + RES_STATFS, 0);
623 if (error)
624 goto out_gunlock;
625
626 gfs2_free_uninit_di(rgd, ul->ul_ut.ut_inum.no_addr);
627 gfs2_unlinked_ondisk_rm(sdp, ul);
628
629 gfs2_trans_end(sdp);
630
631 out_gunlock:
632 gfs2_glock_dq_uninit(&rgd_gh);
633 out:
634 gfs2_glock_dq_uninit(&ri_gh);
635
636 return error;
637}
638
639int gfs2_inode_dealloc(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
640{
641 if (ul->ul_ut.ut_flags & GFS2_UTF_UNINIT)
642 return inode_dealloc_uninit(sdp, ul);
643 else
644 return try_inode_dealloc(sdp, ul);
645}
646
647/**
648 * gfs2_change_nlink - Change nlink count on inode
649 * @ip: The GFS2 inode
650 * @diff: The change in the nlink count required
651 *
652 * Returns: errno
653 */
654
655int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
656{
657 struct buffer_head *dibh;
658 uint32_t nlink;
659 int error;
660
661 nlink = ip->i_di.di_nlink + diff;
662
663 /* If we are reducing the nlink count, but the new value ends up being
664 bigger than the old one, we must have underflowed. */
665 if (diff < 0 && nlink > ip->i_di.di_nlink) {
666 if (gfs2_consist_inode(ip))
667 gfs2_dinode_print(&ip->i_di);
668 return -EIO;
669 }
670
671 error = gfs2_meta_inode_buffer(ip, &dibh);
672 if (error)
673 return error;
674
675 ip->i_di.di_nlink = nlink;
676 ip->i_di.di_ctime = get_seconds();
677
678 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
679 gfs2_dinode_out(&ip->i_di, dibh->b_data);
680 brelse(dibh);
681
682 return 0;
683}
684
685struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
686{
687 struct qstr qstr;
688 gfs2_str2qstr(&qstr, name);
689 return gfs2_lookupi(dip, &qstr, 1, NULL);
690}
691
692
693/**
694 * gfs2_lookupi - Look up a filename in a directory and return its inode
695 * @d_gh: An initialized holder for the directory glock
696 * @name: The name of the inode to look for
697 * @is_root: If 1, ignore the caller's permissions
698 * @i_gh: An uninitialized holder for the new inode glock
699 *
700 * There will always be a vnode (Linux VFS inode) for the d_gh inode unless
701 * @is_root is true.
702 *
703 * Returns: errno
704 */
705
706struct inode *gfs2_lookupi(struct inode *dir, struct qstr *name, int is_root,
707 struct nameidata *nd)
708
709{
710 struct super_block *sb = dir->i_sb;
711 struct gfs2_inode *ipp;
712 struct gfs2_inode *dip = dir->u.generic_ip;
713 struct gfs2_sbd *sdp = dip->i_sbd;
714 struct gfs2_holder d_gh;
715 struct gfs2_inum inum;
716 unsigned int type;
717 struct gfs2_glock *gl;
718 int error = 0;
719 struct inode *inode = NULL;
720
721 if (!name->len || name->len > GFS2_FNAMESIZE)
722 return ERR_PTR(-ENAMETOOLONG);
723
724 if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) ||
725 (name->len == 2 && memcmp(name->name, "..", 2) == 0 &&
726 dir == sb->s_root->d_inode)) {
727 igrab(dir);
728 return dir;
729 }
730
731 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
732 if (error)
733 return ERR_PTR(error);
734
735 if (!is_root) {
736 error = gfs2_repermission(dir, MAY_EXEC, NULL);
737 if (error)
738 goto out;
739 }
740
741 error = gfs2_dir_search(dir, name, &inum, &type);
742 if (error)
743 goto out;
744
745 error = gfs2_glock_get(sdp, inum.no_addr, &gfs2_inode_glops,
746 CREATE, &gl);
747 if (error)
748 goto out;
749
750 error = gfs2_inode_get(gl, &inum, CREATE, &ipp);
751 if (!error)
752 gfs2_inode_min_init(ipp, type);
753
754 gfs2_glock_put(gl);
755
756out:
757 gfs2_glock_dq_uninit(&d_gh);
758 if (error == -ENOENT)
759 return NULL;
760 if (error == 0) {
761 inode = gfs2_ip2v(ipp);
762 gfs2_inode_put(ipp);
763 if (!inode)
764 return ERR_PTR(-ENOMEM);
765 return inode;
766 }
767 return ERR_PTR(error);
768}
769
770static int pick_formal_ino_1(struct gfs2_sbd *sdp, uint64_t *formal_ino)
771{
772 struct gfs2_inode *ip = sdp->sd_ir_inode->u.generic_ip;
773 struct buffer_head *bh;
774 struct gfs2_inum_range ir;
775 int error;
776
777 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
778 if (error)
779 return error;
780 mutex_lock(&sdp->sd_inum_mutex);
781
782 error = gfs2_meta_inode_buffer(ip, &bh);
783 if (error) {
784 mutex_unlock(&sdp->sd_inum_mutex);
785 gfs2_trans_end(sdp);
786 return error;
787 }
788
789 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
790
791 if (ir.ir_length) {
792 *formal_ino = ir.ir_start++;
793 ir.ir_length--;
794 gfs2_trans_add_bh(ip->i_gl, bh, 1);
795 gfs2_inum_range_out(&ir,
796 bh->b_data + sizeof(struct gfs2_dinode));
797 brelse(bh);
798 mutex_unlock(&sdp->sd_inum_mutex);
799 gfs2_trans_end(sdp);
800 return 0;
801 }
802
803 brelse(bh);
804
805 mutex_unlock(&sdp->sd_inum_mutex);
806 gfs2_trans_end(sdp);
807
808 return 1;
809}
810
811static int pick_formal_ino_2(struct gfs2_sbd *sdp, uint64_t *formal_ino)
812{
813 struct gfs2_inode *ip = sdp->sd_ir_inode->u.generic_ip;
814 struct gfs2_inode *m_ip = sdp->sd_inum_inode->u.generic_ip;
815 struct gfs2_holder gh;
816 struct buffer_head *bh;
817 struct gfs2_inum_range ir;
818 int error;
819
820 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
821 if (error)
822 return error;
823
824 error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
825 if (error)
826 goto out;
827 mutex_lock(&sdp->sd_inum_mutex);
828
829 error = gfs2_meta_inode_buffer(ip, &bh);
830 if (error)
831 goto out_end_trans;
832
833 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
834
835 if (!ir.ir_length) {
836 struct buffer_head *m_bh;
837 uint64_t x, y;
838
839 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
840 if (error)
841 goto out_brelse;
842
843 x = *(uint64_t *)(m_bh->b_data + sizeof(struct gfs2_dinode));
844 x = y = be64_to_cpu(x);
845 ir.ir_start = x;
846 ir.ir_length = GFS2_INUM_QUANTUM;
847 x += GFS2_INUM_QUANTUM;
848 if (x < y)
849 gfs2_consist_inode(m_ip);
850 x = cpu_to_be64(x);
851 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
852 *(uint64_t *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = x;
853
854 brelse(m_bh);
855 }
856
857 *formal_ino = ir.ir_start++;
858 ir.ir_length--;
859
860 gfs2_trans_add_bh(ip->i_gl, bh, 1);
861 gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
862
863 out_brelse:
864 brelse(bh);
865
866 out_end_trans:
867 mutex_unlock(&sdp->sd_inum_mutex);
868 gfs2_trans_end(sdp);
869
870 out:
871 gfs2_glock_dq_uninit(&gh);
872
873 return error;
874}
875
876static int pick_formal_ino(struct gfs2_sbd *sdp, uint64_t *inum)
877{
878 int error;
879
880 error = pick_formal_ino_1(sdp, inum);
881 if (error <= 0)
882 return error;
883
884 error = pick_formal_ino_2(sdp, inum);
885
886 return error;
887}
888
889/**
890 * create_ok - OK to create a new on-disk inode here?
891 * @dip: Directory in which dinode is to be created
892 * @name: Name of new dinode
893 * @mode:
894 *
895 * Returns: errno
896 */
897
898static int create_ok(struct gfs2_inode *dip, struct qstr *name,
899 unsigned int mode)
900{
901 int error;
902
903 error = gfs2_repermission(dip->i_vnode, MAY_WRITE | MAY_EXEC, NULL);
904 if (error)
905 return error;
906
907 /* Don't create entries in an unlinked directory */
908 if (!dip->i_di.di_nlink)
909 return -EPERM;
910
911 error = gfs2_dir_search(dip->i_vnode, name, NULL, NULL);
912 switch (error) {
913 case -ENOENT:
914 error = 0;
915 break;
916 case 0:
917 return -EEXIST;
918 default:
919 return error;
920 }
921
922 if (dip->i_di.di_entries == (uint32_t)-1)
923 return -EFBIG;
924 if (S_ISDIR(mode) && dip->i_di.di_nlink == (uint32_t)-1)
925 return -EMLINK;
926
927 return 0;
928}
929
930static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
931 unsigned int *uid, unsigned int *gid)
932{
933 if (dip->i_sbd->sd_args.ar_suiddir &&
934 (dip->i_di.di_mode & S_ISUID) &&
935 dip->i_di.di_uid) {
936 if (S_ISDIR(*mode))
937 *mode |= S_ISUID;
938 else if (dip->i_di.di_uid != current->fsuid)
939 *mode &= ~07111;
940 *uid = dip->i_di.di_uid;
941 } else
942 *uid = current->fsuid;
943
944 if (dip->i_di.di_mode & S_ISGID) {
945 if (S_ISDIR(*mode))
946 *mode |= S_ISGID;
947 *gid = dip->i_di.di_gid;
948 } else
949 *gid = current->fsgid;
950}
951
952static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_unlinked *ul)
953{
954 struct gfs2_sbd *sdp = dip->i_sbd;
955 int error;
956
957 gfs2_alloc_get(dip);
958
959 dip->i_alloc.al_requested = RES_DINODE;
960 error = gfs2_inplace_reserve(dip);
961 if (error)
962 goto out;
963
964 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_UNLINKED +
965 RES_STATFS, 0);
966 if (error)
967 goto out_ipreserv;
968
969 ul->ul_ut.ut_inum.no_addr = gfs2_alloc_di(dip);
970
971 ul->ul_ut.ut_flags = GFS2_UTF_UNINIT;
972 error = gfs2_unlinked_ondisk_add(sdp, ul);
973
974 gfs2_trans_end(sdp);
975
976 out_ipreserv:
977 gfs2_inplace_release(dip);
978
979 out:
980 gfs2_alloc_put(dip);
981
982 return error;
983}
984
985/**
986 * init_dinode - Fill in a new dinode structure
987 * @dip: the directory this inode is being created in
988 * @gl: The glock covering the new inode
989 * @inum: the inode number
990 * @mode: the file permissions
991 * @uid:
992 * @gid:
993 *
994 */
995
996static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
997 struct gfs2_inum *inum, unsigned int mode,
998 unsigned int uid, unsigned int gid)
999{
1000 struct gfs2_sbd *sdp = dip->i_sbd;
1001 struct gfs2_dinode *di;
1002 struct buffer_head *dibh;
1003
1004 dibh = gfs2_meta_new(gl, inum->no_addr);
1005 gfs2_trans_add_bh(gl, dibh, 1);
1006 gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
1007 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1008 di = (struct gfs2_dinode *)dibh->b_data;
1009
1010 di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino);
1011 di->di_num.no_addr = cpu_to_be64(inum->no_addr);
1012 di->di_mode = cpu_to_be32(mode);
1013 di->di_uid = cpu_to_be32(uid);
1014 di->di_gid = cpu_to_be32(gid);
1015 di->di_nlink = cpu_to_be32(0);
1016 di->di_size = cpu_to_be64(0);
1017 di->di_blocks = cpu_to_be64(1);
1018 di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(get_seconds());
1019 di->di_major = di->di_minor = cpu_to_be32(0);
1020 di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
1021 di->__pad[0] = di->__pad[1] = 0;
1022 di->di_flags = cpu_to_be32(0);
1023
1024 if (S_ISREG(mode)) {
1025 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
1026 gfs2_tune_get(sdp, gt_new_files_jdata))
1027 di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
1028 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
1029 gfs2_tune_get(sdp, gt_new_files_directio))
1030 di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
1031 } else if (S_ISDIR(mode)) {
1032 di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
1033 GFS2_DIF_INHERIT_DIRECTIO);
1034 di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
1035 GFS2_DIF_INHERIT_JDATA);
1036 }
1037
1038 di->__pad1 = 0;
1039 di->di_height = cpu_to_be32(0);
1040 di->__pad2 = 0;
1041 di->__pad3 = 0;
1042 di->di_depth = cpu_to_be16(0);
1043 di->di_entries = cpu_to_be32(0);
1044 memset(&di->__pad4, 0, sizeof(di->__pad4));
1045 di->di_eattr = cpu_to_be64(0);
1046 memset(&di->di_reserved, 0, sizeof(di->di_reserved));
1047
1048 brelse(dibh);
1049}
1050
1051static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
1052 unsigned int mode, struct gfs2_unlinked *ul)
1053{
1054 struct gfs2_sbd *sdp = dip->i_sbd;
1055 unsigned int uid, gid;
1056 int error;
1057
1058 munge_mode_uid_gid(dip, &mode, &uid, &gid);
1059 gfs2_alloc_get(dip);
1060
1061 error = gfs2_quota_lock(dip, uid, gid);
1062 if (error)
1063 goto out;
1064
1065 error = gfs2_quota_check(dip, uid, gid);
1066 if (error)
1067 goto out_quota;
1068
1069 error = gfs2_trans_begin(sdp, RES_DINODE + RES_UNLINKED + RES_QUOTA, 0);
1070 if (error)
1071 goto out_quota;
1072
1073 ul->ul_ut.ut_flags = 0;
1074 error = gfs2_unlinked_ondisk_munge(sdp, ul);
1075 init_dinode(dip, gl, &ul->ul_ut.ut_inum, mode, uid, gid);
1076 gfs2_quota_change(dip, +1, uid, gid);
1077 gfs2_trans_end(sdp);
1078
1079 out_quota:
1080 gfs2_quota_unlock(dip);
1081
1082 out:
1083 gfs2_alloc_put(dip);
1084 return error;
1085}
1086
1087static int link_dinode(struct gfs2_inode *dip, struct qstr *name,
1088 struct gfs2_inode *ip, struct gfs2_unlinked *ul)
1089{
1090 struct gfs2_sbd *sdp = dip->i_sbd;
1091 struct gfs2_alloc *al;
1092 int alloc_required;
1093 struct buffer_head *dibh;
1094 int error;
1095
1096 al = gfs2_alloc_get(dip);
1097
1098 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1099 if (error)
1100 goto fail;
1101
1102 error = alloc_required = gfs2_diradd_alloc_required(dip->i_vnode, name);
1103 if (alloc_required < 0)
1104 goto fail;
1105 if (alloc_required) {
1106 error = gfs2_quota_check(dip, dip->i_di.di_uid,
1107 dip->i_di.di_gid);
1108 if (error)
1109 goto fail_quota_locks;
1110
1111 al->al_requested = sdp->sd_max_dirres;
1112
1113 error = gfs2_inplace_reserve(dip);
1114 if (error)
1115 goto fail_quota_locks;
1116
1117 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
1118 al->al_rgd->rd_ri.ri_length +
1119 2 * RES_DINODE + RES_UNLINKED +
1120 RES_STATFS + RES_QUOTA, 0);
1121 if (error)
1122 goto fail_ipreserv;
1123 } else {
1124 error = gfs2_trans_begin(sdp,
1125 RES_LEAF +
1126 2 * RES_DINODE +
1127 RES_UNLINKED, 0);
1128 if (error)
1129 goto fail_quota_locks;
1130 }
1131
1132 error = gfs2_dir_add(dip->i_vnode, name, &ip->i_num, IF2DT(ip->i_di.di_mode));
1133 if (error)
1134 goto fail_end_trans;
1135
1136 error = gfs2_meta_inode_buffer(ip, &dibh);
1137 if (error)
1138 goto fail_end_trans;
1139 ip->i_di.di_nlink = 1;
1140 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1141 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1142 brelse(dibh);
1143
1144 error = gfs2_unlinked_ondisk_rm(sdp, ul);
1145 if (error)
1146 goto fail_end_trans;
1147
1148 return 0;
1149
1150fail_end_trans:
1151 gfs2_trans_end(sdp);
1152
1153fail_ipreserv:
1154 if (dip->i_alloc.al_rgd)
1155 gfs2_inplace_release(dip);
1156
1157fail_quota_locks:
1158 gfs2_quota_unlock(dip);
1159
1160fail:
1161 gfs2_alloc_put(dip);
1162 return error;
1163}
1164
1165/**
1166 * gfs2_createi - Create a new inode
1167 * @ghs: An array of two holders
1168 * @name: The name of the new file
1169 * @mode: the permissions on the new inode
1170 *
1171 * @ghs[0] is an initialized holder for the directory
1172 * @ghs[1] is the holder for the inode lock
1173 *
1174 * If the return value is not NULL, the glocks on both the directory and the new
1175 * file are held. A transaction has been started and an inplace reservation
1176 * is held, as well.
1177 *
1178 * Returns: An inode
1179 */
1180
1181struct inode *gfs2_createi(struct gfs2_holder *ghs, struct qstr *name,
1182 unsigned int mode)
1183{
1184 struct inode *inode;
1185 struct gfs2_inode *dip = ghs->gh_gl->gl_object;
1186 struct gfs2_sbd *sdp = dip->i_sbd;
1187 struct gfs2_unlinked *ul;
1188 struct gfs2_inode *ip;
1189 int error;
1190
1191 if (!name->len || name->len > GFS2_FNAMESIZE)
1192 return ERR_PTR(-ENAMETOOLONG);
1193
1194 error = gfs2_unlinked_get(sdp, &ul);
1195 if (error)
1196 return ERR_PTR(error);
1197
1198 gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
1199 error = gfs2_glock_nq(ghs);
1200 if (error)
1201 goto fail;
1202
1203 error = create_ok(dip, name, mode);
1204 if (error)
1205 goto fail_gunlock;
1206
1207 error = pick_formal_ino(sdp, &ul->ul_ut.ut_inum.no_formal_ino);
1208 if (error)
1209 goto fail_gunlock;
1210
1211 error = alloc_dinode(dip, ul);
1212 if (error)
1213 goto fail_gunlock;
1214
1215 if (ul->ul_ut.ut_inum.no_addr < dip->i_num.no_addr) {
1216 gfs2_glock_dq(ghs);
1217
1218 error = gfs2_glock_nq_num(sdp, ul->ul_ut.ut_inum.no_addr,
1219 &gfs2_inode_glops, LM_ST_EXCLUSIVE,
1220 GL_SKIP, ghs + 1);
1221 if (error) {
1222 gfs2_unlinked_put(sdp, ul);
1223 return ERR_PTR(error);
1224 }
1225
1226 gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
1227 error = gfs2_glock_nq(ghs);
1228 if (error) {
1229 gfs2_glock_dq_uninit(ghs + 1);
1230 gfs2_unlinked_put(sdp, ul);
1231 return ERR_PTR(error);
1232 }
1233
1234 error = create_ok(dip, name, mode);
1235 if (error)
1236 goto fail_gunlock2;
1237 } else {
1238 error = gfs2_glock_nq_num(sdp, ul->ul_ut.ut_inum.no_addr,
1239 &gfs2_inode_glops, LM_ST_EXCLUSIVE,
1240 GL_SKIP, ghs + 1);
1241 if (error)
1242 goto fail_gunlock;
1243 }
1244
1245 error = make_dinode(dip, ghs[1].gh_gl, mode, ul);
1246 if (error)
1247 goto fail_gunlock2;
1248
1249 error = gfs2_inode_get(ghs[1].gh_gl, &ul->ul_ut.ut_inum, CREATE, &ip);
1250 if (error)
1251 goto fail_gunlock2;
1252
1253 error = gfs2_inode_refresh(ip);
1254 if (error)
1255 goto fail_iput;
1256
1257 error = gfs2_acl_create(dip, ip);
1258 if (error)
1259 goto fail_iput;
1260
1261 error = link_dinode(dip, name, ip, ul);
1262 if (error)
1263 goto fail_iput;
1264
1265 gfs2_unlinked_put(sdp, ul);
1266
1267 inode = gfs2_ip2v(ip);
1268 gfs2_inode_put(ip);
1269 if (!inode)
1270 return ERR_PTR(-ENOMEM);
1271 return inode;
1272
1273fail_iput:
1274 gfs2_inode_put(ip);
1275
1276fail_gunlock2:
1277 gfs2_glock_dq_uninit(ghs + 1);
1278
1279fail_gunlock:
1280 gfs2_glock_dq(ghs);
1281
1282fail:
1283 gfs2_unlinked_put(sdp, ul);
1284 return ERR_PTR(error);
1285}
1286
1287/**
1288 * gfs2_unlinki - Unlink a file
1289 * @dip: The inode of the directory
1290 * @name: The name of the file to be unlinked
1291 * @ip: The inode of the file to be removed
1292 *
1293 * Assumes Glocks on both dip and ip are held.
1294 *
1295 * Returns: errno
1296 */
1297
1298int gfs2_unlinki(struct gfs2_inode *dip, struct qstr *name,
1299 struct gfs2_inode *ip, struct gfs2_unlinked *ul)
1300{
1301 struct gfs2_sbd *sdp = dip->i_sbd;
1302 int error;
1303
1304 error = gfs2_dir_del(dip, name);
1305 if (error)
1306 return error;
1307
1308 error = gfs2_change_nlink(ip, -1);
1309 if (error)
1310 return error;
1311
1312 /* If this inode is being unlinked from the directory structure,
1313 we need to mark that in the log so that it isn't lost during
1314 a crash. */
1315
1316 if (!ip->i_di.di_nlink) {
1317 ul->ul_ut.ut_inum = ip->i_num;
1318 error = gfs2_unlinked_ondisk_add(sdp, ul);
1319 if (!error)
1320 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
1321 }
1322
1323 return error;
1324}
1325
1326/**
1327 * gfs2_rmdiri - Remove a directory
1328 * @dip: The parent directory of the directory to be removed
1329 * @name: The name of the directory to be removed
1330 * @ip: The GFS2 inode of the directory to be removed
1331 *
1332 * Assumes Glocks on dip and ip are held
1333 *
1334 * Returns: errno
1335 */
1336
1337int gfs2_rmdiri(struct gfs2_inode *dip, struct qstr *name,
1338 struct gfs2_inode *ip, struct gfs2_unlinked *ul)
1339{
1340 struct gfs2_sbd *sdp = dip->i_sbd;
1341 struct qstr dotname;
1342 int error;
1343
1344 if (ip->i_di.di_entries != 2) {
1345 if (gfs2_consist_inode(ip))
1346 gfs2_dinode_print(&ip->i_di);
1347 return -EIO;
1348 }
1349
1350 error = gfs2_dir_del(dip, name);
1351 if (error)
1352 return error;
1353
1354 error = gfs2_change_nlink(dip, -1);
1355 if (error)
1356 return error;
1357
1358 gfs2_str2qstr(&dotname, ".");
1359 error = gfs2_dir_del(ip, &dotname);
1360 if (error)
1361 return error;
1362
1363 dotname.len = 2;
1364 dotname.name = "..";
1365 dotname.hash = gfs2_disk_hash(dotname.name, dotname.len);
1366 error = gfs2_dir_del(ip, &dotname);
1367 if (error)
1368 return error;
1369
1370 error = gfs2_change_nlink(ip, -2);
1371 if (error)
1372 return error;
1373
1374 /* This inode is being unlinked from the directory structure and
1375 we need to mark that in the log so that it isn't lost during
1376 a crash. */
1377
1378 ul->ul_ut.ut_inum = ip->i_num;
1379 error = gfs2_unlinked_ondisk_add(sdp, ul);
1380 if (!error)
1381 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
1382
1383 return error;
1384}
1385
1386/*
1387 * gfs2_unlink_ok - check to see that a inode is still in a directory
1388 * @dip: the directory
1389 * @name: the name of the file
1390 * @ip: the inode
1391 *
1392 * Assumes that the lock on (at least) @dip is held.
1393 *
1394 * Returns: 0 if the parent/child relationship is correct, errno if it isn't
1395 */
1396
1397int gfs2_unlink_ok(struct gfs2_inode *dip, struct qstr *name,
1398 struct gfs2_inode *ip)
1399{
1400 struct gfs2_inum inum;
1401 unsigned int type;
1402 int error;
1403
1404 if (IS_IMMUTABLE(ip->i_vnode) || IS_APPEND(ip->i_vnode))
1405 return -EPERM;
1406
1407 if ((dip->i_di.di_mode & S_ISVTX) &&
1408 dip->i_di.di_uid != current->fsuid &&
1409 ip->i_di.di_uid != current->fsuid &&
1410 !capable(CAP_FOWNER))
1411 return -EPERM;
1412
1413 if (IS_APPEND(dip->i_vnode))
1414 return -EPERM;
1415
1416 error = gfs2_repermission(dip->i_vnode, MAY_WRITE | MAY_EXEC, NULL);
1417 if (error)
1418 return error;
1419
1420 error = gfs2_dir_search(dip->i_vnode, name, &inum, &type);
1421 if (error)
1422 return error;
1423
1424 if (!gfs2_inum_equal(&inum, &ip->i_num))
1425 return -ENOENT;
1426
1427 if (IF2DT(ip->i_di.di_mode) != type) {
1428 gfs2_consist_inode(dip);
1429 return -EIO;
1430 }
1431
1432 return 0;
1433}
1434
1435/*
1436 * gfs2_ok_to_move - check if it's ok to move a directory to another directory
1437 * @this: move this
1438 * @to: to here
1439 *
1440 * Follow @to back to the root and make sure we don't encounter @this
1441 * Assumes we already hold the rename lock.
1442 *
1443 * Returns: errno
1444 */
1445
1446int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
1447{
1448 struct inode *dir = to->i_vnode;
1449 struct super_block *sb = dir->i_sb;
1450 struct inode *tmp;
1451 struct qstr dotdot;
1452 int error = 0;
1453
1454 gfs2_str2qstr(&dotdot, "..");
1455
1456 igrab(dir);
1457
1458 for (;;) {
1459 if (dir == this->i_vnode) {
1460 error = -EINVAL;
1461 break;
1462 }
1463 if (dir == sb->s_root->d_inode) {
1464 error = 0;
1465 break;
1466 }
1467
1468 tmp = gfs2_lookupi(dir, &dotdot, 1, NULL);
1469 if (IS_ERR(tmp)) {
1470 error = PTR_ERR(tmp);
1471 break;
1472 }
1473
1474 iput(dir);
1475 dir = tmp;
1476 }
1477
1478 iput(dir);
1479
1480 return error;
1481}
1482
1483/**
1484 * gfs2_readlinki - return the contents of a symlink
1485 * @ip: the symlink's inode
1486 * @buf: a pointer to the buffer to be filled
1487 * @len: a pointer to the length of @buf
1488 *
1489 * If @buf is too small, a piece of memory is kmalloc()ed and needs
1490 * to be freed by the caller.
1491 *
1492 * Returns: errno
1493 */
1494
1495int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1496{
1497 struct gfs2_holder i_gh;
1498 struct buffer_head *dibh;
1499 unsigned int x;
1500 int error;
1501
1502 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
1503 error = gfs2_glock_nq_atime(&i_gh);
1504 if (error) {
1505 gfs2_holder_uninit(&i_gh);
1506 return error;
1507 }
1508
1509 if (!ip->i_di.di_size) {
1510 gfs2_consist_inode(ip);
1511 error = -EIO;
1512 goto out;
1513 }
1514
1515 error = gfs2_meta_inode_buffer(ip, &dibh);
1516 if (error)
1517 goto out;
1518
1519 x = ip->i_di.di_size + 1;
1520 if (x > *len) {
1521 *buf = kmalloc(x, GFP_KERNEL);
1522 if (!*buf) {
1523 error = -ENOMEM;
1524 goto out_brelse;
1525 }
1526 }
1527
1528 memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
1529 *len = x;
1530
1531 out_brelse:
1532 brelse(dibh);
1533
1534 out:
1535 gfs2_glock_dq_uninit(&i_gh);
1536
1537 return error;
1538}
1539
1540/**
1541 * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
1542 * conditionally update the inode's atime
1543 * @gh: the holder to acquire
1544 *
1545 * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
1546 * Update if the difference between the current time and the inode's current
1547 * atime is greater than an interval specified at mount.
1548 *
1549 * Returns: errno
1550 */
1551
1552int gfs2_glock_nq_atime(struct gfs2_holder *gh)
1553{
1554 struct gfs2_glock *gl = gh->gh_gl;
1555 struct gfs2_sbd *sdp = gl->gl_sbd;
1556 struct gfs2_inode *ip = gl->gl_object;
1557 int64_t curtime, quantum = gfs2_tune_get(sdp, gt_atime_quantum);
1558 unsigned int state;
1559 int flags;
1560 int error;
1561
1562 if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
1563 gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
1564 gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
1565 return -EINVAL;
1566
1567 state = gh->gh_state;
1568 flags = gh->gh_flags;
1569
1570 error = gfs2_glock_nq(gh);
1571 if (error)
1572 return error;
1573
1574 if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
1575 (sdp->sd_vfs->s_flags & MS_RDONLY))
1576 return 0;
1577
1578 curtime = get_seconds();
1579 if (curtime - ip->i_di.di_atime >= quantum) {
1580 gfs2_glock_dq(gh);
1581 gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
1582 gh);
1583 error = gfs2_glock_nq(gh);
1584 if (error)
1585 return error;
1586
1587 /* Verify that atime hasn't been updated while we were
1588 trying to get exclusive lock. */
1589
1590 curtime = get_seconds();
1591 if (curtime - ip->i_di.di_atime >= quantum) {
1592 struct buffer_head *dibh;
1593
1594 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1595 if (error == -EROFS)
1596 return 0;
1597 if (error)
1598 goto fail;
1599
1600 error = gfs2_meta_inode_buffer(ip, &dibh);
1601 if (error)
1602 goto fail_end_trans;
1603
1604 ip->i_di.di_atime = curtime;
1605
1606 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1607 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1608 brelse(dibh);
1609
1610 gfs2_trans_end(sdp);
1611 }
1612
1613 /* If someone else has asked for the glock,
1614 unlock and let them have it. Then reacquire
1615 in the original state. */
1616 if (gfs2_glock_is_blocking(gl)) {
1617 gfs2_glock_dq(gh);
1618 gfs2_holder_reinit(state, flags, gh);
1619 return gfs2_glock_nq(gh);
1620 }
1621 }
1622
1623 return 0;
1624
1625 fail_end_trans:
1626 gfs2_trans_end(sdp);
1627
1628 fail:
1629 gfs2_glock_dq(gh);
1630
1631 return error;
1632}
1633
1634/**
1635 * glock_compare_atime - Compare two struct gfs2_glock structures for sort
1636 * @arg_a: the first structure
1637 * @arg_b: the second structure
1638 *
1639 * Returns: 1 if A > B
1640 * -1 if A < B
1641 * 0 if A = B
1642 */
1643
1644static int glock_compare_atime(const void *arg_a, const void *arg_b)
1645{
1646 struct gfs2_holder *gh_a = *(struct gfs2_holder **)arg_a;
1647 struct gfs2_holder *gh_b = *(struct gfs2_holder **)arg_b;
1648 struct lm_lockname *a = &gh_a->gh_gl->gl_name;
1649 struct lm_lockname *b = &gh_b->gh_gl->gl_name;
1650 int ret = 0;
1651
1652 if (a->ln_number > b->ln_number)
1653 ret = 1;
1654 else if (a->ln_number < b->ln_number)
1655 ret = -1;
1656 else {
1657 if (gh_a->gh_state == LM_ST_SHARED &&
1658 gh_b->gh_state == LM_ST_EXCLUSIVE)
1659 ret = 1;
1660 else if (gh_a->gh_state == LM_ST_SHARED &&
1661 (gh_b->gh_flags & GL_ATIME))
1662 ret = 1;
1663 }
1664
1665 return ret;
1666}
1667
1668/**
1669 * gfs2_glock_nq_m_atime - acquire multiple glocks where one may need an
1670 * atime update
1671 * @num_gh: the number of structures
1672 * @ghs: an array of struct gfs2_holder structures
1673 *
1674 * Returns: 0 on success (all glocks acquired),
1675 * errno on failure (no glocks acquired)
1676 */
1677
1678int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs)
1679{
1680 struct gfs2_holder **p;
1681 unsigned int x;
1682 int error = 0;
1683
1684 if (!num_gh)
1685 return 0;
1686
1687 if (num_gh == 1) {
1688 ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1689 if (ghs->gh_flags & GL_ATIME)
1690 error = gfs2_glock_nq_atime(ghs);
1691 else
1692 error = gfs2_glock_nq(ghs);
1693 return error;
1694 }
1695
1696 p = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
1697 if (!p)
1698 return -ENOMEM;
1699
1700 for (x = 0; x < num_gh; x++)
1701 p[x] = &ghs[x];
1702
1703 sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare_atime,NULL);
1704
1705 for (x = 0; x < num_gh; x++) {
1706 p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1707
1708 if (p[x]->gh_flags & GL_ATIME)
1709 error = gfs2_glock_nq_atime(p[x]);
1710 else
1711 error = gfs2_glock_nq(p[x]);
1712
1713 if (error) {
1714 while (x--)
1715 gfs2_glock_dq(p[x]);
1716 break;
1717 }
1718 }
1719
1720 kfree(p);
1721
1722 return error;
1723}
1724
1725/**
1726 * gfs2_try_toss_vnode - See if we can toss a vnode from memory
1727 * @ip: the inode
1728 *
1729 * Returns: 1 if the vnode was tossed
1730 */
1731
1732void gfs2_try_toss_vnode(struct gfs2_inode *ip)
1733{
1734 struct inode *inode;
1735
1736 inode = gfs2_ip2v_lookup(ip);
1737 if (!inode)
1738 return;
1739
1740 d_prune_aliases(inode);
1741
1742 if (S_ISDIR(ip->i_di.di_mode)) {
1743 struct list_head *head = &inode->i_dentry;
1744 struct dentry *d = NULL;
1745
1746 spin_lock(&dcache_lock);
1747 if (list_empty(head))
1748 spin_unlock(&dcache_lock);
1749 else {
1750 d = list_entry(head->next, struct dentry, d_alias);
1751 dget_locked(d);
1752 spin_unlock(&dcache_lock);
1753
1754 if (have_submounts(d))
1755 dput(d);
1756 else {
1757 shrink_dcache_parent(d);
1758 dput(d);
1759 d_prune_aliases(inode);
1760 }
1761 }
1762 }
1763
1764 inode->i_nlink = 0;
1765 iput(inode);
1766}
1767
1768
1769static int
1770__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1771{
1772 struct buffer_head *dibh;
1773 int error;
1774
1775 error = gfs2_meta_inode_buffer(ip, &dibh);
1776 if (!error) {
1777 error = inode_setattr(ip->i_vnode, attr);
1778 gfs2_assert_warn(ip->i_sbd, !error);
1779 gfs2_inode_attr_out(ip);
1780
1781 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1782 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1783 brelse(dibh);
1784 }
1785 return error;
1786}
1787
1788/**
1789 * gfs2_setattr_simple -
1790 * @ip:
1791 * @attr:
1792 *
1793 * Called with a reference on the vnode.
1794 *
1795 * Returns: errno
1796 */
1797
1798int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1799{
1800 int error;
1801
1802 if (current->journal_info)
1803 return __gfs2_setattr_simple(ip, attr);
1804
1805 error = gfs2_trans_begin(ip->i_sbd, RES_DINODE, 0);
1806 if (error)
1807 return error;
1808
1809 error = __gfs2_setattr_simple(ip, attr);
1810
1811 gfs2_trans_end(ip->i_sbd);
1812
1813 return error;
1814}
1815
1816int gfs2_repermission(struct inode *inode, int mask, struct nameidata *nd)
1817{
1818 return permission(inode, mask, nd);
1819}
1820
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
new file mode 100644
index 000000000000..5ef21317b2f6
--- /dev/null
+++ b/fs/gfs2/inode.h
@@ -0,0 +1,72 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __INODE_DOT_H__
11#define __INODE_DOT_H__
12
13static inline int gfs2_is_stuffed(struct gfs2_inode *ip)
14{
15 return !ip->i_di.di_height;
16}
17
18static inline int gfs2_is_jdata(struct gfs2_inode *ip)
19{
20 return ip->i_di.di_flags & GFS2_DIF_JDATA;
21}
22
23static inline int gfs2_is_dir(struct gfs2_inode *ip)
24{
25 return S_ISDIR(ip->i_di.di_mode);
26}
27
28void gfs2_inode_attr_in(struct gfs2_inode *ip);
29void gfs2_inode_attr_out(struct gfs2_inode *ip);
30struct inode *gfs2_ip2v_lookup(struct gfs2_inode *ip);
31struct inode *gfs2_ip2v(struct gfs2_inode *ip);
32struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum *inum);
33
34void gfs2_inode_min_init(struct gfs2_inode *ip, unsigned int type);
35int gfs2_inode_refresh(struct gfs2_inode *ip);
36
37int gfs2_inode_get(struct gfs2_glock *i_gl,
38 const struct gfs2_inum *inum, int create,
39 struct gfs2_inode **ipp);
40void gfs2_inode_hold(struct gfs2_inode *ip);
41void gfs2_inode_put(struct gfs2_inode *ip);
42void gfs2_inode_destroy(struct gfs2_inode *ip, int unlock);
43
44int gfs2_inode_dealloc(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul);
45
46int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
47struct inode *gfs2_lookupi(struct inode *dir, struct qstr *name, int is_root,
48 struct nameidata *nd);
49struct inode *gfs2_createi(struct gfs2_holder *ghs, struct qstr *name,
50 unsigned int mode);
51int gfs2_unlinki(struct gfs2_inode *dip, struct qstr *name,
52 struct gfs2_inode *ip, struct gfs2_unlinked *ul);
53int gfs2_rmdiri(struct gfs2_inode *dip, struct qstr *name,
54 struct gfs2_inode *ip, struct gfs2_unlinked *ul);
55int gfs2_unlink_ok(struct gfs2_inode *dip, struct qstr *name,
56 struct gfs2_inode *ip);
57int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
58int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
59
60int gfs2_glock_nq_atime(struct gfs2_holder *gh);
61int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs);
62
63void gfs2_try_toss_vnode(struct gfs2_inode *ip);
64
65int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
66
67int gfs2_repermission(struct inode *inode, int mask, struct nameidata *nd);
68
69struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
70
71#endif /* __INODE_DOT_H__ */
72
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
new file mode 100644
index 000000000000..f45c0ffd1c35
--- /dev/null
+++ b/fs/gfs2/lm.c
@@ -0,0 +1,244 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/delay.h>
16#include <linux/gfs2_ondisk.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "glock.h"
22#include "lm.h"
23#include "super.h"
24#include "util.h"
25#include "lvb.h"
26
27/**
28 * gfs2_lm_mount - mount a locking protocol
29 * @sdp: the filesystem
30 * @args: mount arguements
31 * @silent: if 1, don't complain if the FS isn't a GFS2 fs
32 *
33 * Returns: errno
34 */
35
36int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
37{
38 char *proto = sdp->sd_proto_name;
39 char *table = sdp->sd_table_name;
40 int flags = 0;
41 int error;
42
43 if (sdp->sd_args.ar_spectator)
44 flags |= LM_MFLAG_SPECTATOR;
45
46 fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
47
48 error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
49 gfs2_glock_cb, sdp,
50 GFS2_MIN_LVB_SIZE, flags,
51 &sdp->sd_lockstruct, &sdp->sd_kobj);
52 if (error) {
53 fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
54 proto, table, sdp->sd_args.ar_hostdata);
55 goto out;
56 }
57
58 if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
59 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
60 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
61 GFS2_MIN_LVB_SIZE)) {
62 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
63 goto out;
64 }
65
66 if (sdp->sd_args.ar_spectator)
67 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
68 else
69 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
70 sdp->sd_lockstruct.ls_jid);
71
72 fs_info(sdp, "Joined cluster. Now mounting FS...\n");
73
74 if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
75 !sdp->sd_args.ar_ignore_local_fs) {
76 sdp->sd_args.ar_localflocks = 1;
77 sdp->sd_args.ar_localcaching = 1;
78 }
79
80 out:
81 return error;
82}
83
84void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
85{
86 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
87 sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
88 sdp->sd_lockstruct.ls_lockspace);
89}
90
91void gfs2_lm_unmount(struct gfs2_sbd *sdp)
92{
93 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
94 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
95}
96
97int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
98{
99 va_list args;
100
101 if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
102 return 0;
103
104 va_start(args, fmt);
105 vprintk(fmt, args);
106 va_end(args);
107
108 fs_err(sdp, "about to withdraw from the cluster\n");
109 BUG_ON(sdp->sd_args.ar_debug);
110
111
112 fs_err(sdp, "waiting for outstanding I/O\n");
113
114 /* FIXME: suspend dm device so oustanding bio's complete
115 and all further io requests fail */
116
117 fs_err(sdp, "telling LM to withdraw\n");
118 gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
119 fs_err(sdp, "withdrawn\n");
120 dump_stack();
121
122 return -1;
123}
124
125int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
126 lm_lock_t **lockp)
127{
128 int error;
129 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
130 error = -EIO;
131 else
132 error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
133 sdp->sd_lockstruct.ls_lockspace, name, lockp);
134 return error;
135}
136
137void gfs2_lm_put_lock(struct gfs2_sbd *sdp, lm_lock_t *lock)
138{
139 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
140 sdp->sd_lockstruct.ls_ops->lm_put_lock(lock);
141}
142
143unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, lm_lock_t *lock,
144 unsigned int cur_state, unsigned int req_state,
145 unsigned int flags)
146{
147 int ret;
148 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
149 ret = 0;
150 else
151 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
152 cur_state,
153 req_state, flags);
154 return ret;
155}
156
157unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, lm_lock_t *lock,
158 unsigned int cur_state)
159{
160 int ret;
161 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
162 ret = 0;
163 else
164 ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
165 return ret;
166}
167
168void gfs2_lm_cancel(struct gfs2_sbd *sdp, lm_lock_t *lock)
169{
170 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
171 sdp->sd_lockstruct.ls_ops->lm_cancel(lock);
172}
173
174int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char **lvbp)
175{
176 int error;
177 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
178 error = -EIO;
179 else
180 error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
181 return error;
182}
183
184void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb)
185{
186 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
187 sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb);
188}
189
190#if 0
191void gfs2_lm_sync_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb)
192{
193 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
194 sdp->sd_lockstruct.ls_ops->lm_sync_lvb(lock, lvb);
195}
196#endif /* 0 */
197
198int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
199 struct file *file, struct file_lock *fl)
200{
201 int error;
202 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
203 error = -EIO;
204 else
205 error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
206 sdp->sd_lockstruct.ls_lockspace,
207 name, file, fl);
208 return error;
209}
210
211int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
212 struct file *file, int cmd, struct file_lock *fl)
213{
214 int error;
215 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
216 error = -EIO;
217 else
218 error = sdp->sd_lockstruct.ls_ops->lm_plock(
219 sdp->sd_lockstruct.ls_lockspace,
220 name, file, cmd, fl);
221 return error;
222}
223
224int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
225 struct file *file, struct file_lock *fl)
226{
227 int error;
228 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
229 error = -EIO;
230 else
231 error = sdp->sd_lockstruct.ls_ops->lm_punlock(
232 sdp->sd_lockstruct.ls_lockspace,
233 name, file, fl);
234 return error;
235}
236
237void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
238 unsigned int message)
239{
240 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
241 sdp->sd_lockstruct.ls_ops->lm_recovery_done(
242 sdp->sd_lockstruct.ls_lockspace, jid, message);
243}
244
diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h
new file mode 100644
index 000000000000..e821101d19c0
--- /dev/null
+++ b/fs/gfs2/lm.h
@@ -0,0 +1,41 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LM_DOT_H__
11#define __LM_DOT_H__
12
13int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent);
14void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp);
15void gfs2_lm_unmount(struct gfs2_sbd *sdp);
16int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
17__attribute__ ((format(printf, 2, 3)));
18int gfs2_lm_get_lock(struct gfs2_sbd *sdp,
19 struct lm_lockname *name, lm_lock_t **lockp);
20void gfs2_lm_put_lock(struct gfs2_sbd *sdp, lm_lock_t *lock);
21unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, lm_lock_t *lock,
22 unsigned int cur_state, unsigned int req_state,
23 unsigned int flags);
24unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, lm_lock_t *lock,
25 unsigned int cur_state);
26void gfs2_lm_cancel(struct gfs2_sbd *sdp, lm_lock_t *lock);
27int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char **lvbp);
28void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb);
29int gfs2_lm_plock_get(struct gfs2_sbd *sdp,
30 struct lm_lockname *name,
31 struct file *file, struct file_lock *fl);
32int gfs2_lm_plock(struct gfs2_sbd *sdp,
33 struct lm_lockname *name,
34 struct file *file, int cmd, struct file_lock *fl);
35int gfs2_lm_punlock(struct gfs2_sbd *sdp,
36 struct lm_lockname *name,
37 struct file *file, struct file_lock *fl);
38void gfs2_lm_recovery_done(struct gfs2_sbd *sdp,
39 unsigned int jid, unsigned int message);
40
41#endif /* __LM_DOT_H__ */
diff --git a/fs/gfs2/lm_interface.h b/fs/gfs2/lm_interface.h
new file mode 100644
index 000000000000..9d34bf3df103
--- /dev/null
+++ b/fs/gfs2/lm_interface.h
@@ -0,0 +1,295 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LM_INTERFACE_DOT_H__
11#define __LM_INTERFACE_DOT_H__
12
13/*
14 * Opaque handles represent the lock module's lockspace structure, the lock
15 * module's lock structures, and GFS's file system (superblock) structure.
16 */
17
18typedef void lm_lockspace_t;
19typedef void lm_lock_t;
20typedef void lm_fsdata_t;
21
22typedef void (*lm_callback_t) (lm_fsdata_t *fsdata, unsigned int type,
23 void *data);
24
25/*
26 * lm_mount() flags
27 *
28 * LM_MFLAG_SPECTATOR
29 * GFS is asking to join the filesystem's lockspace, but it doesn't want to
30 * modify the filesystem. The lock module shouldn't assign a journal to the FS
31 * mount. It shouldn't send recovery callbacks to the FS mount. If the node
32 * dies or withdraws, all locks can be wiped immediately.
33 */
34
35#define LM_MFLAG_SPECTATOR 0x00000001
36
37/*
38 * lm_lockstruct flags
39 *
40 * LM_LSFLAG_LOCAL
41 * The lock_nolock module returns LM_LSFLAG_LOCAL to GFS, indicating that GFS
42 * can make single-node optimizations.
43 */
44
45#define LM_LSFLAG_LOCAL 0x00000001
46
47/*
48 * lm_lockname types
49 */
50
51#define LM_TYPE_RESERVED 0x00
52#define LM_TYPE_NONDISK 0x01
53#define LM_TYPE_INODE 0x02
54#define LM_TYPE_RGRP 0x03
55#define LM_TYPE_META 0x04
56#define LM_TYPE_IOPEN 0x05
57#define LM_TYPE_FLOCK 0x06
58#define LM_TYPE_PLOCK 0x07
59#define LM_TYPE_QUOTA 0x08
60#define LM_TYPE_JOURNAL 0x09
61
62/*
63 * lm_lock() states
64 *
65 * SHARED is compatible with SHARED, not with DEFERRED or EX.
66 * DEFERRED is compatible with DEFERRED, not with SHARED or EX.
67 */
68
69#define LM_ST_UNLOCKED 0
70#define LM_ST_EXCLUSIVE 1
71#define LM_ST_DEFERRED 2
72#define LM_ST_SHARED 3
73
74/*
75 * lm_lock() flags
76 *
77 * LM_FLAG_TRY
78 * Don't wait to acquire the lock if it can't be granted immediately.
79 *
80 * LM_FLAG_TRY_1CB
81 * Send one blocking callback if TRY is set and the lock is not granted.
82 *
83 * LM_FLAG_NOEXP
84 * GFS sets this flag on lock requests it makes while doing journal recovery.
85 * These special requests should not be blocked due to the recovery like
86 * ordinary locks would be.
87 *
88 * LM_FLAG_ANY
89 * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may
90 * also be granted in SHARED. The preferred state is whichever is compatible
91 * with other granted locks, or the specified state if no other locks exist.
92 *
93 * LM_FLAG_PRIORITY
94 * Override fairness considerations. Suppose a lock is held in a shared state
95 * and there is a pending request for the deferred state. A shared lock
96 * request with the priority flag would be allowed to bypass the deferred
97 * request and directly join the other shared lock. A shared lock request
98 * without the priority flag might be forced to wait until the deferred
99 * requested had acquired and released the lock.
100 */
101
102#define LM_FLAG_TRY 0x00000001
103#define LM_FLAG_TRY_1CB 0x00000002
104#define LM_FLAG_NOEXP 0x00000004
105#define LM_FLAG_ANY 0x00000008
106#define LM_FLAG_PRIORITY 0x00000010
107
108/*
109 * lm_lock() and lm_async_cb return flags
110 *
111 * LM_OUT_ST_MASK
112 * Masks the lower two bits of lock state in the returned value.
113 *
114 * LM_OUT_CACHEABLE
115 * The lock hasn't been released so GFS can continue to cache data for it.
116 *
117 * LM_OUT_CANCELED
118 * The lock request was canceled.
119 *
120 * LM_OUT_ASYNC
121 * The result of the request will be returned in an LM_CB_ASYNC callback.
122 */
123
124#define LM_OUT_ST_MASK 0x00000003
125#define LM_OUT_CACHEABLE 0x00000004
126#define LM_OUT_CANCELED 0x00000008
127#define LM_OUT_ASYNC 0x00000080
128#define LM_OUT_ERROR 0x00000100
129
130/*
131 * lm_callback_t types
132 *
133 * LM_CB_NEED_E LM_CB_NEED_D LM_CB_NEED_S
134 * Blocking callback, a remote node is requesting the given lock in
135 * EXCLUSIVE, DEFERRED, or SHARED.
136 *
137 * LM_CB_NEED_RECOVERY
138 * The given journal needs to be recovered.
139 *
140 * LM_CB_DROPLOCKS
141 * Reduce the number of cached locks.
142 *
143 * LM_CB_ASYNC
144 * The given lock has been granted.
145 */
146
147#define LM_CB_NEED_E 257
148#define LM_CB_NEED_D 258
149#define LM_CB_NEED_S 259
150#define LM_CB_NEED_RECOVERY 260
151#define LM_CB_DROPLOCKS 261
152#define LM_CB_ASYNC 262
153
154/*
155 * lm_recovery_done() messages
156 */
157
158#define LM_RD_GAVEUP 308
159#define LM_RD_SUCCESS 309
160
161
162struct lm_lockname {
163 uint64_t ln_number;
164 unsigned int ln_type;
165};
166
167#define lm_name_equal(name1, name2) \
168 (((name1)->ln_number == (name2)->ln_number) && \
169 ((name1)->ln_type == (name2)->ln_type)) \
170
171struct lm_async_cb {
172 struct lm_lockname lc_name;
173 int lc_ret;
174};
175
176struct lm_lockstruct;
177
178struct lm_lockops {
179 char lm_proto_name[256];
180
181 /*
182 * Mount/Unmount
183 */
184
185 int (*lm_mount) (char *table_name, char *host_data,
186 lm_callback_t cb, lm_fsdata_t *fsdata,
187 unsigned int min_lvb_size, int flags,
188 struct lm_lockstruct *lockstruct,
189 struct kobject *fskobj);
190
191 void (*lm_others_may_mount) (lm_lockspace_t *lockspace);
192
193 void (*lm_unmount) (lm_lockspace_t *lockspace);
194
195 void (*lm_withdraw) (lm_lockspace_t *lockspace);
196
197 /*
198 * Lock oriented operations
199 */
200
201 int (*lm_get_lock) (lm_lockspace_t *lockspace,
202 struct lm_lockname *name, lm_lock_t **lockp);
203
204 void (*lm_put_lock) (lm_lock_t *lock);
205
206 unsigned int (*lm_lock) (lm_lock_t *lock, unsigned int cur_state,
207 unsigned int req_state, unsigned int flags);
208
209 unsigned int (*lm_unlock) (lm_lock_t *lock, unsigned int cur_state);
210
211 void (*lm_cancel) (lm_lock_t *lock);
212
213 int (*lm_hold_lvb) (lm_lock_t *lock, char **lvbp);
214 void (*lm_unhold_lvb) (lm_lock_t *lock, char *lvb);
215 void (*lm_sync_lvb) (lm_lock_t *lock, char *lvb);
216
217 /*
218 * Posix Lock oriented operations
219 */
220
221 int (*lm_plock_get) (lm_lockspace_t *lockspace,
222 struct lm_lockname *name,
223 struct file *file, struct file_lock *fl);
224
225 int (*lm_plock) (lm_lockspace_t *lockspace,
226 struct lm_lockname *name,
227 struct file *file, int cmd, struct file_lock *fl);
228
229 int (*lm_punlock) (lm_lockspace_t *lockspace,
230 struct lm_lockname *name,
231 struct file *file, struct file_lock *fl);
232
233 /*
234 * Client oriented operations
235 */
236
237 void (*lm_recovery_done) (lm_lockspace_t *lockspace, unsigned int jid,
238 unsigned int message);
239
240 struct module *lm_owner;
241};
242
243/*
244 * lm_mount() return values
245 *
246 * ls_jid - the journal ID this node should use
247 * ls_first - this node is the first to mount the file system
248 * ls_lvb_size - size in bytes of lock value blocks
249 * ls_lockspace - lock module's context for this file system
250 * ls_ops - lock module's functions
251 * ls_flags - lock module features
252 */
253
254struct lm_lockstruct {
255 unsigned int ls_jid;
256 unsigned int ls_first;
257 unsigned int ls_lvb_size;
258 lm_lockspace_t *ls_lockspace;
259 struct lm_lockops *ls_ops;
260 int ls_flags;
261};
262
263void __init gfs2_init_lmh(void);
264
265/*
266 * Lock module bottom interface. A lock module makes itself available to GFS
267 * with these functions.
268 *
269 * For the time being, we copy the gfs1 lock module bottom interface so the
270 * same lock modules can be used with both gfs1 and gfs2 (it won't be possible
271 * to load both gfs1 and gfs2 at once.) Eventually the lock modules will fork
272 * for gfs1/gfs2 and this API can change to the gfs2_ prefix.
273 */
274
275int gfs_register_lockproto(struct lm_lockops *proto);
276
277void gfs_unregister_lockproto(struct lm_lockops *proto);
278
279/*
280 * Lock module top interface. GFS calls these functions when mounting or
281 * unmounting a file system.
282 */
283
284int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
285 lm_callback_t cb, lm_fsdata_t *fsdata,
286 unsigned int min_lvb_size, int flags,
287 struct lm_lockstruct *lockstruct,
288 struct kobject *fskobj);
289
290void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct);
291
292void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct);
293
294#endif /* __LM_INTERFACE_DOT_H__ */
295
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
new file mode 100644
index 000000000000..183192836e98
--- /dev/null
+++ b/fs/gfs2/locking.c
@@ -0,0 +1,191 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/string.h>
13#include <linux/slab.h>
14#include <linux/wait.h>
15#include <linux/sched.h>
16#include <linux/kmod.h>
17#include <linux/fs.h>
18#include <linux/delay.h>
19
20#include "lm_interface.h"
21
22struct lmh_wrapper {
23 struct list_head lw_list;
24 struct lm_lockops *lw_ops;
25};
26
27/* List of registered low-level locking protocols. A file system selects one
28 of them by name at mount time, e.g. lock_nolock, lock_dlm. */
29
30static struct list_head lmh_list;
31static struct mutex lmh_lock;
32
33/**
34 * gfs_register_lockproto - Register a low-level locking protocol
35 * @proto: the protocol definition
36 *
37 * Returns: 0 on success, -EXXX on failure
38 */
39
40int gfs_register_lockproto(struct lm_lockops *proto)
41{
42 struct lmh_wrapper *lw;
43
44 mutex_lock(&lmh_lock);
45
46 list_for_each_entry(lw, &lmh_list, lw_list) {
47 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
48 mutex_unlock(&lmh_lock);
49 printk(KERN_INFO "GFS2: protocol %s already exists\n",
50 proto->lm_proto_name);
51 return -EEXIST;
52 }
53 }
54
55 lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL);
56 if (!lw) {
57 mutex_unlock(&lmh_lock);
58 return -ENOMEM;
59 }
60
61 lw->lw_ops = proto;
62 list_add(&lw->lw_list, &lmh_list);
63
64 mutex_unlock(&lmh_lock);
65
66 return 0;
67}
68
69/**
70 * gfs_unregister_lockproto - Unregister a low-level locking protocol
71 * @proto: the protocol definition
72 *
73 */
74
75void gfs_unregister_lockproto(struct lm_lockops *proto)
76{
77 struct lmh_wrapper *lw;
78
79 mutex_lock(&lmh_lock);
80
81 list_for_each_entry(lw, &lmh_list, lw_list) {
82 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
83 list_del(&lw->lw_list);
84 mutex_unlock(&lmh_lock);
85 kfree(lw);
86 return;
87 }
88 }
89
90 mutex_unlock(&lmh_lock);
91
92 printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n",
93 proto->lm_proto_name);
94}
95
96/**
97 * gfs2_mount_lockproto - Mount a lock protocol
98 * @proto_name - the name of the protocol
99 * @table_name - the name of the lock space
100 * @host_data - data specific to this host
101 * @cb - the callback to the code using the lock module
102 * @fsdata - data to pass back with the callback
103 * @min_lvb_size - the mininum LVB size that the caller can deal with
104 * @flags - LM_MFLAG_*
105 * @lockstruct - a structure returned describing the mount
106 *
107 * Returns: 0 on success, -EXXX on failure
108 */
109
110int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
111 lm_callback_t cb, lm_fsdata_t *fsdata,
112 unsigned int min_lvb_size, int flags,
113 struct lm_lockstruct *lockstruct,
114 struct kobject *fskobj)
115{
116 struct lmh_wrapper *lw = NULL;
117 int try = 0;
118 int error, found;
119
120 retry:
121 mutex_lock(&lmh_lock);
122
123 found = 0;
124 list_for_each_entry(lw, &lmh_list, lw_list) {
125 if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
126 found = 1;
127 break;
128 }
129 }
130
131 if (!found) {
132 if (!try && capable(CAP_SYS_MODULE)) {
133 try = 1;
134 mutex_unlock(&lmh_lock);
135 request_module(proto_name);
136 goto retry;
137 }
138 printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name);
139 error = -ENOENT;
140 goto out;
141 }
142
143 if (!try_module_get(lw->lw_ops->lm_owner)) {
144 try = 0;
145 mutex_unlock(&lmh_lock);
146 msleep(1000);
147 goto retry;
148 }
149
150 error = lw->lw_ops->lm_mount(table_name, host_data, cb, fsdata,
151 min_lvb_size, flags, lockstruct, fskobj);
152 if (error)
153 module_put(lw->lw_ops->lm_owner);
154 out:
155 mutex_unlock(&lmh_lock);
156 return error;
157}
158
159void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
160{
161 mutex_lock(&lmh_lock);
162 lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
163 if (lockstruct->ls_ops->lm_owner)
164 module_put(lockstruct->ls_ops->lm_owner);
165 mutex_unlock(&lmh_lock);
166}
167
168/**
169 * gfs2_withdraw_lockproto - abnormally unmount a lock module
170 * @lockstruct: the lockstruct passed into mount
171 *
172 */
173
174void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct)
175{
176 mutex_lock(&lmh_lock);
177 lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace);
178 if (lockstruct->ls_ops->lm_owner)
179 module_put(lockstruct->ls_ops->lm_owner);
180 mutex_unlock(&lmh_lock);
181}
182
183void __init gfs2_init_lmh(void)
184{
185 mutex_init(&lmh_lock);
186 INIT_LIST_HEAD(&lmh_list);
187}
188
189EXPORT_SYMBOL_GPL(gfs_register_lockproto);
190EXPORT_SYMBOL_GPL(gfs_unregister_lockproto);
191
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
new file mode 100644
index 000000000000..a9733ff80371
--- /dev/null
+++ b/fs/gfs2/locking/dlm/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_GFS2_FS) += lock_dlm.o
2lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o
3
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
new file mode 100644
index 000000000000..e74f1215672f
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -0,0 +1,541 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include "lock_dlm.h"
11
12static char junk_lvb[GDLM_LVB_SIZE];
13
14static void queue_complete(struct gdlm_lock *lp)
15{
16 struct gdlm_ls *ls = lp->ls;
17
18 clear_bit(LFL_ACTIVE, &lp->flags);
19
20 spin_lock(&ls->async_lock);
21 list_add_tail(&lp->clist, &ls->complete);
22 spin_unlock(&ls->async_lock);
23 wake_up(&ls->thread_wait);
24}
25
26static inline void gdlm_ast(void *astarg)
27{
28 queue_complete(astarg);
29}
30
31static inline void gdlm_bast(void *astarg, int mode)
32{
33 struct gdlm_lock *lp = astarg;
34 struct gdlm_ls *ls = lp->ls;
35
36 if (!mode) {
37 printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
38 lp->lockname.ln_type,
39 (unsigned long long)lp->lockname.ln_number);
40 return;
41 }
42
43 spin_lock(&ls->async_lock);
44 if (!lp->bast_mode) {
45 list_add_tail(&lp->blist, &ls->blocking);
46 lp->bast_mode = mode;
47 } else if (lp->bast_mode < mode)
48 lp->bast_mode = mode;
49 spin_unlock(&ls->async_lock);
50 wake_up(&ls->thread_wait);
51}
52
53void gdlm_queue_delayed(struct gdlm_lock *lp)
54{
55 struct gdlm_ls *ls = lp->ls;
56
57 spin_lock(&ls->async_lock);
58 list_add_tail(&lp->delay_list, &ls->delayed);
59 spin_unlock(&ls->async_lock);
60}
61
62/* convert gfs lock-state to dlm lock-mode */
63
64static int16_t make_mode(int16_t lmstate)
65{
66 switch (lmstate) {
67 case LM_ST_UNLOCKED:
68 return DLM_LOCK_NL;
69 case LM_ST_EXCLUSIVE:
70 return DLM_LOCK_EX;
71 case LM_ST_DEFERRED:
72 return DLM_LOCK_CW;
73 case LM_ST_SHARED:
74 return DLM_LOCK_PR;
75 }
76 gdlm_assert(0, "unknown LM state %d", lmstate);
77 return -1;
78}
79
80/* convert dlm lock-mode to gfs lock-state */
81
82int16_t gdlm_make_lmstate(int16_t dlmmode)
83{
84 switch (dlmmode) {
85 case DLM_LOCK_IV:
86 case DLM_LOCK_NL:
87 return LM_ST_UNLOCKED;
88 case DLM_LOCK_EX:
89 return LM_ST_EXCLUSIVE;
90 case DLM_LOCK_CW:
91 return LM_ST_DEFERRED;
92 case DLM_LOCK_PR:
93 return LM_ST_SHARED;
94 }
95 gdlm_assert(0, "unknown DLM mode %d", dlmmode);
96 return -1;
97}
98
99/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
100 DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
101
102static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state)
103{
104 int16_t cur = make_mode(cur_state);
105 if (lp->cur != DLM_LOCK_IV)
106 gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur);
107}
108
109static inline unsigned int make_flags(struct gdlm_lock *lp,
110 unsigned int gfs_flags,
111 int16_t cur, int16_t req)
112{
113 unsigned int lkf = 0;
114
115 if (gfs_flags & LM_FLAG_TRY)
116 lkf |= DLM_LKF_NOQUEUE;
117
118 if (gfs_flags & LM_FLAG_TRY_1CB) {
119 lkf |= DLM_LKF_NOQUEUE;
120 lkf |= DLM_LKF_NOQUEUEBAST;
121 }
122
123 if (gfs_flags & LM_FLAG_PRIORITY) {
124 lkf |= DLM_LKF_NOORDER;
125 lkf |= DLM_LKF_HEADQUE;
126 }
127
128 if (gfs_flags & LM_FLAG_ANY) {
129 if (req == DLM_LOCK_PR)
130 lkf |= DLM_LKF_ALTCW;
131 else if (req == DLM_LOCK_CW)
132 lkf |= DLM_LKF_ALTPR;
133 }
134
135 if (lp->lksb.sb_lkid != 0) {
136 lkf |= DLM_LKF_CONVERT;
137
138 /* Conversion deadlock avoidance by DLM */
139
140 if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
141 !(lkf & DLM_LKF_NOQUEUE) &&
142 cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
143 lkf |= DLM_LKF_CONVDEADLK;
144 }
145
146 if (lp->lvb)
147 lkf |= DLM_LKF_VALBLK;
148
149 return lkf;
150}
151
152/* make_strname - convert GFS lock numbers to a string */
153
154static inline void make_strname(struct lm_lockname *lockname,
155 struct gdlm_strname *str)
156{
157 sprintf(str->name, "%8x%16llx", lockname->ln_type,
158 (unsigned long long)lockname->ln_number);
159 str->namelen = GDLM_STRNAME_BYTES;
160}
161
162static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
163 struct gdlm_lock **lpp)
164{
165 struct gdlm_lock *lp;
166
167 lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL);
168 if (!lp)
169 return -ENOMEM;
170
171 lp->lockname = *name;
172 lp->ls = ls;
173 lp->cur = DLM_LOCK_IV;
174 lp->lvb = NULL;
175 lp->hold_null = NULL;
176 init_completion(&lp->ast_wait);
177 INIT_LIST_HEAD(&lp->clist);
178 INIT_LIST_HEAD(&lp->blist);
179 INIT_LIST_HEAD(&lp->delay_list);
180
181 spin_lock(&ls->async_lock);
182 list_add(&lp->all_list, &ls->all_locks);
183 ls->all_locks_count++;
184 spin_unlock(&ls->async_lock);
185
186 *lpp = lp;
187 return 0;
188}
189
190void gdlm_delete_lp(struct gdlm_lock *lp)
191{
192 struct gdlm_ls *ls = lp->ls;
193
194 spin_lock(&ls->async_lock);
195 if (!list_empty(&lp->clist))
196 list_del_init(&lp->clist);
197 if (!list_empty(&lp->blist))
198 list_del_init(&lp->blist);
199 if (!list_empty(&lp->delay_list))
200 list_del_init(&lp->delay_list);
201 gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
202 (unsigned long long)lp->lockname.ln_number);
203 list_del_init(&lp->all_list);
204 ls->all_locks_count--;
205 spin_unlock(&ls->async_lock);
206
207 kfree(lp);
208}
209
210int gdlm_get_lock(lm_lockspace_t *lockspace, struct lm_lockname *name,
211 lm_lock_t **lockp)
212{
213 struct gdlm_lock *lp;
214 int error;
215
216 error = gdlm_create_lp((struct gdlm_ls *) lockspace, name, &lp);
217
218 *lockp = (lm_lock_t *) lp;
219 return error;
220}
221
222void gdlm_put_lock(lm_lock_t *lock)
223{
224 gdlm_delete_lp((struct gdlm_lock *) lock);
225}
226
227unsigned int gdlm_do_lock(struct gdlm_lock *lp)
228{
229 struct gdlm_ls *ls = lp->ls;
230 struct gdlm_strname str;
231 int error, bast = 1;
232
233 /*
234 * When recovery is in progress, delay lock requests for submission
235 * once recovery is done. Requests for recovery (NOEXP) and unlocks
236 * can pass.
237 */
238
239 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
240 !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) {
241 gdlm_queue_delayed(lp);
242 return LM_OUT_ASYNC;
243 }
244
245 /*
246 * Submit the actual lock request.
247 */
248
249 if (test_bit(LFL_NOBAST, &lp->flags))
250 bast = 0;
251
252 make_strname(&lp->lockname, &str);
253
254 set_bit(LFL_ACTIVE, &lp->flags);
255
256 log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
257 (unsigned long long)lp->lockname.ln_number, lp->lksb.sb_lkid,
258 lp->cur, lp->req, lp->lkf);
259
260 error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
261 str.name, str.namelen, 0, gdlm_ast, lp,
262 bast ? gdlm_bast : NULL);
263
264 if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
265 lp->lksb.sb_status = -EAGAIN;
266 queue_complete(lp);
267 error = 0;
268 }
269
270 if (error) {
271 log_debug("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
272 "flags=%lx", ls->fsname, lp->lockname.ln_type,
273 (unsigned long long)lp->lockname.ln_number, error,
274 lp->cur, lp->req, lp->lkf, lp->flags);
275 return LM_OUT_ERROR;
276 }
277 return LM_OUT_ASYNC;
278}
279
280static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
281{
282 struct gdlm_ls *ls = lp->ls;
283 unsigned int lkf = 0;
284 int error;
285
286 set_bit(LFL_DLM_UNLOCK, &lp->flags);
287 set_bit(LFL_ACTIVE, &lp->flags);
288
289 if (lp->lvb)
290 lkf = DLM_LKF_VALBLK;
291
292 log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type,
293 (unsigned long long)lp->lockname.ln_number,
294 lp->lksb.sb_lkid, lp->cur, lkf);
295
296 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
297
298 if (error) {
299 log_debug("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
300 "flags=%lx", ls->fsname, lp->lockname.ln_type,
301 (unsigned long long)lp->lockname.ln_number, error,
302 lp->cur, lp->req, lp->lkf, lp->flags);
303 return LM_OUT_ERROR;
304 }
305 return LM_OUT_ASYNC;
306}
307
308unsigned int gdlm_lock(lm_lock_t *lock, unsigned int cur_state,
309 unsigned int req_state, unsigned int flags)
310{
311 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
312
313 clear_bit(LFL_DLM_CANCEL, &lp->flags);
314 if (flags & LM_FLAG_NOEXP)
315 set_bit(LFL_NOBLOCK, &lp->flags);
316
317 check_cur_state(lp, cur_state);
318 lp->req = make_mode(req_state);
319 lp->lkf = make_flags(lp, flags, lp->cur, lp->req);
320
321 return gdlm_do_lock(lp);
322}
323
324unsigned int gdlm_unlock(lm_lock_t *lock, unsigned int cur_state)
325{
326 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
327
328 clear_bit(LFL_DLM_CANCEL, &lp->flags);
329 if (lp->cur == DLM_LOCK_IV)
330 return 0;
331 return gdlm_do_unlock(lp);
332}
333
334void gdlm_cancel(lm_lock_t *lock)
335{
336 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
337 struct gdlm_ls *ls = lp->ls;
338 int error, delay_list = 0;
339
340 if (test_bit(LFL_DLM_CANCEL, &lp->flags))
341 return;
342
343 log_info("gdlm_cancel %x,%llx flags %lx", lp->lockname.ln_type,
344 (unsigned long long)lp->lockname.ln_number, lp->flags);
345
346 spin_lock(&ls->async_lock);
347 if (!list_empty(&lp->delay_list)) {
348 list_del_init(&lp->delay_list);
349 delay_list = 1;
350 }
351 spin_unlock(&ls->async_lock);
352
353 if (delay_list) {
354 set_bit(LFL_CANCEL, &lp->flags);
355 set_bit(LFL_ACTIVE, &lp->flags);
356 queue_complete(lp);
357 return;
358 }
359
360 if (!test_bit(LFL_ACTIVE, &lp->flags) ||
361 test_bit(LFL_DLM_UNLOCK, &lp->flags)) {
362 log_info("gdlm_cancel skip %x,%llx flags %lx",
363 lp->lockname.ln_type,
364 (unsigned long long)lp->lockname.ln_number, lp->flags);
365 return;
366 }
367
368 /* the lock is blocked in the dlm */
369
370 set_bit(LFL_DLM_CANCEL, &lp->flags);
371 set_bit(LFL_ACTIVE, &lp->flags);
372
373 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL,
374 NULL, lp);
375
376 log_info("gdlm_cancel rv %d %x,%llx flags %lx", error,
377 lp->lockname.ln_type,
378 (unsigned long long)lp->lockname.ln_number, lp->flags);
379
380 if (error == -EBUSY)
381 clear_bit(LFL_DLM_CANCEL, &lp->flags);
382}
383
384static int gdlm_add_lvb(struct gdlm_lock *lp)
385{
386 char *lvb;
387
388 lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL);
389 if (!lvb)
390 return -ENOMEM;
391
392 lp->lksb.sb_lvbptr = lvb;
393 lp->lvb = lvb;
394 return 0;
395}
396
397static void gdlm_del_lvb(struct gdlm_lock *lp)
398{
399 kfree(lp->lvb);
400 lp->lvb = NULL;
401 lp->lksb.sb_lvbptr = NULL;
402}
403
404/* This can do a synchronous dlm request (requiring a lock_dlm thread to get
405 the completion) because gfs won't call hold_lvb() during a callback (from
406 the context of a lock_dlm thread). */
407
408static int hold_null_lock(struct gdlm_lock *lp)
409{
410 struct gdlm_lock *lpn = NULL;
411 int error;
412
413 if (lp->hold_null) {
414 printk(KERN_INFO "lock_dlm: lvb already held\n");
415 return 0;
416 }
417
418 error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn);
419 if (error)
420 goto out;
421
422 lpn->lksb.sb_lvbptr = junk_lvb;
423 lpn->lvb = junk_lvb;
424
425 lpn->req = DLM_LOCK_NL;
426 lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE;
427 set_bit(LFL_NOBAST, &lpn->flags);
428 set_bit(LFL_INLOCK, &lpn->flags);
429
430 init_completion(&lpn->ast_wait);
431 gdlm_do_lock(lpn);
432 wait_for_completion(&lpn->ast_wait);
433 error = lp->lksb.sb_status;
434 if (error) {
435 printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n",
436 error);
437 gdlm_delete_lp(lpn);
438 lpn = NULL;
439 }
440 out:
441 lp->hold_null = lpn;
442 return error;
443}
444
445/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get
446 the completion) because gfs may call unhold_lvb() during a callback (from
447 the context of a lock_dlm thread) which could cause a deadlock since the
448 other lock_dlm thread could be engaged in recovery. */
449
450static void unhold_null_lock(struct gdlm_lock *lp)
451{
452 struct gdlm_lock *lpn = lp->hold_null;
453
454 gdlm_assert(lpn, "%x,%llx", lp->lockname.ln_type,
455 (unsigned long long)lp->lockname.ln_number);
456 lpn->lksb.sb_lvbptr = NULL;
457 lpn->lvb = NULL;
458 set_bit(LFL_UNLOCK_DELETE, &lpn->flags);
459 gdlm_do_unlock(lpn);
460 lp->hold_null = NULL;
461}
462
463/* Acquire a NL lock because gfs requires the value block to remain
464 intact on the resource while the lvb is "held" even if it's holding no locks
465 on the resource. */
466
467int gdlm_hold_lvb(lm_lock_t *lock, char **lvbp)
468{
469 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
470 int error;
471
472 error = gdlm_add_lvb(lp);
473 if (error)
474 return error;
475
476 *lvbp = lp->lvb;
477
478 error = hold_null_lock(lp);
479 if (error)
480 gdlm_del_lvb(lp);
481
482 return error;
483}
484
485void gdlm_unhold_lvb(lm_lock_t *lock, char *lvb)
486{
487 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
488
489 unhold_null_lock(lp);
490 gdlm_del_lvb(lp);
491}
492
493void gdlm_sync_lvb(lm_lock_t *lock, char *lvb)
494{
495 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
496
497 if (lp->cur != DLM_LOCK_EX)
498 return;
499
500 init_completion(&lp->ast_wait);
501 set_bit(LFL_SYNC_LVB, &lp->flags);
502
503 lp->req = DLM_LOCK_EX;
504 lp->lkf = make_flags(lp, 0, lp->cur, lp->req);
505
506 gdlm_do_lock(lp);
507 wait_for_completion(&lp->ast_wait);
508}
509
510void gdlm_submit_delayed(struct gdlm_ls *ls)
511{
512 struct gdlm_lock *lp, *safe;
513
514 spin_lock(&ls->async_lock);
515 list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) {
516 list_del_init(&lp->delay_list);
517 list_add_tail(&lp->delay_list, &ls->submit);
518 }
519 spin_unlock(&ls->async_lock);
520 wake_up(&ls->thread_wait);
521}
522
523int gdlm_release_all_locks(struct gdlm_ls *ls)
524{
525 struct gdlm_lock *lp, *safe;
526 int count = 0;
527
528 spin_lock(&ls->async_lock);
529 list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
530 list_del_init(&lp->all_list);
531
532 if (lp->lvb && lp->lvb != junk_lvb)
533 kfree(lp->lvb);
534 kfree(lp);
535 count++;
536 }
537 spin_unlock(&ls->async_lock);
538
539 return count;
540}
541
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
new file mode 100644
index 000000000000..530c2f542584
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -0,0 +1,188 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef LOCK_DLM_DOT_H
11#define LOCK_DLM_DOT_H
12
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/spinlock.h>
16#include <linux/module.h>
17#include <linux/types.h>
18#include <linux/string.h>
19#include <linux/list.h>
20#include <linux/socket.h>
21#include <linux/delay.h>
22#include <linux/kthread.h>
23#include <linux/kobject.h>
24#include <linux/fcntl.h>
25#include <linux/wait.h>
26#include <net/sock.h>
27
28#include <linux/dlm.h>
29#include "../../lm_interface.h"
30
31/*
32 * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
33 * prefix of lock_dlm_ gets awkward. Externally, GFS refers to this module
34 * as "lock_dlm".
35 */
36
37#define GDLM_STRNAME_BYTES 24
38#define GDLM_LVB_SIZE 32
39#define GDLM_DROP_COUNT 50000
40#define GDLM_DROP_PERIOD 60
41#define GDLM_NAME_LEN 128
42
43/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number).
44 We sprintf these numbers into a 24 byte string of hex values to make them
45 human-readable (to make debugging simpler.) */
46
47struct gdlm_strname {
48 unsigned char name[GDLM_STRNAME_BYTES];
49 unsigned short namelen;
50};
51
52enum {
53 DFL_BLOCK_LOCKS = 0,
54 DFL_SPECTATOR = 1,
55 DFL_WITHDRAW = 2,
56};
57
58struct gdlm_ls {
59 uint32_t id;
60 int jid;
61 int first;
62 int first_done;
63 unsigned long flags;
64 struct kobject kobj;
65 char clustername[GDLM_NAME_LEN];
66 char fsname[GDLM_NAME_LEN];
67 int fsflags;
68 dlm_lockspace_t *dlm_lockspace;
69 lm_callback_t fscb;
70 lm_fsdata_t *fsdata;
71 int recover_jid;
72 int recover_jid_done;
73 int recover_jid_status;
74 spinlock_t async_lock;
75 struct list_head complete;
76 struct list_head blocking;
77 struct list_head delayed;
78 struct list_head submit;
79 struct list_head all_locks;
80 uint32_t all_locks_count;
81 wait_queue_head_t wait_control;
82 struct task_struct *thread1;
83 struct task_struct *thread2;
84 wait_queue_head_t thread_wait;
85 unsigned long drop_time;
86 int drop_locks_count;
87 int drop_locks_period;
88};
89
90enum {
91 LFL_NOBLOCK = 0,
92 LFL_NOCACHE = 1,
93 LFL_DLM_UNLOCK = 2,
94 LFL_DLM_CANCEL = 3,
95 LFL_SYNC_LVB = 4,
96 LFL_FORCE_PROMOTE = 5,
97 LFL_REREQUEST = 6,
98 LFL_ACTIVE = 7,
99 LFL_INLOCK = 8,
100 LFL_CANCEL = 9,
101 LFL_NOBAST = 10,
102 LFL_HEADQUE = 11,
103 LFL_UNLOCK_DELETE = 12,
104};
105
106struct gdlm_lock {
107 struct gdlm_ls *ls;
108 struct lm_lockname lockname;
109 char *lvb;
110 struct dlm_lksb lksb;
111
112 int16_t cur;
113 int16_t req;
114 int16_t prev_req;
115 uint32_t lkf; /* dlm flags DLM_LKF_ */
116 unsigned long flags; /* lock_dlm flags LFL_ */
117
118 int bast_mode; /* protected by async_lock */
119 struct completion ast_wait;
120
121 struct list_head clist; /* complete */
122 struct list_head blist; /* blocking */
123 struct list_head delay_list; /* delayed */
124 struct list_head all_list; /* all locks for the fs */
125 struct gdlm_lock *hold_null; /* NL lock for hold_lvb */
126};
127
128#define gdlm_assert(assertion, fmt, args...) \
129do { \
130 if (unlikely(!(assertion))) { \
131 printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \
132 "lock_dlm: " fmt "\n", \
133 #assertion, ##args); \
134 BUG(); \
135 } \
136} while (0)
137
138#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg)
139#define log_info(fmt, arg...) log_print(KERN_INFO , fmt , ## arg)
140#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg)
141#ifdef LOCK_DLM_LOG_DEBUG
142#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg)
143#else
144#define log_debug(fmt, arg...)
145#endif
146
147/* sysfs.c */
148
149int gdlm_sysfs_init(void);
150void gdlm_sysfs_exit(void);
151int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *);
152void gdlm_kobject_release(struct gdlm_ls *);
153
154/* thread.c */
155
156int gdlm_init_threads(struct gdlm_ls *);
157void gdlm_release_threads(struct gdlm_ls *);
158
159/* lock.c */
160
161int16_t gdlm_make_lmstate(int16_t);
162void gdlm_queue_delayed(struct gdlm_lock *);
163void gdlm_submit_delayed(struct gdlm_ls *);
164int gdlm_release_all_locks(struct gdlm_ls *);
165void gdlm_delete_lp(struct gdlm_lock *);
166unsigned int gdlm_do_lock(struct gdlm_lock *);
167
168int gdlm_get_lock(lm_lockspace_t *, struct lm_lockname *, lm_lock_t **);
169void gdlm_put_lock(lm_lock_t *);
170unsigned int gdlm_lock(lm_lock_t *, unsigned int, unsigned int, unsigned int);
171unsigned int gdlm_unlock(lm_lock_t *, unsigned int);
172void gdlm_cancel(lm_lock_t *);
173int gdlm_hold_lvb(lm_lock_t *, char **);
174void gdlm_unhold_lvb(lm_lock_t *, char *);
175void gdlm_sync_lvb(lm_lock_t *, char *);
176
177/* plock.c */
178
179int gdlm_plock_init(void);
180void gdlm_plock_exit(void);
181int gdlm_plock(lm_lockspace_t *, struct lm_lockname *, struct file *, int,
182 struct file_lock *);
183int gdlm_plock_get(lm_lockspace_t *, struct lm_lockname *, struct file *,
184 struct file_lock *);
185int gdlm_punlock(lm_lockspace_t *, struct lm_lockname *, struct file *,
186 struct file_lock *);
187#endif
188
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
new file mode 100644
index 000000000000..89728c91665f
--- /dev/null
+++ b/fs/gfs2/locking/dlm/main.c
@@ -0,0 +1,64 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/init.h>
11
12#include "lock_dlm.h"
13
14extern int gdlm_drop_count;
15extern int gdlm_drop_period;
16
17extern struct lm_lockops gdlm_ops;
18
19static int __init init_lock_dlm(void)
20{
21 int error;
22
23 error = gfs_register_lockproto(&gdlm_ops);
24 if (error) {
25 printk(KERN_WARNING "lock_dlm: can't register protocol: %d\n",
26 error);
27 return error;
28 }
29
30 error = gdlm_sysfs_init();
31 if (error) {
32 gfs_unregister_lockproto(&gdlm_ops);
33 return error;
34 }
35
36 error = gdlm_plock_init();
37 if (error) {
38 gdlm_sysfs_exit();
39 gfs_unregister_lockproto(&gdlm_ops);
40 return error;
41 }
42
43 gdlm_drop_count = GDLM_DROP_COUNT;
44 gdlm_drop_period = GDLM_DROP_PERIOD;
45
46 printk(KERN_INFO
47 "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
48 return 0;
49}
50
51static void __exit exit_lock_dlm(void)
52{
53 gdlm_plock_exit();
54 gdlm_sysfs_exit();
55 gfs_unregister_lockproto(&gdlm_ops);
56}
57
58module_init(init_lock_dlm);
59module_exit(exit_lock_dlm);
60
61MODULE_DESCRIPTION("GFS DLM Locking Module");
62MODULE_AUTHOR("Red Hat, Inc.");
63MODULE_LICENSE("GPL");
64
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
new file mode 100644
index 000000000000..3caeafc02a1b
--- /dev/null
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -0,0 +1,256 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include "lock_dlm.h"
11
12int gdlm_drop_count;
13int gdlm_drop_period;
14struct lm_lockops gdlm_ops;
15
16
17static struct gdlm_ls *init_gdlm(lm_callback_t cb, lm_fsdata_t *fsdata,
18 int flags, char *table_name)
19{
20 struct gdlm_ls *ls;
21 char buf[256], *p;
22
23 ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL);
24 if (!ls)
25 return NULL;
26
27 ls->drop_locks_count = gdlm_drop_count;
28 ls->drop_locks_period = gdlm_drop_period;
29 ls->fscb = cb;
30 ls->fsdata = fsdata;
31 ls->fsflags = flags;
32 spin_lock_init(&ls->async_lock);
33 INIT_LIST_HEAD(&ls->complete);
34 INIT_LIST_HEAD(&ls->blocking);
35 INIT_LIST_HEAD(&ls->delayed);
36 INIT_LIST_HEAD(&ls->submit);
37 INIT_LIST_HEAD(&ls->all_locks);
38 init_waitqueue_head(&ls->thread_wait);
39 init_waitqueue_head(&ls->wait_control);
40 ls->thread1 = NULL;
41 ls->thread2 = NULL;
42 ls->drop_time = jiffies;
43 ls->jid = -1;
44
45 strncpy(buf, table_name, 256);
46 buf[255] = '\0';
47
48 p = strstr(buf, ":");
49 if (!p) {
50 log_info("invalid table_name \"%s\"", table_name);
51 kfree(ls);
52 return NULL;
53 }
54 *p = '\0';
55 p++;
56
57 strncpy(ls->clustername, buf, GDLM_NAME_LEN);
58 strncpy(ls->fsname, p, GDLM_NAME_LEN);
59
60 return ls;
61}
62
63static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
64{
65 char data[256];
66 char *options, *x, *y;
67 int error = 0;
68
69 memset(data, 0, 256);
70 strncpy(data, data_arg, 255);
71
72 for (options = data; (x = strsep(&options, ":")); ) {
73 if (!*x)
74 continue;
75
76 y = strchr(x, '=');
77 if (y)
78 *y++ = 0;
79
80 if (!strcmp(x, "jid")) {
81 if (!y) {
82 log_error("need argument to jid");
83 error = -EINVAL;
84 break;
85 }
86 sscanf(y, "%u", &ls->jid);
87
88 } else if (!strcmp(x, "first")) {
89 if (!y) {
90 log_error("need argument to first");
91 error = -EINVAL;
92 break;
93 }
94 sscanf(y, "%u", &ls->first);
95
96 } else if (!strcmp(x, "id")) {
97 if (!y) {
98 log_error("need argument to id");
99 error = -EINVAL;
100 break;
101 }
102 sscanf(y, "%u", &ls->id);
103
104 } else if (!strcmp(x, "nodir")) {
105 if (!y) {
106 log_error("need argument to nodir");
107 error = -EINVAL;
108 break;
109 }
110 sscanf(y, "%u", nodir);
111
112 } else {
113 log_error("unkonwn option: %s", x);
114 error = -EINVAL;
115 break;
116 }
117 }
118
119 return error;
120}
121
122static int gdlm_mount(char *table_name, char *host_data,
123 lm_callback_t cb, lm_fsdata_t *fsdata,
124 unsigned int min_lvb_size, int flags,
125 struct lm_lockstruct *lockstruct,
126 struct kobject *fskobj)
127{
128 struct gdlm_ls *ls;
129 int error = -ENOMEM, nodir = 0;
130
131 if (min_lvb_size > GDLM_LVB_SIZE)
132 goto out;
133
134 ls = init_gdlm(cb, fsdata, flags, table_name);
135 if (!ls)
136 goto out;
137
138 error = make_args(ls, host_data, &nodir);
139 if (error)
140 goto out;
141
142 error = gdlm_init_threads(ls);
143 if (error)
144 goto out_free;
145
146 error = gdlm_kobject_setup(ls, fskobj);
147 if (error)
148 goto out_thread;
149
150 error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
151 &ls->dlm_lockspace,
152 nodir ? DLM_LSFL_NODIR : 0,
153 GDLM_LVB_SIZE);
154 if (error) {
155 log_error("dlm_new_lockspace error %d", error);
156 goto out_kobj;
157 }
158
159 lockstruct->ls_jid = ls->jid;
160 lockstruct->ls_first = ls->first;
161 lockstruct->ls_lockspace = ls;
162 lockstruct->ls_ops = &gdlm_ops;
163 lockstruct->ls_flags = 0;
164 lockstruct->ls_lvb_size = GDLM_LVB_SIZE;
165 return 0;
166
167 out_kobj:
168 gdlm_kobject_release(ls);
169 out_thread:
170 gdlm_release_threads(ls);
171 out_free:
172 kfree(ls);
173 out:
174 return error;
175}
176
177static void gdlm_unmount(lm_lockspace_t *lockspace)
178{
179 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
180 int rv;
181
182 log_debug("unmount flags %lx", ls->flags);
183
184 /* FIXME: serialize unmount and withdraw in case they
185 happen at once. Also, if unmount follows withdraw,
186 wait for withdraw to finish. */
187
188 if (test_bit(DFL_WITHDRAW, &ls->flags))
189 goto out;
190
191 gdlm_kobject_release(ls);
192 dlm_release_lockspace(ls->dlm_lockspace, 2);
193 gdlm_release_threads(ls);
194 rv = gdlm_release_all_locks(ls);
195 if (rv)
196 log_info("gdlm_unmount: %d stray locks freed", rv);
197 out:
198 kfree(ls);
199}
200
201static void gdlm_recovery_done(lm_lockspace_t *lockspace, unsigned int jid,
202 unsigned int message)
203{
204 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
205 ls->recover_jid_done = jid;
206 ls->recover_jid_status = message;
207 kobject_uevent(&ls->kobj, KOBJ_CHANGE);
208}
209
210static void gdlm_others_may_mount(lm_lockspace_t *lockspace)
211{
212 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
213 ls->first_done = 1;
214 kobject_uevent(&ls->kobj, KOBJ_CHANGE);
215}
216
217/* Userspace gets the offline uevent, blocks new gfs locks on
218 other mounters, and lets us know (sets WITHDRAW flag). Then,
219 userspace leaves the mount group while we leave the lockspace. */
220
221static void gdlm_withdraw(lm_lockspace_t *lockspace)
222{
223 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
224
225 kobject_uevent(&ls->kobj, KOBJ_OFFLINE);
226
227 wait_event_interruptible(ls->wait_control,
228 test_bit(DFL_WITHDRAW, &ls->flags));
229
230 dlm_release_lockspace(ls->dlm_lockspace, 2);
231 gdlm_release_threads(ls);
232 gdlm_release_all_locks(ls);
233 gdlm_kobject_release(ls);
234}
235
236struct lm_lockops gdlm_ops = {
237 .lm_proto_name = "lock_dlm",
238 .lm_mount = gdlm_mount,
239 .lm_others_may_mount = gdlm_others_may_mount,
240 .lm_unmount = gdlm_unmount,
241 .lm_withdraw = gdlm_withdraw,
242 .lm_get_lock = gdlm_get_lock,
243 .lm_put_lock = gdlm_put_lock,
244 .lm_lock = gdlm_lock,
245 .lm_unlock = gdlm_unlock,
246 .lm_plock = gdlm_plock,
247 .lm_punlock = gdlm_punlock,
248 .lm_plock_get = gdlm_plock_get,
249 .lm_cancel = gdlm_cancel,
250 .lm_hold_lvb = gdlm_hold_lvb,
251 .lm_unhold_lvb = gdlm_unhold_lvb,
252 .lm_sync_lvb = gdlm_sync_lvb,
253 .lm_recovery_done = gdlm_recovery_done,
254 .lm_owner = THIS_MODULE,
255};
256
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
new file mode 100644
index 000000000000..6adfb2d4fd8c
--- /dev/null
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -0,0 +1,299 @@
1/*
2 * Copyright (C) 2005 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#include <linux/miscdevice.h>
10#include <linux/lock_dlm_plock.h>
11
12#include "lock_dlm.h"
13
14
15static spinlock_t ops_lock;
16static struct list_head send_list;
17static struct list_head recv_list;
18static wait_queue_head_t send_wq;
19static wait_queue_head_t recv_wq;
20
21struct plock_op {
22 struct list_head list;
23 int done;
24 struct gdlm_plock_info info;
25};
26
27static inline void set_version(struct gdlm_plock_info *info)
28{
29 info->version[0] = GDLM_PLOCK_VERSION_MAJOR;
30 info->version[1] = GDLM_PLOCK_VERSION_MINOR;
31 info->version[2] = GDLM_PLOCK_VERSION_PATCH;
32}
33
34static int check_version(struct gdlm_plock_info *info)
35{
36 if ((GDLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
37 (GDLM_PLOCK_VERSION_MINOR < info->version[1])) {
38 log_error("plock device version mismatch: "
39 "kernel (%u.%u.%u), user (%u.%u.%u)",
40 GDLM_PLOCK_VERSION_MAJOR,
41 GDLM_PLOCK_VERSION_MINOR,
42 GDLM_PLOCK_VERSION_PATCH,
43 info->version[0],
44 info->version[1],
45 info->version[2]);
46 return -EINVAL;
47 }
48 return 0;
49}
50
51static void send_op(struct plock_op *op)
52{
53 set_version(&op->info);
54 INIT_LIST_HEAD(&op->list);
55 spin_lock(&ops_lock);
56 list_add_tail(&op->list, &send_list);
57 spin_unlock(&ops_lock);
58 wake_up(&send_wq);
59}
60
61int gdlm_plock(lm_lockspace_t *lockspace, struct lm_lockname *name,
62 struct file *file, int cmd, struct file_lock *fl)
63{
64 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
65 struct plock_op *op;
66 int rv;
67
68 op = kzalloc(sizeof(*op), GFP_KERNEL);
69 if (!op)
70 return -ENOMEM;
71
72 op->info.optype = GDLM_PLOCK_OP_LOCK;
73 op->info.pid = fl->fl_pid;
74 op->info.ex = (fl->fl_type == F_WRLCK);
75 op->info.wait = IS_SETLKW(cmd);
76 op->info.fsid = ls->id;
77 op->info.number = name->ln_number;
78 op->info.start = fl->fl_start;
79 op->info.end = fl->fl_end;
80
81 send_op(op);
82 wait_event(recv_wq, (op->done != 0));
83
84 spin_lock(&ops_lock);
85 if (!list_empty(&op->list)) {
86 printk(KERN_INFO "plock op on list\n");
87 list_del(&op->list);
88 }
89 spin_unlock(&ops_lock);
90
91 rv = op->info.rv;
92
93 if (!rv) {
94 if (posix_lock_file_wait(file, fl) < 0)
95 log_error("gdlm_plock: vfs lock error %x,%llx",
96 name->ln_type,
97 (unsigned long long)name->ln_number);
98 }
99
100 kfree(op);
101 return rv;
102}
103
104int gdlm_punlock(lm_lockspace_t *lockspace, struct lm_lockname *name,
105 struct file *file, struct file_lock *fl)
106{
107 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
108 struct plock_op *op;
109 int rv;
110
111 op = kzalloc(sizeof(*op), GFP_KERNEL);
112 if (!op)
113 return -ENOMEM;
114
115 if (posix_lock_file_wait(file, fl) < 0)
116 log_error("gdlm_punlock: vfs unlock error %x,%llx",
117 name->ln_type, (unsigned long long)name->ln_number);
118
119 op->info.optype = GDLM_PLOCK_OP_UNLOCK;
120 op->info.pid = fl->fl_pid;
121 op->info.fsid = ls->id;
122 op->info.number = name->ln_number;
123 op->info.start = fl->fl_start;
124 op->info.end = fl->fl_end;
125
126 send_op(op);
127 wait_event(recv_wq, (op->done != 0));
128
129 spin_lock(&ops_lock);
130 if (!list_empty(&op->list)) {
131 printk(KERN_INFO "punlock op on list\n");
132 list_del(&op->list);
133 }
134 spin_unlock(&ops_lock);
135
136 rv = op->info.rv;
137
138 kfree(op);
139 return rv;
140}
141
142int gdlm_plock_get(lm_lockspace_t *lockspace, struct lm_lockname *name,
143 struct file *file, struct file_lock *fl)
144{
145 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
146 struct plock_op *op;
147 int rv;
148
149 op = kzalloc(sizeof(*op), GFP_KERNEL);
150 if (!op)
151 return -ENOMEM;
152
153 op->info.optype = GDLM_PLOCK_OP_GET;
154 op->info.pid = fl->fl_pid;
155 op->info.ex = (fl->fl_type == F_WRLCK);
156 op->info.fsid = ls->id;
157 op->info.number = name->ln_number;
158 op->info.start = fl->fl_start;
159 op->info.end = fl->fl_end;
160
161 send_op(op);
162 wait_event(recv_wq, (op->done != 0));
163
164 spin_lock(&ops_lock);
165 if (!list_empty(&op->list)) {
166 printk(KERN_INFO "plock_get op on list\n");
167 list_del(&op->list);
168 }
169 spin_unlock(&ops_lock);
170
171 rv = op->info.rv;
172
173 if (rv == 0)
174 fl->fl_type = F_UNLCK;
175 else if (rv > 0) {
176 fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
177 fl->fl_pid = op->info.pid;
178 fl->fl_start = op->info.start;
179 fl->fl_end = op->info.end;
180 }
181
182 kfree(op);
183 return rv;
184}
185
186/* a read copies out one plock request from the send list */
187static ssize_t dev_read(struct file *file, char __user *u, size_t count,
188 loff_t *ppos)
189{
190 struct gdlm_plock_info info;
191 struct plock_op *op = NULL;
192
193 if (count < sizeof(info))
194 return -EINVAL;
195
196 spin_lock(&ops_lock);
197 if (!list_empty(&send_list)) {
198 op = list_entry(send_list.next, struct plock_op, list);
199 list_move(&op->list, &recv_list);
200 memcpy(&info, &op->info, sizeof(info));
201 }
202 spin_unlock(&ops_lock);
203
204 if (!op)
205 return -EAGAIN;
206
207 if (copy_to_user(u, &info, sizeof(info)))
208 return -EFAULT;
209 return sizeof(info);
210}
211
212/* a write copies in one plock result that should match a plock_op
213 on the recv list */
214static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
215 loff_t *ppos)
216{
217 struct gdlm_plock_info info;
218 struct plock_op *op;
219 int found = 0;
220
221 if (count != sizeof(info))
222 return -EINVAL;
223
224 if (copy_from_user(&info, u, sizeof(info)))
225 return -EFAULT;
226
227 if (check_version(&info))
228 return -EINVAL;
229
230 spin_lock(&ops_lock);
231 list_for_each_entry(op, &recv_list, list) {
232 if (op->info.fsid == info.fsid &&
233 op->info.number == info.number) {
234 list_del_init(&op->list);
235 found = 1;
236 op->done = 1;
237 memcpy(&op->info, &info, sizeof(info));
238 break;
239 }
240 }
241 spin_unlock(&ops_lock);
242
243 if (found)
244 wake_up(&recv_wq);
245 else
246 printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid,
247 (unsigned long long)info.number);
248 return count;
249}
250
251static unsigned int dev_poll(struct file *file, poll_table *wait)
252{
253 poll_wait(file, &send_wq, wait);
254
255 spin_lock(&ops_lock);
256 if (!list_empty(&send_list)) {
257 spin_unlock(&ops_lock);
258 return POLLIN | POLLRDNORM;
259 }
260 spin_unlock(&ops_lock);
261 return 0;
262}
263
264static struct file_operations dev_fops = {
265 .read = dev_read,
266 .write = dev_write,
267 .poll = dev_poll,
268 .owner = THIS_MODULE
269};
270
271static struct miscdevice plock_dev_misc = {
272 .minor = MISC_DYNAMIC_MINOR,
273 .name = GDLM_PLOCK_MISC_NAME,
274 .fops = &dev_fops
275};
276
277int gdlm_plock_init(void)
278{
279 int rv;
280
281 spin_lock_init(&ops_lock);
282 INIT_LIST_HEAD(&send_list);
283 INIT_LIST_HEAD(&recv_list);
284 init_waitqueue_head(&send_wq);
285 init_waitqueue_head(&recv_wq);
286
287 rv = misc_register(&plock_dev_misc);
288 if (rv)
289 printk(KERN_INFO "gdlm_plock_init: misc_register failed %d",
290 rv);
291 return rv;
292}
293
294void gdlm_plock_exit(void)
295{
296 if (misc_deregister(&plock_dev_misc) < 0)
297 printk(KERN_INFO "gdlm_plock_exit: misc_deregister failed");
298}
299
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
new file mode 100644
index 000000000000..0d8bd0806dba
--- /dev/null
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -0,0 +1,225 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/ctype.h>
11#include <linux/stat.h>
12
13#include "lock_dlm.h"
14
15extern struct lm_lockops gdlm_ops;
16
17static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
18{
19 return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
20}
21
22static ssize_t block_show(struct gdlm_ls *ls, char *buf)
23{
24 ssize_t ret;
25 int val = 0;
26
27 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags))
28 val = 1;
29 ret = sprintf(buf, "%d\n", val);
30 return ret;
31}
32
33static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len)
34{
35 ssize_t ret = len;
36 int val;
37
38 val = simple_strtol(buf, NULL, 0);
39
40 if (val == 1)
41 set_bit(DFL_BLOCK_LOCKS, &ls->flags);
42 else if (val == 0) {
43 clear_bit(DFL_BLOCK_LOCKS, &ls->flags);
44 gdlm_submit_delayed(ls);
45 } else
46 ret = -EINVAL;
47 return ret;
48}
49
50static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf)
51{
52 ssize_t ret;
53 int val = 0;
54
55 if (test_bit(DFL_WITHDRAW, &ls->flags))
56 val = 1;
57 ret = sprintf(buf, "%d\n", val);
58 return ret;
59}
60
61static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len)
62{
63 ssize_t ret = len;
64 int val;
65
66 val = simple_strtol(buf, NULL, 0);
67
68 if (val == 1)
69 set_bit(DFL_WITHDRAW, &ls->flags);
70 else
71 ret = -EINVAL;
72 wake_up(&ls->wait_control);
73 return ret;
74}
75
76static ssize_t id_show(struct gdlm_ls *ls, char *buf)
77{
78 return sprintf(buf, "%u\n", ls->id);
79}
80
81static ssize_t jid_show(struct gdlm_ls *ls, char *buf)
82{
83 return sprintf(buf, "%d\n", ls->jid);
84}
85
86static ssize_t first_show(struct gdlm_ls *ls, char *buf)
87{
88 return sprintf(buf, "%d\n", ls->first);
89}
90
91static ssize_t first_done_show(struct gdlm_ls *ls, char *buf)
92{
93 return sprintf(buf, "%d\n", ls->first_done);
94}
95
96static ssize_t recover_show(struct gdlm_ls *ls, char *buf)
97{
98 return sprintf(buf, "%d\n", ls->recover_jid);
99}
100
101static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len)
102{
103 ls->recover_jid = simple_strtol(buf, NULL, 0);
104 ls->fscb(ls->fsdata, LM_CB_NEED_RECOVERY, &ls->recover_jid);
105 return len;
106}
107
108static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf)
109{
110 return sprintf(buf, "%d\n", ls->recover_jid_done);
111}
112
113static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
114{
115 return sprintf(buf, "%d\n", ls->recover_jid_status);
116}
117
118struct gdlm_attr {
119 struct attribute attr;
120 ssize_t (*show)(struct gdlm_ls *, char *);
121 ssize_t (*store)(struct gdlm_ls *, const char *, size_t);
122};
123
124#define GDLM_ATTR(_name,_mode,_show,_store) \
125static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
126
127GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
128GDLM_ATTR(block, 0644, block_show, block_store);
129GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
130GDLM_ATTR(id, 0444, id_show, NULL);
131GDLM_ATTR(jid, 0444, jid_show, NULL);
132GDLM_ATTR(first, 0444, first_show, NULL);
133GDLM_ATTR(first_done, 0444, first_done_show, NULL);
134GDLM_ATTR(recover, 0644, recover_show, recover_store);
135GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
136GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
137
138static struct attribute *gdlm_attrs[] = {
139 &gdlm_attr_proto_name.attr,
140 &gdlm_attr_block.attr,
141 &gdlm_attr_withdraw.attr,
142 &gdlm_attr_id.attr,
143 &gdlm_attr_jid.attr,
144 &gdlm_attr_first.attr,
145 &gdlm_attr_first_done.attr,
146 &gdlm_attr_recover.attr,
147 &gdlm_attr_recover_done.attr,
148 &gdlm_attr_recover_status.attr,
149 NULL,
150};
151
152static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr,
153 char *buf)
154{
155 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
156 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
157 return a->show ? a->show(ls, buf) : 0;
158}
159
160static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr,
161 const char *buf, size_t len)
162{
163 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
164 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
165 return a->store ? a->store(ls, buf, len) : len;
166}
167
168static struct sysfs_ops gdlm_attr_ops = {
169 .show = gdlm_attr_show,
170 .store = gdlm_attr_store,
171};
172
173static struct kobj_type gdlm_ktype = {
174 .default_attrs = gdlm_attrs,
175 .sysfs_ops = &gdlm_attr_ops,
176};
177
178static struct kset gdlm_kset = {
179 .subsys = &kernel_subsys,
180 .kobj = {.name = "lock_dlm",},
181 .ktype = &gdlm_ktype,
182};
183
184int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
185{
186 int error;
187
188 error = kobject_set_name(&ls->kobj, "%s", "lock_module");
189 if (error) {
190 log_error("can't set kobj name %d", error);
191 return error;
192 }
193
194 ls->kobj.kset = &gdlm_kset;
195 ls->kobj.ktype = &gdlm_ktype;
196 ls->kobj.parent = fskobj;
197
198 error = kobject_register(&ls->kobj);
199 if (error)
200 log_error("can't register kobj %d", error);
201
202 return error;
203}
204
205void gdlm_kobject_release(struct gdlm_ls *ls)
206{
207 kobject_unregister(&ls->kobj);
208}
209
210int gdlm_sysfs_init(void)
211{
212 int error;
213
214 error = kset_register(&gdlm_kset);
215 if (error)
216 printk("lock_dlm: cannot register kset %d\n", error);
217
218 return error;
219}
220
221void gdlm_sysfs_exit(void)
222{
223 kset_unregister(&gdlm_kset);
224}
225
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
new file mode 100644
index 000000000000..3e2edcc2dbf6
--- /dev/null
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -0,0 +1,352 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include "lock_dlm.h"
11
12/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
13 thread gets to it. */
14
15static void queue_submit(struct gdlm_lock *lp)
16{
17 struct gdlm_ls *ls = lp->ls;
18
19 spin_lock(&ls->async_lock);
20 list_add_tail(&lp->delay_list, &ls->submit);
21 spin_unlock(&ls->async_lock);
22 wake_up(&ls->thread_wait);
23}
24
25static void process_blocking(struct gdlm_lock *lp, int bast_mode)
26{
27 struct gdlm_ls *ls = lp->ls;
28 unsigned int cb = 0;
29
30 switch (gdlm_make_lmstate(bast_mode)) {
31 case LM_ST_EXCLUSIVE:
32 cb = LM_CB_NEED_E;
33 break;
34 case LM_ST_DEFERRED:
35 cb = LM_CB_NEED_D;
36 break;
37 case LM_ST_SHARED:
38 cb = LM_CB_NEED_S;
39 break;
40 default:
41 gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
42 }
43
44 ls->fscb(ls->fsdata, cb, &lp->lockname);
45}
46
47static void process_complete(struct gdlm_lock *lp)
48{
49 struct gdlm_ls *ls = lp->ls;
50 struct lm_async_cb acb;
51 int16_t prev_mode = lp->cur;
52
53 memset(&acb, 0, sizeof(acb));
54
55 if (lp->lksb.sb_status == -DLM_ECANCEL) {
56 log_info("complete dlm cancel %x,%llx flags %lx",
57 lp->lockname.ln_type, lp->lockname.ln_number,
58 lp->flags);
59
60 lp->req = lp->cur;
61 acb.lc_ret |= LM_OUT_CANCELED;
62 if (lp->cur == DLM_LOCK_IV)
63 lp->lksb.sb_lkid = 0;
64 goto out;
65 }
66
67 if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
68 if (lp->lksb.sb_status != -DLM_EUNLOCK) {
69 log_info("unlock sb_status %d %x,%llx flags %lx",
70 lp->lksb.sb_status, lp->lockname.ln_type,
71 lp->lockname.ln_number, lp->flags);
72 return;
73 }
74
75 lp->cur = DLM_LOCK_IV;
76 lp->req = DLM_LOCK_IV;
77 lp->lksb.sb_lkid = 0;
78
79 if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
80 gdlm_delete_lp(lp);
81 return;
82 }
83 goto out;
84 }
85
86 if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
87 memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
88
89 if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
90 if (lp->req == DLM_LOCK_PR)
91 lp->req = DLM_LOCK_CW;
92 else if (lp->req == DLM_LOCK_CW)
93 lp->req = DLM_LOCK_PR;
94 }
95
96 /*
97 * A canceled lock request. The lock was just taken off the delayed
98 * list and was never even submitted to dlm.
99 */
100
101 if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
102 log_info("complete internal cancel %x,%llx",
103 lp->lockname.ln_type, lp->lockname.ln_number);
104 lp->req = lp->cur;
105 acb.lc_ret |= LM_OUT_CANCELED;
106 goto out;
107 }
108
109 /*
110 * An error occured.
111 */
112
113 if (lp->lksb.sb_status) {
114 /* a "normal" error */
115 if ((lp->lksb.sb_status == -EAGAIN) &&
116 (lp->lkf & DLM_LKF_NOQUEUE)) {
117 lp->req = lp->cur;
118 if (lp->cur == DLM_LOCK_IV)
119 lp->lksb.sb_lkid = 0;
120 goto out;
121 }
122
123 /* this could only happen with cancels I think */
124 log_info("ast sb_status %d %x,%llx flags %lx",
125 lp->lksb.sb_status, lp->lockname.ln_type,
126 lp->lockname.ln_number, lp->flags);
127 return;
128 }
129
130 /*
131 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
132 */
133
134 if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
135 complete(&lp->ast_wait);
136 return;
137 }
138
139 /*
140 * A lock has been demoted to NL because it initially completed during
141 * BLOCK_LOCKS. Now it must be requested in the originally requested
142 * mode.
143 */
144
145 if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
146 gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
147 lp->lockname.ln_type, lp->lockname.ln_number);
148 gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
149 lp->lockname.ln_type, lp->lockname.ln_number);
150
151 lp->cur = DLM_LOCK_NL;
152 lp->req = lp->prev_req;
153 lp->prev_req = DLM_LOCK_IV;
154 lp->lkf &= ~DLM_LKF_CONVDEADLK;
155
156 set_bit(LFL_NOCACHE, &lp->flags);
157
158 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
159 !test_bit(LFL_NOBLOCK, &lp->flags))
160 gdlm_queue_delayed(lp);
161 else
162 queue_submit(lp);
163 return;
164 }
165
166 /*
167 * A request is granted during dlm recovery. It may be granted
168 * because the locks of a failed node were cleared. In that case,
169 * there may be inconsistent data beneath this lock and we must wait
170 * for recovery to complete to use it. When gfs recovery is done this
171 * granted lock will be converted to NL and then reacquired in this
172 * granted state.
173 */
174
175 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
176 !test_bit(LFL_NOBLOCK, &lp->flags) &&
177 lp->req != DLM_LOCK_NL) {
178
179 lp->cur = lp->req;
180 lp->prev_req = lp->req;
181 lp->req = DLM_LOCK_NL;
182 lp->lkf |= DLM_LKF_CONVERT;
183 lp->lkf &= ~DLM_LKF_CONVDEADLK;
184
185 log_debug("rereq %x,%llx id %x %d,%d",
186 lp->lockname.ln_type, lp->lockname.ln_number,
187 lp->lksb.sb_lkid, lp->cur, lp->req);
188
189 set_bit(LFL_REREQUEST, &lp->flags);
190 queue_submit(lp);
191 return;
192 }
193
194 /*
195 * DLM demoted the lock to NL before it was granted so GFS must be
196 * told it cannot cache data for this lock.
197 */
198
199 if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
200 set_bit(LFL_NOCACHE, &lp->flags);
201
202 out:
203 /*
204 * This is an internal lock_dlm lock
205 */
206
207 if (test_bit(LFL_INLOCK, &lp->flags)) {
208 clear_bit(LFL_NOBLOCK, &lp->flags);
209 lp->cur = lp->req;
210 complete(&lp->ast_wait);
211 return;
212 }
213
214 /*
215 * Normal completion of a lock request. Tell GFS it now has the lock.
216 */
217
218 clear_bit(LFL_NOBLOCK, &lp->flags);
219 lp->cur = lp->req;
220
221 acb.lc_name = lp->lockname;
222 acb.lc_ret |= gdlm_make_lmstate(lp->cur);
223
224 if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
225 (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
226 acb.lc_ret |= LM_OUT_CACHEABLE;
227
228 ls->fscb(ls->fsdata, LM_CB_ASYNC, &acb);
229}
230
231static inline int no_work(struct gdlm_ls *ls, int blocking)
232{
233 int ret;
234
235 spin_lock(&ls->async_lock);
236 ret = list_empty(&ls->complete) && list_empty(&ls->submit);
237 if (ret && blocking)
238 ret = list_empty(&ls->blocking);
239 spin_unlock(&ls->async_lock);
240
241 return ret;
242}
243
244static inline int check_drop(struct gdlm_ls *ls)
245{
246 if (!ls->drop_locks_count)
247 return 0;
248
249 if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
250 ls->drop_time = jiffies;
251 if (ls->all_locks_count >= ls->drop_locks_count)
252 return 1;
253 }
254 return 0;
255}
256
257static int gdlm_thread(void *data)
258{
259 struct gdlm_ls *ls = (struct gdlm_ls *) data;
260 struct gdlm_lock *lp = NULL;
261 int blist = 0;
262 uint8_t complete, blocking, submit, drop;
263 DECLARE_WAITQUEUE(wait, current);
264
265 /* Only thread1 is allowed to do blocking callbacks since gfs
266 may wait for a completion callback within a blocking cb. */
267
268 if (current == ls->thread1)
269 blist = 1;
270
271 while (!kthread_should_stop()) {
272 set_current_state(TASK_INTERRUPTIBLE);
273 add_wait_queue(&ls->thread_wait, &wait);
274 if (no_work(ls, blist))
275 schedule();
276 remove_wait_queue(&ls->thread_wait, &wait);
277 set_current_state(TASK_RUNNING);
278
279 complete = blocking = submit = drop = 0;
280
281 spin_lock(&ls->async_lock);
282
283 if (blist && !list_empty(&ls->blocking)) {
284 lp = list_entry(ls->blocking.next, struct gdlm_lock,
285 blist);
286 list_del_init(&lp->blist);
287 blocking = lp->bast_mode;
288 lp->bast_mode = 0;
289 } else if (!list_empty(&ls->complete)) {
290 lp = list_entry(ls->complete.next, struct gdlm_lock,
291 clist);
292 list_del_init(&lp->clist);
293 complete = 1;
294 } else if (!list_empty(&ls->submit)) {
295 lp = list_entry(ls->submit.next, struct gdlm_lock,
296 delay_list);
297 list_del_init(&lp->delay_list);
298 submit = 1;
299 }
300
301 drop = check_drop(ls);
302 spin_unlock(&ls->async_lock);
303
304 if (complete)
305 process_complete(lp);
306
307 else if (blocking)
308 process_blocking(lp, blocking);
309
310 else if (submit)
311 gdlm_do_lock(lp);
312
313 if (drop)
314 ls->fscb(ls->fsdata, LM_CB_DROPLOCKS, NULL);
315
316 schedule();
317 }
318
319 return 0;
320}
321
322int gdlm_init_threads(struct gdlm_ls *ls)
323{
324 struct task_struct *p;
325 int error;
326
327 p = kthread_run(gdlm_thread, ls, "lock_dlm1");
328 error = IS_ERR(p);
329 if (error) {
330 log_error("can't start lock_dlm1 thread %d", error);
331 return error;
332 }
333 ls->thread1 = p;
334
335 p = kthread_run(gdlm_thread, ls, "lock_dlm2");
336 error = IS_ERR(p);
337 if (error) {
338 log_error("can't start lock_dlm2 thread %d", error);
339 kthread_stop(ls->thread1);
340 return error;
341 }
342 ls->thread2 = p;
343
344 return 0;
345}
346
347void gdlm_release_threads(struct gdlm_ls *ls)
348{
349 kthread_stop(ls->thread1);
350 kthread_stop(ls->thread2);
351}
352
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
new file mode 100644
index 000000000000..cdadf956c831
--- /dev/null
+++ b/fs/gfs2/locking/nolock/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_GFS2_FS) += lock_nolock.o
2lock_nolock-y := main.o
3
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
new file mode 100644
index 000000000000..97ffac5cdefb
--- /dev/null
+++ b/fs/gfs2/locking/nolock/main.c
@@ -0,0 +1,259 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/module.h>
11#include <linux/slab.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/types.h>
15#include <linux/fs.h>
16#include <linux/smp_lock.h>
17
18#include "../../lm_interface.h"
19
20struct nolock_lockspace {
21 unsigned int nl_lvb_size;
22};
23
24static struct lm_lockops nolock_ops;
25
26static int nolock_mount(char *table_name, char *host_data,
27 lm_callback_t cb, lm_fsdata_t *fsdata,
28 unsigned int min_lvb_size, int flags,
29 struct lm_lockstruct *lockstruct,
30 struct kobject *fskobj)
31{
32 char *c;
33 unsigned int jid;
34 struct nolock_lockspace *nl;
35
36 c = strstr(host_data, "jid=");
37 if (!c)
38 jid = 0;
39 else {
40 c += 4;
41 sscanf(c, "%u", &jid);
42 }
43
44 nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
45 if (!nl)
46 return -ENOMEM;
47
48 nl->nl_lvb_size = min_lvb_size;
49
50 lockstruct->ls_jid = jid;
51 lockstruct->ls_first = 1;
52 lockstruct->ls_lvb_size = min_lvb_size;
53 lockstruct->ls_lockspace = (lm_lockspace_t *)nl;
54 lockstruct->ls_ops = &nolock_ops;
55 lockstruct->ls_flags = LM_LSFLAG_LOCAL;
56
57 return 0;
58}
59
60static void nolock_others_may_mount(lm_lockspace_t *lockspace)
61{
62}
63
64static void nolock_unmount(lm_lockspace_t *lockspace)
65{
66 struct nolock_lockspace *nl = (struct nolock_lockspace *)lockspace;
67 kfree(nl);
68}
69
70static void nolock_withdraw(lm_lockspace_t *lockspace)
71{
72}
73
74/**
75 * nolock_get_lock - get a lm_lock_t given a descripton of the lock
76 * @lockspace: the lockspace the lock lives in
77 * @name: the name of the lock
78 * @lockp: return the lm_lock_t here
79 *
80 * Returns: 0 on success, -EXXX on failure
81 */
82
83static int nolock_get_lock(lm_lockspace_t *lockspace, struct lm_lockname *name,
84 lm_lock_t **lockp)
85{
86 *lockp = (lm_lock_t *)lockspace;
87 return 0;
88}
89
90/**
91 * nolock_put_lock - get rid of a lock structure
92 * @lock: the lock to throw away
93 *
94 */
95
96static void nolock_put_lock(lm_lock_t *lock)
97{
98}
99
100/**
101 * nolock_lock - acquire a lock
102 * @lock: the lock to manipulate
103 * @cur_state: the current state
104 * @req_state: the requested state
105 * @flags: modifier flags
106 *
107 * Returns: A bitmap of LM_OUT_*
108 */
109
110static unsigned int nolock_lock(lm_lock_t *lock, unsigned int cur_state,
111 unsigned int req_state, unsigned int flags)
112{
113 return req_state | LM_OUT_CACHEABLE;
114}
115
116/**
117 * nolock_unlock - unlock a lock
118 * @lock: the lock to manipulate
119 * @cur_state: the current state
120 *
121 * Returns: 0
122 */
123
124static unsigned int nolock_unlock(lm_lock_t *lock, unsigned int cur_state)
125{
126 return 0;
127}
128
129static void nolock_cancel(lm_lock_t *lock)
130{
131}
132
133/**
134 * nolock_hold_lvb - hold on to a lock value block
135 * @lock: the lock the LVB is associated with
136 * @lvbp: return the lm_lvb_t here
137 *
138 * Returns: 0 on success, -EXXX on failure
139 */
140
141static int nolock_hold_lvb(lm_lock_t *lock, char **lvbp)
142{
143 struct nolock_lockspace *nl = (struct nolock_lockspace *)lock;
144 int error = 0;
145
146 *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL);
147 if (!*lvbp)
148 error = -ENOMEM;
149
150 return error;
151}
152
153/**
154 * nolock_unhold_lvb - release a LVB
155 * @lock: the lock the LVB is associated with
156 * @lvb: the lock value block
157 *
158 */
159
160static void nolock_unhold_lvb(lm_lock_t *lock, char *lvb)
161{
162 kfree(lvb);
163}
164
165/**
166 * nolock_sync_lvb - sync out the value of a lvb
167 * @lock: the lock the LVB is associated with
168 * @lvb: the lock value block
169 *
170 */
171
172static void nolock_sync_lvb(lm_lock_t *lock, char *lvb)
173{
174}
175
176static int nolock_plock_get(lm_lockspace_t *lockspace, struct lm_lockname *name,
177 struct file *file, struct file_lock *fl)
178{
179 struct file_lock tmp;
180 int ret;
181
182 ret = posix_test_lock(file, fl, &tmp);
183 fl->fl_type = F_UNLCK;
184 if (ret)
185 memcpy(fl, &tmp, sizeof(struct file_lock));
186
187 return 0;
188}
189
190static int nolock_plock(lm_lockspace_t *lockspace, struct lm_lockname *name,
191 struct file *file, int cmd, struct file_lock *fl)
192{
193 int error;
194 error = posix_lock_file_wait(file, fl);
195 return error;
196}
197
198static int nolock_punlock(lm_lockspace_t *lockspace, struct lm_lockname *name,
199 struct file *file, struct file_lock *fl)
200{
201 int error;
202 error = posix_lock_file_wait(file, fl);
203 return error;
204}
205
206static void nolock_recovery_done(lm_lockspace_t *lockspace, unsigned int jid,
207 unsigned int message)
208{
209}
210
211static struct lm_lockops nolock_ops = {
212 .lm_proto_name = "lock_nolock",
213 .lm_mount = nolock_mount,
214 .lm_others_may_mount = nolock_others_may_mount,
215 .lm_unmount = nolock_unmount,
216 .lm_withdraw = nolock_withdraw,
217 .lm_get_lock = nolock_get_lock,
218 .lm_put_lock = nolock_put_lock,
219 .lm_lock = nolock_lock,
220 .lm_unlock = nolock_unlock,
221 .lm_cancel = nolock_cancel,
222 .lm_hold_lvb = nolock_hold_lvb,
223 .lm_unhold_lvb = nolock_unhold_lvb,
224 .lm_sync_lvb = nolock_sync_lvb,
225 .lm_plock_get = nolock_plock_get,
226 .lm_plock = nolock_plock,
227 .lm_punlock = nolock_punlock,
228 .lm_recovery_done = nolock_recovery_done,
229 .lm_owner = THIS_MODULE,
230};
231
232static int __init init_nolock(void)
233{
234 int error;
235
236 error = gfs_register_lockproto(&nolock_ops);
237 if (error) {
238 printk(KERN_WARNING
239 "lock_nolock: can't register protocol: %d\n", error);
240 return error;
241 }
242
243 printk(KERN_INFO
244 "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
245 return 0;
246}
247
248static void __exit exit_nolock(void)
249{
250 gfs_unregister_lockproto(&nolock_ops);
251}
252
253module_init(init_nolock);
254module_exit(exit_nolock);
255
256MODULE_DESCRIPTION("GFS Nolock Locking Module");
257MODULE_AUTHOR("Red Hat, Inc.");
258MODULE_LICENSE("GPL");
259
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
new file mode 100644
index 000000000000..2a8b4b71dd1f
--- /dev/null
+++ b/fs/gfs2/log.c
@@ -0,0 +1,598 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "log.h"
24#include "lops.h"
25#include "meta_io.h"
26#include "util.h"
27#include "dir.h"
28
29#define PULL 1
30
31/**
32 * gfs2_struct2blk - compute stuff
33 * @sdp: the filesystem
34 * @nstruct: the number of structures
35 * @ssize: the size of the structures
36 *
37 * Compute the number of log descriptor blocks needed to hold a certain number
38 * of structures of a certain size.
39 *
40 * Returns: the number of blocks needed (minimum is always 1)
41 */
42
43unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
44 unsigned int ssize)
45{
46 unsigned int blks;
47 unsigned int first, second;
48
49 blks = 1;
50 first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) /
51 ssize;
52
53 if (nstruct > first) {
54 second = (sdp->sd_sb.sb_bsize -
55 sizeof(struct gfs2_meta_header)) / ssize;
56 blks += DIV_ROUND_UP(nstruct - first, second);
57 }
58
59 return blks;
60}
61
62void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
63{
64 struct list_head *head = &sdp->sd_ail1_list;
65 uint64_t sync_gen;
66 struct list_head *first, *tmp;
67 struct gfs2_ail *first_ai, *ai;
68
69 gfs2_log_lock(sdp);
70 if (list_empty(head)) {
71 gfs2_log_unlock(sdp);
72 return;
73 }
74 sync_gen = sdp->sd_ail_sync_gen++;
75
76 first = head->prev;
77 first_ai = list_entry(first, struct gfs2_ail, ai_list);
78 first_ai->ai_sync_gen = sync_gen;
79 gfs2_ail1_start_one(sdp, first_ai);
80
81 if (flags & DIO_ALL)
82 first = NULL;
83
84 for (;;) {
85 if (first && (head->prev != first ||
86 gfs2_ail1_empty_one(sdp, first_ai, 0)))
87 break;
88
89 for (tmp = head->prev; tmp != head; tmp = tmp->prev) {
90 ai = list_entry(tmp, struct gfs2_ail, ai_list);
91 if (ai->ai_sync_gen >= sync_gen)
92 continue;
93 ai->ai_sync_gen = sync_gen;
94 gfs2_ail1_start_one(sdp, ai);
95 break;
96 }
97
98 if (tmp == head)
99 break;
100 }
101
102 gfs2_log_unlock(sdp);
103}
104
105int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
106{
107 struct gfs2_ail *ai, *s;
108 int ret;
109
110 gfs2_log_lock(sdp);
111
112 list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
113 if (gfs2_ail1_empty_one(sdp, ai, flags))
114 list_move(&ai->ai_list, &sdp->sd_ail2_list);
115 else if (!(flags & DIO_ALL))
116 break;
117 }
118
119 ret = list_empty(&sdp->sd_ail1_list);
120
121 gfs2_log_unlock(sdp);
122
123 return ret;
124}
125
126static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
127{
128 struct gfs2_ail *ai, *safe;
129 unsigned int old_tail = sdp->sd_log_tail;
130 int wrap = (new_tail < old_tail);
131 int a, b, rm;
132
133 gfs2_log_lock(sdp);
134
135 list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
136 a = (old_tail <= ai->ai_first);
137 b = (ai->ai_first < new_tail);
138 rm = (wrap) ? (a || b) : (a && b);
139 if (!rm)
140 continue;
141
142 gfs2_ail2_empty_one(sdp, ai);
143 list_del(&ai->ai_list);
144 gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list));
145 gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list));
146 kfree(ai);
147 }
148
149 gfs2_log_unlock(sdp);
150}
151
152/**
153 * gfs2_log_reserve - Make a log reservation
154 * @sdp: The GFS2 superblock
155 * @blks: The number of blocks to reserve
156 *
157 * Returns: errno
158 */
159
160int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
161{
162 unsigned int try = 0;
163
164 if (gfs2_assert_warn(sdp, blks) ||
165 gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
166 return -EINVAL;
167
168 mutex_lock(&sdp->sd_log_reserve_mutex);
169 gfs2_log_lock(sdp);
170 while(sdp->sd_log_blks_free <= blks) {
171 gfs2_log_unlock(sdp);
172 gfs2_ail1_empty(sdp, 0);
173 gfs2_log_flush(sdp, NULL);
174
175 if (try++)
176 gfs2_ail1_start(sdp, 0);
177 gfs2_log_lock(sdp);
178 }
179 sdp->sd_log_blks_free -= blks;
180 /* printk(KERN_INFO "reserved %u blocks (%u left)\n", blks, sdp->sd_log_blks_free); */
181 gfs2_log_unlock(sdp);
182 mutex_unlock(&sdp->sd_log_reserve_mutex);
183
184 down_read(&sdp->sd_log_flush_lock);
185
186 return 0;
187}
188
189/**
190 * gfs2_log_release - Release a given number of log blocks
191 * @sdp: The GFS2 superblock
192 * @blks: The number of blocks
193 *
194 */
195
196void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
197{
198
199 gfs2_log_lock(sdp);
200 sdp->sd_log_blks_free += blks;
201 /* printk(KERN_INFO "released %u blocks (%u left)\n", blks, sdp->sd_log_blks_free); */
202 gfs2_assert_withdraw(sdp,
203 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
204 gfs2_log_unlock(sdp);
205 up_read(&sdp->sd_log_flush_lock);
206}
207
208static uint64_t log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
209{
210 int new = 0;
211 uint64_t dbn;
212 int error;
213 int bdy;
214
215 error = gfs2_block_map(sdp->sd_jdesc->jd_inode, lbn, &new, &dbn, &bdy);
216 gfs2_assert_withdraw(sdp, !error && dbn);
217
218 return dbn;
219}
220
221/**
222 * log_distance - Compute distance between two journal blocks
223 * @sdp: The GFS2 superblock
224 * @newer: The most recent journal block of the pair
225 * @older: The older journal block of the pair
226 *
227 * Compute the distance (in the journal direction) between two
228 * blocks in the journal
229 *
230 * Returns: the distance in blocks
231 */
232
233static inline unsigned int log_distance(struct gfs2_sbd *sdp,
234 unsigned int newer,
235 unsigned int older)
236{
237 int dist;
238
239 dist = newer - older;
240 if (dist < 0)
241 dist += sdp->sd_jdesc->jd_blocks;
242
243 return dist;
244}
245
246static unsigned int current_tail(struct gfs2_sbd *sdp)
247{
248 struct gfs2_ail *ai;
249 unsigned int tail;
250
251 gfs2_log_lock(sdp);
252
253 if (list_empty(&sdp->sd_ail1_list))
254 tail = sdp->sd_log_head;
255 else {
256 ai = list_entry(sdp->sd_ail1_list.prev,
257 struct gfs2_ail, ai_list);
258 tail = ai->ai_first;
259 }
260
261 gfs2_log_unlock(sdp);
262
263 return tail;
264}
265
266static inline void log_incr_head(struct gfs2_sbd *sdp)
267{
268 if (sdp->sd_log_flush_head == sdp->sd_log_tail)
269 gfs2_assert_withdraw(sdp,
270 sdp->sd_log_flush_head == sdp->sd_log_head);
271
272 if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
273 sdp->sd_log_flush_head = 0;
274 sdp->sd_log_flush_wrapped = 1;
275 }
276}
277
278/**
279 * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
280 * @sdp: The GFS2 superblock
281 *
282 * Returns: the buffer_head
283 */
284
285struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
286{
287 uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head);
288 struct gfs2_log_buf *lb;
289 struct buffer_head *bh;
290
291 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
292 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
293
294 bh = lb->lb_bh = sb_getblk(sdp->sd_vfs, blkno);
295 lock_buffer(bh);
296 memset(bh->b_data, 0, bh->b_size);
297 set_buffer_uptodate(bh);
298 clear_buffer_dirty(bh);
299 unlock_buffer(bh);
300
301 log_incr_head(sdp);
302
303 return bh;
304}
305
306/**
307 * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
308 * @sdp: the filesystem
309 * @data: the data the buffer_head should point to
310 *
311 * Returns: the log buffer descriptor
312 */
313
314struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
315 struct buffer_head *real)
316{
317 uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head);
318 struct gfs2_log_buf *lb;
319 struct buffer_head *bh;
320
321 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
322 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
323 lb->lb_real = real;
324
325 bh = lb->lb_bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
326 atomic_set(&bh->b_count, 1);
327 bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate);
328 set_bh_page(bh, real->b_page, bh_offset(real));
329 bh->b_blocknr = blkno;
330 bh->b_size = sdp->sd_sb.sb_bsize;
331 bh->b_bdev = sdp->sd_vfs->s_bdev;
332
333 log_incr_head(sdp);
334
335 return bh;
336}
337
338static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail, int pull)
339{
340 unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail);
341
342 ail2_empty(sdp, new_tail);
343
344 gfs2_log_lock(sdp);
345 sdp->sd_log_blks_free += dist - ((pull) ? 1 : 0);
346 /* printk(KERN_INFO "pull tail refunding %u blocks (%u left) pull=%d\n", dist - ((pull) ? 1 : 0), sdp->sd_log_blks_free, pull); */
347 gfs2_assert_withdraw(sdp,
348 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
349 gfs2_log_unlock(sdp);
350
351 sdp->sd_log_tail = new_tail;
352}
353
354/**
355 * log_write_header - Get and initialize a journal header buffer
356 * @sdp: The GFS2 superblock
357 *
358 * Returns: the initialized log buffer descriptor
359 */
360
361static void log_write_header(struct gfs2_sbd *sdp, uint32_t flags, int pull)
362{
363 uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head);
364 struct buffer_head *bh;
365 struct gfs2_log_header *lh;
366 unsigned int tail;
367 uint32_t hash;
368
369 /* printk(KERN_INFO "log write header start (flags=%08x, pull=%d)\n", flags, pull); */
370
371 bh = sb_getblk(sdp->sd_vfs, blkno);
372 lock_buffer(bh);
373 memset(bh->b_data, 0, bh->b_size);
374 set_buffer_uptodate(bh);
375 clear_buffer_dirty(bh);
376 unlock_buffer(bh);
377
378 gfs2_ail1_empty(sdp, 0);
379 tail = current_tail(sdp);
380
381 lh = (struct gfs2_log_header *)bh->b_data;
382 memset(lh, 0, sizeof(struct gfs2_log_header));
383 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
384 lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
385 lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
386 lh->lh_sequence = be64_to_cpu(sdp->sd_log_sequence++);
387 lh->lh_flags = be32_to_cpu(flags);
388 lh->lh_tail = be32_to_cpu(tail);
389 lh->lh_blkno = be32_to_cpu(sdp->sd_log_flush_head);
390 hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
391 lh->lh_hash = cpu_to_be32(hash);
392
393 set_buffer_dirty(bh);
394 if (sync_dirty_buffer(bh))
395 gfs2_io_error_bh(sdp, bh);
396 brelse(bh);
397
398 if (sdp->sd_log_tail != tail)
399 log_pull_tail(sdp, tail, pull);
400 else
401 gfs2_assert_withdraw(sdp, !pull);
402
403 sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
404 log_incr_head(sdp);
405
406 /* printk(KERN_INFO "log write header out\n"); */
407}
408
409static void log_flush_commit(struct gfs2_sbd *sdp)
410{
411 struct list_head *head = &sdp->sd_log_flush_list;
412 struct gfs2_log_buf *lb;
413 struct buffer_head *bh;
414#if 0
415 unsigned int d;
416
417 d = log_distance(sdp, sdp->sd_log_flush_head, sdp->sd_log_head);
418
419 gfs2_assert_withdraw(sdp, d + 1 == sdp->sd_log_blks_reserved);
420#endif
421
422 while (!list_empty(head)) {
423 lb = list_entry(head->next, struct gfs2_log_buf, lb_list);
424 list_del(&lb->lb_list);
425 bh = lb->lb_bh;
426
427 wait_on_buffer(bh);
428 if (!buffer_uptodate(bh))
429 gfs2_io_error_bh(sdp, bh);
430 if (lb->lb_real) {
431 while (atomic_read(&bh->b_count) != 1) /* Grrrr... */
432 schedule();
433 free_buffer_head(bh);
434 } else
435 brelse(bh);
436 kfree(lb);
437 }
438
439 log_write_header(sdp, 0, 0);
440}
441
442/**
443 * gfs2_log_flush - flush incore transaction(s)
444 * @sdp: the filesystem
445 * @gl: The glock structure to flush. If NULL, flush the whole incore log
446 *
447 */
448
449void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
450{
451 struct gfs2_ail *ai;
452
453 down_write(&sdp->sd_log_flush_lock);
454
455 if (gl) {
456 gfs2_log_lock(sdp);
457 if (list_empty(&gl->gl_le.le_list)) {
458 gfs2_log_unlock(sdp);
459 up_write(&sdp->sd_log_flush_lock);
460 return;
461 }
462 gfs2_log_unlock(sdp);
463 }
464
465 ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
466 INIT_LIST_HEAD(&ai->ai_ail1_list);
467 INIT_LIST_HEAD(&ai->ai_ail2_list);
468
469 gfs2_assert_withdraw(sdp,
470 sdp->sd_log_num_buf == sdp->sd_log_commited_buf);
471 gfs2_assert_withdraw(sdp,
472 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
473
474 sdp->sd_log_flush_head = sdp->sd_log_head;
475 sdp->sd_log_flush_wrapped = 0;
476 ai->ai_first = sdp->sd_log_flush_head;
477
478 lops_before_commit(sdp);
479 if (!list_empty(&sdp->sd_log_flush_list))
480 log_flush_commit(sdp);
481 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle)
482 log_write_header(sdp, 0, PULL);
483 lops_after_commit(sdp, ai);
484 sdp->sd_log_head = sdp->sd_log_flush_head;
485
486 /* printk(KERN_INFO "sd_log_num_hdrs %u\n", sdp->sd_log_num_hdrs); */
487 sdp->sd_log_blks_free -= sdp->sd_log_num_hdrs;
488
489 sdp->sd_log_blks_reserved =
490 sdp->sd_log_commited_buf =
491 sdp->sd_log_num_hdrs =
492 sdp->sd_log_commited_revoke = 0;
493
494 gfs2_log_lock(sdp);
495 if (!list_empty(&ai->ai_ail1_list)) {
496 list_add(&ai->ai_list, &sdp->sd_ail1_list);
497 ai = NULL;
498 }
499 gfs2_log_unlock(sdp);
500
501 sdp->sd_vfs->s_dirt = 0;
502 up_write(&sdp->sd_log_flush_lock);
503
504 kfree(ai);
505}
506
507static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
508{
509 unsigned int reserved = 1;
510 unsigned int old;
511
512 gfs2_log_lock(sdp);
513
514 sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm;
515 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_buf) >= 0);
516 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
517 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
518
519 if (sdp->sd_log_commited_buf)
520 reserved += sdp->sd_log_commited_buf;
521 if (sdp->sd_log_commited_revoke)
522 reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
523 sizeof(uint64_t));
524
525 old = sdp->sd_log_blks_free;
526 sdp->sd_log_blks_free += tr->tr_reserved -
527 (reserved - sdp->sd_log_blks_reserved);
528
529 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old);
530 gfs2_assert_withdraw(sdp,
531 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks +
532 sdp->sd_log_num_hdrs);
533
534 sdp->sd_log_blks_reserved = reserved;
535
536 gfs2_log_unlock(sdp);
537}
538
539/**
540 * gfs2_log_commit - Commit a transaction to the log
541 * @sdp: the filesystem
542 * @tr: the transaction
543 *
544 * Returns: errno
545 */
546
547void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
548{
549 log_refund(sdp, tr);
550 lops_incore_commit(sdp, tr);
551
552 sdp->sd_vfs->s_dirt = 1;
553 up_read(&sdp->sd_log_flush_lock);
554
555 gfs2_log_lock(sdp);
556 if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) {
557 gfs2_log_unlock(sdp);
558 gfs2_log_flush(sdp, NULL);
559 } else
560 gfs2_log_unlock(sdp);
561}
562
563/**
564 * gfs2_log_shutdown - write a shutdown header into a journal
565 * @sdp: the filesystem
566 *
567 */
568
569void gfs2_log_shutdown(struct gfs2_sbd *sdp)
570{
571 down_write(&sdp->sd_log_flush_lock);
572
573 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
574 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
575 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
576 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_jdata);
577 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
578 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
579 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
580 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_hdrs);
581 gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
582
583 sdp->sd_log_flush_head = sdp->sd_log_head;
584 sdp->sd_log_flush_wrapped = 0;
585
586 log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, 0);
587
588 /* printk(KERN_INFO "sd_log_blks_free %u, sd_jdesc->jd_blocks %u\n", sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); */
589 gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks);
590 gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
591 gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
592
593 sdp->sd_log_head = sdp->sd_log_flush_head;
594 sdp->sd_log_tail = sdp->sd_log_head;
595
596 up_write(&sdp->sd_log_flush_lock);
597}
598
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
new file mode 100644
index 000000000000..8cfd0f1d29f8
--- /dev/null
+++ b/fs/gfs2/log.h
@@ -0,0 +1,61 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LOG_DOT_H__
11#define __LOG_DOT_H__
12
13/**
14 * gfs2_log_lock - acquire the right to mess with the log manager
15 * @sdp: the filesystem
16 *
17 */
18
19static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
20{
21 spin_lock(&sdp->sd_log_lock);
22}
23
24/**
25 * gfs2_log_unlock - release the right to mess with the log manager
26 * @sdp: the filesystem
27 *
28 */
29
30static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
31{
32 spin_unlock(&sdp->sd_log_lock);
33}
34
35static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
36 unsigned int value)
37{
38 if (++value == sdp->sd_jdesc->jd_blocks) {
39 value = 0;
40 }
41 sdp->sd_log_head = sdp->sd_log_tail = value;
42}
43
44unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
45 unsigned int ssize);
46
47void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags);
48int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
49
50int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
51void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
52
53struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
54struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
55 struct buffer_head *real);
56void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
57void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
58
59void gfs2_log_shutdown(struct gfs2_sbd *sdp);
60
61#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
new file mode 100644
index 000000000000..e4c75a74df5b
--- /dev/null
+++ b/fs/gfs2/lops.c
@@ -0,0 +1,804 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16
17#include "gfs2.h"
18#include "lm_interface.h"
19#include "incore.h"
20#include "glock.h"
21#include "log.h"
22#include "lops.h"
23#include "meta_io.h"
24#include "recovery.h"
25#include "rgrp.h"
26#include "trans.h"
27#include "util.h"
28
29static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
30{
31 struct gfs2_glock *gl;
32 struct gfs2_trans *tr = current->journal_info;
33
34 tr->tr_touched = 1;
35
36 if (!list_empty(&le->le_list))
37 return;
38
39 gl = container_of(le, struct gfs2_glock, gl_le);
40 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
41 return;
42 gfs2_glock_hold(gl);
43 set_bit(GLF_DIRTY, &gl->gl_flags);
44
45 gfs2_log_lock(sdp);
46 sdp->sd_log_num_gl++;
47 list_add(&le->le_list, &sdp->sd_log_le_gl);
48 gfs2_log_unlock(sdp);
49}
50
51static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
52{
53 struct list_head *head = &sdp->sd_log_le_gl;
54 struct gfs2_glock *gl;
55
56 while (!list_empty(head)) {
57 gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list);
58 list_del_init(&gl->gl_le.le_list);
59 sdp->sd_log_num_gl--;
60
61 gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl));
62 gfs2_glock_put(gl);
63 }
64 gfs2_assert_warn(sdp, !sdp->sd_log_num_gl);
65}
66
67static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
68{
69 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
70 struct gfs2_trans *tr;
71
72 if (!list_empty(&bd->bd_list_tr))
73 return;
74
75 tr = current->journal_info;
76 tr->tr_touched = 1;
77 tr->tr_num_buf++;
78 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
79
80 if (!list_empty(&le->le_list))
81 return;
82
83 gfs2_trans_add_gl(bd->bd_gl);
84
85 gfs2_meta_check(sdp, bd->bd_bh);
86 gfs2_pin(sdp, bd->bd_bh);
87
88 gfs2_log_lock(sdp);
89 sdp->sd_log_num_buf++;
90 list_add(&le->le_list, &sdp->sd_log_le_buf);
91 gfs2_log_unlock(sdp);
92
93 tr->tr_num_buf_new++;
94}
95
96static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
97{
98 struct list_head *head = &tr->tr_list_buf;
99 struct gfs2_bufdata *bd;
100
101 while (!list_empty(head)) {
102 bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
103 list_del_init(&bd->bd_list_tr);
104 tr->tr_num_buf--;
105 }
106 gfs2_assert_warn(sdp, !tr->tr_num_buf);
107}
108
109static void buf_lo_before_commit(struct gfs2_sbd *sdp)
110{
111 struct buffer_head *bh;
112 struct gfs2_log_descriptor *ld;
113 struct gfs2_bufdata *bd1 = NULL, *bd2;
114 unsigned int total = sdp->sd_log_num_buf;
115 unsigned int offset = sizeof(struct gfs2_log_descriptor);
116 unsigned int limit;
117 unsigned int num;
118 unsigned n;
119 __be64 *ptr;
120
121 offset += (sizeof(__be64) - 1);
122 offset &= ~(sizeof(__be64) - 1);
123 limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
124 /* for 4k blocks, limit = 503 */
125
126 bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list);
127 while(total) {
128 num = total;
129 if (total > limit)
130 num = limit;
131 bh = gfs2_log_get_buf(sdp);
132 sdp->sd_log_num_hdrs++;
133 ld = (struct gfs2_log_descriptor *)bh->b_data;
134 ptr = (__be64 *)(bh->b_data + offset);
135 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
136 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
137 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
138 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_METADATA);
139 ld->ld_length = cpu_to_be32(num + 1);
140 ld->ld_data1 = cpu_to_be32(num);
141 ld->ld_data2 = cpu_to_be32(0);
142 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
143
144 n = 0;
145 list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf,
146 bd_le.le_list) {
147 *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
148 if (++n >= num)
149 break;
150 }
151
152 set_buffer_dirty(bh);
153 ll_rw_block(WRITE, 1, &bh);
154
155 n = 0;
156 list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf,
157 bd_le.le_list) {
158 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
159 set_buffer_dirty(bh);
160 ll_rw_block(WRITE, 1, &bh);
161 if (++n >= num)
162 break;
163 }
164
165 total -= num;
166 }
167}
168
169static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
170{
171 struct list_head *head = &sdp->sd_log_le_buf;
172 struct gfs2_bufdata *bd;
173
174 while (!list_empty(head)) {
175 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
176 list_del_init(&bd->bd_le.le_list);
177 sdp->sd_log_num_buf--;
178
179 gfs2_unpin(sdp, bd->bd_bh, ai);
180 }
181 gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
182}
183
184static void buf_lo_before_scan(struct gfs2_jdesc *jd,
185 struct gfs2_log_header *head, int pass)
186{
187 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
188 struct gfs2_sbd *sdp = ip->i_sbd;
189
190 if (pass != 0)
191 return;
192
193 sdp->sd_found_blocks = 0;
194 sdp->sd_replayed_blocks = 0;
195}
196
197static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
198 struct gfs2_log_descriptor *ld, __be64 *ptr,
199 int pass)
200{
201 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
202 struct gfs2_sbd *sdp = ip->i_sbd;
203 struct gfs2_glock *gl = ip->i_gl;
204 unsigned int blks = be32_to_cpu(ld->ld_data1);
205 struct buffer_head *bh_log, *bh_ip;
206 uint64_t blkno;
207 int error = 0;
208
209 if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA)
210 return 0;
211
212 gfs2_replay_incr_blk(sdp, &start);
213
214 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
215 blkno = be64_to_cpu(*ptr++);
216
217 sdp->sd_found_blocks++;
218
219 if (gfs2_revoke_check(sdp, blkno, start))
220 continue;
221
222 error = gfs2_replay_read_block(jd, start, &bh_log);
223 if (error)
224 return error;
225
226 bh_ip = gfs2_meta_new(gl, blkno);
227 memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
228
229 if (gfs2_meta_check(sdp, bh_ip))
230 error = -EIO;
231 else
232 mark_buffer_dirty(bh_ip);
233
234 brelse(bh_log);
235 brelse(bh_ip);
236
237 if (error)
238 break;
239
240 sdp->sd_replayed_blocks++;
241 }
242
243 return error;
244}
245
246static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
247{
248 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
249 struct gfs2_sbd *sdp = ip->i_sbd;
250
251 if (error) {
252 gfs2_meta_sync(ip->i_gl,
253 DIO_START | DIO_WAIT);
254 return;
255 }
256 if (pass != 1)
257 return;
258
259 gfs2_meta_sync(ip->i_gl, DIO_START | DIO_WAIT);
260
261 fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
262 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
263}
264
265static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
266{
267 struct gfs2_trans *tr;
268
269 tr = current->journal_info;
270 tr->tr_touched = 1;
271 tr->tr_num_revoke++;
272
273 gfs2_log_lock(sdp);
274 sdp->sd_log_num_revoke++;
275 list_add(&le->le_list, &sdp->sd_log_le_revoke);
276 gfs2_log_unlock(sdp);
277}
278
279static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
280{
281 struct gfs2_log_descriptor *ld;
282 struct gfs2_meta_header *mh;
283 struct buffer_head *bh;
284 unsigned int offset;
285 struct list_head *head = &sdp->sd_log_le_revoke;
286 struct gfs2_revoke *rv;
287
288 if (!sdp->sd_log_num_revoke)
289 return;
290
291 bh = gfs2_log_get_buf(sdp);
292 ld = (struct gfs2_log_descriptor *)bh->b_data;
293 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
294 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
295 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
296 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_REVOKE);
297 ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke,
298 sizeof(uint64_t)));
299 ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
300 ld->ld_data2 = cpu_to_be32(0);
301 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
302 offset = sizeof(struct gfs2_log_descriptor);
303
304 while (!list_empty(head)) {
305 rv = list_entry(head->next, struct gfs2_revoke, rv_le.le_list);
306 list_del_init(&rv->rv_le.le_list);
307 sdp->sd_log_num_revoke--;
308
309 if (offset + sizeof(uint64_t) > sdp->sd_sb.sb_bsize) {
310 set_buffer_dirty(bh);
311 ll_rw_block(WRITE, 1, &bh);
312
313 bh = gfs2_log_get_buf(sdp);
314 mh = (struct gfs2_meta_header *)bh->b_data;
315 mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
316 mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB);
317 mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB);
318 offset = sizeof(struct gfs2_meta_header);
319 }
320
321 *(__be64 *)(bh->b_data + offset) = cpu_to_be64(rv->rv_blkno);
322 kfree(rv);
323
324 offset += sizeof(uint64_t);
325 }
326 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
327
328 set_buffer_dirty(bh);
329 ll_rw_block(WRITE, 1, &bh);
330}
331
332static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
333 struct gfs2_log_header *head, int pass)
334{
335 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
336 struct gfs2_sbd *sdp = ip->i_sbd;
337
338 if (pass != 0)
339 return;
340
341 sdp->sd_found_revokes = 0;
342 sdp->sd_replay_tail = head->lh_tail;
343}
344
345static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
346 struct gfs2_log_descriptor *ld, __be64 *ptr,
347 int pass)
348{
349 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
350 struct gfs2_sbd *sdp = ip->i_sbd;
351 unsigned int blks = be32_to_cpu(ld->ld_length);
352 unsigned int revokes = be32_to_cpu(ld->ld_data1);
353 struct buffer_head *bh;
354 unsigned int offset;
355 uint64_t blkno;
356 int first = 1;
357 int error;
358
359 if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE)
360 return 0;
361
362 offset = sizeof(struct gfs2_log_descriptor);
363
364 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
365 error = gfs2_replay_read_block(jd, start, &bh);
366 if (error)
367 return error;
368
369 if (!first)
370 gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB);
371
372 while (offset + sizeof(uint64_t) <= sdp->sd_sb.sb_bsize) {
373 blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
374
375 error = gfs2_revoke_add(sdp, blkno, start);
376 if (error < 0)
377 return error;
378 else if (error)
379 sdp->sd_found_revokes++;
380
381 if (!--revokes)
382 break;
383 offset += sizeof(uint64_t);
384 }
385
386 brelse(bh);
387 offset = sizeof(struct gfs2_meta_header);
388 first = 0;
389 }
390
391 return 0;
392}
393
394static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
395{
396 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
397 struct gfs2_sbd *sdp = ip->i_sbd;
398
399 if (error) {
400 gfs2_revoke_clean(sdp);
401 return;
402 }
403 if (pass != 1)
404 return;
405
406 fs_info(sdp, "jid=%u: Found %u revoke tags\n",
407 jd->jd_jid, sdp->sd_found_revokes);
408
409 gfs2_revoke_clean(sdp);
410}
411
412static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
413{
414 struct gfs2_rgrpd *rgd;
415 struct gfs2_trans *tr = current->journal_info;
416
417 tr->tr_touched = 1;
418
419 if (!list_empty(&le->le_list))
420 return;
421
422 rgd = container_of(le, struct gfs2_rgrpd, rd_le);
423 gfs2_rgrp_bh_hold(rgd);
424
425 gfs2_log_lock(sdp);
426 sdp->sd_log_num_rg++;
427 list_add(&le->le_list, &sdp->sd_log_le_rg);
428 gfs2_log_unlock(sdp);
429}
430
431static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
432{
433 struct list_head *head = &sdp->sd_log_le_rg;
434 struct gfs2_rgrpd *rgd;
435
436 while (!list_empty(head)) {
437 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list);
438 list_del_init(&rgd->rd_le.le_list);
439 sdp->sd_log_num_rg--;
440
441 gfs2_rgrp_repolish_clones(rgd);
442 gfs2_rgrp_bh_put(rgd);
443 }
444 gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
445}
446
447/**
448 * databuf_lo_add - Add a databuf to the transaction.
449 *
450 * This is used in two distinct cases:
451 * i) In ordered write mode
452 * We put the data buffer on a list so that we can ensure that its
453 * synced to disk at the right time
454 * ii) In journaled data mode
455 * We need to journal the data block in the same way as metadata in
456 * the functions above. The difference is that here we have a tag
457 * which is two __be64's being the block number (as per meta data)
458 * and a flag which says whether the data block needs escaping or
459 * not. This means we need a new log entry for each 251 or so data
460 * blocks, which isn't an enormous overhead but twice as much as
461 * for normal metadata blocks.
462 */
463static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
464{
465 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
466 struct gfs2_trans *tr = current->journal_info;
467 struct address_space *mapping = bd->bd_bh->b_page->mapping;
468 struct gfs2_inode *ip = mapping->host->u.generic_ip;
469
470 tr->tr_touched = 1;
471 if (!list_empty(&bd->bd_list_tr) &&
472 (ip->i_di.di_flags & GFS2_DIF_JDATA)) {
473 tr->tr_num_buf++;
474 gfs2_trans_add_gl(bd->bd_gl);
475 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
476 gfs2_pin(sdp, bd->bd_bh);
477 tr->tr_num_buf_new++;
478 }
479 gfs2_log_lock(sdp);
480 if (!list_empty(&le->le_list)) {
481 if (ip->i_di.di_flags & GFS2_DIF_JDATA)
482 sdp->sd_log_num_jdata++;
483 sdp->sd_log_num_databuf++;
484 list_add(&le->le_list, &sdp->sd_log_le_databuf);
485 }
486 gfs2_log_unlock(sdp);
487}
488
489static int gfs2_check_magic(struct buffer_head *bh)
490{
491 struct page *page = bh->b_page;
492 void *kaddr;
493 __be32 *ptr;
494 int rv = 0;
495
496 kaddr = kmap_atomic(page, KM_USER0);
497 ptr = kaddr + bh_offset(bh);
498 if (*ptr == cpu_to_be32(GFS2_MAGIC))
499 rv = 1;
500 kunmap_atomic(page, KM_USER0);
501
502 return rv;
503}
504
505/**
506 * databuf_lo_before_commit - Scan the data buffers, writing as we go
507 *
508 * Here we scan through the lists of buffers and make the assumption
509 * that any buffer thats been pinned is being journaled, and that
510 * any unpinned buffer is an ordered write data buffer and therefore
511 * will be written back rather than journaled.
512 */
513static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
514{
515 LIST_HEAD(started);
516 struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt;
517 struct buffer_head *bh = NULL;
518 unsigned int offset = sizeof(struct gfs2_log_descriptor);
519 struct gfs2_log_descriptor *ld;
520 unsigned int limit;
521 unsigned int total_dbuf = sdp->sd_log_num_databuf;
522 unsigned int total_jdata = sdp->sd_log_num_jdata;
523 unsigned int num, n;
524 __be64 *ptr = NULL;
525
526 offset += (2*sizeof(__be64) - 1);
527 offset &= ~(2*sizeof(__be64) - 1);
528 limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
529
530 /*
531 * Start writing ordered buffers, write journaled buffers
532 * into the log along with a header
533 */
534 gfs2_log_lock(sdp);
535 bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf,
536 bd_le.le_list);
537 while(total_dbuf) {
538 num = total_jdata;
539 if (num > limit)
540 num = limit;
541 n = 0;
542 list_for_each_entry_safe_continue(bd1, bdt,
543 &sdp->sd_log_le_databuf,
544 bd_le.le_list) {
545 /* An ordered write buffer */
546 if (bd1->bd_bh && !buffer_pinned(bd1->bd_bh)) {
547 list_move(&bd1->bd_le.le_list, &started);
548 if (bd1 == bd2) {
549 bd2 = NULL;
550 bd2 = list_prepare_entry(bd2,
551 &sdp->sd_log_le_databuf,
552 bd_le.le_list);
553 }
554 total_dbuf--;
555 if (bd1->bd_bh) {
556 get_bh(bd1->bd_bh);
557 if (buffer_dirty(bd1->bd_bh)) {
558 gfs2_log_unlock(sdp);
559 wait_on_buffer(bd1->bd_bh);
560 ll_rw_block(WRITE, 1,
561 &bd1->bd_bh);
562 gfs2_log_lock(sdp);
563 }
564 brelse(bd1->bd_bh);
565 continue;
566 }
567 continue;
568 } else if (bd1->bd_bh) { /* A journaled buffer */
569 int magic;
570 gfs2_log_unlock(sdp);
571 if (!bh) {
572 bh = gfs2_log_get_buf(sdp);
573 sdp->sd_log_num_hdrs++;
574 ld = (struct gfs2_log_descriptor *)
575 bh->b_data;
576 ptr = (__be64 *)(bh->b_data + offset);
577 ld->ld_header.mh_magic =
578 cpu_to_be32(GFS2_MAGIC);
579 ld->ld_header.mh_type =
580 cpu_to_be32(GFS2_METATYPE_LD);
581 ld->ld_header.mh_format =
582 cpu_to_be32(GFS2_FORMAT_LD);
583 ld->ld_type =
584 cpu_to_be32(GFS2_LOG_DESC_JDATA);
585 ld->ld_length = cpu_to_be32(num + 1);
586 ld->ld_data1 = cpu_to_be32(num);
587 ld->ld_data2 = cpu_to_be32(0);
588 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
589 }
590 magic = gfs2_check_magic(bd1->bd_bh);
591 *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
592 *ptr++ = cpu_to_be64((__u64)magic);
593 clear_buffer_escaped(bd1->bd_bh);
594 if (unlikely(magic != 0))
595 set_buffer_escaped(bd1->bd_bh);
596 gfs2_log_lock(sdp);
597 if (n++ > num)
598 break;
599 }
600 }
601 gfs2_log_unlock(sdp);
602 if (bh) {
603 set_buffer_dirty(bh);
604 ll_rw_block(WRITE, 1, &bh);
605 bh = NULL;
606 }
607 n = 0;
608 gfs2_log_lock(sdp);
609 list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf,
610 bd_le.le_list) {
611 if (!bd2->bd_bh)
612 continue;
613 /* copy buffer if it needs escaping */
614 gfs2_log_unlock(sdp);
615 if (unlikely(buffer_escaped(bd2->bd_bh))) {
616 void *kaddr;
617 struct page *page = bd2->bd_bh->b_page;
618 bh = gfs2_log_get_buf(sdp);
619 kaddr = kmap_atomic(page, KM_USER0);
620 memcpy(bh->b_data,
621 kaddr + bh_offset(bd2->bd_bh),
622 sdp->sd_sb.sb_bsize);
623 kunmap_atomic(page, KM_USER0);
624 *(__be32 *)bh->b_data = 0;
625 } else {
626 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
627 }
628 set_buffer_dirty(bh);
629 ll_rw_block(WRITE, 1, &bh);
630 gfs2_log_lock(sdp);
631 if (++n >= num)
632 break;
633 }
634 bh = NULL;
635 total_dbuf -= num;
636 total_jdata -= num;
637 }
638 gfs2_log_unlock(sdp);
639
640 /* Wait on all ordered buffers */
641 while (!list_empty(&started)) {
642 gfs2_log_lock(sdp);
643 bd1 = list_entry(started.next, struct gfs2_bufdata,
644 bd_le.le_list);
645 list_del(&bd1->bd_le.le_list);
646 sdp->sd_log_num_databuf--;
647
648 bh = bd1->bd_bh;
649 if (bh) {
650 bh->b_private = NULL;
651 gfs2_log_unlock(sdp);
652 wait_on_buffer(bh);
653 brelse(bh);
654 } else
655 gfs2_log_unlock(sdp);
656
657 kfree(bd1);
658 }
659
660 /* We've removed all the ordered write bufs here, so only jdata left */
661 gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata);
662}
663
664static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
665 struct gfs2_log_descriptor *ld,
666 __be64 *ptr, int pass)
667{
668 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
669 struct gfs2_sbd *sdp = ip->i_sbd;
670 struct gfs2_glock *gl = ip->i_gl;
671 unsigned int blks = be32_to_cpu(ld->ld_data1);
672 struct buffer_head *bh_log, *bh_ip;
673 uint64_t blkno;
674 uint64_t esc;
675 int error = 0;
676
677 if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
678 return 0;
679
680 gfs2_replay_incr_blk(sdp, &start);
681 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
682 blkno = be64_to_cpu(*ptr++);
683 esc = be64_to_cpu(*ptr++);
684
685 sdp->sd_found_blocks++;
686
687 if (gfs2_revoke_check(sdp, blkno, start))
688 continue;
689
690 error = gfs2_replay_read_block(jd, start, &bh_log);
691 if (error)
692 return error;
693
694 bh_ip = gfs2_meta_new(gl, blkno);
695 memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
696
697 /* Unescape */
698 if (esc) {
699 __be32 *eptr = (__be32 *)bh_ip->b_data;
700 *eptr = cpu_to_be32(GFS2_MAGIC);
701 }
702 mark_buffer_dirty(bh_ip);
703
704 brelse(bh_log);
705 brelse(bh_ip);
706 if (error)
707 break;
708
709 sdp->sd_replayed_blocks++;
710 }
711
712 return error;
713}
714
715/* FIXME: sort out accounting for log blocks etc. */
716
717static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
718{
719 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
720 struct gfs2_sbd *sdp = ip->i_sbd;
721
722 if (error) {
723 gfs2_meta_sync(ip->i_gl,
724 DIO_START | DIO_WAIT);
725 return;
726 }
727 if (pass != 1)
728 return;
729
730 /* data sync? */
731 gfs2_meta_sync(ip->i_gl, DIO_START | DIO_WAIT);
732
733 fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
734 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
735}
736
737static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
738{
739 struct list_head *head = &sdp->sd_log_le_databuf;
740 struct gfs2_bufdata *bd;
741
742 while (!list_empty(head)) {
743 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
744 list_del(&bd->bd_le.le_list);
745 sdp->sd_log_num_databuf--;
746 sdp->sd_log_num_jdata--;
747 gfs2_unpin(sdp, bd->bd_bh, ai);
748 }
749 gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
750 gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata);
751}
752
753
754const struct gfs2_log_operations gfs2_glock_lops = {
755 .lo_add = glock_lo_add,
756 .lo_after_commit = glock_lo_after_commit,
757 .lo_name = "glock"
758};
759
760const struct gfs2_log_operations gfs2_buf_lops = {
761 .lo_add = buf_lo_add,
762 .lo_incore_commit = buf_lo_incore_commit,
763 .lo_before_commit = buf_lo_before_commit,
764 .lo_after_commit = buf_lo_after_commit,
765 .lo_before_scan = buf_lo_before_scan,
766 .lo_scan_elements = buf_lo_scan_elements,
767 .lo_after_scan = buf_lo_after_scan,
768 .lo_name = "buf"
769};
770
771const struct gfs2_log_operations gfs2_revoke_lops = {
772 .lo_add = revoke_lo_add,
773 .lo_before_commit = revoke_lo_before_commit,
774 .lo_before_scan = revoke_lo_before_scan,
775 .lo_scan_elements = revoke_lo_scan_elements,
776 .lo_after_scan = revoke_lo_after_scan,
777 .lo_name = "revoke"
778};
779
780const struct gfs2_log_operations gfs2_rg_lops = {
781 .lo_add = rg_lo_add,
782 .lo_after_commit = rg_lo_after_commit,
783 .lo_name = "rg"
784};
785
786const struct gfs2_log_operations gfs2_databuf_lops = {
787 .lo_add = databuf_lo_add,
788 .lo_incore_commit = buf_lo_incore_commit,
789 .lo_before_commit = databuf_lo_before_commit,
790 .lo_after_commit = databuf_lo_after_commit,
791 .lo_scan_elements = databuf_lo_scan_elements,
792 .lo_after_scan = databuf_lo_after_scan,
793 .lo_name = "databuf"
794};
795
796const struct gfs2_log_operations *gfs2_log_ops[] = {
797 &gfs2_glock_lops,
798 &gfs2_buf_lops,
799 &gfs2_revoke_lops,
800 &gfs2_rg_lops,
801 &gfs2_databuf_lops,
802 NULL
803};
804
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
new file mode 100644
index 000000000000..8a1029d3d389
--- /dev/null
+++ b/fs/gfs2/lops.h
@@ -0,0 +1,96 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LOPS_DOT_H__
11#define __LOPS_DOT_H__
12
13extern const struct gfs2_log_operations gfs2_glock_lops;
14extern const struct gfs2_log_operations gfs2_buf_lops;
15extern const struct gfs2_log_operations gfs2_revoke_lops;
16extern const struct gfs2_log_operations gfs2_rg_lops;
17extern const struct gfs2_log_operations gfs2_databuf_lops;
18
19extern const struct gfs2_log_operations *gfs2_log_ops[];
20
21static inline void lops_init_le(struct gfs2_log_element *le,
22 const struct gfs2_log_operations *lops)
23{
24 INIT_LIST_HEAD(&le->le_list);
25 le->le_ops = lops;
26}
27
28static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
29{
30 if (le->le_ops->lo_add)
31 le->le_ops->lo_add(sdp, le);
32}
33
34static inline void lops_incore_commit(struct gfs2_sbd *sdp,
35 struct gfs2_trans *tr)
36{
37 int x;
38 for (x = 0; gfs2_log_ops[x]; x++)
39 if (gfs2_log_ops[x]->lo_incore_commit)
40 gfs2_log_ops[x]->lo_incore_commit(sdp, tr);
41}
42
43static inline void lops_before_commit(struct gfs2_sbd *sdp)
44{
45 int x;
46 for (x = 0; gfs2_log_ops[x]; x++)
47 if (gfs2_log_ops[x]->lo_before_commit)
48 gfs2_log_ops[x]->lo_before_commit(sdp);
49}
50
51static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
52{
53 int x;
54 for (x = 0; gfs2_log_ops[x]; x++)
55 if (gfs2_log_ops[x]->lo_after_commit)
56 gfs2_log_ops[x]->lo_after_commit(sdp, ai);
57}
58
59static inline void lops_before_scan(struct gfs2_jdesc *jd,
60 struct gfs2_log_header *head,
61 unsigned int pass)
62{
63 int x;
64 for (x = 0; gfs2_log_ops[x]; x++)
65 if (gfs2_log_ops[x]->lo_before_scan)
66 gfs2_log_ops[x]->lo_before_scan(jd, head, pass);
67}
68
69static inline int lops_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
70 struct gfs2_log_descriptor *ld,
71 __be64 *ptr,
72 unsigned int pass)
73{
74 int x, error;
75 for (x = 0; gfs2_log_ops[x]; x++)
76 if (gfs2_log_ops[x]->lo_scan_elements) {
77 error = gfs2_log_ops[x]->lo_scan_elements(jd, start,
78 ld, ptr, pass);
79 if (error)
80 return error;
81 }
82
83 return 0;
84}
85
86static inline void lops_after_scan(struct gfs2_jdesc *jd, int error,
87 unsigned int pass)
88{
89 int x;
90 for (x = 0; gfs2_log_ops[x]; x++)
91 if (gfs2_log_ops[x]->lo_before_scan)
92 gfs2_log_ops[x]->lo_after_scan(jd, error, pass);
93}
94
95#endif /* __LOPS_DOT_H__ */
96
diff --git a/fs/gfs2/lvb.c b/fs/gfs2/lvb.c
new file mode 100644
index 000000000000..e88e9cce14e7
--- /dev/null
+++ b/fs/gfs2/lvb.c
@@ -0,0 +1,45 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16
17#include "gfs2.h"
18#include "lm_interface.h"
19#include "incore.h"
20#include "lvb.h"
21
22#define pv(struct, member, fmt) printk(KERN_INFO " "#member" = "fmt"\n", \
23 struct->member);
24
25void gfs2_quota_lvb_in(struct gfs2_quota_lvb *qb, char *lvb)
26{
27 struct gfs2_quota_lvb *str = (struct gfs2_quota_lvb *)lvb;
28
29 qb->qb_magic = be32_to_cpu(str->qb_magic);
30 qb->qb_limit = be64_to_cpu(str->qb_limit);
31 qb->qb_warn = be64_to_cpu(str->qb_warn);
32 qb->qb_value = be64_to_cpu(str->qb_value);
33}
34
35void gfs2_quota_lvb_out(struct gfs2_quota_lvb *qb, char *lvb)
36{
37 struct gfs2_quota_lvb *str = (struct gfs2_quota_lvb *)lvb;
38
39 str->qb_magic = cpu_to_be32(qb->qb_magic);
40 str->qb_limit = cpu_to_be64(qb->qb_limit);
41 str->qb_warn = cpu_to_be64(qb->qb_warn);
42 str->qb_value = cpu_to_be64(qb->qb_value);
43}
44
45
diff --git a/fs/gfs2/lvb.h b/fs/gfs2/lvb.h
new file mode 100644
index 000000000000..1b1a8b75219a
--- /dev/null
+++ b/fs/gfs2/lvb.h
@@ -0,0 +1,19 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LVB_DOT_H__
11#define __LVB_DOT_H__
12
13#define GFS2_MIN_LVB_SIZE 32
14
15void gfs2_quota_lvb_in(struct gfs2_quota_lvb *qb, char *lvb);
16void gfs2_quota_lvb_out(struct gfs2_quota_lvb *qb, char *lvb);
17
18#endif /* __LVB_DOT_H__ */
19
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
new file mode 100644
index 000000000000..b24d0b40d965
--- /dev/null
+++ b/fs/gfs2/main.c
@@ -0,0 +1,129 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/module.h>
16#include <linux/init.h>
17#include <linux/gfs2_ondisk.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "ops_fstype.h"
23#include "sys.h"
24#include "util.h"
25
26static void gfs2_init_inode_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
27{
28 struct gfs2_inode *ip = foo;
29 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
30 SLAB_CTOR_CONSTRUCTOR) {
31 inode_init_once(&ip->i_inode);
32 atomic_set(&ip->i_count, 0);
33 ip->i_vnode = &ip->i_inode;
34 spin_lock_init(&ip->i_spin);
35 init_rwsem(&ip->i_rw_mutex);
36 memset(ip->i_cache, 0, sizeof(ip->i_cache));
37 }
38}
39
40/**
41 * init_gfs2_fs - Register GFS2 as a filesystem
42 *
43 * Returns: 0 on success, error code on failure
44 */
45
46static int __init init_gfs2_fs(void)
47{
48 int error;
49
50 gfs2_init_lmh();
51
52 error = gfs2_sys_init();
53 if (error)
54 return error;
55
56 error = -ENOMEM;
57
58 gfs2_glock_cachep = kmem_cache_create("gfs2_glock",
59 sizeof(struct gfs2_glock),
60 0, 0, NULL, NULL);
61 if (!gfs2_glock_cachep)
62 goto fail;
63
64 gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
65 sizeof(struct gfs2_inode),
66 0, (SLAB_RECLAIM_ACCOUNT|
67 SLAB_PANIC|SLAB_MEM_SPREAD),
68 gfs2_init_inode_once, NULL);
69 if (!gfs2_inode_cachep)
70 goto fail;
71
72 gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata",
73 sizeof(struct gfs2_bufdata),
74 0, 0, NULL, NULL);
75 if (!gfs2_bufdata_cachep)
76 goto fail;
77
78 error = register_filesystem(&gfs2_fs_type);
79 if (error)
80 goto fail;
81
82 error = register_filesystem(&gfs2meta_fs_type);
83 if (error)
84 goto fail_unregister;
85
86 printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
87
88 return 0;
89
90fail_unregister:
91 unregister_filesystem(&gfs2_fs_type);
92fail:
93 if (gfs2_bufdata_cachep)
94 kmem_cache_destroy(gfs2_bufdata_cachep);
95
96 if (gfs2_inode_cachep)
97 kmem_cache_destroy(gfs2_inode_cachep);
98
99 if (gfs2_glock_cachep)
100 kmem_cache_destroy(gfs2_glock_cachep);
101
102 gfs2_sys_uninit();
103 return error;
104}
105
106/**
107 * exit_gfs2_fs - Unregister the file system
108 *
109 */
110
111static void __exit exit_gfs2_fs(void)
112{
113 unregister_filesystem(&gfs2_fs_type);
114 unregister_filesystem(&gfs2meta_fs_type);
115
116 kmem_cache_destroy(gfs2_bufdata_cachep);
117 kmem_cache_destroy(gfs2_inode_cachep);
118 kmem_cache_destroy(gfs2_glock_cachep);
119
120 gfs2_sys_uninit();
121}
122
123MODULE_DESCRIPTION("Global File System");
124MODULE_AUTHOR("Red Hat, Inc.");
125MODULE_LICENSE("GPL");
126
127module_init(init_gfs2_fs);
128module_exit(exit_gfs2_fs);
129
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
new file mode 100644
index 000000000000..c78517225f61
--- /dev/null
+++ b/fs/gfs2/meta_io.c
@@ -0,0 +1,892 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/writeback.h>
18#include <linux/swap.h>
19#include <linux/delay.h>
20#include <linux/gfs2_ondisk.h>
21
22#include "gfs2.h"
23#include "lm_interface.h"
24#include "incore.h"
25#include "glock.h"
26#include "glops.h"
27#include "inode.h"
28#include "log.h"
29#include "lops.h"
30#include "meta_io.h"
31#include "rgrp.h"
32#include "trans.h"
33#include "util.h"
34
35#define buffer_busy(bh) \
36((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned)))
37#define buffer_in_io(bh) \
38((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock)))
39
40static int aspace_get_block(struct inode *inode, sector_t lblock,
41 struct buffer_head *bh_result, int create)
42{
43 gfs2_assert_warn(inode->i_sb->s_fs_info, 0);
44 return -EOPNOTSUPP;
45}
46
47static int gfs2_aspace_writepage(struct page *page,
48 struct writeback_control *wbc)
49{
50 return block_write_full_page(page, aspace_get_block, wbc);
51}
52
53/**
54 * stuck_releasepage - We're stuck in gfs2_releasepage(). Print stuff out.
55 * @bh: the buffer we're stuck on
56 *
57 */
58
59static void stuck_releasepage(struct buffer_head *bh)
60{
61 struct inode *inode = bh->b_page->mapping->host;
62 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
63 struct gfs2_bufdata *bd = bh->b_private;
64 struct gfs2_glock *gl;
65
66 fs_warn(sdp, "stuck in gfs2_releasepage() %p\n", inode);
67 fs_warn(sdp, "blkno = %llu, bh->b_count = %d\n",
68 (unsigned long long)bh->b_blocknr, atomic_read(&bh->b_count));
69 fs_warn(sdp, "pinned = %u\n", buffer_pinned(bh));
70 fs_warn(sdp, "bh->b_private = %s\n", (bd) ? "!NULL" : "NULL");
71
72 if (!bd)
73 return;
74
75 gl = bd->bd_gl;
76
77 fs_warn(sdp, "gl = (%u, %llu)\n",
78 gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number);
79
80 fs_warn(sdp, "bd_list_tr = %s, bd_le.le_list = %s\n",
81 (list_empty(&bd->bd_list_tr)) ? "no" : "yes",
82 (list_empty(&bd->bd_le.le_list)) ? "no" : "yes");
83
84 if (gl->gl_ops == &gfs2_inode_glops) {
85 struct gfs2_inode *ip = gl->gl_object;
86 unsigned int x;
87
88 if (!ip)
89 return;
90
91 fs_warn(sdp, "ip = %llu %llu\n",
92 (unsigned long long)ip->i_num.no_formal_ino,
93 (unsigned long long)ip->i_num.no_addr);
94 fs_warn(sdp, "ip->i_count = %d, ip->i_vnode = %s\n",
95 atomic_read(&ip->i_count),
96 (ip->i_vnode) ? "!NULL" : "NULL");
97
98 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++)
99 fs_warn(sdp, "ip->i_cache[%u] = %s\n",
100 x, (ip->i_cache[x]) ? "!NULL" : "NULL");
101 }
102}
103
104/**
105 * gfs2_aspace_releasepage - free the metadata associated with a page
106 * @page: the page that's being released
107 * @gfp_mask: passed from Linux VFS, ignored by us
108 *
109 * Call try_to_free_buffers() if the buffers in this page can be
110 * released.
111 *
112 * Returns: 0
113 */
114
115static int gfs2_aspace_releasepage(struct page *page, gfp_t gfp_mask)
116{
117 struct inode *aspace = page->mapping->host;
118 struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
119 struct buffer_head *bh, *head;
120 struct gfs2_bufdata *bd;
121 unsigned long t;
122
123 if (!page_has_buffers(page))
124 goto out;
125
126 head = bh = page_buffers(page);
127 do {
128 t = jiffies;
129
130 while (atomic_read(&bh->b_count)) {
131 if (atomic_read(&aspace->i_writecount)) {
132 if (time_after_eq(jiffies, t +
133 gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
134 stuck_releasepage(bh);
135 t = jiffies;
136 }
137
138 yield();
139 continue;
140 }
141
142 return 0;
143 }
144
145 gfs2_assert_warn(sdp, !buffer_pinned(bh));
146
147 bd = bh->b_private;
148 if (bd) {
149 gfs2_assert_warn(sdp, bd->bd_bh == bh);
150 gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
151 gfs2_assert_warn(sdp, list_empty(&bd->bd_le.le_list));
152 gfs2_assert_warn(sdp, !bd->bd_ail);
153 kmem_cache_free(gfs2_bufdata_cachep, bd);
154 bh->b_private = NULL;
155 }
156
157 bh = bh->b_this_page;
158 }
159 while (bh != head);
160
161 out:
162 return try_to_free_buffers(page);
163}
164
165static struct address_space_operations aspace_aops = {
166 .writepage = gfs2_aspace_writepage,
167 .releasepage = gfs2_aspace_releasepage,
168};
169
170/**
171 * gfs2_aspace_get - Create and initialize a struct inode structure
172 * @sdp: the filesystem the aspace is in
173 *
174 * Right now a struct inode is just a struct inode. Maybe Linux
175 * will supply a more lightweight address space construct (that works)
176 * in the future.
177 *
178 * Make sure pages/buffers in this aspace aren't in high memory.
179 *
180 * Returns: the aspace
181 */
182
183struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
184{
185 struct inode *aspace;
186
187 aspace = new_inode(sdp->sd_vfs);
188 if (aspace) {
189 mapping_set_gfp_mask(aspace->i_mapping, GFP_KERNEL);
190 aspace->i_mapping->a_ops = &aspace_aops;
191 aspace->i_size = ~0ULL;
192 aspace->u.generic_ip = NULL;
193 insert_inode_hash(aspace);
194 }
195 return aspace;
196}
197
198void gfs2_aspace_put(struct inode *aspace)
199{
200 remove_inode_hash(aspace);
201 iput(aspace);
202}
203
204/**
205 * gfs2_ail1_start_one - Start I/O on a part of the AIL
206 * @sdp: the filesystem
207 * @tr: the part of the AIL
208 *
209 */
210
211void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
212{
213 struct gfs2_bufdata *bd, *s;
214 struct buffer_head *bh;
215 int retry;
216
217 BUG_ON(!spin_is_locked(&sdp->sd_log_lock));
218
219 do {
220 retry = 0;
221
222 list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
223 bd_ail_st_list) {
224 bh = bd->bd_bh;
225
226 gfs2_assert(sdp, bd->bd_ail == ai);
227
228 if (!buffer_busy(bh)) {
229 if (!buffer_uptodate(bh)) {
230 gfs2_log_unlock(sdp);
231 gfs2_io_error_bh(sdp, bh);
232 gfs2_log_lock(sdp);
233 }
234 list_move(&bd->bd_ail_st_list,
235 &ai->ai_ail2_list);
236 continue;
237 }
238
239 if (!buffer_dirty(bh))
240 continue;
241
242 list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
243
244 gfs2_log_unlock(sdp);
245 wait_on_buffer(bh);
246 ll_rw_block(WRITE, 1, &bh);
247 gfs2_log_lock(sdp);
248
249 retry = 1;
250 break;
251 }
252 } while (retry);
253}
254
255/**
256 * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced
257 * @sdp: the filesystem
258 * @ai: the AIL entry
259 *
260 */
261
262int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags)
263{
264 struct gfs2_bufdata *bd, *s;
265 struct buffer_head *bh;
266
267 list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
268 bd_ail_st_list) {
269 bh = bd->bd_bh;
270
271 gfs2_assert(sdp, bd->bd_ail == ai);
272
273 if (buffer_busy(bh)) {
274 if (flags & DIO_ALL)
275 continue;
276 else
277 break;
278 }
279
280 if (!buffer_uptodate(bh))
281 gfs2_io_error_bh(sdp, bh);
282
283 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
284 }
285
286 return list_empty(&ai->ai_ail1_list);
287}
288
289/**
290 * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
291 * @sdp: the filesystem
292 * @ai: the AIL entry
293 *
294 */
295
296void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
297{
298 struct list_head *head = &ai->ai_ail2_list;
299 struct gfs2_bufdata *bd;
300
301 while (!list_empty(head)) {
302 bd = list_entry(head->prev, struct gfs2_bufdata,
303 bd_ail_st_list);
304 gfs2_assert(sdp, bd->bd_ail == ai);
305 bd->bd_ail = NULL;
306 list_del(&bd->bd_ail_st_list);
307 list_del(&bd->bd_ail_gl_list);
308 atomic_dec(&bd->bd_gl->gl_ail_count);
309 brelse(bd->bd_bh);
310 }
311}
312
313/**
314 * ail_empty_gl - remove all buffers for a given lock from the AIL
315 * @gl: the glock
316 *
317 * None of the buffers should be dirty, locked, or pinned.
318 */
319
320void gfs2_ail_empty_gl(struct gfs2_glock *gl)
321{
322 struct gfs2_sbd *sdp = gl->gl_sbd;
323 unsigned int blocks;
324 struct list_head *head = &gl->gl_ail_list;
325 struct gfs2_bufdata *bd;
326 struct buffer_head *bh;
327 uint64_t blkno;
328 int error;
329
330 blocks = atomic_read(&gl->gl_ail_count);
331 if (!blocks)
332 return;
333
334 error = gfs2_trans_begin(sdp, 0, blocks);
335 if (gfs2_assert_withdraw(sdp, !error))
336 return;
337
338 gfs2_log_lock(sdp);
339 while (!list_empty(head)) {
340 bd = list_entry(head->next, struct gfs2_bufdata,
341 bd_ail_gl_list);
342 bh = bd->bd_bh;
343 blkno = bh->b_blocknr;
344 gfs2_assert_withdraw(sdp, !buffer_busy(bh));
345
346 bd->bd_ail = NULL;
347 list_del(&bd->bd_ail_st_list);
348 list_del(&bd->bd_ail_gl_list);
349 atomic_dec(&gl->gl_ail_count);
350 brelse(bh);
351 gfs2_log_unlock(sdp);
352
353 gfs2_trans_add_revoke(sdp, blkno);
354
355 gfs2_log_lock(sdp);
356 }
357 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
358 gfs2_log_unlock(sdp);
359
360 gfs2_trans_end(sdp);
361 gfs2_log_flush(sdp, NULL);
362}
363
364/**
365 * gfs2_meta_inval - Invalidate all buffers associated with a glock
366 * @gl: the glock
367 *
368 */
369
370void gfs2_meta_inval(struct gfs2_glock *gl)
371{
372 struct gfs2_sbd *sdp = gl->gl_sbd;
373 struct inode *aspace = gl->gl_aspace;
374 struct address_space *mapping = gl->gl_aspace->i_mapping;
375
376 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
377
378 atomic_inc(&aspace->i_writecount);
379 truncate_inode_pages(mapping, 0);
380 atomic_dec(&aspace->i_writecount);
381
382 gfs2_assert_withdraw(sdp, !mapping->nrpages);
383}
384
385/**
386 * gfs2_meta_sync - Sync all buffers associated with a glock
387 * @gl: The glock
388 * @flags: DIO_START | DIO_WAIT
389 *
390 */
391
392void gfs2_meta_sync(struct gfs2_glock *gl, int flags)
393{
394 struct address_space *mapping = gl->gl_aspace->i_mapping;
395 int error = 0;
396
397 if (flags & DIO_START)
398 filemap_fdatawrite(mapping);
399 if (!error && (flags & DIO_WAIT))
400 error = filemap_fdatawait(mapping);
401
402 if (error)
403 gfs2_io_error(gl->gl_sbd);
404}
405
406/**
407 * getbuf - Get a buffer with a given address space
408 * @sdp: the filesystem
409 * @aspace: the address space
410 * @blkno: the block number (filesystem scope)
411 * @create: 1 if the buffer should be created
412 *
413 * Returns: the buffer
414 */
415
416static struct buffer_head *getbuf(struct gfs2_sbd *sdp, struct inode *aspace,
417 uint64_t blkno, int create)
418{
419 struct page *page;
420 struct buffer_head *bh;
421 unsigned int shift;
422 unsigned long index;
423 unsigned int bufnum;
424
425 shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
426 index = blkno >> shift; /* convert block to page */
427 bufnum = blkno - (index << shift); /* block buf index within page */
428
429 if (create) {
430 for (;;) {
431 page = grab_cache_page(aspace->i_mapping, index);
432 if (page)
433 break;
434 yield();
435 }
436 } else {
437 page = find_lock_page(aspace->i_mapping, index);
438 if (!page)
439 return NULL;
440 }
441
442 if (!page_has_buffers(page))
443 create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
444
445 /* Locate header for our buffer within our page */
446 for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
447 /* Do nothing */;
448 get_bh(bh);
449
450 if (!buffer_mapped(bh))
451 map_bh(bh, sdp->sd_vfs, blkno);
452
453 unlock_page(page);
454 mark_page_accessed(page);
455 page_cache_release(page);
456
457 return bh;
458}
459
460static void meta_prep_new(struct buffer_head *bh)
461{
462 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
463
464 lock_buffer(bh);
465 clear_buffer_dirty(bh);
466 set_buffer_uptodate(bh);
467 unlock_buffer(bh);
468
469 mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
470}
471
472/**
473 * gfs2_meta_new - Get a block
474 * @gl: The glock associated with this block
475 * @blkno: The block number
476 *
477 * Returns: The buffer
478 */
479
480struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, uint64_t blkno)
481{
482 struct buffer_head *bh;
483 bh = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
484 meta_prep_new(bh);
485 return bh;
486}
487
488/**
489 * gfs2_meta_read - Read a block from disk
490 * @gl: The glock covering the block
491 * @blkno: The block number
492 * @flags: flags to gfs2_dreread()
493 * @bhp: the place where the buffer is returned (NULL on failure)
494 *
495 * Returns: errno
496 */
497
498int gfs2_meta_read(struct gfs2_glock *gl, uint64_t blkno, int flags,
499 struct buffer_head **bhp)
500{
501 int error;
502
503 *bhp = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
504 error = gfs2_meta_reread(gl->gl_sbd, *bhp, flags);
505 if (error)
506 brelse(*bhp);
507
508 return error;
509}
510
511/**
512 * gfs2_meta_reread - Reread a block from disk
513 * @sdp: the filesystem
514 * @bh: The block to read
515 * @flags: Flags that control the read
516 *
517 * Returns: errno
518 */
519
520int gfs2_meta_reread(struct gfs2_sbd *sdp, struct buffer_head *bh, int flags)
521{
522 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
523 return -EIO;
524
525 if (flags & DIO_FORCE)
526 clear_buffer_uptodate(bh);
527
528 if ((flags & DIO_START) && !buffer_uptodate(bh))
529 ll_rw_block(READ, 1, &bh);
530
531 if (flags & DIO_WAIT) {
532 wait_on_buffer(bh);
533
534 if (!buffer_uptodate(bh)) {
535 struct gfs2_trans *tr = current->journal_info;
536 if (tr && tr->tr_touched)
537 gfs2_io_error_bh(sdp, bh);
538 return -EIO;
539 }
540 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
541 return -EIO;
542 }
543
544 return 0;
545}
546
547/**
548 * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer
549 * @gl: the glock the buffer belongs to
550 * @bh: The buffer to be attached to
551 * @meta: Flag to indicate whether its metadata or not
552 */
553
554void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
555 int meta)
556{
557 struct gfs2_bufdata *bd;
558
559 if (meta)
560 lock_page(bh->b_page);
561
562 if (bh->b_private) {
563 if (meta)
564 unlock_page(bh->b_page);
565 return;
566 }
567
568 bd = kmem_cache_alloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL),
569 memset(bd, 0, sizeof(struct gfs2_bufdata));
570
571 bd->bd_bh = bh;
572 bd->bd_gl = gl;
573
574 INIT_LIST_HEAD(&bd->bd_list_tr);
575 if (meta) {
576 lops_init_le(&bd->bd_le, &gfs2_buf_lops);
577 } else {
578 lops_init_le(&bd->bd_le, &gfs2_databuf_lops);
579 get_bh(bh);
580 }
581 bh->b_private = bd;
582
583 if (meta)
584 unlock_page(bh->b_page);
585}
586
587/**
588 * gfs2_pin - Pin a buffer in memory
589 * @sdp: the filesystem the buffer belongs to
590 * @bh: The buffer to be pinned
591 *
592 */
593
594void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
595{
596 struct gfs2_bufdata *bd = bh->b_private;
597
598 gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
599
600 if (test_set_buffer_pinned(bh))
601 gfs2_assert_withdraw(sdp, 0);
602
603 wait_on_buffer(bh);
604
605 /* If this buffer is in the AIL and it has already been written
606 to in-place disk block, remove it from the AIL. */
607
608 gfs2_log_lock(sdp);
609 if (bd->bd_ail && !buffer_in_io(bh))
610 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
611 gfs2_log_unlock(sdp);
612
613 clear_buffer_dirty(bh);
614 wait_on_buffer(bh);
615
616 if (!buffer_uptodate(bh))
617 gfs2_io_error_bh(sdp, bh);
618
619 get_bh(bh);
620}
621
622/**
623 * gfs2_unpin - Unpin a buffer
624 * @sdp: the filesystem the buffer belongs to
625 * @bh: The buffer to unpin
626 * @ai:
627 *
628 */
629
630void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
631 struct gfs2_ail *ai)
632{
633 struct gfs2_bufdata *bd = bh->b_private;
634
635 gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
636
637 if (!buffer_pinned(bh))
638 gfs2_assert_withdraw(sdp, 0);
639
640 mark_buffer_dirty(bh);
641 clear_buffer_pinned(bh);
642
643 gfs2_log_lock(sdp);
644 if (bd->bd_ail) {
645 list_del(&bd->bd_ail_st_list);
646 brelse(bh);
647 } else {
648 struct gfs2_glock *gl = bd->bd_gl;
649 list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
650 atomic_inc(&gl->gl_ail_count);
651 }
652 bd->bd_ail = ai;
653 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
654 gfs2_log_unlock(sdp);
655}
656
657/**
658 * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore
659 * @ip: the inode who owns the buffers
660 * @bstart: the first buffer in the run
661 * @blen: the number of buffers in the run
662 *
663 */
664
665void gfs2_meta_wipe(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen)
666{
667 struct gfs2_sbd *sdp = ip->i_sbd;
668 struct inode *aspace = ip->i_gl->gl_aspace;
669 struct buffer_head *bh;
670
671 while (blen) {
672 bh = getbuf(sdp, aspace, bstart, NO_CREATE);
673 if (bh) {
674 struct gfs2_bufdata *bd = bh->b_private;
675
676 if (test_clear_buffer_pinned(bh)) {
677 struct gfs2_trans *tr = current->journal_info;
678 gfs2_log_lock(sdp);
679 list_del_init(&bd->bd_le.le_list);
680 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
681 sdp->sd_log_num_buf--;
682 gfs2_log_unlock(sdp);
683 tr->tr_num_buf_rm++;
684 brelse(bh);
685 }
686 if (bd) {
687 gfs2_log_lock(sdp);
688 if (bd->bd_ail) {
689 uint64_t blkno = bh->b_blocknr;
690 bd->bd_ail = NULL;
691 list_del(&bd->bd_ail_st_list);
692 list_del(&bd->bd_ail_gl_list);
693 atomic_dec(&bd->bd_gl->gl_ail_count);
694 brelse(bh);
695 gfs2_log_unlock(sdp);
696 gfs2_trans_add_revoke(sdp, blkno);
697 } else
698 gfs2_log_unlock(sdp);
699 }
700
701 lock_buffer(bh);
702 clear_buffer_dirty(bh);
703 clear_buffer_uptodate(bh);
704 unlock_buffer(bh);
705
706 brelse(bh);
707 }
708
709 bstart++;
710 blen--;
711 }
712}
713
714/**
715 * gfs2_meta_cache_flush - get rid of any references on buffers for this inode
716 * @ip: The GFS2 inode
717 *
718 * This releases buffers that are in the most-recently-used array of
719 * blocks used for indirect block addressing for this inode.
720 */
721
722void gfs2_meta_cache_flush(struct gfs2_inode *ip)
723{
724 struct buffer_head **bh_slot;
725 unsigned int x;
726
727 spin_lock(&ip->i_spin);
728
729 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
730 bh_slot = &ip->i_cache[x];
731 if (!*bh_slot)
732 break;
733 brelse(*bh_slot);
734 *bh_slot = NULL;
735 }
736
737 spin_unlock(&ip->i_spin);
738}
739
740/**
741 * gfs2_meta_indirect_buffer - Get a metadata buffer
742 * @ip: The GFS2 inode
743 * @height: The level of this buf in the metadata (indir addr) tree (if any)
744 * @num: The block number (device relative) of the buffer
745 * @new: Non-zero if we may create a new buffer
746 * @bhp: the buffer is returned here
747 *
748 * Try to use the gfs2_inode's MRU metadata tree cache.
749 *
750 * Returns: errno
751 */
752
753int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, uint64_t num,
754 int new, struct buffer_head **bhp)
755{
756 struct buffer_head *bh, **bh_slot = ip->i_cache + height;
757 int error;
758
759 spin_lock(&ip->i_spin);
760 bh = *bh_slot;
761 if (bh) {
762 if (bh->b_blocknr == num)
763 get_bh(bh);
764 else
765 bh = NULL;
766 }
767 spin_unlock(&ip->i_spin);
768
769 if (bh) {
770 if (new)
771 meta_prep_new(bh);
772 else {
773 error = gfs2_meta_reread(ip->i_sbd, bh,
774 DIO_START | DIO_WAIT);
775 if (error) {
776 brelse(bh);
777 return error;
778 }
779 }
780 } else {
781 if (new)
782 bh = gfs2_meta_new(ip->i_gl, num);
783 else {
784 error = gfs2_meta_read(ip->i_gl, num,
785 DIO_START | DIO_WAIT, &bh);
786 if (error)
787 return error;
788 }
789
790 spin_lock(&ip->i_spin);
791 if (*bh_slot != bh) {
792 brelse(*bh_slot);
793 *bh_slot = bh;
794 get_bh(bh);
795 }
796 spin_unlock(&ip->i_spin);
797 }
798
799 if (new) {
800 if (gfs2_assert_warn(ip->i_sbd, height)) {
801 brelse(bh);
802 return -EIO;
803 }
804 gfs2_trans_add_bh(ip->i_gl, bh, 1);
805 gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
806 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
807
808 } else if (gfs2_metatype_check(ip->i_sbd, bh,
809 (height) ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)) {
810 brelse(bh);
811 return -EIO;
812 }
813
814 *bhp = bh;
815
816 return 0;
817}
818
819/**
820 * gfs2_meta_ra - start readahead on an extent of a file
821 * @gl: the glock the blocks belong to
822 * @dblock: the starting disk block
823 * @extlen: the number of blocks in the extent
824 *
825 */
826
827void gfs2_meta_ra(struct gfs2_glock *gl, uint64_t dblock, uint32_t extlen)
828{
829 struct gfs2_sbd *sdp = gl->gl_sbd;
830 struct inode *aspace = gl->gl_aspace;
831 struct buffer_head *first_bh, *bh;
832 uint32_t max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
833 sdp->sd_sb.sb_bsize_shift;
834 int error;
835
836 if (!extlen || !max_ra)
837 return;
838 if (extlen > max_ra)
839 extlen = max_ra;
840
841 first_bh = getbuf(sdp, aspace, dblock, CREATE);
842
843 if (buffer_uptodate(first_bh))
844 goto out;
845 if (!buffer_locked(first_bh)) {
846 error = gfs2_meta_reread(sdp, first_bh, DIO_START);
847 if (error)
848 goto out;
849 }
850
851 dblock++;
852 extlen--;
853
854 while (extlen) {
855 bh = getbuf(sdp, aspace, dblock, CREATE);
856
857 if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
858 error = gfs2_meta_reread(sdp, bh, DIO_START);
859 brelse(bh);
860 if (error)
861 goto out;
862 } else
863 brelse(bh);
864
865 dblock++;
866 extlen--;
867
868 if (buffer_uptodate(first_bh))
869 break;
870 }
871
872 out:
873 brelse(first_bh);
874}
875
876/**
877 * gfs2_meta_syncfs - sync all the buffers in a filesystem
878 * @sdp: the filesystem
879 *
880 */
881
882void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
883{
884 gfs2_log_flush(sdp, NULL);
885 for (;;) {
886 gfs2_ail1_start(sdp, DIO_ALL);
887 if (gfs2_ail1_empty(sdp, DIO_ALL))
888 break;
889 msleep(10);
890 }
891}
892
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
new file mode 100644
index 000000000000..23c6a596fd9e
--- /dev/null
+++ b/fs/gfs2/meta_io.h
@@ -0,0 +1,89 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __DIO_DOT_H__
11#define __DIO_DOT_H__
12
13static inline void gfs2_buffer_clear(struct buffer_head *bh)
14{
15 memset(bh->b_data, 0, bh->b_size);
16}
17
18static inline void gfs2_buffer_clear_tail(struct buffer_head *bh, int head)
19{
20 memset(bh->b_data + head, 0, bh->b_size - head);
21}
22
23static inline void gfs2_buffer_clear_ends(struct buffer_head *bh, int offset,
24 int amount, int journaled)
25{
26 int z_off1 = (journaled) ? sizeof(struct gfs2_meta_header) : 0;
27 int z_len1 = offset - z_off1;
28 int z_off2 = offset + amount;
29 int z_len2 = (bh)->b_size - z_off2;
30
31 if (z_len1)
32 memset(bh->b_data + z_off1, 0, z_len1);
33
34 if (z_len2)
35 memset(bh->b_data + z_off2, 0, z_len2);
36}
37
38static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
39 int to_head,
40 struct buffer_head *from_bh,
41 int from_head)
42{
43 memcpy(to_bh->b_data + to_head,
44 from_bh->b_data + from_head,
45 from_bh->b_size - from_head);
46 memset(to_bh->b_data + to_bh->b_size + to_head - from_head,
47 0,
48 from_head - to_head);
49}
50
51struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
52void gfs2_aspace_put(struct inode *aspace);
53
54void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai);
55int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags);
56void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai);
57void gfs2_ail_empty_gl(struct gfs2_glock *gl);
58
59void gfs2_meta_inval(struct gfs2_glock *gl);
60void gfs2_meta_sync(struct gfs2_glock *gl, int flags);
61
62struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, uint64_t blkno);
63int gfs2_meta_read(struct gfs2_glock *gl, uint64_t blkno,
64 int flags, struct buffer_head **bhp);
65int gfs2_meta_reread(struct gfs2_sbd *sdp, struct buffer_head *bh, int flags);
66
67void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
68 int meta);
69void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
70void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
71 struct gfs2_ail *ai);
72
73void gfs2_meta_wipe(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen);
74
75void gfs2_meta_cache_flush(struct gfs2_inode *ip);
76int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, uint64_t num,
77 int new, struct buffer_head **bhp);
78
79static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
80 struct buffer_head **bhp)
81{
82 return gfs2_meta_indirect_buffer(ip, 0, ip->i_num.no_addr, 0, bhp);
83}
84
85void gfs2_meta_ra(struct gfs2_glock *gl, uint64_t dblock, uint32_t extlen);
86void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
87
88#endif /* __DIO_DOT_H__ */
89
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
new file mode 100644
index 000000000000..0d4b230785af
--- /dev/null
+++ b/fs/gfs2/mount.c
@@ -0,0 +1,214 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16
17#include "gfs2.h"
18#include "lm_interface.h"
19#include "incore.h"
20#include "mount.h"
21#include "sys.h"
22#include "util.h"
23
24/**
25 * gfs2_mount_args - Parse mount options
26 * @sdp:
27 * @data:
28 *
29 * Return: errno
30 */
31
32int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
33{
34 struct gfs2_args *args = &sdp->sd_args;
35 char *data = data_arg;
36 char *options, *o, *v;
37 int error = 0;
38
39 if (!remount) {
40 /* If someone preloaded options, use those instead */
41 spin_lock(&gfs2_sys_margs_lock);
42 if (gfs2_sys_margs) {
43 data = gfs2_sys_margs;
44 gfs2_sys_margs = NULL;
45 }
46 spin_unlock(&gfs2_sys_margs_lock);
47
48 /* Set some defaults */
49 args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
50 args->ar_quota = GFS2_QUOTA_DEFAULT;
51 args->ar_data = GFS2_DATA_DEFAULT;
52 }
53
54 /* Split the options into tokens with the "," character and
55 process them */
56
57 for (options = data; (o = strsep(&options, ",")); ) {
58 if (!*o)
59 continue;
60
61 v = strchr(o, '=');
62 if (v)
63 *v++ = 0;
64
65 if (!strcmp(o, "lockproto")) {
66 if (!v)
67 goto need_value;
68 if (remount && strcmp(v, args->ar_lockproto))
69 goto cant_remount;
70 strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
71 args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
72 }
73
74 else if (!strcmp(o, "locktable")) {
75 if (!v)
76 goto need_value;
77 if (remount && strcmp(v, args->ar_locktable))
78 goto cant_remount;
79 strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
80 args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0;
81 }
82
83 else if (!strcmp(o, "hostdata")) {
84 if (!v)
85 goto need_value;
86 if (remount && strcmp(v, args->ar_hostdata))
87 goto cant_remount;
88 strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
89 args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
90 }
91
92 else if (!strcmp(o, "spectator")) {
93 if (remount && !args->ar_spectator)
94 goto cant_remount;
95 args->ar_spectator = 1;
96 sdp->sd_vfs->s_flags |= MS_RDONLY;
97 }
98
99 else if (!strcmp(o, "ignore_local_fs")) {
100 if (remount && !args->ar_ignore_local_fs)
101 goto cant_remount;
102 args->ar_ignore_local_fs = 1;
103 }
104
105 else if (!strcmp(o, "localflocks")) {
106 if (remount && !args->ar_localflocks)
107 goto cant_remount;
108 args->ar_localflocks = 1;
109 }
110
111 else if (!strcmp(o, "localcaching")) {
112 if (remount && !args->ar_localcaching)
113 goto cant_remount;
114 args->ar_localcaching = 1;
115 }
116
117 else if (!strcmp(o, "debug"))
118 args->ar_debug = 1;
119
120 else if (!strcmp(o, "nodebug"))
121 args->ar_debug = 0;
122
123 else if (!strcmp(o, "upgrade")) {
124 if (remount && !args->ar_upgrade)
125 goto cant_remount;
126 args->ar_upgrade = 1;
127 }
128
129 else if (!strcmp(o, "num_glockd")) {
130 unsigned int x;
131 if (!v)
132 goto need_value;
133 sscanf(v, "%u", &x);
134 if (remount && x != args->ar_num_glockd)
135 goto cant_remount;
136 if (!x || x > GFS2_GLOCKD_MAX) {
137 fs_info(sdp, "0 < num_glockd <= %u (not %u)\n",
138 GFS2_GLOCKD_MAX, x);
139 error = -EINVAL;
140 break;
141 }
142 args->ar_num_glockd = x;
143 }
144
145 else if (!strcmp(o, "acl")) {
146 args->ar_posix_acl = 1;
147 sdp->sd_vfs->s_flags |= MS_POSIXACL;
148 }
149
150 else if (!strcmp(o, "noacl")) {
151 args->ar_posix_acl = 0;
152 sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
153 }
154
155 else if (!strcmp(o, "quota")) {
156 if (!v)
157 goto need_value;
158 if (!strcmp(v, "off"))
159 args->ar_quota = GFS2_QUOTA_OFF;
160 else if (!strcmp(v, "account"))
161 args->ar_quota = GFS2_QUOTA_ACCOUNT;
162 else if (!strcmp(v, "on"))
163 args->ar_quota = GFS2_QUOTA_ON;
164 else {
165 fs_info(sdp, "invalid value for quota\n");
166 error = -EINVAL;
167 break;
168 }
169 }
170
171 else if (!strcmp(o, "suiddir"))
172 args->ar_suiddir = 1;
173
174 else if (!strcmp(o, "nosuiddir"))
175 args->ar_suiddir = 0;
176
177 else if (!strcmp(o, "data")) {
178 if (!v)
179 goto need_value;
180 if (!strcmp(v, "writeback"))
181 args->ar_data = GFS2_DATA_WRITEBACK;
182 else if (!strcmp(v, "ordered"))
183 args->ar_data = GFS2_DATA_ORDERED;
184 else {
185 fs_info(sdp, "invalid value for data\n");
186 error = -EINVAL;
187 break;
188 }
189 }
190
191 else {
192 fs_info(sdp, "unknown option: %s\n", o);
193 error = -EINVAL;
194 break;
195 }
196 }
197
198 if (error)
199 fs_info(sdp, "invalid mount option(s)\n");
200
201 if (data != data_arg)
202 kfree(data);
203
204 return error;
205
206 need_value:
207 fs_info(sdp, "need value for option %s\n", o);
208 return -EINVAL;
209
210 cant_remount:
211 fs_info(sdp, "can't remount with option %s\n", o);
212 return -EINVAL;
213}
214
diff --git a/fs/gfs2/mount.h b/fs/gfs2/mount.h
new file mode 100644
index 000000000000..2eb14722144f
--- /dev/null
+++ b/fs/gfs2/mount.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __MOUNT_DOT_H__
11#define __MOUNT_DOT_H__
12
13int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount);
14
15#endif /* __MOUNT_DOT_H__ */
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
new file mode 100644
index 000000000000..be5c86e5787e
--- /dev/null
+++ b/fs/gfs2/ondisk.c
@@ -0,0 +1,321 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15
16#include "gfs2.h"
17#include <linux/gfs2_ondisk.h>
18
19#define pv(struct, member, fmt) printk(KERN_INFO " "#member" = "fmt"\n", \
20 struct->member);
21
22/*
23 * gfs2_xxx_in - read in an xxx struct
24 * first arg: the cpu-order structure
25 * buf: the disk-order buffer
26 *
27 * gfs2_xxx_out - write out an xxx struct
28 * first arg: the cpu-order structure
29 * buf: the disk-order buffer
30 *
31 * gfs2_xxx_print - print out an xxx struct
32 * first arg: the cpu-order structure
33 */
34
35void gfs2_inum_in(struct gfs2_inum *no, char *buf)
36{
37 struct gfs2_inum *str = (struct gfs2_inum *)buf;
38
39 no->no_formal_ino = be64_to_cpu(str->no_formal_ino);
40 no->no_addr = be64_to_cpu(str->no_addr);
41}
42
43void gfs2_inum_out(const struct gfs2_inum *no, char *buf)
44{
45 struct gfs2_inum *str = (struct gfs2_inum *)buf;
46
47 str->no_formal_ino = cpu_to_be64(no->no_formal_ino);
48 str->no_addr = cpu_to_be64(no->no_addr);
49}
50
51static void gfs2_inum_print(struct gfs2_inum *no)
52{
53 printk(KERN_INFO " no_formal_ino = %llu\n", (unsigned long long)no->no_formal_ino);
54 printk(KERN_INFO " no_addr = %llu\n", (unsigned long long)no->no_addr);
55}
56
57static void gfs2_meta_header_in(struct gfs2_meta_header *mh, char *buf)
58{
59 struct gfs2_meta_header *str = (struct gfs2_meta_header *)buf;
60
61 mh->mh_magic = be32_to_cpu(str->mh_magic);
62 mh->mh_type = be32_to_cpu(str->mh_type);
63 mh->mh_format = be32_to_cpu(str->mh_format);
64}
65
66static void gfs2_meta_header_out(struct gfs2_meta_header *mh, char *buf)
67{
68 struct gfs2_meta_header *str = (struct gfs2_meta_header *)buf;
69
70 str->mh_magic = cpu_to_be32(mh->mh_magic);
71 str->mh_type = cpu_to_be32(mh->mh_type);
72 str->mh_format = cpu_to_be32(mh->mh_format);
73}
74
75static void gfs2_meta_header_print(struct gfs2_meta_header *mh)
76{
77 pv(mh, mh_magic, "0x%.8X");
78 pv(mh, mh_type, "%u");
79 pv(mh, mh_format, "%u");
80}
81
82void gfs2_sb_in(struct gfs2_sb *sb, char *buf)
83{
84 struct gfs2_sb *str = (struct gfs2_sb *)buf;
85
86 gfs2_meta_header_in(&sb->sb_header, buf);
87
88 sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
89 sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
90 sb->sb_bsize = be32_to_cpu(str->sb_bsize);
91 sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
92
93 gfs2_inum_in(&sb->sb_master_dir, (char *)&str->sb_master_dir);
94 gfs2_inum_in(&sb->sb_root_dir, (char *)&str->sb_root_dir);
95
96 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
97 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
98}
99
100void gfs2_rindex_in(struct gfs2_rindex *ri, char *buf)
101{
102 struct gfs2_rindex *str = (struct gfs2_rindex *)buf;
103
104 ri->ri_addr = be64_to_cpu(str->ri_addr);
105 ri->ri_length = be32_to_cpu(str->ri_length);
106 ri->ri_data0 = be64_to_cpu(str->ri_data0);
107 ri->ri_data = be32_to_cpu(str->ri_data);
108 ri->ri_bitbytes = be32_to_cpu(str->ri_bitbytes);
109
110}
111
112void gfs2_rindex_print(struct gfs2_rindex *ri)
113{
114 printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)ri->ri_addr);
115 pv(ri, ri_length, "%u");
116
117 printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)ri->ri_data0);
118 pv(ri, ri_data, "%u");
119
120 pv(ri, ri_bitbytes, "%u");
121}
122
123void gfs2_rgrp_in(struct gfs2_rgrp *rg, char *buf)
124{
125 struct gfs2_rgrp *str = (struct gfs2_rgrp *)buf;
126
127 gfs2_meta_header_in(&rg->rg_header, buf);
128 rg->rg_flags = be32_to_cpu(str->rg_flags);
129 rg->rg_free = be32_to_cpu(str->rg_free);
130 rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
131}
132
133void gfs2_rgrp_out(struct gfs2_rgrp *rg, char *buf)
134{
135 struct gfs2_rgrp *str = (struct gfs2_rgrp *)buf;
136
137 gfs2_meta_header_out(&rg->rg_header, buf);
138 str->rg_flags = cpu_to_be32(rg->rg_flags);
139 str->rg_free = cpu_to_be32(rg->rg_free);
140 str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
141
142 memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
143}
144
145void gfs2_quota_in(struct gfs2_quota *qu, char *buf)
146{
147 struct gfs2_quota *str = (struct gfs2_quota *)buf;
148
149 qu->qu_limit = be64_to_cpu(str->qu_limit);
150 qu->qu_warn = be64_to_cpu(str->qu_warn);
151 qu->qu_value = be64_to_cpu(str->qu_value);
152}
153
154void gfs2_dinode_in(struct gfs2_dinode *di, char *buf)
155{
156 struct gfs2_dinode *str = (struct gfs2_dinode *)buf;
157
158 gfs2_meta_header_in(&di->di_header, buf);
159 gfs2_inum_in(&di->di_num, (char *)&str->di_num);
160
161 di->di_mode = be32_to_cpu(str->di_mode);
162 di->di_uid = be32_to_cpu(str->di_uid);
163 di->di_gid = be32_to_cpu(str->di_gid);
164 di->di_nlink = be32_to_cpu(str->di_nlink);
165 di->di_size = be64_to_cpu(str->di_size);
166 di->di_blocks = be64_to_cpu(str->di_blocks);
167 di->di_atime = be64_to_cpu(str->di_atime);
168 di->di_mtime = be64_to_cpu(str->di_mtime);
169 di->di_ctime = be64_to_cpu(str->di_ctime);
170 di->di_major = be32_to_cpu(str->di_major);
171 di->di_minor = be32_to_cpu(str->di_minor);
172
173 di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
174 di->di_goal_data = be64_to_cpu(str->di_goal_data);
175
176 di->di_flags = be32_to_cpu(str->di_flags);
177 di->di_payload_format = be32_to_cpu(str->di_payload_format);
178 di->di_height = be16_to_cpu(str->di_height);
179
180 di->di_depth = be16_to_cpu(str->di_depth);
181 di->di_entries = be32_to_cpu(str->di_entries);
182
183 di->di_eattr = be64_to_cpu(str->di_eattr);
184
185}
186
187void gfs2_dinode_out(struct gfs2_dinode *di, char *buf)
188{
189 struct gfs2_dinode *str = (struct gfs2_dinode *)buf;
190
191 gfs2_meta_header_out(&di->di_header, buf);
192 gfs2_inum_out(&di->di_num, (char *)&str->di_num);
193
194 str->di_mode = cpu_to_be32(di->di_mode);
195 str->di_uid = cpu_to_be32(di->di_uid);
196 str->di_gid = cpu_to_be32(di->di_gid);
197 str->di_nlink = cpu_to_be32(di->di_nlink);
198 str->di_size = cpu_to_be64(di->di_size);
199 str->di_blocks = cpu_to_be64(di->di_blocks);
200 str->di_atime = cpu_to_be64(di->di_atime);
201 str->di_mtime = cpu_to_be64(di->di_mtime);
202 str->di_ctime = cpu_to_be64(di->di_ctime);
203 str->di_major = cpu_to_be32(di->di_major);
204 str->di_minor = cpu_to_be32(di->di_minor);
205
206 str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
207 str->di_goal_data = cpu_to_be64(di->di_goal_data);
208
209 str->di_flags = cpu_to_be32(di->di_flags);
210 str->di_payload_format = cpu_to_be32(di->di_payload_format);
211 str->di_height = cpu_to_be16(di->di_height);
212
213 str->di_depth = cpu_to_be16(di->di_depth);
214 str->di_entries = cpu_to_be32(di->di_entries);
215
216 str->di_eattr = cpu_to_be64(di->di_eattr);
217
218}
219
220void gfs2_dinode_print(struct gfs2_dinode *di)
221{
222 gfs2_meta_header_print(&di->di_header);
223 gfs2_inum_print(&di->di_num);
224
225 pv(di, di_mode, "0%o");
226 pv(di, di_uid, "%u");
227 pv(di, di_gid, "%u");
228 pv(di, di_nlink, "%u");
229 printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size);
230 printk(KERN_INFO " di_blocks = %llu\n", (unsigned long long)di->di_blocks);
231 printk(KERN_INFO " di_atime = %lld\n", (long long)di->di_atime);
232 printk(KERN_INFO " di_mtime = %lld\n", (long long)di->di_mtime);
233 printk(KERN_INFO " di_ctime = %lld\n", (long long)di->di_ctime);
234 pv(di, di_major, "%u");
235 pv(di, di_minor, "%u");
236
237 printk(KERN_INFO " di_goal_meta = %llu\n", (unsigned long long)di->di_goal_meta);
238 printk(KERN_INFO " di_goal_data = %llu\n", (unsigned long long)di->di_goal_data);
239
240 pv(di, di_flags, "0x%.8X");
241 pv(di, di_payload_format, "%u");
242 pv(di, di_height, "%u");
243
244 pv(di, di_depth, "%u");
245 pv(di, di_entries, "%u");
246
247 printk(KERN_INFO " di_eattr = %llu\n", (unsigned long long)di->di_eattr);
248}
249
250void gfs2_log_header_in(struct gfs2_log_header *lh, char *buf)
251{
252 struct gfs2_log_header *str = (struct gfs2_log_header *)buf;
253
254 gfs2_meta_header_in(&lh->lh_header, buf);
255 lh->lh_sequence = be64_to_cpu(str->lh_sequence);
256 lh->lh_flags = be32_to_cpu(str->lh_flags);
257 lh->lh_tail = be32_to_cpu(str->lh_tail);
258 lh->lh_blkno = be32_to_cpu(str->lh_blkno);
259 lh->lh_hash = be32_to_cpu(str->lh_hash);
260}
261
262void gfs2_inum_range_in(struct gfs2_inum_range *ir, char *buf)
263{
264 struct gfs2_inum_range *str = (struct gfs2_inum_range *)buf;
265
266 ir->ir_start = be64_to_cpu(str->ir_start);
267 ir->ir_length = be64_to_cpu(str->ir_length);
268}
269
270void gfs2_inum_range_out(struct gfs2_inum_range *ir, char *buf)
271{
272 struct gfs2_inum_range *str = (struct gfs2_inum_range *)buf;
273
274 str->ir_start = cpu_to_be64(ir->ir_start);
275 str->ir_length = cpu_to_be64(ir->ir_length);
276}
277
278void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, char *buf)
279{
280 struct gfs2_statfs_change *str = (struct gfs2_statfs_change *)buf;
281
282 sc->sc_total = be64_to_cpu(str->sc_total);
283 sc->sc_free = be64_to_cpu(str->sc_free);
284 sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
285}
286
287void gfs2_statfs_change_out(struct gfs2_statfs_change *sc, char *buf)
288{
289 struct gfs2_statfs_change *str = (struct gfs2_statfs_change *)buf;
290
291 str->sc_total = cpu_to_be64(sc->sc_total);
292 str->sc_free = cpu_to_be64(sc->sc_free);
293 str->sc_dinodes = cpu_to_be64(sc->sc_dinodes);
294}
295
296void gfs2_unlinked_tag_in(struct gfs2_unlinked_tag *ut, char *buf)
297{
298 struct gfs2_unlinked_tag *str = (struct gfs2_unlinked_tag *)buf;
299
300 gfs2_inum_in(&ut->ut_inum, buf);
301 ut->ut_flags = be32_to_cpu(str->ut_flags);
302}
303
304void gfs2_unlinked_tag_out(struct gfs2_unlinked_tag *ut, char *buf)
305{
306 struct gfs2_unlinked_tag *str = (struct gfs2_unlinked_tag *)buf;
307
308 gfs2_inum_out(&ut->ut_inum, buf);
309 str->ut_flags = cpu_to_be32(ut->ut_flags);
310 str->__pad = 0;
311}
312
313void gfs2_quota_change_in(struct gfs2_quota_change *qc, char *buf)
314{
315 struct gfs2_quota_change *str = (struct gfs2_quota_change *)buf;
316
317 qc->qc_change = be64_to_cpu(str->qc_change);
318 qc->qc_flags = be32_to_cpu(str->qc_flags);
319 qc->qc_id = be32_to_cpu(str->qc_id);
320}
321
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
new file mode 100644
index 000000000000..16d3ebd32092
--- /dev/null
+++ b/fs/gfs2/ops_address.c
@@ -0,0 +1,670 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/pagevec.h>
17#include <linux/mpage.h>
18#include <linux/fs.h>
19#include <linux/gfs2_ondisk.h>
20
21#include "gfs2.h"
22#include "lm_interface.h"
23#include "incore.h"
24#include "bmap.h"
25#include "glock.h"
26#include "inode.h"
27#include "log.h"
28#include "meta_io.h"
29#include "ops_address.h"
30#include "page.h"
31#include "quota.h"
32#include "trans.h"
33#include "rgrp.h"
34#include "ops_file.h"
35#include "util.h"
36
37/**
38 * gfs2_get_block - Fills in a buffer head with details about a block
39 * @inode: The inode
40 * @lblock: The block number to look up
41 * @bh_result: The buffer head to return the result in
42 * @create: Non-zero if we may add block to the file
43 *
44 * Returns: errno
45 */
46
47int gfs2_get_block(struct inode *inode, sector_t lblock,
48 struct buffer_head *bh_result, int create)
49{
50 int new = create;
51 uint64_t dblock;
52 int error;
53 int boundary;
54
55 error = gfs2_block_map(inode, lblock, &new, &dblock, &boundary);
56 if (error)
57 return error;
58
59 if (!dblock)
60 return 0;
61
62 map_bh(bh_result, inode->i_sb, dblock);
63 if (new)
64 set_buffer_new(bh_result);
65 if (boundary)
66 set_buffer_boundary(bh_result);
67
68 return 0;
69}
70
71/**
72 * get_block_noalloc - Fills in a buffer head with details about a block
73 * @inode: The inode
74 * @lblock: The block number to look up
75 * @bh_result: The buffer head to return the result in
76 * @create: Non-zero if we may add block to the file
77 *
78 * Returns: errno
79 */
80
81static int get_block_noalloc(struct inode *inode, sector_t lblock,
82 struct buffer_head *bh_result, int create)
83{
84 struct gfs2_inode *ip = inode->u.generic_ip;
85 int new = 0;
86 uint64_t dblock;
87 int error;
88 int boundary;
89
90 error = gfs2_block_map(inode, lblock, &new, &dblock, &boundary);
91 if (error)
92 return error;
93
94 if (dblock)
95 map_bh(bh_result, inode->i_sb, dblock);
96 else if (gfs2_assert_withdraw(ip->i_sbd, !create))
97 error = -EIO;
98 if (boundary)
99 set_buffer_boundary(bh_result);
100
101 return error;
102}
103
104/**
105 * gfs2_writepage - Write complete page
106 * @page: Page to write
107 *
108 * Returns: errno
109 *
110 * Some of this is copied from block_write_full_page() although we still
111 * call it to do most of the work.
112 */
113
114static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
115{
116 struct inode *inode = page->mapping->host;
117 struct gfs2_inode *ip = page->mapping->host->u.generic_ip;
118 struct gfs2_sbd *sdp = ip->i_sbd;
119 loff_t i_size = i_size_read(inode);
120 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
121 unsigned offset;
122 int error;
123 int done_trans = 0;
124
125 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) {
126 unlock_page(page);
127 return -EIO;
128 }
129 if (current->journal_info)
130 goto out_ignore;
131
132 /* Is the page fully outside i_size? (truncate in progress) */
133 offset = i_size & (PAGE_CACHE_SIZE-1);
134 if (page->index > end_index || (page->index == end_index && !offset)) {
135 page->mapping->a_ops->invalidatepage(page, 0);
136 unlock_page(page);
137 return 0; /* don't care */
138 }
139
140 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) {
141 error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
142 if (error)
143 goto out_ignore;
144 gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
145 done_trans = 1;
146 }
147 error = block_write_full_page(page, get_block_noalloc, wbc);
148 if (done_trans)
149 gfs2_trans_end(sdp);
150 gfs2_meta_cache_flush(ip);
151 return error;
152
153out_ignore:
154 redirty_page_for_writepage(wbc, page);
155 unlock_page(page);
156 return 0;
157}
158
159static int zero_readpage(struct page *page)
160{
161 void *kaddr;
162
163 kaddr = kmap_atomic(page, KM_USER0);
164 memset(kaddr, 0, PAGE_CACHE_SIZE);
165 kunmap_atomic(page, KM_USER0);
166
167 SetPageUptodate(page);
168
169 return 0;
170}
171
172/**
173 * stuffed_readpage - Fill in a Linux page with stuffed file data
174 * @ip: the inode
175 * @page: the page
176 *
177 * Returns: errno
178 */
179
180static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
181{
182 struct buffer_head *dibh;
183 void *kaddr;
184 int error;
185
186 /* Only the first page of a stuffed file might contain data */
187 if (unlikely(page->index))
188 return zero_readpage(page);
189
190 error = gfs2_meta_inode_buffer(ip, &dibh);
191 if (error)
192 return error;
193
194 kaddr = kmap_atomic(page, KM_USER0);
195 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
196 ip->i_di.di_size);
197 memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
198 kunmap_atomic(page, KM_USER0);
199
200 brelse(dibh);
201
202 SetPageUptodate(page);
203
204 return 0;
205}
206
207
208/**
209 * gfs2_readpage - readpage with locking
210 * @file: The file to read a page for. N.B. This may be NULL if we are
211 * reading an internal file.
212 * @page: The page to read
213 *
214 * Returns: errno
215 */
216
217static int gfs2_readpage(struct file *file, struct page *page)
218{
219 struct gfs2_inode *ip = page->mapping->host->u.generic_ip;
220 struct gfs2_sbd *sdp = ip->i_sbd;
221 struct gfs2_holder gh;
222 int error;
223
224 if (likely(file != &gfs2_internal_file_sentinal)) {
225 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|GL_AOP, &gh);
226 error = gfs2_glock_nq_m_atime(1, &gh);
227 if (unlikely(error))
228 goto out_unlock;
229 }
230
231 if (gfs2_is_stuffed(ip)) {
232 error = stuffed_readpage(ip, page);
233 unlock_page(page);
234 } else
235 error = mpage_readpage(page, gfs2_get_block);
236
237 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
238 error = -EIO;
239
240 if (file != &gfs2_internal_file_sentinal) {
241 gfs2_glock_dq_m(1, &gh);
242 gfs2_holder_uninit(&gh);
243 }
244out:
245 return error;
246out_unlock:
247 unlock_page(page);
248 if (file != &gfs2_internal_file_sentinal)
249 gfs2_holder_uninit(&gh);
250 goto out;
251}
252
253#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
254
255/**
256 * gfs2_readpages - Read a bunch of pages at once
257 *
258 * Some notes:
259 * 1. This is only for readahead, so we can simply ignore any things
260 * which are slightly inconvenient (such as locking conflicts between
261 * the page lock and the glock) and return having done no I/O. Its
262 * obviously not something we'd want to do on too regular a basis.
263 * Any I/O we ignore at this time will be done via readpage later.
264 * 2. We have to handle stuffed files here too.
265 * 3. mpage_readpages() does most of the heavy lifting in the common case.
266 * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places.
267 * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
268 * well as read-ahead.
269 */
270static int gfs2_readpages(struct file *file, struct address_space *mapping,
271 struct list_head *pages, unsigned nr_pages)
272{
273 struct inode *inode = mapping->host;
274 struct gfs2_inode *ip = inode->u.generic_ip;
275 struct gfs2_sbd *sdp = ip->i_sbd;
276 struct gfs2_holder gh;
277 unsigned page_idx;
278 int ret;
279
280 if (likely(file != &gfs2_internal_file_sentinal)) {
281 gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
282 LM_FLAG_TRY_1CB|GL_ATIME|GL_AOP, &gh);
283 ret = gfs2_glock_nq_m_atime(1, &gh);
284 if (ret == GLR_TRYFAILED)
285 goto out_noerror;
286 if (unlikely(ret))
287 goto out_unlock;
288 }
289
290 if (gfs2_is_stuffed(ip)) {
291 struct pagevec lru_pvec;
292 pagevec_init(&lru_pvec, 0);
293 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
294 struct page *page = list_to_page(pages);
295 list_del(&page->lru);
296 if (!add_to_page_cache(page, mapping,
297 page->index, GFP_KERNEL)) {
298 ret = stuffed_readpage(ip, page);
299 unlock_page(page);
300 if (!pagevec_add(&lru_pvec, page))
301 __pagevec_lru_add(&lru_pvec);
302 }
303 page_cache_release(page);
304 }
305 pagevec_lru_add(&lru_pvec);
306 ret = 0;
307 } else {
308 /* What we really want to do .... */
309 ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
310 }
311
312 if (likely(file != &gfs2_internal_file_sentinal)) {
313 gfs2_glock_dq_m(1, &gh);
314 gfs2_holder_uninit(&gh);
315 }
316out:
317 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
318 ret = -EIO;
319 return ret;
320out_noerror:
321 ret = 0;
322out_unlock:
323 /* unlock all pages, we can't do any I/O right now */
324 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
325 struct page *page = list_to_page(pages);
326 list_del(&page->lru);
327 unlock_page(page);
328 page_cache_release(page);
329 }
330 if (likely(file != &gfs2_internal_file_sentinal))
331 gfs2_holder_uninit(&gh);
332 goto out;
333}
334
335/**
336 * gfs2_prepare_write - Prepare to write a page to a file
337 * @file: The file to write to
338 * @page: The page which is to be prepared for writing
339 * @from: From (byte range within page)
340 * @to: To (byte range within page)
341 *
342 * Returns: errno
343 */
344
345static int gfs2_prepare_write(struct file *file, struct page *page,
346 unsigned from, unsigned to)
347{
348 struct gfs2_inode *ip = page->mapping->host->u.generic_ip;
349 struct gfs2_sbd *sdp = ip->i_sbd;
350 unsigned int data_blocks, ind_blocks, rblocks;
351 int alloc_required;
352 int error = 0;
353 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + from;
354 loff_t end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
355 struct gfs2_alloc *al;
356
357 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|GL_AOP, &ip->i_gh);
358 error = gfs2_glock_nq_m_atime(1, &ip->i_gh);
359 if (error)
360 goto out_uninit;
361
362 gfs2_write_calc_reserv(ip, to - from, &data_blocks, &ind_blocks);
363
364 error = gfs2_write_alloc_required(ip, pos, from - to, &alloc_required);
365 if (error)
366 goto out_unlock;
367
368
369 if (alloc_required) {
370 al = gfs2_alloc_get(ip);
371
372 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
373 if (error)
374 goto out_alloc_put;
375
376 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
377 if (error)
378 goto out_qunlock;
379
380 al->al_requested = data_blocks + ind_blocks;
381 error = gfs2_inplace_reserve(ip);
382 if (error)
383 goto out_qunlock;
384 }
385
386 rblocks = RES_DINODE + ind_blocks;
387 if (gfs2_is_jdata(ip))
388 rblocks += data_blocks ? data_blocks : 1;
389 if (ind_blocks || data_blocks)
390 rblocks += RES_STATFS + RES_QUOTA;
391
392 error = gfs2_trans_begin(sdp, rblocks, 0);
393 if (error)
394 goto out;
395
396 if (gfs2_is_stuffed(ip)) {
397 if (end > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
398 error = gfs2_unstuff_dinode(ip, gfs2_unstuffer_page,
399 page);
400 if (error == 0)
401 goto prepare_write;
402 } else if (!PageUptodate(page))
403 error = stuffed_readpage(ip, page);
404 goto out;
405 }
406
407prepare_write:
408 error = block_prepare_write(page, from, to, gfs2_get_block);
409
410out:
411 if (error) {
412 gfs2_trans_end(sdp);
413 if (alloc_required) {
414 gfs2_inplace_release(ip);
415out_qunlock:
416 gfs2_quota_unlock(ip);
417out_alloc_put:
418 gfs2_alloc_put(ip);
419 }
420out_unlock:
421 gfs2_glock_dq_m(1, &ip->i_gh);
422out_uninit:
423 gfs2_holder_uninit(&ip->i_gh);
424 }
425
426 return error;
427}
428
429/**
430 * gfs2_commit_write - Commit write to a file
431 * @file: The file to write to
432 * @page: The page containing the data
433 * @from: From (byte range within page)
434 * @to: To (byte range within page)
435 *
436 * Returns: errno
437 */
438
439static int gfs2_commit_write(struct file *file, struct page *page,
440 unsigned from, unsigned to)
441{
442 struct inode *inode = page->mapping->host;
443 struct gfs2_inode *ip = inode->u.generic_ip;
444 struct gfs2_sbd *sdp = ip->i_sbd;
445 int error = -EOPNOTSUPP;
446 struct buffer_head *dibh;
447 struct gfs2_alloc *al = &ip->i_alloc;;
448
449 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(ip->i_gl)))
450 goto fail_nounlock;
451
452 error = gfs2_meta_inode_buffer(ip, &dibh);
453 if (error)
454 goto fail_endtrans;
455
456 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
457
458 if (gfs2_is_stuffed(ip)) {
459 uint64_t file_size;
460 void *kaddr;
461
462 file_size = ((uint64_t)page->index << PAGE_CACHE_SHIFT) + to;
463
464 kaddr = kmap_atomic(page, KM_USER0);
465 memcpy(dibh->b_data + sizeof(struct gfs2_dinode) + from,
466 (char *)kaddr + from, to - from);
467 kunmap_atomic(page, KM_USER0);
468
469 SetPageUptodate(page);
470
471 if (inode->i_size < file_size)
472 i_size_write(inode, file_size);
473 } else {
474 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED ||
475 gfs2_is_jdata(ip))
476 gfs2_page_add_databufs(ip, page, from, to);
477 error = generic_commit_write(file, page, from, to);
478 if (error)
479 goto fail;
480 }
481
482 if (ip->i_di.di_size < inode->i_size)
483 ip->i_di.di_size = inode->i_size;
484
485 gfs2_dinode_out(&ip->i_di, dibh->b_data);
486 brelse(dibh);
487 gfs2_trans_end(sdp);
488 if (al->al_requested) {
489 gfs2_inplace_release(ip);
490 gfs2_quota_unlock(ip);
491 gfs2_alloc_put(ip);
492 }
493 gfs2_glock_dq_m(1, &ip->i_gh);
494 gfs2_holder_uninit(&ip->i_gh);
495 return 0;
496
497fail:
498 brelse(dibh);
499fail_endtrans:
500 gfs2_trans_end(sdp);
501 if (al->al_requested) {
502 gfs2_inplace_release(ip);
503 gfs2_quota_unlock(ip);
504 gfs2_alloc_put(ip);
505 }
506 gfs2_glock_dq_m(1, &ip->i_gh);
507 gfs2_holder_uninit(&ip->i_gh);
508fail_nounlock:
509 ClearPageUptodate(page);
510 return error;
511}
512
513/**
514 * gfs2_bmap - Block map function
515 * @mapping: Address space info
516 * @lblock: The block to map
517 *
518 * Returns: The disk address for the block or 0 on hole or error
519 */
520
521static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
522{
523 struct gfs2_inode *ip = mapping->host->u.generic_ip;
524 struct gfs2_holder i_gh;
525 sector_t dblock = 0;
526 int error;
527
528 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
529 if (error)
530 return 0;
531
532 if (!gfs2_is_stuffed(ip))
533 dblock = generic_block_bmap(mapping, lblock, gfs2_get_block);
534
535 gfs2_glock_dq_uninit(&i_gh);
536
537 return dblock;
538}
539
540static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh)
541{
542 struct gfs2_bufdata *bd;
543
544 gfs2_log_lock(sdp);
545 bd = bh->b_private;
546 if (bd) {
547 bd->bd_bh = NULL;
548 bh->b_private = NULL;
549 gfs2_log_unlock(sdp);
550 brelse(bh);
551 } else
552 gfs2_log_unlock(sdp);
553
554 lock_buffer(bh);
555 clear_buffer_dirty(bh);
556 bh->b_bdev = NULL;
557 clear_buffer_mapped(bh);
558 clear_buffer_req(bh);
559 clear_buffer_new(bh);
560 clear_buffer_delay(bh);
561 unlock_buffer(bh);
562}
563
564static void gfs2_invalidatepage(struct page *page, unsigned long offset)
565{
566 struct gfs2_sbd *sdp = page->mapping->host->i_sb->s_fs_info;
567 struct buffer_head *head, *bh, *next;
568 unsigned int curr_off = 0;
569
570 BUG_ON(!PageLocked(page));
571 if (!page_has_buffers(page))
572 return;
573
574 bh = head = page_buffers(page);
575 do {
576 unsigned int next_off = curr_off + bh->b_size;
577 next = bh->b_this_page;
578
579 if (offset <= curr_off)
580 discard_buffer(sdp, bh);
581
582 curr_off = next_off;
583 bh = next;
584 } while (bh != head);
585
586 if (!offset)
587 try_to_release_page(page, 0);
588
589 return;
590}
591
592static ssize_t gfs2_direct_IO_write(struct kiocb *iocb, const struct iovec *iov,
593 loff_t offset, unsigned long nr_segs)
594{
595 struct file *file = iocb->ki_filp;
596 struct inode *inode = file->f_mapping->host;
597 struct gfs2_inode *ip = inode->u.generic_ip;
598 struct gfs2_holder gh;
599 int rv;
600
601 /*
602 * Shared lock, even though its write, since we do no allocation
603 * on this path. All we need change is atime.
604 */
605 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
606 rv = gfs2_glock_nq_m_atime(1, &gh);
607 if (rv)
608 goto out;
609
610 /*
611 * Should we return an error here? I can't see that O_DIRECT for
612 * a journaled file makes any sense. For now we'll silently fall
613 * back to buffered I/O, likewise we do the same for stuffed
614 * files since they are (a) small and (b) unaligned.
615 */
616 if (gfs2_is_jdata(ip))
617 goto out;
618
619 if (gfs2_is_stuffed(ip))
620 goto out;
621
622 rv = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev,
623 iov, offset, nr_segs, gfs2_get_block,
624 NULL, DIO_OWN_LOCKING);
625out:
626 gfs2_glock_dq_m(1, &gh);
627 gfs2_holder_uninit(&gh);
628
629 return rv;
630}
631
632/**
633 * gfs2_direct_IO
634 *
635 * This is called with a shared lock already held for the read path.
636 * Currently, no locks are held when the write path is called.
637 */
638static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
639 const struct iovec *iov, loff_t offset,
640 unsigned long nr_segs)
641{
642 struct file *file = iocb->ki_filp;
643 struct inode *inode = file->f_mapping->host;
644 struct gfs2_inode *ip = inode->u.generic_ip;
645 struct gfs2_sbd *sdp = ip->i_sbd;
646
647 if (rw == WRITE)
648 return gfs2_direct_IO_write(iocb, iov, offset, nr_segs);
649
650 if (gfs2_assert_warn(sdp, gfs2_glock_is_locked_by_me(ip->i_gl)) ||
651 gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
652 return -EINVAL;
653
654 return __blockdev_direct_IO(READ, iocb, inode, inode->i_sb->s_bdev, iov,
655 offset, nr_segs, gfs2_get_block, NULL,
656 DIO_OWN_LOCKING);
657}
658
659struct address_space_operations gfs2_file_aops = {
660 .writepage = gfs2_writepage,
661 .readpage = gfs2_readpage,
662 .readpages = gfs2_readpages,
663 .sync_page = block_sync_page,
664 .prepare_write = gfs2_prepare_write,
665 .commit_write = gfs2_commit_write,
666 .bmap = gfs2_bmap,
667 .invalidatepage = gfs2_invalidatepage,
668 .direct_IO = gfs2_direct_IO,
669};
670
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
new file mode 100644
index 000000000000..b88adddaffb2
--- /dev/null
+++ b/fs/gfs2/ops_address.h
@@ -0,0 +1,17 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_ADDRESS_DOT_H__
11#define __OPS_ADDRESS_DOT_H__
12
13extern struct address_space_operations gfs2_file_aops;
14extern int gfs2_get_block(struct inode *inode, sector_t lblock,
15 struct buffer_head *bh_result, int create);
16
17#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
new file mode 100644
index 000000000000..fef415e2068e
--- /dev/null
+++ b/fs/gfs2/ops_dentry.c
@@ -0,0 +1,123 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/smp_lock.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/crc32.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "dir.h"
23#include "glock.h"
24#include "ops_dentry.h"
25#include "util.h"
26
27/**
28 * gfs2_drevalidate - Check directory lookup consistency
29 * @dentry: the mapping to check
30 * @nd:
31 *
32 * Check to make sure the lookup necessary to arrive at this inode from its
33 * parent is still good.
34 *
35 * Returns: 1 if the dentry is ok, 0 if it isn't
36 */
37
38static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
39{
40 struct dentry *parent = dget_parent(dentry);
41 struct gfs2_sbd *sdp = parent->d_inode->i_sb->s_fs_info;
42 struct gfs2_inode *dip = parent->d_inode->u.generic_ip;
43 struct inode *inode = dentry->d_inode;
44 struct gfs2_holder d_gh;
45 struct gfs2_inode *ip;
46 struct gfs2_inum inum;
47 unsigned int type;
48 int error;
49
50 if (inode && is_bad_inode(inode))
51 goto invalid;
52
53 if (sdp->sd_args.ar_localcaching)
54 goto valid;
55
56 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
57 if (error)
58 goto fail;
59
60 error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type);
61 switch (error) {
62 case 0:
63 if (!inode)
64 goto invalid_gunlock;
65 break;
66 case -ENOENT:
67 if (!inode)
68 goto valid_gunlock;
69 goto invalid_gunlock;
70 default:
71 goto fail_gunlock;
72 }
73
74 ip = inode->u.generic_ip;
75
76 if (!gfs2_inum_equal(&ip->i_num, &inum))
77 goto invalid_gunlock;
78
79 if (IF2DT(ip->i_di.di_mode) != type) {
80 gfs2_consist_inode(dip);
81 goto fail_gunlock;
82 }
83
84 valid_gunlock:
85 gfs2_glock_dq_uninit(&d_gh);
86
87 valid:
88 dput(parent);
89 return 1;
90
91 invalid_gunlock:
92 gfs2_glock_dq_uninit(&d_gh);
93
94 invalid:
95 if (inode && S_ISDIR(inode->i_mode)) {
96 if (have_submounts(dentry))
97 goto valid;
98 shrink_dcache_parent(dentry);
99 }
100 d_drop(dentry);
101
102 dput(parent);
103 return 0;
104
105 fail_gunlock:
106 gfs2_glock_dq_uninit(&d_gh);
107
108 fail:
109 dput(parent);
110 return 0;
111}
112
113static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
114{
115 str->hash = gfs2_disk_hash(str->name, str->len);
116 return 0;
117}
118
119struct dentry_operations gfs2_dops = {
120 .d_revalidate = gfs2_drevalidate,
121 .d_hash = gfs2_dhash,
122};
123
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
new file mode 100644
index 000000000000..1b6e75c0a4a7
--- /dev/null
+++ b/fs/gfs2/ops_dentry.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_DENTRY_DOT_H__
11#define __OPS_DENTRY_DOT_H__
12
13extern struct dentry_operations gfs2_dops;
14
15#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
new file mode 100644
index 000000000000..a376ead7d0cd
--- /dev/null
+++ b/fs/gfs2/ops_export.c
@@ -0,0 +1,297 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "dir.h"
22#include "glock.h"
23#include "glops.h"
24#include "inode.h"
25#include "ops_export.h"
26#include "rgrp.h"
27#include "util.h"
28
29static struct dentry *gfs2_decode_fh(struct super_block *sb,
30 __u32 *fh,
31 int fh_len,
32 int fh_type,
33 int (*acceptable)(void *context,
34 struct dentry *dentry),
35 void *context)
36{
37 struct gfs2_inum this, parent;
38
39 if (fh_type != fh_len)
40 return NULL;
41
42 memset(&parent, 0, sizeof(struct gfs2_inum));
43
44 switch (fh_type) {
45 case 8:
46 parent.no_formal_ino = ((uint64_t)be32_to_cpu(fh[4])) << 32;
47 parent.no_formal_ino |= be32_to_cpu(fh[5]);
48 parent.no_addr = ((uint64_t)be32_to_cpu(fh[6])) << 32;
49 parent.no_addr |= be32_to_cpu(fh[7]);
50 case 4:
51 this.no_formal_ino = ((uint64_t)be32_to_cpu(fh[0])) << 32;
52 this.no_formal_ino |= be32_to_cpu(fh[1]);
53 this.no_addr = ((uint64_t)be32_to_cpu(fh[2])) << 32;
54 this.no_addr |= be32_to_cpu(fh[3]);
55 break;
56 default:
57 return NULL;
58 }
59
60 return gfs2_export_ops.find_exported_dentry(sb, &this, &parent,
61 acceptable, context);
62}
63
64static int gfs2_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
65 int connectable)
66{
67 struct inode *inode = dentry->d_inode;
68 struct super_block *sb = inode->i_sb;
69 struct gfs2_inode *ip = inode->u.generic_ip;
70
71 if (*len < 4 || (connectable && *len < 8))
72 return 255;
73
74 fh[0] = ip->i_num.no_formal_ino >> 32;
75 fh[0] = cpu_to_be32(fh[0]);
76 fh[1] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
77 fh[1] = cpu_to_be32(fh[1]);
78 fh[2] = ip->i_num.no_addr >> 32;
79 fh[2] = cpu_to_be32(fh[2]);
80 fh[3] = ip->i_num.no_addr & 0xFFFFFFFF;
81 fh[3] = cpu_to_be32(fh[3]);
82 *len = 4;
83
84 if (!connectable || inode == sb->s_root->d_inode)
85 return *len;
86
87 spin_lock(&dentry->d_lock);
88 inode = dentry->d_parent->d_inode;
89 ip = inode->u.generic_ip;
90 gfs2_inode_hold(ip);
91 spin_unlock(&dentry->d_lock);
92
93 fh[4] = ip->i_num.no_formal_ino >> 32;
94 fh[4] = cpu_to_be32(fh[4]);
95 fh[5] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
96 fh[5] = cpu_to_be32(fh[5]);
97 fh[6] = ip->i_num.no_addr >> 32;
98 fh[6] = cpu_to_be32(fh[6]);
99 fh[7] = ip->i_num.no_addr & 0xFFFFFFFF;
100 fh[7] = cpu_to_be32(fh[7]);
101 *len = 8;
102
103 gfs2_inode_put(ip);
104
105 return *len;
106}
107
108struct get_name_filldir {
109 struct gfs2_inum inum;
110 char *name;
111};
112
113static int get_name_filldir(void *opaque, const char *name, unsigned int length,
114 uint64_t offset, struct gfs2_inum *inum,
115 unsigned int type)
116{
117 struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque;
118
119 if (!gfs2_inum_equal(inum, &gnfd->inum))
120 return 0;
121
122 memcpy(gnfd->name, name, length);
123 gnfd->name[length] = 0;
124
125 return 1;
126}
127
128static int gfs2_get_name(struct dentry *parent, char *name,
129 struct dentry *child)
130{
131 struct inode *dir = parent->d_inode;
132 struct inode *inode = child->d_inode;
133 struct gfs2_inode *dip, *ip;
134 struct get_name_filldir gnfd;
135 struct gfs2_holder gh;
136 uint64_t offset = 0;
137 int error;
138
139 if (!dir)
140 return -EINVAL;
141
142 if (!S_ISDIR(dir->i_mode) || !inode)
143 return -EINVAL;
144
145 dip = dir->u.generic_ip;
146 ip = inode->u.generic_ip;
147
148 *name = 0;
149 gnfd.inum = ip->i_num;
150 gnfd.name = name;
151
152 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
153 if (error)
154 return error;
155
156 error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
157
158 gfs2_glock_dq_uninit(&gh);
159
160 if (!error && !*name)
161 error = -ENOENT;
162
163 return error;
164}
165
166static struct dentry *gfs2_get_parent(struct dentry *child)
167{
168 struct qstr dotdot;
169 struct inode *inode;
170 struct dentry *dentry;
171
172 gfs2_str2qstr(&dotdot, "..");
173 inode = gfs2_lookupi(child->d_inode, &dotdot, 1, NULL);
174
175 if (!inode)
176 return ERR_PTR(-ENOENT);
177 if (IS_ERR(inode))
178 return ERR_PTR(PTR_ERR(inode));
179
180 dentry = d_alloc_anon(inode);
181 if (!dentry) {
182 iput(inode);
183 return ERR_PTR(-ENOMEM);
184 }
185
186 return dentry;
187}
188
189static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_p)
190{
191 struct gfs2_sbd *sdp = sb->s_fs_info;
192 struct gfs2_inum *inum = (struct gfs2_inum *)inum_p;
193 struct gfs2_holder i_gh, ri_gh, rgd_gh;
194 struct gfs2_rgrpd *rgd;
195 struct gfs2_inode *ip;
196 struct inode *inode;
197 struct dentry *dentry;
198 int error;
199
200 /* System files? */
201
202 inode = gfs2_iget(sb, inum);
203 if (inode) {
204 ip = inode->u.generic_ip;
205 if (ip->i_num.no_formal_ino != inum->no_formal_ino) {
206 iput(inode);
207 return ERR_PTR(-ESTALE);
208 }
209 goto out_inode;
210 }
211
212 error = gfs2_glock_nq_num(sdp,
213 inum->no_addr, &gfs2_inode_glops,
214 LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL,
215 &i_gh);
216 if (error)
217 return ERR_PTR(error);
218
219 error = gfs2_inode_get(i_gh.gh_gl, inum, NO_CREATE, &ip);
220 if (error)
221 goto fail;
222 if (ip)
223 goto out_ip;
224
225 error = gfs2_rindex_hold(sdp, &ri_gh);
226 if (error)
227 goto fail;
228
229 error = -EINVAL;
230 rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
231 if (!rgd)
232 goto fail_rindex;
233
234 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
235 if (error)
236 goto fail_rindex;
237
238 error = -ESTALE;
239 if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
240 goto fail_rgd;
241
242 gfs2_glock_dq_uninit(&rgd_gh);
243 gfs2_glock_dq_uninit(&ri_gh);
244
245 error = gfs2_inode_get(i_gh.gh_gl, inum, CREATE, &ip);
246 if (error)
247 goto fail;
248
249 error = gfs2_inode_refresh(ip);
250 if (error) {
251 gfs2_inode_put(ip);
252 goto fail;
253 }
254
255 out_ip:
256 error = -EIO;
257 if (ip->i_di.di_flags & GFS2_DIF_SYSTEM) {
258 gfs2_inode_put(ip);
259 goto fail;
260 }
261
262 gfs2_glock_dq_uninit(&i_gh);
263
264 inode = gfs2_ip2v(ip);
265 gfs2_inode_put(ip);
266
267 if (!inode)
268 return ERR_PTR(-ENOMEM);
269
270 out_inode:
271 dentry = d_alloc_anon(inode);
272 if (!dentry) {
273 iput(inode);
274 return ERR_PTR(-ENOMEM);
275 }
276
277 return dentry;
278
279 fail_rgd:
280 gfs2_glock_dq_uninit(&rgd_gh);
281
282 fail_rindex:
283 gfs2_glock_dq_uninit(&ri_gh);
284
285 fail:
286 gfs2_glock_dq_uninit(&i_gh);
287 return ERR_PTR(error);
288}
289
290struct export_operations gfs2_export_ops = {
291 .decode_fh = gfs2_decode_fh,
292 .encode_fh = gfs2_encode_fh,
293 .get_name = gfs2_get_name,
294 .get_parent = gfs2_get_parent,
295 .get_dentry = gfs2_get_dentry,
296};
297
diff --git a/fs/gfs2/ops_export.h b/fs/gfs2/ops_export.h
new file mode 100644
index 000000000000..88d58e57f518
--- /dev/null
+++ b/fs/gfs2/ops_export.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_EXPORT_DOT_H__
11#define __OPS_EXPORT_DOT_H__
12
13extern struct export_operations gfs2_export_ops;
14
15#endif /* __OPS_EXPORT_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
new file mode 100644
index 000000000000..1e8f602c1e50
--- /dev/null
+++ b/fs/gfs2/ops_file.c
@@ -0,0 +1,1000 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/uio.h>
17#include <linux/blkdev.h>
18#include <linux/mm.h>
19#include <linux/smp_lock.h>
20#include <linux/fs.h>
21#include <linux/gfs2_ondisk.h>
22#include <linux/ext2_fs.h>
23#include <linux/crc32.h>
24#include <linux/iflags.h>
25#include <asm/uaccess.h>
26
27#include "gfs2.h"
28#include "lm_interface.h"
29#include "incore.h"
30#include "bmap.h"
31#include "dir.h"
32#include "glock.h"
33#include "glops.h"
34#include "inode.h"
35#include "lm.h"
36#include "log.h"
37#include "meta_io.h"
38#include "ops_file.h"
39#include "ops_vm.h"
40#include "quota.h"
41#include "rgrp.h"
42#include "trans.h"
43#include "util.h"
44#include "eaops.h"
45
46/* "bad" is for NFS support */
47struct filldir_bad_entry {
48 char *fbe_name;
49 unsigned int fbe_length;
50 uint64_t fbe_offset;
51 struct gfs2_inum fbe_inum;
52 unsigned int fbe_type;
53};
54
55struct filldir_bad {
56 struct gfs2_sbd *fdb_sbd;
57
58 struct filldir_bad_entry *fdb_entry;
59 unsigned int fdb_entry_num;
60 unsigned int fdb_entry_off;
61
62 char *fdb_name;
63 unsigned int fdb_name_size;
64 unsigned int fdb_name_off;
65};
66
67/* For regular, non-NFS */
68struct filldir_reg {
69 struct gfs2_sbd *fdr_sbd;
70 int fdr_prefetch;
71
72 filldir_t fdr_filldir;
73 void *fdr_opaque;
74};
75
76/*
77 * Most fields left uninitialised to catch anybody who tries to
78 * use them. f_flags set to prevent file_accessed() from touching
79 * any other part of this. Its use is purely as a flag so that we
80 * know (in readpage()) whether or not do to locking.
81 */
82struct file gfs2_internal_file_sentinal = {
83 .f_flags = O_NOATIME|O_RDONLY,
84};
85
86static int gfs2_read_actor(read_descriptor_t *desc, struct page *page,
87 unsigned long offset, unsigned long size)
88{
89 char *kaddr;
90 unsigned long count = desc->count;
91
92 if (size > count)
93 size = count;
94
95 kaddr = kmap(page);
96 memcpy(desc->arg.buf, kaddr + offset, size);
97 kunmap(page);
98
99 desc->count = count - size;
100 desc->written += size;
101 desc->arg.buf += size;
102 return size;
103}
104
105int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
106 char *buf, loff_t *pos, unsigned size)
107{
108 struct inode *inode = ip->i_vnode;
109 read_descriptor_t desc;
110 desc.written = 0;
111 desc.arg.buf = buf;
112 desc.count = size;
113 desc.error = 0;
114 do_generic_mapping_read(inode->i_mapping, ra_state,
115 &gfs2_internal_file_sentinal, pos, &desc,
116 gfs2_read_actor);
117 return desc.written ? desc.written : desc.error;
118}
119
120/**
121 * gfs2_llseek - seek to a location in a file
122 * @file: the file
123 * @offset: the offset
124 * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
125 *
126 * SEEK_END requires the glock for the file because it references the
127 * file's size.
128 *
129 * Returns: The new offset, or errno
130 */
131
132static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
133{
134 struct gfs2_inode *ip = file->f_mapping->host->u.generic_ip;
135 struct gfs2_holder i_gh;
136 loff_t error;
137
138 if (origin == 2) {
139 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
140 &i_gh);
141 if (!error) {
142 error = remote_llseek(file, offset, origin);
143 gfs2_glock_dq_uninit(&i_gh);
144 }
145 } else
146 error = remote_llseek(file, offset, origin);
147
148 return error;
149}
150
151
152static ssize_t gfs2_direct_IO_read(struct kiocb *iocb, const struct iovec *iov,
153 loff_t offset, unsigned long nr_segs)
154{
155 struct file *file = iocb->ki_filp;
156 struct address_space *mapping = file->f_mapping;
157 ssize_t retval;
158
159 retval = filemap_write_and_wait(mapping);
160 if (retval == 0) {
161 retval = mapping->a_ops->direct_IO(READ, iocb, iov, offset,
162 nr_segs);
163 }
164 return retval;
165}
166
167/**
168 * __gfs2_file_aio_read - The main GFS2 read function
169 *
170 * N.B. This is almost, but not quite the same as __generic_file_aio_read()
171 * the important subtle different being that inode->i_size isn't valid
172 * unless we are holding a lock, and we do this _only_ on the O_DIRECT
173 * path since otherwise locking is done entirely at the page cache
174 * layer.
175 */
176static ssize_t __gfs2_file_aio_read(struct kiocb *iocb,
177 const struct iovec *iov,
178 unsigned long nr_segs, loff_t *ppos)
179{
180 struct file *filp = iocb->ki_filp;
181 struct gfs2_inode *ip = filp->f_mapping->host->u.generic_ip;
182 struct gfs2_holder gh;
183 ssize_t retval;
184 unsigned long seg;
185 size_t count;
186
187 count = 0;
188 for (seg = 0; seg < nr_segs; seg++) {
189 const struct iovec *iv = &iov[seg];
190
191 /*
192 * If any segment has a negative length, or the cumulative
193 * length ever wraps negative then return -EINVAL.
194 */
195 count += iv->iov_len;
196 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
197 return -EINVAL;
198 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
199 continue;
200 if (seg == 0)
201 return -EFAULT;
202 nr_segs = seg;
203 count -= iv->iov_len; /* This segment is no good */
204 break;
205 }
206
207 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
208 if (filp->f_flags & O_DIRECT) {
209 loff_t pos = *ppos, size;
210 struct address_space *mapping;
211 struct inode *inode;
212
213 mapping = filp->f_mapping;
214 inode = mapping->host;
215 retval = 0;
216 if (!count)
217 goto out; /* skip atime */
218
219 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
220 retval = gfs2_glock_nq_m_atime(1, &gh);
221 if (retval)
222 goto out;
223 if (gfs2_is_stuffed(ip)) {
224 gfs2_glock_dq_m(1, &gh);
225 gfs2_holder_uninit(&gh);
226 goto fallback_to_normal;
227 }
228 size = i_size_read(inode);
229 if (pos < size) {
230 retval = gfs2_direct_IO_read(iocb, iov, pos, nr_segs);
231 if (retval > 0 && !is_sync_kiocb(iocb))
232 retval = -EIOCBQUEUED;
233 if (retval > 0)
234 *ppos = pos + retval;
235 }
236 file_accessed(filp);
237 gfs2_glock_dq_m(1, &gh);
238 gfs2_holder_uninit(&gh);
239 goto out;
240 }
241
242fallback_to_normal:
243 retval = 0;
244 if (count) {
245 for (seg = 0; seg < nr_segs; seg++) {
246 read_descriptor_t desc;
247
248 desc.written = 0;
249 desc.arg.buf = iov[seg].iov_base;
250 desc.count = iov[seg].iov_len;
251 if (desc.count == 0)
252 continue;
253 desc.error = 0;
254 do_generic_file_read(filp,ppos,&desc,file_read_actor);
255 retval += desc.written;
256 if (desc.error) {
257 retval = retval ?: desc.error;
258 break;
259 }
260 }
261 }
262out:
263 return retval;
264}
265
266/**
267 * gfs2_read - Read bytes from a file
268 * @file: The file to read from
269 * @buf: The buffer to copy into
270 * @size: The amount of data requested
271 * @offset: The current file offset
272 *
273 * Outputs: Offset - updated according to number of bytes read
274 *
275 * Returns: The number of bytes read, errno on failure
276 */
277
278static ssize_t gfs2_read(struct file *filp, char __user *buf, size_t size,
279 loff_t *offset)
280{
281 struct iovec local_iov = { .iov_base = buf, .iov_len = size };
282 struct kiocb kiocb;
283 ssize_t ret;
284
285 init_sync_kiocb(&kiocb, filp);
286 ret = __gfs2_file_aio_read(&kiocb, &local_iov, 1, offset);
287 if (-EIOCBQUEUED == ret)
288 ret = wait_on_sync_kiocb(&kiocb);
289 return ret;
290}
291
292static ssize_t gfs2_file_readv(struct file *filp, const struct iovec *iov,
293 unsigned long nr_segs, loff_t *ppos)
294{
295 struct kiocb kiocb;
296 ssize_t ret;
297
298 init_sync_kiocb(&kiocb, filp);
299 ret = __gfs2_file_aio_read(&kiocb, iov, nr_segs, ppos);
300 if (-EIOCBQUEUED == ret)
301 ret = wait_on_sync_kiocb(&kiocb);
302 return ret;
303}
304
305static ssize_t gfs2_file_aio_read(struct kiocb *iocb, char __user *buf,
306 size_t count, loff_t pos)
307{
308 struct iovec local_iov = { .iov_base = buf, .iov_len = count };
309
310 BUG_ON(iocb->ki_pos != pos);
311 return __gfs2_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
312}
313
314
315/**
316 * filldir_reg_func - Report a directory entry to the caller of gfs2_dir_read()
317 * @opaque: opaque data used by the function
318 * @name: the name of the directory entry
319 * @length: the length of the name
320 * @offset: the entry's offset in the directory
321 * @inum: the inode number the entry points to
322 * @type: the type of inode the entry points to
323 *
324 * Returns: 0 on success, 1 if buffer full
325 */
326
327static int filldir_reg_func(void *opaque, const char *name, unsigned int length,
328 uint64_t offset, struct gfs2_inum *inum,
329 unsigned int type)
330{
331 struct filldir_reg *fdr = (struct filldir_reg *)opaque;
332 struct gfs2_sbd *sdp = fdr->fdr_sbd;
333 int error;
334
335 error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset,
336 inum->no_formal_ino, type);
337 if (error)
338 return 1;
339
340 if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) {
341 gfs2_glock_prefetch_num(sdp,
342 inum->no_addr, &gfs2_inode_glops,
343 LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
344 gfs2_glock_prefetch_num(sdp,
345 inum->no_addr, &gfs2_iopen_glops,
346 LM_ST_SHARED, LM_FLAG_TRY);
347 }
348
349 return 0;
350}
351
352/**
353 * readdir_reg - Read directory entries from a directory
354 * @file: The directory to read from
355 * @dirent: Buffer for dirents
356 * @filldir: Function used to do the copying
357 *
358 * Returns: errno
359 */
360
361static int readdir_reg(struct file *file, void *dirent, filldir_t filldir)
362{
363 struct inode *dir = file->f_mapping->host;
364 struct gfs2_inode *dip = dir->u.generic_ip;
365 struct filldir_reg fdr;
366 struct gfs2_holder d_gh;
367 uint64_t offset = file->f_pos;
368 int error;
369
370 fdr.fdr_sbd = dip->i_sbd;
371 fdr.fdr_prefetch = 1;
372 fdr.fdr_filldir = filldir;
373 fdr.fdr_opaque = dirent;
374
375 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
376 error = gfs2_glock_nq_atime(&d_gh);
377 if (error) {
378 gfs2_holder_uninit(&d_gh);
379 return error;
380 }
381
382 error = gfs2_dir_read(dir, &offset, &fdr, filldir_reg_func);
383
384 gfs2_glock_dq_uninit(&d_gh);
385
386 file->f_pos = offset;
387
388 return error;
389}
390
391/**
392 * filldir_bad_func - Report a directory entry to the caller of gfs2_dir_read()
393 * @opaque: opaque data used by the function
394 * @name: the name of the directory entry
395 * @length: the length of the name
396 * @offset: the entry's offset in the directory
397 * @inum: the inode number the entry points to
398 * @type: the type of inode the entry points to
399 *
400 * For supporting NFS.
401 *
402 * Returns: 0 on success, 1 if buffer full
403 */
404
405static int filldir_bad_func(void *opaque, const char *name, unsigned int length,
406 uint64_t offset, struct gfs2_inum *inum,
407 unsigned int type)
408{
409 struct filldir_bad *fdb = (struct filldir_bad *)opaque;
410 struct gfs2_sbd *sdp = fdb->fdb_sbd;
411 struct filldir_bad_entry *fbe;
412
413 if (fdb->fdb_entry_off == fdb->fdb_entry_num ||
414 fdb->fdb_name_off + length > fdb->fdb_name_size)
415 return 1;
416
417 fbe = &fdb->fdb_entry[fdb->fdb_entry_off];
418 fbe->fbe_name = fdb->fdb_name + fdb->fdb_name_off;
419 memcpy(fbe->fbe_name, name, length);
420 fbe->fbe_length = length;
421 fbe->fbe_offset = offset;
422 fbe->fbe_inum = *inum;
423 fbe->fbe_type = type;
424
425 fdb->fdb_entry_off++;
426 fdb->fdb_name_off += length;
427
428 if (!(length == 1 && *name == '.')) {
429 gfs2_glock_prefetch_num(sdp,
430 inum->no_addr, &gfs2_inode_glops,
431 LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
432 gfs2_glock_prefetch_num(sdp,
433 inum->no_addr, &gfs2_iopen_glops,
434 LM_ST_SHARED, LM_FLAG_TRY);
435 }
436
437 return 0;
438}
439
440/**
441 * readdir_bad - Read directory entries from a directory
442 * @file: The directory to read from
443 * @dirent: Buffer for dirents
444 * @filldir: Function used to do the copying
445 *
446 * For supporting NFS.
447 *
448 * Returns: errno
449 */
450
451static int readdir_bad(struct file *file, void *dirent, filldir_t filldir)
452{
453 struct inode *dir = file->f_mapping->host;
454 struct gfs2_inode *dip = dir->u.generic_ip;
455 struct gfs2_sbd *sdp = dip->i_sbd;
456 struct filldir_reg fdr;
457 unsigned int entries, size;
458 struct filldir_bad *fdb;
459 struct gfs2_holder d_gh;
460 uint64_t offset = file->f_pos;
461 unsigned int x;
462 struct filldir_bad_entry *fbe;
463 int error;
464
465 entries = gfs2_tune_get(sdp, gt_entries_per_readdir);
466 size = sizeof(struct filldir_bad) +
467 entries * (sizeof(struct filldir_bad_entry) + GFS2_FAST_NAME_SIZE);
468
469 fdb = kzalloc(size, GFP_KERNEL);
470 if (!fdb)
471 return -ENOMEM;
472
473 fdb->fdb_sbd = sdp;
474 fdb->fdb_entry = (struct filldir_bad_entry *)(fdb + 1);
475 fdb->fdb_entry_num = entries;
476 fdb->fdb_name = ((char *)fdb) + sizeof(struct filldir_bad) +
477 entries * sizeof(struct filldir_bad_entry);
478 fdb->fdb_name_size = entries * GFS2_FAST_NAME_SIZE;
479
480 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
481 error = gfs2_glock_nq_atime(&d_gh);
482 if (error) {
483 gfs2_holder_uninit(&d_gh);
484 goto out;
485 }
486
487 error = gfs2_dir_read(dir, &offset, fdb, filldir_bad_func);
488
489 gfs2_glock_dq_uninit(&d_gh);
490
491 fdr.fdr_sbd = sdp;
492 fdr.fdr_prefetch = 0;
493 fdr.fdr_filldir = filldir;
494 fdr.fdr_opaque = dirent;
495
496 for (x = 0; x < fdb->fdb_entry_off; x++) {
497 fbe = &fdb->fdb_entry[x];
498
499 error = filldir_reg_func(&fdr,
500 fbe->fbe_name, fbe->fbe_length,
501 fbe->fbe_offset,
502 &fbe->fbe_inum, fbe->fbe_type);
503 if (error) {
504 file->f_pos = fbe->fbe_offset;
505 error = 0;
506 goto out;
507 }
508 }
509
510 file->f_pos = offset;
511
512 out:
513 kfree(fdb);
514
515 return error;
516}
517
518/**
519 * gfs2_readdir - Read directory entries from a directory
520 * @file: The directory to read from
521 * @dirent: Buffer for dirents
522 * @filldir: Function used to do the copying
523 *
524 * Returns: errno
525 */
526
527static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
528{
529 int error;
530
531 if (strcmp(current->comm, "nfsd") != 0)
532 error = readdir_reg(file, dirent, filldir);
533 else
534 error = readdir_bad(file, dirent, filldir);
535
536 return error;
537}
538
539static const u32 iflags_to_gfs2[32] = {
540 [iflag_Sync] = GFS2_DIF_SYNC,
541 [iflag_Immutable] = GFS2_DIF_IMMUTABLE,
542 [iflag_Append] = GFS2_DIF_APPENDONLY,
543 [iflag_NoAtime] = GFS2_DIF_NOATIME,
544 [iflag_Index] = GFS2_DIF_EXHASH,
545 [iflag_JournalData] = GFS2_DIF_JDATA,
546 [iflag_DirectIO] = GFS2_DIF_DIRECTIO,
547};
548
549static const u32 gfs2_to_iflags[32] = {
550 [gfs2fl_Sync] = IFLAG_SYNC,
551 [gfs2fl_Immutable] = IFLAG_IMMUTABLE,
552 [gfs2fl_AppendOnly] = IFLAG_APPEND,
553 [gfs2fl_NoAtime] = IFLAG_NOATIME,
554 [gfs2fl_ExHash] = IFLAG_INDEX,
555 [gfs2fl_Jdata] = IFLAG_JOURNAL_DATA,
556 [gfs2fl_Directio] = IFLAG_DIRECTIO,
557 [gfs2fl_InheritDirectio] = IFLAG_DIRECTIO,
558 [gfs2fl_InheritJdata] = IFLAG_JOURNAL_DATA,
559};
560
561static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
562{
563 struct inode *inode = filp->f_dentry->d_inode;
564 struct gfs2_inode *ip = inode->u.generic_ip;
565 struct gfs2_holder gh;
566 int error;
567 u32 iflags;
568
569 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
570 error = gfs2_glock_nq_m_atime(1, &gh);
571 if (error)
572 return error;
573
574 iflags = iflags_cvt(gfs2_to_iflags, ip->i_di.di_flags);
575 if (put_user(iflags, ptr))
576 error = -EFAULT;
577
578 gfs2_glock_dq_m(1, &gh);
579 gfs2_holder_uninit(&gh);
580 return error;
581}
582
583/* Flags that can be set by user space */
584#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \
585 GFS2_DIF_DIRECTIO| \
586 GFS2_DIF_IMMUTABLE| \
587 GFS2_DIF_APPENDONLY| \
588 GFS2_DIF_NOATIME| \
589 GFS2_DIF_SYNC| \
590 GFS2_DIF_SYSTEM| \
591 GFS2_DIF_INHERIT_DIRECTIO| \
592 GFS2_DIF_INHERIT_JDATA)
593
594/**
595 * gfs2_set_flags - set flags on an inode
596 * @inode: The inode
597 * @flags: The flags to set
598 * @mask: Indicates which flags are valid
599 *
600 */
601static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
602{
603 struct inode *inode = filp->f_dentry->d_inode;
604 struct gfs2_inode *ip = inode->u.generic_ip;
605 struct gfs2_sbd *sdp = ip->i_sbd;
606 struct buffer_head *bh;
607 struct gfs2_holder gh;
608 int error;
609 u32 new_flags, flags;
610
611 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
612 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
613 if (error) {
614 gfs2_holder_uninit(&gh);
615 return error;
616 }
617
618 flags = ip->i_di.di_flags;
619 new_flags = (flags & ~mask) | (reqflags & mask);
620 if ((new_flags ^ flags) == 0)
621 goto out;
622
623 if (S_ISDIR(inode->i_mode)) {
624 if ((new_flags ^ flags) & GFS2_DIF_JDATA)
625 new_flags ^= (GFS2_DIF_JDATA|GFS2_DIF_INHERIT_JDATA);
626 if ((new_flags ^ flags) & GFS2_DIF_DIRECTIO)
627 new_flags ^= (GFS2_DIF_DIRECTIO|GFS2_DIF_INHERIT_DIRECTIO);
628 }
629
630 error = -EINVAL;
631 if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
632 goto out;
633
634 error = -EPERM;
635 if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
636 goto out;
637 if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
638 goto out;
639 if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&
640 !capable(CAP_LINUX_IMMUTABLE))
641 goto out;
642 if (!IS_IMMUTABLE(inode)) {
643 error = gfs2_repermission(inode, MAY_WRITE, NULL);
644 if (error)
645 goto out;
646 }
647
648 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
649 if (error)
650 goto out;
651 error = gfs2_meta_inode_buffer(ip, &bh);
652 if (error)
653 goto out_trans_end;
654 gfs2_trans_add_bh(ip->i_gl, bh, 1);
655 ip->i_di.di_flags = new_flags;
656 gfs2_dinode_out(&ip->i_di, bh->b_data);
657 brelse(bh);
658out_trans_end:
659 gfs2_trans_end(sdp);
660out:
661 gfs2_glock_dq_uninit(&gh);
662 return error;
663}
664
665static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
666{
667 u32 iflags, gfsflags;
668 if (get_user(iflags, ptr))
669 return -EFAULT;
670 gfsflags = iflags_cvt(iflags_to_gfs2, iflags);
671 return do_gfs2_set_flags(filp, gfsflags, ~0);
672}
673
674static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
675{
676 switch(cmd) {
677 case IFLAGS_GET_IOC:
678 return gfs2_get_flags(filp, (u32 __user *)arg);
679 case IFLAGS_SET_IOC:
680 return gfs2_set_flags(filp, (u32 __user *)arg);
681 }
682 return -ENOTTY;
683}
684
685
686/**
687 * gfs2_mmap -
688 * @file: The file to map
689 * @vma: The VMA which described the mapping
690 *
691 * Returns: 0 or error code
692 */
693
694static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
695{
696 struct gfs2_inode *ip = file->f_mapping->host->u.generic_ip;
697 struct gfs2_holder i_gh;
698 int error;
699
700 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
701 error = gfs2_glock_nq_atime(&i_gh);
702 if (error) {
703 gfs2_holder_uninit(&i_gh);
704 return error;
705 }
706
707 /* This is VM_MAYWRITE instead of VM_WRITE because a call
708 to mprotect() can turn on VM_WRITE later. */
709
710 if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) ==
711 (VM_MAYSHARE | VM_MAYWRITE))
712 vma->vm_ops = &gfs2_vm_ops_sharewrite;
713 else
714 vma->vm_ops = &gfs2_vm_ops_private;
715
716 gfs2_glock_dq_uninit(&i_gh);
717
718 return error;
719}
720
721/**
722 * gfs2_open - open a file
723 * @inode: the inode to open
724 * @file: the struct file for this opening
725 *
726 * Returns: errno
727 */
728
729static int gfs2_open(struct inode *inode, struct file *file)
730{
731 struct gfs2_inode *ip = inode->u.generic_ip;
732 struct gfs2_holder i_gh;
733 struct gfs2_file *fp;
734 int error;
735
736 fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
737 if (!fp)
738 return -ENOMEM;
739
740 mutex_init(&fp->f_fl_mutex);
741
742 gfs2_assert_warn(ip->i_sbd, !file->private_data);
743 file->private_data = fp;
744
745 if (S_ISREG(ip->i_di.di_mode)) {
746 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
747 &i_gh);
748 if (error)
749 goto fail;
750
751 if (!(file->f_flags & O_LARGEFILE) &&
752 ip->i_di.di_size > MAX_NON_LFS) {
753 error = -EFBIG;
754 goto fail_gunlock;
755 }
756
757 /* Listen to the Direct I/O flag */
758
759 if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
760 file->f_flags |= O_DIRECT;
761
762 gfs2_glock_dq_uninit(&i_gh);
763 }
764
765 return 0;
766
767 fail_gunlock:
768 gfs2_glock_dq_uninit(&i_gh);
769
770 fail:
771 file->private_data = NULL;
772 kfree(fp);
773
774 return error;
775}
776
777/**
778 * gfs2_close - called to close a struct file
779 * @inode: the inode the struct file belongs to
780 * @file: the struct file being closed
781 *
782 * Returns: errno
783 */
784
785static int gfs2_close(struct inode *inode, struct file *file)
786{
787 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
788 struct gfs2_file *fp;
789
790 fp = file->private_data;
791 file->private_data = NULL;
792
793 if (gfs2_assert_warn(sdp, fp))
794 return -EIO;
795
796 kfree(fp);
797
798 return 0;
799}
800
801/**
802 * gfs2_fsync - sync the dirty data for a file (across the cluster)
803 * @file: the file that points to the dentry (we ignore this)
804 * @dentry: the dentry that points to the inode to sync
805 *
806 * Returns: errno
807 */
808
809static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
810{
811 struct gfs2_inode *ip = dentry->d_inode->u.generic_ip;
812
813 gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
814
815 return 0;
816}
817
818/**
819 * gfs2_lock - acquire/release a posix lock on a file
820 * @file: the file pointer
821 * @cmd: either modify or retrieve lock state, possibly wait
822 * @fl: type and range of lock
823 *
824 * Returns: errno
825 */
826
827static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
828{
829 struct gfs2_inode *ip = file->f_mapping->host->u.generic_ip;
830 struct gfs2_sbd *sdp = ip->i_sbd;
831 struct lm_lockname name =
832 { .ln_number = ip->i_num.no_addr,
833 .ln_type = LM_TYPE_PLOCK };
834
835 if (!(fl->fl_flags & FL_POSIX))
836 return -ENOLCK;
837 if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
838 return -ENOLCK;
839
840 if (sdp->sd_args.ar_localflocks) {
841 if (IS_GETLK(cmd)) {
842 struct file_lock tmp;
843 int ret;
844 ret = posix_test_lock(file, fl, &tmp);
845 fl->fl_type = F_UNLCK;
846 if (ret)
847 memcpy(fl, &tmp, sizeof(struct file_lock));
848 return 0;
849 } else {
850 return posix_lock_file_wait(file, fl);
851 }
852 }
853
854 if (IS_GETLK(cmd))
855 return gfs2_lm_plock_get(sdp, &name, file, fl);
856 else if (fl->fl_type == F_UNLCK)
857 return gfs2_lm_punlock(sdp, &name, file, fl);
858 else
859 return gfs2_lm_plock(sdp, &name, file, cmd, fl);
860}
861
862/**
863 * gfs2_sendfile - Send bytes to a file or socket
864 * @in_file: The file to read from
865 * @out_file: The file to write to
866 * @count: The amount of data
867 * @offset: The beginning file offset
868 *
869 * Outputs: offset - updated according to number of bytes read
870 *
871 * Returns: The number of bytes sent, errno on failure
872 */
873
874static ssize_t gfs2_sendfile(struct file *in_file, loff_t *offset, size_t count,
875 read_actor_t actor, void *target)
876{
877 return generic_file_sendfile(in_file, offset, count, actor, target);
878}
879
880static int do_flock(struct file *file, int cmd, struct file_lock *fl)
881{
882 struct gfs2_file *fp = file->private_data;
883 struct gfs2_holder *fl_gh = &fp->f_fl_gh;
884 struct gfs2_inode *ip = file->f_dentry->d_inode->u.generic_ip;
885 struct gfs2_glock *gl;
886 unsigned int state;
887 int flags;
888 int error = 0;
889
890 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
891 flags = ((IS_SETLKW(cmd)) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
892
893 mutex_lock(&fp->f_fl_mutex);
894
895 gl = fl_gh->gh_gl;
896 if (gl) {
897 if (fl_gh->gh_state == state)
898 goto out;
899 gfs2_glock_hold(gl);
900 flock_lock_file_wait(file,
901 &(struct file_lock){.fl_type = F_UNLCK});
902 gfs2_glock_dq_uninit(fl_gh);
903 } else {
904 error = gfs2_glock_get(ip->i_sbd,
905 ip->i_num.no_addr, &gfs2_flock_glops,
906 CREATE, &gl);
907 if (error)
908 goto out;
909 }
910
911 gfs2_holder_init(gl, state, flags, fl_gh);
912 gfs2_glock_put(gl);
913
914 error = gfs2_glock_nq(fl_gh);
915 if (error) {
916 gfs2_holder_uninit(fl_gh);
917 if (error == GLR_TRYFAILED)
918 error = -EAGAIN;
919 } else {
920 error = flock_lock_file_wait(file, fl);
921 gfs2_assert_warn(ip->i_sbd, !error);
922 }
923
924 out:
925 mutex_unlock(&fp->f_fl_mutex);
926
927 return error;
928}
929
930static void do_unflock(struct file *file, struct file_lock *fl)
931{
932 struct gfs2_file *fp = file->private_data;
933 struct gfs2_holder *fl_gh = &fp->f_fl_gh;
934
935 mutex_lock(&fp->f_fl_mutex);
936 flock_lock_file_wait(file, fl);
937 if (fl_gh->gh_gl)
938 gfs2_glock_dq_uninit(fl_gh);
939 mutex_unlock(&fp->f_fl_mutex);
940}
941
942/**
943 * gfs2_flock - acquire/release a flock lock on a file
944 * @file: the file pointer
945 * @cmd: either modify or retrieve lock state, possibly wait
946 * @fl: type and range of lock
947 *
948 * Returns: errno
949 */
950
951static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
952{
953 struct gfs2_inode *ip = file->f_mapping->host->u.generic_ip;
954 struct gfs2_sbd *sdp = ip->i_sbd;
955
956 if (!(fl->fl_flags & FL_FLOCK))
957 return -ENOLCK;
958 if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
959 return -ENOLCK;
960
961 if (sdp->sd_args.ar_localflocks)
962 return flock_lock_file_wait(file, fl);
963
964 if (fl->fl_type == F_UNLCK) {
965 do_unflock(file, fl);
966 return 0;
967 } else
968 return do_flock(file, cmd, fl);
969}
970
971struct file_operations gfs2_file_fops = {
972 .llseek = gfs2_llseek,
973 .read = gfs2_read,
974 .readv = gfs2_file_readv,
975 .aio_read = gfs2_file_aio_read,
976 .write = generic_file_write,
977 .writev = generic_file_writev,
978 .aio_write = generic_file_aio_write,
979 .unlocked_ioctl = gfs2_ioctl,
980 .mmap = gfs2_mmap,
981 .open = gfs2_open,
982 .release = gfs2_close,
983 .fsync = gfs2_fsync,
984 .lock = gfs2_lock,
985 .sendfile = gfs2_sendfile,
986 .flock = gfs2_flock,
987 .splice_read = generic_file_splice_read,
988 .splice_write = generic_file_splice_write,
989};
990
991struct file_operations gfs2_dir_fops = {
992 .readdir = gfs2_readdir,
993 .unlocked_ioctl = gfs2_ioctl,
994 .open = gfs2_open,
995 .release = gfs2_close,
996 .fsync = gfs2_fsync,
997 .lock = gfs2_lock,
998 .flock = gfs2_flock,
999};
1000
diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h
new file mode 100644
index 000000000000..a2edce38f5cb
--- /dev/null
+++ b/fs/gfs2/ops_file.h
@@ -0,0 +1,20 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_FILE_DOT_H__
11#define __OPS_FILE_DOT_H__
12extern struct file gfs2_internal_file_sentinal;
13extern int gfs2_internal_read(struct gfs2_inode *ip,
14 struct file_ra_state *ra_state,
15 char *buf, loff_t *pos, unsigned size);
16
17extern struct file_operations gfs2_file_fops;
18extern struct file_operations gfs2_dir_fops;
19
20#endif /* __OPS_FILE_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
new file mode 100644
index 000000000000..a45982045509
--- /dev/null
+++ b/fs/gfs2/ops_fstype.c
@@ -0,0 +1,901 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/vmalloc.h>
16#include <linux/blkdev.h>
17#include <linux/kthread.h>
18#include <linux/gfs2_ondisk.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "daemon.h"
24#include "glock.h"
25#include "glops.h"
26#include "inode.h"
27#include "lm.h"
28#include "mount.h"
29#include "ops_export.h"
30#include "ops_fstype.h"
31#include "ops_super.h"
32#include "recovery.h"
33#include "rgrp.h"
34#include "super.h"
35#include "unlinked.h"
36#include "sys.h"
37#include "util.h"
38
39#define DO 0
40#define UNDO 1
41
42extern struct dentry_operations gfs2_dops;
43
44static struct gfs2_sbd *init_sbd(struct super_block *sb)
45{
46 struct gfs2_sbd *sdp;
47 unsigned int x;
48
49 sdp = vmalloc(sizeof(struct gfs2_sbd));
50 if (!sdp)
51 return NULL;
52
53 memset(sdp, 0, sizeof(struct gfs2_sbd));
54
55 sb->s_fs_info = sdp;
56 sdp->sd_vfs = sb;
57
58 gfs2_tune_init(&sdp->sd_tune);
59
60 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
61 sdp->sd_gl_hash[x].hb_lock = RW_LOCK_UNLOCKED;
62 INIT_LIST_HEAD(&sdp->sd_gl_hash[x].hb_list);
63 }
64 INIT_LIST_HEAD(&sdp->sd_reclaim_list);
65 spin_lock_init(&sdp->sd_reclaim_lock);
66 init_waitqueue_head(&sdp->sd_reclaim_wq);
67 mutex_init(&sdp->sd_invalidate_inodes_mutex);
68
69 mutex_init(&sdp->sd_inum_mutex);
70 spin_lock_init(&sdp->sd_statfs_spin);
71 mutex_init(&sdp->sd_statfs_mutex);
72
73 spin_lock_init(&sdp->sd_rindex_spin);
74 mutex_init(&sdp->sd_rindex_mutex);
75 INIT_LIST_HEAD(&sdp->sd_rindex_list);
76 INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
77 INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
78
79 INIT_LIST_HEAD(&sdp->sd_jindex_list);
80 spin_lock_init(&sdp->sd_jindex_spin);
81 mutex_init(&sdp->sd_jindex_mutex);
82
83 INIT_LIST_HEAD(&sdp->sd_unlinked_list);
84 spin_lock_init(&sdp->sd_unlinked_spin);
85 mutex_init(&sdp->sd_unlinked_mutex);
86
87 INIT_LIST_HEAD(&sdp->sd_quota_list);
88 spin_lock_init(&sdp->sd_quota_spin);
89 mutex_init(&sdp->sd_quota_mutex);
90
91 spin_lock_init(&sdp->sd_log_lock);
92
93 INIT_LIST_HEAD(&sdp->sd_log_le_gl);
94 INIT_LIST_HEAD(&sdp->sd_log_le_buf);
95 INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
96 INIT_LIST_HEAD(&sdp->sd_log_le_rg);
97 INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
98
99 mutex_init(&sdp->sd_log_reserve_mutex);
100 INIT_LIST_HEAD(&sdp->sd_ail1_list);
101 INIT_LIST_HEAD(&sdp->sd_ail2_list);
102
103 init_rwsem(&sdp->sd_log_flush_lock);
104 INIT_LIST_HEAD(&sdp->sd_log_flush_list);
105
106 INIT_LIST_HEAD(&sdp->sd_revoke_list);
107
108 mutex_init(&sdp->sd_freeze_lock);
109
110 return sdp;
111}
112
113static void init_vfs(struct super_block *sb, unsigned noatime)
114{
115 struct gfs2_sbd *sdp = sb->s_fs_info;
116
117 sb->s_magic = GFS2_MAGIC;
118 sb->s_op = &gfs2_super_ops;
119 sb->s_export_op = &gfs2_export_ops;
120 sb->s_maxbytes = MAX_LFS_FILESIZE;
121
122 if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
123 set_bit(noatime, &sdp->sd_flags);
124
125 /* Don't let the VFS update atimes. GFS2 handles this itself. */
126 sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
127}
128
129static int init_names(struct gfs2_sbd *sdp, int silent)
130{
131 struct gfs2_sb *sb = NULL;
132 char *proto, *table;
133 int error = 0;
134
135 proto = sdp->sd_args.ar_lockproto;
136 table = sdp->sd_args.ar_locktable;
137
138 /* Try to autodetect */
139
140 if (!proto[0] || !table[0]) {
141 struct buffer_head *bh;
142 bh = sb_getblk(sdp->sd_vfs,
143 GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
144 lock_buffer(bh);
145 clear_buffer_uptodate(bh);
146 clear_buffer_dirty(bh);
147 unlock_buffer(bh);
148 ll_rw_block(READ, 1, &bh);
149 wait_on_buffer(bh);
150
151 if (!buffer_uptodate(bh)) {
152 brelse(bh);
153 return -EIO;
154 }
155
156 sb = kmalloc(sizeof(struct gfs2_sb), GFP_KERNEL);
157 if (!sb) {
158 brelse(bh);
159 return -ENOMEM;
160 }
161 gfs2_sb_in(sb, bh->b_data);
162 brelse(bh);
163
164 error = gfs2_check_sb(sdp, sb, silent);
165 if (error)
166 goto out;
167
168 if (!proto[0])
169 proto = sb->sb_lockproto;
170 if (!table[0])
171 table = sb->sb_locktable;
172 }
173
174 if (!table[0])
175 table = sdp->sd_vfs->s_id;
176
177 snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto);
178 snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table);
179
180 out:
181 kfree(sb);
182
183 return error;
184}
185
186static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
187 int undo)
188{
189 struct task_struct *p;
190 int error = 0;
191
192 if (undo)
193 goto fail_trans;
194
195 p = kthread_run(gfs2_scand, sdp, "gfs2_scand");
196 error = IS_ERR(p);
197 if (error) {
198 fs_err(sdp, "can't start scand thread: %d\n", error);
199 return error;
200 }
201 sdp->sd_scand_process = p;
202
203 for (sdp->sd_glockd_num = 0;
204 sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
205 sdp->sd_glockd_num++) {
206 p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
207 error = IS_ERR(p);
208 if (error) {
209 fs_err(sdp, "can't start glockd thread: %d\n", error);
210 goto fail;
211 }
212 sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
213 }
214
215 error = gfs2_glock_nq_num(sdp,
216 GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
217 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
218 mount_gh);
219 if (error) {
220 fs_err(sdp, "can't acquire mount glock: %d\n", error);
221 goto fail;
222 }
223
224 error = gfs2_glock_nq_num(sdp,
225 GFS2_LIVE_LOCK, &gfs2_nondisk_glops,
226 LM_ST_SHARED,
227 LM_FLAG_NOEXP | GL_EXACT,
228 &sdp->sd_live_gh);
229 if (error) {
230 fs_err(sdp, "can't acquire live glock: %d\n", error);
231 goto fail_mount;
232 }
233
234 error = gfs2_glock_get(sdp, GFS2_RENAME_LOCK, &gfs2_nondisk_glops,
235 CREATE, &sdp->sd_rename_gl);
236 if (error) {
237 fs_err(sdp, "can't create rename glock: %d\n", error);
238 goto fail_live;
239 }
240
241 error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops,
242 CREATE, &sdp->sd_trans_gl);
243 if (error) {
244 fs_err(sdp, "can't create transaction glock: %d\n", error);
245 goto fail_rename;
246 }
247 set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
248
249 return 0;
250
251 fail_trans:
252 gfs2_glock_put(sdp->sd_trans_gl);
253
254 fail_rename:
255 gfs2_glock_put(sdp->sd_rename_gl);
256
257 fail_live:
258 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
259
260 fail_mount:
261 gfs2_glock_dq_uninit(mount_gh);
262
263 fail:
264 while (sdp->sd_glockd_num--)
265 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
266
267 kthread_stop(sdp->sd_scand_process);
268
269 return error;
270}
271
272static struct inode *gfs2_lookup_root(struct gfs2_sbd *sdp,
273 const struct gfs2_inum *inum)
274{
275 int error;
276 struct gfs2_glock *gl;
277 struct gfs2_inode *ip;
278 struct inode *inode;
279
280 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_inode_glops,
281 CREATE, &gl);
282 if (!error) {
283 error = gfs2_inode_get(gl, inum, CREATE, &ip);
284 if (!error) {
285 gfs2_inode_min_init(ip, DT_DIR);
286 inode = gfs2_ip2v(ip);
287 gfs2_inode_put(ip);
288 gfs2_glock_put(gl);
289 return inode;
290 }
291 gfs2_glock_put(gl);
292 }
293 return ERR_PTR(error);
294}
295
296static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
297{
298 struct super_block *sb = sdp->sd_vfs;
299 struct gfs2_holder sb_gh;
300 struct gfs2_inum *inum;
301 struct inode *inode;
302 int error = 0;
303
304 if (undo) {
305 return 0;
306 }
307
308 error = gfs2_glock_nq_num(sdp,
309 GFS2_SB_LOCK, &gfs2_meta_glops,
310 LM_ST_SHARED, 0, &sb_gh);
311 if (error) {
312 fs_err(sdp, "can't acquire superblock glock: %d\n", error);
313 return error;
314 }
315
316 error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
317 if (error) {
318 fs_err(sdp, "can't read superblock: %d\n", error);
319 goto out;
320 }
321
322 /* Set up the buffer cache and SB for real */
323 if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
324 error = -EINVAL;
325 fs_err(sdp, "FS block size (%u) is too small for device "
326 "block size (%u)\n",
327 sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
328 goto out;
329 }
330 if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
331 error = -EINVAL;
332 fs_err(sdp, "FS block size (%u) is too big for machine "
333 "page size (%u)\n",
334 sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
335 goto out;
336 }
337
338 /* Get rid of buffers from the original block size */
339 sb_gh.gh_gl->gl_ops->go_inval(sb_gh.gh_gl, DIO_METADATA | DIO_DATA);
340 sb_gh.gh_gl->gl_aspace->i_blkbits = sdp->sd_sb.sb_bsize_shift;
341
342 sb_set_blocksize(sb, sdp->sd_sb.sb_bsize);
343
344 /* Get the root inode */
345 inum = &sdp->sd_sb.sb_root_dir;
346 if (sb->s_type == &gfs2meta_fs_type)
347 inum = &sdp->sd_sb.sb_master_dir;
348 inode = gfs2_lookup_root(sdp, inum);
349 if (IS_ERR(inode)) {
350 error = PTR_ERR(inode);
351 fs_err(sdp, "can't read in root inode: %d\n", error);
352 goto out;
353 }
354
355 sb->s_root = d_alloc_root(inode);
356 if (!sb->s_root) {
357 fs_err(sdp, "can't get root dentry\n");
358 error = -ENOMEM;
359 iput(inode);
360 }
361 sb->s_root->d_op = &gfs2_dops;
362out:
363 gfs2_glock_dq_uninit(&sb_gh);
364 return error;
365}
366
367static int init_journal(struct gfs2_sbd *sdp, int undo)
368{
369 struct gfs2_holder ji_gh;
370 struct task_struct *p;
371 struct gfs2_inode *ip;
372 int jindex = 1;
373 int error = 0;
374
375 if (undo) {
376 jindex = 0;
377 goto fail_recoverd;
378 }
379
380 sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex");
381 if (IS_ERR(sdp->sd_jindex)) {
382 fs_err(sdp, "can't lookup journal index: %d\n", error);
383 return PTR_ERR(sdp->sd_jindex);
384 }
385 ip = sdp->sd_jindex->u.generic_ip;
386 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
387
388 /* Load in the journal index special file */
389
390 error = gfs2_jindex_hold(sdp, &ji_gh);
391 if (error) {
392 fs_err(sdp, "can't read journal index: %d\n", error);
393 goto fail;
394 }
395
396 error = -EINVAL;
397 if (!gfs2_jindex_size(sdp)) {
398 fs_err(sdp, "no journals!\n");
399 goto fail_jindex;
400 }
401
402 if (sdp->sd_args.ar_spectator) {
403 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
404 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
405 } else {
406 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
407 fs_err(sdp, "can't mount journal #%u\n",
408 sdp->sd_lockstruct.ls_jid);
409 fs_err(sdp, "there are only %u journals (0 - %u)\n",
410 gfs2_jindex_size(sdp),
411 gfs2_jindex_size(sdp) - 1);
412 goto fail_jindex;
413 }
414 sdp->sd_jdesc = gfs2_jdesc_find(sdp, sdp->sd_lockstruct.ls_jid);
415
416 error = gfs2_glock_nq_num(sdp,
417 sdp->sd_lockstruct.ls_jid,
418 &gfs2_journal_glops,
419 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP,
420 &sdp->sd_journal_gh);
421 if (error) {
422 fs_err(sdp, "can't acquire journal glock: %d\n", error);
423 goto fail_jindex;
424 }
425
426 ip = sdp->sd_jdesc->jd_inode->u.generic_ip;
427 error = gfs2_glock_nq_init(ip->i_gl,
428 LM_ST_SHARED,
429 LM_FLAG_NOEXP | GL_EXACT,
430 &sdp->sd_jinode_gh);
431 if (error) {
432 fs_err(sdp, "can't acquire journal inode glock: %d\n",
433 error);
434 goto fail_journal_gh;
435 }
436
437 error = gfs2_jdesc_check(sdp->sd_jdesc);
438 if (error) {
439 fs_err(sdp, "my journal (%u) is bad: %d\n",
440 sdp->sd_jdesc->jd_jid, error);
441 goto fail_jinode_gh;
442 }
443 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
444 }
445
446 if (sdp->sd_lockstruct.ls_first) {
447 unsigned int x;
448 for (x = 0; x < sdp->sd_journals; x++) {
449 error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x));
450 if (error) {
451 fs_err(sdp, "error recovering journal %u: %d\n",
452 x, error);
453 goto fail_jinode_gh;
454 }
455 }
456
457 gfs2_lm_others_may_mount(sdp);
458 } else if (!sdp->sd_args.ar_spectator) {
459 error = gfs2_recover_journal(sdp->sd_jdesc);
460 if (error) {
461 fs_err(sdp, "error recovering my journal: %d\n", error);
462 goto fail_jinode_gh;
463 }
464 }
465
466 set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags);
467 gfs2_glock_dq_uninit(&ji_gh);
468 jindex = 0;
469
470 p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
471 error = IS_ERR(p);
472 if (error) {
473 fs_err(sdp, "can't start recoverd thread: %d\n", error);
474 goto fail_jinode_gh;
475 }
476 sdp->sd_recoverd_process = p;
477
478 return 0;
479
480 fail_recoverd:
481 kthread_stop(sdp->sd_recoverd_process);
482
483 fail_jinode_gh:
484 if (!sdp->sd_args.ar_spectator)
485 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
486
487 fail_journal_gh:
488 if (!sdp->sd_args.ar_spectator)
489 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
490
491 fail_jindex:
492 gfs2_jindex_free(sdp);
493 if (jindex)
494 gfs2_glock_dq_uninit(&ji_gh);
495
496 fail:
497 iput(sdp->sd_jindex);
498
499 return error;
500}
501
502
503static int init_inodes(struct gfs2_sbd *sdp, int undo)
504{
505 int error = 0;
506 struct gfs2_inode *ip;
507 struct inode *inode;
508
509 if (undo)
510 goto fail_qinode;
511
512 inode = gfs2_lookup_root(sdp, &sdp->sd_sb.sb_master_dir);
513 if (IS_ERR(inode)) {
514 error = PTR_ERR(inode);
515 fs_err(sdp, "can't read in master directory: %d\n", error);
516 goto fail;
517 }
518 sdp->sd_master_dir = inode;
519
520 error = init_journal(sdp, undo);
521 if (error)
522 goto fail_master;
523
524 /* Read in the master inode number inode */
525 sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum");
526 if (IS_ERR(sdp->sd_inum_inode)) {
527 error = PTR_ERR(sdp->sd_inum_inode);
528 fs_err(sdp, "can't read in inum inode: %d\n", error);
529 goto fail_journal;
530 }
531
532
533 /* Read in the master statfs inode */
534 sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs");
535 if (IS_ERR(sdp->sd_statfs_inode)) {
536 error = PTR_ERR(sdp->sd_statfs_inode);
537 fs_err(sdp, "can't read in statfs inode: %d\n", error);
538 goto fail_inum;
539 }
540
541 /* Read in the resource index inode */
542 sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex");
543 if (IS_ERR(sdp->sd_rindex)) {
544 error = PTR_ERR(sdp->sd_rindex);
545 fs_err(sdp, "can't get resource index inode: %d\n", error);
546 goto fail_statfs;
547 }
548 ip = sdp->sd_rindex->u.generic_ip;
549 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
550 sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1;
551
552 /* Read in the quota inode */
553 sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
554 if (IS_ERR(sdp->sd_quota_inode)) {
555 error = PTR_ERR(sdp->sd_quota_inode);
556 fs_err(sdp, "can't get quota file inode: %d\n", error);
557 goto fail_rindex;
558 }
559 return 0;
560
561fail_qinode:
562 iput(sdp->sd_quota_inode);
563
564fail_rindex:
565 gfs2_clear_rgrpd(sdp);
566 iput(sdp->sd_rindex);
567
568fail_statfs:
569 iput(sdp->sd_statfs_inode);
570
571fail_inum:
572 iput(sdp->sd_inum_inode);
573fail_journal:
574 init_journal(sdp, UNDO);
575fail_master:
576 iput(sdp->sd_master_dir);
577fail:
578 return error;
579}
580
581static int init_per_node(struct gfs2_sbd *sdp, int undo)
582{
583 struct inode *pn = NULL;
584 char buf[30];
585 int error = 0;
586 struct gfs2_inode *ip;
587
588 if (sdp->sd_args.ar_spectator)
589 return 0;
590
591 if (undo)
592 goto fail_qc_gh;
593
594 pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node");
595 if (IS_ERR(pn)) {
596 error = PTR_ERR(pn);
597 fs_err(sdp, "can't find per_node directory: %d\n", error);
598 return error;
599 }
600
601 sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
602 sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
603 if (IS_ERR(sdp->sd_ir_inode)) {
604 error = PTR_ERR(sdp->sd_ir_inode);
605 fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
606 goto fail;
607 }
608
609 sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
610 sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
611 if (IS_ERR(sdp->sd_sc_inode)) {
612 error = PTR_ERR(sdp->sd_sc_inode);
613 fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
614 goto fail_ir_i;
615 }
616
617 sprintf(buf, "unlinked_tag%u", sdp->sd_jdesc->jd_jid);
618 sdp->sd_ut_inode = gfs2_lookup_simple(pn, buf);
619 if (IS_ERR(sdp->sd_ut_inode)) {
620 error = PTR_ERR(sdp->sd_ut_inode);
621 fs_err(sdp, "can't find local \"ut\" file: %d\n", error);
622 goto fail_sc_i;
623 }
624
625 sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
626 sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf);
627 if (IS_ERR(sdp->sd_qc_inode)) {
628 error = PTR_ERR(sdp->sd_qc_inode);
629 fs_err(sdp, "can't find local \"qc\" file: %d\n", error);
630 goto fail_ut_i;
631 }
632
633 iput(pn);
634 pn = NULL;
635
636 ip = sdp->sd_ir_inode->u.generic_ip;
637 error = gfs2_glock_nq_init(ip->i_gl,
638 LM_ST_EXCLUSIVE, 0,
639 &sdp->sd_ir_gh);
640 if (error) {
641 fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
642 goto fail_qc_i;
643 }
644
645 ip = sdp->sd_sc_inode->u.generic_ip;
646 error = gfs2_glock_nq_init(ip->i_gl,
647 LM_ST_EXCLUSIVE, 0,
648 &sdp->sd_sc_gh);
649 if (error) {
650 fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
651 goto fail_ir_gh;
652 }
653
654 ip = sdp->sd_ut_inode->u.generic_ip;
655 error = gfs2_glock_nq_init(ip->i_gl,
656 LM_ST_EXCLUSIVE, 0,
657 &sdp->sd_ut_gh);
658 if (error) {
659 fs_err(sdp, "can't lock local \"ut\" file: %d\n", error);
660 goto fail_sc_gh;
661 }
662
663 ip = sdp->sd_qc_inode->u.generic_ip;
664 error = gfs2_glock_nq_init(ip->i_gl,
665 LM_ST_EXCLUSIVE, 0,
666 &sdp->sd_qc_gh);
667 if (error) {
668 fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
669 goto fail_ut_gh;
670 }
671
672 return 0;
673
674 fail_qc_gh:
675 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
676
677 fail_ut_gh:
678 gfs2_glock_dq_uninit(&sdp->sd_ut_gh);
679
680 fail_sc_gh:
681 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
682
683 fail_ir_gh:
684 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
685
686 fail_qc_i:
687 iput(sdp->sd_qc_inode);
688
689 fail_ut_i:
690 iput(sdp->sd_ut_inode);
691
692 fail_sc_i:
693 iput(sdp->sd_sc_inode);
694
695 fail_ir_i:
696 iput(sdp->sd_ir_inode);
697
698 fail:
699 if (pn)
700 iput(pn);
701 return error;
702}
703
704static int init_threads(struct gfs2_sbd *sdp, int undo)
705{
706 struct task_struct *p;
707 int error = 0;
708
709 if (undo)
710 goto fail_inoded;
711
712 sdp->sd_log_flush_time = jiffies;
713 sdp->sd_jindex_refresh_time = jiffies;
714
715 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
716 error = IS_ERR(p);
717 if (error) {
718 fs_err(sdp, "can't start logd thread: %d\n", error);
719 return error;
720 }
721 sdp->sd_logd_process = p;
722
723 sdp->sd_statfs_sync_time = jiffies;
724 sdp->sd_quota_sync_time = jiffies;
725
726 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
727 error = IS_ERR(p);
728 if (error) {
729 fs_err(sdp, "can't start quotad thread: %d\n", error);
730 goto fail;
731 }
732 sdp->sd_quotad_process = p;
733
734 p = kthread_run(gfs2_inoded, sdp, "gfs2_inoded");
735 error = IS_ERR(p);
736 if (error) {
737 fs_err(sdp, "can't start inoded thread: %d\n", error);
738 goto fail_quotad;
739 }
740 sdp->sd_inoded_process = p;
741
742 return 0;
743
744 fail_inoded:
745 kthread_stop(sdp->sd_inoded_process);
746
747 fail_quotad:
748 kthread_stop(sdp->sd_quotad_process);
749
750 fail:
751 kthread_stop(sdp->sd_logd_process);
752
753 return error;
754}
755
756/**
757 * fill_super - Read in superblock
758 * @sb: The VFS superblock
759 * @data: Mount options
760 * @silent: Don't complain if it's not a GFS2 filesystem
761 *
762 * Returns: errno
763 */
764
765static int fill_super(struct super_block *sb, void *data, int silent)
766{
767 struct gfs2_sbd *sdp;
768 struct gfs2_holder mount_gh;
769 int error;
770
771 sdp = init_sbd(sb);
772 if (!sdp) {
773 printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
774 return -ENOMEM;
775 }
776
777 error = gfs2_mount_args(sdp, (char *)data, 0);
778 if (error) {
779 printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
780 goto fail;
781 }
782
783 init_vfs(sb, SDF_NOATIME);
784
785 /* Set up the buffer cache and fill in some fake block size values
786 to allow us to read-in the on-disk superblock. */
787 sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK);
788 sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
789 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
790 GFS2_BASIC_BLOCK_SHIFT;
791 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
792
793 error = init_names(sdp, silent);
794 if (error)
795 goto fail;
796
797 error = gfs2_sys_fs_add(sdp);
798 if (error)
799 goto fail;
800
801 error = gfs2_lm_mount(sdp, silent);
802 if (error)
803 goto fail_sys;
804
805 error = init_locking(sdp, &mount_gh, DO);
806 if (error)
807 goto fail_lm;
808
809 error = init_sb(sdp, silent, DO);
810 if (error)
811 goto fail_locking;
812
813 error = init_inodes(sdp, DO);
814 if (error)
815 goto fail_sb;
816
817 error = init_per_node(sdp, DO);
818 if (error)
819 goto fail_inodes;
820
821 error = gfs2_statfs_init(sdp);
822 if (error) {
823 fs_err(sdp, "can't initialize statfs subsystem: %d\n", error);
824 goto fail_per_node;
825 }
826
827 error = init_threads(sdp, DO);
828 if (error)
829 goto fail_per_node;
830
831 if (!(sb->s_flags & MS_RDONLY)) {
832 error = gfs2_make_fs_rw(sdp);
833 if (error) {
834 fs_err(sdp, "can't make FS RW: %d\n", error);
835 goto fail_threads;
836 }
837 }
838
839 gfs2_glock_dq_uninit(&mount_gh);
840
841 return 0;
842
843 fail_threads:
844 init_threads(sdp, UNDO);
845
846 fail_per_node:
847 init_per_node(sdp, UNDO);
848
849 fail_inodes:
850 init_inodes(sdp, UNDO);
851
852 fail_sb:
853 init_sb(sdp, 0, UNDO);
854
855 fail_locking:
856 init_locking(sdp, &mount_gh, UNDO);
857
858 fail_lm:
859 gfs2_gl_hash_clear(sdp, WAIT);
860 gfs2_lm_unmount(sdp);
861 while (invalidate_inodes(sb))
862 yield();
863
864 fail_sys:
865 gfs2_sys_fs_del(sdp);
866
867 fail:
868 vfree(sdp);
869 sb->s_fs_info = NULL;
870
871 return error;
872}
873
874static struct super_block *gfs2_get_sb(struct file_system_type *fs_type,
875 int flags, const char *dev_name,
876 void *data)
877{
878 return get_sb_bdev(fs_type, flags, dev_name, data, fill_super);
879}
880
881static void gfs2_kill_sb(struct super_block *sb)
882{
883 kill_block_super(sb);
884}
885
886struct file_system_type gfs2_fs_type = {
887 .name = "gfs2",
888 .fs_flags = FS_REQUIRES_DEV,
889 .get_sb = gfs2_get_sb,
890 .kill_sb = gfs2_kill_sb,
891 .owner = THIS_MODULE,
892};
893
894struct file_system_type gfs2meta_fs_type = {
895 .name = "gfs2meta",
896 .fs_flags = FS_REQUIRES_DEV,
897 .get_sb = gfs2_get_sb,
898 .kill_sb = gfs2_kill_sb,
899 .owner = THIS_MODULE,
900};
901
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
new file mode 100644
index 000000000000..622f5760d6b2
--- /dev/null
+++ b/fs/gfs2/ops_fstype.h
@@ -0,0 +1,16 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_FSTYPE_DOT_H__
11#define __OPS_FSTYPE_DOT_H__
12
13extern struct file_system_type gfs2_fs_type;
14extern struct file_system_type gfs2meta_fs_type;
15
16#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
new file mode 100644
index 000000000000..0c06f92368f2
--- /dev/null
+++ b/fs/gfs2/ops_inode.c
@@ -0,0 +1,1194 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/namei.h>
16#include <linux/utsname.h>
17#include <linux/mm.h>
18#include <linux/xattr.h>
19#include <linux/posix_acl.h>
20#include <linux/gfs2_ondisk.h>
21#include <linux/crc32.h>
22#include <asm/uaccess.h>
23
24#include "gfs2.h"
25#include "lm_interface.h"
26#include "incore.h"
27#include "acl.h"
28#include "bmap.h"
29#include "dir.h"
30#include "eaops.h"
31#include "eattr.h"
32#include "glock.h"
33#include "inode.h"
34#include "meta_io.h"
35#include "ops_dentry.h"
36#include "ops_inode.h"
37#include "page.h"
38#include "quota.h"
39#include "rgrp.h"
40#include "trans.h"
41#include "unlinked.h"
42#include "util.h"
43
44/**
45 * gfs2_create - Create a file
46 * @dir: The directory in which to create the file
47 * @dentry: The dentry of the new file
48 * @mode: The mode of the new file
49 *
50 * Returns: errno
51 */
52
53static int gfs2_create(struct inode *dir, struct dentry *dentry,
54 int mode, struct nameidata *nd)
55{
56 struct gfs2_inode *dip = dir->u.generic_ip;
57 struct gfs2_sbd *sdp = dip->i_sbd;
58 struct gfs2_holder ghs[2];
59 struct inode *inode;
60 int new = 1;
61
62 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
63
64 for (;;) {
65 inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode);
66 if (!IS_ERR(inode)) {
67 gfs2_trans_end(sdp);
68 if (dip->i_alloc.al_rgd)
69 gfs2_inplace_release(dip);
70 gfs2_quota_unlock(dip);
71 gfs2_alloc_put(dip);
72 gfs2_glock_dq_uninit_m(2, ghs);
73 break;
74 } else if (PTR_ERR(inode) != -EEXIST ||
75 (nd->intent.open.flags & O_EXCL)) {
76 gfs2_holder_uninit(ghs);
77 return PTR_ERR(inode);
78 }
79
80 inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
81 if (inode) {
82 if (!IS_ERR(inode)) {
83 new = 0;
84 gfs2_holder_uninit(ghs);
85 break;
86 } else {
87 gfs2_holder_uninit(ghs);
88 return PTR_ERR(inode);
89 }
90 }
91 }
92
93 d_instantiate(dentry, inode);
94 if (new)
95 mark_inode_dirty(inode);
96
97 return 0;
98}
99
100/**
101 * gfs2_lookup - Look up a filename in a directory and return its inode
102 * @dir: The directory inode
103 * @dentry: The dentry of the new inode
104 * @nd: passed from Linux VFS, ignored by us
105 *
106 * Called by the VFS layer. Lock dir and call gfs2_lookupi()
107 *
108 * Returns: errno
109 */
110
111static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
112 struct nameidata *nd)
113{
114 struct inode *inode = NULL;
115
116 dentry->d_op = &gfs2_dops;
117
118 inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
119 if (inode && IS_ERR(inode))
120 return ERR_PTR(PTR_ERR(inode));
121
122 if (inode)
123 return d_splice_alias(inode, dentry);
124 d_add(dentry, inode);
125
126 return NULL;
127}
128
129/**
130 * gfs2_link - Link to a file
131 * @old_dentry: The inode to link
132 * @dir: Add link to this directory
133 * @dentry: The name of the link
134 *
135 * Link the inode in "old_dentry" into the directory "dir" with the
136 * name in "dentry".
137 *
138 * Returns: errno
139 */
140
141static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
142 struct dentry *dentry)
143{
144 struct gfs2_inode *dip = dir->u.generic_ip;
145 struct gfs2_sbd *sdp = dip->i_sbd;
146 struct inode *inode = old_dentry->d_inode;
147 struct gfs2_inode *ip = inode->u.generic_ip;
148 struct gfs2_holder ghs[2];
149 int alloc_required;
150 int error;
151
152 if (S_ISDIR(ip->i_di.di_mode))
153 return -EPERM;
154
155 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
156 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
157
158 error = gfs2_glock_nq_m(2, ghs);
159 if (error)
160 goto out;
161
162 error = gfs2_repermission(dir, MAY_WRITE | MAY_EXEC, NULL);
163 if (error)
164 goto out_gunlock;
165
166 error = gfs2_dir_search(dir, &dentry->d_name, NULL, NULL);
167 switch (error) {
168 case -ENOENT:
169 break;
170 case 0:
171 error = -EEXIST;
172 default:
173 goto out_gunlock;
174 }
175
176 error = -EINVAL;
177 if (!dip->i_di.di_nlink)
178 goto out_gunlock;
179 error = -EFBIG;
180 if (dip->i_di.di_entries == (uint32_t)-1)
181 goto out_gunlock;
182 error = -EPERM;
183 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
184 goto out_gunlock;
185 error = -EINVAL;
186 if (!ip->i_di.di_nlink)
187 goto out_gunlock;
188 error = -EMLINK;
189 if (ip->i_di.di_nlink == (uint32_t)-1)
190 goto out_gunlock;
191
192 alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
193 if (error < 0)
194 goto out_gunlock;
195 error = 0;
196
197 if (alloc_required) {
198 struct gfs2_alloc *al = gfs2_alloc_get(dip);
199
200 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
201 if (error)
202 goto out_alloc;
203
204 error = gfs2_quota_check(dip, dip->i_di.di_uid,
205 dip->i_di.di_gid);
206 if (error)
207 goto out_gunlock_q;
208
209 al->al_requested = sdp->sd_max_dirres;
210
211 error = gfs2_inplace_reserve(dip);
212 if (error)
213 goto out_gunlock_q;
214
215 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
216 al->al_rgd->rd_ri.ri_length +
217 2 * RES_DINODE + RES_STATFS +
218 RES_QUOTA, 0);
219 if (error)
220 goto out_ipres;
221 } else {
222 error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
223 if (error)
224 goto out_ipres;
225 }
226
227 error = gfs2_dir_add(dir, &dentry->d_name, &ip->i_num,
228 IF2DT(ip->i_di.di_mode));
229 if (error)
230 goto out_end_trans;
231
232 error = gfs2_change_nlink(ip, +1);
233
234 out_end_trans:
235 gfs2_trans_end(sdp);
236
237 out_ipres:
238 if (alloc_required)
239 gfs2_inplace_release(dip);
240
241 out_gunlock_q:
242 if (alloc_required)
243 gfs2_quota_unlock(dip);
244
245 out_alloc:
246 if (alloc_required)
247 gfs2_alloc_put(dip);
248
249 out_gunlock:
250 gfs2_glock_dq_m(2, ghs);
251
252 out:
253 gfs2_holder_uninit(ghs);
254 gfs2_holder_uninit(ghs + 1);
255
256 if (!error) {
257 atomic_inc(&inode->i_count);
258 d_instantiate(dentry, inode);
259 mark_inode_dirty(inode);
260 }
261
262 return error;
263}
264
265/**
266 * gfs2_unlink - Unlink a file
267 * @dir: The inode of the directory containing the file to unlink
268 * @dentry: The file itself
269 *
270 * Unlink a file. Call gfs2_unlinki()
271 *
272 * Returns: errno
273 */
274
275static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
276{
277 struct gfs2_inode *dip = dir->u.generic_ip;
278 struct gfs2_sbd *sdp = dip->i_sbd;
279 struct gfs2_inode *ip = dentry->d_inode->u.generic_ip;
280 struct gfs2_unlinked *ul;
281 struct gfs2_holder ghs[2];
282 int error;
283
284 error = gfs2_unlinked_get(sdp, &ul);
285 if (error)
286 return error;
287
288 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
289 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
290
291 error = gfs2_glock_nq_m(2, ghs);
292 if (error)
293 goto out;
294
295 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
296 if (error)
297 goto out_gunlock;
298
299 error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF +
300 RES_UNLINKED, 0);
301 if (error)
302 goto out_gunlock;
303
304 error = gfs2_unlinki(dip, &dentry->d_name, ip, ul);
305
306 gfs2_trans_end(sdp);
307
308 out_gunlock:
309 gfs2_glock_dq_m(2, ghs);
310
311 out:
312 gfs2_holder_uninit(ghs);
313 gfs2_holder_uninit(ghs + 1);
314
315 gfs2_unlinked_put(sdp, ul);
316
317 return error;
318}
319
320/**
321 * gfs2_symlink - Create a symlink
322 * @dir: The directory to create the symlink in
323 * @dentry: The dentry to put the symlink in
324 * @symname: The thing which the link points to
325 *
326 * Returns: errno
327 */
328
329static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
330 const char *symname)
331{
332 struct gfs2_inode *dip = dir->u.generic_ip, *ip;
333 struct gfs2_sbd *sdp = dip->i_sbd;
334 struct gfs2_holder ghs[2];
335 struct inode *inode;
336 struct buffer_head *dibh;
337 int size;
338 int error;
339
340 /* Must be stuffed with a null terminator for gfs2_follow_link() */
341 size = strlen(symname);
342 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
343 return -ENAMETOOLONG;
344
345 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
346
347 inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO);
348 if (IS_ERR(inode)) {
349 gfs2_holder_uninit(ghs);
350 return PTR_ERR(inode);
351 }
352
353 ip = ghs[1].gh_gl->gl_object;
354
355 ip->i_di.di_size = size;
356
357 error = gfs2_meta_inode_buffer(ip, &dibh);
358
359 if (!gfs2_assert_withdraw(sdp, !error)) {
360 gfs2_dinode_out(&ip->i_di, dibh->b_data);
361 memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname,
362 size);
363 brelse(dibh);
364 }
365
366 gfs2_trans_end(sdp);
367 if (dip->i_alloc.al_rgd)
368 gfs2_inplace_release(dip);
369 gfs2_quota_unlock(dip);
370 gfs2_alloc_put(dip);
371
372 gfs2_glock_dq_uninit_m(2, ghs);
373
374 d_instantiate(dentry, inode);
375 mark_inode_dirty(inode);
376
377 return 0;
378}
379
380/**
381 * gfs2_mkdir - Make a directory
382 * @dir: The parent directory of the new one
383 * @dentry: The dentry of the new directory
384 * @mode: The mode of the new directory
385 *
386 * Returns: errno
387 */
388
389static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
390{
391 struct gfs2_inode *dip = dir->u.generic_ip, *ip;
392 struct gfs2_sbd *sdp = dip->i_sbd;
393 struct gfs2_holder ghs[2];
394 struct inode *inode;
395 struct buffer_head *dibh;
396 int error;
397
398 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
399
400 inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode);
401 if (IS_ERR(inode)) {
402 gfs2_holder_uninit(ghs);
403 return PTR_ERR(inode);
404 }
405
406 ip = ghs[1].gh_gl->gl_object;
407
408 ip->i_di.di_nlink = 2;
409 ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
410 ip->i_di.di_flags |= GFS2_DIF_JDATA;
411 ip->i_di.di_payload_format = GFS2_FORMAT_DE;
412 ip->i_di.di_entries = 2;
413
414 error = gfs2_meta_inode_buffer(ip, &dibh);
415
416 if (!gfs2_assert_withdraw(sdp, !error)) {
417 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
418 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
419 struct qstr str;
420
421 gfs2_str2qstr(&str, ".");
422 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
423 gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent);
424 dent->de_inum = di->di_num; /* already GFS2 endian */
425 dent->de_type = DT_DIR;
426 di->di_entries = cpu_to_be32(1);
427
428 gfs2_str2qstr(&str, "..");
429 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
430 gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
431
432 gfs2_inum_out(&dip->i_num, (char *) &dent->de_inum);
433 dent->de_type = DT_DIR;
434
435 gfs2_dinode_out(&ip->i_di, (char *)di);
436
437 brelse(dibh);
438 }
439
440 error = gfs2_change_nlink(dip, +1);
441 gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
442
443 gfs2_trans_end(sdp);
444 if (dip->i_alloc.al_rgd)
445 gfs2_inplace_release(dip);
446 gfs2_quota_unlock(dip);
447 gfs2_alloc_put(dip);
448
449 gfs2_glock_dq_uninit_m(2, ghs);
450
451 d_instantiate(dentry, inode);
452 mark_inode_dirty(inode);
453
454 return 0;
455}
456
457/**
458 * gfs2_rmdir - Remove a directory
459 * @dir: The parent directory of the directory to be removed
460 * @dentry: The dentry of the directory to remove
461 *
462 * Remove a directory. Call gfs2_rmdiri()
463 *
464 * Returns: errno
465 */
466
467static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
468{
469 struct gfs2_inode *dip = dir->u.generic_ip;
470 struct gfs2_sbd *sdp = dip->i_sbd;
471 struct gfs2_inode *ip = dentry->d_inode->u.generic_ip;
472 struct gfs2_unlinked *ul;
473 struct gfs2_holder ghs[2];
474 int error;
475
476 error = gfs2_unlinked_get(sdp, &ul);
477 if (error)
478 return error;
479
480 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
481 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
482
483 error = gfs2_glock_nq_m(2, ghs);
484 if (error)
485 goto out;
486
487 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
488 if (error)
489 goto out_gunlock;
490
491 if (ip->i_di.di_entries < 2) {
492 if (gfs2_consist_inode(ip))
493 gfs2_dinode_print(&ip->i_di);
494 error = -EIO;
495 goto out_gunlock;
496 }
497 if (ip->i_di.di_entries > 2) {
498 error = -ENOTEMPTY;
499 goto out_gunlock;
500 }
501
502 error = gfs2_trans_begin(sdp, 2 * RES_DINODE + 3 * RES_LEAF +
503 RES_UNLINKED, 0);
504 if (error)
505 goto out_gunlock;
506
507 error = gfs2_rmdiri(dip, &dentry->d_name, ip, ul);
508
509 gfs2_trans_end(sdp);
510
511 out_gunlock:
512 gfs2_glock_dq_m(2, ghs);
513
514 out:
515 gfs2_holder_uninit(ghs);
516 gfs2_holder_uninit(ghs + 1);
517
518 gfs2_unlinked_put(sdp, ul);
519
520 return error;
521}
522
523/**
524 * gfs2_mknod - Make a special file
525 * @dir: The directory in which the special file will reside
526 * @dentry: The dentry of the special file
527 * @mode: The mode of the special file
528 * @rdev: The device specification of the special file
529 *
530 */
531
532static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
533 dev_t dev)
534{
535 struct gfs2_inode *dip = dir->u.generic_ip, *ip;
536 struct gfs2_sbd *sdp = dip->i_sbd;
537 struct gfs2_holder ghs[2];
538 struct inode *inode;
539 struct buffer_head *dibh;
540 uint32_t major = 0, minor = 0;
541 int error;
542
543 switch (mode & S_IFMT) {
544 case S_IFBLK:
545 case S_IFCHR:
546 major = MAJOR(dev);
547 minor = MINOR(dev);
548 break;
549 case S_IFIFO:
550 case S_IFSOCK:
551 break;
552 default:
553 return -EOPNOTSUPP;
554 };
555
556 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
557
558 inode = gfs2_createi(ghs, &dentry->d_name, mode);
559 if (IS_ERR(inode)) {
560 gfs2_holder_uninit(ghs);
561 return PTR_ERR(inode);
562 }
563
564 ip = ghs[1].gh_gl->gl_object;
565
566 ip->i_di.di_major = major;
567 ip->i_di.di_minor = minor;
568
569 error = gfs2_meta_inode_buffer(ip, &dibh);
570
571 if (!gfs2_assert_withdraw(sdp, !error)) {
572 gfs2_dinode_out(&ip->i_di, dibh->b_data);
573 brelse(dibh);
574 }
575
576 gfs2_trans_end(sdp);
577 if (dip->i_alloc.al_rgd)
578 gfs2_inplace_release(dip);
579 gfs2_quota_unlock(dip);
580 gfs2_alloc_put(dip);
581
582 gfs2_glock_dq_uninit_m(2, ghs);
583
584 d_instantiate(dentry, inode);
585 mark_inode_dirty(inode);
586
587 return 0;
588}
589
590/**
591 * gfs2_rename - Rename a file
592 * @odir: Parent directory of old file name
593 * @odentry: The old dentry of the file
594 * @ndir: Parent directory of new file name
595 * @ndentry: The new dentry of the file
596 *
597 * Returns: errno
598 */
599
600static int gfs2_rename(struct inode *odir, struct dentry *odentry,
601 struct inode *ndir, struct dentry *ndentry)
602{
603 struct gfs2_inode *odip = odir->u.generic_ip;
604 struct gfs2_inode *ndip = ndir->u.generic_ip;
605 struct gfs2_inode *ip = odentry->d_inode->u.generic_ip;
606 struct gfs2_inode *nip = NULL;
607 struct gfs2_sbd *sdp = odip->i_sbd;
608 struct gfs2_unlinked *ul;
609 struct gfs2_holder ghs[4], r_gh;
610 unsigned int num_gh;
611 int dir_rename = 0;
612 int alloc_required;
613 unsigned int x;
614 int error;
615
616 if (ndentry->d_inode) {
617 nip = ndentry->d_inode->u.generic_ip;
618 if (ip == nip)
619 return 0;
620 }
621
622 error = gfs2_unlinked_get(sdp, &ul);
623 if (error)
624 return error;
625
626 /* Make sure we aren't trying to move a dirctory into it's subdir */
627
628 if (S_ISDIR(ip->i_di.di_mode) && odip != ndip) {
629 dir_rename = 1;
630
631 error = gfs2_glock_nq_init(sdp->sd_rename_gl,
632 LM_ST_EXCLUSIVE, 0,
633 &r_gh);
634 if (error)
635 goto out;
636
637 error = gfs2_ok_to_move(ip, ndip);
638 if (error)
639 goto out_gunlock_r;
640 }
641
642 gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
643 gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
644 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
645 num_gh = 3;
646
647 if (nip)
648 gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
649
650 error = gfs2_glock_nq_m(num_gh, ghs);
651 if (error)
652 goto out_uninit;
653
654 /* Check out the old directory */
655
656 error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
657 if (error)
658 goto out_gunlock;
659
660 /* Check out the new directory */
661
662 if (nip) {
663 error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
664 if (error)
665 goto out_gunlock;
666
667 if (S_ISDIR(nip->i_di.di_mode)) {
668 if (nip->i_di.di_entries < 2) {
669 if (gfs2_consist_inode(nip))
670 gfs2_dinode_print(&nip->i_di);
671 error = -EIO;
672 goto out_gunlock;
673 }
674 if (nip->i_di.di_entries > 2) {
675 error = -ENOTEMPTY;
676 goto out_gunlock;
677 }
678 }
679 } else {
680 error = gfs2_repermission(ndir, MAY_WRITE | MAY_EXEC, NULL);
681 if (error)
682 goto out_gunlock;
683
684 error = gfs2_dir_search(ndir, &ndentry->d_name, NULL, NULL);
685 switch (error) {
686 case -ENOENT:
687 error = 0;
688 break;
689 case 0:
690 error = -EEXIST;
691 default:
692 goto out_gunlock;
693 };
694
695 if (odip != ndip) {
696 if (!ndip->i_di.di_nlink) {
697 error = -EINVAL;
698 goto out_gunlock;
699 }
700 if (ndip->i_di.di_entries == (uint32_t)-1) {
701 error = -EFBIG;
702 goto out_gunlock;
703 }
704 if (S_ISDIR(ip->i_di.di_mode) &&
705 ndip->i_di.di_nlink == (uint32_t)-1) {
706 error = -EMLINK;
707 goto out_gunlock;
708 }
709 }
710 }
711
712 /* Check out the dir to be renamed */
713
714 if (dir_rename) {
715 error = gfs2_repermission(odentry->d_inode, MAY_WRITE, NULL);
716 if (error)
717 goto out_gunlock;
718 }
719
720 alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
721 if (error < 0)
722 goto out_gunlock;
723 error = 0;
724
725 if (alloc_required) {
726 struct gfs2_alloc *al = gfs2_alloc_get(ndip);
727
728 error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
729 if (error)
730 goto out_alloc;
731
732 error = gfs2_quota_check(ndip, ndip->i_di.di_uid,
733 ndip->i_di.di_gid);
734 if (error)
735 goto out_gunlock_q;
736
737 al->al_requested = sdp->sd_max_dirres;
738
739 error = gfs2_inplace_reserve(ndip);
740 if (error)
741 goto out_gunlock_q;
742
743 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
744 al->al_rgd->rd_ri.ri_length +
745 4 * RES_DINODE + 4 * RES_LEAF +
746 RES_UNLINKED + RES_STATFS +
747 RES_QUOTA, 0);
748 if (error)
749 goto out_ipreserv;
750 } else {
751 error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
752 5 * RES_LEAF +
753 RES_UNLINKED, 0);
754 if (error)
755 goto out_gunlock;
756 }
757
758 /* Remove the target file, if it exists */
759
760 if (nip) {
761 if (S_ISDIR(nip->i_di.di_mode))
762 error = gfs2_rmdiri(ndip, &ndentry->d_name, nip, ul);
763 else
764 error = gfs2_unlinki(ndip, &ndentry->d_name, nip, ul);
765 if (error)
766 goto out_end_trans;
767 }
768
769 if (dir_rename) {
770 struct qstr name;
771 gfs2_str2qstr(&name, "..");
772
773 error = gfs2_change_nlink(ndip, +1);
774 if (error)
775 goto out_end_trans;
776 error = gfs2_change_nlink(odip, -1);
777 if (error)
778 goto out_end_trans;
779
780 error = gfs2_dir_mvino(ip, &name, &ndip->i_num, DT_DIR);
781 if (error)
782 goto out_end_trans;
783 } else {
784 struct buffer_head *dibh;
785 error = gfs2_meta_inode_buffer(ip, &dibh);
786 if (error)
787 goto out_end_trans;
788 ip->i_di.di_ctime = get_seconds();
789 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
790 gfs2_dinode_out(&ip->i_di, dibh->b_data);
791 brelse(dibh);
792 }
793
794 error = gfs2_dir_del(odip, &odentry->d_name);
795 if (error)
796 goto out_end_trans;
797
798 error = gfs2_dir_add(ndir, &ndentry->d_name, &ip->i_num,
799 IF2DT(ip->i_di.di_mode));
800 if (error)
801 goto out_end_trans;
802
803 out_end_trans:
804 gfs2_trans_end(sdp);
805
806 out_ipreserv:
807 if (alloc_required)
808 gfs2_inplace_release(ndip);
809
810 out_gunlock_q:
811 if (alloc_required)
812 gfs2_quota_unlock(ndip);
813
814 out_alloc:
815 if (alloc_required)
816 gfs2_alloc_put(ndip);
817
818 out_gunlock:
819 gfs2_glock_dq_m(num_gh, ghs);
820
821 out_uninit:
822 for (x = 0; x < num_gh; x++)
823 gfs2_holder_uninit(ghs + x);
824
825 out_gunlock_r:
826 if (dir_rename)
827 gfs2_glock_dq_uninit(&r_gh);
828
829 out:
830 gfs2_unlinked_put(sdp, ul);
831
832 return error;
833}
834
835/**
836 * gfs2_readlink - Read the value of a symlink
837 * @dentry: the symlink
838 * @buf: the buffer to read the symlink data into
839 * @size: the size of the buffer
840 *
841 * Returns: errno
842 */
843
844static int gfs2_readlink(struct dentry *dentry, char __user *user_buf,
845 int user_size)
846{
847 struct gfs2_inode *ip = dentry->d_inode->u.generic_ip;
848 char array[GFS2_FAST_NAME_SIZE], *buf = array;
849 unsigned int len = GFS2_FAST_NAME_SIZE;
850 int error;
851
852 error = gfs2_readlinki(ip, &buf, &len);
853 if (error)
854 return error;
855
856 if (user_size > len - 1)
857 user_size = len - 1;
858
859 if (copy_to_user(user_buf, buf, user_size))
860 error = -EFAULT;
861 else
862 error = user_size;
863
864 if (buf != array)
865 kfree(buf);
866
867 return error;
868}
869
870/**
871 * gfs2_follow_link - Follow a symbolic link
872 * @dentry: The dentry of the link
873 * @nd: Data that we pass to vfs_follow_link()
874 *
875 * This can handle symlinks of any size. It is optimised for symlinks
876 * under GFS2_FAST_NAME_SIZE.
877 *
878 * Returns: 0 on success or error code
879 */
880
881static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
882{
883 struct gfs2_inode *ip = dentry->d_inode->u.generic_ip;
884 char array[GFS2_FAST_NAME_SIZE], *buf = array;
885 unsigned int len = GFS2_FAST_NAME_SIZE;
886 int error;
887
888 error = gfs2_readlinki(ip, &buf, &len);
889 if (!error) {
890 error = vfs_follow_link(nd, buf);
891 if (buf != array)
892 kfree(buf);
893 }
894
895 return ERR_PTR(error);
896}
897
898/**
899 * gfs2_permission -
900 * @inode:
901 * @mask:
902 * @nd: passed from Linux VFS, ignored by us
903 *
904 * Returns: errno
905 */
906
907static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
908{
909 struct gfs2_inode *ip = inode->u.generic_ip;
910 struct gfs2_holder i_gh;
911 int error;
912
913 if (ip->i_vn == ip->i_gl->gl_vn)
914 return generic_permission(inode, mask, gfs2_check_acl);
915
916 error = gfs2_glock_nq_init(ip->i_gl,
917 LM_ST_SHARED, LM_FLAG_ANY,
918 &i_gh);
919 if (!error) {
920 error = generic_permission(inode, mask, gfs2_check_acl_locked);
921 gfs2_glock_dq_uninit(&i_gh);
922 }
923
924 return error;
925}
926
927static int setattr_size(struct inode *inode, struct iattr *attr)
928{
929 struct gfs2_inode *ip = inode->u.generic_ip;
930 int error;
931
932 if (attr->ia_size != ip->i_di.di_size) {
933 error = vmtruncate(inode, attr->ia_size);
934 if (error)
935 return error;
936 }
937
938 error = gfs2_truncatei(ip, attr->ia_size);
939 if (error)
940 return error;
941
942 return error;
943}
944
945static int setattr_chown(struct inode *inode, struct iattr *attr)
946{
947 struct gfs2_inode *ip = inode->u.generic_ip;
948 struct gfs2_sbd *sdp = ip->i_sbd;
949 struct buffer_head *dibh;
950 uint32_t ouid, ogid, nuid, ngid;
951 int error;
952
953 ouid = ip->i_di.di_uid;
954 ogid = ip->i_di.di_gid;
955 nuid = attr->ia_uid;
956 ngid = attr->ia_gid;
957
958 if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
959 ouid = nuid = NO_QUOTA_CHANGE;
960 if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
961 ogid = ngid = NO_QUOTA_CHANGE;
962
963 gfs2_alloc_get(ip);
964
965 error = gfs2_quota_lock(ip, nuid, ngid);
966 if (error)
967 goto out_alloc;
968
969 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
970 error = gfs2_quota_check(ip, nuid, ngid);
971 if (error)
972 goto out_gunlock_q;
973 }
974
975 error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
976 if (error)
977 goto out_gunlock_q;
978
979 error = gfs2_meta_inode_buffer(ip, &dibh);
980 if (error)
981 goto out_end_trans;
982
983 error = inode_setattr(inode, attr);
984 gfs2_assert_warn(sdp, !error);
985 gfs2_inode_attr_out(ip);
986
987 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
988 gfs2_dinode_out(&ip->i_di, dibh->b_data);
989 brelse(dibh);
990
991 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
992 gfs2_quota_change(ip, -ip->i_di.di_blocks,
993 ouid, ogid);
994 gfs2_quota_change(ip, ip->i_di.di_blocks,
995 nuid, ngid);
996 }
997
998 out_end_trans:
999 gfs2_trans_end(sdp);
1000
1001 out_gunlock_q:
1002 gfs2_quota_unlock(ip);
1003
1004 out_alloc:
1005 gfs2_alloc_put(ip);
1006
1007 return error;
1008}
1009
1010/**
1011 * gfs2_setattr - Change attributes on an inode
1012 * @dentry: The dentry which is changing
1013 * @attr: The structure describing the change
1014 *
1015 * The VFS layer wants to change one or more of an inodes attributes. Write
1016 * that change out to disk.
1017 *
1018 * Returns: errno
1019 */
1020
1021static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
1022{
1023 struct inode *inode = dentry->d_inode;
1024 struct gfs2_inode *ip = inode->u.generic_ip;
1025 struct gfs2_holder i_gh;
1026 int error;
1027
1028 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1029 if (error)
1030 return error;
1031
1032 error = -EPERM;
1033 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
1034 goto out;
1035
1036 error = inode_change_ok(inode, attr);
1037 if (error)
1038 goto out;
1039
1040 if (attr->ia_valid & ATTR_SIZE)
1041 error = setattr_size(inode, attr);
1042 else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
1043 error = setattr_chown(inode, attr);
1044 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
1045 error = gfs2_acl_chmod(ip, attr);
1046 else
1047 error = gfs2_setattr_simple(ip, attr);
1048
1049 out:
1050 gfs2_glock_dq_uninit(&i_gh);
1051
1052 if (!error)
1053 mark_inode_dirty(inode);
1054
1055 return error;
1056}
1057
1058/**
1059 * gfs2_getattr - Read out an inode's attributes
1060 * @mnt: ?
1061 * @dentry: The dentry to stat
1062 * @stat: The inode's stats
1063 *
1064 * Returns: errno
1065 */
1066
1067static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
1068 struct kstat *stat)
1069{
1070 struct inode *inode = dentry->d_inode;
1071 struct gfs2_inode *ip = inode->u.generic_ip;
1072 struct gfs2_holder gh;
1073 int error;
1074
1075 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
1076 if (!error) {
1077 generic_fillattr(inode, stat);
1078 gfs2_glock_dq_uninit(&gh);
1079 }
1080
1081 return error;
1082}
1083
1084static int gfs2_setxattr(struct dentry *dentry, const char *name,
1085 const void *data, size_t size, int flags)
1086{
1087 struct gfs2_inode *ip = dentry->d_inode->u.generic_ip;
1088 struct gfs2_ea_request er;
1089
1090 memset(&er, 0, sizeof(struct gfs2_ea_request));
1091 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1092 if (er.er_type == GFS2_EATYPE_UNUSED)
1093 return -EOPNOTSUPP;
1094 er.er_data = (char *)data;
1095 er.er_name_len = strlen(er.er_name);
1096 er.er_data_len = size;
1097 er.er_flags = flags;
1098
1099 gfs2_assert_warn(ip->i_sbd, !(er.er_flags & GFS2_ERF_MODE));
1100
1101 return gfs2_ea_set(ip, &er);
1102}
1103
1104static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
1105 void *data, size_t size)
1106{
1107 struct gfs2_ea_request er;
1108
1109 memset(&er, 0, sizeof(struct gfs2_ea_request));
1110 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1111 if (er.er_type == GFS2_EATYPE_UNUSED)
1112 return -EOPNOTSUPP;
1113 er.er_data = data;
1114 er.er_name_len = strlen(er.er_name);
1115 er.er_data_len = size;
1116
1117 return gfs2_ea_get(dentry->d_inode->u.generic_ip, &er);
1118}
1119
1120static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
1121{
1122 struct gfs2_ea_request er;
1123
1124 memset(&er, 0, sizeof(struct gfs2_ea_request));
1125 er.er_data = (size) ? buffer : NULL;
1126 er.er_data_len = size;
1127
1128 return gfs2_ea_list(dentry->d_inode->u.generic_ip, &er);
1129}
1130
1131static int gfs2_removexattr(struct dentry *dentry, const char *name)
1132{
1133 struct gfs2_ea_request er;
1134
1135 memset(&er, 0, sizeof(struct gfs2_ea_request));
1136 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1137 if (er.er_type == GFS2_EATYPE_UNUSED)
1138 return -EOPNOTSUPP;
1139 er.er_name_len = strlen(er.er_name);
1140
1141 return gfs2_ea_remove(dentry->d_inode->u.generic_ip, &er);
1142}
1143
1144struct inode_operations gfs2_file_iops = {
1145 .permission = gfs2_permission,
1146 .setattr = gfs2_setattr,
1147 .getattr = gfs2_getattr,
1148 .setxattr = gfs2_setxattr,
1149 .getxattr = gfs2_getxattr,
1150 .listxattr = gfs2_listxattr,
1151 .removexattr = gfs2_removexattr,
1152};
1153
1154struct inode_operations gfs2_dev_iops = {
1155 .permission = gfs2_permission,
1156 .setattr = gfs2_setattr,
1157 .getattr = gfs2_getattr,
1158 .setxattr = gfs2_setxattr,
1159 .getxattr = gfs2_getxattr,
1160 .listxattr = gfs2_listxattr,
1161 .removexattr = gfs2_removexattr,
1162};
1163
1164struct inode_operations gfs2_dir_iops = {
1165 .create = gfs2_create,
1166 .lookup = gfs2_lookup,
1167 .link = gfs2_link,
1168 .unlink = gfs2_unlink,
1169 .symlink = gfs2_symlink,
1170 .mkdir = gfs2_mkdir,
1171 .rmdir = gfs2_rmdir,
1172 .mknod = gfs2_mknod,
1173 .rename = gfs2_rename,
1174 .permission = gfs2_permission,
1175 .setattr = gfs2_setattr,
1176 .getattr = gfs2_getattr,
1177 .setxattr = gfs2_setxattr,
1178 .getxattr = gfs2_getxattr,
1179 .listxattr = gfs2_listxattr,
1180 .removexattr = gfs2_removexattr,
1181};
1182
1183struct inode_operations gfs2_symlink_iops = {
1184 .readlink = gfs2_readlink,
1185 .follow_link = gfs2_follow_link,
1186 .permission = gfs2_permission,
1187 .setattr = gfs2_setattr,
1188 .getattr = gfs2_getattr,
1189 .setxattr = gfs2_setxattr,
1190 .getxattr = gfs2_getxattr,
1191 .listxattr = gfs2_listxattr,
1192 .removexattr = gfs2_removexattr,
1193};
1194
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
new file mode 100644
index 000000000000..930aaae91377
--- /dev/null
+++ b/fs/gfs2/ops_inode.h
@@ -0,0 +1,18 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_INODE_DOT_H__
11#define __OPS_INODE_DOT_H__
12
13extern struct inode_operations gfs2_file_iops;
14extern struct inode_operations gfs2_dir_iops;
15extern struct inode_operations gfs2_symlink_iops;
16extern struct inode_operations gfs2_dev_iops;
17
18#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
new file mode 100644
index 000000000000..1c17acc946f9
--- /dev/null
+++ b/fs/gfs2/ops_super.c
@@ -0,0 +1,399 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/statfs.h>
16#include <linux/vmalloc.h>
17#include <linux/seq_file.h>
18#include <linux/mount.h>
19#include <linux/kthread.h>
20#include <linux/delay.h>
21#include <linux/gfs2_ondisk.h>
22
23#include "gfs2.h"
24#include "lm_interface.h"
25#include "incore.h"
26#include "glock.h"
27#include "inode.h"
28#include "lm.h"
29#include "log.h"
30#include "mount.h"
31#include "ops_super.h"
32#include "page.h"
33#include "quota.h"
34#include "recovery.h"
35#include "rgrp.h"
36#include "super.h"
37#include "sys.h"
38#include "util.h"
39
40/**
41 * gfs2_write_inode - Make sure the inode is stable on the disk
42 * @inode: The inode
43 * @sync: synchronous write flag
44 *
45 * Returns: errno
46 */
47
48static int gfs2_write_inode(struct inode *inode, int sync)
49{
50 struct gfs2_inode *ip = inode->u.generic_ip;
51
52 if (current->flags & PF_MEMALLOC)
53 return 0;
54 if (ip && sync)
55 gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
56
57 return 0;
58}
59
60/**
61 * gfs2_put_super - Unmount the filesystem
62 * @sb: The VFS superblock
63 *
64 */
65
66static void gfs2_put_super(struct super_block *sb)
67{
68 struct gfs2_sbd *sdp = sb->s_fs_info;
69 int error;
70
71 if (!sdp)
72 return;
73
74 /* Unfreeze the filesystem, if we need to */
75
76 mutex_lock(&sdp->sd_freeze_lock);
77 if (sdp->sd_freeze_count)
78 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
79 mutex_unlock(&sdp->sd_freeze_lock);
80
81 kthread_stop(sdp->sd_inoded_process);
82 kthread_stop(sdp->sd_quotad_process);
83 kthread_stop(sdp->sd_logd_process);
84 kthread_stop(sdp->sd_recoverd_process);
85 while (sdp->sd_glockd_num--)
86 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
87 kthread_stop(sdp->sd_scand_process);
88
89 if (!(sb->s_flags & MS_RDONLY)) {
90 error = gfs2_make_fs_ro(sdp);
91 if (error)
92 gfs2_io_error(sdp);
93 }
94 /* At this point, we're through modifying the disk */
95
96 /* Release stuff */
97
98 iput(sdp->sd_master_dir);
99 iput(sdp->sd_jindex);
100 iput(sdp->sd_inum_inode);
101 iput(sdp->sd_statfs_inode);
102 iput(sdp->sd_rindex);
103 iput(sdp->sd_quota_inode);
104
105 gfs2_glock_put(sdp->sd_rename_gl);
106 gfs2_glock_put(sdp->sd_trans_gl);
107
108 if (!sdp->sd_args.ar_spectator) {
109 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
110 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
111 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
112 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
113 gfs2_glock_dq_uninit(&sdp->sd_ut_gh);
114 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
115 iput(sdp->sd_ir_inode);
116 iput(sdp->sd_sc_inode);
117 iput(sdp->sd_ut_inode);
118 iput(sdp->sd_qc_inode);
119 }
120
121 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
122 gfs2_clear_rgrpd(sdp);
123 gfs2_jindex_free(sdp);
124 /* Take apart glock structures and buffer lists */
125 gfs2_gl_hash_clear(sdp, WAIT);
126 /* Unmount the locking protocol */
127 gfs2_lm_unmount(sdp);
128
129 /* At this point, we're through participating in the lockspace */
130 gfs2_sys_fs_del(sdp);
131 vfree(sdp);
132 sb->s_fs_info = NULL;
133}
134
135/**
136 * gfs2_write_super - disk commit all incore transactions
137 * @sb: the filesystem
138 *
139 * This function is called every time sync(2) is called.
140 * After this exits, all dirty buffers are synced.
141 */
142
143static void gfs2_write_super(struct super_block *sb)
144{
145 struct gfs2_sbd *sdp = sb->s_fs_info;
146 gfs2_log_flush(sdp, NULL);
147}
148
149/**
150 * gfs2_write_super_lockfs - prevent further writes to the filesystem
151 * @sb: the VFS structure for the filesystem
152 *
153 */
154
155static void gfs2_write_super_lockfs(struct super_block *sb)
156{
157 struct gfs2_sbd *sdp = sb->s_fs_info;
158 int error;
159
160 for (;;) {
161 error = gfs2_freeze_fs(sdp);
162 if (!error)
163 break;
164
165 switch (error) {
166 case -EBUSY:
167 fs_err(sdp, "waiting for recovery before freeze\n");
168 break;
169
170 default:
171 fs_err(sdp, "error freezing FS: %d\n", error);
172 break;
173 }
174
175 fs_err(sdp, "retrying...\n");
176 msleep(1000);
177 }
178}
179
180/**
181 * gfs2_unlockfs - reallow writes to the filesystem
182 * @sb: the VFS structure for the filesystem
183 *
184 */
185
186static void gfs2_unlockfs(struct super_block *sb)
187{
188 struct gfs2_sbd *sdp = sb->s_fs_info;
189 gfs2_unfreeze_fs(sdp);
190}
191
192/**
193 * gfs2_statfs - Gather and return stats about the filesystem
194 * @sb: The superblock
195 * @statfsbuf: The buffer
196 *
197 * Returns: 0 on success or error code
198 */
199
200static int gfs2_statfs(struct super_block *sb, struct kstatfs *buf)
201{
202 struct gfs2_sbd *sdp = sb->s_fs_info;
203 struct gfs2_statfs_change sc;
204 int error;
205
206 if (gfs2_tune_get(sdp, gt_statfs_slow))
207 error = gfs2_statfs_slow(sdp, &sc);
208 else
209 error = gfs2_statfs_i(sdp, &sc);
210
211 if (error)
212 return error;
213
214 memset(buf, 0, sizeof(struct kstatfs));
215
216 buf->f_type = GFS2_MAGIC;
217 buf->f_bsize = sdp->sd_sb.sb_bsize;
218 buf->f_blocks = sc.sc_total;
219 buf->f_bfree = sc.sc_free;
220 buf->f_bavail = sc.sc_free;
221 buf->f_files = sc.sc_dinodes + sc.sc_free;
222 buf->f_ffree = sc.sc_free;
223 buf->f_namelen = GFS2_FNAMESIZE;
224
225 return 0;
226}
227
228/**
229 * gfs2_remount_fs - called when the FS is remounted
230 * @sb: the filesystem
231 * @flags: the remount flags
232 * @data: extra data passed in (not used right now)
233 *
234 * Returns: errno
235 */
236
237static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
238{
239 struct gfs2_sbd *sdp = sb->s_fs_info;
240 int error;
241
242 error = gfs2_mount_args(sdp, data, 1);
243 if (error)
244 return error;
245
246 if (sdp->sd_args.ar_spectator)
247 *flags |= MS_RDONLY;
248 else {
249 if (*flags & MS_RDONLY) {
250 if (!(sb->s_flags & MS_RDONLY))
251 error = gfs2_make_fs_ro(sdp);
252 } else if (!(*flags & MS_RDONLY) &&
253 (sb->s_flags & MS_RDONLY)) {
254 error = gfs2_make_fs_rw(sdp);
255 }
256 }
257
258 if (*flags & (MS_NOATIME | MS_NODIRATIME))
259 set_bit(SDF_NOATIME, &sdp->sd_flags);
260 else
261 clear_bit(SDF_NOATIME, &sdp->sd_flags);
262
263 /* Don't let the VFS update atimes. GFS2 handles this itself. */
264 *flags |= MS_NOATIME | MS_NODIRATIME;
265
266 return error;
267}
268
269/**
270 * gfs2_clear_inode - Deallocate an inode when VFS is done with it
271 * @inode: The VFS inode
272 *
273 */
274
275static void gfs2_clear_inode(struct inode *inode)
276{
277 struct gfs2_inode *ip = inode->u.generic_ip;
278
279 if (ip) {
280 spin_lock(&ip->i_spin);
281 ip->i_vnode = NULL;
282 inode->u.generic_ip = NULL;
283 spin_unlock(&ip->i_spin);
284
285 gfs2_glock_schedule_for_reclaim(ip->i_gl);
286 gfs2_inode_put(ip);
287 }
288}
289
290/**
291 * gfs2_show_options - Show mount options for /proc/mounts
292 * @s: seq_file structure
293 * @mnt: vfsmount
294 *
295 * Returns: 0 on success or error code
296 */
297
298static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
299{
300 struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
301 struct gfs2_args *args = &sdp->sd_args;
302
303 if (args->ar_lockproto[0])
304 seq_printf(s, ",lockproto=%s", args->ar_lockproto);
305 if (args->ar_locktable[0])
306 seq_printf(s, ",locktable=%s", args->ar_locktable);
307 if (args->ar_hostdata[0])
308 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
309 if (args->ar_spectator)
310 seq_printf(s, ",spectator");
311 if (args->ar_ignore_local_fs)
312 seq_printf(s, ",ignore_local_fs");
313 if (args->ar_localflocks)
314 seq_printf(s, ",localflocks");
315 if (args->ar_localcaching)
316 seq_printf(s, ",localcaching");
317 if (args->ar_debug)
318 seq_printf(s, ",debug");
319 if (args->ar_upgrade)
320 seq_printf(s, ",upgrade");
321 if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
322 seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
323 if (args->ar_posix_acl)
324 seq_printf(s, ",acl");
325 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
326 char *state;
327 switch (args->ar_quota) {
328 case GFS2_QUOTA_OFF:
329 state = "off";
330 break;
331 case GFS2_QUOTA_ACCOUNT:
332 state = "account";
333 break;
334 case GFS2_QUOTA_ON:
335 state = "on";
336 break;
337 default:
338 state = "unknown";
339 break;
340 }
341 seq_printf(s, ",quota=%s", state);
342 }
343 if (args->ar_suiddir)
344 seq_printf(s, ",suiddir");
345 if (args->ar_data != GFS2_DATA_DEFAULT) {
346 char *state;
347 switch (args->ar_data) {
348 case GFS2_DATA_WRITEBACK:
349 state = "writeback";
350 break;
351 case GFS2_DATA_ORDERED:
352 state = "ordered";
353 break;
354 default:
355 state = "unknown";
356 break;
357 }
358 seq_printf(s, ",data=%s", state);
359 }
360
361 return 0;
362}
363
364static struct inode *gfs2_alloc_inode(struct super_block *sb)
365{
366 struct gfs2_sbd *sdp = sb->s_fs_info;
367 struct gfs2_inode *ip;
368
369 ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
370 if (ip) {
371 ip->i_flags = 0;
372 ip->i_gl = NULL;
373 ip->i_sbd = sdp;
374 ip->i_vnode = &ip->i_inode;
375 ip->i_greedy = gfs2_tune_get(sdp, gt_greedy_default);
376 ip->i_last_pfault = jiffies;
377 }
378 return &ip->i_inode;
379}
380
381static void gfs2_destroy_inode(struct inode *inode)
382{
383 kmem_cache_free(gfs2_inode_cachep, inode);
384}
385
386struct super_operations gfs2_super_ops = {
387 .alloc_inode = gfs2_alloc_inode,
388 .destroy_inode = gfs2_destroy_inode,
389 .write_inode = gfs2_write_inode,
390 .put_super = gfs2_put_super,
391 .write_super = gfs2_write_super,
392 .write_super_lockfs = gfs2_write_super_lockfs,
393 .unlockfs = gfs2_unlockfs,
394 .statfs = gfs2_statfs,
395 .remount_fs = gfs2_remount_fs,
396 .clear_inode = gfs2_clear_inode,
397 .show_options = gfs2_show_options,
398};
399
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
new file mode 100644
index 000000000000..a15ccc276113
--- /dev/null
+++ b/fs/gfs2/ops_super.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_SUPER_DOT_H__
11#define __OPS_SUPER_DOT_H__
12
13extern struct super_operations gfs2_super_ops;
14
15#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
new file mode 100644
index 000000000000..263c1fb7bbaf
--- /dev/null
+++ b/fs/gfs2/ops_vm.c
@@ -0,0 +1,195 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/gfs2_ondisk.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "bmap.h"
23#include "glock.h"
24#include "inode.h"
25#include "ops_vm.h"
26#include "page.h"
27#include "quota.h"
28#include "rgrp.h"
29#include "trans.h"
30#include "util.h"
31
32static void pfault_be_greedy(struct gfs2_inode *ip)
33{
34 unsigned int time;
35
36 spin_lock(&ip->i_spin);
37 time = ip->i_greedy;
38 ip->i_last_pfault = jiffies;
39 spin_unlock(&ip->i_spin);
40
41 gfs2_inode_hold(ip);
42 if (gfs2_glock_be_greedy(ip->i_gl, time))
43 gfs2_inode_put(ip);
44}
45
46static struct page *gfs2_private_nopage(struct vm_area_struct *area,
47 unsigned long address, int *type)
48{
49 struct gfs2_inode *ip = area->vm_file->f_mapping->host->u.generic_ip;
50 struct gfs2_holder i_gh;
51 struct page *result;
52 int error;
53
54 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
55 if (error)
56 return NULL;
57
58 set_bit(GIF_PAGED, &ip->i_flags);
59
60 result = filemap_nopage(area, address, type);
61
62 if (result && result != NOPAGE_OOM)
63 pfault_be_greedy(ip);
64
65 gfs2_glock_dq_uninit(&i_gh);
66
67 return result;
68}
69
70static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
71{
72 struct gfs2_sbd *sdp = ip->i_sbd;
73 unsigned long index = page->index;
74 uint64_t lblock = index << (PAGE_CACHE_SHIFT -
75 sdp->sd_sb.sb_bsize_shift);
76 unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift;
77 struct gfs2_alloc *al;
78 unsigned int data_blocks, ind_blocks;
79 unsigned int x;
80 int error;
81
82 al = gfs2_alloc_get(ip);
83
84 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
85 if (error)
86 goto out;
87
88 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
89 if (error)
90 goto out_gunlock_q;
91
92 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
93
94 al->al_requested = data_blocks + ind_blocks;
95
96 error = gfs2_inplace_reserve(ip);
97 if (error)
98 goto out_gunlock_q;
99
100 error = gfs2_trans_begin(sdp, al->al_rgd->rd_ri.ri_length +
101 ind_blocks + RES_DINODE +
102 RES_STATFS + RES_QUOTA, 0);
103 if (error)
104 goto out_ipres;
105
106 if (gfs2_is_stuffed(ip)) {
107 error = gfs2_unstuff_dinode(ip, gfs2_unstuffer_page, NULL);
108 if (error)
109 goto out_trans;
110 }
111
112 for (x = 0; x < blocks; ) {
113 uint64_t dblock;
114 unsigned int extlen;
115 int new = 1;
116
117 error = gfs2_extent_map(ip->i_vnode, lblock, &new, &dblock, &extlen);
118 if (error)
119 goto out_trans;
120
121 lblock += extlen;
122 x += extlen;
123 }
124
125 gfs2_assert_warn(sdp, al->al_alloced);
126
127 out_trans:
128 gfs2_trans_end(sdp);
129
130 out_ipres:
131 gfs2_inplace_release(ip);
132
133 out_gunlock_q:
134 gfs2_quota_unlock(ip);
135
136 out:
137 gfs2_alloc_put(ip);
138
139 return error;
140}
141
142static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
143 unsigned long address, int *type)
144{
145 struct gfs2_inode *ip = area->vm_file->f_mapping->host->u.generic_ip;
146 struct gfs2_holder i_gh;
147 struct page *result = NULL;
148 unsigned long index = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) +
149 area->vm_pgoff;
150 int alloc_required;
151 int error;
152
153 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
154 if (error)
155 return NULL;
156
157 set_bit(GIF_PAGED, &ip->i_flags);
158 set_bit(GIF_SW_PAGED, &ip->i_flags);
159
160 error = gfs2_write_alloc_required(ip,
161 (uint64_t)index << PAGE_CACHE_SHIFT,
162 PAGE_CACHE_SIZE, &alloc_required);
163 if (error)
164 goto out;
165
166 result = filemap_nopage(area, address, type);
167 if (!result || result == NOPAGE_OOM)
168 goto out;
169
170 if (alloc_required) {
171 error = alloc_page_backing(ip, result);
172 if (error) {
173 page_cache_release(result);
174 result = NULL;
175 goto out;
176 }
177 set_page_dirty(result);
178 }
179
180 pfault_be_greedy(ip);
181
182 out:
183 gfs2_glock_dq_uninit(&i_gh);
184
185 return result;
186}
187
188struct vm_operations_struct gfs2_vm_ops_private = {
189 .nopage = gfs2_private_nopage,
190};
191
192struct vm_operations_struct gfs2_vm_ops_sharewrite = {
193 .nopage = gfs2_sharewrite_nopage,
194};
195
diff --git a/fs/gfs2/ops_vm.h b/fs/gfs2/ops_vm.h
new file mode 100644
index 000000000000..077cffcd4085
--- /dev/null
+++ b/fs/gfs2/ops_vm.h
@@ -0,0 +1,16 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_VM_DOT_H__
11#define __OPS_VM_DOT_H__
12
13extern struct vm_operations_struct gfs2_vm_ops_private;
14extern struct vm_operations_struct gfs2_vm_ops_sharewrite;
15
16#endif /* __OPS_VM_DOT_H__ */
diff --git a/fs/gfs2/page.c b/fs/gfs2/page.c
new file mode 100644
index 000000000000..cd93644c7d70
--- /dev/null
+++ b/fs/gfs2/page.c
@@ -0,0 +1,280 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/mm.h>
17#include <linux/gfs2_ondisk.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "bmap.h"
23#include "inode.h"
24#include "page.h"
25#include "trans.h"
26#include "ops_address.h"
27#include "util.h"
28
29/**
30 * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock
31 * @gl: the glock
32 *
33 */
34
35void gfs2_pte_inval(struct gfs2_glock *gl)
36{
37 struct gfs2_inode *ip;
38 struct inode *inode;
39
40 ip = gl->gl_object;
41 if (!ip || !S_ISREG(ip->i_di.di_mode))
42 return;
43
44 if (!test_bit(GIF_PAGED, &ip->i_flags))
45 return;
46
47 inode = gfs2_ip2v_lookup(ip);
48 if (inode) {
49 unmap_shared_mapping_range(inode->i_mapping, 0, 0);
50 iput(inode);
51
52 if (test_bit(GIF_SW_PAGED, &ip->i_flags))
53 set_bit(GLF_DIRTY, &gl->gl_flags);
54 }
55
56 clear_bit(GIF_SW_PAGED, &ip->i_flags);
57}
58
59/**
60 * gfs2_page_inval - Invalidate all pages associated with a glock
61 * @gl: the glock
62 *
63 */
64
65void gfs2_page_inval(struct gfs2_glock *gl)
66{
67 struct gfs2_inode *ip;
68 struct inode *inode;
69
70 ip = gl->gl_object;
71 if (!ip || !S_ISREG(ip->i_di.di_mode))
72 return;
73
74 inode = gfs2_ip2v_lookup(ip);
75 if (inode) {
76 struct address_space *mapping = inode->i_mapping;
77
78 truncate_inode_pages(mapping, 0);
79 gfs2_assert_withdraw(ip->i_sbd, !mapping->nrpages);
80
81 iput(inode);
82 }
83
84 clear_bit(GIF_PAGED, &ip->i_flags);
85}
86
87/**
88 * gfs2_page_sync - Sync the data pages (not metadata) associated with a glock
89 * @gl: the glock
90 * @flags: DIO_START | DIO_WAIT
91 *
92 * Syncs data (not metadata) for a regular file.
93 * No-op for all other types.
94 */
95
96void gfs2_page_sync(struct gfs2_glock *gl, int flags)
97{
98 struct gfs2_inode *ip;
99 struct inode *inode;
100
101 ip = gl->gl_object;
102 if (!ip || !S_ISREG(ip->i_di.di_mode))
103 return;
104
105 inode = gfs2_ip2v_lookup(ip);
106 if (inode) {
107 struct address_space *mapping = inode->i_mapping;
108 int error = 0;
109
110 if (flags & DIO_START)
111 filemap_fdatawrite(mapping);
112 if (!error && (flags & DIO_WAIT))
113 error = filemap_fdatawait(mapping);
114
115 /* Put back any errors cleared by filemap_fdatawait()
116 so they can be caught by someone who can pass them
117 up to user space. */
118
119 if (error == -ENOSPC)
120 set_bit(AS_ENOSPC, &mapping->flags);
121 else if (error)
122 set_bit(AS_EIO, &mapping->flags);
123
124 iput(inode);
125 }
126}
127
128/**
129 * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
130 * @ip: the inode
131 * @dibh: the dinode buffer
132 * @block: the block number that was allocated
133 * @private: any locked page held by the caller process
134 *
135 * Returns: errno
136 */
137
138int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
139 uint64_t block, void *private)
140{
141 struct gfs2_sbd *sdp = ip->i_sbd;
142 struct inode *inode = ip->i_vnode;
143 struct page *page = (struct page *)private;
144 struct buffer_head *bh;
145 int release = 0;
146
147 if (!page || page->index) {
148 page = grab_cache_page(inode->i_mapping, 0);
149 if (!page)
150 return -ENOMEM;
151 release = 1;
152 }
153
154 if (!PageUptodate(page)) {
155 void *kaddr = kmap(page);
156
157 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
158 ip->i_di.di_size);
159 memset(kaddr + ip->i_di.di_size, 0,
160 PAGE_CACHE_SIZE - ip->i_di.di_size);
161 kunmap(page);
162
163 SetPageUptodate(page);
164 }
165
166 if (!page_has_buffers(page))
167 create_empty_buffers(page, 1 << inode->i_blkbits,
168 (1 << BH_Uptodate));
169
170 bh = page_buffers(page);
171
172 if (!buffer_mapped(bh))
173 map_bh(bh, inode->i_sb, block);
174
175 set_buffer_uptodate(bh);
176 if ((sdp->sd_args.ar_data == GFS2_DATA_ORDERED) || gfs2_is_jdata(ip))
177 gfs2_trans_add_bh(ip->i_gl, bh, 0);
178 mark_buffer_dirty(bh);
179
180 if (release) {
181 unlock_page(page);
182 page_cache_release(page);
183 }
184
185 return 0;
186}
187
188/**
189 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
190 *
191 * This is partly borrowed from ext3.
192 */
193int gfs2_block_truncate_page(struct address_space *mapping)
194{
195 struct inode *inode = mapping->host;
196 struct gfs2_inode *ip = inode->u.generic_ip;
197 struct gfs2_sbd *sdp = ip->i_sbd;
198 loff_t from = inode->i_size;
199 unsigned long index = from >> PAGE_CACHE_SHIFT;
200 unsigned offset = from & (PAGE_CACHE_SIZE-1);
201 unsigned blocksize, iblock, length, pos;
202 struct buffer_head *bh;
203 struct page *page;
204 void *kaddr;
205 int err;
206
207 page = grab_cache_page(mapping, index);
208 if (!page)
209 return 0;
210
211 blocksize = inode->i_sb->s_blocksize;
212 length = blocksize - (offset & (blocksize - 1));
213 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
214
215 if (!page_has_buffers(page))
216 create_empty_buffers(page, blocksize, 0);
217
218 /* Find the buffer that contains "offset" */
219 bh = page_buffers(page);
220 pos = blocksize;
221 while (offset >= pos) {
222 bh = bh->b_this_page;
223 iblock++;
224 pos += blocksize;
225 }
226
227 err = 0;
228
229 if (!buffer_mapped(bh)) {
230 gfs2_get_block(inode, iblock, bh, 0);
231 /* unmapped? It's a hole - nothing to do */
232 if (!buffer_mapped(bh))
233 goto unlock;
234 }
235
236 /* Ok, it's mapped. Make sure it's up-to-date */
237 if (PageUptodate(page))
238 set_buffer_uptodate(bh);
239
240 if (!buffer_uptodate(bh)) {
241 err = -EIO;
242 ll_rw_block(READ, 1, &bh);
243 wait_on_buffer(bh);
244 /* Uhhuh. Read error. Complain and punt. */
245 if (!buffer_uptodate(bh))
246 goto unlock;
247 }
248
249 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
250 gfs2_trans_add_bh(ip->i_gl, bh, 0);
251
252 kaddr = kmap_atomic(page, KM_USER0);
253 memset(kaddr + offset, 0, length);
254 flush_dcache_page(page);
255 kunmap_atomic(kaddr, KM_USER0);
256
257unlock:
258 unlock_page(page);
259 page_cache_release(page);
260 return err;
261}
262
263void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
264 unsigned int from, unsigned int to)
265{
266 struct buffer_head *head = page_buffers(page);
267 unsigned int bsize = head->b_size;
268 struct buffer_head *bh;
269 unsigned int start, end;
270
271 for (bh = head, start = 0;
272 bh != head || !start;
273 bh = bh->b_this_page, start = end) {
274 end = start + bsize;
275 if (end <= from || start >= to)
276 continue;
277 gfs2_trans_add_bh(ip->i_gl, bh, 0);
278 }
279}
280
diff --git a/fs/gfs2/page.h b/fs/gfs2/page.h
new file mode 100644
index 000000000000..2c853a90ac04
--- /dev/null
+++ b/fs/gfs2/page.h
@@ -0,0 +1,23 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __PAGE_DOT_H__
11#define __PAGE_DOT_H__
12
13void gfs2_pte_inval(struct gfs2_glock *gl);
14void gfs2_page_inval(struct gfs2_glock *gl);
15void gfs2_page_sync(struct gfs2_glock *gl, int flags);
16
17int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
18 uint64_t block, void *private);
19int gfs2_block_truncate_page(struct address_space *mapping);
20void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
21 unsigned int from, unsigned int to);
22
23#endif /* __PAGE_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
new file mode 100644
index 000000000000..f752b0184690
--- /dev/null
+++ b/fs/gfs2/quota.c
@@ -0,0 +1,1305 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10/*
11 * Quota change tags are associated with each transaction that allocates or
12 * deallocates space. Those changes are accumulated locally to each node (in a
13 * per-node file) and then are periodically synced to the quota file. This
14 * avoids the bottleneck of constantly touching the quota file, but introduces
15 * fuzziness in the current usage value of IDs that are being used on different
16 * nodes in the cluster simultaneously. So, it is possible for a user on
17 * multiple nodes to overrun their quota, but that overrun is controlable.
18 * Since quota tags are part of transactions, there is no need to a quota check
19 * program to be run on node crashes or anything like that.
20 *
21 * There are couple of knobs that let the administrator manage the quota
22 * fuzziness. "quota_quantum" sets the maximum time a quota change can be
23 * sitting on one node before being synced to the quota file. (The default is
24 * 60 seconds.) Another knob, "quota_scale" controls how quickly the frequency
25 * of quota file syncs increases as the user moves closer to their limit. The
26 * more frequent the syncs, the more accurate the quota enforcement, but that
27 * means that there is more contention between the nodes for the quota file.
28 * The default value is one. This sets the maximum theoretical quota overrun
29 * (with infinite node with infinite bandwidth) to twice the user's limit. (In
30 * practice, the maximum overrun you see should be much less.) A "quota_scale"
31 * number greater than one makes quota syncs more frequent and reduces the
32 * maximum overrun. Numbers less than one (but greater than zero) make quota
33 * syncs less frequent.
34 *
35 * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of
36 * the quota file, so it is not being constantly read.
37 */
38
39#include <linux/sched.h>
40#include <linux/slab.h>
41#include <linux/spinlock.h>
42#include <linux/completion.h>
43#include <linux/buffer_head.h>
44#include <linux/tty.h>
45#include <linux/sort.h>
46#include <linux/fs.h>
47#include <linux/gfs2_ondisk.h>
48
49#include "gfs2.h"
50#include "lm_interface.h"
51#include "incore.h"
52#include "bmap.h"
53#include "glock.h"
54#include "glops.h"
55#include "log.h"
56#include "lvb.h"
57#include "meta_io.h"
58#include "quota.h"
59#include "rgrp.h"
60#include "super.h"
61#include "trans.h"
62#include "inode.h"
63#include "ops_file.h"
64#include "ops_address.h"
65#include "util.h"
66
67#define QUOTA_USER 1
68#define QUOTA_GROUP 0
69
70static uint64_t qd2offset(struct gfs2_quota_data *qd)
71{
72 uint64_t offset;
73
74 offset = 2 * (uint64_t)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags);
75 offset *= sizeof(struct gfs2_quota);
76
77 return offset;
78}
79
80static int qd_alloc(struct gfs2_sbd *sdp, int user, uint32_t id,
81 struct gfs2_quota_data **qdp)
82{
83 struct gfs2_quota_data *qd;
84 int error;
85
86 qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL);
87 if (!qd)
88 return -ENOMEM;
89
90 qd->qd_count = 1;
91 qd->qd_id = id;
92 if (user)
93 set_bit(QDF_USER, &qd->qd_flags);
94 qd->qd_slot = -1;
95
96 error = gfs2_glock_get(sdp, 2 * (uint64_t)id + !user,
97 &gfs2_quota_glops, CREATE, &qd->qd_gl);
98 if (error)
99 goto fail;
100
101 error = gfs2_lvb_hold(qd->qd_gl);
102 gfs2_glock_put(qd->qd_gl);
103 if (error)
104 goto fail;
105
106 *qdp = qd;
107
108 return 0;
109
110 fail:
111 kfree(qd);
112 return error;
113}
114
115static int qd_get(struct gfs2_sbd *sdp, int user, uint32_t id, int create,
116 struct gfs2_quota_data **qdp)
117{
118 struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
119 int error, found;
120
121 *qdp = NULL;
122
123 for (;;) {
124 found = 0;
125 spin_lock(&sdp->sd_quota_spin);
126 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
127 if (qd->qd_id == id &&
128 !test_bit(QDF_USER, &qd->qd_flags) == !user) {
129 qd->qd_count++;
130 found = 1;
131 break;
132 }
133 }
134
135 if (!found)
136 qd = NULL;
137
138 if (!qd && new_qd) {
139 qd = new_qd;
140 list_add(&qd->qd_list, &sdp->sd_quota_list);
141 atomic_inc(&sdp->sd_quota_count);
142 new_qd = NULL;
143 }
144
145 spin_unlock(&sdp->sd_quota_spin);
146
147 if (qd || !create) {
148 if (new_qd) {
149 gfs2_lvb_unhold(new_qd->qd_gl);
150 kfree(new_qd);
151 }
152 *qdp = qd;
153 return 0;
154 }
155
156 error = qd_alloc(sdp, user, id, &new_qd);
157 if (error)
158 return error;
159 }
160}
161
162static void qd_hold(struct gfs2_quota_data *qd)
163{
164 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
165
166 spin_lock(&sdp->sd_quota_spin);
167 gfs2_assert(sdp, qd->qd_count);
168 qd->qd_count++;
169 spin_unlock(&sdp->sd_quota_spin);
170}
171
172static void qd_put(struct gfs2_quota_data *qd)
173{
174 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
175 spin_lock(&sdp->sd_quota_spin);
176 gfs2_assert(sdp, qd->qd_count);
177 if (!--qd->qd_count)
178 qd->qd_last_touched = jiffies;
179 spin_unlock(&sdp->sd_quota_spin);
180}
181
182static int slot_get(struct gfs2_quota_data *qd)
183{
184 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
185 unsigned int c, o = 0, b;
186 unsigned char byte = 0;
187
188 spin_lock(&sdp->sd_quota_spin);
189
190 if (qd->qd_slot_count++) {
191 spin_unlock(&sdp->sd_quota_spin);
192 return 0;
193 }
194
195 for (c = 0; c < sdp->sd_quota_chunks; c++)
196 for (o = 0; o < PAGE_SIZE; o++) {
197 byte = sdp->sd_quota_bitmap[c][o];
198 if (byte != 0xFF)
199 goto found;
200 }
201
202 goto fail;
203
204 found:
205 for (b = 0; b < 8; b++)
206 if (!(byte & (1 << b)))
207 break;
208 qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
209
210 if (qd->qd_slot >= sdp->sd_quota_slots)
211 goto fail;
212
213 sdp->sd_quota_bitmap[c][o] |= 1 << b;
214
215 spin_unlock(&sdp->sd_quota_spin);
216
217 return 0;
218
219 fail:
220 qd->qd_slot_count--;
221 spin_unlock(&sdp->sd_quota_spin);
222 return -ENOSPC;
223}
224
225static void slot_hold(struct gfs2_quota_data *qd)
226{
227 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
228
229 spin_lock(&sdp->sd_quota_spin);
230 gfs2_assert(sdp, qd->qd_slot_count);
231 qd->qd_slot_count++;
232 spin_unlock(&sdp->sd_quota_spin);
233}
234
235static void slot_put(struct gfs2_quota_data *qd)
236{
237 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
238
239 spin_lock(&sdp->sd_quota_spin);
240 gfs2_assert(sdp, qd->qd_slot_count);
241 if (!--qd->qd_slot_count) {
242 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
243 qd->qd_slot = -1;
244 }
245 spin_unlock(&sdp->sd_quota_spin);
246}
247
248static int bh_get(struct gfs2_quota_data *qd)
249{
250 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
251 struct gfs2_inode *ip = sdp->sd_qc_inode->u.generic_ip;
252 unsigned int block, offset;
253 uint64_t dblock;
254 int new = 0;
255 struct buffer_head *bh;
256 int error;
257 int boundary;
258
259 mutex_lock(&sdp->sd_quota_mutex);
260
261 if (qd->qd_bh_count++) {
262 mutex_unlock(&sdp->sd_quota_mutex);
263 return 0;
264 }
265
266 block = qd->qd_slot / sdp->sd_qc_per_block;
267 offset = qd->qd_slot % sdp->sd_qc_per_block;;
268
269 error = gfs2_block_map(ip->i_vnode, block, &new, &dblock, &boundary);
270 if (error)
271 goto fail;
272 error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT, &bh);
273 if (error)
274 goto fail;
275 error = -EIO;
276 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC))
277 goto fail_brelse;
278
279 qd->qd_bh = bh;
280 qd->qd_bh_qc = (struct gfs2_quota_change *)
281 (bh->b_data + sizeof(struct gfs2_meta_header) +
282 offset * sizeof(struct gfs2_quota_change));
283
284 mutex_lock(&sdp->sd_quota_mutex);
285
286 return 0;
287
288 fail_brelse:
289 brelse(bh);
290
291 fail:
292 qd->qd_bh_count--;
293 mutex_unlock(&sdp->sd_quota_mutex);
294 return error;
295}
296
297static void bh_put(struct gfs2_quota_data *qd)
298{
299 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
300
301 mutex_lock(&sdp->sd_quota_mutex);
302 gfs2_assert(sdp, qd->qd_bh_count);
303 if (!--qd->qd_bh_count) {
304 brelse(qd->qd_bh);
305 qd->qd_bh = NULL;
306 qd->qd_bh_qc = NULL;
307 }
308 mutex_unlock(&sdp->sd_quota_mutex);
309}
310
311static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
312{
313 struct gfs2_quota_data *qd = NULL;
314 int error;
315 int found = 0;
316
317 *qdp = NULL;
318
319 if (sdp->sd_vfs->s_flags & MS_RDONLY)
320 return 0;
321
322 spin_lock(&sdp->sd_quota_spin);
323
324 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
325 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
326 !test_bit(QDF_CHANGE, &qd->qd_flags) ||
327 qd->qd_sync_gen >= sdp->sd_quota_sync_gen)
328 continue;
329
330 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
331
332 set_bit(QDF_LOCKED, &qd->qd_flags);
333 gfs2_assert_warn(sdp, qd->qd_count);
334 qd->qd_count++;
335 qd->qd_change_sync = qd->qd_change;
336 gfs2_assert_warn(sdp, qd->qd_slot_count);
337 qd->qd_slot_count++;
338 found = 1;
339
340 break;
341 }
342
343 if (!found)
344 qd = NULL;
345
346 spin_unlock(&sdp->sd_quota_spin);
347
348 if (qd) {
349 gfs2_assert_warn(sdp, qd->qd_change_sync);
350 error = bh_get(qd);
351 if (error) {
352 clear_bit(QDF_LOCKED, &qd->qd_flags);
353 slot_put(qd);
354 qd_put(qd);
355 return error;
356 }
357 }
358
359 *qdp = qd;
360
361 return 0;
362}
363
364static int qd_trylock(struct gfs2_quota_data *qd)
365{
366 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
367
368 if (sdp->sd_vfs->s_flags & MS_RDONLY)
369 return 0;
370
371 spin_lock(&sdp->sd_quota_spin);
372
373 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
374 !test_bit(QDF_CHANGE, &qd->qd_flags)) {
375 spin_unlock(&sdp->sd_quota_spin);
376 return 0;
377 }
378
379 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
380
381 set_bit(QDF_LOCKED, &qd->qd_flags);
382 gfs2_assert_warn(sdp, qd->qd_count);
383 qd->qd_count++;
384 qd->qd_change_sync = qd->qd_change;
385 gfs2_assert_warn(sdp, qd->qd_slot_count);
386 qd->qd_slot_count++;
387
388 spin_unlock(&sdp->sd_quota_spin);
389
390 gfs2_assert_warn(sdp, qd->qd_change_sync);
391 if (bh_get(qd)) {
392 clear_bit(QDF_LOCKED, &qd->qd_flags);
393 slot_put(qd);
394 qd_put(qd);
395 return 0;
396 }
397
398 return 1;
399}
400
401static void qd_unlock(struct gfs2_quota_data *qd)
402{
403 gfs2_assert_warn(qd->qd_gl->gl_sbd,
404 test_bit(QDF_LOCKED, &qd->qd_flags));
405 clear_bit(QDF_LOCKED, &qd->qd_flags);
406 bh_put(qd);
407 slot_put(qd);
408 qd_put(qd);
409}
410
411static int qdsb_get(struct gfs2_sbd *sdp, int user, uint32_t id, int create,
412 struct gfs2_quota_data **qdp)
413{
414 int error;
415
416 error = qd_get(sdp, user, id, create, qdp);
417 if (error)
418 return error;
419
420 error = slot_get(*qdp);
421 if (error)
422 goto fail;
423
424 error = bh_get(*qdp);
425 if (error)
426 goto fail_slot;
427
428 return 0;
429
430 fail_slot:
431 slot_put(*qdp);
432
433 fail:
434 qd_put(*qdp);
435 return error;
436}
437
438static void qdsb_put(struct gfs2_quota_data *qd)
439{
440 bh_put(qd);
441 slot_put(qd);
442 qd_put(qd);
443}
444
445int gfs2_quota_hold(struct gfs2_inode *ip, uint32_t uid, uint32_t gid)
446{
447 struct gfs2_sbd *sdp = ip->i_sbd;
448 struct gfs2_alloc *al = &ip->i_alloc;
449 struct gfs2_quota_data **qd = al->al_qd;
450 int error;
451
452 if (gfs2_assert_warn(sdp, !al->al_qd_num) ||
453 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
454 return -EIO;
455
456 if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
457 return 0;
458
459 error = qdsb_get(sdp, QUOTA_USER, ip->i_di.di_uid, CREATE, qd);
460 if (error)
461 goto out;
462 al->al_qd_num++;
463 qd++;
464
465 error = qdsb_get(sdp, QUOTA_GROUP, ip->i_di.di_gid, CREATE, qd);
466 if (error)
467 goto out;
468 al->al_qd_num++;
469 qd++;
470
471 if (uid != NO_QUOTA_CHANGE && uid != ip->i_di.di_uid) {
472 error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd);
473 if (error)
474 goto out;
475 al->al_qd_num++;
476 qd++;
477 }
478
479 if (gid != NO_QUOTA_CHANGE && gid != ip->i_di.di_gid) {
480 error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd);
481 if (error)
482 goto out;
483 al->al_qd_num++;
484 qd++;
485 }
486
487 out:
488 if (error)
489 gfs2_quota_unhold(ip);
490
491 return error;
492}
493
494void gfs2_quota_unhold(struct gfs2_inode *ip)
495{
496 struct gfs2_sbd *sdp = ip->i_sbd;
497 struct gfs2_alloc *al = &ip->i_alloc;
498 unsigned int x;
499
500 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
501
502 for (x = 0; x < al->al_qd_num; x++) {
503 qdsb_put(al->al_qd[x]);
504 al->al_qd[x] = NULL;
505 }
506 al->al_qd_num = 0;
507}
508
509static int sort_qd(const void *a, const void *b)
510{
511 struct gfs2_quota_data *qd_a = *(struct gfs2_quota_data **)a;
512 struct gfs2_quota_data *qd_b = *(struct gfs2_quota_data **)b;
513 int ret = 0;
514
515 if (!test_bit(QDF_USER, &qd_a->qd_flags) !=
516 !test_bit(QDF_USER, &qd_b->qd_flags)) {
517 if (test_bit(QDF_USER, &qd_a->qd_flags))
518 ret = -1;
519 else
520 ret = 1;
521 } else {
522 if (qd_a->qd_id < qd_b->qd_id)
523 ret = -1;
524 else if (qd_a->qd_id > qd_b->qd_id)
525 ret = 1;
526 }
527
528 return ret;
529}
530
531static void do_qc(struct gfs2_quota_data *qd, int64_t change)
532{
533 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
534 struct gfs2_inode *ip = sdp->sd_qc_inode->u.generic_ip;
535 struct gfs2_quota_change *qc = qd->qd_bh_qc;
536 int64_t x;
537
538 mutex_lock(&sdp->sd_quota_mutex);
539 gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1);
540
541 if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {
542 qc->qc_change = 0;
543 qc->qc_flags = 0;
544 if (test_bit(QDF_USER, &qd->qd_flags))
545 qc->qc_flags = cpu_to_be32(GFS2_QCF_USER);
546 qc->qc_id = cpu_to_be32(qd->qd_id);
547 }
548
549 x = qc->qc_change;
550 x = be64_to_cpu(x) + change;
551 qc->qc_change = cpu_to_be64(x);
552
553 spin_lock(&sdp->sd_quota_spin);
554 qd->qd_change = x;
555 spin_unlock(&sdp->sd_quota_spin);
556
557 if (!x) {
558 gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
559 clear_bit(QDF_CHANGE, &qd->qd_flags);
560 qc->qc_flags = 0;
561 qc->qc_id = 0;
562 slot_put(qd);
563 qd_put(qd);
564 } else if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) {
565 qd_hold(qd);
566 slot_hold(qd);
567 }
568
569 mutex_unlock(&sdp->sd_quota_mutex);
570}
571
572/**
573 * gfs2_adjust_quota
574 *
575 * This function was mostly borrowed from gfs2_block_truncate_page which was
576 * in turn mostly borrowed from ext3
577 */
578static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
579 int64_t change, struct gfs2_quota_data *qd)
580{
581 struct inode *inode = ip->i_vnode;
582 struct address_space *mapping = inode->i_mapping;
583 unsigned long index = loc >> PAGE_CACHE_SHIFT;
584 unsigned offset = loc & (PAGE_CACHE_SHIFT - 1);
585 unsigned blocksize, iblock, pos;
586 struct buffer_head *bh;
587 struct page *page;
588 void *kaddr;
589 __be64 *ptr;
590 u64 value;
591 int err = -EIO;
592
593 page = grab_cache_page(mapping, index);
594 if (!page)
595 return -ENOMEM;
596
597 blocksize = inode->i_sb->s_blocksize;
598 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
599
600 if (!page_has_buffers(page))
601 create_empty_buffers(page, blocksize, 0);
602
603 bh = page_buffers(page);
604 pos = blocksize;
605 while (offset >= pos) {
606 bh = bh->b_this_page;
607 iblock++;
608 pos += blocksize;
609 }
610
611 if (!buffer_mapped(bh)) {
612 gfs2_get_block(inode, iblock, bh, 1);
613 if (!buffer_mapped(bh))
614 goto unlock;
615 }
616
617 if (PageUptodate(page))
618 set_buffer_uptodate(bh);
619
620 if (!buffer_uptodate(bh)) {
621 ll_rw_block(READ, 1, &bh);
622 wait_on_buffer(bh);
623 if (!buffer_uptodate(bh))
624 goto unlock;
625 }
626
627 gfs2_trans_add_bh(ip->i_gl, bh, 0);
628
629 kaddr = kmap_atomic(page, KM_USER0);
630 ptr = (__be64 *)(kaddr + offset);
631 value = *ptr = cpu_to_be64(be64_to_cpu(*ptr) + change);
632 flush_dcache_page(page);
633 kunmap_atomic(kaddr, KM_USER0);
634 err = 0;
635 qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC);
636#if 0
637 qd->qd_qb.qb_limit = cpu_to_be64(q.qu_limit);
638 qd->qd_qb.qb_warn = cpu_to_be64(q.qu_warn);
639#endif
640 qd->qd_qb.qb_value = cpu_to_be64(value);
641unlock:
642 unlock_page(page);
643 page_cache_release(page);
644 return err;
645}
646
647static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
648{
649 struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
650 struct gfs2_inode *ip = sdp->sd_quota_inode->u.generic_ip;
651 unsigned int data_blocks, ind_blocks;
652 struct file_ra_state ra_state;
653 struct gfs2_holder *ghs, i_gh;
654 unsigned int qx, x;
655 struct gfs2_quota_data *qd;
656 loff_t offset;
657 unsigned int nalloc = 0;
658 struct gfs2_alloc *al = NULL;
659 int error;
660
661 gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
662 &data_blocks, &ind_blocks);
663
664 ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL);
665 if (!ghs)
666 return -ENOMEM;
667
668 sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
669 for (qx = 0; qx < num_qd; qx++) {
670 error = gfs2_glock_nq_init(qda[qx]->qd_gl,
671 LM_ST_EXCLUSIVE,
672 GL_NOCACHE, &ghs[qx]);
673 if (error)
674 goto out;
675 }
676
677 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
678 if (error)
679 goto out;
680
681 for (x = 0; x < num_qd; x++) {
682 int alloc_required;
683
684 offset = qd2offset(qda[x]);
685 error = gfs2_write_alloc_required(ip, offset,
686 sizeof(struct gfs2_quota),
687 &alloc_required);
688 if (error)
689 goto out_gunlock;
690 if (alloc_required)
691 nalloc++;
692 }
693
694 if (nalloc) {
695 al = gfs2_alloc_get(ip);
696
697 al->al_requested = nalloc * (data_blocks + ind_blocks);
698
699 error = gfs2_inplace_reserve(ip);
700 if (error)
701 goto out_alloc;
702
703 error = gfs2_trans_begin(sdp,
704 al->al_rgd->rd_ri.ri_length +
705 num_qd * data_blocks +
706 nalloc * ind_blocks +
707 RES_DINODE + num_qd +
708 RES_STATFS, 0);
709 if (error)
710 goto out_ipres;
711 } else {
712 error = gfs2_trans_begin(sdp,
713 num_qd * data_blocks +
714 RES_DINODE + num_qd, 0);
715 if (error)
716 goto out_gunlock;
717 }
718
719 file_ra_state_init(&ra_state, ip->i_vnode->i_mapping);
720 for (x = 0; x < num_qd; x++) {
721 qd = qda[x];
722 offset = qd2offset(qd);
723 error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync,
724 (struct gfs2_quota_data *)
725 qd->qd_gl->gl_lvb);
726 if (error)
727 goto out_end_trans;
728
729 do_qc(qd, -qd->qd_change_sync);
730 }
731
732 error = 0;
733
734 out_end_trans:
735 gfs2_trans_end(sdp);
736
737 out_ipres:
738 if (nalloc)
739 gfs2_inplace_release(ip);
740
741 out_alloc:
742 if (nalloc)
743 gfs2_alloc_put(ip);
744
745 out_gunlock:
746 gfs2_glock_dq_uninit(&i_gh);
747
748 out:
749 while (qx--)
750 gfs2_glock_dq_uninit(&ghs[qx]);
751 kfree(ghs);
752 gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
753
754 return error;
755}
756
757static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
758 struct gfs2_holder *q_gh)
759{
760 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
761 struct gfs2_inode *ip = sdp->sd_quota_inode->u.generic_ip;
762 struct gfs2_holder i_gh;
763 struct gfs2_quota q;
764 char buf[sizeof(struct gfs2_quota)];
765 struct file_ra_state ra_state;
766 int error;
767
768 file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping);
769 restart:
770 error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
771 if (error)
772 return error;
773
774 gfs2_quota_lvb_in(&qd->qd_qb, qd->qd_gl->gl_lvb);
775
776 if (force_refresh || qd->qd_qb.qb_magic != GFS2_MAGIC) {
777 loff_t pos;
778 gfs2_glock_dq_uninit(q_gh);
779 error = gfs2_glock_nq_init(qd->qd_gl,
780 LM_ST_EXCLUSIVE, GL_NOCACHE,
781 q_gh);
782 if (error)
783 return error;
784
785 error = gfs2_glock_nq_init(ip->i_gl,
786 LM_ST_SHARED, 0,
787 &i_gh);
788 if (error)
789 goto fail;
790
791 memset(buf, 0, sizeof(struct gfs2_quota));
792 pos = qd2offset(qd);
793 error = gfs2_internal_read(ip,
794 &ra_state, buf,
795 &pos,
796 sizeof(struct gfs2_quota));
797 if (error < 0)
798 goto fail_gunlock;
799
800 gfs2_glock_dq_uninit(&i_gh);
801
802 gfs2_quota_in(&q, buf);
803
804 memset(&qd->qd_qb, 0, sizeof(struct gfs2_quota_lvb));
805 qd->qd_qb.qb_magic = GFS2_MAGIC;
806 qd->qd_qb.qb_limit = q.qu_limit;
807 qd->qd_qb.qb_warn = q.qu_warn;
808 qd->qd_qb.qb_value = q.qu_value;
809
810 gfs2_quota_lvb_out(&qd->qd_qb, qd->qd_gl->gl_lvb);
811
812 if (gfs2_glock_is_blocking(qd->qd_gl)) {
813 gfs2_glock_dq_uninit(q_gh);
814 force_refresh = 0;
815 goto restart;
816 }
817 }
818
819 return 0;
820
821 fail_gunlock:
822 gfs2_glock_dq_uninit(&i_gh);
823
824 fail:
825 gfs2_glock_dq_uninit(q_gh);
826
827 return error;
828}
829
830int gfs2_quota_lock(struct gfs2_inode *ip, uint32_t uid, uint32_t gid)
831{
832 struct gfs2_sbd *sdp = ip->i_sbd;
833 struct gfs2_alloc *al = &ip->i_alloc;
834 unsigned int x;
835 int error = 0;
836
837 gfs2_quota_hold(ip, uid, gid);
838
839 if (capable(CAP_SYS_RESOURCE) ||
840 sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
841 return 0;
842
843 sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *),
844 sort_qd, NULL);
845
846 for (x = 0; x < al->al_qd_num; x++) {
847 error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]);
848 if (error)
849 break;
850 }
851
852 if (!error)
853 set_bit(GIF_QD_LOCKED, &ip->i_flags);
854 else {
855 while (x--)
856 gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
857 gfs2_quota_unhold(ip);
858 }
859
860 return error;
861}
862
863static int need_sync(struct gfs2_quota_data *qd)
864{
865 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
866 struct gfs2_tune *gt = &sdp->sd_tune;
867 int64_t value;
868 unsigned int num, den;
869 int do_sync = 1;
870
871 if (!qd->qd_qb.qb_limit)
872 return 0;
873
874 spin_lock(&sdp->sd_quota_spin);
875 value = qd->qd_change;
876 spin_unlock(&sdp->sd_quota_spin);
877
878 spin_lock(&gt->gt_spin);
879 num = gt->gt_quota_scale_num;
880 den = gt->gt_quota_scale_den;
881 spin_unlock(&gt->gt_spin);
882
883 if (value < 0)
884 do_sync = 0;
885 else if (qd->qd_qb.qb_value >= (int64_t)qd->qd_qb.qb_limit)
886 do_sync = 0;
887 else {
888 value *= gfs2_jindex_size(sdp) * num;
889 do_div(value, den);
890 value += qd->qd_qb.qb_value;
891 if (value < (int64_t)qd->qd_qb.qb_limit)
892 do_sync = 0;
893 }
894
895 return do_sync;
896}
897
898void gfs2_quota_unlock(struct gfs2_inode *ip)
899{
900 struct gfs2_alloc *al = &ip->i_alloc;
901 struct gfs2_quota_data *qda[4];
902 unsigned int count = 0;
903 unsigned int x;
904
905 if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
906 goto out;
907
908 for (x = 0; x < al->al_qd_num; x++) {
909 struct gfs2_quota_data *qd;
910 int sync;
911
912 qd = al->al_qd[x];
913 sync = need_sync(qd);
914
915 gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
916
917 if (sync && qd_trylock(qd))
918 qda[count++] = qd;
919 }
920
921 if (count) {
922 do_sync(count, qda);
923 for (x = 0; x < count; x++)
924 qd_unlock(qda[x]);
925 }
926
927 out:
928 gfs2_quota_unhold(ip);
929}
930
931#define MAX_LINE 256
932
933static int print_message(struct gfs2_quota_data *qd, char *type)
934{
935 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
936 char *line;
937 int len;
938
939 line = kmalloc(MAX_LINE, GFP_KERNEL);
940 if (!line)
941 return -ENOMEM;
942
943 len = snprintf(line, MAX_LINE-1,
944 "GFS2: fsid=%s: quota %s for %s %u\r\n",
945 sdp->sd_fsname, type,
946 (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
947 qd->qd_id);
948 line[MAX_LINE-1] = 0;
949
950 if (current->signal) { /* Is this test still required? */
951 tty_write_message(current->signal->tty, line);
952 }
953
954 kfree(line);
955
956 return 0;
957}
958
959int gfs2_quota_check(struct gfs2_inode *ip, uint32_t uid, uint32_t gid)
960{
961 struct gfs2_sbd *sdp = ip->i_sbd;
962 struct gfs2_alloc *al = &ip->i_alloc;
963 struct gfs2_quota_data *qd;
964 int64_t value;
965 unsigned int x;
966 int error = 0;
967
968 if (!test_bit(GIF_QD_LOCKED, &ip->i_flags))
969 return 0;
970
971 if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
972 return 0;
973
974 for (x = 0; x < al->al_qd_num; x++) {
975 qd = al->al_qd[x];
976
977 if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
978 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))))
979 continue;
980
981 value = qd->qd_qb.qb_value;
982 spin_lock(&sdp->sd_quota_spin);
983 value += qd->qd_change;
984 spin_unlock(&sdp->sd_quota_spin);
985
986 if (qd->qd_qb.qb_limit && (int64_t)qd->qd_qb.qb_limit < value) {
987 print_message(qd, "exceeded");
988 error = -EDQUOT;
989 break;
990 } else if (qd->qd_qb.qb_warn &&
991 (int64_t)qd->qd_qb.qb_warn < value &&
992 time_after_eq(jiffies, qd->qd_last_warn +
993 gfs2_tune_get(sdp,
994 gt_quota_warn_period) * HZ)) {
995 error = print_message(qd, "warning");
996 qd->qd_last_warn = jiffies;
997 }
998 }
999
1000 return error;
1001}
1002
1003void gfs2_quota_change(struct gfs2_inode *ip, int64_t change,
1004 uint32_t uid, uint32_t gid)
1005{
1006 struct gfs2_alloc *al = &ip->i_alloc;
1007 struct gfs2_quota_data *qd;
1008 unsigned int x;
1009 unsigned int found = 0;
1010
1011 if (gfs2_assert_warn(ip->i_sbd, change))
1012 return;
1013 if (ip->i_di.di_flags & GFS2_DIF_SYSTEM)
1014 return;
1015
1016 for (x = 0; x < al->al_qd_num; x++) {
1017 qd = al->al_qd[x];
1018
1019 if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
1020 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
1021 do_qc(qd, change);
1022 found++;
1023 }
1024 }
1025}
1026
1027int gfs2_quota_sync(struct gfs2_sbd *sdp)
1028{
1029 struct gfs2_quota_data **qda;
1030 unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
1031 unsigned int num_qd;
1032 unsigned int x;
1033 int error = 0;
1034
1035 sdp->sd_quota_sync_gen++;
1036
1037 qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL);
1038 if (!qda)
1039 return -ENOMEM;
1040
1041 do {
1042 num_qd = 0;
1043
1044 for (;;) {
1045 error = qd_fish(sdp, qda + num_qd);
1046 if (error || !qda[num_qd])
1047 break;
1048 if (++num_qd == max_qd)
1049 break;
1050 }
1051
1052 if (num_qd) {
1053 if (!error)
1054 error = do_sync(num_qd, qda);
1055 if (!error)
1056 for (x = 0; x < num_qd; x++)
1057 qda[x]->qd_sync_gen =
1058 sdp->sd_quota_sync_gen;
1059
1060 for (x = 0; x < num_qd; x++)
1061 qd_unlock(qda[x]);
1062 }
1063 } while (!error && num_qd == max_qd);
1064
1065 kfree(qda);
1066
1067 return error;
1068}
1069
1070int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, uint32_t id)
1071{
1072 struct gfs2_quota_data *qd;
1073 struct gfs2_holder q_gh;
1074 int error;
1075
1076 error = qd_get(sdp, user, id, CREATE, &qd);
1077 if (error)
1078 return error;
1079
1080 error = do_glock(qd, FORCE, &q_gh);
1081 if (!error)
1082 gfs2_glock_dq_uninit(&q_gh);
1083
1084 qd_put(qd);
1085
1086 return error;
1087}
1088
1089#if 0
1090int gfs2_quota_read(struct gfs2_sbd *sdp, int user, uint32_t id,
1091 struct gfs2_quota *q)
1092{
1093 struct gfs2_quota_data *qd;
1094 struct gfs2_holder q_gh;
1095 int error;
1096
1097 if (((user) ? (id != current->fsuid) : (!in_group_p(id))) &&
1098 !capable(CAP_SYS_ADMIN))
1099 return -EACCES;
1100
1101 error = qd_get(sdp, user, id, CREATE, &qd);
1102 if (error)
1103 return error;
1104
1105 error = do_glock(qd, NO_FORCE, &q_gh);
1106 if (error)
1107 goto out;
1108
1109 memset(q, 0, sizeof(struct gfs2_quota));
1110 q->qu_limit = qd->qd_qb.qb_limit;
1111 q->qu_warn = qd->qd_qb.qb_warn;
1112 q->qu_value = qd->qd_qb.qb_value;
1113
1114 spin_lock(&sdp->sd_quota_spin);
1115 q->qu_value += qd->qd_change;
1116 spin_unlock(&sdp->sd_quota_spin);
1117
1118 gfs2_glock_dq_uninit(&q_gh);
1119
1120 out:
1121 qd_put(qd);
1122
1123 return error;
1124}
1125#endif /* 0 */
1126
1127int gfs2_quota_init(struct gfs2_sbd *sdp)
1128{
1129 struct gfs2_inode *ip = sdp->sd_qc_inode->u.generic_ip;
1130 unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
1131 unsigned int x, slot = 0;
1132 unsigned int found = 0;
1133 uint64_t dblock;
1134 uint32_t extlen = 0;
1135 int error;
1136
1137 if (!ip->i_di.di_size ||
1138 ip->i_di.di_size > (64 << 20) ||
1139 ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
1140 gfs2_consist_inode(ip);
1141 return -EIO;
1142 }
1143 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
1144 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
1145
1146 error = -ENOMEM;
1147
1148 sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
1149 sizeof(unsigned char *), GFP_KERNEL);
1150 if (!sdp->sd_quota_bitmap)
1151 return error;
1152
1153 for (x = 0; x < sdp->sd_quota_chunks; x++) {
1154 sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL);
1155 if (!sdp->sd_quota_bitmap[x])
1156 goto fail;
1157 }
1158
1159 for (x = 0; x < blocks; x++) {
1160 struct buffer_head *bh;
1161 unsigned int y;
1162
1163 if (!extlen) {
1164 int new = 0;
1165 error = gfs2_extent_map(ip->i_vnode, x, &new, &dblock, &extlen);
1166 if (error)
1167 goto fail;
1168 }
1169 gfs2_meta_ra(ip->i_gl, dblock, extlen);
1170 error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT,
1171 &bh);
1172 if (error)
1173 goto fail;
1174 error = -EIO;
1175 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) {
1176 brelse(bh);
1177 goto fail;
1178 }
1179
1180 for (y = 0;
1181 y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
1182 y++, slot++) {
1183 struct gfs2_quota_change qc;
1184 struct gfs2_quota_data *qd;
1185
1186 gfs2_quota_change_in(&qc, bh->b_data +
1187 sizeof(struct gfs2_meta_header) +
1188 y * sizeof(struct gfs2_quota_change));
1189 if (!qc.qc_change)
1190 continue;
1191
1192 error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER),
1193 qc.qc_id, &qd);
1194 if (error) {
1195 brelse(bh);
1196 goto fail;
1197 }
1198
1199 set_bit(QDF_CHANGE, &qd->qd_flags);
1200 qd->qd_change = qc.qc_change;
1201 qd->qd_slot = slot;
1202 qd->qd_slot_count = 1;
1203 qd->qd_last_touched = jiffies;
1204
1205 spin_lock(&sdp->sd_quota_spin);
1206 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
1207 list_add(&qd->qd_list, &sdp->sd_quota_list);
1208 atomic_inc(&sdp->sd_quota_count);
1209 spin_unlock(&sdp->sd_quota_spin);
1210
1211 found++;
1212 }
1213
1214 brelse(bh);
1215 dblock++;
1216 extlen--;
1217 }
1218
1219 if (found)
1220 fs_info(sdp, "found %u quota changes\n", found);
1221
1222 return 0;
1223
1224 fail:
1225 gfs2_quota_cleanup(sdp);
1226 return error;
1227}
1228
1229void gfs2_quota_scan(struct gfs2_sbd *sdp)
1230{
1231 struct gfs2_quota_data *qd, *safe;
1232 LIST_HEAD(dead);
1233
1234 spin_lock(&sdp->sd_quota_spin);
1235 list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) {
1236 if (!qd->qd_count &&
1237 time_after_eq(jiffies, qd->qd_last_touched +
1238 gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) {
1239 list_move(&qd->qd_list, &dead);
1240 gfs2_assert_warn(sdp,
1241 atomic_read(&sdp->sd_quota_count) > 0);
1242 atomic_dec(&sdp->sd_quota_count);
1243 }
1244 }
1245 spin_unlock(&sdp->sd_quota_spin);
1246
1247 while (!list_empty(&dead)) {
1248 qd = list_entry(dead.next, struct gfs2_quota_data, qd_list);
1249 list_del(&qd->qd_list);
1250
1251 gfs2_assert_warn(sdp, !qd->qd_change);
1252 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1253 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1254
1255 gfs2_lvb_unhold(qd->qd_gl);
1256 kfree(qd);
1257 }
1258}
1259
1260void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1261{
1262 struct list_head *head = &sdp->sd_quota_list;
1263 struct gfs2_quota_data *qd;
1264 unsigned int x;
1265
1266 spin_lock(&sdp->sd_quota_spin);
1267 while (!list_empty(head)) {
1268 qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
1269
1270 if (qd->qd_count > 1 ||
1271 (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
1272 list_move(&qd->qd_list, head);
1273 spin_unlock(&sdp->sd_quota_spin);
1274 schedule();
1275 spin_lock(&sdp->sd_quota_spin);
1276 continue;
1277 }
1278
1279 list_del(&qd->qd_list);
1280 atomic_dec(&sdp->sd_quota_count);
1281 spin_unlock(&sdp->sd_quota_spin);
1282
1283 if (!qd->qd_count) {
1284 gfs2_assert_warn(sdp, !qd->qd_change);
1285 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1286 } else
1287 gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
1288 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1289
1290 gfs2_lvb_unhold(qd->qd_gl);
1291 kfree(qd);
1292
1293 spin_lock(&sdp->sd_quota_spin);
1294 }
1295 spin_unlock(&sdp->sd_quota_spin);
1296
1297 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
1298
1299 if (sdp->sd_quota_bitmap) {
1300 for (x = 0; x < sdp->sd_quota_chunks; x++)
1301 kfree(sdp->sd_quota_bitmap[x]);
1302 kfree(sdp->sd_quota_bitmap);
1303 }
1304}
1305
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
new file mode 100644
index 000000000000..af05492f9644
--- /dev/null
+++ b/fs/gfs2/quota.h
@@ -0,0 +1,32 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __QUOTA_DOT_H__
11#define __QUOTA_DOT_H__
12
13#define NO_QUOTA_CHANGE ((uint32_t)-1)
14
15int gfs2_quota_hold(struct gfs2_inode *ip, uint32_t uid, uint32_t gid);
16void gfs2_quota_unhold(struct gfs2_inode *ip);
17
18int gfs2_quota_lock(struct gfs2_inode *ip, uint32_t uid, uint32_t gid);
19void gfs2_quota_unlock(struct gfs2_inode *ip);
20
21int gfs2_quota_check(struct gfs2_inode *ip, uint32_t uid, uint32_t gid);
22void gfs2_quota_change(struct gfs2_inode *ip, int64_t change,
23 uint32_t uid, uint32_t gid);
24
25int gfs2_quota_sync(struct gfs2_sbd *sdp);
26int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, uint32_t id);
27
28int gfs2_quota_init(struct gfs2_sbd *sdp);
29void gfs2_quota_scan(struct gfs2_sbd *sdp);
30void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
31
32#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
new file mode 100644
index 000000000000..c504ac1b831d
--- /dev/null
+++ b/fs/gfs2/recovery.c
@@ -0,0 +1,576 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "glops.h"
24#include "lm.h"
25#include "lops.h"
26#include "meta_io.h"
27#include "recovery.h"
28#include "super.h"
29#include "util.h"
30#include "dir.h"
31
32int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
33 struct buffer_head **bh)
34{
35 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
36 struct gfs2_glock *gl = ip->i_gl;
37 int new = 0;
38 uint64_t dblock;
39 uint32_t extlen;
40 int error;
41
42 error = gfs2_extent_map(ip->i_vnode, blk, &new, &dblock, &extlen);
43 if (error)
44 return error;
45 if (!dblock) {
46 gfs2_consist_inode(ip);
47 return -EIO;
48 }
49
50 gfs2_meta_ra(gl, dblock, extlen);
51 error = gfs2_meta_read(gl, dblock, DIO_START | DIO_WAIT, bh);
52
53 return error;
54}
55
56int gfs2_revoke_add(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where)
57{
58 struct list_head *head = &sdp->sd_revoke_list;
59 struct gfs2_revoke_replay *rr;
60 int found = 0;
61
62 list_for_each_entry(rr, head, rr_list) {
63 if (rr->rr_blkno == blkno) {
64 found = 1;
65 break;
66 }
67 }
68
69 if (found) {
70 rr->rr_where = where;
71 return 0;
72 }
73
74 rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL);
75 if (!rr)
76 return -ENOMEM;
77
78 rr->rr_blkno = blkno;
79 rr->rr_where = where;
80 list_add(&rr->rr_list, head);
81
82 return 1;
83}
84
85int gfs2_revoke_check(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where)
86{
87 struct gfs2_revoke_replay *rr;
88 int wrap, a, b, revoke;
89 int found = 0;
90
91 list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) {
92 if (rr->rr_blkno == blkno) {
93 found = 1;
94 break;
95 }
96 }
97
98 if (!found)
99 return 0;
100
101 wrap = (rr->rr_where < sdp->sd_replay_tail);
102 a = (sdp->sd_replay_tail < where);
103 b = (where < rr->rr_where);
104 revoke = (wrap) ? (a || b) : (a && b);
105
106 return revoke;
107}
108
109void gfs2_revoke_clean(struct gfs2_sbd *sdp)
110{
111 struct list_head *head = &sdp->sd_revoke_list;
112 struct gfs2_revoke_replay *rr;
113
114 while (!list_empty(head)) {
115 rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list);
116 list_del(&rr->rr_list);
117 kfree(rr);
118 }
119}
120
121/**
122 * get_log_header - read the log header for a given segment
123 * @jd: the journal
124 * @blk: the block to look at
125 * @lh: the log header to return
126 *
127 * Read the log header for a given segement in a given journal. Do a few
128 * sanity checks on it.
129 *
130 * Returns: 0 on success,
131 * 1 if the header was invalid or incomplete,
132 * errno on error
133 */
134
135static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
136 struct gfs2_log_header *head)
137{
138 struct buffer_head *bh;
139 struct gfs2_log_header lh;
140 uint32_t hash;
141 int error;
142
143 error = gfs2_replay_read_block(jd, blk, &bh);
144 if (error)
145 return error;
146
147 memcpy(&lh, bh->b_data, sizeof(struct gfs2_log_header));
148 lh.lh_hash = 0;
149 hash = gfs2_disk_hash((char *)&lh, sizeof(struct gfs2_log_header));
150 gfs2_log_header_in(&lh, bh->b_data);
151
152 brelse(bh);
153
154 if (lh.lh_header.mh_magic != GFS2_MAGIC ||
155 lh.lh_header.mh_type != GFS2_METATYPE_LH ||
156 lh.lh_blkno != blk ||
157 lh.lh_hash != hash)
158 return 1;
159
160 *head = lh;
161
162 return 0;
163}
164
165/**
166 * find_good_lh - find a good log header
167 * @jd: the journal
168 * @blk: the segment to start searching from
169 * @lh: the log header to fill in
170 * @forward: if true search forward in the log, else search backward
171 *
172 * Call get_log_header() to get a log header for a segment, but if the
173 * segment is bad, either scan forward or backward until we find a good one.
174 *
175 * Returns: errno
176 */
177
178static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk,
179 struct gfs2_log_header *head)
180{
181 unsigned int orig_blk = *blk;
182 int error;
183
184 for (;;) {
185 error = get_log_header(jd, *blk, head);
186 if (error <= 0)
187 return error;
188
189 if (++*blk == jd->jd_blocks)
190 *blk = 0;
191
192 if (*blk == orig_blk) {
193 gfs2_consist_inode(jd->jd_inode->u.generic_ip);
194 return -EIO;
195 }
196 }
197}
198
199/**
200 * jhead_scan - make sure we've found the head of the log
201 * @jd: the journal
202 * @head: this is filled in with the log descriptor of the head
203 *
204 * At this point, seg and lh should be either the head of the log or just
205 * before. Scan forward until we find the head.
206 *
207 * Returns: errno
208 */
209
210static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
211{
212 unsigned int blk = head->lh_blkno;
213 struct gfs2_log_header lh;
214 int error;
215
216 for (;;) {
217 if (++blk == jd->jd_blocks)
218 blk = 0;
219
220 error = get_log_header(jd, blk, &lh);
221 if (error < 0)
222 return error;
223 if (error == 1)
224 continue;
225
226 if (lh.lh_sequence == head->lh_sequence) {
227 gfs2_consist_inode(jd->jd_inode->u.generic_ip);
228 return -EIO;
229 }
230 if (lh.lh_sequence < head->lh_sequence)
231 break;
232
233 *head = lh;
234 }
235
236 return 0;
237}
238
239/**
240 * gfs2_find_jhead - find the head of a log
241 * @jd: the journal
242 * @head: the log descriptor for the head of the log is returned here
243 *
244 * Do a binary search of a journal and find the valid log entry with the
245 * highest sequence number. (i.e. the log head)
246 *
247 * Returns: errno
248 */
249
250int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
251{
252 struct gfs2_log_header lh_1, lh_m;
253 uint32_t blk_1, blk_2, blk_m;
254 int error;
255
256 blk_1 = 0;
257 blk_2 = jd->jd_blocks - 1;
258
259 for (;;) {
260 blk_m = (blk_1 + blk_2) / 2;
261
262 error = find_good_lh(jd, &blk_1, &lh_1);
263 if (error)
264 return error;
265
266 error = find_good_lh(jd, &blk_m, &lh_m);
267 if (error)
268 return error;
269
270 if (blk_1 == blk_m || blk_m == blk_2)
271 break;
272
273 if (lh_1.lh_sequence <= lh_m.lh_sequence)
274 blk_1 = blk_m;
275 else
276 blk_2 = blk_m;
277 }
278
279 error = jhead_scan(jd, &lh_1);
280 if (error)
281 return error;
282
283 *head = lh_1;
284
285 return error;
286}
287
288/**
289 * foreach_descriptor - go through the active part of the log
290 * @jd: the journal
291 * @start: the first log header in the active region
292 * @end: the last log header (don't process the contents of this entry))
293 *
294 * Call a given function once for every log descriptor in the active
295 * portion of the log.
296 *
297 * Returns: errno
298 */
299
300static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start,
301 unsigned int end, int pass)
302{
303 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
304 struct gfs2_sbd *sdp = ip->i_sbd;
305 struct buffer_head *bh;
306 struct gfs2_log_descriptor *ld;
307 int error = 0;
308 u32 length;
309 __be64 *ptr;
310 unsigned int offset = sizeof(struct gfs2_log_descriptor);
311 offset += (sizeof(__be64)-1);
312 offset &= ~(sizeof(__be64)-1);
313
314 while (start != end) {
315 error = gfs2_replay_read_block(jd, start, &bh);
316 if (error)
317 return error;
318 if (gfs2_meta_check(sdp, bh)) {
319 brelse(bh);
320 return -EIO;
321 }
322 ld = (struct gfs2_log_descriptor *)bh->b_data;
323 length = be32_to_cpu(ld->ld_length);
324
325 if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) {
326 struct gfs2_log_header lh;
327 error = get_log_header(jd, start, &lh);
328 if (!error) {
329 gfs2_replay_incr_blk(sdp, &start);
330 continue;
331 }
332 if (error == 1) {
333 gfs2_consist_inode(jd->jd_inode->u.generic_ip);
334 error = -EIO;
335 }
336 brelse(bh);
337 return error;
338 } else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) {
339 brelse(bh);
340 return -EIO;
341 }
342 ptr = (__be64 *)(bh->b_data + offset);
343 error = lops_scan_elements(jd, start, ld, ptr, pass);
344 if (error) {
345 brelse(bh);
346 return error;
347 }
348
349 while (length--)
350 gfs2_replay_incr_blk(sdp, &start);
351
352 brelse(bh);
353 }
354
355 return 0;
356}
357
358/**
359 * clean_journal - mark a dirty journal as being clean
360 * @sdp: the filesystem
361 * @jd: the journal
362 * @gl: the journal's glock
363 * @head: the head journal to start from
364 *
365 * Returns: errno
366 */
367
368static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
369{
370 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
371 struct gfs2_sbd *sdp = ip->i_sbd;
372 unsigned int lblock;
373 int new = 0;
374 uint64_t dblock;
375 struct gfs2_log_header *lh;
376 uint32_t hash;
377 struct buffer_head *bh;
378 int error;
379 int boundary;
380
381 lblock = head->lh_blkno;
382 gfs2_replay_incr_blk(sdp, &lblock);
383 error = gfs2_block_map(ip->i_vnode, lblock, &new, &dblock, &boundary);
384 if (error)
385 return error;
386 if (!dblock) {
387 gfs2_consist_inode(ip);
388 return -EIO;
389 }
390
391 bh = sb_getblk(sdp->sd_vfs, dblock);
392 lock_buffer(bh);
393 memset(bh->b_data, 0, bh->b_size);
394 set_buffer_uptodate(bh);
395 clear_buffer_dirty(bh);
396 unlock_buffer(bh);
397
398 lh = (struct gfs2_log_header *)bh->b_data;
399 memset(lh, 0, sizeof(struct gfs2_log_header));
400 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
401 lh->lh_header.mh_type = cpu_to_be16(GFS2_METATYPE_LH);
402 lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
403 lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
404 lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
405 lh->lh_blkno = cpu_to_be32(lblock);
406 hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header));
407 lh->lh_hash = cpu_to_be32(hash);
408
409 set_buffer_dirty(bh);
410 if (sync_dirty_buffer(bh))
411 gfs2_io_error_bh(sdp, bh);
412 brelse(bh);
413
414 return error;
415}
416
417/**
418 * gfs2_recover_journal - recovery a given journal
419 * @jd: the struct gfs2_jdesc describing the journal
420 *
421 * Acquire the journal's lock, check to see if the journal is clean, and
422 * do recovery if necessary.
423 *
424 * Returns: errno
425 */
426
427int gfs2_recover_journal(struct gfs2_jdesc *jd)
428{
429 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
430 struct gfs2_sbd *sdp = ip->i_sbd;
431 struct gfs2_log_header head;
432 struct gfs2_holder j_gh, ji_gh, t_gh;
433 unsigned long t;
434 int ro = 0;
435 unsigned int pass;
436 int error;
437
438 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
439 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
440 jd->jd_jid);
441
442 /* Aquire the journal lock so we can do recovery */
443
444 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
445 LM_ST_EXCLUSIVE,
446 LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE,
447 &j_gh);
448 switch (error) {
449 case 0:
450 break;
451
452 case GLR_TRYFAILED:
453 fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid);
454 error = 0;
455
456 default:
457 goto fail;
458 };
459
460 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
461 LM_FLAG_NOEXP, &ji_gh);
462 if (error)
463 goto fail_gunlock_j;
464 } else {
465 fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid);
466 }
467
468 fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid);
469
470 error = gfs2_jdesc_check(jd);
471 if (error)
472 goto fail_gunlock_ji;
473
474 error = gfs2_find_jhead(jd, &head);
475 if (error)
476 goto fail_gunlock_ji;
477
478 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
479 fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n",
480 jd->jd_jid);
481
482 t = jiffies;
483
484 /* Acquire a shared hold on the transaction lock */
485
486 error = gfs2_glock_nq_init(sdp->sd_trans_gl,
487 LM_ST_SHARED,
488 LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
489 GL_NOCANCEL | GL_NOCACHE,
490 &t_gh);
491 if (error)
492 goto fail_gunlock_ji;
493
494 if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
495 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
496 ro = 1;
497 } else {
498 if (sdp->sd_vfs->s_flags & MS_RDONLY)
499 ro = 1;
500 }
501
502 if (ro) {
503 fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n",
504 jd->jd_jid);
505 error = -EROFS;
506 goto fail_gunlock_tr;
507 }
508
509 fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid);
510
511 for (pass = 0; pass < 2; pass++) {
512 lops_before_scan(jd, &head, pass);
513 error = foreach_descriptor(jd, head.lh_tail,
514 head.lh_blkno, pass);
515 lops_after_scan(jd, error, pass);
516 if (error)
517 goto fail_gunlock_tr;
518 }
519
520 error = clean_journal(jd, &head);
521 if (error)
522 goto fail_gunlock_tr;
523
524 gfs2_glock_dq_uninit(&t_gh);
525 t = DIV_ROUND_UP(jiffies - t, HZ);
526 fs_info(sdp, "jid=%u: Journal replayed in %lus\n",
527 jd->jd_jid, t);
528 }
529
530 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
531 gfs2_glock_dq_uninit(&ji_gh);
532
533 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
534
535 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
536 gfs2_glock_dq_uninit(&j_gh);
537
538 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
539 return 0;
540
541fail_gunlock_tr:
542 gfs2_glock_dq_uninit(&t_gh);
543fail_gunlock_ji:
544 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
545 gfs2_glock_dq_uninit(&ji_gh);
546fail_gunlock_j:
547 gfs2_glock_dq_uninit(&j_gh);
548 }
549
550 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
551
552fail:
553 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
554 return error;
555}
556
557/**
558 * gfs2_check_journals - Recover any dirty journals
559 * @sdp: the filesystem
560 *
561 */
562
563void gfs2_check_journals(struct gfs2_sbd *sdp)
564{
565 struct gfs2_jdesc *jd;
566
567 for (;;) {
568 jd = gfs2_jdesc_find_dirty(sdp);
569 if (!jd)
570 break;
571
572 if (jd != sdp->sd_jdesc)
573 gfs2_recover_journal(jd);
574 }
575}
576
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
new file mode 100644
index 000000000000..ac0f1d6ce456
--- /dev/null
+++ b/fs/gfs2/recovery.h
@@ -0,0 +1,32 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __RECOVERY_DOT_H__
11#define __RECOVERY_DOT_H__
12
13static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
14{
15 if (++*blk == sdp->sd_jdesc->jd_blocks)
16 *blk = 0;
17}
18
19int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
20 struct buffer_head **bh);
21
22int gfs2_revoke_add(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where);
23int gfs2_revoke_check(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where);
24void gfs2_revoke_clean(struct gfs2_sbd *sdp);
25
26int gfs2_find_jhead(struct gfs2_jdesc *jd,
27 struct gfs2_log_header *head);
28int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
29void gfs2_check_journals(struct gfs2_sbd *sdp);
30
31#endif /* __RECOVERY_DOT_H__ */
32
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
new file mode 100644
index 000000000000..691e6f3ce43b
--- /dev/null
+++ b/fs/gfs2/rgrp.c
@@ -0,0 +1,1524 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/fs.h>
16#include <linux/gfs2_ondisk.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "glock.h"
22#include "glops.h"
23#include "lops.h"
24#include "meta_io.h"
25#include "quota.h"
26#include "rgrp.h"
27#include "super.h"
28#include "trans.h"
29#include "ops_file.h"
30#include "util.h"
31
32#define BFITNOENT 0xFFFFFFFF
33
34/*
35 * These routines are used by the resource group routines (rgrp.c)
36 * to keep track of block allocation. Each block is represented by two
37 * bits. One bit indicates whether or not the block is used. (1=used,
38 * 0=free) The other bit indicates whether or not the block contains a
39 * dinode or not. (1=dinode, 0=not-dinode) So, each byte represents
40 * GFS2_NBBY (i.e. 4) blocks.
41 */
42
43static const char valid_change[16] = {
44 /* current */
45 /* n */ 0, 1, 0, 1,
46 /* e */ 1, 0, 0, 0,
47 /* w */ 0, 0, 0, 0,
48 1, 0, 0, 0
49};
50
51/**
52 * gfs2_setbit - Set a bit in the bitmaps
53 * @buffer: the buffer that holds the bitmaps
54 * @buflen: the length (in bytes) of the buffer
55 * @block: the block to set
56 * @new_state: the new state of the block
57 *
58 */
59
60static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
61 unsigned int buflen, uint32_t block,
62 unsigned char new_state)
63{
64 unsigned char *byte, *end, cur_state;
65 unsigned int bit;
66
67 byte = buffer + (block / GFS2_NBBY);
68 bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
69 end = buffer + buflen;
70
71 gfs2_assert(rgd->rd_sbd, byte < end);
72
73 cur_state = (*byte >> bit) & GFS2_BIT_MASK;
74
75 if (valid_change[new_state * 4 + cur_state]) {
76 *byte ^= cur_state << bit;
77 *byte |= new_state << bit;
78 } else
79 gfs2_consist_rgrpd(rgd);
80}
81
82/**
83 * gfs2_testbit - test a bit in the bitmaps
84 * @buffer: the buffer that holds the bitmaps
85 * @buflen: the length (in bytes) of the buffer
86 * @block: the block to read
87 *
88 */
89
90static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
91 unsigned int buflen, uint32_t block)
92{
93 unsigned char *byte, *end, cur_state;
94 unsigned int bit;
95
96 byte = buffer + (block / GFS2_NBBY);
97 bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
98 end = buffer + buflen;
99
100 gfs2_assert(rgd->rd_sbd, byte < end);
101
102 cur_state = (*byte >> bit) & GFS2_BIT_MASK;
103
104 return cur_state;
105}
106
107/**
108 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
109 * a block in a given allocation state.
110 * @buffer: the buffer that holds the bitmaps
111 * @buflen: the length (in bytes) of the buffer
112 * @goal: start search at this block's bit-pair (within @buffer)
113 * @old_state: GFS2_BLKST_XXX the state of the block we're looking for;
114 * bit 0 = alloc(1)/free(0), bit 1 = meta(1)/data(0)
115 *
116 * Scope of @goal and returned block number is only within this bitmap buffer,
117 * not entire rgrp or filesystem. @buffer will be offset from the actual
118 * beginning of a bitmap block buffer, skipping any header structures.
119 *
120 * Return: the block number (bitmap buffer scope) that was found
121 */
122
123static uint32_t gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
124 unsigned int buflen, uint32_t goal,
125 unsigned char old_state)
126{
127 unsigned char *byte, *end, alloc;
128 uint32_t blk = goal;
129 unsigned int bit;
130
131 byte = buffer + (goal / GFS2_NBBY);
132 bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
133 end = buffer + buflen;
134 alloc = (old_state & 1) ? 0 : 0x55;
135
136 while (byte < end) {
137 if ((*byte & 0x55) == alloc) {
138 blk += (8 - bit) >> 1;
139
140 bit = 0;
141 byte++;
142
143 continue;
144 }
145
146 if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
147 return blk;
148
149 bit += GFS2_BIT_SIZE;
150 if (bit >= 8) {
151 bit = 0;
152 byte++;
153 }
154
155 blk++;
156 }
157
158 return BFITNOENT;
159}
160
161/**
162 * gfs2_bitcount - count the number of bits in a certain state
163 * @buffer: the buffer that holds the bitmaps
164 * @buflen: the length (in bytes) of the buffer
165 * @state: the state of the block we're looking for
166 *
167 * Returns: The number of bits
168 */
169
170static uint32_t gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer,
171 unsigned int buflen, unsigned char state)
172{
173 unsigned char *byte = buffer;
174 unsigned char *end = buffer + buflen;
175 unsigned char state1 = state << 2;
176 unsigned char state2 = state << 4;
177 unsigned char state3 = state << 6;
178 uint32_t count = 0;
179
180 for (; byte < end; byte++) {
181 if (((*byte) & 0x03) == state)
182 count++;
183 if (((*byte) & 0x0C) == state1)
184 count++;
185 if (((*byte) & 0x30) == state2)
186 count++;
187 if (((*byte) & 0xC0) == state3)
188 count++;
189 }
190
191 return count;
192}
193
194/**
195 * gfs2_rgrp_verify - Verify that a resource group is consistent
196 * @sdp: the filesystem
197 * @rgd: the rgrp
198 *
199 */
200
201void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
202{
203 struct gfs2_sbd *sdp = rgd->rd_sbd;
204 struct gfs2_bitmap *bi = NULL;
205 uint32_t length = rgd->rd_ri.ri_length;
206 uint32_t count[4], tmp;
207 int buf, x;
208
209 memset(count, 0, 4 * sizeof(uint32_t));
210
211 /* Count # blocks in each of 4 possible allocation states */
212 for (buf = 0; buf < length; buf++) {
213 bi = rgd->rd_bits + buf;
214 for (x = 0; x < 4; x++)
215 count[x] += gfs2_bitcount(rgd,
216 bi->bi_bh->b_data +
217 bi->bi_offset,
218 bi->bi_len, x);
219 }
220
221 if (count[0] != rgd->rd_rg.rg_free) {
222 if (gfs2_consist_rgrpd(rgd))
223 fs_err(sdp, "free data mismatch: %u != %u\n",
224 count[0], rgd->rd_rg.rg_free);
225 return;
226 }
227
228 tmp = rgd->rd_ri.ri_data -
229 rgd->rd_rg.rg_free -
230 rgd->rd_rg.rg_dinodes;
231 if (count[1] != tmp) {
232 if (gfs2_consist_rgrpd(rgd))
233 fs_err(sdp, "used data mismatch: %u != %u\n",
234 count[1], tmp);
235 return;
236 }
237
238 if (count[2]) {
239 if (gfs2_consist_rgrpd(rgd))
240 fs_err(sdp, "free metadata mismatch: %u != 0\n",
241 count[2]);
242 return;
243 }
244
245 if (count[3] != rgd->rd_rg.rg_dinodes) {
246 if (gfs2_consist_rgrpd(rgd))
247 fs_err(sdp, "used metadata mismatch: %u != %u\n",
248 count[3], rgd->rd_rg.rg_dinodes);
249 return;
250 }
251}
252
253static inline int rgrp_contains_block(struct gfs2_rindex *ri, uint64_t block)
254{
255 uint64_t first = ri->ri_data0;
256 uint64_t last = first + ri->ri_data;
257 return !!(first <= block && block < last);
258}
259
260/**
261 * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
262 * @sdp: The GFS2 superblock
263 * @n: The data block number
264 *
265 * Returns: The resource group, or NULL if not found
266 */
267
268struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, uint64_t blk)
269{
270 struct gfs2_rgrpd *rgd;
271
272 spin_lock(&sdp->sd_rindex_spin);
273
274 list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) {
275 if (rgrp_contains_block(&rgd->rd_ri, blk)) {
276 list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
277 spin_unlock(&sdp->sd_rindex_spin);
278 return rgd;
279 }
280 }
281
282 spin_unlock(&sdp->sd_rindex_spin);
283
284 return NULL;
285}
286
287/**
288 * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
289 * @sdp: The GFS2 superblock
290 *
291 * Returns: The first rgrp in the filesystem
292 */
293
294struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
295{
296 gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list));
297 return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list);
298}
299
300/**
301 * gfs2_rgrpd_get_next - get the next RG
302 * @rgd: A RG
303 *
304 * Returns: The next rgrp
305 */
306
307struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
308{
309 if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list)
310 return NULL;
311 return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list);
312}
313
314static void clear_rgrpdi(struct gfs2_sbd *sdp)
315{
316 struct list_head *head;
317 struct gfs2_rgrpd *rgd;
318 struct gfs2_glock *gl;
319
320 spin_lock(&sdp->sd_rindex_spin);
321 sdp->sd_rindex_forward = NULL;
322 head = &sdp->sd_rindex_recent_list;
323 while (!list_empty(head)) {
324 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
325 list_del(&rgd->rd_recent);
326 }
327 spin_unlock(&sdp->sd_rindex_spin);
328
329 head = &sdp->sd_rindex_list;
330 while (!list_empty(head)) {
331 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list);
332 gl = rgd->rd_gl;
333
334 list_del(&rgd->rd_list);
335 list_del(&rgd->rd_list_mru);
336
337 if (gl) {
338 gl->gl_object = NULL;
339 gfs2_glock_put(gl);
340 }
341
342 kfree(rgd->rd_bits);
343 kfree(rgd);
344 }
345}
346
347void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
348{
349 mutex_lock(&sdp->sd_rindex_mutex);
350 clear_rgrpdi(sdp);
351 mutex_unlock(&sdp->sd_rindex_mutex);
352}
353
354/**
355 * gfs2_compute_bitstructs - Compute the bitmap sizes
356 * @rgd: The resource group descriptor
357 *
358 * Calculates bitmap descriptors, one for each block that contains bitmap data
359 *
360 * Returns: errno
361 */
362
363static int compute_bitstructs(struct gfs2_rgrpd *rgd)
364{
365 struct gfs2_sbd *sdp = rgd->rd_sbd;
366 struct gfs2_bitmap *bi;
367 uint32_t length = rgd->rd_ri.ri_length; /* # blocks in hdr & bitmap */
368 uint32_t bytes_left, bytes;
369 int x;
370
371 rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_KERNEL);
372 if (!rgd->rd_bits)
373 return -ENOMEM;
374
375 bytes_left = rgd->rd_ri.ri_bitbytes;
376
377 for (x = 0; x < length; x++) {
378 bi = rgd->rd_bits + x;
379
380 /* small rgrp; bitmap stored completely in header block */
381 if (length == 1) {
382 bytes = bytes_left;
383 bi->bi_offset = sizeof(struct gfs2_rgrp);
384 bi->bi_start = 0;
385 bi->bi_len = bytes;
386 /* header block */
387 } else if (x == 0) {
388 bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
389 bi->bi_offset = sizeof(struct gfs2_rgrp);
390 bi->bi_start = 0;
391 bi->bi_len = bytes;
392 /* last block */
393 } else if (x + 1 == length) {
394 bytes = bytes_left;
395 bi->bi_offset = sizeof(struct gfs2_meta_header);
396 bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
397 bi->bi_len = bytes;
398 /* other blocks */
399 } else {
400 bytes = sdp->sd_sb.sb_bsize -
401 sizeof(struct gfs2_meta_header);
402 bi->bi_offset = sizeof(struct gfs2_meta_header);
403 bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
404 bi->bi_len = bytes;
405 }
406
407 bytes_left -= bytes;
408 }
409
410 if (bytes_left) {
411 gfs2_consist_rgrpd(rgd);
412 return -EIO;
413 }
414 bi = rgd->rd_bits + (length - 1);
415 if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_ri.ri_data) {
416 if (gfs2_consist_rgrpd(rgd)) {
417 gfs2_rindex_print(&rgd->rd_ri);
418 fs_err(sdp, "start=%u len=%u offset=%u\n",
419 bi->bi_start, bi->bi_len, bi->bi_offset);
420 }
421 return -EIO;
422 }
423
424 return 0;
425}
426
427/**
428 * gfs2_ri_update - Pull in a new resource index from the disk
429 * @gl: The glock covering the rindex inode
430 *
431 * Returns: 0 on successful update, error code otherwise
432 */
433
434static int gfs2_ri_update(struct gfs2_inode *ip)
435{
436 struct gfs2_sbd *sdp = ip->i_sbd;
437 struct inode *inode = ip->i_vnode;
438 struct gfs2_rgrpd *rgd;
439 char buf[sizeof(struct gfs2_rindex)];
440 struct file_ra_state ra_state;
441 uint64_t junk = ip->i_di.di_size;
442 int error;
443
444 if (do_div(junk, sizeof(struct gfs2_rindex))) {
445 gfs2_consist_inode(ip);
446 return -EIO;
447 }
448
449 clear_rgrpdi(sdp);
450
451 file_ra_state_init(&ra_state, inode->i_mapping);
452 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
453 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
454 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
455 sizeof(struct gfs2_rindex));
456 if (!error)
457 break;
458 if (error != sizeof(struct gfs2_rindex)) {
459 if (error > 0)
460 error = -EIO;
461 goto fail;
462 }
463
464 rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_KERNEL);
465 error = -ENOMEM;
466 if (!rgd)
467 goto fail;
468
469 mutex_init(&rgd->rd_mutex);
470 lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
471 rgd->rd_sbd = sdp;
472
473 list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
474 list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
475
476 gfs2_rindex_in(&rgd->rd_ri, buf);
477
478 error = compute_bitstructs(rgd);
479 if (error)
480 goto fail;
481
482 error = gfs2_glock_get(sdp, rgd->rd_ri.ri_addr,
483 &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
484 if (error)
485 goto fail;
486
487 rgd->rd_gl->gl_object = rgd;
488 rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
489 }
490
491 sdp->sd_rindex_vn = ip->i_gl->gl_vn;
492
493 return 0;
494
495 fail:
496 clear_rgrpdi(sdp);
497
498 return error;
499}
500
501/**
502 * gfs2_rindex_hold - Grab a lock on the rindex
503 * @sdp: The GFS2 superblock
504 * @ri_gh: the glock holder
505 *
506 * We grab a lock on the rindex inode to make sure that it doesn't
507 * change whilst we are performing an operation. We keep this lock
508 * for quite long periods of time compared to other locks. This
509 * doesn't matter, since it is shared and it is very, very rarely
510 * accessed in the exclusive mode (i.e. only when expanding the filesystem).
511 *
512 * This makes sure that we're using the latest copy of the resource index
513 * special file, which might have been updated if someone expanded the
514 * filesystem (via gfs2_grow utility), which adds new resource groups.
515 *
516 * Returns: 0 on success, error code otherwise
517 */
518
519int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
520{
521 struct gfs2_inode *ip = sdp->sd_rindex->u.generic_ip;
522 struct gfs2_glock *gl = ip->i_gl;
523 int error;
524
525 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
526 if (error)
527 return error;
528
529 /* Read new copy from disk if we don't have the latest */
530 if (sdp->sd_rindex_vn != gl->gl_vn) {
531 mutex_lock(&sdp->sd_rindex_mutex);
532 if (sdp->sd_rindex_vn != gl->gl_vn) {
533 error = gfs2_ri_update(ip);
534 if (error)
535 gfs2_glock_dq_uninit(ri_gh);
536 }
537 mutex_unlock(&sdp->sd_rindex_mutex);
538 }
539
540 return error;
541}
542
543/**
544 * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
545 * @rgd: the struct gfs2_rgrpd describing the RG to read in
546 *
547 * Read in all of a Resource Group's header and bitmap blocks.
548 * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
549 *
550 * Returns: errno
551 */
552
553int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
554{
555 struct gfs2_sbd *sdp = rgd->rd_sbd;
556 struct gfs2_glock *gl = rgd->rd_gl;
557 unsigned int length = rgd->rd_ri.ri_length;
558 struct gfs2_bitmap *bi;
559 unsigned int x, y;
560 int error;
561
562 mutex_lock(&rgd->rd_mutex);
563
564 spin_lock(&sdp->sd_rindex_spin);
565 if (rgd->rd_bh_count) {
566 rgd->rd_bh_count++;
567 spin_unlock(&sdp->sd_rindex_spin);
568 mutex_unlock(&rgd->rd_mutex);
569 return 0;
570 }
571 spin_unlock(&sdp->sd_rindex_spin);
572
573 for (x = 0; x < length; x++) {
574 bi = rgd->rd_bits + x;
575 error = gfs2_meta_read(gl, rgd->rd_ri.ri_addr + x, DIO_START,
576 &bi->bi_bh);
577 if (error)
578 goto fail;
579 }
580
581 for (y = length; y--;) {
582 bi = rgd->rd_bits + y;
583 error = gfs2_meta_reread(sdp, bi->bi_bh, DIO_WAIT);
584 if (error)
585 goto fail;
586 if (gfs2_metatype_check(sdp, bi->bi_bh,
587 (y) ? GFS2_METATYPE_RB :
588 GFS2_METATYPE_RG)) {
589 error = -EIO;
590 goto fail;
591 }
592 }
593
594 if (rgd->rd_rg_vn != gl->gl_vn) {
595 gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data);
596 rgd->rd_rg_vn = gl->gl_vn;
597 }
598
599 spin_lock(&sdp->sd_rindex_spin);
600 rgd->rd_free_clone = rgd->rd_rg.rg_free;
601 rgd->rd_bh_count++;
602 spin_unlock(&sdp->sd_rindex_spin);
603
604 mutex_unlock(&rgd->rd_mutex);
605
606 return 0;
607
608 fail:
609 while (x--) {
610 bi = rgd->rd_bits + x;
611 brelse(bi->bi_bh);
612 bi->bi_bh = NULL;
613 gfs2_assert_warn(sdp, !bi->bi_clone);
614 }
615 mutex_unlock(&rgd->rd_mutex);
616
617 return error;
618}
619
620void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
621{
622 struct gfs2_sbd *sdp = rgd->rd_sbd;
623
624 spin_lock(&sdp->sd_rindex_spin);
625 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
626 rgd->rd_bh_count++;
627 spin_unlock(&sdp->sd_rindex_spin);
628}
629
630/**
631 * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get()
632 * @rgd: the struct gfs2_rgrpd describing the RG to read in
633 *
634 */
635
636void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
637{
638 struct gfs2_sbd *sdp = rgd->rd_sbd;
639 int x, length = rgd->rd_ri.ri_length;
640
641 spin_lock(&sdp->sd_rindex_spin);
642 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
643 if (--rgd->rd_bh_count) {
644 spin_unlock(&sdp->sd_rindex_spin);
645 return;
646 }
647
648 for (x = 0; x < length; x++) {
649 struct gfs2_bitmap *bi = rgd->rd_bits + x;
650 kfree(bi->bi_clone);
651 bi->bi_clone = NULL;
652 brelse(bi->bi_bh);
653 bi->bi_bh = NULL;
654 }
655
656 spin_unlock(&sdp->sd_rindex_spin);
657}
658
659void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
660{
661 struct gfs2_sbd *sdp = rgd->rd_sbd;
662 unsigned int length = rgd->rd_ri.ri_length;
663 unsigned int x;
664
665 for (x = 0; x < length; x++) {
666 struct gfs2_bitmap *bi = rgd->rd_bits + x;
667 if (!bi->bi_clone)
668 continue;
669 memcpy(bi->bi_clone + bi->bi_offset,
670 bi->bi_bh->b_data + bi->bi_offset,
671 bi->bi_len);
672 }
673
674 spin_lock(&sdp->sd_rindex_spin);
675 rgd->rd_free_clone = rgd->rd_rg.rg_free;
676 spin_unlock(&sdp->sd_rindex_spin);
677}
678
679/**
680 * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
681 * @ip: the incore GFS2 inode structure
682 *
683 * Returns: the struct gfs2_alloc
684 */
685
686struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
687{
688 struct gfs2_alloc *al = &ip->i_alloc;
689
690 /* FIXME: Should assert that the correct locks are held here... */
691 memset(al, 0, sizeof(*al));
692 return al;
693}
694
695/**
696 * gfs2_alloc_put - throw away the struct gfs2_alloc for an inode
697 * @ip: the inode
698 *
699 */
700
701void gfs2_alloc_put(struct gfs2_inode *ip)
702{
703 return;
704}
705
706/**
707 * try_rgrp_fit - See if a given reservation will fit in a given RG
708 * @rgd: the RG data
709 * @al: the struct gfs2_alloc structure describing the reservation
710 *
711 * If there's room for the requested blocks to be allocated from the RG:
712 * Sets the $al_reserved_data field in @al.
713 * Sets the $al_reserved_meta field in @al.
714 * Sets the $al_rgd field in @al.
715 *
716 * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
717 */
718
719static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
720{
721 struct gfs2_sbd *sdp = rgd->rd_sbd;
722 int ret = 0;
723
724 spin_lock(&sdp->sd_rindex_spin);
725 if (rgd->rd_free_clone >= al->al_requested) {
726 al->al_rgd = rgd;
727 ret = 1;
728 }
729 spin_unlock(&sdp->sd_rindex_spin);
730
731 return ret;
732}
733
734/**
735 * recent_rgrp_first - get first RG from "recent" list
736 * @sdp: The GFS2 superblock
737 * @rglast: address of the rgrp used last
738 *
739 * Returns: The first rgrp in the recent list
740 */
741
742static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
743 uint64_t rglast)
744{
745 struct gfs2_rgrpd *rgd = NULL;
746
747 spin_lock(&sdp->sd_rindex_spin);
748
749 if (list_empty(&sdp->sd_rindex_recent_list))
750 goto out;
751
752 if (!rglast)
753 goto first;
754
755 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
756 if (rgd->rd_ri.ri_addr == rglast)
757 goto out;
758 }
759
760 first:
761 rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd,
762 rd_recent);
763
764 out:
765 spin_unlock(&sdp->sd_rindex_spin);
766
767 return rgd;
768}
769
770/**
771 * recent_rgrp_next - get next RG from "recent" list
772 * @cur_rgd: current rgrp
773 * @remove:
774 *
775 * Returns: The next rgrp in the recent list
776 */
777
778static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
779 int remove)
780{
781 struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
782 struct list_head *head;
783 struct gfs2_rgrpd *rgd;
784
785 spin_lock(&sdp->sd_rindex_spin);
786
787 head = &sdp->sd_rindex_recent_list;
788
789 list_for_each_entry(rgd, head, rd_recent) {
790 if (rgd == cur_rgd) {
791 if (cur_rgd->rd_recent.next != head)
792 rgd = list_entry(cur_rgd->rd_recent.next,
793 struct gfs2_rgrpd, rd_recent);
794 else
795 rgd = NULL;
796
797 if (remove)
798 list_del(&cur_rgd->rd_recent);
799
800 goto out;
801 }
802 }
803
804 rgd = NULL;
805 if (!list_empty(head))
806 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
807
808 out:
809 spin_unlock(&sdp->sd_rindex_spin);
810
811 return rgd;
812}
813
814/**
815 * recent_rgrp_add - add an RG to tail of "recent" list
816 * @new_rgd: The rgrp to add
817 *
818 */
819
820static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
821{
822 struct gfs2_sbd *sdp = new_rgd->rd_sbd;
823 struct gfs2_rgrpd *rgd;
824 unsigned int count = 0;
825 unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
826
827 spin_lock(&sdp->sd_rindex_spin);
828
829 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
830 if (rgd == new_rgd)
831 goto out;
832
833 if (++count >= max)
834 goto out;
835 }
836 list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
837
838 out:
839 spin_unlock(&sdp->sd_rindex_spin);
840}
841
842/**
843 * forward_rgrp_get - get an rgrp to try next from full list
844 * @sdp: The GFS2 superblock
845 *
846 * Returns: The rgrp to try next
847 */
848
849static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp)
850{
851 struct gfs2_rgrpd *rgd;
852 unsigned int journals = gfs2_jindex_size(sdp);
853 unsigned int rg = 0, x;
854
855 spin_lock(&sdp->sd_rindex_spin);
856
857 rgd = sdp->sd_rindex_forward;
858 if (!rgd) {
859 if (sdp->sd_rgrps >= journals)
860 rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals;
861
862 for (x = 0, rgd = gfs2_rgrpd_get_first(sdp);
863 x < rg;
864 x++, rgd = gfs2_rgrpd_get_next(rgd))
865 /* Do Nothing */;
866
867 sdp->sd_rindex_forward = rgd;
868 }
869
870 spin_unlock(&sdp->sd_rindex_spin);
871
872 return rgd;
873}
874
875/**
876 * forward_rgrp_set - set the forward rgrp pointer
877 * @sdp: the filesystem
878 * @rgd: The new forward rgrp
879 *
880 */
881
882static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
883{
884 spin_lock(&sdp->sd_rindex_spin);
885 sdp->sd_rindex_forward = rgd;
886 spin_unlock(&sdp->sd_rindex_spin);
887}
888
889/**
890 * get_local_rgrp - Choose and lock a rgrp for allocation
891 * @ip: the inode to reserve space for
892 * @rgp: the chosen and locked rgrp
893 *
894 * Try to acquire rgrp in way which avoids contending with others.
895 *
896 * Returns: errno
897 */
898
899static int get_local_rgrp(struct gfs2_inode *ip)
900{
901 struct gfs2_sbd *sdp = ip->i_sbd;
902 struct gfs2_rgrpd *rgd, *begin = NULL;
903 struct gfs2_alloc *al = &ip->i_alloc;
904 int flags = LM_FLAG_TRY;
905 int skipped = 0;
906 int loops = 0;
907 int error;
908
909 /* Try recently successful rgrps */
910
911 rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
912
913 while (rgd) {
914 error = gfs2_glock_nq_init(rgd->rd_gl,
915 LM_ST_EXCLUSIVE, LM_FLAG_TRY,
916 &al->al_rgd_gh);
917 switch (error) {
918 case 0:
919 if (try_rgrp_fit(rgd, al))
920 goto out;
921 gfs2_glock_dq_uninit(&al->al_rgd_gh);
922 rgd = recent_rgrp_next(rgd, 1);
923 break;
924
925 case GLR_TRYFAILED:
926 rgd = recent_rgrp_next(rgd, 0);
927 break;
928
929 default:
930 return error;
931 }
932 }
933
934 /* Go through full list of rgrps */
935
936 begin = rgd = forward_rgrp_get(sdp);
937
938 for (;;) {
939 error = gfs2_glock_nq_init(rgd->rd_gl,
940 LM_ST_EXCLUSIVE, flags,
941 &al->al_rgd_gh);
942 switch (error) {
943 case 0:
944 if (try_rgrp_fit(rgd, al))
945 goto out;
946 gfs2_glock_dq_uninit(&al->al_rgd_gh);
947 break;
948
949 case GLR_TRYFAILED:
950 skipped++;
951 break;
952
953 default:
954 return error;
955 }
956
957 rgd = gfs2_rgrpd_get_next(rgd);
958 if (!rgd)
959 rgd = gfs2_rgrpd_get_first(sdp);
960
961 if (rgd == begin) {
962 if (++loops >= 2 || !skipped)
963 return -ENOSPC;
964 flags = 0;
965 }
966 }
967
968 out:
969 ip->i_last_rg_alloc = rgd->rd_ri.ri_addr;
970
971 if (begin) {
972 recent_rgrp_add(rgd);
973 rgd = gfs2_rgrpd_get_next(rgd);
974 if (!rgd)
975 rgd = gfs2_rgrpd_get_first(sdp);
976 forward_rgrp_set(sdp, rgd);
977 }
978
979 return 0;
980}
981
982/**
983 * gfs2_inplace_reserve_i - Reserve space in the filesystem
984 * @ip: the inode to reserve space for
985 *
986 * Returns: errno
987 */
988
989int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
990{
991 struct gfs2_sbd *sdp = ip->i_sbd;
992 struct gfs2_alloc *al = &ip->i_alloc;
993 int error;
994
995 if (gfs2_assert_warn(sdp, al->al_requested))
996 return -EINVAL;
997
998 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
999 if (error)
1000 return error;
1001
1002 error = get_local_rgrp(ip);
1003 if (error) {
1004 gfs2_glock_dq_uninit(&al->al_ri_gh);
1005 return error;
1006 }
1007
1008 al->al_file = file;
1009 al->al_line = line;
1010
1011 return 0;
1012}
1013
1014/**
1015 * gfs2_inplace_release - release an inplace reservation
1016 * @ip: the inode the reservation was taken out on
1017 *
1018 * Release a reservation made by gfs2_inplace_reserve().
1019 */
1020
1021void gfs2_inplace_release(struct gfs2_inode *ip)
1022{
1023 struct gfs2_sbd *sdp = ip->i_sbd;
1024 struct gfs2_alloc *al = &ip->i_alloc;
1025
1026 if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
1027 fs_warn(sdp, "al_alloced = %u, al_requested = %u "
1028 "al_file = %s, al_line = %u\n",
1029 al->al_alloced, al->al_requested, al->al_file,
1030 al->al_line);
1031
1032 al->al_rgd = NULL;
1033 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1034 gfs2_glock_dq_uninit(&al->al_ri_gh);
1035}
1036
1037/**
1038 * gfs2_get_block_type - Check a block in a RG is of given type
1039 * @rgd: the resource group holding the block
1040 * @block: the block number
1041 *
1042 * Returns: The block type (GFS2_BLKST_*)
1043 */
1044
1045unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, uint64_t block)
1046{
1047 struct gfs2_bitmap *bi = NULL;
1048 uint32_t length, rgrp_block, buf_block;
1049 unsigned int buf;
1050 unsigned char type;
1051
1052 length = rgd->rd_ri.ri_length;
1053 rgrp_block = block - rgd->rd_ri.ri_data0;
1054
1055 for (buf = 0; buf < length; buf++) {
1056 bi = rgd->rd_bits + buf;
1057 if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1058 break;
1059 }
1060
1061 gfs2_assert(rgd->rd_sbd, buf < length);
1062 buf_block = rgrp_block - bi->bi_start * GFS2_NBBY;
1063
1064 type = gfs2_testbit(rgd,
1065 bi->bi_bh->b_data + bi->bi_offset,
1066 bi->bi_len, buf_block);
1067
1068 return type;
1069}
1070
1071/**
1072 * rgblk_search - find a block in @old_state, change allocation
1073 * state to @new_state
1074 * @rgd: the resource group descriptor
1075 * @goal: the goal block within the RG (start here to search for avail block)
1076 * @old_state: GFS2_BLKST_XXX the before-allocation state to find
1077 * @new_state: GFS2_BLKST_XXX the after-allocation block state
1078 *
1079 * Walk rgrp's bitmap to find bits that represent a block in @old_state.
1080 * Add the found bitmap buffer to the transaction.
1081 * Set the found bits to @new_state to change block's allocation state.
1082 *
1083 * This function never fails, because we wouldn't call it unless we
1084 * know (from reservation results, etc.) that a block is available.
1085 *
1086 * Scope of @goal and returned block is just within rgrp, not the whole
1087 * filesystem.
1088 *
1089 * Returns: the block number allocated
1090 */
1091
1092static uint32_t rgblk_search(struct gfs2_rgrpd *rgd, uint32_t goal,
1093 unsigned char old_state, unsigned char new_state)
1094{
1095 struct gfs2_bitmap *bi = NULL;
1096 uint32_t length = rgd->rd_ri.ri_length;
1097 uint32_t blk = 0;
1098 unsigned int buf, x;
1099
1100 /* Find bitmap block that contains bits for goal block */
1101 for (buf = 0; buf < length; buf++) {
1102 bi = rgd->rd_bits + buf;
1103 if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1104 break;
1105 }
1106
1107 gfs2_assert(rgd->rd_sbd, buf < length);
1108
1109 /* Convert scope of "goal" from rgrp-wide to within found bit block */
1110 goal -= bi->bi_start * GFS2_NBBY;
1111
1112 /* Search (up to entire) bitmap in this rgrp for allocatable block.
1113 "x <= length", instead of "x < length", because we typically start
1114 the search in the middle of a bit block, but if we can't find an
1115 allocatable block anywhere else, we want to be able wrap around and
1116 search in the first part of our first-searched bit block. */
1117 for (x = 0; x <= length; x++) {
1118 if (bi->bi_clone)
1119 blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset,
1120 bi->bi_len, goal, old_state);
1121 else
1122 blk = gfs2_bitfit(rgd,
1123 bi->bi_bh->b_data + bi->bi_offset,
1124 bi->bi_len, goal, old_state);
1125 if (blk != BFITNOENT)
1126 break;
1127
1128 /* Try next bitmap block (wrap back to rgrp header if at end) */
1129 buf = (buf + 1) % length;
1130 bi = rgd->rd_bits + buf;
1131 goal = 0;
1132 }
1133
1134 if (gfs2_assert_withdraw(rgd->rd_sbd, x <= length))
1135 blk = 0;
1136
1137 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1138 gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
1139 bi->bi_len, blk, new_state);
1140 if (bi->bi_clone)
1141 gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset,
1142 bi->bi_len, blk, new_state);
1143
1144 return bi->bi_start * GFS2_NBBY + blk;
1145}
1146
1147/**
1148 * rgblk_free - Change alloc state of given block(s)
1149 * @sdp: the filesystem
1150 * @bstart: the start of a run of blocks to free
1151 * @blen: the length of the block run (all must lie within ONE RG!)
1152 * @new_state: GFS2_BLKST_XXX the after-allocation block state
1153 *
1154 * Returns: Resource group containing the block(s)
1155 */
1156
1157static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, uint64_t bstart,
1158 uint32_t blen, unsigned char new_state)
1159{
1160 struct gfs2_rgrpd *rgd;
1161 struct gfs2_bitmap *bi = NULL;
1162 uint32_t length, rgrp_blk, buf_blk;
1163 unsigned int buf;
1164
1165 rgd = gfs2_blk2rgrpd(sdp, bstart);
1166 if (!rgd) {
1167 if (gfs2_consist(sdp))
1168 fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
1169 return NULL;
1170 }
1171
1172 length = rgd->rd_ri.ri_length;
1173
1174 rgrp_blk = bstart - rgd->rd_ri.ri_data0;
1175
1176 while (blen--) {
1177 for (buf = 0; buf < length; buf++) {
1178 bi = rgd->rd_bits + buf;
1179 if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1180 break;
1181 }
1182
1183 gfs2_assert(rgd->rd_sbd, buf < length);
1184
1185 buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY;
1186 rgrp_blk++;
1187
1188 if (!bi->bi_clone) {
1189 bi->bi_clone = kmalloc(bi->bi_bh->b_size,
1190 GFP_KERNEL | __GFP_NOFAIL);
1191 memcpy(bi->bi_clone + bi->bi_offset,
1192 bi->bi_bh->b_data + bi->bi_offset,
1193 bi->bi_len);
1194 }
1195 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1196 gfs2_setbit(rgd,
1197 bi->bi_bh->b_data + bi->bi_offset,
1198 bi->bi_len, buf_blk, new_state);
1199 }
1200
1201 return rgd;
1202}
1203
1204/**
1205 * gfs2_alloc_data - Allocate a data block
1206 * @ip: the inode to allocate the data block for
1207 *
1208 * Returns: the allocated block
1209 */
1210
1211uint64_t gfs2_alloc_data(struct gfs2_inode *ip)
1212{
1213 struct gfs2_sbd *sdp = ip->i_sbd;
1214 struct gfs2_alloc *al = &ip->i_alloc;
1215 struct gfs2_rgrpd *rgd = al->al_rgd;
1216 uint32_t goal, blk;
1217 uint64_t block;
1218
1219 if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_data))
1220 goal = ip->i_di.di_goal_data - rgd->rd_ri.ri_data0;
1221 else
1222 goal = rgd->rd_last_alloc_data;
1223
1224 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
1225 rgd->rd_last_alloc_data = blk;
1226
1227 block = rgd->rd_ri.ri_data0 + blk;
1228 ip->i_di.di_goal_data = block;
1229
1230 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1231 rgd->rd_rg.rg_free--;
1232
1233 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1234 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1235
1236 al->al_alloced++;
1237
1238 gfs2_statfs_change(sdp, 0, -1, 0);
1239 gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
1240
1241 spin_lock(&sdp->sd_rindex_spin);
1242 rgd->rd_free_clone--;
1243 spin_unlock(&sdp->sd_rindex_spin);
1244
1245 return block;
1246}
1247
1248/**
1249 * gfs2_alloc_meta - Allocate a metadata block
1250 * @ip: the inode to allocate the metadata block for
1251 *
1252 * Returns: the allocated block
1253 */
1254
1255uint64_t gfs2_alloc_meta(struct gfs2_inode *ip)
1256{
1257 struct gfs2_sbd *sdp = ip->i_sbd;
1258 struct gfs2_alloc *al = &ip->i_alloc;
1259 struct gfs2_rgrpd *rgd = al->al_rgd;
1260 uint32_t goal, blk;
1261 uint64_t block;
1262
1263 if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_meta))
1264 goal = ip->i_di.di_goal_meta - rgd->rd_ri.ri_data0;
1265 else
1266 goal = rgd->rd_last_alloc_meta;
1267
1268 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
1269 rgd->rd_last_alloc_meta = blk;
1270
1271 block = rgd->rd_ri.ri_data0 + blk;
1272 ip->i_di.di_goal_meta = block;
1273
1274 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1275 rgd->rd_rg.rg_free--;
1276
1277 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1278 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1279
1280 al->al_alloced++;
1281
1282 gfs2_statfs_change(sdp, 0, -1, 0);
1283 gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
1284 gfs2_trans_add_unrevoke(sdp, block);
1285
1286 spin_lock(&sdp->sd_rindex_spin);
1287 rgd->rd_free_clone--;
1288 spin_unlock(&sdp->sd_rindex_spin);
1289
1290 return block;
1291}
1292
1293/**
1294 * gfs2_alloc_di - Allocate a dinode
1295 * @dip: the directory that the inode is going in
1296 *
1297 * Returns: the block allocated
1298 */
1299
1300uint64_t gfs2_alloc_di(struct gfs2_inode *dip)
1301{
1302 struct gfs2_sbd *sdp = dip->i_sbd;
1303 struct gfs2_alloc *al = &dip->i_alloc;
1304 struct gfs2_rgrpd *rgd = al->al_rgd;
1305 uint32_t blk;
1306 uint64_t block;
1307
1308 blk = rgblk_search(rgd, rgd->rd_last_alloc_meta,
1309 GFS2_BLKST_FREE, GFS2_BLKST_DINODE);
1310
1311 rgd->rd_last_alloc_meta = blk;
1312
1313 block = rgd->rd_ri.ri_data0 + blk;
1314
1315 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1316 rgd->rd_rg.rg_free--;
1317 rgd->rd_rg.rg_dinodes++;
1318
1319 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1320 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1321
1322 al->al_alloced++;
1323
1324 gfs2_statfs_change(sdp, 0, -1, +1);
1325 gfs2_trans_add_unrevoke(sdp, block);
1326
1327 spin_lock(&sdp->sd_rindex_spin);
1328 rgd->rd_free_clone--;
1329 spin_unlock(&sdp->sd_rindex_spin);
1330
1331 return block;
1332}
1333
1334/**
1335 * gfs2_free_data - free a contiguous run of data block(s)
1336 * @ip: the inode these blocks are being freed from
1337 * @bstart: first block of a run of contiguous blocks
1338 * @blen: the length of the block run
1339 *
1340 */
1341
1342void gfs2_free_data(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen)
1343{
1344 struct gfs2_sbd *sdp = ip->i_sbd;
1345 struct gfs2_rgrpd *rgd;
1346
1347 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1348 if (!rgd)
1349 return;
1350
1351 rgd->rd_rg.rg_free += blen;
1352
1353 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1354 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1355
1356 gfs2_trans_add_rg(rgd);
1357
1358 gfs2_statfs_change(sdp, 0, +blen, 0);
1359 gfs2_quota_change(ip, -(int64_t)blen,
1360 ip->i_di.di_uid, ip->i_di.di_gid);
1361}
1362
1363/**
1364 * gfs2_free_meta - free a contiguous run of data block(s)
1365 * @ip: the inode these blocks are being freed from
1366 * @bstart: first block of a run of contiguous blocks
1367 * @blen: the length of the block run
1368 *
1369 */
1370
1371void gfs2_free_meta(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen)
1372{
1373 struct gfs2_sbd *sdp = ip->i_sbd;
1374 struct gfs2_rgrpd *rgd;
1375
1376 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1377 if (!rgd)
1378 return;
1379
1380 rgd->rd_rg.rg_free += blen;
1381
1382 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1383 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1384
1385 gfs2_trans_add_rg(rgd);
1386
1387 gfs2_statfs_change(sdp, 0, +blen, 0);
1388 gfs2_quota_change(ip, -(int64_t)blen,
1389 ip->i_di.di_uid, ip->i_di.di_gid);
1390 gfs2_meta_wipe(ip, bstart, blen);
1391}
1392
1393void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, uint64_t blkno)
1394{
1395 struct gfs2_sbd *sdp = rgd->rd_sbd;
1396 struct gfs2_rgrpd *tmp_rgd;
1397
1398 tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE);
1399 if (!tmp_rgd)
1400 return;
1401 gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
1402
1403 if (!rgd->rd_rg.rg_dinodes)
1404 gfs2_consist_rgrpd(rgd);
1405 rgd->rd_rg.rg_dinodes--;
1406 rgd->rd_rg.rg_free++;
1407
1408 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1409 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1410
1411 gfs2_statfs_change(sdp, 0, +1, -1);
1412 gfs2_trans_add_rg(rgd);
1413}
1414
1415/**
1416 * gfs2_free_uninit_di - free a dinode block
1417 * @rgd: the resource group that contains the dinode
1418 * @ip: the inode
1419 *
1420 */
1421
1422void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
1423{
1424 gfs2_free_uninit_di(rgd, ip->i_num.no_addr);
1425 gfs2_quota_change(ip, -1, ip->i_di.di_uid, ip->i_di.di_gid);
1426 gfs2_meta_wipe(ip, ip->i_num.no_addr, 1);
1427}
1428
1429/**
1430 * gfs2_rlist_add - add a RG to a list of RGs
1431 * @sdp: the filesystem
1432 * @rlist: the list of resource groups
1433 * @block: the block
1434 *
1435 * Figure out what RG a block belongs to and add that RG to the list
1436 *
1437 * FIXME: Don't use NOFAIL
1438 *
1439 */
1440
1441void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
1442 uint64_t block)
1443{
1444 struct gfs2_rgrpd *rgd;
1445 struct gfs2_rgrpd **tmp;
1446 unsigned int new_space;
1447 unsigned int x;
1448
1449 if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
1450 return;
1451
1452 rgd = gfs2_blk2rgrpd(sdp, block);
1453 if (!rgd) {
1454 if (gfs2_consist(sdp))
1455 fs_err(sdp, "block = %llu\n", (unsigned long long)block);
1456 return;
1457 }
1458
1459 for (x = 0; x < rlist->rl_rgrps; x++)
1460 if (rlist->rl_rgd[x] == rgd)
1461 return;
1462
1463 if (rlist->rl_rgrps == rlist->rl_space) {
1464 new_space = rlist->rl_space + 10;
1465
1466 tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
1467 GFP_KERNEL | __GFP_NOFAIL);
1468
1469 if (rlist->rl_rgd) {
1470 memcpy(tmp, rlist->rl_rgd,
1471 rlist->rl_space * sizeof(struct gfs2_rgrpd *));
1472 kfree(rlist->rl_rgd);
1473 }
1474
1475 rlist->rl_space = new_space;
1476 rlist->rl_rgd = tmp;
1477 }
1478
1479 rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
1480}
1481
1482/**
1483 * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
1484 * and initialize an array of glock holders for them
1485 * @rlist: the list of resource groups
1486 * @state: the lock state to acquire the RG lock in
1487 * @flags: the modifier flags for the holder structures
1488 *
1489 * FIXME: Don't use NOFAIL
1490 *
1491 */
1492
1493void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
1494 int flags)
1495{
1496 unsigned int x;
1497
1498 rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
1499 GFP_KERNEL | __GFP_NOFAIL);
1500 for (x = 0; x < rlist->rl_rgrps; x++)
1501 gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
1502 state, flags,
1503 &rlist->rl_ghs[x]);
1504}
1505
1506/**
1507 * gfs2_rlist_free - free a resource group list
1508 * @list: the list of resource groups
1509 *
1510 */
1511
1512void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
1513{
1514 unsigned int x;
1515
1516 kfree(rlist->rl_rgd);
1517
1518 if (rlist->rl_ghs) {
1519 for (x = 0; x < rlist->rl_rgrps; x++)
1520 gfs2_holder_uninit(&rlist->rl_ghs[x]);
1521 kfree(rlist->rl_ghs);
1522 }
1523}
1524
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
new file mode 100644
index 000000000000..d2db3719cc0f
--- /dev/null
+++ b/fs/gfs2/rgrp.h
@@ -0,0 +1,62 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __RGRP_DOT_H__
11#define __RGRP_DOT_H__
12
13void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
14
15struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, uint64_t blk);
16struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
17struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
18
19void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
20int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
21
22int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
23void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
24void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
25
26void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
27
28struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
29void gfs2_alloc_put(struct gfs2_inode *ip);
30
31int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
32 char *file, unsigned int line);
33#define gfs2_inplace_reserve(ip) \
34gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
35
36void gfs2_inplace_release(struct gfs2_inode *ip);
37
38unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, uint64_t block);
39
40uint64_t gfs2_alloc_data(struct gfs2_inode *ip);
41uint64_t gfs2_alloc_meta(struct gfs2_inode *ip);
42uint64_t gfs2_alloc_di(struct gfs2_inode *ip);
43
44void gfs2_free_data(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen);
45void gfs2_free_meta(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen);
46void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, uint64_t blkno);
47void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
48
49struct gfs2_rgrp_list {
50 unsigned int rl_rgrps;
51 unsigned int rl_space;
52 struct gfs2_rgrpd **rl_rgd;
53 struct gfs2_holder *rl_ghs;
54};
55
56void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
57 uint64_t block);
58void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
59 int flags);
60void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
61
62#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
new file mode 100644
index 000000000000..a943a505bc5a
--- /dev/null
+++ b/fs/gfs2/super.c
@@ -0,0 +1,945 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/crc32.h>
16#include <linux/gfs2_ondisk.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "bmap.h"
22#include "dir.h"
23#include "format.h"
24#include "glock.h"
25#include "glops.h"
26#include "inode.h"
27#include "log.h"
28#include "meta_io.h"
29#include "quota.h"
30#include "recovery.h"
31#include "rgrp.h"
32#include "super.h"
33#include "trans.h"
34#include "unlinked.h"
35#include "util.h"
36
37/**
38 * gfs2_tune_init - Fill a gfs2_tune structure with default values
39 * @gt: tune
40 *
41 */
42
43void gfs2_tune_init(struct gfs2_tune *gt)
44{
45 spin_lock_init(&gt->gt_spin);
46
47 gt->gt_ilimit = 100;
48 gt->gt_ilimit_tries = 3;
49 gt->gt_ilimit_min = 1;
50 gt->gt_demote_secs = 300;
51 gt->gt_incore_log_blocks = 1024;
52 gt->gt_log_flush_secs = 60;
53 gt->gt_jindex_refresh_secs = 60;
54 gt->gt_scand_secs = 15;
55 gt->gt_recoverd_secs = 60;
56 gt->gt_logd_secs = 1;
57 gt->gt_quotad_secs = 5;
58 gt->gt_inoded_secs = 15;
59 gt->gt_quota_simul_sync = 64;
60 gt->gt_quota_warn_period = 10;
61 gt->gt_quota_scale_num = 1;
62 gt->gt_quota_scale_den = 1;
63 gt->gt_quota_cache_secs = 300;
64 gt->gt_quota_quantum = 60;
65 gt->gt_atime_quantum = 3600;
66 gt->gt_new_files_jdata = 0;
67 gt->gt_new_files_directio = 0;
68 gt->gt_max_atomic_write = 4 << 20;
69 gt->gt_max_readahead = 1 << 18;
70 gt->gt_lockdump_size = 131072;
71 gt->gt_stall_secs = 600;
72 gt->gt_complain_secs = 10;
73 gt->gt_reclaim_limit = 5000;
74 gt->gt_entries_per_readdir = 32;
75 gt->gt_prefetch_secs = 10;
76 gt->gt_greedy_default = HZ / 10;
77 gt->gt_greedy_quantum = HZ / 40;
78 gt->gt_greedy_max = HZ / 4;
79 gt->gt_statfs_quantum = 30;
80 gt->gt_statfs_slow = 0;
81}
82
83/**
84 * gfs2_check_sb - Check superblock
85 * @sdp: the filesystem
86 * @sb: The superblock
87 * @silent: Don't print a message if the check fails
88 *
89 * Checks the version code of the FS is one that we understand how to
90 * read and that the sizes of the various on-disk structures have not
91 * changed.
92 */
93
94int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent)
95{
96 unsigned int x;
97
98 if (sb->sb_header.mh_magic != GFS2_MAGIC ||
99 sb->sb_header.mh_type != GFS2_METATYPE_SB) {
100 if (!silent)
101 printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
102 return -EINVAL;
103 }
104
105 /* If format numbers match exactly, we're done. */
106
107 if (sb->sb_fs_format == GFS2_FORMAT_FS &&
108 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
109 return 0;
110
111 if (sb->sb_fs_format != GFS2_FORMAT_FS) {
112 for (x = 0; gfs2_old_fs_formats[x]; x++)
113 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
114 break;
115
116 if (!gfs2_old_fs_formats[x]) {
117 printk(KERN_WARNING
118 "GFS2: code version (%u, %u) is incompatible "
119 "with ondisk format (%u, %u)\n",
120 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
121 sb->sb_fs_format, sb->sb_multihost_format);
122 printk(KERN_WARNING
123 "GFS2: I don't know how to upgrade this FS\n");
124 return -EINVAL;
125 }
126 }
127
128 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
129 for (x = 0; gfs2_old_multihost_formats[x]; x++)
130 if (gfs2_old_multihost_formats[x] ==
131 sb->sb_multihost_format)
132 break;
133
134 if (!gfs2_old_multihost_formats[x]) {
135 printk(KERN_WARNING
136 "GFS2: code version (%u, %u) is incompatible "
137 "with ondisk format (%u, %u)\n",
138 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
139 sb->sb_fs_format, sb->sb_multihost_format);
140 printk(KERN_WARNING
141 "GFS2: I don't know how to upgrade this FS\n");
142 return -EINVAL;
143 }
144 }
145
146 if (!sdp->sd_args.ar_upgrade) {
147 printk(KERN_WARNING
148 "GFS2: code version (%u, %u) is incompatible "
149 "with ondisk format (%u, %u)\n",
150 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
151 sb->sb_fs_format, sb->sb_multihost_format);
152 printk(KERN_INFO
153 "GFS2: Use the \"upgrade\" mount option to upgrade "
154 "the FS\n");
155 printk(KERN_INFO "GFS2: See the manual for more details\n");
156 return -EINVAL;
157 }
158
159 return 0;
160}
161
162/**
163 * gfs2_read_sb - Read super block
164 * @sdp: The GFS2 superblock
165 * @gl: the glock for the superblock (assumed to be held)
166 * @silent: Don't print message if mount fails
167 *
168 */
169
170int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
171{
172 struct buffer_head *bh;
173 uint32_t hash_blocks, ind_blocks, leaf_blocks;
174 uint32_t tmp_blocks;
175 unsigned int x;
176 int error;
177
178 error = gfs2_meta_read(gl, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift,
179 DIO_FORCE | DIO_START | DIO_WAIT, &bh);
180 if (error) {
181 if (!silent)
182 fs_err(sdp, "can't read superblock\n");
183 return error;
184 }
185
186 gfs2_assert(sdp, sizeof(struct gfs2_sb) <= bh->b_size);
187 gfs2_sb_in(&sdp->sd_sb, bh->b_data);
188 brelse(bh);
189
190 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
191 if (error)
192 return error;
193
194 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
195 GFS2_BASIC_BLOCK_SHIFT;
196 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
197 sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
198 sizeof(struct gfs2_dinode)) / sizeof(uint64_t);
199 sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
200 sizeof(struct gfs2_meta_header)) / sizeof(uint64_t);
201 sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
202 sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
203 sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
204 sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(uint64_t);
205 sdp->sd_ut_per_block = (sdp->sd_sb.sb_bsize -
206 sizeof(struct gfs2_meta_header)) /
207 sizeof(struct gfs2_unlinked_tag);
208 sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
209 sizeof(struct gfs2_meta_header)) /
210 sizeof(struct gfs2_quota_change);
211
212 /* Compute maximum reservation required to add a entry to a directory */
213
214 hash_blocks = DIV_ROUND_UP(sizeof(uint64_t) * (1 << GFS2_DIR_MAX_DEPTH),
215 sdp->sd_jbsize);
216
217 ind_blocks = 0;
218 for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
219 tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
220 ind_blocks += tmp_blocks;
221 }
222
223 leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
224
225 sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
226
227 sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
228 sizeof(struct gfs2_dinode);
229 sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
230 for (x = 2;; x++) {
231 uint64_t space, d;
232 uint32_t m;
233
234 space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
235 d = space;
236 m = do_div(d, sdp->sd_inptrs);
237
238 if (d != sdp->sd_heightsize[x - 1] || m)
239 break;
240 sdp->sd_heightsize[x] = space;
241 }
242 sdp->sd_max_height = x;
243 gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
244
245 sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
246 sizeof(struct gfs2_dinode);
247 sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
248 for (x = 2;; x++) {
249 uint64_t space, d;
250 uint32_t m;
251
252 space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
253 d = space;
254 m = do_div(d, sdp->sd_inptrs);
255
256 if (d != sdp->sd_jheightsize[x - 1] || m)
257 break;
258 sdp->sd_jheightsize[x] = space;
259 }
260 sdp->sd_max_jheight = x;
261 gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
262
263 return 0;
264}
265
266/**
267 * gfs2_jindex_hold - Grab a lock on the jindex
268 * @sdp: The GFS2 superblock
269 * @ji_gh: the holder for the jindex glock
270 *
271 * This is very similar to the gfs2_rindex_hold() function, except that
272 * in general we hold the jindex lock for longer periods of time and
273 * we grab it far less frequently (in general) then the rgrp lock.
274 *
275 * Returns: errno
276 */
277
278int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
279{
280 struct gfs2_inode *dip = sdp->sd_jindex->u.generic_ip;
281 struct qstr name;
282 char buf[20];
283 struct gfs2_jdesc *jd;
284 int error;
285
286 name.name = buf;
287
288 mutex_lock(&sdp->sd_jindex_mutex);
289
290 for (;;) {
291 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED,
292 GL_LOCAL_EXCL, ji_gh);
293 if (error)
294 break;
295
296 name.len = sprintf(buf, "journal%u", sdp->sd_journals);
297 name.hash = gfs2_disk_hash(name.name, name.len);
298
299 error = gfs2_dir_search(sdp->sd_jindex,
300 &name, NULL, NULL);
301 if (error == -ENOENT) {
302 error = 0;
303 break;
304 }
305
306 gfs2_glock_dq_uninit(ji_gh);
307
308 if (error)
309 break;
310
311 error = -ENOMEM;
312 jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
313 if (!jd)
314 break;
315
316 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
317 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
318 if (!jd->jd_inode)
319 error = -ENOENT;
320 else
321 error = PTR_ERR(jd->jd_inode);
322 kfree(jd);
323 break;
324 }
325
326 spin_lock(&sdp->sd_jindex_spin);
327 jd->jd_jid = sdp->sd_journals++;
328 list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
329 spin_unlock(&sdp->sd_jindex_spin);
330 }
331
332 mutex_unlock(&sdp->sd_jindex_mutex);
333
334 return error;
335}
336
337/**
338 * gfs2_jindex_free - Clear all the journal index information
339 * @sdp: The GFS2 superblock
340 *
341 */
342
343void gfs2_jindex_free(struct gfs2_sbd *sdp)
344{
345 struct list_head list;
346 struct gfs2_jdesc *jd;
347
348 spin_lock(&sdp->sd_jindex_spin);
349 list_add(&list, &sdp->sd_jindex_list);
350 list_del_init(&sdp->sd_jindex_list);
351 sdp->sd_journals = 0;
352 spin_unlock(&sdp->sd_jindex_spin);
353
354 while (!list_empty(&list)) {
355 jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
356 list_del(&jd->jd_list);
357 iput(jd->jd_inode);
358 kfree(jd);
359 }
360}
361
362static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid)
363{
364 struct gfs2_jdesc *jd;
365 int found = 0;
366
367 list_for_each_entry(jd, head, jd_list) {
368 if (jd->jd_jid == jid) {
369 found = 1;
370 break;
371 }
372 }
373
374 if (!found)
375 jd = NULL;
376
377 return jd;
378}
379
380struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
381{
382 struct gfs2_jdesc *jd;
383
384 spin_lock(&sdp->sd_jindex_spin);
385 jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
386 spin_unlock(&sdp->sd_jindex_spin);
387
388 return jd;
389}
390
391void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
392{
393 struct gfs2_jdesc *jd;
394
395 spin_lock(&sdp->sd_jindex_spin);
396 jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
397 if (jd)
398 jd->jd_dirty = 1;
399 spin_unlock(&sdp->sd_jindex_spin);
400}
401
402struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
403{
404 struct gfs2_jdesc *jd;
405 int found = 0;
406
407 spin_lock(&sdp->sd_jindex_spin);
408
409 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
410 if (jd->jd_dirty) {
411 jd->jd_dirty = 0;
412 found = 1;
413 break;
414 }
415 }
416 spin_unlock(&sdp->sd_jindex_spin);
417
418 if (!found)
419 jd = NULL;
420
421 return jd;
422}
423
424int gfs2_jdesc_check(struct gfs2_jdesc *jd)
425{
426 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
427 struct gfs2_sbd *sdp = ip->i_sbd;
428 int ar;
429 int error;
430
431 if (ip->i_di.di_size < (8 << 20) ||
432 ip->i_di.di_size > (1 << 30) ||
433 (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) {
434 gfs2_consist_inode(ip);
435 return -EIO;
436 }
437 jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
438
439 error = gfs2_write_alloc_required(ip,
440 0, ip->i_di.di_size,
441 &ar);
442 if (!error && ar) {
443 gfs2_consist_inode(ip);
444 error = -EIO;
445 }
446
447 return error;
448}
449
450/**
451 * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
452 * @sdp: the filesystem
453 *
454 * Returns: errno
455 */
456
457int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
458{
459 struct gfs2_inode *ip = sdp->sd_jdesc->jd_inode->u.generic_ip;
460 struct gfs2_glock *j_gl = ip->i_gl;
461 struct gfs2_holder t_gh;
462 struct gfs2_log_header head;
463 int error;
464
465 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
466 GL_LOCAL_EXCL, &t_gh);
467 if (error)
468 return error;
469
470 gfs2_meta_cache_flush(ip);
471 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
472
473 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
474 if (error)
475 goto fail;
476
477 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
478 gfs2_consist(sdp);
479 error = -EIO;
480 goto fail;
481 }
482
483 /* Initialize some head of the log stuff */
484 sdp->sd_log_sequence = head.lh_sequence + 1;
485 gfs2_log_pointers_init(sdp, head.lh_blkno);
486
487 error = gfs2_unlinked_init(sdp);
488 if (error)
489 goto fail;
490 error = gfs2_quota_init(sdp);
491 if (error)
492 goto fail_unlinked;
493
494 set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
495
496 gfs2_glock_dq_uninit(&t_gh);
497
498 return 0;
499
500 fail_unlinked:
501 gfs2_unlinked_cleanup(sdp);
502
503 fail:
504 t_gh.gh_flags |= GL_NOCACHE;
505 gfs2_glock_dq_uninit(&t_gh);
506
507 return error;
508}
509
510/**
511 * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
512 * @sdp: the filesystem
513 *
514 * Returns: errno
515 */
516
517int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
518{
519 struct gfs2_holder t_gh;
520 int error;
521
522 gfs2_unlinked_dealloc(sdp);
523 gfs2_quota_sync(sdp);
524 gfs2_statfs_sync(sdp);
525
526 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
527 GL_LOCAL_EXCL | GL_NOCACHE,
528 &t_gh);
529 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
530 return error;
531
532 gfs2_meta_syncfs(sdp);
533 gfs2_log_shutdown(sdp);
534
535 clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
536
537 if (t_gh.gh_gl)
538 gfs2_glock_dq_uninit(&t_gh);
539
540 gfs2_unlinked_cleanup(sdp);
541 gfs2_quota_cleanup(sdp);
542
543 return error;
544}
545
546int gfs2_statfs_init(struct gfs2_sbd *sdp)
547{
548 struct gfs2_inode *m_ip = sdp->sd_statfs_inode->u.generic_ip;
549 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
550 struct gfs2_inode *l_ip = sdp->sd_sc_inode->u.generic_ip;
551 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
552 struct buffer_head *m_bh, *l_bh;
553 struct gfs2_holder gh;
554 int error;
555
556 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
557 &gh);
558 if (error)
559 return error;
560
561 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
562 if (error)
563 goto out;
564
565 if (sdp->sd_args.ar_spectator) {
566 spin_lock(&sdp->sd_statfs_spin);
567 gfs2_statfs_change_in(m_sc, m_bh->b_data +
568 sizeof(struct gfs2_dinode));
569 spin_unlock(&sdp->sd_statfs_spin);
570 } else {
571 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
572 if (error)
573 goto out_m_bh;
574
575 spin_lock(&sdp->sd_statfs_spin);
576 gfs2_statfs_change_in(m_sc, m_bh->b_data +
577 sizeof(struct gfs2_dinode));
578 gfs2_statfs_change_in(l_sc, l_bh->b_data +
579 sizeof(struct gfs2_dinode));
580 spin_unlock(&sdp->sd_statfs_spin);
581
582 brelse(l_bh);
583 }
584
585 out_m_bh:
586 brelse(m_bh);
587
588 out:
589 gfs2_glock_dq_uninit(&gh);
590
591 return 0;
592}
593
594void gfs2_statfs_change(struct gfs2_sbd *sdp, int64_t total, int64_t free,
595 int64_t dinodes)
596{
597 struct gfs2_inode *l_ip = sdp->sd_sc_inode->u.generic_ip;
598 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
599 struct buffer_head *l_bh;
600 int error;
601
602 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
603 if (error)
604 return;
605
606 mutex_lock(&sdp->sd_statfs_mutex);
607 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
608 mutex_unlock(&sdp->sd_statfs_mutex);
609
610 spin_lock(&sdp->sd_statfs_spin);
611 l_sc->sc_total += total;
612 l_sc->sc_free += free;
613 l_sc->sc_dinodes += dinodes;
614 gfs2_statfs_change_out(l_sc, l_bh->b_data +
615 sizeof(struct gfs2_dinode));
616 spin_unlock(&sdp->sd_statfs_spin);
617
618 brelse(l_bh);
619}
620
621int gfs2_statfs_sync(struct gfs2_sbd *sdp)
622{
623 struct gfs2_inode *m_ip = sdp->sd_statfs_inode->u.generic_ip;
624 struct gfs2_inode *l_ip = sdp->sd_sc_inode->u.generic_ip;
625 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
626 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
627 struct gfs2_holder gh;
628 struct buffer_head *m_bh, *l_bh;
629 int error;
630
631 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
632 &gh);
633 if (error)
634 return error;
635
636 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
637 if (error)
638 goto out;
639
640 spin_lock(&sdp->sd_statfs_spin);
641 gfs2_statfs_change_in(m_sc, m_bh->b_data +
642 sizeof(struct gfs2_dinode));
643 if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) {
644 spin_unlock(&sdp->sd_statfs_spin);
645 goto out_bh;
646 }
647 spin_unlock(&sdp->sd_statfs_spin);
648
649 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
650 if (error)
651 goto out_bh;
652
653 error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
654 if (error)
655 goto out_bh2;
656
657 mutex_lock(&sdp->sd_statfs_mutex);
658 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
659 mutex_unlock(&sdp->sd_statfs_mutex);
660
661 spin_lock(&sdp->sd_statfs_spin);
662 m_sc->sc_total += l_sc->sc_total;
663 m_sc->sc_free += l_sc->sc_free;
664 m_sc->sc_dinodes += l_sc->sc_dinodes;
665 memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
666 memset(l_bh->b_data + sizeof(struct gfs2_dinode),
667 0, sizeof(struct gfs2_statfs_change));
668 spin_unlock(&sdp->sd_statfs_spin);
669
670 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
671 gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
672
673 gfs2_trans_end(sdp);
674
675 out_bh2:
676 brelse(l_bh);
677
678 out_bh:
679 brelse(m_bh);
680
681 out:
682 gfs2_glock_dq_uninit(&gh);
683
684 return error;
685}
686
687/**
688 * gfs2_statfs_i - Do a statfs
689 * @sdp: the filesystem
690 * @sg: the sg structure
691 *
692 * Returns: errno
693 */
694
695int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
696{
697 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
698 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
699
700 spin_lock(&sdp->sd_statfs_spin);
701
702 *sc = *m_sc;
703 sc->sc_total += l_sc->sc_total;
704 sc->sc_free += l_sc->sc_free;
705 sc->sc_dinodes += l_sc->sc_dinodes;
706
707 spin_unlock(&sdp->sd_statfs_spin);
708
709 if (sc->sc_free < 0)
710 sc->sc_free = 0;
711 if (sc->sc_free > sc->sc_total)
712 sc->sc_free = sc->sc_total;
713 if (sc->sc_dinodes < 0)
714 sc->sc_dinodes = 0;
715
716 return 0;
717}
718
719/**
720 * statfs_fill - fill in the sg for a given RG
721 * @rgd: the RG
722 * @sc: the sc structure
723 *
724 * Returns: 0 on success, -ESTALE if the LVB is invalid
725 */
726
727static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
728 struct gfs2_statfs_change *sc)
729{
730 gfs2_rgrp_verify(rgd);
731 sc->sc_total += rgd->rd_ri.ri_data;
732 sc->sc_free += rgd->rd_rg.rg_free;
733 sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
734 return 0;
735}
736
737/**
738 * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
739 * @sdp: the filesystem
740 * @sc: the sc info that will be returned
741 *
742 * Any error (other than a signal) will cause this routine to fall back
743 * to the synchronous version.
744 *
745 * FIXME: This really shouldn't busy wait like this.
746 *
747 * Returns: errno
748 */
749
750int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
751{
752 struct gfs2_holder ri_gh;
753 struct gfs2_rgrpd *rgd_next;
754 struct gfs2_holder *gha, *gh;
755 unsigned int slots = 64;
756 unsigned int x;
757 int done;
758 int error = 0, err;
759
760 memset(sc, 0, sizeof(struct gfs2_statfs_change));
761 gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
762 if (!gha)
763 return -ENOMEM;
764
765 error = gfs2_rindex_hold(sdp, &ri_gh);
766 if (error)
767 goto out;
768
769 rgd_next = gfs2_rgrpd_get_first(sdp);
770
771 for (;;) {
772 done = 1;
773
774 for (x = 0; x < slots; x++) {
775 gh = gha + x;
776
777 if (gh->gh_gl && gfs2_glock_poll(gh)) {
778 err = gfs2_glock_wait(gh);
779 if (err) {
780 gfs2_holder_uninit(gh);
781 error = err;
782 } else {
783 if (!error)
784 error = statfs_slow_fill(
785 gh->gh_gl->gl_object, sc);
786 gfs2_glock_dq_uninit(gh);
787 }
788 }
789
790 if (gh->gh_gl)
791 done = 0;
792 else if (rgd_next && !error) {
793 error = gfs2_glock_nq_init(rgd_next->rd_gl,
794 LM_ST_SHARED,
795 GL_ASYNC,
796 gh);
797 rgd_next = gfs2_rgrpd_get_next(rgd_next);
798 done = 0;
799 }
800
801 if (signal_pending(current))
802 error = -ERESTARTSYS;
803 }
804
805 if (done)
806 break;
807
808 yield();
809 }
810
811 gfs2_glock_dq_uninit(&ri_gh);
812
813 out:
814 kfree(gha);
815
816 return error;
817}
818
819struct lfcc {
820 struct list_head list;
821 struct gfs2_holder gh;
822};
823
824/**
825 * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all
826 * journals are clean
827 * @sdp: the file system
828 * @state: the state to put the transaction lock into
829 * @t_gh: the hold on the transaction lock
830 *
831 * Returns: errno
832 */
833
834static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
835 struct gfs2_holder *t_gh)
836{
837 struct gfs2_inode *ip;
838 struct gfs2_holder ji_gh;
839 struct gfs2_jdesc *jd;
840 struct lfcc *lfcc;
841 LIST_HEAD(list);
842 struct gfs2_log_header lh;
843 int error;
844
845 error = gfs2_jindex_hold(sdp, &ji_gh);
846 if (error)
847 return error;
848
849 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
850 lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
851 if (!lfcc) {
852 error = -ENOMEM;
853 goto out;
854 }
855 ip = jd->jd_inode->u.generic_ip;
856 error = gfs2_glock_nq_init(ip->i_gl,
857 LM_ST_SHARED, 0,
858 &lfcc->gh);
859 if (error) {
860 kfree(lfcc);
861 goto out;
862 }
863 list_add(&lfcc->list, &list);
864 }
865
866 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
867 LM_FLAG_PRIORITY | GL_NOCACHE,
868 t_gh);
869
870 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
871 error = gfs2_jdesc_check(jd);
872 if (error)
873 break;
874 error = gfs2_find_jhead(jd, &lh);
875 if (error)
876 break;
877 if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
878 error = -EBUSY;
879 break;
880 }
881 }
882
883 if (error)
884 gfs2_glock_dq_uninit(t_gh);
885
886 out:
887 while (!list_empty(&list)) {
888 lfcc = list_entry(list.next, struct lfcc, list);
889 list_del(&lfcc->list);
890 gfs2_glock_dq_uninit(&lfcc->gh);
891 kfree(lfcc);
892 }
893 gfs2_glock_dq_uninit(&ji_gh);
894
895 return error;
896}
897
898/**
899 * gfs2_freeze_fs - freezes the file system
900 * @sdp: the file system
901 *
902 * This function flushes data and meta data for all machines by
903 * aquiring the transaction log exclusively. All journals are
904 * ensured to be in a clean state as well.
905 *
906 * Returns: errno
907 */
908
909int gfs2_freeze_fs(struct gfs2_sbd *sdp)
910{
911 int error = 0;
912
913 mutex_lock(&sdp->sd_freeze_lock);
914
915 if (!sdp->sd_freeze_count++) {
916 error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
917 if (error)
918 sdp->sd_freeze_count--;
919 }
920
921 mutex_unlock(&sdp->sd_freeze_lock);
922
923 return error;
924}
925
926/**
927 * gfs2_unfreeze_fs - unfreezes the file system
928 * @sdp: the file system
929 *
930 * This function allows the file system to proceed by unlocking
931 * the exclusively held transaction lock. Other GFS2 nodes are
932 * now free to acquire the lock shared and go on with their lives.
933 *
934 */
935
936void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
937{
938 mutex_lock(&sdp->sd_freeze_lock);
939
940 if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
941 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
942
943 mutex_unlock(&sdp->sd_freeze_lock);
944}
945
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
new file mode 100644
index 000000000000..df2495230402
--- /dev/null
+++ b/fs/gfs2/super.h
@@ -0,0 +1,52 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __SUPER_DOT_H__
11#define __SUPER_DOT_H__
12
13void gfs2_tune_init(struct gfs2_tune *gt);
14
15int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent);
16int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
17
18static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
19{
20 unsigned int x;
21 spin_lock(&sdp->sd_jindex_spin);
22 x = sdp->sd_journals;
23 spin_unlock(&sdp->sd_jindex_spin);
24 return x;
25}
26
27int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
28void gfs2_jindex_free(struct gfs2_sbd *sdp);
29
30struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
31void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
32struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
33int gfs2_jdesc_check(struct gfs2_jdesc *jd);
34
35int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
36 struct gfs2_inode **ipp);
37
38int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
39int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
40
41int gfs2_statfs_init(struct gfs2_sbd *sdp);
42void gfs2_statfs_change(struct gfs2_sbd *sdp,
43 int64_t total, int64_t free, int64_t dinodes);
44int gfs2_statfs_sync(struct gfs2_sbd *sdp);
45int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
46int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
47
48int gfs2_freeze_fs(struct gfs2_sbd *sdp);
49void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
50
51#endif /* __SUPER_DOT_H__ */
52
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
new file mode 100644
index 000000000000..d32a2c54daee
--- /dev/null
+++ b/fs/gfs2/sys.c
@@ -0,0 +1,581 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/module.h>
16#include <linux/kobject.h>
17#include <linux/gfs2_ondisk.h>
18#include <asm/uaccess.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "lm.h"
24#include "sys.h"
25#include "super.h"
26#include "glock.h"
27#include "quota.h"
28#include "util.h"
29
30char *gfs2_sys_margs;
31spinlock_t gfs2_sys_margs_lock;
32
33static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
34{
35 return sprintf(buf, "%s\n", sdp->sd_vfs->s_id);
36}
37
38static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
39{
40 return sprintf(buf, "%s\n", sdp->sd_fsname);
41}
42
43static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
44{
45 unsigned int count;
46
47 mutex_lock(&sdp->sd_freeze_lock);
48 count = sdp->sd_freeze_count;
49 mutex_unlock(&sdp->sd_freeze_lock);
50
51 return sprintf(buf, "%u\n", count);
52}
53
54static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
55{
56 ssize_t ret = len;
57 int error = 0;
58 int n = simple_strtol(buf, NULL, 0);
59
60 if (!capable(CAP_SYS_ADMIN))
61 return -EACCES;
62
63 switch (n) {
64 case 0:
65 gfs2_unfreeze_fs(sdp);
66 break;
67 case 1:
68 error = gfs2_freeze_fs(sdp);
69 break;
70 default:
71 ret = -EINVAL;
72 }
73
74 if (error)
75 fs_warn(sdp, "freeze %d error %d", n, error);
76
77 return ret;
78}
79
80static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
81{
82 unsigned int b = test_bit(SDF_SHUTDOWN, &sdp->sd_flags);
83 return sprintf(buf, "%u\n", b);
84}
85
86static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
87{
88 if (!capable(CAP_SYS_ADMIN))
89 return -EACCES;
90
91 if (simple_strtol(buf, NULL, 0) != 1)
92 return -EINVAL;
93
94 gfs2_lm_withdraw(sdp,
95 "GFS2: fsid=%s: withdrawing from cluster at user's request\n",
96 sdp->sd_fsname);
97 return len;
98}
99
100static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
101 size_t len)
102{
103 if (!capable(CAP_SYS_ADMIN))
104 return -EACCES;
105
106 if (simple_strtol(buf, NULL, 0) != 1)
107 return -EINVAL;
108
109 gfs2_statfs_sync(sdp);
110 return len;
111}
112
113static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
114{
115 if (!capable(CAP_SYS_ADMIN))
116 return -EACCES;
117
118 if (simple_strtol(buf, NULL, 0) != 1)
119 return -EINVAL;
120
121 gfs2_gl_hash_clear(sdp, NO_WAIT);
122 return len;
123}
124
125static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
126 size_t len)
127{
128 if (!capable(CAP_SYS_ADMIN))
129 return -EACCES;
130
131 if (simple_strtol(buf, NULL, 0) != 1)
132 return -EINVAL;
133
134 gfs2_quota_sync(sdp);
135 return len;
136}
137
138static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
139 size_t len)
140{
141 uint32_t id;
142
143 if (!capable(CAP_SYS_ADMIN))
144 return -EACCES;
145
146 id = simple_strtoul(buf, NULL, 0);
147
148 gfs2_quota_refresh(sdp, 1, id);
149 return len;
150}
151
152static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
153 size_t len)
154{
155 uint32_t id;
156
157 if (!capable(CAP_SYS_ADMIN))
158 return -EACCES;
159
160 id = simple_strtoul(buf, NULL, 0);
161
162 gfs2_quota_refresh(sdp, 0, id);
163 return len;
164}
165
166struct gfs2_attr {
167 struct attribute attr;
168 ssize_t (*show)(struct gfs2_sbd *, char *);
169 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
170};
171
172#define GFS2_ATTR(name, mode, show, store) \
173static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
174
175GFS2_ATTR(id, 0444, id_show, NULL);
176GFS2_ATTR(fsname, 0444, fsname_show, NULL);
177GFS2_ATTR(freeze, 0644, freeze_show, freeze_store);
178GFS2_ATTR(shrink, 0200, NULL, shrink_store);
179GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
180GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store);
181GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store);
182GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store);
183GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store);
184
185static struct attribute *gfs2_attrs[] = {
186 &gfs2_attr_id.attr,
187 &gfs2_attr_fsname.attr,
188 &gfs2_attr_freeze.attr,
189 &gfs2_attr_shrink.attr,
190 &gfs2_attr_withdraw.attr,
191 &gfs2_attr_statfs_sync.attr,
192 &gfs2_attr_quota_sync.attr,
193 &gfs2_attr_quota_refresh_user.attr,
194 &gfs2_attr_quota_refresh_group.attr,
195 NULL,
196};
197
198static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
199 char *buf)
200{
201 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
202 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
203 return a->show ? a->show(sdp, buf) : 0;
204}
205
206static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
207 const char *buf, size_t len)
208{
209 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
210 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
211 return a->store ? a->store(sdp, buf, len) : len;
212}
213
214static struct sysfs_ops gfs2_attr_ops = {
215 .show = gfs2_attr_show,
216 .store = gfs2_attr_store,
217};
218
219static struct kobj_type gfs2_ktype = {
220 .default_attrs = gfs2_attrs,
221 .sysfs_ops = &gfs2_attr_ops,
222};
223
224static struct kset gfs2_kset = {
225 .subsys = &fs_subsys,
226 .kobj = {.name = "gfs2",},
227 .ktype = &gfs2_ktype,
228};
229
230/*
231 * display struct lm_lockstruct fields
232 */
233
234struct lockstruct_attr {
235 struct attribute attr;
236 ssize_t (*show)(struct gfs2_sbd *, char *);
237};
238
239#define LOCKSTRUCT_ATTR(name, fmt) \
240static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
241{ \
242 return sprintf(buf, fmt, sdp->sd_lockstruct.ls_##name); \
243} \
244static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
245
246LOCKSTRUCT_ATTR(jid, "%u\n");
247LOCKSTRUCT_ATTR(first, "%u\n");
248LOCKSTRUCT_ATTR(lvb_size, "%u\n");
249LOCKSTRUCT_ATTR(flags, "%d\n");
250
251static struct attribute *lockstruct_attrs[] = {
252 &lockstruct_attr_jid.attr,
253 &lockstruct_attr_first.attr,
254 &lockstruct_attr_lvb_size.attr,
255 &lockstruct_attr_flags.attr,
256 NULL
257};
258
259/*
260 * display struct gfs2_args fields
261 */
262
263struct args_attr {
264 struct attribute attr;
265 ssize_t (*show)(struct gfs2_sbd *, char *);
266};
267
268#define ARGS_ATTR(name, fmt) \
269static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
270{ \
271 return sprintf(buf, fmt, sdp->sd_args.ar_##name); \
272} \
273static struct args_attr args_attr_##name = __ATTR_RO(name)
274
275ARGS_ATTR(lockproto, "%s\n");
276ARGS_ATTR(locktable, "%s\n");
277ARGS_ATTR(hostdata, "%s\n");
278ARGS_ATTR(spectator, "%d\n");
279ARGS_ATTR(ignore_local_fs, "%d\n");
280ARGS_ATTR(localcaching, "%d\n");
281ARGS_ATTR(localflocks, "%d\n");
282ARGS_ATTR(debug, "%d\n");
283ARGS_ATTR(upgrade, "%d\n");
284ARGS_ATTR(num_glockd, "%u\n");
285ARGS_ATTR(posix_acl, "%d\n");
286ARGS_ATTR(quota, "%u\n");
287ARGS_ATTR(suiddir, "%d\n");
288ARGS_ATTR(data, "%d\n");
289
290/* one oddball doesn't fit the macro mold */
291static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
292{
293 return sprintf(buf, "%d\n", !!test_bit(SDF_NOATIME, &sdp->sd_flags));
294}
295static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
296
297static struct attribute *args_attrs[] = {
298 &args_attr_lockproto.attr,
299 &args_attr_locktable.attr,
300 &args_attr_hostdata.attr,
301 &args_attr_spectator.attr,
302 &args_attr_ignore_local_fs.attr,
303 &args_attr_localcaching.attr,
304 &args_attr_localflocks.attr,
305 &args_attr_debug.attr,
306 &args_attr_upgrade.attr,
307 &args_attr_num_glockd.attr,
308 &args_attr_posix_acl.attr,
309 &args_attr_quota.attr,
310 &args_attr_suiddir.attr,
311 &args_attr_data.attr,
312 &args_attr_noatime.attr,
313 NULL
314};
315
316/*
317 * display counters from superblock
318 */
319
320struct counters_attr {
321 struct attribute attr;
322 ssize_t (*show)(struct gfs2_sbd *, char *);
323};
324
325#define COUNTERS_ATTR(name, fmt) \
326static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
327{ \
328 return sprintf(buf, fmt, (unsigned int)atomic_read(&sdp->sd_##name)); \
329} \
330static struct counters_attr counters_attr_##name = __ATTR_RO(name)
331
332COUNTERS_ATTR(glock_count, "%u\n");
333COUNTERS_ATTR(glock_held_count, "%u\n");
334COUNTERS_ATTR(inode_count, "%u\n");
335COUNTERS_ATTR(reclaimed, "%u\n");
336
337static struct attribute *counters_attrs[] = {
338 &counters_attr_glock_count.attr,
339 &counters_attr_glock_held_count.attr,
340 &counters_attr_inode_count.attr,
341 &counters_attr_reclaimed.attr,
342 NULL
343};
344
345/*
346 * get and set struct gfs2_tune fields
347 */
348
349static ssize_t quota_scale_show(struct gfs2_sbd *sdp, char *buf)
350{
351 return sprintf(buf, "%u %u\n", sdp->sd_tune.gt_quota_scale_num,
352 sdp->sd_tune.gt_quota_scale_den);
353}
354
355static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf,
356 size_t len)
357{
358 struct gfs2_tune *gt = &sdp->sd_tune;
359 unsigned int x, y;
360
361 if (!capable(CAP_SYS_ADMIN))
362 return -EACCES;
363
364 if (sscanf(buf, "%u %u", &x, &y) != 2 || !y)
365 return -EINVAL;
366
367 spin_lock(&gt->gt_spin);
368 gt->gt_quota_scale_num = x;
369 gt->gt_quota_scale_den = y;
370 spin_unlock(&gt->gt_spin);
371 return len;
372}
373
374static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
375 int check_zero, const char *buf, size_t len)
376{
377 struct gfs2_tune *gt = &sdp->sd_tune;
378 unsigned int x;
379
380 if (!capable(CAP_SYS_ADMIN))
381 return -EACCES;
382
383 x = simple_strtoul(buf, NULL, 0);
384
385 if (check_zero && !x)
386 return -EINVAL;
387
388 spin_lock(&gt->gt_spin);
389 *field = x;
390 spin_unlock(&gt->gt_spin);
391 return len;
392}
393
394struct tune_attr {
395 struct attribute attr;
396 ssize_t (*show)(struct gfs2_sbd *, char *);
397 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
398};
399
400#define TUNE_ATTR_3(name, show, store) \
401static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store)
402
403#define TUNE_ATTR_2(name, store) \
404static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
405{ \
406 return sprintf(buf, "%u\n", sdp->sd_tune.gt_##name); \
407} \
408TUNE_ATTR_3(name, name##_show, store)
409
410#define TUNE_ATTR(name, check_zero) \
411static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
412{ \
413 return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len); \
414} \
415TUNE_ATTR_2(name, name##_store)
416
417#define TUNE_ATTR_DAEMON(name, process) \
418static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
419{ \
420 ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len); \
421 wake_up_process(sdp->sd_##process); \
422 return r; \
423} \
424TUNE_ATTR_2(name, name##_store)
425
426TUNE_ATTR(ilimit, 0);
427TUNE_ATTR(ilimit_tries, 0);
428TUNE_ATTR(ilimit_min, 0);
429TUNE_ATTR(demote_secs, 0);
430TUNE_ATTR(incore_log_blocks, 0);
431TUNE_ATTR(log_flush_secs, 0);
432TUNE_ATTR(jindex_refresh_secs, 0);
433TUNE_ATTR(quota_warn_period, 0);
434TUNE_ATTR(quota_quantum, 0);
435TUNE_ATTR(atime_quantum, 0);
436TUNE_ATTR(max_readahead, 0);
437TUNE_ATTR(complain_secs, 0);
438TUNE_ATTR(reclaim_limit, 0);
439TUNE_ATTR(prefetch_secs, 0);
440TUNE_ATTR(statfs_slow, 0);
441TUNE_ATTR(new_files_jdata, 0);
442TUNE_ATTR(new_files_directio, 0);
443TUNE_ATTR(quota_simul_sync, 1);
444TUNE_ATTR(quota_cache_secs, 1);
445TUNE_ATTR(max_atomic_write, 1);
446TUNE_ATTR(stall_secs, 1);
447TUNE_ATTR(entries_per_readdir, 1);
448TUNE_ATTR(greedy_default, 1);
449TUNE_ATTR(greedy_quantum, 1);
450TUNE_ATTR(greedy_max, 1);
451TUNE_ATTR(statfs_quantum, 1);
452TUNE_ATTR_DAEMON(scand_secs, scand_process);
453TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
454TUNE_ATTR_DAEMON(logd_secs, logd_process);
455TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
456TUNE_ATTR_DAEMON(inoded_secs, inoded_process);
457TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
458
459static struct attribute *tune_attrs[] = {
460 &tune_attr_ilimit.attr,
461 &tune_attr_ilimit_tries.attr,
462 &tune_attr_ilimit_min.attr,
463 &tune_attr_demote_secs.attr,
464 &tune_attr_incore_log_blocks.attr,
465 &tune_attr_log_flush_secs.attr,
466 &tune_attr_jindex_refresh_secs.attr,
467 &tune_attr_quota_warn_period.attr,
468 &tune_attr_quota_quantum.attr,
469 &tune_attr_atime_quantum.attr,
470 &tune_attr_max_readahead.attr,
471 &tune_attr_complain_secs.attr,
472 &tune_attr_reclaim_limit.attr,
473 &tune_attr_prefetch_secs.attr,
474 &tune_attr_statfs_slow.attr,
475 &tune_attr_quota_simul_sync.attr,
476 &tune_attr_quota_cache_secs.attr,
477 &tune_attr_max_atomic_write.attr,
478 &tune_attr_stall_secs.attr,
479 &tune_attr_entries_per_readdir.attr,
480 &tune_attr_greedy_default.attr,
481 &tune_attr_greedy_quantum.attr,
482 &tune_attr_greedy_max.attr,
483 &tune_attr_statfs_quantum.attr,
484 &tune_attr_scand_secs.attr,
485 &tune_attr_recoverd_secs.attr,
486 &tune_attr_logd_secs.attr,
487 &tune_attr_quotad_secs.attr,
488 &tune_attr_inoded_secs.attr,
489 &tune_attr_quota_scale.attr,
490 &tune_attr_new_files_jdata.attr,
491 &tune_attr_new_files_directio.attr,
492 NULL
493};
494
495static struct attribute_group lockstruct_group = {
496 .name = "lockstruct",
497 .attrs = lockstruct_attrs
498};
499
500static struct attribute_group counters_group = {
501 .name = "counters",
502 .attrs = counters_attrs
503};
504
505static struct attribute_group args_group = {
506 .name = "args",
507 .attrs = args_attrs
508};
509
510static struct attribute_group tune_group = {
511 .name = "tune",
512 .attrs = tune_attrs
513};
514
515int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
516{
517 int error;
518
519 sdp->sd_kobj.kset = &gfs2_kset;
520 sdp->sd_kobj.ktype = &gfs2_ktype;
521
522 error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name);
523 if (error)
524 goto fail;
525
526 error = kobject_register(&sdp->sd_kobj);
527 if (error)
528 goto fail;
529
530 error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
531 if (error)
532 goto fail_reg;
533
534 error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
535 if (error)
536 goto fail_lockstruct;
537
538 error = sysfs_create_group(&sdp->sd_kobj, &args_group);
539 if (error)
540 goto fail_counters;
541
542 error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
543 if (error)
544 goto fail_args;
545
546 return 0;
547
548 fail_args:
549 sysfs_remove_group(&sdp->sd_kobj, &args_group);
550 fail_counters:
551 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
552 fail_lockstruct:
553 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
554 fail_reg:
555 kobject_unregister(&sdp->sd_kobj);
556 fail:
557 return error;
558}
559
560void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
561{
562 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
563 sysfs_remove_group(&sdp->sd_kobj, &args_group);
564 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
565 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
566 kobject_unregister(&sdp->sd_kobj);
567}
568
569int gfs2_sys_init(void)
570{
571 gfs2_sys_margs = NULL;
572 spin_lock_init(&gfs2_sys_margs_lock);
573 return kset_register(&gfs2_kset);
574}
575
576void gfs2_sys_uninit(void)
577{
578 kfree(gfs2_sys_margs);
579 kset_unregister(&gfs2_kset);
580}
581
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
new file mode 100644
index 000000000000..c46a700e801e
--- /dev/null
+++ b/fs/gfs2/sys.h
@@ -0,0 +1,24 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __SYS_DOT_H__
11#define __SYS_DOT_H__
12
13/* Allow args to be passed to GFS2 when using an initial ram disk */
14extern char *gfs2_sys_margs;
15extern spinlock_t gfs2_sys_margs_lock;
16
17int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
18void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
19
20int gfs2_sys_init(void);
21void gfs2_sys_uninit(void);
22
23#endif /* __SYS_DOT_H__ */
24
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
new file mode 100644
index 000000000000..05e0b72d56ff
--- /dev/null
+++ b/fs/gfs2/trans.c
@@ -0,0 +1,184 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/kallsyms.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "glock.h"
22#include "log.h"
23#include "lops.h"
24#include "meta_io.h"
25#include "trans.h"
26#include "util.h"
27
28int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
29 unsigned int revokes)
30{
31 struct gfs2_trans *tr;
32 int error;
33
34 BUG_ON(current->journal_info);
35 BUG_ON(blocks == 0 && revokes == 0);
36
37 tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
38 if (!tr)
39 return -ENOMEM;
40
41 tr->tr_ip = (unsigned long)__builtin_return_address(0);
42 tr->tr_blocks = blocks;
43 tr->tr_revokes = revokes;
44 tr->tr_reserved = 1;
45 if (blocks)
46 tr->tr_reserved += 6 + blocks;
47 if (revokes)
48 tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
49 sizeof(uint64_t));
50 INIT_LIST_HEAD(&tr->tr_list_buf);
51
52 gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
53
54 error = gfs2_glock_nq(&tr->tr_t_gh);
55 if (error)
56 goto fail_holder_uninit;
57
58 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
59 tr->tr_t_gh.gh_flags |= GL_NOCACHE;
60 error = -EROFS;
61 goto fail_gunlock;
62 }
63
64 error = gfs2_log_reserve(sdp, tr->tr_reserved);
65 if (error)
66 goto fail_gunlock;
67
68 current->journal_info = tr;
69
70 return 0;
71
72fail_gunlock:
73 gfs2_glock_dq(&tr->tr_t_gh);
74
75fail_holder_uninit:
76 gfs2_holder_uninit(&tr->tr_t_gh);
77 kfree(tr);
78
79 return error;
80}
81
82void gfs2_trans_end(struct gfs2_sbd *sdp)
83{
84 struct gfs2_trans *tr = current->journal_info;
85
86 BUG_ON(!tr);
87 current->journal_info = NULL;
88
89 if (!tr->tr_touched) {
90 gfs2_log_release(sdp, tr->tr_reserved);
91 gfs2_glock_dq(&tr->tr_t_gh);
92 gfs2_holder_uninit(&tr->tr_t_gh);
93 kfree(tr);
94 return;
95 }
96
97 if (gfs2_assert_withdraw(sdp, tr->tr_num_buf <= tr->tr_blocks)) {
98 fs_err(sdp, "tr_num_buf = %u, tr_blocks = %u ",
99 tr->tr_num_buf, tr->tr_blocks);
100 print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
101 }
102 if (gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) {
103 fs_err(sdp, "tr_num_revoke = %u, tr_revokes = %u ",
104 tr->tr_num_revoke, tr->tr_revokes);
105 print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
106 }
107
108 gfs2_log_commit(sdp, tr);
109 gfs2_glock_dq(&tr->tr_t_gh);
110 gfs2_holder_uninit(&tr->tr_t_gh);
111 kfree(tr);
112
113 if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
114 gfs2_log_flush(sdp, NULL);
115}
116
117void gfs2_trans_add_gl(struct gfs2_glock *gl)
118{
119 lops_add(gl->gl_sbd, &gl->gl_le);
120}
121
122/**
123 * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
124 * @gl: the glock the buffer belongs to
125 * @bh: The buffer to add
126 * @meta: True in the case of adding metadata
127 *
128 */
129
130void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
131{
132 struct gfs2_sbd *sdp = gl->gl_sbd;
133 struct gfs2_bufdata *bd;
134
135 bd = bh->b_private;
136 if (bd)
137 gfs2_assert(sdp, bd->bd_gl == gl);
138 else {
139 gfs2_attach_bufdata(gl, bh, meta);
140 bd = bh->b_private;
141 }
142 lops_add(sdp, &bd->bd_le);
143}
144
145void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, uint64_t blkno)
146{
147 struct gfs2_revoke *rv = kmalloc(sizeof(struct gfs2_revoke),
148 GFP_NOFS | __GFP_NOFAIL);
149 lops_init_le(&rv->rv_le, &gfs2_revoke_lops);
150 rv->rv_blkno = blkno;
151 lops_add(sdp, &rv->rv_le);
152}
153
154void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, uint64_t blkno)
155{
156 struct gfs2_revoke *rv;
157 int found = 0;
158
159 gfs2_log_lock(sdp);
160
161 list_for_each_entry(rv, &sdp->sd_log_le_revoke, rv_le.le_list) {
162 if (rv->rv_blkno == blkno) {
163 list_del(&rv->rv_le.le_list);
164 gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
165 sdp->sd_log_num_revoke--;
166 found = 1;
167 break;
168 }
169 }
170
171 gfs2_log_unlock(sdp);
172
173 if (found) {
174 struct gfs2_trans *tr = current->journal_info;
175 kfree(rv);
176 tr->tr_num_revoke_rm++;
177 }
178}
179
180void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
181{
182 lops_add(rgd->rd_sbd, &rgd->rd_le);
183}
184
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
new file mode 100644
index 000000000000..60ef163dd9bb
--- /dev/null
+++ b/fs/gfs2/trans.h
@@ -0,0 +1,35 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __TRANS_DOT_H__
11#define __TRANS_DOT_H__
12
13#define RES_DINODE 1
14#define RES_INDIRECT 1
15#define RES_JDATA 1
16#define RES_DATA 1
17#define RES_LEAF 1
18#define RES_RG_BIT 2
19#define RES_EATTR 1
20#define RES_UNLINKED 1
21#define RES_STATFS 1
22#define RES_QUOTA 2
23
24int gfs2_trans_begin(struct gfs2_sbd *sdp,
25 unsigned int blocks, unsigned int revokes);
26
27void gfs2_trans_end(struct gfs2_sbd *sdp);
28
29void gfs2_trans_add_gl(struct gfs2_glock *gl);
30void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
31void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, uint64_t blkno);
32void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, uint64_t blkno);
33void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
34
35#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/unlinked.c b/fs/gfs2/unlinked.c
new file mode 100644
index 000000000000..b92d73002055
--- /dev/null
+++ b/fs/gfs2/unlinked.c
@@ -0,0 +1,459 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/kthread.h>
16#include <linux/gfs2_ondisk.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "bmap.h"
22#include "inode.h"
23#include "meta_io.h"
24#include "trans.h"
25#include "unlinked.h"
26#include "util.h"
27
28static int munge_ondisk(struct gfs2_sbd *sdp, unsigned int slot,
29 struct gfs2_unlinked_tag *ut)
30{
31 struct gfs2_inode *ip = sdp->sd_ut_inode->u.generic_ip;
32 unsigned int block, offset;
33 uint64_t dblock;
34 int new = 0;
35 struct buffer_head *bh;
36 int error;
37 int boundary;
38
39 block = slot / sdp->sd_ut_per_block;
40 offset = slot % sdp->sd_ut_per_block;
41
42 error = gfs2_block_map(ip->i_vnode, block, &new, &dblock, &boundary);
43 if (error)
44 return error;
45 error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT, &bh);
46 if (error)
47 return error;
48 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_UT)) {
49 error = -EIO;
50 goto out;
51 }
52
53 mutex_lock(&sdp->sd_unlinked_mutex);
54 gfs2_trans_add_bh(ip->i_gl, bh, 1);
55 gfs2_unlinked_tag_out(ut, bh->b_data +
56 sizeof(struct gfs2_meta_header) +
57 offset * sizeof(struct gfs2_unlinked_tag));
58 mutex_unlock(&sdp->sd_unlinked_mutex);
59
60 out:
61 brelse(bh);
62
63 return error;
64}
65
66static void ul_hash(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
67{
68 spin_lock(&sdp->sd_unlinked_spin);
69 list_add(&ul->ul_list, &sdp->sd_unlinked_list);
70 gfs2_assert(sdp, ul->ul_count);
71 ul->ul_count++;
72 atomic_inc(&sdp->sd_unlinked_count);
73 spin_unlock(&sdp->sd_unlinked_spin);
74}
75
76static void ul_unhash(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
77{
78 spin_lock(&sdp->sd_unlinked_spin);
79 list_del_init(&ul->ul_list);
80 gfs2_assert(sdp, ul->ul_count > 1);
81 ul->ul_count--;
82 gfs2_assert_warn(sdp, atomic_read(&sdp->sd_unlinked_count) > 0);
83 atomic_dec(&sdp->sd_unlinked_count);
84 spin_unlock(&sdp->sd_unlinked_spin);
85}
86
87static struct gfs2_unlinked *ul_fish(struct gfs2_sbd *sdp)
88{
89 struct list_head *head;
90 struct gfs2_unlinked *ul;
91 int found = 0;
92
93 if (sdp->sd_vfs->s_flags & MS_RDONLY)
94 return NULL;
95
96 spin_lock(&sdp->sd_unlinked_spin);
97
98 head = &sdp->sd_unlinked_list;
99
100 list_for_each_entry(ul, head, ul_list) {
101 if (test_bit(ULF_LOCKED, &ul->ul_flags))
102 continue;
103
104 list_move_tail(&ul->ul_list, head);
105 ul->ul_count++;
106 set_bit(ULF_LOCKED, &ul->ul_flags);
107 found = 1;
108
109 break;
110 }
111
112 if (!found)
113 ul = NULL;
114
115 spin_unlock(&sdp->sd_unlinked_spin);
116
117 return ul;
118}
119
120/**
121 * enforce_limit - limit the number of inodes waiting to be deallocated
122 * @sdp: the filesystem
123 *
124 * Returns: errno
125 */
126
127static void enforce_limit(struct gfs2_sbd *sdp)
128{
129 unsigned int tries = 0, min = 0;
130 int error;
131
132 if (atomic_read(&sdp->sd_unlinked_count) <
133 gfs2_tune_get(sdp, gt_ilimit))
134 return;
135
136 tries = gfs2_tune_get(sdp, gt_ilimit_tries);
137 min = gfs2_tune_get(sdp, gt_ilimit_min);
138
139 while (tries--) {
140 struct gfs2_unlinked *ul = ul_fish(sdp);
141 if (!ul)
142 break;
143 error = gfs2_inode_dealloc(sdp, ul);
144 gfs2_unlinked_put(sdp, ul);
145
146 if (!error) {
147 if (!--min)
148 break;
149 } else if (error != 1)
150 break;
151 }
152}
153
154static struct gfs2_unlinked *ul_alloc(struct gfs2_sbd *sdp)
155{
156 struct gfs2_unlinked *ul;
157
158 ul = kzalloc(sizeof(struct gfs2_unlinked), GFP_KERNEL);
159 if (ul) {
160 INIT_LIST_HEAD(&ul->ul_list);
161 ul->ul_count = 1;
162 set_bit(ULF_LOCKED, &ul->ul_flags);
163 }
164
165 return ul;
166}
167
168int gfs2_unlinked_get(struct gfs2_sbd *sdp, struct gfs2_unlinked **ul)
169{
170 unsigned int c, o = 0, b;
171 unsigned char byte = 0;
172
173 enforce_limit(sdp);
174
175 *ul = ul_alloc(sdp);
176 if (!*ul)
177 return -ENOMEM;
178
179 spin_lock(&sdp->sd_unlinked_spin);
180
181 for (c = 0; c < sdp->sd_unlinked_chunks; c++)
182 for (o = 0; o < PAGE_SIZE; o++) {
183 byte = sdp->sd_unlinked_bitmap[c][o];
184 if (byte != 0xFF)
185 goto found;
186 }
187
188 goto fail;
189
190found:
191 for (b = 0; b < 8; b++)
192 if (!(byte & (1 << b)))
193 break;
194 (*ul)->ul_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
195
196 if ((*ul)->ul_slot >= sdp->sd_unlinked_slots)
197 goto fail;
198
199 sdp->sd_unlinked_bitmap[c][o] |= 1 << b;
200
201 spin_unlock(&sdp->sd_unlinked_spin);
202
203 return 0;
204
205fail:
206 spin_unlock(&sdp->sd_unlinked_spin);
207 kfree(*ul);
208 return -ENOSPC;
209}
210
211void gfs2_unlinked_put(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
212{
213 gfs2_assert_warn(sdp, test_and_clear_bit(ULF_LOCKED, &ul->ul_flags));
214
215 spin_lock(&sdp->sd_unlinked_spin);
216 gfs2_assert(sdp, ul->ul_count);
217 ul->ul_count--;
218 if (!ul->ul_count) {
219 gfs2_icbit_munge(sdp, sdp->sd_unlinked_bitmap, ul->ul_slot, 0);
220 spin_unlock(&sdp->sd_unlinked_spin);
221 kfree(ul);
222 } else
223 spin_unlock(&sdp->sd_unlinked_spin);
224}
225
226int gfs2_unlinked_ondisk_add(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
227{
228 int error;
229
230 gfs2_assert_warn(sdp, test_bit(ULF_LOCKED, &ul->ul_flags));
231 gfs2_assert_warn(sdp, list_empty(&ul->ul_list));
232
233 error = munge_ondisk(sdp, ul->ul_slot, &ul->ul_ut);
234 if (!error)
235 ul_hash(sdp, ul);
236
237 return error;
238}
239
240int gfs2_unlinked_ondisk_munge(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
241{
242 int error;
243
244 gfs2_assert_warn(sdp, test_bit(ULF_LOCKED, &ul->ul_flags));
245 gfs2_assert_warn(sdp, !list_empty(&ul->ul_list));
246
247 error = munge_ondisk(sdp, ul->ul_slot, &ul->ul_ut);
248
249 return error;
250}
251
252int gfs2_unlinked_ondisk_rm(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
253{
254 struct gfs2_unlinked_tag ut;
255 int error;
256
257 gfs2_assert_warn(sdp, test_bit(ULF_LOCKED, &ul->ul_flags));
258 gfs2_assert_warn(sdp, !list_empty(&ul->ul_list));
259
260 memset(&ut, 0, sizeof(struct gfs2_unlinked_tag));
261
262 error = munge_ondisk(sdp, ul->ul_slot, &ut);
263 if (error)
264 return error;
265
266 ul_unhash(sdp, ul);
267
268 return 0;
269}
270
271/**
272 * gfs2_unlinked_dealloc - Go through the list of inodes to be deallocated
273 * @sdp: the filesystem
274 *
275 * Returns: errno
276 */
277
278int gfs2_unlinked_dealloc(struct gfs2_sbd *sdp)
279{
280 unsigned int hits, strikes;
281 int error;
282
283 for (;;) {
284 hits = 0;
285 strikes = 0;
286
287 for (;;) {
288 struct gfs2_unlinked *ul = ul_fish(sdp);
289 if (!ul)
290 return 0;
291 error = gfs2_inode_dealloc(sdp, ul);
292 gfs2_unlinked_put(sdp, ul);
293
294 if (!error) {
295 hits++;
296 if (strikes)
297 strikes--;
298 } else if (error == 1) {
299 strikes++;
300 if (strikes >=
301 atomic_read(&sdp->sd_unlinked_count)) {
302 error = 0;
303 break;
304 }
305 } else
306 return error;
307 }
308
309 if (!hits || kthread_should_stop())
310 break;
311
312 cond_resched();
313 }
314
315 return 0;
316}
317
318int gfs2_unlinked_init(struct gfs2_sbd *sdp)
319{
320 struct gfs2_inode *ip = sdp->sd_ut_inode->u.generic_ip;
321 unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
322 unsigned int x, slot = 0;
323 unsigned int found = 0;
324 uint64_t dblock;
325 uint32_t extlen = 0;
326 int error;
327
328 if (!ip->i_di.di_size ||
329 ip->i_di.di_size > (64 << 20) ||
330 ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
331 gfs2_consist_inode(ip);
332 return -EIO;
333 }
334 sdp->sd_unlinked_slots = blocks * sdp->sd_ut_per_block;
335 sdp->sd_unlinked_chunks = DIV_ROUND_UP(sdp->sd_unlinked_slots,
336 8 * PAGE_SIZE);
337
338 error = -ENOMEM;
339
340 sdp->sd_unlinked_bitmap = kcalloc(sdp->sd_unlinked_chunks,
341 sizeof(unsigned char *),
342 GFP_KERNEL);
343 if (!sdp->sd_unlinked_bitmap)
344 return error;
345
346 for (x = 0; x < sdp->sd_unlinked_chunks; x++) {
347 sdp->sd_unlinked_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL);
348 if (!sdp->sd_unlinked_bitmap[x])
349 goto fail;
350 }
351
352 for (x = 0; x < blocks; x++) {
353 struct buffer_head *bh;
354 unsigned int y;
355
356 if (!extlen) {
357 int new = 0;
358 error = gfs2_extent_map(ip->i_vnode, x, &new, &dblock, &extlen);
359 if (error)
360 goto fail;
361 }
362 gfs2_meta_ra(ip->i_gl, dblock, extlen);
363 error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT,
364 &bh);
365 if (error)
366 goto fail;
367 error = -EIO;
368 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_UT)) {
369 brelse(bh);
370 goto fail;
371 }
372
373 for (y = 0;
374 y < sdp->sd_ut_per_block && slot < sdp->sd_unlinked_slots;
375 y++, slot++) {
376 struct gfs2_unlinked_tag ut;
377 struct gfs2_unlinked *ul;
378
379 gfs2_unlinked_tag_in(&ut, bh->b_data +
380 sizeof(struct gfs2_meta_header) +
381 y * sizeof(struct gfs2_unlinked_tag));
382 if (!ut.ut_inum.no_addr)
383 continue;
384
385 error = -ENOMEM;
386 ul = ul_alloc(sdp);
387 if (!ul) {
388 brelse(bh);
389 goto fail;
390 }
391 ul->ul_ut = ut;
392 ul->ul_slot = slot;
393
394 spin_lock(&sdp->sd_unlinked_spin);
395 gfs2_icbit_munge(sdp, sdp->sd_unlinked_bitmap, slot, 1);
396 spin_unlock(&sdp->sd_unlinked_spin);
397 ul_hash(sdp, ul);
398
399 gfs2_unlinked_put(sdp, ul);
400 found++;
401 }
402
403 brelse(bh);
404 dblock++;
405 extlen--;
406 }
407
408 if (found)
409 fs_info(sdp, "found %u unlinked inodes\n", found);
410
411 return 0;
412
413fail:
414 gfs2_unlinked_cleanup(sdp);
415 return error;
416}
417
418/**
419 * gfs2_unlinked_cleanup - get rid of any extra struct gfs2_unlinked structures
420 * @sdp: the filesystem
421 *
422 */
423
424void gfs2_unlinked_cleanup(struct gfs2_sbd *sdp)
425{
426 struct list_head *head = &sdp->sd_unlinked_list;
427 struct gfs2_unlinked *ul;
428 unsigned int x;
429
430 spin_lock(&sdp->sd_unlinked_spin);
431 while (!list_empty(head)) {
432 ul = list_entry(head->next, struct gfs2_unlinked, ul_list);
433
434 if (ul->ul_count > 1) {
435 list_move_tail(&ul->ul_list, head);
436 spin_unlock(&sdp->sd_unlinked_spin);
437 schedule();
438 spin_lock(&sdp->sd_unlinked_spin);
439 continue;
440 }
441
442 list_del_init(&ul->ul_list);
443 atomic_dec(&sdp->sd_unlinked_count);
444
445 gfs2_assert_warn(sdp, ul->ul_count == 1);
446 gfs2_assert_warn(sdp, !test_bit(ULF_LOCKED, &ul->ul_flags));
447 kfree(ul);
448 }
449 spin_unlock(&sdp->sd_unlinked_spin);
450
451 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_unlinked_count));
452
453 if (sdp->sd_unlinked_bitmap) {
454 for (x = 0; x < sdp->sd_unlinked_chunks; x++)
455 kfree(sdp->sd_unlinked_bitmap[x]);
456 kfree(sdp->sd_unlinked_bitmap);
457 }
458}
459
diff --git a/fs/gfs2/unlinked.h b/fs/gfs2/unlinked.h
new file mode 100644
index 000000000000..159cf5ffe47e
--- /dev/null
+++ b/fs/gfs2/unlinked.h
@@ -0,0 +1,25 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __UNLINKED_DOT_H__
11#define __UNLINKED_DOT_H__
12
13int gfs2_unlinked_get(struct gfs2_sbd *sdp, struct gfs2_unlinked **ul);
14void gfs2_unlinked_put(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul);
15
16int gfs2_unlinked_ondisk_add(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul);
17int gfs2_unlinked_ondisk_munge(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul);
18int gfs2_unlinked_ondisk_rm(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul);
19
20int gfs2_unlinked_dealloc(struct gfs2_sbd *sdp);
21
22int gfs2_unlinked_init(struct gfs2_sbd *sdp);
23void gfs2_unlinked_cleanup(struct gfs2_sbd *sdp);
24
25#endif /* __UNLINKED_DOT_H__ */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
new file mode 100644
index 000000000000..88974e9824f7
--- /dev/null
+++ b/fs/gfs2/util.c
@@ -0,0 +1,245 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/crc32.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/uaccess.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "glock.h"
23#include "lm.h"
24#include "util.h"
25
26kmem_cache_t *gfs2_glock_cachep __read_mostly;
27kmem_cache_t *gfs2_inode_cachep __read_mostly;
28kmem_cache_t *gfs2_bufdata_cachep __read_mostly;
29
30void gfs2_assert_i(struct gfs2_sbd *sdp)
31{
32 printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n",
33 sdp->sd_fsname);
34}
35
36/**
37 * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
38 * Returns: -1 if this call withdrew the machine,
39 * -2 if it was already withdrawn
40 */
41
42int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
43 const char *function, char *file, unsigned int line)
44{
45 int me;
46 me = gfs2_lm_withdraw(sdp,
47 "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n"
48 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
49 sdp->sd_fsname, assertion,
50 sdp->sd_fsname, function, file, line);
51 dump_stack();
52 return (me) ? -1 : -2;
53}
54
55/**
56 * gfs2_assert_warn_i - Print a message to the console if @assertion is false
57 * Returns: -1 if we printed something
58 * -2 if we didn't
59 */
60
61int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
62 const char *function, char *file, unsigned int line)
63{
64 if (time_before(jiffies,
65 sdp->sd_last_warning +
66 gfs2_tune_get(sdp, gt_complain_secs) * HZ))
67 return -2;
68
69 printk(KERN_WARNING
70 "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
71 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
72 sdp->sd_fsname, assertion,
73 sdp->sd_fsname, function, file, line);
74
75 if (sdp->sd_args.ar_debug)
76 BUG();
77 else
78 dump_stack();
79
80 sdp->sd_last_warning = jiffies;
81
82 return -1;
83}
84
85/**
86 * gfs2_consist_i - Flag a filesystem consistency error and withdraw
87 * Returns: -1 if this call withdrew the machine,
88 * 0 if it was already withdrawn
89 */
90
91int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
92 char *file, unsigned int line)
93{
94 int rv;
95 rv = gfs2_lm_withdraw(sdp,
96 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
97 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
98 sdp->sd_fsname,
99 sdp->sd_fsname, function, file, line);
100 return rv;
101}
102
103/**
104 * gfs2_consist_inode_i - Flag an inode consistency error and withdraw
105 * Returns: -1 if this call withdrew the machine,
106 * 0 if it was already withdrawn
107 */
108
109int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
110 const char *function, char *file, unsigned int line)
111{
112 struct gfs2_sbd *sdp = ip->i_sbd;
113 int rv;
114 rv = gfs2_lm_withdraw(sdp,
115 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
116 "GFS2: fsid=%s: inode = %llu %llu\n"
117 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
118 sdp->sd_fsname,
119 sdp->sd_fsname, (unsigned long long)ip->i_num.no_formal_ino,
120 (unsigned long long)ip->i_num.no_addr,
121 sdp->sd_fsname, function, file, line);
122 return rv;
123}
124
125/**
126 * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw
127 * Returns: -1 if this call withdrew the machine,
128 * 0 if it was already withdrawn
129 */
130
131int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
132 const char *function, char *file, unsigned int line)
133{
134 struct gfs2_sbd *sdp = rgd->rd_sbd;
135 int rv;
136 rv = gfs2_lm_withdraw(sdp,
137 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
138 "GFS2: fsid=%s: RG = %llu\n"
139 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
140 sdp->sd_fsname,
141 sdp->sd_fsname, (unsigned long long)rgd->rd_ri.ri_addr,
142 sdp->sd_fsname, function, file, line);
143 return rv;
144}
145
146/**
147 * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw
148 * Returns: -1 if this call withdrew the machine,
149 * -2 if it was already withdrawn
150 */
151
152int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
153 const char *type, const char *function, char *file,
154 unsigned int line)
155{
156 int me;
157 me = gfs2_lm_withdraw(sdp,
158 "GFS2: fsid=%s: fatal: invalid metadata block\n"
159 "GFS2: fsid=%s: bh = %llu (%s)\n"
160 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
161 sdp->sd_fsname,
162 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type,
163 sdp->sd_fsname, function, file, line);
164 return (me) ? -1 : -2;
165}
166
167/**
168 * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw
169 * Returns: -1 if this call withdrew the machine,
170 * -2 if it was already withdrawn
171 */
172
173int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
174 uint16_t type, uint16_t t, const char *function,
175 char *file, unsigned int line)
176{
177 int me;
178 me = gfs2_lm_withdraw(sdp,
179 "GFS2: fsid=%s: fatal: invalid metadata block\n"
180 "GFS2: fsid=%s: bh = %llu (type: exp=%u, found=%u)\n"
181 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
182 sdp->sd_fsname,
183 sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t,
184 sdp->sd_fsname, function, file, line);
185 return (me) ? -1 : -2;
186}
187
188/**
189 * gfs2_io_error_i - Flag an I/O error and withdraw
190 * Returns: -1 if this call withdrew the machine,
191 * 0 if it was already withdrawn
192 */
193
194int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
195 unsigned int line)
196{
197 int rv;
198 rv = gfs2_lm_withdraw(sdp,
199 "GFS2: fsid=%s: fatal: I/O error\n"
200 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
201 sdp->sd_fsname,
202 sdp->sd_fsname, function, file, line);
203 return rv;
204}
205
206/**
207 * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw
208 * Returns: -1 if this call withdrew the machine,
209 * 0 if it was already withdrawn
210 */
211
212int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
213 const char *function, char *file, unsigned int line)
214{
215 int rv;
216 rv = gfs2_lm_withdraw(sdp,
217 "GFS2: fsid=%s: fatal: I/O error\n"
218 "GFS2: fsid=%s: block = %llu\n"
219 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
220 sdp->sd_fsname,
221 sdp->sd_fsname, (unsigned long long)bh->b_blocknr,
222 sdp->sd_fsname, function, file, line);
223 return rv;
224}
225
226void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
227 unsigned int bit, int new_value)
228{
229 unsigned int c, o, b = bit;
230 int old_value;
231
232 c = b / (8 * PAGE_SIZE);
233 b %= 8 * PAGE_SIZE;
234 o = b / 8;
235 b %= 8;
236
237 old_value = (bitmap[c][o] & (1 << b));
238 gfs2_assert_withdraw(sdp, !old_value != !new_value);
239
240 if (new_value)
241 bitmap[c][o] |= 1 << b;
242 else
243 bitmap[c][o] &= ~(1 << b);
244}
245
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
new file mode 100644
index 000000000000..8216d28bd816
--- /dev/null
+++ b/fs/gfs2/util.h
@@ -0,0 +1,169 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __UTIL_DOT_H__
11#define __UTIL_DOT_H__
12
13
14#define fs_printk(level, fs, fmt, arg...) \
15 printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg)
16
17#define fs_info(fs, fmt, arg...) \
18 fs_printk(KERN_INFO , fs , fmt , ## arg)
19
20#define fs_warn(fs, fmt, arg...) \
21 fs_printk(KERN_WARNING , fs , fmt , ## arg)
22
23#define fs_err(fs, fmt, arg...) \
24 fs_printk(KERN_ERR, fs , fmt , ## arg)
25
26
27void gfs2_assert_i(struct gfs2_sbd *sdp);
28
29#define gfs2_assert(sdp, assertion) \
30do { \
31 if (unlikely(!(assertion))) { \
32 gfs2_assert_i(sdp); \
33 BUG(); \
34 } \
35} while (0)
36
37
38int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
39 const char *function, char *file, unsigned int line);
40
41#define gfs2_assert_withdraw(sdp, assertion) \
42((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \
43 __FUNCTION__, __FILE__, __LINE__))
44
45
46int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
47 const char *function, char *file, unsigned int line);
48
49#define gfs2_assert_warn(sdp, assertion) \
50((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \
51 __FUNCTION__, __FILE__, __LINE__))
52
53
54int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide,
55 const char *function, char *file, unsigned int line);
56
57#define gfs2_consist(sdp) \
58gfs2_consist_i((sdp), 0, __FUNCTION__, __FILE__, __LINE__)
59
60
61int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
62 const char *function, char *file, unsigned int line);
63
64#define gfs2_consist_inode(ip) \
65gfs2_consist_inode_i((ip), 0, __FUNCTION__, __FILE__, __LINE__)
66
67
68int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
69 const char *function, char *file, unsigned int line);
70
71#define gfs2_consist_rgrpd(rgd) \
72gfs2_consist_rgrpd_i((rgd), 0, __FUNCTION__, __FILE__, __LINE__)
73
74
75int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
76 const char *type, const char *function,
77 char *file, unsigned int line);
78
79static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp,
80 struct buffer_head *bh,
81 const char *function,
82 char *file, unsigned int line)
83{
84 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
85 uint32_t magic = mh->mh_magic;
86 magic = be32_to_cpu(magic);
87 if (unlikely(magic != GFS2_MAGIC))
88 return gfs2_meta_check_ii(sdp, bh, "magic number", function,
89 file, line);
90 return 0;
91}
92
93#define gfs2_meta_check(sdp, bh) \
94gfs2_meta_check_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__)
95
96
97int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
98 uint16_t type, uint16_t t,
99 const char *function,
100 char *file, unsigned int line);
101
102static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
103 struct buffer_head *bh,
104 uint16_t type,
105 const char *function,
106 char *file, unsigned int line)
107{
108 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
109 uint32_t magic = mh->mh_magic;
110 uint16_t t = be32_to_cpu(mh->mh_type);
111 magic = be32_to_cpu(magic);
112 if (unlikely(magic != GFS2_MAGIC))
113 return gfs2_meta_check_ii(sdp, bh, "magic number", function,
114 file, line);
115 if (unlikely(t != type))
116 return gfs2_metatype_check_ii(sdp, bh, type, t, function,
117 file, line);
118 return 0;
119}
120
121#define gfs2_metatype_check(sdp, bh, type) \
122gfs2_metatype_check_i((sdp), (bh), (type), __FUNCTION__, __FILE__, __LINE__)
123
124static inline void gfs2_metatype_set(struct buffer_head *bh, uint16_t type,
125 uint16_t format)
126{
127 struct gfs2_meta_header *mh;
128 mh = (struct gfs2_meta_header *)bh->b_data;
129 mh->mh_type = cpu_to_be32(type);
130 mh->mh_format = cpu_to_be32(format);
131}
132
133
134int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
135 char *file, unsigned int line);
136
137#define gfs2_io_error(sdp) \
138gfs2_io_error_i((sdp), __FUNCTION__, __FILE__, __LINE__);
139
140
141int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
142 const char *function, char *file, unsigned int line);
143
144#define gfs2_io_error_bh(sdp, bh) \
145gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__);
146
147
148extern kmem_cache_t *gfs2_glock_cachep;
149extern kmem_cache_t *gfs2_inode_cachep;
150extern kmem_cache_t *gfs2_bufdata_cachep;
151
152static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
153 unsigned int *p)
154{
155 unsigned int x;
156 spin_lock(&gt->gt_spin);
157 x = *p;
158 spin_unlock(&gt->gt_spin);
159 return x;
160}
161
162#define gfs2_tune_get(sdp, field) \
163gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
164
165void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
166 unsigned int bit, int new_value);
167
168#endif /* __UTIL_DOT_H__ */
169
diff --git a/include/linux/dlm.h b/include/linux/dlm.h
new file mode 100644
index 000000000000..1b1dcb9a40bb
--- /dev/null
+++ b/include/linux/dlm.h
@@ -0,0 +1,302 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DLM_DOT_H__
15#define __DLM_DOT_H__
16
17/*
18 * Interface to Distributed Lock Manager (DLM)
19 * routines and structures to use DLM lockspaces
20 */
21
22/*
23 * Lock Modes
24 */
25
26#define DLM_LOCK_IV -1 /* invalid */
27#define DLM_LOCK_NL 0 /* null */
28#define DLM_LOCK_CR 1 /* concurrent read */
29#define DLM_LOCK_CW 2 /* concurrent write */
30#define DLM_LOCK_PR 3 /* protected read */
31#define DLM_LOCK_PW 4 /* protected write */
32#define DLM_LOCK_EX 5 /* exclusive */
33
34/*
35 * Maximum size in bytes of a dlm_lock name
36 */
37
38#define DLM_RESNAME_MAXLEN 64
39
40/*
41 * Flags to dlm_lock
42 *
43 * DLM_LKF_NOQUEUE
44 *
45 * Do not queue the lock request on the wait queue if it cannot be granted
46 * immediately. If the lock cannot be granted because of this flag, DLM will
47 * either return -EAGAIN from the dlm_lock call or will return 0 from
48 * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
49 *
50 * DLM_LKF_CANCEL
51 *
52 * Used to cancel a pending lock request or conversion. A converting lock is
53 * returned to its previously granted mode.
54 *
55 * DLM_LKF_CONVERT
56 *
57 * Indicates a lock conversion request. For conversions the name and namelen
58 * are ignored and the lock ID in the LKSB is used to identify the lock.
59 *
60 * DLM_LKF_VALBLK
61 *
62 * Requests DLM to return the current contents of the lock value block in the
63 * lock status block. When this flag is set in a lock conversion from PW or EX
64 * modes, DLM assigns the value specified in the lock status block to the lock
65 * value block of the lock resource. The LVB is a DLM_LVB_LEN size array
66 * containing application-specific information.
67 *
68 * DLM_LKF_QUECVT
69 *
70 * Force a conversion request to be queued, even if it is compatible with
71 * the granted modes of other locks on the same resource.
72 *
73 * DLM_LKF_IVVALBLK
74 *
75 * Invalidate the lock value block.
76 *
77 * DLM_LKF_CONVDEADLK
78 *
79 * Allows the dlm to resolve conversion deadlocks internally by demoting the
80 * granted mode of a converting lock to NL. The DLM_SBF_DEMOTED flag is
81 * returned for a conversion that's been effected by this.
82 *
83 * DLM_LKF_PERSISTENT
84 *
85 * Only relevant to locks originating in userspace. A persistent lock will not
86 * be removed if the process holding the lock exits.
87 *
88 * DLM_LKF_NODLKWT
89 * DLM_LKF_NODLCKBLK
90 *
91 * net yet implemented
92 *
93 * DLM_LKF_EXPEDITE
94 *
95 * Used only with new requests for NL mode locks. Tells the lock manager
96 * to grant the lock, ignoring other locks in convert and wait queues.
97 *
98 * DLM_LKF_NOQUEUEBAST
99 *
100 * Send blocking AST's before returning -EAGAIN to the caller. It is only
101 * used along with the NOQUEUE flag. Blocking AST's are not sent for failed
102 * NOQUEUE requests otherwise.
103 *
104 * DLM_LKF_HEADQUE
105 *
106 * Add a lock to the head of the convert or wait queue rather than the tail.
107 *
108 * DLM_LKF_NOORDER
109 *
110 * Disregard the standard grant order rules and grant a lock as soon as it
111 * is compatible with other granted locks.
112 *
113 * DLM_LKF_ORPHAN
114 *
115 * not yet implemented
116 *
117 * DLM_LKF_ALTPR
118 *
119 * If the requested mode cannot be granted immediately, try to grant the lock
120 * in PR mode instead. If this alternate mode is granted instead of the
121 * requested mode, DLM_SBF_ALTMODE is returned in the lksb.
122 *
123 * DLM_LKF_ALTCW
124 *
125 * The same as ALTPR, but the alternate mode is CW.
126 *
127 * DLM_LKF_FORCEUNLOCK
128 *
129 * Unlock the lock even if it is converting or waiting or has sublocks.
130 * Only really for use by the userland device.c code.
131 *
132 */
133
134#define DLM_LKF_NOQUEUE 0x00000001
135#define DLM_LKF_CANCEL 0x00000002
136#define DLM_LKF_CONVERT 0x00000004
137#define DLM_LKF_VALBLK 0x00000008
138#define DLM_LKF_QUECVT 0x00000010
139#define DLM_LKF_IVVALBLK 0x00000020
140#define DLM_LKF_CONVDEADLK 0x00000040
141#define DLM_LKF_PERSISTENT 0x00000080
142#define DLM_LKF_NODLCKWT 0x00000100
143#define DLM_LKF_NODLCKBLK 0x00000200
144#define DLM_LKF_EXPEDITE 0x00000400
145#define DLM_LKF_NOQUEUEBAST 0x00000800
146#define DLM_LKF_HEADQUE 0x00001000
147#define DLM_LKF_NOORDER 0x00002000
148#define DLM_LKF_ORPHAN 0x00004000
149#define DLM_LKF_ALTPR 0x00008000
150#define DLM_LKF_ALTCW 0x00010000
151#define DLM_LKF_FORCEUNLOCK 0x00020000
152
153/*
154 * Some return codes that are not in errno.h
155 */
156
157#define DLM_ECANCEL 0x10001
158#define DLM_EUNLOCK 0x10002
159
160typedef void dlm_lockspace_t;
161
162/*
163 * Lock status block
164 *
165 * Use this structure to specify the contents of the lock value block. For a
166 * conversion request, this structure is used to specify the lock ID of the
167 * lock. DLM writes the status of the lock request and the lock ID assigned
168 * to the request in the lock status block.
169 *
170 * sb_lkid: the returned lock ID. It is set on new (non-conversion) requests.
171 * It is available when dlm_lock returns.
172 *
173 * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
174 * shown for the DLM_LKF_VALBLK flag.
175 *
176 * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
177 * it was first demoted to NL to avoid conversion deadlock.
178 * DLM_SBF_VALNOTVALID is returned if the resource's LVB is marked invalid.
179 *
180 * sb_status: the returned status of the lock request set prior to AST
181 * execution. Possible return values:
182 *
183 * 0 if lock request was successful
184 * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
185 * -ENOMEM if there is no memory to process request
186 * -EINVAL if there are invalid parameters
187 * -DLM_EUNLOCK if unlock request was successful
188 * -DLM_ECANCEL if a cancel completed successfully
189 */
190
191#define DLM_SBF_DEMOTED 0x01
192#define DLM_SBF_VALNOTVALID 0x02
193#define DLM_SBF_ALTMODE 0x04
194
195struct dlm_lksb {
196 int sb_status;
197 uint32_t sb_lkid;
198 char sb_flags;
199 char * sb_lvbptr;
200};
201
202
203#ifdef __KERNEL__
204
205#define DLM_LSFL_NODIR 0x00000001
206
207/*
208 * dlm_new_lockspace
209 *
210 * Starts a lockspace with the given name. If the named lockspace exists in
211 * the cluster, the calling node joins it.
212 */
213
214int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
215 uint32_t flags, int lvblen);
216
217/*
218 * dlm_release_lockspace
219 *
220 * Stop a lockspace.
221 */
222
223int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
224
225/*
226 * dlm_lock
227 *
228 * Make an asyncronous request to acquire or convert a lock on a named
229 * resource.
230 *
231 * lockspace: context for the request
232 * mode: the requested mode of the lock (DLM_LOCK_)
233 * lksb: lock status block for input and async return values
234 * flags: input flags (DLM_LKF_)
235 * name: name of the resource to lock, can be binary
236 * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
237 * parent: the lock ID of a parent lock or 0 if none
238 * lockast: function DLM executes when it completes processing the request
239 * astarg: argument passed to lockast and bast functions
240 * bast: function DLM executes when this lock later blocks another request
241 *
242 * Returns:
243 * 0 if request is successfully queued for processing
244 * -EINVAL if any input parameters are invalid
245 * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
246 * -ENOMEM if there is no memory to process request
247 * -ENOTCONN if there is a communication error
248 *
249 * If the call to dlm_lock returns an error then the operation has failed and
250 * the AST routine will not be called. If dlm_lock returns 0 it is still
251 * possible that the lock operation will fail. The AST routine will be called
252 * when the locking is complete and the status is returned in the lksb.
253 *
254 * If the AST routines or parameter are passed to a conversion operation then
255 * they will overwrite those values that were passed to a previous dlm_lock
256 * call.
257 *
258 * AST routines should not block (at least not for long), but may make
259 * any locking calls they please.
260 */
261
262int dlm_lock(dlm_lockspace_t *lockspace,
263 int mode,
264 struct dlm_lksb *lksb,
265 uint32_t flags,
266 void *name,
267 unsigned int namelen,
268 uint32_t parent_lkid,
269 void (*lockast) (void *astarg),
270 void *astarg,
271 void (*bast) (void *astarg, int mode));
272
273/*
274 * dlm_unlock
275 *
276 * Asynchronously release a lock on a resource. The AST routine is called
277 * when the resource is successfully unlocked.
278 *
279 * lockspace: context for the request
280 * lkid: the lock ID as returned in the lksb
281 * flags: input flags (DLM_LKF_)
282 * lksb: if NULL the lksb parameter passed to last lock request is used
283 * astarg: the arg used with the completion ast for the unlock
284 *
285 * Returns:
286 * 0 if request is successfully queued for processing
287 * -EINVAL if any input parameters are invalid
288 * -ENOTEMPTY if the lock still has sublocks
289 * -EBUSY if the lock is waiting for a remote lock operation
290 * -ENOTCONN if there is a communication error
291 */
292
293int dlm_unlock(dlm_lockspace_t *lockspace,
294 uint32_t lkid,
295 uint32_t flags,
296 struct dlm_lksb *lksb,
297 void *astarg);
298
299#endif /* __KERNEL__ */
300
301#endif /* __DLM_DOT_H__ */
302
diff --git a/include/linux/dlm_device.h b/include/linux/dlm_device.h
new file mode 100644
index 000000000000..f8ba1981aa96
--- /dev/null
+++ b/include/linux/dlm_device.h
@@ -0,0 +1,83 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/* This is the device interface for dlm, most users will use a library
15 * interface.
16 */
17
18#define DLM_USER_LVB_LEN 32
19
20/* Version of the device interface */
21#define DLM_DEVICE_VERSION_MAJOR 4
22#define DLM_DEVICE_VERSION_MINOR 0
23#define DLM_DEVICE_VERSION_PATCH 0
24
25/* struct passed to the lock write */
26struct dlm_lock_params {
27 __u8 mode;
28 __u16 flags;
29 __u32 lkid;
30 __u32 parent;
31 __u8 namelen;
32 void __user *castparam;
33 void __user *castaddr;
34 void __user *bastparam;
35 void __user *bastaddr;
36 struct dlm_lksb __user *lksb;
37 char lvb[DLM_USER_LVB_LEN];
38 char name[1];
39};
40
41struct dlm_lspace_params {
42 __u32 flags;
43 __u32 minor;
44 char name[1];
45};
46
47struct dlm_write_request {
48 __u32 version[3];
49 __u8 cmd;
50
51 union {
52 struct dlm_lock_params lock;
53 struct dlm_lspace_params lspace;
54 } i;
55};
56
57/* struct read from the "device" fd,
58 consists mainly of userspace pointers for the library to use */
59struct dlm_lock_result {
60 __u32 length;
61 void __user * user_astaddr;
62 void __user * user_astparam;
63 struct dlm_lksb __user * user_lksb;
64 struct dlm_lksb lksb;
65 __u8 bast_mode;
66 /* Offsets may be zero if no data is present */
67 __u32 lvb_offset;
68};
69
70/* Commands passed to the device */
71#define DLM_USER_LOCK 1
72#define DLM_USER_UNLOCK 2
73#define DLM_USER_QUERY 3
74#define DLM_USER_CREATE_LOCKSPACE 4
75#define DLM_USER_REMOVE_LOCKSPACE 5
76
77/* Arbitrary length restriction */
78#define MAX_LS_NAME_LEN 64
79
80/* Lockspace flags */
81#define DLM_USER_LSFLG_AUTOFREE 1
82#define DLM_USER_LSFLG_FORCEFREE 2
83
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f813bc8266aa..39e3d117e3d9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1330,6 +1330,9 @@ extern struct subsystem fs_subsys;
1330#define FLOCK_VERIFY_READ 1 1330#define FLOCK_VERIFY_READ 1
1331#define FLOCK_VERIFY_WRITE 2 1331#define FLOCK_VERIFY_WRITE 2
1332 1332
1333/* /sys/fs */
1334extern struct subsystem fs_subsys;
1335
1333extern int locks_mandatory_locked(struct inode *); 1336extern int locks_mandatory_locked(struct inode *);
1334extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); 1337extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t);
1335 1338
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
new file mode 100644
index 000000000000..3893aac4e3ae
--- /dev/null
+++ b/include/linux/gfs2_ondisk.h
@@ -0,0 +1,459 @@
1/*
2* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3* Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4*
5* This copyrighted material is made available to anyone wishing to use,
6* modify, copy, or redistribute it subject to the terms and conditions
7* of the GNU General Public License v.2.
8*/
9
10#ifndef __GFS2_ONDISK_DOT_H__
11#define __GFS2_ONDISK_DOT_H__
12
13#define GFS2_MAGIC 0x01161970
14#define GFS2_BASIC_BLOCK 512
15#define GFS2_BASIC_BLOCK_SHIFT 9
16
17/* Lock numbers of the LM_TYPE_NONDISK type */
18
19#define GFS2_MOUNT_LOCK 0
20#define GFS2_LIVE_LOCK 1
21#define GFS2_TRANS_LOCK 2
22#define GFS2_RENAME_LOCK 3
23
24/* Format numbers for various metadata types */
25
26#define GFS2_FORMAT_NONE 0
27#define GFS2_FORMAT_SB 100
28#define GFS2_FORMAT_RG 200
29#define GFS2_FORMAT_RB 300
30#define GFS2_FORMAT_DI 400
31#define GFS2_FORMAT_IN 500
32#define GFS2_FORMAT_LF 600
33#define GFS2_FORMAT_JD 700
34#define GFS2_FORMAT_LH 800
35#define GFS2_FORMAT_LD 900
36#define GFS2_FORMAT_LB 1000
37#define GFS2_FORMAT_EA 1600
38#define GFS2_FORMAT_ED 1700
39#define GFS2_FORMAT_UT 1300
40#define GFS2_FORMAT_QC 1400
41/* These are format numbers for entities contained in files */
42#define GFS2_FORMAT_RI 1100
43#define GFS2_FORMAT_DE 1200
44#define GFS2_FORMAT_QU 1500
45/* These are part of the superblock */
46#define GFS2_FORMAT_FS 1801
47#define GFS2_FORMAT_MULTI 1900
48
49/*
50 * An on-disk inode number
51 */
52
53struct gfs2_inum {
54 __be64 no_formal_ino;
55 __be64 no_addr;
56};
57
58static inline int gfs2_inum_equal(const struct gfs2_inum *ino1,
59 const struct gfs2_inum *ino2)
60{
61 return ino1->no_formal_ino == ino2->no_formal_ino &&
62 ino1->no_addr == ino2->no_addr;
63}
64
65/*
66 * Generic metadata head structure
67 * Every inplace buffer logged in the journal must start with this.
68 */
69
70#define GFS2_METATYPE_NONE 0
71#define GFS2_METATYPE_SB 1
72#define GFS2_METATYPE_RG 2
73#define GFS2_METATYPE_RB 3
74#define GFS2_METATYPE_DI 4
75#define GFS2_METATYPE_IN 5
76#define GFS2_METATYPE_LF 6
77#define GFS2_METATYPE_JD 7
78#define GFS2_METATYPE_LH 8
79#define GFS2_METATYPE_LD 9
80#define GFS2_METATYPE_LB 12
81#define GFS2_METATYPE_EA 10
82#define GFS2_METATYPE_ED 11
83#define GFS2_METATYPE_UT 13
84#define GFS2_METATYPE_QC 14
85
86struct gfs2_meta_header {
87 __be32 mh_magic;
88 __be32 mh_type;
89 __be64 __pad0; /* Was generation number in gfs1 */
90 __be32 mh_format;
91 __be32 __pad1; /* Was incarnation number in gfs1 */
92};
93
94/*
95 * super-block structure
96 *
97 * It's probably good if SIZEOF_SB <= GFS2_BASIC_BLOCK (512 bytes)
98 *
99 * Order is important, need to be able to read old superblocks to do on-disk
100 * version upgrades.
101 */
102
103/* Address of superblock in GFS2 basic blocks */
104#define GFS2_SB_ADDR 128
105
106/* The lock number for the superblock (must be zero) */
107#define GFS2_SB_LOCK 0
108
109/* Requirement: GFS2_LOCKNAME_LEN % 8 == 0
110 Includes: the fencing zero at the end */
111#define GFS2_LOCKNAME_LEN 64
112
113struct gfs2_sb {
114 struct gfs2_meta_header sb_header;
115
116 __be32 sb_fs_format;
117 __be32 sb_multihost_format;
118 __u32 __pad0; /* Was superblock flags in gfs1 */
119
120 __be32 sb_bsize;
121 __be32 sb_bsize_shift;
122 __u32 __pad1; /* Was journal segment size in gfs1 */
123
124 struct gfs2_inum sb_master_dir; /* Was jindex dinode in gfs1 */
125 struct gfs2_inum __pad2; /* Was rindex dinode in gfs1 */
126 struct gfs2_inum sb_root_dir;
127
128 char sb_lockproto[GFS2_LOCKNAME_LEN];
129 char sb_locktable[GFS2_LOCKNAME_LEN];
130 /* In gfs1, quota and license dinodes followed */
131};
132
133/*
134 * resource index structure
135 */
136
137struct gfs2_rindex {
138 __be64 ri_addr; /* grp block disk address */
139 __be32 ri_length; /* length of rgrp header in fs blocks */
140 __u32 __pad;
141
142 __be64 ri_data0; /* first data location */
143 __be32 ri_data; /* num of data blocks in rgrp */
144
145 __be32 ri_bitbytes; /* number of bytes in data bitmaps */
146
147 __u8 ri_reserved[64];
148};
149
150/*
151 * resource group header structure
152 */
153
154/* Number of blocks per byte in rgrp */
155#define GFS2_NBBY 4
156#define GFS2_BIT_SIZE 2
157#define GFS2_BIT_MASK 0x00000003
158
159#define GFS2_BLKST_FREE 0
160#define GFS2_BLKST_USED 1
161#define GFS2_BLKST_INVALID 2
162#define GFS2_BLKST_DINODE 3
163
164#define GFS2_RGF_JOURNAL 0x00000001
165#define GFS2_RGF_METAONLY 0x00000002
166#define GFS2_RGF_DATAONLY 0x00000004
167#define GFS2_RGF_NOALLOC 0x00000008
168
169struct gfs2_rgrp {
170 struct gfs2_meta_header rg_header;
171
172 __be32 rg_flags;
173 __be32 rg_free;
174 __be32 rg_dinodes;
175
176 __u8 rg_reserved[92]; /* Several fields from gfs1 now reserved */
177};
178
179/*
180 * quota structure
181 */
182
183struct gfs2_quota {
184 __be64 qu_limit;
185 __be64 qu_warn;
186 __be64 qu_value;
187 __u8 qu_reserved[64];
188};
189
190/*
191 * dinode structure
192 */
193
194#define GFS2_MAX_META_HEIGHT 10
195#define GFS2_DIR_MAX_DEPTH 17
196
197#define DT2IF(dt) (((dt) << 12) & S_IFMT)
198#define IF2DT(sif) (((sif) & S_IFMT) >> 12)
199
200enum {
201 gfs2fl_Jdata = 0,
202 gfs2fl_ExHash = 1,
203 gfs2fl_Unused = 2,
204 gfs2fl_EaIndirect = 3,
205 gfs2fl_Directio = 4,
206 gfs2fl_Immutable = 5,
207 gfs2fl_AppendOnly = 6,
208 gfs2fl_NoAtime = 7,
209 gfs2fl_Sync = 8,
210 gfs2fl_System = 9,
211 gfs2fl_TruncInProg = 29,
212 gfs2fl_InheritDirectio = 30,
213 gfs2fl_InheritJdata = 31,
214};
215
216/* Dinode flags */
217#define GFS2_DIF_JDATA 0x00000001
218#define GFS2_DIF_EXHASH 0x00000002
219#define GFS2_DIF_UNUSED 0x00000004 /* only in gfs1 */
220#define GFS2_DIF_EA_INDIRECT 0x00000008
221#define GFS2_DIF_DIRECTIO 0x00000010
222#define GFS2_DIF_IMMUTABLE 0x00000020
223#define GFS2_DIF_APPENDONLY 0x00000040
224#define GFS2_DIF_NOATIME 0x00000080
225#define GFS2_DIF_SYNC 0x00000100
226#define GFS2_DIF_SYSTEM 0x00000200 /* New in gfs2 */
227#define GFS2_DIF_TRUNC_IN_PROG 0x20000000 /* New in gfs2 */
228#define GFS2_DIF_INHERIT_DIRECTIO 0x40000000
229#define GFS2_DIF_INHERIT_JDATA 0x80000000
230
231struct gfs2_dinode {
232 struct gfs2_meta_header di_header;
233
234 struct gfs2_inum di_num;
235
236 __be32 di_mode; /* mode of file */
237 __be32 di_uid; /* owner's user id */
238 __be32 di_gid; /* owner's group id */
239 __be32 di_nlink; /* number of links to this file */
240 __be64 di_size; /* number of bytes in file */
241 __be64 di_blocks; /* number of blocks in file */
242 __be64 di_atime; /* time last accessed */
243 __be64 di_mtime; /* time last modified */
244 __be64 di_ctime; /* time last changed */
245 __be32 di_major; /* device major number */
246 __be32 di_minor; /* device minor number */
247
248 /* This section varies from gfs1. Padding added to align with
249 * remainder of dinode
250 */
251 __be64 di_goal_meta; /* rgrp to alloc from next */
252 __be64 di_goal_data; /* data block goal */
253 __u32 __pad[2];
254
255 __be32 di_flags; /* GFS2_DIF_... */
256 __be32 di_payload_format; /* GFS2_FORMAT_... */
257 __u16 __pad1; /* Was ditype in gfs1 */
258 __be16 di_height; /* height of metadata */
259 __u32 __pad2; /* Unused incarnation number from gfs1 */
260
261 /* These only apply to directories */
262 __u16 __pad3; /* Padding */
263 __be16 di_depth; /* Number of bits in the table */
264 __be32 di_entries; /* The number of entries in the directory */
265
266 struct gfs2_inum __pad4; /* Unused even in current gfs1 */
267
268 __be64 di_eattr; /* extended attribute block number */
269
270 __u8 di_reserved[56];
271};
272
273/*
274 * directory structure - many of these per directory file
275 */
276
277#define GFS2_FNAMESIZE 255
278#define GFS2_DIRENT_SIZE(name_len) ((sizeof(struct gfs2_dirent) + (name_len) + 7) & ~7)
279
280struct gfs2_dirent {
281 struct gfs2_inum de_inum;
282 __be32 de_hash;
283 __be16 de_rec_len;
284 __be16 de_name_len;
285 __be16 de_type;
286 __u8 __pad[14];
287};
288
289/*
290 * Header of leaf directory nodes
291 */
292
293struct gfs2_leaf {
294 struct gfs2_meta_header lf_header;
295
296 __be16 lf_depth; /* Depth of leaf */
297 __be16 lf_entries; /* Number of dirents in leaf */
298 __be32 lf_dirent_format; /* Format of the dirents */
299 __be64 lf_next; /* Next leaf, if overflow */
300
301 __u8 lf_reserved[64];
302};
303
304/*
305 * Extended attribute header format
306 */
307
308#define GFS2_EA_MAX_NAME_LEN 255
309#define GFS2_EA_MAX_DATA_LEN 65536
310
311#define GFS2_EATYPE_UNUSED 0
312#define GFS2_EATYPE_USR 1
313#define GFS2_EATYPE_SYS 2
314#define GFS2_EATYPE_SECURITY 3
315
316#define GFS2_EATYPE_LAST 3
317#define GFS2_EATYPE_VALID(x) ((x) <= GFS2_EATYPE_LAST)
318
319#define GFS2_EAFLAG_LAST 0x01 /* last ea in block */
320
321struct gfs2_ea_header {
322 __be32 ea_rec_len;
323 __be32 ea_data_len;
324 __u8 ea_name_len; /* no NULL pointer after the string */
325 __u8 ea_type; /* GFS2_EATYPE_... */
326 __u8 ea_flags; /* GFS2_EAFLAG_... */
327 __u8 ea_num_ptrs;
328 __u32 __pad;
329};
330
331/*
332 * Log header structure
333 */
334
335#define GFS2_LOG_HEAD_UNMOUNT 0x00000001 /* log is clean */
336
337struct gfs2_log_header {
338 struct gfs2_meta_header lh_header;
339
340 __be64 lh_sequence; /* Sequence number of this transaction */
341 __be32 lh_flags; /* GFS2_LOG_HEAD_... */
342 __be32 lh_tail; /* Block number of log tail */
343 __be32 lh_blkno;
344 __be32 lh_hash;
345};
346
347/*
348 * Log type descriptor
349 */
350
351#define GFS2_LOG_DESC_METADATA 300
352/* ld_data1 is the number of metadata blocks in the descriptor.
353 ld_data2 is unused. */
354
355#define GFS2_LOG_DESC_REVOKE 301
356/* ld_data1 is the number of revoke blocks in the descriptor.
357 ld_data2 is unused. */
358
359#define GFS2_LOG_DESC_JDATA 302
360/* ld_data1 is the number of data blocks in the descriptor.
361 ld_data2 is unused. */
362
363struct gfs2_log_descriptor {
364 struct gfs2_meta_header ld_header;
365
366 __be32 ld_type; /* GFS2_LOG_DESC_... */
367 __be32 ld_length; /* Number of buffers in this chunk */
368 __be32 ld_data1; /* descriptor-specific field */
369 __be32 ld_data2; /* descriptor-specific field */
370
371 __u8 ld_reserved[32];
372};
373
374/*
375 * Inum Range
376 * Describe a range of formal inode numbers allocated to
377 * one machine to assign to inodes.
378 */
379
380#define GFS2_INUM_QUANTUM 1048576
381
382struct gfs2_inum_range {
383 __be64 ir_start;
384 __be64 ir_length;
385};
386
387/*
388 * Statfs change
389 * Describes an change to the pool of free and allocated
390 * blocks.
391 */
392
393struct gfs2_statfs_change {
394 __be64 sc_total;
395 __be64 sc_free;
396 __be64 sc_dinodes;
397};
398
399/*
400 * Unlinked Tag
401 * Describes an allocated inode that isn't linked into
402 * the directory tree and might need to be deallocated.
403 */
404
405#define GFS2_UTF_UNINIT 0x00000001
406
407struct gfs2_unlinked_tag {
408 struct gfs2_inum ut_inum;
409 __be32 ut_flags; /* GFS2_UTF_... */
410 __u32 __pad;
411};
412
413/*
414 * Quota change
415 * Describes an allocation change for a particular
416 * user or group.
417 */
418
419#define GFS2_QCF_USER 0x00000001
420
421struct gfs2_quota_change {
422 __be64 qc_change;
423 __be32 qc_flags; /* GFS2_QCF_... */
424 __be32 qc_id;
425};
426
427#ifdef __KERNEL__
428/* Translation functions */
429
430extern void gfs2_inum_in(struct gfs2_inum *no, char *buf);
431extern void gfs2_inum_out(const struct gfs2_inum *no, char *buf);
432extern void gfs2_sb_in(struct gfs2_sb *sb, char *buf);
433extern void gfs2_rindex_in(struct gfs2_rindex *ri, char *buf);
434extern void gfs2_rindex_out(struct gfs2_rindex *ri, char *buf);
435extern void gfs2_rgrp_in(struct gfs2_rgrp *rg, char *buf);
436extern void gfs2_rgrp_out(struct gfs2_rgrp *rg, char *buf);
437extern void gfs2_quota_in(struct gfs2_quota *qu, char *buf);
438extern void gfs2_quota_out(struct gfs2_quota *qu, char *buf);
439extern void gfs2_dinode_in(struct gfs2_dinode *di, char *buf);
440extern void gfs2_dinode_out(struct gfs2_dinode *di, char *buf);
441extern void gfs2_ea_header_in(struct gfs2_ea_header *ea, char *buf);
442extern void gfs2_ea_header_out(struct gfs2_ea_header *ea, char *buf);
443extern void gfs2_log_header_in(struct gfs2_log_header *lh, char *buf);
444extern void gfs2_inum_range_in(struct gfs2_inum_range *ir, char *buf);
445extern void gfs2_inum_range_out(struct gfs2_inum_range *ir, char *buf);
446extern void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, char *buf);
447extern void gfs2_statfs_change_out(struct gfs2_statfs_change *sc, char *buf);
448extern void gfs2_unlinked_tag_in(struct gfs2_unlinked_tag *ut, char *buf);
449extern void gfs2_unlinked_tag_out(struct gfs2_unlinked_tag *ut, char *buf);
450extern void gfs2_quota_change_in(struct gfs2_quota_change *qc, char *buf);
451
452/* Printing functions */
453
454extern void gfs2_rindex_print(struct gfs2_rindex *ri);
455extern void gfs2_dinode_print(struct gfs2_dinode *di);
456
457#endif /* __KERNEL__ */
458
459#endif /* __GFS2_ONDISK_DOT_H__ */
diff --git a/include/linux/iflags.h b/include/linux/iflags.h
new file mode 100644
index 000000000000..5b27102dfeaf
--- /dev/null
+++ b/include/linux/iflags.h
@@ -0,0 +1,102 @@
1#ifndef _LINUX_IFLAGS_H
2#define _LINUX_IFLAGS_H
3
4/*
5 * A universal set of inode flags.
6 *
7 * Originally taken from ext2/3 with additions for other filesystems.
8 * Filesystems supporting this interface should interoperate with
9 * the lsattr and chattr command line tools.
10 *
11 * This interface is supported in whole or in part by:
12 * ext2
13 * ext3
14 * xfs
15 * jfs
16 * gfs2
17 *
18 */
19
20#define IFLAGS_GET_IOC _IOR('f', 1, long)
21#define IFLAGS_SET_IOC _IOW('f', 2, long)
22
23/*
24 * These values are provided for use as indices of an array
25 * for use with the iflags_cvt function below
26 */
27enum {
28 iflag_SecureRm = 0, /* Secure deletion */
29 iflag_Unrm = 1, /* Undelete */
30 iflag_Compress = 2, /* Compress file */
31 iflag_Sync = 3, /* Synchronous updates */
32 iflag_Immutable = 4, /* Immutable */
33 iflag_Append = 5, /* Append */
34 iflag_NoDump = 6, /* Don't dump file */
35 iflag_NoAtime = 7, /* No atime updates */
36 /* Reserved for compression usage */
37 iflag_Dirty = 8,
38 iflag_ComprBlk = 9, /* One or more compressed clusters */
39 iflag_NoComp = 10, /* Don't compress */
40 iflag_Ecompr = 11, /* Compression error */
41 /* End of compression flags */
42 iflag_Btree = 12, /* btree format dir */
43 iflag_Index = 12, /* hash-indexed directory */
44 iflag_Imagic = 13, /* AFS directory */
45 iflag_JournalData = 14, /* file data should be journaled */
46 iflag_NoTail = 15, /* file tail should not be merged */
47 iflag_DirSync = 16, /* dirsync behaviour */
48 iflag_TopDir = 17, /* Top of directory hierarchies */
49 iflag_Extent = 19, /* Extents */
50 iflag_DirectIO = 20, /* Always use direct I/O on this file */
51 iflag_Reserved = 31 /* reserved for ext2/3 lib */
52};
53
54#define __IFL(x) (1<<(iflag_##x))
55#define IFLAG_SECRM __IFL(SecureRm) /* 0x00000001 */
56#define IFLAG_UNRM __IFL(Unrm) /* 0x00000002 */
57#define IFLAG_COMPR __IFL(Compr) /* 0x00000004 */
58#define IFLAG_SYNC __IFL(Sync) /* 0x00000008 */
59#define IFLAG_IMMUTABLE __IFL(Immutable) /* 0x00000010 */
60#define IFLAG_APPEND __IFL(Append) /* 0x00000020 */
61#define IFLAG_NODUMP __IFL(NoDump) /* 0x00000040 */
62#define IFLAG_NOATIME __IFL(NoAtime) /* 0x00000080 */
63#define IFLAG_DIRTY __IFL(Dirty) /* 0x00000100 */
64#define IFLAG_COMPRBLK __IFL(ComprBlk) /* 0x00000200 */
65#define IFLAG_NOCOMP __IFL(NoComp) /* 0x00000400 */
66#define IFLAG_ECOMPR __IFL(Ecompr) /* 0x00000800 */
67#define IFLAG_BTREE __IFL(Btree) /* 0x00001000 */
68#define IFLAG_INDEX __IFL(Index) /* 0x00001000 */
69#define IFLAG_IMAGIC __IFL(Imagic) /* 0x00002000 */
70#define IFLAG_JOURNAL_DATA __IFL(JournalData) /* 0x00004000 */
71#define IFLAG_NOTAIL __IFL(NoTail) /* 0x00008000 */
72#define IFLAG_DIRSYNC __IFL(DirSync) /* 0x00010000 */
73#define IFLAG_TOPDIR __IFL(TopDir) /* 0x00020000 */
74#define IFLAG_EXTENT __IFL(Extent) /* 0x00080000 */
75#define IFLAG_DIRECTIO __IFL(DirectIO) /* 0x00100000 */
76#define IFLAG_RESERVED __IFL(Reserved) /* 0x80000000 */
77
78#ifdef __KERNEL__
79/**
80 * iflags_cvt
81 * @table: A table of 32 u32 flags
82 * @val: a 32 bit value to convert
83 *
84 * This function can be used to convert between IFLAGS values and
85 * the filesystem's own flags values.
86 *
87 * Returns: the converted flags
88 */
89static inline u32 iflags_cvt(const u32 *table, u32 val)
90{
91 u32 res = 0;
92 while(val) {
93 if (val & 1)
94 res |= *table;
95 table++;
96 val >>= 1;
97 }
98 return res;
99}
100#endif /* __KERNEL__ */
101
102#endif /* _LINUX_IFLAGS_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f4fc576ed4c4..c217e99d5073 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -29,6 +29,7 @@ extern const char linux_banner[];
29 29
30#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 30#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
31#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1)) 31#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
32#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
32 33
33#define KERN_EMERG "<0>" /* system is unusable */ 34#define KERN_EMERG "<0>" /* system is unusable */
34#define KERN_ALERT "<1>" /* action must be taken immediately */ 35#define KERN_ALERT "<1>" /* action must be taken immediately */
diff --git a/include/linux/lock_dlm_plock.h b/include/linux/lock_dlm_plock.h
new file mode 100644
index 000000000000..007b07a178ab
--- /dev/null
+++ b/include/linux/lock_dlm_plock.h
@@ -0,0 +1,40 @@
1/*
2 * Copyright (C) 2005 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#ifndef __LOCK_DLM_PLOCK_DOT_H__
10#define __LOCK_DLM_PLOCK_DOT_H__
11
12#define GDLM_PLOCK_MISC_NAME "lock_dlm_plock"
13
14#define GDLM_PLOCK_VERSION_MAJOR 1
15#define GDLM_PLOCK_VERSION_MINOR 0
16#define GDLM_PLOCK_VERSION_PATCH 0
17
18enum {
19 GDLM_PLOCK_OP_LOCK = 1,
20 GDLM_PLOCK_OP_UNLOCK,
21 GDLM_PLOCK_OP_GET,
22};
23
24struct gdlm_plock_info {
25 __u32 version[3];
26 __u8 optype;
27 __u8 ex;
28 __u8 wait;
29 __u8 pad;
30 __u32 pid;
31 __s32 nodeid;
32 __s32 rv;
33 __u32 fsid;
34 __u64 number;
35 __u64 start;
36 __u64 end;
37};
38
39#endif
40
diff --git a/kernel/printk.c b/kernel/printk.c
index c056f3324432..a33f342b31b7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -997,6 +997,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
997 tty->driver->write(tty, msg, strlen(msg)); 997 tty->driver->write(tty, msg, strlen(msg));
998 return; 998 return;
999} 999}
1000EXPORT_SYMBOL_GPL(tty_write_message);
1000 1001
1001/* 1002/*
1002 * printk rate limiting, lifted from the networking subsystem. 1003 * printk rate limiting, lifted from the networking subsystem.
diff --git a/mm/filemap.c b/mm/filemap.c
index fd57442186cb..82c448898d05 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1044,6 +1044,7 @@ success:
1044 desc->arg.buf += size; 1044 desc->arg.buf += size;
1045 return size; 1045 return size;
1046} 1046}
1047EXPORT_SYMBOL(file_read_actor);
1047 1048
1048/* 1049/*
1049 * This is the "read()" routine for all filesystems 1050 * This is the "read()" routine for all filesystems
diff --git a/mm/readahead.c b/mm/readahead.c
index 0f142a40984b..ba7db816f4c8 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -38,6 +38,7 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
38 ra->ra_pages = mapping->backing_dev_info->ra_pages; 38 ra->ra_pages = mapping->backing_dev_info->ra_pages;
39 ra->prev_page = -1; 39 ra->prev_page = -1;
40} 40}
41EXPORT_SYMBOL_GPL(file_ra_state_init);
41 42
42/* 43/*
43 * Return max readahead size for this inode in number-of-pages. 44 * Return max readahead size for this inode in number-of-pages.