aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/gfs2.txt44
-rw-r--r--Documentation/ioctl-number.txt1
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/configfs/item.c2
-rw-r--r--fs/dlm/Kconfig30
-rw-r--r--fs/dlm/Makefile21
-rw-r--r--fs/dlm/ast.c167
-rw-r--r--fs/dlm/ast.h26
-rw-r--r--fs/dlm/config.c787
-rw-r--r--fs/dlm/config.h42
-rw-r--r--fs/dlm/debug_fs.c296
-rw-r--r--fs/dlm/device.c1093
-rw-r--r--fs/dlm/dir.c423
-rw-r--r--fs/dlm/dir.h30
-rw-r--r--fs/dlm/dlm_internal.h493
-rw-r--r--fs/dlm/lock.c3533
-rw-r--r--fs/dlm/lock.h50
-rw-r--r--fs/dlm/lockspace.c665
-rw-r--r--fs/dlm/lockspace.h24
-rw-r--r--fs/dlm/lowcomms.c1218
-rw-r--r--fs/dlm/lowcomms.h25
-rw-r--r--fs/dlm/lvb_table.h18
-rw-r--r--fs/dlm/main.c89
-rw-r--r--fs/dlm/member.c313
-rw-r--r--fs/dlm/member.h24
-rw-r--r--fs/dlm/memory.c106
-rw-r--r--fs/dlm/memory.h29
-rw-r--r--fs/dlm/midcomms.c140
-rw-r--r--fs/dlm/midcomms.h21
-rw-r--r--fs/dlm/rcom.c457
-rw-r--r--fs/dlm/rcom.h24
-rw-r--r--fs/dlm/recover.c762
-rw-r--r--fs/dlm/recover.h34
-rw-r--r--fs/dlm/recoverd.c285
-rw-r--r--fs/dlm/recoverd.h24
-rw-r--r--fs/dlm/requestqueue.c184
-rw-r--r--fs/dlm/requestqueue.h22
-rw-r--r--fs/dlm/util.c161
-rw-r--r--fs/dlm/util.h22
-rw-r--r--fs/gfs2/Kconfig46
-rw-r--r--fs/gfs2/Makefile42
-rw-r--r--fs/gfs2/acl.c316
-rw-r--r--fs/gfs2/acl.h37
-rw-r--r--fs/gfs2/bits.c182
-rw-r--r--fs/gfs2/bits.h28
-rw-r--r--fs/gfs2/bmap.c1098
-rw-r--r--fs/gfs2/bmap.h35
-rw-r--r--fs/gfs2/daemon.c229
-rw-r--r--fs/gfs2/daemon.h20
-rw-r--r--fs/gfs2/dir.c1968
-rw-r--r--fs/gfs2/dir.h73
-rw-r--r--fs/gfs2/eaops.c189
-rw-r--r--fs/gfs2/eaops.h30
-rw-r--r--fs/gfs2/eattr.c1568
-rw-r--r--fs/gfs2/eattr.h88
-rw-r--r--fs/gfs2/format.h21
-rw-r--r--fs/gfs2/gfs2.h31
-rw-r--r--fs/gfs2/glock.c2493
-rw-r--r--fs/gfs2/glock.h167
-rw-r--r--fs/gfs2/glops.c492
-rw-r--r--fs/gfs2/glops.h23
-rw-r--r--fs/gfs2/incore.h680
-rw-r--r--fs/gfs2/inode.c1854
-rw-r--r--fs/gfs2/inode.h72
-rw-r--r--fs/gfs2/lm.c243
-rw-r--r--fs/gfs2/lm.h42
-rw-r--r--fs/gfs2/lm_interface.h295
-rw-r--r--fs/gfs2/locking.c191
-rw-r--r--fs/gfs2/locking/dlm/Makefile3
-rw-r--r--fs/gfs2/locking/dlm/lock.c538
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h191
-rw-r--r--fs/gfs2/locking/dlm/main.c64
-rw-r--r--fs/gfs2/locking/dlm/mount.c255
-rw-r--r--fs/gfs2/locking/dlm/plock.c298
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c218
-rw-r--r--fs/gfs2/locking/dlm/thread.c352
-rw-r--r--fs/gfs2/locking/nolock/Makefile3
-rw-r--r--fs/gfs2/locking/nolock/main.c259
-rw-r--r--fs/gfs2/log.c600
-rw-r--r--fs/gfs2/log.h61
-rw-r--r--fs/gfs2/lops.c805
-rw-r--r--fs/gfs2/lops.h96
-rw-r--r--fs/gfs2/lvb.c53
-rw-r--r--fs/gfs2/lvb.h20
-rw-r--r--fs/gfs2/main.c114
-rw-r--r--fs/gfs2/meta_io.c889
-rw-r--r--fs/gfs2/meta_io.h89
-rw-r--r--fs/gfs2/mount.c215
-rw-r--r--fs/gfs2/mount.h15
-rw-r--r--fs/gfs2/ondisk.c517
-rw-r--r--fs/gfs2/ops_address.c582
-rw-r--r--fs/gfs2/ops_address.h17
-rw-r--r--fs/gfs2/ops_dentry.c124
-rw-r--r--fs/gfs2/ops_dentry.h15
-rw-r--r--fs/gfs2/ops_export.c298
-rw-r--r--fs/gfs2/ops_export.h15
-rw-r--r--fs/gfs2/ops_file.c999
-rw-r--r--fs/gfs2/ops_file.h20
-rw-r--r--fs/gfs2/ops_fstype.c904
-rw-r--r--fs/gfs2/ops_fstype.h16
-rw-r--r--fs/gfs2/ops_inode.c1196
-rw-r--r--fs/gfs2/ops_inode.h18
-rw-r--r--fs/gfs2/ops_super.c379
-rw-r--r--fs/gfs2/ops_super.h15
-rw-r--r--fs/gfs2/ops_vm.c198
-rw-r--r--fs/gfs2/ops_vm.h16
-rw-r--r--fs/gfs2/page.c283
-rw-r--r--fs/gfs2/page.h23
-rw-r--r--fs/gfs2/quota.c1303
-rw-r--r--fs/gfs2/quota.h34
-rw-r--r--fs/gfs2/recovery.c577
-rw-r--r--fs/gfs2/recovery.h32
-rw-r--r--fs/gfs2/rgrp.c1369
-rw-r--r--fs/gfs2/rgrp.h62
-rw-r--r--fs/gfs2/super.c950
-rw-r--r--fs/gfs2/super.h54
-rw-r--r--fs/gfs2/sys.c582
-rw-r--r--fs/gfs2/sys.h24
-rw-r--r--fs/gfs2/trans.c186
-rw-r--r--fs/gfs2/trans.h35
-rw-r--r--fs/gfs2/unlinked.c458
-rw-r--r--fs/gfs2/unlinked.h25
-rw-r--r--fs/gfs2/util.c245
-rw-r--r--fs/gfs2/util.h169
-rw-r--r--include/linux/dlm.h302
-rw-r--r--include/linux/dlm_device.h83
-rw-r--r--include/linux/fs.h3
-rw-r--r--include/linux/gfs2_ondisk.h472
-rw-r--r--include/linux/iflags.h104
-rw-r--r--include/linux/kernel.h1
-rw-r--r--include/linux/lock_dlm_plock.h40
-rw-r--r--kernel/printk.c1
-rw-r--r--mm/filemap.c1
-rw-r--r--mm/readahead.c1
135 files changed, 42127 insertions, 1 deletions
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt
new file mode 100644
index 000000000000..a20ba19e9fe5
--- /dev/null
+++ b/Documentation/filesystems/gfs2.txt
@@ -0,0 +1,44 @@
1Global File System
2------------------
3
4http://sources.redhat.com/cluster/
5
6GFS is a cluster file system. It allows a cluster of computers to
7simultaneously use a block device that is shared between them (with FC,
8iSCSI, NBD, etc). GFS reads and writes to the block device like a local
9file system, but also uses a lock module to allow the computers coordinate
10their I/O so file system consistency is maintained. One of the nifty
11features of GFS is perfect consistency -- changes made to the file system
12on one machine show up immediately on all other machines in the cluster.
13
14GFS uses interchangable inter-node locking mechanisms. Different lock
15modules can plug into GFS and each file system selects the appropriate
16lock module at mount time. Lock modules include:
17
18 lock_nolock -- allows gfs to be used as a local file system
19
20 lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking
21 The dlm is found at linux/fs/dlm/
22
23In addition to interfacing with an external locking manager, a gfs lock
24module is responsible for interacting with external cluster management
25systems. Lock_dlm depends on user space cluster management systems found
26at the URL above.
27
28To use gfs as a local file system, no external clustering systems are
29needed, simply:
30
31 $ mkfs -t gfs2 -p lock_nolock -j 1 /dev/block_device
32 $ mount -t gfs2 /dev/block_device /dir
33
34GFS2 is not on-disk compatible with previous versions of GFS.
35
36The following man pages can be found at the URL above:
37 gfs2_mkfs to make a filesystem
38 gfs2_fsck to repair a filesystem
39 gfs2_grow to expand a filesystem online
40 gfs2_jadd to add journals to a filesystem online
41 gfs2_tool to manipulate, examine and tune a filesystem
42 gfs2_quota to examine and change quota values in a filesystem
43 mount.gfs2 to find mount options
44
diff --git a/Documentation/ioctl-number.txt b/Documentation/ioctl-number.txt
index 171a44ebd939..93a86ac23cdd 100644
--- a/Documentation/ioctl-number.txt
+++ b/Documentation/ioctl-number.txt
@@ -124,6 +124,7 @@ Code Seq# Include File Comments
124'e' 00-1F linux/video_encoder.h conflict! 124'e' 00-1F linux/video_encoder.h conflict!
125'e' 00-1F net/irda/irtty.h conflict! 125'e' 00-1F net/irda/irtty.h conflict!
126'f' 00-1F linux/ext2_fs.h 126'f' 00-1F linux/ext2_fs.h
127'g' 00-1F linux/gfs2_ioctl.h
127'h' 00-7F Charon filesystem 128'h' 00-7F Charon filesystem
128 <mailto:zapman@interlan.net> 129 <mailto:zapman@interlan.net>
129'i' 00-3F linux/i2o.h 130'i' 00-3F linux/i2o.h
diff --git a/fs/Kconfig b/fs/Kconfig
index f9b5842c8d2d..563a59e5e694 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -323,6 +323,7 @@ config FS_POSIX_ACL
323 default n 323 default n
324 324
325source "fs/xfs/Kconfig" 325source "fs/xfs/Kconfig"
326source "fs/gfs2/Kconfig"
326 327
327config OCFS2_FS 328config OCFS2_FS
328 tristate "OCFS2 file system support (EXPERIMENTAL)" 329 tristate "OCFS2 file system support (EXPERIMENTAL)"
@@ -1824,6 +1825,7 @@ source "fs/partitions/Kconfig"
1824endmenu 1825endmenu
1825 1826
1826source "fs/nls/Kconfig" 1827source "fs/nls/Kconfig"
1828source "fs/dlm/Kconfig"
1827 1829
1828endmenu 1830endmenu
1829 1831
diff --git a/fs/Makefile b/fs/Makefile
index 83bf478e786b..2c22e282c777 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_SYSFS) += sysfs/
48obj-y += devpts/ 48obj-y += devpts/
49 49
50obj-$(CONFIG_PROFILING) += dcookies.o 50obj-$(CONFIG_PROFILING) += dcookies.o
51obj-$(CONFIG_DLM) += dlm/
51 52
52# Do not add any filesystems before this line 53# Do not add any filesystems before this line
53obj-$(CONFIG_REISERFS_FS) += reiserfs/ 54obj-$(CONFIG_REISERFS_FS) += reiserfs/
@@ -102,3 +103,4 @@ obj-$(CONFIG_HPPFS) += hppfs/
102obj-$(CONFIG_DEBUG_FS) += debugfs/ 103obj-$(CONFIG_DEBUG_FS) += debugfs/
103obj-$(CONFIG_CONFIGFS_FS) += configfs/ 104obj-$(CONFIG_CONFIGFS_FS) += configfs/
104obj-$(CONFIG_OCFS2_FS) += ocfs2/ 105obj-$(CONFIG_OCFS2_FS) += ocfs2/
106obj-$(CONFIG_GFS2_FS) += gfs2/
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index e07485ac50ad..24421209f854 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -224,4 +224,4 @@ EXPORT_SYMBOL(config_item_init);
224EXPORT_SYMBOL(config_group_init); 224EXPORT_SYMBOL(config_group_init);
225EXPORT_SYMBOL(config_item_get); 225EXPORT_SYMBOL(config_item_get);
226EXPORT_SYMBOL(config_item_put); 226EXPORT_SYMBOL(config_item_put);
227 227EXPORT_SYMBOL(config_group_find_obj);
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
new file mode 100644
index 000000000000..d01f735e6e06
--- /dev/null
+++ b/fs/dlm/Kconfig
@@ -0,0 +1,30 @@
1menu "Distributed Lock Manager"
2 depends on INET && EXPERIMENTAL
3
4config DLM
5 tristate "Distributed Lock Manager (DLM)"
6 depends on SYSFS
7 depends on IPV6 || IPV6=n
8 select IP_SCTP
9 select CONFIGFS_FS
10 help
11 A general purpose distributed lock manager for kernel or userspace
12 applications.
13
14config DLM_DEVICE
15 tristate "DLM device for userspace access"
16 depends on DLM
17 help
18 This module creates a misc device through which the dlm lockspace
19 and locking functions become available to userspace applications
20 (usually through the libdlm library).
21
22config DLM_DEBUG
23 bool "DLM debugging"
24 depends on DLM
25 help
26 Under the debugfs mount point, the name of each lockspace will
27 appear as a file in the "dlm" directory. The output is the
28 list of resource and locks the local node knows about.
29
30endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
new file mode 100644
index 000000000000..1e6232e7d8e5
--- /dev/null
+++ b/fs/dlm/Makefile
@@ -0,0 +1,21 @@
1obj-$(CONFIG_DLM) += dlm.o
2obj-$(CONFIG_DLM_DEVICE) += dlm_device.o
3
4dlm-y := ast.o \
5 config.o \
6 dir.o \
7 lock.o \
8 lockspace.o \
9 lowcomms.o \
10 main.o \
11 member.o \
12 memory.o \
13 midcomms.o \
14 rcom.o \
15 recover.o \
16 recoverd.o \
17 requestqueue.o \
18 util.o
19dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o
20
21dlm_device-y := device.o
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
new file mode 100644
index 000000000000..57bdf09b520a
--- /dev/null
+++ b/fs/dlm/ast.c
@@ -0,0 +1,167 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lock.h"
16#include "ast.h"
17
18#define WAKE_ASTS 0
19
20static struct list_head ast_queue;
21static spinlock_t ast_queue_lock;
22static struct task_struct * astd_task;
23static unsigned long astd_wakeflags;
24static struct mutex astd_running;
25
26
27void dlm_del_ast(struct dlm_lkb *lkb)
28{
29 spin_lock(&ast_queue_lock);
30 if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
31 list_del(&lkb->lkb_astqueue);
32 spin_unlock(&ast_queue_lock);
33}
34
35void dlm_add_ast(struct dlm_lkb *lkb, int type)
36{
37 spin_lock(&ast_queue_lock);
38 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
39 kref_get(&lkb->lkb_ref);
40 list_add_tail(&lkb->lkb_astqueue, &ast_queue);
41 }
42 lkb->lkb_ast_type |= type;
43 spin_unlock(&ast_queue_lock);
44
45 set_bit(WAKE_ASTS, &astd_wakeflags);
46 wake_up_process(astd_task);
47}
48
49static void process_asts(void)
50{
51 struct dlm_ls *ls = NULL;
52 struct dlm_rsb *r = NULL;
53 struct dlm_lkb *lkb;
54 void (*cast) (long param);
55 void (*bast) (long param, int mode);
56 int type = 0, found, bmode;
57
58 for (;;) {
59 found = 0;
60 spin_lock(&ast_queue_lock);
61 list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
62 r = lkb->lkb_resource;
63 ls = r->res_ls;
64
65 if (dlm_locking_stopped(ls))
66 continue;
67
68 list_del(&lkb->lkb_astqueue);
69 type = lkb->lkb_ast_type;
70 lkb->lkb_ast_type = 0;
71 found = 1;
72 break;
73 }
74 spin_unlock(&ast_queue_lock);
75
76 if (!found)
77 break;
78
79 cast = lkb->lkb_astaddr;
80 bast = lkb->lkb_bastaddr;
81 bmode = lkb->lkb_bastmode;
82
83 if ((type & AST_COMP) && cast)
84 cast(lkb->lkb_astparam);
85
86 /* FIXME: Is it safe to look at lkb_grmode here
87 without doing a lock_rsb() ?
88 Look at other checks in v1 to avoid basts. */
89
90 if ((type & AST_BAST) && bast)
91 if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
92 bast(lkb->lkb_astparam, bmode);
93
94 /* this removes the reference added by dlm_add_ast
95 and may result in the lkb being freed */
96 dlm_put_lkb(lkb);
97
98 schedule();
99 }
100}
101
102static inline int no_asts(void)
103{
104 int ret;
105
106 spin_lock(&ast_queue_lock);
107 ret = list_empty(&ast_queue);
108 spin_unlock(&ast_queue_lock);
109 return ret;
110}
111
112static int dlm_astd(void *data)
113{
114 while (!kthread_should_stop()) {
115 set_current_state(TASK_INTERRUPTIBLE);
116 if (!test_bit(WAKE_ASTS, &astd_wakeflags))
117 schedule();
118 set_current_state(TASK_RUNNING);
119
120 mutex_lock(&astd_running);
121 if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
122 process_asts();
123 mutex_unlock(&astd_running);
124 }
125 return 0;
126}
127
128void dlm_astd_wake(void)
129{
130 if (!no_asts()) {
131 set_bit(WAKE_ASTS, &astd_wakeflags);
132 wake_up_process(astd_task);
133 }
134}
135
136int dlm_astd_start(void)
137{
138 struct task_struct *p;
139 int error = 0;
140
141 INIT_LIST_HEAD(&ast_queue);
142 spin_lock_init(&ast_queue_lock);
143 mutex_init(&astd_running);
144
145 p = kthread_run(dlm_astd, NULL, "dlm_astd");
146 if (IS_ERR(p))
147 error = PTR_ERR(p);
148 else
149 astd_task = p;
150 return error;
151}
152
153void dlm_astd_stop(void)
154{
155 kthread_stop(astd_task);
156}
157
158void dlm_astd_suspend(void)
159{
160 mutex_lock(&astd_running);
161}
162
163void dlm_astd_resume(void)
164{
165 mutex_unlock(&astd_running);
166}
167
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
new file mode 100644
index 000000000000..6ee276c74c52
--- /dev/null
+++ b/fs/dlm/ast.h
@@ -0,0 +1,26 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __ASTD_DOT_H__
14#define __ASTD_DOT_H__
15
16void dlm_add_ast(struct dlm_lkb *lkb, int type);
17void dlm_del_ast(struct dlm_lkb *lkb);
18
19void dlm_astd_wake(void);
20int dlm_astd_start(void);
21void dlm_astd_stop(void);
22void dlm_astd_suspend(void);
23void dlm_astd_resume(void);
24
25#endif
26
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
new file mode 100644
index 000000000000..87df9616415e
--- /dev/null
+++ b/fs/dlm/config.c
@@ -0,0 +1,787 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/configfs.h>
17#include <net/sock.h>
18
19#include "config.h"
20
21/*
22 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
23 * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
24 * /config/dlm/<cluster>/comms/<comm>/nodeid
25 * /config/dlm/<cluster>/comms/<comm>/local
26 * /config/dlm/<cluster>/comms/<comm>/addr
27 * The <cluster> level is useless, but I haven't figured out how to avoid it.
28 */
29
30static struct config_group *space_list;
31static struct config_group *comm_list;
32static struct comm *local_comm;
33
34struct clusters;
35struct cluster;
36struct spaces;
37struct space;
38struct comms;
39struct comm;
40struct nodes;
41struct node;
42
43static struct config_group *make_cluster(struct config_group *, const char *);
44static void drop_cluster(struct config_group *, struct config_item *);
45static void release_cluster(struct config_item *);
46static struct config_group *make_space(struct config_group *, const char *);
47static void drop_space(struct config_group *, struct config_item *);
48static void release_space(struct config_item *);
49static struct config_item *make_comm(struct config_group *, const char *);
50static void drop_comm(struct config_group *, struct config_item *);
51static void release_comm(struct config_item *);
52static struct config_item *make_node(struct config_group *, const char *);
53static void drop_node(struct config_group *, struct config_item *);
54static void release_node(struct config_item *);
55
56static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
57 char *buf);
58static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
59 const char *buf, size_t len);
60static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
61 char *buf);
62static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
63 const char *buf, size_t len);
64
65static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
66static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
67static ssize_t comm_local_read(struct comm *cm, char *buf);
68static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
69static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
70static ssize_t node_nodeid_read(struct node *nd, char *buf);
71static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
72static ssize_t node_weight_read(struct node *nd, char *buf);
73static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
74
75enum {
76 COMM_ATTR_NODEID = 0,
77 COMM_ATTR_LOCAL,
78 COMM_ATTR_ADDR,
79};
80
81struct comm_attribute {
82 struct configfs_attribute attr;
83 ssize_t (*show)(struct comm *, char *);
84 ssize_t (*store)(struct comm *, const char *, size_t);
85};
86
87static struct comm_attribute comm_attr_nodeid = {
88 .attr = { .ca_owner = THIS_MODULE,
89 .ca_name = "nodeid",
90 .ca_mode = S_IRUGO | S_IWUSR },
91 .show = comm_nodeid_read,
92 .store = comm_nodeid_write,
93};
94
95static struct comm_attribute comm_attr_local = {
96 .attr = { .ca_owner = THIS_MODULE,
97 .ca_name = "local",
98 .ca_mode = S_IRUGO | S_IWUSR },
99 .show = comm_local_read,
100 .store = comm_local_write,
101};
102
103static struct comm_attribute comm_attr_addr = {
104 .attr = { .ca_owner = THIS_MODULE,
105 .ca_name = "addr",
106 .ca_mode = S_IRUGO | S_IWUSR },
107 .store = comm_addr_write,
108};
109
110static struct configfs_attribute *comm_attrs[] = {
111 [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
112 [COMM_ATTR_LOCAL] = &comm_attr_local.attr,
113 [COMM_ATTR_ADDR] = &comm_attr_addr.attr,
114 NULL,
115};
116
117enum {
118 NODE_ATTR_NODEID = 0,
119 NODE_ATTR_WEIGHT,
120};
121
122struct node_attribute {
123 struct configfs_attribute attr;
124 ssize_t (*show)(struct node *, char *);
125 ssize_t (*store)(struct node *, const char *, size_t);
126};
127
128static struct node_attribute node_attr_nodeid = {
129 .attr = { .ca_owner = THIS_MODULE,
130 .ca_name = "nodeid",
131 .ca_mode = S_IRUGO | S_IWUSR },
132 .show = node_nodeid_read,
133 .store = node_nodeid_write,
134};
135
136static struct node_attribute node_attr_weight = {
137 .attr = { .ca_owner = THIS_MODULE,
138 .ca_name = "weight",
139 .ca_mode = S_IRUGO | S_IWUSR },
140 .show = node_weight_read,
141 .store = node_weight_write,
142};
143
144static struct configfs_attribute *node_attrs[] = {
145 [NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
146 [NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
147 NULL,
148};
149
150struct clusters {
151 struct configfs_subsystem subsys;
152};
153
154struct cluster {
155 struct config_group group;
156};
157
158struct spaces {
159 struct config_group ss_group;
160};
161
162struct space {
163 struct config_group group;
164 struct list_head members;
165 struct mutex members_lock;
166 int members_count;
167};
168
169struct comms {
170 struct config_group cs_group;
171};
172
173struct comm {
174 struct config_item item;
175 int nodeid;
176 int local;
177 int addr_count;
178 struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
179};
180
181struct nodes {
182 struct config_group ns_group;
183};
184
185struct node {
186 struct config_item item;
187 struct list_head list; /* space->members */
188 int nodeid;
189 int weight;
190};
191
192static struct configfs_group_operations clusters_ops = {
193 .make_group = make_cluster,
194 .drop_item = drop_cluster,
195};
196
197static struct configfs_item_operations cluster_ops = {
198 .release = release_cluster,
199};
200
201static struct configfs_group_operations spaces_ops = {
202 .make_group = make_space,
203 .drop_item = drop_space,
204};
205
206static struct configfs_item_operations space_ops = {
207 .release = release_space,
208};
209
210static struct configfs_group_operations comms_ops = {
211 .make_item = make_comm,
212 .drop_item = drop_comm,
213};
214
215static struct configfs_item_operations comm_ops = {
216 .release = release_comm,
217 .show_attribute = show_comm,
218 .store_attribute = store_comm,
219};
220
221static struct configfs_group_operations nodes_ops = {
222 .make_item = make_node,
223 .drop_item = drop_node,
224};
225
226static struct configfs_item_operations node_ops = {
227 .release = release_node,
228 .show_attribute = show_node,
229 .store_attribute = store_node,
230};
231
232static struct config_item_type clusters_type = {
233 .ct_group_ops = &clusters_ops,
234 .ct_owner = THIS_MODULE,
235};
236
237static struct config_item_type cluster_type = {
238 .ct_item_ops = &cluster_ops,
239 .ct_owner = THIS_MODULE,
240};
241
242static struct config_item_type spaces_type = {
243 .ct_group_ops = &spaces_ops,
244 .ct_owner = THIS_MODULE,
245};
246
247static struct config_item_type space_type = {
248 .ct_item_ops = &space_ops,
249 .ct_owner = THIS_MODULE,
250};
251
252static struct config_item_type comms_type = {
253 .ct_group_ops = &comms_ops,
254 .ct_owner = THIS_MODULE,
255};
256
257static struct config_item_type comm_type = {
258 .ct_item_ops = &comm_ops,
259 .ct_attrs = comm_attrs,
260 .ct_owner = THIS_MODULE,
261};
262
263static struct config_item_type nodes_type = {
264 .ct_group_ops = &nodes_ops,
265 .ct_owner = THIS_MODULE,
266};
267
268static struct config_item_type node_type = {
269 .ct_item_ops = &node_ops,
270 .ct_attrs = node_attrs,
271 .ct_owner = THIS_MODULE,
272};
273
274static struct cluster *to_cluster(struct config_item *i)
275{
276 return i ? container_of(to_config_group(i), struct cluster, group):NULL;
277}
278
279static struct space *to_space(struct config_item *i)
280{
281 return i ? container_of(to_config_group(i), struct space, group) : NULL;
282}
283
284static struct comm *to_comm(struct config_item *i)
285{
286 return i ? container_of(i, struct comm, item) : NULL;
287}
288
289static struct node *to_node(struct config_item *i)
290{
291 return i ? container_of(i, struct node, item) : NULL;
292}
293
294static struct config_group *make_cluster(struct config_group *g,
295 const char *name)
296{
297 struct cluster *cl = NULL;
298 struct spaces *sps = NULL;
299 struct comms *cms = NULL;
300 void *gps = NULL;
301
302 cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
303 gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
304 sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
305 cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
306
307 if (!cl || !gps || !sps || !cms)
308 goto fail;
309
310 config_group_init_type_name(&cl->group, name, &cluster_type);
311 config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
312 config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
313
314 cl->group.default_groups = gps;
315 cl->group.default_groups[0] = &sps->ss_group;
316 cl->group.default_groups[1] = &cms->cs_group;
317 cl->group.default_groups[2] = NULL;
318
319 space_list = &sps->ss_group;
320 comm_list = &cms->cs_group;
321 return &cl->group;
322
323 fail:
324 kfree(cl);
325 kfree(gps);
326 kfree(sps);
327 kfree(cms);
328 return NULL;
329}
330
331static void drop_cluster(struct config_group *g, struct config_item *i)
332{
333 struct cluster *cl = to_cluster(i);
334 struct config_item *tmp;
335 int j;
336
337 for (j = 0; cl->group.default_groups[j]; j++) {
338 tmp = &cl->group.default_groups[j]->cg_item;
339 cl->group.default_groups[j] = NULL;
340 config_item_put(tmp);
341 }
342
343 space_list = NULL;
344 comm_list = NULL;
345
346 config_item_put(i);
347}
348
349static void release_cluster(struct config_item *i)
350{
351 struct cluster *cl = to_cluster(i);
352 kfree(cl->group.default_groups);
353 kfree(cl);
354}
355
356static struct config_group *make_space(struct config_group *g, const char *name)
357{
358 struct space *sp = NULL;
359 struct nodes *nds = NULL;
360 void *gps = NULL;
361
362 sp = kzalloc(sizeof(struct space), GFP_KERNEL);
363 gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
364 nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
365
366 if (!sp || !gps || !nds)
367 goto fail;
368
369 config_group_init_type_name(&sp->group, name, &space_type);
370 config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
371
372 sp->group.default_groups = gps;
373 sp->group.default_groups[0] = &nds->ns_group;
374 sp->group.default_groups[1] = NULL;
375
376 INIT_LIST_HEAD(&sp->members);
377 mutex_init(&sp->members_lock);
378 sp->members_count = 0;
379 return &sp->group;
380
381 fail:
382 kfree(sp);
383 kfree(gps);
384 kfree(nds);
385 return NULL;
386}
387
388static void drop_space(struct config_group *g, struct config_item *i)
389{
390 struct space *sp = to_space(i);
391 struct config_item *tmp;
392 int j;
393
394 /* assert list_empty(&sp->members) */
395
396 for (j = 0; sp->group.default_groups[j]; j++) {
397 tmp = &sp->group.default_groups[j]->cg_item;
398 sp->group.default_groups[j] = NULL;
399 config_item_put(tmp);
400 }
401
402 config_item_put(i);
403}
404
405static void release_space(struct config_item *i)
406{
407 struct space *sp = to_space(i);
408 kfree(sp->group.default_groups);
409 kfree(sp);
410}
411
412static struct config_item *make_comm(struct config_group *g, const char *name)
413{
414 struct comm *cm;
415
416 cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
417 if (!cm)
418 return NULL;
419
420 config_item_init_type_name(&cm->item, name, &comm_type);
421 cm->nodeid = -1;
422 cm->local = 0;
423 cm->addr_count = 0;
424 return &cm->item;
425}
426
427static void drop_comm(struct config_group *g, struct config_item *i)
428{
429 struct comm *cm = to_comm(i);
430 if (local_comm == cm)
431 local_comm = NULL;
432 while (cm->addr_count--)
433 kfree(cm->addr[cm->addr_count]);
434 config_item_put(i);
435}
436
437static void release_comm(struct config_item *i)
438{
439 struct comm *cm = to_comm(i);
440 kfree(cm);
441}
442
443static struct config_item *make_node(struct config_group *g, const char *name)
444{
445 struct space *sp = to_space(g->cg_item.ci_parent);
446 struct node *nd;
447
448 nd = kzalloc(sizeof(struct node), GFP_KERNEL);
449 if (!nd)
450 return NULL;
451
452 config_item_init_type_name(&nd->item, name, &node_type);
453 nd->nodeid = -1;
454 nd->weight = 1; /* default weight of 1 if none is set */
455
456 mutex_lock(&sp->members_lock);
457 list_add(&nd->list, &sp->members);
458 sp->members_count++;
459 mutex_unlock(&sp->members_lock);
460
461 return &nd->item;
462}
463
464static void drop_node(struct config_group *g, struct config_item *i)
465{
466 struct space *sp = to_space(g->cg_item.ci_parent);
467 struct node *nd = to_node(i);
468
469 mutex_lock(&sp->members_lock);
470 list_del(&nd->list);
471 sp->members_count--;
472 mutex_unlock(&sp->members_lock);
473
474 config_item_put(i);
475}
476
477static void release_node(struct config_item *i)
478{
479 struct node *nd = to_node(i);
480 kfree(nd);
481}
482
483static struct clusters clusters_root = {
484 .subsys = {
485 .su_group = {
486 .cg_item = {
487 .ci_namebuf = "dlm",
488 .ci_type = &clusters_type,
489 },
490 },
491 },
492};
493
494int dlm_config_init(void)
495{
496 config_group_init(&clusters_root.subsys.su_group);
497 init_MUTEX(&clusters_root.subsys.su_sem);
498 return configfs_register_subsystem(&clusters_root.subsys);
499}
500
501void dlm_config_exit(void)
502{
503 configfs_unregister_subsystem(&clusters_root.subsys);
504}
505
506/*
507 * Functions for user space to read/write attributes
508 */
509
510static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
511 char *buf)
512{
513 struct comm *cm = to_comm(i);
514 struct comm_attribute *cma =
515 container_of(a, struct comm_attribute, attr);
516 return cma->show ? cma->show(cm, buf) : 0;
517}
518
519static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
520 const char *buf, size_t len)
521{
522 struct comm *cm = to_comm(i);
523 struct comm_attribute *cma =
524 container_of(a, struct comm_attribute, attr);
525 return cma->store ? cma->store(cm, buf, len) : -EINVAL;
526}
527
528static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
529{
530 return sprintf(buf, "%d\n", cm->nodeid);
531}
532
533static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
534{
535 cm->nodeid = simple_strtol(buf, NULL, 0);
536 return len;
537}
538
539static ssize_t comm_local_read(struct comm *cm, char *buf)
540{
541 return sprintf(buf, "%d\n", cm->local);
542}
543
544static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
545{
546 cm->local= simple_strtol(buf, NULL, 0);
547 if (cm->local && !local_comm)
548 local_comm = cm;
549 return len;
550}
551
552static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
553{
554 struct sockaddr_storage *addr;
555
556 if (len != sizeof(struct sockaddr_storage))
557 return -EINVAL;
558
559 if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
560 return -ENOSPC;
561
562 addr = kzalloc(sizeof(*addr), GFP_KERNEL);
563 if (!addr)
564 return -ENOMEM;
565
566 memcpy(addr, buf, len);
567 cm->addr[cm->addr_count++] = addr;
568 return len;
569}
570
571static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
572 char *buf)
573{
574 struct node *nd = to_node(i);
575 struct node_attribute *nda =
576 container_of(a, struct node_attribute, attr);
577 return nda->show ? nda->show(nd, buf) : 0;
578}
579
580static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
581 const char *buf, size_t len)
582{
583 struct node *nd = to_node(i);
584 struct node_attribute *nda =
585 container_of(a, struct node_attribute, attr);
586 return nda->store ? nda->store(nd, buf, len) : -EINVAL;
587}
588
589static ssize_t node_nodeid_read(struct node *nd, char *buf)
590{
591 return sprintf(buf, "%d\n", nd->nodeid);
592}
593
594static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
595{
596 nd->nodeid = simple_strtol(buf, NULL, 0);
597 return len;
598}
599
600static ssize_t node_weight_read(struct node *nd, char *buf)
601{
602 return sprintf(buf, "%d\n", nd->weight);
603}
604
605static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
606{
607 nd->weight = simple_strtol(buf, NULL, 0);
608 return len;
609}
610
611/*
612 * Functions for the dlm to get the info that's been configured
613 */
614
615static struct space *get_space(char *name)
616{
617 if (!space_list)
618 return NULL;
619 return to_space(config_group_find_obj(space_list, name));
620}
621
622static void put_space(struct space *sp)
623{
624 config_item_put(&sp->group.cg_item);
625}
626
627static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
628{
629 struct config_item *i;
630 struct comm *cm = NULL;
631 int found = 0;
632
633 if (!comm_list)
634 return NULL;
635
636 down(&clusters_root.subsys.su_sem);
637
638 list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
639 cm = to_comm(i);
640
641 if (nodeid) {
642 if (cm->nodeid != nodeid)
643 continue;
644 found = 1;
645 break;
646 } else {
647 if (!cm->addr_count ||
648 memcmp(cm->addr[0], addr, sizeof(*addr)))
649 continue;
650 found = 1;
651 break;
652 }
653 }
654 up(&clusters_root.subsys.su_sem);
655
656 if (found)
657 config_item_get(i);
658 else
659 cm = NULL;
660 return cm;
661}
662
663static void put_comm(struct comm *cm)
664{
665 config_item_put(&cm->item);
666}
667
668/* caller must free mem */
669int dlm_nodeid_list(char *lsname, int **ids_out)
670{
671 struct space *sp;
672 struct node *nd;
673 int i = 0, rv = 0;
674 int *ids;
675
676 sp = get_space(lsname);
677 if (!sp)
678 return -EEXIST;
679
680 mutex_lock(&sp->members_lock);
681 if (!sp->members_count) {
682 rv = 0;
683 goto out;
684 }
685
686 ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL);
687 if (!ids) {
688 rv = -ENOMEM;
689 goto out;
690 }
691
692 rv = sp->members_count;
693 list_for_each_entry(nd, &sp->members, list)
694 ids[i++] = nd->nodeid;
695
696 if (rv != i)
697 printk("bad nodeid count %d %d\n", rv, i);
698
699 *ids_out = ids;
700 out:
701 mutex_unlock(&sp->members_lock);
702 put_space(sp);
703 return rv;
704}
705
706int dlm_node_weight(char *lsname, int nodeid)
707{
708 struct space *sp;
709 struct node *nd;
710 int w = -EEXIST;
711
712 sp = get_space(lsname);
713 if (!sp)
714 goto out;
715
716 mutex_lock(&sp->members_lock);
717 list_for_each_entry(nd, &sp->members, list) {
718 if (nd->nodeid != nodeid)
719 continue;
720 w = nd->weight;
721 break;
722 }
723 mutex_unlock(&sp->members_lock);
724 put_space(sp);
725 out:
726 return w;
727}
728
729int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
730{
731 struct comm *cm = get_comm(nodeid, NULL);
732 if (!cm)
733 return -EEXIST;
734 if (!cm->addr_count)
735 return -ENOENT;
736 memcpy(addr, cm->addr[0], sizeof(*addr));
737 put_comm(cm);
738 return 0;
739}
740
741int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
742{
743 struct comm *cm = get_comm(0, addr);
744 if (!cm)
745 return -EEXIST;
746 *nodeid = cm->nodeid;
747 put_comm(cm);
748 return 0;
749}
750
751int dlm_our_nodeid(void)
752{
753 return local_comm ? local_comm->nodeid : 0;
754}
755
756/* num 0 is first addr, num 1 is second addr */
757int dlm_our_addr(struct sockaddr_storage *addr, int num)
758{
759 if (!local_comm)
760 return -1;
761 if (num + 1 > local_comm->addr_count)
762 return -1;
763 memcpy(addr, local_comm->addr[num], sizeof(*addr));
764 return 0;
765}
766
767/* Config file defaults */
768#define DEFAULT_TCP_PORT 21064
769#define DEFAULT_BUFFER_SIZE 4096
770#define DEFAULT_RSBTBL_SIZE 256
771#define DEFAULT_LKBTBL_SIZE 1024
772#define DEFAULT_DIRTBL_SIZE 512
773#define DEFAULT_RECOVER_TIMER 5
774#define DEFAULT_TOSS_SECS 10
775#define DEFAULT_SCAN_SECS 5
776
777struct dlm_config_info dlm_config = {
778 .tcp_port = DEFAULT_TCP_PORT,
779 .buffer_size = DEFAULT_BUFFER_SIZE,
780 .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
781 .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
782 .dirtbl_size = DEFAULT_DIRTBL_SIZE,
783 .recover_timer = DEFAULT_RECOVER_TIMER,
784 .toss_secs = DEFAULT_TOSS_SECS,
785 .scan_secs = DEFAULT_SCAN_SECS
786};
787
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
new file mode 100644
index 000000000000..9da7839958a9
--- /dev/null
+++ b/fs/dlm/config.h
@@ -0,0 +1,42 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __CONFIG_DOT_H__
15#define __CONFIG_DOT_H__
16
17#define DLM_MAX_ADDR_COUNT 3
18
19struct dlm_config_info {
20 int tcp_port;
21 int buffer_size;
22 int rsbtbl_size;
23 int lkbtbl_size;
24 int dirtbl_size;
25 int recover_timer;
26 int toss_secs;
27 int scan_secs;
28};
29
30extern struct dlm_config_info dlm_config;
31
32int dlm_config_init(void);
33void dlm_config_exit(void);
34int dlm_node_weight(char *lsname, int nodeid);
35int dlm_nodeid_list(char *lsname, int **ids_out);
36int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
37int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
38int dlm_our_nodeid(void);
39int dlm_our_addr(struct sockaddr_storage *addr, int num);
40
41#endif /* __CONFIG_DOT_H__ */
42
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
new file mode 100644
index 000000000000..49deca845dba
--- /dev/null
+++ b/fs/dlm/debug_fs.c
@@ -0,0 +1,296 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include <linux/pagemap.h>
14#include <linux/seq_file.h>
15#include <linux/module.h>
16#include <linux/ctype.h>
17#include <linux/debugfs.h>
18
19#include "dlm_internal.h"
20
21
22static struct dentry *dlm_root;
23
24struct rsb_iter {
25 int entry;
26 struct dlm_ls *ls;
27 struct list_head *next;
28 struct dlm_rsb *rsb;
29};
30
31static char *print_lockmode(int mode)
32{
33 switch (mode) {
34 case DLM_LOCK_IV:
35 return "--";
36 case DLM_LOCK_NL:
37 return "NL";
38 case DLM_LOCK_CR:
39 return "CR";
40 case DLM_LOCK_CW:
41 return "CW";
42 case DLM_LOCK_PR:
43 return "PR";
44 case DLM_LOCK_PW:
45 return "PW";
46 case DLM_LOCK_EX:
47 return "EX";
48 default:
49 return "??";
50 }
51}
52
53static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
54 struct dlm_rsb *res)
55{
56 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
57
58 if (lkb->lkb_status == DLM_LKSTS_CONVERT
59 || lkb->lkb_status == DLM_LKSTS_WAITING)
60 seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
61
62 if (lkb->lkb_nodeid) {
63 if (lkb->lkb_nodeid != res->res_nodeid)
64 seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
65 lkb->lkb_remid);
66 else
67 seq_printf(s, " Master: %08x", lkb->lkb_remid);
68 }
69
70 if (lkb->lkb_wait_type)
71 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
72
73 seq_printf(s, "\n");
74}
75
76static int print_resource(struct dlm_rsb *res, struct seq_file *s)
77{
78 struct dlm_lkb *lkb;
79 int i, lvblen = res->res_ls->ls_lvblen;
80
81 seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
82 for (i = 0; i < res->res_length; i++) {
83 if (isprint(res->res_name[i]))
84 seq_printf(s, "%c", res->res_name[i]);
85 else
86 seq_printf(s, "%c", '.');
87 }
88 if (res->res_nodeid > 0)
89 seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
90 res->res_nodeid);
91 else if (res->res_nodeid == 0)
92 seq_printf(s, "\" \nMaster Copy\n");
93 else if (res->res_nodeid == -1)
94 seq_printf(s, "\" \nLooking up master (lkid %x)\n",
95 res->res_first_lkid);
96 else
97 seq_printf(s, "\" \nInvalid master %d\n", res->res_nodeid);
98
99 /* Print the LVB: */
100 if (res->res_lvbptr) {
101 seq_printf(s, "LVB: ");
102 for (i = 0; i < lvblen; i++) {
103 if (i == lvblen / 2)
104 seq_printf(s, "\n ");
105 seq_printf(s, "%02x ",
106 (unsigned char) res->res_lvbptr[i]);
107 }
108 if (rsb_flag(res, RSB_VALNOTVALID))
109 seq_printf(s, " (INVALID)");
110 seq_printf(s, "\n");
111 }
112
113 /* Print the locks attached to this resource */
114 seq_printf(s, "Granted Queue\n");
115 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
116 print_lock(s, lkb, res);
117
118 seq_printf(s, "Conversion Queue\n");
119 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
120 print_lock(s, lkb, res);
121
122 seq_printf(s, "Waiting Queue\n");
123 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
124 print_lock(s, lkb, res);
125
126 return 0;
127}
128
129static int rsb_iter_next(struct rsb_iter *ri)
130{
131 struct dlm_ls *ls = ri->ls;
132 int i;
133
134 if (!ri->next) {
135 top:
136 /* Find the next non-empty hash bucket */
137 for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
138 read_lock(&ls->ls_rsbtbl[i].lock);
139 if (!list_empty(&ls->ls_rsbtbl[i].list)) {
140 ri->next = ls->ls_rsbtbl[i].list.next;
141 read_unlock(&ls->ls_rsbtbl[i].lock);
142 break;
143 }
144 read_unlock(&ls->ls_rsbtbl[i].lock);
145 }
146 ri->entry = i;
147
148 if (ri->entry >= ls->ls_rsbtbl_size)
149 return 1;
150 } else {
151 i = ri->entry;
152 read_lock(&ls->ls_rsbtbl[i].lock);
153 ri->next = ri->next->next;
154 if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
155 /* End of list - move to next bucket */
156 ri->next = NULL;
157 ri->entry++;
158 read_unlock(&ls->ls_rsbtbl[i].lock);
159 goto top;
160 }
161 read_unlock(&ls->ls_rsbtbl[i].lock);
162 }
163 ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
164
165 return 0;
166}
167
168static void rsb_iter_free(struct rsb_iter *ri)
169{
170 kfree(ri);
171}
172
173static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
174{
175 struct rsb_iter *ri;
176
177 ri = kmalloc(sizeof *ri, GFP_KERNEL);
178 if (!ri)
179 return NULL;
180
181 ri->ls = ls;
182 ri->entry = 0;
183 ri->next = NULL;
184
185 if (rsb_iter_next(ri)) {
186 rsb_iter_free(ri);
187 return NULL;
188 }
189
190 return ri;
191}
192
193static void *seq_start(struct seq_file *file, loff_t *pos)
194{
195 struct rsb_iter *ri;
196 loff_t n = *pos;
197
198 ri = rsb_iter_init(file->private);
199 if (!ri)
200 return NULL;
201
202 while (n--) {
203 if (rsb_iter_next(ri)) {
204 rsb_iter_free(ri);
205 return NULL;
206 }
207 }
208
209 return ri;
210}
211
212static void *seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
213{
214 struct rsb_iter *ri = iter_ptr;
215
216 (*pos)++;
217
218 if (rsb_iter_next(ri)) {
219 rsb_iter_free(ri);
220 return NULL;
221 }
222
223 return ri;
224}
225
226static void seq_stop(struct seq_file *file, void *iter_ptr)
227{
228 /* nothing for now */
229}
230
231static int seq_show(struct seq_file *file, void *iter_ptr)
232{
233 struct rsb_iter *ri = iter_ptr;
234
235 print_resource(ri->rsb, file);
236
237 return 0;
238}
239
240static struct seq_operations dlm_seq_ops = {
241 .start = seq_start,
242 .next = seq_next,
243 .stop = seq_stop,
244 .show = seq_show,
245};
246
247static int do_open(struct inode *inode, struct file *file)
248{
249 struct seq_file *seq;
250 int ret;
251
252 ret = seq_open(file, &dlm_seq_ops);
253 if (ret)
254 return ret;
255
256 seq = file->private_data;
257 seq->private = inode->u.generic_ip;
258
259 return 0;
260}
261
262static struct file_operations dlm_fops = {
263 .owner = THIS_MODULE,
264 .open = do_open,
265 .read = seq_read,
266 .llseek = seq_lseek,
267 .release = seq_release
268};
269
270int dlm_create_debug_file(struct dlm_ls *ls)
271{
272 ls->ls_debug_dentry = debugfs_create_file(ls->ls_name,
273 S_IFREG | S_IRUGO,
274 dlm_root,
275 ls,
276 &dlm_fops);
277 return ls->ls_debug_dentry ? 0 : -ENOMEM;
278}
279
280void dlm_delete_debug_file(struct dlm_ls *ls)
281{
282 if (ls->ls_debug_dentry)
283 debugfs_remove(ls->ls_debug_dentry);
284}
285
286int dlm_register_debugfs(void)
287{
288 dlm_root = debugfs_create_dir("dlm", NULL);
289 return dlm_root ? 0 : -ENOMEM;
290}
291
292void dlm_unregister_debugfs(void)
293{
294 debugfs_remove(dlm_root);
295}
296
diff --git a/fs/dlm/device.c b/fs/dlm/device.c
new file mode 100644
index 000000000000..99d8b6b07fba
--- /dev/null
+++ b/fs/dlm/device.c
@@ -0,0 +1,1093 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * device.c
16 *
17 * This is the userland interface to the DLM.
18 *
19 * The locking is done via a misc char device (find the
20 * registered minor number in /proc/misc).
21 *
22 * User code should not use this interface directly but
23 * call the library routines in libdlm.a instead.
24 *
25 */
26
27#include <linux/miscdevice.h>
28#include <linux/init.h>
29#include <linux/wait.h>
30#include <linux/module.h>
31#include <linux/file.h>
32#include <linux/fs.h>
33#include <linux/poll.h>
34#include <linux/signal.h>
35#include <linux/spinlock.h>
36#include <linux/idr.h>
37
38#include <linux/dlm.h>
39#include <linux/dlm_device.h>
40
41#include "lvb_table.h"
42
43static struct file_operations _dlm_fops;
44static const char *name_prefix="dlm";
45static struct list_head user_ls_list;
46static struct mutex user_ls_lock;
47
48/* Lock infos are stored in here indexed by lock ID */
49static DEFINE_IDR(lockinfo_idr);
50static rwlock_t lockinfo_lock;
51
52/* Flags in li_flags */
53#define LI_FLAG_COMPLETE 1
54#define LI_FLAG_FIRSTLOCK 2
55#define LI_FLAG_PERSISTENT 3
56#define LI_FLAG_ONLIST 4
57
58/* flags in ls_flags*/
59#define LS_FLAG_DELETED 1
60#define LS_FLAG_AUTOFREE 2
61
62
63#define LOCKINFO_MAGIC 0x53595324
64
65struct lock_info {
66 uint32_t li_magic;
67 uint8_t li_cmd;
68 int8_t li_grmode;
69 int8_t li_rqmode;
70 struct dlm_lksb li_lksb;
71 wait_queue_head_t li_waitq;
72 unsigned long li_flags;
73 void __user *li_castparam;
74 void __user *li_castaddr;
75 void __user *li_bastparam;
76 void __user *li_bastaddr;
77 void __user *li_pend_bastparam;
78 void __user *li_pend_bastaddr;
79 struct list_head li_ownerqueue;
80 struct file_info *li_file;
81 struct dlm_lksb __user *li_user_lksb;
82 struct semaphore li_firstlock;
83};
84
85/* A queued AST no less */
86struct ast_info {
87 struct dlm_lock_result result;
88 struct list_head list;
89 uint32_t lvb_updated;
90 uint32_t progress; /* How much has been read */
91};
92
93/* One of these per userland lockspace */
94struct user_ls {
95 void *ls_lockspace;
96 atomic_t ls_refcnt;
97 long ls_flags;
98
99 /* Passed into misc_register() */
100 struct miscdevice ls_miscinfo;
101 struct list_head ls_list;
102};
103
104/* misc_device info for the control device */
105static struct miscdevice ctl_device;
106
107/*
108 * Stuff we hang off the file struct.
109 * The first two are to cope with unlocking all the
110 * locks help by a process when it dies.
111 */
112struct file_info {
113 struct list_head fi_li_list; /* List of active lock_infos */
114 spinlock_t fi_li_lock;
115 struct list_head fi_ast_list; /* Queue of ASTs to be delivered */
116 spinlock_t fi_ast_lock;
117 wait_queue_head_t fi_wait;
118 struct user_ls *fi_ls;
119 atomic_t fi_refcnt; /* Number of users */
120 unsigned long fi_flags; /* Bit 1 means the device is open */
121};
122
123
124/* get and put ops for file_info.
125 Actually I don't really like "get" and "put", but everyone
126 else seems to use them and I can't think of anything
127 nicer at the moment */
128static void get_file_info(struct file_info *f)
129{
130 atomic_inc(&f->fi_refcnt);
131}
132
133static void put_file_info(struct file_info *f)
134{
135 if (atomic_dec_and_test(&f->fi_refcnt))
136 kfree(f);
137}
138
139static void release_lockinfo(struct lock_info *li)
140{
141 put_file_info(li->li_file);
142
143 write_lock(&lockinfo_lock);
144 idr_remove(&lockinfo_idr, li->li_lksb.sb_lkid);
145 write_unlock(&lockinfo_lock);
146
147 if (li->li_lksb.sb_lvbptr)
148 kfree(li->li_lksb.sb_lvbptr);
149 kfree(li);
150
151 module_put(THIS_MODULE);
152}
153
154static struct lock_info *get_lockinfo(uint32_t lockid)
155{
156 struct lock_info *li;
157
158 read_lock(&lockinfo_lock);
159 li = idr_find(&lockinfo_idr, lockid);
160 read_unlock(&lockinfo_lock);
161
162 return li;
163}
164
165static int add_lockinfo(struct lock_info *li)
166{
167 int n;
168 int r;
169 int ret = -EINVAL;
170
171 write_lock(&lockinfo_lock);
172
173 if (idr_find(&lockinfo_idr, li->li_lksb.sb_lkid))
174 goto out_up;
175
176 ret = -ENOMEM;
177 r = idr_pre_get(&lockinfo_idr, GFP_KERNEL);
178 if (!r)
179 goto out_up;
180
181 r = idr_get_new_above(&lockinfo_idr, li, li->li_lksb.sb_lkid, &n);
182 if (r)
183 goto out_up;
184
185 if (n != li->li_lksb.sb_lkid) {
186 idr_remove(&lockinfo_idr, n);
187 goto out_up;
188 }
189
190 ret = 0;
191
192 out_up:
193 write_unlock(&lockinfo_lock);
194
195 return ret;
196}
197
198
199static struct user_ls *__find_lockspace(int minor)
200{
201 struct user_ls *lsinfo;
202
203 list_for_each_entry(lsinfo, &user_ls_list, ls_list) {
204 if (lsinfo->ls_miscinfo.minor == minor)
205 return lsinfo;
206 }
207 return NULL;
208}
209
210/* Find a lockspace struct given the device minor number */
211static struct user_ls *find_lockspace(int minor)
212{
213 struct user_ls *lsinfo;
214
215 mutex_lock(&user_ls_lock);
216 lsinfo = __find_lockspace(minor);
217 mutex_unlock(&user_ls_lock);
218
219 return lsinfo;
220}
221
222static void add_lockspace_to_list(struct user_ls *lsinfo)
223{
224 mutex_lock(&user_ls_lock);
225 list_add(&lsinfo->ls_list, &user_ls_list);
226 mutex_unlock(&user_ls_lock);
227}
228
229/* Register a lockspace with the DLM and create a misc
230 device for userland to access it */
231static int register_lockspace(char *name, struct user_ls **ls, int flags)
232{
233 struct user_ls *newls;
234 int status;
235 int namelen;
236
237 namelen = strlen(name)+strlen(name_prefix)+2;
238
239 newls = kzalloc(sizeof(struct user_ls), GFP_KERNEL);
240 if (!newls)
241 return -ENOMEM;
242
243 newls->ls_miscinfo.name = kzalloc(namelen, GFP_KERNEL);
244 if (!newls->ls_miscinfo.name) {
245 kfree(newls);
246 return -ENOMEM;
247 }
248
249 status = dlm_new_lockspace(name, strlen(name), &newls->ls_lockspace, 0,
250 DLM_USER_LVB_LEN);
251 if (status != 0) {
252 kfree(newls->ls_miscinfo.name);
253 kfree(newls);
254 return status;
255 }
256
257 snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s",
258 name_prefix, name);
259
260 newls->ls_miscinfo.fops = &_dlm_fops;
261 newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR;
262
263 status = misc_register(&newls->ls_miscinfo);
264 if (status) {
265 printk(KERN_ERR "dlm: misc register failed for %s\n", name);
266 dlm_release_lockspace(newls->ls_lockspace, 0);
267 kfree(newls->ls_miscinfo.name);
268 kfree(newls);
269 return status;
270 }
271
272 if (flags & DLM_USER_LSFLG_AUTOFREE)
273 set_bit(LS_FLAG_AUTOFREE, &newls->ls_flags);
274
275 add_lockspace_to_list(newls);
276 *ls = newls;
277 return 0;
278}
279
280/* Called with the user_ls_lock mutex held */
281static int unregister_lockspace(struct user_ls *lsinfo, int force)
282{
283 int status;
284
285 status = dlm_release_lockspace(lsinfo->ls_lockspace, force);
286 if (status)
287 return status;
288
289 status = misc_deregister(&lsinfo->ls_miscinfo);
290 if (status)
291 return status;
292
293 list_del(&lsinfo->ls_list);
294 set_bit(LS_FLAG_DELETED, &lsinfo->ls_flags);
295 lsinfo->ls_lockspace = NULL;
296 if (atomic_read(&lsinfo->ls_refcnt) == 0) {
297 kfree(lsinfo->ls_miscinfo.name);
298 kfree(lsinfo);
299 }
300
301 return 0;
302}
303
304/* Add it to userland's AST queue */
305static void add_to_astqueue(struct lock_info *li, void *astaddr, void *astparam,
306 int lvb_updated)
307{
308 struct ast_info *ast = kzalloc(sizeof(struct ast_info), GFP_KERNEL);
309 if (!ast)
310 return;
311
312 ast->result.user_astparam = astparam;
313 ast->result.user_astaddr = astaddr;
314 ast->result.user_lksb = li->li_user_lksb;
315 memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb));
316 ast->lvb_updated = lvb_updated;
317
318 spin_lock(&li->li_file->fi_ast_lock);
319 list_add_tail(&ast->list, &li->li_file->fi_ast_list);
320 spin_unlock(&li->li_file->fi_ast_lock);
321 wake_up_interruptible(&li->li_file->fi_wait);
322}
323
324static void bast_routine(void *param, int mode)
325{
326 struct lock_info *li = param;
327
328 if (li && li->li_bastaddr)
329 add_to_astqueue(li, li->li_bastaddr, li->li_bastparam, 0);
330}
331
332/*
333 * This is the kernel's AST routine.
334 * All lock, unlock & query operations complete here.
335 * The only syncronous ops are those done during device close.
336 */
337static void ast_routine(void *param)
338{
339 struct lock_info *li = param;
340
341 /* Param may be NULL if a persistent lock is unlocked by someone else */
342 if (!li)
343 return;
344
345 /* If this is a succesful conversion then activate the blocking ast
346 * args from the conversion request */
347 if (!test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
348 li->li_lksb.sb_status == 0) {
349
350 li->li_bastparam = li->li_pend_bastparam;
351 li->li_bastaddr = li->li_pend_bastaddr;
352 li->li_pend_bastaddr = NULL;
353 }
354
355 /* If it's an async request then post data to the user's AST queue. */
356 if (li->li_castaddr) {
357 int lvb_updated = 0;
358
359 /* See if the lvb has been updated */
360 if (dlm_lvb_operations[li->li_grmode+1][li->li_rqmode+1] == 1)
361 lvb_updated = 1;
362
363 if (li->li_lksb.sb_status == 0)
364 li->li_grmode = li->li_rqmode;
365
366 /* Only queue AST if the device is still open */
367 if (test_bit(1, &li->li_file->fi_flags))
368 add_to_astqueue(li, li->li_castaddr, li->li_castparam,
369 lvb_updated);
370
371 /* If it's a new lock operation that failed, then
372 * remove it from the owner queue and free the
373 * lock_info.
374 */
375 if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) &&
376 li->li_lksb.sb_status != 0) {
377
378 /* Wait till dlm_lock() has finished */
379 down(&li->li_firstlock);
380 up(&li->li_firstlock);
381
382 spin_lock(&li->li_file->fi_li_lock);
383 list_del(&li->li_ownerqueue);
384 clear_bit(LI_FLAG_ONLIST, &li->li_flags);
385 spin_unlock(&li->li_file->fi_li_lock);
386 release_lockinfo(li);
387 return;
388 }
389 /* Free unlocks & queries */
390 if (li->li_lksb.sb_status == -DLM_EUNLOCK ||
391 li->li_cmd == DLM_USER_QUERY) {
392 release_lockinfo(li);
393 }
394 } else {
395 /* Synchronous request, just wake up the caller */
396 set_bit(LI_FLAG_COMPLETE, &li->li_flags);
397 wake_up_interruptible(&li->li_waitq);
398 }
399}
400
401/*
402 * Wait for the lock op to complete and return the status.
403 */
404static int wait_for_ast(struct lock_info *li)
405{
406 /* Wait for the AST routine to complete */
407 set_task_state(current, TASK_INTERRUPTIBLE);
408 while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags))
409 schedule();
410
411 set_task_state(current, TASK_RUNNING);
412
413 return li->li_lksb.sb_status;
414}
415
416
417/* Open on control device */
418static int dlm_ctl_open(struct inode *inode, struct file *file)
419{
420 file->private_data = NULL;
421 return 0;
422}
423
424/* Close on control device */
425static int dlm_ctl_close(struct inode *inode, struct file *file)
426{
427 return 0;
428}
429
430/* Open on lockspace device */
431static int dlm_open(struct inode *inode, struct file *file)
432{
433 struct file_info *f;
434 struct user_ls *lsinfo;
435
436 lsinfo = find_lockspace(iminor(inode));
437 if (!lsinfo)
438 return -ENOENT;
439
440 f = kzalloc(sizeof(struct file_info), GFP_KERNEL);
441 if (!f)
442 return -ENOMEM;
443
444 atomic_inc(&lsinfo->ls_refcnt);
445 INIT_LIST_HEAD(&f->fi_li_list);
446 INIT_LIST_HEAD(&f->fi_ast_list);
447 spin_lock_init(&f->fi_li_lock);
448 spin_lock_init(&f->fi_ast_lock);
449 init_waitqueue_head(&f->fi_wait);
450 f->fi_ls = lsinfo;
451 f->fi_flags = 0;
452 get_file_info(f);
453 set_bit(1, &f->fi_flags);
454
455 file->private_data = f;
456
457 return 0;
458}
459
460/* Check the user's version matches ours */
461static int check_version(struct dlm_write_request *req)
462{
463 if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
464 (req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
465 req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
466
467 printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
468 "user (%d.%d.%d) kernel (%d.%d.%d)\n",
469 current->comm,
470 current->pid,
471 req->version[0],
472 req->version[1],
473 req->version[2],
474 DLM_DEVICE_VERSION_MAJOR,
475 DLM_DEVICE_VERSION_MINOR,
476 DLM_DEVICE_VERSION_PATCH);
477 return -EINVAL;
478 }
479 return 0;
480}
481
482/* Close on lockspace device */
483static int dlm_close(struct inode *inode, struct file *file)
484{
485 struct file_info *f = file->private_data;
486 struct lock_info li;
487 struct lock_info *old_li, *safe;
488 sigset_t tmpsig;
489 sigset_t allsigs;
490 struct user_ls *lsinfo;
491 DECLARE_WAITQUEUE(wq, current);
492
493 lsinfo = find_lockspace(iminor(inode));
494 if (!lsinfo)
495 return -ENOENT;
496
497 /* Mark this closed so that ASTs will not be delivered any more */
498 clear_bit(1, &f->fi_flags);
499
500 /* Block signals while we are doing this */
501 sigfillset(&allsigs);
502 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
503
504 /* We use our own lock_info struct here, so that any
505 * outstanding "real" ASTs will be delivered with the
506 * corresponding "real" params, thus freeing the lock_info
507 * that belongs the lock. This catches the corner case where
508 * a lock is BUSY when we try to unlock it here
509 */
510 memset(&li, 0, sizeof(li));
511 clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
512 init_waitqueue_head(&li.li_waitq);
513 add_wait_queue(&li.li_waitq, &wq);
514
515 /*
516 * Free any outstanding locks, they are on the
517 * list in LIFO order so there should be no problems
518 * about unlocking parents before children.
519 */
520 list_for_each_entry_safe(old_li, safe, &f->fi_li_list, li_ownerqueue) {
521 int status;
522 int flags = 0;
523
524 /* Don't unlock persistent locks, just mark them orphaned */
525 if (test_bit(LI_FLAG_PERSISTENT, &old_li->li_flags)) {
526 list_del(&old_li->li_ownerqueue);
527
528 /* Update master copy */
529 /* TODO: Check locking core updates the local and
530 remote ORPHAN flags */
531 li.li_lksb.sb_lkid = old_li->li_lksb.sb_lkid;
532 status = dlm_lock(f->fi_ls->ls_lockspace,
533 old_li->li_grmode, &li.li_lksb,
534 DLM_LKF_CONVERT|DLM_LKF_ORPHAN,
535 NULL, 0, 0, ast_routine, NULL, NULL);
536 if (status != 0)
537 printk("dlm: Error orphaning lock %x: %d\n",
538 old_li->li_lksb.sb_lkid, status);
539
540 /* But tidy our references in it */
541 release_lockinfo(old_li);
542 continue;
543 }
544
545 clear_bit(LI_FLAG_COMPLETE, &li.li_flags);
546
547 flags = DLM_LKF_FORCEUNLOCK;
548 if (old_li->li_grmode >= DLM_LOCK_PW)
549 flags |= DLM_LKF_IVVALBLK;
550
551 status = dlm_unlock(f->fi_ls->ls_lockspace,
552 old_li->li_lksb.sb_lkid, flags,
553 &li.li_lksb, &li);
554
555 /* Must wait for it to complete as the next lock could be its
556 * parent */
557 if (status == 0)
558 wait_for_ast(&li);
559
560 /* Unlock suceeded, free the lock_info struct. */
561 if (status == 0)
562 release_lockinfo(old_li);
563 }
564
565 remove_wait_queue(&li.li_waitq, &wq);
566
567 /*
568 * If this is the last reference to the lockspace
569 * then free the struct. If it's an AUTOFREE lockspace
570 * then free the whole thing.
571 */
572 mutex_lock(&user_ls_lock);
573 if (atomic_dec_and_test(&lsinfo->ls_refcnt)) {
574
575 if (lsinfo->ls_lockspace) {
576 if (test_bit(LS_FLAG_AUTOFREE, &lsinfo->ls_flags)) {
577 unregister_lockspace(lsinfo, 1);
578 }
579 } else {
580 kfree(lsinfo->ls_miscinfo.name);
581 kfree(lsinfo);
582 }
583 }
584 mutex_unlock(&user_ls_lock);
585 put_file_info(f);
586
587 /* Restore signals */
588 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
589 recalc_sigpending();
590
591 return 0;
592}
593
594static int do_user_create_lockspace(struct file_info *fi, uint8_t cmd,
595 struct dlm_lspace_params *kparams)
596{
597 int status;
598 struct user_ls *lsinfo;
599
600 if (!capable(CAP_SYS_ADMIN))
601 return -EPERM;
602
603 status = register_lockspace(kparams->name, &lsinfo, kparams->flags);
604
605 /* If it succeeded then return the minor number */
606 if (status == 0)
607 status = lsinfo->ls_miscinfo.minor;
608
609 return status;
610}
611
612static int do_user_remove_lockspace(struct file_info *fi, uint8_t cmd,
613 struct dlm_lspace_params *kparams)
614{
615 int status;
616 int force = 1;
617 struct user_ls *lsinfo;
618
619 if (!capable(CAP_SYS_ADMIN))
620 return -EPERM;
621
622 mutex_lock(&user_ls_lock);
623 lsinfo = __find_lockspace(kparams->minor);
624 if (!lsinfo) {
625 mutex_unlock(&user_ls_lock);
626 return -EINVAL;
627 }
628
629 if (kparams->flags & DLM_USER_LSFLG_FORCEFREE)
630 force = 2;
631
632 status = unregister_lockspace(lsinfo, force);
633 mutex_unlock(&user_ls_lock);
634
635 return status;
636}
637
638/* Read call, might block if no ASTs are waiting.
639 * It will only ever return one message at a time, regardless
640 * of how many are pending.
641 */
642static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count,
643 loff_t *ppos)
644{
645 struct file_info *fi = file->private_data;
646 struct ast_info *ast;
647 int data_size;
648 int offset;
649 DECLARE_WAITQUEUE(wait, current);
650
651 if (count < sizeof(struct dlm_lock_result))
652 return -EINVAL;
653
654 spin_lock(&fi->fi_ast_lock);
655 if (list_empty(&fi->fi_ast_list)) {
656
657 /* No waiting ASTs.
658 * Return EOF if the lockspace been deleted.
659 */
660 if (test_bit(LS_FLAG_DELETED, &fi->fi_ls->ls_flags))
661 return 0;
662
663 if (file->f_flags & O_NONBLOCK) {
664 spin_unlock(&fi->fi_ast_lock);
665 return -EAGAIN;
666 }
667
668 add_wait_queue(&fi->fi_wait, &wait);
669
670 repeat:
671 set_current_state(TASK_INTERRUPTIBLE);
672 if (list_empty(&fi->fi_ast_list) &&
673 !signal_pending(current)) {
674
675 spin_unlock(&fi->fi_ast_lock);
676 schedule();
677 spin_lock(&fi->fi_ast_lock);
678 goto repeat;
679 }
680
681 current->state = TASK_RUNNING;
682 remove_wait_queue(&fi->fi_wait, &wait);
683
684 if (signal_pending(current)) {
685 spin_unlock(&fi->fi_ast_lock);
686 return -ERESTARTSYS;
687 }
688 }
689
690 ast = list_entry(fi->fi_ast_list.next, struct ast_info, list);
691 list_del(&ast->list);
692 spin_unlock(&fi->fi_ast_lock);
693
694 /* Work out the size of the returned data */
695 data_size = sizeof(struct dlm_lock_result);
696 if (ast->lvb_updated && ast->result.lksb.sb_lvbptr)
697 data_size += DLM_USER_LVB_LEN;
698
699 offset = sizeof(struct dlm_lock_result);
700
701 /* Room for the extended data ? */
702 if (count >= data_size) {
703
704 if (ast->lvb_updated && ast->result.lksb.sb_lvbptr) {
705 if (copy_to_user(buffer+offset,
706 ast->result.lksb.sb_lvbptr,
707 DLM_USER_LVB_LEN))
708 return -EFAULT;
709 ast->result.lvb_offset = offset;
710 offset += DLM_USER_LVB_LEN;
711 }
712 }
713
714 ast->result.length = data_size;
715 /* Copy the header now it has all the offsets in it */
716 if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result)))
717 offset = -EFAULT;
718
719 /* If we only returned a header and there's more to come then put it
720 back on the list */
721 if (count < data_size) {
722 spin_lock(&fi->fi_ast_lock);
723 list_add(&ast->list, &fi->fi_ast_list);
724 spin_unlock(&fi->fi_ast_lock);
725 } else
726 kfree(ast);
727 return offset;
728}
729
730static unsigned int dlm_poll(struct file *file, poll_table *wait)
731{
732 struct file_info *fi = file->private_data;
733
734 poll_wait(file, &fi->fi_wait, wait);
735
736 spin_lock(&fi->fi_ast_lock);
737 if (!list_empty(&fi->fi_ast_list)) {
738 spin_unlock(&fi->fi_ast_lock);
739 return POLLIN | POLLRDNORM;
740 }
741
742 spin_unlock(&fi->fi_ast_lock);
743 return 0;
744}
745
746static struct lock_info *allocate_lockinfo(struct file_info *fi, uint8_t cmd,
747 struct dlm_lock_params *kparams)
748{
749 struct lock_info *li;
750
751 if (!try_module_get(THIS_MODULE))
752 return NULL;
753
754 li = kzalloc(sizeof(struct lock_info), GFP_KERNEL);
755 if (li) {
756 li->li_magic = LOCKINFO_MAGIC;
757 li->li_file = fi;
758 li->li_cmd = cmd;
759 li->li_flags = 0;
760 li->li_grmode = -1;
761 li->li_rqmode = -1;
762 li->li_pend_bastparam = NULL;
763 li->li_pend_bastaddr = NULL;
764 li->li_castaddr = NULL;
765 li->li_castparam = NULL;
766 li->li_lksb.sb_lvbptr = NULL;
767 li->li_bastaddr = kparams->bastaddr;
768 li->li_bastparam = kparams->bastparam;
769
770 get_file_info(fi);
771 }
772 return li;
773}
774
775static int do_user_lock(struct file_info *fi, uint8_t cmd,
776 struct dlm_lock_params *kparams)
777{
778 struct lock_info *li;
779 int status;
780
781 /*
782 * Validate things that we need to have correct.
783 */
784 if (!kparams->castaddr)
785 return -EINVAL;
786
787 if (!kparams->lksb)
788 return -EINVAL;
789
790 /* Persistent child locks are not available yet */
791 if ((kparams->flags & DLM_LKF_PERSISTENT) && kparams->parent)
792 return -EINVAL;
793
794 /* For conversions, there should already be a lockinfo struct,
795 unless we are adopting an orphaned persistent lock */
796 if (kparams->flags & DLM_LKF_CONVERT) {
797
798 li = get_lockinfo(kparams->lkid);
799
800 /* If this is a persistent lock we will have to create a
801 lockinfo again */
802 if (!li && (kparams->flags & DLM_LKF_PERSISTENT)) {
803 li = allocate_lockinfo(fi, cmd, kparams);
804 if (!li)
805 return -ENOMEM;
806
807 li->li_lksb.sb_lkid = kparams->lkid;
808 li->li_castaddr = kparams->castaddr;
809 li->li_castparam = kparams->castparam;
810
811 /* OK, this isn;t exactly a FIRSTLOCK but it is the
812 first time we've used this lockinfo, and if things
813 fail we want rid of it */
814 init_MUTEX_LOCKED(&li->li_firstlock);
815 set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
816 add_lockinfo(li);
817
818 /* TODO: do a query to get the current state ?? */
819 }
820 if (!li)
821 return -EINVAL;
822
823 if (li->li_magic != LOCKINFO_MAGIC)
824 return -EINVAL;
825
826 /* For conversions don't overwrite the current blocking AST
827 info so that:
828 a) if a blocking AST fires before the conversion is queued
829 it runs the current handler
830 b) if the conversion is cancelled, the original blocking AST
831 declaration is active
832 The pend_ info is made active when the conversion
833 completes.
834 */
835 li->li_pend_bastaddr = kparams->bastaddr;
836 li->li_pend_bastparam = kparams->bastparam;
837 } else {
838 li = allocate_lockinfo(fi, cmd, kparams);
839 if (!li)
840 return -ENOMEM;
841
842 /* semaphore to allow us to complete our work before
843 the AST routine runs. In fact we only need (and use) this
844 when the initial lock fails */
845 init_MUTEX_LOCKED(&li->li_firstlock);
846 set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags);
847 }
848
849 li->li_user_lksb = kparams->lksb;
850 li->li_castaddr = kparams->castaddr;
851 li->li_castparam = kparams->castparam;
852 li->li_lksb.sb_lkid = kparams->lkid;
853 li->li_rqmode = kparams->mode;
854 if (kparams->flags & DLM_LKF_PERSISTENT)
855 set_bit(LI_FLAG_PERSISTENT, &li->li_flags);
856
857 /* Copy in the value block */
858 if (kparams->flags & DLM_LKF_VALBLK) {
859 if (!li->li_lksb.sb_lvbptr) {
860 li->li_lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN,
861 GFP_KERNEL);
862 if (!li->li_lksb.sb_lvbptr) {
863 status = -ENOMEM;
864 goto out_err;
865 }
866 }
867
868 memcpy(li->li_lksb.sb_lvbptr, kparams->lvb, DLM_USER_LVB_LEN);
869 }
870
871 /* Lock it ... */
872 status = dlm_lock(fi->fi_ls->ls_lockspace,
873 kparams->mode, &li->li_lksb,
874 kparams->flags,
875 kparams->name, kparams->namelen,
876 kparams->parent,
877 ast_routine,
878 li,
879 (li->li_pend_bastaddr || li->li_bastaddr) ?
880 bast_routine : NULL);
881 if (status)
882 goto out_err;
883
884 /* If it succeeded (this far) with a new lock then keep track of
885 it on the file's lockinfo list */
886 if (!status && test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags)) {
887
888 spin_lock(&fi->fi_li_lock);
889 list_add(&li->li_ownerqueue, &fi->fi_li_list);
890 set_bit(LI_FLAG_ONLIST, &li->li_flags);
891 spin_unlock(&fi->fi_li_lock);
892 if (add_lockinfo(li))
893 printk(KERN_WARNING "Add lockinfo failed\n");
894
895 up(&li->li_firstlock);
896 }
897
898 /* Return the lockid as the user needs it /now/ */
899 return li->li_lksb.sb_lkid;
900
901 out_err:
902 if (test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags))
903 release_lockinfo(li);
904 return status;
905
906}
907
908static int do_user_unlock(struct file_info *fi, uint8_t cmd,
909 struct dlm_lock_params *kparams)
910{
911 struct lock_info *li;
912 int status;
913 int convert_cancel = 0;
914
915 li = get_lockinfo(kparams->lkid);
916 if (!li) {
917 li = allocate_lockinfo(fi, cmd, kparams);
918 if (!li)
919 return -ENOMEM;
920 spin_lock(&fi->fi_li_lock);
921 list_add(&li->li_ownerqueue, &fi->fi_li_list);
922 set_bit(LI_FLAG_ONLIST, &li->li_flags);
923 spin_unlock(&fi->fi_li_lock);
924 }
925
926 if (li->li_magic != LOCKINFO_MAGIC)
927 return -EINVAL;
928
929 li->li_user_lksb = kparams->lksb;
930 li->li_castparam = kparams->castparam;
931 li->li_cmd = cmd;
932
933 /* Cancelling a conversion doesn't remove the lock...*/
934 if (kparams->flags & DLM_LKF_CANCEL && li->li_grmode != -1)
935 convert_cancel = 1;
936
937 /* Wait until dlm_lock() has completed */
938 if (!test_bit(LI_FLAG_ONLIST, &li->li_flags)) {
939 down(&li->li_firstlock);
940 up(&li->li_firstlock);
941 }
942
943 /* dlm_unlock() passes a 0 for castaddr which means don't overwrite
944 the existing li_castaddr as that's the completion routine for
945 unlocks. dlm_unlock_wait() specifies a new AST routine to be
946 executed when the unlock completes. */
947 if (kparams->castaddr)
948 li->li_castaddr = kparams->castaddr;
949
950 /* Use existing lksb & astparams */
951 status = dlm_unlock(fi->fi_ls->ls_lockspace,
952 kparams->lkid,
953 kparams->flags, &li->li_lksb, li);
954
955 if (!status && !convert_cancel) {
956 spin_lock(&fi->fi_li_lock);
957 list_del(&li->li_ownerqueue);
958 clear_bit(LI_FLAG_ONLIST, &li->li_flags);
959 spin_unlock(&fi->fi_li_lock);
960 }
961
962 return status;
963}
964
965/* Write call, submit a locking request */
966static ssize_t dlm_write(struct file *file, const char __user *buffer,
967 size_t count, loff_t *ppos)
968{
969 struct file_info *fi = file->private_data;
970 struct dlm_write_request *kparams;
971 sigset_t tmpsig;
972 sigset_t allsigs;
973 int status;
974
975 /* -1 because lock name is optional */
976 if (count < sizeof(struct dlm_write_request)-1)
977 return -EINVAL;
978
979 /* Has the lockspace been deleted */
980 if (fi && test_bit(LS_FLAG_DELETED, &fi->fi_ls->ls_flags))
981 return -ENOENT;
982
983 kparams = kmalloc(count, GFP_KERNEL);
984 if (!kparams)
985 return -ENOMEM;
986
987 status = -EFAULT;
988 /* Get the command info */
989 if (copy_from_user(kparams, buffer, count))
990 goto out_free;
991
992 status = -EBADE;
993 if (check_version(kparams))
994 goto out_free;
995
996 /* Block signals while we are doing this */
997 sigfillset(&allsigs);
998 sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
999
1000 status = -EINVAL;
1001 switch (kparams->cmd)
1002 {
1003 case DLM_USER_LOCK:
1004 if (!fi) goto out_sig;
1005 status = do_user_lock(fi, kparams->cmd, &kparams->i.lock);
1006 break;
1007
1008 case DLM_USER_UNLOCK:
1009 if (!fi) goto out_sig;
1010 status = do_user_unlock(fi, kparams->cmd, &kparams->i.lock);
1011 break;
1012
1013 case DLM_USER_CREATE_LOCKSPACE:
1014 if (fi) goto out_sig;
1015 status = do_user_create_lockspace(fi, kparams->cmd,
1016 &kparams->i.lspace);
1017 break;
1018
1019 case DLM_USER_REMOVE_LOCKSPACE:
1020 if (fi) goto out_sig;
1021 status = do_user_remove_lockspace(fi, kparams->cmd,
1022 &kparams->i.lspace);
1023 break;
1024 default:
1025 printk("Unknown command passed to DLM device : %d\n",
1026 kparams->cmd);
1027 break;
1028 }
1029
1030 out_sig:
1031 /* Restore signals */
1032 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
1033 recalc_sigpending();
1034
1035 out_free:
1036 kfree(kparams);
1037 if (status == 0)
1038 return count;
1039 else
1040 return status;
1041}
1042
1043static struct file_operations _dlm_fops = {
1044 .open = dlm_open,
1045 .release = dlm_close,
1046 .read = dlm_read,
1047 .write = dlm_write,
1048 .poll = dlm_poll,
1049 .owner = THIS_MODULE,
1050};
1051
1052static struct file_operations _dlm_ctl_fops = {
1053 .open = dlm_ctl_open,
1054 .release = dlm_ctl_close,
1055 .write = dlm_write,
1056 .owner = THIS_MODULE,
1057};
1058
1059/*
1060 * Create control device
1061 */
1062static int __init dlm_device_init(void)
1063{
1064 int r;
1065
1066 INIT_LIST_HEAD(&user_ls_list);
1067 mutex_init(&user_ls_lock);
1068 rwlock_init(&lockinfo_lock);
1069
1070 ctl_device.name = "dlm-control";
1071 ctl_device.fops = &_dlm_ctl_fops;
1072 ctl_device.minor = MISC_DYNAMIC_MINOR;
1073
1074 r = misc_register(&ctl_device);
1075 if (r) {
1076 printk(KERN_ERR "dlm: misc_register failed for control dev\n");
1077 return r;
1078 }
1079
1080 return 0;
1081}
1082
1083static void __exit dlm_device_exit(void)
1084{
1085 misc_deregister(&ctl_device);
1086}
1087
1088MODULE_DESCRIPTION("Distributed Lock Manager device interface");
1089MODULE_AUTHOR("Red Hat, Inc.");
1090MODULE_LICENSE("GPL");
1091
1092module_init(dlm_device_init);
1093module_exit(dlm_device_exit);
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
new file mode 100644
index 000000000000..46754553fdcc
--- /dev/null
+++ b/fs/dlm/dir.c
@@ -0,0 +1,423 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "rcom.h"
19#include "config.h"
20#include "memory.h"
21#include "recover.h"
22#include "util.h"
23#include "lock.h"
24#include "dir.h"
25
26
27static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
28{
29 spin_lock(&ls->ls_recover_list_lock);
30 list_add(&de->list, &ls->ls_recover_list);
31 spin_unlock(&ls->ls_recover_list_lock);
32}
33
34static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
35{
36 int found = 0;
37 struct dlm_direntry *de;
38
39 spin_lock(&ls->ls_recover_list_lock);
40 list_for_each_entry(de, &ls->ls_recover_list, list) {
41 if (de->length == len) {
42 list_del(&de->list);
43 de->master_nodeid = 0;
44 memset(de->name, 0, len);
45 found = 1;
46 break;
47 }
48 }
49 spin_unlock(&ls->ls_recover_list_lock);
50
51 if (!found)
52 de = allocate_direntry(ls, len);
53 return de;
54}
55
56void dlm_clear_free_entries(struct dlm_ls *ls)
57{
58 struct dlm_direntry *de;
59
60 spin_lock(&ls->ls_recover_list_lock);
61 while (!list_empty(&ls->ls_recover_list)) {
62 de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
63 list);
64 list_del(&de->list);
65 free_direntry(de);
66 }
67 spin_unlock(&ls->ls_recover_list_lock);
68}
69
70/*
71 * We use the upper 16 bits of the hash value to select the directory node.
72 * Low bits are used for distribution of rsb's among hash buckets on each node.
73 *
74 * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
75 * num_nodes to the hash value. This value in the desired range is used as an
76 * offset into the sorted list of nodeid's to give the particular nodeid.
77 */
78
79int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
80{
81 struct list_head *tmp;
82 struct dlm_member *memb = NULL;
83 uint32_t node, n = 0;
84 int nodeid;
85
86 if (ls->ls_num_nodes == 1) {
87 nodeid = dlm_our_nodeid();
88 goto out;
89 }
90
91 if (ls->ls_node_array) {
92 node = (hash >> 16) % ls->ls_total_weight;
93 nodeid = ls->ls_node_array[node];
94 goto out;
95 }
96
97 /* make_member_array() failed to kmalloc ls_node_array... */
98
99 node = (hash >> 16) % ls->ls_num_nodes;
100
101 list_for_each(tmp, &ls->ls_nodes) {
102 if (n++ != node)
103 continue;
104 memb = list_entry(tmp, struct dlm_member, list);
105 break;
106 }
107
108 DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n",
109 ls->ls_num_nodes, n, node););
110 nodeid = memb->nodeid;
111 out:
112 return nodeid;
113}
114
115int dlm_dir_nodeid(struct dlm_rsb *r)
116{
117 return dlm_hash2nodeid(r->res_ls, r->res_hash);
118}
119
120static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
121{
122 uint32_t val;
123
124 val = jhash(name, len, 0);
125 val &= (ls->ls_dirtbl_size - 1);
126
127 return val;
128}
129
130static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
131{
132 uint32_t bucket;
133
134 bucket = dir_hash(ls, de->name, de->length);
135 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
136}
137
138static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
139 int namelen, uint32_t bucket)
140{
141 struct dlm_direntry *de;
142
143 list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
144 if (de->length == namelen && !memcmp(name, de->name, namelen))
145 goto out;
146 }
147 de = NULL;
148 out:
149 return de;
150}
151
152void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen)
153{
154 struct dlm_direntry *de;
155 uint32_t bucket;
156
157 bucket = dir_hash(ls, name, namelen);
158
159 write_lock(&ls->ls_dirtbl[bucket].lock);
160
161 de = search_bucket(ls, name, namelen, bucket);
162
163 if (!de) {
164 log_error(ls, "remove fr %u none", nodeid);
165 goto out;
166 }
167
168 if (de->master_nodeid != nodeid) {
169 log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
170 goto out;
171 }
172
173 list_del(&de->list);
174 free_direntry(de);
175 out:
176 write_unlock(&ls->ls_dirtbl[bucket].lock);
177}
178
179void dlm_dir_clear(struct dlm_ls *ls)
180{
181 struct list_head *head;
182 struct dlm_direntry *de;
183 int i;
184
185 DLM_ASSERT(list_empty(&ls->ls_recover_list), );
186
187 for (i = 0; i < ls->ls_dirtbl_size; i++) {
188 write_lock(&ls->ls_dirtbl[i].lock);
189 head = &ls->ls_dirtbl[i].list;
190 while (!list_empty(head)) {
191 de = list_entry(head->next, struct dlm_direntry, list);
192 list_del(&de->list);
193 put_free_de(ls, de);
194 }
195 write_unlock(&ls->ls_dirtbl[i].lock);
196 }
197}
198
199int dlm_recover_directory(struct dlm_ls *ls)
200{
201 struct dlm_member *memb;
202 struct dlm_direntry *de;
203 char *b, *last_name = NULL;
204 int error = -ENOMEM, last_len, count = 0;
205 uint16_t namelen;
206
207 log_debug(ls, "dlm_recover_directory");
208
209 if (dlm_no_directory(ls))
210 goto out_status;
211
212 dlm_dir_clear(ls);
213
214 last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
215 if (!last_name)
216 goto out;
217
218 list_for_each_entry(memb, &ls->ls_nodes, list) {
219 memset(last_name, 0, DLM_RESNAME_MAXLEN);
220 last_len = 0;
221
222 for (;;) {
223 error = dlm_recovery_stopped(ls);
224 if (error)
225 goto out_free;
226
227 error = dlm_rcom_names(ls, memb->nodeid,
228 last_name, last_len);
229 if (error)
230 goto out_free;
231
232 schedule();
233
234 /*
235 * pick namelen/name pairs out of received buffer
236 */
237
238 b = ls->ls_recover_buf + sizeof(struct dlm_rcom);
239
240 for (;;) {
241 memcpy(&namelen, b, sizeof(uint16_t));
242 namelen = be16_to_cpu(namelen);
243 b += sizeof(uint16_t);
244
245 /* namelen of 0xFFFFF marks end of names for
246 this node; namelen of 0 marks end of the
247 buffer */
248
249 if (namelen == 0xFFFF)
250 goto done;
251 if (!namelen)
252 break;
253
254 error = -ENOMEM;
255 de = get_free_de(ls, namelen);
256 if (!de)
257 goto out_free;
258
259 de->master_nodeid = memb->nodeid;
260 de->length = namelen;
261 last_len = namelen;
262 memcpy(de->name, b, namelen);
263 memcpy(last_name, b, namelen);
264 b += namelen;
265
266 add_entry_to_hash(ls, de);
267 count++;
268 }
269 }
270 done:
271 ;
272 }
273
274 out_status:
275 error = 0;
276 dlm_set_recover_status(ls, DLM_RS_DIR);
277 log_debug(ls, "dlm_recover_directory %d entries", count);
278 out_free:
279 kfree(last_name);
280 out:
281 dlm_clear_free_entries(ls);
282 return error;
283}
284
285static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
286 int namelen, int *r_nodeid)
287{
288 struct dlm_direntry *de, *tmp;
289 uint32_t bucket;
290
291 bucket = dir_hash(ls, name, namelen);
292
293 write_lock(&ls->ls_dirtbl[bucket].lock);
294 de = search_bucket(ls, name, namelen, bucket);
295 if (de) {
296 *r_nodeid = de->master_nodeid;
297 write_unlock(&ls->ls_dirtbl[bucket].lock);
298 if (*r_nodeid == nodeid)
299 return -EEXIST;
300 return 0;
301 }
302
303 write_unlock(&ls->ls_dirtbl[bucket].lock);
304
305 de = allocate_direntry(ls, namelen);
306 if (!de)
307 return -ENOMEM;
308
309 de->master_nodeid = nodeid;
310 de->length = namelen;
311 memcpy(de->name, name, namelen);
312
313 write_lock(&ls->ls_dirtbl[bucket].lock);
314 tmp = search_bucket(ls, name, namelen, bucket);
315 if (tmp) {
316 free_direntry(de);
317 de = tmp;
318 } else {
319 list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
320 }
321 *r_nodeid = de->master_nodeid;
322 write_unlock(&ls->ls_dirtbl[bucket].lock);
323 return 0;
324}
325
326int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
327 int *r_nodeid)
328{
329 return get_entry(ls, nodeid, name, namelen, r_nodeid);
330}
331
332/* Copy the names of master rsb's into the buffer provided.
333 Only select names whose dir node is the given nodeid. */
334
335void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
336 char *outbuf, int outlen, int nodeid)
337{
338 struct list_head *list;
339 struct dlm_rsb *start_r = NULL, *r = NULL;
340 int offset = 0, start_namelen, error, dir_nodeid;
341 char *start_name;
342 uint16_t be_namelen;
343
344 /*
345 * Find the rsb where we left off (or start again)
346 */
347
348 start_namelen = inlen;
349 start_name = inbuf;
350
351 if (start_namelen > 1) {
352 /*
353 * We could also use a find_rsb_root() function here that
354 * searched the ls_root_list.
355 */
356 error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
357 &start_r);
358 DLM_ASSERT(!error && start_r,
359 printk("error %d\n", error););
360 DLM_ASSERT(!list_empty(&start_r->res_root_list),
361 dlm_print_rsb(start_r););
362 dlm_put_rsb(start_r);
363 }
364
365 /*
366 * Send rsb names for rsb's we're master of and whose directory node
367 * matches the requesting node.
368 */
369
370 down_read(&ls->ls_root_sem);
371 if (start_r)
372 list = start_r->res_root_list.next;
373 else
374 list = ls->ls_root_list.next;
375
376 for (offset = 0; list != &ls->ls_root_list; list = list->next) {
377 r = list_entry(list, struct dlm_rsb, res_root_list);
378 if (r->res_nodeid)
379 continue;
380
381 dir_nodeid = dlm_dir_nodeid(r);
382 if (dir_nodeid != nodeid)
383 continue;
384
385 /*
386 * The block ends when we can't fit the following in the
387 * remaining buffer space:
388 * namelen (uint16_t) +
389 * name (r->res_length) +
390 * end-of-block record 0x0000 (uint16_t)
391 */
392
393 if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
394 /* Write end-of-block record */
395 be_namelen = 0;
396 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
397 offset += sizeof(uint16_t);
398 goto out;
399 }
400
401 be_namelen = cpu_to_be16(r->res_length);
402 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
403 offset += sizeof(uint16_t);
404 memcpy(outbuf + offset, r->res_name, r->res_length);
405 offset += r->res_length;
406 }
407
408 /*
409 * If we've reached the end of the list (and there's room) write a
410 * terminating record.
411 */
412
413 if ((list == &ls->ls_root_list) &&
414 (offset + sizeof(uint16_t) <= outlen)) {
415 be_namelen = 0xFFFF;
416 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
417 offset += sizeof(uint16_t);
418 }
419
420 out:
421 up_read(&ls->ls_root_sem);
422}
423
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
new file mode 100644
index 000000000000..0b0eb1267b6e
--- /dev/null
+++ b/fs/dlm/dir.h
@@ -0,0 +1,30 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DIR_DOT_H__
15#define __DIR_DOT_H__
16
17
18int dlm_dir_nodeid(struct dlm_rsb *rsb);
19int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
20void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len);
21void dlm_dir_clear(struct dlm_ls *ls);
22void dlm_clear_free_entries(struct dlm_ls *ls);
23int dlm_recover_directory(struct dlm_ls *ls);
24int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
25 int *r_nodeid);
26void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
27 char *outbuf, int outlen, int nodeid);
28
29#endif /* __DIR_DOT_H__ */
30
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
new file mode 100644
index 000000000000..c3299020c8f3
--- /dev/null
+++ b/fs/dlm/dlm_internal.h
@@ -0,0 +1,493 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DLM_INTERNAL_DOT_H__
15#define __DLM_INTERNAL_DOT_H__
16
17/*
18 * This is the main header file to be included in each DLM source file.
19 */
20
21#include <linux/module.h>
22#include <linux/slab.h>
23#include <linux/sched.h>
24#include <linux/types.h>
25#include <linux/ctype.h>
26#include <linux/spinlock.h>
27#include <linux/vmalloc.h>
28#include <linux/list.h>
29#include <linux/errno.h>
30#include <linux/random.h>
31#include <linux/delay.h>
32#include <linux/socket.h>
33#include <linux/kthread.h>
34#include <linux/kobject.h>
35#include <linux/kref.h>
36#include <linux/kernel.h>
37#include <linux/jhash.h>
38#include <linux/mutex.h>
39#include <asm/semaphore.h>
40#include <asm/uaccess.h>
41
42#include <linux/dlm.h>
43
44#define DLM_LOCKSPACE_LEN 64
45
46/* Size of the temp buffer midcomms allocates on the stack.
47 We try to make this large enough so most messages fit.
48 FIXME: should sctp make this unnecessary? */
49
50#define DLM_INBUF_LEN 148
51
52struct dlm_ls;
53struct dlm_lkb;
54struct dlm_rsb;
55struct dlm_member;
56struct dlm_lkbtable;
57struct dlm_rsbtable;
58struct dlm_dirtable;
59struct dlm_direntry;
60struct dlm_recover;
61struct dlm_header;
62struct dlm_message;
63struct dlm_rcom;
64struct dlm_mhandle;
65
66#define log_print(fmt, args...) \
67 printk(KERN_ERR "dlm: "fmt"\n" , ##args)
68#define log_error(ls, fmt, args...) \
69 printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
70
71#ifdef DLM_LOG_DEBUG
72#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args)
73#else
74#define log_debug(ls, fmt, args...)
75#endif
76
77#define DLM_ASSERT(x, do) \
78{ \
79 if (!(x)) \
80 { \
81 printk(KERN_ERR "\nDLM: Assertion failed on line %d of file %s\n" \
82 "DLM: assertion: \"%s\"\n" \
83 "DLM: time = %lu\n", \
84 __LINE__, __FILE__, #x, jiffies); \
85 {do} \
86 printk("\n"); \
87 BUG(); \
88 panic("DLM: Record message above and reboot.\n"); \
89 } \
90}
91
92
93struct dlm_direntry {
94 struct list_head list;
95 uint32_t master_nodeid;
96 uint16_t length;
97 char name[1];
98};
99
100struct dlm_dirtable {
101 struct list_head list;
102 rwlock_t lock;
103};
104
105struct dlm_rsbtable {
106 struct list_head list;
107 struct list_head toss;
108 rwlock_t lock;
109};
110
111struct dlm_lkbtable {
112 struct list_head list;
113 rwlock_t lock;
114 uint16_t counter;
115};
116
117/*
118 * Lockspace member (per node in a ls)
119 */
120
121struct dlm_member {
122 struct list_head list;
123 int nodeid;
124 int weight;
125};
126
127/*
128 * Save and manage recovery state for a lockspace.
129 */
130
131struct dlm_recover {
132 struct list_head list;
133 int *nodeids;
134 int node_count;
135 uint64_t seq;
136};
137
138/*
139 * Pass input args to second stage locking function.
140 */
141
142struct dlm_args {
143 uint32_t flags;
144 void *astaddr;
145 long astparam;
146 void *bastaddr;
147 int mode;
148 struct dlm_lksb *lksb;
149};
150
151
152/*
153 * Lock block
154 *
155 * A lock can be one of three types:
156 *
157 * local copy lock is mastered locally
158 * (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
159 * process copy lock is mastered on a remote node
160 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
161 * master copy master node's copy of a lock owned by remote node
162 * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
163 *
164 * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
165 * dlm_unlock. The dlm does not modify these or use any private flags in
166 * this field; it only contains DLM_LKF_ flags from dlm.h. These flags
167 * are sent as-is to the remote master when the lock is remote.
168 *
169 * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
170 * Some internal flags are shared between the master and process nodes;
171 * these shared flags are kept in the lower two bytes. One of these
172 * flags set on the master copy will be propagated to the process copy
173 * and v.v. Other internal flags are private to the master or process
174 * node (e.g. DLM_IFL_MSTCPY). These are kept in the high two bytes.
175 *
176 * lkb_sbflags: status block flags. These flags are copied directly into
177 * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
178 * ast. All defined in dlm.h with DLM_SBF_ prefix.
179 *
180 * lkb_status: the lock status indicates which rsb queue the lock is
181 * on, grant, convert, or wait. DLM_LKSTS_ WAITING/GRANTED/CONVERT
182 *
183 * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
184 * reply is needed. Only set when the lkb is on the lockspace waiters
185 * list awaiting a reply from a remote node.
186 *
187 * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
188 * is a master copy, nodeid specifies the remote lock holder, when the
189 * lkb is a process copy, the nodeid specifies the lock master.
190 */
191
192/* lkb_ast_type */
193
194#define AST_COMP 1
195#define AST_BAST 2
196
197/* lkb_status */
198
199#define DLM_LKSTS_WAITING 1
200#define DLM_LKSTS_GRANTED 2
201#define DLM_LKSTS_CONVERT 3
202
203/* lkb_flags */
204
205#define DLM_IFL_MSTCPY 0x00010000
206#define DLM_IFL_RESEND 0x00020000
207
208struct dlm_lkb {
209 struct dlm_rsb *lkb_resource; /* the rsb */
210 struct kref lkb_ref;
211 int lkb_nodeid; /* copied from rsb */
212 int lkb_ownpid; /* pid of lock owner */
213 uint32_t lkb_id; /* our lock ID */
214 uint32_t lkb_remid; /* lock ID on remote partner */
215 uint32_t lkb_exflags; /* external flags from caller */
216 uint32_t lkb_sbflags; /* lksb flags */
217 uint32_t lkb_flags; /* internal flags */
218 uint32_t lkb_lvbseq; /* lvb sequence number */
219
220 int8_t lkb_status; /* granted, waiting, convert */
221 int8_t lkb_rqmode; /* requested lock mode */
222 int8_t lkb_grmode; /* granted lock mode */
223 int8_t lkb_bastmode; /* requested mode */
224 int8_t lkb_highbast; /* highest mode bast sent for */
225
226 int8_t lkb_wait_type; /* type of reply waiting for */
227 int8_t lkb_ast_type; /* type of ast queued for */
228
229 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
230 struct list_head lkb_statequeue; /* rsb g/c/w list */
231 struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */
232 struct list_head lkb_wait_reply; /* waiting for remote reply */
233 struct list_head lkb_astqueue; /* need ast to be sent */
234
235 char *lkb_lvbptr;
236 struct dlm_lksb *lkb_lksb; /* caller's status block */
237 void *lkb_astaddr; /* caller's ast function */
238 void *lkb_bastaddr; /* caller's bast function */
239 long lkb_astparam; /* caller's ast arg */
240};
241
242
243struct dlm_rsb {
244 struct dlm_ls *res_ls; /* the lockspace */
245 struct kref res_ref;
246 struct mutex res_mutex;
247 unsigned long res_flags;
248 int res_length; /* length of rsb name */
249 int res_nodeid;
250 uint32_t res_lvbseq;
251 uint32_t res_hash;
252 uint32_t res_bucket; /* rsbtbl */
253 unsigned long res_toss_time;
254 uint32_t res_first_lkid;
255 struct list_head res_lookup; /* lkbs waiting on first */
256 struct list_head res_hashchain; /* rsbtbl */
257 struct list_head res_grantqueue;
258 struct list_head res_convertqueue;
259 struct list_head res_waitqueue;
260
261 struct list_head res_root_list; /* used for recovery */
262 struct list_head res_recover_list; /* used for recovery */
263 int res_recover_locks_count;
264
265 char *res_lvbptr;
266 char res_name[1];
267};
268
269/* find_rsb() flags */
270
271#define R_MASTER 1 /* only return rsb if it's a master */
272#define R_CREATE 2 /* create/add rsb if not found */
273
274/* rsb_flags */
275
276enum rsb_flags {
277 RSB_MASTER_UNCERTAIN,
278 RSB_VALNOTVALID,
279 RSB_VALNOTVALID_PREV,
280 RSB_NEW_MASTER,
281 RSB_NEW_MASTER2,
282 RSB_RECOVER_CONVERT,
283};
284
285static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
286{
287 __set_bit(flag, &r->res_flags);
288}
289
290static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
291{
292 __clear_bit(flag, &r->res_flags);
293}
294
295static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
296{
297 return test_bit(flag, &r->res_flags);
298}
299
300
301/* dlm_header is first element of all structs sent between nodes */
302
303#define DLM_HEADER_MAJOR 0x00020000
304#define DLM_HEADER_MINOR 0x00000001
305
306#define DLM_MSG 1
307#define DLM_RCOM 2
308
309struct dlm_header {
310 uint32_t h_version;
311 uint32_t h_lockspace;
312 uint32_t h_nodeid; /* nodeid of sender */
313 uint16_t h_length;
314 uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */
315 uint8_t h_pad;
316};
317
318
319#define DLM_MSG_REQUEST 1
320#define DLM_MSG_CONVERT 2
321#define DLM_MSG_UNLOCK 3
322#define DLM_MSG_CANCEL 4
323#define DLM_MSG_REQUEST_REPLY 5
324#define DLM_MSG_CONVERT_REPLY 6
325#define DLM_MSG_UNLOCK_REPLY 7
326#define DLM_MSG_CANCEL_REPLY 8
327#define DLM_MSG_GRANT 9
328#define DLM_MSG_BAST 10
329#define DLM_MSG_LOOKUP 11
330#define DLM_MSG_REMOVE 12
331#define DLM_MSG_LOOKUP_REPLY 13
332
333struct dlm_message {
334 struct dlm_header m_header;
335 uint32_t m_type; /* DLM_MSG_ */
336 uint32_t m_nodeid;
337 uint32_t m_pid;
338 uint32_t m_lkid; /* lkid on sender */
339 uint32_t m_remid; /* lkid on receiver */
340 uint32_t m_parent_lkid;
341 uint32_t m_parent_remid;
342 uint32_t m_exflags;
343 uint32_t m_sbflags;
344 uint32_t m_flags;
345 uint32_t m_lvbseq;
346 uint32_t m_hash;
347 int m_status;
348 int m_grmode;
349 int m_rqmode;
350 int m_bastmode;
351 int m_asts;
352 int m_result; /* 0 or -EXXX */
353 char m_extra[0]; /* name or lvb */
354};
355
356
357#define DLM_RS_NODES 0x00000001
358#define DLM_RS_NODES_ALL 0x00000002
359#define DLM_RS_DIR 0x00000004
360#define DLM_RS_DIR_ALL 0x00000008
361#define DLM_RS_LOCKS 0x00000010
362#define DLM_RS_LOCKS_ALL 0x00000020
363#define DLM_RS_DONE 0x00000040
364#define DLM_RS_DONE_ALL 0x00000080
365
366#define DLM_RCOM_STATUS 1
367#define DLM_RCOM_NAMES 2
368#define DLM_RCOM_LOOKUP 3
369#define DLM_RCOM_LOCK 4
370#define DLM_RCOM_STATUS_REPLY 5
371#define DLM_RCOM_NAMES_REPLY 6
372#define DLM_RCOM_LOOKUP_REPLY 7
373#define DLM_RCOM_LOCK_REPLY 8
374
375struct dlm_rcom {
376 struct dlm_header rc_header;
377 uint32_t rc_type; /* DLM_RCOM_ */
378 int rc_result; /* multi-purpose */
379 uint64_t rc_id; /* match reply with request */
380 char rc_buf[0];
381};
382
383struct rcom_config {
384 uint32_t rf_lvblen;
385 uint32_t rf_lsflags;
386 uint64_t rf_unused;
387};
388
389struct rcom_lock {
390 uint32_t rl_ownpid;
391 uint32_t rl_lkid;
392 uint32_t rl_remid;
393 uint32_t rl_parent_lkid;
394 uint32_t rl_parent_remid;
395 uint32_t rl_exflags;
396 uint32_t rl_flags;
397 uint32_t rl_lvbseq;
398 int rl_result;
399 int8_t rl_rqmode;
400 int8_t rl_grmode;
401 int8_t rl_status;
402 int8_t rl_asts;
403 uint16_t rl_wait_type;
404 uint16_t rl_namelen;
405 char rl_name[DLM_RESNAME_MAXLEN];
406 char rl_lvb[0];
407};
408
409struct dlm_ls {
410 struct list_head ls_list; /* list of lockspaces */
411 uint32_t ls_global_id; /* global unique lockspace ID */
412 uint32_t ls_exflags;
413 int ls_lvblen;
414 int ls_count; /* reference count */
415 unsigned long ls_flags; /* LSFL_ */
416 struct kobject ls_kobj;
417
418 struct dlm_rsbtable *ls_rsbtbl;
419 uint32_t ls_rsbtbl_size;
420
421 struct dlm_lkbtable *ls_lkbtbl;
422 uint32_t ls_lkbtbl_size;
423
424 struct dlm_dirtable *ls_dirtbl;
425 uint32_t ls_dirtbl_size;
426
427 struct mutex ls_waiters_mutex;
428 struct list_head ls_waiters; /* lkbs needing a reply */
429
430 struct list_head ls_nodes; /* current nodes in ls */
431 struct list_head ls_nodes_gone; /* dead node list, recovery */
432 int ls_num_nodes; /* number of nodes in ls */
433 int ls_low_nodeid;
434 int ls_total_weight;
435 int *ls_node_array;
436
437 struct dlm_rsb ls_stub_rsb; /* for returning errors */
438 struct dlm_lkb ls_stub_lkb; /* for returning errors */
439 struct dlm_message ls_stub_ms; /* for faking a reply */
440
441 struct dentry *ls_debug_dentry; /* debugfs */
442
443 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
444 int ls_uevent_result;
445
446 /* recovery related */
447
448 struct timer_list ls_timer;
449 struct task_struct *ls_recoverd_task;
450 struct mutex ls_recoverd_active;
451 spinlock_t ls_recover_lock;
452 uint32_t ls_recover_status; /* DLM_RS_ */
453 uint64_t ls_recover_seq;
454 struct dlm_recover *ls_recover_args;
455 struct rw_semaphore ls_in_recovery; /* block local requests */
456 struct list_head ls_requestqueue;/* queue remote requests */
457 struct mutex ls_requestqueue_mutex;
458 char *ls_recover_buf;
459 struct list_head ls_recover_list;
460 spinlock_t ls_recover_list_lock;
461 int ls_recover_list_count;
462 wait_queue_head_t ls_wait_general;
463
464 struct list_head ls_root_list; /* root resources */
465 struct rw_semaphore ls_root_sem; /* protect root_list */
466
467 int ls_namelen;
468 char ls_name[1];
469};
470
471#define LSFL_WORK 0
472#define LSFL_RUNNING 1
473#define LSFL_RECOVERY_STOP 2
474#define LSFL_RCOM_READY 3
475#define LSFL_UEVENT_WAIT 4
476
477static inline int dlm_locking_stopped(struct dlm_ls *ls)
478{
479 return !test_bit(LSFL_RUNNING, &ls->ls_flags);
480}
481
482static inline int dlm_recovery_stopped(struct dlm_ls *ls)
483{
484 return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
485}
486
487static inline int dlm_no_directory(struct dlm_ls *ls)
488{
489 return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
490}
491
492#endif /* __DLM_INTERNAL_DOT_H__ */
493
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
new file mode 100644
index 000000000000..85a0e73ba808
--- /dev/null
+++ b/fs/dlm/lock.c
@@ -0,0 +1,3533 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13/* Central locking logic has four stages:
14
15 dlm_lock()
16 dlm_unlock()
17
18 request_lock(ls, lkb)
19 convert_lock(ls, lkb)
20 unlock_lock(ls, lkb)
21 cancel_lock(ls, lkb)
22
23 _request_lock(r, lkb)
24 _convert_lock(r, lkb)
25 _unlock_lock(r, lkb)
26 _cancel_lock(r, lkb)
27
28 do_request(r, lkb)
29 do_convert(r, lkb)
30 do_unlock(r, lkb)
31 do_cancel(r, lkb)
32
33 Stage 1 (lock, unlock) is mainly about checking input args and
34 splitting into one of the four main operations:
35
36 dlm_lock = request_lock
37 dlm_lock+CONVERT = convert_lock
38 dlm_unlock = unlock_lock
39 dlm_unlock+CANCEL = cancel_lock
40
41 Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
42 provided to the next stage.
43
44 Stage 3, _xxxx_lock(), determines if the operation is local or remote.
45 When remote, it calls send_xxxx(), when local it calls do_xxxx().
46
47 Stage 4, do_xxxx(), is the guts of the operation. It manipulates the
48 given rsb and lkb and queues callbacks.
49
50 For remote operations, send_xxxx() results in the corresponding do_xxxx()
51 function being executed on the remote node. The connecting send/receive
52 calls on local (L) and remote (R) nodes:
53
54 L: send_xxxx() -> R: receive_xxxx()
55 R: do_xxxx()
56 L: receive_xxxx_reply() <- R: send_xxxx_reply()
57*/
58
59#include "dlm_internal.h"
60#include "memory.h"
61#include "lowcomms.h"
62#include "requestqueue.h"
63#include "util.h"
64#include "dir.h"
65#include "member.h"
66#include "lockspace.h"
67#include "ast.h"
68#include "lock.h"
69#include "rcom.h"
70#include "recover.h"
71#include "lvb_table.h"
72#include "config.h"
73
74static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
75static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
76static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
77static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
78static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
79static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
80static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
81static int send_remove(struct dlm_rsb *r);
82static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
83static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
84 struct dlm_message *ms);
85static int receive_extralen(struct dlm_message *ms);
86
87/*
88 * Lock compatibilty matrix - thanks Steve
89 * UN = Unlocked state. Not really a state, used as a flag
90 * PD = Padding. Used to make the matrix a nice power of two in size
91 * Other states are the same as the VMS DLM.
92 * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
93 */
94
95static const int __dlm_compat_matrix[8][8] = {
96 /* UN NL CR CW PR PW EX PD */
97 {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
98 {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
99 {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
100 {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
101 {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
102 {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
103 {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
104 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
105};
106
107/*
108 * This defines the direction of transfer of LVB data.
109 * Granted mode is the row; requested mode is the column.
110 * Usage: matrix[grmode+1][rqmode+1]
111 * 1 = LVB is returned to the caller
112 * 0 = LVB is written to the resource
113 * -1 = nothing happens to the LVB
114 */
115
116const int dlm_lvb_operations[8][8] = {
117 /* UN NL CR CW PR PW EX PD*/
118 { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
119 { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
120 { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
121 { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
122 { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
123 { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
124 { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
125 { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
126};
127EXPORT_SYMBOL_GPL(dlm_lvb_operations);
128
129#define modes_compat(gr, rq) \
130 __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
131
132int dlm_modes_compat(int mode1, int mode2)
133{
134 return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
135}
136
137/*
138 * Compatibility matrix for conversions with QUECVT set.
139 * Granted mode is the row; requested mode is the column.
140 * Usage: matrix[grmode+1][rqmode+1]
141 */
142
143static const int __quecvt_compat_matrix[8][8] = {
144 /* UN NL CR CW PR PW EX PD */
145 {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
146 {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
147 {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
148 {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
149 {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
150 {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
151 {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
152 {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
153};
154
155static void dlm_print_lkb(struct dlm_lkb *lkb)
156{
157 printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
158 " status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
159 lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
160 lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
161 lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
162}
163
164void dlm_print_rsb(struct dlm_rsb *r)
165{
166 printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
167 r->res_nodeid, r->res_flags, r->res_first_lkid,
168 r->res_recover_locks_count, r->res_name);
169}
170
171/* Threads cannot use the lockspace while it's being recovered */
172
173static inline void lock_recovery(struct dlm_ls *ls)
174{
175 down_read(&ls->ls_in_recovery);
176}
177
178static inline void unlock_recovery(struct dlm_ls *ls)
179{
180 up_read(&ls->ls_in_recovery);
181}
182
183static inline int lock_recovery_try(struct dlm_ls *ls)
184{
185 return down_read_trylock(&ls->ls_in_recovery);
186}
187
188static inline int can_be_queued(struct dlm_lkb *lkb)
189{
190 return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
191}
192
193static inline int force_blocking_asts(struct dlm_lkb *lkb)
194{
195 return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
196}
197
198static inline int is_demoted(struct dlm_lkb *lkb)
199{
200 return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
201}
202
203static inline int is_remote(struct dlm_rsb *r)
204{
205 DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
206 return !!r->res_nodeid;
207}
208
209static inline int is_process_copy(struct dlm_lkb *lkb)
210{
211 return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
212}
213
214static inline int is_master_copy(struct dlm_lkb *lkb)
215{
216 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
217 DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
218 return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
219}
220
221static inline int middle_conversion(struct dlm_lkb *lkb)
222{
223 if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
224 (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
225 return 1;
226 return 0;
227}
228
229static inline int down_conversion(struct dlm_lkb *lkb)
230{
231 return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
232}
233
234static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
235{
236 if (is_master_copy(lkb))
237 return;
238
239 DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
240
241 lkb->lkb_lksb->sb_status = rv;
242 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
243
244 dlm_add_ast(lkb, AST_COMP);
245}
246
247static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
248{
249 if (is_master_copy(lkb))
250 send_bast(r, lkb, rqmode);
251 else {
252 lkb->lkb_bastmode = rqmode;
253 dlm_add_ast(lkb, AST_BAST);
254 }
255}
256
257/*
258 * Basic operations on rsb's and lkb's
259 */
260
261static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
262{
263 struct dlm_rsb *r;
264
265 r = allocate_rsb(ls, len);
266 if (!r)
267 return NULL;
268
269 r->res_ls = ls;
270 r->res_length = len;
271 memcpy(r->res_name, name, len);
272 mutex_init(&r->res_mutex);
273
274 INIT_LIST_HEAD(&r->res_lookup);
275 INIT_LIST_HEAD(&r->res_grantqueue);
276 INIT_LIST_HEAD(&r->res_convertqueue);
277 INIT_LIST_HEAD(&r->res_waitqueue);
278 INIT_LIST_HEAD(&r->res_root_list);
279 INIT_LIST_HEAD(&r->res_recover_list);
280
281 return r;
282}
283
284static int search_rsb_list(struct list_head *head, char *name, int len,
285 unsigned int flags, struct dlm_rsb **r_ret)
286{
287 struct dlm_rsb *r;
288 int error = 0;
289
290 list_for_each_entry(r, head, res_hashchain) {
291 if (len == r->res_length && !memcmp(name, r->res_name, len))
292 goto found;
293 }
294 return -ENOENT;
295
296 found:
297 if (r->res_nodeid && (flags & R_MASTER))
298 error = -ENOTBLK;
299 *r_ret = r;
300 return error;
301}
302
303static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
304 unsigned int flags, struct dlm_rsb **r_ret)
305{
306 struct dlm_rsb *r;
307 int error;
308
309 error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
310 if (!error) {
311 kref_get(&r->res_ref);
312 goto out;
313 }
314 error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
315 if (error)
316 goto out;
317
318 list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
319
320 if (dlm_no_directory(ls))
321 goto out;
322
323 if (r->res_nodeid == -1) {
324 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
325 r->res_first_lkid = 0;
326 } else if (r->res_nodeid > 0) {
327 rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
328 r->res_first_lkid = 0;
329 } else {
330 DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
331 DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
332 }
333 out:
334 *r_ret = r;
335 return error;
336}
337
338static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
339 unsigned int flags, struct dlm_rsb **r_ret)
340{
341 int error;
342 write_lock(&ls->ls_rsbtbl[b].lock);
343 error = _search_rsb(ls, name, len, b, flags, r_ret);
344 write_unlock(&ls->ls_rsbtbl[b].lock);
345 return error;
346}
347
348/*
349 * Find rsb in rsbtbl and potentially create/add one
350 *
351 * Delaying the release of rsb's has a similar benefit to applications keeping
352 * NL locks on an rsb, but without the guarantee that the cached master value
353 * will still be valid when the rsb is reused. Apps aren't always smart enough
354 * to keep NL locks on an rsb that they may lock again shortly; this can lead
355 * to excessive master lookups and removals if we don't delay the release.
356 *
357 * Searching for an rsb means looking through both the normal list and toss
358 * list. When found on the toss list the rsb is moved to the normal list with
359 * ref count of 1; when found on normal list the ref count is incremented.
360 */
361
362static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
363 unsigned int flags, struct dlm_rsb **r_ret)
364{
365 struct dlm_rsb *r, *tmp;
366 uint32_t hash, bucket;
367 int error = 0;
368
369 if (dlm_no_directory(ls))
370 flags |= R_CREATE;
371
372 hash = jhash(name, namelen, 0);
373 bucket = hash & (ls->ls_rsbtbl_size - 1);
374
375 error = search_rsb(ls, name, namelen, bucket, flags, &r);
376 if (!error)
377 goto out;
378
379 if (error == -ENOENT && !(flags & R_CREATE))
380 goto out;
381
382 /* the rsb was found but wasn't a master copy */
383 if (error == -ENOTBLK)
384 goto out;
385
386 error = -ENOMEM;
387 r = create_rsb(ls, name, namelen);
388 if (!r)
389 goto out;
390
391 r->res_hash = hash;
392 r->res_bucket = bucket;
393 r->res_nodeid = -1;
394 kref_init(&r->res_ref);
395
396 /* With no directory, the master can be set immediately */
397 if (dlm_no_directory(ls)) {
398 int nodeid = dlm_dir_nodeid(r);
399 if (nodeid == dlm_our_nodeid())
400 nodeid = 0;
401 r->res_nodeid = nodeid;
402 }
403
404 write_lock(&ls->ls_rsbtbl[bucket].lock);
405 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
406 if (!error) {
407 write_unlock(&ls->ls_rsbtbl[bucket].lock);
408 free_rsb(r);
409 r = tmp;
410 goto out;
411 }
412 list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
413 write_unlock(&ls->ls_rsbtbl[bucket].lock);
414 error = 0;
415 out:
416 *r_ret = r;
417 return error;
418}
419
420int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
421 unsigned int flags, struct dlm_rsb **r_ret)
422{
423 return find_rsb(ls, name, namelen, flags, r_ret);
424}
425
426/* This is only called to add a reference when the code already holds
427 a valid reference to the rsb, so there's no need for locking. */
428
429static inline void hold_rsb(struct dlm_rsb *r)
430{
431 kref_get(&r->res_ref);
432}
433
434void dlm_hold_rsb(struct dlm_rsb *r)
435{
436 hold_rsb(r);
437}
438
439static void toss_rsb(struct kref *kref)
440{
441 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
442 struct dlm_ls *ls = r->res_ls;
443
444 DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
445 kref_init(&r->res_ref);
446 list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
447 r->res_toss_time = jiffies;
448 if (r->res_lvbptr) {
449 free_lvb(r->res_lvbptr);
450 r->res_lvbptr = NULL;
451 }
452}
453
454/* When all references to the rsb are gone it's transfered to
455 the tossed list for later disposal. */
456
457static void put_rsb(struct dlm_rsb *r)
458{
459 struct dlm_ls *ls = r->res_ls;
460 uint32_t bucket = r->res_bucket;
461
462 write_lock(&ls->ls_rsbtbl[bucket].lock);
463 kref_put(&r->res_ref, toss_rsb);
464 write_unlock(&ls->ls_rsbtbl[bucket].lock);
465}
466
467void dlm_put_rsb(struct dlm_rsb *r)
468{
469 put_rsb(r);
470}
471
472/* See comment for unhold_lkb */
473
474static void unhold_rsb(struct dlm_rsb *r)
475{
476 int rv;
477 rv = kref_put(&r->res_ref, toss_rsb);
478 DLM_ASSERT(!rv, dlm_print_rsb(r););
479}
480
481static void kill_rsb(struct kref *kref)
482{
483 struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
484
485 /* All work is done after the return from kref_put() so we
486 can release the write_lock before the remove and free. */
487
488 DLM_ASSERT(list_empty(&r->res_lookup),);
489 DLM_ASSERT(list_empty(&r->res_grantqueue),);
490 DLM_ASSERT(list_empty(&r->res_convertqueue),);
491 DLM_ASSERT(list_empty(&r->res_waitqueue),);
492 DLM_ASSERT(list_empty(&r->res_root_list),);
493 DLM_ASSERT(list_empty(&r->res_recover_list),);
494}
495
496/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
497 The rsb must exist as long as any lkb's for it do. */
498
499static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
500{
501 hold_rsb(r);
502 lkb->lkb_resource = r;
503}
504
505static void detach_lkb(struct dlm_lkb *lkb)
506{
507 if (lkb->lkb_resource) {
508 put_rsb(lkb->lkb_resource);
509 lkb->lkb_resource = NULL;
510 }
511}
512
513static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
514{
515 struct dlm_lkb *lkb, *tmp;
516 uint32_t lkid = 0;
517 uint16_t bucket;
518
519 lkb = allocate_lkb(ls);
520 if (!lkb)
521 return -ENOMEM;
522
523 lkb->lkb_nodeid = -1;
524 lkb->lkb_grmode = DLM_LOCK_IV;
525 kref_init(&lkb->lkb_ref);
526
527 get_random_bytes(&bucket, sizeof(bucket));
528 bucket &= (ls->ls_lkbtbl_size - 1);
529
530 write_lock(&ls->ls_lkbtbl[bucket].lock);
531
532 /* counter can roll over so we must verify lkid is not in use */
533
534 while (lkid == 0) {
535 lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
536
537 list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
538 lkb_idtbl_list) {
539 if (tmp->lkb_id != lkid)
540 continue;
541 lkid = 0;
542 break;
543 }
544 }
545
546 lkb->lkb_id = lkid;
547 list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
548 write_unlock(&ls->ls_lkbtbl[bucket].lock);
549
550 *lkb_ret = lkb;
551 return 0;
552}
553
554static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
555{
556 uint16_t bucket = lkid & 0xFFFF;
557 struct dlm_lkb *lkb;
558
559 list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
560 if (lkb->lkb_id == lkid)
561 return lkb;
562 }
563 return NULL;
564}
565
566static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
567{
568 struct dlm_lkb *lkb;
569 uint16_t bucket = lkid & 0xFFFF;
570
571 if (bucket >= ls->ls_lkbtbl_size)
572 return -EBADSLT;
573
574 read_lock(&ls->ls_lkbtbl[bucket].lock);
575 lkb = __find_lkb(ls, lkid);
576 if (lkb)
577 kref_get(&lkb->lkb_ref);
578 read_unlock(&ls->ls_lkbtbl[bucket].lock);
579
580 *lkb_ret = lkb;
581 return lkb ? 0 : -ENOENT;
582}
583
584static void kill_lkb(struct kref *kref)
585{
586 struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
587
588 /* All work is done after the return from kref_put() so we
589 can release the write_lock before the detach_lkb */
590
591 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
592}
593
594/* __put_lkb() is used when an lkb may not have an rsb attached to
595 it so we need to provide the lockspace explicitly */
596
597static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
598{
599 uint16_t bucket = lkb->lkb_id & 0xFFFF;
600
601 write_lock(&ls->ls_lkbtbl[bucket].lock);
602 if (kref_put(&lkb->lkb_ref, kill_lkb)) {
603 list_del(&lkb->lkb_idtbl_list);
604 write_unlock(&ls->ls_lkbtbl[bucket].lock);
605
606 detach_lkb(lkb);
607
608 /* for local/process lkbs, lvbptr points to caller's lksb */
609 if (lkb->lkb_lvbptr && is_master_copy(lkb))
610 free_lvb(lkb->lkb_lvbptr);
611 free_lkb(lkb);
612 return 1;
613 } else {
614 write_unlock(&ls->ls_lkbtbl[bucket].lock);
615 return 0;
616 }
617}
618
619int dlm_put_lkb(struct dlm_lkb *lkb)
620{
621 struct dlm_ls *ls;
622
623 DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb););
624 DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb););
625
626 ls = lkb->lkb_resource->res_ls;
627 return __put_lkb(ls, lkb);
628}
629
630/* This is only called to add a reference when the code already holds
631 a valid reference to the lkb, so there's no need for locking. */
632
633static inline void hold_lkb(struct dlm_lkb *lkb)
634{
635 kref_get(&lkb->lkb_ref);
636}
637
638/* This is called when we need to remove a reference and are certain
639 it's not the last ref. e.g. del_lkb is always called between a
640 find_lkb/put_lkb and is always the inverse of a previous add_lkb.
641 put_lkb would work fine, but would involve unnecessary locking */
642
643static inline void unhold_lkb(struct dlm_lkb *lkb)
644{
645 int rv;
646 rv = kref_put(&lkb->lkb_ref, kill_lkb);
647 DLM_ASSERT(!rv, dlm_print_lkb(lkb););
648}
649
650static void lkb_add_ordered(struct list_head *new, struct list_head *head,
651 int mode)
652{
653 struct dlm_lkb *lkb = NULL;
654
655 list_for_each_entry(lkb, head, lkb_statequeue)
656 if (lkb->lkb_rqmode < mode)
657 break;
658
659 if (!lkb)
660 list_add_tail(new, head);
661 else
662 __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
663}
664
665/* add/remove lkb to rsb's grant/convert/wait queue */
666
667static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
668{
669 kref_get(&lkb->lkb_ref);
670
671 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
672
673 lkb->lkb_status = status;
674
675 switch (status) {
676 case DLM_LKSTS_WAITING:
677 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
678 list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
679 else
680 list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
681 break;
682 case DLM_LKSTS_GRANTED:
683 /* convention says granted locks kept in order of grmode */
684 lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
685 lkb->lkb_grmode);
686 break;
687 case DLM_LKSTS_CONVERT:
688 if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
689 list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
690 else
691 list_add_tail(&lkb->lkb_statequeue,
692 &r->res_convertqueue);
693 break;
694 default:
695 DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
696 }
697}
698
699static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
700{
701 lkb->lkb_status = 0;
702 list_del(&lkb->lkb_statequeue);
703 unhold_lkb(lkb);
704}
705
706static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
707{
708 hold_lkb(lkb);
709 del_lkb(r, lkb);
710 add_lkb(r, lkb, sts);
711 unhold_lkb(lkb);
712}
713
714/* add/remove lkb from global waiters list of lkb's waiting for
715 a reply from a remote node */
716
717static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
718{
719 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
720
721 mutex_lock(&ls->ls_waiters_mutex);
722 if (lkb->lkb_wait_type) {
723 log_print("add_to_waiters error %d", lkb->lkb_wait_type);
724 goto out;
725 }
726 lkb->lkb_wait_type = mstype;
727 kref_get(&lkb->lkb_ref);
728 list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
729 out:
730 mutex_unlock(&ls->ls_waiters_mutex);
731}
732
733static int _remove_from_waiters(struct dlm_lkb *lkb)
734{
735 int error = 0;
736
737 if (!lkb->lkb_wait_type) {
738 log_print("remove_from_waiters error");
739 error = -EINVAL;
740 goto out;
741 }
742 lkb->lkb_wait_type = 0;
743 list_del(&lkb->lkb_wait_reply);
744 unhold_lkb(lkb);
745 out:
746 return error;
747}
748
749static int remove_from_waiters(struct dlm_lkb *lkb)
750{
751 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
752 int error;
753
754 mutex_lock(&ls->ls_waiters_mutex);
755 error = _remove_from_waiters(lkb);
756 mutex_unlock(&ls->ls_waiters_mutex);
757 return error;
758}
759
760static void dir_remove(struct dlm_rsb *r)
761{
762 int to_nodeid;
763
764 if (dlm_no_directory(r->res_ls))
765 return;
766
767 to_nodeid = dlm_dir_nodeid(r);
768 if (to_nodeid != dlm_our_nodeid())
769 send_remove(r);
770 else
771 dlm_dir_remove_entry(r->res_ls, to_nodeid,
772 r->res_name, r->res_length);
773}
774
775/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
776 found since they are in order of newest to oldest? */
777
778static int shrink_bucket(struct dlm_ls *ls, int b)
779{
780 struct dlm_rsb *r;
781 int count = 0, found;
782
783 for (;;) {
784 found = 0;
785 write_lock(&ls->ls_rsbtbl[b].lock);
786 list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
787 res_hashchain) {
788 if (!time_after_eq(jiffies, r->res_toss_time +
789 dlm_config.toss_secs * HZ))
790 continue;
791 found = 1;
792 break;
793 }
794
795 if (!found) {
796 write_unlock(&ls->ls_rsbtbl[b].lock);
797 break;
798 }
799
800 if (kref_put(&r->res_ref, kill_rsb)) {
801 list_del(&r->res_hashchain);
802 write_unlock(&ls->ls_rsbtbl[b].lock);
803
804 if (is_master(r))
805 dir_remove(r);
806 free_rsb(r);
807 count++;
808 } else {
809 write_unlock(&ls->ls_rsbtbl[b].lock);
810 log_error(ls, "tossed rsb in use %s", r->res_name);
811 }
812 }
813
814 return count;
815}
816
817void dlm_scan_rsbs(struct dlm_ls *ls)
818{
819 int i;
820
821 if (dlm_locking_stopped(ls))
822 return;
823
824 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
825 shrink_bucket(ls, i);
826 cond_resched();
827 }
828}
829
830/* lkb is master or local copy */
831
832static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
833{
834 int b, len = r->res_ls->ls_lvblen;
835
836 /* b=1 lvb returned to caller
837 b=0 lvb written to rsb or invalidated
838 b=-1 do nothing */
839
840 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
841
842 if (b == 1) {
843 if (!lkb->lkb_lvbptr)
844 return;
845
846 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
847 return;
848
849 if (!r->res_lvbptr)
850 return;
851
852 memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
853 lkb->lkb_lvbseq = r->res_lvbseq;
854
855 } else if (b == 0) {
856 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
857 rsb_set_flag(r, RSB_VALNOTVALID);
858 return;
859 }
860
861 if (!lkb->lkb_lvbptr)
862 return;
863
864 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
865 return;
866
867 if (!r->res_lvbptr)
868 r->res_lvbptr = allocate_lvb(r->res_ls);
869
870 if (!r->res_lvbptr)
871 return;
872
873 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
874 r->res_lvbseq++;
875 lkb->lkb_lvbseq = r->res_lvbseq;
876 rsb_clear_flag(r, RSB_VALNOTVALID);
877 }
878
879 if (rsb_flag(r, RSB_VALNOTVALID))
880 lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
881}
882
883static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
884{
885 if (lkb->lkb_grmode < DLM_LOCK_PW)
886 return;
887
888 if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
889 rsb_set_flag(r, RSB_VALNOTVALID);
890 return;
891 }
892
893 if (!lkb->lkb_lvbptr)
894 return;
895
896 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
897 return;
898
899 if (!r->res_lvbptr)
900 r->res_lvbptr = allocate_lvb(r->res_ls);
901
902 if (!r->res_lvbptr)
903 return;
904
905 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
906 r->res_lvbseq++;
907 rsb_clear_flag(r, RSB_VALNOTVALID);
908}
909
910/* lkb is process copy (pc) */
911
912static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
913 struct dlm_message *ms)
914{
915 int b;
916
917 if (!lkb->lkb_lvbptr)
918 return;
919
920 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
921 return;
922
923 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
924 if (b == 1) {
925 int len = receive_extralen(ms);
926 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
927 lkb->lkb_lvbseq = ms->m_lvbseq;
928 }
929}
930
931/* Manipulate lkb's on rsb's convert/granted/waiting queues
932 remove_lock -- used for unlock, removes lkb from granted
933 revert_lock -- used for cancel, moves lkb from convert to granted
934 grant_lock -- used for request and convert, adds lkb to granted or
935 moves lkb from convert or waiting to granted
936
937 Each of these is used for master or local copy lkb's. There is
938 also a _pc() variation used to make the corresponding change on
939 a process copy (pc) lkb. */
940
941static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
942{
943 del_lkb(r, lkb);
944 lkb->lkb_grmode = DLM_LOCK_IV;
945 /* this unhold undoes the original ref from create_lkb()
946 so this leads to the lkb being freed */
947 unhold_lkb(lkb);
948}
949
950static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
951{
952 set_lvb_unlock(r, lkb);
953 _remove_lock(r, lkb);
954}
955
956static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
957{
958 _remove_lock(r, lkb);
959}
960
961static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
962{
963 lkb->lkb_rqmode = DLM_LOCK_IV;
964
965 switch (lkb->lkb_status) {
966 case DLM_LKSTS_CONVERT:
967 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
968 break;
969 case DLM_LKSTS_WAITING:
970 del_lkb(r, lkb);
971 lkb->lkb_grmode = DLM_LOCK_IV;
972 /* this unhold undoes the original ref from create_lkb()
973 so this leads to the lkb being freed */
974 unhold_lkb(lkb);
975 break;
976 default:
977 log_print("invalid status for revert %d", lkb->lkb_status);
978 }
979}
980
981static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
982{
983 revert_lock(r, lkb);
984}
985
986static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
987{
988 if (lkb->lkb_grmode != lkb->lkb_rqmode) {
989 lkb->lkb_grmode = lkb->lkb_rqmode;
990 if (lkb->lkb_status)
991 move_lkb(r, lkb, DLM_LKSTS_GRANTED);
992 else
993 add_lkb(r, lkb, DLM_LKSTS_GRANTED);
994 }
995
996 lkb->lkb_rqmode = DLM_LOCK_IV;
997}
998
999static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1000{
1001 set_lvb_lock(r, lkb);
1002 _grant_lock(r, lkb);
1003 lkb->lkb_highbast = 0;
1004}
1005
1006static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
1007 struct dlm_message *ms)
1008{
1009 set_lvb_lock_pc(r, lkb, ms);
1010 _grant_lock(r, lkb);
1011}
1012
1013/* called by grant_pending_locks() which means an async grant message must
1014 be sent to the requesting node in addition to granting the lock if the
1015 lkb belongs to a remote node. */
1016
1017static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
1018{
1019 grant_lock(r, lkb);
1020 if (is_master_copy(lkb))
1021 send_grant(r, lkb);
1022 else
1023 queue_cast(r, lkb, 0);
1024}
1025
1026static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
1027{
1028 struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
1029 lkb_statequeue);
1030 if (lkb->lkb_id == first->lkb_id)
1031 return 1;
1032
1033 return 0;
1034}
1035
1036/* Check if the given lkb conflicts with another lkb on the queue. */
1037
1038static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
1039{
1040 struct dlm_lkb *this;
1041
1042 list_for_each_entry(this, head, lkb_statequeue) {
1043 if (this == lkb)
1044 continue;
1045 if (!modes_compat(this, lkb))
1046 return 1;
1047 }
1048 return 0;
1049}
1050
1051/*
1052 * "A conversion deadlock arises with a pair of lock requests in the converting
1053 * queue for one resource. The granted mode of each lock blocks the requested
1054 * mode of the other lock."
1055 *
1056 * Part 2: if the granted mode of lkb is preventing the first lkb in the
1057 * convert queue from being granted, then demote lkb (set grmode to NL).
1058 * This second form requires that we check for conv-deadlk even when
1059 * now == 0 in _can_be_granted().
1060 *
1061 * Example:
1062 * Granted Queue: empty
1063 * Convert Queue: NL->EX (first lock)
1064 * PR->EX (second lock)
1065 *
1066 * The first lock can't be granted because of the granted mode of the second
1067 * lock and the second lock can't be granted because it's not first in the
1068 * list. We demote the granted mode of the second lock (the lkb passed to this
1069 * function).
1070 *
1071 * After the resolution, the "grant pending" function needs to go back and try
1072 * to grant locks on the convert queue again since the first lock can now be
1073 * granted.
1074 */
1075
1076static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
1077{
1078 struct dlm_lkb *this, *first = NULL, *self = NULL;
1079
1080 list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
1081 if (!first)
1082 first = this;
1083 if (this == lkb) {
1084 self = lkb;
1085 continue;
1086 }
1087
1088 if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
1089 return 1;
1090 }
1091
1092 /* if lkb is on the convert queue and is preventing the first
1093 from being granted, then there's deadlock and we demote lkb.
1094 multiple converting locks may need to do this before the first
1095 converting lock can be granted. */
1096
1097 if (self && self != first) {
1098 if (!modes_compat(lkb, first) &&
1099 !queue_conflict(&rsb->res_grantqueue, first))
1100 return 1;
1101 }
1102
1103 return 0;
1104}
1105
1106/*
1107 * Return 1 if the lock can be granted, 0 otherwise.
1108 * Also detect and resolve conversion deadlocks.
1109 *
1110 * lkb is the lock to be granted
1111 *
1112 * now is 1 if the function is being called in the context of the
1113 * immediate request, it is 0 if called later, after the lock has been
1114 * queued.
1115 *
1116 * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
1117 */
1118
1119static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1120{
1121 int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
1122
1123 /*
1124 * 6-10: Version 5.4 introduced an option to address the phenomenon of
1125 * a new request for a NL mode lock being blocked.
1126 *
1127 * 6-11: If the optional EXPEDITE flag is used with the new NL mode
1128 * request, then it would be granted. In essence, the use of this flag
1129 * tells the Lock Manager to expedite theis request by not considering
1130 * what may be in the CONVERTING or WAITING queues... As of this
1131 * writing, the EXPEDITE flag can be used only with new requests for NL
1132 * mode locks. This flag is not valid for conversion requests.
1133 *
1134 * A shortcut. Earlier checks return an error if EXPEDITE is used in a
1135 * conversion or used with a non-NL requested mode. We also know an
1136 * EXPEDITE request is always granted immediately, so now must always
1137 * be 1. The full condition to grant an expedite request: (now &&
1138 * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
1139 * therefore be shortened to just checking the flag.
1140 */
1141
1142 if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
1143 return 1;
1144
1145 /*
1146 * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
1147 * added to the remaining conditions.
1148 */
1149
1150 if (queue_conflict(&r->res_grantqueue, lkb))
1151 goto out;
1152
1153 /*
1154 * 6-3: By default, a conversion request is immediately granted if the
1155 * requested mode is compatible with the modes of all other granted
1156 * locks
1157 */
1158
1159 if (queue_conflict(&r->res_convertqueue, lkb))
1160 goto out;
1161
1162 /*
1163 * 6-5: But the default algorithm for deciding whether to grant or
1164 * queue conversion requests does not by itself guarantee that such
1165 * requests are serviced on a "first come first serve" basis. This, in
1166 * turn, can lead to a phenomenon known as "indefinate postponement".
1167 *
1168 * 6-7: This issue is dealt with by using the optional QUECVT flag with
1169 * the system service employed to request a lock conversion. This flag
1170 * forces certain conversion requests to be queued, even if they are
1171 * compatible with the granted modes of other locks on the same
1172 * resource. Thus, the use of this flag results in conversion requests
1173 * being ordered on a "first come first servce" basis.
1174 *
1175 * DCT: This condition is all about new conversions being able to occur
1176 * "in place" while the lock remains on the granted queue (assuming
1177 * nothing else conflicts.) IOW if QUECVT isn't set, a conversion
1178 * doesn't _have_ to go onto the convert queue where it's processed in
1179 * order. The "now" variable is necessary to distinguish converts
1180 * being received and processed for the first time now, because once a
1181 * convert is moved to the conversion queue the condition below applies
1182 * requiring fifo granting.
1183 */
1184
1185 if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
1186 return 1;
1187
1188 /*
1189 * The NOORDER flag is set to avoid the standard vms rules on grant
1190 * order.
1191 */
1192
1193 if (lkb->lkb_exflags & DLM_LKF_NOORDER)
1194 return 1;
1195
1196 /*
1197 * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
1198 * granted until all other conversion requests ahead of it are granted
1199 * and/or canceled.
1200 */
1201
1202 if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
1203 return 1;
1204
1205 /*
1206 * 6-4: By default, a new request is immediately granted only if all
1207 * three of the following conditions are satisfied when the request is
1208 * issued:
1209 * - The queue of ungranted conversion requests for the resource is
1210 * empty.
1211 * - The queue of ungranted new requests for the resource is empty.
1212 * - The mode of the new request is compatible with the most
1213 * restrictive mode of all granted locks on the resource.
1214 */
1215
1216 if (now && !conv && list_empty(&r->res_convertqueue) &&
1217 list_empty(&r->res_waitqueue))
1218 return 1;
1219
1220 /*
1221 * 6-4: Once a lock request is in the queue of ungranted new requests,
1222 * it cannot be granted until the queue of ungranted conversion
1223 * requests is empty, all ungranted new requests ahead of it are
1224 * granted and/or canceled, and it is compatible with the granted mode
1225 * of the most restrictive lock granted on the resource.
1226 */
1227
1228 if (!now && !conv && list_empty(&r->res_convertqueue) &&
1229 first_in_list(lkb, &r->res_waitqueue))
1230 return 1;
1231
1232 out:
1233 /*
1234 * The following, enabled by CONVDEADLK, departs from VMS.
1235 */
1236
1237 if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
1238 conversion_deadlock_detect(r, lkb)) {
1239 lkb->lkb_grmode = DLM_LOCK_NL;
1240 lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
1241 }
1242
1243 return 0;
1244}
1245
1246/*
1247 * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a
1248 * simple way to provide a big optimization to applications that can use them.
1249 */
1250
1251static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
1252{
1253 uint32_t flags = lkb->lkb_exflags;
1254 int rv;
1255 int8_t alt = 0, rqmode = lkb->lkb_rqmode;
1256
1257 rv = _can_be_granted(r, lkb, now);
1258 if (rv)
1259 goto out;
1260
1261 if (lkb->lkb_sbflags & DLM_SBF_DEMOTED)
1262 goto out;
1263
1264 if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR)
1265 alt = DLM_LOCK_PR;
1266 else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW)
1267 alt = DLM_LOCK_CW;
1268
1269 if (alt) {
1270 lkb->lkb_rqmode = alt;
1271 rv = _can_be_granted(r, lkb, now);
1272 if (rv)
1273 lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
1274 else
1275 lkb->lkb_rqmode = rqmode;
1276 }
1277 out:
1278 return rv;
1279}
1280
1281static int grant_pending_convert(struct dlm_rsb *r, int high)
1282{
1283 struct dlm_lkb *lkb, *s;
1284 int hi, demoted, quit, grant_restart, demote_restart;
1285
1286 quit = 0;
1287 restart:
1288 grant_restart = 0;
1289 demote_restart = 0;
1290 hi = DLM_LOCK_IV;
1291
1292 list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
1293 demoted = is_demoted(lkb);
1294 if (can_be_granted(r, lkb, 0)) {
1295 grant_lock_pending(r, lkb);
1296 grant_restart = 1;
1297 } else {
1298 hi = max_t(int, lkb->lkb_rqmode, hi);
1299 if (!demoted && is_demoted(lkb))
1300 demote_restart = 1;
1301 }
1302 }
1303
1304 if (grant_restart)
1305 goto restart;
1306 if (demote_restart && !quit) {
1307 quit = 1;
1308 goto restart;
1309 }
1310
1311 return max_t(int, high, hi);
1312}
1313
1314static int grant_pending_wait(struct dlm_rsb *r, int high)
1315{
1316 struct dlm_lkb *lkb, *s;
1317
1318 list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
1319 if (can_be_granted(r, lkb, 0))
1320 grant_lock_pending(r, lkb);
1321 else
1322 high = max_t(int, lkb->lkb_rqmode, high);
1323 }
1324
1325 return high;
1326}
1327
1328static void grant_pending_locks(struct dlm_rsb *r)
1329{
1330 struct dlm_lkb *lkb, *s;
1331 int high = DLM_LOCK_IV;
1332
1333 DLM_ASSERT(is_master(r), dlm_print_rsb(r););
1334
1335 high = grant_pending_convert(r, high);
1336 high = grant_pending_wait(r, high);
1337
1338 if (high == DLM_LOCK_IV)
1339 return;
1340
1341 /*
1342 * If there are locks left on the wait/convert queue then send blocking
1343 * ASTs to granted locks based on the largest requested mode (high)
1344 * found above. FIXME: highbast < high comparison not valid for PR/CW.
1345 */
1346
1347 list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
1348 if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
1349 !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
1350 queue_bast(r, lkb, high);
1351 lkb->lkb_highbast = high;
1352 }
1353 }
1354}
1355
1356static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
1357 struct dlm_lkb *lkb)
1358{
1359 struct dlm_lkb *gr;
1360
1361 list_for_each_entry(gr, head, lkb_statequeue) {
1362 if (gr->lkb_bastaddr &&
1363 gr->lkb_highbast < lkb->lkb_rqmode &&
1364 !modes_compat(gr, lkb)) {
1365 queue_bast(r, gr, lkb->lkb_rqmode);
1366 gr->lkb_highbast = lkb->lkb_rqmode;
1367 }
1368 }
1369}
1370
1371static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
1372{
1373 send_bast_queue(r, &r->res_grantqueue, lkb);
1374}
1375
1376static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
1377{
1378 send_bast_queue(r, &r->res_grantqueue, lkb);
1379 send_bast_queue(r, &r->res_convertqueue, lkb);
1380}
1381
1382/* set_master(r, lkb) -- set the master nodeid of a resource
1383
1384 The purpose of this function is to set the nodeid field in the given
1385 lkb using the nodeid field in the given rsb. If the rsb's nodeid is
1386 known, it can just be copied to the lkb and the function will return
1387 0. If the rsb's nodeid is _not_ known, it needs to be looked up
1388 before it can be copied to the lkb.
1389
1390 When the rsb nodeid is being looked up remotely, the initial lkb
1391 causing the lookup is kept on the ls_waiters list waiting for the
1392 lookup reply. Other lkb's waiting for the same rsb lookup are kept
1393 on the rsb's res_lookup list until the master is verified.
1394
1395 Return values:
1396 0: nodeid is set in rsb/lkb and the caller should go ahead and use it
1397 1: the rsb master is not available and the lkb has been placed on
1398 a wait queue
1399*/
1400
1401static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
1402{
1403 struct dlm_ls *ls = r->res_ls;
1404 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
1405
1406 if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
1407 rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
1408 r->res_first_lkid = lkb->lkb_id;
1409 lkb->lkb_nodeid = r->res_nodeid;
1410 return 0;
1411 }
1412
1413 if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
1414 list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
1415 return 1;
1416 }
1417
1418 if (r->res_nodeid == 0) {
1419 lkb->lkb_nodeid = 0;
1420 return 0;
1421 }
1422
1423 if (r->res_nodeid > 0) {
1424 lkb->lkb_nodeid = r->res_nodeid;
1425 return 0;
1426 }
1427
1428 DLM_ASSERT(r->res_nodeid == -1, dlm_print_rsb(r););
1429
1430 dir_nodeid = dlm_dir_nodeid(r);
1431
1432 if (dir_nodeid != our_nodeid) {
1433 r->res_first_lkid = lkb->lkb_id;
1434 send_lookup(r, lkb);
1435 return 1;
1436 }
1437
1438 for (;;) {
1439 /* It's possible for dlm_scand to remove an old rsb for
1440 this same resource from the toss list, us to create
1441 a new one, look up the master locally, and find it
1442 already exists just before dlm_scand does the
1443 dir_remove() on the previous rsb. */
1444
1445 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
1446 r->res_length, &ret_nodeid);
1447 if (!error)
1448 break;
1449 log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
1450 schedule();
1451 }
1452
1453 if (ret_nodeid == our_nodeid) {
1454 r->res_first_lkid = 0;
1455 r->res_nodeid = 0;
1456 lkb->lkb_nodeid = 0;
1457 } else {
1458 r->res_first_lkid = lkb->lkb_id;
1459 r->res_nodeid = ret_nodeid;
1460 lkb->lkb_nodeid = ret_nodeid;
1461 }
1462 return 0;
1463}
1464
1465static void process_lookup_list(struct dlm_rsb *r)
1466{
1467 struct dlm_lkb *lkb, *safe;
1468
1469 list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
1470 list_del(&lkb->lkb_rsb_lookup);
1471 _request_lock(r, lkb);
1472 schedule();
1473 }
1474}
1475
1476/* confirm_master -- confirm (or deny) an rsb's master nodeid */
1477
1478static void confirm_master(struct dlm_rsb *r, int error)
1479{
1480 struct dlm_lkb *lkb;
1481
1482 if (!r->res_first_lkid)
1483 return;
1484
1485 switch (error) {
1486 case 0:
1487 case -EINPROGRESS:
1488 r->res_first_lkid = 0;
1489 process_lookup_list(r);
1490 break;
1491
1492 case -EAGAIN:
1493 /* the remote master didn't queue our NOQUEUE request;
1494 make a waiting lkb the first_lkid */
1495
1496 r->res_first_lkid = 0;
1497
1498 if (!list_empty(&r->res_lookup)) {
1499 lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
1500 lkb_rsb_lookup);
1501 list_del(&lkb->lkb_rsb_lookup);
1502 r->res_first_lkid = lkb->lkb_id;
1503 _request_lock(r, lkb);
1504 } else
1505 r->res_nodeid = -1;
1506 break;
1507
1508 default:
1509 log_error(r->res_ls, "confirm_master unknown error %d", error);
1510 }
1511}
1512
1513static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
1514 int namelen, uint32_t parent_lkid, void *ast,
1515 void *astarg, void *bast, struct dlm_args *args)
1516{
1517 int rv = -EINVAL;
1518
1519 /* check for invalid arg usage */
1520
1521 if (mode < 0 || mode > DLM_LOCK_EX)
1522 goto out;
1523
1524 if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
1525 goto out;
1526
1527 if (flags & DLM_LKF_CANCEL)
1528 goto out;
1529
1530 if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
1531 goto out;
1532
1533 if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
1534 goto out;
1535
1536 if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
1537 goto out;
1538
1539 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
1540 goto out;
1541
1542 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
1543 goto out;
1544
1545 if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
1546 goto out;
1547
1548 if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
1549 goto out;
1550
1551 if (!ast || !lksb)
1552 goto out;
1553
1554 if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
1555 goto out;
1556
1557 /* parent/child locks not yet supported */
1558 if (parent_lkid)
1559 goto out;
1560
1561 if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
1562 goto out;
1563
1564 /* these args will be copied to the lkb in validate_lock_args,
1565 it cannot be done now because when converting locks, fields in
1566 an active lkb cannot be modified before locking the rsb */
1567
1568 args->flags = flags;
1569 args->astaddr = ast;
1570 args->astparam = (long) astarg;
1571 args->bastaddr = bast;
1572 args->mode = mode;
1573 args->lksb = lksb;
1574 rv = 0;
1575 out:
1576 return rv;
1577}
1578
1579static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
1580{
1581 if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
1582 DLM_LKF_FORCEUNLOCK))
1583 return -EINVAL;
1584
1585 args->flags = flags;
1586 args->astparam = (long) astarg;
1587 return 0;
1588}
1589
1590static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
1591 struct dlm_args *args)
1592{
1593 int rv = -EINVAL;
1594
1595 if (args->flags & DLM_LKF_CONVERT) {
1596 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1597 goto out;
1598
1599 if (args->flags & DLM_LKF_QUECVT &&
1600 !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
1601 goto out;
1602
1603 rv = -EBUSY;
1604 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
1605 goto out;
1606
1607 if (lkb->lkb_wait_type)
1608 goto out;
1609 }
1610
1611 lkb->lkb_exflags = args->flags;
1612 lkb->lkb_sbflags = 0;
1613 lkb->lkb_astaddr = args->astaddr;
1614 lkb->lkb_astparam = args->astparam;
1615 lkb->lkb_bastaddr = args->bastaddr;
1616 lkb->lkb_rqmode = args->mode;
1617 lkb->lkb_lksb = args->lksb;
1618 lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
1619 lkb->lkb_ownpid = (int) current->pid;
1620 rv = 0;
1621 out:
1622 return rv;
1623}
1624
1625static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
1626{
1627 int rv = -EINVAL;
1628
1629 if (lkb->lkb_flags & DLM_IFL_MSTCPY)
1630 goto out;
1631
1632 if (args->flags & DLM_LKF_FORCEUNLOCK)
1633 goto out_ok;
1634
1635 if (args->flags & DLM_LKF_CANCEL &&
1636 lkb->lkb_status == DLM_LKSTS_GRANTED)
1637 goto out;
1638
1639 if (!(args->flags & DLM_LKF_CANCEL) &&
1640 lkb->lkb_status != DLM_LKSTS_GRANTED)
1641 goto out;
1642
1643 rv = -EBUSY;
1644 if (lkb->lkb_wait_type)
1645 goto out;
1646
1647 out_ok:
1648 lkb->lkb_exflags = args->flags;
1649 lkb->lkb_sbflags = 0;
1650 lkb->lkb_astparam = args->astparam;
1651
1652 rv = 0;
1653 out:
1654 return rv;
1655}
1656
1657/*
1658 * Four stage 4 varieties:
1659 * do_request(), do_convert(), do_unlock(), do_cancel()
1660 * These are called on the master node for the given lock and
1661 * from the central locking logic.
1662 */
1663
1664static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
1665{
1666 int error = 0;
1667
1668 if (can_be_granted(r, lkb, 1)) {
1669 grant_lock(r, lkb);
1670 queue_cast(r, lkb, 0);
1671 goto out;
1672 }
1673
1674 if (can_be_queued(lkb)) {
1675 error = -EINPROGRESS;
1676 add_lkb(r, lkb, DLM_LKSTS_WAITING);
1677 send_blocking_asts(r, lkb);
1678 goto out;
1679 }
1680
1681 error = -EAGAIN;
1682 if (force_blocking_asts(lkb))
1683 send_blocking_asts_all(r, lkb);
1684 queue_cast(r, lkb, -EAGAIN);
1685
1686 out:
1687 return error;
1688}
1689
1690static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
1691{
1692 int error = 0;
1693
1694 /* changing an existing lock may allow others to be granted */
1695
1696 if (can_be_granted(r, lkb, 1)) {
1697 grant_lock(r, lkb);
1698 queue_cast(r, lkb, 0);
1699 grant_pending_locks(r);
1700 goto out;
1701 }
1702
1703 if (can_be_queued(lkb)) {
1704 if (is_demoted(lkb))
1705 grant_pending_locks(r);
1706 error = -EINPROGRESS;
1707 del_lkb(r, lkb);
1708 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
1709 send_blocking_asts(r, lkb);
1710 goto out;
1711 }
1712
1713 error = -EAGAIN;
1714 if (force_blocking_asts(lkb))
1715 send_blocking_asts_all(r, lkb);
1716 queue_cast(r, lkb, -EAGAIN);
1717
1718 out:
1719 return error;
1720}
1721
1722static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1723{
1724 remove_lock(r, lkb);
1725 queue_cast(r, lkb, -DLM_EUNLOCK);
1726 grant_pending_locks(r);
1727 return -DLM_EUNLOCK;
1728}
1729
1730static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
1731{
1732 revert_lock(r, lkb);
1733 queue_cast(r, lkb, -DLM_ECANCEL);
1734 grant_pending_locks(r);
1735 return -DLM_ECANCEL;
1736}
1737
1738/*
1739 * Four stage 3 varieties:
1740 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
1741 */
1742
1743/* add a new lkb to a possibly new rsb, called by requesting process */
1744
1745static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1746{
1747 int error;
1748
1749 /* set_master: sets lkb nodeid from r */
1750
1751 error = set_master(r, lkb);
1752 if (error < 0)
1753 goto out;
1754 if (error) {
1755 error = 0;
1756 goto out;
1757 }
1758
1759 if (is_remote(r))
1760 /* receive_request() calls do_request() on remote node */
1761 error = send_request(r, lkb);
1762 else
1763 error = do_request(r, lkb);
1764 out:
1765 return error;
1766}
1767
1768/* change some property of an existing lkb, e.g. mode */
1769
1770static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1771{
1772 int error;
1773
1774 if (is_remote(r))
1775 /* receive_convert() calls do_convert() on remote node */
1776 error = send_convert(r, lkb);
1777 else
1778 error = do_convert(r, lkb);
1779
1780 return error;
1781}
1782
1783/* remove an existing lkb from the granted queue */
1784
1785static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1786{
1787 int error;
1788
1789 if (is_remote(r))
1790 /* receive_unlock() calls do_unlock() on remote node */
1791 error = send_unlock(r, lkb);
1792 else
1793 error = do_unlock(r, lkb);
1794
1795 return error;
1796}
1797
1798/* remove an existing lkb from the convert or wait queue */
1799
1800static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1801{
1802 int error;
1803
1804 if (is_remote(r))
1805 /* receive_cancel() calls do_cancel() on remote node */
1806 error = send_cancel(r, lkb);
1807 else
1808 error = do_cancel(r, lkb);
1809
1810 return error;
1811}
1812
1813/*
1814 * Four stage 2 varieties:
1815 * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
1816 */
1817
1818static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
1819 int len, struct dlm_args *args)
1820{
1821 struct dlm_rsb *r;
1822 int error;
1823
1824 error = validate_lock_args(ls, lkb, args);
1825 if (error)
1826 goto out;
1827
1828 error = find_rsb(ls, name, len, R_CREATE, &r);
1829 if (error)
1830 goto out;
1831
1832 lock_rsb(r);
1833
1834 attach_lkb(r, lkb);
1835 lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
1836
1837 error = _request_lock(r, lkb);
1838
1839 unlock_rsb(r);
1840 put_rsb(r);
1841
1842 out:
1843 return error;
1844}
1845
1846static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1847 struct dlm_args *args)
1848{
1849 struct dlm_rsb *r;
1850 int error;
1851
1852 r = lkb->lkb_resource;
1853
1854 hold_rsb(r);
1855 lock_rsb(r);
1856
1857 error = validate_lock_args(ls, lkb, args);
1858 if (error)
1859 goto out;
1860
1861 error = _convert_lock(r, lkb);
1862 out:
1863 unlock_rsb(r);
1864 put_rsb(r);
1865 return error;
1866}
1867
1868static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1869 struct dlm_args *args)
1870{
1871 struct dlm_rsb *r;
1872 int error;
1873
1874 r = lkb->lkb_resource;
1875
1876 hold_rsb(r);
1877 lock_rsb(r);
1878
1879 error = validate_unlock_args(lkb, args);
1880 if (error)
1881 goto out;
1882
1883 error = _unlock_lock(r, lkb);
1884 out:
1885 unlock_rsb(r);
1886 put_rsb(r);
1887 return error;
1888}
1889
1890static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
1891 struct dlm_args *args)
1892{
1893 struct dlm_rsb *r;
1894 int error;
1895
1896 r = lkb->lkb_resource;
1897
1898 hold_rsb(r);
1899 lock_rsb(r);
1900
1901 error = validate_unlock_args(lkb, args);
1902 if (error)
1903 goto out;
1904
1905 error = _cancel_lock(r, lkb);
1906 out:
1907 unlock_rsb(r);
1908 put_rsb(r);
1909 return error;
1910}
1911
1912/*
1913 * Two stage 1 varieties: dlm_lock() and dlm_unlock()
1914 */
1915
1916int dlm_lock(dlm_lockspace_t *lockspace,
1917 int mode,
1918 struct dlm_lksb *lksb,
1919 uint32_t flags,
1920 void *name,
1921 unsigned int namelen,
1922 uint32_t parent_lkid,
1923 void (*ast) (void *astarg),
1924 void *astarg,
1925 void (*bast) (void *astarg, int mode))
1926{
1927 struct dlm_ls *ls;
1928 struct dlm_lkb *lkb;
1929 struct dlm_args args;
1930 int error, convert = flags & DLM_LKF_CONVERT;
1931
1932 ls = dlm_find_lockspace_local(lockspace);
1933 if (!ls)
1934 return -EINVAL;
1935
1936 lock_recovery(ls);
1937
1938 if (convert)
1939 error = find_lkb(ls, lksb->sb_lkid, &lkb);
1940 else
1941 error = create_lkb(ls, &lkb);
1942
1943 if (error)
1944 goto out;
1945
1946 error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast,
1947 astarg, bast, &args);
1948 if (error)
1949 goto out_put;
1950
1951 if (convert)
1952 error = convert_lock(ls, lkb, &args);
1953 else
1954 error = request_lock(ls, lkb, name, namelen, &args);
1955
1956 if (error == -EINPROGRESS)
1957 error = 0;
1958 out_put:
1959 if (convert || error)
1960 __put_lkb(ls, lkb);
1961 if (error == -EAGAIN)
1962 error = 0;
1963 out:
1964 unlock_recovery(ls);
1965 dlm_put_lockspace(ls);
1966 return error;
1967}
1968
1969int dlm_unlock(dlm_lockspace_t *lockspace,
1970 uint32_t lkid,
1971 uint32_t flags,
1972 struct dlm_lksb *lksb,
1973 void *astarg)
1974{
1975 struct dlm_ls *ls;
1976 struct dlm_lkb *lkb;
1977 struct dlm_args args;
1978 int error;
1979
1980 ls = dlm_find_lockspace_local(lockspace);
1981 if (!ls)
1982 return -EINVAL;
1983
1984 lock_recovery(ls);
1985
1986 error = find_lkb(ls, lkid, &lkb);
1987 if (error)
1988 goto out;
1989
1990 error = set_unlock_args(flags, astarg, &args);
1991 if (error)
1992 goto out_put;
1993
1994 if (flags & DLM_LKF_CANCEL)
1995 error = cancel_lock(ls, lkb, &args);
1996 else
1997 error = unlock_lock(ls, lkb, &args);
1998
1999 if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
2000 error = 0;
2001 out_put:
2002 dlm_put_lkb(lkb);
2003 out:
2004 unlock_recovery(ls);
2005 dlm_put_lockspace(ls);
2006 return error;
2007}
2008
2009/*
2010 * send/receive routines for remote operations and replies
2011 *
2012 * send_args
2013 * send_common
2014 * send_request receive_request
2015 * send_convert receive_convert
2016 * send_unlock receive_unlock
2017 * send_cancel receive_cancel
2018 * send_grant receive_grant
2019 * send_bast receive_bast
2020 * send_lookup receive_lookup
2021 * send_remove receive_remove
2022 *
2023 * send_common_reply
2024 * receive_request_reply send_request_reply
2025 * receive_convert_reply send_convert_reply
2026 * receive_unlock_reply send_unlock_reply
2027 * receive_cancel_reply send_cancel_reply
2028 * receive_lookup_reply send_lookup_reply
2029 */
2030
2031static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
2032 int to_nodeid, int mstype,
2033 struct dlm_message **ms_ret,
2034 struct dlm_mhandle **mh_ret)
2035{
2036 struct dlm_message *ms;
2037 struct dlm_mhandle *mh;
2038 char *mb;
2039 int mb_len = sizeof(struct dlm_message);
2040
2041 switch (mstype) {
2042 case DLM_MSG_REQUEST:
2043 case DLM_MSG_LOOKUP:
2044 case DLM_MSG_REMOVE:
2045 mb_len += r->res_length;
2046 break;
2047 case DLM_MSG_CONVERT:
2048 case DLM_MSG_UNLOCK:
2049 case DLM_MSG_REQUEST_REPLY:
2050 case DLM_MSG_CONVERT_REPLY:
2051 case DLM_MSG_GRANT:
2052 if (lkb && lkb->lkb_lvbptr)
2053 mb_len += r->res_ls->ls_lvblen;
2054 break;
2055 }
2056
2057 /* get_buffer gives us a message handle (mh) that we need to
2058 pass into lowcomms_commit and a message buffer (mb) that we
2059 write our data into */
2060
2061 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
2062 if (!mh)
2063 return -ENOBUFS;
2064
2065 memset(mb, 0, mb_len);
2066
2067 ms = (struct dlm_message *) mb;
2068
2069 ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
2070 ms->m_header.h_lockspace = r->res_ls->ls_global_id;
2071 ms->m_header.h_nodeid = dlm_our_nodeid();
2072 ms->m_header.h_length = mb_len;
2073 ms->m_header.h_cmd = DLM_MSG;
2074
2075 ms->m_type = mstype;
2076
2077 *mh_ret = mh;
2078 *ms_ret = ms;
2079 return 0;
2080}
2081
2082/* further lowcomms enhancements or alternate implementations may make
2083 the return value from this function useful at some point */
2084
2085static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
2086{
2087 dlm_message_out(ms);
2088 dlm_lowcomms_commit_buffer(mh);
2089 return 0;
2090}
2091
2092static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
2093 struct dlm_message *ms)
2094{
2095 ms->m_nodeid = lkb->lkb_nodeid;
2096 ms->m_pid = lkb->lkb_ownpid;
2097 ms->m_lkid = lkb->lkb_id;
2098 ms->m_remid = lkb->lkb_remid;
2099 ms->m_exflags = lkb->lkb_exflags;
2100 ms->m_sbflags = lkb->lkb_sbflags;
2101 ms->m_flags = lkb->lkb_flags;
2102 ms->m_lvbseq = lkb->lkb_lvbseq;
2103 ms->m_status = lkb->lkb_status;
2104 ms->m_grmode = lkb->lkb_grmode;
2105 ms->m_rqmode = lkb->lkb_rqmode;
2106 ms->m_hash = r->res_hash;
2107
2108 /* m_result and m_bastmode are set from function args,
2109 not from lkb fields */
2110
2111 if (lkb->lkb_bastaddr)
2112 ms->m_asts |= AST_BAST;
2113 if (lkb->lkb_astaddr)
2114 ms->m_asts |= AST_COMP;
2115
2116 if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
2117 memcpy(ms->m_extra, r->res_name, r->res_length);
2118
2119 else if (lkb->lkb_lvbptr)
2120 memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
2121
2122}
2123
2124static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
2125{
2126 struct dlm_message *ms;
2127 struct dlm_mhandle *mh;
2128 int to_nodeid, error;
2129
2130 add_to_waiters(lkb, mstype);
2131
2132 to_nodeid = r->res_nodeid;
2133
2134 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2135 if (error)
2136 goto fail;
2137
2138 send_args(r, lkb, ms);
2139
2140 error = send_message(mh, ms);
2141 if (error)
2142 goto fail;
2143 return 0;
2144
2145 fail:
2146 remove_from_waiters(lkb);
2147 return error;
2148}
2149
2150static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
2151{
2152 return send_common(r, lkb, DLM_MSG_REQUEST);
2153}
2154
2155static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
2156{
2157 int error;
2158
2159 error = send_common(r, lkb, DLM_MSG_CONVERT);
2160
2161 /* down conversions go without a reply from the master */
2162 if (!error && down_conversion(lkb)) {
2163 remove_from_waiters(lkb);
2164 r->res_ls->ls_stub_ms.m_result = 0;
2165 __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
2166 }
2167
2168 return error;
2169}
2170
2171/* FIXME: if this lkb is the only lock we hold on the rsb, then set
2172 MASTER_UNCERTAIN to force the next request on the rsb to confirm
2173 that the master is still correct. */
2174
2175static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2176{
2177 return send_common(r, lkb, DLM_MSG_UNLOCK);
2178}
2179
2180static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
2181{
2182 return send_common(r, lkb, DLM_MSG_CANCEL);
2183}
2184
2185static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
2186{
2187 struct dlm_message *ms;
2188 struct dlm_mhandle *mh;
2189 int to_nodeid, error;
2190
2191 to_nodeid = lkb->lkb_nodeid;
2192
2193 error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
2194 if (error)
2195 goto out;
2196
2197 send_args(r, lkb, ms);
2198
2199 ms->m_result = 0;
2200
2201 error = send_message(mh, ms);
2202 out:
2203 return error;
2204}
2205
2206static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
2207{
2208 struct dlm_message *ms;
2209 struct dlm_mhandle *mh;
2210 int to_nodeid, error;
2211
2212 to_nodeid = lkb->lkb_nodeid;
2213
2214 error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
2215 if (error)
2216 goto out;
2217
2218 send_args(r, lkb, ms);
2219
2220 ms->m_bastmode = mode;
2221
2222 error = send_message(mh, ms);
2223 out:
2224 return error;
2225}
2226
2227static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
2228{
2229 struct dlm_message *ms;
2230 struct dlm_mhandle *mh;
2231 int to_nodeid, error;
2232
2233 add_to_waiters(lkb, DLM_MSG_LOOKUP);
2234
2235 to_nodeid = dlm_dir_nodeid(r);
2236
2237 error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
2238 if (error)
2239 goto fail;
2240
2241 send_args(r, lkb, ms);
2242
2243 error = send_message(mh, ms);
2244 if (error)
2245 goto fail;
2246 return 0;
2247
2248 fail:
2249 remove_from_waiters(lkb);
2250 return error;
2251}
2252
2253static int send_remove(struct dlm_rsb *r)
2254{
2255 struct dlm_message *ms;
2256 struct dlm_mhandle *mh;
2257 int to_nodeid, error;
2258
2259 to_nodeid = dlm_dir_nodeid(r);
2260
2261 error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
2262 if (error)
2263 goto out;
2264
2265 memcpy(ms->m_extra, r->res_name, r->res_length);
2266 ms->m_hash = r->res_hash;
2267
2268 error = send_message(mh, ms);
2269 out:
2270 return error;
2271}
2272
2273static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2274 int mstype, int rv)
2275{
2276 struct dlm_message *ms;
2277 struct dlm_mhandle *mh;
2278 int to_nodeid, error;
2279
2280 to_nodeid = lkb->lkb_nodeid;
2281
2282 error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
2283 if (error)
2284 goto out;
2285
2286 send_args(r, lkb, ms);
2287
2288 ms->m_result = rv;
2289
2290 error = send_message(mh, ms);
2291 out:
2292 return error;
2293}
2294
2295static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2296{
2297 return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
2298}
2299
2300static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2301{
2302 return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
2303}
2304
2305static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2306{
2307 return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
2308}
2309
2310static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
2311{
2312 return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
2313}
2314
2315static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
2316 int ret_nodeid, int rv)
2317{
2318 struct dlm_rsb *r = &ls->ls_stub_rsb;
2319 struct dlm_message *ms;
2320 struct dlm_mhandle *mh;
2321 int error, nodeid = ms_in->m_header.h_nodeid;
2322
2323 error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
2324 if (error)
2325 goto out;
2326
2327 ms->m_lkid = ms_in->m_lkid;
2328 ms->m_result = rv;
2329 ms->m_nodeid = ret_nodeid;
2330
2331 error = send_message(mh, ms);
2332 out:
2333 return error;
2334}
2335
2336/* which args we save from a received message depends heavily on the type
2337 of message, unlike the send side where we can safely send everything about
2338 the lkb for any type of message */
2339
2340static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
2341{
2342 lkb->lkb_exflags = ms->m_exflags;
2343 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2344 (ms->m_flags & 0x0000FFFF);
2345}
2346
2347static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2348{
2349 lkb->lkb_sbflags = ms->m_sbflags;
2350 lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
2351 (ms->m_flags & 0x0000FFFF);
2352}
2353
2354static int receive_extralen(struct dlm_message *ms)
2355{
2356 return (ms->m_header.h_length - sizeof(struct dlm_message));
2357}
2358
2359static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
2360 struct dlm_message *ms)
2361{
2362 int len;
2363
2364 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
2365 if (!lkb->lkb_lvbptr)
2366 lkb->lkb_lvbptr = allocate_lvb(ls);
2367 if (!lkb->lkb_lvbptr)
2368 return -ENOMEM;
2369 len = receive_extralen(ms);
2370 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
2371 }
2372 return 0;
2373}
2374
2375static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2376 struct dlm_message *ms)
2377{
2378 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2379 lkb->lkb_ownpid = ms->m_pid;
2380 lkb->lkb_remid = ms->m_lkid;
2381 lkb->lkb_grmode = DLM_LOCK_IV;
2382 lkb->lkb_rqmode = ms->m_rqmode;
2383 lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
2384 lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
2385
2386 DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
2387
2388 if (receive_lvb(ls, lkb, ms))
2389 return -ENOMEM;
2390
2391 return 0;
2392}
2393
2394static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2395 struct dlm_message *ms)
2396{
2397 if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
2398 log_error(ls, "convert_args nodeid %d %d lkid %x %x",
2399 lkb->lkb_nodeid, ms->m_header.h_nodeid,
2400 lkb->lkb_id, lkb->lkb_remid);
2401 return -EINVAL;
2402 }
2403
2404 if (!is_master_copy(lkb))
2405 return -EINVAL;
2406
2407 if (lkb->lkb_status != DLM_LKSTS_GRANTED)
2408 return -EBUSY;
2409
2410 if (receive_lvb(ls, lkb, ms))
2411 return -ENOMEM;
2412
2413 lkb->lkb_rqmode = ms->m_rqmode;
2414 lkb->lkb_lvbseq = ms->m_lvbseq;
2415
2416 return 0;
2417}
2418
2419static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2420 struct dlm_message *ms)
2421{
2422 if (!is_master_copy(lkb))
2423 return -EINVAL;
2424 if (receive_lvb(ls, lkb, ms))
2425 return -ENOMEM;
2426 return 0;
2427}
2428
2429/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
2430 uses to send a reply and that the remote end uses to process the reply. */
2431
2432static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
2433{
2434 struct dlm_lkb *lkb = &ls->ls_stub_lkb;
2435 lkb->lkb_nodeid = ms->m_header.h_nodeid;
2436 lkb->lkb_remid = ms->m_lkid;
2437}
2438
2439static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
2440{
2441 struct dlm_lkb *lkb;
2442 struct dlm_rsb *r;
2443 int error, namelen;
2444
2445 error = create_lkb(ls, &lkb);
2446 if (error)
2447 goto fail;
2448
2449 receive_flags(lkb, ms);
2450 lkb->lkb_flags |= DLM_IFL_MSTCPY;
2451 error = receive_request_args(ls, lkb, ms);
2452 if (error) {
2453 __put_lkb(ls, lkb);
2454 goto fail;
2455 }
2456
2457 namelen = receive_extralen(ms);
2458
2459 error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
2460 if (error) {
2461 __put_lkb(ls, lkb);
2462 goto fail;
2463 }
2464
2465 lock_rsb(r);
2466
2467 attach_lkb(r, lkb);
2468 error = do_request(r, lkb);
2469 send_request_reply(r, lkb, error);
2470
2471 unlock_rsb(r);
2472 put_rsb(r);
2473
2474 if (error == -EINPROGRESS)
2475 error = 0;
2476 if (error)
2477 dlm_put_lkb(lkb);
2478 return;
2479
2480 fail:
2481 setup_stub_lkb(ls, ms);
2482 send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2483}
2484
2485static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
2486{
2487 struct dlm_lkb *lkb;
2488 struct dlm_rsb *r;
2489 int error, reply = 1;
2490
2491 error = find_lkb(ls, ms->m_remid, &lkb);
2492 if (error)
2493 goto fail;
2494
2495 r = lkb->lkb_resource;
2496
2497 hold_rsb(r);
2498 lock_rsb(r);
2499
2500 receive_flags(lkb, ms);
2501 error = receive_convert_args(ls, lkb, ms);
2502 if (error)
2503 goto out;
2504 reply = !down_conversion(lkb);
2505
2506 error = do_convert(r, lkb);
2507 out:
2508 if (reply)
2509 send_convert_reply(r, lkb, error);
2510
2511 unlock_rsb(r);
2512 put_rsb(r);
2513 dlm_put_lkb(lkb);
2514 return;
2515
2516 fail:
2517 setup_stub_lkb(ls, ms);
2518 send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2519}
2520
2521static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
2522{
2523 struct dlm_lkb *lkb;
2524 struct dlm_rsb *r;
2525 int error;
2526
2527 error = find_lkb(ls, ms->m_remid, &lkb);
2528 if (error)
2529 goto fail;
2530
2531 r = lkb->lkb_resource;
2532
2533 hold_rsb(r);
2534 lock_rsb(r);
2535
2536 receive_flags(lkb, ms);
2537 error = receive_unlock_args(ls, lkb, ms);
2538 if (error)
2539 goto out;
2540
2541 error = do_unlock(r, lkb);
2542 out:
2543 send_unlock_reply(r, lkb, error);
2544
2545 unlock_rsb(r);
2546 put_rsb(r);
2547 dlm_put_lkb(lkb);
2548 return;
2549
2550 fail:
2551 setup_stub_lkb(ls, ms);
2552 send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2553}
2554
2555static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
2556{
2557 struct dlm_lkb *lkb;
2558 struct dlm_rsb *r;
2559 int error;
2560
2561 error = find_lkb(ls, ms->m_remid, &lkb);
2562 if (error)
2563 goto fail;
2564
2565 receive_flags(lkb, ms);
2566
2567 r = lkb->lkb_resource;
2568
2569 hold_rsb(r);
2570 lock_rsb(r);
2571
2572 error = do_cancel(r, lkb);
2573 send_cancel_reply(r, lkb, error);
2574
2575 unlock_rsb(r);
2576 put_rsb(r);
2577 dlm_put_lkb(lkb);
2578 return;
2579
2580 fail:
2581 setup_stub_lkb(ls, ms);
2582 send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
2583}
2584
2585static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
2586{
2587 struct dlm_lkb *lkb;
2588 struct dlm_rsb *r;
2589 int error;
2590
2591 error = find_lkb(ls, ms->m_remid, &lkb);
2592 if (error) {
2593 log_error(ls, "receive_grant no lkb");
2594 return;
2595 }
2596 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2597
2598 r = lkb->lkb_resource;
2599
2600 hold_rsb(r);
2601 lock_rsb(r);
2602
2603 receive_flags_reply(lkb, ms);
2604 grant_lock_pc(r, lkb, ms);
2605 queue_cast(r, lkb, 0);
2606
2607 unlock_rsb(r);
2608 put_rsb(r);
2609 dlm_put_lkb(lkb);
2610}
2611
2612static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
2613{
2614 struct dlm_lkb *lkb;
2615 struct dlm_rsb *r;
2616 int error;
2617
2618 error = find_lkb(ls, ms->m_remid, &lkb);
2619 if (error) {
2620 log_error(ls, "receive_bast no lkb");
2621 return;
2622 }
2623 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2624
2625 r = lkb->lkb_resource;
2626
2627 hold_rsb(r);
2628 lock_rsb(r);
2629
2630 queue_bast(r, lkb, ms->m_bastmode);
2631
2632 unlock_rsb(r);
2633 put_rsb(r);
2634 dlm_put_lkb(lkb);
2635}
2636
2637static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
2638{
2639 int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
2640
2641 from_nodeid = ms->m_header.h_nodeid;
2642 our_nodeid = dlm_our_nodeid();
2643
2644 len = receive_extralen(ms);
2645
2646 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2647 if (dir_nodeid != our_nodeid) {
2648 log_error(ls, "lookup dir_nodeid %d from %d",
2649 dir_nodeid, from_nodeid);
2650 error = -EINVAL;
2651 ret_nodeid = -1;
2652 goto out;
2653 }
2654
2655 error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
2656
2657 /* Optimization: we're master so treat lookup as a request */
2658 if (!error && ret_nodeid == our_nodeid) {
2659 receive_request(ls, ms);
2660 return;
2661 }
2662 out:
2663 send_lookup_reply(ls, ms, ret_nodeid, error);
2664}
2665
2666static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
2667{
2668 int len, dir_nodeid, from_nodeid;
2669
2670 from_nodeid = ms->m_header.h_nodeid;
2671
2672 len = receive_extralen(ms);
2673
2674 dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
2675 if (dir_nodeid != dlm_our_nodeid()) {
2676 log_error(ls, "remove dir entry dir_nodeid %d from %d",
2677 dir_nodeid, from_nodeid);
2678 return;
2679 }
2680
2681 dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
2682}
2683
2684static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
2685{
2686 struct dlm_lkb *lkb;
2687 struct dlm_rsb *r;
2688 int error, mstype;
2689
2690 error = find_lkb(ls, ms->m_remid, &lkb);
2691 if (error) {
2692 log_error(ls, "receive_request_reply no lkb");
2693 return;
2694 }
2695 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2696
2697 mstype = lkb->lkb_wait_type;
2698 error = remove_from_waiters(lkb);
2699 if (error) {
2700 log_error(ls, "receive_request_reply not on waiters");
2701 goto out;
2702 }
2703
2704 /* this is the value returned from do_request() on the master */
2705 error = ms->m_result;
2706
2707 r = lkb->lkb_resource;
2708 hold_rsb(r);
2709 lock_rsb(r);
2710
2711 /* Optimization: the dir node was also the master, so it took our
2712 lookup as a request and sent request reply instead of lookup reply */
2713 if (mstype == DLM_MSG_LOOKUP) {
2714 r->res_nodeid = ms->m_header.h_nodeid;
2715 lkb->lkb_nodeid = r->res_nodeid;
2716 }
2717
2718 switch (error) {
2719 case -EAGAIN:
2720 /* request would block (be queued) on remote master;
2721 the unhold undoes the original ref from create_lkb()
2722 so it leads to the lkb being freed */
2723 queue_cast(r, lkb, -EAGAIN);
2724 confirm_master(r, -EAGAIN);
2725 unhold_lkb(lkb);
2726 break;
2727
2728 case -EINPROGRESS:
2729 case 0:
2730 /* request was queued or granted on remote master */
2731 receive_flags_reply(lkb, ms);
2732 lkb->lkb_remid = ms->m_lkid;
2733 if (error)
2734 add_lkb(r, lkb, DLM_LKSTS_WAITING);
2735 else {
2736 grant_lock_pc(r, lkb, ms);
2737 queue_cast(r, lkb, 0);
2738 }
2739 confirm_master(r, error);
2740 break;
2741
2742 case -ENOENT:
2743 case -ENOTBLK:
2744 /* find_rsb failed to find rsb or rsb wasn't master */
2745 r->res_nodeid = -1;
2746 lkb->lkb_nodeid = -1;
2747 _request_lock(r, lkb);
2748 break;
2749
2750 default:
2751 log_error(ls, "receive_request_reply error %d", error);
2752 }
2753
2754 unlock_rsb(r);
2755 put_rsb(r);
2756 out:
2757 dlm_put_lkb(lkb);
2758}
2759
2760static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
2761 struct dlm_message *ms)
2762{
2763 int error = ms->m_result;
2764
2765 /* this is the value returned from do_convert() on the master */
2766
2767 switch (error) {
2768 case -EAGAIN:
2769 /* convert would block (be queued) on remote master */
2770 queue_cast(r, lkb, -EAGAIN);
2771 break;
2772
2773 case -EINPROGRESS:
2774 /* convert was queued on remote master */
2775 del_lkb(r, lkb);
2776 add_lkb(r, lkb, DLM_LKSTS_CONVERT);
2777 break;
2778
2779 case 0:
2780 /* convert was granted on remote master */
2781 receive_flags_reply(lkb, ms);
2782 grant_lock_pc(r, lkb, ms);
2783 queue_cast(r, lkb, 0);
2784 break;
2785
2786 default:
2787 log_error(r->res_ls, "receive_convert_reply error %d", error);
2788 }
2789}
2790
2791static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2792{
2793 struct dlm_rsb *r = lkb->lkb_resource;
2794
2795 hold_rsb(r);
2796 lock_rsb(r);
2797
2798 __receive_convert_reply(r, lkb, ms);
2799
2800 unlock_rsb(r);
2801 put_rsb(r);
2802}
2803
2804static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
2805{
2806 struct dlm_lkb *lkb;
2807 int error;
2808
2809 error = find_lkb(ls, ms->m_remid, &lkb);
2810 if (error) {
2811 log_error(ls, "receive_convert_reply no lkb");
2812 return;
2813 }
2814 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2815
2816 error = remove_from_waiters(lkb);
2817 if (error) {
2818 log_error(ls, "receive_convert_reply not on waiters");
2819 goto out;
2820 }
2821
2822 _receive_convert_reply(lkb, ms);
2823 out:
2824 dlm_put_lkb(lkb);
2825}
2826
2827static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2828{
2829 struct dlm_rsb *r = lkb->lkb_resource;
2830 int error = ms->m_result;
2831
2832 hold_rsb(r);
2833 lock_rsb(r);
2834
2835 /* this is the value returned from do_unlock() on the master */
2836
2837 switch (error) {
2838 case -DLM_EUNLOCK:
2839 receive_flags_reply(lkb, ms);
2840 remove_lock_pc(r, lkb);
2841 queue_cast(r, lkb, -DLM_EUNLOCK);
2842 break;
2843 default:
2844 log_error(r->res_ls, "receive_unlock_reply error %d", error);
2845 }
2846
2847 unlock_rsb(r);
2848 put_rsb(r);
2849}
2850
2851static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
2852{
2853 struct dlm_lkb *lkb;
2854 int error;
2855
2856 error = find_lkb(ls, ms->m_remid, &lkb);
2857 if (error) {
2858 log_error(ls, "receive_unlock_reply no lkb");
2859 return;
2860 }
2861 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2862
2863 error = remove_from_waiters(lkb);
2864 if (error) {
2865 log_error(ls, "receive_unlock_reply not on waiters");
2866 goto out;
2867 }
2868
2869 _receive_unlock_reply(lkb, ms);
2870 out:
2871 dlm_put_lkb(lkb);
2872}
2873
2874static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
2875{
2876 struct dlm_rsb *r = lkb->lkb_resource;
2877 int error = ms->m_result;
2878
2879 hold_rsb(r);
2880 lock_rsb(r);
2881
2882 /* this is the value returned from do_cancel() on the master */
2883
2884 switch (error) {
2885 case -DLM_ECANCEL:
2886 receive_flags_reply(lkb, ms);
2887 revert_lock_pc(r, lkb);
2888 queue_cast(r, lkb, -DLM_ECANCEL);
2889 break;
2890 default:
2891 log_error(r->res_ls, "receive_cancel_reply error %d", error);
2892 }
2893
2894 unlock_rsb(r);
2895 put_rsb(r);
2896}
2897
2898static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
2899{
2900 struct dlm_lkb *lkb;
2901 int error;
2902
2903 error = find_lkb(ls, ms->m_remid, &lkb);
2904 if (error) {
2905 log_error(ls, "receive_cancel_reply no lkb");
2906 return;
2907 }
2908 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
2909
2910 error = remove_from_waiters(lkb);
2911 if (error) {
2912 log_error(ls, "receive_cancel_reply not on waiters");
2913 goto out;
2914 }
2915
2916 _receive_cancel_reply(lkb, ms);
2917 out:
2918 dlm_put_lkb(lkb);
2919}
2920
2921static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
2922{
2923 struct dlm_lkb *lkb;
2924 struct dlm_rsb *r;
2925 int error, ret_nodeid;
2926
2927 error = find_lkb(ls, ms->m_lkid, &lkb);
2928 if (error) {
2929 log_error(ls, "receive_lookup_reply no lkb");
2930 return;
2931 }
2932
2933 error = remove_from_waiters(lkb);
2934 if (error) {
2935 log_error(ls, "receive_lookup_reply not on waiters");
2936 goto out;
2937 }
2938
2939 /* this is the value returned by dlm_dir_lookup on dir node
2940 FIXME: will a non-zero error ever be returned? */
2941 error = ms->m_result;
2942
2943 r = lkb->lkb_resource;
2944 hold_rsb(r);
2945 lock_rsb(r);
2946
2947 ret_nodeid = ms->m_nodeid;
2948 if (ret_nodeid == dlm_our_nodeid()) {
2949 r->res_nodeid = 0;
2950 ret_nodeid = 0;
2951 r->res_first_lkid = 0;
2952 } else {
2953 /* set_master() will copy res_nodeid to lkb_nodeid */
2954 r->res_nodeid = ret_nodeid;
2955 }
2956
2957 _request_lock(r, lkb);
2958
2959 if (!ret_nodeid)
2960 process_lookup_list(r);
2961
2962 unlock_rsb(r);
2963 put_rsb(r);
2964 out:
2965 dlm_put_lkb(lkb);
2966}
2967
2968int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
2969{
2970 struct dlm_message *ms = (struct dlm_message *) hd;
2971 struct dlm_ls *ls;
2972 int error;
2973
2974 if (!recovery)
2975 dlm_message_in(ms);
2976
2977 ls = dlm_find_lockspace_global(hd->h_lockspace);
2978 if (!ls) {
2979 log_print("drop message %d from %d for unknown lockspace %d",
2980 ms->m_type, nodeid, hd->h_lockspace);
2981 return -EINVAL;
2982 }
2983
2984 /* recovery may have just ended leaving a bunch of backed-up requests
2985 in the requestqueue; wait while dlm_recoverd clears them */
2986
2987 if (!recovery)
2988 dlm_wait_requestqueue(ls);
2989
2990 /* recovery may have just started while there were a bunch of
2991 in-flight requests -- save them in requestqueue to be processed
2992 after recovery. we can't let dlm_recvd block on the recovery
2993 lock. if dlm_recoverd is calling this function to clear the
2994 requestqueue, it needs to be interrupted (-EINTR) if another
2995 recovery operation is starting. */
2996
2997 while (1) {
2998 if (dlm_locking_stopped(ls)) {
2999 if (!recovery)
3000 dlm_add_requestqueue(ls, nodeid, hd);
3001 error = -EINTR;
3002 goto out;
3003 }
3004
3005 if (lock_recovery_try(ls))
3006 break;
3007 schedule();
3008 }
3009
3010 switch (ms->m_type) {
3011
3012 /* messages sent to a master node */
3013
3014 case DLM_MSG_REQUEST:
3015 receive_request(ls, ms);
3016 break;
3017
3018 case DLM_MSG_CONVERT:
3019 receive_convert(ls, ms);
3020 break;
3021
3022 case DLM_MSG_UNLOCK:
3023 receive_unlock(ls, ms);
3024 break;
3025
3026 case DLM_MSG_CANCEL:
3027 receive_cancel(ls, ms);
3028 break;
3029
3030 /* messages sent from a master node (replies to above) */
3031
3032 case DLM_MSG_REQUEST_REPLY:
3033 receive_request_reply(ls, ms);
3034 break;
3035
3036 case DLM_MSG_CONVERT_REPLY:
3037 receive_convert_reply(ls, ms);
3038 break;
3039
3040 case DLM_MSG_UNLOCK_REPLY:
3041 receive_unlock_reply(ls, ms);
3042 break;
3043
3044 case DLM_MSG_CANCEL_REPLY:
3045 receive_cancel_reply(ls, ms);
3046 break;
3047
3048 /* messages sent from a master node (only two types of async msg) */
3049
3050 case DLM_MSG_GRANT:
3051 receive_grant(ls, ms);
3052 break;
3053
3054 case DLM_MSG_BAST:
3055 receive_bast(ls, ms);
3056 break;
3057
3058 /* messages sent to a dir node */
3059
3060 case DLM_MSG_LOOKUP:
3061 receive_lookup(ls, ms);
3062 break;
3063
3064 case DLM_MSG_REMOVE:
3065 receive_remove(ls, ms);
3066 break;
3067
3068 /* messages sent from a dir node (remove has no reply) */
3069
3070 case DLM_MSG_LOOKUP_REPLY:
3071 receive_lookup_reply(ls, ms);
3072 break;
3073
3074 default:
3075 log_error(ls, "unknown message type %d", ms->m_type);
3076 }
3077
3078 unlock_recovery(ls);
3079 out:
3080 dlm_put_lockspace(ls);
3081 dlm_astd_wake();
3082 return 0;
3083}
3084
3085
3086/*
3087 * Recovery related
3088 */
3089
3090static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
3091{
3092 if (middle_conversion(lkb)) {
3093 hold_lkb(lkb);
3094 ls->ls_stub_ms.m_result = -EINPROGRESS;
3095 _remove_from_waiters(lkb);
3096 _receive_convert_reply(lkb, &ls->ls_stub_ms);
3097
3098 /* Same special case as in receive_rcom_lock_args() */
3099 lkb->lkb_grmode = DLM_LOCK_IV;
3100 rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
3101 unhold_lkb(lkb);
3102
3103 } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
3104 lkb->lkb_flags |= DLM_IFL_RESEND;
3105 }
3106
3107 /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
3108 conversions are async; there's no reply from the remote master */
3109}
3110
3111/* A waiting lkb needs recovery if the master node has failed, or
3112 the master node is changing (only when no directory is used) */
3113
3114static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
3115{
3116 if (dlm_is_removed(ls, lkb->lkb_nodeid))
3117 return 1;
3118
3119 if (!dlm_no_directory(ls))
3120 return 0;
3121
3122 if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
3123 return 1;
3124
3125 return 0;
3126}
3127
3128/* Recovery for locks that are waiting for replies from nodes that are now
3129 gone. We can just complete unlocks and cancels by faking a reply from the
3130 dead node. Requests and up-conversions we flag to be resent after
3131 recovery. Down-conversions can just be completed with a fake reply like
3132 unlocks. Conversions between PR and CW need special attention. */
3133
3134void dlm_recover_waiters_pre(struct dlm_ls *ls)
3135{
3136 struct dlm_lkb *lkb, *safe;
3137
3138 mutex_lock(&ls->ls_waiters_mutex);
3139
3140 list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
3141 log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
3142 lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
3143
3144 /* all outstanding lookups, regardless of destination will be
3145 resent after recovery is done */
3146
3147 if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
3148 lkb->lkb_flags |= DLM_IFL_RESEND;
3149 continue;
3150 }
3151
3152 if (!waiter_needs_recovery(ls, lkb))
3153 continue;
3154
3155 switch (lkb->lkb_wait_type) {
3156
3157 case DLM_MSG_REQUEST:
3158 lkb->lkb_flags |= DLM_IFL_RESEND;
3159 break;
3160
3161 case DLM_MSG_CONVERT:
3162 recover_convert_waiter(ls, lkb);
3163 break;
3164
3165 case DLM_MSG_UNLOCK:
3166 hold_lkb(lkb);
3167 ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
3168 _remove_from_waiters(lkb);
3169 _receive_unlock_reply(lkb, &ls->ls_stub_ms);
3170 dlm_put_lkb(lkb);
3171 break;
3172
3173 case DLM_MSG_CANCEL:
3174 hold_lkb(lkb);
3175 ls->ls_stub_ms.m_result = -DLM_ECANCEL;
3176 _remove_from_waiters(lkb);
3177 _receive_cancel_reply(lkb, &ls->ls_stub_ms);
3178 dlm_put_lkb(lkb);
3179 break;
3180
3181 default:
3182 log_error(ls, "invalid lkb wait_type %d",
3183 lkb->lkb_wait_type);
3184 }
3185 }
3186 mutex_unlock(&ls->ls_waiters_mutex);
3187}
3188
3189static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
3190{
3191 struct dlm_lkb *lkb;
3192 int rv = 0;
3193
3194 mutex_lock(&ls->ls_waiters_mutex);
3195 list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
3196 if (lkb->lkb_flags & DLM_IFL_RESEND) {
3197 rv = lkb->lkb_wait_type;
3198 _remove_from_waiters(lkb);
3199 lkb->lkb_flags &= ~DLM_IFL_RESEND;
3200 break;
3201 }
3202 }
3203 mutex_unlock(&ls->ls_waiters_mutex);
3204
3205 if (!rv)
3206 lkb = NULL;
3207 *lkb_ret = lkb;
3208 return rv;
3209}
3210
3211/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the
3212 master or dir-node for r. Processing the lkb may result in it being placed
3213 back on waiters. */
3214
3215int dlm_recover_waiters_post(struct dlm_ls *ls)
3216{
3217 struct dlm_lkb *lkb;
3218 struct dlm_rsb *r;
3219 int error = 0, mstype;
3220
3221 while (1) {
3222 if (dlm_locking_stopped(ls)) {
3223 log_debug(ls, "recover_waiters_post aborted");
3224 error = -EINTR;
3225 break;
3226 }
3227
3228 mstype = remove_resend_waiter(ls, &lkb);
3229 if (!mstype)
3230 break;
3231
3232 r = lkb->lkb_resource;
3233
3234 log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
3235 lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
3236
3237 switch (mstype) {
3238
3239 case DLM_MSG_LOOKUP:
3240 hold_rsb(r);
3241 lock_rsb(r);
3242 _request_lock(r, lkb);
3243 if (is_master(r))
3244 confirm_master(r, 0);
3245 unlock_rsb(r);
3246 put_rsb(r);
3247 break;
3248
3249 case DLM_MSG_REQUEST:
3250 hold_rsb(r);
3251 lock_rsb(r);
3252 _request_lock(r, lkb);
3253 unlock_rsb(r);
3254 put_rsb(r);
3255 break;
3256
3257 case DLM_MSG_CONVERT:
3258 hold_rsb(r);
3259 lock_rsb(r);
3260 _convert_lock(r, lkb);
3261 unlock_rsb(r);
3262 put_rsb(r);
3263 break;
3264
3265 default:
3266 log_error(ls, "recover_waiters_post type %d", mstype);
3267 }
3268 }
3269
3270 return error;
3271}
3272
3273static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
3274 int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
3275{
3276 struct dlm_ls *ls = r->res_ls;
3277 struct dlm_lkb *lkb, *safe;
3278
3279 list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
3280 if (test(ls, lkb)) {
3281 del_lkb(r, lkb);
3282 /* this put should free the lkb */
3283 if (!dlm_put_lkb(lkb))
3284 log_error(ls, "purged lkb not released");
3285 }
3286 }
3287}
3288
3289static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3290{
3291 return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
3292}
3293
3294static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
3295{
3296 return is_master_copy(lkb);
3297}
3298
3299static void purge_dead_locks(struct dlm_rsb *r)
3300{
3301 purge_queue(r, &r->res_grantqueue, &purge_dead_test);
3302 purge_queue(r, &r->res_convertqueue, &purge_dead_test);
3303 purge_queue(r, &r->res_waitqueue, &purge_dead_test);
3304}
3305
3306void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
3307{
3308 purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
3309 purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
3310 purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
3311}
3312
3313/* Get rid of locks held by nodes that are gone. */
3314
3315int dlm_purge_locks(struct dlm_ls *ls)
3316{
3317 struct dlm_rsb *r;
3318
3319 log_debug(ls, "dlm_purge_locks");
3320
3321 down_write(&ls->ls_root_sem);
3322 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
3323 hold_rsb(r);
3324 lock_rsb(r);
3325 if (is_master(r))
3326 purge_dead_locks(r);
3327 unlock_rsb(r);
3328 unhold_rsb(r);
3329
3330 schedule();
3331 }
3332 up_write(&ls->ls_root_sem);
3333
3334 return 0;
3335}
3336
3337int dlm_grant_after_purge(struct dlm_ls *ls)
3338{
3339 struct dlm_rsb *r;
3340 int i;
3341
3342 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
3343 read_lock(&ls->ls_rsbtbl[i].lock);
3344 list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
3345 hold_rsb(r);
3346 lock_rsb(r);
3347 if (is_master(r)) {
3348 grant_pending_locks(r);
3349 confirm_master(r, 0);
3350 }
3351 unlock_rsb(r);
3352 put_rsb(r);
3353 }
3354 read_unlock(&ls->ls_rsbtbl[i].lock);
3355 }
3356
3357 return 0;
3358}
3359
3360static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
3361 uint32_t remid)
3362{
3363 struct dlm_lkb *lkb;
3364
3365 list_for_each_entry(lkb, head, lkb_statequeue) {
3366 if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
3367 return lkb;
3368 }
3369 return NULL;
3370}
3371
3372static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
3373 uint32_t remid)
3374{
3375 struct dlm_lkb *lkb;
3376
3377 lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
3378 if (lkb)
3379 return lkb;
3380 lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
3381 if (lkb)
3382 return lkb;
3383 lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
3384 if (lkb)
3385 return lkb;
3386 return NULL;
3387}
3388
3389static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3390 struct dlm_rsb *r, struct dlm_rcom *rc)
3391{
3392 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3393 int lvblen;
3394
3395 lkb->lkb_nodeid = rc->rc_header.h_nodeid;
3396 lkb->lkb_ownpid = rl->rl_ownpid;
3397 lkb->lkb_remid = rl->rl_lkid;
3398 lkb->lkb_exflags = rl->rl_exflags;
3399 lkb->lkb_flags = rl->rl_flags & 0x0000FFFF;
3400 lkb->lkb_flags |= DLM_IFL_MSTCPY;
3401 lkb->lkb_lvbseq = rl->rl_lvbseq;
3402 lkb->lkb_rqmode = rl->rl_rqmode;
3403 lkb->lkb_grmode = rl->rl_grmode;
3404 /* don't set lkb_status because add_lkb wants to itself */
3405
3406 lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST);
3407 lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
3408
3409 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3410 lkb->lkb_lvbptr = allocate_lvb(ls);
3411 if (!lkb->lkb_lvbptr)
3412 return -ENOMEM;
3413 lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
3414 sizeof(struct rcom_lock);
3415 memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
3416 }
3417
3418 /* Conversions between PR and CW (middle modes) need special handling.
3419 The real granted mode of these converting locks cannot be determined
3420 until all locks have been rebuilt on the rsb (recover_conversion) */
3421
3422 if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) {
3423 rl->rl_status = DLM_LKSTS_CONVERT;
3424 lkb->lkb_grmode = DLM_LOCK_IV;
3425 rsb_set_flag(r, RSB_RECOVER_CONVERT);
3426 }
3427
3428 return 0;
3429}
3430
3431/* This lkb may have been recovered in a previous aborted recovery so we need
3432 to check if the rsb already has an lkb with the given remote nodeid/lkid.
3433 If so we just send back a standard reply. If not, we create a new lkb with
3434 the given values and send back our lkid. We send back our lkid by sending
3435 back the rcom_lock struct we got but with the remid field filled in. */
3436
3437int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3438{
3439 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3440 struct dlm_rsb *r;
3441 struct dlm_lkb *lkb;
3442 int error;
3443
3444 if (rl->rl_parent_lkid) {
3445 error = -EOPNOTSUPP;
3446 goto out;
3447 }
3448
3449 error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r);
3450 if (error)
3451 goto out;
3452
3453 lock_rsb(r);
3454
3455 lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid);
3456 if (lkb) {
3457 error = -EEXIST;
3458 goto out_remid;
3459 }
3460
3461 error = create_lkb(ls, &lkb);
3462 if (error)
3463 goto out_unlock;
3464
3465 error = receive_rcom_lock_args(ls, lkb, r, rc);
3466 if (error) {
3467 __put_lkb(ls, lkb);
3468 goto out_unlock;
3469 }
3470
3471 attach_lkb(r, lkb);
3472 add_lkb(r, lkb, rl->rl_status);
3473 error = 0;
3474
3475 out_remid:
3476 /* this is the new value returned to the lock holder for
3477 saving in its process-copy lkb */
3478 rl->rl_remid = lkb->lkb_id;
3479
3480 out_unlock:
3481 unlock_rsb(r);
3482 put_rsb(r);
3483 out:
3484 if (error)
3485 log_print("recover_master_copy %d %x", error, rl->rl_lkid);
3486 rl->rl_result = error;
3487 return error;
3488}
3489
3490int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
3491{
3492 struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
3493 struct dlm_rsb *r;
3494 struct dlm_lkb *lkb;
3495 int error;
3496
3497 error = find_lkb(ls, rl->rl_lkid, &lkb);
3498 if (error) {
3499 log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid);
3500 return error;
3501 }
3502
3503 DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
3504
3505 error = rl->rl_result;
3506
3507 r = lkb->lkb_resource;
3508 hold_rsb(r);
3509 lock_rsb(r);
3510
3511 switch (error) {
3512 case -EEXIST:
3513 log_debug(ls, "master copy exists %x", lkb->lkb_id);
3514 /* fall through */
3515 case 0:
3516 lkb->lkb_remid = rl->rl_remid;
3517 break;
3518 default:
3519 log_error(ls, "dlm_recover_process_copy unknown error %d %x",
3520 error, lkb->lkb_id);
3521 }
3522
3523 /* an ack for dlm_recover_locks() which waits for replies from
3524 all the locks it sends to new masters */
3525 dlm_recovered_lock(r);
3526
3527 unlock_rsb(r);
3528 put_rsb(r);
3529 dlm_put_lkb(lkb);
3530
3531 return 0;
3532}
3533
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
new file mode 100644
index 000000000000..bffab9c88b1d
--- /dev/null
+++ b/fs/dlm/lock.h
@@ -0,0 +1,50 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LOCK_DOT_H__
14#define __LOCK_DOT_H__
15
16void dlm_print_rsb(struct dlm_rsb *r);
17int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery);
18int dlm_modes_compat(int mode1, int mode2);
19int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
20 unsigned int flags, struct dlm_rsb **r_ret);
21void dlm_put_rsb(struct dlm_rsb *r);
22void dlm_hold_rsb(struct dlm_rsb *r);
23int dlm_put_lkb(struct dlm_lkb *lkb);
24void dlm_scan_rsbs(struct dlm_ls *ls);
25
26int dlm_purge_locks(struct dlm_ls *ls);
27void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
28int dlm_grant_after_purge(struct dlm_ls *ls);
29int dlm_recover_waiters_post(struct dlm_ls *ls);
30void dlm_recover_waiters_pre(struct dlm_ls *ls);
31int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
32int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
33
34static inline int is_master(struct dlm_rsb *r)
35{
36 return !r->res_nodeid;
37}
38
39static inline void lock_rsb(struct dlm_rsb *r)
40{
41 mutex_lock(&r->res_mutex);
42}
43
44static inline void unlock_rsb(struct dlm_rsb *r)
45{
46 mutex_unlock(&r->res_mutex);
47}
48
49#endif
50
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
new file mode 100644
index 000000000000..d2ff505d51cd
--- /dev/null
+++ b/fs/dlm/lockspace.c
@@ -0,0 +1,665 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "recoverd.h"
18#include "ast.h"
19#include "dir.h"
20#include "lowcomms.h"
21#include "config.h"
22#include "memory.h"
23#include "lock.h"
24
25#ifdef CONFIG_DLM_DEBUG
26int dlm_create_debug_file(struct dlm_ls *ls);
27void dlm_delete_debug_file(struct dlm_ls *ls);
28#else
29static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
30static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
31#endif
32
33static int ls_count;
34static struct mutex ls_lock;
35static struct list_head lslist;
36static spinlock_t lslist_lock;
37static struct task_struct * scand_task;
38
39
40static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
41{
42 ssize_t ret = len;
43 int n = simple_strtol(buf, NULL, 0);
44
45 switch (n) {
46 case 0:
47 dlm_ls_stop(ls);
48 break;
49 case 1:
50 dlm_ls_start(ls);
51 break;
52 default:
53 ret = -EINVAL;
54 }
55 return ret;
56}
57
58static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
59{
60 ls->ls_uevent_result = simple_strtol(buf, NULL, 0);
61 set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
62 wake_up(&ls->ls_uevent_wait);
63 return len;
64}
65
66static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
67{
68 return sprintf(buf, "%u\n", ls->ls_global_id);
69}
70
71static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
72{
73 ls->ls_global_id = simple_strtoul(buf, NULL, 0);
74 return len;
75}
76
77struct dlm_attr {
78 struct attribute attr;
79 ssize_t (*show)(struct dlm_ls *, char *);
80 ssize_t (*store)(struct dlm_ls *, const char *, size_t);
81};
82
83static struct dlm_attr dlm_attr_control = {
84 .attr = {.name = "control", .mode = S_IWUSR},
85 .store = dlm_control_store
86};
87
88static struct dlm_attr dlm_attr_event = {
89 .attr = {.name = "event_done", .mode = S_IWUSR},
90 .store = dlm_event_store
91};
92
93static struct dlm_attr dlm_attr_id = {
94 .attr = {.name = "id", .mode = S_IRUGO | S_IWUSR},
95 .show = dlm_id_show,
96 .store = dlm_id_store
97};
98
99static struct attribute *dlm_attrs[] = {
100 &dlm_attr_control.attr,
101 &dlm_attr_event.attr,
102 &dlm_attr_id.attr,
103 NULL,
104};
105
106static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
107 char *buf)
108{
109 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
110 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
111 return a->show ? a->show(ls, buf) : 0;
112}
113
114static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
115 const char *buf, size_t len)
116{
117 struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
118 struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
119 return a->store ? a->store(ls, buf, len) : len;
120}
121
122static struct sysfs_ops dlm_attr_ops = {
123 .show = dlm_attr_show,
124 .store = dlm_attr_store,
125};
126
127static struct kobj_type dlm_ktype = {
128 .default_attrs = dlm_attrs,
129 .sysfs_ops = &dlm_attr_ops,
130};
131
132static struct kset dlm_kset = {
133 .subsys = &kernel_subsys,
134 .kobj = {.name = "dlm",},
135 .ktype = &dlm_ktype,
136};
137
138static int kobject_setup(struct dlm_ls *ls)
139{
140 char lsname[DLM_LOCKSPACE_LEN];
141 int error;
142
143 memset(lsname, 0, DLM_LOCKSPACE_LEN);
144 snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
145
146 error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
147 if (error)
148 return error;
149
150 ls->ls_kobj.kset = &dlm_kset;
151 ls->ls_kobj.ktype = &dlm_ktype;
152 return 0;
153}
154
155static int do_uevent(struct dlm_ls *ls, int in)
156{
157 int error;
158
159 if (in)
160 kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
161 else
162 kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
163
164 error = wait_event_interruptible(ls->ls_uevent_wait,
165 test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
166 if (error)
167 goto out;
168
169 error = ls->ls_uevent_result;
170 out:
171 return error;
172}
173
174
175int dlm_lockspace_init(void)
176{
177 int error;
178
179 ls_count = 0;
180 mutex_init(&ls_lock);
181 INIT_LIST_HEAD(&lslist);
182 spin_lock_init(&lslist_lock);
183
184 error = kset_register(&dlm_kset);
185 if (error)
186 printk("dlm_lockspace_init: cannot register kset %d\n", error);
187 return error;
188}
189
190void dlm_lockspace_exit(void)
191{
192 kset_unregister(&dlm_kset);
193}
194
195static int dlm_scand(void *data)
196{
197 struct dlm_ls *ls;
198
199 while (!kthread_should_stop()) {
200 list_for_each_entry(ls, &lslist, ls_list)
201 dlm_scan_rsbs(ls);
202 schedule_timeout_interruptible(dlm_config.scan_secs * HZ);
203 }
204 return 0;
205}
206
207static int dlm_scand_start(void)
208{
209 struct task_struct *p;
210 int error = 0;
211
212 p = kthread_run(dlm_scand, NULL, "dlm_scand");
213 if (IS_ERR(p))
214 error = PTR_ERR(p);
215 else
216 scand_task = p;
217 return error;
218}
219
220static void dlm_scand_stop(void)
221{
222 kthread_stop(scand_task);
223}
224
225static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
226{
227 struct dlm_ls *ls;
228
229 spin_lock(&lslist_lock);
230
231 list_for_each_entry(ls, &lslist, ls_list) {
232 if (ls->ls_namelen == namelen &&
233 memcmp(ls->ls_name, name, namelen) == 0)
234 goto out;
235 }
236 ls = NULL;
237 out:
238 spin_unlock(&lslist_lock);
239 return ls;
240}
241
242struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
243{
244 struct dlm_ls *ls;
245
246 spin_lock(&lslist_lock);
247
248 list_for_each_entry(ls, &lslist, ls_list) {
249 if (ls->ls_global_id == id) {
250 ls->ls_count++;
251 goto out;
252 }
253 }
254 ls = NULL;
255 out:
256 spin_unlock(&lslist_lock);
257 return ls;
258}
259
260struct dlm_ls *dlm_find_lockspace_local(void *id)
261{
262 struct dlm_ls *ls = id;
263
264 spin_lock(&lslist_lock);
265 ls->ls_count++;
266 spin_unlock(&lslist_lock);
267 return ls;
268}
269
270void dlm_put_lockspace(struct dlm_ls *ls)
271{
272 spin_lock(&lslist_lock);
273 ls->ls_count--;
274 spin_unlock(&lslist_lock);
275}
276
277static void remove_lockspace(struct dlm_ls *ls)
278{
279 for (;;) {
280 spin_lock(&lslist_lock);
281 if (ls->ls_count == 0) {
282 list_del(&ls->ls_list);
283 spin_unlock(&lslist_lock);
284 return;
285 }
286 spin_unlock(&lslist_lock);
287 ssleep(1);
288 }
289}
290
291static int threads_start(void)
292{
293 int error;
294
295 /* Thread which process lock requests for all lockspace's */
296 error = dlm_astd_start();
297 if (error) {
298 log_print("cannot start dlm_astd thread %d", error);
299 goto fail;
300 }
301
302 error = dlm_scand_start();
303 if (error) {
304 log_print("cannot start dlm_scand thread %d", error);
305 goto astd_fail;
306 }
307
308 /* Thread for sending/receiving messages for all lockspace's */
309 error = dlm_lowcomms_start();
310 if (error) {
311 log_print("cannot start dlm lowcomms %d", error);
312 goto scand_fail;
313 }
314
315 return 0;
316
317 scand_fail:
318 dlm_scand_stop();
319 astd_fail:
320 dlm_astd_stop();
321 fail:
322 return error;
323}
324
325static void threads_stop(void)
326{
327 dlm_scand_stop();
328 dlm_lowcomms_stop();
329 dlm_astd_stop();
330}
331
332static int new_lockspace(char *name, int namelen, void **lockspace,
333 uint32_t flags, int lvblen)
334{
335 struct dlm_ls *ls;
336 int i, size, error = -ENOMEM;
337
338 if (namelen > DLM_LOCKSPACE_LEN)
339 return -EINVAL;
340
341 if (!lvblen || (lvblen % 8))
342 return -EINVAL;
343
344 if (!try_module_get(THIS_MODULE))
345 return -EINVAL;
346
347 ls = dlm_find_lockspace_name(name, namelen);
348 if (ls) {
349 *lockspace = ls;
350 module_put(THIS_MODULE);
351 return -EEXIST;
352 }
353
354 ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
355 if (!ls)
356 goto out;
357 memcpy(ls->ls_name, name, namelen);
358 ls->ls_namelen = namelen;
359 ls->ls_exflags = flags;
360 ls->ls_lvblen = lvblen;
361 ls->ls_count = 0;
362 ls->ls_flags = 0;
363
364 size = dlm_config.rsbtbl_size;
365 ls->ls_rsbtbl_size = size;
366
367 ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
368 if (!ls->ls_rsbtbl)
369 goto out_lsfree;
370 for (i = 0; i < size; i++) {
371 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
372 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
373 rwlock_init(&ls->ls_rsbtbl[i].lock);
374 }
375
376 size = dlm_config.lkbtbl_size;
377 ls->ls_lkbtbl_size = size;
378
379 ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
380 if (!ls->ls_lkbtbl)
381 goto out_rsbfree;
382 for (i = 0; i < size; i++) {
383 INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
384 rwlock_init(&ls->ls_lkbtbl[i].lock);
385 ls->ls_lkbtbl[i].counter = 1;
386 }
387
388 size = dlm_config.dirtbl_size;
389 ls->ls_dirtbl_size = size;
390
391 ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
392 if (!ls->ls_dirtbl)
393 goto out_lkbfree;
394 for (i = 0; i < size; i++) {
395 INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
396 rwlock_init(&ls->ls_dirtbl[i].lock);
397 }
398
399 INIT_LIST_HEAD(&ls->ls_waiters);
400 mutex_init(&ls->ls_waiters_mutex);
401
402 INIT_LIST_HEAD(&ls->ls_nodes);
403 INIT_LIST_HEAD(&ls->ls_nodes_gone);
404 ls->ls_num_nodes = 0;
405 ls->ls_low_nodeid = 0;
406 ls->ls_total_weight = 0;
407 ls->ls_node_array = NULL;
408
409 memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
410 ls->ls_stub_rsb.res_ls = ls;
411
412 ls->ls_debug_dentry = NULL;
413
414 init_waitqueue_head(&ls->ls_uevent_wait);
415 ls->ls_uevent_result = 0;
416
417 ls->ls_recoverd_task = NULL;
418 mutex_init(&ls->ls_recoverd_active);
419 spin_lock_init(&ls->ls_recover_lock);
420 ls->ls_recover_status = 0;
421 ls->ls_recover_seq = 0;
422 ls->ls_recover_args = NULL;
423 init_rwsem(&ls->ls_in_recovery);
424 INIT_LIST_HEAD(&ls->ls_requestqueue);
425 mutex_init(&ls->ls_requestqueue_mutex);
426
427 ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
428 if (!ls->ls_recover_buf)
429 goto out_dirfree;
430
431 INIT_LIST_HEAD(&ls->ls_recover_list);
432 spin_lock_init(&ls->ls_recover_list_lock);
433 ls->ls_recover_list_count = 0;
434 init_waitqueue_head(&ls->ls_wait_general);
435 INIT_LIST_HEAD(&ls->ls_root_list);
436 init_rwsem(&ls->ls_root_sem);
437
438 down_write(&ls->ls_in_recovery);
439
440 error = dlm_recoverd_start(ls);
441 if (error) {
442 log_error(ls, "can't start dlm_recoverd %d", error);
443 goto out_rcomfree;
444 }
445
446 spin_lock(&lslist_lock);
447 list_add(&ls->ls_list, &lslist);
448 spin_unlock(&lslist_lock);
449
450 dlm_create_debug_file(ls);
451
452 error = kobject_setup(ls);
453 if (error)
454 goto out_del;
455
456 error = kobject_register(&ls->ls_kobj);
457 if (error)
458 goto out_del;
459
460 error = do_uevent(ls, 1);
461 if (error)
462 goto out_unreg;
463
464 *lockspace = ls;
465 return 0;
466
467 out_unreg:
468 kobject_unregister(&ls->ls_kobj);
469 out_del:
470 dlm_delete_debug_file(ls);
471 spin_lock(&lslist_lock);
472 list_del(&ls->ls_list);
473 spin_unlock(&lslist_lock);
474 dlm_recoverd_stop(ls);
475 out_rcomfree:
476 kfree(ls->ls_recover_buf);
477 out_dirfree:
478 kfree(ls->ls_dirtbl);
479 out_lkbfree:
480 kfree(ls->ls_lkbtbl);
481 out_rsbfree:
482 kfree(ls->ls_rsbtbl);
483 out_lsfree:
484 kfree(ls);
485 out:
486 module_put(THIS_MODULE);
487 return error;
488}
489
490int dlm_new_lockspace(char *name, int namelen, void **lockspace,
491 uint32_t flags, int lvblen)
492{
493 int error = 0;
494
495 mutex_lock(&ls_lock);
496 if (!ls_count)
497 error = threads_start();
498 if (error)
499 goto out;
500
501 error = new_lockspace(name, namelen, lockspace, flags, lvblen);
502 if (!error)
503 ls_count++;
504 out:
505 mutex_unlock(&ls_lock);
506 return error;
507}
508
509/* Return 1 if the lockspace still has active remote locks,
510 * 2 if the lockspace still has active local locks.
511 */
512static int lockspace_busy(struct dlm_ls *ls)
513{
514 int i, lkb_found = 0;
515 struct dlm_lkb *lkb;
516
517 /* NOTE: We check the lockidtbl here rather than the resource table.
518 This is because there may be LKBs queued as ASTs that have been
519 unlinked from their RSBs and are pending deletion once the AST has
520 been delivered */
521
522 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
523 read_lock(&ls->ls_lkbtbl[i].lock);
524 if (!list_empty(&ls->ls_lkbtbl[i].list)) {
525 lkb_found = 1;
526 list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
527 lkb_idtbl_list) {
528 if (!lkb->lkb_nodeid) {
529 read_unlock(&ls->ls_lkbtbl[i].lock);
530 return 2;
531 }
532 }
533 }
534 read_unlock(&ls->ls_lkbtbl[i].lock);
535 }
536 return lkb_found;
537}
538
539static int release_lockspace(struct dlm_ls *ls, int force)
540{
541 struct dlm_lkb *lkb;
542 struct dlm_rsb *rsb;
543 struct list_head *head;
544 int i;
545 int busy = lockspace_busy(ls);
546
547 if (busy > force)
548 return -EBUSY;
549
550 if (force < 3)
551 do_uevent(ls, 0);
552
553 dlm_recoverd_stop(ls);
554
555 remove_lockspace(ls);
556
557 dlm_delete_debug_file(ls);
558
559 dlm_astd_suspend();
560
561 kfree(ls->ls_recover_buf);
562
563 /*
564 * Free direntry structs.
565 */
566
567 dlm_dir_clear(ls);
568 kfree(ls->ls_dirtbl);
569
570 /*
571 * Free all lkb's on lkbtbl[] lists.
572 */
573
574 for (i = 0; i < ls->ls_lkbtbl_size; i++) {
575 head = &ls->ls_lkbtbl[i].list;
576 while (!list_empty(head)) {
577 lkb = list_entry(head->next, struct dlm_lkb,
578 lkb_idtbl_list);
579
580 list_del(&lkb->lkb_idtbl_list);
581
582 dlm_del_ast(lkb);
583
584 if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
585 free_lvb(lkb->lkb_lvbptr);
586
587 free_lkb(lkb);
588 }
589 }
590 dlm_astd_resume();
591
592 kfree(ls->ls_lkbtbl);
593
594 /*
595 * Free all rsb's on rsbtbl[] lists
596 */
597
598 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
599 head = &ls->ls_rsbtbl[i].list;
600 while (!list_empty(head)) {
601 rsb = list_entry(head->next, struct dlm_rsb,
602 res_hashchain);
603
604 list_del(&rsb->res_hashchain);
605 free_rsb(rsb);
606 }
607
608 head = &ls->ls_rsbtbl[i].toss;
609 while (!list_empty(head)) {
610 rsb = list_entry(head->next, struct dlm_rsb,
611 res_hashchain);
612 list_del(&rsb->res_hashchain);
613 free_rsb(rsb);
614 }
615 }
616
617 kfree(ls->ls_rsbtbl);
618
619 /*
620 * Free structures on any other lists
621 */
622
623 kfree(ls->ls_recover_args);
624 dlm_clear_free_entries(ls);
625 dlm_clear_members(ls);
626 dlm_clear_members_gone(ls);
627 kfree(ls->ls_node_array);
628 kobject_unregister(&ls->ls_kobj);
629 kfree(ls);
630
631 mutex_lock(&ls_lock);
632 ls_count--;
633 if (!ls_count)
634 threads_stop();
635 mutex_unlock(&ls_lock);
636
637 module_put(THIS_MODULE);
638 return 0;
639}
640
641/*
642 * Called when a system has released all its locks and is not going to use the
643 * lockspace any longer. We free everything we're managing for this lockspace.
644 * Remaining nodes will go through the recovery process as if we'd died. The
645 * lockspace must continue to function as usual, participating in recoveries,
646 * until this returns.
647 *
648 * Force has 4 possible values:
649 * 0 - don't destroy locksapce if it has any LKBs
650 * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
651 * 2 - destroy lockspace regardless of LKBs
652 * 3 - destroy lockspace as part of a forced shutdown
653 */
654
655int dlm_release_lockspace(void *lockspace, int force)
656{
657 struct dlm_ls *ls;
658
659 ls = dlm_find_lockspace_local(lockspace);
660 if (!ls)
661 return -EINVAL;
662 dlm_put_lockspace(ls);
663 return release_lockspace(ls, force);
664}
665
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
new file mode 100644
index 000000000000..17bd3ba863a9
--- /dev/null
+++ b/fs/dlm/lockspace.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOCKSPACE_DOT_H__
15#define __LOCKSPACE_DOT_H__
16
17int dlm_lockspace_init(void);
18void dlm_lockspace_exit(void);
19struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
20struct dlm_ls *dlm_find_lockspace_local(void *id);
21void dlm_put_lockspace(struct dlm_ls *ls);
22
23#endif /* __LOCKSPACE_DOT_H__ */
24
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
new file mode 100644
index 000000000000..09b0124f7fc4
--- /dev/null
+++ b/fs/dlm/lowcomms.c
@@ -0,0 +1,1218 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * lowcomms.c
16 *
17 * This is the "low-level" comms layer.
18 *
19 * It is responsible for sending/receiving messages
20 * from other nodes in the cluster.
21 *
22 * Cluster nodes are referred to by their nodeids. nodeids are
23 * simply 32 bit numbers to the locking module - if they need to
24 * be expanded for the cluster infrastructure then that is it's
25 * responsibility. It is this layer's
26 * responsibility to resolve these into IP address or
27 * whatever it needs for inter-node communication.
28 *
29 * The comms level is two kernel threads that deal mainly with
30 * the receiving of messages from other nodes and passing them
31 * up to the mid-level comms layer (which understands the
32 * message format) for execution by the locking core, and
33 * a send thread which does all the setting up of connections
34 * to remote nodes and the sending of data. Threads are not allowed
35 * to send their own data because it may cause them to wait in times
36 * of high load. Also, this way, the sending thread can collect together
37 * messages bound for one node and send them in one block.
38 *
39 * I don't see any problem with the recv thread executing the locking
40 * code on behalf of remote processes as the locking code is
41 * short, efficient and never (well, hardly ever) waits.
42 *
43 */
44
45#include <asm/ioctls.h>
46#include <net/sock.h>
47#include <net/tcp.h>
48#include <net/sctp/user.h>
49#include <linux/pagemap.h>
50#include <linux/socket.h>
51#include <linux/idr.h>
52
53#include "dlm_internal.h"
54#include "lowcomms.h"
55#include "config.h"
56#include "midcomms.h"
57
58static struct sockaddr_storage *local_addr[DLM_MAX_ADDR_COUNT];
59static int local_count;
60static int local_nodeid;
61
62/* One of these per connected node */
63
64#define NI_INIT_PENDING 1
65#define NI_WRITE_PENDING 2
66
67struct nodeinfo {
68 spinlock_t lock;
69 sctp_assoc_t assoc_id;
70 unsigned long flags;
71 struct list_head write_list; /* nodes with pending writes */
72 struct list_head writequeue; /* outgoing writequeue_entries */
73 spinlock_t writequeue_lock;
74 int nodeid;
75};
76
77static DEFINE_IDR(nodeinfo_idr);
78static struct rw_semaphore nodeinfo_lock;
79static int max_nodeid;
80
81struct cbuf {
82 unsigned base;
83 unsigned len;
84 unsigned mask;
85};
86
87/* Just the one of these, now. But this struct keeps
88 the connection-specific variables together */
89
90#define CF_READ_PENDING 1
91
92struct connection {
93 struct socket *sock;
94 unsigned long flags;
95 struct page *rx_page;
96 atomic_t waiting_requests;
97 struct cbuf cb;
98 int eagain_flag;
99};
100
101/* An entry waiting to be sent */
102
103struct writequeue_entry {
104 struct list_head list;
105 struct page *page;
106 int offset;
107 int len;
108 int end;
109 int users;
110 struct nodeinfo *ni;
111};
112
113#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
114#define CBUF_EMPTY(cb) ((cb)->len == 0)
115#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
116#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
117
118#define CBUF_INIT(cb, size) \
119do { \
120 (cb)->base = (cb)->len = 0; \
121 (cb)->mask = ((size)-1); \
122} while(0)
123
124#define CBUF_EAT(cb, n) \
125do { \
126 (cb)->len -= (n); \
127 (cb)->base += (n); \
128 (cb)->base &= (cb)->mask; \
129} while(0)
130
131
132/* List of nodes which have writes pending */
133static struct list_head write_nodes;
134static spinlock_t write_nodes_lock;
135
136/* Maximum number of incoming messages to process before
137 * doing a schedule()
138 */
139#define MAX_RX_MSG_COUNT 25
140
141/* Manage daemons */
142static struct task_struct *recv_task;
143static struct task_struct *send_task;
144static wait_queue_head_t lowcomms_recv_wait;
145static atomic_t accepting;
146
147/* The SCTP connection */
148static struct connection sctp_con;
149
150
151static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
152{
153 struct sockaddr_storage addr;
154 int error;
155
156 if (!local_count)
157 return -1;
158
159 error = dlm_nodeid_to_addr(nodeid, &addr);
160 if (error)
161 return error;
162
163 if (local_addr[0]->ss_family == AF_INET) {
164 struct sockaddr_in *in4 = (struct sockaddr_in *) &addr;
165 struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
166 ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
167 } else {
168 struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
169 struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
170 memcpy(&ret6->sin6_addr, &in6->sin6_addr,
171 sizeof(in6->sin6_addr));
172 }
173
174 return 0;
175}
176
177static struct nodeinfo *nodeid2nodeinfo(int nodeid, int alloc)
178{
179 struct nodeinfo *ni;
180 int r;
181 int n;
182
183 down_read(&nodeinfo_lock);
184 ni = idr_find(&nodeinfo_idr, nodeid);
185 up_read(&nodeinfo_lock);
186
187 if (!ni && alloc) {
188 down_write(&nodeinfo_lock);
189
190 ni = idr_find(&nodeinfo_idr, nodeid);
191 if (ni)
192 goto out_up;
193
194 r = idr_pre_get(&nodeinfo_idr, alloc);
195 if (!r)
196 goto out_up;
197
198 ni = kmalloc(sizeof(struct nodeinfo), alloc);
199 if (!ni)
200 goto out_up;
201
202 r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
203 if (r) {
204 kfree(ni);
205 ni = NULL;
206 goto out_up;
207 }
208 if (n != nodeid) {
209 idr_remove(&nodeinfo_idr, n);
210 kfree(ni);
211 ni = NULL;
212 goto out_up;
213 }
214 memset(ni, 0, sizeof(struct nodeinfo));
215 spin_lock_init(&ni->lock);
216 INIT_LIST_HEAD(&ni->writequeue);
217 spin_lock_init(&ni->writequeue_lock);
218 ni->nodeid = nodeid;
219
220 if (nodeid > max_nodeid)
221 max_nodeid = nodeid;
222 out_up:
223 up_write(&nodeinfo_lock);
224 }
225
226 return ni;
227}
228
229/* Don't call this too often... */
230static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
231{
232 int i;
233 struct nodeinfo *ni;
234
235 for (i=1; i<=max_nodeid; i++) {
236 ni = nodeid2nodeinfo(i, 0);
237 if (ni && ni->assoc_id == assoc)
238 return ni;
239 }
240 return NULL;
241}
242
243/* Data or notification available on socket */
244static void lowcomms_data_ready(struct sock *sk, int count_unused)
245{
246 atomic_inc(&sctp_con.waiting_requests);
247 if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
248 return;
249
250 wake_up_interruptible(&lowcomms_recv_wait);
251}
252
253
254/* Add the port number to an IP6 or 4 sockaddr and return the address length.
255 Also padd out the struct with zeros to make comparisons meaningful */
256
257static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
258 int *addr_len)
259{
260 struct sockaddr_in *local4_addr;
261 struct sockaddr_in6 *local6_addr;
262
263 if (!local_count)
264 return;
265
266 if (!port) {
267 if (local_addr[0]->ss_family == AF_INET) {
268 local4_addr = (struct sockaddr_in *)local_addr[0];
269 port = be16_to_cpu(local4_addr->sin_port);
270 } else {
271 local6_addr = (struct sockaddr_in6 *)local_addr[0];
272 port = be16_to_cpu(local6_addr->sin6_port);
273 }
274 }
275
276 saddr->ss_family = local_addr[0]->ss_family;
277 if (local_addr[0]->ss_family == AF_INET) {
278 struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
279 in4_addr->sin_port = cpu_to_be16(port);
280 memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
281 memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
282 sizeof(struct sockaddr_in));
283 *addr_len = sizeof(struct sockaddr_in);
284 } else {
285 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
286 in6_addr->sin6_port = cpu_to_be16(port);
287 memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
288 sizeof(struct sockaddr_in6));
289 *addr_len = sizeof(struct sockaddr_in6);
290 }
291}
292
293/* Close the connection and tidy up */
294static void close_connection(void)
295{
296 if (sctp_con.sock) {
297 sock_release(sctp_con.sock);
298 sctp_con.sock = NULL;
299 }
300
301 if (sctp_con.rx_page) {
302 __free_page(sctp_con.rx_page);
303 sctp_con.rx_page = NULL;
304 }
305}
306
307/* We only send shutdown messages to nodes that are not part of the cluster */
308static void send_shutdown(sctp_assoc_t associd)
309{
310 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
311 struct msghdr outmessage;
312 struct cmsghdr *cmsg;
313 struct sctp_sndrcvinfo *sinfo;
314 int ret;
315
316 outmessage.msg_name = NULL;
317 outmessage.msg_namelen = 0;
318 outmessage.msg_control = outcmsg;
319 outmessage.msg_controllen = sizeof(outcmsg);
320 outmessage.msg_flags = MSG_EOR;
321
322 cmsg = CMSG_FIRSTHDR(&outmessage);
323 cmsg->cmsg_level = IPPROTO_SCTP;
324 cmsg->cmsg_type = SCTP_SNDRCV;
325 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
326 outmessage.msg_controllen = cmsg->cmsg_len;
327 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
328 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
329
330 sinfo->sinfo_flags |= MSG_EOF;
331 sinfo->sinfo_assoc_id = associd;
332
333 ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
334
335 if (ret != 0)
336 log_print("send EOF to node failed: %d", ret);
337}
338
339
340/* INIT failed but we don't know which node...
341 restart INIT on all pending nodes */
342static void init_failed(void)
343{
344 int i;
345 struct nodeinfo *ni;
346
347 for (i=1; i<=max_nodeid; i++) {
348 ni = nodeid2nodeinfo(i, 0);
349 if (!ni)
350 continue;
351
352 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
353 ni->assoc_id = 0;
354 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
355 spin_lock_bh(&write_nodes_lock);
356 list_add_tail(&ni->write_list, &write_nodes);
357 spin_unlock_bh(&write_nodes_lock);
358 }
359 }
360 }
361 wake_up_process(send_task);
362}
363
364/* Something happened to an association */
365static void process_sctp_notification(struct msghdr *msg, char *buf)
366{
367 union sctp_notification *sn = (union sctp_notification *)buf;
368
369 if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
370 switch (sn->sn_assoc_change.sac_state) {
371
372 case SCTP_COMM_UP:
373 case SCTP_RESTART:
374 {
375 /* Check that the new node is in the lockspace */
376 struct sctp_prim prim;
377 mm_segment_t fs;
378 int nodeid;
379 int prim_len, ret;
380 int addr_len;
381 struct nodeinfo *ni;
382
383 /* This seems to happen when we received a connection
384 * too early... or something... anyway, it happens but
385 * we always seem to get a real message too, see
386 * receive_from_sock */
387
388 if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
389 log_print("COMM_UP for invalid assoc ID %d",
390 (int)sn->sn_assoc_change.sac_assoc_id);
391 init_failed();
392 return;
393 }
394 memset(&prim, 0, sizeof(struct sctp_prim));
395 prim_len = sizeof(struct sctp_prim);
396 prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
397
398 fs = get_fs();
399 set_fs(get_ds());
400 ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
401 IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
402 (char*)&prim, &prim_len);
403 set_fs(fs);
404 if (ret < 0) {
405 struct nodeinfo *ni;
406
407 log_print("getsockopt/sctp_primary_addr on "
408 "new assoc %d failed : %d",
409 (int)sn->sn_assoc_change.sac_assoc_id, ret);
410
411 /* Retry INIT later */
412 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
413 if (ni)
414 clear_bit(NI_INIT_PENDING, &ni->flags);
415 return;
416 }
417 make_sockaddr(&prim.ssp_addr, 0, &addr_len);
418 if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
419 log_print("reject connect from unknown addr");
420 send_shutdown(prim.ssp_assoc_id);
421 return;
422 }
423
424 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
425 if (!ni)
426 return;
427
428 /* Save the assoc ID */
429 spin_lock(&ni->lock);
430 ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
431 spin_unlock(&ni->lock);
432
433 log_print("got new/restarted association %d nodeid %d",
434 (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
435
436 /* Send any pending writes */
437 clear_bit(NI_INIT_PENDING, &ni->flags);
438 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
439 spin_lock_bh(&write_nodes_lock);
440 list_add_tail(&ni->write_list, &write_nodes);
441 spin_unlock_bh(&write_nodes_lock);
442 }
443 wake_up_process(send_task);
444 }
445 break;
446
447 case SCTP_COMM_LOST:
448 case SCTP_SHUTDOWN_COMP:
449 {
450 struct nodeinfo *ni;
451
452 ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
453 if (ni) {
454 spin_lock(&ni->lock);
455 ni->assoc_id = 0;
456 spin_unlock(&ni->lock);
457 }
458 }
459 break;
460
461 /* We don't know which INIT failed, so clear the PENDING flags
462 * on them all. if assoc_id is zero then it will then try
463 * again */
464
465 case SCTP_CANT_STR_ASSOC:
466 {
467 log_print("Can't start SCTP association - retrying");
468 init_failed();
469 }
470 break;
471
472 default:
473 log_print("unexpected SCTP assoc change id=%d state=%d",
474 (int)sn->sn_assoc_change.sac_assoc_id,
475 sn->sn_assoc_change.sac_state);
476 }
477 }
478}
479
480/* Data received from remote end */
481static int receive_from_sock(void)
482{
483 int ret = 0;
484 struct msghdr msg;
485 struct kvec iov[2];
486 unsigned len;
487 int r;
488 struct sctp_sndrcvinfo *sinfo;
489 struct cmsghdr *cmsg;
490 struct nodeinfo *ni;
491
492 /* These two are marginally too big for stack allocation, but this
493 * function is (currently) only called by dlm_recvd so static should be
494 * OK.
495 */
496 static struct sockaddr_storage msgname;
497 static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
498
499 if (sctp_con.sock == NULL)
500 goto out;
501
502 if (sctp_con.rx_page == NULL) {
503 /*
504 * This doesn't need to be atomic, but I think it should
505 * improve performance if it is.
506 */
507 sctp_con.rx_page = alloc_page(GFP_ATOMIC);
508 if (sctp_con.rx_page == NULL)
509 goto out_resched;
510 CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
511 }
512
513 memset(&incmsg, 0, sizeof(incmsg));
514 memset(&msgname, 0, sizeof(msgname));
515
516 memset(incmsg, 0, sizeof(incmsg));
517 msg.msg_name = &msgname;
518 msg.msg_namelen = sizeof(msgname);
519 msg.msg_flags = 0;
520 msg.msg_control = incmsg;
521 msg.msg_controllen = sizeof(incmsg);
522
523 /* I don't see why this circular buffer stuff is necessary for SCTP
524 * which is a packet-based protocol, but the whole thing breaks under
525 * load without it! The overhead is minimal (and is in the TCP lowcomms
526 * anyway, of course) so I'll leave it in until I can figure out what's
527 * really happening.
528 */
529
530 /*
531 * iov[0] is the bit of the circular buffer between the current end
532 * point (cb.base + cb.len) and the end of the buffer.
533 */
534 iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
535 iov[0].iov_base = page_address(sctp_con.rx_page) +
536 CBUF_DATA(&sctp_con.cb);
537 iov[1].iov_len = 0;
538
539 /*
540 * iov[1] is the bit of the circular buffer between the start of the
541 * buffer and the start of the currently used section (cb.base)
542 */
543 if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
544 iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
545 iov[1].iov_len = sctp_con.cb.base;
546 iov[1].iov_base = page_address(sctp_con.rx_page);
547 msg.msg_iovlen = 2;
548 }
549 len = iov[0].iov_len + iov[1].iov_len;
550
551 r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, 1, len,
552 MSG_NOSIGNAL | MSG_DONTWAIT);
553 if (ret <= 0)
554 goto out_close;
555
556 msg.msg_control = incmsg;
557 msg.msg_controllen = sizeof(incmsg);
558 cmsg = CMSG_FIRSTHDR(&msg);
559 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
560
561 if (msg.msg_flags & MSG_NOTIFICATION) {
562 process_sctp_notification(&msg, page_address(sctp_con.rx_page));
563 return 0;
564 }
565
566 /* Is this a new association ? */
567 ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
568 if (ni) {
569 ni->assoc_id = sinfo->sinfo_assoc_id;
570 if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
571
572 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
573 spin_lock_bh(&write_nodes_lock);
574 list_add_tail(&ni->write_list, &write_nodes);
575 spin_unlock_bh(&write_nodes_lock);
576 }
577 wake_up_process(send_task);
578 }
579 }
580
581 /* INIT sends a message with length of 1 - ignore it */
582 if (r == 1)
583 return 0;
584
585 CBUF_ADD(&sctp_con.cb, ret);
586 ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
587 page_address(sctp_con.rx_page),
588 sctp_con.cb.base, sctp_con.cb.len,
589 PAGE_CACHE_SIZE);
590 if (ret < 0)
591 goto out_close;
592 CBUF_EAT(&sctp_con.cb, ret);
593
594 out:
595 ret = 0;
596 goto out_ret;
597
598 out_resched:
599 lowcomms_data_ready(sctp_con.sock->sk, 0);
600 ret = 0;
601 schedule();
602 goto out_ret;
603
604 out_close:
605 if (ret != -EAGAIN)
606 log_print("error reading from sctp socket: %d", ret);
607 out_ret:
608 return ret;
609}
610
611/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
612static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
613{
614 mm_segment_t fs;
615 int result = 0;
616
617 fs = get_fs();
618 set_fs(get_ds());
619 if (num == 1)
620 result = sctp_con.sock->ops->bind(sctp_con.sock,
621 (struct sockaddr *) addr, addr_len);
622 else
623 result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
624 SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
625 set_fs(fs);
626
627 if (result < 0)
628 log_print("Can't bind to port %d addr number %d",
629 dlm_config.tcp_port, num);
630
631 return result;
632}
633
634static void init_local(void)
635{
636 struct sockaddr_storage sas, *addr;
637 int i;
638
639 local_nodeid = dlm_our_nodeid();
640
641 for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
642 if (dlm_our_addr(&sas, i))
643 break;
644
645 addr = kmalloc(sizeof(*addr), GFP_KERNEL);
646 if (!addr)
647 break;
648 memcpy(addr, &sas, sizeof(*addr));
649 local_addr[local_count++] = addr;
650 }
651}
652
653/* Initialise SCTP socket and bind to all interfaces */
654static int init_sock(void)
655{
656 mm_segment_t fs;
657 struct socket *sock = NULL;
658 struct sockaddr_storage localaddr;
659 struct sctp_event_subscribe subscribe;
660 int result = -EINVAL, num = 1, i, addr_len;
661
662 if (!local_count) {
663 init_local();
664 if (!local_count) {
665 log_print("no local IP address has been set");
666 goto out;
667 }
668 }
669
670 result = sock_create_kern(local_addr[0]->ss_family, SOCK_SEQPACKET,
671 IPPROTO_SCTP, &sock);
672 if (result < 0) {
673 log_print("Can't create comms socket, check SCTP is loaded");
674 goto out;
675 }
676
677 /* Listen for events */
678 memset(&subscribe, 0, sizeof(subscribe));
679 subscribe.sctp_data_io_event = 1;
680 subscribe.sctp_association_event = 1;
681 subscribe.sctp_send_failure_event = 1;
682 subscribe.sctp_shutdown_event = 1;
683 subscribe.sctp_partial_delivery_event = 1;
684
685 fs = get_fs();
686 set_fs(get_ds());
687 result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
688 (char *)&subscribe, sizeof(subscribe));
689 set_fs(fs);
690
691 if (result < 0) {
692 log_print("Failed to set SCTP_EVENTS on socket: result=%d",
693 result);
694 goto create_delsock;
695 }
696
697 /* Init con struct */
698 sock->sk->sk_user_data = &sctp_con;
699 sctp_con.sock = sock;
700 sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
701
702 /* Bind to all interfaces. */
703 for (i = 0; i < local_count; i++) {
704 memcpy(&localaddr, local_addr[i], sizeof(localaddr));
705 make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
706
707 result = add_bind_addr(&localaddr, addr_len, num);
708 if (result)
709 goto create_delsock;
710 ++num;
711 }
712
713 result = sock->ops->listen(sock, 5);
714 if (result < 0) {
715 log_print("Can't set socket listening");
716 goto create_delsock;
717 }
718
719 return 0;
720
721 create_delsock:
722 sock_release(sock);
723 sctp_con.sock = NULL;
724 out:
725 return result;
726}
727
728
729static struct writequeue_entry *new_writequeue_entry(int allocation)
730{
731 struct writequeue_entry *entry;
732
733 entry = kmalloc(sizeof(struct writequeue_entry), allocation);
734 if (!entry)
735 return NULL;
736
737 entry->page = alloc_page(allocation);
738 if (!entry->page) {
739 kfree(entry);
740 return NULL;
741 }
742
743 entry->offset = 0;
744 entry->len = 0;
745 entry->end = 0;
746 entry->users = 0;
747
748 return entry;
749}
750
751void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc)
752{
753 struct writequeue_entry *e;
754 int offset = 0;
755 int users = 0;
756 struct nodeinfo *ni;
757
758 if (!atomic_read(&accepting))
759 return NULL;
760
761 ni = nodeid2nodeinfo(nodeid, allocation);
762 if (!ni)
763 return NULL;
764
765 spin_lock(&ni->writequeue_lock);
766 e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
767 if (((struct list_head *) e == &ni->writequeue) ||
768 (PAGE_CACHE_SIZE - e->end < len)) {
769 e = NULL;
770 } else {
771 offset = e->end;
772 e->end += len;
773 users = e->users++;
774 }
775 spin_unlock(&ni->writequeue_lock);
776
777 if (e) {
778 got_one:
779 if (users == 0)
780 kmap(e->page);
781 *ppc = page_address(e->page) + offset;
782 return e;
783 }
784
785 e = new_writequeue_entry(allocation);
786 if (e) {
787 spin_lock(&ni->writequeue_lock);
788 offset = e->end;
789 e->end += len;
790 e->ni = ni;
791 users = e->users++;
792 list_add_tail(&e->list, &ni->writequeue);
793 spin_unlock(&ni->writequeue_lock);
794 goto got_one;
795 }
796 return NULL;
797}
798
799void dlm_lowcomms_commit_buffer(void *arg)
800{
801 struct writequeue_entry *e = (struct writequeue_entry *) arg;
802 int users;
803 struct nodeinfo *ni = e->ni;
804
805 if (!atomic_read(&accepting))
806 return;
807
808 spin_lock(&ni->writequeue_lock);
809 users = --e->users;
810 if (users)
811 goto out;
812 e->len = e->end - e->offset;
813 kunmap(e->page);
814 spin_unlock(&ni->writequeue_lock);
815
816 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
817 spin_lock_bh(&write_nodes_lock);
818 list_add_tail(&ni->write_list, &write_nodes);
819 spin_unlock_bh(&write_nodes_lock);
820 wake_up_process(send_task);
821 }
822 return;
823
824 out:
825 spin_unlock(&ni->writequeue_lock);
826 return;
827}
828
829static void free_entry(struct writequeue_entry *e)
830{
831 __free_page(e->page);
832 kfree(e);
833}
834
835/* Initiate an SCTP association. In theory we could just use sendmsg() on
836 the first IP address and it should work, but this allows us to set up the
837 association before sending any valuable data that we can't afford to lose.
838 It also keeps the send path clean as it can now always use the association ID */
839static void initiate_association(int nodeid)
840{
841 struct sockaddr_storage rem_addr;
842 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
843 struct msghdr outmessage;
844 struct cmsghdr *cmsg;
845 struct sctp_sndrcvinfo *sinfo;
846 int ret;
847 int addrlen;
848 char buf[1];
849 struct kvec iov[1];
850 struct nodeinfo *ni;
851
852 log_print("Initiating association with node %d", nodeid);
853
854 ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
855 if (!ni)
856 return;
857
858 if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
859 log_print("no address for nodeid %d", nodeid);
860 return;
861 }
862
863 make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
864
865 outmessage.msg_name = &rem_addr;
866 outmessage.msg_namelen = addrlen;
867 outmessage.msg_control = outcmsg;
868 outmessage.msg_controllen = sizeof(outcmsg);
869 outmessage.msg_flags = MSG_EOR;
870
871 iov[0].iov_base = buf;
872 iov[0].iov_len = 1;
873
874 /* Real INIT messages seem to cause trouble. Just send a 1 byte message
875 we can afford to lose */
876 cmsg = CMSG_FIRSTHDR(&outmessage);
877 cmsg->cmsg_level = IPPROTO_SCTP;
878 cmsg->cmsg_type = SCTP_SNDRCV;
879 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
880 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
881 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
882 sinfo->sinfo_ppid = cpu_to_le32(local_nodeid);
883
884 outmessage.msg_controllen = cmsg->cmsg_len;
885 ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
886 if (ret < 0) {
887 log_print("send INIT to node failed: %d", ret);
888 /* Try again later */
889 clear_bit(NI_INIT_PENDING, &ni->flags);
890 }
891}
892
893/* Send a message */
894static int send_to_sock(struct nodeinfo *ni)
895{
896 int ret = 0;
897 struct writequeue_entry *e;
898 int len, offset;
899 struct msghdr outmsg;
900 static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
901 struct cmsghdr *cmsg;
902 struct sctp_sndrcvinfo *sinfo;
903 struct kvec iov;
904
905 /* See if we need to init an association before we start
906 sending precious messages */
907 spin_lock(&ni->lock);
908 if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
909 spin_unlock(&ni->lock);
910 initiate_association(ni->nodeid);
911 return 0;
912 }
913 spin_unlock(&ni->lock);
914
915 outmsg.msg_name = NULL; /* We use assoc_id */
916 outmsg.msg_namelen = 0;
917 outmsg.msg_control = outcmsg;
918 outmsg.msg_controllen = sizeof(outcmsg);
919 outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
920
921 cmsg = CMSG_FIRSTHDR(&outmsg);
922 cmsg->cmsg_level = IPPROTO_SCTP;
923 cmsg->cmsg_type = SCTP_SNDRCV;
924 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
925 sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
926 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
927 sinfo->sinfo_ppid = cpu_to_le32(local_nodeid);
928 sinfo->sinfo_assoc_id = ni->assoc_id;
929 outmsg.msg_controllen = cmsg->cmsg_len;
930
931 spin_lock(&ni->writequeue_lock);
932 for (;;) {
933 if (list_empty(&ni->writequeue))
934 break;
935 e = list_entry(ni->writequeue.next, struct writequeue_entry,
936 list);
937 kmap(e->page);
938 len = e->len;
939 offset = e->offset;
940 BUG_ON(len == 0 && e->users == 0);
941 spin_unlock(&ni->writequeue_lock);
942
943 ret = 0;
944 if (len) {
945 iov.iov_base = page_address(e->page)+offset;
946 iov.iov_len = len;
947
948 ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
949 len);
950 if (ret == -EAGAIN) {
951 sctp_con.eagain_flag = 1;
952 goto out;
953 } else if (ret < 0)
954 goto send_error;
955 } else {
956 /* Don't starve people filling buffers */
957 schedule();
958 }
959
960 spin_lock(&ni->writequeue_lock);
961 e->offset += ret;
962 e->len -= ret;
963
964 if (e->len == 0 && e->users == 0) {
965 list_del(&e->list);
966 free_entry(e);
967 continue;
968 }
969 }
970 spin_unlock(&ni->writequeue_lock);
971 out:
972 return ret;
973
974 send_error:
975 log_print("Error sending to node %d %d", ni->nodeid, ret);
976 spin_lock(&ni->lock);
977 if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
978 ni->assoc_id = 0;
979 spin_unlock(&ni->lock);
980 initiate_association(ni->nodeid);
981 } else
982 spin_unlock(&ni->lock);
983
984 return ret;
985}
986
987/* Try to send any messages that are pending */
988static void process_output_queue(void)
989{
990 struct list_head *list;
991 struct list_head *temp;
992
993 spin_lock_bh(&write_nodes_lock);
994 list_for_each_safe(list, temp, &write_nodes) {
995 struct nodeinfo *ni =
996 list_entry(list, struct nodeinfo, write_list);
997 clear_bit(NI_WRITE_PENDING, &ni->flags);
998 list_del(&ni->write_list);
999
1000 spin_unlock_bh(&write_nodes_lock);
1001
1002 send_to_sock(ni);
1003 spin_lock_bh(&write_nodes_lock);
1004 }
1005 spin_unlock_bh(&write_nodes_lock);
1006}
1007
1008/* Called after we've had -EAGAIN and been woken up */
1009static void refill_write_queue(void)
1010{
1011 int i;
1012
1013 for (i=1; i<=max_nodeid; i++) {
1014 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1015
1016 if (ni) {
1017 if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
1018 spin_lock_bh(&write_nodes_lock);
1019 list_add_tail(&ni->write_list, &write_nodes);
1020 spin_unlock_bh(&write_nodes_lock);
1021 }
1022 }
1023 }
1024}
1025
1026static void clean_one_writequeue(struct nodeinfo *ni)
1027{
1028 struct list_head *list;
1029 struct list_head *temp;
1030
1031 spin_lock(&ni->writequeue_lock);
1032 list_for_each_safe(list, temp, &ni->writequeue) {
1033 struct writequeue_entry *e =
1034 list_entry(list, struct writequeue_entry, list);
1035 list_del(&e->list);
1036 free_entry(e);
1037 }
1038 spin_unlock(&ni->writequeue_lock);
1039}
1040
1041static void clean_writequeues(void)
1042{
1043 int i;
1044
1045 for (i=1; i<=max_nodeid; i++) {
1046 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1047 if (ni)
1048 clean_one_writequeue(ni);
1049 }
1050}
1051
1052
1053static void dealloc_nodeinfo(void)
1054{
1055 int i;
1056
1057 for (i=1; i<=max_nodeid; i++) {
1058 struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
1059 if (ni) {
1060 idr_remove(&nodeinfo_idr, i);
1061 kfree(ni);
1062 }
1063 }
1064}
1065
1066static int write_list_empty(void)
1067{
1068 int status;
1069
1070 spin_lock_bh(&write_nodes_lock);
1071 status = list_empty(&write_nodes);
1072 spin_unlock_bh(&write_nodes_lock);
1073
1074 return status;
1075}
1076
1077static int dlm_recvd(void *data)
1078{
1079 DECLARE_WAITQUEUE(wait, current);
1080
1081 while (!kthread_should_stop()) {
1082 int count = 0;
1083
1084 set_current_state(TASK_INTERRUPTIBLE);
1085 add_wait_queue(&lowcomms_recv_wait, &wait);
1086 if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
1087 schedule();
1088 remove_wait_queue(&lowcomms_recv_wait, &wait);
1089 set_current_state(TASK_RUNNING);
1090
1091 if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
1092 int ret;
1093
1094 do {
1095 ret = receive_from_sock();
1096
1097 /* Don't starve out everyone else */
1098 if (++count >= MAX_RX_MSG_COUNT) {
1099 schedule();
1100 count = 0;
1101 }
1102 } while (!kthread_should_stop() && ret >=0);
1103 }
1104 schedule();
1105 }
1106
1107 return 0;
1108}
1109
1110static int dlm_sendd(void *data)
1111{
1112 DECLARE_WAITQUEUE(wait, current);
1113
1114 add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1115
1116 while (!kthread_should_stop()) {
1117 set_current_state(TASK_INTERRUPTIBLE);
1118 if (write_list_empty())
1119 schedule();
1120 set_current_state(TASK_RUNNING);
1121
1122 if (sctp_con.eagain_flag) {
1123 sctp_con.eagain_flag = 0;
1124 refill_write_queue();
1125 }
1126 process_output_queue();
1127 }
1128
1129 remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
1130
1131 return 0;
1132}
1133
1134static void daemons_stop(void)
1135{
1136 kthread_stop(recv_task);
1137 kthread_stop(send_task);
1138}
1139
1140static int daemons_start(void)
1141{
1142 struct task_struct *p;
1143 int error;
1144
1145 p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
1146 error = IS_ERR(p);
1147 if (error) {
1148 log_print("can't start dlm_recvd %d", error);
1149 return error;
1150 }
1151 recv_task = p;
1152
1153 p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
1154 error = IS_ERR(p);
1155 if (error) {
1156 log_print("can't start dlm_sendd %d", error);
1157 kthread_stop(recv_task);
1158 return error;
1159 }
1160 send_task = p;
1161
1162 return 0;
1163}
1164
1165/*
1166 * This is quite likely to sleep...
1167 */
1168int dlm_lowcomms_start(void)
1169{
1170 int error;
1171
1172 spin_lock_init(&write_nodes_lock);
1173 INIT_LIST_HEAD(&write_nodes);
1174 init_rwsem(&nodeinfo_lock);
1175
1176 error = init_sock();
1177 if (error)
1178 goto fail_sock;
1179 error = daemons_start();
1180 if (error)
1181 goto fail_sock;
1182 atomic_set(&accepting, 1);
1183 return 0;
1184
1185 fail_sock:
1186 close_connection();
1187 return error;
1188}
1189
1190/* Set all the activity flags to prevent any socket activity. */
1191
1192void dlm_lowcomms_stop(void)
1193{
1194 atomic_set(&accepting, 0);
1195 sctp_con.flags = 0x7;
1196 daemons_stop();
1197 clean_writequeues();
1198 close_connection();
1199 dealloc_nodeinfo();
1200 max_nodeid = 0;
1201}
1202
1203int dlm_lowcomms_init(void)
1204{
1205 init_waitqueue_head(&lowcomms_recv_wait);
1206 return 0;
1207}
1208
1209void dlm_lowcomms_exit(void)
1210{
1211 int i;
1212
1213 for (i = 0; i < local_count; i++)
1214 kfree(local_addr[i]);
1215 local_count = 0;
1216 local_nodeid = 0;
1217}
1218
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
new file mode 100644
index 000000000000..3af8035ff12f
--- /dev/null
+++ b/fs/dlm/lowcomms.h
@@ -0,0 +1,25 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __LOWCOMMS_DOT_H__
15#define __LOWCOMMS_DOT_H__
16
17int dlm_lowcomms_init(void);
18void dlm_lowcomms_exit(void);
19int dlm_lowcomms_start(void);
20void dlm_lowcomms_stop(void);
21void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc);
22void dlm_lowcomms_commit_buffer(void *mh);
23
24#endif /* __LOWCOMMS_DOT_H__ */
25
diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h
new file mode 100644
index 000000000000..cc3e92f3feef
--- /dev/null
+++ b/fs/dlm/lvb_table.h
@@ -0,0 +1,18 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __LVB_TABLE_DOT_H__
14#define __LVB_TABLE_DOT_H__
15
16extern const int dlm_lvb_operations[8][8];
17
18#endif
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
new file mode 100644
index 000000000000..81bf4cb22033
--- /dev/null
+++ b/fs/dlm/main.c
@@ -0,0 +1,89 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "lock.h"
17#include "memory.h"
18#include "lowcomms.h"
19#include "config.h"
20
21#ifdef CONFIG_DLM_DEBUG
22int dlm_register_debugfs(void);
23void dlm_unregister_debugfs(void);
24#else
25static inline int dlm_register_debugfs(void) { return 0; }
26static inline void dlm_unregister_debugfs(void) { }
27#endif
28
29static int __init init_dlm(void)
30{
31 int error;
32
33 error = dlm_memory_init();
34 if (error)
35 goto out;
36
37 error = dlm_lockspace_init();
38 if (error)
39 goto out_mem;
40
41 error = dlm_config_init();
42 if (error)
43 goto out_lockspace;
44
45 error = dlm_register_debugfs();
46 if (error)
47 goto out_config;
48
49 error = dlm_lowcomms_init();
50 if (error)
51 goto out_debug;
52
53 printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
54
55 return 0;
56
57 out_debug:
58 dlm_unregister_debugfs();
59 out_config:
60 dlm_config_exit();
61 out_lockspace:
62 dlm_lockspace_exit();
63 out_mem:
64 dlm_memory_exit();
65 out:
66 return error;
67}
68
69static void __exit exit_dlm(void)
70{
71 dlm_lowcomms_exit();
72 dlm_config_exit();
73 dlm_memory_exit();
74 dlm_lockspace_exit();
75 dlm_unregister_debugfs();
76}
77
78module_init(init_dlm);
79module_exit(exit_dlm);
80
81MODULE_DESCRIPTION("Distributed Lock Manager");
82MODULE_AUTHOR("Red Hat, Inc.");
83MODULE_LICENSE("GPL");
84
85EXPORT_SYMBOL_GPL(dlm_new_lockspace);
86EXPORT_SYMBOL_GPL(dlm_release_lockspace);
87EXPORT_SYMBOL_GPL(dlm_lock);
88EXPORT_SYMBOL_GPL(dlm_unlock);
89
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
new file mode 100644
index 000000000000..926cd0cb6bff
--- /dev/null
+++ b/fs/dlm/member.c
@@ -0,0 +1,313 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "lockspace.h"
15#include "member.h"
16#include "recoverd.h"
17#include "recover.h"
18#include "lowcomms.h"
19#include "rcom.h"
20#include "config.h"
21
22/*
23 * Following called by dlm_recoverd thread
24 */
25
26static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
27{
28 struct dlm_member *memb = NULL;
29 struct list_head *tmp;
30 struct list_head *newlist = &new->list;
31 struct list_head *head = &ls->ls_nodes;
32
33 list_for_each(tmp, head) {
34 memb = list_entry(tmp, struct dlm_member, list);
35 if (new->nodeid < memb->nodeid)
36 break;
37 }
38
39 if (!memb)
40 list_add_tail(newlist, head);
41 else {
42 /* FIXME: can use list macro here */
43 newlist->prev = tmp->prev;
44 newlist->next = tmp;
45 tmp->prev->next = newlist;
46 tmp->prev = newlist;
47 }
48}
49
50static int dlm_add_member(struct dlm_ls *ls, int nodeid)
51{
52 struct dlm_member *memb;
53 int w;
54
55 memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
56 if (!memb)
57 return -ENOMEM;
58
59 w = dlm_node_weight(ls->ls_name, nodeid);
60 if (w < 0)
61 return w;
62
63 memb->nodeid = nodeid;
64 memb->weight = w;
65 add_ordered_member(ls, memb);
66 ls->ls_num_nodes++;
67 return 0;
68}
69
70static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
71{
72 list_move(&memb->list, &ls->ls_nodes_gone);
73 ls->ls_num_nodes--;
74}
75
76static int dlm_is_member(struct dlm_ls *ls, int nodeid)
77{
78 struct dlm_member *memb;
79
80 list_for_each_entry(memb, &ls->ls_nodes, list) {
81 if (memb->nodeid == nodeid)
82 return 1;
83 }
84 return 0;
85}
86
87int dlm_is_removed(struct dlm_ls *ls, int nodeid)
88{
89 struct dlm_member *memb;
90
91 list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
92 if (memb->nodeid == nodeid)
93 return 1;
94 }
95 return 0;
96}
97
98static void clear_memb_list(struct list_head *head)
99{
100 struct dlm_member *memb;
101
102 while (!list_empty(head)) {
103 memb = list_entry(head->next, struct dlm_member, list);
104 list_del(&memb->list);
105 kfree(memb);
106 }
107}
108
109void dlm_clear_members(struct dlm_ls *ls)
110{
111 clear_memb_list(&ls->ls_nodes);
112 ls->ls_num_nodes = 0;
113}
114
115void dlm_clear_members_gone(struct dlm_ls *ls)
116{
117 clear_memb_list(&ls->ls_nodes_gone);
118}
119
120static void make_member_array(struct dlm_ls *ls)
121{
122 struct dlm_member *memb;
123 int i, w, x = 0, total = 0, all_zero = 0, *array;
124
125 kfree(ls->ls_node_array);
126 ls->ls_node_array = NULL;
127
128 list_for_each_entry(memb, &ls->ls_nodes, list) {
129 if (memb->weight)
130 total += memb->weight;
131 }
132
133 /* all nodes revert to weight of 1 if all have weight 0 */
134
135 if (!total) {
136 total = ls->ls_num_nodes;
137 all_zero = 1;
138 }
139
140 ls->ls_total_weight = total;
141
142 array = kmalloc(sizeof(int) * total, GFP_KERNEL);
143 if (!array)
144 return;
145
146 list_for_each_entry(memb, &ls->ls_nodes, list) {
147 if (!all_zero && !memb->weight)
148 continue;
149
150 if (all_zero)
151 w = 1;
152 else
153 w = memb->weight;
154
155 DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
156
157 for (i = 0; i < w; i++)
158 array[x++] = memb->nodeid;
159 }
160
161 ls->ls_node_array = array;
162}
163
164/* send a status request to all members just to establish comms connections */
165
166static void ping_members(struct dlm_ls *ls)
167{
168 struct dlm_member *memb;
169 list_for_each_entry(memb, &ls->ls_nodes, list)
170 dlm_rcom_status(ls, memb->nodeid);
171}
172
173int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
174{
175 struct dlm_member *memb, *safe;
176 int i, error, found, pos = 0, neg = 0, low = -1;
177
178 /* move departed members from ls_nodes to ls_nodes_gone */
179
180 list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
181 found = 0;
182 for (i = 0; i < rv->node_count; i++) {
183 if (memb->nodeid == rv->nodeids[i]) {
184 found = 1;
185 break;
186 }
187 }
188
189 if (!found) {
190 neg++;
191 dlm_remove_member(ls, memb);
192 log_debug(ls, "remove member %d", memb->nodeid);
193 }
194 }
195
196 /* add new members to ls_nodes */
197
198 for (i = 0; i < rv->node_count; i++) {
199 if (dlm_is_member(ls, rv->nodeids[i]))
200 continue;
201 dlm_add_member(ls, rv->nodeids[i]);
202 pos++;
203 log_debug(ls, "add member %d", rv->nodeids[i]);
204 }
205
206 list_for_each_entry(memb, &ls->ls_nodes, list) {
207 if (low == -1 || memb->nodeid < low)
208 low = memb->nodeid;
209 }
210 ls->ls_low_nodeid = low;
211
212 make_member_array(ls);
213 dlm_set_recover_status(ls, DLM_RS_NODES);
214 *neg_out = neg;
215
216 ping_members(ls);
217
218 error = dlm_recover_members_wait(ls);
219 log_debug(ls, "total members %d", ls->ls_num_nodes);
220 return error;
221}
222
223/*
224 * Following called from lockspace.c
225 */
226
227int dlm_ls_stop(struct dlm_ls *ls)
228{
229 int new;
230
231 /*
232 * A stop cancels any recovery that's in progress (see RECOVERY_STOP,
233 * dlm_recovery_stopped()) and prevents any new locks from being
234 * processed (see RUNNING, dlm_locking_stopped()).
235 */
236
237 spin_lock(&ls->ls_recover_lock);
238 set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
239 new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
240 ls->ls_recover_seq++;
241 spin_unlock(&ls->ls_recover_lock);
242
243 /*
244 * This in_recovery lock does two things:
245 *
246 * 1) Keeps this function from returning until all threads are out
247 * of locking routines and locking is truely stopped.
248 * 2) Keeps any new requests from being processed until it's unlocked
249 * when recovery is complete.
250 */
251
252 if (new)
253 down_write(&ls->ls_in_recovery);
254
255 /*
256 * The recoverd suspend/resume makes sure that dlm_recoverd (if
257 * running) has noticed the clearing of RUNNING above and quit
258 * processing the previous recovery. This will be true for all nodes
259 * before any nodes start the new recovery.
260 */
261
262 dlm_recoverd_suspend(ls);
263 ls->ls_recover_status = 0;
264 dlm_recoverd_resume(ls);
265 return 0;
266}
267
268int dlm_ls_start(struct dlm_ls *ls)
269{
270 struct dlm_recover *rv = NULL, *rv_old;
271 int *ids = NULL;
272 int error, count;
273
274 rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL);
275 if (!rv)
276 return -ENOMEM;
277
278 error = count = dlm_nodeid_list(ls->ls_name, &ids);
279 if (error <= 0)
280 goto fail;
281
282 spin_lock(&ls->ls_recover_lock);
283
284 /* the lockspace needs to be stopped before it can be started */
285
286 if (!dlm_locking_stopped(ls)) {
287 spin_unlock(&ls->ls_recover_lock);
288 log_error(ls, "start ignored: lockspace running");
289 error = -EINVAL;
290 goto fail;
291 }
292
293 rv->nodeids = ids;
294 rv->node_count = count;
295 rv->seq = ++ls->ls_recover_seq;
296 rv_old = ls->ls_recover_args;
297 ls->ls_recover_args = rv;
298 spin_unlock(&ls->ls_recover_lock);
299
300 if (rv_old) {
301 kfree(rv_old->nodeids);
302 kfree(rv_old);
303 }
304
305 dlm_recoverd_kick(ls);
306 return 0;
307
308 fail:
309 kfree(rv);
310 kfree(ids);
311 return error;
312}
313
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
new file mode 100644
index 000000000000..927c08c19214
--- /dev/null
+++ b/fs/dlm/member.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __MEMBER_DOT_H__
14#define __MEMBER_DOT_H__
15
16int dlm_ls_stop(struct dlm_ls *ls);
17int dlm_ls_start(struct dlm_ls *ls);
18void dlm_clear_members(struct dlm_ls *ls);
19void dlm_clear_members_gone(struct dlm_ls *ls);
20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
21int dlm_is_removed(struct dlm_ls *ls, int nodeid);
22
23#endif /* __MEMBER_DOT_H__ */
24
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
new file mode 100644
index 000000000000..f7cf4589fae8
--- /dev/null
+++ b/fs/dlm/memory.c
@@ -0,0 +1,106 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "config.h"
16#include "memory.h"
17
18static kmem_cache_t *lkb_cache;
19
20
21int dlm_memory_init(void)
22{
23 int ret = 0;
24
25 lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
26 __alignof__(struct dlm_lkb), 0, NULL, NULL);
27 if (!lkb_cache)
28 ret = -ENOMEM;
29 return ret;
30}
31
32void dlm_memory_exit(void)
33{
34 if (lkb_cache)
35 kmem_cache_destroy(lkb_cache);
36}
37
38char *allocate_lvb(struct dlm_ls *ls)
39{
40 char *p;
41
42 p = kmalloc(ls->ls_lvblen, GFP_KERNEL);
43 if (p)
44 memset(p, 0, ls->ls_lvblen);
45 return p;
46}
47
48void free_lvb(char *p)
49{
50 kfree(p);
51}
52
53/* FIXME: have some minimal space built-in to rsb for the name and
54 kmalloc a separate name if needed, like dentries are done */
55
56struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
57{
58 struct dlm_rsb *r;
59
60 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
61
62 r = kmalloc(sizeof(*r) + namelen, GFP_KERNEL);
63 if (r)
64 memset(r, 0, sizeof(*r) + namelen);
65 return r;
66}
67
68void free_rsb(struct dlm_rsb *r)
69{
70 if (r->res_lvbptr)
71 free_lvb(r->res_lvbptr);
72 kfree(r);
73}
74
75struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
76{
77 struct dlm_lkb *lkb;
78
79 lkb = kmem_cache_alloc(lkb_cache, GFP_KERNEL);
80 if (lkb)
81 memset(lkb, 0, sizeof(*lkb));
82 return lkb;
83}
84
85void free_lkb(struct dlm_lkb *lkb)
86{
87 kmem_cache_free(lkb_cache, lkb);
88}
89
90struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
91{
92 struct dlm_direntry *de;
93
94 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
95
96 de = kmalloc(sizeof(*de) + namelen, GFP_KERNEL);
97 if (de)
98 memset(de, 0, sizeof(*de) + namelen);
99 return de;
100}
101
102void free_direntry(struct dlm_direntry *de)
103{
104 kfree(de);
105}
106
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
new file mode 100644
index 000000000000..6ead158ccc5c
--- /dev/null
+++ b/fs/dlm/memory.h
@@ -0,0 +1,29 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MEMORY_DOT_H__
15#define __MEMORY_DOT_H__
16
17int dlm_memory_init(void);
18void dlm_memory_exit(void);
19struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
20void free_rsb(struct dlm_rsb *r);
21struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
22void free_lkb(struct dlm_lkb *l);
23struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
24void free_direntry(struct dlm_direntry *de);
25char *allocate_lvb(struct dlm_ls *ls);
26void free_lvb(char *l);
27
28#endif /* __MEMORY_DOT_H__ */
29
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
new file mode 100644
index 000000000000..c9b1c3d535f4
--- /dev/null
+++ b/fs/dlm/midcomms.c
@@ -0,0 +1,140 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/*
15 * midcomms.c
16 *
17 * This is the appallingly named "mid-level" comms layer.
18 *
19 * Its purpose is to take packets from the "real" comms layer,
20 * split them up into packets and pass them to the interested
21 * part of the locking mechanism.
22 *
23 * It also takes messages from the locking layer, formats them
24 * into packets and sends them to the comms layer.
25 */
26
27#include "dlm_internal.h"
28#include "lowcomms.h"
29#include "config.h"
30#include "rcom.h"
31#include "lock.h"
32#include "midcomms.h"
33
34
35static void copy_from_cb(void *dst, const void *base, unsigned offset,
36 unsigned len, unsigned limit)
37{
38 unsigned copy = len;
39
40 if ((copy + offset) > limit)
41 copy = limit - offset;
42 memcpy(dst, base + offset, copy);
43 len -= copy;
44 if (len)
45 memcpy(dst + copy, base, len);
46}
47
48/*
49 * Called from the low-level comms layer to process a buffer of
50 * commands.
51 *
52 * Only complete messages are processed here, any "spare" bytes from
53 * the end of a buffer are saved and tacked onto the front of the next
54 * message that comes in. I doubt this will happen very often but we
55 * need to be able to cope with it and I don't want the task to be waiting
56 * for packets to come in when there is useful work to be done.
57 */
58
59int dlm_process_incoming_buffer(int nodeid, const void *base,
60 unsigned offset, unsigned len, unsigned limit)
61{
62 unsigned char __tmp[DLM_INBUF_LEN];
63 struct dlm_header *msg = (struct dlm_header *) __tmp;
64 int ret = 0;
65 int err = 0;
66 uint16_t msglen;
67 uint32_t lockspace;
68
69 while (len > sizeof(struct dlm_header)) {
70
71 /* Copy just the header to check the total length. The
72 message may wrap around the end of the buffer back to the
73 start, so we need to use a temp buffer and copy_from_cb. */
74
75 copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
76 limit);
77
78 msglen = le16_to_cpu(msg->h_length);
79 lockspace = msg->h_lockspace;
80
81 err = -EINVAL;
82 if (msglen < sizeof(struct dlm_header))
83 break;
84 err = -E2BIG;
85 if (msglen > dlm_config.buffer_size) {
86 log_print("message size %d from %d too big, buf len %d",
87 msglen, nodeid, len);
88 break;
89 }
90 err = 0;
91
92 /* If only part of the full message is contained in this
93 buffer, then do nothing and wait for lowcomms to call
94 us again later with more data. We return 0 meaning
95 we've consumed none of the input buffer. */
96
97 if (msglen > len)
98 break;
99
100 /* Allocate a larger temp buffer if the full message won't fit
101 in the buffer on the stack (which should work for most
102 ordinary messages). */
103
104 if (msglen > sizeof(__tmp) &&
105 msg == (struct dlm_header *) __tmp) {
106 msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
107 if (msg == NULL)
108 return ret;
109 }
110
111 copy_from_cb(msg, base, offset, msglen, limit);
112
113 BUG_ON(lockspace != msg->h_lockspace);
114
115 ret += msglen;
116 offset += msglen;
117 offset &= (limit - 1);
118 len -= msglen;
119
120 switch (msg->h_cmd) {
121 case DLM_MSG:
122 dlm_receive_message(msg, nodeid, 0);
123 break;
124
125 case DLM_RCOM:
126 dlm_receive_rcom(msg, nodeid);
127 break;
128
129 default:
130 log_print("unknown msg type %x from %u: %u %u %u %u",
131 msg->h_cmd, nodeid, msglen, len, offset, ret);
132 }
133 }
134
135 if (msg != (struct dlm_header *) __tmp)
136 kfree(msg);
137
138 return err ? err : ret;
139}
140
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
new file mode 100644
index 000000000000..95852a5f111d
--- /dev/null
+++ b/fs/dlm/midcomms.h
@@ -0,0 +1,21 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __MIDCOMMS_DOT_H__
15#define __MIDCOMMS_DOT_H__
16
17int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
18 unsigned len, unsigned limit);
19
20#endif /* __MIDCOMMS_DOT_H__ */
21
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
new file mode 100644
index 000000000000..55fbe313340e
--- /dev/null
+++ b/fs/dlm/rcom.c
@@ -0,0 +1,457 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "lowcomms.h"
18#include "midcomms.h"
19#include "rcom.h"
20#include "recover.h"
21#include "dir.h"
22#include "config.h"
23#include "memory.h"
24#include "lock.h"
25#include "util.h"
26
27
28static int rcom_response(struct dlm_ls *ls)
29{
30 return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
31}
32
33static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
34 struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
35{
36 struct dlm_rcom *rc;
37 struct dlm_mhandle *mh;
38 char *mb;
39 int mb_len = sizeof(struct dlm_rcom) + len;
40
41 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
42 if (!mh) {
43 log_print("create_rcom to %d type %d len %d ENOBUFS",
44 to_nodeid, type, len);
45 return -ENOBUFS;
46 }
47 memset(mb, 0, mb_len);
48
49 rc = (struct dlm_rcom *) mb;
50
51 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
52 rc->rc_header.h_lockspace = ls->ls_global_id;
53 rc->rc_header.h_nodeid = dlm_our_nodeid();
54 rc->rc_header.h_length = mb_len;
55 rc->rc_header.h_cmd = DLM_RCOM;
56
57 rc->rc_type = type;
58
59 *mh_ret = mh;
60 *rc_ret = rc;
61 return 0;
62}
63
64static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
65 struct dlm_rcom *rc)
66{
67 dlm_rcom_out(rc);
68 dlm_lowcomms_commit_buffer(mh);
69}
70
71/* When replying to a status request, a node also sends back its
72 configuration values. The requesting node then checks that the remote
73 node is configured the same way as itself. */
74
75static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
76{
77 rf->rf_lvblen = ls->ls_lvblen;
78 rf->rf_lsflags = ls->ls_exflags;
79}
80
81static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
82{
83 if (rf->rf_lvblen != ls->ls_lvblen ||
84 rf->rf_lsflags != ls->ls_exflags) {
85 log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
86 ls->ls_lvblen, ls->ls_exflags,
87 nodeid, rf->rf_lvblen, rf->rf_lsflags);
88 return -EINVAL;
89 }
90 return 0;
91}
92
93int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
94{
95 struct dlm_rcom *rc;
96 struct dlm_mhandle *mh;
97 int error = 0;
98
99 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
100
101 if (nodeid == dlm_our_nodeid()) {
102 rc = (struct dlm_rcom *) ls->ls_recover_buf;
103 rc->rc_result = dlm_recover_status(ls);
104 goto out;
105 }
106
107 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
108 if (error)
109 goto out;
110
111 send_rcom(ls, mh, rc);
112
113 error = dlm_wait_function(ls, &rcom_response);
114 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
115 if (error)
116 goto out;
117
118 rc = (struct dlm_rcom *) ls->ls_recover_buf;
119
120 if (rc->rc_result == -ESRCH) {
121 /* we pretend the remote lockspace exists with 0 status */
122 log_debug(ls, "remote node %d not ready", nodeid);
123 rc->rc_result = 0;
124 } else
125 error = check_config(ls, (struct rcom_config *) rc->rc_buf,
126 nodeid);
127 /* the caller looks at rc_result for the remote recovery status */
128 out:
129 return error;
130}
131
132static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
133{
134 struct dlm_rcom *rc;
135 struct dlm_mhandle *mh;
136 int error, nodeid = rc_in->rc_header.h_nodeid;
137
138 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
139 sizeof(struct rcom_config), &rc, &mh);
140 if (error)
141 return;
142 rc->rc_result = dlm_recover_status(ls);
143 make_config(ls, (struct rcom_config *) rc->rc_buf);
144
145 send_rcom(ls, mh, rc);
146}
147
148static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
149{
150 memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
151 set_bit(LSFL_RCOM_READY, &ls->ls_flags);
152 wake_up(&ls->ls_wait_general);
153}
154
155int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
156{
157 struct dlm_rcom *rc;
158 struct dlm_mhandle *mh;
159 int error = 0, len = sizeof(struct dlm_rcom);
160
161 memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
162
163 if (nodeid == dlm_our_nodeid()) {
164 dlm_copy_master_names(ls, last_name, last_len,
165 ls->ls_recover_buf + len,
166 dlm_config.buffer_size - len, nodeid);
167 goto out;
168 }
169
170 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
171 if (error)
172 goto out;
173 memcpy(rc->rc_buf, last_name, last_len);
174
175 send_rcom(ls, mh, rc);
176
177 error = dlm_wait_function(ls, &rcom_response);
178 clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
179 out:
180 return error;
181}
182
183static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
184{
185 struct dlm_rcom *rc;
186 struct dlm_mhandle *mh;
187 int error, inlen, outlen;
188 int nodeid = rc_in->rc_header.h_nodeid;
189 uint32_t status = dlm_recover_status(ls);
190
191 /*
192 * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
193 * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
194 * It could only happen in rare cases where we get a late NAMES
195 * message from a previous instance of recovery.
196 */
197
198 if (!(status & DLM_RS_NODES)) {
199 log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
200 return;
201 }
202
203 nodeid = rc_in->rc_header.h_nodeid;
204 inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
205 outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
206
207 error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
208 if (error)
209 return;
210
211 dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
212 nodeid);
213 send_rcom(ls, mh, rc);
214}
215
216static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
217{
218 memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
219 set_bit(LSFL_RCOM_READY, &ls->ls_flags);
220 wake_up(&ls->ls_wait_general);
221}
222
223int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
224{
225 struct dlm_rcom *rc;
226 struct dlm_mhandle *mh;
227 struct dlm_ls *ls = r->res_ls;
228 int error;
229
230 error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
231 &rc, &mh);
232 if (error)
233 goto out;
234 memcpy(rc->rc_buf, r->res_name, r->res_length);
235 rc->rc_id = (unsigned long) r;
236
237 send_rcom(ls, mh, rc);
238 out:
239 return error;
240}
241
242static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
243{
244 struct dlm_rcom *rc;
245 struct dlm_mhandle *mh;
246 int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
247 int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
248
249 error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
250 if (error)
251 return;
252
253 error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
254 if (error)
255 ret_nodeid = error;
256 rc->rc_result = ret_nodeid;
257 rc->rc_id = rc_in->rc_id;
258
259 send_rcom(ls, mh, rc);
260}
261
262static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
263{
264 dlm_recover_master_reply(ls, rc_in);
265}
266
267static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
268 struct rcom_lock *rl)
269{
270 memset(rl, 0, sizeof(*rl));
271
272 rl->rl_ownpid = lkb->lkb_ownpid;
273 rl->rl_lkid = lkb->lkb_id;
274 rl->rl_exflags = lkb->lkb_exflags;
275 rl->rl_flags = lkb->lkb_flags;
276 rl->rl_lvbseq = lkb->lkb_lvbseq;
277 rl->rl_rqmode = lkb->lkb_rqmode;
278 rl->rl_grmode = lkb->lkb_grmode;
279 rl->rl_status = lkb->lkb_status;
280 rl->rl_wait_type = lkb->lkb_wait_type;
281
282 if (lkb->lkb_bastaddr)
283 rl->rl_asts |= AST_BAST;
284 if (lkb->lkb_astaddr)
285 rl->rl_asts |= AST_COMP;
286
287 rl->rl_namelen = r->res_length;
288 memcpy(rl->rl_name, r->res_name, r->res_length);
289
290 /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
291 If so, receive_rcom_lock_args() won't take this copy. */
292
293 if (lkb->lkb_lvbptr)
294 memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
295}
296
297int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
298{
299 struct dlm_ls *ls = r->res_ls;
300 struct dlm_rcom *rc;
301 struct dlm_mhandle *mh;
302 struct rcom_lock *rl;
303 int error, len = sizeof(struct rcom_lock);
304
305 if (lkb->lkb_lvbptr)
306 len += ls->ls_lvblen;
307
308 error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
309 if (error)
310 goto out;
311
312 rl = (struct rcom_lock *) rc->rc_buf;
313 pack_rcom_lock(r, lkb, rl);
314 rc->rc_id = (unsigned long) r;
315
316 send_rcom(ls, mh, rc);
317 out:
318 return error;
319}
320
321static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
322{
323 struct dlm_rcom *rc;
324 struct dlm_mhandle *mh;
325 int error, nodeid = rc_in->rc_header.h_nodeid;
326
327 dlm_recover_master_copy(ls, rc_in);
328
329 error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
330 sizeof(struct rcom_lock), &rc, &mh);
331 if (error)
332 return;
333
334 /* We send back the same rcom_lock struct we received, but
335 dlm_recover_master_copy() has filled in rl_remid and rl_result */
336
337 memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
338 rc->rc_id = rc_in->rc_id;
339
340 send_rcom(ls, mh, rc);
341}
342
343static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
344{
345 uint32_t status = dlm_recover_status(ls);
346
347 if (!(status & DLM_RS_DIR)) {
348 log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
349 rc_in->rc_header.h_nodeid);
350 return;
351 }
352
353 dlm_recover_process_copy(ls, rc_in);
354}
355
356static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
357{
358 struct dlm_rcom *rc;
359 struct dlm_mhandle *mh;
360 char *mb;
361 int mb_len = sizeof(struct dlm_rcom);
362
363 mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb);
364 if (!mh)
365 return -ENOBUFS;
366 memset(mb, 0, mb_len);
367
368 rc = (struct dlm_rcom *) mb;
369
370 rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
371 rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
372 rc->rc_header.h_nodeid = dlm_our_nodeid();
373 rc->rc_header.h_length = mb_len;
374 rc->rc_header.h_cmd = DLM_RCOM;
375
376 rc->rc_type = DLM_RCOM_STATUS_REPLY;
377 rc->rc_result = -ESRCH;
378
379 dlm_rcom_out(rc);
380 dlm_lowcomms_commit_buffer(mh);
381
382 return 0;
383}
384
385/* Called by dlm_recvd; corresponds to dlm_receive_message() but special
386 recovery-only comms are sent through here. */
387
388void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
389{
390 struct dlm_rcom *rc = (struct dlm_rcom *) hd;
391 struct dlm_ls *ls;
392
393 dlm_rcom_in(rc);
394
395 /* If the lockspace doesn't exist then still send a status message
396 back; it's possible that it just doesn't have its global_id yet. */
397
398 ls = dlm_find_lockspace_global(hd->h_lockspace);
399 if (!ls) {
400 log_print("lockspace %x from %d not found",
401 hd->h_lockspace, nodeid);
402 send_ls_not_ready(nodeid, rc);
403 return;
404 }
405
406 if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
407 log_error(ls, "ignoring recovery message %x from %d",
408 rc->rc_type, nodeid);
409 goto out;
410 }
411
412 if (nodeid != rc->rc_header.h_nodeid) {
413 log_error(ls, "bad rcom nodeid %d from %d",
414 rc->rc_header.h_nodeid, nodeid);
415 goto out;
416 }
417
418 switch (rc->rc_type) {
419 case DLM_RCOM_STATUS:
420 receive_rcom_status(ls, rc);
421 break;
422
423 case DLM_RCOM_NAMES:
424 receive_rcom_names(ls, rc);
425 break;
426
427 case DLM_RCOM_LOOKUP:
428 receive_rcom_lookup(ls, rc);
429 break;
430
431 case DLM_RCOM_LOCK:
432 receive_rcom_lock(ls, rc);
433 break;
434
435 case DLM_RCOM_STATUS_REPLY:
436 receive_rcom_status_reply(ls, rc);
437 break;
438
439 case DLM_RCOM_NAMES_REPLY:
440 receive_rcom_names_reply(ls, rc);
441 break;
442
443 case DLM_RCOM_LOOKUP_REPLY:
444 receive_rcom_lookup_reply(ls, rc);
445 break;
446
447 case DLM_RCOM_LOCK_REPLY:
448 receive_rcom_lock_reply(ls, rc);
449 break;
450
451 default:
452 DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
453 }
454 out:
455 dlm_put_lockspace(ls);
456}
457
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
new file mode 100644
index 000000000000..d7984321ff41
--- /dev/null
+++ b/fs/dlm/rcom.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RCOM_DOT_H__
15#define __RCOM_DOT_H__
16
17int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
18int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
19int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
20int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
21void dlm_receive_rcom(struct dlm_header *hd, int nodeid);
22
23#endif
24
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
new file mode 100644
index 000000000000..b036ee7dcb32
--- /dev/null
+++ b/fs/dlm/recover.c
@@ -0,0 +1,762 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "dir.h"
17#include "config.h"
18#include "ast.h"
19#include "memory.h"
20#include "rcom.h"
21#include "lock.h"
22#include "lowcomms.h"
23#include "member.h"
24#include "recover.h"
25
26
27/*
28 * Recovery waiting routines: these functions wait for a particular reply from
29 * a remote node, or for the remote node to report a certain status. They need
30 * to abort if the lockspace is stopped indicating a node has failed (perhaps
31 * the one being waited for).
32 */
33
34/*
35 * Wait until given function returns non-zero or lockspace is stopped
36 * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another
37 * function thinks it could have completed the waited-on task, they should wake
38 * up ls_wait_general to get an immediate response rather than waiting for the
39 * timer to detect the result. A timer wakes us up periodically while waiting
40 * to see if we should abort due to a node failure. This should only be called
41 * by the dlm_recoverd thread.
42 */
43
44static void dlm_wait_timer_fn(unsigned long data)
45{
46 struct dlm_ls *ls = (struct dlm_ls *) data;
47 mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ));
48 wake_up(&ls->ls_wait_general);
49}
50
51int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
52{
53 int error = 0;
54
55 init_timer(&ls->ls_timer);
56 ls->ls_timer.function = dlm_wait_timer_fn;
57 ls->ls_timer.data = (long) ls;
58 ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ);
59 add_timer(&ls->ls_timer);
60
61 wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
62 del_timer_sync(&ls->ls_timer);
63
64 if (dlm_recovery_stopped(ls)) {
65 log_debug(ls, "dlm_wait_function aborted");
66 error = -EINTR;
67 }
68 return error;
69}
70
71/*
72 * An efficient way for all nodes to wait for all others to have a certain
73 * status. The node with the lowest nodeid polls all the others for their
74 * status (wait_status_all) and all the others poll the node with the low id
75 * for its accumulated result (wait_status_low). When all nodes have set
76 * status flag X, then status flag X_ALL will be set on the low nodeid.
77 */
78
79uint32_t dlm_recover_status(struct dlm_ls *ls)
80{
81 uint32_t status;
82 spin_lock(&ls->ls_recover_lock);
83 status = ls->ls_recover_status;
84 spin_unlock(&ls->ls_recover_lock);
85 return status;
86}
87
88void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
89{
90 spin_lock(&ls->ls_recover_lock);
91 ls->ls_recover_status |= status;
92 spin_unlock(&ls->ls_recover_lock);
93}
94
95static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
96{
97 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
98 struct dlm_member *memb;
99 int error = 0, delay;
100
101 list_for_each_entry(memb, &ls->ls_nodes, list) {
102 delay = 0;
103 for (;;) {
104 if (dlm_recovery_stopped(ls)) {
105 error = -EINTR;
106 goto out;
107 }
108
109 error = dlm_rcom_status(ls, memb->nodeid);
110 if (error)
111 goto out;
112
113 if (rc->rc_result & wait_status)
114 break;
115 if (delay < 1000)
116 delay += 20;
117 msleep(delay);
118 }
119 }
120 out:
121 return error;
122}
123
124static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
125{
126 struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
127 int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
128
129 for (;;) {
130 if (dlm_recovery_stopped(ls)) {
131 error = -EINTR;
132 goto out;
133 }
134
135 error = dlm_rcom_status(ls, nodeid);
136 if (error)
137 break;
138
139 if (rc->rc_result & wait_status)
140 break;
141 if (delay < 1000)
142 delay += 20;
143 msleep(delay);
144 }
145 out:
146 return error;
147}
148
149static int wait_status(struct dlm_ls *ls, uint32_t status)
150{
151 uint32_t status_all = status << 1;
152 int error;
153
154 if (ls->ls_low_nodeid == dlm_our_nodeid()) {
155 error = wait_status_all(ls, status);
156 if (!error)
157 dlm_set_recover_status(ls, status_all);
158 } else
159 error = wait_status_low(ls, status_all);
160
161 return error;
162}
163
164int dlm_recover_members_wait(struct dlm_ls *ls)
165{
166 return wait_status(ls, DLM_RS_NODES);
167}
168
169int dlm_recover_directory_wait(struct dlm_ls *ls)
170{
171 return wait_status(ls, DLM_RS_DIR);
172}
173
174int dlm_recover_locks_wait(struct dlm_ls *ls)
175{
176 return wait_status(ls, DLM_RS_LOCKS);
177}
178
179int dlm_recover_done_wait(struct dlm_ls *ls)
180{
181 return wait_status(ls, DLM_RS_DONE);
182}
183
184/*
185 * The recover_list contains all the rsb's for which we've requested the new
186 * master nodeid. As replies are returned from the resource directories the
187 * rsb's are removed from the list. When the list is empty we're done.
188 *
189 * The recover_list is later similarly used for all rsb's for which we've sent
190 * new lkb's and need to receive new corresponding lkid's.
191 *
192 * We use the address of the rsb struct as a simple local identifier for the
193 * rsb so we can match an rcom reply with the rsb it was sent for.
194 */
195
196static int recover_list_empty(struct dlm_ls *ls)
197{
198 int empty;
199
200 spin_lock(&ls->ls_recover_list_lock);
201 empty = list_empty(&ls->ls_recover_list);
202 spin_unlock(&ls->ls_recover_list_lock);
203
204 return empty;
205}
206
207static void recover_list_add(struct dlm_rsb *r)
208{
209 struct dlm_ls *ls = r->res_ls;
210
211 spin_lock(&ls->ls_recover_list_lock);
212 if (list_empty(&r->res_recover_list)) {
213 list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
214 ls->ls_recover_list_count++;
215 dlm_hold_rsb(r);
216 }
217 spin_unlock(&ls->ls_recover_list_lock);
218}
219
220static void recover_list_del(struct dlm_rsb *r)
221{
222 struct dlm_ls *ls = r->res_ls;
223
224 spin_lock(&ls->ls_recover_list_lock);
225 list_del_init(&r->res_recover_list);
226 ls->ls_recover_list_count--;
227 spin_unlock(&ls->ls_recover_list_lock);
228
229 dlm_put_rsb(r);
230}
231
232static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id)
233{
234 struct dlm_rsb *r = NULL;
235
236 spin_lock(&ls->ls_recover_list_lock);
237
238 list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) {
239 if (id == (unsigned long) r)
240 goto out;
241 }
242 r = NULL;
243 out:
244 spin_unlock(&ls->ls_recover_list_lock);
245 return r;
246}
247
248static void recover_list_clear(struct dlm_ls *ls)
249{
250 struct dlm_rsb *r, *s;
251
252 spin_lock(&ls->ls_recover_list_lock);
253 list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
254 list_del_init(&r->res_recover_list);
255 dlm_put_rsb(r);
256 ls->ls_recover_list_count--;
257 }
258
259 if (ls->ls_recover_list_count != 0) {
260 log_error(ls, "warning: recover_list_count %d",
261 ls->ls_recover_list_count);
262 ls->ls_recover_list_count = 0;
263 }
264 spin_unlock(&ls->ls_recover_list_lock);
265}
266
267
268/* Master recovery: find new master node for rsb's that were
269 mastered on nodes that have been removed.
270
271 dlm_recover_masters
272 recover_master
273 dlm_send_rcom_lookup -> receive_rcom_lookup
274 dlm_dir_lookup
275 receive_rcom_lookup_reply <-
276 dlm_recover_master_reply
277 set_new_master
278 set_master_lkbs
279 set_lock_master
280*/
281
282/*
283 * Set the lock master for all LKBs in a lock queue
284 * If we are the new master of the rsb, we may have received new
285 * MSTCPY locks from other nodes already which we need to ignore
286 * when setting the new nodeid.
287 */
288
289static void set_lock_master(struct list_head *queue, int nodeid)
290{
291 struct dlm_lkb *lkb;
292
293 list_for_each_entry(lkb, queue, lkb_statequeue)
294 if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
295 lkb->lkb_nodeid = nodeid;
296}
297
298static void set_master_lkbs(struct dlm_rsb *r)
299{
300 set_lock_master(&r->res_grantqueue, r->res_nodeid);
301 set_lock_master(&r->res_convertqueue, r->res_nodeid);
302 set_lock_master(&r->res_waitqueue, r->res_nodeid);
303}
304
305/*
306 * Propogate the new master nodeid to locks
307 * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
308 * The NEW_MASTER2 flag tells recover_lvb() which rsb's to consider.
309 */
310
311static void set_new_master(struct dlm_rsb *r, int nodeid)
312{
313 lock_rsb(r);
314 r->res_nodeid = nodeid;
315 set_master_lkbs(r);
316 rsb_set_flag(r, RSB_NEW_MASTER);
317 rsb_set_flag(r, RSB_NEW_MASTER2);
318 unlock_rsb(r);
319}
320
321/*
322 * We do async lookups on rsb's that need new masters. The rsb's
323 * waiting for a lookup reply are kept on the recover_list.
324 */
325
326static int recover_master(struct dlm_rsb *r)
327{
328 struct dlm_ls *ls = r->res_ls;
329 int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
330
331 dir_nodeid = dlm_dir_nodeid(r);
332
333 if (dir_nodeid == our_nodeid) {
334 error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
335 r->res_length, &ret_nodeid);
336 if (error)
337 log_error(ls, "recover dir lookup error %d", error);
338
339 if (ret_nodeid == our_nodeid)
340 ret_nodeid = 0;
341 set_new_master(r, ret_nodeid);
342 } else {
343 recover_list_add(r);
344 error = dlm_send_rcom_lookup(r, dir_nodeid);
345 }
346
347 return error;
348}
349
350/*
351 * When not using a directory, most resource names will hash to a new static
352 * master nodeid and the resource will need to be remastered.
353 */
354
355static int recover_master_static(struct dlm_rsb *r)
356{
357 int master = dlm_dir_nodeid(r);
358
359 if (master == dlm_our_nodeid())
360 master = 0;
361
362 if (r->res_nodeid != master) {
363 if (is_master(r))
364 dlm_purge_mstcpy_locks(r);
365 set_new_master(r, master);
366 return 1;
367 }
368 return 0;
369}
370
371/*
372 * Go through local root resources and for each rsb which has a master which
373 * has departed, get the new master nodeid from the directory. The dir will
374 * assign mastery to the first node to look up the new master. That means
375 * we'll discover in this lookup if we're the new master of any rsb's.
376 *
377 * We fire off all the dir lookup requests individually and asynchronously to
378 * the correct dir node.
379 */
380
381int dlm_recover_masters(struct dlm_ls *ls)
382{
383 struct dlm_rsb *r;
384 int error = 0, count = 0;
385
386 log_debug(ls, "dlm_recover_masters");
387
388 down_read(&ls->ls_root_sem);
389 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
390 if (dlm_recovery_stopped(ls)) {
391 up_read(&ls->ls_root_sem);
392 error = -EINTR;
393 goto out;
394 }
395
396 if (dlm_no_directory(ls))
397 count += recover_master_static(r);
398 else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) {
399 recover_master(r);
400 count++;
401 }
402
403 schedule();
404 }
405 up_read(&ls->ls_root_sem);
406
407 log_debug(ls, "dlm_recover_masters %d resources", count);
408
409 error = dlm_wait_function(ls, &recover_list_empty);
410 out:
411 if (error)
412 recover_list_clear(ls);
413 return error;
414}
415
416int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
417{
418 struct dlm_rsb *r;
419 int nodeid;
420
421 r = recover_list_find(ls, rc->rc_id);
422 if (!r) {
423 log_error(ls, "dlm_recover_master_reply no id %llx",
424 rc->rc_id);
425 goto out;
426 }
427
428 nodeid = rc->rc_result;
429 if (nodeid == dlm_our_nodeid())
430 nodeid = 0;
431
432 set_new_master(r, nodeid);
433 recover_list_del(r);
434
435 if (recover_list_empty(ls))
436 wake_up(&ls->ls_wait_general);
437 out:
438 return 0;
439}
440
441
442/* Lock recovery: rebuild the process-copy locks we hold on a
443 remastered rsb on the new rsb master.
444
445 dlm_recover_locks
446 recover_locks
447 recover_locks_queue
448 dlm_send_rcom_lock -> receive_rcom_lock
449 dlm_recover_master_copy
450 receive_rcom_lock_reply <-
451 dlm_recover_process_copy
452*/
453
454
455/*
456 * keep a count of the number of lkb's we send to the new master; when we get
457 * an equal number of replies then recovery for the rsb is done
458 */
459
460static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
461{
462 struct dlm_lkb *lkb;
463 int error = 0;
464
465 list_for_each_entry(lkb, head, lkb_statequeue) {
466 error = dlm_send_rcom_lock(r, lkb);
467 if (error)
468 break;
469 r->res_recover_locks_count++;
470 }
471
472 return error;
473}
474
475static int all_queues_empty(struct dlm_rsb *r)
476{
477 if (!list_empty(&r->res_grantqueue) ||
478 !list_empty(&r->res_convertqueue) ||
479 !list_empty(&r->res_waitqueue))
480 return 0;
481 return 1;
482}
483
484static int recover_locks(struct dlm_rsb *r)
485{
486 int error = 0;
487
488 lock_rsb(r);
489 if (all_queues_empty(r))
490 goto out;
491
492 DLM_ASSERT(!r->res_recover_locks_count, dlm_print_rsb(r););
493
494 error = recover_locks_queue(r, &r->res_grantqueue);
495 if (error)
496 goto out;
497 error = recover_locks_queue(r, &r->res_convertqueue);
498 if (error)
499 goto out;
500 error = recover_locks_queue(r, &r->res_waitqueue);
501 if (error)
502 goto out;
503
504 if (r->res_recover_locks_count)
505 recover_list_add(r);
506 else
507 rsb_clear_flag(r, RSB_NEW_MASTER);
508 out:
509 unlock_rsb(r);
510 return error;
511}
512
513int dlm_recover_locks(struct dlm_ls *ls)
514{
515 struct dlm_rsb *r;
516 int error, count = 0;
517
518 log_debug(ls, "dlm_recover_locks");
519
520 down_read(&ls->ls_root_sem);
521 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
522 if (is_master(r)) {
523 rsb_clear_flag(r, RSB_NEW_MASTER);
524 continue;
525 }
526
527 if (!rsb_flag(r, RSB_NEW_MASTER))
528 continue;
529
530 if (dlm_recovery_stopped(ls)) {
531 error = -EINTR;
532 up_read(&ls->ls_root_sem);
533 goto out;
534 }
535
536 error = recover_locks(r);
537 if (error) {
538 up_read(&ls->ls_root_sem);
539 goto out;
540 }
541
542 count += r->res_recover_locks_count;
543 }
544 up_read(&ls->ls_root_sem);
545
546 log_debug(ls, "dlm_recover_locks %d locks", count);
547
548 error = dlm_wait_function(ls, &recover_list_empty);
549 out:
550 if (error)
551 recover_list_clear(ls);
552 else
553 dlm_set_recover_status(ls, DLM_RS_LOCKS);
554 return error;
555}
556
557void dlm_recovered_lock(struct dlm_rsb *r)
558{
559 DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_print_rsb(r););
560
561 r->res_recover_locks_count--;
562 if (!r->res_recover_locks_count) {
563 rsb_clear_flag(r, RSB_NEW_MASTER);
564 recover_list_del(r);
565 }
566
567 if (recover_list_empty(r->res_ls))
568 wake_up(&r->res_ls->ls_wait_general);
569}
570
571/*
572 * The lvb needs to be recovered on all master rsb's. This includes setting
573 * the VALNOTVALID flag if necessary, and determining the correct lvb contents
574 * based on the lvb's of the locks held on the rsb.
575 *
576 * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it
577 * was already set prior to recovery, it's not cleared, regardless of locks.
578 *
579 * The LVB contents are only considered for changing when this is a new master
580 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
581 * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken
582 * from the lkb with the largest lvb sequence number.
583 */
584
585static void recover_lvb(struct dlm_rsb *r)
586{
587 struct dlm_lkb *lkb, *high_lkb = NULL;
588 uint32_t high_seq = 0;
589 int lock_lvb_exists = 0;
590 int big_lock_exists = 0;
591 int lvblen = r->res_ls->ls_lvblen;
592
593 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
594 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
595 continue;
596
597 lock_lvb_exists = 1;
598
599 if (lkb->lkb_grmode > DLM_LOCK_CR) {
600 big_lock_exists = 1;
601 goto setflag;
602 }
603
604 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
605 high_lkb = lkb;
606 high_seq = lkb->lkb_lvbseq;
607 }
608 }
609
610 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
611 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
612 continue;
613
614 lock_lvb_exists = 1;
615
616 if (lkb->lkb_grmode > DLM_LOCK_CR) {
617 big_lock_exists = 1;
618 goto setflag;
619 }
620
621 if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
622 high_lkb = lkb;
623 high_seq = lkb->lkb_lvbseq;
624 }
625 }
626
627 setflag:
628 if (!lock_lvb_exists)
629 goto out;
630
631 if (!big_lock_exists)
632 rsb_set_flag(r, RSB_VALNOTVALID);
633
634 /* don't mess with the lvb unless we're the new master */
635 if (!rsb_flag(r, RSB_NEW_MASTER2))
636 goto out;
637
638 if (!r->res_lvbptr) {
639 r->res_lvbptr = allocate_lvb(r->res_ls);
640 if (!r->res_lvbptr)
641 goto out;
642 }
643
644 if (big_lock_exists) {
645 r->res_lvbseq = lkb->lkb_lvbseq;
646 memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
647 } else if (high_lkb) {
648 r->res_lvbseq = high_lkb->lkb_lvbseq;
649 memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
650 } else {
651 r->res_lvbseq = 0;
652 memset(r->res_lvbptr, 0, lvblen);
653 }
654 out:
655 return;
656}
657
658/* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks
659 converting PR->CW or CW->PR need to have their lkb_grmode set. */
660
661static void recover_conversion(struct dlm_rsb *r)
662{
663 struct dlm_lkb *lkb;
664 int grmode = -1;
665
666 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
667 if (lkb->lkb_grmode == DLM_LOCK_PR ||
668 lkb->lkb_grmode == DLM_LOCK_CW) {
669 grmode = lkb->lkb_grmode;
670 break;
671 }
672 }
673
674 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
675 if (lkb->lkb_grmode != DLM_LOCK_IV)
676 continue;
677 if (grmode == -1)
678 lkb->lkb_grmode = lkb->lkb_rqmode;
679 else
680 lkb->lkb_grmode = grmode;
681 }
682}
683
684void dlm_recover_rsbs(struct dlm_ls *ls)
685{
686 struct dlm_rsb *r;
687 int count = 0;
688
689 log_debug(ls, "dlm_recover_rsbs");
690
691 down_read(&ls->ls_root_sem);
692 list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
693 lock_rsb(r);
694 if (is_master(r)) {
695 if (rsb_flag(r, RSB_RECOVER_CONVERT))
696 recover_conversion(r);
697 recover_lvb(r);
698 count++;
699 }
700 rsb_clear_flag(r, RSB_RECOVER_CONVERT);
701 unlock_rsb(r);
702 }
703 up_read(&ls->ls_root_sem);
704
705 log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
706}
707
708/* Create a single list of all root rsb's to be used during recovery */
709
710int dlm_create_root_list(struct dlm_ls *ls)
711{
712 struct dlm_rsb *r;
713 int i, error = 0;
714
715 down_write(&ls->ls_root_sem);
716 if (!list_empty(&ls->ls_root_list)) {
717 log_error(ls, "root list not empty");
718 error = -EINVAL;
719 goto out;
720 }
721
722 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
723 read_lock(&ls->ls_rsbtbl[i].lock);
724 list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
725 list_add(&r->res_root_list, &ls->ls_root_list);
726 dlm_hold_rsb(r);
727 }
728 read_unlock(&ls->ls_rsbtbl[i].lock);
729 }
730 out:
731 up_write(&ls->ls_root_sem);
732 return error;
733}
734
735void dlm_release_root_list(struct dlm_ls *ls)
736{
737 struct dlm_rsb *r, *safe;
738
739 down_write(&ls->ls_root_sem);
740 list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
741 list_del_init(&r->res_root_list);
742 dlm_put_rsb(r);
743 }
744 up_write(&ls->ls_root_sem);
745}
746
747void dlm_clear_toss_list(struct dlm_ls *ls)
748{
749 struct dlm_rsb *r, *safe;
750 int i;
751
752 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
753 write_lock(&ls->ls_rsbtbl[i].lock);
754 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
755 res_hashchain) {
756 list_del(&r->res_hashchain);
757 free_rsb(r);
758 }
759 write_unlock(&ls->ls_rsbtbl[i].lock);
760 }
761}
762
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
new file mode 100644
index 000000000000..ebd0363f1e08
--- /dev/null
+++ b/fs/dlm/recover.h
@@ -0,0 +1,34 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVER_DOT_H__
15#define __RECOVER_DOT_H__
16
17int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
18uint32_t dlm_recover_status(struct dlm_ls *ls);
19void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
20int dlm_recover_members_wait(struct dlm_ls *ls);
21int dlm_recover_directory_wait(struct dlm_ls *ls);
22int dlm_recover_locks_wait(struct dlm_ls *ls);
23int dlm_recover_done_wait(struct dlm_ls *ls);
24int dlm_recover_masters(struct dlm_ls *ls);
25int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
26int dlm_recover_locks(struct dlm_ls *ls);
27void dlm_recovered_lock(struct dlm_rsb *r);
28int dlm_create_root_list(struct dlm_ls *ls);
29void dlm_release_root_list(struct dlm_ls *ls);
30void dlm_clear_toss_list(struct dlm_ls *ls);
31void dlm_recover_rsbs(struct dlm_ls *ls);
32
33#endif /* __RECOVER_DOT_H__ */
34
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
new file mode 100644
index 000000000000..70103533677d
--- /dev/null
+++ b/fs/dlm/recoverd.c
@@ -0,0 +1,285 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#include "dlm_internal.h"
15#include "lockspace.h"
16#include "member.h"
17#include "dir.h"
18#include "ast.h"
19#include "recover.h"
20#include "lowcomms.h"
21#include "lock.h"
22#include "requestqueue.h"
23#include "recoverd.h"
24
25
26/* If the start for which we're re-enabling locking (seq) has been superseded
27 by a newer stop (ls_recover_seq), we need to leave locking disabled. */
28
29static int enable_locking(struct dlm_ls *ls, uint64_t seq)
30{
31 int error = -EINTR;
32
33 spin_lock(&ls->ls_recover_lock);
34 if (ls->ls_recover_seq == seq) {
35 set_bit(LSFL_RUNNING, &ls->ls_flags);
36 up_write(&ls->ls_in_recovery);
37 error = 0;
38 }
39 spin_unlock(&ls->ls_recover_lock);
40 return error;
41}
42
43static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
44{
45 unsigned long start;
46 int error, neg = 0;
47
48 log_debug(ls, "recover %llx", rv->seq);
49
50 mutex_lock(&ls->ls_recoverd_active);
51
52 /*
53 * Suspending and resuming dlm_astd ensures that no lkb's from this ls
54 * will be processed by dlm_astd during recovery.
55 */
56
57 dlm_astd_suspend();
58 dlm_astd_resume();
59
60 /*
61 * This list of root rsb's will be the basis of most of the recovery
62 * routines.
63 */
64
65 dlm_create_root_list(ls);
66
67 /*
68 * Free all the tossed rsb's so we don't have to recover them.
69 */
70
71 dlm_clear_toss_list(ls);
72
73 /*
74 * Add or remove nodes from the lockspace's ls_nodes list.
75 * Also waits for all nodes to complete dlm_recover_members.
76 */
77
78 error = dlm_recover_members(ls, rv, &neg);
79 if (error) {
80 log_error(ls, "recover_members failed %d", error);
81 goto fail;
82 }
83 start = jiffies;
84
85 /*
86 * Rebuild our own share of the directory by collecting from all other
87 * nodes their master rsb names that hash to us.
88 */
89
90 error = dlm_recover_directory(ls);
91 if (error) {
92 log_error(ls, "recover_directory failed %d", error);
93 goto fail;
94 }
95
96 /*
97 * Purge directory-related requests that are saved in requestqueue.
98 * All dir requests from before recovery are invalid now due to the dir
99 * rebuild and will be resent by the requesting nodes.
100 */
101
102 dlm_purge_requestqueue(ls);
103
104 /*
105 * Wait for all nodes to complete directory rebuild.
106 */
107
108 error = dlm_recover_directory_wait(ls);
109 if (error) {
110 log_error(ls, "recover_directory_wait failed %d", error);
111 goto fail;
112 }
113
114 /*
115 * We may have outstanding operations that are waiting for a reply from
116 * a failed node. Mark these to be resent after recovery. Unlock and
117 * cancel ops can just be completed.
118 */
119
120 dlm_recover_waiters_pre(ls);
121
122 error = dlm_recovery_stopped(ls);
123 if (error)
124 goto fail;
125
126 if (neg || dlm_no_directory(ls)) {
127 /*
128 * Clear lkb's for departed nodes.
129 */
130
131 dlm_purge_locks(ls);
132
133 /*
134 * Get new master nodeid's for rsb's that were mastered on
135 * departed nodes.
136 */
137
138 error = dlm_recover_masters(ls);
139 if (error) {
140 log_error(ls, "recover_masters failed %d", error);
141 goto fail;
142 }
143
144 /*
145 * Send our locks on remastered rsb's to the new masters.
146 */
147
148 error = dlm_recover_locks(ls);
149 if (error) {
150 log_error(ls, "recover_locks failed %d", error);
151 goto fail;
152 }
153
154 error = dlm_recover_locks_wait(ls);
155 if (error) {
156 log_error(ls, "recover_locks_wait failed %d", error);
157 goto fail;
158 }
159
160 /*
161 * Finalize state in master rsb's now that all locks can be
162 * checked. This includes conversion resolution and lvb
163 * settings.
164 */
165
166 dlm_recover_rsbs(ls);
167 }
168
169 dlm_release_root_list(ls);
170
171 dlm_set_recover_status(ls, DLM_RS_DONE);
172 error = dlm_recover_done_wait(ls);
173 if (error) {
174 log_error(ls, "recover_done_wait failed %d", error);
175 goto fail;
176 }
177
178 dlm_clear_members_gone(ls);
179
180 error = enable_locking(ls, rv->seq);
181 if (error) {
182 log_error(ls, "enable_locking failed %d", error);
183 goto fail;
184 }
185
186 error = dlm_process_requestqueue(ls);
187 if (error) {
188 log_error(ls, "process_requestqueue failed %d", error);
189 goto fail;
190 }
191
192 error = dlm_recover_waiters_post(ls);
193 if (error) {
194 log_error(ls, "recover_waiters_post failed %d", error);
195 goto fail;
196 }
197
198 dlm_grant_after_purge(ls);
199
200 dlm_astd_wake();
201
202 log_debug(ls, "recover %llx done: %u ms", rv->seq,
203 jiffies_to_msecs(jiffies - start));
204 mutex_unlock(&ls->ls_recoverd_active);
205
206 return 0;
207
208 fail:
209 dlm_release_root_list(ls);
210 log_debug(ls, "recover %llx error %d", rv->seq, error);
211 mutex_unlock(&ls->ls_recoverd_active);
212 return error;
213}
214
215static void do_ls_recovery(struct dlm_ls *ls)
216{
217 struct dlm_recover *rv = NULL;
218
219 spin_lock(&ls->ls_recover_lock);
220 rv = ls->ls_recover_args;
221 ls->ls_recover_args = NULL;
222 clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
223 spin_unlock(&ls->ls_recover_lock);
224
225 if (rv) {
226 ls_recover(ls, rv);
227 kfree(rv->nodeids);
228 kfree(rv);
229 }
230}
231
232static int dlm_recoverd(void *arg)
233{
234 struct dlm_ls *ls;
235
236 ls = dlm_find_lockspace_local(arg);
237
238 while (!kthread_should_stop()) {
239 set_current_state(TASK_INTERRUPTIBLE);
240 if (!test_bit(LSFL_WORK, &ls->ls_flags))
241 schedule();
242 set_current_state(TASK_RUNNING);
243
244 if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
245 do_ls_recovery(ls);
246 }
247
248 dlm_put_lockspace(ls);
249 return 0;
250}
251
252void dlm_recoverd_kick(struct dlm_ls *ls)
253{
254 set_bit(LSFL_WORK, &ls->ls_flags);
255 wake_up_process(ls->ls_recoverd_task);
256}
257
258int dlm_recoverd_start(struct dlm_ls *ls)
259{
260 struct task_struct *p;
261 int error = 0;
262
263 p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
264 if (IS_ERR(p))
265 error = PTR_ERR(p);
266 else
267 ls->ls_recoverd_task = p;
268 return error;
269}
270
271void dlm_recoverd_stop(struct dlm_ls *ls)
272{
273 kthread_stop(ls->ls_recoverd_task);
274}
275
276void dlm_recoverd_suspend(struct dlm_ls *ls)
277{
278 mutex_lock(&ls->ls_recoverd_active);
279}
280
281void dlm_recoverd_resume(struct dlm_ls *ls)
282{
283 mutex_unlock(&ls->ls_recoverd_active);
284}
285
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
new file mode 100644
index 000000000000..866657c5d69d
--- /dev/null
+++ b/fs/dlm/recoverd.h
@@ -0,0 +1,24 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __RECOVERD_DOT_H__
15#define __RECOVERD_DOT_H__
16
17void dlm_recoverd_kick(struct dlm_ls *ls);
18void dlm_recoverd_stop(struct dlm_ls *ls);
19int dlm_recoverd_start(struct dlm_ls *ls);
20void dlm_recoverd_suspend(struct dlm_ls *ls);
21void dlm_recoverd_resume(struct dlm_ls *ls);
22
23#endif /* __RECOVERD_DOT_H__ */
24
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
new file mode 100644
index 000000000000..7b2b089634a2
--- /dev/null
+++ b/fs/dlm/requestqueue.c
@@ -0,0 +1,184 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "member.h"
15#include "lock.h"
16#include "dir.h"
17#include "config.h"
18#include "requestqueue.h"
19
20struct rq_entry {
21 struct list_head list;
22 int nodeid;
23 char request[1];
24};
25
26/*
27 * Requests received while the lockspace is in recovery get added to the
28 * request queue and processed when recovery is complete. This happens when
29 * the lockspace is suspended on some nodes before it is on others, or the
30 * lockspace is enabled on some while still suspended on others.
31 */
32
33void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
34{
35 struct rq_entry *e;
36 int length = hd->h_length;
37
38 if (dlm_is_removed(ls, nodeid))
39 return;
40
41 e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
42 if (!e) {
43 log_print("dlm_add_requestqueue: out of memory\n");
44 return;
45 }
46
47 e->nodeid = nodeid;
48 memcpy(e->request, hd, length);
49
50 mutex_lock(&ls->ls_requestqueue_mutex);
51 list_add_tail(&e->list, &ls->ls_requestqueue);
52 mutex_unlock(&ls->ls_requestqueue_mutex);
53}
54
55int dlm_process_requestqueue(struct dlm_ls *ls)
56{
57 struct rq_entry *e;
58 struct dlm_header *hd;
59 int error = 0;
60
61 mutex_lock(&ls->ls_requestqueue_mutex);
62
63 for (;;) {
64 if (list_empty(&ls->ls_requestqueue)) {
65 mutex_unlock(&ls->ls_requestqueue_mutex);
66 error = 0;
67 break;
68 }
69 e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
70 mutex_unlock(&ls->ls_requestqueue_mutex);
71
72 hd = (struct dlm_header *) e->request;
73 error = dlm_receive_message(hd, e->nodeid, 1);
74
75 if (error == -EINTR) {
76 /* entry is left on requestqueue */
77 log_debug(ls, "process_requestqueue abort eintr");
78 break;
79 }
80
81 mutex_lock(&ls->ls_requestqueue_mutex);
82 list_del(&e->list);
83 kfree(e);
84
85 if (dlm_locking_stopped(ls)) {
86 log_debug(ls, "process_requestqueue abort running");
87 mutex_unlock(&ls->ls_requestqueue_mutex);
88 error = -EINTR;
89 break;
90 }
91 schedule();
92 }
93
94 return error;
95}
96
97/*
98 * After recovery is done, locking is resumed and dlm_recoverd takes all the
99 * saved requests and processes them as they would have been by dlm_recvd. At
100 * the same time, dlm_recvd will start receiving new requests from remote
101 * nodes. We want to delay dlm_recvd processing new requests until
102 * dlm_recoverd has finished processing the old saved requests.
103 */
104
105void dlm_wait_requestqueue(struct dlm_ls *ls)
106{
107 for (;;) {
108 mutex_lock(&ls->ls_requestqueue_mutex);
109 if (list_empty(&ls->ls_requestqueue))
110 break;
111 if (dlm_locking_stopped(ls))
112 break;
113 mutex_unlock(&ls->ls_requestqueue_mutex);
114 schedule();
115 }
116 mutex_unlock(&ls->ls_requestqueue_mutex);
117}
118
119static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
120{
121 uint32_t type = ms->m_type;
122
123 if (dlm_is_removed(ls, nodeid))
124 return 1;
125
126 /* directory operations are always purged because the directory is
127 always rebuilt during recovery and the lookups resent */
128
129 if (type == DLM_MSG_REMOVE ||
130 type == DLM_MSG_LOOKUP ||
131 type == DLM_MSG_LOOKUP_REPLY)
132 return 1;
133
134 if (!dlm_no_directory(ls))
135 return 0;
136
137 /* with no directory, the master is likely to change as a part of
138 recovery; requests to/from the defunct master need to be purged */
139
140 switch (type) {
141 case DLM_MSG_REQUEST:
142 case DLM_MSG_CONVERT:
143 case DLM_MSG_UNLOCK:
144 case DLM_MSG_CANCEL:
145 /* we're no longer the master of this resource, the sender
146 will resend to the new master (see waiter_needs_recovery) */
147
148 if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
149 return 1;
150 break;
151
152 case DLM_MSG_REQUEST_REPLY:
153 case DLM_MSG_CONVERT_REPLY:
154 case DLM_MSG_UNLOCK_REPLY:
155 case DLM_MSG_CANCEL_REPLY:
156 case DLM_MSG_GRANT:
157 /* this reply is from the former master of the resource,
158 we'll resend to the new master if needed */
159
160 if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
161 return 1;
162 break;
163 }
164
165 return 0;
166}
167
168void dlm_purge_requestqueue(struct dlm_ls *ls)
169{
170 struct dlm_message *ms;
171 struct rq_entry *e, *safe;
172
173 mutex_lock(&ls->ls_requestqueue_mutex);
174 list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
175 ms = (struct dlm_message *) e->request;
176
177 if (purge_request(ls, ms, e->nodeid)) {
178 list_del(&e->list);
179 kfree(e);
180 }
181 }
182 mutex_unlock(&ls->ls_requestqueue_mutex);
183}
184
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
new file mode 100644
index 000000000000..349f0d292d95
--- /dev/null
+++ b/fs/dlm/requestqueue.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __REQUESTQUEUE_DOT_H__
14#define __REQUESTQUEUE_DOT_H__
15
16void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
17int dlm_process_requestqueue(struct dlm_ls *ls);
18void dlm_wait_requestqueue(struct dlm_ls *ls);
19void dlm_purge_requestqueue(struct dlm_ls *ls);
20
21#endif
22
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
new file mode 100644
index 000000000000..767197db9944
--- /dev/null
+++ b/fs/dlm/util.c
@@ -0,0 +1,161 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#include "dlm_internal.h"
14#include "rcom.h"
15#include "util.h"
16
17static void header_out(struct dlm_header *hd)
18{
19 hd->h_version = cpu_to_le32(hd->h_version);
20 hd->h_lockspace = cpu_to_le32(hd->h_lockspace);
21 hd->h_nodeid = cpu_to_le32(hd->h_nodeid);
22 hd->h_length = cpu_to_le16(hd->h_length);
23}
24
25static void header_in(struct dlm_header *hd)
26{
27 hd->h_version = le32_to_cpu(hd->h_version);
28 hd->h_lockspace = le32_to_cpu(hd->h_lockspace);
29 hd->h_nodeid = le32_to_cpu(hd->h_nodeid);
30 hd->h_length = le16_to_cpu(hd->h_length);
31}
32
33void dlm_message_out(struct dlm_message *ms)
34{
35 struct dlm_header *hd = (struct dlm_header *) ms;
36
37 header_out(hd);
38
39 ms->m_type = cpu_to_le32(ms->m_type);
40 ms->m_nodeid = cpu_to_le32(ms->m_nodeid);
41 ms->m_pid = cpu_to_le32(ms->m_pid);
42 ms->m_lkid = cpu_to_le32(ms->m_lkid);
43 ms->m_remid = cpu_to_le32(ms->m_remid);
44 ms->m_parent_lkid = cpu_to_le32(ms->m_parent_lkid);
45 ms->m_parent_remid = cpu_to_le32(ms->m_parent_remid);
46 ms->m_exflags = cpu_to_le32(ms->m_exflags);
47 ms->m_sbflags = cpu_to_le32(ms->m_sbflags);
48 ms->m_flags = cpu_to_le32(ms->m_flags);
49 ms->m_lvbseq = cpu_to_le32(ms->m_lvbseq);
50 ms->m_hash = cpu_to_le32(ms->m_hash);
51 ms->m_status = cpu_to_le32(ms->m_status);
52 ms->m_grmode = cpu_to_le32(ms->m_grmode);
53 ms->m_rqmode = cpu_to_le32(ms->m_rqmode);
54 ms->m_bastmode = cpu_to_le32(ms->m_bastmode);
55 ms->m_asts = cpu_to_le32(ms->m_asts);
56 ms->m_result = cpu_to_le32(ms->m_result);
57}
58
59void dlm_message_in(struct dlm_message *ms)
60{
61 struct dlm_header *hd = (struct dlm_header *) ms;
62
63 header_in(hd);
64
65 ms->m_type = le32_to_cpu(ms->m_type);
66 ms->m_nodeid = le32_to_cpu(ms->m_nodeid);
67 ms->m_pid = le32_to_cpu(ms->m_pid);
68 ms->m_lkid = le32_to_cpu(ms->m_lkid);
69 ms->m_remid = le32_to_cpu(ms->m_remid);
70 ms->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid);
71 ms->m_parent_remid = le32_to_cpu(ms->m_parent_remid);
72 ms->m_exflags = le32_to_cpu(ms->m_exflags);
73 ms->m_sbflags = le32_to_cpu(ms->m_sbflags);
74 ms->m_flags = le32_to_cpu(ms->m_flags);
75 ms->m_lvbseq = le32_to_cpu(ms->m_lvbseq);
76 ms->m_hash = le32_to_cpu(ms->m_hash);
77 ms->m_status = le32_to_cpu(ms->m_status);
78 ms->m_grmode = le32_to_cpu(ms->m_grmode);
79 ms->m_rqmode = le32_to_cpu(ms->m_rqmode);
80 ms->m_bastmode = le32_to_cpu(ms->m_bastmode);
81 ms->m_asts = le32_to_cpu(ms->m_asts);
82 ms->m_result = le32_to_cpu(ms->m_result);
83}
84
85static void rcom_lock_out(struct rcom_lock *rl)
86{
87 rl->rl_ownpid = cpu_to_le32(rl->rl_ownpid);
88 rl->rl_lkid = cpu_to_le32(rl->rl_lkid);
89 rl->rl_remid = cpu_to_le32(rl->rl_remid);
90 rl->rl_parent_lkid = cpu_to_le32(rl->rl_parent_lkid);
91 rl->rl_parent_remid = cpu_to_le32(rl->rl_parent_remid);
92 rl->rl_exflags = cpu_to_le32(rl->rl_exflags);
93 rl->rl_flags = cpu_to_le32(rl->rl_flags);
94 rl->rl_lvbseq = cpu_to_le32(rl->rl_lvbseq);
95 rl->rl_result = cpu_to_le32(rl->rl_result);
96 rl->rl_wait_type = cpu_to_le16(rl->rl_wait_type);
97 rl->rl_namelen = cpu_to_le16(rl->rl_namelen);
98}
99
100static void rcom_lock_in(struct rcom_lock *rl)
101{
102 rl->rl_ownpid = le32_to_cpu(rl->rl_ownpid);
103 rl->rl_lkid = le32_to_cpu(rl->rl_lkid);
104 rl->rl_remid = le32_to_cpu(rl->rl_remid);
105 rl->rl_parent_lkid = le32_to_cpu(rl->rl_parent_lkid);
106 rl->rl_parent_remid = le32_to_cpu(rl->rl_parent_remid);
107 rl->rl_exflags = le32_to_cpu(rl->rl_exflags);
108 rl->rl_flags = le32_to_cpu(rl->rl_flags);
109 rl->rl_lvbseq = le32_to_cpu(rl->rl_lvbseq);
110 rl->rl_result = le32_to_cpu(rl->rl_result);
111 rl->rl_wait_type = le16_to_cpu(rl->rl_wait_type);
112 rl->rl_namelen = le16_to_cpu(rl->rl_namelen);
113}
114
115static void rcom_config_out(struct rcom_config *rf)
116{
117 rf->rf_lvblen = cpu_to_le32(rf->rf_lvblen);
118 rf->rf_lsflags = cpu_to_le32(rf->rf_lsflags);
119}
120
121static void rcom_config_in(struct rcom_config *rf)
122{
123 rf->rf_lvblen = le32_to_cpu(rf->rf_lvblen);
124 rf->rf_lsflags = le32_to_cpu(rf->rf_lsflags);
125}
126
127void dlm_rcom_out(struct dlm_rcom *rc)
128{
129 struct dlm_header *hd = (struct dlm_header *) rc;
130 int type = rc->rc_type;
131
132 header_out(hd);
133
134 rc->rc_type = cpu_to_le32(rc->rc_type);
135 rc->rc_result = cpu_to_le32(rc->rc_result);
136 rc->rc_id = cpu_to_le64(rc->rc_id);
137
138 if (type == DLM_RCOM_LOCK)
139 rcom_lock_out((struct rcom_lock *) rc->rc_buf);
140
141 else if (type == DLM_RCOM_STATUS_REPLY)
142 rcom_config_out((struct rcom_config *) rc->rc_buf);
143}
144
145void dlm_rcom_in(struct dlm_rcom *rc)
146{
147 struct dlm_header *hd = (struct dlm_header *) rc;
148
149 header_in(hd);
150
151 rc->rc_type = le32_to_cpu(rc->rc_type);
152 rc->rc_result = le32_to_cpu(rc->rc_result);
153 rc->rc_id = le64_to_cpu(rc->rc_id);
154
155 if (rc->rc_type == DLM_RCOM_LOCK)
156 rcom_lock_in((struct rcom_lock *) rc->rc_buf);
157
158 else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
159 rcom_config_in((struct rcom_config *) rc->rc_buf);
160}
161
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
new file mode 100644
index 000000000000..2b25915161c0
--- /dev/null
+++ b/fs/dlm/util.h
@@ -0,0 +1,22 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
5**
6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions
8** of the GNU General Public License v.2.
9**
10*******************************************************************************
11******************************************************************************/
12
13#ifndef __UTIL_DOT_H__
14#define __UTIL_DOT_H__
15
16void dlm_message_out(struct dlm_message *ms);
17void dlm_message_in(struct dlm_message *ms);
18void dlm_rcom_out(struct dlm_rcom *rc);
19void dlm_rcom_in(struct dlm_rcom *rc);
20
21#endif
22
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
new file mode 100644
index 000000000000..17cb44bea1c0
--- /dev/null
+++ b/fs/gfs2/Kconfig
@@ -0,0 +1,46 @@
1config GFS2_FS
2 tristate "GFS2 file system support"
3 default m
4 depends on EXPERIMENTAL
5 select FS_POSIX_ACL
6 select SYSFS
7 help
8 A cluster filesystem.
9
10 Allows a cluster of computers to simultaneously use a block device
11 that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads
12 and writes to the block device like a local filesystem, but also uses
13 a lock module to allow the computers coordinate their I/O so
14 filesystem consistency is maintained. One of the nifty features of
15 GFS is perfect consistency -- changes made to the filesystem on one
16 machine show up immediately on all other machines in the cluster.
17
18 To use the GFS2 filesystem, you will need to enable one or more of
19 the below locking modules. Documentation and utilities for GFS2 can
20 be found here: http://sources.redhat.com/cluster/gfs/
21
22config GFS2_FS_LOCKING_NOLOCK
23 tristate "GFS2 \"nolock\" locking module"
24 depends on GFS2_FS
25 help
26 Single node locking module for GFS2.
27
28 Use this module if you want to use GFS2 on a single node without
29 its clustering features. You can still take advantage of the
30 large file support, and upgrade to running a full cluster later on
31 if required.
32
33 If you will only be using GFS2 in cluster mode, you do not need this
34 module.
35
36config GFS2_FS_LOCKING_DLM
37 tristate "GFS2 DLM locking module"
38 depends on GFS2_FS
39 select DLM
40 help
41 Multiple node locking module for GFS2
42
43 Most users of GFS2 will require this module. It provides the locking
44 interface between GFS2 and the DLM, which is required to use GFS2
45 in a cluster environment.
46
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
new file mode 100644
index 000000000000..88f927948113
--- /dev/null
+++ b/fs/gfs2/Makefile
@@ -0,0 +1,42 @@
1obj-$(CONFIG_GFS2_FS) += gfs2.o
2gfs2-y := \
3 acl.o \
4 bits.o \
5 bmap.o \
6 daemon.o \
7 dir.o \
8 eaops.o \
9 eattr.o \
10 glock.o \
11 glops.o \
12 inode.o \
13 lm.o \
14 log.o \
15 lops.o \
16 locking.o \
17 lvb.o \
18 main.o \
19 meta_io.o \
20 mount.o \
21 ondisk.o \
22 ops_address.o \
23 ops_dentry.o \
24 ops_export.o \
25 ops_file.o \
26 ops_fstype.o \
27 ops_inode.o \
28 ops_super.o \
29 ops_vm.o \
30 page.o \
31 quota.o \
32 recovery.o \
33 rgrp.o \
34 super.o \
35 sys.o \
36 trans.o \
37 unlinked.o \
38 util.o
39
40obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
41obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
42
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
new file mode 100644
index 000000000000..e9d05fe94357
--- /dev/null
+++ b/fs/gfs2/acl.c
@@ -0,0 +1,316 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/posix_acl.h>
16#include <linux/posix_acl_xattr.h>
17#include <asm/semaphore.h>
18#include <linux/gfs2_ondisk.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "acl.h"
24#include "eaops.h"
25#include "eattr.h"
26#include "glock.h"
27#include "inode.h"
28#include "meta_io.h"
29#include "trans.h"
30#include "util.h"
31
32#define ACL_ACCESS 1
33#define ACL_DEFAULT 0
34
35int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
36 struct gfs2_ea_request *er,
37 int *remove, mode_t *mode)
38{
39 struct posix_acl *acl;
40 int error;
41
42 error = gfs2_acl_validate_remove(ip, access);
43 if (error)
44 return error;
45
46 if (!er->er_data)
47 return -EINVAL;
48
49 acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
50 if (IS_ERR(acl))
51 return PTR_ERR(acl);
52 if (!acl) {
53 *remove = 1;
54 return 0;
55 }
56
57 error = posix_acl_valid(acl);
58 if (error)
59 goto out;
60
61 if (access) {
62 error = posix_acl_equiv_mode(acl, mode);
63 if (!error)
64 *remove = 1;
65 else if (error > 0)
66 error = 0;
67 }
68
69 out:
70 posix_acl_release(acl);
71
72 return error;
73}
74
75int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
76{
77 if (!ip->i_sbd->sd_args.ar_posix_acl)
78 return -EOPNOTSUPP;
79 if (current->fsuid != ip->i_di.di_uid && !capable(CAP_FOWNER))
80 return -EPERM;
81 if (S_ISLNK(ip->i_di.di_mode))
82 return -EOPNOTSUPP;
83 if (!access && !S_ISDIR(ip->i_di.di_mode))
84 return -EACCES;
85
86 return 0;
87}
88
89static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
90 struct gfs2_ea_location *el, char **data, unsigned int *len)
91{
92 struct gfs2_ea_request er;
93 struct gfs2_ea_location el_this;
94 int error;
95
96 if (!ip->i_di.di_eattr)
97 return 0;
98
99 memset(&er, 0, sizeof(struct gfs2_ea_request));
100 if (access) {
101 er.er_name = GFS2_POSIX_ACL_ACCESS;
102 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
103 } else {
104 er.er_name = GFS2_POSIX_ACL_DEFAULT;
105 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
106 }
107 er.er_type = GFS2_EATYPE_SYS;
108
109 if (!el)
110 el = &el_this;
111
112 error = gfs2_ea_find(ip, &er, el);
113 if (error)
114 return error;
115 if (!el->el_ea)
116 return 0;
117 if (!GFS2_EA_DATA_LEN(el->el_ea))
118 goto out;
119
120 er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
121 er.er_data = kmalloc(er.er_data_len, GFP_KERNEL);
122 error = -ENOMEM;
123 if (!er.er_data)
124 goto out;
125
126 error = gfs2_ea_get_copy(ip, el, er.er_data);
127 if (error)
128 goto out_kfree;
129
130 if (acl) {
131 *acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
132 if (IS_ERR(*acl))
133 error = PTR_ERR(*acl);
134 }
135
136 out_kfree:
137 if (error || !data)
138 kfree(er.er_data);
139 else {
140 *data = er.er_data;
141 *len = er.er_data_len;
142 }
143
144 out:
145 if (error || el == &el_this)
146 brelse(el->el_bh);
147
148 return error;
149}
150
151/**
152 * gfs2_check_acl_locked - Check an ACL to see if we're allowed to do something
153 * @inode: the file we want to do something to
154 * @mask: what we want to do
155 *
156 * Returns: errno
157 */
158
159int gfs2_check_acl_locked(struct inode *inode, int mask)
160{
161 struct posix_acl *acl = NULL;
162 int error;
163
164 error = acl_get(inode->u.generic_ip, ACL_ACCESS, &acl, NULL, NULL, NULL);
165 if (error)
166 return error;
167
168 if (acl) {
169 error = posix_acl_permission(inode, acl, mask);
170 posix_acl_release(acl);
171 return error;
172 }
173
174 return -EAGAIN;
175}
176
177int gfs2_check_acl(struct inode *inode, int mask)
178{
179 struct gfs2_inode *ip = inode->u.generic_ip;
180 struct gfs2_holder i_gh;
181 int error;
182
183 error = gfs2_glock_nq_init(ip->i_gl,
184 LM_ST_SHARED, LM_FLAG_ANY,
185 &i_gh);
186 if (!error) {
187 error = gfs2_check_acl_locked(inode, mask);
188 gfs2_glock_dq_uninit(&i_gh);
189 }
190
191 return error;
192}
193
194static int munge_mode(struct gfs2_inode *ip, mode_t mode)
195{
196 struct gfs2_sbd *sdp = ip->i_sbd;
197 struct buffer_head *dibh;
198 int error;
199
200 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
201 if (error)
202 return error;
203
204 error = gfs2_meta_inode_buffer(ip, &dibh);
205 if (!error) {
206 gfs2_assert_withdraw(sdp,
207 (ip->i_di.di_mode & S_IFMT) == (mode & S_IFMT));
208 ip->i_di.di_mode = mode;
209 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
210 gfs2_dinode_out(&ip->i_di, dibh->b_data);
211 brelse(dibh);
212 }
213
214 gfs2_trans_end(sdp);
215
216 return 0;
217}
218
219int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
220{
221 struct gfs2_sbd *sdp = dip->i_sbd;
222 struct posix_acl *acl = NULL, *clone;
223 struct gfs2_ea_request er;
224 mode_t mode = ip->i_di.di_mode;
225 int error;
226
227 if (!sdp->sd_args.ar_posix_acl)
228 return 0;
229 if (S_ISLNK(ip->i_di.di_mode))
230 return 0;
231
232 memset(&er, 0, sizeof(struct gfs2_ea_request));
233 er.er_type = GFS2_EATYPE_SYS;
234
235 error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
236 &er.er_data, &er.er_data_len);
237 if (error)
238 return error;
239 if (!acl) {
240 mode &= ~current->fs->umask;
241 if (mode != ip->i_di.di_mode)
242 error = munge_mode(ip, mode);
243 return error;
244 }
245
246 clone = posix_acl_clone(acl, GFP_KERNEL);
247 error = -ENOMEM;
248 if (!clone)
249 goto out;
250 posix_acl_release(acl);
251 acl = clone;
252
253 if (S_ISDIR(ip->i_di.di_mode)) {
254 er.er_name = GFS2_POSIX_ACL_DEFAULT;
255 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
256 error = gfs2_system_eaops.eo_set(ip, &er);
257 if (error)
258 goto out;
259 }
260
261 error = posix_acl_create_masq(acl, &mode);
262 if (error < 0)
263 goto out;
264 if (error > 0) {
265 er.er_name = GFS2_POSIX_ACL_ACCESS;
266 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
267 posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
268 er.er_mode = mode;
269 er.er_flags = GFS2_ERF_MODE;
270 error = gfs2_system_eaops.eo_set(ip, &er);
271 if (error)
272 goto out;
273 } else
274 munge_mode(ip, mode);
275
276 out:
277 posix_acl_release(acl);
278 kfree(er.er_data);
279 return error;
280}
281
282int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
283{
284 struct posix_acl *acl = NULL, *clone;
285 struct gfs2_ea_location el;
286 char *data;
287 unsigned int len;
288 int error;
289
290 error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
291 if (error)
292 return error;
293 if (!acl)
294 return gfs2_setattr_simple(ip, attr);
295
296 clone = posix_acl_clone(acl, GFP_KERNEL);
297 error = -ENOMEM;
298 if (!clone)
299 goto out;
300 posix_acl_release(acl);
301 acl = clone;
302
303 error = posix_acl_chmod_masq(acl, attr->ia_mode);
304 if (!error) {
305 posix_acl_to_xattr(acl, data, len);
306 error = gfs2_ea_acl_chmod(ip, &el, attr, data);
307 }
308
309 out:
310 posix_acl_release(acl);
311 brelse(el.el_bh);
312 kfree(data);
313
314 return error;
315}
316
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
new file mode 100644
index 000000000000..a174b4f6bcc2
--- /dev/null
+++ b/fs/gfs2/acl.h
@@ -0,0 +1,37 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __ACL_DOT_H__
11#define __ACL_DOT_H__
12
13#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
14#define GFS2_POSIX_ACL_ACCESS_LEN 16
15#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
16#define GFS2_POSIX_ACL_DEFAULT_LEN 17
17
18#define GFS2_ACL_IS_ACCESS(name, len) \
19 ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
20 !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
21
22#define GFS2_ACL_IS_DEFAULT(name, len) \
23 ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
24 !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
25
26struct gfs2_ea_request;
27
28int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
29 struct gfs2_ea_request *er,
30 int *remove, mode_t *mode);
31int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
32int gfs2_check_acl_locked(struct inode *inode, int mask);
33int gfs2_check_acl(struct inode *inode, int mask);
34int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
35int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
36
37#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/bits.c b/fs/gfs2/bits.c
new file mode 100644
index 000000000000..49585e3de095
--- /dev/null
+++ b/fs/gfs2/bits.c
@@ -0,0 +1,182 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10/*
11 * These routines are used by the resource group routines (rgrp.c)
12 * to keep track of block allocation. Each block is represented by two
13 * bits. One bit indicates whether or not the block is used. (1=used,
14 * 0=free) The other bit indicates whether or not the block contains a
15 * dinode or not. (1=dinode, 0=not-dinode) So, each byte represents
16 * GFS2_NBBY (i.e. 4) blocks.
17 */
18
19#include <linux/sched.h>
20#include <linux/slab.h>
21#include <linux/spinlock.h>
22#include <linux/completion.h>
23#include <linux/buffer_head.h>
24#include <linux/gfs2_ondisk.h>
25#include <asm/semaphore.h>
26
27#include "gfs2.h"
28#include "lm_interface.h"
29#include "incore.h"
30#include "bits.h"
31#include "util.h"
32
33static const char valid_change[16] = {
34 /* current */
35 /* n */ 0, 1, 0, 1,
36 /* e */ 1, 0, 0, 0,
37 /* w */ 0, 0, 0, 0,
38 1, 0, 0, 0
39};
40
41/**
42 * gfs2_setbit - Set a bit in the bitmaps
43 * @buffer: the buffer that holds the bitmaps
44 * @buflen: the length (in bytes) of the buffer
45 * @block: the block to set
46 * @new_state: the new state of the block
47 *
48 */
49
50void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
51 unsigned int buflen, uint32_t block, unsigned char new_state)
52{
53 unsigned char *byte, *end, cur_state;
54 unsigned int bit;
55
56 byte = buffer + (block / GFS2_NBBY);
57 bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
58 end = buffer + buflen;
59
60 gfs2_assert(rgd->rd_sbd, byte < end);
61
62 cur_state = (*byte >> bit) & GFS2_BIT_MASK;
63
64 if (valid_change[new_state * 4 + cur_state]) {
65 *byte ^= cur_state << bit;
66 *byte |= new_state << bit;
67 } else
68 gfs2_consist_rgrpd(rgd);
69}
70
71/**
72 * gfs2_testbit - test a bit in the bitmaps
73 * @buffer: the buffer that holds the bitmaps
74 * @buflen: the length (in bytes) of the buffer
75 * @block: the block to read
76 *
77 */
78
79unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
80 unsigned int buflen, uint32_t block)
81{
82 unsigned char *byte, *end, cur_state;
83 unsigned int bit;
84
85 byte = buffer + (block / GFS2_NBBY);
86 bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
87 end = buffer + buflen;
88
89 gfs2_assert(rgd->rd_sbd, byte < end);
90
91 cur_state = (*byte >> bit) & GFS2_BIT_MASK;
92
93 return cur_state;
94}
95
96/**
97 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
98 * a block in a given allocation state.
99 * @buffer: the buffer that holds the bitmaps
100 * @buflen: the length (in bytes) of the buffer
101 * @goal: start search at this block's bit-pair (within @buffer)
102 * @old_state: GFS2_BLKST_XXX the state of the block we're looking for;
103 * bit 0 = alloc(1)/free(0), bit 1 = meta(1)/data(0)
104 *
105 * Scope of @goal and returned block number is only within this bitmap buffer,
106 * not entire rgrp or filesystem. @buffer will be offset from the actual
107 * beginning of a bitmap block buffer, skipping any header structures.
108 *
109 * Return: the block number (bitmap buffer scope) that was found
110 */
111
112uint32_t gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
113 unsigned int buflen, uint32_t goal,
114 unsigned char old_state)
115{
116 unsigned char *byte, *end, alloc;
117 uint32_t blk = goal;
118 unsigned int bit;
119
120 byte = buffer + (goal / GFS2_NBBY);
121 bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
122 end = buffer + buflen;
123 alloc = (old_state & 1) ? 0 : 0x55;
124
125 while (byte < end) {
126 if ((*byte & 0x55) == alloc) {
127 blk += (8 - bit) >> 1;
128
129 bit = 0;
130 byte++;
131
132 continue;
133 }
134
135 if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
136 return blk;
137
138 bit += GFS2_BIT_SIZE;
139 if (bit >= 8) {
140 bit = 0;
141 byte++;
142 }
143
144 blk++;
145 }
146
147 return BFITNOENT;
148}
149
150/**
151 * gfs2_bitcount - count the number of bits in a certain state
152 * @buffer: the buffer that holds the bitmaps
153 * @buflen: the length (in bytes) of the buffer
154 * @state: the state of the block we're looking for
155 *
156 * Returns: The number of bits
157 */
158
159uint32_t gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer,
160 unsigned int buflen, unsigned char state)
161{
162 unsigned char *byte = buffer;
163 unsigned char *end = buffer + buflen;
164 unsigned char state1 = state << 2;
165 unsigned char state2 = state << 4;
166 unsigned char state3 = state << 6;
167 uint32_t count = 0;
168
169 for (; byte < end; byte++) {
170 if (((*byte) & 0x03) == state)
171 count++;
172 if (((*byte) & 0x0C) == state1)
173 count++;
174 if (((*byte) & 0x30) == state2)
175 count++;
176 if (((*byte) & 0xC0) == state3)
177 count++;
178 }
179
180 return count;
181}
182
diff --git a/fs/gfs2/bits.h b/fs/gfs2/bits.h
new file mode 100644
index 000000000000..36ccbdcb1eef
--- /dev/null
+++ b/fs/gfs2/bits.h
@@ -0,0 +1,28 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __BITS_DOT_H__
11#define __BITS_DOT_H__
12
13#define BFITNOENT 0xFFFFFFFF
14
15void gfs2_setbit(struct gfs2_rgrpd *rgd,
16 unsigned char *buffer, unsigned int buflen,
17 uint32_t block, unsigned char new_state);
18unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
19 unsigned char *buffer, unsigned int buflen,
20 uint32_t block);
21uint32_t gfs2_bitfit(struct gfs2_rgrpd *rgd,
22 unsigned char *buffer, unsigned int buflen,
23 uint32_t goal, unsigned char old_state);
24uint32_t gfs2_bitcount(struct gfs2_rgrpd *rgd,
25 unsigned char *buffer, unsigned int buflen,
26 unsigned char state);
27
28#endif /* __BITS_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
new file mode 100644
index 000000000000..c7723119acb6
--- /dev/null
+++ b/fs/gfs2/bmap.c
@@ -0,0 +1,1098 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17#include <asm/semaphore.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "bmap.h"
23#include "glock.h"
24#include "inode.h"
25#include "meta_io.h"
26#include "page.h"
27#include "quota.h"
28#include "rgrp.h"
29#include "trans.h"
30#include "dir.h"
31#include "util.h"
32
33/* This doesn't need to be that large as max 64 bit pointers in a 4k
34 * block is 512, so __u16 is fine for that. It saves stack space to
35 * keep it small.
36 */
37struct metapath {
38 __u16 mp_list[GFS2_MAX_META_HEIGHT];
39};
40
41typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
42 struct buffer_head *bh, uint64_t *top,
43 uint64_t *bottom, unsigned int height,
44 void *data);
45
46struct strip_mine {
47 int sm_first;
48 unsigned int sm_height;
49};
50
51/**
52 * @gfs2_unstuffer_sync - Synchronously unstuff a dinode
53 * @ip:
54 * @dibh:
55 * @block:
56 * @private:
57 *
58 * Cheat and use a metadata buffer instead of a data page.
59 *
60 * Returns: errno
61 */
62
63int gfs2_unstuffer_sync(struct gfs2_inode *ip, struct buffer_head *dibh,
64 uint64_t block, void *private)
65{
66 struct buffer_head *bh;
67 int error;
68
69 bh = gfs2_meta_new(ip->i_gl, block);
70
71 gfs2_buffer_copy_tail(bh, 0, dibh, sizeof(struct gfs2_dinode));
72
73 set_buffer_dirty(bh);
74 error = sync_dirty_buffer(bh);
75
76 brelse(bh);
77
78 return error;
79}
80
81/**
82 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
83 * @ip: The GFS2 inode to unstuff
84 * @unstuffer: the routine that handles unstuffing a non-zero length file
85 * @private: private data for the unstuffer
86 *
87 * This routine unstuffs a dinode and returns it to a "normal" state such
88 * that the height can be grown in the traditional way.
89 *
90 * Returns: errno
91 */
92
93int gfs2_unstuff_dinode(struct gfs2_inode *ip, gfs2_unstuffer_t unstuffer,
94 void *private)
95{
96 struct buffer_head *bh, *dibh;
97 uint64_t block = 0;
98 int isdir = gfs2_is_dir(ip);
99 int error;
100
101 down_write(&ip->i_rw_mutex);
102
103 error = gfs2_meta_inode_buffer(ip, &dibh);
104 if (error)
105 goto out;
106
107 if (ip->i_di.di_size) {
108 /* Get a free block, fill it with the stuffed data,
109 and write it out to disk */
110
111 if (isdir) {
112 block = gfs2_alloc_meta(ip);
113
114 error = gfs2_dir_get_buffer(ip, block, 1, &bh);
115 if (error)
116 goto out_brelse;
117 gfs2_buffer_copy_tail(bh,
118 sizeof(struct gfs2_meta_header),
119 dibh, sizeof(struct gfs2_dinode));
120 brelse(bh);
121 } else {
122 block = gfs2_alloc_data(ip);
123
124 error = unstuffer(ip, dibh, block, private);
125 if (error)
126 goto out_brelse;
127 }
128 }
129
130 /* Set up the pointer to the new block */
131
132 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
133
134 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
135
136 if (ip->i_di.di_size) {
137 *(uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode)) =
138 cpu_to_be64(block);
139 ip->i_di.di_blocks++;
140 }
141
142 ip->i_di.di_height = 1;
143
144 gfs2_dinode_out(&ip->i_di, dibh->b_data);
145
146 out_brelse:
147 brelse(dibh);
148
149 out:
150 up_write(&ip->i_rw_mutex);
151
152 return error;
153}
154
155/**
156 * calc_tree_height - Calculate the height of a metadata tree
157 * @ip: The GFS2 inode
158 * @size: The proposed size of the file
159 *
160 * Work out how tall a metadata tree needs to be in order to accommodate a
161 * file of a particular size. If size is less than the current size of
162 * the inode, then the current size of the inode is used instead of the
163 * supplied one.
164 *
165 * Returns: the height the tree should be
166 */
167
168static unsigned int calc_tree_height(struct gfs2_inode *ip, uint64_t size)
169{
170 struct gfs2_sbd *sdp = ip->i_sbd;
171 uint64_t *arr;
172 unsigned int max, height;
173
174 if (ip->i_di.di_size > size)
175 size = ip->i_di.di_size;
176
177 if (gfs2_is_dir(ip)) {
178 arr = sdp->sd_jheightsize;
179 max = sdp->sd_max_jheight;
180 } else {
181 arr = sdp->sd_heightsize;
182 max = sdp->sd_max_height;
183 }
184
185 for (height = 0; height < max; height++)
186 if (arr[height] >= size)
187 break;
188
189 return height;
190}
191
192/**
193 * build_height - Build a metadata tree of the requested height
194 * @ip: The GFS2 inode
195 * @height: The height to build to
196 *
197 * This routine makes sure that the metadata tree is tall enough to hold
198 * "size" bytes of data.
199 *
200 * Returns: errno
201 */
202
203static int build_height(struct gfs2_inode *ip, int height)
204{
205 struct gfs2_sbd *sdp = ip->i_sbd;
206 struct buffer_head *bh, *dibh;
207 uint64_t block = 0, *bp;
208 unsigned int x;
209 int new_block;
210 int error;
211
212 while (ip->i_di.di_height < height) {
213 error = gfs2_meta_inode_buffer(ip, &dibh);
214 if (error)
215 return error;
216
217 new_block = 0;
218 bp = (uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode));
219 for (x = 0; x < sdp->sd_diptrs; x++, bp++)
220 if (*bp) {
221 new_block = 1;
222 break;
223 }
224
225 if (new_block) {
226 /* Get a new block, fill it with the old direct
227 pointers, and write it out */
228
229 block = gfs2_alloc_meta(ip);
230
231 bh = gfs2_meta_new(ip->i_gl, block);
232 gfs2_trans_add_bh(ip->i_gl, bh, 1);
233 gfs2_metatype_set(bh,
234 GFS2_METATYPE_IN,
235 GFS2_FORMAT_IN);
236 gfs2_buffer_copy_tail(bh,
237 sizeof(struct gfs2_meta_header),
238 dibh, sizeof(struct gfs2_dinode));
239
240 brelse(bh);
241 }
242
243 /* Set up the new direct pointer and write it out to disk */
244
245 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
246
247 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
248
249 if (new_block) {
250 *(uint64_t *)(dibh->b_data +
251 sizeof(struct gfs2_dinode)) =
252 cpu_to_be64(block);
253 ip->i_di.di_blocks++;
254 }
255
256 ip->i_di.di_height++;
257
258 gfs2_dinode_out(&ip->i_di, dibh->b_data);
259 brelse(dibh);
260 }
261
262 return 0;
263}
264
265/**
266 * find_metapath - Find path through the metadata tree
267 * @ip: The inode pointer
268 * @mp: The metapath to return the result in
269 * @block: The disk block to look up
270 *
271 * This routine returns a struct metapath structure that defines a path
272 * through the metadata of inode "ip" to get to block "block".
273 *
274 * Example:
275 * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a
276 * filesystem with a blocksize of 4096.
277 *
278 * find_metapath() would return a struct metapath structure set to:
279 * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
280 * and mp_list[2] = 165.
281 *
282 * That means that in order to get to the block containing the byte at
283 * offset 101342453, we would load the indirect block pointed to by pointer
284 * 0 in the dinode. We would then load the indirect block pointed to by
285 * pointer 48 in that indirect block. We would then load the data block
286 * pointed to by pointer 165 in that indirect block.
287 *
288 * ----------------------------------------
289 * | Dinode | |
290 * | | 4|
291 * | |0 1 2 3 4 5 9|
292 * | | 6|
293 * ----------------------------------------
294 * |
295 * |
296 * V
297 * ----------------------------------------
298 * | Indirect Block |
299 * | 5|
300 * | 4 4 4 4 4 5 5 1|
301 * |0 5 6 7 8 9 0 1 2|
302 * ----------------------------------------
303 * |
304 * |
305 * V
306 * ----------------------------------------
307 * | Indirect Block |
308 * | 1 1 1 1 1 5|
309 * | 6 6 6 6 6 1|
310 * |0 3 4 5 6 7 2|
311 * ----------------------------------------
312 * |
313 * |
314 * V
315 * ----------------------------------------
316 * | Data block containing offset |
317 * | 101342453 |
318 * | |
319 * | |
320 * ----------------------------------------
321 *
322 */
323
324static void find_metapath(struct gfs2_inode *ip, uint64_t block,
325 struct metapath *mp)
326{
327 struct gfs2_sbd *sdp = ip->i_sbd;
328 uint64_t b = block;
329 unsigned int i;
330
331 for (i = ip->i_di.di_height; i--;)
332 mp->mp_list[i] = (__u16)do_div(b, sdp->sd_inptrs);
333
334}
335
336/**
337 * metapointer - Return pointer to start of metadata in a buffer
338 * @bh: The buffer
339 * @height: The metadata height (0 = dinode)
340 * @mp: The metapath
341 *
342 * Return a pointer to the block number of the next height of the metadata
343 * tree given a buffer containing the pointer to the current height of the
344 * metadata tree.
345 */
346
347static inline uint64_t *metapointer(struct buffer_head *bh,
348 unsigned int height, struct metapath *mp)
349{
350 unsigned int head_size = (height > 0) ?
351 sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
352
353 return ((uint64_t *)(bh->b_data + head_size)) + mp->mp_list[height];
354}
355
356/**
357 * lookup_block - Get the next metadata block in metadata tree
358 * @ip: The GFS2 inode
359 * @bh: Buffer containing the pointers to metadata blocks
360 * @height: The height of the tree (0 = dinode)
361 * @mp: The metapath
362 * @create: Non-zero if we may create a new meatdata block
363 * @new: Used to indicate if we did create a new metadata block
364 * @block: the returned disk block number
365 *
366 * Given a metatree, complete to a particular height, checks to see if the next
367 * height of the tree exists. If not the next height of the tree is created.
368 * The block number of the next height of the metadata tree is returned.
369 *
370 */
371
372static void lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
373 unsigned int height, struct metapath *mp, int create,
374 int *new, uint64_t *block)
375{
376 uint64_t *ptr = metapointer(bh, height, mp);
377
378 if (*ptr) {
379 *block = be64_to_cpu(*ptr);
380 return;
381 }
382
383 *block = 0;
384
385 if (!create)
386 return;
387
388 if (height == ip->i_di.di_height - 1 &&
389 !gfs2_is_dir(ip))
390 *block = gfs2_alloc_data(ip);
391 else
392 *block = gfs2_alloc_meta(ip);
393
394 gfs2_trans_add_bh(ip->i_gl, bh, 1);
395
396 *ptr = cpu_to_be64(*block);
397 ip->i_di.di_blocks++;
398
399 *new = 1;
400}
401
402/**
403 * gfs2_block_map - Map a block from an inode to a disk block
404 * @ip: The GFS2 inode
405 * @lblock: The logical block number
406 * @new: Value/Result argument (1 = may create/did create new blocks)
407 * @dblock: the disk block number of the start of an extent
408 * @extlen: the size of the extent
409 *
410 * Find the block number on the current device which corresponds to an
411 * inode's block. If the block had to be created, "new" will be set.
412 *
413 * Returns: errno
414 */
415
416int gfs2_block_map(struct gfs2_inode *ip, uint64_t lblock, int *new,
417 uint64_t *dblock, uint32_t *extlen)
418{
419 struct gfs2_sbd *sdp = ip->i_sbd;
420 struct buffer_head *bh;
421 struct metapath mp;
422 int create = *new;
423 unsigned int bsize;
424 unsigned int height;
425 unsigned int end_of_metadata;
426 unsigned int x;
427 int error = 0;
428
429 *new = 0;
430 *dblock = 0;
431 if (extlen)
432 *extlen = 0;
433
434 if (create)
435 down_write(&ip->i_rw_mutex);
436 else
437 down_read(&ip->i_rw_mutex);
438
439 if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
440 goto out;
441
442 bsize = (gfs2_is_dir(ip)) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
443
444 height = calc_tree_height(ip, (lblock + 1) * bsize);
445 if (ip->i_di.di_height < height) {
446 if (!create)
447 goto out;
448
449 error = build_height(ip, height);
450 if (error)
451 goto out;
452 }
453
454 find_metapath(ip, lblock, &mp);
455 end_of_metadata = ip->i_di.di_height - 1;
456
457 error = gfs2_meta_inode_buffer(ip, &bh);
458 if (error)
459 goto out;
460
461 for (x = 0; x < end_of_metadata; x++) {
462 lookup_block(ip, bh, x, &mp, create, new, dblock);
463 brelse(bh);
464 if (!*dblock)
465 goto out;
466
467 error = gfs2_meta_indirect_buffer(ip, x+1, *dblock, *new, &bh);
468 if (error)
469 goto out;
470 }
471
472 lookup_block(ip, bh, end_of_metadata, &mp, create, new, dblock);
473
474 if (extlen && *dblock) {
475 *extlen = 1;
476
477 if (!*new) {
478 uint64_t tmp_dblock;
479 int tmp_new;
480 unsigned int nptrs;
481
482 nptrs = (end_of_metadata) ? sdp->sd_inptrs :
483 sdp->sd_diptrs;
484
485 while (++mp.mp_list[end_of_metadata] < nptrs) {
486 lookup_block(ip, bh, end_of_metadata, &mp,
487 0, &tmp_new, &tmp_dblock);
488
489 if (*dblock + *extlen != tmp_dblock)
490 break;
491
492 (*extlen)++;
493 }
494 }
495 }
496
497 brelse(bh);
498
499 if (*new) {
500 error = gfs2_meta_inode_buffer(ip, &bh);
501 if (!error) {
502 gfs2_trans_add_bh(ip->i_gl, bh, 1);
503 gfs2_dinode_out(&ip->i_di, bh->b_data);
504 brelse(bh);
505 }
506 }
507
508 out:
509 if (create)
510 up_write(&ip->i_rw_mutex);
511 else
512 up_read(&ip->i_rw_mutex);
513
514 return error;
515}
516
517/**
518 * recursive_scan - recursively scan through the end of a file
519 * @ip: the inode
520 * @dibh: the dinode buffer
521 * @mp: the path through the metadata to the point to start
522 * @height: the height the recursion is at
523 * @block: the indirect block to look at
524 * @first: 1 if this is the first block
525 * @bc: the call to make for each piece of metadata
526 * @data: data opaque to this function to pass to @bc
527 *
528 * When this is first called @height and @block should be zero and
529 * @first should be 1.
530 *
531 * Returns: errno
532 */
533
534static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
535 struct metapath *mp, unsigned int height,
536 uint64_t block, int first, block_call_t bc,
537 void *data)
538{
539 struct gfs2_sbd *sdp = ip->i_sbd;
540 struct buffer_head *bh = NULL;
541 uint64_t *top, *bottom;
542 uint64_t bn;
543 int error;
544 int mh_size = sizeof(struct gfs2_meta_header);
545
546 if (!height) {
547 error = gfs2_meta_inode_buffer(ip, &bh);
548 if (error)
549 return error;
550 dibh = bh;
551
552 top = (uint64_t *)(bh->b_data + sizeof(struct gfs2_dinode)) +
553 mp->mp_list[0];
554 bottom = (uint64_t *)(bh->b_data + sizeof(struct gfs2_dinode)) +
555 sdp->sd_diptrs;
556 } else {
557 error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
558 if (error)
559 return error;
560
561 top = (uint64_t *)(bh->b_data + mh_size) +
562 ((first) ? mp->mp_list[height] : 0);
563
564 bottom = (uint64_t *)(bh->b_data + mh_size) + sdp->sd_inptrs;
565 }
566
567 error = bc(ip, dibh, bh, top, bottom, height, data);
568 if (error)
569 goto out;
570
571 if (height < ip->i_di.di_height - 1)
572 for (; top < bottom; top++, first = 0) {
573 if (!*top)
574 continue;
575
576 bn = be64_to_cpu(*top);
577
578 error = recursive_scan(ip, dibh, mp, height + 1, bn,
579 first, bc, data);
580 if (error)
581 break;
582 }
583
584 out:
585 brelse(bh);
586
587 return error;
588}
589
590/**
591 * do_strip - Look for a layer a particular layer of the file and strip it off
592 * @ip: the inode
593 * @dibh: the dinode buffer
594 * @bh: A buffer of pointers
595 * @top: The first pointer in the buffer
596 * @bottom: One more than the last pointer
597 * @height: the height this buffer is at
598 * @data: a pointer to a struct strip_mine
599 *
600 * Returns: errno
601 */
602
603static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
604 struct buffer_head *bh, uint64_t *top, uint64_t *bottom,
605 unsigned int height, void *data)
606{
607 struct strip_mine *sm = (struct strip_mine *)data;
608 struct gfs2_sbd *sdp = ip->i_sbd;
609 struct gfs2_rgrp_list rlist;
610 uint64_t bn, bstart;
611 uint32_t blen;
612 uint64_t *p;
613 unsigned int rg_blocks = 0;
614 int metadata;
615 unsigned int revokes = 0;
616 int x;
617 int error;
618
619 if (!*top)
620 sm->sm_first = 0;
621
622 if (height != sm->sm_height)
623 return 0;
624
625 if (sm->sm_first) {
626 top++;
627 sm->sm_first = 0;
628 }
629
630 metadata = (height != ip->i_di.di_height - 1);
631 if (metadata)
632 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
633
634 error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh);
635 if (error)
636 return error;
637
638 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
639 bstart = 0;
640 blen = 0;
641
642 for (p = top; p < bottom; p++) {
643 if (!*p)
644 continue;
645
646 bn = be64_to_cpu(*p);
647
648 if (bstart + blen == bn)
649 blen++;
650 else {
651 if (bstart)
652 gfs2_rlist_add(sdp, &rlist, bstart);
653
654 bstart = bn;
655 blen = 1;
656 }
657 }
658
659 if (bstart)
660 gfs2_rlist_add(sdp, &rlist, bstart);
661 else
662 goto out; /* Nothing to do */
663
664 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
665
666 for (x = 0; x < rlist.rl_rgrps; x++) {
667 struct gfs2_rgrpd *rgd;
668 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
669 rg_blocks += rgd->rd_ri.ri_length;
670 }
671
672 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
673 if (error)
674 goto out_rlist;
675
676 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
677 RES_INDIRECT + RES_STATFS + RES_QUOTA,
678 revokes);
679 if (error)
680 goto out_rg_gunlock;
681
682 down_write(&ip->i_rw_mutex);
683
684 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
685 gfs2_trans_add_bh(ip->i_gl, bh, 1);
686
687 bstart = 0;
688 blen = 0;
689
690 for (p = top; p < bottom; p++) {
691 if (!*p)
692 continue;
693
694 bn = be64_to_cpu(*p);
695
696 if (bstart + blen == bn)
697 blen++;
698 else {
699 if (bstart) {
700 if (metadata)
701 gfs2_free_meta(ip, bstart, blen);
702 else
703 gfs2_free_data(ip, bstart, blen);
704 }
705
706 bstart = bn;
707 blen = 1;
708 }
709
710 *p = 0;
711 if (!ip->i_di.di_blocks)
712 gfs2_consist_inode(ip);
713 ip->i_di.di_blocks--;
714 }
715 if (bstart) {
716 if (metadata)
717 gfs2_free_meta(ip, bstart, blen);
718 else
719 gfs2_free_data(ip, bstart, blen);
720 }
721
722 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
723
724 gfs2_dinode_out(&ip->i_di, dibh->b_data);
725
726 up_write(&ip->i_rw_mutex);
727
728 gfs2_trans_end(sdp);
729
730 out_rg_gunlock:
731 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
732
733 out_rlist:
734 gfs2_rlist_free(&rlist);
735
736 out:
737 gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh);
738
739 return error;
740}
741
742/**
743 * do_grow - Make a file look bigger than it is
744 * @ip: the inode
745 * @size: the size to set the file to
746 *
747 * Called with an exclusive lock on @ip.
748 *
749 * Returns: errno
750 */
751
752static int do_grow(struct gfs2_inode *ip, uint64_t size)
753{
754 struct gfs2_sbd *sdp = ip->i_sbd;
755 struct gfs2_alloc *al;
756 struct buffer_head *dibh;
757 unsigned int h;
758 int error;
759
760 al = gfs2_alloc_get(ip);
761
762 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
763 if (error)
764 goto out;
765
766 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
767 if (error)
768 goto out_gunlock_q;
769
770 al->al_requested = sdp->sd_max_height + RES_DATA;
771
772 error = gfs2_inplace_reserve(ip);
773 if (error)
774 goto out_gunlock_q;
775
776 error = gfs2_trans_begin(sdp,
777 sdp->sd_max_height + al->al_rgd->rd_ri.ri_length +
778 RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
779 if (error)
780 goto out_ipres;
781
782 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
783 if (gfs2_is_stuffed(ip)) {
784 error = gfs2_unstuff_dinode(ip, gfs2_unstuffer_page,
785 NULL);
786 if (error)
787 goto out_end_trans;
788 }
789
790 h = calc_tree_height(ip, size);
791 if (ip->i_di.di_height < h) {
792 down_write(&ip->i_rw_mutex);
793 error = build_height(ip, h);
794 up_write(&ip->i_rw_mutex);
795 if (error)
796 goto out_end_trans;
797 }
798 }
799
800 ip->i_di.di_size = size;
801 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
802
803 error = gfs2_meta_inode_buffer(ip, &dibh);
804 if (error)
805 goto out_end_trans;
806
807 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
808 gfs2_dinode_out(&ip->i_di, dibh->b_data);
809 brelse(dibh);
810
811 out_end_trans:
812 gfs2_trans_end(sdp);
813
814 out_ipres:
815 gfs2_inplace_release(ip);
816
817 out_gunlock_q:
818 gfs2_quota_unlock(ip);
819
820 out:
821 gfs2_alloc_put(ip);
822
823 return error;
824}
825
826static int trunc_start(struct gfs2_inode *ip, uint64_t size)
827{
828 struct gfs2_sbd *sdp = ip->i_sbd;
829 struct buffer_head *dibh;
830 int journaled = gfs2_is_jdata(ip);
831 int error;
832
833 error = gfs2_trans_begin(sdp,
834 RES_DINODE + ((journaled) ? RES_JDATA : 0), 0);
835 if (error)
836 return error;
837
838 error = gfs2_meta_inode_buffer(ip, &dibh);
839 if (error)
840 goto out;
841
842 if (gfs2_is_stuffed(ip)) {
843 ip->i_di.di_size = size;
844 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
845 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
846 gfs2_dinode_out(&ip->i_di, dibh->b_data);
847 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
848 error = 1;
849
850 } else {
851 if (size & (uint64_t)(sdp->sd_sb.sb_bsize - 1))
852 error = gfs2_block_truncate_page(ip->i_vnode->i_mapping);
853
854 if (!error) {
855 ip->i_di.di_size = size;
856 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
857 ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
858 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
859 gfs2_dinode_out(&ip->i_di, dibh->b_data);
860 }
861 }
862
863 brelse(dibh);
864
865 out:
866 gfs2_trans_end(sdp);
867
868 return error;
869}
870
871static int trunc_dealloc(struct gfs2_inode *ip, uint64_t size)
872{
873 unsigned int height = ip->i_di.di_height;
874 uint64_t lblock;
875 struct metapath mp;
876 int error;
877
878 if (!size)
879 lblock = 0;
880 else
881 lblock = (size - 1) >> ip->i_sbd->sd_sb.sb_bsize_shift;
882
883 find_metapath(ip, lblock, &mp);
884 gfs2_alloc_get(ip);
885
886 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
887 if (error)
888 goto out;
889
890 while (height--) {
891 struct strip_mine sm;
892 sm.sm_first = !!size;
893 sm.sm_height = height;
894
895 error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
896 if (error)
897 break;
898 }
899
900 gfs2_quota_unhold(ip);
901
902 out:
903 gfs2_alloc_put(ip);
904 return error;
905}
906
907static int trunc_end(struct gfs2_inode *ip)
908{
909 struct gfs2_sbd *sdp = ip->i_sbd;
910 struct buffer_head *dibh;
911 int error;
912
913 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
914 if (error)
915 return error;
916
917 down_write(&ip->i_rw_mutex);
918
919 error = gfs2_meta_inode_buffer(ip, &dibh);
920 if (error)
921 goto out;
922
923 if (!ip->i_di.di_size) {
924 ip->i_di.di_height = 0;
925 ip->i_di.di_goal_meta =
926 ip->i_di.di_goal_data =
927 ip->i_num.no_addr;
928 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
929 }
930 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
931 ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
932
933 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
934 gfs2_dinode_out(&ip->i_di, dibh->b_data);
935 brelse(dibh);
936
937 out:
938 up_write(&ip->i_rw_mutex);
939
940 gfs2_trans_end(sdp);
941
942 return error;
943}
944
945/**
946 * do_shrink - make a file smaller
947 * @ip: the inode
948 * @size: the size to make the file
949 * @truncator: function to truncate the last partial block
950 *
951 * Called with an exclusive lock on @ip.
952 *
953 * Returns: errno
954 */
955
956static int do_shrink(struct gfs2_inode *ip, uint64_t size)
957{
958 int error;
959
960 error = trunc_start(ip, size);
961 if (error < 0)
962 return error;
963 if (error > 0)
964 return 0;
965
966 error = trunc_dealloc(ip, size);
967 if (!error)
968 error = trunc_end(ip);
969
970 return error;
971}
972
973/**
974 * gfs2_truncatei - make a file a given size
975 * @ip: the inode
976 * @size: the size to make the file
977 * @truncator: function to truncate the last partial block
978 *
979 * The file size can grow, shrink, or stay the same size.
980 *
981 * Returns: errno
982 */
983
984int gfs2_truncatei(struct gfs2_inode *ip, uint64_t size)
985{
986 int error;
987
988 if (gfs2_assert_warn(ip->i_sbd, S_ISREG(ip->i_di.di_mode)))
989 return -EINVAL;
990
991 if (size > ip->i_di.di_size)
992 error = do_grow(ip, size);
993 else
994 error = do_shrink(ip, size);
995
996 return error;
997}
998
999int gfs2_truncatei_resume(struct gfs2_inode *ip)
1000{
1001 int error;
1002 error = trunc_dealloc(ip, ip->i_di.di_size);
1003 if (!error)
1004 error = trunc_end(ip);
1005 return error;
1006}
1007
1008int gfs2_file_dealloc(struct gfs2_inode *ip)
1009{
1010 return trunc_dealloc(ip, 0);
1011}
1012
1013/**
1014 * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
1015 * @ip: the file
1016 * @len: the number of bytes to be written to the file
1017 * @data_blocks: returns the number of data blocks required
1018 * @ind_blocks: returns the number of indirect blocks required
1019 *
1020 */
1021
1022void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
1023 unsigned int *data_blocks, unsigned int *ind_blocks)
1024{
1025 struct gfs2_sbd *sdp = ip->i_sbd;
1026 unsigned int tmp;
1027
1028 if (gfs2_is_dir(ip)) {
1029 *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
1030 *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
1031 } else {
1032 *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
1033 *ind_blocks = 3 * (sdp->sd_max_height - 1);
1034 }
1035
1036 for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
1037 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
1038 *ind_blocks += tmp;
1039 }
1040}
1041
1042/**
1043 * gfs2_write_alloc_required - figure out if a write will require an allocation
1044 * @ip: the file being written to
1045 * @offset: the offset to write to
1046 * @len: the number of bytes being written
1047 * @alloc_required: set to 1 if an alloc is required, 0 otherwise
1048 *
1049 * Returns: errno
1050 */
1051
1052int gfs2_write_alloc_required(struct gfs2_inode *ip, uint64_t offset,
1053 unsigned int len, int *alloc_required)
1054{
1055 struct gfs2_sbd *sdp = ip->i_sbd;
1056 uint64_t lblock, lblock_stop, dblock;
1057 uint32_t extlen;
1058 int new = 0;
1059 int error = 0;
1060
1061 *alloc_required = 0;
1062
1063 if (!len)
1064 return 0;
1065
1066 if (gfs2_is_stuffed(ip)) {
1067 if (offset + len >
1068 sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
1069 *alloc_required = 1;
1070 return 0;
1071 }
1072
1073 if (gfs2_is_dir(ip)) {
1074 unsigned int bsize = sdp->sd_jbsize;
1075 lblock = offset;
1076 do_div(lblock, bsize);
1077 lblock_stop = offset + len + bsize - 1;
1078 do_div(lblock_stop, bsize);
1079 } else {
1080 unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1081 lblock = offset >> shift;
1082 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1083 }
1084
1085 for (; lblock < lblock_stop; lblock += extlen) {
1086 error = gfs2_block_map(ip, lblock, &new, &dblock, &extlen);
1087 if (error)
1088 return error;
1089
1090 if (!dblock) {
1091 *alloc_required = 1;
1092 return 0;
1093 }
1094 }
1095
1096 return 0;
1097}
1098
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
new file mode 100644
index 000000000000..ee9ec8d7515c
--- /dev/null
+++ b/fs/gfs2/bmap.h
@@ -0,0 +1,35 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __BMAP_DOT_H__
11#define __BMAP_DOT_H__
12
13typedef int (*gfs2_unstuffer_t) (struct gfs2_inode * ip,
14 struct buffer_head * dibh, uint64_t block,
15 void *private);
16int gfs2_unstuffer_sync(struct gfs2_inode *ip, struct buffer_head *dibh,
17 uint64_t block, void *private);
18int gfs2_unstuff_dinode(struct gfs2_inode *ip, gfs2_unstuffer_t unstuffer,
19 void *private);
20
21int gfs2_block_map(struct gfs2_inode *ip,
22 uint64_t lblock, int *new,
23 uint64_t *dblock, uint32_t *extlen);
24
25int gfs2_truncatei(struct gfs2_inode *ip, uint64_t size);
26int gfs2_truncatei_resume(struct gfs2_inode *ip);
27int gfs2_file_dealloc(struct gfs2_inode *ip);
28
29void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
30 unsigned int *data_blocks,
31 unsigned int *ind_blocks);
32int gfs2_write_alloc_required(struct gfs2_inode *ip, uint64_t offset,
33 unsigned int len, int *alloc_required);
34
35#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
new file mode 100644
index 000000000000..b3830b92d78c
--- /dev/null
+++ b/fs/gfs2/daemon.c
@@ -0,0 +1,229 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/kthread.h>
16#include <linux/delay.h>
17#include <linux/gfs2_ondisk.h>
18#include <asm/semaphore.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "daemon.h"
24#include "glock.h"
25#include "log.h"
26#include "quota.h"
27#include "recovery.h"
28#include "super.h"
29#include "unlinked.h"
30#include "util.h"
31
32/* This uses schedule_timeout() instead of msleep() because it's good for
33 the daemons to wake up more often than the timeout when unmounting so
34 the user's unmount doesn't sit there forever.
35
36 The kthread functions used to start these daemons block and flush signals. */
37
38/**
39 * gfs2_scand - Look for cached glocks and inodes to toss from memory
40 * @sdp: Pointer to GFS2 superblock
41 *
42 * One of these daemons runs, finding candidates to add to sd_reclaim_list.
43 * See gfs2_glockd()
44 */
45
46int gfs2_scand(void *data)
47{
48 struct gfs2_sbd *sdp = (struct gfs2_sbd *)data;
49 unsigned long t;
50
51 while (!kthread_should_stop()) {
52 gfs2_scand_internal(sdp);
53 t = gfs2_tune_get(sdp, gt_scand_secs) * HZ;
54 schedule_timeout_interruptible(t);
55 }
56
57 return 0;
58}
59
60/**
61 * gfs2_glockd - Reclaim unused glock structures
62 * @sdp: Pointer to GFS2 superblock
63 *
64 * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
65 * Number of daemons can be set by user, with num_glockd mount option.
66 */
67
68int gfs2_glockd(void *data)
69{
70 struct gfs2_sbd *sdp = (struct gfs2_sbd *)data;
71 DECLARE_WAITQUEUE(wait_chan, current);
72
73 while (!kthread_should_stop()) {
74 while (atomic_read(&sdp->sd_reclaim_count))
75 gfs2_reclaim_glock(sdp);
76
77 set_current_state(TASK_INTERRUPTIBLE);
78 add_wait_queue(&sdp->sd_reclaim_wq, &wait_chan);
79 if (!atomic_read(&sdp->sd_reclaim_count) &&
80 !kthread_should_stop())
81 schedule();
82 remove_wait_queue(&sdp->sd_reclaim_wq, &wait_chan);
83 set_current_state(TASK_RUNNING);
84 }
85
86 return 0;
87}
88
89/**
90 * gfs2_recoverd - Recover dead machine's journals
91 * @sdp: Pointer to GFS2 superblock
92 *
93 */
94
95int gfs2_recoverd(void *data)
96{
97 struct gfs2_sbd *sdp = (struct gfs2_sbd *)data;
98 unsigned long t;
99
100 while (!kthread_should_stop()) {
101 gfs2_check_journals(sdp);
102 t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
103 schedule_timeout_interruptible(t);
104 }
105
106 return 0;
107}
108
109/**
110 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
111 * @sdp: Pointer to GFS2 superblock
112 *
113 * Also, periodically check to make sure that we're using the most recent
114 * journal index.
115 */
116
117int gfs2_logd(void *data)
118{
119 struct gfs2_sbd *sdp = (struct gfs2_sbd *)data;
120 struct gfs2_holder ji_gh;
121 unsigned long t;
122
123 while (!kthread_should_stop()) {
124 /* Advance the log tail */
125
126 t = sdp->sd_log_flush_time +
127 gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
128
129 gfs2_ail1_empty(sdp, DIO_ALL);
130
131 if (time_after_eq(jiffies, t)) {
132 gfs2_log_flush(sdp, NULL);
133 sdp->sd_log_flush_time = jiffies;
134 }
135
136 /* Check for latest journal index */
137
138 t = sdp->sd_jindex_refresh_time +
139 gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
140
141 if (time_after_eq(jiffies, t)) {
142 if (!gfs2_jindex_hold(sdp, &ji_gh))
143 gfs2_glock_dq_uninit(&ji_gh);
144 sdp->sd_jindex_refresh_time = jiffies;
145 }
146
147 t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
148 schedule_timeout_interruptible(t);
149 }
150
151 return 0;
152}
153
154/**
155 * gfs2_quotad - Write cached quota changes into the quota file
156 * @sdp: Pointer to GFS2 superblock
157 *
158 */
159
160int gfs2_quotad(void *data)
161{
162 struct gfs2_sbd *sdp = (struct gfs2_sbd *)data;
163 unsigned long t;
164 int error;
165
166 while (!kthread_should_stop()) {
167 /* Update the master statfs file */
168
169 t = sdp->sd_statfs_sync_time +
170 gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
171
172 if (time_after_eq(jiffies, t)) {
173 error = gfs2_statfs_sync(sdp);
174 if (error &&
175 error != -EROFS &&
176 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
177 fs_err(sdp, "quotad: (1) error=%d\n", error);
178 sdp->sd_statfs_sync_time = jiffies;
179 }
180
181 /* Update quota file */
182
183 t = sdp->sd_quota_sync_time +
184 gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
185
186 if (time_after_eq(jiffies, t)) {
187 error = gfs2_quota_sync(sdp);
188 if (error &&
189 error != -EROFS &&
190 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
191 fs_err(sdp, "quotad: (2) error=%d\n", error);
192 sdp->sd_quota_sync_time = jiffies;
193 }
194
195 gfs2_quota_scan(sdp);
196
197 t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
198 schedule_timeout_interruptible(t);
199 }
200
201 return 0;
202}
203
204/**
205 * gfs2_inoded - Deallocate unlinked inodes
206 * @sdp: Pointer to GFS2 superblock
207 *
208 */
209
210int gfs2_inoded(void *data)
211{
212 struct gfs2_sbd *sdp = (struct gfs2_sbd *)data;
213 unsigned long t;
214 int error;
215
216 while (!kthread_should_stop()) {
217 error = gfs2_unlinked_dealloc(sdp);
218 if (error &&
219 error != -EROFS &&
220 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
221 fs_err(sdp, "inoded: error = %d\n", error);
222
223 t = gfs2_tune_get(sdp, gt_inoded_secs) * HZ;
224 schedule_timeout_interruptible(t);
225 }
226
227 return 0;
228}
229
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
new file mode 100644
index 000000000000..a27fdeda5fbb
--- /dev/null
+++ b/fs/gfs2/daemon.h
@@ -0,0 +1,20 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __DAEMON_DOT_H__
11#define __DAEMON_DOT_H__
12
13int gfs2_scand(void *data);
14int gfs2_glockd(void *data);
15int gfs2_recoverd(void *data);
16int gfs2_logd(void *data);
17int gfs2_quotad(void *data);
18int gfs2_inoded(void *data);
19
20#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
new file mode 100644
index 000000000000..eb68cdd41d48
--- /dev/null
+++ b/fs/gfs2/dir.c
@@ -0,0 +1,1968 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10/*
11* Implements Extendible Hashing as described in:
12* "Extendible Hashing" by Fagin, et al in
13* __ACM Trans. on Database Systems__, Sept 1979.
14*
15*
16* Here's the layout of dirents which is essentially the same as that of ext2
17* within a single block. The field de_name_len is the number of bytes
18* actually required for the name (no null terminator). The field de_rec_len
19* is the number of bytes allocated to the dirent. The offset of the next
20* dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
21* deleted, the preceding dirent inherits its allocated space, ie
22* prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
23* by adding de_rec_len to the current dirent, this essentially causes the
24* deleted dirent to get jumped over when iterating through all the dirents.
25*
26* When deleting the first dirent in a block, there is no previous dirent so
27* the field de_ino is set to zero to designate it as deleted. When allocating
28* a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the
29* first dirent has (de_ino == 0) and de_rec_len is large enough, this first
30* dirent is allocated. Otherwise it must go through all the 'used' dirents
31* searching for one in which the amount of total space minus the amount of
32* used space will provide enough space for the new dirent.
33*
34* There are two types of blocks in which dirents reside. In a stuffed dinode,
35* the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of
36* the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
37* beginning of the leaf block. The dirents reside in leaves when
38*
39* dip->i_di.di_flags & GFS2_DIF_EXHASH is true
40*
41* Otherwise, the dirents are "linear", within a single stuffed dinode block.
42*
43* When the dirents are in leaves, the actual contents of the directory file are
44* used as an array of 64-bit block pointers pointing to the leaf blocks. The
45* dirents are NOT in the directory file itself. There can be more than one block
46* pointer in the array that points to the same leaf. In fact, when a directory
47* is first converted from linear to exhash, all of the pointers point to the
48* same leaf.
49*
50* When a leaf is completely full, the size of the hash table can be
51* doubled unless it is already at the maximum size which is hard coded into
52* GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list,
53* but never before the maximum hash table size has been reached.
54*/
55
56#include <linux/sched.h>
57#include <linux/slab.h>
58#include <linux/spinlock.h>
59#include <linux/completion.h>
60#include <linux/buffer_head.h>
61#include <linux/sort.h>
62#include <linux/gfs2_ondisk.h>
63#include <linux/crc32.h>
64#include <linux/vmalloc.h>
65#include <asm/semaphore.h>
66
67#include "gfs2.h"
68#include "lm_interface.h"
69#include "incore.h"
70#include "dir.h"
71#include "glock.h"
72#include "inode.h"
73#include "meta_io.h"
74#include "quota.h"
75#include "rgrp.h"
76#include "trans.h"
77#include "bmap.h"
78#include "util.h"
79
80#define IS_LEAF 1 /* Hashed (leaf) directory */
81#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */
82
83#if 1
84#define gfs2_disk_hash2offset(h) (((uint64_t)(h)) >> 1)
85#define gfs2_dir_offset2hash(p) ((uint32_t)(((uint64_t)(p)) << 1))
86#else
87#define gfs2_disk_hash2offset(h) (((uint64_t)(h)))
88#define gfs2_dir_offset2hash(p) ((uint32_t)(((uint64_t)(p))))
89#endif
90
91typedef int (*leaf_call_t) (struct gfs2_inode *dip,
92 uint32_t index, uint32_t len, uint64_t leaf_no,
93 void *data);
94
95int gfs2_dir_get_buffer(struct gfs2_inode *ip, uint64_t block, int new,
96 struct buffer_head **bhp)
97{
98 struct buffer_head *bh;
99 int error = 0;
100
101 if (new) {
102 bh = gfs2_meta_new(ip->i_gl, block);
103 gfs2_trans_add_bh(ip->i_gl, bh, 1);
104 gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
105 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
106 } else {
107 error = gfs2_meta_read(ip->i_gl, block, DIO_START | DIO_WAIT,
108 &bh);
109 if (error)
110 return error;
111 if (gfs2_metatype_check(ip->i_sbd, bh, GFS2_METATYPE_JD)) {
112 brelse(bh);
113 return -EIO;
114 }
115 }
116
117 *bhp = bh;
118 return 0;
119}
120
121
122
123static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
124 unsigned int offset, unsigned int size)
125
126{
127 struct buffer_head *dibh;
128 int error;
129
130 error = gfs2_meta_inode_buffer(ip, &dibh);
131 if (error)
132 return error;
133
134 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
135 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
136 if (ip->i_di.di_size < offset + size)
137 ip->i_di.di_size = offset + size;
138 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
139 gfs2_dinode_out(&ip->i_di, dibh->b_data);
140
141 brelse(dibh);
142
143 return size;
144}
145
146
147
148/**
149 * gfs2_dir_write_data - Write directory information to the inode
150 * @ip: The GFS2 inode
151 * @buf: The buffer containing information to be written
152 * @offset: The file offset to start writing at
153 * @size: The amount of data to write
154 *
155 * Returns: The number of bytes correctly written or error code
156 */
157static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
158 uint64_t offset, unsigned int size)
159{
160 struct gfs2_sbd *sdp = ip->i_sbd;
161 struct buffer_head *dibh;
162 uint64_t lblock, dblock;
163 uint32_t extlen = 0;
164 unsigned int o;
165 int copied = 0;
166 int error = 0;
167
168 if (!size)
169 return 0;
170
171 if (gfs2_is_stuffed(ip) &&
172 offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
173 return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset,
174 size);
175
176 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
177 return -EINVAL;
178
179 if (gfs2_is_stuffed(ip)) {
180 error = gfs2_unstuff_dinode(ip, NULL, NULL);
181 if (error)
182 return error;
183 }
184
185 lblock = offset;
186 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
187
188 while (copied < size) {
189 unsigned int amount;
190 struct buffer_head *bh;
191 int new;
192
193 amount = size - copied;
194 if (amount > sdp->sd_sb.sb_bsize - o)
195 amount = sdp->sd_sb.sb_bsize - o;
196
197 if (!extlen) {
198 new = 1;
199 error = gfs2_block_map(ip, lblock, &new, &dblock,
200 &extlen);
201 if (error)
202 goto fail;
203 error = -EIO;
204 if (gfs2_assert_withdraw(sdp, dblock))
205 goto fail;
206 }
207
208 error = gfs2_dir_get_buffer(ip, dblock,
209 (amount == sdp->sd_jbsize) ?
210 1 : new, &bh);
211 if (error)
212 goto fail;
213
214 gfs2_trans_add_bh(ip->i_gl, bh, 1);
215 memcpy(bh->b_data + o, buf, amount);
216 brelse(bh);
217 if (error)
218 goto fail;
219
220 copied += amount;
221 lblock++;
222 dblock++;
223 extlen--;
224
225 o = sizeof(struct gfs2_meta_header);
226 }
227
228out:
229 error = gfs2_meta_inode_buffer(ip, &dibh);
230 if (error)
231 return error;
232
233 if (ip->i_di.di_size < offset + copied)
234 ip->i_di.di_size = offset + copied;
235 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
236
237 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
238 gfs2_dinode_out(&ip->i_di, dibh->b_data);
239 brelse(dibh);
240
241 return copied;
242fail:
243 if (copied)
244 goto out;
245 return error;
246}
247
248static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
249 unsigned int offset, unsigned int size)
250{
251 struct buffer_head *dibh;
252 int error;
253
254 error = gfs2_meta_inode_buffer(ip, &dibh);
255 if (!error) {
256 offset += sizeof(struct gfs2_dinode);
257 memcpy(buf, dibh->b_data + offset, size);
258 brelse(dibh);
259 }
260
261 return (error) ? error : size;
262}
263
264
265/**
266 * gfs2_dir_read_data - Read a data from a directory inode
267 * @ip: The GFS2 Inode
268 * @buf: The buffer to place result into
269 * @offset: File offset to begin jdata_readng from
270 * @size: Amount of data to transfer
271 *
272 * Returns: The amount of data actually copied or the error
273 */
274static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf,
275 uint64_t offset, unsigned int size)
276{
277 struct gfs2_sbd *sdp = ip->i_sbd;
278 uint64_t lblock, dblock;
279 uint32_t extlen = 0;
280 unsigned int o;
281 int copied = 0;
282 int error = 0;
283
284 if (offset >= ip->i_di.di_size)
285 return 0;
286
287 if ((offset + size) > ip->i_di.di_size)
288 size = ip->i_di.di_size - offset;
289
290 if (!size)
291 return 0;
292
293 if (gfs2_is_stuffed(ip))
294 return gfs2_dir_read_stuffed(ip, buf, (unsigned int)offset,
295 size);
296
297 if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
298 return -EINVAL;
299
300 lblock = offset;
301 o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
302
303 while (copied < size) {
304 unsigned int amount;
305 struct buffer_head *bh;
306 int new;
307
308 amount = size - copied;
309 if (amount > sdp->sd_sb.sb_bsize - o)
310 amount = sdp->sd_sb.sb_bsize - o;
311
312 if (!extlen) {
313 new = 0;
314 error = gfs2_block_map(ip, lblock, &new, &dblock,
315 &extlen);
316 if (error)
317 goto fail;
318 }
319
320 if (extlen > 1)
321 gfs2_meta_ra(ip->i_gl, dblock, extlen);
322
323 if (dblock) {
324 error = gfs2_dir_get_buffer(ip, dblock, new, &bh);
325 if (error)
326 goto fail;
327 dblock++;
328 extlen--;
329 } else
330 bh = NULL;
331
332 memcpy(buf, bh->b_data + o, amount);
333 brelse(bh);
334 if (error)
335 goto fail;
336
337 copied += amount;
338 lblock++;
339
340 o = sizeof(struct gfs2_meta_header);
341 }
342
343 return copied;
344fail:
345 return (copied) ? copied : error;
346}
347
348typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
349 const struct qstr *name,
350 void *opaque);
351
352static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent,
353 const struct qstr *name, int ret)
354{
355 if (dent->de_inum.no_addr != 0 &&
356 be32_to_cpu(dent->de_hash) == name->hash &&
357 be16_to_cpu(dent->de_name_len) == name->len &&
358 memcmp((char *)(dent+1), name->name, name->len) == 0)
359 return ret;
360 return 0;
361}
362
363static int gfs2_dirent_find(const struct gfs2_dirent *dent,
364 const struct qstr *name,
365 void *opaque)
366{
367 return __gfs2_dirent_find(dent, name, 1);
368}
369
370static int gfs2_dirent_prev(const struct gfs2_dirent *dent,
371 const struct qstr *name,
372 void *opaque)
373{
374 return __gfs2_dirent_find(dent, name, 2);
375}
376
377/*
378 * name->name holds ptr to start of block.
379 * name->len holds size of block.
380 */
381static int gfs2_dirent_last(const struct gfs2_dirent *dent,
382 const struct qstr *name,
383 void *opaque)
384{
385 const char *start = name->name;
386 const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len);
387 if (name->len == (end - start))
388 return 1;
389 return 0;
390}
391
392static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
393 const struct qstr *name,
394 void *opaque)
395{
396 unsigned required = GFS2_DIRENT_SIZE(name->len);
397 unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
398 unsigned totlen = be16_to_cpu(dent->de_rec_len);
399
400 if (!dent->de_inum.no_addr)
401 actual = GFS2_DIRENT_SIZE(0);
402 if ((totlen - actual) >= required)
403 return 1;
404 return 0;
405}
406
407struct dirent_gather {
408 const struct gfs2_dirent **pdent;
409 unsigned offset;
410};
411
412static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
413 const struct qstr *name,
414 void *opaque)
415{
416 struct dirent_gather *g = opaque;
417 if (dent->de_inum.no_addr) {
418 g->pdent[g->offset++] = dent;
419 }
420 return 0;
421}
422
423/*
424 * Other possible things to check:
425 * - Inode located within filesystem size (and on valid block)
426 * - Valid directory entry type
427 * Not sure how heavy-weight we want to make this... could also check
428 * hash is correct for example, but that would take a lot of extra time.
429 * For now the most important thing is to check that the various sizes
430 * are correct.
431 */
432static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
433 unsigned int size, unsigned int len, int first)
434{
435 const char *msg = "gfs2_dirent too small";
436 if (unlikely(size < sizeof(struct gfs2_dirent)))
437 goto error;
438 msg = "gfs2_dirent misaligned";
439 if (unlikely(offset & 0x7))
440 goto error;
441 msg = "gfs2_dirent points beyond end of block";
442 if (unlikely(offset + size > len))
443 goto error;
444 msg = "zero inode number";
445 if (unlikely(!first && !dent->de_inum.no_addr))
446 goto error;
447 msg = "name length is greater than space in dirent";
448 if (dent->de_inum.no_addr &&
449 unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) >
450 size))
451 goto error;
452 return 0;
453error:
454 printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg,
455 first ? "first in block" : "not first in block");
456 return -EIO;
457}
458
459static int gfs2_dirent_offset(const void *buf)
460{
461 const struct gfs2_meta_header *h = buf;
462 int offset;
463
464 BUG_ON(buf == NULL);
465
466 switch(be32_to_cpu(h->mh_type)) {
467 case GFS2_METATYPE_LF:
468 offset = sizeof(struct gfs2_leaf);
469 break;
470 case GFS2_METATYPE_DI:
471 offset = sizeof(struct gfs2_dinode);
472 break;
473 default:
474 goto wrong_type;
475 }
476 return offset;
477wrong_type:
478 printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n",
479 be32_to_cpu(h->mh_type));
480 return -1;
481}
482
483static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode,
484 void *buf,
485 unsigned int len, gfs2_dscan_t scan,
486 const struct qstr *name,
487 void *opaque)
488{
489 struct gfs2_dirent *dent, *prev;
490 unsigned offset;
491 unsigned size;
492 int ret = 0;
493
494 ret = gfs2_dirent_offset(buf);
495 if (ret < 0)
496 goto consist_inode;
497
498 offset = ret;
499 prev = NULL;
500 dent = (struct gfs2_dirent *)(buf + offset);
501 size = be16_to_cpu(dent->de_rec_len);
502 if (gfs2_check_dirent(dent, offset, size, len, 1))
503 goto consist_inode;
504 do {
505 ret = scan(dent, name, opaque);
506 if (ret)
507 break;
508 offset += size;
509 if (offset == len)
510 break;
511 prev = dent;
512 dent = (struct gfs2_dirent *)(buf + offset);
513 size = be16_to_cpu(dent->de_rec_len);
514 if (gfs2_check_dirent(dent, offset, size, len, 0))
515 goto consist_inode;
516 } while(1);
517
518 switch(ret) {
519 case 0:
520 return NULL;
521 case 1:
522 return dent;
523 case 2:
524 return prev ? prev : dent;
525 default:
526 BUG_ON(ret > 0);
527 return ERR_PTR(ret);
528 }
529
530consist_inode:
531 gfs2_consist_inode(inode->u.generic_ip);
532 return ERR_PTR(-EIO);
533}
534
535
536/**
537 * dirent_first - Return the first dirent
538 * @dip: the directory
539 * @bh: The buffer
540 * @dent: Pointer to list of dirents
541 *
542 * return first dirent whether bh points to leaf or stuffed dinode
543 *
544 * Returns: IS_LEAF, IS_DINODE, or -errno
545 */
546
547static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
548 struct gfs2_dirent **dent)
549{
550 struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
551
552 if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
553 if (gfs2_meta_check(dip->i_sbd, bh))
554 return -EIO;
555 *dent = (struct gfs2_dirent *)(bh->b_data +
556 sizeof(struct gfs2_leaf));
557 return IS_LEAF;
558 } else {
559 if (gfs2_metatype_check(dip->i_sbd, bh, GFS2_METATYPE_DI))
560 return -EIO;
561 *dent = (struct gfs2_dirent *)(bh->b_data +
562 sizeof(struct gfs2_dinode));
563 return IS_DINODE;
564 }
565}
566
567/**
568 * dirent_next - Next dirent
569 * @dip: the directory
570 * @bh: The buffer
571 * @dent: Pointer to list of dirents
572 *
573 * Returns: 0 on success, error code otherwise
574 */
575
576static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh,
577 struct gfs2_dirent **dent)
578{
579 struct gfs2_dirent *tmp, *cur;
580 char *bh_end;
581 uint16_t cur_rec_len;
582
583 cur = *dent;
584 bh_end = bh->b_data + bh->b_size;
585 cur_rec_len = be16_to_cpu(cur->de_rec_len);
586
587 if ((char *)cur + cur_rec_len >= bh_end) {
588 if ((char *)cur + cur_rec_len > bh_end) {
589 gfs2_consist_inode(dip);
590 return -EIO;
591 }
592 return -ENOENT;
593 }
594
595 tmp = (struct gfs2_dirent *)((char *)cur + cur_rec_len);
596
597 if ((char *)tmp + be16_to_cpu(tmp->de_rec_len) > bh_end) {
598 gfs2_consist_inode(dip);
599 return -EIO;
600 }
601
602 if (cur_rec_len == 0) {
603 gfs2_consist_inode(dip);
604 return -EIO;
605 }
606
607 /* Only the first dent could ever have de_inum.no_addr == 0 */
608 if (!tmp->de_inum.no_addr) {
609 gfs2_consist_inode(dip);
610 return -EIO;
611 }
612
613 *dent = tmp;
614
615 return 0;
616}
617
618/**
619 * dirent_del - Delete a dirent
620 * @dip: The GFS2 inode
621 * @bh: The buffer
622 * @prev: The previous dirent
623 * @cur: The current dirent
624 *
625 */
626
627static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
628 struct gfs2_dirent *prev, struct gfs2_dirent *cur)
629{
630 uint16_t cur_rec_len, prev_rec_len;
631
632 if (!cur->de_inum.no_addr) {
633 gfs2_consist_inode(dip);
634 return;
635 }
636
637 gfs2_trans_add_bh(dip->i_gl, bh, 1);
638
639 /* If there is no prev entry, this is the first entry in the block.
640 The de_rec_len is already as big as it needs to be. Just zero
641 out the inode number and return. */
642
643 if (!prev) {
644 cur->de_inum.no_addr = 0; /* No endianess worries */
645 return;
646 }
647
648 /* Combine this dentry with the previous one. */
649
650 prev_rec_len = be16_to_cpu(prev->de_rec_len);
651 cur_rec_len = be16_to_cpu(cur->de_rec_len);
652
653 if ((char *)prev + prev_rec_len != (char *)cur)
654 gfs2_consist_inode(dip);
655 if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size)
656 gfs2_consist_inode(dip);
657
658 prev_rec_len += cur_rec_len;
659 prev->de_rec_len = cpu_to_be16(prev_rec_len);
660}
661
662/*
663 * Takes a dent from which to grab space as an argument. Returns the
664 * newly created dent.
665 */
666struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
667 struct gfs2_dirent *dent,
668 const struct qstr *name,
669 struct buffer_head *bh)
670{
671 struct gfs2_inode *ip = inode->u.generic_ip;
672 struct gfs2_dirent *ndent;
673 unsigned offset = 0, totlen;
674
675 if (dent->de_inum.no_addr)
676 offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
677 totlen = be16_to_cpu(dent->de_rec_len);
678 BUG_ON(offset + name->len > totlen);
679 gfs2_trans_add_bh(ip->i_gl, bh, 1);
680 ndent = (struct gfs2_dirent *)((char *)dent + offset);
681 dent->de_rec_len = cpu_to_be16(offset);
682 gfs2_qstr2dirent(name, totlen - offset, ndent);
683 return ndent;
684}
685
686static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
687 struct buffer_head *bh,
688 const struct qstr *name)
689{
690 struct gfs2_dirent *dent;
691 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
692 gfs2_dirent_find_space, name, NULL);
693 if (!dent || IS_ERR(dent))
694 return dent;
695 return gfs2_init_dirent(inode, dent, name, bh);
696}
697
698static int get_leaf(struct gfs2_inode *dip, uint64_t leaf_no,
699 struct buffer_head **bhp)
700{
701 int error;
702
703 error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_START | DIO_WAIT, bhp);
704 if (!error && gfs2_metatype_check(dip->i_sbd, *bhp, GFS2_METATYPE_LF))
705 error = -EIO;
706
707 return error;
708}
709
710/**
711 * get_leaf_nr - Get a leaf number associated with the index
712 * @dip: The GFS2 inode
713 * @index:
714 * @leaf_out:
715 *
716 * Returns: 0 on success, error code otherwise
717 */
718
719static int get_leaf_nr(struct gfs2_inode *dip, uint32_t index,
720 uint64_t *leaf_out)
721{
722 uint64_t leaf_no;
723 int error;
724
725 error = gfs2_dir_read_data(dip, (char *)&leaf_no,
726 index * sizeof(uint64_t),
727 sizeof(uint64_t));
728 if (error != sizeof(uint64_t))
729 return (error < 0) ? error : -EIO;
730
731 *leaf_out = be64_to_cpu(leaf_no);
732
733 return 0;
734}
735
736static int get_first_leaf(struct gfs2_inode *dip, uint32_t index,
737 struct buffer_head **bh_out)
738{
739 uint64_t leaf_no;
740 int error;
741
742 error = get_leaf_nr(dip, index, &leaf_no);
743 if (!error)
744 error = get_leaf(dip, leaf_no, bh_out);
745
746 return error;
747}
748
749static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
750 const struct qstr *name,
751 gfs2_dscan_t scan,
752 struct buffer_head **pbh)
753{
754 struct buffer_head *bh;
755 struct gfs2_dirent *dent;
756 struct gfs2_inode *ip = inode->u.generic_ip;
757 int error;
758
759 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
760 struct gfs2_leaf *leaf;
761 unsigned hsize = 1 << ip->i_di.di_depth;
762 unsigned index;
763 u64 ln;
764 if (hsize * sizeof(u64) != ip->i_di.di_size) {
765 gfs2_consist_inode(ip);
766 return ERR_PTR(-EIO);
767 }
768
769 index = name->hash >> (32 - ip->i_di.di_depth);
770 error = get_first_leaf(ip, index, &bh);
771 if (error)
772 return ERR_PTR(error);
773 do {
774 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
775 scan, name, NULL);
776 if (dent)
777 goto got_dent;
778 leaf = (struct gfs2_leaf *)bh->b_data;
779 ln = be64_to_cpu(leaf->lf_next);
780 brelse(bh);
781 if (!ln)
782 break;
783 error = get_leaf(ip, ln, &bh);
784 } while(!error);
785
786 return error ? ERR_PTR(error) : NULL;
787 }
788
789 error = gfs2_meta_inode_buffer(ip, &bh);
790 if (error)
791 return ERR_PTR(error);
792 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL);
793got_dent:
794 if (unlikely(dent == NULL || IS_ERR(dent))) {
795 brelse(bh);
796 bh = NULL;
797 }
798 *pbh = bh;
799 return dent;
800}
801
802static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
803{
804 struct gfs2_inode *ip = inode->u.generic_ip;
805 u64 bn = gfs2_alloc_meta(ip);
806 struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
807 struct gfs2_leaf *leaf;
808 struct gfs2_dirent *dent;
809 struct qstr name = { .name = "", .len = 0, .hash = 0 };
810 if (!bh)
811 return NULL;
812 gfs2_trans_add_bh(ip->i_gl, bh, 1);
813 gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
814 leaf = (struct gfs2_leaf *)bh->b_data;
815 leaf->lf_depth = cpu_to_be16(depth);
816 leaf->lf_entries = cpu_to_be16(0);
817 leaf->lf_dirent_format = cpu_to_be16(GFS2_FORMAT_DE);
818 leaf->lf_next = cpu_to_be64(0);
819 memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
820 dent = (struct gfs2_dirent *)(leaf+1);
821 gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
822 *pbh = bh;
823 return leaf;
824}
825
826/**
827 * dir_make_exhash - Convert a stuffed directory into an ExHash directory
828 * @dip: The GFS2 inode
829 *
830 * Returns: 0 on success, error code otherwise
831 */
832
833static int dir_make_exhash(struct inode *inode)
834{
835 struct gfs2_inode *dip = inode->u.generic_ip;
836 struct gfs2_sbd *sdp = dip->i_sbd;
837 struct gfs2_dirent *dent;
838 struct qstr args;
839 struct buffer_head *bh, *dibh;
840 struct gfs2_leaf *leaf;
841 int y;
842 uint32_t x;
843 uint64_t *lp, bn;
844 int error;
845
846 error = gfs2_meta_inode_buffer(dip, &dibh);
847 if (error)
848 return error;
849
850 /* Turn over a new leaf */
851
852 leaf = new_leaf(inode, &bh, 0);
853 if (!leaf)
854 return -ENOSPC;
855 bn = bh->b_blocknr;
856
857 gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
858 leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
859
860 /* Copy dirents */
861
862 gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh,
863 sizeof(struct gfs2_dinode));
864
865 /* Find last entry */
866
867 x = 0;
868 args.len = bh->b_size - sizeof(struct gfs2_dinode) +
869 sizeof(struct gfs2_leaf);
870 args.name = bh->b_data;
871 dent = gfs2_dirent_scan(dip->i_vnode, bh->b_data, bh->b_size,
872 gfs2_dirent_last, &args, NULL);
873 if (!dent) {
874 brelse(bh);
875 brelse(dibh);
876 return -EIO;
877 }
878 if (IS_ERR(dent)) {
879 brelse(bh);
880 brelse(dibh);
881 return PTR_ERR(dent);
882 }
883
884 /* Adjust the last dirent's record length
885 (Remember that dent still points to the last entry.) */
886
887 dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) +
888 sizeof(struct gfs2_dinode) -
889 sizeof(struct gfs2_leaf));
890
891 brelse(bh);
892
893 /* We're done with the new leaf block, now setup the new
894 hash table. */
895
896 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
897 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
898
899 lp = (uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode));
900
901 for (x = sdp->sd_hash_ptrs; x--; lp++)
902 *lp = cpu_to_be64(bn);
903
904 dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
905 dip->i_di.di_blocks++;
906 dip->i_di.di_flags |= GFS2_DIF_EXHASH;
907 dip->i_di.di_payload_format = 0;
908
909 for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
910 dip->i_di.di_depth = y;
911
912 gfs2_dinode_out(&dip->i_di, dibh->b_data);
913
914 brelse(dibh);
915
916 return 0;
917}
918
919/**
920 * dir_split_leaf - Split a leaf block into two
921 * @dip: The GFS2 inode
922 * @index:
923 * @leaf_no:
924 *
925 * Returns: 0 on success, error code on failure
926 */
927
928static int dir_split_leaf(struct inode *inode, const struct qstr *name)
929{
930 struct gfs2_inode *dip = inode->u.generic_ip;
931 struct buffer_head *nbh, *obh, *dibh;
932 struct gfs2_leaf *nleaf, *oleaf;
933 struct gfs2_dirent *dent, *prev = NULL, *next = NULL, *new;
934 uint32_t start, len, half_len, divider;
935 uint64_t bn, *lp, leaf_no;
936 uint32_t index;
937 int x, moved = 0;
938 int error;
939
940 index = name->hash >> (32 - dip->i_di.di_depth);
941 error = get_leaf_nr(dip, index, &leaf_no);
942 if (error)
943 return error;
944
945 /* Get the old leaf block */
946 error = get_leaf(dip, leaf_no, &obh);
947 if (error)
948 return error;
949
950 oleaf = (struct gfs2_leaf *)obh->b_data;
951 if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) {
952 brelse(obh);
953 return 1; /* can't split */
954 }
955
956 gfs2_trans_add_bh(dip->i_gl, obh, 1);
957
958 nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
959 if (!nleaf) {
960 brelse(obh);
961 return -ENOSPC;
962 }
963 bn = nbh->b_blocknr;
964
965 /* Compute the start and len of leaf pointers in the hash table. */
966 len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth));
967 half_len = len >> 1;
968 if (!half_len) {
969 printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index);
970 gfs2_consist_inode(dip);
971 error = -EIO;
972 goto fail_brelse;
973 }
974
975 start = (index & ~(len - 1));
976
977 /* Change the pointers.
978 Don't bother distinguishing stuffed from non-stuffed.
979 This code is complicated enough already. */
980 lp = kmalloc(half_len * sizeof(uint64_t), GFP_NOFS | __GFP_NOFAIL);
981 /* Change the pointers */
982 for (x = 0; x < half_len; x++)
983 lp[x] = cpu_to_be64(bn);
984
985 error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(uint64_t),
986 half_len * sizeof(uint64_t));
987 if (error != half_len * sizeof(uint64_t)) {
988 if (error >= 0)
989 error = -EIO;
990 goto fail_lpfree;
991 }
992
993 kfree(lp);
994
995 /* Compute the divider */
996 divider = (start + half_len) << (32 - dip->i_di.di_depth);
997
998 /* Copy the entries */
999 dirent_first(dip, obh, &dent);
1000
1001 do {
1002 next = dent;
1003 if (dirent_next(dip, obh, &next))
1004 next = NULL;
1005
1006 if (dent->de_inum.no_addr &&
1007 be32_to_cpu(dent->de_hash) < divider) {
1008 struct qstr str;
1009 str.name = (char*)(dent+1);
1010 str.len = be16_to_cpu(dent->de_name_len);
1011 str.hash = be32_to_cpu(dent->de_hash);
1012 new = gfs2_dirent_alloc(inode, nbh, &str);
1013 if (IS_ERR(new)) {
1014 error = PTR_ERR(new);
1015 break;
1016 }
1017
1018 new->de_inum = dent->de_inum; /* No endian worries */
1019 new->de_type = dent->de_type; /* No endian worries */
1020 nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1);
1021
1022 dirent_del(dip, obh, prev, dent);
1023
1024 if (!oleaf->lf_entries)
1025 gfs2_consist_inode(dip);
1026 oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1);
1027
1028 if (!prev)
1029 prev = dent;
1030
1031 moved = 1;
1032 } else {
1033 prev = dent;
1034 }
1035 dent = next;
1036 } while (dent);
1037
1038 oleaf->lf_depth = nleaf->lf_depth;
1039
1040 error = gfs2_meta_inode_buffer(dip, &dibh);
1041 if (!gfs2_assert_withdraw(dip->i_sbd, !error)) {
1042 dip->i_di.di_blocks++;
1043 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1044 brelse(dibh);
1045 }
1046
1047 brelse(obh);
1048 brelse(nbh);
1049
1050 return error;
1051
1052fail_lpfree:
1053 kfree(lp);
1054
1055fail_brelse:
1056 brelse(obh);
1057 brelse(nbh);
1058 return error;
1059}
1060
1061/**
1062 * dir_double_exhash - Double size of ExHash table
1063 * @dip: The GFS2 dinode
1064 *
1065 * Returns: 0 on success, error code on failure
1066 */
1067
1068static int dir_double_exhash(struct gfs2_inode *dip)
1069{
1070 struct gfs2_sbd *sdp = dip->i_sbd;
1071 struct buffer_head *dibh;
1072 uint32_t hsize;
1073 uint64_t *buf;
1074 uint64_t *from, *to;
1075 uint64_t block;
1076 int x;
1077 int error = 0;
1078
1079 hsize = 1 << dip->i_di.di_depth;
1080 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
1081 gfs2_consist_inode(dip);
1082 return -EIO;
1083 }
1084
1085 /* Allocate both the "from" and "to" buffers in one big chunk */
1086
1087 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL);
1088
1089 for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
1090 error = gfs2_dir_read_data(dip, (char *)buf,
1091 block * sdp->sd_hash_bsize,
1092 sdp->sd_hash_bsize);
1093 if (error != sdp->sd_hash_bsize) {
1094 if (error >= 0)
1095 error = -EIO;
1096 goto fail;
1097 }
1098
1099 from = buf;
1100 to = (uint64_t *)((char *)buf + sdp->sd_hash_bsize);
1101
1102 for (x = sdp->sd_hash_ptrs; x--; from++) {
1103 *to++ = *from; /* No endianess worries */
1104 *to++ = *from;
1105 }
1106
1107 error = gfs2_dir_write_data(dip,
1108 (char *)buf + sdp->sd_hash_bsize,
1109 block * sdp->sd_sb.sb_bsize,
1110 sdp->sd_sb.sb_bsize);
1111 if (error != sdp->sd_sb.sb_bsize) {
1112 if (error >= 0)
1113 error = -EIO;
1114 goto fail;
1115 }
1116 }
1117
1118 kfree(buf);
1119
1120 error = gfs2_meta_inode_buffer(dip, &dibh);
1121 if (!gfs2_assert_withdraw(sdp, !error)) {
1122 dip->i_di.di_depth++;
1123 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1124 brelse(dibh);
1125 }
1126
1127 return error;
1128
1129 fail:
1130 kfree(buf);
1131
1132 return error;
1133}
1134
1135/**
1136 * compare_dents - compare directory entries by hash value
1137 * @a: first dent
1138 * @b: second dent
1139 *
1140 * When comparing the hash entries of @a to @b:
1141 * gt: returns 1
1142 * lt: returns -1
1143 * eq: returns 0
1144 */
1145
1146static int compare_dents(const void *a, const void *b)
1147{
1148 struct gfs2_dirent *dent_a, *dent_b;
1149 uint32_t hash_a, hash_b;
1150 int ret = 0;
1151
1152 dent_a = *(struct gfs2_dirent **)a;
1153 hash_a = be32_to_cpu(dent_a->de_hash);
1154
1155 dent_b = *(struct gfs2_dirent **)b;
1156 hash_b = be32_to_cpu(dent_b->de_hash);
1157
1158 if (hash_a > hash_b)
1159 ret = 1;
1160 else if (hash_a < hash_b)
1161 ret = -1;
1162 else {
1163 unsigned int len_a = be16_to_cpu(dent_a->de_name_len);
1164 unsigned int len_b = be16_to_cpu(dent_b->de_name_len);
1165
1166 if (len_a > len_b)
1167 ret = 1;
1168 else if (len_a < len_b)
1169 ret = -1;
1170 else
1171 ret = memcmp((char *)(dent_a + 1),
1172 (char *)(dent_b + 1),
1173 len_a);
1174 }
1175
1176 return ret;
1177}
1178
1179/**
1180 * do_filldir_main - read out directory entries
1181 * @dip: The GFS2 inode
1182 * @offset: The offset in the file to read from
1183 * @opaque: opaque data to pass to filldir
1184 * @filldir: The function to pass entries to
1185 * @darr: an array of struct gfs2_dirent pointers to read
1186 * @entries: the number of entries in darr
1187 * @copied: pointer to int that's non-zero if a entry has been copied out
1188 *
1189 * Jump through some hoops to make sure that if there are hash collsions,
1190 * they are read out at the beginning of a buffer. We want to minimize
1191 * the possibility that they will fall into different readdir buffers or
1192 * that someone will want to seek to that location.
1193 *
1194 * Returns: errno, >0 on exception from filldir
1195 */
1196
1197static int do_filldir_main(struct gfs2_inode *dip, uint64_t *offset,
1198 void *opaque, gfs2_filldir_t filldir,
1199 const struct gfs2_dirent **darr, uint32_t entries,
1200 int *copied)
1201{
1202 const struct gfs2_dirent *dent, *dent_next;
1203 struct gfs2_inum inum;
1204 uint64_t off, off_next;
1205 unsigned int x, y;
1206 int run = 0;
1207 int error = 0;
1208
1209 sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
1210
1211 dent_next = darr[0];
1212 off_next = be32_to_cpu(dent_next->de_hash);
1213 off_next = gfs2_disk_hash2offset(off_next);
1214
1215 for (x = 0, y = 1; x < entries; x++, y++) {
1216 dent = dent_next;
1217 off = off_next;
1218
1219 if (y < entries) {
1220 dent_next = darr[y];
1221 off_next = be32_to_cpu(dent_next->de_hash);
1222 off_next = gfs2_disk_hash2offset(off_next);
1223
1224 if (off < *offset)
1225 continue;
1226 *offset = off;
1227
1228 if (off_next == off) {
1229 if (*copied && !run)
1230 return 1;
1231 run = 1;
1232 } else
1233 run = 0;
1234 } else {
1235 if (off < *offset)
1236 continue;
1237 *offset = off;
1238 }
1239
1240 gfs2_inum_in(&inum, (char *)&dent->de_inum);
1241
1242 error = filldir(opaque, (char *)(dent + 1),
1243 be16_to_cpu(dent->de_name_len),
1244 off, &inum,
1245 be16_to_cpu(dent->de_type));
1246 if (error)
1247 return 1;
1248
1249 *copied = 1;
1250 }
1251
1252 /* Increment the *offset by one, so the next time we come into the
1253 do_filldir fxn, we get the next entry instead of the last one in the
1254 current leaf */
1255
1256 (*offset)++;
1257
1258 return 0;
1259}
1260
1261static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1262 gfs2_filldir_t filldir, int *copied,
1263 unsigned *depth, u64 leaf_no)
1264{
1265 struct gfs2_inode *ip = inode->u.generic_ip;
1266 struct buffer_head *bh;
1267 struct gfs2_leaf *lf;
1268 unsigned entries = 0;
1269 unsigned leaves = 0;
1270 const struct gfs2_dirent **darr, *dent;
1271 struct dirent_gather g;
1272 struct buffer_head **larr;
1273 int leaf = 0;
1274 int error, i;
1275 u64 lfn = leaf_no;
1276
1277 do {
1278 error = get_leaf(ip, lfn, &bh);
1279 if (error)
1280 goto out;
1281 lf = (struct gfs2_leaf *)bh->b_data;
1282 if (leaves == 0)
1283 *depth = be16_to_cpu(lf->lf_depth);
1284 entries += be16_to_cpu(lf->lf_entries);
1285 leaves++;
1286 lfn = be64_to_cpu(lf->lf_next);
1287 brelse(bh);
1288 } while(lfn);
1289
1290 if (!entries)
1291 return 0;
1292
1293 error = -ENOMEM;
1294 larr = vmalloc((leaves + entries) * sizeof(void*));
1295 if (!larr)
1296 goto out;
1297 darr = (const struct gfs2_dirent **)(larr + leaves);
1298 g.pdent = darr;
1299 g.offset = 0;
1300 lfn = leaf_no;
1301
1302 do {
1303 error = get_leaf(ip, lfn, &bh);
1304 if (error)
1305 goto out_kfree;
1306 lf = (struct gfs2_leaf *)bh->b_data;
1307 lfn = be64_to_cpu(lf->lf_next);
1308 if (lf->lf_entries) {
1309 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
1310 gfs2_dirent_gather, NULL, &g);
1311 error = PTR_ERR(dent);
1312 if (IS_ERR(dent)) {
1313 goto out_kfree;
1314 }
1315 error = 0;
1316 larr[leaf++] = bh;
1317 } else {
1318 brelse(bh);
1319 }
1320 } while(lfn);
1321
1322 error = do_filldir_main(ip, offset, opaque, filldir, darr,
1323 entries, copied);
1324out_kfree:
1325 for(i = 0; i < leaf; i++)
1326 brelse(larr[i]);
1327 vfree(larr);
1328out:
1329 return error;
1330}
1331
1332/**
1333 * dir_e_read - Reads the entries from a directory into a filldir buffer
1334 * @dip: dinode pointer
1335 * @offset: the hash of the last entry read shifted to the right once
1336 * @opaque: buffer for the filldir function to fill
1337 * @filldir: points to the filldir function to use
1338 *
1339 * Returns: errno
1340 */
1341
1342static int dir_e_read(struct inode *inode, uint64_t *offset, void *opaque,
1343 gfs2_filldir_t filldir)
1344{
1345 struct gfs2_inode *dip = inode->u.generic_ip;
1346 struct gfs2_sbd *sdp = dip->i_sbd;
1347 uint32_t hsize, len = 0;
1348 uint32_t ht_offset, lp_offset, ht_offset_cur = -1;
1349 uint32_t hash, index;
1350 uint64_t *lp;
1351 int copied = 0;
1352 int error = 0;
1353 unsigned depth;
1354
1355 hsize = 1 << dip->i_di.di_depth;
1356 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
1357 gfs2_consist_inode(dip);
1358 return -EIO;
1359 }
1360
1361 hash = gfs2_dir_offset2hash(*offset);
1362 index = hash >> (32 - dip->i_di.di_depth);
1363
1364 lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
1365 if (!lp)
1366 return -ENOMEM;
1367
1368 while (index < hsize) {
1369 lp_offset = index & (sdp->sd_hash_ptrs - 1);
1370 ht_offset = index - lp_offset;
1371
1372 if (ht_offset_cur != ht_offset) {
1373 error = gfs2_dir_read_data(dip, (char *)lp,
1374 ht_offset * sizeof(uint64_t),
1375 sdp->sd_hash_bsize);
1376 if (error != sdp->sd_hash_bsize) {
1377 if (error >= 0)
1378 error = -EIO;
1379 goto out;
1380 }
1381 ht_offset_cur = ht_offset;
1382 }
1383
1384 error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
1385 &copied, &depth,
1386 be64_to_cpu(lp[lp_offset]));
1387 if (error)
1388 break;
1389
1390 len = 1 << (dip->i_di.di_depth - depth);
1391 index = (index & ~(len - 1)) + len;
1392 }
1393
1394out:
1395 kfree(lp);
1396 if (error > 0)
1397 error = 0;
1398 return error;
1399}
1400
1401int gfs2_dir_read(struct inode *inode, uint64_t *offset, void *opaque,
1402 gfs2_filldir_t filldir)
1403{
1404 struct gfs2_inode *dip = inode->u.generic_ip;
1405 struct dirent_gather g;
1406 const struct gfs2_dirent **darr, *dent;
1407 struct buffer_head *dibh;
1408 int copied = 0;
1409 int error;
1410
1411 if (!dip->i_di.di_entries)
1412 return 0;
1413
1414 if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
1415 return dir_e_read(inode, offset, opaque, filldir);
1416
1417 if (!gfs2_is_stuffed(dip)) {
1418 gfs2_consist_inode(dip);
1419 return -EIO;
1420 }
1421
1422 error = gfs2_meta_inode_buffer(dip, &dibh);
1423 if (error)
1424 return error;
1425
1426 error = -ENOMEM;
1427 darr = kmalloc(dip->i_di.di_entries * sizeof(struct gfs2_dirent *),
1428 GFP_KERNEL);
1429 if (darr) {
1430 g.pdent = darr;
1431 g.offset = 0;
1432 dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
1433 gfs2_dirent_gather, NULL, &g);
1434 if (IS_ERR(dent)) {
1435 error = PTR_ERR(dent);
1436 goto out;
1437 }
1438 error = do_filldir_main(dip, offset, opaque, filldir, darr,
1439 dip->i_di.di_entries, &copied);
1440out:
1441 kfree(darr);
1442 }
1443
1444 if (error > 0)
1445 error = 0;
1446
1447 brelse(dibh);
1448
1449 return error;
1450}
1451
1452/**
1453 * gfs2_dir_search - Search a directory
1454 * @dip: The GFS2 inode
1455 * @filename:
1456 * @inode:
1457 *
1458 * This routine searches a directory for a file or another directory.
1459 * Assumes a glock is held on dip.
1460 *
1461 * Returns: errno
1462 */
1463
1464int gfs2_dir_search(struct inode *dir, const struct qstr *name,
1465 struct gfs2_inum *inum, unsigned int *type)
1466{
1467 struct buffer_head *bh;
1468 struct gfs2_dirent *dent;
1469
1470 dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
1471 if (dent) {
1472 if (IS_ERR(dent))
1473 return PTR_ERR(dent);
1474 if (inum)
1475 gfs2_inum_in(inum, (char *)&dent->de_inum);
1476 if (type)
1477 *type = be16_to_cpu(dent->de_type);
1478 brelse(bh);
1479 return 0;
1480 }
1481 return -ENOENT;
1482}
1483
1484static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1485{
1486 struct buffer_head *bh, *obh;
1487 struct gfs2_inode *ip = inode->u.generic_ip;
1488 struct gfs2_leaf *leaf, *oleaf;
1489 int error;
1490 u32 index;
1491 u64 bn;
1492
1493 index = name->hash >> (32 - ip->i_di.di_depth);
1494 error = get_first_leaf(ip, index, &obh);
1495 if (error)
1496 return error;
1497 do {
1498 oleaf = (struct gfs2_leaf *)obh->b_data;
1499 bn = be64_to_cpu(oleaf->lf_next);
1500 if (!bn)
1501 break;
1502 brelse(obh);
1503 error = get_leaf(ip, bn, &obh);
1504 if (error)
1505 return error;
1506 } while(1);
1507
1508 gfs2_trans_add_bh(ip->i_gl, obh, 1);
1509
1510 leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
1511 if (!leaf) {
1512 brelse(obh);
1513 return -ENOSPC;
1514 }
1515 oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
1516 brelse(bh);
1517 brelse(obh);
1518
1519 error = gfs2_meta_inode_buffer(ip, &bh);
1520 if (error)
1521 return error;
1522 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1523 ip->i_di.di_blocks++;
1524 gfs2_dinode_out(&ip->i_di, bh->b_data);
1525 brelse(bh);
1526 return 0;
1527}
1528
1529/**
1530 * gfs2_dir_add - Add new filename into directory
1531 * @dip: The GFS2 inode
1532 * @filename: The new name
1533 * @inode: The inode number of the entry
1534 * @type: The type of the entry
1535 *
1536 * Returns: 0 on success, error code on failure
1537 */
1538
1539int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1540 const struct gfs2_inum *inum, unsigned type)
1541{
1542 struct gfs2_inode *ip = inode->u.generic_ip;
1543 struct buffer_head *bh;
1544 struct gfs2_dirent *dent;
1545 struct gfs2_leaf *leaf;
1546 int error;
1547
1548 while(1) {
1549 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
1550 &bh);
1551 if (dent) {
1552 if (IS_ERR(dent))
1553 return PTR_ERR(dent);
1554 dent = gfs2_init_dirent(inode, dent, name, bh);
1555 gfs2_inum_out(inum, (char *)&dent->de_inum);
1556 dent->de_type = cpu_to_be16(type);
1557 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
1558 leaf = (struct gfs2_leaf *)bh->b_data;
1559 leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1);
1560 }
1561 brelse(bh);
1562 error = gfs2_meta_inode_buffer(ip, &bh);
1563 if (error)
1564 break;
1565 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1566 ip->i_di.di_entries++;
1567 ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
1568 gfs2_dinode_out(&ip->i_di, bh->b_data);
1569 brelse(bh);
1570 error = 0;
1571 break;
1572 }
1573 if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
1574 error = dir_make_exhash(inode);
1575 if (error)
1576 break;
1577 continue;
1578 }
1579 error = dir_split_leaf(inode, name);
1580 if (error == 0)
1581 continue;
1582 if (error < 0)
1583 break;
1584 if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) {
1585 error = dir_double_exhash(ip);
1586 if (error)
1587 break;
1588 error = dir_split_leaf(inode, name);
1589 if (error < 0)
1590 break;
1591 if (error == 0)
1592 continue;
1593 }
1594 error = dir_new_leaf(inode, name);
1595 if (!error)
1596 continue;
1597 error = -ENOSPC;
1598 break;
1599 }
1600 return error;
1601}
1602
1603
1604/**
1605 * gfs2_dir_del - Delete a directory entry
1606 * @dip: The GFS2 inode
1607 * @filename: The filename
1608 *
1609 * Returns: 0 on success, error code on failure
1610 */
1611
1612int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
1613{
1614 struct gfs2_dirent *dent, *prev = NULL;
1615 struct buffer_head *bh;
1616 int error;
1617
1618 /* Returns _either_ the entry (if its first in block) or the
1619 previous entry otherwise */
1620 dent = gfs2_dirent_search(dip->i_vnode, name, gfs2_dirent_prev, &bh);
1621 if (!dent) {
1622 gfs2_consist_inode(dip);
1623 return -EIO;
1624 }
1625 if (IS_ERR(dent)) {
1626 gfs2_consist_inode(dip);
1627 return PTR_ERR(dent);
1628 }
1629 /* If not first in block, adjust pointers accordingly */
1630 if (gfs2_dirent_find(dent, name, NULL) == 0) {
1631 prev = dent;
1632 dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len));
1633 }
1634
1635 dirent_del(dip, bh, prev, dent);
1636 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
1637 struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
1638 u16 entries = be16_to_cpu(leaf->lf_entries);
1639 if (!entries)
1640 gfs2_consist_inode(dip);
1641 leaf->lf_entries = cpu_to_be16(--entries);
1642 }
1643 brelse(bh);
1644
1645 error = gfs2_meta_inode_buffer(dip, &bh);
1646 if (error)
1647 return error;
1648
1649 if (!dip->i_di.di_entries)
1650 gfs2_consist_inode(dip);
1651 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1652 dip->i_di.di_entries--;
1653 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1654 gfs2_dinode_out(&dip->i_di, bh->b_data);
1655 brelse(bh);
1656
1657 return error;
1658}
1659
1660/**
1661 * gfs2_dir_mvino - Change inode number of directory entry
1662 * @dip: The GFS2 inode
1663 * @filename:
1664 * @new_inode:
1665 *
1666 * This routine changes the inode number of a directory entry. It's used
1667 * by rename to change ".." when a directory is moved.
1668 * Assumes a glock is held on dvp.
1669 *
1670 * Returns: errno
1671 */
1672
1673int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
1674 struct gfs2_inum *inum, unsigned int new_type)
1675{
1676 struct buffer_head *bh;
1677 struct gfs2_dirent *dent;
1678 int error;
1679
1680 dent = gfs2_dirent_search(dip->i_vnode, filename, gfs2_dirent_find, &bh);
1681 if (!dent) {
1682 gfs2_consist_inode(dip);
1683 return -EIO;
1684 }
1685 if (IS_ERR(dent))
1686 return PTR_ERR(dent);
1687
1688 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1689 gfs2_inum_out(inum, (char *)&dent->de_inum);
1690 dent->de_type = cpu_to_be16(new_type);
1691
1692 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
1693 brelse(bh);
1694 error = gfs2_meta_inode_buffer(dip, &bh);
1695 if (error)
1696 return error;
1697 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1698 }
1699
1700 dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
1701 gfs2_dinode_out(&dip->i_di, bh->b_data);
1702 brelse(bh);
1703 return 0;
1704}
1705
1706/**
1707 * foreach_leaf - call a function for each leaf in a directory
1708 * @dip: the directory
1709 * @lc: the function to call for each each
1710 * @data: private data to pass to it
1711 *
1712 * Returns: errno
1713 */
1714
1715static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1716{
1717 struct gfs2_sbd *sdp = dip->i_sbd;
1718 struct buffer_head *bh;
1719 struct gfs2_leaf *leaf;
1720 uint32_t hsize, len;
1721 uint32_t ht_offset, lp_offset, ht_offset_cur = -1;
1722 uint32_t index = 0;
1723 uint64_t *lp;
1724 uint64_t leaf_no;
1725 int error = 0;
1726
1727 hsize = 1 << dip->i_di.di_depth;
1728 if (hsize * sizeof(uint64_t) != dip->i_di.di_size) {
1729 gfs2_consist_inode(dip);
1730 return -EIO;
1731 }
1732
1733 lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
1734 if (!lp)
1735 return -ENOMEM;
1736
1737 while (index < hsize) {
1738 lp_offset = index & (sdp->sd_hash_ptrs - 1);
1739 ht_offset = index - lp_offset;
1740
1741 if (ht_offset_cur != ht_offset) {
1742 error = gfs2_dir_read_data(dip, (char *)lp,
1743 ht_offset * sizeof(uint64_t),
1744 sdp->sd_hash_bsize);
1745 if (error != sdp->sd_hash_bsize) {
1746 if (error >= 0)
1747 error = -EIO;
1748 goto out;
1749 }
1750 ht_offset_cur = ht_offset;
1751 }
1752
1753 leaf_no = be64_to_cpu(lp[lp_offset]);
1754 if (leaf_no) {
1755 error = get_leaf(dip, leaf_no, &bh);
1756 if (error)
1757 goto out;
1758 leaf = (struct gfs2_leaf *)bh->b_data;
1759 brelse(bh);
1760
1761 len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth));
1762
1763 error = lc(dip, index, len, leaf_no, data);
1764 if (error)
1765 goto out;
1766
1767 index = (index & ~(len - 1)) + len;
1768 } else
1769 index++;
1770 }
1771
1772 if (index != hsize) {
1773 gfs2_consist_inode(dip);
1774 error = -EIO;
1775 }
1776
1777 out:
1778 kfree(lp);
1779
1780 return error;
1781}
1782
1783/**
1784 * leaf_dealloc - Deallocate a directory leaf
1785 * @dip: the directory
1786 * @index: the hash table offset in the directory
1787 * @len: the number of pointers to this leaf
1788 * @leaf_no: the leaf number
1789 * @data: not used
1790 *
1791 * Returns: errno
1792 */
1793
1794static int leaf_dealloc(struct gfs2_inode *dip, uint32_t index, uint32_t len,
1795 uint64_t leaf_no, void *data)
1796{
1797 struct gfs2_sbd *sdp = dip->i_sbd;
1798 struct gfs2_leaf *tmp_leaf;
1799 struct gfs2_rgrp_list rlist;
1800 struct buffer_head *bh, *dibh;
1801 uint64_t blk, nblk;
1802 unsigned int rg_blocks = 0, l_blocks = 0;
1803 char *ht;
1804 unsigned int x, size = len * sizeof(uint64_t);
1805 int error;
1806
1807 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1808
1809 ht = kzalloc(size, GFP_KERNEL);
1810 if (!ht)
1811 return -ENOMEM;
1812
1813 gfs2_alloc_get(dip);
1814
1815 error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1816 if (error)
1817 goto out;
1818
1819 error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh);
1820 if (error)
1821 goto out_qs;
1822
1823 /* Count the number of leaves */
1824
1825 for (blk = leaf_no; blk; blk = nblk) {
1826 error = get_leaf(dip, blk, &bh);
1827 if (error)
1828 goto out_rlist;
1829 tmp_leaf = (struct gfs2_leaf *)bh->b_data;
1830 nblk = be64_to_cpu(tmp_leaf->lf_next);
1831 brelse(bh);
1832
1833 gfs2_rlist_add(sdp, &rlist, blk);
1834 l_blocks++;
1835 }
1836
1837 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
1838
1839 for (x = 0; x < rlist.rl_rgrps; x++) {
1840 struct gfs2_rgrpd *rgd;
1841 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
1842 rg_blocks += rgd->rd_ri.ri_length;
1843 }
1844
1845 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
1846 if (error)
1847 goto out_rlist;
1848
1849 error = gfs2_trans_begin(sdp,
1850 rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) +
1851 RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
1852 if (error)
1853 goto out_rg_gunlock;
1854
1855 for (blk = leaf_no; blk; blk = nblk) {
1856 error = get_leaf(dip, blk, &bh);
1857 if (error)
1858 goto out_end_trans;
1859 tmp_leaf = (struct gfs2_leaf *)bh->b_data;
1860 nblk = be64_to_cpu(tmp_leaf->lf_next);
1861 brelse(bh);
1862
1863 gfs2_free_meta(dip, blk, 1);
1864
1865 if (!dip->i_di.di_blocks)
1866 gfs2_consist_inode(dip);
1867 dip->i_di.di_blocks--;
1868 }
1869
1870 error = gfs2_dir_write_data(dip, ht, index * sizeof(uint64_t), size);
1871 if (error != size) {
1872 if (error >= 0)
1873 error = -EIO;
1874 goto out_end_trans;
1875 }
1876
1877 error = gfs2_meta_inode_buffer(dip, &dibh);
1878 if (error)
1879 goto out_end_trans;
1880
1881 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
1882 gfs2_dinode_out(&dip->i_di, dibh->b_data);
1883 brelse(dibh);
1884
1885 out_end_trans:
1886 gfs2_trans_end(sdp);
1887
1888 out_rg_gunlock:
1889 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1890
1891 out_rlist:
1892 gfs2_rlist_free(&rlist);
1893 gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh);
1894
1895 out_qs:
1896 gfs2_quota_unhold(dip);
1897
1898 out:
1899 gfs2_alloc_put(dip);
1900 kfree(ht);
1901
1902 return error;
1903}
1904
1905/**
1906 * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
1907 * @dip: the directory
1908 *
1909 * Dealloc all on-disk directory leaves to FREEMETA state
1910 * Change on-disk inode type to "regular file"
1911 *
1912 * Returns: errno
1913 */
1914
1915int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
1916{
1917 struct gfs2_sbd *sdp = dip->i_sbd;
1918 struct buffer_head *bh;
1919 int error;
1920
1921 /* Dealloc on-disk leaves to FREEMETA state */
1922 error = foreach_leaf(dip, leaf_dealloc, NULL);
1923 if (error)
1924 return error;
1925
1926 /* Make this a regular file in case we crash.
1927 (We don't want to free these blocks a second time.) */
1928
1929 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1930 if (error)
1931 return error;
1932
1933 error = gfs2_meta_inode_buffer(dip, &bh);
1934 if (!error) {
1935 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1936 ((struct gfs2_dinode *)bh->b_data)->di_mode =
1937 cpu_to_be32(S_IFREG);
1938 brelse(bh);
1939 }
1940
1941 gfs2_trans_end(sdp);
1942
1943 return error;
1944}
1945
1946/**
1947 * gfs2_diradd_alloc_required - find if adding entry will require an allocation
1948 * @ip: the file being written to
1949 * @filname: the filename that's going to be added
1950 *
1951 * Returns: 1 if alloc required, 0 if not, -ve on error
1952 */
1953
1954int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
1955{
1956 struct gfs2_dirent *dent;
1957 struct buffer_head *bh;
1958
1959 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
1960 if (!dent) {
1961 return 1;
1962 }
1963 if (IS_ERR(dent))
1964 return PTR_ERR(dent);
1965 brelse(bh);
1966 return 0;
1967}
1968
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
new file mode 100644
index 000000000000..42b3a1f34deb
--- /dev/null
+++ b/fs/gfs2/dir.h
@@ -0,0 +1,73 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __DIR_DOT_H__
11#define __DIR_DOT_H__
12
13/**
14 * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read()
15 * @opaque: opaque data used by the function
16 * @name: the name of the directory entry
17 * @length: the length of the name
18 * @offset: the entry's offset in the directory
19 * @inum: the inode number the entry points to
20 * @type: the type of inode the entry points to
21 *
22 * Returns: 0 on success, 1 if buffer full
23 */
24
25typedef int (*gfs2_filldir_t) (void *opaque,
26 const char *name, unsigned int length,
27 uint64_t offset,
28 struct gfs2_inum *inum, unsigned int type);
29
30int gfs2_dir_search(struct inode *dir, const struct qstr *filename,
31 struct gfs2_inum *inum, unsigned int *type);
32int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
33 const struct gfs2_inum *inum, unsigned int type);
34int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
35int gfs2_dir_read(struct inode *inode, uint64_t * offset, void *opaque,
36 gfs2_filldir_t filldir);
37int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
38 struct gfs2_inum *new_inum, unsigned int new_type);
39
40int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
41
42int gfs2_diradd_alloc_required(struct inode *dir,
43 const struct qstr *filename);
44int gfs2_dir_get_buffer(struct gfs2_inode *ip, uint64_t block, int new,
45 struct buffer_head **bhp);
46
47static inline uint32_t gfs2_disk_hash(const char *data, int len)
48{
49 return crc32_le(0xFFFFFFFF, data, len) ^ 0xFFFFFFFF;
50}
51
52
53static inline void gfs2_str2qstr(struct qstr *name, const char *fname)
54{
55 name->name = fname;
56 name->len = strlen(fname);
57 name->hash = gfs2_disk_hash(name->name, name->len);
58}
59
60/* N.B. This probably ought to take inum & type as args as well */
61static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent)
62{
63 dent->de_inum.no_addr = cpu_to_be64(0);
64 dent->de_inum.no_formal_ino = cpu_to_be64(0);
65 dent->de_hash = cpu_to_be32(name->hash);
66 dent->de_rec_len = cpu_to_be16(reclen);
67 dent->de_name_len = cpu_to_be16(name->len);
68 dent->de_type = cpu_to_be16(0);
69 memset(dent->__pad, 0, sizeof(dent->__pad));
70 memcpy((char*)(dent+1), name->name, name->len);
71}
72
73#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
new file mode 100644
index 000000000000..4b9f6cff7a34
--- /dev/null
+++ b/fs/gfs2/eaops.c
@@ -0,0 +1,189 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/xattr.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/semaphore.h>
18#include <asm/uaccess.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "acl.h"
24#include "eaops.h"
25#include "eattr.h"
26#include "util.h"
27
28/**
29 * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
30 * @namep: ea name, possibly with type appended
31 *
32 * Returns: GFS2_EATYPE_XXX
33 */
34
35unsigned int gfs2_ea_name2type(const char *name, char **truncated_name)
36{
37 unsigned int type;
38
39 if (strncmp(name, "system.", 7) == 0) {
40 type = GFS2_EATYPE_SYS;
41 if (truncated_name)
42 *truncated_name = strchr(name, '.') + 1;
43 } else if (strncmp(name, "user.", 5) == 0) {
44 type = GFS2_EATYPE_USR;
45 if (truncated_name)
46 *truncated_name = strchr(name, '.') + 1;
47 } else {
48 type = GFS2_EATYPE_UNUSED;
49 if (truncated_name)
50 *truncated_name = NULL;
51 }
52
53 return type;
54}
55
56static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
57{
58 struct inode *inode = ip->i_vnode;
59 int error = permission(inode, MAY_READ, NULL);
60 if (error)
61 return error;
62
63 return gfs2_ea_get_i(ip, er);
64}
65
66static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
67{
68 struct inode *inode = ip->i_vnode;
69
70 if (S_ISREG(inode->i_mode) ||
71 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
72 int error = permission(inode, MAY_WRITE, NULL);
73 if (error)
74 return error;
75 } else
76 return -EPERM;
77
78 return gfs2_ea_set_i(ip, er);
79}
80
81static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
82{
83 struct inode *inode = ip->i_vnode;
84
85 if (S_ISREG(inode->i_mode) ||
86 (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
87 int error = permission(inode, MAY_WRITE, NULL);
88 if (error)
89 return error;
90 } else
91 return -EPERM;
92
93 return gfs2_ea_remove_i(ip, er);
94}
95
96static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
97{
98 if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
99 !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
100 !capable(CAP_SYS_ADMIN))
101 return -EPERM;
102
103 if (ip->i_sbd->sd_args.ar_posix_acl == 0 &&
104 (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
105 GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
106 return -EOPNOTSUPP;
107
108
109
110 return gfs2_ea_get_i(ip, er);
111}
112
113static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
114{
115 int remove = 0;
116 int error;
117
118 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
119 if (!(er->er_flags & GFS2_ERF_MODE)) {
120 er->er_mode = ip->i_di.di_mode;
121 er->er_flags |= GFS2_ERF_MODE;
122 }
123 error = gfs2_acl_validate_set(ip, 1, er,
124 &remove, &er->er_mode);
125 if (error)
126 return error;
127 error = gfs2_ea_set_i(ip, er);
128 if (error)
129 return error;
130 if (remove)
131 gfs2_ea_remove_i(ip, er);
132 return 0;
133
134 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
135 error = gfs2_acl_validate_set(ip, 0, er,
136 &remove, NULL);
137 if (error)
138 return error;
139 if (!remove)
140 error = gfs2_ea_set_i(ip, er);
141 else {
142 error = gfs2_ea_remove_i(ip, er);
143 if (error == -ENODATA)
144 error = 0;
145 }
146 return error;
147 }
148
149 return -EPERM;
150}
151
152static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
153{
154 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
155 int error = gfs2_acl_validate_remove(ip, 1);
156 if (error)
157 return error;
158
159 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
160 int error = gfs2_acl_validate_remove(ip, 0);
161 if (error)
162 return error;
163
164 } else
165 return -EPERM;
166
167 return gfs2_ea_remove_i(ip, er);
168}
169
170struct gfs2_eattr_operations gfs2_user_eaops = {
171 .eo_get = user_eo_get,
172 .eo_set = user_eo_set,
173 .eo_remove = user_eo_remove,
174 .eo_name = "user",
175};
176
177struct gfs2_eattr_operations gfs2_system_eaops = {
178 .eo_get = system_eo_get,
179 .eo_set = system_eo_set,
180 .eo_remove = system_eo_remove,
181 .eo_name = "system",
182};
183
184struct gfs2_eattr_operations *gfs2_ea_ops[] = {
185 NULL,
186 &gfs2_user_eaops,
187 &gfs2_system_eaops,
188};
189
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
new file mode 100644
index 000000000000..f83c497eddca
--- /dev/null
+++ b/fs/gfs2/eaops.h
@@ -0,0 +1,30 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __EAOPS_DOT_H__
11#define __EAOPS_DOT_H__
12
13struct gfs2_ea_request;
14
15struct gfs2_eattr_operations {
16 int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
17 int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
18 int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
19 char *eo_name;
20};
21
22unsigned int gfs2_ea_name2type(const char *name, char **truncated_name);
23
24extern struct gfs2_eattr_operations gfs2_user_eaops;
25extern struct gfs2_eattr_operations gfs2_system_eaops;
26
27extern struct gfs2_eattr_operations *gfs2_ea_ops[];
28
29#endif /* __EAOPS_DOT_H__ */
30
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
new file mode 100644
index 000000000000..8219d471f06c
--- /dev/null
+++ b/fs/gfs2/eattr.c
@@ -0,0 +1,1568 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/xattr.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/semaphore.h>
18#include <asm/uaccess.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "acl.h"
24#include "eaops.h"
25#include "eattr.h"
26#include "glock.h"
27#include "inode.h"
28#include "meta_io.h"
29#include "quota.h"
30#include "rgrp.h"
31#include "trans.h"
32#include "util.h"
33
34/**
35 * ea_calc_size - returns the acutal number of bytes the request will take up
36 * (not counting any unstuffed data blocks)
37 * @sdp:
38 * @er:
39 * @size:
40 *
41 * Returns: 1 if the EA should be stuffed
42 */
43
44static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
45 unsigned int *size)
46{
47 *size = GFS2_EAREQ_SIZE_STUFFED(er);
48 if (*size <= sdp->sd_jbsize)
49 return 1;
50
51 *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
52
53 return 0;
54}
55
56static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
57{
58 unsigned int size;
59
60 if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
61 return -ERANGE;
62
63 ea_calc_size(sdp, er, &size);
64
65 /* This can only happen with 512 byte blocks */
66 if (size > sdp->sd_jbsize)
67 return -ERANGE;
68
69 return 0;
70}
71
72typedef int (*ea_call_t) (struct gfs2_inode *ip,
73 struct buffer_head *bh,
74 struct gfs2_ea_header *ea,
75 struct gfs2_ea_header *prev,
76 void *private);
77
78static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
79 ea_call_t ea_call, void *data)
80{
81 struct gfs2_ea_header *ea, *prev = NULL;
82 int error = 0;
83
84 if (gfs2_metatype_check(ip->i_sbd, bh, GFS2_METATYPE_EA))
85 return -EIO;
86
87 for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
88 if (!GFS2_EA_REC_LEN(ea))
89 goto fail;
90 if (!(bh->b_data <= (char *)ea &&
91 (char *)GFS2_EA2NEXT(ea) <=
92 bh->b_data + bh->b_size))
93 goto fail;
94 if (!GFS2_EATYPE_VALID(ea->ea_type))
95 goto fail;
96
97 error = ea_call(ip, bh, ea, prev, data);
98 if (error)
99 return error;
100
101 if (GFS2_EA_IS_LAST(ea)) {
102 if ((char *)GFS2_EA2NEXT(ea) !=
103 bh->b_data + bh->b_size)
104 goto fail;
105 break;
106 }
107 }
108
109 return error;
110
111 fail:
112 gfs2_consist_inode(ip);
113 return -EIO;
114}
115
116static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
117{
118 struct buffer_head *bh, *eabh;
119 uint64_t *eablk, *end;
120 int error;
121
122 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
123 DIO_START | DIO_WAIT, &bh);
124 if (error)
125 return error;
126
127 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
128 error = ea_foreach_i(ip, bh, ea_call, data);
129 goto out;
130 }
131
132 if (gfs2_metatype_check(ip->i_sbd, bh, GFS2_METATYPE_IN)) {
133 error = -EIO;
134 goto out;
135 }
136
137 eablk = (uint64_t *)(bh->b_data + sizeof(struct gfs2_meta_header));
138 end = eablk + ip->i_sbd->sd_inptrs;
139
140 for (; eablk < end; eablk++) {
141 uint64_t bn;
142
143 if (!*eablk)
144 break;
145 bn = be64_to_cpu(*eablk);
146
147 error = gfs2_meta_read(ip->i_gl, bn, DIO_START | DIO_WAIT,
148 &eabh);
149 if (error)
150 break;
151 error = ea_foreach_i(ip, eabh, ea_call, data);
152 brelse(eabh);
153 if (error)
154 break;
155 }
156 out:
157 brelse(bh);
158
159 return error;
160}
161
162struct ea_find {
163 struct gfs2_ea_request *ef_er;
164 struct gfs2_ea_location *ef_el;
165};
166
167static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
168 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
169 void *private)
170{
171 struct ea_find *ef = private;
172 struct gfs2_ea_request *er = ef->ef_er;
173
174 if (ea->ea_type == GFS2_EATYPE_UNUSED)
175 return 0;
176
177 if (ea->ea_type == er->er_type) {
178 if (ea->ea_name_len == er->er_name_len &&
179 !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
180 struct gfs2_ea_location *el = ef->ef_el;
181 get_bh(bh);
182 el->el_bh = bh;
183 el->el_ea = ea;
184 el->el_prev = prev;
185 return 1;
186 }
187 }
188
189#if 0
190 else if ((ip->i_di.di_flags & GFS2_DIF_EA_PACKED) &&
191 er->er_type == GFS2_EATYPE_SYS)
192 return 1;
193#endif
194
195 return 0;
196}
197
198int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
199 struct gfs2_ea_location *el)
200{
201 struct ea_find ef;
202 int error;
203
204 ef.ef_er = er;
205 ef.ef_el = el;
206
207 memset(el, 0, sizeof(struct gfs2_ea_location));
208
209 error = ea_foreach(ip, ea_find_i, &ef);
210 if (error > 0)
211 return 0;
212
213 return error;
214}
215
216/**
217 * ea_dealloc_unstuffed -
218 * @ip:
219 * @bh:
220 * @ea:
221 * @prev:
222 * @private:
223 *
224 * Take advantage of the fact that all unstuffed blocks are
225 * allocated from the same RG. But watch, this may not always
226 * be true.
227 *
228 * Returns: errno
229 */
230
231static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
232 struct gfs2_ea_header *ea,
233 struct gfs2_ea_header *prev, void *private)
234{
235 int *leave = private;
236 struct gfs2_sbd *sdp = ip->i_sbd;
237 struct gfs2_rgrpd *rgd;
238 struct gfs2_holder rg_gh;
239 struct buffer_head *dibh;
240 uint64_t *dataptrs, bn = 0;
241 uint64_t bstart = 0;
242 unsigned int blen = 0;
243 unsigned int blks = 0;
244 unsigned int x;
245 int error;
246
247 if (GFS2_EA_IS_STUFFED(ea))
248 return 0;
249
250 dataptrs = GFS2_EA2DATAPTRS(ea);
251 for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++)
252 if (*dataptrs) {
253 blks++;
254 bn = be64_to_cpu(*dataptrs);
255 }
256 if (!blks)
257 return 0;
258
259 rgd = gfs2_blk2rgrpd(sdp, bn);
260 if (!rgd) {
261 gfs2_consist_inode(ip);
262 return -EIO;
263 }
264
265 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
266 if (error)
267 return error;
268
269 error = gfs2_trans_begin(sdp, rgd->rd_ri.ri_length +
270 RES_DINODE + RES_EATTR + RES_STATFS +
271 RES_QUOTA, blks);
272 if (error)
273 goto out_gunlock;
274
275 gfs2_trans_add_bh(ip->i_gl, bh, 1);
276
277 dataptrs = GFS2_EA2DATAPTRS(ea);
278 for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
279 if (!*dataptrs)
280 break;
281 bn = be64_to_cpu(*dataptrs);
282
283 if (bstart + blen == bn)
284 blen++;
285 else {
286 if (bstart)
287 gfs2_free_meta(ip, bstart, blen);
288 bstart = bn;
289 blen = 1;
290 }
291
292 *dataptrs = 0;
293 if (!ip->i_di.di_blocks)
294 gfs2_consist_inode(ip);
295 ip->i_di.di_blocks--;
296 }
297 if (bstart)
298 gfs2_free_meta(ip, bstart, blen);
299
300 if (prev && !leave) {
301 uint32_t len;
302
303 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
304 prev->ea_rec_len = cpu_to_be32(len);
305
306 if (GFS2_EA_IS_LAST(ea))
307 prev->ea_flags |= GFS2_EAFLAG_LAST;
308 } else {
309 ea->ea_type = GFS2_EATYPE_UNUSED;
310 ea->ea_num_ptrs = 0;
311 }
312
313 error = gfs2_meta_inode_buffer(ip, &dibh);
314 if (!error) {
315 ip->i_di.di_ctime = get_seconds();
316 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
317 gfs2_dinode_out(&ip->i_di, dibh->b_data);
318 brelse(dibh);
319 }
320
321 gfs2_trans_end(sdp);
322
323 out_gunlock:
324 gfs2_glock_dq_uninit(&rg_gh);
325
326 return error;
327}
328
329static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
330 struct gfs2_ea_header *ea,
331 struct gfs2_ea_header *prev, int leave)
332{
333 struct gfs2_alloc *al;
334 int error;
335
336 al = gfs2_alloc_get(ip);
337
338 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
339 if (error)
340 goto out_alloc;
341
342 error = gfs2_rindex_hold(ip->i_sbd, &al->al_ri_gh);
343 if (error)
344 goto out_quota;
345
346 error = ea_dealloc_unstuffed(ip,
347 bh, ea, prev,
348 (leave) ? &error : NULL);
349
350 gfs2_glock_dq_uninit(&al->al_ri_gh);
351
352 out_quota:
353 gfs2_quota_unhold(ip);
354
355 out_alloc:
356 gfs2_alloc_put(ip);
357
358 return error;
359}
360
361
362static int gfs2_ea_repack_i(struct gfs2_inode *ip)
363{
364 return -EOPNOTSUPP;
365}
366
367int gfs2_ea_repack(struct gfs2_inode *ip)
368{
369 struct gfs2_holder gh;
370 int error;
371
372 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
373 if (error)
374 return error;
375
376 /* Some sort of permissions checking would be nice */
377
378 error = gfs2_ea_repack_i(ip);
379
380 gfs2_glock_dq_uninit(&gh);
381
382 return error;
383}
384
385struct ea_list {
386 struct gfs2_ea_request *ei_er;
387 unsigned int ei_size;
388};
389
390static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
391 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
392 void *private)
393{
394 struct ea_list *ei = private;
395 struct gfs2_ea_request *er = ei->ei_er;
396 unsigned int ea_size = GFS2_EA_STRLEN(ea);
397
398 if (ea->ea_type == GFS2_EATYPE_UNUSED)
399 return 0;
400
401 if (er->er_data_len) {
402 char *prefix;
403 unsigned int l;
404 char c = 0;
405
406 if (ei->ei_size + ea_size > er->er_data_len)
407 return -ERANGE;
408
409 if (ea->ea_type == GFS2_EATYPE_USR) {
410 prefix = "user.";
411 l = 5;
412 } else {
413 prefix = "system.";
414 l = 7;
415 }
416
417 memcpy(er->er_data + ei->ei_size,
418 prefix, l);
419 memcpy(er->er_data + ei->ei_size + l,
420 GFS2_EA2NAME(ea),
421 ea->ea_name_len);
422 memcpy(er->er_data + ei->ei_size +
423 ea_size - 1,
424 &c, 1);
425 }
426
427 ei->ei_size += ea_size;
428
429 return 0;
430}
431
432/**
433 * gfs2_ea_list -
434 * @ip:
435 * @er:
436 *
437 * Returns: actual size of data on success, -errno on error
438 */
439
440int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
441{
442 struct gfs2_holder i_gh;
443 int error;
444
445 if (!er->er_data || !er->er_data_len) {
446 er->er_data = NULL;
447 er->er_data_len = 0;
448 }
449
450 error = gfs2_glock_nq_init(ip->i_gl,
451 LM_ST_SHARED, LM_FLAG_ANY,
452 &i_gh);
453 if (error)
454 return error;
455
456 if (ip->i_di.di_eattr) {
457 struct ea_list ei = { .ei_er = er, .ei_size = 0 };
458
459 error = ea_foreach(ip, ea_list_i, &ei);
460 if (!error)
461 error = ei.ei_size;
462 }
463
464 gfs2_glock_dq_uninit(&i_gh);
465
466 return error;
467}
468
469/**
470 * ea_get_unstuffed - actually copies the unstuffed data into the
471 * request buffer
472 * @ip:
473 * @ea:
474 * @data:
475 *
476 * Returns: errno
477 */
478
479static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
480 char *data)
481{
482 struct gfs2_sbd *sdp = ip->i_sbd;
483 struct buffer_head **bh;
484 unsigned int amount = GFS2_EA_DATA_LEN(ea);
485 unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
486 uint64_t *dataptrs = GFS2_EA2DATAPTRS(ea);
487 unsigned int x;
488 int error = 0;
489
490 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
491 if (!bh)
492 return -ENOMEM;
493
494 for (x = 0; x < nptrs; x++) {
495 error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs),
496 DIO_START, bh + x);
497 if (error) {
498 while (x--)
499 brelse(bh[x]);
500 goto out;
501 }
502 dataptrs++;
503 }
504
505 for (x = 0; x < nptrs; x++) {
506 error = gfs2_meta_reread(sdp, bh[x], DIO_WAIT);
507 if (error) {
508 for (; x < nptrs; x++)
509 brelse(bh[x]);
510 goto out;
511 }
512 if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
513 for (; x < nptrs; x++)
514 brelse(bh[x]);
515 error = -EIO;
516 goto out;
517 }
518
519 memcpy(data,
520 bh[x]->b_data + sizeof(struct gfs2_meta_header),
521 (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
522
523 amount -= sdp->sd_jbsize;
524 data += sdp->sd_jbsize;
525
526 brelse(bh[x]);
527 }
528
529 out:
530 kfree(bh);
531
532 return error;
533}
534
535int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
536 char *data)
537{
538 if (GFS2_EA_IS_STUFFED(el->el_ea)) {
539 memcpy(data,
540 GFS2_EA2DATA(el->el_ea),
541 GFS2_EA_DATA_LEN(el->el_ea));
542 return 0;
543 } else
544 return ea_get_unstuffed(ip, el->el_ea, data);
545}
546
547/**
548 * gfs2_ea_get_i -
549 * @ip:
550 * @er:
551 *
552 * Returns: actual size of data on success, -errno on error
553 */
554
555int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
556{
557 struct gfs2_ea_location el;
558 int error;
559
560 if (!ip->i_di.di_eattr)
561 return -ENODATA;
562
563 error = gfs2_ea_find(ip, er, &el);
564 if (error)
565 return error;
566 if (!el.el_ea)
567 return -ENODATA;
568
569 if (er->er_data_len) {
570 if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
571 error = -ERANGE;
572 else
573 error = gfs2_ea_get_copy(ip, &el, er->er_data);
574 }
575 if (!error)
576 error = GFS2_EA_DATA_LEN(el.el_ea);
577
578 brelse(el.el_bh);
579
580 return error;
581}
582
583/**
584 * gfs2_ea_get -
585 * @ip:
586 * @er:
587 *
588 * Returns: actual size of data on success, -errno on error
589 */
590
591int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
592{
593 struct gfs2_holder i_gh;
594 int error;
595
596 if (!er->er_name_len ||
597 er->er_name_len > GFS2_EA_MAX_NAME_LEN)
598 return -EINVAL;
599 if (!er->er_data || !er->er_data_len) {
600 er->er_data = NULL;
601 er->er_data_len = 0;
602 }
603
604 error = gfs2_glock_nq_init(ip->i_gl,
605 LM_ST_SHARED, LM_FLAG_ANY,
606 &i_gh);
607 if (error)
608 return error;
609
610 error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
611
612 gfs2_glock_dq_uninit(&i_gh);
613
614 return error;
615}
616
617/**
618 * ea_alloc_blk - allocates a new block for extended attributes.
619 * @ip: A pointer to the inode that's getting extended attributes
620 * @bhp:
621 *
622 * Returns: errno
623 */
624
625static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
626{
627 struct gfs2_sbd *sdp = ip->i_sbd;
628 struct gfs2_ea_header *ea;
629 uint64_t block;
630
631 block = gfs2_alloc_meta(ip);
632
633 *bhp = gfs2_meta_new(ip->i_gl, block);
634 gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
635 gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
636 gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
637
638 ea = GFS2_EA_BH2FIRST(*bhp);
639 ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
640 ea->ea_type = GFS2_EATYPE_UNUSED;
641 ea->ea_flags = GFS2_EAFLAG_LAST;
642 ea->ea_num_ptrs = 0;
643
644 ip->i_di.di_blocks++;
645
646 return 0;
647}
648
649/**
650 * ea_write - writes the request info to an ea, creating new blocks if
651 * necessary
652 * @ip: inode that is being modified
653 * @ea: the location of the new ea in a block
654 * @er: the write request
655 *
656 * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
657 *
658 * returns : errno
659 */
660
661static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
662 struct gfs2_ea_request *er)
663{
664 struct gfs2_sbd *sdp = ip->i_sbd;
665
666 ea->ea_data_len = cpu_to_be32(er->er_data_len);
667 ea->ea_name_len = er->er_name_len;
668 ea->ea_type = er->er_type;
669 ea->__pad = 0;
670
671 memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
672
673 if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
674 ea->ea_num_ptrs = 0;
675 memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
676 } else {
677 uint64_t *dataptr = GFS2_EA2DATAPTRS(ea);
678 const char *data = er->er_data;
679 unsigned int data_len = er->er_data_len;
680 unsigned int copy;
681 unsigned int x;
682
683 ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize);
684 for (x = 0; x < ea->ea_num_ptrs; x++) {
685 struct buffer_head *bh;
686 uint64_t block;
687 int mh_size = sizeof(struct gfs2_meta_header);
688
689 block = gfs2_alloc_meta(ip);
690
691 bh = gfs2_meta_new(ip->i_gl, block);
692 gfs2_trans_add_bh(ip->i_gl, bh, 1);
693 gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
694
695 ip->i_di.di_blocks++;
696
697 copy = (data_len > sdp->sd_jbsize) ? sdp->sd_jbsize :
698 data_len;
699 memcpy(bh->b_data + mh_size, data, copy);
700 if (copy < sdp->sd_jbsize)
701 memset(bh->b_data + mh_size + copy, 0,
702 sdp->sd_jbsize - copy);
703
704 *dataptr++ = cpu_to_be64((uint64_t)bh->b_blocknr);
705 data += copy;
706 data_len -= copy;
707
708 brelse(bh);
709 }
710
711 gfs2_assert_withdraw(sdp, !data_len);
712 }
713
714 return 0;
715}
716
717typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
718 struct gfs2_ea_request *er,
719 void *private);
720
721static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
722 unsigned int blks,
723 ea_skeleton_call_t skeleton_call,
724 void *private)
725{
726 struct gfs2_alloc *al;
727 struct buffer_head *dibh;
728 int error;
729
730 al = gfs2_alloc_get(ip);
731
732 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
733 if (error)
734 goto out;
735
736 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
737 if (error)
738 goto out_gunlock_q;
739
740 al->al_requested = blks;
741
742 error = gfs2_inplace_reserve(ip);
743 if (error)
744 goto out_gunlock_q;
745
746 error = gfs2_trans_begin(ip->i_sbd,
747 blks + al->al_rgd->rd_ri.ri_length +
748 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
749 if (error)
750 goto out_ipres;
751
752 error = skeleton_call(ip, er, private);
753 if (error)
754 goto out_end_trans;
755
756 error = gfs2_meta_inode_buffer(ip, &dibh);
757 if (!error) {
758 if (er->er_flags & GFS2_ERF_MODE) {
759 gfs2_assert_withdraw(ip->i_sbd,
760 (ip->i_di.di_mode & S_IFMT) ==
761 (er->er_mode & S_IFMT));
762 ip->i_di.di_mode = er->er_mode;
763 }
764 ip->i_di.di_ctime = get_seconds();
765 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
766 gfs2_dinode_out(&ip->i_di, dibh->b_data);
767 brelse(dibh);
768 }
769
770 out_end_trans:
771 gfs2_trans_end(ip->i_sbd);
772
773 out_ipres:
774 gfs2_inplace_release(ip);
775
776 out_gunlock_q:
777 gfs2_quota_unlock(ip);
778
779 out:
780 gfs2_alloc_put(ip);
781
782 return error;
783}
784
785static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
786 void *private)
787{
788 struct buffer_head *bh;
789 int error;
790
791 error = ea_alloc_blk(ip, &bh);
792 if (error)
793 return error;
794
795 ip->i_di.di_eattr = bh->b_blocknr;
796 error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
797
798 brelse(bh);
799
800 return error;
801}
802
803/**
804 * ea_init - initializes a new eattr block
805 * @ip:
806 * @er:
807 *
808 * Returns: errno
809 */
810
811static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
812{
813 unsigned int jbsize = ip->i_sbd->sd_jbsize;
814 unsigned int blks = 1;
815
816 if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
817 blks += DIV_ROUND_UP(er->er_data_len, jbsize);
818
819 return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
820}
821
822static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
823{
824 uint32_t ea_size = GFS2_EA_SIZE(ea);
825 struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea +
826 ea_size);
827 uint32_t new_size = GFS2_EA_REC_LEN(ea) - ea_size;
828 int last = ea->ea_flags & GFS2_EAFLAG_LAST;
829
830 ea->ea_rec_len = cpu_to_be32(ea_size);
831 ea->ea_flags ^= last;
832
833 new->ea_rec_len = cpu_to_be32(new_size);
834 new->ea_flags = last;
835
836 return new;
837}
838
839static void ea_set_remove_stuffed(struct gfs2_inode *ip,
840 struct gfs2_ea_location *el)
841{
842 struct gfs2_ea_header *ea = el->el_ea;
843 struct gfs2_ea_header *prev = el->el_prev;
844 uint32_t len;
845
846 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
847
848 if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
849 ea->ea_type = GFS2_EATYPE_UNUSED;
850 return;
851 } else if (GFS2_EA2NEXT(prev) != ea) {
852 prev = GFS2_EA2NEXT(prev);
853 gfs2_assert_withdraw(ip->i_sbd, GFS2_EA2NEXT(prev) == ea);
854 }
855
856 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
857 prev->ea_rec_len = cpu_to_be32(len);
858
859 if (GFS2_EA_IS_LAST(ea))
860 prev->ea_flags |= GFS2_EAFLAG_LAST;
861}
862
863struct ea_set {
864 int ea_split;
865
866 struct gfs2_ea_request *es_er;
867 struct gfs2_ea_location *es_el;
868
869 struct buffer_head *es_bh;
870 struct gfs2_ea_header *es_ea;
871};
872
873static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
874 struct gfs2_ea_header *ea, struct ea_set *es)
875{
876 struct gfs2_ea_request *er = es->es_er;
877 struct buffer_head *dibh;
878 int error;
879
880 error = gfs2_trans_begin(ip->i_sbd, RES_DINODE + 2 * RES_EATTR, 0);
881 if (error)
882 return error;
883
884 gfs2_trans_add_bh(ip->i_gl, bh, 1);
885
886 if (es->ea_split)
887 ea = ea_split_ea(ea);
888
889 ea_write(ip, ea, er);
890
891 if (es->es_el)
892 ea_set_remove_stuffed(ip, es->es_el);
893
894 error = gfs2_meta_inode_buffer(ip, &dibh);
895 if (error)
896 goto out;
897
898 if (er->er_flags & GFS2_ERF_MODE) {
899 gfs2_assert_withdraw(ip->i_sbd,
900 (ip->i_di.di_mode & S_IFMT) == (er->er_mode & S_IFMT));
901 ip->i_di.di_mode = er->er_mode;
902 }
903 ip->i_di.di_ctime = get_seconds();
904 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
905 gfs2_dinode_out(&ip->i_di, dibh->b_data);
906 brelse(dibh);
907 out:
908 gfs2_trans_end(ip->i_sbd);
909
910 return error;
911}
912
913static int ea_set_simple_alloc(struct gfs2_inode *ip,
914 struct gfs2_ea_request *er, void *private)
915{
916 struct ea_set *es = private;
917 struct gfs2_ea_header *ea = es->es_ea;
918 int error;
919
920 gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
921
922 if (es->ea_split)
923 ea = ea_split_ea(ea);
924
925 error = ea_write(ip, ea, er);
926 if (error)
927 return error;
928
929 if (es->es_el)
930 ea_set_remove_stuffed(ip, es->es_el);
931
932 return 0;
933}
934
935static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
936 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
937 void *private)
938{
939 struct ea_set *es = private;
940 unsigned int size;
941 int stuffed;
942 int error;
943
944 stuffed = ea_calc_size(ip->i_sbd, es->es_er, &size);
945
946 if (ea->ea_type == GFS2_EATYPE_UNUSED) {
947 if (GFS2_EA_REC_LEN(ea) < size)
948 return 0;
949 if (!GFS2_EA_IS_STUFFED(ea)) {
950 error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
951 if (error)
952 return error;
953 }
954 es->ea_split = 0;
955 } else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
956 es->ea_split = 1;
957 else
958 return 0;
959
960 if (stuffed) {
961 error = ea_set_simple_noalloc(ip, bh, ea, es);
962 if (error)
963 return error;
964 } else {
965 unsigned int blks;
966
967 es->es_bh = bh;
968 es->es_ea = ea;
969 blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len,
970 ip->i_sbd->sd_jbsize);
971
972 error = ea_alloc_skeleton(ip, es->es_er, blks,
973 ea_set_simple_alloc, es);
974 if (error)
975 return error;
976 }
977
978 return 1;
979}
980
981static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
982 void *private)
983{
984 struct gfs2_sbd *sdp = ip->i_sbd;
985 struct buffer_head *indbh, *newbh;
986 uint64_t *eablk;
987 int error;
988 int mh_size = sizeof(struct gfs2_meta_header);
989
990 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
991 uint64_t *end;
992
993 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
994 DIO_START | DIO_WAIT, &indbh);
995 if (error)
996 return error;
997
998 if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
999 error = -EIO;
1000 goto out;
1001 }
1002
1003 eablk = (uint64_t *)(indbh->b_data + mh_size);
1004 end = eablk + sdp->sd_inptrs;
1005
1006 for (; eablk < end; eablk++)
1007 if (!*eablk)
1008 break;
1009
1010 if (eablk == end) {
1011 error = -ENOSPC;
1012 goto out;
1013 }
1014
1015 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
1016 } else {
1017 uint64_t blk;
1018
1019 blk = gfs2_alloc_meta(ip);
1020
1021 indbh = gfs2_meta_new(ip->i_gl, blk);
1022 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
1023 gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
1024 gfs2_buffer_clear_tail(indbh, mh_size);
1025
1026 eablk = (uint64_t *)(indbh->b_data + mh_size);
1027 *eablk = cpu_to_be64(ip->i_di.di_eattr);
1028 ip->i_di.di_eattr = blk;
1029 ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
1030 ip->i_di.di_blocks++;
1031
1032 eablk++;
1033 }
1034
1035 error = ea_alloc_blk(ip, &newbh);
1036 if (error)
1037 goto out;
1038
1039 *eablk = cpu_to_be64((uint64_t)newbh->b_blocknr);
1040 error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
1041 brelse(newbh);
1042 if (error)
1043 goto out;
1044
1045 if (private)
1046 ea_set_remove_stuffed(ip, (struct gfs2_ea_location *)private);
1047
1048 out:
1049 brelse(indbh);
1050
1051 return error;
1052}
1053
1054static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
1055 struct gfs2_ea_location *el)
1056{
1057 struct ea_set es;
1058 unsigned int blks = 2;
1059 int error;
1060
1061 memset(&es, 0, sizeof(struct ea_set));
1062 es.es_er = er;
1063 es.es_el = el;
1064
1065 error = ea_foreach(ip, ea_set_simple, &es);
1066 if (error > 0)
1067 return 0;
1068 if (error)
1069 return error;
1070
1071 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
1072 blks++;
1073 if (GFS2_EAREQ_SIZE_STUFFED(er) > ip->i_sbd->sd_jbsize)
1074 blks += DIV_ROUND_UP(er->er_data_len, ip->i_sbd->sd_jbsize);
1075
1076 return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
1077}
1078
1079static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
1080 struct gfs2_ea_location *el)
1081{
1082 if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
1083 el->el_prev = GFS2_EA2NEXT(el->el_prev);
1084 gfs2_assert_withdraw(ip->i_sbd,
1085 GFS2_EA2NEXT(el->el_prev) == el->el_ea);
1086 }
1087
1088 return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
1089}
1090
1091int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1092{
1093 struct gfs2_ea_location el;
1094 int error;
1095
1096 if (!ip->i_di.di_eattr) {
1097 if (er->er_flags & XATTR_REPLACE)
1098 return -ENODATA;
1099 return ea_init(ip, er);
1100 }
1101
1102 error = gfs2_ea_find(ip, er, &el);
1103 if (error)
1104 return error;
1105
1106 if (el.el_ea) {
1107 if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
1108 brelse(el.el_bh);
1109 return -EPERM;
1110 }
1111
1112 error = -EEXIST;
1113 if (!(er->er_flags & XATTR_CREATE)) {
1114 int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
1115 error = ea_set_i(ip, er, &el);
1116 if (!error && unstuffed)
1117 ea_set_remove_unstuffed(ip, &el);
1118 }
1119
1120 brelse(el.el_bh);
1121 } else {
1122 error = -ENODATA;
1123 if (!(er->er_flags & XATTR_REPLACE))
1124 error = ea_set_i(ip, er, NULL);
1125 }
1126
1127 return error;
1128}
1129
1130int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1131{
1132 struct gfs2_holder i_gh;
1133 int error;
1134
1135 if (!er->er_name_len ||
1136 er->er_name_len > GFS2_EA_MAX_NAME_LEN)
1137 return -EINVAL;
1138 if (!er->er_data || !er->er_data_len) {
1139 er->er_data = NULL;
1140 er->er_data_len = 0;
1141 }
1142 error = ea_check_size(ip->i_sbd, er);
1143 if (error)
1144 return error;
1145
1146 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1147 if (error)
1148 return error;
1149
1150 if (IS_IMMUTABLE(ip->i_vnode))
1151 error = -EPERM;
1152 else
1153 error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
1154
1155 gfs2_glock_dq_uninit(&i_gh);
1156
1157 return error;
1158}
1159
1160static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
1161{
1162 struct gfs2_ea_header *ea = el->el_ea;
1163 struct gfs2_ea_header *prev = el->el_prev;
1164 struct buffer_head *dibh;
1165 int error;
1166
1167 error = gfs2_trans_begin(ip->i_sbd, RES_DINODE + RES_EATTR, 0);
1168 if (error)
1169 return error;
1170
1171 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
1172
1173 if (prev) {
1174 uint32_t len;
1175
1176 len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
1177 prev->ea_rec_len = cpu_to_be32(len);
1178
1179 if (GFS2_EA_IS_LAST(ea))
1180 prev->ea_flags |= GFS2_EAFLAG_LAST;
1181 } else
1182 ea->ea_type = GFS2_EATYPE_UNUSED;
1183
1184 error = gfs2_meta_inode_buffer(ip, &dibh);
1185 if (!error) {
1186 ip->i_di.di_ctime = get_seconds();
1187 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1188 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1189 brelse(dibh);
1190 }
1191
1192 gfs2_trans_end(ip->i_sbd);
1193
1194 return error;
1195}
1196
1197int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1198{
1199 struct gfs2_ea_location el;
1200 int error;
1201
1202 if (!ip->i_di.di_eattr)
1203 return -ENODATA;
1204
1205 error = gfs2_ea_find(ip, er, &el);
1206 if (error)
1207 return error;
1208 if (!el.el_ea)
1209 return -ENODATA;
1210
1211 if (GFS2_EA_IS_STUFFED(el.el_ea))
1212 error = ea_remove_stuffed(ip, &el);
1213 else
1214 error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
1215 0);
1216
1217 brelse(el.el_bh);
1218
1219 return error;
1220}
1221
1222/**
1223 * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
1224 * @ip: pointer to the inode of the target file
1225 * @er: request information
1226 *
1227 * Returns: errno
1228 */
1229
1230int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1231{
1232 struct gfs2_holder i_gh;
1233 int error;
1234
1235 if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
1236 return -EINVAL;
1237
1238 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1239 if (error)
1240 return error;
1241
1242 if (IS_IMMUTABLE(ip->i_vnode) || IS_APPEND(ip->i_vnode))
1243 error = -EPERM;
1244 else
1245 error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
1246
1247 gfs2_glock_dq_uninit(&i_gh);
1248
1249 return error;
1250}
1251
1252static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
1253 struct gfs2_ea_header *ea, char *data)
1254{
1255 struct gfs2_sbd *sdp = ip->i_sbd;
1256 struct buffer_head **bh;
1257 unsigned int amount = GFS2_EA_DATA_LEN(ea);
1258 unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
1259 uint64_t *dataptrs = GFS2_EA2DATAPTRS(ea);
1260 unsigned int x;
1261 int error;
1262
1263 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
1264 if (!bh)
1265 return -ENOMEM;
1266
1267 error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
1268 if (error)
1269 goto out;
1270
1271 for (x = 0; x < nptrs; x++) {
1272 error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs),
1273 DIO_START, bh + x);
1274 if (error) {
1275 while (x--)
1276 brelse(bh[x]);
1277 goto fail;
1278 }
1279 dataptrs++;
1280 }
1281
1282 for (x = 0; x < nptrs; x++) {
1283 error = gfs2_meta_reread(sdp, bh[x], DIO_WAIT);
1284 if (error) {
1285 for (; x < nptrs; x++)
1286 brelse(bh[x]);
1287 goto fail;
1288 }
1289 if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
1290 for (; x < nptrs; x++)
1291 brelse(bh[x]);
1292 error = -EIO;
1293 goto fail;
1294 }
1295
1296 gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
1297
1298 memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header),
1299 data,
1300 (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
1301
1302 amount -= sdp->sd_jbsize;
1303 data += sdp->sd_jbsize;
1304
1305 brelse(bh[x]);
1306 }
1307
1308 out:
1309 kfree(bh);
1310
1311 return error;
1312
1313 fail:
1314 gfs2_trans_end(sdp);
1315 kfree(bh);
1316
1317 return error;
1318}
1319
1320int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
1321 struct iattr *attr, char *data)
1322{
1323 struct buffer_head *dibh;
1324 int error;
1325
1326 if (GFS2_EA_IS_STUFFED(el->el_ea)) {
1327 error = gfs2_trans_begin(ip->i_sbd, RES_DINODE + RES_EATTR, 0);
1328 if (error)
1329 return error;
1330
1331 gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
1332 memcpy(GFS2_EA2DATA(el->el_ea),
1333 data,
1334 GFS2_EA_DATA_LEN(el->el_ea));
1335 } else
1336 error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
1337
1338 if (error)
1339 return error;
1340
1341 error = gfs2_meta_inode_buffer(ip, &dibh);
1342 if (!error) {
1343 error = inode_setattr(ip->i_vnode, attr);
1344 gfs2_assert_warn(ip->i_sbd, !error);
1345 gfs2_inode_attr_out(ip);
1346 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1347 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1348 brelse(dibh);
1349 }
1350
1351 gfs2_trans_end(ip->i_sbd);
1352
1353 return error;
1354}
1355
1356static int ea_dealloc_indirect(struct gfs2_inode *ip)
1357{
1358 struct gfs2_sbd *sdp = ip->i_sbd;
1359 struct gfs2_rgrp_list rlist;
1360 struct buffer_head *indbh, *dibh;
1361 uint64_t *eablk, *end;
1362 unsigned int rg_blocks = 0;
1363 uint64_t bstart = 0;
1364 unsigned int blen = 0;
1365 unsigned int blks = 0;
1366 unsigned int x;
1367 int error;
1368
1369 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1370
1371 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr,
1372 DIO_START | DIO_WAIT, &indbh);
1373 if (error)
1374 return error;
1375
1376 if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
1377 error = -EIO;
1378 goto out;
1379 }
1380
1381 eablk = (uint64_t *)(indbh->b_data + sizeof(struct gfs2_meta_header));
1382 end = eablk + sdp->sd_inptrs;
1383
1384 for (; eablk < end; eablk++) {
1385 uint64_t bn;
1386
1387 if (!*eablk)
1388 break;
1389 bn = be64_to_cpu(*eablk);
1390
1391 if (bstart + blen == bn)
1392 blen++;
1393 else {
1394 if (bstart)
1395 gfs2_rlist_add(sdp, &rlist, bstart);
1396 bstart = bn;
1397 blen = 1;
1398 }
1399 blks++;
1400 }
1401 if (bstart)
1402 gfs2_rlist_add(sdp, &rlist, bstart);
1403 else
1404 goto out;
1405
1406 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
1407
1408 for (x = 0; x < rlist.rl_rgrps; x++) {
1409 struct gfs2_rgrpd *rgd;
1410 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
1411 rg_blocks += rgd->rd_ri.ri_length;
1412 }
1413
1414 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
1415 if (error)
1416 goto out_rlist_free;
1417
1418 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
1419 RES_INDIRECT + RES_STATFS +
1420 RES_QUOTA, blks);
1421 if (error)
1422 goto out_gunlock;
1423
1424 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
1425
1426 eablk = (uint64_t *)(indbh->b_data + sizeof(struct gfs2_meta_header));
1427 bstart = 0;
1428 blen = 0;
1429
1430 for (; eablk < end; eablk++) {
1431 uint64_t bn;
1432
1433 if (!*eablk)
1434 break;
1435 bn = be64_to_cpu(*eablk);
1436
1437 if (bstart + blen == bn)
1438 blen++;
1439 else {
1440 if (bstart)
1441 gfs2_free_meta(ip, bstart, blen);
1442 bstart = bn;
1443 blen = 1;
1444 }
1445
1446 *eablk = 0;
1447 if (!ip->i_di.di_blocks)
1448 gfs2_consist_inode(ip);
1449 ip->i_di.di_blocks--;
1450 }
1451 if (bstart)
1452 gfs2_free_meta(ip, bstart, blen);
1453
1454 ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
1455
1456 error = gfs2_meta_inode_buffer(ip, &dibh);
1457 if (!error) {
1458 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1459 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1460 brelse(dibh);
1461 }
1462
1463 gfs2_trans_end(sdp);
1464
1465 out_gunlock:
1466 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1467
1468 out_rlist_free:
1469 gfs2_rlist_free(&rlist);
1470
1471 out:
1472 brelse(indbh);
1473
1474 return error;
1475}
1476
1477static int ea_dealloc_block(struct gfs2_inode *ip)
1478{
1479 struct gfs2_sbd *sdp = ip->i_sbd;
1480 struct gfs2_alloc *al = &ip->i_alloc;
1481 struct gfs2_rgrpd *rgd;
1482 struct buffer_head *dibh;
1483 int error;
1484
1485 rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
1486 if (!rgd) {
1487 gfs2_consist_inode(ip);
1488 return -EIO;
1489 }
1490
1491 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
1492 &al->al_rgd_gh);
1493 if (error)
1494 return error;
1495
1496 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE +
1497 RES_STATFS + RES_QUOTA, 1);
1498 if (error)
1499 goto out_gunlock;
1500
1501 gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
1502
1503 ip->i_di.di_eattr = 0;
1504 if (!ip->i_di.di_blocks)
1505 gfs2_consist_inode(ip);
1506 ip->i_di.di_blocks--;
1507
1508 error = gfs2_meta_inode_buffer(ip, &dibh);
1509 if (!error) {
1510 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1511 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1512 brelse(dibh);
1513 }
1514
1515 gfs2_trans_end(sdp);
1516
1517 out_gunlock:
1518 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1519
1520 return error;
1521}
1522
1523/**
1524 * gfs2_ea_dealloc - deallocate the extended attribute fork
1525 * @ip: the inode
1526 *
1527 * Returns: errno
1528 */
1529
1530int gfs2_ea_dealloc(struct gfs2_inode *ip)
1531{
1532 struct gfs2_alloc *al;
1533 int error;
1534
1535 al = gfs2_alloc_get(ip);
1536
1537 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1538 if (error)
1539 goto out_alloc;
1540
1541 error = gfs2_rindex_hold(ip->i_sbd, &al->al_ri_gh);
1542 if (error)
1543 goto out_quota;
1544
1545 error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
1546 if (error)
1547 goto out_rindex;
1548
1549 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
1550 error = ea_dealloc_indirect(ip);
1551 if (error)
1552 goto out_rindex;
1553 }
1554
1555 error = ea_dealloc_block(ip);
1556
1557 out_rindex:
1558 gfs2_glock_dq_uninit(&al->al_ri_gh);
1559
1560 out_quota:
1561 gfs2_quota_unhold(ip);
1562
1563 out_alloc:
1564 gfs2_alloc_put(ip);
1565
1566 return error;
1567}
1568
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h
new file mode 100644
index 000000000000..2b4152b1fcbe
--- /dev/null
+++ b/fs/gfs2/eattr.h
@@ -0,0 +1,88 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __EATTR_DOT_H__
11#define __EATTR_DOT_H__
12
13#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
14#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
15
16#define GFS2_EA_SIZE(ea) \
17ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
18 ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
19 (sizeof(uint64_t) * (ea)->ea_num_ptrs)), 8)
20
21#define GFS2_EA_STRLEN(ea) \
22((((ea)->ea_type == GFS2_EATYPE_USR) ? 5 : 7) + (ea)->ea_name_len + 1)
23
24#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
25#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
26
27#define GFS2_EAREQ_SIZE_STUFFED(er) \
28ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
29
30#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
31ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
32 sizeof(uint64_t) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
33
34#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
35#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
36
37#define GFS2_EA2DATAPTRS(ea) \
38((uint64_t *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
39
40#define GFS2_EA2NEXT(ea) \
41((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
42
43#define GFS2_EA_BH2FIRST(bh) \
44((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
45
46#define GFS2_ERF_MODE 0x80000000
47
48struct gfs2_ea_request {
49 char *er_name;
50 char *er_data;
51 unsigned int er_name_len;
52 unsigned int er_data_len;
53 unsigned int er_type; /* GFS2_EATYPE_... */
54 int er_flags;
55 mode_t er_mode;
56};
57
58struct gfs2_ea_location {
59 struct buffer_head *el_bh;
60 struct gfs2_ea_header *el_ea;
61 struct gfs2_ea_header *el_prev;
62};
63
64int gfs2_ea_repack(struct gfs2_inode *ip);
65
66int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
67int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
68int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
69
70int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
71int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
72int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
73int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
74
75int gfs2_ea_dealloc(struct gfs2_inode *ip);
76
77/* Exported to acl.c */
78
79int gfs2_ea_find(struct gfs2_inode *ip,
80 struct gfs2_ea_request *er,
81 struct gfs2_ea_location *el);
82int gfs2_ea_get_copy(struct gfs2_inode *ip,
83 struct gfs2_ea_location *el,
84 char *data);
85int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
86 struct iattr *attr, char *data);
87
88#endif /* __EATTR_DOT_H__ */
diff --git a/fs/gfs2/format.h b/fs/gfs2/format.h
new file mode 100644
index 000000000000..c7bf32ce3eca
--- /dev/null
+++ b/fs/gfs2/format.h
@@ -0,0 +1,21 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __FORMAT_DOT_H__
11#define __FORMAT_DOT_H__
12
13static const uint32_t gfs2_old_fs_formats[] = {
14 0
15};
16
17static const uint32_t gfs2_old_multihost_formats[] = {
18 0
19};
20
21#endif /* __FORMAT_DOT_H__ */
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
new file mode 100644
index 000000000000..57175f70e2bd
--- /dev/null
+++ b/fs/gfs2/gfs2.h
@@ -0,0 +1,31 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GFS2_DOT_H__
11#define __GFS2_DOT_H__
12
13enum {
14 NO_CREATE = 0,
15 CREATE = 1,
16};
17
18enum {
19 NO_WAIT = 0,
20 WAIT = 1,
21};
22
23enum {
24 NO_FORCE = 0,
25 FORCE = 1,
26};
27
28#define GFS2_FAST_NAME_SIZE 8
29
30#endif /* __GFS2_DOT_H__ */
31
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
new file mode 100644
index 000000000000..32cc4005307d
--- /dev/null
+++ b/fs/gfs2/glock.c
@@ -0,0 +1,2493 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/delay.h>
16#include <linux/sort.h>
17#include <linux/jhash.h>
18#include <linux/kref.h>
19#include <linux/kallsyms.h>
20#include <linux/gfs2_ondisk.h>
21#include <asm/semaphore.h>
22#include <asm/uaccess.h>
23
24#include "gfs2.h"
25#include "lm_interface.h"
26#include "incore.h"
27#include "glock.h"
28#include "glops.h"
29#include "inode.h"
30#include "lm.h"
31#include "lops.h"
32#include "meta_io.h"
33#include "quota.h"
34#include "super.h"
35#include "util.h"
36
37/* Must be kept in sync with the beginning of struct gfs2_glock */
38struct glock_plug {
39 struct list_head gl_list;
40 unsigned long gl_flags;
41};
42
43struct greedy {
44 struct gfs2_holder gr_gh;
45 struct work_struct gr_work;
46};
47
48typedef void (*glock_examiner) (struct gfs2_glock * gl);
49
50/**
51 * relaxed_state_ok - is a requested lock compatible with the current lock mode?
52 * @actual: the current state of the lock
53 * @requested: the lock state that was requested by the caller
54 * @flags: the modifier flags passed in by the caller
55 *
56 * Returns: 1 if the locks are compatible, 0 otherwise
57 */
58
59static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
60 int flags)
61{
62 if (actual == requested)
63 return 1;
64
65 if (flags & GL_EXACT)
66 return 0;
67
68 if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
69 return 1;
70
71 if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
72 return 1;
73
74 return 0;
75}
76
77/**
78 * gl_hash() - Turn glock number into hash bucket number
79 * @lock: The glock number
80 *
81 * Returns: The number of the corresponding hash bucket
82 */
83
84static unsigned int gl_hash(struct lm_lockname *name)
85{
86 unsigned int h;
87
88 h = jhash(&name->ln_number, sizeof(uint64_t), 0);
89 h = jhash(&name->ln_type, sizeof(unsigned int), h);
90 h &= GFS2_GL_HASH_MASK;
91
92 return h;
93}
94
95/**
96 * glock_free() - Perform a few checks and then release struct gfs2_glock
97 * @gl: The glock to release
98 *
99 * Also calls lock module to release its internal structure for this glock.
100 *
101 */
102
103static void glock_free(struct gfs2_glock *gl)
104{
105 struct gfs2_sbd *sdp = gl->gl_sbd;
106 struct inode *aspace = gl->gl_aspace;
107
108 gfs2_lm_put_lock(sdp, gl->gl_lock);
109
110 if (aspace)
111 gfs2_aspace_put(aspace);
112
113 kmem_cache_free(gfs2_glock_cachep, gl);
114}
115
116/**
117 * gfs2_glock_hold() - increment reference count on glock
118 * @gl: The glock to hold
119 *
120 */
121
122void gfs2_glock_hold(struct gfs2_glock *gl)
123{
124 kref_get(&gl->gl_ref);
125}
126
127/* All work is done after the return from kref_put() so we
128 can release the write_lock before the free. */
129
130static void kill_glock(struct kref *kref)
131{
132 struct gfs2_glock *gl = container_of(kref, struct gfs2_glock, gl_ref);
133 struct gfs2_sbd *sdp = gl->gl_sbd;
134
135 gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
136 gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
137 gfs2_assert(sdp, list_empty(&gl->gl_holders));
138 gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
139 gfs2_assert(sdp, list_empty(&gl->gl_waiters2));
140 gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
141}
142
143/**
144 * gfs2_glock_put() - Decrement reference count on glock
145 * @gl: The glock to put
146 *
147 */
148
149int gfs2_glock_put(struct gfs2_glock *gl)
150{
151 struct gfs2_sbd *sdp = gl->gl_sbd;
152 struct gfs2_gl_hash_bucket *bucket = gl->gl_bucket;
153 int rv = 0;
154
155 mutex_lock(&sdp->sd_invalidate_inodes_mutex);
156
157 write_lock(&bucket->hb_lock);
158 if (kref_put(&gl->gl_ref, kill_glock)) {
159 list_del_init(&gl->gl_list);
160 write_unlock(&bucket->hb_lock);
161 BUG_ON(spin_is_locked(&gl->gl_spin));
162 glock_free(gl);
163 rv = 1;
164 goto out;
165 }
166 write_unlock(&bucket->hb_lock);
167 out:
168 mutex_unlock(&sdp->sd_invalidate_inodes_mutex);
169 return rv;
170}
171
172/**
173 * queue_empty - check to see if a glock's queue is empty
174 * @gl: the glock
175 * @head: the head of the queue to check
176 *
177 * This function protects the list in the event that a process already
178 * has a holder on the list and is adding a second holder for itself.
179 * The glmutex lock is what generally prevents processes from working
180 * on the same glock at once, but the special case of adding a second
181 * holder for yourself ("recursive" locking) doesn't involve locking
182 * glmutex, making the spin lock necessary.
183 *
184 * Returns: 1 if the queue is empty
185 */
186
187static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head)
188{
189 int empty;
190 spin_lock(&gl->gl_spin);
191 empty = list_empty(head);
192 spin_unlock(&gl->gl_spin);
193 return empty;
194}
195
196/**
197 * search_bucket() - Find struct gfs2_glock by lock number
198 * @bucket: the bucket to search
199 * @name: The lock name
200 *
201 * Returns: NULL, or the struct gfs2_glock with the requested number
202 */
203
204static struct gfs2_glock *search_bucket(struct gfs2_gl_hash_bucket *bucket,
205 struct lm_lockname *name)
206{
207 struct gfs2_glock *gl;
208
209 list_for_each_entry(gl, &bucket->hb_list, gl_list) {
210 if (test_bit(GLF_PLUG, &gl->gl_flags))
211 continue;
212 if (!lm_name_equal(&gl->gl_name, name))
213 continue;
214
215 kref_get(&gl->gl_ref);
216
217 return gl;
218 }
219
220 return NULL;
221}
222
223/**
224 * gfs2_glock_find() - Find glock by lock number
225 * @sdp: The GFS2 superblock
226 * @name: The lock name
227 *
228 * Returns: NULL, or the struct gfs2_glock with the requested number
229 */
230
231struct gfs2_glock *gfs2_glock_find(struct gfs2_sbd *sdp,
232 struct lm_lockname *name)
233{
234 struct gfs2_gl_hash_bucket *bucket = &sdp->sd_gl_hash[gl_hash(name)];
235 struct gfs2_glock *gl;
236
237 read_lock(&bucket->hb_lock);
238 gl = search_bucket(bucket, name);
239 read_unlock(&bucket->hb_lock);
240
241 return gl;
242}
243
244/**
245 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
246 * @sdp: The GFS2 superblock
247 * @number: the lock number
248 * @glops: The glock_operations to use
249 * @create: If 0, don't create the glock if it doesn't exist
250 * @glp: the glock is returned here
251 *
252 * This does not lock a glock, just finds/creates structures for one.
253 *
254 * Returns: errno
255 */
256
257int gfs2_glock_get(struct gfs2_sbd *sdp, uint64_t number,
258 struct gfs2_glock_operations *glops, int create,
259 struct gfs2_glock **glp)
260{
261 struct lm_lockname name;
262 struct gfs2_glock *gl, *tmp;
263 struct gfs2_gl_hash_bucket *bucket;
264 int error;
265
266 name.ln_number = number;
267 name.ln_type = glops->go_type;
268 bucket = &sdp->sd_gl_hash[gl_hash(&name)];
269
270 read_lock(&bucket->hb_lock);
271 gl = search_bucket(bucket, &name);
272 read_unlock(&bucket->hb_lock);
273
274 if (gl || !create) {
275 *glp = gl;
276 return 0;
277 }
278
279 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
280 if (!gl)
281 return -ENOMEM;
282
283 memset(gl, 0, sizeof(struct gfs2_glock));
284
285 INIT_LIST_HEAD(&gl->gl_list);
286 gl->gl_name = name;
287 kref_init(&gl->gl_ref);
288
289 spin_lock_init(&gl->gl_spin);
290
291 gl->gl_state = LM_ST_UNLOCKED;
292 INIT_LIST_HEAD(&gl->gl_holders);
293 INIT_LIST_HEAD(&gl->gl_waiters1);
294 INIT_LIST_HEAD(&gl->gl_waiters2);
295 INIT_LIST_HEAD(&gl->gl_waiters3);
296
297 gl->gl_ops = glops;
298
299 gl->gl_bucket = bucket;
300 INIT_LIST_HEAD(&gl->gl_reclaim);
301
302 gl->gl_sbd = sdp;
303
304 lops_init_le(&gl->gl_le, &gfs2_glock_lops);
305 INIT_LIST_HEAD(&gl->gl_ail_list);
306
307 /* If this glock protects actual on-disk data or metadata blocks,
308 create a VFS inode to manage the pages/buffers holding them. */
309 if (glops == &gfs2_inode_glops ||
310 glops == &gfs2_rgrp_glops ||
311 glops == &gfs2_meta_glops) {
312 gl->gl_aspace = gfs2_aspace_get(sdp);
313 if (!gl->gl_aspace) {
314 error = -ENOMEM;
315 goto fail;
316 }
317 }
318
319 error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock);
320 if (error)
321 goto fail_aspace;
322
323 write_lock(&bucket->hb_lock);
324 tmp = search_bucket(bucket, &name);
325 if (tmp) {
326 write_unlock(&bucket->hb_lock);
327 glock_free(gl);
328 gl = tmp;
329 } else {
330 list_add_tail(&gl->gl_list, &bucket->hb_list);
331 write_unlock(&bucket->hb_lock);
332 }
333
334 *glp = gl;
335
336 return 0;
337
338 fail_aspace:
339 if (gl->gl_aspace)
340 gfs2_aspace_put(gl->gl_aspace);
341
342 fail:
343 kmem_cache_free(gfs2_glock_cachep, gl);
344
345 return error;
346}
347
348/**
349 * gfs2_holder_init - initialize a struct gfs2_holder in the default way
350 * @gl: the glock
351 * @state: the state we're requesting
352 * @flags: the modifier flags
353 * @gh: the holder structure
354 *
355 */
356
357void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
358 struct gfs2_holder *gh)
359{
360 flags |= GL_NEVER_RECURSE;
361 INIT_LIST_HEAD(&gh->gh_list);
362 gh->gh_gl = gl;
363 gh->gh_ip = (unsigned long)__builtin_return_address(0);
364 gh->gh_owner = current;
365 gh->gh_state = state;
366 gh->gh_flags = flags;
367 gh->gh_error = 0;
368 gh->gh_iflags = 0;
369 init_completion(&gh->gh_wait);
370
371 if (gh->gh_state == LM_ST_EXCLUSIVE)
372 gh->gh_flags |= GL_LOCAL_EXCL;
373
374 gfs2_glock_hold(gl);
375}
376
377/**
378 * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
379 * @state: the state we're requesting
380 * @flags: the modifier flags
381 * @gh: the holder structure
382 *
383 * Don't mess with the glock.
384 *
385 */
386
387void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh)
388{
389 gh->gh_state = state;
390 gh->gh_flags = flags | GL_NEVER_RECURSE;
391 if (gh->gh_state == LM_ST_EXCLUSIVE)
392 gh->gh_flags |= GL_LOCAL_EXCL;
393
394 gh->gh_iflags &= 1 << HIF_ALLOCED;
395 gh->gh_ip = (unsigned long)__builtin_return_address(0);
396}
397
398/**
399 * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference)
400 * @gh: the holder structure
401 *
402 */
403
404void gfs2_holder_uninit(struct gfs2_holder *gh)
405{
406 gfs2_glock_put(gh->gh_gl);
407 gh->gh_gl = NULL;
408 gh->gh_ip = 0;
409}
410
411/**
412 * gfs2_holder_get - get a struct gfs2_holder structure
413 * @gl: the glock
414 * @state: the state we're requesting
415 * @flags: the modifier flags
416 * @gfp_flags: __GFP_NOFAIL
417 *
418 * Figure out how big an impact this function has. Either:
419 * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd
420 * 2) Leave it like it is
421 *
422 * Returns: the holder structure, NULL on ENOMEM
423 */
424
425struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl, unsigned int state,
426 int flags, gfp_t gfp_flags)
427{
428 struct gfs2_holder *gh;
429
430 gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags);
431 if (!gh)
432 return NULL;
433
434 gfs2_holder_init(gl, state, flags, gh);
435 set_bit(HIF_ALLOCED, &gh->gh_iflags);
436 gh->gh_ip = (unsigned long)__builtin_return_address(0);
437 return gh;
438}
439
440/**
441 * gfs2_holder_put - get rid of a struct gfs2_holder structure
442 * @gh: the holder structure
443 *
444 */
445
446void gfs2_holder_put(struct gfs2_holder *gh)
447{
448 gfs2_holder_uninit(gh);
449 kfree(gh);
450}
451
452/**
453 * handle_recurse - put other holder structures (marked recursive)
454 * into the holders list
455 * @gh: the holder structure
456 *
457 */
458
459static void handle_recurse(struct gfs2_holder *gh)
460{
461 struct gfs2_glock *gl = gh->gh_gl;
462 struct gfs2_sbd *sdp = gl->gl_sbd;
463 struct gfs2_holder *tmp_gh, *safe;
464 int found = 0;
465
466 BUG_ON(!spin_is_locked(&gl->gl_spin));
467
468 printk(KERN_INFO "recursion %016llx, %u\n", gl->gl_name.ln_number,
469 gl->gl_name.ln_type);
470
471 if (gfs2_assert_warn(sdp, gh->gh_owner))
472 return;
473
474 list_for_each_entry_safe(tmp_gh, safe, &gl->gl_waiters3, gh_list) {
475 if (tmp_gh->gh_owner != gh->gh_owner)
476 continue;
477
478 gfs2_assert_warn(sdp,
479 test_bit(HIF_RECURSE, &tmp_gh->gh_iflags));
480
481 list_move_tail(&tmp_gh->gh_list, &gl->gl_holders);
482 tmp_gh->gh_error = 0;
483 set_bit(HIF_HOLDER, &tmp_gh->gh_iflags);
484
485 complete(&tmp_gh->gh_wait);
486
487 found = 1;
488 }
489
490 gfs2_assert_warn(sdp, found);
491}
492
493/**
494 * do_unrecurse - a recursive holder was just dropped of the waiters3 list
495 * @gh: the holder
496 *
497 * If there is only one other recursive holder, clear its HIF_RECURSE bit.
498 * If there is more than one, leave them alone.
499 *
500 */
501
502static void do_unrecurse(struct gfs2_holder *gh)
503{
504 struct gfs2_glock *gl = gh->gh_gl;
505 struct gfs2_sbd *sdp = gl->gl_sbd;
506 struct gfs2_holder *tmp_gh, *last_gh = NULL;
507 int found = 0;
508
509 BUG_ON(!spin_is_locked(&gl->gl_spin));
510
511 if (gfs2_assert_warn(sdp, gh->gh_owner))
512 return;
513
514 list_for_each_entry(tmp_gh, &gl->gl_waiters3, gh_list) {
515 if (tmp_gh->gh_owner != gh->gh_owner)
516 continue;
517
518 gfs2_assert_warn(sdp,
519 test_bit(HIF_RECURSE, &tmp_gh->gh_iflags));
520
521 if (found)
522 return;
523
524 found = 1;
525 last_gh = tmp_gh;
526 }
527
528 if (!gfs2_assert_warn(sdp, found))
529 clear_bit(HIF_RECURSE, &last_gh->gh_iflags);
530}
531
532/**
533 * rq_mutex - process a mutex request in the queue
534 * @gh: the glock holder
535 *
536 * Returns: 1 if the queue is blocked
537 */
538
539static int rq_mutex(struct gfs2_holder *gh)
540{
541 struct gfs2_glock *gl = gh->gh_gl;
542
543 list_del_init(&gh->gh_list);
544 /* gh->gh_error never examined. */
545 set_bit(GLF_LOCK, &gl->gl_flags);
546 complete(&gh->gh_wait);
547
548 return 1;
549}
550
551/**
552 * rq_promote - process a promote request in the queue
553 * @gh: the glock holder
554 *
555 * Acquire a new inter-node lock, or change a lock state to more restrictive.
556 *
557 * Returns: 1 if the queue is blocked
558 */
559
560static int rq_promote(struct gfs2_holder *gh)
561{
562 struct gfs2_glock *gl = gh->gh_gl;
563 struct gfs2_sbd *sdp = gl->gl_sbd;
564 struct gfs2_glock_operations *glops = gl->gl_ops;
565 int recurse;
566
567 if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
568 if (list_empty(&gl->gl_holders)) {
569 gl->gl_req_gh = gh;
570 set_bit(GLF_LOCK, &gl->gl_flags);
571 spin_unlock(&gl->gl_spin);
572
573 if (atomic_read(&sdp->sd_reclaim_count) >
574 gfs2_tune_get(sdp, gt_reclaim_limit) &&
575 !(gh->gh_flags & LM_FLAG_PRIORITY)) {
576 gfs2_reclaim_glock(sdp);
577 gfs2_reclaim_glock(sdp);
578 }
579
580 glops->go_xmote_th(gl, gh->gh_state,
581 gh->gh_flags);
582
583 spin_lock(&gl->gl_spin);
584 }
585 return 1;
586 }
587
588 if (list_empty(&gl->gl_holders)) {
589 set_bit(HIF_FIRST, &gh->gh_iflags);
590 set_bit(GLF_LOCK, &gl->gl_flags);
591 recurse = 0;
592 } else {
593 struct gfs2_holder *next_gh;
594 if (gh->gh_flags & GL_LOCAL_EXCL)
595 return 1;
596 next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
597 gh_list);
598 if (next_gh->gh_flags & GL_LOCAL_EXCL)
599 return 1;
600 recurse = test_bit(HIF_RECURSE, &gh->gh_iflags);
601 }
602
603 list_move_tail(&gh->gh_list, &gl->gl_holders);
604 gh->gh_error = 0;
605 set_bit(HIF_HOLDER, &gh->gh_iflags);
606
607 if (recurse)
608 handle_recurse(gh);
609
610 complete(&gh->gh_wait);
611
612 return 0;
613}
614
615/**
616 * rq_demote - process a demote request in the queue
617 * @gh: the glock holder
618 *
619 * Returns: 1 if the queue is blocked
620 */
621
622static int rq_demote(struct gfs2_holder *gh)
623{
624 struct gfs2_glock *gl = gh->gh_gl;
625 struct gfs2_glock_operations *glops = gl->gl_ops;
626
627 if (!list_empty(&gl->gl_holders))
628 return 1;
629
630 if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
631 list_del_init(&gh->gh_list);
632 gh->gh_error = 0;
633 spin_unlock(&gl->gl_spin);
634 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
635 gfs2_holder_put(gh);
636 else
637 complete(&gh->gh_wait);
638 spin_lock(&gl->gl_spin);
639 } else {
640 gl->gl_req_gh = gh;
641 set_bit(GLF_LOCK, &gl->gl_flags);
642 spin_unlock(&gl->gl_spin);
643
644 if (gh->gh_state == LM_ST_UNLOCKED ||
645 gl->gl_state != LM_ST_EXCLUSIVE)
646 glops->go_drop_th(gl);
647 else
648 glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
649
650 spin_lock(&gl->gl_spin);
651 }
652
653 return 0;
654}
655
656/**
657 * rq_greedy - process a queued request to drop greedy status
658 * @gh: the glock holder
659 *
660 * Returns: 1 if the queue is blocked
661 */
662
663static int rq_greedy(struct gfs2_holder *gh)
664{
665 struct gfs2_glock *gl = gh->gh_gl;
666
667 list_del_init(&gh->gh_list);
668 /* gh->gh_error never examined. */
669 clear_bit(GLF_GREEDY, &gl->gl_flags);
670 spin_unlock(&gl->gl_spin);
671
672 gfs2_holder_uninit(gh);
673 kfree(container_of(gh, struct greedy, gr_gh));
674
675 spin_lock(&gl->gl_spin);
676
677 return 0;
678}
679
680/**
681 * run_queue - process holder structures on a glock
682 * @gl: the glock
683 *
684 */
685static void run_queue(struct gfs2_glock *gl)
686{
687 struct gfs2_holder *gh;
688 int blocked = 1;
689
690 for (;;) {
691 if (test_bit(GLF_LOCK, &gl->gl_flags))
692 break;
693
694 if (!list_empty(&gl->gl_waiters1)) {
695 gh = list_entry(gl->gl_waiters1.next,
696 struct gfs2_holder, gh_list);
697
698 if (test_bit(HIF_MUTEX, &gh->gh_iflags))
699 blocked = rq_mutex(gh);
700 else
701 gfs2_assert_warn(gl->gl_sbd, 0);
702
703 } else if (!list_empty(&gl->gl_waiters2) &&
704 !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) {
705 gh = list_entry(gl->gl_waiters2.next,
706 struct gfs2_holder, gh_list);
707
708 if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
709 blocked = rq_demote(gh);
710 else if (test_bit(HIF_GREEDY, &gh->gh_iflags))
711 blocked = rq_greedy(gh);
712 else
713 gfs2_assert_warn(gl->gl_sbd, 0);
714
715 } else if (!list_empty(&gl->gl_waiters3)) {
716 gh = list_entry(gl->gl_waiters3.next,
717 struct gfs2_holder, gh_list);
718
719 if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
720 blocked = rq_promote(gh);
721 else
722 gfs2_assert_warn(gl->gl_sbd, 0);
723
724 } else
725 break;
726
727 if (blocked)
728 break;
729 }
730}
731
732/**
733 * gfs2_glmutex_lock - acquire a local lock on a glock
734 * @gl: the glock
735 *
736 * Gives caller exclusive access to manipulate a glock structure.
737 */
738
739void gfs2_glmutex_lock(struct gfs2_glock *gl)
740{
741 struct gfs2_holder gh;
742
743 gfs2_holder_init(gl, 0, 0, &gh);
744 set_bit(HIF_MUTEX, &gh.gh_iflags);
745
746 spin_lock(&gl->gl_spin);
747 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
748 list_add_tail(&gh.gh_list, &gl->gl_waiters1);
749 else
750 complete(&gh.gh_wait);
751 spin_unlock(&gl->gl_spin);
752
753 wait_for_completion(&gh.gh_wait);
754 gfs2_holder_uninit(&gh);
755}
756
757/**
758 * gfs2_glmutex_trylock - try to acquire a local lock on a glock
759 * @gl: the glock
760 *
761 * Returns: 1 if the glock is acquired
762 */
763
764int gfs2_glmutex_trylock(struct gfs2_glock *gl)
765{
766 int acquired = 1;
767
768 spin_lock(&gl->gl_spin);
769 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
770 acquired = 0;
771 spin_unlock(&gl->gl_spin);
772
773 return acquired;
774}
775
776/**
777 * gfs2_glmutex_unlock - release a local lock on a glock
778 * @gl: the glock
779 *
780 */
781
782void gfs2_glmutex_unlock(struct gfs2_glock *gl)
783{
784 spin_lock(&gl->gl_spin);
785 clear_bit(GLF_LOCK, &gl->gl_flags);
786 run_queue(gl);
787 BUG_ON(!spin_is_locked(&gl->gl_spin));
788 spin_unlock(&gl->gl_spin);
789}
790
791/**
792 * handle_callback - add a demote request to a lock's queue
793 * @gl: the glock
794 * @state: the state the caller wants us to change to
795 *
796 */
797
798static void handle_callback(struct gfs2_glock *gl, unsigned int state)
799{
800 struct gfs2_holder *gh, *new_gh = NULL;
801
802 restart:
803 spin_lock(&gl->gl_spin);
804
805 list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
806 if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
807 gl->gl_req_gh != gh) {
808 if (gh->gh_state != state)
809 gh->gh_state = LM_ST_UNLOCKED;
810 goto out;
811 }
812 }
813
814 if (new_gh) {
815 list_add_tail(&new_gh->gh_list, &gl->gl_waiters2);
816 new_gh = NULL;
817 } else {
818 spin_unlock(&gl->gl_spin);
819
820 new_gh = gfs2_holder_get(gl, state,
821 LM_FLAG_TRY | GL_NEVER_RECURSE,
822 GFP_KERNEL | __GFP_NOFAIL),
823 set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
824 set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
825
826 goto restart;
827 }
828
829 out:
830 spin_unlock(&gl->gl_spin);
831
832 if (new_gh)
833 gfs2_holder_put(new_gh);
834}
835
836/**
837 * state_change - record that the glock is now in a different state
838 * @gl: the glock
839 * @new_state the new state
840 *
841 */
842
843static void state_change(struct gfs2_glock *gl, unsigned int new_state)
844{
845 int held1, held2;
846
847 held1 = (gl->gl_state != LM_ST_UNLOCKED);
848 held2 = (new_state != LM_ST_UNLOCKED);
849
850 if (held1 != held2) {
851 if (held2)
852 gfs2_glock_hold(gl);
853 else
854 gfs2_glock_put(gl);
855 }
856
857 gl->gl_state = new_state;
858}
859
860/**
861 * xmote_bh - Called after the lock module is done acquiring a lock
862 * @gl: The glock in question
863 * @ret: the int returned from the lock module
864 *
865 */
866
867static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
868{
869 struct gfs2_sbd *sdp = gl->gl_sbd;
870 struct gfs2_glock_operations *glops = gl->gl_ops;
871 struct gfs2_holder *gh = gl->gl_req_gh;
872 int prev_state = gl->gl_state;
873 int op_done = 1;
874
875 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
876 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
877 gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
878
879 state_change(gl, ret & LM_OUT_ST_MASK);
880
881 if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
882 if (glops->go_inval)
883 glops->go_inval(gl, DIO_METADATA | DIO_DATA);
884 } else if (gl->gl_state == LM_ST_DEFERRED) {
885 /* We might not want to do this here.
886 Look at moving to the inode glops. */
887 if (glops->go_inval)
888 glops->go_inval(gl, DIO_DATA);
889 }
890
891 /* Deal with each possible exit condition */
892
893 if (!gh)
894 gl->gl_stamp = jiffies;
895
896 else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
897 spin_lock(&gl->gl_spin);
898 list_del_init(&gh->gh_list);
899 gh->gh_error = -EIO;
900 if (test_bit(HIF_RECURSE, &gh->gh_iflags))
901 do_unrecurse(gh);
902 spin_unlock(&gl->gl_spin);
903
904 } else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
905 spin_lock(&gl->gl_spin);
906 list_del_init(&gh->gh_list);
907 if (gl->gl_state == gh->gh_state ||
908 gl->gl_state == LM_ST_UNLOCKED)
909 gh->gh_error = 0;
910 else {
911 if (gfs2_assert_warn(sdp, gh->gh_flags &
912 (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1)
913 fs_warn(sdp, "ret = 0x%.8X\n", ret);
914 gh->gh_error = GLR_TRYFAILED;
915 }
916 spin_unlock(&gl->gl_spin);
917
918 if (ret & LM_OUT_CANCELED)
919 handle_callback(gl, LM_ST_UNLOCKED); /* Lame */
920
921 } else if (ret & LM_OUT_CANCELED) {
922 spin_lock(&gl->gl_spin);
923 list_del_init(&gh->gh_list);
924 gh->gh_error = GLR_CANCELED;
925 if (test_bit(HIF_RECURSE, &gh->gh_iflags))
926 do_unrecurse(gh);
927 spin_unlock(&gl->gl_spin);
928
929 } else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
930 spin_lock(&gl->gl_spin);
931 list_move_tail(&gh->gh_list, &gl->gl_holders);
932 gh->gh_error = 0;
933 set_bit(HIF_HOLDER, &gh->gh_iflags);
934 spin_unlock(&gl->gl_spin);
935
936 set_bit(HIF_FIRST, &gh->gh_iflags);
937
938 op_done = 0;
939
940 } else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
941 spin_lock(&gl->gl_spin);
942 list_del_init(&gh->gh_list);
943 gh->gh_error = GLR_TRYFAILED;
944 if (test_bit(HIF_RECURSE, &gh->gh_iflags))
945 do_unrecurse(gh);
946 spin_unlock(&gl->gl_spin);
947
948 } else {
949 if (gfs2_assert_withdraw(sdp, 0) == -1)
950 fs_err(sdp, "ret = 0x%.8X\n", ret);
951 }
952
953 if (glops->go_xmote_bh)
954 glops->go_xmote_bh(gl);
955
956 if (op_done) {
957 spin_lock(&gl->gl_spin);
958 gl->gl_req_gh = NULL;
959 gl->gl_req_bh = NULL;
960 clear_bit(GLF_LOCK, &gl->gl_flags);
961 run_queue(gl);
962 spin_unlock(&gl->gl_spin);
963 }
964
965 gfs2_glock_put(gl);
966
967 if (gh) {
968 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
969 gfs2_holder_put(gh);
970 else
971 complete(&gh->gh_wait);
972 }
973}
974
975/**
976 * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
977 * @gl: The glock in question
978 * @state: the requested state
979 * @flags: modifier flags to the lock call
980 *
981 */
982
983void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags)
984{
985 struct gfs2_sbd *sdp = gl->gl_sbd;
986 struct gfs2_glock_operations *glops = gl->gl_ops;
987 int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
988 LM_FLAG_NOEXP | LM_FLAG_ANY |
989 LM_FLAG_PRIORITY);
990 unsigned int lck_ret;
991
992 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
993 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
994 gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
995 gfs2_assert_warn(sdp, state != gl->gl_state);
996
997 if (gl->gl_state == LM_ST_EXCLUSIVE) {
998 if (glops->go_sync)
999 glops->go_sync(gl,
1000 DIO_METADATA | DIO_DATA | DIO_RELEASE);
1001 }
1002
1003 gfs2_glock_hold(gl);
1004 gl->gl_req_bh = xmote_bh;
1005
1006 lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state,
1007 lck_flags);
1008
1009 if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
1010 return;
1011
1012 if (lck_ret & LM_OUT_ASYNC)
1013 gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
1014 else
1015 xmote_bh(gl, lck_ret);
1016}
1017
1018/**
1019 * drop_bh - Called after a lock module unlock completes
1020 * @gl: the glock
1021 * @ret: the return status
1022 *
1023 * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
1024 * Doesn't drop the reference on the glock the top half took out
1025 *
1026 */
1027
1028static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
1029{
1030 struct gfs2_sbd *sdp = gl->gl_sbd;
1031 struct gfs2_glock_operations *glops = gl->gl_ops;
1032 struct gfs2_holder *gh = gl->gl_req_gh;
1033
1034 clear_bit(GLF_PREFETCH, &gl->gl_flags);
1035
1036 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1037 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
1038 gfs2_assert_warn(sdp, !ret);
1039
1040 state_change(gl, LM_ST_UNLOCKED);
1041
1042 if (glops->go_inval)
1043 glops->go_inval(gl, DIO_METADATA | DIO_DATA);
1044
1045 if (gh) {
1046 spin_lock(&gl->gl_spin);
1047 list_del_init(&gh->gh_list);
1048 gh->gh_error = 0;
1049 spin_unlock(&gl->gl_spin);
1050 }
1051
1052 if (glops->go_drop_bh)
1053 glops->go_drop_bh(gl);
1054
1055 spin_lock(&gl->gl_spin);
1056 gl->gl_req_gh = NULL;
1057 gl->gl_req_bh = NULL;
1058 clear_bit(GLF_LOCK, &gl->gl_flags);
1059 run_queue(gl);
1060 spin_unlock(&gl->gl_spin);
1061
1062 gfs2_glock_put(gl);
1063
1064 if (gh) {
1065 if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
1066 gfs2_holder_put(gh);
1067 else
1068 complete(&gh->gh_wait);
1069 }
1070}
1071
1072/**
1073 * gfs2_glock_drop_th - call into the lock module to unlock a lock
1074 * @gl: the glock
1075 *
1076 */
1077
1078void gfs2_glock_drop_th(struct gfs2_glock *gl)
1079{
1080 struct gfs2_sbd *sdp = gl->gl_sbd;
1081 struct gfs2_glock_operations *glops = gl->gl_ops;
1082 unsigned int ret;
1083
1084 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1085 gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
1086 gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
1087
1088 if (gl->gl_state == LM_ST_EXCLUSIVE) {
1089 if (glops->go_sync)
1090 glops->go_sync(gl,
1091 DIO_METADATA | DIO_DATA | DIO_RELEASE);
1092 }
1093
1094 gfs2_glock_hold(gl);
1095 gl->gl_req_bh = drop_bh;
1096
1097 ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
1098
1099 if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
1100 return;
1101
1102 if (!ret)
1103 drop_bh(gl, ret);
1104 else
1105 gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
1106}
1107
1108/**
1109 * do_cancels - cancel requests for locks stuck waiting on an expire flag
1110 * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
1111 *
1112 * Don't cancel GL_NOCANCEL requests.
1113 */
1114
1115static void do_cancels(struct gfs2_holder *gh)
1116{
1117 struct gfs2_glock *gl = gh->gh_gl;
1118
1119 spin_lock(&gl->gl_spin);
1120
1121 while (gl->gl_req_gh != gh &&
1122 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
1123 !list_empty(&gh->gh_list)) {
1124 if (gl->gl_req_bh &&
1125 !(gl->gl_req_gh &&
1126 (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
1127 spin_unlock(&gl->gl_spin);
1128 gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock);
1129 msleep(100);
1130 spin_lock(&gl->gl_spin);
1131 } else {
1132 spin_unlock(&gl->gl_spin);
1133 msleep(100);
1134 spin_lock(&gl->gl_spin);
1135 }
1136 }
1137
1138 spin_unlock(&gl->gl_spin);
1139}
1140
1141/**
1142 * glock_wait_internal - wait on a glock acquisition
1143 * @gh: the glock holder
1144 *
1145 * Returns: 0 on success
1146 */
1147
1148static int glock_wait_internal(struct gfs2_holder *gh)
1149{
1150 struct gfs2_glock *gl = gh->gh_gl;
1151 struct gfs2_sbd *sdp = gl->gl_sbd;
1152 struct gfs2_glock_operations *glops = gl->gl_ops;
1153
1154 if (test_bit(HIF_ABORTED, &gh->gh_iflags))
1155 return -EIO;
1156
1157 if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
1158 spin_lock(&gl->gl_spin);
1159 if (gl->gl_req_gh != gh &&
1160 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
1161 !list_empty(&gh->gh_list)) {
1162 list_del_init(&gh->gh_list);
1163 gh->gh_error = GLR_TRYFAILED;
1164 if (test_bit(HIF_RECURSE, &gh->gh_iflags))
1165 do_unrecurse(gh);
1166 run_queue(gl);
1167 spin_unlock(&gl->gl_spin);
1168 return gh->gh_error;
1169 }
1170 spin_unlock(&gl->gl_spin);
1171 }
1172
1173 if (gh->gh_flags & LM_FLAG_PRIORITY)
1174 do_cancels(gh);
1175
1176 wait_for_completion(&gh->gh_wait);
1177
1178 if (gh->gh_error)
1179 return gh->gh_error;
1180
1181 gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
1182 gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state,
1183 gh->gh_state,
1184 gh->gh_flags));
1185
1186 if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
1187 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
1188
1189 if (glops->go_lock) {
1190 gh->gh_error = glops->go_lock(gh);
1191 if (gh->gh_error) {
1192 spin_lock(&gl->gl_spin);
1193 list_del_init(&gh->gh_list);
1194 if (test_and_clear_bit(HIF_RECURSE,
1195 &gh->gh_iflags))
1196 do_unrecurse(gh);
1197 spin_unlock(&gl->gl_spin);
1198 }
1199 }
1200
1201 spin_lock(&gl->gl_spin);
1202 gl->gl_req_gh = NULL;
1203 gl->gl_req_bh = NULL;
1204 clear_bit(GLF_LOCK, &gl->gl_flags);
1205 if (test_bit(HIF_RECURSE, &gh->gh_iflags))
1206 handle_recurse(gh);
1207 run_queue(gl);
1208 spin_unlock(&gl->gl_spin);
1209 }
1210
1211 return gh->gh_error;
1212}
1213
1214static inline struct gfs2_holder *
1215find_holder_by_owner(struct list_head *head, struct task_struct *owner)
1216{
1217 struct gfs2_holder *gh;
1218
1219 list_for_each_entry(gh, head, gh_list) {
1220 if (gh->gh_owner == owner)
1221 return gh;
1222 }
1223
1224 return NULL;
1225}
1226
1227/**
1228 * recurse_check -
1229 *
1230 * Make sure the new holder is compatible with the pre-existing one.
1231 *
1232 */
1233
1234static int recurse_check(struct gfs2_holder *existing, struct gfs2_holder *new,
1235 unsigned int state)
1236{
1237 struct gfs2_sbd *sdp = existing->gh_gl->gl_sbd;
1238
1239 if (gfs2_assert_warn(sdp, (new->gh_flags & LM_FLAG_ANY) ||
1240 !(existing->gh_flags & LM_FLAG_ANY)))
1241 goto fail;
1242
1243 if (gfs2_assert_warn(sdp, (existing->gh_flags & GL_LOCAL_EXCL) ||
1244 !(new->gh_flags & GL_LOCAL_EXCL)))
1245 goto fail;
1246
1247 if (gfs2_assert_warn(sdp, relaxed_state_ok(state, new->gh_state,
1248 new->gh_flags)))
1249 goto fail;
1250
1251 return 0;
1252
1253fail:
1254 print_symbol(KERN_WARNING "GFS2: Existing holder from %s\n",
1255 existing->gh_ip);
1256 print_symbol(KERN_WARNING "GFS2: New holder from %s\n", new->gh_ip);
1257 set_bit(HIF_ABORTED, &new->gh_iflags);
1258 return -EINVAL;
1259}
1260
1261/**
1262 * add_to_queue - Add a holder to the wait queue (but look for recursion)
1263 * @gh: the holder structure to add
1264 *
1265 */
1266
1267static void add_to_queue(struct gfs2_holder *gh)
1268{
1269 struct gfs2_glock *gl = gh->gh_gl;
1270 struct gfs2_holder *existing;
1271
1272 BUG_ON(!gh->gh_owner);
1273
1274 if (!gh->gh_owner)
1275 goto out;
1276
1277 existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
1278 if (existing) {
1279 if (recurse_check(existing, gh, gl->gl_state))
1280 return;
1281
1282 list_add_tail(&gh->gh_list, &gl->gl_holders);
1283 set_bit(HIF_HOLDER, &gh->gh_iflags);
1284
1285 gh->gh_error = 0;
1286 complete(&gh->gh_wait);
1287
1288 return;
1289 }
1290
1291 existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner);
1292 if (existing) {
1293 if (recurse_check(existing, gh, existing->gh_state))
1294 return;
1295
1296 set_bit(HIF_RECURSE, &gh->gh_iflags);
1297 set_bit(HIF_RECURSE, &existing->gh_iflags);
1298
1299 list_add_tail(&gh->gh_list, &gl->gl_waiters3);
1300
1301 return;
1302 }
1303
1304 out:
1305 if (gh->gh_flags & LM_FLAG_PRIORITY)
1306 list_add(&gh->gh_list, &gl->gl_waiters3);
1307 else
1308 list_add_tail(&gh->gh_list, &gl->gl_waiters3);
1309}
1310
1311/**
1312 * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock)
1313 * @gh: the holder structure
1314 *
1315 * if (gh->gh_flags & GL_ASYNC), this never returns an error
1316 *
1317 * Returns: 0, GLR_TRYFAILED, or errno on failure
1318 */
1319
1320int gfs2_glock_nq(struct gfs2_holder *gh)
1321{
1322 struct gfs2_glock *gl = gh->gh_gl;
1323 struct gfs2_sbd *sdp = gl->gl_sbd;
1324 int error = 0;
1325
1326 restart:
1327 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
1328 set_bit(HIF_ABORTED, &gh->gh_iflags);
1329 return -EIO;
1330 }
1331
1332 set_bit(HIF_PROMOTE, &gh->gh_iflags);
1333
1334 spin_lock(&gl->gl_spin);
1335 add_to_queue(gh);
1336 run_queue(gl);
1337 spin_unlock(&gl->gl_spin);
1338
1339 if (!(gh->gh_flags & GL_ASYNC)) {
1340 error = glock_wait_internal(gh);
1341 if (error == GLR_CANCELED) {
1342 msleep(100);
1343 goto restart;
1344 }
1345 }
1346
1347 clear_bit(GLF_PREFETCH, &gl->gl_flags);
1348
1349 return error;
1350}
1351
1352/**
1353 * gfs2_glock_poll - poll to see if an async request has been completed
1354 * @gh: the holder
1355 *
1356 * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on
1357 */
1358
1359int gfs2_glock_poll(struct gfs2_holder *gh)
1360{
1361 struct gfs2_glock *gl = gh->gh_gl;
1362 int ready = 0;
1363
1364 spin_lock(&gl->gl_spin);
1365
1366 if (test_bit(HIF_HOLDER, &gh->gh_iflags))
1367 ready = 1;
1368 else if (list_empty(&gh->gh_list)) {
1369 if (gh->gh_error == GLR_CANCELED) {
1370 spin_unlock(&gl->gl_spin);
1371 msleep(100);
1372 if (gfs2_glock_nq(gh))
1373 return 1;
1374 return 0;
1375 } else
1376 ready = 1;
1377 }
1378
1379 spin_unlock(&gl->gl_spin);
1380
1381 return ready;
1382}
1383
1384/**
1385 * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
1386 * @gh: the holder structure
1387 *
1388 * Returns: 0, GLR_TRYFAILED, or errno on failure
1389 */
1390
1391int gfs2_glock_wait(struct gfs2_holder *gh)
1392{
1393 int error;
1394
1395 error = glock_wait_internal(gh);
1396 if (error == GLR_CANCELED) {
1397 msleep(100);
1398 gh->gh_flags &= ~GL_ASYNC;
1399 error = gfs2_glock_nq(gh);
1400 }
1401
1402 return error;
1403}
1404
1405/**
1406 * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
1407 * @gh: the glock holder
1408 *
1409 */
1410
1411void gfs2_glock_dq(struct gfs2_holder *gh)
1412{
1413 struct gfs2_glock *gl = gh->gh_gl;
1414 struct gfs2_glock_operations *glops = gl->gl_ops;
1415
1416 if (gh->gh_flags & GL_SYNC)
1417 set_bit(GLF_SYNC, &gl->gl_flags);
1418
1419 if (gh->gh_flags & GL_NOCACHE)
1420 handle_callback(gl, LM_ST_UNLOCKED);
1421
1422 gfs2_glmutex_lock(gl);
1423
1424 spin_lock(&gl->gl_spin);
1425 list_del_init(&gh->gh_list);
1426
1427 if (list_empty(&gl->gl_holders)) {
1428 spin_unlock(&gl->gl_spin);
1429
1430 if (glops->go_unlock)
1431 glops->go_unlock(gh);
1432
1433 if (test_bit(GLF_SYNC, &gl->gl_flags)) {
1434 if (glops->go_sync)
1435 glops->go_sync(gl, DIO_METADATA | DIO_DATA);
1436 }
1437
1438 gl->gl_stamp = jiffies;
1439
1440 spin_lock(&gl->gl_spin);
1441 }
1442
1443 clear_bit(GLF_LOCK, &gl->gl_flags);
1444 run_queue(gl);
1445 spin_unlock(&gl->gl_spin);
1446}
1447
1448/**
1449 * gfs2_glock_prefetch - Try to prefetch a glock
1450 * @gl: the glock
1451 * @state: the state to prefetch in
1452 * @flags: flags passed to go_xmote_th()
1453 *
1454 */
1455
1456void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state, int flags)
1457{
1458 struct gfs2_glock_operations *glops = gl->gl_ops;
1459
1460 spin_lock(&gl->gl_spin);
1461
1462 if (test_bit(GLF_LOCK, &gl->gl_flags) ||
1463 !list_empty(&gl->gl_holders) ||
1464 !list_empty(&gl->gl_waiters1) ||
1465 !list_empty(&gl->gl_waiters2) ||
1466 !list_empty(&gl->gl_waiters3) ||
1467 relaxed_state_ok(gl->gl_state, state, flags)) {
1468 spin_unlock(&gl->gl_spin);
1469 return;
1470 }
1471
1472 set_bit(GLF_PREFETCH, &gl->gl_flags);
1473 set_bit(GLF_LOCK, &gl->gl_flags);
1474 spin_unlock(&gl->gl_spin);
1475
1476 glops->go_xmote_th(gl, state, flags);
1477}
1478
1479/**
1480 * gfs2_glock_force_drop - Force a glock to be uncached
1481 * @gl: the glock
1482 *
1483 */
1484
1485void gfs2_glock_force_drop(struct gfs2_glock *gl)
1486{
1487 struct gfs2_holder gh;
1488
1489 gfs2_holder_init(gl, LM_ST_UNLOCKED, GL_NEVER_RECURSE, &gh);
1490 set_bit(HIF_DEMOTE, &gh.gh_iflags);
1491
1492 spin_lock(&gl->gl_spin);
1493 list_add_tail(&gh.gh_list, &gl->gl_waiters2);
1494 run_queue(gl);
1495 spin_unlock(&gl->gl_spin);
1496
1497 wait_for_completion(&gh.gh_wait);
1498 gfs2_holder_uninit(&gh);
1499}
1500
1501static void greedy_work(void *data)
1502{
1503 struct greedy *gr = (struct greedy *)data;
1504 struct gfs2_holder *gh = &gr->gr_gh;
1505 struct gfs2_glock *gl = gh->gh_gl;
1506 struct gfs2_glock_operations *glops = gl->gl_ops;
1507
1508 clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
1509
1510 if (glops->go_greedy)
1511 glops->go_greedy(gl);
1512
1513 spin_lock(&gl->gl_spin);
1514
1515 if (list_empty(&gl->gl_waiters2)) {
1516 clear_bit(GLF_GREEDY, &gl->gl_flags);
1517 spin_unlock(&gl->gl_spin);
1518 gfs2_holder_uninit(gh);
1519 kfree(gr);
1520 } else {
1521 gfs2_glock_hold(gl);
1522 list_add_tail(&gh->gh_list, &gl->gl_waiters2);
1523 run_queue(gl);
1524 spin_unlock(&gl->gl_spin);
1525 gfs2_glock_put(gl);
1526 }
1527}
1528
1529/**
1530 * gfs2_glock_be_greedy -
1531 * @gl:
1532 * @time:
1533 *
1534 * Returns: 0 if go_greedy will be called, 1 otherwise
1535 */
1536
1537int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time)
1538{
1539 struct greedy *gr;
1540 struct gfs2_holder *gh;
1541
1542 if (!time ||
1543 gl->gl_sbd->sd_args.ar_localcaching ||
1544 test_and_set_bit(GLF_GREEDY, &gl->gl_flags))
1545 return 1;
1546
1547 gr = kmalloc(sizeof(struct greedy), GFP_KERNEL);
1548 if (!gr) {
1549 clear_bit(GLF_GREEDY, &gl->gl_flags);
1550 return 1;
1551 }
1552 gh = &gr->gr_gh;
1553
1554 gfs2_holder_init(gl, 0, GL_NEVER_RECURSE, gh);
1555 set_bit(HIF_GREEDY, &gh->gh_iflags);
1556 INIT_WORK(&gr->gr_work, greedy_work, gr);
1557
1558 set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
1559 schedule_delayed_work(&gr->gr_work, time);
1560
1561 return 0;
1562}
1563
1564/**
1565 * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
1566 * @gh: the holder structure
1567 *
1568 */
1569
1570void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
1571{
1572 gfs2_glock_dq(gh);
1573 gfs2_holder_uninit(gh);
1574}
1575
1576/**
1577 * gfs2_glock_nq_num - acquire a glock based on lock number
1578 * @sdp: the filesystem
1579 * @number: the lock number
1580 * @glops: the glock operations for the type of glock
1581 * @state: the state to acquire the glock in
1582 * @flags: modifier flags for the aquisition
1583 * @gh: the struct gfs2_holder
1584 *
1585 * Returns: errno
1586 */
1587
1588int gfs2_glock_nq_num(struct gfs2_sbd *sdp, uint64_t number,
1589 struct gfs2_glock_operations *glops, unsigned int state,
1590 int flags, struct gfs2_holder *gh)
1591{
1592 struct gfs2_glock *gl;
1593 int error;
1594
1595 error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
1596 if (!error) {
1597 error = gfs2_glock_nq_init(gl, state, flags, gh);
1598 gfs2_glock_put(gl);
1599 }
1600
1601 return error;
1602}
1603
1604/**
1605 * glock_compare - Compare two struct gfs2_glock structures for sorting
1606 * @arg_a: the first structure
1607 * @arg_b: the second structure
1608 *
1609 */
1610
1611static int glock_compare(const void *arg_a, const void *arg_b)
1612{
1613 struct gfs2_holder *gh_a = *(struct gfs2_holder **)arg_a;
1614 struct gfs2_holder *gh_b = *(struct gfs2_holder **)arg_b;
1615 struct lm_lockname *a = &gh_a->gh_gl->gl_name;
1616 struct lm_lockname *b = &gh_b->gh_gl->gl_name;
1617 int ret = 0;
1618
1619 if (a->ln_number > b->ln_number)
1620 ret = 1;
1621 else if (a->ln_number < b->ln_number)
1622 ret = -1;
1623 else {
1624 if (gh_a->gh_state == LM_ST_SHARED &&
1625 gh_b->gh_state == LM_ST_EXCLUSIVE)
1626 ret = 1;
1627 else if (!(gh_a->gh_flags & GL_LOCAL_EXCL) &&
1628 (gh_b->gh_flags & GL_LOCAL_EXCL))
1629 ret = 1;
1630 }
1631
1632 return ret;
1633}
1634
1635/**
1636 * nq_m_sync - synchonously acquire more than one glock in deadlock free order
1637 * @num_gh: the number of structures
1638 * @ghs: an array of struct gfs2_holder structures
1639 *
1640 * Returns: 0 on success (all glocks acquired),
1641 * errno on failure (no glocks acquired)
1642 */
1643
1644static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
1645 struct gfs2_holder **p)
1646{
1647 unsigned int x;
1648 int error = 0;
1649
1650 for (x = 0; x < num_gh; x++)
1651 p[x] = &ghs[x];
1652
1653 sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL);
1654
1655 for (x = 0; x < num_gh; x++) {
1656 p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1657
1658 error = gfs2_glock_nq(p[x]);
1659 if (error) {
1660 while (x--)
1661 gfs2_glock_dq(p[x]);
1662 break;
1663 }
1664 }
1665
1666 return error;
1667}
1668
1669/**
1670 * gfs2_glock_nq_m - acquire multiple glocks
1671 * @num_gh: the number of structures
1672 * @ghs: an array of struct gfs2_holder structures
1673 *
1674 * Figure out how big an impact this function has. Either:
1675 * 1) Replace this code with code that calls gfs2_glock_prefetch()
1676 * 2) Forget async stuff and just call nq_m_sync()
1677 * 3) Leave it like it is
1678 *
1679 * Returns: 0 on success (all glocks acquired),
1680 * errno on failure (no glocks acquired)
1681 */
1682
1683int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1684{
1685 int *e;
1686 unsigned int x;
1687 int borked = 0, serious = 0;
1688 int error = 0;
1689
1690 if (!num_gh)
1691 return 0;
1692
1693 if (num_gh == 1) {
1694 ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1695 return gfs2_glock_nq(ghs);
1696 }
1697
1698 e = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
1699 if (!e)
1700 return -ENOMEM;
1701
1702 for (x = 0; x < num_gh; x++) {
1703 ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC;
1704 error = gfs2_glock_nq(&ghs[x]);
1705 if (error) {
1706 borked = 1;
1707 serious = error;
1708 num_gh = x;
1709 break;
1710 }
1711 }
1712
1713 for (x = 0; x < num_gh; x++) {
1714 error = e[x] = glock_wait_internal(&ghs[x]);
1715 if (error) {
1716 borked = 1;
1717 if (error != GLR_TRYFAILED && error != GLR_CANCELED)
1718 serious = error;
1719 }
1720 }
1721
1722 if (!borked) {
1723 kfree(e);
1724 return 0;
1725 }
1726
1727 for (x = 0; x < num_gh; x++)
1728 if (!e[x])
1729 gfs2_glock_dq(&ghs[x]);
1730
1731 if (serious)
1732 error = serious;
1733 else {
1734 for (x = 0; x < num_gh; x++)
1735 gfs2_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags,
1736 &ghs[x]);
1737 error = nq_m_sync(num_gh, ghs, (struct gfs2_holder **)e);
1738 }
1739
1740 kfree(e);
1741
1742 return error;
1743}
1744
1745/**
1746 * gfs2_glock_dq_m - release multiple glocks
1747 * @num_gh: the number of structures
1748 * @ghs: an array of struct gfs2_holder structures
1749 *
1750 */
1751
1752void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1753{
1754 unsigned int x;
1755
1756 for (x = 0; x < num_gh; x++)
1757 gfs2_glock_dq(&ghs[x]);
1758}
1759
1760/**
1761 * gfs2_glock_dq_uninit_m - release multiple glocks
1762 * @num_gh: the number of structures
1763 * @ghs: an array of struct gfs2_holder structures
1764 *
1765 */
1766
1767void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
1768{
1769 unsigned int x;
1770
1771 for (x = 0; x < num_gh; x++)
1772 gfs2_glock_dq_uninit(&ghs[x]);
1773}
1774
1775/**
1776 * gfs2_glock_prefetch_num - prefetch a glock based on lock number
1777 * @sdp: the filesystem
1778 * @number: the lock number
1779 * @glops: the glock operations for the type of glock
1780 * @state: the state to acquire the glock in
1781 * @flags: modifier flags for the aquisition
1782 *
1783 * Returns: errno
1784 */
1785
1786void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, uint64_t number,
1787 struct gfs2_glock_operations *glops,
1788 unsigned int state, int flags)
1789{
1790 struct gfs2_glock *gl;
1791 int error;
1792
1793 if (atomic_read(&sdp->sd_reclaim_count) <
1794 gfs2_tune_get(sdp, gt_reclaim_limit)) {
1795 error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
1796 if (!error) {
1797 gfs2_glock_prefetch(gl, state, flags);
1798 gfs2_glock_put(gl);
1799 }
1800 }
1801}
1802
1803/**
1804 * gfs2_lvb_hold - attach a LVB from a glock
1805 * @gl: The glock in question
1806 *
1807 */
1808
1809int gfs2_lvb_hold(struct gfs2_glock *gl)
1810{
1811 int error;
1812
1813 gfs2_glmutex_lock(gl);
1814
1815 if (!atomic_read(&gl->gl_lvb_count)) {
1816 error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
1817 if (error) {
1818 gfs2_glmutex_unlock(gl);
1819 return error;
1820 }
1821 gfs2_glock_hold(gl);
1822 }
1823 atomic_inc(&gl->gl_lvb_count);
1824
1825 gfs2_glmutex_unlock(gl);
1826
1827 return 0;
1828}
1829
1830/**
1831 * gfs2_lvb_unhold - detach a LVB from a glock
1832 * @gl: The glock in question
1833 *
1834 */
1835
1836void gfs2_lvb_unhold(struct gfs2_glock *gl)
1837{
1838 gfs2_glock_hold(gl);
1839 gfs2_glmutex_lock(gl);
1840
1841 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
1842 if (atomic_dec_and_test(&gl->gl_lvb_count)) {
1843 gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
1844 gl->gl_lvb = NULL;
1845 gfs2_glock_put(gl);
1846 }
1847
1848 gfs2_glmutex_unlock(gl);
1849 gfs2_glock_put(gl);
1850}
1851
1852void gfs2_lvb_sync(struct gfs2_glock *gl)
1853{
1854 gfs2_glmutex_lock(gl);
1855
1856 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count));
1857 if (!gfs2_assert_warn(gl->gl_sbd, gfs2_glock_is_held_excl(gl)))
1858 gfs2_lm_sync_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
1859
1860 gfs2_glmutex_unlock(gl);
1861}
1862
1863static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1864 unsigned int state)
1865{
1866 struct gfs2_glock *gl;
1867
1868 gl = gfs2_glock_find(sdp, name);
1869 if (!gl)
1870 return;
1871
1872 if (gl->gl_ops->go_callback)
1873 gl->gl_ops->go_callback(gl, state);
1874 handle_callback(gl, state);
1875
1876 spin_lock(&gl->gl_spin);
1877 run_queue(gl);
1878 spin_unlock(&gl->gl_spin);
1879
1880 gfs2_glock_put(gl);
1881}
1882
1883/**
1884 * gfs2_glock_cb - Callback used by locking module
1885 * @fsdata: Pointer to the superblock
1886 * @type: Type of callback
1887 * @data: Type dependent data pointer
1888 *
1889 * Called by the locking module when it wants to tell us something.
1890 * Either we need to drop a lock, one of our ASYNC requests completed, or
1891 * a journal from another client needs to be recovered.
1892 */
1893
1894void gfs2_glock_cb(lm_fsdata_t *fsdata, unsigned int type, void *data)
1895{
1896 struct gfs2_sbd *sdp = (struct gfs2_sbd *)fsdata;
1897
1898 switch (type) {
1899 case LM_CB_NEED_E:
1900 blocking_cb(sdp, (struct lm_lockname *)data, LM_ST_UNLOCKED);
1901 return;
1902
1903 case LM_CB_NEED_D:
1904 blocking_cb(sdp, (struct lm_lockname *)data, LM_ST_DEFERRED);
1905 return;
1906
1907 case LM_CB_NEED_S:
1908 blocking_cb(sdp, (struct lm_lockname *)data, LM_ST_SHARED);
1909 return;
1910
1911 case LM_CB_ASYNC: {
1912 struct lm_async_cb *async = (struct lm_async_cb *)data;
1913 struct gfs2_glock *gl;
1914
1915 gl = gfs2_glock_find(sdp, &async->lc_name);
1916 if (gfs2_assert_warn(sdp, gl))
1917 return;
1918 if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
1919 gl->gl_req_bh(gl, async->lc_ret);
1920 gfs2_glock_put(gl);
1921
1922 return;
1923 }
1924
1925 case LM_CB_NEED_RECOVERY:
1926 gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data);
1927 if (sdp->sd_recoverd_process)
1928 wake_up_process(sdp->sd_recoverd_process);
1929 return;
1930
1931 case LM_CB_DROPLOCKS:
1932 gfs2_gl_hash_clear(sdp, NO_WAIT);
1933 gfs2_quota_scan(sdp);
1934 return;
1935
1936 default:
1937 gfs2_assert_warn(sdp, 0);
1938 return;
1939 }
1940}
1941
1942/**
1943 * gfs2_try_toss_inode - try to remove a particular inode struct from cache
1944 * sdp: the filesystem
1945 * inum: the inode number
1946 *
1947 */
1948
1949void gfs2_try_toss_inode(struct gfs2_sbd *sdp, struct gfs2_inum *inum)
1950{
1951 struct gfs2_glock *gl;
1952 struct gfs2_inode *ip;
1953 int error;
1954
1955 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_inode_glops,
1956 NO_CREATE, &gl);
1957 if (error || !gl)
1958 return;
1959
1960 if (!gfs2_glmutex_trylock(gl))
1961 goto out;
1962
1963 ip = gl->gl_object;
1964 if (!ip)
1965 goto out_unlock;
1966
1967 if (atomic_read(&ip->i_count))
1968 goto out_unlock;
1969
1970 gfs2_inode_destroy(ip);
1971
1972 out_unlock:
1973 gfs2_glmutex_unlock(gl);
1974
1975 out:
1976 gfs2_glock_put(gl);
1977}
1978
1979/**
1980 * gfs2_iopen_go_callback - Try to kick the inode/vnode associated with an
1981 * iopen glock from memory
1982 * @io_gl: the iopen glock
1983 * @state: the state into which the glock should be put
1984 *
1985 */
1986
1987void gfs2_iopen_go_callback(struct gfs2_glock *io_gl, unsigned int state)
1988{
1989 struct gfs2_glock *i_gl;
1990
1991 if (state != LM_ST_UNLOCKED)
1992 return;
1993
1994 spin_lock(&io_gl->gl_spin);
1995 i_gl = io_gl->gl_object;
1996 if (i_gl) {
1997 gfs2_glock_hold(i_gl);
1998 spin_unlock(&io_gl->gl_spin);
1999 } else {
2000 spin_unlock(&io_gl->gl_spin);
2001 return;
2002 }
2003
2004 if (gfs2_glmutex_trylock(i_gl)) {
2005 struct gfs2_inode *ip = i_gl->gl_object;
2006 if (ip) {
2007 gfs2_try_toss_vnode(ip);
2008 gfs2_glmutex_unlock(i_gl);
2009 gfs2_glock_schedule_for_reclaim(i_gl);
2010 goto out;
2011 }
2012 gfs2_glmutex_unlock(i_gl);
2013 }
2014
2015 out:
2016 gfs2_glock_put(i_gl);
2017}
2018
2019/**
2020 * demote_ok - Check to see if it's ok to unlock a glock
2021 * @gl: the glock
2022 *
2023 * Returns: 1 if it's ok
2024 */
2025
2026static int demote_ok(struct gfs2_glock *gl)
2027{
2028 struct gfs2_sbd *sdp = gl->gl_sbd;
2029 struct gfs2_glock_operations *glops = gl->gl_ops;
2030 int demote = 1;
2031
2032 if (test_bit(GLF_STICKY, &gl->gl_flags))
2033 demote = 0;
2034 else if (test_bit(GLF_PREFETCH, &gl->gl_flags))
2035 demote = time_after_eq(jiffies,
2036 gl->gl_stamp +
2037 gfs2_tune_get(sdp, gt_prefetch_secs) * HZ);
2038 else if (glops->go_demote_ok)
2039 demote = glops->go_demote_ok(gl);
2040
2041 return demote;
2042}
2043
2044/**
2045 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
2046 * @gl: the glock
2047 *
2048 */
2049
2050void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
2051{
2052 struct gfs2_sbd *sdp = gl->gl_sbd;
2053
2054 spin_lock(&sdp->sd_reclaim_lock);
2055 if (list_empty(&gl->gl_reclaim)) {
2056 gfs2_glock_hold(gl);
2057 list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
2058 atomic_inc(&sdp->sd_reclaim_count);
2059 }
2060 spin_unlock(&sdp->sd_reclaim_lock);
2061
2062 wake_up(&sdp->sd_reclaim_wq);
2063}
2064
2065/**
2066 * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
2067 * @sdp: the filesystem
2068 *
2069 * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
2070 * different glock and we notice that there are a lot of glocks in the
2071 * reclaim list.
2072 *
2073 */
2074
2075void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
2076{
2077 struct gfs2_glock *gl;
2078
2079 spin_lock(&sdp->sd_reclaim_lock);
2080 if (list_empty(&sdp->sd_reclaim_list)) {
2081 spin_unlock(&sdp->sd_reclaim_lock);
2082 return;
2083 }
2084 gl = list_entry(sdp->sd_reclaim_list.next,
2085 struct gfs2_glock, gl_reclaim);
2086 list_del_init(&gl->gl_reclaim);
2087 spin_unlock(&sdp->sd_reclaim_lock);
2088
2089 atomic_dec(&sdp->sd_reclaim_count);
2090 atomic_inc(&sdp->sd_reclaimed);
2091
2092 if (gfs2_glmutex_trylock(gl)) {
2093 if (gl->gl_ops == &gfs2_inode_glops) {
2094 struct gfs2_inode *ip = gl->gl_object;
2095 if (ip && !atomic_read(&ip->i_count))
2096 gfs2_inode_destroy(ip);
2097 }
2098 if (queue_empty(gl, &gl->gl_holders) &&
2099 gl->gl_state != LM_ST_UNLOCKED &&
2100 demote_ok(gl))
2101 handle_callback(gl, LM_ST_UNLOCKED);
2102 gfs2_glmutex_unlock(gl);
2103 }
2104
2105 gfs2_glock_put(gl);
2106}
2107
2108/**
2109 * examine_bucket - Call a function for glock in a hash bucket
2110 * @examiner: the function
2111 * @sdp: the filesystem
2112 * @bucket: the bucket
2113 *
2114 * Returns: 1 if the bucket has entries
2115 */
2116
2117static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
2118 struct gfs2_gl_hash_bucket *bucket)
2119{
2120 struct glock_plug plug;
2121 struct list_head *tmp;
2122 struct gfs2_glock *gl;
2123 int entries;
2124
2125 /* Add "plug" to end of bucket list, work back up list from there */
2126 memset(&plug.gl_flags, 0, sizeof(unsigned long));
2127 set_bit(GLF_PLUG, &plug.gl_flags);
2128
2129 write_lock(&bucket->hb_lock);
2130 list_add(&plug.gl_list, &bucket->hb_list);
2131 write_unlock(&bucket->hb_lock);
2132
2133 for (;;) {
2134 write_lock(&bucket->hb_lock);
2135
2136 for (;;) {
2137 tmp = plug.gl_list.next;
2138
2139 if (tmp == &bucket->hb_list) {
2140 list_del(&plug.gl_list);
2141 entries = !list_empty(&bucket->hb_list);
2142 write_unlock(&bucket->hb_lock);
2143 return entries;
2144 }
2145 gl = list_entry(tmp, struct gfs2_glock, gl_list);
2146
2147 /* Move plug up list */
2148 list_move(&plug.gl_list, &gl->gl_list);
2149
2150 if (test_bit(GLF_PLUG, &gl->gl_flags))
2151 continue;
2152
2153 /* examiner() must glock_put() */
2154 gfs2_glock_hold(gl);
2155
2156 break;
2157 }
2158
2159 write_unlock(&bucket->hb_lock);
2160
2161 examiner(gl);
2162 }
2163}
2164
2165/**
2166 * scan_glock - look at a glock and see if we can reclaim it
2167 * @gl: the glock to look at
2168 *
2169 */
2170
2171static void scan_glock(struct gfs2_glock *gl)
2172{
2173 if (gfs2_glmutex_trylock(gl)) {
2174 if (gl->gl_ops == &gfs2_inode_glops) {
2175 struct gfs2_inode *ip = gl->gl_object;
2176 if (ip && !atomic_read(&ip->i_count))
2177 goto out_schedule;
2178 }
2179 if (queue_empty(gl, &gl->gl_holders) &&
2180 gl->gl_state != LM_ST_UNLOCKED &&
2181 demote_ok(gl))
2182 goto out_schedule;
2183
2184 gfs2_glmutex_unlock(gl);
2185 }
2186
2187 gfs2_glock_put(gl);
2188
2189 return;
2190
2191 out_schedule:
2192 gfs2_glmutex_unlock(gl);
2193 gfs2_glock_schedule_for_reclaim(gl);
2194 gfs2_glock_put(gl);
2195}
2196
2197/**
2198 * gfs2_scand_internal - Look for glocks and inodes to toss from memory
2199 * @sdp: the filesystem
2200 *
2201 */
2202
2203void gfs2_scand_internal(struct gfs2_sbd *sdp)
2204{
2205 unsigned int x;
2206
2207 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
2208 examine_bucket(scan_glock, sdp, &sdp->sd_gl_hash[x]);
2209 cond_resched();
2210 }
2211}
2212
2213/**
2214 * clear_glock - look at a glock and see if we can free it from glock cache
2215 * @gl: the glock to look at
2216 *
2217 */
2218
2219static void clear_glock(struct gfs2_glock *gl)
2220{
2221 struct gfs2_sbd *sdp = gl->gl_sbd;
2222 int released;
2223
2224 spin_lock(&sdp->sd_reclaim_lock);
2225 if (!list_empty(&gl->gl_reclaim)) {
2226 list_del_init(&gl->gl_reclaim);
2227 atomic_dec(&sdp->sd_reclaim_count);
2228 spin_unlock(&sdp->sd_reclaim_lock);
2229 released = gfs2_glock_put(gl);
2230 gfs2_assert(sdp, !released);
2231 } else {
2232 spin_unlock(&sdp->sd_reclaim_lock);
2233 }
2234
2235 if (gfs2_glmutex_trylock(gl)) {
2236 if (gl->gl_ops == &gfs2_inode_glops) {
2237 struct gfs2_inode *ip = gl->gl_object;
2238 if (ip && !atomic_read(&ip->i_count))
2239 gfs2_inode_destroy(ip);
2240 }
2241 if (queue_empty(gl, &gl->gl_holders) &&
2242 gl->gl_state != LM_ST_UNLOCKED)
2243 handle_callback(gl, LM_ST_UNLOCKED);
2244
2245 gfs2_glmutex_unlock(gl);
2246 }
2247
2248 gfs2_glock_put(gl);
2249}
2250
2251/**
2252 * gfs2_gl_hash_clear - Empty out the glock hash table
2253 * @sdp: the filesystem
2254 * @wait: wait until it's all gone
2255 *
2256 * Called when unmounting the filesystem, or when inter-node lock manager
2257 * requests DROPLOCKS because it is running out of capacity.
2258 */
2259
2260void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
2261{
2262 unsigned long t;
2263 unsigned int x;
2264 int cont;
2265
2266 t = jiffies;
2267
2268 for (;;) {
2269 cont = 0;
2270
2271 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
2272 if (examine_bucket(clear_glock, sdp,
2273 &sdp->sd_gl_hash[x]))
2274 cont = 1;
2275
2276 if (!wait || !cont)
2277 break;
2278
2279 if (time_after_eq(jiffies,
2280 t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
2281 fs_warn(sdp, "Unmount seems to be stalled. "
2282 "Dumping lock state...\n");
2283 gfs2_dump_lockstate(sdp);
2284 t = jiffies;
2285 }
2286
2287 /* invalidate_inodes() requires that the sb inodes list
2288 not change, but an async completion callback for an
2289 unlock can occur which does glock_put() which
2290 can call iput() which will change the sb inodes list.
2291 invalidate_inodes_mutex prevents glock_put()'s during
2292 an invalidate_inodes() */
2293
2294 mutex_lock(&sdp->sd_invalidate_inodes_mutex);
2295 invalidate_inodes(sdp->sd_vfs);
2296 mutex_unlock(&sdp->sd_invalidate_inodes_mutex);
2297 yield();
2298 }
2299}
2300
2301/*
2302 * Diagnostic routines to help debug distributed deadlock
2303 */
2304
2305/**
2306 * dump_holder - print information about a glock holder
2307 * @str: a string naming the type of holder
2308 * @gh: the glock holder
2309 *
2310 * Returns: 0 on success, -ENOBUFS when we run out of space
2311 */
2312
2313static int dump_holder(char *str, struct gfs2_holder *gh)
2314{
2315 unsigned int x;
2316 int error = -ENOBUFS;
2317
2318 printk(KERN_INFO " %s\n", str);
2319 printk(KERN_INFO " owner = %ld\n",
2320 (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
2321 printk(KERN_INFO " gh_state = %u\n", gh->gh_state);
2322 printk(KERN_INFO " gh_flags =");
2323 for (x = 0; x < 32; x++)
2324 if (gh->gh_flags & (1 << x))
2325 printk(" %u", x);
2326 printk(" \n");
2327 printk(KERN_INFO " error = %d\n", gh->gh_error);
2328 printk(KERN_INFO " gh_iflags =");
2329 for (x = 0; x < 32; x++)
2330 if (test_bit(x, &gh->gh_iflags))
2331 printk(" %u", x);
2332 printk(" \n");
2333 print_symbol(KERN_INFO " initialized at: %s\n", gh->gh_ip);
2334
2335 error = 0;
2336
2337 return error;
2338}
2339
2340/**
2341 * dump_inode - print information about an inode
2342 * @ip: the inode
2343 *
2344 * Returns: 0 on success, -ENOBUFS when we run out of space
2345 */
2346
2347static int dump_inode(struct gfs2_inode *ip)
2348{
2349 unsigned int x;
2350 int error = -ENOBUFS;
2351
2352 printk(KERN_INFO " Inode:\n");
2353 printk(KERN_INFO " num = %llu %llu\n",
2354 ip->i_num.no_formal_ino, ip->i_num.no_addr);
2355 printk(KERN_INFO " type = %u\n", IF2DT(ip->i_di.di_mode));
2356 printk(KERN_INFO " i_count = %d\n", atomic_read(&ip->i_count));
2357 printk(KERN_INFO " i_flags =");
2358 for (x = 0; x < 32; x++)
2359 if (test_bit(x, &ip->i_flags))
2360 printk(" %u", x);
2361 printk(" \n");
2362 printk(KERN_INFO " vnode = %s\n", (ip->i_vnode) ? "yes" : "no");
2363
2364 error = 0;
2365
2366 return error;
2367}
2368
2369/**
2370 * dump_glock - print information about a glock
2371 * @gl: the glock
2372 * @count: where we are in the buffer
2373 *
2374 * Returns: 0 on success, -ENOBUFS when we run out of space
2375 */
2376
2377static int dump_glock(struct gfs2_glock *gl)
2378{
2379 struct gfs2_holder *gh;
2380 unsigned int x;
2381 int error = -ENOBUFS;
2382
2383 spin_lock(&gl->gl_spin);
2384
2385 printk(KERN_INFO "Glock (%u, %llu)\n",
2386 gl->gl_name.ln_type,
2387 gl->gl_name.ln_number);
2388 printk(KERN_INFO " gl_flags =");
2389 for (x = 0; x < 32; x++)
2390 if (test_bit(x, &gl->gl_flags))
2391 printk(" %u", x);
2392 printk(" \n");
2393 printk(KERN_INFO " gl_ref = %d\n", atomic_read(&gl->gl_ref.refcount));
2394 printk(KERN_INFO " gl_state = %u\n", gl->gl_state);
2395 printk(KERN_INFO " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
2396 printk(KERN_INFO " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
2397 printk(KERN_INFO " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
2398 printk(KERN_INFO " object = %s\n", (gl->gl_object) ? "yes" : "no");
2399 printk(KERN_INFO " le = %s\n",
2400 (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
2401 printk(KERN_INFO " reclaim = %s\n",
2402 (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
2403 if (gl->gl_aspace)
2404 printk(KERN_INFO " aspace = %lu\n",
2405 gl->gl_aspace->i_mapping->nrpages);
2406 else
2407 printk(KERN_INFO " aspace = no\n");
2408 printk(KERN_INFO " ail = %d\n", atomic_read(&gl->gl_ail_count));
2409 if (gl->gl_req_gh) {
2410 error = dump_holder("Request", gl->gl_req_gh);
2411 if (error)
2412 goto out;
2413 }
2414 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
2415 error = dump_holder("Holder", gh);
2416 if (error)
2417 goto out;
2418 }
2419 list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
2420 error = dump_holder("Waiter1", gh);
2421 if (error)
2422 goto out;
2423 }
2424 list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
2425 error = dump_holder("Waiter2", gh);
2426 if (error)
2427 goto out;
2428 }
2429 list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
2430 error = dump_holder("Waiter3", gh);
2431 if (error)
2432 goto out;
2433 }
2434 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
2435 if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
2436 list_empty(&gl->gl_holders)) {
2437 error = dump_inode(gl->gl_object);
2438 if (error)
2439 goto out;
2440 } else {
2441 error = -ENOBUFS;
2442 printk(KERN_INFO " Inode: busy\n");
2443 }
2444 }
2445
2446 error = 0;
2447
2448 out:
2449 spin_unlock(&gl->gl_spin);
2450
2451 return error;
2452}
2453
2454/**
2455 * gfs2_dump_lockstate - print out the current lockstate
2456 * @sdp: the filesystem
2457 * @ub: the buffer to copy the information into
2458 *
2459 * If @ub is NULL, dump the lockstate to the console.
2460 *
2461 */
2462
2463int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
2464{
2465 struct gfs2_gl_hash_bucket *bucket;
2466 struct gfs2_glock *gl;
2467 unsigned int x;
2468 int error = 0;
2469
2470 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
2471 bucket = &sdp->sd_gl_hash[x];
2472
2473 read_lock(&bucket->hb_lock);
2474
2475 list_for_each_entry(gl, &bucket->hb_list, gl_list) {
2476 if (test_bit(GLF_PLUG, &gl->gl_flags))
2477 continue;
2478
2479 error = dump_glock(gl);
2480 if (error)
2481 break;
2482 }
2483
2484 read_unlock(&bucket->hb_lock);
2485
2486 if (error)
2487 break;
2488 }
2489
2490
2491 return error;
2492}
2493
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
new file mode 100644
index 000000000000..ed5bc3e65397
--- /dev/null
+++ b/fs/gfs2/glock.h
@@ -0,0 +1,167 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GLOCK_DOT_H__
11#define __GLOCK_DOT_H__
12
13/* Flags for lock requests; used in gfs2_holder gh_flag field.
14 From lm_interface.h:
15#define LM_FLAG_TRY 0x00000001
16#define LM_FLAG_TRY_1CB 0x00000002
17#define LM_FLAG_NOEXP 0x00000004
18#define LM_FLAG_ANY 0x00000008
19#define LM_FLAG_PRIORITY 0x00000010 */
20
21#define GL_LOCAL_EXCL 0x00000020
22#define GL_ASYNC 0x00000040
23#define GL_EXACT 0x00000080
24#define GL_SKIP 0x00000100
25#define GL_ATIME 0x00000200
26#define GL_NOCACHE 0x00000400
27#define GL_SYNC 0x00000800
28#define GL_NOCANCEL 0x00001000
29#define GL_NEVER_RECURSE 0x00002000
30#define GL_AOP 0x00004000
31
32#define GLR_TRYFAILED 13
33#define GLR_CANCELED 14
34
35static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
36{
37 struct gfs2_holder *gh;
38 int locked = 0;
39
40 /* Look in glock's list of holders for one with current task as owner */
41 spin_lock(&gl->gl_spin);
42 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
43 if (gh->gh_owner == current) {
44 locked = 1;
45 break;
46 }
47 }
48 spin_unlock(&gl->gl_spin);
49
50 return locked;
51}
52
53static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
54{
55 return (gl->gl_state == LM_ST_EXCLUSIVE);
56}
57
58static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl)
59{
60 return (gl->gl_state == LM_ST_DEFERRED);
61}
62
63static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
64{
65 return (gl->gl_state == LM_ST_SHARED);
66}
67
68static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
69{
70 int ret;
71 spin_lock(&gl->gl_spin);
72 ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3);
73 spin_unlock(&gl->gl_spin);
74 return ret;
75}
76
77struct gfs2_glock *gfs2_glock_find(struct gfs2_sbd *sdp,
78 struct lm_lockname *name);
79int gfs2_glock_get(struct gfs2_sbd *sdp,
80 uint64_t number, struct gfs2_glock_operations *glops,
81 int create, struct gfs2_glock **glp);
82void gfs2_glock_hold(struct gfs2_glock *gl);
83int gfs2_glock_put(struct gfs2_glock *gl);
84void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
85 struct gfs2_holder *gh);
86void gfs2_holder_reinit(unsigned int state, unsigned flags,
87 struct gfs2_holder *gh);
88void gfs2_holder_uninit(struct gfs2_holder *gh);
89struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl, unsigned int state,
90 int flags, gfp_t gfp_flags);
91void gfs2_holder_put(struct gfs2_holder *gh);
92
93void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags);
94void gfs2_glock_drop_th(struct gfs2_glock *gl);
95
96void gfs2_glmutex_lock(struct gfs2_glock *gl);
97int gfs2_glmutex_trylock(struct gfs2_glock *gl);
98void gfs2_glmutex_unlock(struct gfs2_glock *gl);
99
100int gfs2_glock_nq(struct gfs2_holder *gh);
101int gfs2_glock_poll(struct gfs2_holder *gh);
102int gfs2_glock_wait(struct gfs2_holder *gh);
103void gfs2_glock_dq(struct gfs2_holder *gh);
104
105void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state, int flags);
106void gfs2_glock_force_drop(struct gfs2_glock *gl);
107
108int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time);
109
110void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
111int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
112 uint64_t number, struct gfs2_glock_operations *glops,
113 unsigned int state, int flags, struct gfs2_holder *gh);
114
115int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
116void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
117void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
118
119void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, uint64_t number,
120 struct gfs2_glock_operations *glops,
121 unsigned int state, int flags);
122
123/**
124 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
125 * @gl: the glock
126 * @state: the state we're requesting
127 * @flags: the modifier flags
128 * @gh: the holder structure
129 *
130 * Returns: 0, GLR_*, or errno
131 */
132
133static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
134 unsigned int state, int flags,
135 struct gfs2_holder *gh)
136{
137 int error;
138
139 gfs2_holder_init(gl, state, flags, gh);
140
141 error = gfs2_glock_nq(gh);
142 if (error)
143 gfs2_holder_uninit(gh);
144
145 return error;
146}
147
148/* Lock Value Block functions */
149
150int gfs2_lvb_hold(struct gfs2_glock *gl);
151void gfs2_lvb_unhold(struct gfs2_glock *gl);
152void gfs2_lvb_sync(struct gfs2_glock *gl);
153
154void gfs2_glock_cb(lm_fsdata_t *fsdata, unsigned int type, void *data);
155
156void gfs2_try_toss_inode(struct gfs2_sbd *sdp, struct gfs2_inum *inum);
157void gfs2_iopen_go_callback(struct gfs2_glock *gl, unsigned int state);
158
159void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
160void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
161
162void gfs2_scand_internal(struct gfs2_sbd *sdp);
163void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
164
165int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
166
167#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
new file mode 100644
index 000000000000..d180c89dd567
--- /dev/null
+++ b/fs/gfs2/glops.c
@@ -0,0 +1,492 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <asm/semaphore.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "bmap.h"
22#include "glock.h"
23#include "glops.h"
24#include "inode.h"
25#include "log.h"
26#include "meta_io.h"
27#include "page.h"
28#include "recovery.h"
29#include "rgrp.h"
30#include "util.h"
31
32/**
33 * meta_go_sync - sync out the metadata for this glock
34 * @gl: the glock
35 * @flags: DIO_*
36 *
37 * Called when demoting or unlocking an EX glock. We must flush
38 * to disk all dirty buffers/pages relating to this glock, and must not
39 * not return to caller to demote/unlock the glock until I/O is complete.
40 */
41
42static void meta_go_sync(struct gfs2_glock *gl, int flags)
43{
44 if (!(flags & DIO_METADATA))
45 return;
46
47 if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) {
48 gfs2_log_flush(gl->gl_sbd, gl);
49 gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
50 if (flags & DIO_RELEASE)
51 gfs2_ail_empty_gl(gl);
52 }
53
54 clear_bit(GLF_SYNC, &gl->gl_flags);
55}
56
57/**
58 * meta_go_inval - invalidate the metadata for this glock
59 * @gl: the glock
60 * @flags:
61 *
62 */
63
64static void meta_go_inval(struct gfs2_glock *gl, int flags)
65{
66 if (!(flags & DIO_METADATA))
67 return;
68
69 gfs2_meta_inval(gl);
70 gl->gl_vn++;
71}
72
73/**
74 * meta_go_demote_ok - Check to see if it's ok to unlock a glock
75 * @gl: the glock
76 *
77 * Returns: 1 if we have no cached data; ok to demote meta glock
78 */
79
80static int meta_go_demote_ok(struct gfs2_glock *gl)
81{
82 return !gl->gl_aspace->i_mapping->nrpages;
83}
84
85/**
86 * inode_go_xmote_th - promote/demote a glock
87 * @gl: the glock
88 * @state: the requested state
89 * @flags:
90 *
91 */
92
93static void inode_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
94 int flags)
95{
96 if (gl->gl_state != LM_ST_UNLOCKED)
97 gfs2_pte_inval(gl);
98 gfs2_glock_xmote_th(gl, state, flags);
99}
100
101/**
102 * inode_go_xmote_bh - After promoting/demoting a glock
103 * @gl: the glock
104 *
105 */
106
107static void inode_go_xmote_bh(struct gfs2_glock *gl)
108{
109 struct gfs2_holder *gh = gl->gl_req_gh;
110 struct buffer_head *bh;
111 int error;
112
113 if (gl->gl_state != LM_ST_UNLOCKED &&
114 (!gh || !(gh->gh_flags & GL_SKIP))) {
115 error = gfs2_meta_read(gl, gl->gl_name.ln_number, DIO_START,
116 &bh);
117 if (!error)
118 brelse(bh);
119 }
120}
121
122/**
123 * inode_go_drop_th - unlock a glock
124 * @gl: the glock
125 *
126 * Invoked from rq_demote().
127 * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long)
128 * is being purged from our node's glock cache; we're dropping lock.
129 */
130
131static void inode_go_drop_th(struct gfs2_glock *gl)
132{
133 gfs2_pte_inval(gl);
134 gfs2_glock_drop_th(gl);
135}
136
137/**
138 * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
139 * @gl: the glock protecting the inode
140 * @flags:
141 *
142 */
143
144static void inode_go_sync(struct gfs2_glock *gl, int flags)
145{
146 int meta = (flags & DIO_METADATA);
147 int data = (flags & DIO_DATA);
148
149 if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
150 if (meta && data) {
151 gfs2_page_sync(gl, flags | DIO_START);
152 gfs2_log_flush(gl->gl_sbd, gl);
153 gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
154 gfs2_page_sync(gl, flags | DIO_WAIT);
155 clear_bit(GLF_DIRTY, &gl->gl_flags);
156 } else if (meta) {
157 gfs2_log_flush(gl->gl_sbd, gl);
158 gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT);
159 } else if (data)
160 gfs2_page_sync(gl, flags | DIO_START | DIO_WAIT);
161 if (flags & DIO_RELEASE)
162 gfs2_ail_empty_gl(gl);
163 }
164
165 clear_bit(GLF_SYNC, &gl->gl_flags);
166}
167
168/**
169 * inode_go_inval - prepare a inode glock to be released
170 * @gl: the glock
171 * @flags:
172 *
173 */
174
175static void inode_go_inval(struct gfs2_glock *gl, int flags)
176{
177 int meta = (flags & DIO_METADATA);
178 int data = (flags & DIO_DATA);
179
180 if (meta) {
181 gfs2_meta_inval(gl);
182 gl->gl_vn++;
183 }
184 if (data)
185 gfs2_page_inval(gl);
186}
187
188/**
189 * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock
190 * @gl: the glock
191 *
192 * Returns: 1 if it's ok
193 */
194
195static int inode_go_demote_ok(struct gfs2_glock *gl)
196{
197 struct gfs2_sbd *sdp = gl->gl_sbd;
198 int demote = 0;
199
200 if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages)
201 demote = 1;
202 else if (!sdp->sd_args.ar_localcaching &&
203 time_after_eq(jiffies, gl->gl_stamp +
204 gfs2_tune_get(sdp, gt_demote_secs) * HZ))
205 demote = 1;
206
207 return demote;
208}
209
210/**
211 * inode_go_lock - operation done after an inode lock is locked by a process
212 * @gl: the glock
213 * @flags:
214 *
215 * Returns: errno
216 */
217
218static int inode_go_lock(struct gfs2_holder *gh)
219{
220 struct gfs2_glock *gl = gh->gh_gl;
221 struct gfs2_inode *ip = gl->gl_object;
222 int error = 0;
223
224 if (!ip)
225 return 0;
226
227 if (ip->i_vn != gl->gl_vn) {
228 error = gfs2_inode_refresh(ip);
229 if (error)
230 return error;
231 gfs2_inode_attr_in(ip);
232 }
233
234 if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
235 (gl->gl_state == LM_ST_EXCLUSIVE) &&
236 (gh->gh_flags & GL_LOCAL_EXCL))
237 error = gfs2_truncatei_resume(ip);
238
239 return error;
240}
241
242/**
243 * inode_go_unlock - operation done before an inode lock is unlocked by a
244 * process
245 * @gl: the glock
246 * @flags:
247 *
248 */
249
250static void inode_go_unlock(struct gfs2_holder *gh)
251{
252 struct gfs2_glock *gl = gh->gh_gl;
253 struct gfs2_inode *ip = gl->gl_object;
254
255 if (ip && test_bit(GLF_DIRTY, &gl->gl_flags))
256 gfs2_inode_attr_in(ip);
257
258 if (ip)
259 gfs2_meta_cache_flush(ip);
260}
261
262/**
263 * inode_greedy -
264 * @gl: the glock
265 *
266 */
267
268static void inode_greedy(struct gfs2_glock *gl)
269{
270 struct gfs2_sbd *sdp = gl->gl_sbd;
271 struct gfs2_inode *ip = gl->gl_object;
272 unsigned int quantum = gfs2_tune_get(sdp, gt_greedy_quantum);
273 unsigned int max = gfs2_tune_get(sdp, gt_greedy_max);
274 unsigned int new_time;
275
276 spin_lock(&ip->i_spin);
277
278 if (time_after(ip->i_last_pfault + quantum, jiffies)) {
279 new_time = ip->i_greedy + quantum;
280 if (new_time > max)
281 new_time = max;
282 } else {
283 new_time = ip->i_greedy - quantum;
284 if (!new_time || new_time > max)
285 new_time = 1;
286 }
287
288 ip->i_greedy = new_time;
289
290 spin_unlock(&ip->i_spin);
291
292 gfs2_inode_put(ip);
293}
294
295/**
296 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
297 * @gl: the glock
298 *
299 * Returns: 1 if it's ok
300 */
301
302static int rgrp_go_demote_ok(struct gfs2_glock *gl)
303{
304 return !gl->gl_aspace->i_mapping->nrpages;
305}
306
307/**
308 * rgrp_go_lock - operation done after an rgrp lock is locked by
309 * a first holder on this node.
310 * @gl: the glock
311 * @flags:
312 *
313 * Returns: errno
314 */
315
316static int rgrp_go_lock(struct gfs2_holder *gh)
317{
318 return gfs2_rgrp_bh_get(gh->gh_gl->gl_object);
319}
320
321/**
322 * rgrp_go_unlock - operation done before an rgrp lock is unlocked by
323 * a last holder on this node.
324 * @gl: the glock
325 * @flags:
326 *
327 */
328
329static void rgrp_go_unlock(struct gfs2_holder *gh)
330{
331 gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
332}
333
334/**
335 * trans_go_xmote_th - promote/demote the transaction glock
336 * @gl: the glock
337 * @state: the requested state
338 * @flags:
339 *
340 */
341
342static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
343 int flags)
344{
345 struct gfs2_sbd *sdp = gl->gl_sbd;
346
347 if (gl->gl_state != LM_ST_UNLOCKED &&
348 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
349 gfs2_meta_syncfs(sdp);
350 gfs2_log_shutdown(sdp);
351 }
352
353 gfs2_glock_xmote_th(gl, state, flags);
354}
355
356/**
357 * trans_go_xmote_bh - After promoting/demoting the transaction glock
358 * @gl: the glock
359 *
360 */
361
362static void trans_go_xmote_bh(struct gfs2_glock *gl)
363{
364 struct gfs2_sbd *sdp = gl->gl_sbd;
365 struct gfs2_inode *ip = sdp->sd_jdesc->jd_inode->u.generic_ip;
366 struct gfs2_glock *j_gl = ip->i_gl;
367 struct gfs2_log_header head;
368 int error;
369
370 if (gl->gl_state != LM_ST_UNLOCKED &&
371 test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
372 gfs2_meta_cache_flush(sdp->sd_jdesc->jd_inode->u.generic_ip);
373 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
374
375 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
376 if (error)
377 gfs2_consist(sdp);
378 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT))
379 gfs2_consist(sdp);
380
381 /* Initialize some head of the log stuff */
382 if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
383 sdp->sd_log_sequence = head.lh_sequence + 1;
384 gfs2_log_pointers_init(sdp, head.lh_blkno);
385 }
386 }
387}
388
389/**
390 * trans_go_drop_th - unlock the transaction glock
391 * @gl: the glock
392 *
393 * We want to sync the device even with localcaching. Remember
394 * that localcaching journal replay only marks buffers dirty.
395 */
396
397static void trans_go_drop_th(struct gfs2_glock *gl)
398{
399 struct gfs2_sbd *sdp = gl->gl_sbd;
400
401 if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
402 gfs2_meta_syncfs(sdp);
403 gfs2_log_shutdown(sdp);
404 }
405
406 gfs2_glock_drop_th(gl);
407}
408
409/**
410 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
411 * @gl: the glock
412 *
413 * Returns: 1 if it's ok
414 */
415
416static int quota_go_demote_ok(struct gfs2_glock *gl)
417{
418 return !atomic_read(&gl->gl_lvb_count);
419}
420
421struct gfs2_glock_operations gfs2_meta_glops = {
422 .go_xmote_th = gfs2_glock_xmote_th,
423 .go_drop_th = gfs2_glock_drop_th,
424 .go_sync = meta_go_sync,
425 .go_inval = meta_go_inval,
426 .go_demote_ok = meta_go_demote_ok,
427 .go_type = LM_TYPE_META
428};
429
430struct gfs2_glock_operations gfs2_inode_glops = {
431 .go_xmote_th = inode_go_xmote_th,
432 .go_xmote_bh = inode_go_xmote_bh,
433 .go_drop_th = inode_go_drop_th,
434 .go_sync = inode_go_sync,
435 .go_inval = inode_go_inval,
436 .go_demote_ok = inode_go_demote_ok,
437 .go_lock = inode_go_lock,
438 .go_unlock = inode_go_unlock,
439 .go_greedy = inode_greedy,
440 .go_type = LM_TYPE_INODE
441};
442
443struct gfs2_glock_operations gfs2_rgrp_glops = {
444 .go_xmote_th = gfs2_glock_xmote_th,
445 .go_drop_th = gfs2_glock_drop_th,
446 .go_sync = meta_go_sync,
447 .go_inval = meta_go_inval,
448 .go_demote_ok = rgrp_go_demote_ok,
449 .go_lock = rgrp_go_lock,
450 .go_unlock = rgrp_go_unlock,
451 .go_type = LM_TYPE_RGRP
452};
453
454struct gfs2_glock_operations gfs2_trans_glops = {
455 .go_xmote_th = trans_go_xmote_th,
456 .go_xmote_bh = trans_go_xmote_bh,
457 .go_drop_th = trans_go_drop_th,
458 .go_type = LM_TYPE_NONDISK
459};
460
461struct gfs2_glock_operations gfs2_iopen_glops = {
462 .go_xmote_th = gfs2_glock_xmote_th,
463 .go_drop_th = gfs2_glock_drop_th,
464 .go_callback = gfs2_iopen_go_callback,
465 .go_type = LM_TYPE_IOPEN
466};
467
468struct gfs2_glock_operations gfs2_flock_glops = {
469 .go_xmote_th = gfs2_glock_xmote_th,
470 .go_drop_th = gfs2_glock_drop_th,
471 .go_type = LM_TYPE_FLOCK
472};
473
474struct gfs2_glock_operations gfs2_nondisk_glops = {
475 .go_xmote_th = gfs2_glock_xmote_th,
476 .go_drop_th = gfs2_glock_drop_th,
477 .go_type = LM_TYPE_NONDISK
478};
479
480struct gfs2_glock_operations gfs2_quota_glops = {
481 .go_xmote_th = gfs2_glock_xmote_th,
482 .go_drop_th = gfs2_glock_drop_th,
483 .go_demote_ok = quota_go_demote_ok,
484 .go_type = LM_TYPE_QUOTA
485};
486
487struct gfs2_glock_operations gfs2_journal_glops = {
488 .go_xmote_th = gfs2_glock_xmote_th,
489 .go_drop_th = gfs2_glock_drop_th,
490 .go_type = LM_TYPE_JOURNAL
491};
492
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
new file mode 100644
index 000000000000..94f2d264aa64
--- /dev/null
+++ b/fs/gfs2/glops.h
@@ -0,0 +1,23 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __GLOPS_DOT_H__
11#define __GLOPS_DOT_H__
12
13extern struct gfs2_glock_operations gfs2_meta_glops;
14extern struct gfs2_glock_operations gfs2_inode_glops;
15extern struct gfs2_glock_operations gfs2_rgrp_glops;
16extern struct gfs2_glock_operations gfs2_trans_glops;
17extern struct gfs2_glock_operations gfs2_iopen_glops;
18extern struct gfs2_glock_operations gfs2_flock_glops;
19extern struct gfs2_glock_operations gfs2_nondisk_glops;
20extern struct gfs2_glock_operations gfs2_quota_glops;
21extern struct gfs2_glock_operations gfs2_journal_glops;
22
23#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
new file mode 100644
index 000000000000..761f00153d43
--- /dev/null
+++ b/fs/gfs2/incore.h
@@ -0,0 +1,680 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __INCORE_DOT_H__
11#define __INCORE_DOT_H__
12
13#define DIO_FORCE 0x00000001
14#define DIO_CLEAN 0x00000002
15#define DIO_DIRTY 0x00000004
16#define DIO_START 0x00000008
17#define DIO_WAIT 0x00000010
18#define DIO_METADATA 0x00000020
19#define DIO_DATA 0x00000040
20#define DIO_RELEASE 0x00000080
21#define DIO_ALL 0x00000100
22
23struct gfs2_log_operations;
24struct gfs2_log_element;
25struct gfs2_bitmap;
26struct gfs2_rgrpd;
27struct gfs2_bufdata;
28struct gfs2_glock_operations;
29struct gfs2_holder;
30struct gfs2_glock;
31struct gfs2_alloc;
32struct gfs2_inode;
33struct gfs2_file;
34struct gfs2_revoke;
35struct gfs2_revoke_replay;
36struct gfs2_unlinked;
37struct gfs2_quota_data;
38struct gfs2_log_buf;
39struct gfs2_trans;
40struct gfs2_ail;
41struct gfs2_jdesc;
42struct gfs2_args;
43struct gfs2_tune;
44struct gfs2_gl_hash_bucket;
45struct gfs2_sbd;
46
47typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
48
49/*
50 * Structure of operations that are associated with each
51 * type of element in the log.
52 */
53
54struct gfs2_log_operations {
55 void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le);
56 void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
57 void (*lo_before_commit) (struct gfs2_sbd *sdp);
58 void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
59 void (*lo_before_scan) (struct gfs2_jdesc *jd,
60 struct gfs2_log_header *head, int pass);
61 int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start,
62 struct gfs2_log_descriptor *ld, __be64 *ptr,
63 int pass);
64 void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass);
65 const char *lo_name;
66};
67
68struct gfs2_log_element {
69 struct list_head le_list;
70 const struct gfs2_log_operations *le_ops;
71};
72
73struct gfs2_bitmap {
74 struct buffer_head *bi_bh;
75 char *bi_clone;
76 uint32_t bi_offset;
77 uint32_t bi_start;
78 uint32_t bi_len;
79};
80
81struct gfs2_rgrpd {
82 struct list_head rd_list; /* Link with superblock */
83 struct list_head rd_list_mru;
84 struct list_head rd_recent; /* Recently used rgrps */
85 struct gfs2_glock *rd_gl; /* Glock for this rgrp */
86 struct gfs2_rindex rd_ri;
87 struct gfs2_rgrp rd_rg;
88 uint64_t rd_rg_vn;
89 struct gfs2_bitmap *rd_bits;
90 unsigned int rd_bh_count;
91 struct mutex rd_mutex;
92 uint32_t rd_free_clone;
93 struct gfs2_log_element rd_le;
94 uint32_t rd_last_alloc_data;
95 uint32_t rd_last_alloc_meta;
96 struct gfs2_sbd *rd_sbd;
97};
98
99enum gfs2_state_bits {
100 BH_Pinned = BH_PrivateStart,
101 BH_Escaped = BH_PrivateStart + 1,
102};
103
104BUFFER_FNS(Pinned, pinned)
105TAS_BUFFER_FNS(Pinned, pinned)
106BUFFER_FNS(Escaped, escaped)
107TAS_BUFFER_FNS(Escaped, escaped)
108
109struct gfs2_bufdata {
110 struct buffer_head *bd_bh;
111 struct gfs2_glock *bd_gl;
112
113 struct list_head bd_list_tr;
114 struct gfs2_log_element bd_le;
115
116 struct gfs2_ail *bd_ail;
117 struct list_head bd_ail_st_list;
118 struct list_head bd_ail_gl_list;
119};
120
121struct gfs2_glock_operations {
122 void (*go_xmote_th) (struct gfs2_glock * gl, unsigned int state,
123 int flags);
124 void (*go_xmote_bh) (struct gfs2_glock * gl);
125 void (*go_drop_th) (struct gfs2_glock * gl);
126 void (*go_drop_bh) (struct gfs2_glock * gl);
127 void (*go_sync) (struct gfs2_glock * gl, int flags);
128 void (*go_inval) (struct gfs2_glock * gl, int flags);
129 int (*go_demote_ok) (struct gfs2_glock * gl);
130 int (*go_lock) (struct gfs2_holder * gh);
131 void (*go_unlock) (struct gfs2_holder * gh);
132 void (*go_callback) (struct gfs2_glock * gl, unsigned int state);
133 void (*go_greedy) (struct gfs2_glock * gl);
134 int go_type;
135};
136
137enum {
138 /* Actions */
139 HIF_MUTEX = 0,
140 HIF_PROMOTE = 1,
141 HIF_DEMOTE = 2,
142 HIF_GREEDY = 3,
143
144 /* States */
145 HIF_ALLOCED = 4,
146 HIF_DEALLOC = 5,
147 HIF_HOLDER = 6,
148 HIF_FIRST = 7,
149 HIF_RECURSE = 8,
150 HIF_ABORTED = 9,
151};
152
153struct gfs2_holder {
154 struct list_head gh_list;
155
156 struct gfs2_glock *gh_gl;
157 struct task_struct *gh_owner;
158 unsigned int gh_state;
159 unsigned gh_flags;
160
161 int gh_error;
162 unsigned long gh_iflags;
163 struct completion gh_wait;
164 unsigned long gh_ip;
165};
166
167enum {
168 GLF_PLUG = 0,
169 GLF_LOCK = 1,
170 GLF_STICKY = 2,
171 GLF_PREFETCH = 3,
172 GLF_SYNC = 4,
173 GLF_DIRTY = 5,
174 GLF_SKIP_WAITERS2 = 6,
175 GLF_GREEDY = 7,
176};
177
178struct gfs2_glock {
179 struct list_head gl_list;
180 unsigned long gl_flags; /* GLF_... */
181 struct lm_lockname gl_name;
182 struct kref gl_ref;
183
184 spinlock_t gl_spin;
185
186 unsigned int gl_state;
187 struct list_head gl_holders;
188 struct list_head gl_waiters1; /* HIF_MUTEX */
189 struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_GREEDY */
190 struct list_head gl_waiters3; /* HIF_PROMOTE */
191
192 struct gfs2_glock_operations *gl_ops;
193
194 struct gfs2_holder *gl_req_gh;
195 gfs2_glop_bh_t gl_req_bh;
196
197 lm_lock_t *gl_lock;
198 char *gl_lvb;
199 atomic_t gl_lvb_count;
200
201 uint64_t gl_vn;
202 unsigned long gl_stamp;
203 void *gl_object;
204
205 struct gfs2_gl_hash_bucket *gl_bucket;
206 struct list_head gl_reclaim;
207
208 struct gfs2_sbd *gl_sbd;
209
210 struct inode *gl_aspace;
211 struct gfs2_log_element gl_le;
212 struct list_head gl_ail_list;
213 atomic_t gl_ail_count;
214};
215
216struct gfs2_alloc {
217 /* Quota stuff */
218
219 unsigned int al_qd_num;
220 struct gfs2_quota_data *al_qd[4];
221 struct gfs2_holder al_qd_ghs[4];
222
223 /* Filled in by the caller to gfs2_inplace_reserve() */
224
225 uint32_t al_requested;
226
227 /* Filled in by gfs2_inplace_reserve() */
228
229 char *al_file;
230 unsigned int al_line;
231 struct gfs2_holder al_ri_gh;
232 struct gfs2_holder al_rgd_gh;
233 struct gfs2_rgrpd *al_rgd;
234
235 /* Filled in by gfs2_alloc_*() */
236
237 uint32_t al_alloced;
238};
239
240enum {
241 GIF_MIN_INIT = 0,
242 GIF_QD_LOCKED = 1,
243 GIF_PAGED = 2,
244 GIF_SW_PAGED = 3,
245};
246
247struct gfs2_inode {
248 struct gfs2_inum i_num;
249
250 atomic_t i_count;
251 unsigned long i_flags; /* GIF_... */
252
253 uint64_t i_vn;
254 struct gfs2_dinode i_di;
255
256 struct gfs2_glock *i_gl;
257 struct gfs2_sbd *i_sbd;
258 struct inode *i_vnode;
259
260 struct gfs2_holder i_iopen_gh;
261 struct gfs2_holder i_gh; /* for prepare/commit_write only */
262 struct gfs2_alloc i_alloc;
263 uint64_t i_last_rg_alloc;
264
265 spinlock_t i_spin;
266 struct rw_semaphore i_rw_mutex;
267
268 unsigned int i_greedy;
269 unsigned long i_last_pfault;
270
271 struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT];
272};
273
274enum {
275 GFF_DID_DIRECT_ALLOC = 0,
276};
277
278struct gfs2_file {
279 unsigned long f_flags; /* GFF_... */
280 struct mutex f_fl_mutex;
281 struct gfs2_holder f_fl_gh;
282};
283
284struct gfs2_revoke {
285 struct gfs2_log_element rv_le;
286 uint64_t rv_blkno;
287};
288
289struct gfs2_revoke_replay {
290 struct list_head rr_list;
291 uint64_t rr_blkno;
292 unsigned int rr_where;
293};
294
295enum {
296 ULF_LOCKED = 0,
297};
298
299struct gfs2_unlinked {
300 struct list_head ul_list;
301 unsigned int ul_count;
302 struct gfs2_unlinked_tag ul_ut;
303 unsigned long ul_flags; /* ULF_... */
304 unsigned int ul_slot;
305};
306
307enum {
308 QDF_USER = 0,
309 QDF_CHANGE = 1,
310 QDF_LOCKED = 2,
311};
312
313struct gfs2_quota_lvb {
314 uint32_t qb_magic;
315 uint32_t __pad;
316 uint64_t qb_limit; /* Hard limit of # blocks to alloc */
317 uint64_t qb_warn; /* Warn user when alloc is above this # */
318 int64_t qb_value; /* Current # blocks allocated */
319};
320
321struct gfs2_quota_data {
322 struct list_head qd_list;
323 unsigned int qd_count;
324
325 uint32_t qd_id;
326 unsigned long qd_flags; /* QDF_... */
327
328 int64_t qd_change;
329 int64_t qd_change_sync;
330
331 unsigned int qd_slot;
332 unsigned int qd_slot_count;
333
334 struct buffer_head *qd_bh;
335 struct gfs2_quota_change *qd_bh_qc;
336 unsigned int qd_bh_count;
337
338 struct gfs2_glock *qd_gl;
339 struct gfs2_quota_lvb qd_qb;
340
341 uint64_t qd_sync_gen;
342 unsigned long qd_last_warn;
343 unsigned long qd_last_touched;
344};
345
346struct gfs2_log_buf {
347 struct list_head lb_list;
348 struct buffer_head *lb_bh;
349 struct buffer_head *lb_real;
350};
351
352struct gfs2_trans {
353 unsigned long tr_ip;
354
355 unsigned int tr_blocks;
356 unsigned int tr_revokes;
357 unsigned int tr_reserved;
358
359 struct gfs2_holder tr_t_gh;
360
361 int tr_touched;
362
363 unsigned int tr_num_buf;
364 unsigned int tr_num_buf_new;
365 unsigned int tr_num_buf_rm;
366 struct list_head tr_list_buf;
367
368 unsigned int tr_num_revoke;
369 unsigned int tr_num_revoke_rm;
370};
371
372struct gfs2_ail {
373 struct list_head ai_list;
374
375 unsigned int ai_first;
376 struct list_head ai_ail1_list;
377 struct list_head ai_ail2_list;
378
379 uint64_t ai_sync_gen;
380};
381
382struct gfs2_jdesc {
383 struct list_head jd_list;
384
385 struct inode *jd_inode;
386 unsigned int jd_jid;
387 int jd_dirty;
388
389 unsigned int jd_blocks;
390};
391
392#define GFS2_GLOCKD_DEFAULT 1
393#define GFS2_GLOCKD_MAX 16
394
395#define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF
396#define GFS2_QUOTA_OFF 0
397#define GFS2_QUOTA_ACCOUNT 1
398#define GFS2_QUOTA_ON 2
399
400#define GFS2_DATA_DEFAULT GFS2_DATA_ORDERED
401#define GFS2_DATA_WRITEBACK 1
402#define GFS2_DATA_ORDERED 2
403
404struct gfs2_args {
405 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
406 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
407 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
408 int ar_spectator; /* Don't get a journal because we're always RO */
409 int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */
410 int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */
411 int ar_localcaching; /* Local-style caching (dangerous on multihost) */
412 int ar_debug; /* Oops on errors instead of trying to be graceful */
413 int ar_upgrade; /* Upgrade ondisk/multihost format */
414 unsigned int ar_num_glockd; /* Number of glockd threads */
415 int ar_posix_acl; /* Enable posix acls */
416 int ar_quota; /* off/account/on */
417 int ar_suiddir; /* suiddir support */
418 int ar_data; /* ordered/writeback */
419};
420
421struct gfs2_tune {
422 spinlock_t gt_spin;
423
424 unsigned int gt_ilimit;
425 unsigned int gt_ilimit_tries;
426 unsigned int gt_ilimit_min;
427 unsigned int gt_demote_secs; /* Cache retention for unheld glock */
428 unsigned int gt_incore_log_blocks;
429 unsigned int gt_log_flush_secs;
430 unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
431
432 unsigned int gt_scand_secs;
433 unsigned int gt_recoverd_secs;
434 unsigned int gt_logd_secs;
435 unsigned int gt_quotad_secs;
436 unsigned int gt_inoded_secs;
437
438 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
439 unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
440 unsigned int gt_quota_scale_num; /* Numerator */
441 unsigned int gt_quota_scale_den; /* Denominator */
442 unsigned int gt_quota_cache_secs;
443 unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
444 unsigned int gt_atime_quantum; /* Min secs between atime updates */
445 unsigned int gt_new_files_jdata;
446 unsigned int gt_new_files_directio;
447 unsigned int gt_max_atomic_write; /* Split big writes into this size */
448 unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
449 unsigned int gt_lockdump_size;
450 unsigned int gt_stall_secs; /* Detects trouble! */
451 unsigned int gt_complain_secs;
452 unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
453 unsigned int gt_entries_per_readdir;
454 unsigned int gt_prefetch_secs; /* Usage window for prefetched glocks */
455 unsigned int gt_greedy_default;
456 unsigned int gt_greedy_quantum;
457 unsigned int gt_greedy_max;
458 unsigned int gt_statfs_quantum;
459 unsigned int gt_statfs_slow;
460};
461
462struct gfs2_gl_hash_bucket {
463 rwlock_t hb_lock;
464 struct list_head hb_list;
465};
466
467enum {
468 SDF_JOURNAL_CHECKED = 0,
469 SDF_JOURNAL_LIVE = 1,
470 SDF_SHUTDOWN = 2,
471 SDF_NOATIME = 3,
472};
473
474#define GFS2_GL_HASH_SHIFT 13
475#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
476#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1)
477#define GFS2_FSNAME_LEN 256
478
479struct gfs2_sbd {
480 struct super_block *sd_vfs;
481 struct kobject sd_kobj;
482 unsigned long sd_flags; /* SDF_... */
483 struct gfs2_sb sd_sb;
484
485 /* Constants computed on mount */
486
487 uint32_t sd_fsb2bb;
488 uint32_t sd_fsb2bb_shift;
489 uint32_t sd_diptrs; /* Number of pointers in a dinode */
490 uint32_t sd_inptrs; /* Number of pointers in a indirect block */
491 uint32_t sd_jbsize; /* Size of a journaled data block */
492 uint32_t sd_hash_bsize; /* sizeof(exhash block) */
493 uint32_t sd_hash_bsize_shift;
494 uint32_t sd_hash_ptrs; /* Number of pointers in a hash block */
495 uint32_t sd_ut_per_block;
496 uint32_t sd_qc_per_block;
497 uint32_t sd_max_dirres; /* Max blocks needed to add a directory entry */
498 uint32_t sd_max_height; /* Max height of a file's metadata tree */
499 uint64_t sd_heightsize[GFS2_MAX_META_HEIGHT];
500 uint32_t sd_max_jheight; /* Max height of journaled file's meta tree */
501 uint64_t sd_jheightsize[GFS2_MAX_META_HEIGHT];
502
503 struct gfs2_args sd_args; /* Mount arguments */
504 struct gfs2_tune sd_tune; /* Filesystem tuning structure */
505
506 /* Lock Stuff */
507
508 struct lm_lockstruct sd_lockstruct;
509 struct gfs2_gl_hash_bucket sd_gl_hash[GFS2_GL_HASH_SIZE];
510 struct list_head sd_reclaim_list;
511 spinlock_t sd_reclaim_lock;
512 wait_queue_head_t sd_reclaim_wq;
513 atomic_t sd_reclaim_count;
514 struct gfs2_holder sd_live_gh;
515 struct gfs2_glock *sd_rename_gl;
516 struct gfs2_glock *sd_trans_gl;
517 struct mutex sd_invalidate_inodes_mutex;
518
519 /* Inode Stuff */
520
521 struct inode *sd_master_dir;
522 struct inode *sd_jindex;
523 struct inode *sd_inum_inode;
524 struct inode *sd_statfs_inode;
525 struct inode *sd_ir_inode;
526 struct inode *sd_sc_inode;
527 struct inode *sd_ut_inode;
528 struct inode *sd_qc_inode;
529 struct inode *sd_rindex;
530 struct inode *sd_quota_inode;
531
532 /* Inum stuff */
533
534 struct mutex sd_inum_mutex;
535
536 /* StatFS stuff */
537
538 spinlock_t sd_statfs_spin;
539 struct mutex sd_statfs_mutex;
540 struct gfs2_statfs_change sd_statfs_master;
541 struct gfs2_statfs_change sd_statfs_local;
542 unsigned long sd_statfs_sync_time;
543
544 /* Resource group stuff */
545
546 uint64_t sd_rindex_vn;
547 spinlock_t sd_rindex_spin;
548 struct mutex sd_rindex_mutex;
549 struct list_head sd_rindex_list;
550 struct list_head sd_rindex_mru_list;
551 struct list_head sd_rindex_recent_list;
552 struct gfs2_rgrpd *sd_rindex_forward;
553 unsigned int sd_rgrps;
554
555 /* Journal index stuff */
556
557 struct list_head sd_jindex_list;
558 spinlock_t sd_jindex_spin;
559 struct mutex sd_jindex_mutex;
560 unsigned int sd_journals;
561 unsigned long sd_jindex_refresh_time;
562
563 struct gfs2_jdesc *sd_jdesc;
564 struct gfs2_holder sd_journal_gh;
565 struct gfs2_holder sd_jinode_gh;
566
567 struct gfs2_holder sd_ir_gh;
568 struct gfs2_holder sd_sc_gh;
569 struct gfs2_holder sd_ut_gh;
570 struct gfs2_holder sd_qc_gh;
571
572 /* Daemon stuff */
573
574 struct task_struct *sd_scand_process;
575 struct task_struct *sd_recoverd_process;
576 struct task_struct *sd_logd_process;
577 struct task_struct *sd_quotad_process;
578 struct task_struct *sd_inoded_process;
579 struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
580 unsigned int sd_glockd_num;
581
582 /* Unlinked inode stuff */
583
584 struct list_head sd_unlinked_list;
585 atomic_t sd_unlinked_count;
586 spinlock_t sd_unlinked_spin;
587 struct mutex sd_unlinked_mutex;
588
589 unsigned int sd_unlinked_slots;
590 unsigned int sd_unlinked_chunks;
591 unsigned char **sd_unlinked_bitmap;
592
593 /* Quota stuff */
594
595 struct list_head sd_quota_list;
596 atomic_t sd_quota_count;
597 spinlock_t sd_quota_spin;
598 struct mutex sd_quota_mutex;
599
600 unsigned int sd_quota_slots;
601 unsigned int sd_quota_chunks;
602 unsigned char **sd_quota_bitmap;
603
604 uint64_t sd_quota_sync_gen;
605 unsigned long sd_quota_sync_time;
606
607 /* Log stuff */
608
609 spinlock_t sd_log_lock;
610
611 unsigned int sd_log_blks_reserved;
612 unsigned int sd_log_commited_buf;
613 unsigned int sd_log_commited_revoke;
614
615 unsigned int sd_log_num_gl;
616 unsigned int sd_log_num_buf;
617 unsigned int sd_log_num_revoke;
618 unsigned int sd_log_num_rg;
619 unsigned int sd_log_num_databuf;
620 unsigned int sd_log_num_jdata;
621 unsigned int sd_log_num_hdrs;
622
623 struct list_head sd_log_le_gl;
624 struct list_head sd_log_le_buf;
625 struct list_head sd_log_le_revoke;
626 struct list_head sd_log_le_rg;
627 struct list_head sd_log_le_databuf;
628
629 unsigned int sd_log_blks_free;
630 struct mutex sd_log_reserve_mutex;
631
632 uint64_t sd_log_sequence;
633 unsigned int sd_log_head;
634 unsigned int sd_log_tail;
635 int sd_log_idle;
636
637 unsigned long sd_log_flush_time;
638 struct rw_semaphore sd_log_flush_lock;
639 struct list_head sd_log_flush_list;
640
641 unsigned int sd_log_flush_head;
642 uint64_t sd_log_flush_wrapped;
643
644 struct list_head sd_ail1_list;
645 struct list_head sd_ail2_list;
646 uint64_t sd_ail_sync_gen;
647
648 /* Replay stuff */
649
650 struct list_head sd_revoke_list;
651 unsigned int sd_replay_tail;
652
653 unsigned int sd_found_blocks;
654 unsigned int sd_found_revokes;
655 unsigned int sd_replayed_blocks;
656
657 /* For quiescing the filesystem */
658
659 struct gfs2_holder sd_freeze_gh;
660 struct mutex sd_freeze_lock;
661 unsigned int sd_freeze_count;
662
663 /* Counters */
664
665 atomic_t sd_glock_count;
666 atomic_t sd_glock_held_count;
667 atomic_t sd_inode_count;
668 atomic_t sd_reclaimed;
669
670 char sd_fsname[GFS2_FSNAME_LEN];
671 char sd_table_name[GFS2_FSNAME_LEN];
672 char sd_proto_name[GFS2_FSNAME_LEN];
673
674 /* Debugging crud */
675
676 unsigned long sd_last_warning;
677};
678
679#endif /* __INCORE_DOT_H__ */
680
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
new file mode 100644
index 000000000000..fb5a4d06e926
--- /dev/null
+++ b/fs/gfs2/inode.c
@@ -0,0 +1,1854 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/posix_acl.h>
16#include <linux/sort.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/crc32.h>
19#include <asm/semaphore.h>
20
21#include "gfs2.h"
22#include "lm_interface.h"
23#include "incore.h"
24#include "acl.h"
25#include "bmap.h"
26#include "dir.h"
27#include "eattr.h"
28#include "glock.h"
29#include "glops.h"
30#include "inode.h"
31#include "log.h"
32#include "meta_io.h"
33#include "ops_address.h"
34#include "ops_file.h"
35#include "ops_inode.h"
36#include "quota.h"
37#include "rgrp.h"
38#include "trans.h"
39#include "unlinked.h"
40#include "util.h"
41
42/**
43 * inode_attr_in - Copy attributes from the dinode into the VFS inode
44 * @ip: The GFS2 inode (with embedded disk inode data)
45 * @inode: The Linux VFS inode
46 *
47 */
48
49static void inode_attr_in(struct gfs2_inode *ip, struct inode *inode)
50{
51 inode->i_ino = ip->i_num.no_formal_ino;
52
53 switch (ip->i_di.di_mode & S_IFMT) {
54 case S_IFBLK:
55 case S_IFCHR:
56 inode->i_rdev = MKDEV(ip->i_di.di_major, ip->i_di.di_minor);
57 break;
58 default:
59 inode->i_rdev = 0;
60 break;
61 };
62
63 inode->i_mode = ip->i_di.di_mode;
64 inode->i_nlink = ip->i_di.di_nlink;
65 inode->i_uid = ip->i_di.di_uid;
66 inode->i_gid = ip->i_di.di_gid;
67 i_size_write(inode, ip->i_di.di_size);
68 inode->i_atime.tv_sec = ip->i_di.di_atime;
69 inode->i_mtime.tv_sec = ip->i_di.di_mtime;
70 inode->i_ctime.tv_sec = ip->i_di.di_ctime;
71 inode->i_atime.tv_nsec = 0;
72 inode->i_mtime.tv_nsec = 0;
73 inode->i_ctime.tv_nsec = 0;
74 inode->i_blksize = PAGE_SIZE;
75 inode->i_blocks = ip->i_di.di_blocks <<
76 (ip->i_sbd->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
77
78 if (ip->i_di.di_flags & GFS2_DIF_IMMUTABLE)
79 inode->i_flags |= S_IMMUTABLE;
80 else
81 inode->i_flags &= ~S_IMMUTABLE;
82
83 if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY)
84 inode->i_flags |= S_APPEND;
85 else
86 inode->i_flags &= ~S_APPEND;
87}
88
89/**
90 * gfs2_inode_attr_in - Copy attributes from the dinode into the VFS inode
91 * @ip: The GFS2 inode (with embedded disk inode data)
92 *
93 */
94
95void gfs2_inode_attr_in(struct gfs2_inode *ip)
96{
97 struct inode *inode;
98
99 inode = gfs2_ip2v_lookup(ip);
100 if (inode) {
101 inode_attr_in(ip, inode);
102 iput(inode);
103 }
104}
105
106/**
107 * gfs2_inode_attr_out - Copy attributes from VFS inode into the dinode
108 * @ip: The GFS2 inode
109 *
110 * Only copy out the attributes that we want the VFS layer
111 * to be able to modify.
112 */
113
114void gfs2_inode_attr_out(struct gfs2_inode *ip)
115{
116 struct inode *inode = ip->i_vnode;
117
118 gfs2_assert_withdraw(ip->i_sbd,
119 (ip->i_di.di_mode & S_IFMT) == (inode->i_mode & S_IFMT));
120 ip->i_di.di_mode = inode->i_mode;
121 ip->i_di.di_uid = inode->i_uid;
122 ip->i_di.di_gid = inode->i_gid;
123 ip->i_di.di_atime = inode->i_atime.tv_sec;
124 ip->i_di.di_mtime = inode->i_mtime.tv_sec;
125 ip->i_di.di_ctime = inode->i_ctime.tv_sec;
126}
127
128/**
129 * gfs2_ip2v_lookup - Get the struct inode for a struct gfs2_inode
130 * @ip: the struct gfs2_inode to get the struct inode for
131 *
132 * Returns: A VFS inode, or NULL if none
133 */
134
135struct inode *gfs2_ip2v_lookup(struct gfs2_inode *ip)
136{
137 struct inode *inode = NULL;
138
139 gfs2_assert_warn(ip->i_sbd, test_bit(GIF_MIN_INIT, &ip->i_flags));
140
141 spin_lock(&ip->i_spin);
142 if (ip->i_vnode)
143 inode = igrab(ip->i_vnode);
144 spin_unlock(&ip->i_spin);
145
146 return inode;
147}
148
149/**
150 * gfs2_ip2v - Get/Create a struct inode for a struct gfs2_inode
151 * @ip: the struct gfs2_inode to get the struct inode for
152 *
153 * Returns: A VFS inode, or NULL if no mem
154 */
155
156struct inode *gfs2_ip2v(struct gfs2_inode *ip)
157{
158 struct inode *inode, *tmp;
159
160 inode = gfs2_ip2v_lookup(ip);
161 if (inode)
162 return inode;
163
164 tmp = new_inode(ip->i_sbd->sd_vfs);
165 if (!tmp)
166 return NULL;
167
168 inode_attr_in(ip, tmp);
169
170 if (S_ISREG(ip->i_di.di_mode)) {
171 tmp->i_op = &gfs2_file_iops;
172 tmp->i_fop = &gfs2_file_fops;
173 tmp->i_mapping->a_ops = &gfs2_file_aops;
174 } else if (S_ISDIR(ip->i_di.di_mode)) {
175 tmp->i_op = &gfs2_dir_iops;
176 tmp->i_fop = &gfs2_dir_fops;
177 } else if (S_ISLNK(ip->i_di.di_mode)) {
178 tmp->i_op = &gfs2_symlink_iops;
179 } else {
180 tmp->i_op = &gfs2_dev_iops;
181 init_special_inode(tmp, tmp->i_mode, tmp->i_rdev);
182 }
183
184 tmp->u.generic_ip = NULL;
185
186 for (;;) {
187 spin_lock(&ip->i_spin);
188 if (!ip->i_vnode)
189 break;
190 inode = igrab(ip->i_vnode);
191 spin_unlock(&ip->i_spin);
192
193 if (inode) {
194 iput(tmp);
195 return inode;
196 }
197 yield();
198 }
199
200 inode = tmp;
201
202 gfs2_inode_hold(ip);
203 ip->i_vnode = inode;
204 inode->u.generic_ip = ip;
205
206 spin_unlock(&ip->i_spin);
207
208 insert_inode_hash(inode);
209
210 return inode;
211}
212
213static int iget_test(struct inode *inode, void *opaque)
214{
215 struct gfs2_inode *ip = inode->u.generic_ip;
216 struct gfs2_inum *inum = (struct gfs2_inum *)opaque;
217
218 if (ip && ip->i_num.no_addr == inum->no_addr)
219 return 1;
220
221 return 0;
222}
223
224struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum *inum)
225{
226 return ilookup5(sb, (unsigned long)inum->no_formal_ino,
227 iget_test, inum);
228}
229
230void gfs2_inode_min_init(struct gfs2_inode *ip, unsigned int type)
231{
232 if (!test_and_set_bit(GIF_MIN_INIT, &ip->i_flags)) {
233 ip->i_di.di_nlink = 1;
234 ip->i_di.di_mode = DT2IF(type);
235 }
236}
237
238/**
239 * gfs2_inode_refresh - Refresh the incore copy of the dinode
240 * @ip: The GFS2 inode
241 *
242 * Returns: errno
243 */
244
245int gfs2_inode_refresh(struct gfs2_inode *ip)
246{
247 struct buffer_head *dibh;
248 int error;
249
250 error = gfs2_meta_inode_buffer(ip, &dibh);
251 if (error)
252 return error;
253
254 if (gfs2_metatype_check(ip->i_sbd, dibh, GFS2_METATYPE_DI)) {
255 brelse(dibh);
256 return -EIO;
257 }
258
259 gfs2_dinode_in(&ip->i_di, dibh->b_data);
260 set_bit(GIF_MIN_INIT, &ip->i_flags);
261
262 brelse(dibh);
263
264 if (ip->i_num.no_addr != ip->i_di.di_num.no_addr) {
265 if (gfs2_consist_inode(ip))
266 gfs2_dinode_print(&ip->i_di);
267 return -EIO;
268 }
269 if (ip->i_num.no_formal_ino != ip->i_di.di_num.no_formal_ino)
270 return -ESTALE;
271
272 ip->i_vn = ip->i_gl->gl_vn;
273
274 return 0;
275}
276
277/**
278 * inode_create - create a struct gfs2_inode
279 * @i_gl: The glock covering the inode
280 * @inum: The inode number
281 * @io_gl: the iopen glock to acquire/hold (using holder in new gfs2_inode)
282 * @io_state: the state the iopen glock should be acquired in
283 * @ipp: pointer to put the returned inode in
284 *
285 * Returns: errno
286 */
287
288static int inode_create(struct gfs2_glock *i_gl, const struct gfs2_inum *inum,
289 struct gfs2_glock *io_gl, unsigned int io_state,
290 struct gfs2_inode **ipp)
291{
292 struct gfs2_sbd *sdp = i_gl->gl_sbd;
293 struct gfs2_inode *ip;
294 int error = 0;
295
296 ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
297 if (!ip)
298 return -ENOMEM;
299 memset(ip, 0, sizeof(struct gfs2_inode));
300
301 ip->i_num = *inum;
302
303 atomic_set(&ip->i_count, 1);
304
305 ip->i_vn = i_gl->gl_vn - 1;
306
307 ip->i_gl = i_gl;
308 ip->i_sbd = sdp;
309
310 spin_lock_init(&ip->i_spin);
311 init_rwsem(&ip->i_rw_mutex);
312
313 ip->i_greedy = gfs2_tune_get(sdp, gt_greedy_default);
314
315 error = gfs2_glock_nq_init(io_gl,
316 io_state, GL_LOCAL_EXCL | GL_EXACT,
317 &ip->i_iopen_gh);
318 if (error)
319 goto fail;
320 ip->i_iopen_gh.gh_owner = NULL;
321
322 spin_lock(&io_gl->gl_spin);
323 gfs2_glock_hold(i_gl);
324 io_gl->gl_object = i_gl;
325 spin_unlock(&io_gl->gl_spin);
326
327 gfs2_glock_hold(i_gl);
328 i_gl->gl_object = ip;
329
330 atomic_inc(&sdp->sd_inode_count);
331
332 *ipp = ip;
333
334 return 0;
335
336 fail:
337 gfs2_meta_cache_flush(ip);
338 kmem_cache_free(gfs2_inode_cachep, ip);
339 *ipp = NULL;
340
341 return error;
342}
343
344/**
345 * gfs2_inode_get - Create or get a reference on an inode
346 * @i_gl: The glock covering the inode
347 * @inum: The inode number
348 * @create:
349 * @ipp: pointer to put the returned inode in
350 *
351 * Returns: errno
352 */
353
354int gfs2_inode_get(struct gfs2_glock *i_gl, const struct gfs2_inum *inum,
355 int create, struct gfs2_inode **ipp)
356{
357 struct gfs2_sbd *sdp = i_gl->gl_sbd;
358 struct gfs2_glock *io_gl;
359 int error = 0;
360
361 gfs2_glmutex_lock(i_gl);
362
363 *ipp = i_gl->gl_object;
364 if (*ipp) {
365 error = -ESTALE;
366 if ((*ipp)->i_num.no_formal_ino != inum->no_formal_ino)
367 goto out;
368 atomic_inc(&(*ipp)->i_count);
369 error = 0;
370 goto out;
371 }
372
373 if (!create)
374 goto out;
375
376 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_iopen_glops,
377 CREATE, &io_gl);
378 if (!error) {
379 error = inode_create(i_gl, inum, io_gl, LM_ST_SHARED, ipp);
380 gfs2_glock_put(io_gl);
381 }
382
383 out:
384 gfs2_glmutex_unlock(i_gl);
385
386 return error;
387}
388
389void gfs2_inode_hold(struct gfs2_inode *ip)
390{
391 gfs2_assert(ip->i_sbd, atomic_read(&ip->i_count) > 0);
392 atomic_inc(&ip->i_count);
393}
394
395void gfs2_inode_put(struct gfs2_inode *ip)
396{
397 gfs2_assert(ip->i_sbd, atomic_read(&ip->i_count) > 0);
398 atomic_dec(&ip->i_count);
399}
400
401void gfs2_inode_destroy(struct gfs2_inode *ip)
402{
403 struct gfs2_sbd *sdp = ip->i_sbd;
404 struct gfs2_glock *io_gl = ip->i_iopen_gh.gh_gl;
405 struct gfs2_glock *i_gl = ip->i_gl;
406
407 gfs2_assert_warn(sdp, !atomic_read(&ip->i_count));
408 gfs2_assert(sdp, io_gl->gl_object == i_gl);
409
410 spin_lock(&io_gl->gl_spin);
411 io_gl->gl_object = NULL;
412 spin_unlock(&io_gl->gl_spin);
413 gfs2_glock_put(i_gl);
414
415 gfs2_glock_dq_uninit(&ip->i_iopen_gh);
416
417 gfs2_meta_cache_flush(ip);
418 kmem_cache_free(gfs2_inode_cachep, ip);
419
420 i_gl->gl_object = NULL;
421 gfs2_glock_put(i_gl);
422
423 atomic_dec(&sdp->sd_inode_count);
424}
425
426static int dinode_dealloc(struct gfs2_inode *ip, struct gfs2_unlinked *ul)
427{
428 struct gfs2_sbd *sdp = ip->i_sbd;
429 struct gfs2_alloc *al;
430 struct gfs2_rgrpd *rgd;
431 int error;
432
433 if (ip->i_di.di_blocks != 1) {
434 if (gfs2_consist_inode(ip))
435 gfs2_dinode_print(&ip->i_di);
436 return -EIO;
437 }
438
439 al = gfs2_alloc_get(ip);
440
441 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
442 if (error)
443 goto out;
444
445 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
446 if (error)
447 goto out_qs;
448
449 rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
450 if (!rgd) {
451 gfs2_consist_inode(ip);
452 error = -EIO;
453 goto out_rindex_relse;
454 }
455
456 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
457 &al->al_rgd_gh);
458 if (error)
459 goto out_rindex_relse;
460
461 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_UNLINKED +
462 RES_STATFS + RES_QUOTA, 1);
463 if (error)
464 goto out_rg_gunlock;
465
466 gfs2_trans_add_gl(ip->i_gl);
467
468 gfs2_free_di(rgd, ip);
469
470 error = gfs2_unlinked_ondisk_rm(sdp, ul);
471
472 gfs2_trans_end(sdp);
473 clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
474
475 out_rg_gunlock:
476 gfs2_glock_dq_uninit(&al->al_rgd_gh);
477
478 out_rindex_relse:
479 gfs2_glock_dq_uninit(&al->al_ri_gh);
480
481 out_qs:
482 gfs2_quota_unhold(ip);
483
484 out:
485 gfs2_alloc_put(ip);
486
487 return error;
488}
489
490/**
491 * inode_dealloc - Deallocate all on-disk blocks for an inode (dinode)
492 * @sdp: the filesystem
493 * @inum: the inode number to deallocate
494 * @io_gh: a holder for the iopen glock for this inode
495 *
496 * Returns: errno
497 */
498
499static int inode_dealloc(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul,
500 struct gfs2_holder *io_gh)
501{
502 struct gfs2_inode *ip;
503 struct gfs2_holder i_gh;
504 int error;
505
506 error = gfs2_glock_nq_num(sdp,
507 ul->ul_ut.ut_inum.no_addr, &gfs2_inode_glops,
508 LM_ST_EXCLUSIVE, 0, &i_gh);
509 if (error)
510 return error;
511
512 /* We reacquire the iopen lock here to avoid a race with the NFS server
513 calling gfs2_read_inode() with the inode number of a inode we're in
514 the process of deallocating. And we can't keep our hold on the lock
515 from inode_dealloc_init() for deadlock reasons. */
516
517 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY, io_gh);
518 error = gfs2_glock_nq(io_gh);
519 switch (error) {
520 case 0:
521 break;
522 case GLR_TRYFAILED:
523 error = 1;
524 default:
525 goto out;
526 }
527
528 gfs2_assert_warn(sdp, !i_gh.gh_gl->gl_object);
529 error = inode_create(i_gh.gh_gl, &ul->ul_ut.ut_inum, io_gh->gh_gl,
530 LM_ST_EXCLUSIVE, &ip);
531
532 gfs2_glock_dq(io_gh);
533
534 if (error)
535 goto out;
536
537 error = gfs2_inode_refresh(ip);
538 if (error)
539 goto out_iput;
540
541 if (ip->i_di.di_nlink) {
542 if (gfs2_consist_inode(ip))
543 gfs2_dinode_print(&ip->i_di);
544 error = -EIO;
545 goto out_iput;
546 }
547
548 if (S_ISDIR(ip->i_di.di_mode) &&
549 (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
550 error = gfs2_dir_exhash_dealloc(ip);
551 if (error)
552 goto out_iput;
553 }
554
555 if (ip->i_di.di_eattr) {
556 error = gfs2_ea_dealloc(ip);
557 if (error)
558 goto out_iput;
559 }
560
561 if (!gfs2_is_stuffed(ip)) {
562 error = gfs2_file_dealloc(ip);
563 if (error)
564 goto out_iput;
565 }
566
567 error = dinode_dealloc(ip, ul);
568 if (error)
569 goto out_iput;
570
571 out_iput:
572 gfs2_glmutex_lock(i_gh.gh_gl);
573 gfs2_inode_put(ip);
574 gfs2_inode_destroy(ip);
575 gfs2_glmutex_unlock(i_gh.gh_gl);
576
577 out:
578 gfs2_glock_dq_uninit(&i_gh);
579
580 return error;
581}
582
583/**
584 * try_inode_dealloc - Try to deallocate an inode and all its blocks
585 * @sdp: the filesystem
586 *
587 * Returns: 0 on success, -errno on error, 1 on busy (inode open)
588 */
589
590static int try_inode_dealloc(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
591{
592 struct gfs2_holder io_gh;
593 int error = 0;
594
595 gfs2_try_toss_inode(sdp, &ul->ul_ut.ut_inum);
596
597 error = gfs2_glock_nq_num(sdp,
598 ul->ul_ut.ut_inum.no_addr, &gfs2_iopen_glops,
599 LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB, &io_gh);
600 switch (error) {
601 case 0:
602 break;
603 case GLR_TRYFAILED:
604 return 1;
605 default:
606 return error;
607 }
608
609 gfs2_glock_dq(&io_gh);
610 error = inode_dealloc(sdp, ul, &io_gh);
611 gfs2_holder_uninit(&io_gh);
612
613 return error;
614}
615
616static int inode_dealloc_uninit(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
617{
618 struct gfs2_rgrpd *rgd;
619 struct gfs2_holder ri_gh, rgd_gh;
620 int error;
621
622 error = gfs2_rindex_hold(sdp, &ri_gh);
623 if (error)
624 return error;
625
626 rgd = gfs2_blk2rgrpd(sdp, ul->ul_ut.ut_inum.no_addr);
627 if (!rgd) {
628 gfs2_consist(sdp);
629 error = -EIO;
630 goto out;
631 }
632
633 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rgd_gh);
634 if (error)
635 goto out;
636
637 error = gfs2_trans_begin(sdp,
638 RES_RG_BIT + RES_UNLINKED + RES_STATFS,
639 0);
640 if (error)
641 goto out_gunlock;
642
643 gfs2_free_uninit_di(rgd, ul->ul_ut.ut_inum.no_addr);
644 gfs2_unlinked_ondisk_rm(sdp, ul);
645
646 gfs2_trans_end(sdp);
647
648 out_gunlock:
649 gfs2_glock_dq_uninit(&rgd_gh);
650 out:
651 gfs2_glock_dq_uninit(&ri_gh);
652
653 return error;
654}
655
656int gfs2_inode_dealloc(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
657{
658 if (ul->ul_ut.ut_flags & GFS2_UTF_UNINIT)
659 return inode_dealloc_uninit(sdp, ul);
660 else
661 return try_inode_dealloc(sdp, ul);
662}
663
664/**
665 * gfs2_change_nlink - Change nlink count on inode
666 * @ip: The GFS2 inode
667 * @diff: The change in the nlink count required
668 *
669 * Returns: errno
670 */
671
672int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
673{
674 struct buffer_head *dibh;
675 uint32_t nlink;
676 int error;
677
678 nlink = ip->i_di.di_nlink + diff;
679
680 /* If we are reducing the nlink count, but the new value ends up being
681 bigger than the old one, we must have underflowed. */
682 if (diff < 0 && nlink > ip->i_di.di_nlink) {
683 if (gfs2_consist_inode(ip))
684 gfs2_dinode_print(&ip->i_di);
685 return -EIO;
686 }
687
688 error = gfs2_meta_inode_buffer(ip, &dibh);
689 if (error)
690 return error;
691
692 ip->i_di.di_nlink = nlink;
693 ip->i_di.di_ctime = get_seconds();
694
695 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
696 gfs2_dinode_out(&ip->i_di, dibh->b_data);
697 brelse(dibh);
698
699 return 0;
700}
701
702struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
703{
704 struct qstr qstr;
705 gfs2_str2qstr(&qstr, name);
706 return gfs2_lookupi(dip, &qstr, 1, NULL);
707}
708
709
710/**
711 * gfs2_lookupi - Look up a filename in a directory and return its inode
712 * @d_gh: An initialized holder for the directory glock
713 * @name: The name of the inode to look for
714 * @is_root: If 1, ignore the caller's permissions
715 * @i_gh: An uninitialized holder for the new inode glock
716 *
717 * There will always be a vnode (Linux VFS inode) for the d_gh inode unless
718 * @is_root is true.
719 *
720 * Returns: errno
721 */
722
723struct inode *gfs2_lookupi(struct inode *dir, struct qstr *name, int is_root,
724 struct nameidata *nd)
725
726{
727 struct super_block *sb = dir->i_sb;
728 struct gfs2_inode *ipp;
729 struct gfs2_inode *dip = dir->u.generic_ip;
730 struct gfs2_sbd *sdp = dip->i_sbd;
731 struct gfs2_holder d_gh;
732 struct gfs2_inum inum;
733 unsigned int type;
734 struct gfs2_glock *gl;
735 int error = 0;
736 struct inode *inode = NULL;
737
738 if (!name->len || name->len > GFS2_FNAMESIZE)
739 return ERR_PTR(-ENAMETOOLONG);
740
741 if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) ||
742 (name->len == 2 && memcmp(name->name, "..", 2) == 0 &&
743 dir == sb->s_root->d_inode)) {
744 gfs2_inode_hold(dip);
745 ipp = dip;
746 goto done;
747 }
748
749 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
750 if (error)
751 return ERR_PTR(error);
752
753 if (!is_root) {
754 error = gfs2_repermission(dip->i_vnode, MAY_EXEC, NULL);
755 if (error)
756 goto out;
757 }
758
759 error = gfs2_dir_search(dir, name, &inum, &type);
760 if (error)
761 goto out;
762
763 error = gfs2_glock_get(sdp, inum.no_addr, &gfs2_inode_glops,
764 CREATE, &gl);
765 if (error)
766 goto out;
767
768 error = gfs2_inode_get(gl, &inum, CREATE, &ipp);
769 if (!error)
770 gfs2_inode_min_init(ipp, type);
771
772 gfs2_glock_put(gl);
773
774out:
775 gfs2_glock_dq_uninit(&d_gh);
776done:
777 if (error == -ENOENT)
778 return NULL;
779 if (error == 0) {
780 inode = gfs2_ip2v(ipp);
781 gfs2_inode_put(ipp);
782 if (!inode)
783 return ERR_PTR(-ENOMEM);
784 return inode;
785 }
786 return ERR_PTR(error);
787}
788
789static int pick_formal_ino_1(struct gfs2_sbd *sdp, uint64_t *formal_ino)
790{
791 struct gfs2_inode *ip = sdp->sd_ir_inode->u.generic_ip;
792 struct buffer_head *bh;
793 struct gfs2_inum_range ir;
794 int error;
795
796 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
797 if (error)
798 return error;
799 mutex_lock(&sdp->sd_inum_mutex);
800
801 error = gfs2_meta_inode_buffer(ip, &bh);
802 if (error) {
803 mutex_unlock(&sdp->sd_inum_mutex);
804 gfs2_trans_end(sdp);
805 return error;
806 }
807
808 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
809
810 if (ir.ir_length) {
811 *formal_ino = ir.ir_start++;
812 ir.ir_length--;
813 gfs2_trans_add_bh(ip->i_gl, bh, 1);
814 gfs2_inum_range_out(&ir,
815 bh->b_data + sizeof(struct gfs2_dinode));
816 brelse(bh);
817 mutex_unlock(&sdp->sd_inum_mutex);
818 gfs2_trans_end(sdp);
819 return 0;
820 }
821
822 brelse(bh);
823
824 mutex_unlock(&sdp->sd_inum_mutex);
825 gfs2_trans_end(sdp);
826
827 return 1;
828}
829
830static int pick_formal_ino_2(struct gfs2_sbd *sdp, uint64_t *formal_ino)
831{
832 struct gfs2_inode *ip = sdp->sd_ir_inode->u.generic_ip;
833 struct gfs2_inode *m_ip = sdp->sd_inum_inode->u.generic_ip;
834 struct gfs2_holder gh;
835 struct buffer_head *bh;
836 struct gfs2_inum_range ir;
837 int error;
838
839 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
840 if (error)
841 return error;
842
843 error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
844 if (error)
845 goto out;
846 mutex_lock(&sdp->sd_inum_mutex);
847
848 error = gfs2_meta_inode_buffer(ip, &bh);
849 if (error)
850 goto out_end_trans;
851
852 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
853
854 if (!ir.ir_length) {
855 struct buffer_head *m_bh;
856 uint64_t x, y;
857
858 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
859 if (error)
860 goto out_brelse;
861
862 x = *(uint64_t *)(m_bh->b_data + sizeof(struct gfs2_dinode));
863 x = y = be64_to_cpu(x);
864 ir.ir_start = x;
865 ir.ir_length = GFS2_INUM_QUANTUM;
866 x += GFS2_INUM_QUANTUM;
867 if (x < y)
868 gfs2_consist_inode(m_ip);
869 x = cpu_to_be64(x);
870 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
871 *(uint64_t *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = x;
872
873 brelse(m_bh);
874 }
875
876 *formal_ino = ir.ir_start++;
877 ir.ir_length--;
878
879 gfs2_trans_add_bh(ip->i_gl, bh, 1);
880 gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
881
882 out_brelse:
883 brelse(bh);
884
885 out_end_trans:
886 mutex_unlock(&sdp->sd_inum_mutex);
887 gfs2_trans_end(sdp);
888
889 out:
890 gfs2_glock_dq_uninit(&gh);
891
892 return error;
893}
894
895static int pick_formal_ino(struct gfs2_sbd *sdp, uint64_t *inum)
896{
897 int error;
898
899 error = pick_formal_ino_1(sdp, inum);
900 if (error <= 0)
901 return error;
902
903 error = pick_formal_ino_2(sdp, inum);
904
905 return error;
906}
907
908/**
909 * create_ok - OK to create a new on-disk inode here?
910 * @dip: Directory in which dinode is to be created
911 * @name: Name of new dinode
912 * @mode:
913 *
914 * Returns: errno
915 */
916
917static int create_ok(struct gfs2_inode *dip, struct qstr *name,
918 unsigned int mode)
919{
920 int error;
921
922 error = gfs2_repermission(dip->i_vnode, MAY_WRITE | MAY_EXEC, NULL);
923 if (error)
924 return error;
925
926 /* Don't create entries in an unlinked directory */
927 if (!dip->i_di.di_nlink)
928 return -EPERM;
929
930 error = gfs2_dir_search(dip->i_vnode, name, NULL, NULL);
931 switch (error) {
932 case -ENOENT:
933 error = 0;
934 break;
935 case 0:
936 return -EEXIST;
937 default:
938 return error;
939 }
940
941 if (dip->i_di.di_entries == (uint32_t)-1)
942 return -EFBIG;
943 if (S_ISDIR(mode) && dip->i_di.di_nlink == (uint32_t)-1)
944 return -EMLINK;
945
946 return 0;
947}
948
949static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
950 unsigned int *uid, unsigned int *gid)
951{
952 if (dip->i_sbd->sd_args.ar_suiddir &&
953 (dip->i_di.di_mode & S_ISUID) &&
954 dip->i_di.di_uid) {
955 if (S_ISDIR(*mode))
956 *mode |= S_ISUID;
957 else if (dip->i_di.di_uid != current->fsuid)
958 *mode &= ~07111;
959 *uid = dip->i_di.di_uid;
960 } else
961 *uid = current->fsuid;
962
963 if (dip->i_di.di_mode & S_ISGID) {
964 if (S_ISDIR(*mode))
965 *mode |= S_ISGID;
966 *gid = dip->i_di.di_gid;
967 } else
968 *gid = current->fsgid;
969}
970
971static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_unlinked *ul)
972{
973 struct gfs2_sbd *sdp = dip->i_sbd;
974 int error;
975
976 gfs2_alloc_get(dip);
977
978 dip->i_alloc.al_requested = RES_DINODE;
979 error = gfs2_inplace_reserve(dip);
980 if (error)
981 goto out;
982
983 error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_UNLINKED +
984 RES_STATFS, 0);
985 if (error)
986 goto out_ipreserv;
987
988 ul->ul_ut.ut_inum.no_addr = gfs2_alloc_di(dip);
989
990 ul->ul_ut.ut_flags = GFS2_UTF_UNINIT;
991 error = gfs2_unlinked_ondisk_add(sdp, ul);
992
993 gfs2_trans_end(sdp);
994
995 out_ipreserv:
996 gfs2_inplace_release(dip);
997
998 out:
999 gfs2_alloc_put(dip);
1000
1001 return error;
1002}
1003
1004/**
1005 * init_dinode - Fill in a new dinode structure
1006 * @dip: the directory this inode is being created in
1007 * @gl: The glock covering the new inode
1008 * @inum: the inode number
1009 * @mode: the file permissions
1010 * @uid:
1011 * @gid:
1012 *
1013 */
1014
1015static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
1016 struct gfs2_inum *inum, unsigned int mode,
1017 unsigned int uid, unsigned int gid)
1018{
1019 struct gfs2_sbd *sdp = dip->i_sbd;
1020 struct gfs2_dinode *di;
1021 struct buffer_head *dibh;
1022
1023 dibh = gfs2_meta_new(gl, inum->no_addr);
1024 gfs2_trans_add_bh(gl, dibh, 1);
1025 gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
1026 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1027 di = (struct gfs2_dinode *)dibh->b_data;
1028
1029 di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino);
1030 di->di_num.no_addr = cpu_to_be64(inum->no_addr);
1031 di->di_mode = cpu_to_be32(mode);
1032 di->di_uid = cpu_to_be32(uid);
1033 di->di_gid = cpu_to_be32(gid);
1034 di->di_nlink = cpu_to_be32(0);
1035 di->di_size = cpu_to_be64(0);
1036 di->di_blocks = cpu_to_be64(1);
1037 di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(get_seconds());
1038 di->di_major = di->di_minor = cpu_to_be32(0);
1039 di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
1040 di->__pad[0] = di->__pad[1] = 0;
1041 di->di_flags = cpu_to_be32(0);
1042
1043 if (S_ISREG(mode)) {
1044 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
1045 gfs2_tune_get(sdp, gt_new_files_jdata))
1046 di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
1047 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
1048 gfs2_tune_get(sdp, gt_new_files_directio))
1049 di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
1050 } else if (S_ISDIR(mode)) {
1051 di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
1052 GFS2_DIF_INHERIT_DIRECTIO);
1053 di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
1054 GFS2_DIF_INHERIT_JDATA);
1055 }
1056
1057 di->__pad1 = 0;
1058 di->di_height = cpu_to_be32(0);
1059 di->__pad2 = 0;
1060 di->__pad3 = 0;
1061 di->di_depth = cpu_to_be16(0);
1062 di->di_entries = cpu_to_be32(0);
1063 memset(&di->__pad4, 0, sizeof(di->__pad4));
1064 di->di_eattr = cpu_to_be64(0);
1065 memset(&di->di_reserved, 0, sizeof(di->di_reserved));
1066
1067 brelse(dibh);
1068}
1069
1070static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
1071 unsigned int mode, struct gfs2_unlinked *ul)
1072{
1073 struct gfs2_sbd *sdp = dip->i_sbd;
1074 unsigned int uid, gid;
1075 int error;
1076
1077 munge_mode_uid_gid(dip, &mode, &uid, &gid);
1078
1079 gfs2_alloc_get(dip);
1080
1081 error = gfs2_quota_lock(dip, uid, gid);
1082 if (error)
1083 goto out;
1084
1085 error = gfs2_quota_check(dip, uid, gid);
1086 if (error)
1087 goto out_quota;
1088
1089 error = gfs2_trans_begin(sdp, RES_DINODE + RES_UNLINKED +
1090 RES_QUOTA, 0);
1091 if (error)
1092 goto out_quota;
1093
1094 ul->ul_ut.ut_flags = 0;
1095 error = gfs2_unlinked_ondisk_munge(sdp, ul);
1096
1097 init_dinode(dip, gl, &ul->ul_ut.ut_inum,
1098 mode, uid, gid);
1099
1100 gfs2_quota_change(dip, +1, uid, gid);
1101
1102 gfs2_trans_end(sdp);
1103
1104 out_quota:
1105 gfs2_quota_unlock(dip);
1106
1107 out:
1108 gfs2_alloc_put(dip);
1109
1110 return error;
1111}
1112
1113static int link_dinode(struct gfs2_inode *dip, struct qstr *name,
1114 struct gfs2_inode *ip, struct gfs2_unlinked *ul)
1115{
1116 struct gfs2_sbd *sdp = dip->i_sbd;
1117 struct gfs2_alloc *al;
1118 int alloc_required;
1119 struct buffer_head *dibh;
1120 int error;
1121
1122 al = gfs2_alloc_get(dip);
1123
1124 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1125 if (error)
1126 goto fail;
1127
1128 error = alloc_required = gfs2_diradd_alloc_required(dip->i_vnode, name);
1129 if (alloc_required < 0)
1130 goto fail;
1131 if (alloc_required) {
1132 error = gfs2_quota_check(dip, dip->i_di.di_uid,
1133 dip->i_di.di_gid);
1134 if (error)
1135 goto fail_quota_locks;
1136
1137 al->al_requested = sdp->sd_max_dirres;
1138
1139 error = gfs2_inplace_reserve(dip);
1140 if (error)
1141 goto fail_quota_locks;
1142
1143 error = gfs2_trans_begin(sdp,
1144 sdp->sd_max_dirres +
1145 al->al_rgd->rd_ri.ri_length +
1146 2 * RES_DINODE + RES_UNLINKED +
1147 RES_STATFS + RES_QUOTA, 0);
1148 if (error)
1149 goto fail_ipreserv;
1150 } else {
1151 error = gfs2_trans_begin(sdp,
1152 RES_LEAF +
1153 2 * RES_DINODE +
1154 RES_UNLINKED, 0);
1155 if (error)
1156 goto fail_quota_locks;
1157 }
1158
1159 error = gfs2_dir_add(dip->i_vnode, name, &ip->i_num, IF2DT(ip->i_di.di_mode));
1160 if (error)
1161 goto fail_end_trans;
1162
1163 error = gfs2_meta_inode_buffer(ip, &dibh);
1164 if (error)
1165 goto fail_end_trans;
1166 ip->i_di.di_nlink = 1;
1167 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1168 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1169 brelse(dibh);
1170
1171 error = gfs2_unlinked_ondisk_rm(sdp, ul);
1172 if (error)
1173 goto fail_end_trans;
1174
1175 return 0;
1176
1177 fail_end_trans:
1178 gfs2_trans_end(sdp);
1179
1180 fail_ipreserv:
1181 if (dip->i_alloc.al_rgd)
1182 gfs2_inplace_release(dip);
1183
1184 fail_quota_locks:
1185 gfs2_quota_unlock(dip);
1186
1187 fail:
1188 gfs2_alloc_put(dip);
1189
1190 return error;
1191}
1192
1193/**
1194 * gfs2_createi - Create a new inode
1195 * @ghs: An array of two holders
1196 * @name: The name of the new file
1197 * @mode: the permissions on the new inode
1198 *
1199 * @ghs[0] is an initialized holder for the directory
1200 * @ghs[1] is the holder for the inode lock
1201 *
1202 * If the return value is not NULL, the glocks on both the directory and the new
1203 * file are held. A transaction has been started and an inplace reservation
1204 * is held, as well.
1205 *
1206 * Returns: An inode
1207 */
1208
1209struct inode *gfs2_createi(struct gfs2_holder *ghs, struct qstr *name,
1210 unsigned int mode)
1211{
1212 struct inode *inode;
1213 struct gfs2_inode *dip = ghs->gh_gl->gl_object;
1214 struct gfs2_sbd *sdp = dip->i_sbd;
1215 struct gfs2_unlinked *ul;
1216 struct gfs2_inode *ip;
1217 int error;
1218
1219 if (!name->len || name->len > GFS2_FNAMESIZE)
1220 return ERR_PTR(-ENAMETOOLONG);
1221
1222 error = gfs2_unlinked_get(sdp, &ul);
1223 if (error)
1224 return ERR_PTR(error);
1225
1226 gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
1227 error = gfs2_glock_nq(ghs);
1228 if (error)
1229 goto fail;
1230
1231 error = create_ok(dip, name, mode);
1232 if (error)
1233 goto fail_gunlock;
1234
1235 error = pick_formal_ino(sdp, &ul->ul_ut.ut_inum.no_formal_ino);
1236 if (error)
1237 goto fail_gunlock;
1238
1239 error = alloc_dinode(dip, ul);
1240 if (error)
1241 goto fail_gunlock;
1242
1243 if (ul->ul_ut.ut_inum.no_addr < dip->i_num.no_addr) {
1244 gfs2_glock_dq(ghs);
1245
1246 error = gfs2_glock_nq_num(sdp,
1247 ul->ul_ut.ut_inum.no_addr,
1248 &gfs2_inode_glops,
1249 LM_ST_EXCLUSIVE, GL_SKIP,
1250 ghs + 1);
1251 if (error) {
1252 gfs2_unlinked_put(sdp, ul);
1253 return ERR_PTR(error);
1254 }
1255
1256 gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
1257 error = gfs2_glock_nq(ghs);
1258 if (error) {
1259 gfs2_glock_dq_uninit(ghs + 1);
1260 gfs2_unlinked_put(sdp, ul);
1261 return ERR_PTR(error);
1262 }
1263
1264 error = create_ok(dip, name, mode);
1265 if (error)
1266 goto fail_gunlock2;
1267 } else {
1268 error = gfs2_glock_nq_num(sdp,
1269 ul->ul_ut.ut_inum.no_addr,
1270 &gfs2_inode_glops,
1271 LM_ST_EXCLUSIVE, GL_SKIP,
1272 ghs + 1);
1273 if (error)
1274 goto fail_gunlock;
1275 }
1276
1277 error = make_dinode(dip, ghs[1].gh_gl, mode, ul);
1278 if (error)
1279 goto fail_gunlock2;
1280
1281 error = gfs2_inode_get(ghs[1].gh_gl, &ul->ul_ut.ut_inum, CREATE, &ip);
1282 if (error)
1283 goto fail_gunlock2;
1284
1285 error = gfs2_inode_refresh(ip);
1286 if (error)
1287 goto fail_iput;
1288
1289 error = gfs2_acl_create(dip, ip);
1290 if (error)
1291 goto fail_iput;
1292
1293 error = link_dinode(dip, name, ip, ul);
1294 if (error)
1295 goto fail_iput;
1296
1297 gfs2_unlinked_put(sdp, ul);
1298
1299 inode = gfs2_ip2v(ip);
1300 gfs2_inode_put(ip);
1301 if (!inode)
1302 return ERR_PTR(-ENOMEM);
1303 return inode;
1304
1305 fail_iput:
1306 gfs2_inode_put(ip);
1307
1308 fail_gunlock2:
1309 gfs2_glock_dq_uninit(ghs + 1);
1310
1311 fail_gunlock:
1312 gfs2_glock_dq(ghs);
1313
1314 fail:
1315 gfs2_unlinked_put(sdp, ul);
1316
1317 return ERR_PTR(error);
1318}
1319
1320/**
1321 * gfs2_unlinki - Unlink a file
1322 * @dip: The inode of the directory
1323 * @name: The name of the file to be unlinked
1324 * @ip: The inode of the file to be removed
1325 *
1326 * Assumes Glocks on both dip and ip are held.
1327 *
1328 * Returns: errno
1329 */
1330
1331int gfs2_unlinki(struct gfs2_inode *dip, struct qstr *name,
1332 struct gfs2_inode *ip, struct gfs2_unlinked *ul)
1333{
1334 struct gfs2_sbd *sdp = dip->i_sbd;
1335 int error;
1336
1337 error = gfs2_dir_del(dip, name);
1338 if (error)
1339 return error;
1340
1341 error = gfs2_change_nlink(ip, -1);
1342 if (error)
1343 return error;
1344
1345 /* If this inode is being unlinked from the directory structure,
1346 we need to mark that in the log so that it isn't lost during
1347 a crash. */
1348
1349 if (!ip->i_di.di_nlink) {
1350 ul->ul_ut.ut_inum = ip->i_num;
1351 error = gfs2_unlinked_ondisk_add(sdp, ul);
1352 if (!error)
1353 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
1354 }
1355
1356 return error;
1357}
1358
1359/**
1360 * gfs2_rmdiri - Remove a directory
1361 * @dip: The parent directory of the directory to be removed
1362 * @name: The name of the directory to be removed
1363 * @ip: The GFS2 inode of the directory to be removed
1364 *
1365 * Assumes Glocks on dip and ip are held
1366 *
1367 * Returns: errno
1368 */
1369
1370int gfs2_rmdiri(struct gfs2_inode *dip, struct qstr *name,
1371 struct gfs2_inode *ip, struct gfs2_unlinked *ul)
1372{
1373 struct gfs2_sbd *sdp = dip->i_sbd;
1374 struct qstr dotname;
1375 int error;
1376
1377 if (ip->i_di.di_entries != 2) {
1378 if (gfs2_consist_inode(ip))
1379 gfs2_dinode_print(&ip->i_di);
1380 return -EIO;
1381 }
1382
1383 error = gfs2_dir_del(dip, name);
1384 if (error)
1385 return error;
1386
1387 error = gfs2_change_nlink(dip, -1);
1388 if (error)
1389 return error;
1390
1391 gfs2_str2qstr(&dotname, ".");
1392 error = gfs2_dir_del(ip, &dotname);
1393 if (error)
1394 return error;
1395
1396 dotname.len = 2;
1397 dotname.name = "..";
1398 dotname.hash = gfs2_disk_hash(dotname.name, dotname.len);
1399 error = gfs2_dir_del(ip, &dotname);
1400 if (error)
1401 return error;
1402
1403 error = gfs2_change_nlink(ip, -2);
1404 if (error)
1405 return error;
1406
1407 /* This inode is being unlinked from the directory structure and
1408 we need to mark that in the log so that it isn't lost during
1409 a crash. */
1410
1411 ul->ul_ut.ut_inum = ip->i_num;
1412 error = gfs2_unlinked_ondisk_add(sdp, ul);
1413 if (!error)
1414 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
1415
1416 return error;
1417}
1418
1419/*
1420 * gfs2_unlink_ok - check to see that a inode is still in a directory
1421 * @dip: the directory
1422 * @name: the name of the file
1423 * @ip: the inode
1424 *
1425 * Assumes that the lock on (at least) @dip is held.
1426 *
1427 * Returns: 0 if the parent/child relationship is correct, errno if it isn't
1428 */
1429
1430int gfs2_unlink_ok(struct gfs2_inode *dip, struct qstr *name,
1431 struct gfs2_inode *ip)
1432{
1433 struct gfs2_inum inum;
1434 unsigned int type;
1435 int error;
1436
1437 if (IS_IMMUTABLE(ip->i_vnode) || IS_APPEND(ip->i_vnode))
1438 return -EPERM;
1439
1440 if ((dip->i_di.di_mode & S_ISVTX) &&
1441 dip->i_di.di_uid != current->fsuid &&
1442 ip->i_di.di_uid != current->fsuid &&
1443 !capable(CAP_FOWNER))
1444 return -EPERM;
1445
1446 if (IS_APPEND(dip->i_vnode))
1447 return -EPERM;
1448
1449 error = gfs2_repermission(dip->i_vnode, MAY_WRITE | MAY_EXEC, NULL);
1450 if (error)
1451 return error;
1452
1453 error = gfs2_dir_search(dip->i_vnode, name, &inum, &type);
1454 if (error)
1455 return error;
1456
1457 if (!gfs2_inum_equal(&inum, &ip->i_num))
1458 return -ENOENT;
1459
1460 if (IF2DT(ip->i_di.di_mode) != type) {
1461 gfs2_consist_inode(dip);
1462 return -EIO;
1463 }
1464
1465 return 0;
1466}
1467
1468/*
1469 * gfs2_ok_to_move - check if it's ok to move a directory to another directory
1470 * @this: move this
1471 * @to: to here
1472 *
1473 * Follow @to back to the root and make sure we don't encounter @this
1474 * Assumes we already hold the rename lock.
1475 *
1476 * Returns: errno
1477 */
1478
1479int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
1480{
1481 struct inode *dir = to->i_vnode;
1482 struct super_block *sb = dir->i_sb;
1483 struct inode *tmp;
1484 struct qstr dotdot;
1485 int error = 0;
1486
1487 gfs2_str2qstr(&dotdot, "..");
1488
1489 igrab(dir);
1490
1491 for (;;) {
1492 if (dir == this->i_vnode) {
1493 error = -EINVAL;
1494 break;
1495 }
1496 if (dir == sb->s_root->d_inode) {
1497 error = 0;
1498 break;
1499 }
1500
1501 tmp = gfs2_lookupi(dir, &dotdot, 1, NULL);
1502 if (IS_ERR(tmp)) {
1503 error = PTR_ERR(tmp);
1504 break;
1505 }
1506
1507 iput(dir);
1508 dir = tmp;
1509 }
1510
1511 iput(dir);
1512
1513 return error;
1514}
1515
1516/**
1517 * gfs2_readlinki - return the contents of a symlink
1518 * @ip: the symlink's inode
1519 * @buf: a pointer to the buffer to be filled
1520 * @len: a pointer to the length of @buf
1521 *
1522 * If @buf is too small, a piece of memory is kmalloc()ed and needs
1523 * to be freed by the caller.
1524 *
1525 * Returns: errno
1526 */
1527
1528int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1529{
1530 struct gfs2_holder i_gh;
1531 struct buffer_head *dibh;
1532 unsigned int x;
1533 int error;
1534
1535 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
1536 error = gfs2_glock_nq_atime(&i_gh);
1537 if (error) {
1538 gfs2_holder_uninit(&i_gh);
1539 return error;
1540 }
1541
1542 if (!ip->i_di.di_size) {
1543 gfs2_consist_inode(ip);
1544 error = -EIO;
1545 goto out;
1546 }
1547
1548 error = gfs2_meta_inode_buffer(ip, &dibh);
1549 if (error)
1550 goto out;
1551
1552 x = ip->i_di.di_size + 1;
1553 if (x > *len) {
1554 *buf = kmalloc(x, GFP_KERNEL);
1555 if (!*buf) {
1556 error = -ENOMEM;
1557 goto out_brelse;
1558 }
1559 }
1560
1561 memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
1562 *len = x;
1563
1564 out_brelse:
1565 brelse(dibh);
1566
1567 out:
1568 gfs2_glock_dq_uninit(&i_gh);
1569
1570 return error;
1571}
1572
1573/**
1574 * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
1575 * conditionally update the inode's atime
1576 * @gh: the holder to acquire
1577 *
1578 * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
1579 * Update if the difference between the current time and the inode's current
1580 * atime is greater than an interval specified at mount.
1581 *
1582 * Returns: errno
1583 */
1584
1585int gfs2_glock_nq_atime(struct gfs2_holder *gh)
1586{
1587 struct gfs2_glock *gl = gh->gh_gl;
1588 struct gfs2_sbd *sdp = gl->gl_sbd;
1589 struct gfs2_inode *ip = gl->gl_object;
1590 int64_t curtime, quantum = gfs2_tune_get(sdp, gt_atime_quantum);
1591 unsigned int state;
1592 int flags;
1593 int error;
1594
1595 if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
1596 gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
1597 gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
1598 return -EINVAL;
1599
1600 state = gh->gh_state;
1601 flags = gh->gh_flags;
1602
1603 error = gfs2_glock_nq(gh);
1604 if (error)
1605 return error;
1606
1607 if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
1608 (sdp->sd_vfs->s_flags & MS_RDONLY))
1609 return 0;
1610
1611 curtime = get_seconds();
1612 if (curtime - ip->i_di.di_atime >= quantum) {
1613 gfs2_glock_dq(gh);
1614 gfs2_holder_reinit(LM_ST_EXCLUSIVE,
1615 gh->gh_flags & ~LM_FLAG_ANY,
1616 gh);
1617 error = gfs2_glock_nq(gh);
1618 if (error)
1619 return error;
1620
1621 /* Verify that atime hasn't been updated while we were
1622 trying to get exclusive lock. */
1623
1624 curtime = get_seconds();
1625 if (curtime - ip->i_di.di_atime >= quantum) {
1626 struct buffer_head *dibh;
1627
1628 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1629 if (error == -EROFS)
1630 return 0;
1631 if (error)
1632 goto fail;
1633
1634 error = gfs2_meta_inode_buffer(ip, &dibh);
1635 if (error)
1636 goto fail_end_trans;
1637
1638 ip->i_di.di_atime = curtime;
1639
1640 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1641 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1642 brelse(dibh);
1643
1644 gfs2_trans_end(sdp);
1645 }
1646
1647 /* If someone else has asked for the glock,
1648 unlock and let them have it. Then reacquire
1649 in the original state. */
1650 if (gfs2_glock_is_blocking(gl)) {
1651 gfs2_glock_dq(gh);
1652 gfs2_holder_reinit(state, flags, gh);
1653 return gfs2_glock_nq(gh);
1654 }
1655 }
1656
1657 return 0;
1658
1659 fail_end_trans:
1660 gfs2_trans_end(sdp);
1661
1662 fail:
1663 gfs2_glock_dq(gh);
1664
1665 return error;
1666}
1667
1668/**
1669 * glock_compare_atime - Compare two struct gfs2_glock structures for sort
1670 * @arg_a: the first structure
1671 * @arg_b: the second structure
1672 *
1673 * Returns: 1 if A > B
1674 * -1 if A < B
1675 * 0 if A = B
1676 */
1677
1678static int glock_compare_atime(const void *arg_a, const void *arg_b)
1679{
1680 struct gfs2_holder *gh_a = *(struct gfs2_holder **)arg_a;
1681 struct gfs2_holder *gh_b = *(struct gfs2_holder **)arg_b;
1682 struct lm_lockname *a = &gh_a->gh_gl->gl_name;
1683 struct lm_lockname *b = &gh_b->gh_gl->gl_name;
1684 int ret = 0;
1685
1686 if (a->ln_number > b->ln_number)
1687 ret = 1;
1688 else if (a->ln_number < b->ln_number)
1689 ret = -1;
1690 else {
1691 if (gh_a->gh_state == LM_ST_SHARED &&
1692 gh_b->gh_state == LM_ST_EXCLUSIVE)
1693 ret = 1;
1694 else if (gh_a->gh_state == LM_ST_SHARED &&
1695 (gh_b->gh_flags & GL_ATIME))
1696 ret = 1;
1697 }
1698
1699 return ret;
1700}
1701
1702/**
1703 * gfs2_glock_nq_m_atime - acquire multiple glocks where one may need an
1704 * atime update
1705 * @num_gh: the number of structures
1706 * @ghs: an array of struct gfs2_holder structures
1707 *
1708 * Returns: 0 on success (all glocks acquired),
1709 * errno on failure (no glocks acquired)
1710 */
1711
1712int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs)
1713{
1714 struct gfs2_holder **p;
1715 unsigned int x;
1716 int error = 0;
1717
1718 if (!num_gh)
1719 return 0;
1720
1721 if (num_gh == 1) {
1722 ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1723 if (ghs->gh_flags & GL_ATIME)
1724 error = gfs2_glock_nq_atime(ghs);
1725 else
1726 error = gfs2_glock_nq(ghs);
1727 return error;
1728 }
1729
1730 p = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
1731 if (!p)
1732 return -ENOMEM;
1733
1734 for (x = 0; x < num_gh; x++)
1735 p[x] = &ghs[x];
1736
1737 sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare_atime,NULL);
1738
1739 for (x = 0; x < num_gh; x++) {
1740 p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
1741
1742 if (p[x]->gh_flags & GL_ATIME)
1743 error = gfs2_glock_nq_atime(p[x]);
1744 else
1745 error = gfs2_glock_nq(p[x]);
1746
1747 if (error) {
1748 while (x--)
1749 gfs2_glock_dq(p[x]);
1750 break;
1751 }
1752 }
1753
1754 kfree(p);
1755
1756 return error;
1757}
1758
1759/**
1760 * gfs2_try_toss_vnode - See if we can toss a vnode from memory
1761 * @ip: the inode
1762 *
1763 * Returns: 1 if the vnode was tossed
1764 */
1765
1766void gfs2_try_toss_vnode(struct gfs2_inode *ip)
1767{
1768 struct inode *inode;
1769
1770 inode = gfs2_ip2v_lookup(ip);
1771 if (!inode)
1772 return;
1773
1774 d_prune_aliases(inode);
1775
1776 if (S_ISDIR(ip->i_di.di_mode)) {
1777 struct list_head *head = &inode->i_dentry;
1778 struct dentry *d = NULL;
1779
1780 spin_lock(&dcache_lock);
1781 if (list_empty(head))
1782 spin_unlock(&dcache_lock);
1783 else {
1784 d = list_entry(head->next, struct dentry, d_alias);
1785 dget_locked(d);
1786 spin_unlock(&dcache_lock);
1787
1788 if (have_submounts(d))
1789 dput(d);
1790 else {
1791 shrink_dcache_parent(d);
1792 dput(d);
1793 d_prune_aliases(inode);
1794 }
1795 }
1796 }
1797
1798 inode->i_nlink = 0;
1799 iput(inode);
1800}
1801
1802
1803static int
1804__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1805{
1806 struct buffer_head *dibh;
1807 int error;
1808
1809 error = gfs2_meta_inode_buffer(ip, &dibh);
1810 if (!error) {
1811 error = inode_setattr(ip->i_vnode, attr);
1812 gfs2_assert_warn(ip->i_sbd, !error);
1813 gfs2_inode_attr_out(ip);
1814
1815 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1816 gfs2_dinode_out(&ip->i_di, dibh->b_data);
1817 brelse(dibh);
1818 }
1819 return error;
1820}
1821
1822/**
1823 * gfs2_setattr_simple -
1824 * @ip:
1825 * @attr:
1826 *
1827 * Called with a reference on the vnode.
1828 *
1829 * Returns: errno
1830 */
1831
1832int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1833{
1834 int error;
1835
1836 if (current->journal_info)
1837 return __gfs2_setattr_simple(ip, attr);
1838
1839 error = gfs2_trans_begin(ip->i_sbd, RES_DINODE, 0);
1840 if (error)
1841 return error;
1842
1843 error = __gfs2_setattr_simple(ip, attr);
1844
1845 gfs2_trans_end(ip->i_sbd);
1846
1847 return error;
1848}
1849
1850int gfs2_repermission(struct inode *inode, int mask, struct nameidata *nd)
1851{
1852 return permission(inode, mask, nd);
1853}
1854
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
new file mode 100644
index 000000000000..0dd2a26626ec
--- /dev/null
+++ b/fs/gfs2/inode.h
@@ -0,0 +1,72 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __INODE_DOT_H__
11#define __INODE_DOT_H__
12
13static inline int gfs2_is_stuffed(struct gfs2_inode *ip)
14{
15 return !ip->i_di.di_height;
16}
17
18static inline int gfs2_is_jdata(struct gfs2_inode *ip)
19{
20 return ip->i_di.di_flags & GFS2_DIF_JDATA;
21}
22
23static inline int gfs2_is_dir(struct gfs2_inode *ip)
24{
25 return S_ISDIR(ip->i_di.di_mode);
26}
27
28void gfs2_inode_attr_in(struct gfs2_inode *ip);
29void gfs2_inode_attr_out(struct gfs2_inode *ip);
30struct inode *gfs2_ip2v_lookup(struct gfs2_inode *ip);
31struct inode *gfs2_ip2v(struct gfs2_inode *ip);
32struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum *inum);
33
34void gfs2_inode_min_init(struct gfs2_inode *ip, unsigned int type);
35int gfs2_inode_refresh(struct gfs2_inode *ip);
36
37int gfs2_inode_get(struct gfs2_glock *i_gl,
38 const struct gfs2_inum *inum, int create,
39 struct gfs2_inode **ipp);
40void gfs2_inode_hold(struct gfs2_inode *ip);
41void gfs2_inode_put(struct gfs2_inode *ip);
42void gfs2_inode_destroy(struct gfs2_inode *ip);
43
44int gfs2_inode_dealloc(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul);
45
46int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
47struct inode *gfs2_lookupi(struct inode *dir, struct qstr *name, int is_root,
48 struct nameidata *nd);
49struct inode *gfs2_createi(struct gfs2_holder *ghs, struct qstr *name,
50 unsigned int mode);
51int gfs2_unlinki(struct gfs2_inode *dip, struct qstr *name,
52 struct gfs2_inode *ip, struct gfs2_unlinked *ul);
53int gfs2_rmdiri(struct gfs2_inode *dip, struct qstr *name,
54 struct gfs2_inode *ip, struct gfs2_unlinked *ul);
55int gfs2_unlink_ok(struct gfs2_inode *dip, struct qstr *name,
56 struct gfs2_inode *ip);
57int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
58int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
59
60int gfs2_glock_nq_atime(struct gfs2_holder *gh);
61int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs);
62
63void gfs2_try_toss_vnode(struct gfs2_inode *ip);
64
65int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
66
67int gfs2_repermission(struct inode *inode, int mask, struct nameidata *nd);
68
69struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
70
71#endif /* __INODE_DOT_H__ */
72
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
new file mode 100644
index 000000000000..5b3c56d2df2f
--- /dev/null
+++ b/fs/gfs2/lm.c
@@ -0,0 +1,243 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/delay.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/semaphore.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "glock.h"
23#include "lm.h"
24#include "super.h"
25#include "util.h"
26#include "lvb.h"
27
28/**
29 * gfs2_lm_mount - mount a locking protocol
30 * @sdp: the filesystem
31 * @args: mount arguements
32 * @silent: if 1, don't complain if the FS isn't a GFS2 fs
33 *
34 * Returns: errno
35 */
36
37int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
38{
39 char *proto = sdp->sd_proto_name;
40 char *table = sdp->sd_table_name;
41 int flags = 0;
42 int error;
43
44 if (sdp->sd_args.ar_spectator)
45 flags |= LM_MFLAG_SPECTATOR;
46
47 fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
48
49 error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
50 gfs2_glock_cb, sdp,
51 GFS2_MIN_LVB_SIZE, flags,
52 &sdp->sd_lockstruct, &sdp->sd_kobj);
53 if (error) {
54 fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
55 proto, table, sdp->sd_args.ar_hostdata);
56 goto out;
57 }
58
59 if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
60 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
61 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
62 GFS2_MIN_LVB_SIZE)) {
63 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
64 goto out;
65 }
66
67 if (sdp->sd_args.ar_spectator)
68 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
69 else
70 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
71 sdp->sd_lockstruct.ls_jid);
72
73 fs_info(sdp, "Joined cluster. Now mounting FS...\n");
74
75 if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
76 !sdp->sd_args.ar_ignore_local_fs) {
77 sdp->sd_args.ar_localflocks = 1;
78 sdp->sd_args.ar_localcaching = 1;
79 }
80
81 out:
82 return error;
83}
84
85void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
86{
87 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
88 sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
89 sdp->sd_lockstruct.ls_lockspace);
90}
91
92void gfs2_lm_unmount(struct gfs2_sbd *sdp)
93{
94 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
95 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
96}
97
98int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
99{
100 va_list args;
101
102 if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
103 return 0;
104
105 va_start(args, fmt);
106 vprintk(fmt, args);
107 va_end(args);
108
109 fs_err(sdp, "about to withdraw from the cluster\n");
110 BUG_ON(sdp->sd_args.ar_debug);
111
112
113 fs_err(sdp, "waiting for outstanding I/O\n");
114
115 /* FIXME: suspend dm device so oustanding bio's complete
116 and all further io requests fail */
117
118 fs_err(sdp, "telling LM to withdraw\n");
119 gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
120 fs_err(sdp, "withdrawn\n");
121 dump_stack();
122
123 return -1;
124}
125
126int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
127 lm_lock_t **lockp)
128{
129 int error;
130 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
131 error = -EIO;
132 else
133 error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
134 sdp->sd_lockstruct.ls_lockspace, name, lockp);
135 return error;
136}
137
138void gfs2_lm_put_lock(struct gfs2_sbd *sdp, lm_lock_t *lock)
139{
140 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
141 sdp->sd_lockstruct.ls_ops->lm_put_lock(lock);
142}
143
144unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, lm_lock_t *lock,
145 unsigned int cur_state, unsigned int req_state,
146 unsigned int flags)
147{
148 int ret;
149 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
150 ret = 0;
151 else
152 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
153 cur_state,
154 req_state, flags);
155 return ret;
156}
157
158unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, lm_lock_t *lock,
159 unsigned int cur_state)
160{
161 int ret;
162 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
163 ret = 0;
164 else
165 ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
166 return ret;
167}
168
169void gfs2_lm_cancel(struct gfs2_sbd *sdp, lm_lock_t *lock)
170{
171 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
172 sdp->sd_lockstruct.ls_ops->lm_cancel(lock);
173}
174
175int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char **lvbp)
176{
177 int error;
178 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
179 error = -EIO;
180 else
181 error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
182 return error;
183}
184
185void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb)
186{
187 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
188 sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb);
189}
190
191void gfs2_lm_sync_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb)
192{
193 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
194 sdp->sd_lockstruct.ls_ops->lm_sync_lvb(lock, lvb);
195}
196
197int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
198 struct file *file, struct file_lock *fl)
199{
200 int error;
201 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
202 error = -EIO;
203 else
204 error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
205 sdp->sd_lockstruct.ls_lockspace,
206 name, file, fl);
207 return error;
208}
209
210int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
211 struct file *file, int cmd, struct file_lock *fl)
212{
213 int error;
214 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
215 error = -EIO;
216 else
217 error = sdp->sd_lockstruct.ls_ops->lm_plock(
218 sdp->sd_lockstruct.ls_lockspace,
219 name, file, cmd, fl);
220 return error;
221}
222
223int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
224 struct file *file, struct file_lock *fl)
225{
226 int error;
227 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
228 error = -EIO;
229 else
230 error = sdp->sd_lockstruct.ls_ops->lm_punlock(
231 sdp->sd_lockstruct.ls_lockspace,
232 name, file, fl);
233 return error;
234}
235
236void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
237 unsigned int message)
238{
239 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
240 sdp->sd_lockstruct.ls_ops->lm_recovery_done(
241 sdp->sd_lockstruct.ls_lockspace, jid, message);
242}
243
diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h
new file mode 100644
index 000000000000..ec812424fdec
--- /dev/null
+++ b/fs/gfs2/lm.h
@@ -0,0 +1,42 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LM_DOT_H__
11#define __LM_DOT_H__
12
13int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent);
14void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp);
15void gfs2_lm_unmount(struct gfs2_sbd *sdp);
16int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
17__attribute__ ((format(printf, 2, 3)));
18int gfs2_lm_get_lock(struct gfs2_sbd *sdp,
19 struct lm_lockname *name, lm_lock_t **lockp);
20void gfs2_lm_put_lock(struct gfs2_sbd *sdp, lm_lock_t *lock);
21unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, lm_lock_t *lock,
22 unsigned int cur_state, unsigned int req_state,
23 unsigned int flags);
24unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, lm_lock_t *lock,
25 unsigned int cur_state);
26void gfs2_lm_cancel(struct gfs2_sbd *sdp, lm_lock_t *lock);
27int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char **lvbp);
28void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb);
29void gfs2_lm_sync_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb);
30int gfs2_lm_plock_get(struct gfs2_sbd *sdp,
31 struct lm_lockname *name,
32 struct file *file, struct file_lock *fl);
33int gfs2_lm_plock(struct gfs2_sbd *sdp,
34 struct lm_lockname *name,
35 struct file *file, int cmd, struct file_lock *fl);
36int gfs2_lm_punlock(struct gfs2_sbd *sdp,
37 struct lm_lockname *name,
38 struct file *file, struct file_lock *fl);
39void gfs2_lm_recovery_done(struct gfs2_sbd *sdp,
40 unsigned int jid, unsigned int message);
41
42#endif /* __LM_DOT_H__ */
diff --git a/fs/gfs2/lm_interface.h b/fs/gfs2/lm_interface.h
new file mode 100644
index 000000000000..378432f17f27
--- /dev/null
+++ b/fs/gfs2/lm_interface.h
@@ -0,0 +1,295 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LM_INTERFACE_DOT_H__
11#define __LM_INTERFACE_DOT_H__
12
13/*
14 * Opaque handles represent the lock module's lockspace structure, the lock
15 * module's lock structures, and GFS's file system (superblock) structure.
16 */
17
18typedef void lm_lockspace_t;
19typedef void lm_lock_t;
20typedef void lm_fsdata_t;
21
22typedef void (*lm_callback_t) (lm_fsdata_t *fsdata, unsigned int type,
23 void *data);
24
25/*
26 * lm_mount() flags
27 *
28 * LM_MFLAG_SPECTATOR
29 * GFS is asking to join the filesystem's lockspace, but it doesn't want to
30 * modify the filesystem. The lock module shouldn't assign a journal to the FS
31 * mount. It shouldn't send recovery callbacks to the FS mount. If the node
32 * dies or withdraws, all locks can be wiped immediately.
33 */
34
35#define LM_MFLAG_SPECTATOR 0x00000001
36
37/*
38 * lm_lockstruct flags
39 *
40 * LM_LSFLAG_LOCAL
41 * The lock_nolock module returns LM_LSFLAG_LOCAL to GFS, indicating that GFS
42 * can make single-node optimizations.
43 */
44
45#define LM_LSFLAG_LOCAL 0x00000001
46
47/*
48 * lm_lockname types
49 */
50
51#define LM_TYPE_RESERVED 0x00
52#define LM_TYPE_NONDISK 0x01
53#define LM_TYPE_INODE 0x02
54#define LM_TYPE_RGRP 0x03
55#define LM_TYPE_META 0x04
56#define LM_TYPE_IOPEN 0x05
57#define LM_TYPE_FLOCK 0x06
58#define LM_TYPE_PLOCK 0x07
59#define LM_TYPE_QUOTA 0x08
60#define LM_TYPE_JOURNAL 0x09
61
62/*
63 * lm_lock() states
64 *
65 * SHARED is compatible with SHARED, not with DEFERRED or EX.
66 * DEFERRED is compatible with DEFERRED, not with SHARED or EX.
67 */
68
69#define LM_ST_UNLOCKED 0
70#define LM_ST_EXCLUSIVE 1
71#define LM_ST_DEFERRED 2
72#define LM_ST_SHARED 3
73
74/*
75 * lm_lock() flags
76 *
77 * LM_FLAG_TRY
78 * Don't wait to acquire the lock if it can't be granted immediately.
79 *
80 * LM_FLAG_TRY_1CB
81 * Send one blocking callback if TRY is set and the lock is not granted.
82 *
83 * LM_FLAG_NOEXP
84 * GFS sets this flag on lock requests it makes while doing journal recovery.
85 * These special requests should not be blocked due to the recovery like
86 * ordinary locks would be.
87 *
88 * LM_FLAG_ANY
89 * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may
90 * also be granted in SHARED. The preferred state is whichever is compatible
91 * with other granted locks, or the specified state if no other locks exist.
92 *
93 * LM_FLAG_PRIORITY
94 * Override fairness considerations. Suppose a lock is held in a shared state
95 * and there is a pending request for the deferred state. A shared lock
96 * request with the priority flag would be allowed to bypass the deferred
97 * request and directly join the other shared lock. A shared lock request
98 * without the priority flag might be forced to wait until the deferred
99 * requested had acquired and released the lock.
100 */
101
102#define LM_FLAG_TRY 0x00000001
103#define LM_FLAG_TRY_1CB 0x00000002
104#define LM_FLAG_NOEXP 0x00000004
105#define LM_FLAG_ANY 0x00000008
106#define LM_FLAG_PRIORITY 0x00000010
107
108/*
109 * lm_lock() and lm_async_cb return flags
110 *
111 * LM_OUT_ST_MASK
112 * Masks the lower two bits of lock state in the returned value.
113 *
114 * LM_OUT_CACHEABLE
115 * The lock hasn't been released so GFS can continue to cache data for it.
116 *
117 * LM_OUT_CANCELED
118 * The lock request was canceled.
119 *
120 * LM_OUT_ASYNC
121 * The result of the request will be returned in an LM_CB_ASYNC callback.
122 */
123
124#define LM_OUT_ST_MASK 0x00000003
125#define LM_OUT_CACHEABLE 0x00000004
126#define LM_OUT_CANCELED 0x00000008
127#define LM_OUT_ASYNC 0x00000080
128#define LM_OUT_ERROR 0x00000100
129
130/*
131 * lm_callback_t types
132 *
133 * LM_CB_NEED_E LM_CB_NEED_D LM_CB_NEED_S
134 * Blocking callback, a remote node is requesting the given lock in
135 * EXCLUSIVE, DEFERRED, or SHARED.
136 *
137 * LM_CB_NEED_RECOVERY
138 * The given journal needs to be recovered.
139 *
140 * LM_CB_DROPLOCKS
141 * Reduce the number of cached locks.
142 *
143 * LM_CB_ASYNC
144 * The given lock has been granted.
145 */
146
147#define LM_CB_NEED_E 257
148#define LM_CB_NEED_D 258
149#define LM_CB_NEED_S 259
150#define LM_CB_NEED_RECOVERY 260
151#define LM_CB_DROPLOCKS 261
152#define LM_CB_ASYNC 262
153
154/*
155 * lm_recovery_done() messages
156 */
157
158#define LM_RD_GAVEUP 308
159#define LM_RD_SUCCESS 309
160
161
162struct lm_lockname {
163 uint64_t ln_number;
164 unsigned int ln_type;
165};
166
167#define lm_name_equal(name1, name2) \
168 (((name1)->ln_number == (name2)->ln_number) && \
169 ((name1)->ln_type == (name2)->ln_type)) \
170
171struct lm_async_cb {
172 struct lm_lockname lc_name;
173 int lc_ret;
174};
175
176struct lm_lockstruct;
177
178struct lm_lockops {
179 char lm_proto_name[256];
180
181 /*
182 * Mount/Unmount
183 */
184
185 int (*lm_mount) (char *table_name, char *host_data,
186 lm_callback_t cb, lm_fsdata_t *fsdata,
187 unsigned int min_lvb_size, int flags,
188 struct lm_lockstruct *lockstruct,
189 struct kobject *fskobj);
190
191 void (*lm_others_may_mount) (lm_lockspace_t *lockspace);
192
193 void (*lm_unmount) (lm_lockspace_t *lockspace);
194
195 void (*lm_withdraw) (lm_lockspace_t *lockspace);
196
197 /*
198 * Lock oriented operations
199 */
200
201 int (*lm_get_lock) (lm_lockspace_t *lockspace,
202 struct lm_lockname *name, lm_lock_t **lockp);
203
204 void (*lm_put_lock) (lm_lock_t *lock);
205
206 unsigned int (*lm_lock) (lm_lock_t *lock, unsigned int cur_state,
207 unsigned int req_state, unsigned int flags);
208
209 unsigned int (*lm_unlock) (lm_lock_t *lock, unsigned int cur_state);
210
211 void (*lm_cancel) (lm_lock_t *lock);
212
213 int (*lm_hold_lvb) (lm_lock_t *lock, char **lvbp);
214 void (*lm_unhold_lvb) (lm_lock_t *lock, char *lvb);
215 void (*lm_sync_lvb) (lm_lock_t *lock, char *lvb);
216
217 /*
218 * Posix Lock oriented operations
219 */
220
221 int (*lm_plock_get) (lm_lockspace_t *lockspace,
222 struct lm_lockname *name,
223 struct file *file, struct file_lock *fl);
224
225 int (*lm_plock) (lm_lockspace_t *lockspace,
226 struct lm_lockname *name,
227 struct file *file, int cmd, struct file_lock *fl);
228
229 int (*lm_punlock) (lm_lockspace_t *lockspace,
230 struct lm_lockname *name,
231 struct file *file, struct file_lock *fl);
232
233 /*
234 * Client oriented operations
235 */
236
237 void (*lm_recovery_done) (lm_lockspace_t *lockspace, unsigned int jid,
238 unsigned int message);
239
240 struct module *lm_owner;
241};
242
243/*
244 * lm_mount() return values
245 *
246 * ls_jid - the journal ID this node should use
247 * ls_first - this node is the first to mount the file system
248 * ls_lvb_size - size in bytes of lock value blocks
249 * ls_lockspace - lock module's context for this file system
250 * ls_ops - lock module's functions
251 * ls_flags - lock module features
252 */
253
254struct lm_lockstruct {
255 unsigned int ls_jid;
256 unsigned int ls_first;
257 unsigned int ls_lvb_size;
258 lm_lockspace_t *ls_lockspace;
259 struct lm_lockops *ls_ops;
260 int ls_flags;
261};
262
263void __init gfs2_init_lmh(void);
264
265/*
266 * Lock module bottom interface. A lock module makes itself available to GFS
267 * with these functions.
268 *
269 * For the time being, we copy the gfs1 lock module bottom interface so the
270 * same lock modules can be used with both gfs1 and gfs2 (it won't be possible
271 * to load both gfs1 and gfs2 at once.) Eventually the lock modules will fork
272 * for gfs1/gfs2 and this API can change to the gfs2_ prefix.
273 */
274
275int gfs_register_lockproto(struct lm_lockops *proto);
276
277void gfs_unregister_lockproto(struct lm_lockops *proto);
278
279/*
280 * Lock module top interface. GFS calls these functions when mounting or
281 * unmounting a file system.
282 */
283
284int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
285 lm_callback_t cb, lm_fsdata_t *fsdata,
286 unsigned int min_lvb_size, int flags,
287 struct lm_lockstruct *lockstruct,
288 struct kobject *fskobj);
289
290void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct);
291
292void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct);
293
294#endif /* __LM_INTERFACE_DOT_H__ */
295
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
new file mode 100644
index 000000000000..0f4c50ebcbad
--- /dev/null
+++ b/fs/gfs2/locking.c
@@ -0,0 +1,191 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/string.h>
13#include <linux/slab.h>
14#include <linux/wait.h>
15#include <linux/sched.h>
16#include <linux/kmod.h>
17#include <linux/fs.h>
18#include <linux/delay.h>
19
20#include "lm_interface.h"
21
22struct lmh_wrapper {
23 struct list_head lw_list;
24 struct lm_lockops *lw_ops;
25};
26
27/* List of registered low-level locking protocols. A file system selects one
28 of them by name at mount time, e.g. lock_nolock, lock_dlm. */
29
30static struct list_head lmh_list;
31static struct semaphore lmh_lock;
32
33/**
34 * gfs_register_lockproto - Register a low-level locking protocol
35 * @proto: the protocol definition
36 *
37 * Returns: 0 on success, -EXXX on failure
38 */
39
40int gfs_register_lockproto(struct lm_lockops *proto)
41{
42 struct lmh_wrapper *lw;
43
44 down(&lmh_lock);
45
46 list_for_each_entry(lw, &lmh_list, lw_list) {
47 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
48 up(&lmh_lock);
49 printk(KERN_INFO "GFS2: protocol %s already exists\n",
50 proto->lm_proto_name);
51 return -EEXIST;
52 }
53 }
54
55 lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL);
56 if (!lw) {
57 up(&lmh_lock);
58 return -ENOMEM;
59 }
60
61 lw->lw_ops = proto;
62 list_add(&lw->lw_list, &lmh_list);
63
64 up(&lmh_lock);
65
66 return 0;
67}
68
69/**
70 * gfs_unregister_lockproto - Unregister a low-level locking protocol
71 * @proto: the protocol definition
72 *
73 */
74
75void gfs_unregister_lockproto(struct lm_lockops *proto)
76{
77 struct lmh_wrapper *lw;
78
79 down(&lmh_lock);
80
81 list_for_each_entry(lw, &lmh_list, lw_list) {
82 if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
83 list_del(&lw->lw_list);
84 up(&lmh_lock);
85 kfree(lw);
86 return;
87 }
88 }
89
90 up(&lmh_lock);
91
92 printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n",
93 proto->lm_proto_name);
94}
95
96/**
97 * gfs2_mount_lockproto - Mount a lock protocol
98 * @proto_name - the name of the protocol
99 * @table_name - the name of the lock space
100 * @host_data - data specific to this host
101 * @cb - the callback to the code using the lock module
102 * @fsdata - data to pass back with the callback
103 * @min_lvb_size - the mininum LVB size that the caller can deal with
104 * @flags - LM_MFLAG_*
105 * @lockstruct - a structure returned describing the mount
106 *
107 * Returns: 0 on success, -EXXX on failure
108 */
109
110int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
111 lm_callback_t cb, lm_fsdata_t *fsdata,
112 unsigned int min_lvb_size, int flags,
113 struct lm_lockstruct *lockstruct,
114 struct kobject *fskobj)
115{
116 struct lmh_wrapper *lw = NULL;
117 int try = 0;
118 int error, found;
119
120 retry:
121 down(&lmh_lock);
122
123 found = 0;
124 list_for_each_entry(lw, &lmh_list, lw_list) {
125 if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
126 found = 1;
127 break;
128 }
129 }
130
131 if (!found) {
132 if (!try && capable(CAP_SYS_MODULE)) {
133 try = 1;
134 up(&lmh_lock);
135 request_module(proto_name);
136 goto retry;
137 }
138 printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name);
139 error = -ENOENT;
140 goto out;
141 }
142
143 if (!try_module_get(lw->lw_ops->lm_owner)) {
144 try = 0;
145 up(&lmh_lock);
146 msleep(1000);
147 goto retry;
148 }
149
150 error = lw->lw_ops->lm_mount(table_name, host_data, cb, fsdata,
151 min_lvb_size, flags, lockstruct, fskobj);
152 if (error)
153 module_put(lw->lw_ops->lm_owner);
154 out:
155 up(&lmh_lock);
156 return error;
157}
158
159void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
160{
161 down(&lmh_lock);
162 lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
163 if (lockstruct->ls_ops->lm_owner)
164 module_put(lockstruct->ls_ops->lm_owner);
165 up(&lmh_lock);
166}
167
168/**
169 * gfs2_withdraw_lockproto - abnormally unmount a lock module
170 * @lockstruct: the lockstruct passed into mount
171 *
172 */
173
174void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct)
175{
176 down(&lmh_lock);
177 lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace);
178 if (lockstruct->ls_ops->lm_owner)
179 module_put(lockstruct->ls_ops->lm_owner);
180 up(&lmh_lock);
181}
182
183void __init gfs2_init_lmh(void)
184{
185 init_MUTEX(&lmh_lock);
186 INIT_LIST_HEAD(&lmh_list);
187}
188
189EXPORT_SYMBOL_GPL(gfs_register_lockproto);
190EXPORT_SYMBOL_GPL(gfs_unregister_lockproto);
191
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
new file mode 100644
index 000000000000..a9733ff80371
--- /dev/null
+++ b/fs/gfs2/locking/dlm/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_GFS2_FS) += lock_dlm.o
2lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o
3
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
new file mode 100644
index 000000000000..1799d2237e7e
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -0,0 +1,538 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include "lock_dlm.h"
11
12static char junk_lvb[GDLM_LVB_SIZE];
13
14static void queue_complete(struct gdlm_lock *lp)
15{
16 struct gdlm_ls *ls = lp->ls;
17
18 clear_bit(LFL_ACTIVE, &lp->flags);
19
20 spin_lock(&ls->async_lock);
21 list_add_tail(&lp->clist, &ls->complete);
22 spin_unlock(&ls->async_lock);
23 wake_up(&ls->thread_wait);
24}
25
26static inline void gdlm_ast(void *astarg)
27{
28 queue_complete((struct gdlm_lock *) astarg);
29}
30
31static inline void gdlm_bast(void *astarg, int mode)
32{
33 struct gdlm_lock *lp = astarg;
34 struct gdlm_ls *ls = lp->ls;
35
36 if (!mode) {
37 printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
38 lp->lockname.ln_type, lp->lockname.ln_number);
39 return;
40 }
41
42 spin_lock(&ls->async_lock);
43 if (!lp->bast_mode) {
44 list_add_tail(&lp->blist, &ls->blocking);
45 lp->bast_mode = mode;
46 } else if (lp->bast_mode < mode)
47 lp->bast_mode = mode;
48 spin_unlock(&ls->async_lock);
49 wake_up(&ls->thread_wait);
50}
51
52void gdlm_queue_delayed(struct gdlm_lock *lp)
53{
54 struct gdlm_ls *ls = lp->ls;
55
56 spin_lock(&ls->async_lock);
57 list_add_tail(&lp->delay_list, &ls->delayed);
58 spin_unlock(&ls->async_lock);
59}
60
61/* convert gfs lock-state to dlm lock-mode */
62
63static int16_t make_mode(int16_t lmstate)
64{
65 switch (lmstate) {
66 case LM_ST_UNLOCKED:
67 return DLM_LOCK_NL;
68 case LM_ST_EXCLUSIVE:
69 return DLM_LOCK_EX;
70 case LM_ST_DEFERRED:
71 return DLM_LOCK_CW;
72 case LM_ST_SHARED:
73 return DLM_LOCK_PR;
74 }
75 gdlm_assert(0, "unknown LM state %d", lmstate);
76 return -1;
77}
78
79/* convert dlm lock-mode to gfs lock-state */
80
81int16_t gdlm_make_lmstate(int16_t dlmmode)
82{
83 switch (dlmmode) {
84 case DLM_LOCK_IV:
85 case DLM_LOCK_NL:
86 return LM_ST_UNLOCKED;
87 case DLM_LOCK_EX:
88 return LM_ST_EXCLUSIVE;
89 case DLM_LOCK_CW:
90 return LM_ST_DEFERRED;
91 case DLM_LOCK_PR:
92 return LM_ST_SHARED;
93 }
94 gdlm_assert(0, "unknown DLM mode %d", dlmmode);
95 return -1;
96}
97
98/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
99 DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
100
101static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state)
102{
103 int16_t cur = make_mode(cur_state);
104 if (lp->cur != DLM_LOCK_IV)
105 gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur);
106}
107
108static inline unsigned int make_flags(struct gdlm_lock *lp,
109 unsigned int gfs_flags,
110 int16_t cur, int16_t req)
111{
112 unsigned int lkf = 0;
113
114 if (gfs_flags & LM_FLAG_TRY)
115 lkf |= DLM_LKF_NOQUEUE;
116
117 if (gfs_flags & LM_FLAG_TRY_1CB) {
118 lkf |= DLM_LKF_NOQUEUE;
119 lkf |= DLM_LKF_NOQUEUEBAST;
120 }
121
122 if (gfs_flags & LM_FLAG_PRIORITY) {
123 lkf |= DLM_LKF_NOORDER;
124 lkf |= DLM_LKF_HEADQUE;
125 }
126
127 if (gfs_flags & LM_FLAG_ANY) {
128 if (req == DLM_LOCK_PR)
129 lkf |= DLM_LKF_ALTCW;
130 else if (req == DLM_LOCK_CW)
131 lkf |= DLM_LKF_ALTPR;
132 }
133
134 if (lp->lksb.sb_lkid != 0) {
135 lkf |= DLM_LKF_CONVERT;
136
137 /* Conversion deadlock avoidance by DLM */
138
139 if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
140 !(lkf & DLM_LKF_NOQUEUE) &&
141 cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
142 lkf |= DLM_LKF_CONVDEADLK;
143 }
144
145 if (lp->lvb)
146 lkf |= DLM_LKF_VALBLK;
147
148 return lkf;
149}
150
151/* make_strname - convert GFS lock numbers to a string */
152
153static inline void make_strname(struct lm_lockname *lockname,
154 struct gdlm_strname *str)
155{
156 sprintf(str->name, "%8x%16llx", lockname->ln_type,
157 lockname->ln_number);
158 str->namelen = GDLM_STRNAME_BYTES;
159}
160
161int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
162 struct gdlm_lock **lpp)
163{
164 struct gdlm_lock *lp;
165
166 lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL);
167 if (!lp)
168 return -ENOMEM;
169
170 lp->lockname = *name;
171 lp->ls = ls;
172 lp->cur = DLM_LOCK_IV;
173 lp->lvb = NULL;
174 lp->hold_null = NULL;
175 init_completion(&lp->ast_wait);
176 INIT_LIST_HEAD(&lp->clist);
177 INIT_LIST_HEAD(&lp->blist);
178 INIT_LIST_HEAD(&lp->delay_list);
179
180 spin_lock(&ls->async_lock);
181 list_add(&lp->all_list, &ls->all_locks);
182 ls->all_locks_count++;
183 spin_unlock(&ls->async_lock);
184
185 *lpp = lp;
186 return 0;
187}
188
189void gdlm_delete_lp(struct gdlm_lock *lp)
190{
191 struct gdlm_ls *ls = lp->ls;
192
193 spin_lock(&ls->async_lock);
194 if (!list_empty(&lp->clist))
195 list_del_init(&lp->clist);
196 if (!list_empty(&lp->blist))
197 list_del_init(&lp->blist);
198 if (!list_empty(&lp->delay_list))
199 list_del_init(&lp->delay_list);
200 gdlm_assert(!list_empty(&lp->all_list),
201 "%x,%llx", lp->lockname.ln_type, lp->lockname.ln_number);
202 list_del_init(&lp->all_list);
203 ls->all_locks_count--;
204 spin_unlock(&ls->async_lock);
205
206 kfree(lp);
207}
208
209int gdlm_get_lock(lm_lockspace_t *lockspace, struct lm_lockname *name,
210 lm_lock_t **lockp)
211{
212 struct gdlm_lock *lp;
213 int error;
214
215 error = gdlm_create_lp((struct gdlm_ls *) lockspace, name, &lp);
216
217 *lockp = (lm_lock_t *) lp;
218 return error;
219}
220
221void gdlm_put_lock(lm_lock_t *lock)
222{
223 gdlm_delete_lp((struct gdlm_lock *) lock);
224}
225
226unsigned int gdlm_do_lock(struct gdlm_lock *lp)
227{
228 struct gdlm_ls *ls = lp->ls;
229 struct gdlm_strname str;
230 int error, bast = 1;
231
232 /*
233 * When recovery is in progress, delay lock requests for submission
234 * once recovery is done. Requests for recovery (NOEXP) and unlocks
235 * can pass.
236 */
237
238 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
239 !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) {
240 gdlm_queue_delayed(lp);
241 return LM_OUT_ASYNC;
242 }
243
244 /*
245 * Submit the actual lock request.
246 */
247
248 if (test_bit(LFL_NOBAST, &lp->flags))
249 bast = 0;
250
251 make_strname(&lp->lockname, &str);
252
253 set_bit(LFL_ACTIVE, &lp->flags);
254
255 log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
256 lp->lockname.ln_number, lp->lksb.sb_lkid,
257 lp->cur, lp->req, lp->lkf);
258
259 error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
260 str.name, str.namelen, 0, gdlm_ast, (void *) lp,
261 bast ? gdlm_bast : NULL);
262
263 if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
264 lp->lksb.sb_status = -EAGAIN;
265 queue_complete(lp);
266 error = 0;
267 }
268
269 if (error) {
270 log_debug("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
271 "flags=%lx", ls->fsname, lp->lockname.ln_type,
272 lp->lockname.ln_number, error, lp->cur, lp->req,
273 lp->lkf, lp->flags);
274 return LM_OUT_ERROR;
275 }
276 return LM_OUT_ASYNC;
277}
278
279unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
280{
281 struct gdlm_ls *ls = lp->ls;
282 unsigned int lkf = 0;
283 int error;
284
285 set_bit(LFL_DLM_UNLOCK, &lp->flags);
286 set_bit(LFL_ACTIVE, &lp->flags);
287
288 if (lp->lvb)
289 lkf = DLM_LKF_VALBLK;
290
291 log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type,
292 lp->lockname.ln_number, lp->lksb.sb_lkid, lp->cur, lkf);
293
294 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
295
296 if (error) {
297 log_debug("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
298 "flags=%lx", ls->fsname, lp->lockname.ln_type,
299 lp->lockname.ln_number, error, lp->cur, lp->req,
300 lp->lkf, lp->flags);
301 return LM_OUT_ERROR;
302 }
303 return LM_OUT_ASYNC;
304}
305
306unsigned int gdlm_lock(lm_lock_t *lock, unsigned int cur_state,
307 unsigned int req_state, unsigned int flags)
308{
309 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
310
311 clear_bit(LFL_DLM_CANCEL, &lp->flags);
312 if (flags & LM_FLAG_NOEXP)
313 set_bit(LFL_NOBLOCK, &lp->flags);
314
315 check_cur_state(lp, cur_state);
316 lp->req = make_mode(req_state);
317 lp->lkf = make_flags(lp, flags, lp->cur, lp->req);
318
319 return gdlm_do_lock(lp);
320}
321
322unsigned int gdlm_unlock(lm_lock_t *lock, unsigned int cur_state)
323{
324 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
325
326 clear_bit(LFL_DLM_CANCEL, &lp->flags);
327 if (lp->cur == DLM_LOCK_IV)
328 return 0;
329 return gdlm_do_unlock(lp);
330}
331
332void gdlm_cancel(lm_lock_t *lock)
333{
334 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
335 struct gdlm_ls *ls = lp->ls;
336 int error, delay_list = 0;
337
338 if (test_bit(LFL_DLM_CANCEL, &lp->flags))
339 return;
340
341 log_info("gdlm_cancel %x,%llx flags %lx",
342 lp->lockname.ln_type, lp->lockname.ln_number, lp->flags);
343
344 spin_lock(&ls->async_lock);
345 if (!list_empty(&lp->delay_list)) {
346 list_del_init(&lp->delay_list);
347 delay_list = 1;
348 }
349 spin_unlock(&ls->async_lock);
350
351 if (delay_list) {
352 set_bit(LFL_CANCEL, &lp->flags);
353 set_bit(LFL_ACTIVE, &lp->flags);
354 queue_complete(lp);
355 return;
356 }
357
358 if (!test_bit(LFL_ACTIVE, &lp->flags) ||
359 test_bit(LFL_DLM_UNLOCK, &lp->flags)) {
360 log_info("gdlm_cancel skip %x,%llx flags %lx",
361 lp->lockname.ln_type, lp->lockname.ln_number,
362 lp->flags);
363 return;
364 }
365
366 /* the lock is blocked in the dlm */
367
368 set_bit(LFL_DLM_CANCEL, &lp->flags);
369 set_bit(LFL_ACTIVE, &lp->flags);
370
371 error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL,
372 NULL, lp);
373
374 log_info("gdlm_cancel rv %d %x,%llx flags %lx", error,
375 lp->lockname.ln_type, lp->lockname.ln_number, lp->flags);
376
377 if (error == -EBUSY)
378 clear_bit(LFL_DLM_CANCEL, &lp->flags);
379}
380
381int gdlm_add_lvb(struct gdlm_lock *lp)
382{
383 char *lvb;
384
385 lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL);
386 if (!lvb)
387 return -ENOMEM;
388
389 lp->lksb.sb_lvbptr = lvb;
390 lp->lvb = lvb;
391 return 0;
392}
393
394void gdlm_del_lvb(struct gdlm_lock *lp)
395{
396 kfree(lp->lvb);
397 lp->lvb = NULL;
398 lp->lksb.sb_lvbptr = NULL;
399}
400
401/* This can do a synchronous dlm request (requiring a lock_dlm thread to get
402 the completion) because gfs won't call hold_lvb() during a callback (from
403 the context of a lock_dlm thread). */
404
405static int hold_null_lock(struct gdlm_lock *lp)
406{
407 struct gdlm_lock *lpn = NULL;
408 int error;
409
410 if (lp->hold_null) {
411 printk(KERN_INFO "lock_dlm: lvb already held\n");
412 return 0;
413 }
414
415 error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn);
416 if (error)
417 goto out;
418
419 lpn->lksb.sb_lvbptr = junk_lvb;
420 lpn->lvb = junk_lvb;
421
422 lpn->req = DLM_LOCK_NL;
423 lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE;
424 set_bit(LFL_NOBAST, &lpn->flags);
425 set_bit(LFL_INLOCK, &lpn->flags);
426
427 init_completion(&lpn->ast_wait);
428 gdlm_do_lock(lpn);
429 wait_for_completion(&lpn->ast_wait);
430 error = lp->lksb.sb_status;
431 if (error) {
432 printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n",
433 error);
434 gdlm_delete_lp(lpn);
435 lpn = NULL;
436 }
437 out:
438 lp->hold_null = lpn;
439 return error;
440}
441
442/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get
443 the completion) because gfs may call unhold_lvb() during a callback (from
444 the context of a lock_dlm thread) which could cause a deadlock since the
445 other lock_dlm thread could be engaged in recovery. */
446
447static void unhold_null_lock(struct gdlm_lock *lp)
448{
449 struct gdlm_lock *lpn = lp->hold_null;
450
451 gdlm_assert(lpn, "%x,%llx",
452 lp->lockname.ln_type, lp->lockname.ln_number);
453 lpn->lksb.sb_lvbptr = NULL;
454 lpn->lvb = NULL;
455 set_bit(LFL_UNLOCK_DELETE, &lpn->flags);
456 gdlm_do_unlock(lpn);
457 lp->hold_null = NULL;
458}
459
460/* Acquire a NL lock because gfs requires the value block to remain
461 intact on the resource while the lvb is "held" even if it's holding no locks
462 on the resource. */
463
464int gdlm_hold_lvb(lm_lock_t *lock, char **lvbp)
465{
466 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
467 int error;
468
469 error = gdlm_add_lvb(lp);
470 if (error)
471 return error;
472
473 *lvbp = lp->lvb;
474
475 error = hold_null_lock(lp);
476 if (error)
477 gdlm_del_lvb(lp);
478
479 return error;
480}
481
482void gdlm_unhold_lvb(lm_lock_t *lock, char *lvb)
483{
484 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
485
486 unhold_null_lock(lp);
487 gdlm_del_lvb(lp);
488}
489
490void gdlm_sync_lvb(lm_lock_t *lock, char *lvb)
491{
492 struct gdlm_lock *lp = (struct gdlm_lock *) lock;
493
494 if (lp->cur != DLM_LOCK_EX)
495 return;
496
497 init_completion(&lp->ast_wait);
498 set_bit(LFL_SYNC_LVB, &lp->flags);
499
500 lp->req = DLM_LOCK_EX;
501 lp->lkf = make_flags(lp, 0, lp->cur, lp->req);
502
503 gdlm_do_lock(lp);
504 wait_for_completion(&lp->ast_wait);
505}
506
507void gdlm_submit_delayed(struct gdlm_ls *ls)
508{
509 struct gdlm_lock *lp, *safe;
510
511 spin_lock(&ls->async_lock);
512 list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) {
513 list_del_init(&lp->delay_list);
514 list_add_tail(&lp->delay_list, &ls->submit);
515 }
516 spin_unlock(&ls->async_lock);
517 wake_up(&ls->thread_wait);
518}
519
520int gdlm_release_all_locks(struct gdlm_ls *ls)
521{
522 struct gdlm_lock *lp, *safe;
523 int count = 0;
524
525 spin_lock(&ls->async_lock);
526 list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
527 list_del_init(&lp->all_list);
528
529 if (lp->lvb && lp->lvb != junk_lvb)
530 kfree(lp->lvb);
531 kfree(lp);
532 count++;
533 }
534 spin_unlock(&ls->async_lock);
535
536 return count;
537}
538
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
new file mode 100644
index 000000000000..6d76146953ce
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -0,0 +1,191 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef LOCK_DLM_DOT_H
11#define LOCK_DLM_DOT_H
12
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/spinlock.h>
16#include <linux/module.h>
17#include <linux/types.h>
18#include <linux/string.h>
19#include <linux/list.h>
20#include <linux/socket.h>
21#include <linux/delay.h>
22#include <linux/kthread.h>
23#include <linux/kobject.h>
24#include <linux/fcntl.h>
25#include <linux/wait.h>
26#include <net/sock.h>
27
28#include <linux/dlm.h>
29#include "../../lm_interface.h"
30
31/*
32 * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
33 * prefix of lock_dlm_ gets awkward. Externally, GFS refers to this module
34 * as "lock_dlm".
35 */
36
37#define GDLM_STRNAME_BYTES 24
38#define GDLM_LVB_SIZE 32
39#define GDLM_DROP_COUNT 50000
40#define GDLM_DROP_PERIOD 60
41#define GDLM_NAME_LEN 128
42
43/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number).
44 We sprintf these numbers into a 24 byte string of hex values to make them
45 human-readable (to make debugging simpler.) */
46
47struct gdlm_strname {
48 unsigned char name[GDLM_STRNAME_BYTES];
49 unsigned short namelen;
50};
51
52enum {
53 DFL_BLOCK_LOCKS = 0,
54 DFL_SPECTATOR = 1,
55 DFL_WITHDRAW = 2,
56};
57
58struct gdlm_ls {
59 uint32_t id;
60 int jid;
61 int first;
62 int first_done;
63 unsigned long flags;
64 struct kobject kobj;
65 char clustername[GDLM_NAME_LEN];
66 char fsname[GDLM_NAME_LEN];
67 int fsflags;
68 dlm_lockspace_t *dlm_lockspace;
69 lm_callback_t fscb;
70 lm_fsdata_t *fsdata;
71 int recover_jid;
72 int recover_jid_done;
73 spinlock_t async_lock;
74 struct list_head complete;
75 struct list_head blocking;
76 struct list_head delayed;
77 struct list_head submit;
78 struct list_head all_locks;
79 uint32_t all_locks_count;
80 wait_queue_head_t wait_control;
81 struct task_struct *thread1;
82 struct task_struct *thread2;
83 wait_queue_head_t thread_wait;
84 unsigned long drop_time;
85 int drop_locks_count;
86 int drop_locks_period;
87};
88
89enum {
90 LFL_NOBLOCK = 0,
91 LFL_NOCACHE = 1,
92 LFL_DLM_UNLOCK = 2,
93 LFL_DLM_CANCEL = 3,
94 LFL_SYNC_LVB = 4,
95 LFL_FORCE_PROMOTE = 5,
96 LFL_REREQUEST = 6,
97 LFL_ACTIVE = 7,
98 LFL_INLOCK = 8,
99 LFL_CANCEL = 9,
100 LFL_NOBAST = 10,
101 LFL_HEADQUE = 11,
102 LFL_UNLOCK_DELETE = 12,
103};
104
105struct gdlm_lock {
106 struct gdlm_ls *ls;
107 struct lm_lockname lockname;
108 char *lvb;
109 struct dlm_lksb lksb;
110
111 int16_t cur;
112 int16_t req;
113 int16_t prev_req;
114 uint32_t lkf; /* dlm flags DLM_LKF_ */
115 unsigned long flags; /* lock_dlm flags LFL_ */
116
117 int bast_mode; /* protected by async_lock */
118 struct completion ast_wait;
119
120 struct list_head clist; /* complete */
121 struct list_head blist; /* blocking */
122 struct list_head delay_list; /* delayed */
123 struct list_head all_list; /* all locks for the fs */
124 struct gdlm_lock *hold_null; /* NL lock for hold_lvb */
125};
126
127#define gdlm_assert(assertion, fmt, args...) \
128do { \
129 if (unlikely(!(assertion))) { \
130 printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \
131 "lock_dlm: " fmt "\n", \
132 #assertion, ##args); \
133 BUG(); \
134 } \
135} while (0)
136
137#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg)
138#define log_info(fmt, arg...) log_print(KERN_INFO , fmt , ## arg)
139#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg)
140#ifdef LOCK_DLM_LOG_DEBUG
141#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg)
142#else
143#define log_debug(fmt, arg...)
144#endif
145
146/* sysfs.c */
147
148int gdlm_sysfs_init(void);
149void gdlm_sysfs_exit(void);
150int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *);
151void gdlm_kobject_release(struct gdlm_ls *);
152
153/* thread.c */
154
155int gdlm_init_threads(struct gdlm_ls *);
156void gdlm_release_threads(struct gdlm_ls *);
157
158/* lock.c */
159
160int16_t gdlm_make_lmstate(int16_t);
161void gdlm_queue_delayed(struct gdlm_lock *);
162void gdlm_submit_delayed(struct gdlm_ls *);
163int gdlm_release_all_locks(struct gdlm_ls *);
164int gdlm_create_lp(struct gdlm_ls *, struct lm_lockname *, struct gdlm_lock **);
165void gdlm_delete_lp(struct gdlm_lock *);
166int gdlm_add_lvb(struct gdlm_lock *);
167void gdlm_del_lvb(struct gdlm_lock *);
168unsigned int gdlm_do_lock(struct gdlm_lock *);
169unsigned int gdlm_do_unlock(struct gdlm_lock *);
170
171int gdlm_get_lock(lm_lockspace_t *, struct lm_lockname *, lm_lock_t **);
172void gdlm_put_lock(lm_lock_t *);
173unsigned int gdlm_lock(lm_lock_t *, unsigned int, unsigned int, unsigned int);
174unsigned int gdlm_unlock(lm_lock_t *, unsigned int);
175void gdlm_cancel(lm_lock_t *);
176int gdlm_hold_lvb(lm_lock_t *, char **);
177void gdlm_unhold_lvb(lm_lock_t *, char *);
178void gdlm_sync_lvb(lm_lock_t *, char *);
179
180/* plock.c */
181
182int gdlm_plock_init(void);
183void gdlm_plock_exit(void);
184int gdlm_plock(lm_lockspace_t *, struct lm_lockname *, struct file *, int,
185 struct file_lock *);
186int gdlm_plock_get(lm_lockspace_t *, struct lm_lockname *, struct file *,
187 struct file_lock *);
188int gdlm_punlock(lm_lockspace_t *, struct lm_lockname *, struct file *,
189 struct file_lock *);
190#endif
191
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
new file mode 100644
index 000000000000..3c9adf18fd9c
--- /dev/null
+++ b/fs/gfs2/locking/dlm/main.c
@@ -0,0 +1,64 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/init.h>
11
12#include "lock_dlm.h"
13
14extern int gdlm_drop_count;
15extern int gdlm_drop_period;
16
17extern struct lm_lockops gdlm_ops;
18
19int __init init_lock_dlm(void)
20{
21 int error;
22
23 error = gfs_register_lockproto(&gdlm_ops);
24 if (error) {
25 printk(KERN_WARNING "lock_dlm: can't register protocol: %d\n",
26 error);
27 return error;
28 }
29
30 error = gdlm_sysfs_init();
31 if (error) {
32 gfs_unregister_lockproto(&gdlm_ops);
33 return error;
34 }
35
36 error = gdlm_plock_init();
37 if (error) {
38 gdlm_sysfs_exit();
39 gfs_unregister_lockproto(&gdlm_ops);
40 return error;
41 }
42
43 gdlm_drop_count = GDLM_DROP_COUNT;
44 gdlm_drop_period = GDLM_DROP_PERIOD;
45
46 printk(KERN_INFO
47 "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
48 return 0;
49}
50
51void __exit exit_lock_dlm(void)
52{
53 gdlm_plock_exit();
54 gdlm_sysfs_exit();
55 gfs_unregister_lockproto(&gdlm_ops);
56}
57
58module_init(init_lock_dlm);
59module_exit(exit_lock_dlm);
60
61MODULE_DESCRIPTION("GFS DLM Locking Module");
62MODULE_AUTHOR("Red Hat, Inc.");
63MODULE_LICENSE("GPL");
64
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
new file mode 100644
index 000000000000..026f05ce168d
--- /dev/null
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -0,0 +1,255 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include "lock_dlm.h"
11
12int gdlm_drop_count;
13int gdlm_drop_period;
14struct lm_lockops gdlm_ops;
15
16
17static struct gdlm_ls *init_gdlm(lm_callback_t cb, lm_fsdata_t *fsdata,
18 int flags, char *table_name)
19{
20 struct gdlm_ls *ls;
21 char buf[256], *p;
22
23 ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL);
24 if (!ls)
25 return NULL;
26
27 ls->drop_locks_count = gdlm_drop_count;
28 ls->drop_locks_period = gdlm_drop_period;
29 ls->fscb = cb;
30 ls->fsdata = fsdata;
31 ls->fsflags = flags;
32 spin_lock_init(&ls->async_lock);
33 INIT_LIST_HEAD(&ls->complete);
34 INIT_LIST_HEAD(&ls->blocking);
35 INIT_LIST_HEAD(&ls->delayed);
36 INIT_LIST_HEAD(&ls->submit);
37 INIT_LIST_HEAD(&ls->all_locks);
38 init_waitqueue_head(&ls->thread_wait);
39 init_waitqueue_head(&ls->wait_control);
40 ls->thread1 = NULL;
41 ls->thread2 = NULL;
42 ls->drop_time = jiffies;
43 ls->jid = -1;
44
45 strncpy(buf, table_name, 256);
46 buf[255] = '\0';
47
48 p = strstr(buf, ":");
49 if (!p) {
50 log_info("invalid table_name \"%s\"", table_name);
51 kfree(ls);
52 return NULL;
53 }
54 *p = '\0';
55 p++;
56
57 strncpy(ls->clustername, buf, GDLM_NAME_LEN);
58 strncpy(ls->fsname, p, GDLM_NAME_LEN);
59
60 return ls;
61}
62
63static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
64{
65 char data[256];
66 char *options, *x, *y;
67 int error = 0;
68
69 memset(data, 0, 256);
70 strncpy(data, data_arg, 255);
71
72 for (options = data; (x = strsep(&options, ":")); ) {
73 if (!*x)
74 continue;
75
76 y = strchr(x, '=');
77 if (y)
78 *y++ = 0;
79
80 if (!strcmp(x, "jid")) {
81 if (!y) {
82 log_error("need argument to jid");
83 error = -EINVAL;
84 break;
85 }
86 sscanf(y, "%u", &ls->jid);
87
88 } else if (!strcmp(x, "first")) {
89 if (!y) {
90 log_error("need argument to first");
91 error = -EINVAL;
92 break;
93 }
94 sscanf(y, "%u", &ls->first);
95
96 } else if (!strcmp(x, "id")) {
97 if (!y) {
98 log_error("need argument to id");
99 error = -EINVAL;
100 break;
101 }
102 sscanf(y, "%u", &ls->id);
103
104 } else if (!strcmp(x, "nodir")) {
105 if (!y) {
106 log_error("need argument to nodir");
107 error = -EINVAL;
108 break;
109 }
110 sscanf(y, "%u", nodir);
111
112 } else {
113 log_error("unkonwn option: %s", x);
114 error = -EINVAL;
115 break;
116 }
117 }
118
119 return error;
120}
121
122static int gdlm_mount(char *table_name, char *host_data,
123 lm_callback_t cb, lm_fsdata_t *fsdata,
124 unsigned int min_lvb_size, int flags,
125 struct lm_lockstruct *lockstruct,
126 struct kobject *fskobj)
127{
128 struct gdlm_ls *ls;
129 int error = -ENOMEM, nodir = 0;
130
131 if (min_lvb_size > GDLM_LVB_SIZE)
132 goto out;
133
134 ls = init_gdlm(cb, fsdata, flags, table_name);
135 if (!ls)
136 goto out;
137
138 error = make_args(ls, host_data, &nodir);
139 if (error)
140 goto out;
141
142 error = gdlm_init_threads(ls);
143 if (error)
144 goto out_free;
145
146 error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
147 &ls->dlm_lockspace,
148 nodir ? DLM_LSFL_NODIR : 0,
149 GDLM_LVB_SIZE);
150 if (error) {
151 log_error("dlm_new_lockspace error %d", error);
152 goto out_thread;
153 }
154
155 error = gdlm_kobject_setup(ls, fskobj);
156 if (error)
157 goto out_dlm;
158
159 lockstruct->ls_jid = ls->jid;
160 lockstruct->ls_first = ls->first;
161 lockstruct->ls_lockspace = ls;
162 lockstruct->ls_ops = &gdlm_ops;
163 lockstruct->ls_flags = 0;
164 lockstruct->ls_lvb_size = GDLM_LVB_SIZE;
165 return 0;
166
167 out_dlm:
168 dlm_release_lockspace(ls->dlm_lockspace, 2);
169 out_thread:
170 gdlm_release_threads(ls);
171 out_free:
172 kfree(ls);
173 out:
174 return error;
175}
176
177static void gdlm_unmount(lm_lockspace_t *lockspace)
178{
179 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
180 int rv;
181
182 log_debug("unmount flags %lx", ls->flags);
183
184 /* FIXME: serialize unmount and withdraw in case they
185 happen at once. Also, if unmount follows withdraw,
186 wait for withdraw to finish. */
187
188 if (test_bit(DFL_WITHDRAW, &ls->flags))
189 goto out;
190
191 gdlm_kobject_release(ls);
192 dlm_release_lockspace(ls->dlm_lockspace, 2);
193 gdlm_release_threads(ls);
194 rv = gdlm_release_all_locks(ls);
195 if (rv)
196 log_info("gdlm_unmount: %d stray locks freed", rv);
197 out:
198 kfree(ls);
199}
200
201static void gdlm_recovery_done(lm_lockspace_t *lockspace, unsigned int jid,
202 unsigned int message)
203{
204 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
205 ls->recover_jid_done = jid;
206 kobject_uevent(&ls->kobj, KOBJ_CHANGE);
207}
208
209static void gdlm_others_may_mount(lm_lockspace_t *lockspace)
210{
211 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
212 ls->first_done = 1;
213 kobject_uevent(&ls->kobj, KOBJ_CHANGE);
214}
215
216/* Userspace gets the offline uevent, blocks new gfs locks on
217 other mounters, and lets us know (sets WITHDRAW flag). Then,
218 userspace leaves the mount group while we leave the lockspace. */
219
220static void gdlm_withdraw(lm_lockspace_t *lockspace)
221{
222 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
223
224 kobject_uevent(&ls->kobj, KOBJ_OFFLINE);
225
226 wait_event_interruptible(ls->wait_control,
227 test_bit(DFL_WITHDRAW, &ls->flags));
228
229 dlm_release_lockspace(ls->dlm_lockspace, 2);
230 gdlm_release_threads(ls);
231 gdlm_release_all_locks(ls);
232 gdlm_kobject_release(ls);
233}
234
235struct lm_lockops gdlm_ops = {
236 .lm_proto_name = "lock_dlm",
237 .lm_mount = gdlm_mount,
238 .lm_others_may_mount = gdlm_others_may_mount,
239 .lm_unmount = gdlm_unmount,
240 .lm_withdraw = gdlm_withdraw,
241 .lm_get_lock = gdlm_get_lock,
242 .lm_put_lock = gdlm_put_lock,
243 .lm_lock = gdlm_lock,
244 .lm_unlock = gdlm_unlock,
245 .lm_plock = gdlm_plock,
246 .lm_punlock = gdlm_punlock,
247 .lm_plock_get = gdlm_plock_get,
248 .lm_cancel = gdlm_cancel,
249 .lm_hold_lvb = gdlm_hold_lvb,
250 .lm_unhold_lvb = gdlm_unhold_lvb,
251 .lm_sync_lvb = gdlm_sync_lvb,
252 .lm_recovery_done = gdlm_recovery_done,
253 .lm_owner = THIS_MODULE,
254};
255
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
new file mode 100644
index 000000000000..f7ac5821def9
--- /dev/null
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -0,0 +1,298 @@
1/*
2 * Copyright (C) 2005 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#include <linux/miscdevice.h>
10#include <linux/lock_dlm_plock.h>
11
12#include "lock_dlm.h"
13
14
15static spinlock_t ops_lock;
16static struct list_head send_list;
17static struct list_head recv_list;
18static wait_queue_head_t send_wq;
19static wait_queue_head_t recv_wq;
20
21struct plock_op {
22 struct list_head list;
23 int done;
24 struct gdlm_plock_info info;
25};
26
27static inline void set_version(struct gdlm_plock_info *info)
28{
29 info->version[0] = GDLM_PLOCK_VERSION_MAJOR;
30 info->version[1] = GDLM_PLOCK_VERSION_MINOR;
31 info->version[2] = GDLM_PLOCK_VERSION_PATCH;
32}
33
34static int check_version(struct gdlm_plock_info *info)
35{
36 if ((GDLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
37 (GDLM_PLOCK_VERSION_MINOR < info->version[1])) {
38 log_error("plock device version mismatch: "
39 "kernel (%u.%u.%u), user (%u.%u.%u)",
40 GDLM_PLOCK_VERSION_MAJOR,
41 GDLM_PLOCK_VERSION_MINOR,
42 GDLM_PLOCK_VERSION_PATCH,
43 info->version[0],
44 info->version[1],
45 info->version[2]);
46 return -EINVAL;
47 }
48 return 0;
49}
50
51static void send_op(struct plock_op *op)
52{
53 set_version(&op->info);
54 INIT_LIST_HEAD(&op->list);
55 spin_lock(&ops_lock);
56 list_add_tail(&op->list, &send_list);
57 spin_unlock(&ops_lock);
58 wake_up(&send_wq);
59}
60
61int gdlm_plock(lm_lockspace_t *lockspace, struct lm_lockname *name,
62 struct file *file, int cmd, struct file_lock *fl)
63{
64 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
65 struct plock_op *op;
66 int rv;
67
68 op = kzalloc(sizeof(*op), GFP_KERNEL);
69 if (!op)
70 return -ENOMEM;
71
72 op->info.optype = GDLM_PLOCK_OP_LOCK;
73 op->info.pid = (uint32_t) fl->fl_owner;
74 op->info.ex = (fl->fl_type == F_WRLCK);
75 op->info.wait = IS_SETLKW(cmd);
76 op->info.fsid = ls->id;
77 op->info.number = name->ln_number;
78 op->info.start = fl->fl_start;
79 op->info.end = fl->fl_end;
80
81 send_op(op);
82 wait_event(recv_wq, (op->done != 0));
83
84 spin_lock(&ops_lock);
85 if (!list_empty(&op->list)) {
86 printk(KERN_INFO "plock op on list\n");
87 list_del(&op->list);
88 }
89 spin_unlock(&ops_lock);
90
91 rv = op->info.rv;
92
93 if (!rv) {
94 if (posix_lock_file_wait(file, fl) < 0)
95 log_error("gdlm_plock: vfs lock error %x,%llx",
96 name->ln_type, name->ln_number);
97 }
98
99 kfree(op);
100 return rv;
101}
102
103int gdlm_punlock(lm_lockspace_t *lockspace, struct lm_lockname *name,
104 struct file *file, struct file_lock *fl)
105{
106 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
107 struct plock_op *op;
108 int rv;
109
110 op = kzalloc(sizeof(*op), GFP_KERNEL);
111 if (!op)
112 return -ENOMEM;
113
114 if (posix_lock_file_wait(file, fl) < 0)
115 log_error("gdlm_punlock: vfs unlock error %x,%llx",
116 name->ln_type, name->ln_number);
117
118 op->info.optype = GDLM_PLOCK_OP_UNLOCK;
119 op->info.pid = (uint32_t) fl->fl_owner;
120 op->info.fsid = ls->id;
121 op->info.number = name->ln_number;
122 op->info.start = fl->fl_start;
123 op->info.end = fl->fl_end;
124
125 send_op(op);
126 wait_event(recv_wq, (op->done != 0));
127
128 spin_lock(&ops_lock);
129 if (!list_empty(&op->list)) {
130 printk(KERN_INFO "punlock op on list\n");
131 list_del(&op->list);
132 }
133 spin_unlock(&ops_lock);
134
135 rv = op->info.rv;
136
137 kfree(op);
138 return rv;
139}
140
141int gdlm_plock_get(lm_lockspace_t *lockspace, struct lm_lockname *name,
142 struct file *file, struct file_lock *fl)
143{
144 struct gdlm_ls *ls = (struct gdlm_ls *) lockspace;
145 struct plock_op *op;
146 int rv;
147
148 op = kzalloc(sizeof(*op), GFP_KERNEL);
149 if (!op)
150 return -ENOMEM;
151
152 op->info.optype = GDLM_PLOCK_OP_GET;
153 op->info.pid = (uint32_t) fl->fl_owner;
154 op->info.ex = (fl->fl_type == F_WRLCK);
155 op->info.fsid = ls->id;
156 op->info.number = name->ln_number;
157 op->info.start = fl->fl_start;
158 op->info.end = fl->fl_end;
159
160 send_op(op);
161 wait_event(recv_wq, (op->done != 0));
162
163 spin_lock(&ops_lock);
164 if (!list_empty(&op->list)) {
165 printk(KERN_INFO "plock_get op on list\n");
166 list_del(&op->list);
167 }
168 spin_unlock(&ops_lock);
169
170 rv = op->info.rv;
171
172 if (rv == 0)
173 fl->fl_type = F_UNLCK;
174 else if (rv > 0) {
175 fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
176 fl->fl_pid = op->info.pid;
177 fl->fl_start = op->info.start;
178 fl->fl_end = op->info.end;
179 }
180
181 kfree(op);
182 return rv;
183}
184
185/* a read copies out one plock request from the send list */
186static ssize_t dev_read(struct file *file, char __user *u, size_t count,
187 loff_t *ppos)
188{
189 struct gdlm_plock_info info;
190 struct plock_op *op = NULL;
191
192 if (count < sizeof(info))
193 return -EINVAL;
194
195 spin_lock(&ops_lock);
196 if (!list_empty(&send_list)) {
197 op = list_entry(send_list.next, struct plock_op, list);
198 list_move(&op->list, &recv_list);
199 memcpy(&info, &op->info, sizeof(info));
200 }
201 spin_unlock(&ops_lock);
202
203 if (!op)
204 return -EAGAIN;
205
206 if (copy_to_user(u, &info, sizeof(info)))
207 return -EFAULT;
208 return sizeof(info);
209}
210
211/* a write copies in one plock result that should match a plock_op
212 on the recv list */
213static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
214 loff_t *ppos)
215{
216 struct gdlm_plock_info info;
217 struct plock_op *op;
218 int found = 0;
219
220 if (count != sizeof(info))
221 return -EINVAL;
222
223 if (copy_from_user(&info, u, sizeof(info)))
224 return -EFAULT;
225
226 if (check_version(&info))
227 return -EINVAL;
228
229 spin_lock(&ops_lock);
230 list_for_each_entry(op, &recv_list, list) {
231 if (op->info.fsid == info.fsid &&
232 op->info.number == info.number) {
233 list_del_init(&op->list);
234 found = 1;
235 op->done = 1;
236 memcpy(&op->info, &info, sizeof(info));
237 break;
238 }
239 }
240 spin_unlock(&ops_lock);
241
242 if (found)
243 wake_up(&recv_wq);
244 else
245 printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid,
246 info.number);
247 return count;
248}
249
250static unsigned int dev_poll(struct file *file, poll_table *wait)
251{
252 poll_wait(file, &send_wq, wait);
253
254 spin_lock(&ops_lock);
255 if (!list_empty(&send_list)) {
256 spin_unlock(&ops_lock);
257 return POLLIN | POLLRDNORM;
258 }
259 spin_unlock(&ops_lock);
260 return 0;
261}
262
263static struct file_operations dev_fops = {
264 .read = dev_read,
265 .write = dev_write,
266 .poll = dev_poll,
267 .owner = THIS_MODULE
268};
269
270static struct miscdevice plock_dev_misc = {
271 .minor = MISC_DYNAMIC_MINOR,
272 .name = GDLM_PLOCK_MISC_NAME,
273 .fops = &dev_fops
274};
275
276int gdlm_plock_init(void)
277{
278 int rv;
279
280 spin_lock_init(&ops_lock);
281 INIT_LIST_HEAD(&send_list);
282 INIT_LIST_HEAD(&recv_list);
283 init_waitqueue_head(&send_wq);
284 init_waitqueue_head(&recv_wq);
285
286 rv = misc_register(&plock_dev_misc);
287 if (rv)
288 printk(KERN_INFO "gdlm_plock_init: misc_register failed %d",
289 rv);
290 return rv;
291}
292
293void gdlm_plock_exit(void)
294{
295 if (misc_deregister(&plock_dev_misc) < 0)
296 printk(KERN_INFO "gdlm_plock_exit: misc_deregister failed");
297}
298
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
new file mode 100644
index 000000000000..e1e5186c97c9
--- /dev/null
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -0,0 +1,218 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/ctype.h>
11#include <linux/stat.h>
12
13#include "lock_dlm.h"
14
15extern struct lm_lockops gdlm_ops;
16
17static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
18{
19 return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
20}
21
22static ssize_t block_show(struct gdlm_ls *ls, char *buf)
23{
24 ssize_t ret;
25 int val = 0;
26
27 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags))
28 val = 1;
29 ret = sprintf(buf, "%d\n", val);
30 return ret;
31}
32
33static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len)
34{
35 ssize_t ret = len;
36 int val;
37
38 val = simple_strtol(buf, NULL, 0);
39
40 if (val == 1)
41 set_bit(DFL_BLOCK_LOCKS, &ls->flags);
42 else if (val == 0) {
43 clear_bit(DFL_BLOCK_LOCKS, &ls->flags);
44 gdlm_submit_delayed(ls);
45 } else
46 ret = -EINVAL;
47 return ret;
48}
49
50static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf)
51{
52 ssize_t ret;
53 int val = 0;
54
55 if (test_bit(DFL_WITHDRAW, &ls->flags))
56 val = 1;
57 ret = sprintf(buf, "%d\n", val);
58 return ret;
59}
60
61static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len)
62{
63 ssize_t ret = len;
64 int val;
65
66 val = simple_strtol(buf, NULL, 0);
67
68 if (val == 1)
69 set_bit(DFL_WITHDRAW, &ls->flags);
70 else
71 ret = -EINVAL;
72 wake_up(&ls->wait_control);
73 return ret;
74}
75
76static ssize_t id_show(struct gdlm_ls *ls, char *buf)
77{
78 return sprintf(buf, "%u\n", ls->id);
79}
80
81static ssize_t jid_show(struct gdlm_ls *ls, char *buf)
82{
83 return sprintf(buf, "%d\n", ls->jid);
84}
85
86static ssize_t first_show(struct gdlm_ls *ls, char *buf)
87{
88 return sprintf(buf, "%d\n", ls->first);
89}
90
91static ssize_t first_done_show(struct gdlm_ls *ls, char *buf)
92{
93 return sprintf(buf, "%d\n", ls->first_done);
94}
95
96static ssize_t recover_show(struct gdlm_ls *ls, char *buf)
97{
98 return sprintf(buf, "%d\n", ls->recover_jid);
99}
100
101static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len)
102{
103 ls->recover_jid = simple_strtol(buf, NULL, 0);
104 ls->fscb(ls->fsdata, LM_CB_NEED_RECOVERY, &ls->recover_jid);
105 return len;
106}
107
108static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf)
109{
110 return sprintf(buf, "%d\n", ls->recover_jid_done);
111}
112
113struct gdlm_attr {
114 struct attribute attr;
115 ssize_t (*show)(struct gdlm_ls *, char *);
116 ssize_t (*store)(struct gdlm_ls *, const char *, size_t);
117};
118
119#define GDLM_ATTR(_name,_mode,_show,_store) \
120static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
121
122GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
123GDLM_ATTR(block, 0644, block_show, block_store);
124GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
125GDLM_ATTR(id, 0444, id_show, NULL);
126GDLM_ATTR(jid, 0444, jid_show, NULL);
127GDLM_ATTR(first, 0444, first_show, NULL);
128GDLM_ATTR(first_done, 0444, first_done_show, NULL);
129GDLM_ATTR(recover, 0644, recover_show, recover_store);
130GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
131
132static struct attribute *gdlm_attrs[] = {
133 &gdlm_attr_proto_name.attr,
134 &gdlm_attr_block.attr,
135 &gdlm_attr_withdraw.attr,
136 &gdlm_attr_id.attr,
137 &gdlm_attr_jid.attr,
138 &gdlm_attr_first.attr,
139 &gdlm_attr_first_done.attr,
140 &gdlm_attr_recover.attr,
141 &gdlm_attr_recover_done.attr,
142 NULL,
143};
144
145static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr,
146 char *buf)
147{
148 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
149 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
150 return a->show ? a->show(ls, buf) : 0;
151}
152
153static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr,
154 const char *buf, size_t len)
155{
156 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
157 struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
158 return a->store ? a->store(ls, buf, len) : len;
159}
160
161static struct sysfs_ops gdlm_attr_ops = {
162 .show = gdlm_attr_show,
163 .store = gdlm_attr_store,
164};
165
166static struct kobj_type gdlm_ktype = {
167 .default_attrs = gdlm_attrs,
168 .sysfs_ops = &gdlm_attr_ops,
169};
170
171static struct kset gdlm_kset = {
172 .subsys = &kernel_subsys,
173 .kobj = {.name = "lock_dlm",},
174 .ktype = &gdlm_ktype,
175};
176
177int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
178{
179 int error;
180
181 error = kobject_set_name(&ls->kobj, "%s", "lock_module");
182 if (error) {
183 log_error("can't set kobj name %d", error);
184 return error;
185 }
186
187 ls->kobj.kset = &gdlm_kset;
188 ls->kobj.ktype = &gdlm_ktype;
189 ls->kobj.parent = fskobj;
190
191 error = kobject_register(&ls->kobj);
192 if (error)
193 log_error("can't register kobj %d", error);
194
195 return error;
196}
197
198void gdlm_kobject_release(struct gdlm_ls *ls)
199{
200 kobject_unregister(&ls->kobj);
201}
202
203int gdlm_sysfs_init(void)
204{
205 int error;
206
207 error = kset_register(&gdlm_kset);
208 if (error)
209 printk("lock_dlm: cannot register kset %d\n", error);
210
211 return error;
212}
213
214void gdlm_sysfs_exit(void)
215{
216 kset_unregister(&gdlm_kset);
217}
218
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
new file mode 100644
index 000000000000..3e2edcc2dbf6
--- /dev/null
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -0,0 +1,352 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include "lock_dlm.h"
11
12/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
13 thread gets to it. */
14
15static void queue_submit(struct gdlm_lock *lp)
16{
17 struct gdlm_ls *ls = lp->ls;
18
19 spin_lock(&ls->async_lock);
20 list_add_tail(&lp->delay_list, &ls->submit);
21 spin_unlock(&ls->async_lock);
22 wake_up(&ls->thread_wait);
23}
24
25static void process_blocking(struct gdlm_lock *lp, int bast_mode)
26{
27 struct gdlm_ls *ls = lp->ls;
28 unsigned int cb = 0;
29
30 switch (gdlm_make_lmstate(bast_mode)) {
31 case LM_ST_EXCLUSIVE:
32 cb = LM_CB_NEED_E;
33 break;
34 case LM_ST_DEFERRED:
35 cb = LM_CB_NEED_D;
36 break;
37 case LM_ST_SHARED:
38 cb = LM_CB_NEED_S;
39 break;
40 default:
41 gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
42 }
43
44 ls->fscb(ls->fsdata, cb, &lp->lockname);
45}
46
47static void process_complete(struct gdlm_lock *lp)
48{
49 struct gdlm_ls *ls = lp->ls;
50 struct lm_async_cb acb;
51 int16_t prev_mode = lp->cur;
52
53 memset(&acb, 0, sizeof(acb));
54
55 if (lp->lksb.sb_status == -DLM_ECANCEL) {
56 log_info("complete dlm cancel %x,%llx flags %lx",
57 lp->lockname.ln_type, lp->lockname.ln_number,
58 lp->flags);
59
60 lp->req = lp->cur;
61 acb.lc_ret |= LM_OUT_CANCELED;
62 if (lp->cur == DLM_LOCK_IV)
63 lp->lksb.sb_lkid = 0;
64 goto out;
65 }
66
67 if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
68 if (lp->lksb.sb_status != -DLM_EUNLOCK) {
69 log_info("unlock sb_status %d %x,%llx flags %lx",
70 lp->lksb.sb_status, lp->lockname.ln_type,
71 lp->lockname.ln_number, lp->flags);
72 return;
73 }
74
75 lp->cur = DLM_LOCK_IV;
76 lp->req = DLM_LOCK_IV;
77 lp->lksb.sb_lkid = 0;
78
79 if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
80 gdlm_delete_lp(lp);
81 return;
82 }
83 goto out;
84 }
85
86 if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
87 memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
88
89 if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
90 if (lp->req == DLM_LOCK_PR)
91 lp->req = DLM_LOCK_CW;
92 else if (lp->req == DLM_LOCK_CW)
93 lp->req = DLM_LOCK_PR;
94 }
95
96 /*
97 * A canceled lock request. The lock was just taken off the delayed
98 * list and was never even submitted to dlm.
99 */
100
101 if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
102 log_info("complete internal cancel %x,%llx",
103 lp->lockname.ln_type, lp->lockname.ln_number);
104 lp->req = lp->cur;
105 acb.lc_ret |= LM_OUT_CANCELED;
106 goto out;
107 }
108
109 /*
110 * An error occured.
111 */
112
113 if (lp->lksb.sb_status) {
114 /* a "normal" error */
115 if ((lp->lksb.sb_status == -EAGAIN) &&
116 (lp->lkf & DLM_LKF_NOQUEUE)) {
117 lp->req = lp->cur;
118 if (lp->cur == DLM_LOCK_IV)
119 lp->lksb.sb_lkid = 0;
120 goto out;
121 }
122
123 /* this could only happen with cancels I think */
124 log_info("ast sb_status %d %x,%llx flags %lx",
125 lp->lksb.sb_status, lp->lockname.ln_type,
126 lp->lockname.ln_number, lp->flags);
127 return;
128 }
129
130 /*
131 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
132 */
133
134 if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
135 complete(&lp->ast_wait);
136 return;
137 }
138
139 /*
140 * A lock has been demoted to NL because it initially completed during
141 * BLOCK_LOCKS. Now it must be requested in the originally requested
142 * mode.
143 */
144
145 if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
146 gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
147 lp->lockname.ln_type, lp->lockname.ln_number);
148 gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
149 lp->lockname.ln_type, lp->lockname.ln_number);
150
151 lp->cur = DLM_LOCK_NL;
152 lp->req = lp->prev_req;
153 lp->prev_req = DLM_LOCK_IV;
154 lp->lkf &= ~DLM_LKF_CONVDEADLK;
155
156 set_bit(LFL_NOCACHE, &lp->flags);
157
158 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
159 !test_bit(LFL_NOBLOCK, &lp->flags))
160 gdlm_queue_delayed(lp);
161 else
162 queue_submit(lp);
163 return;
164 }
165
166 /*
167 * A request is granted during dlm recovery. It may be granted
168 * because the locks of a failed node were cleared. In that case,
169 * there may be inconsistent data beneath this lock and we must wait
170 * for recovery to complete to use it. When gfs recovery is done this
171 * granted lock will be converted to NL and then reacquired in this
172 * granted state.
173 */
174
175 if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
176 !test_bit(LFL_NOBLOCK, &lp->flags) &&
177 lp->req != DLM_LOCK_NL) {
178
179 lp->cur = lp->req;
180 lp->prev_req = lp->req;
181 lp->req = DLM_LOCK_NL;
182 lp->lkf |= DLM_LKF_CONVERT;
183 lp->lkf &= ~DLM_LKF_CONVDEADLK;
184
185 log_debug("rereq %x,%llx id %x %d,%d",
186 lp->lockname.ln_type, lp->lockname.ln_number,
187 lp->lksb.sb_lkid, lp->cur, lp->req);
188
189 set_bit(LFL_REREQUEST, &lp->flags);
190 queue_submit(lp);
191 return;
192 }
193
194 /*
195 * DLM demoted the lock to NL before it was granted so GFS must be
196 * told it cannot cache data for this lock.
197 */
198
199 if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
200 set_bit(LFL_NOCACHE, &lp->flags);
201
202 out:
203 /*
204 * This is an internal lock_dlm lock
205 */
206
207 if (test_bit(LFL_INLOCK, &lp->flags)) {
208 clear_bit(LFL_NOBLOCK, &lp->flags);
209 lp->cur = lp->req;
210 complete(&lp->ast_wait);
211 return;
212 }
213
214 /*
215 * Normal completion of a lock request. Tell GFS it now has the lock.
216 */
217
218 clear_bit(LFL_NOBLOCK, &lp->flags);
219 lp->cur = lp->req;
220
221 acb.lc_name = lp->lockname;
222 acb.lc_ret |= gdlm_make_lmstate(lp->cur);
223
224 if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
225 (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
226 acb.lc_ret |= LM_OUT_CACHEABLE;
227
228 ls->fscb(ls->fsdata, LM_CB_ASYNC, &acb);
229}
230
231static inline int no_work(struct gdlm_ls *ls, int blocking)
232{
233 int ret;
234
235 spin_lock(&ls->async_lock);
236 ret = list_empty(&ls->complete) && list_empty(&ls->submit);
237 if (ret && blocking)
238 ret = list_empty(&ls->blocking);
239 spin_unlock(&ls->async_lock);
240
241 return ret;
242}
243
244static inline int check_drop(struct gdlm_ls *ls)
245{
246 if (!ls->drop_locks_count)
247 return 0;
248
249 if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
250 ls->drop_time = jiffies;
251 if (ls->all_locks_count >= ls->drop_locks_count)
252 return 1;
253 }
254 return 0;
255}
256
257static int gdlm_thread(void *data)
258{
259 struct gdlm_ls *ls = (struct gdlm_ls *) data;
260 struct gdlm_lock *lp = NULL;
261 int blist = 0;
262 uint8_t complete, blocking, submit, drop;
263 DECLARE_WAITQUEUE(wait, current);
264
265 /* Only thread1 is allowed to do blocking callbacks since gfs
266 may wait for a completion callback within a blocking cb. */
267
268 if (current == ls->thread1)
269 blist = 1;
270
271 while (!kthread_should_stop()) {
272 set_current_state(TASK_INTERRUPTIBLE);
273 add_wait_queue(&ls->thread_wait, &wait);
274 if (no_work(ls, blist))
275 schedule();
276 remove_wait_queue(&ls->thread_wait, &wait);
277 set_current_state(TASK_RUNNING);
278
279 complete = blocking = submit = drop = 0;
280
281 spin_lock(&ls->async_lock);
282
283 if (blist && !list_empty(&ls->blocking)) {
284 lp = list_entry(ls->blocking.next, struct gdlm_lock,
285 blist);
286 list_del_init(&lp->blist);
287 blocking = lp->bast_mode;
288 lp->bast_mode = 0;
289 } else if (!list_empty(&ls->complete)) {
290 lp = list_entry(ls->complete.next, struct gdlm_lock,
291 clist);
292 list_del_init(&lp->clist);
293 complete = 1;
294 } else if (!list_empty(&ls->submit)) {
295 lp = list_entry(ls->submit.next, struct gdlm_lock,
296 delay_list);
297 list_del_init(&lp->delay_list);
298 submit = 1;
299 }
300
301 drop = check_drop(ls);
302 spin_unlock(&ls->async_lock);
303
304 if (complete)
305 process_complete(lp);
306
307 else if (blocking)
308 process_blocking(lp, blocking);
309
310 else if (submit)
311 gdlm_do_lock(lp);
312
313 if (drop)
314 ls->fscb(ls->fsdata, LM_CB_DROPLOCKS, NULL);
315
316 schedule();
317 }
318
319 return 0;
320}
321
322int gdlm_init_threads(struct gdlm_ls *ls)
323{
324 struct task_struct *p;
325 int error;
326
327 p = kthread_run(gdlm_thread, ls, "lock_dlm1");
328 error = IS_ERR(p);
329 if (error) {
330 log_error("can't start lock_dlm1 thread %d", error);
331 return error;
332 }
333 ls->thread1 = p;
334
335 p = kthread_run(gdlm_thread, ls, "lock_dlm2");
336 error = IS_ERR(p);
337 if (error) {
338 log_error("can't start lock_dlm2 thread %d", error);
339 kthread_stop(ls->thread1);
340 return error;
341 }
342 ls->thread2 = p;
343
344 return 0;
345}
346
347void gdlm_release_threads(struct gdlm_ls *ls)
348{
349 kthread_stop(ls->thread1);
350 kthread_stop(ls->thread2);
351}
352
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
new file mode 100644
index 000000000000..cdadf956c831
--- /dev/null
+++ b/fs/gfs2/locking/nolock/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_GFS2_FS) += lock_nolock.o
2lock_nolock-y := main.o
3
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
new file mode 100644
index 000000000000..ecd37371eba5
--- /dev/null
+++ b/fs/gfs2/locking/nolock/main.c
@@ -0,0 +1,259 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/module.h>
11#include <linux/slab.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/types.h>
15#include <linux/fs.h>
16#include <linux/smp_lock.h>
17
18#include "../../lm_interface.h"
19
20struct nolock_lockspace {
21 unsigned int nl_lvb_size;
22};
23
24struct lm_lockops nolock_ops;
25
26static int nolock_mount(char *table_name, char *host_data,
27 lm_callback_t cb, lm_fsdata_t *fsdata,
28 unsigned int min_lvb_size, int flags,
29 struct lm_lockstruct *lockstruct,
30 struct kobject *fskobj)
31{
32 char *c;
33 unsigned int jid;
34 struct nolock_lockspace *nl;
35
36 c = strstr(host_data, "jid=");
37 if (!c)
38 jid = 0;
39 else {
40 c += 4;
41 sscanf(c, "%u", &jid);
42 }
43
44 nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
45 if (!nl)
46 return -ENOMEM;
47
48 nl->nl_lvb_size = min_lvb_size;
49
50 lockstruct->ls_jid = jid;
51 lockstruct->ls_first = 1;
52 lockstruct->ls_lvb_size = min_lvb_size;
53 lockstruct->ls_lockspace = (lm_lockspace_t *)nl;
54 lockstruct->ls_ops = &nolock_ops;
55 lockstruct->ls_flags = LM_LSFLAG_LOCAL;
56
57 return 0;
58}
59
60static void nolock_others_may_mount(lm_lockspace_t *lockspace)
61{
62}
63
64static void nolock_unmount(lm_lockspace_t *lockspace)
65{
66 struct nolock_lockspace *nl = (struct nolock_lockspace *)lockspace;
67 kfree(nl);
68}
69
70static void nolock_withdraw(lm_lockspace_t *lockspace)
71{
72}
73
74/**
75 * nolock_get_lock - get a lm_lock_t given a descripton of the lock
76 * @lockspace: the lockspace the lock lives in
77 * @name: the name of the lock
78 * @lockp: return the lm_lock_t here
79 *
80 * Returns: 0 on success, -EXXX on failure
81 */
82
83static int nolock_get_lock(lm_lockspace_t *lockspace, struct lm_lockname *name,
84 lm_lock_t **lockp)
85{
86 *lockp = (lm_lock_t *)lockspace;
87 return 0;
88}
89
90/**
91 * nolock_put_lock - get rid of a lock structure
92 * @lock: the lock to throw away
93 *
94 */
95
96static void nolock_put_lock(lm_lock_t *lock)
97{
98}
99
100/**
101 * nolock_lock - acquire a lock
102 * @lock: the lock to manipulate
103 * @cur_state: the current state
104 * @req_state: the requested state
105 * @flags: modifier flags
106 *
107 * Returns: A bitmap of LM_OUT_*
108 */
109
110static unsigned int nolock_lock(lm_lock_t *lock, unsigned int cur_state,
111 unsigned int req_state, unsigned int flags)
112{
113 return req_state | LM_OUT_CACHEABLE;
114}
115
116/**
117 * nolock_unlock - unlock a lock
118 * @lock: the lock to manipulate
119 * @cur_state: the current state
120 *
121 * Returns: 0
122 */
123
124static unsigned int nolock_unlock(lm_lock_t *lock, unsigned int cur_state)
125{
126 return 0;
127}
128
129static void nolock_cancel(lm_lock_t *lock)
130{
131}
132
133/**
134 * nolock_hold_lvb - hold on to a lock value block
135 * @lock: the lock the LVB is associated with
136 * @lvbp: return the lm_lvb_t here
137 *
138 * Returns: 0 on success, -EXXX on failure
139 */
140
141static int nolock_hold_lvb(lm_lock_t *lock, char **lvbp)
142{
143 struct nolock_lockspace *nl = (struct nolock_lockspace *)lock;
144 int error = 0;
145
146 *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL);
147 if (!*lvbp)
148 error = -ENOMEM;
149
150 return error;
151}
152
153/**
154 * nolock_unhold_lvb - release a LVB
155 * @lock: the lock the LVB is associated with
156 * @lvb: the lock value block
157 *
158 */
159
160static void nolock_unhold_lvb(lm_lock_t *lock, char *lvb)
161{
162 kfree(lvb);
163}
164
165/**
166 * nolock_sync_lvb - sync out the value of a lvb
167 * @lock: the lock the LVB is associated with
168 * @lvb: the lock value block
169 *
170 */
171
172static void nolock_sync_lvb(lm_lock_t *lock, char *lvb)
173{
174}
175
176static int nolock_plock_get(lm_lockspace_t *lockspace, struct lm_lockname *name,
177 struct file *file, struct file_lock *fl)
178{
179 struct file_lock tmp;
180 int ret;
181
182 ret = posix_test_lock(file, fl, &tmp);
183 fl->fl_type = F_UNLCK;
184 if (ret)
185 memcpy(fl, &tmp, sizeof(struct file_lock));
186
187 return 0;
188}
189
190static int nolock_plock(lm_lockspace_t *lockspace, struct lm_lockname *name,
191 struct file *file, int cmd, struct file_lock *fl)
192{
193 int error;
194 error = posix_lock_file_wait(file, fl);
195 return error;
196}
197
198static int nolock_punlock(lm_lockspace_t *lockspace, struct lm_lockname *name,
199 struct file *file, struct file_lock *fl)
200{
201 int error;
202 error = posix_lock_file_wait(file, fl);
203 return error;
204}
205
206static void nolock_recovery_done(lm_lockspace_t *lockspace, unsigned int jid,
207 unsigned int message)
208{
209}
210
211struct lm_lockops nolock_ops = {
212 .lm_proto_name = "lock_nolock",
213 .lm_mount = nolock_mount,
214 .lm_others_may_mount = nolock_others_may_mount,
215 .lm_unmount = nolock_unmount,
216 .lm_withdraw = nolock_withdraw,
217 .lm_get_lock = nolock_get_lock,
218 .lm_put_lock = nolock_put_lock,
219 .lm_lock = nolock_lock,
220 .lm_unlock = nolock_unlock,
221 .lm_cancel = nolock_cancel,
222 .lm_hold_lvb = nolock_hold_lvb,
223 .lm_unhold_lvb = nolock_unhold_lvb,
224 .lm_sync_lvb = nolock_sync_lvb,
225 .lm_plock_get = nolock_plock_get,
226 .lm_plock = nolock_plock,
227 .lm_punlock = nolock_punlock,
228 .lm_recovery_done = nolock_recovery_done,
229 .lm_owner = THIS_MODULE,
230};
231
232int __init init_nolock(void)
233{
234 int error;
235
236 error = gfs_register_lockproto(&nolock_ops);
237 if (error) {
238 printk(KERN_WARNING
239 "lock_nolock: can't register protocol: %d\n", error);
240 return error;
241 }
242
243 printk(KERN_INFO
244 "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
245 return 0;
246}
247
248void __exit exit_nolock(void)
249{
250 gfs_unregister_lockproto(&nolock_ops);
251}
252
253module_init(init_nolock);
254module_exit(exit_nolock);
255
256MODULE_DESCRIPTION("GFS Nolock Locking Module");
257MODULE_AUTHOR("Red Hat, Inc.");
258MODULE_LICENSE("GPL");
259
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
new file mode 100644
index 000000000000..9e32e0faaf20
--- /dev/null
+++ b/fs/gfs2/log.c
@@ -0,0 +1,600 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17#include <asm/semaphore.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "bmap.h"
23#include "glock.h"
24#include "log.h"
25#include "lops.h"
26#include "meta_io.h"
27#include "util.h"
28#include "dir.h"
29
30#define PULL 1
31
32/**
33 * gfs2_struct2blk - compute stuff
34 * @sdp: the filesystem
35 * @nstruct: the number of structures
36 * @ssize: the size of the structures
37 *
38 * Compute the number of log descriptor blocks needed to hold a certain number
39 * of structures of a certain size.
40 *
41 * Returns: the number of blocks needed (minimum is always 1)
42 */
43
44unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
45 unsigned int ssize)
46{
47 unsigned int blks;
48 unsigned int first, second;
49
50 blks = 1;
51 first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) /
52 ssize;
53
54 if (nstruct > first) {
55 second = (sdp->sd_sb.sb_bsize -
56 sizeof(struct gfs2_meta_header)) / ssize;
57 blks += DIV_ROUND_UP(nstruct - first, second);
58 }
59
60 return blks;
61}
62
63void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
64{
65 struct list_head *head = &sdp->sd_ail1_list;
66 uint64_t sync_gen;
67 struct list_head *first, *tmp;
68 struct gfs2_ail *first_ai, *ai;
69
70 gfs2_log_lock(sdp);
71 if (list_empty(head)) {
72 gfs2_log_unlock(sdp);
73 return;
74 }
75 sync_gen = sdp->sd_ail_sync_gen++;
76
77 first = head->prev;
78 first_ai = list_entry(first, struct gfs2_ail, ai_list);
79 first_ai->ai_sync_gen = sync_gen;
80 gfs2_ail1_start_one(sdp, first_ai);
81
82 if (flags & DIO_ALL)
83 first = NULL;
84
85 for (;;) {
86 if (first && (head->prev != first ||
87 gfs2_ail1_empty_one(sdp, first_ai, 0)))
88 break;
89
90 for (tmp = head->prev; tmp != head; tmp = tmp->prev) {
91 ai = list_entry(tmp, struct gfs2_ail, ai_list);
92 if (ai->ai_sync_gen >= sync_gen)
93 continue;
94 ai->ai_sync_gen = sync_gen;
95 gfs2_ail1_start_one(sdp, ai);
96 break;
97 }
98
99 if (tmp == head)
100 break;
101 }
102
103 gfs2_log_unlock(sdp);
104}
105
106int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
107{
108 struct gfs2_ail *ai, *s;
109 int ret;
110
111 gfs2_log_lock(sdp);
112
113 list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
114 if (gfs2_ail1_empty_one(sdp, ai, flags))
115 list_move(&ai->ai_list, &sdp->sd_ail2_list);
116 else if (!(flags & DIO_ALL))
117 break;
118 }
119
120 ret = list_empty(&sdp->sd_ail1_list);
121
122 gfs2_log_unlock(sdp);
123
124 return ret;
125}
126
127static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
128{
129 struct gfs2_ail *ai, *safe;
130 unsigned int old_tail = sdp->sd_log_tail;
131 int wrap = (new_tail < old_tail);
132 int a, b, rm;
133
134 gfs2_log_lock(sdp);
135
136 list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
137 a = (old_tail <= ai->ai_first);
138 b = (ai->ai_first < new_tail);
139 rm = (wrap) ? (a || b) : (a && b);
140 if (!rm)
141 continue;
142
143 gfs2_ail2_empty_one(sdp, ai);
144 list_del(&ai->ai_list);
145 gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list));
146 gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list));
147 kfree(ai);
148 }
149
150 gfs2_log_unlock(sdp);
151}
152
153/**
154 * gfs2_log_reserve - Make a log reservation
155 * @sdp: The GFS2 superblock
156 * @blks: The number of blocks to reserve
157 *
158 * Returns: errno
159 */
160
161int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
162{
163 unsigned int try = 0;
164
165 if (gfs2_assert_warn(sdp, blks) ||
166 gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
167 return -EINVAL;
168
169 mutex_lock(&sdp->sd_log_reserve_mutex);
170 gfs2_log_lock(sdp);
171 while(sdp->sd_log_blks_free <= blks) {
172 gfs2_log_unlock(sdp);
173 gfs2_ail1_empty(sdp, 0);
174 gfs2_log_flush(sdp, NULL);
175
176 if (try++)
177 gfs2_ail1_start(sdp, 0);
178 gfs2_log_lock(sdp);
179 }
180 sdp->sd_log_blks_free -= blks;
181 /* printk(KERN_INFO "reserved %u blocks (%u left)\n", blks, sdp->sd_log_blks_free); */
182 gfs2_log_unlock(sdp);
183 mutex_unlock(&sdp->sd_log_reserve_mutex);
184
185 down_read(&sdp->sd_log_flush_lock);
186
187 return 0;
188}
189
190/**
191 * gfs2_log_release - Release a given number of log blocks
192 * @sdp: The GFS2 superblock
193 * @blks: The number of blocks
194 *
195 */
196
197void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
198{
199
200 gfs2_log_lock(sdp);
201 sdp->sd_log_blks_free += blks;
202 /* printk(KERN_INFO "released %u blocks (%u left)\n", blks, sdp->sd_log_blks_free); */
203 gfs2_assert_withdraw(sdp,
204 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
205 gfs2_log_unlock(sdp);
206 up_read(&sdp->sd_log_flush_lock);
207}
208
209static uint64_t log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
210{
211 int new = 0;
212 uint64_t dbn;
213 int error;
214
215 error = gfs2_block_map(sdp->sd_jdesc->jd_inode->u.generic_ip,
216 lbn, &new, &dbn, NULL);
217 gfs2_assert_withdraw(sdp, !error && dbn);
218
219 return dbn;
220}
221
222/**
223 * log_distance - Compute distance between two journal blocks
224 * @sdp: The GFS2 superblock
225 * @newer: The most recent journal block of the pair
226 * @older: The older journal block of the pair
227 *
228 * Compute the distance (in the journal direction) between two
229 * blocks in the journal
230 *
231 * Returns: the distance in blocks
232 */
233
234static inline unsigned int log_distance(struct gfs2_sbd *sdp,
235 unsigned int newer,
236 unsigned int older)
237{
238 int dist;
239
240 dist = newer - older;
241 if (dist < 0)
242 dist += sdp->sd_jdesc->jd_blocks;
243
244 return dist;
245}
246
247static unsigned int current_tail(struct gfs2_sbd *sdp)
248{
249 struct gfs2_ail *ai;
250 unsigned int tail;
251
252 gfs2_log_lock(sdp);
253
254 if (list_empty(&sdp->sd_ail1_list))
255 tail = sdp->sd_log_head;
256 else {
257 ai = list_entry(sdp->sd_ail1_list.prev,
258 struct gfs2_ail, ai_list);
259 tail = ai->ai_first;
260 }
261
262 gfs2_log_unlock(sdp);
263
264 return tail;
265}
266
267static inline void log_incr_head(struct gfs2_sbd *sdp)
268{
269 if (sdp->sd_log_flush_head == sdp->sd_log_tail)
270 gfs2_assert_withdraw(sdp,
271 sdp->sd_log_flush_head == sdp->sd_log_head);
272
273 if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
274 sdp->sd_log_flush_head = 0;
275 sdp->sd_log_flush_wrapped = 1;
276 }
277}
278
279/**
280 * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
281 * @sdp: The GFS2 superblock
282 *
283 * Returns: the buffer_head
284 */
285
286struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
287{
288 uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head);
289 struct gfs2_log_buf *lb;
290 struct buffer_head *bh;
291
292 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
293 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
294
295 bh = lb->lb_bh = sb_getblk(sdp->sd_vfs, blkno);
296 lock_buffer(bh);
297 memset(bh->b_data, 0, bh->b_size);
298 set_buffer_uptodate(bh);
299 clear_buffer_dirty(bh);
300 unlock_buffer(bh);
301
302 log_incr_head(sdp);
303
304 return bh;
305}
306
307/**
308 * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
309 * @sdp: the filesystem
310 * @data: the data the buffer_head should point to
311 *
312 * Returns: the log buffer descriptor
313 */
314
315struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
316 struct buffer_head *real)
317{
318 uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head);
319 struct gfs2_log_buf *lb;
320 struct buffer_head *bh;
321
322 lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
323 list_add(&lb->lb_list, &sdp->sd_log_flush_list);
324 lb->lb_real = real;
325
326 bh = lb->lb_bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
327 atomic_set(&bh->b_count, 1);
328 bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate);
329 set_bh_page(bh, real->b_page, bh_offset(real));
330 bh->b_blocknr = blkno;
331 bh->b_size = sdp->sd_sb.sb_bsize;
332 bh->b_bdev = sdp->sd_vfs->s_bdev;
333
334 log_incr_head(sdp);
335
336 return bh;
337}
338
339static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail, int pull)
340{
341 unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail);
342
343 ail2_empty(sdp, new_tail);
344
345 gfs2_log_lock(sdp);
346 sdp->sd_log_blks_free += dist - ((pull) ? 1 : 0);
347 /* printk(KERN_INFO "pull tail refunding %u blocks (%u left) pull=%d\n", dist - ((pull) ? 1 : 0), sdp->sd_log_blks_free, pull); */
348 gfs2_assert_withdraw(sdp,
349 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
350 gfs2_log_unlock(sdp);
351
352 sdp->sd_log_tail = new_tail;
353}
354
355/**
356 * log_write_header - Get and initialize a journal header buffer
357 * @sdp: The GFS2 superblock
358 *
359 * Returns: the initialized log buffer descriptor
360 */
361
362static void log_write_header(struct gfs2_sbd *sdp, uint32_t flags, int pull)
363{
364 uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head);
365 struct buffer_head *bh;
366 struct gfs2_log_header *lh;
367 unsigned int tail;
368 uint32_t hash;
369
370 /* printk(KERN_INFO "log write header start (flags=%08x, pull=%d)\n", flags, pull); */
371
372 bh = sb_getblk(sdp->sd_vfs, blkno);
373 lock_buffer(bh);
374 memset(bh->b_data, 0, bh->b_size);
375 set_buffer_uptodate(bh);
376 clear_buffer_dirty(bh);
377 unlock_buffer(bh);
378
379 gfs2_ail1_empty(sdp, 0);
380 tail = current_tail(sdp);
381
382 lh = (struct gfs2_log_header *)bh->b_data;
383 memset(lh, 0, sizeof(struct gfs2_log_header));
384 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
385 lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
386 lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
387 lh->lh_sequence = be64_to_cpu(sdp->sd_log_sequence++);
388 lh->lh_flags = be32_to_cpu(flags);
389 lh->lh_tail = be32_to_cpu(tail);
390 lh->lh_blkno = be32_to_cpu(sdp->sd_log_flush_head);
391 hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
392 lh->lh_hash = cpu_to_be32(hash);
393
394 set_buffer_dirty(bh);
395 if (sync_dirty_buffer(bh))
396 gfs2_io_error_bh(sdp, bh);
397 brelse(bh);
398
399 if (sdp->sd_log_tail != tail)
400 log_pull_tail(sdp, tail, pull);
401 else
402 gfs2_assert_withdraw(sdp, !pull);
403
404 sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
405 log_incr_head(sdp);
406
407 /* printk(KERN_INFO "log write header out\n"); */
408}
409
410static void log_flush_commit(struct gfs2_sbd *sdp)
411{
412 struct list_head *head = &sdp->sd_log_flush_list;
413 struct gfs2_log_buf *lb;
414 struct buffer_head *bh;
415#if 0
416 unsigned int d;
417
418 d = log_distance(sdp, sdp->sd_log_flush_head, sdp->sd_log_head);
419
420 gfs2_assert_withdraw(sdp, d + 1 == sdp->sd_log_blks_reserved);
421#endif
422
423 while (!list_empty(head)) {
424 lb = list_entry(head->next, struct gfs2_log_buf, lb_list);
425 list_del(&lb->lb_list);
426 bh = lb->lb_bh;
427
428 wait_on_buffer(bh);
429 if (!buffer_uptodate(bh))
430 gfs2_io_error_bh(sdp, bh);
431 if (lb->lb_real) {
432 while (atomic_read(&bh->b_count) != 1) /* Grrrr... */
433 schedule();
434 free_buffer_head(bh);
435 } else
436 brelse(bh);
437 kfree(lb);
438 }
439
440 log_write_header(sdp, 0, 0);
441}
442
443/**
444 * gfs2_log_flush - flush incore transaction(s)
445 * @sdp: the filesystem
446 * @gl: The glock structure to flush. If NULL, flush the whole incore log
447 *
448 */
449
450void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
451{
452 struct gfs2_ail *ai;
453
454 down_write(&sdp->sd_log_flush_lock);
455
456 if (gl) {
457 gfs2_log_lock(sdp);
458 if (list_empty(&gl->gl_le.le_list)) {
459 gfs2_log_unlock(sdp);
460 up_write(&sdp->sd_log_flush_lock);
461 return;
462 }
463 gfs2_log_unlock(sdp);
464 }
465
466 ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
467 INIT_LIST_HEAD(&ai->ai_ail1_list);
468 INIT_LIST_HEAD(&ai->ai_ail2_list);
469
470 gfs2_assert_withdraw(sdp,
471 sdp->sd_log_num_buf == sdp->sd_log_commited_buf);
472 gfs2_assert_withdraw(sdp,
473 sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
474
475 sdp->sd_log_flush_head = sdp->sd_log_head;
476 sdp->sd_log_flush_wrapped = 0;
477 ai->ai_first = sdp->sd_log_flush_head;
478
479 lops_before_commit(sdp);
480 if (!list_empty(&sdp->sd_log_flush_list))
481 log_flush_commit(sdp);
482 else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle)
483 log_write_header(sdp, 0, PULL);
484 lops_after_commit(sdp, ai);
485 sdp->sd_log_head = sdp->sd_log_flush_head;
486
487 /* printk(KERN_INFO "sd_log_num_hdrs %u\n", sdp->sd_log_num_hdrs); */
488 sdp->sd_log_blks_free -= sdp->sd_log_num_hdrs;
489
490 sdp->sd_log_blks_reserved =
491 sdp->sd_log_commited_buf =
492 sdp->sd_log_num_hdrs =
493 sdp->sd_log_commited_revoke = 0;
494
495 gfs2_log_lock(sdp);
496 if (!list_empty(&ai->ai_ail1_list)) {
497 list_add(&ai->ai_list, &sdp->sd_ail1_list);
498 ai = NULL;
499 }
500 gfs2_log_unlock(sdp);
501
502 sdp->sd_vfs->s_dirt = 0;
503 up_write(&sdp->sd_log_flush_lock);
504
505 kfree(ai);
506}
507
508static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
509{
510 unsigned int reserved = 1;
511 unsigned int old;
512
513 gfs2_log_lock(sdp);
514
515 sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm;
516 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_buf) >= 0);
517 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
518 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
519
520 if (sdp->sd_log_commited_buf)
521 reserved += sdp->sd_log_commited_buf;
522 if (sdp->sd_log_commited_revoke)
523 reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
524 sizeof(uint64_t));
525
526 old = sdp->sd_log_blks_free;
527 sdp->sd_log_blks_free += tr->tr_reserved -
528 (reserved - sdp->sd_log_blks_reserved);
529
530 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old);
531 gfs2_assert_withdraw(sdp,
532 sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks +
533 sdp->sd_log_num_hdrs);
534
535 sdp->sd_log_blks_reserved = reserved;
536
537 gfs2_log_unlock(sdp);
538}
539
540/**
541 * gfs2_log_commit - Commit a transaction to the log
542 * @sdp: the filesystem
543 * @tr: the transaction
544 *
545 * Returns: errno
546 */
547
548void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
549{
550 log_refund(sdp, tr);
551 lops_incore_commit(sdp, tr);
552
553 sdp->sd_vfs->s_dirt = 1;
554 up_read(&sdp->sd_log_flush_lock);
555
556 gfs2_log_lock(sdp);
557 if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) {
558 gfs2_log_unlock(sdp);
559 gfs2_log_flush(sdp, NULL);
560 } else
561 gfs2_log_unlock(sdp);
562}
563
564/**
565 * gfs2_log_shutdown - write a shutdown header into a journal
566 * @sdp: the filesystem
567 *
568 */
569
570void gfs2_log_shutdown(struct gfs2_sbd *sdp)
571{
572 down_write(&sdp->sd_log_flush_lock);
573
574 gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
575 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
576 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
577 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_jdata);
578 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
579 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
580 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
581 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_hdrs);
582 gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
583
584 sdp->sd_log_flush_head = sdp->sd_log_head;
585 sdp->sd_log_flush_wrapped = 0;
586
587 log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, 0);
588
589 /* printk(KERN_INFO "sd_log_blks_free %u, sd_jdesc->jd_blocks %u\n", sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); */
590 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free ==
591 sdp->sd_jdesc->jd_blocks);
592 gfs2_assert_withdraw(sdp, sdp->sd_log_head == sdp->sd_log_tail);
593 gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail2_list));
594
595 sdp->sd_log_head = sdp->sd_log_flush_head;
596 sdp->sd_log_tail = sdp->sd_log_head;
597
598 up_write(&sdp->sd_log_flush_lock);
599}
600
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
new file mode 100644
index 000000000000..84a3e902e848
--- /dev/null
+++ b/fs/gfs2/log.h
@@ -0,0 +1,61 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LOG_DOT_H__
11#define __LOG_DOT_H__
12
13/**
14 * gfs2_log_lock - acquire the right to mess with the log manager
15 * @sdp: the filesystem
16 *
17 */
18
19static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
20{
21 spin_lock(&sdp->sd_log_lock);
22}
23
24/**
25 * gfs2_log_unlock - release the right to mess with the log manager
26 * @sdp: the filesystem
27 *
28 */
29
30static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
31{
32 spin_unlock(&sdp->sd_log_lock);
33}
34
35static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
36 unsigned int value)
37{
38 if (++value == sdp->sd_jdesc->jd_blocks) {
39 value = 0;
40 }
41 sdp->sd_log_head = sdp->sd_log_tail = value;
42}
43
44unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
45 unsigned int ssize);
46
47void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags);
48int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
49
50int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
51void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
52
53struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
54struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
55 struct buffer_head *real);
56void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
57void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
58
59void gfs2_log_shutdown(struct gfs2_sbd *sdp);
60
61#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
new file mode 100644
index 000000000000..4d90eb311497
--- /dev/null
+++ b/fs/gfs2/lops.c
@@ -0,0 +1,805 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <asm/semaphore.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "glock.h"
22#include "log.h"
23#include "lops.h"
24#include "meta_io.h"
25#include "recovery.h"
26#include "rgrp.h"
27#include "trans.h"
28#include "util.h"
29
30static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
31{
32 struct gfs2_glock *gl;
33 struct gfs2_trans *tr = current->journal_info;
34
35 tr->tr_touched = 1;
36
37 if (!list_empty(&le->le_list))
38 return;
39
40 gl = container_of(le, struct gfs2_glock, gl_le);
41 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
42 return;
43 gfs2_glock_hold(gl);
44 set_bit(GLF_DIRTY, &gl->gl_flags);
45
46 gfs2_log_lock(sdp);
47 sdp->sd_log_num_gl++;
48 list_add(&le->le_list, &sdp->sd_log_le_gl);
49 gfs2_log_unlock(sdp);
50}
51
52static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
53{
54 struct list_head *head = &sdp->sd_log_le_gl;
55 struct gfs2_glock *gl;
56
57 while (!list_empty(head)) {
58 gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list);
59 list_del_init(&gl->gl_le.le_list);
60 sdp->sd_log_num_gl--;
61
62 gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl));
63 gfs2_glock_put(gl);
64 }
65 gfs2_assert_warn(sdp, !sdp->sd_log_num_gl);
66}
67
68static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
69{
70 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
71 struct gfs2_trans *tr;
72
73 if (!list_empty(&bd->bd_list_tr))
74 return;
75
76 tr = current->journal_info;
77 tr->tr_touched = 1;
78 tr->tr_num_buf++;
79 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
80
81 if (!list_empty(&le->le_list))
82 return;
83
84 gfs2_trans_add_gl(bd->bd_gl);
85
86 gfs2_meta_check(sdp, bd->bd_bh);
87 gfs2_pin(sdp, bd->bd_bh);
88
89 gfs2_log_lock(sdp);
90 sdp->sd_log_num_buf++;
91 list_add(&le->le_list, &sdp->sd_log_le_buf);
92 gfs2_log_unlock(sdp);
93
94 tr->tr_num_buf_new++;
95}
96
97static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
98{
99 struct list_head *head = &tr->tr_list_buf;
100 struct gfs2_bufdata *bd;
101
102 while (!list_empty(head)) {
103 bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
104 list_del_init(&bd->bd_list_tr);
105 tr->tr_num_buf--;
106 }
107 gfs2_assert_warn(sdp, !tr->tr_num_buf);
108}
109
110static void buf_lo_before_commit(struct gfs2_sbd *sdp)
111{
112 struct buffer_head *bh;
113 struct gfs2_log_descriptor *ld;
114 struct gfs2_bufdata *bd1 = NULL, *bd2;
115 unsigned int total = sdp->sd_log_num_buf;
116 unsigned int offset = sizeof(struct gfs2_log_descriptor);
117 unsigned int limit;
118 unsigned int num;
119 unsigned n;
120 __be64 *ptr;
121
122 offset += (sizeof(__be64) - 1);
123 offset &= ~(sizeof(__be64) - 1);
124 limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
125 /* for 4k blocks, limit = 503 */
126
127 bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list);
128 while(total) {
129 num = total;
130 if (total > limit)
131 num = limit;
132 bh = gfs2_log_get_buf(sdp);
133 sdp->sd_log_num_hdrs++;
134 ld = (struct gfs2_log_descriptor *)bh->b_data;
135 ptr = (__be64 *)(bh->b_data + offset);
136 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
137 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
138 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
139 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_METADATA);
140 ld->ld_length = cpu_to_be32(num + 1);
141 ld->ld_data1 = cpu_to_be32(num);
142 ld->ld_data2 = cpu_to_be32(0);
143 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
144
145 n = 0;
146 list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf,
147 bd_le.le_list) {
148 *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
149 if (++n >= num)
150 break;
151 }
152
153 set_buffer_dirty(bh);
154 ll_rw_block(WRITE, 1, &bh);
155
156 n = 0;
157 list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf,
158 bd_le.le_list) {
159 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
160 set_buffer_dirty(bh);
161 ll_rw_block(WRITE, 1, &bh);
162 if (++n >= num)
163 break;
164 }
165
166 total -= num;
167 }
168}
169
170static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
171{
172 struct list_head *head = &sdp->sd_log_le_buf;
173 struct gfs2_bufdata *bd;
174
175 while (!list_empty(head)) {
176 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
177 list_del_init(&bd->bd_le.le_list);
178 sdp->sd_log_num_buf--;
179
180 gfs2_unpin(sdp, bd->bd_bh, ai);
181 }
182 gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
183}
184
185static void buf_lo_before_scan(struct gfs2_jdesc *jd,
186 struct gfs2_log_header *head, int pass)
187{
188 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
189 struct gfs2_sbd *sdp = ip->i_sbd;
190
191 if (pass != 0)
192 return;
193
194 sdp->sd_found_blocks = 0;
195 sdp->sd_replayed_blocks = 0;
196}
197
198static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
199 struct gfs2_log_descriptor *ld, __be64 *ptr,
200 int pass)
201{
202 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
203 struct gfs2_sbd *sdp = ip->i_sbd;
204 struct gfs2_glock *gl = ip->i_gl;
205 unsigned int blks = be32_to_cpu(ld->ld_data1);
206 struct buffer_head *bh_log, *bh_ip;
207 uint64_t blkno;
208 int error = 0;
209
210 if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA)
211 return 0;
212
213 gfs2_replay_incr_blk(sdp, &start);
214
215 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
216 blkno = be64_to_cpu(*ptr++);
217
218 sdp->sd_found_blocks++;
219
220 if (gfs2_revoke_check(sdp, blkno, start))
221 continue;
222
223 error = gfs2_replay_read_block(jd, start, &bh_log);
224 if (error)
225 return error;
226
227 bh_ip = gfs2_meta_new(gl, blkno);
228 memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
229
230 if (gfs2_meta_check(sdp, bh_ip))
231 error = -EIO;
232 else
233 mark_buffer_dirty(bh_ip);
234
235 brelse(bh_log);
236 brelse(bh_ip);
237
238 if (error)
239 break;
240
241 sdp->sd_replayed_blocks++;
242 }
243
244 return error;
245}
246
247static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
248{
249 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
250 struct gfs2_sbd *sdp = ip->i_sbd;
251
252 if (error) {
253 gfs2_meta_sync(ip->i_gl,
254 DIO_START | DIO_WAIT);
255 return;
256 }
257 if (pass != 1)
258 return;
259
260 gfs2_meta_sync(ip->i_gl, DIO_START | DIO_WAIT);
261
262 fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
263 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
264}
265
266static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
267{
268 struct gfs2_trans *tr;
269
270 tr = current->journal_info;
271 tr->tr_touched = 1;
272 tr->tr_num_revoke++;
273
274 gfs2_log_lock(sdp);
275 sdp->sd_log_num_revoke++;
276 list_add(&le->le_list, &sdp->sd_log_le_revoke);
277 gfs2_log_unlock(sdp);
278}
279
280static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
281{
282 struct gfs2_log_descriptor *ld;
283 struct gfs2_meta_header *mh;
284 struct buffer_head *bh;
285 unsigned int offset;
286 struct list_head *head = &sdp->sd_log_le_revoke;
287 struct gfs2_revoke *rv;
288
289 if (!sdp->sd_log_num_revoke)
290 return;
291
292 bh = gfs2_log_get_buf(sdp);
293 ld = (struct gfs2_log_descriptor *)bh->b_data;
294 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
295 ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
296 ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
297 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_REVOKE);
298 ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke,
299 sizeof(uint64_t)));
300 ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
301 ld->ld_data2 = cpu_to_be32(0);
302 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
303 offset = sizeof(struct gfs2_log_descriptor);
304
305 while (!list_empty(head)) {
306 rv = list_entry(head->next, struct gfs2_revoke, rv_le.le_list);
307 list_del_init(&rv->rv_le.le_list);
308 sdp->sd_log_num_revoke--;
309
310 if (offset + sizeof(uint64_t) > sdp->sd_sb.sb_bsize) {
311 set_buffer_dirty(bh);
312 ll_rw_block(WRITE, 1, &bh);
313
314 bh = gfs2_log_get_buf(sdp);
315 mh = (struct gfs2_meta_header *)bh->b_data;
316 mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
317 mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB);
318 mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB);
319 offset = sizeof(struct gfs2_meta_header);
320 }
321
322 *(__be64 *)(bh->b_data + offset) = cpu_to_be64(rv->rv_blkno);
323 kfree(rv);
324
325 offset += sizeof(uint64_t);
326 }
327 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
328
329 set_buffer_dirty(bh);
330 ll_rw_block(WRITE, 1, &bh);
331}
332
333static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
334 struct gfs2_log_header *head, int pass)
335{
336 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
337 struct gfs2_sbd *sdp = ip->i_sbd;
338
339 if (pass != 0)
340 return;
341
342 sdp->sd_found_revokes = 0;
343 sdp->sd_replay_tail = head->lh_tail;
344}
345
346static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
347 struct gfs2_log_descriptor *ld, __be64 *ptr,
348 int pass)
349{
350 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
351 struct gfs2_sbd *sdp = ip->i_sbd;
352 unsigned int blks = be32_to_cpu(ld->ld_length);
353 unsigned int revokes = be32_to_cpu(ld->ld_data1);
354 struct buffer_head *bh;
355 unsigned int offset;
356 uint64_t blkno;
357 int first = 1;
358 int error;
359
360 if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE)
361 return 0;
362
363 offset = sizeof(struct gfs2_log_descriptor);
364
365 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
366 error = gfs2_replay_read_block(jd, start, &bh);
367 if (error)
368 return error;
369
370 if (!first)
371 gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB);
372
373 while (offset + sizeof(uint64_t) <= sdp->sd_sb.sb_bsize) {
374 blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
375
376 error = gfs2_revoke_add(sdp, blkno, start);
377 if (error < 0)
378 return error;
379 else if (error)
380 sdp->sd_found_revokes++;
381
382 if (!--revokes)
383 break;
384 offset += sizeof(uint64_t);
385 }
386
387 brelse(bh);
388 offset = sizeof(struct gfs2_meta_header);
389 first = 0;
390 }
391
392 return 0;
393}
394
395static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
396{
397 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
398 struct gfs2_sbd *sdp = ip->i_sbd;
399
400 if (error) {
401 gfs2_revoke_clean(sdp);
402 return;
403 }
404 if (pass != 1)
405 return;
406
407 fs_info(sdp, "jid=%u: Found %u revoke tags\n",
408 jd->jd_jid, sdp->sd_found_revokes);
409
410 gfs2_revoke_clean(sdp);
411}
412
413static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
414{
415 struct gfs2_rgrpd *rgd;
416 struct gfs2_trans *tr = current->journal_info;
417
418 tr->tr_touched = 1;
419
420 if (!list_empty(&le->le_list))
421 return;
422
423 rgd = container_of(le, struct gfs2_rgrpd, rd_le);
424 gfs2_rgrp_bh_hold(rgd);
425
426 gfs2_log_lock(sdp);
427 sdp->sd_log_num_rg++;
428 list_add(&le->le_list, &sdp->sd_log_le_rg);
429 gfs2_log_unlock(sdp);
430}
431
432static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
433{
434 struct list_head *head = &sdp->sd_log_le_rg;
435 struct gfs2_rgrpd *rgd;
436
437 while (!list_empty(head)) {
438 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list);
439 list_del_init(&rgd->rd_le.le_list);
440 sdp->sd_log_num_rg--;
441
442 gfs2_rgrp_repolish_clones(rgd);
443 gfs2_rgrp_bh_put(rgd);
444 }
445 gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
446}
447
448/**
449 * databuf_lo_add - Add a databuf to the transaction.
450 *
451 * This is used in two distinct cases:
452 * i) In ordered write mode
453 * We put the data buffer on a list so that we can ensure that its
454 * synced to disk at the right time
455 * ii) In journaled data mode
456 * We need to journal the data block in the same way as metadata in
457 * the functions above. The difference is that here we have a tag
458 * which is two __be64's being the block number (as per meta data)
459 * and a flag which says whether the data block needs escaping or
460 * not. This means we need a new log entry for each 251 or so data
461 * blocks, which isn't an enormous overhead but twice as much as
462 * for normal metadata blocks.
463 */
464static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
465{
466 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
467 struct gfs2_trans *tr = current->journal_info;
468 struct address_space *mapping = bd->bd_bh->b_page->mapping;
469 struct gfs2_inode *ip = mapping->host->u.generic_ip;
470
471 tr->tr_touched = 1;
472 if (!list_empty(&bd->bd_list_tr) &&
473 (ip->i_di.di_flags & GFS2_DIF_JDATA)) {
474 tr->tr_num_buf++;
475 gfs2_trans_add_gl(bd->bd_gl);
476 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
477 gfs2_pin(sdp, bd->bd_bh);
478 tr->tr_num_buf_new++;
479 }
480 gfs2_log_lock(sdp);
481 if (!list_empty(&le->le_list)) {
482 if (ip->i_di.di_flags & GFS2_DIF_JDATA)
483 sdp->sd_log_num_jdata++;
484 sdp->sd_log_num_databuf++;
485 list_add(&le->le_list, &sdp->sd_log_le_databuf);
486 }
487 gfs2_log_unlock(sdp);
488}
489
490static int gfs2_check_magic(struct buffer_head *bh)
491{
492 struct page *page = bh->b_page;
493 void *kaddr;
494 __be32 *ptr;
495 int rv = 0;
496
497 kaddr = kmap_atomic(page, KM_USER0);
498 ptr = kaddr + bh_offset(bh);
499 if (*ptr == cpu_to_be32(GFS2_MAGIC))
500 rv = 1;
501 kunmap_atomic(page, KM_USER0);
502
503 return rv;
504}
505
506/**
507 * databuf_lo_before_commit - Scan the data buffers, writing as we go
508 *
509 * Here we scan through the lists of buffers and make the assumption
510 * that any buffer thats been pinned is being journaled, and that
511 * any unpinned buffer is an ordered write data buffer and therefore
512 * will be written back rather than journaled.
513 */
514static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
515{
516 LIST_HEAD(started);
517 struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt;
518 struct buffer_head *bh = NULL;
519 unsigned int offset = sizeof(struct gfs2_log_descriptor);
520 struct gfs2_log_descriptor *ld;
521 unsigned int limit;
522 unsigned int total_dbuf = sdp->sd_log_num_databuf;
523 unsigned int total_jdata = sdp->sd_log_num_jdata;
524 unsigned int num, n;
525 __be64 *ptr = NULL;
526
527 offset += (2*sizeof(__be64) - 1);
528 offset &= ~(2*sizeof(__be64) - 1);
529 limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
530
531 /*
532 * Start writing ordered buffers, write journaled buffers
533 * into the log along with a header
534 */
535 gfs2_log_lock(sdp);
536 bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf,
537 bd_le.le_list);
538 while(total_dbuf) {
539 num = total_jdata;
540 if (num > limit)
541 num = limit;
542 n = 0;
543 list_for_each_entry_safe_continue(bd1, bdt,
544 &sdp->sd_log_le_databuf,
545 bd_le.le_list) {
546 /* An ordered write buffer */
547 if (bd1->bd_bh && !buffer_pinned(bd1->bd_bh)) {
548 list_move(&bd1->bd_le.le_list, &started);
549 if (bd1 == bd2) {
550 bd2 = NULL;
551 bd2 = list_prepare_entry(bd2,
552 &sdp->sd_log_le_databuf,
553 bd_le.le_list);
554 }
555 total_dbuf--;
556 if (bd1->bd_bh) {
557 get_bh(bd1->bd_bh);
558 if (buffer_dirty(bd1->bd_bh)) {
559 gfs2_log_unlock(sdp);
560 wait_on_buffer(bd1->bd_bh);
561 ll_rw_block(WRITE, 1,
562 &bd1->bd_bh);
563 gfs2_log_lock(sdp);
564 }
565 brelse(bd1->bd_bh);
566 continue;
567 }
568 continue;
569 } else if (bd1->bd_bh) { /* A journaled buffer */
570 int magic;
571 gfs2_log_unlock(sdp);
572 if (!bh) {
573 bh = gfs2_log_get_buf(sdp);
574 sdp->sd_log_num_hdrs++;
575 ld = (struct gfs2_log_descriptor *)
576 bh->b_data;
577 ptr = (__be64 *)(bh->b_data + offset);
578 ld->ld_header.mh_magic =
579 cpu_to_be32(GFS2_MAGIC);
580 ld->ld_header.mh_type =
581 cpu_to_be32(GFS2_METATYPE_LD);
582 ld->ld_header.mh_format =
583 cpu_to_be32(GFS2_FORMAT_LD);
584 ld->ld_type =
585 cpu_to_be32(GFS2_LOG_DESC_JDATA);
586 ld->ld_length = cpu_to_be32(num + 1);
587 ld->ld_data1 = cpu_to_be32(num);
588 ld->ld_data2 = cpu_to_be32(0);
589 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
590 }
591 magic = gfs2_check_magic(bd1->bd_bh);
592 *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
593 *ptr++ = cpu_to_be64((__u64)magic);
594 clear_buffer_escaped(bd1->bd_bh);
595 if (unlikely(magic != 0))
596 set_buffer_escaped(bd1->bd_bh);
597 gfs2_log_lock(sdp);
598 if (n++ > num)
599 break;
600 }
601 }
602 gfs2_log_unlock(sdp);
603 if (bh) {
604 set_buffer_dirty(bh);
605 ll_rw_block(WRITE, 1, &bh);
606 bh = NULL;
607 }
608 n = 0;
609 gfs2_log_lock(sdp);
610 list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf,
611 bd_le.le_list) {
612 if (!bd2->bd_bh)
613 continue;
614 /* copy buffer if it needs escaping */
615 gfs2_log_unlock(sdp);
616 if (unlikely(buffer_escaped(bd2->bd_bh))) {
617 void *kaddr;
618 struct page *page = bd2->bd_bh->b_page;
619 bh = gfs2_log_get_buf(sdp);
620 kaddr = kmap_atomic(page, KM_USER0);
621 memcpy(bh->b_data,
622 kaddr + bh_offset(bd2->bd_bh),
623 sdp->sd_sb.sb_bsize);
624 kunmap_atomic(page, KM_USER0);
625 *(__be32 *)bh->b_data = 0;
626 } else {
627 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
628 }
629 set_buffer_dirty(bh);
630 ll_rw_block(WRITE, 1, &bh);
631 gfs2_log_lock(sdp);
632 if (++n >= num)
633 break;
634 }
635 bh = NULL;
636 total_dbuf -= num;
637 total_jdata -= num;
638 }
639 gfs2_log_unlock(sdp);
640
641 /* Wait on all ordered buffers */
642 while (!list_empty(&started)) {
643 gfs2_log_lock(sdp);
644 bd1 = list_entry(started.next, struct gfs2_bufdata,
645 bd_le.le_list);
646 list_del(&bd1->bd_le.le_list);
647 sdp->sd_log_num_databuf--;
648
649 bh = bd1->bd_bh;
650 if (bh) {
651 bh->b_private = NULL;
652 gfs2_log_unlock(sdp);
653 wait_on_buffer(bh);
654 brelse(bh);
655 } else
656 gfs2_log_unlock(sdp);
657
658 kfree(bd1);
659 }
660
661 /* We've removed all the ordered write bufs here, so only jdata left */
662 gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata);
663}
664
665static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
666 struct gfs2_log_descriptor *ld,
667 __be64 *ptr, int pass)
668{
669 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
670 struct gfs2_sbd *sdp = ip->i_sbd;
671 struct gfs2_glock *gl = ip->i_gl;
672 unsigned int blks = be32_to_cpu(ld->ld_data1);
673 struct buffer_head *bh_log, *bh_ip;
674 uint64_t blkno;
675 uint64_t esc;
676 int error = 0;
677
678 if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
679 return 0;
680
681 gfs2_replay_incr_blk(sdp, &start);
682 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
683 blkno = be64_to_cpu(*ptr++);
684 esc = be64_to_cpu(*ptr++);
685
686 sdp->sd_found_blocks++;
687
688 if (gfs2_revoke_check(sdp, blkno, start))
689 continue;
690
691 error = gfs2_replay_read_block(jd, start, &bh_log);
692 if (error)
693 return error;
694
695 bh_ip = gfs2_meta_new(gl, blkno);
696 memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
697
698 /* Unescape */
699 if (esc) {
700 __be32 *eptr = (__be32 *)bh_ip->b_data;
701 *eptr = cpu_to_be32(GFS2_MAGIC);
702 }
703 mark_buffer_dirty(bh_ip);
704
705 brelse(bh_log);
706 brelse(bh_ip);
707 if (error)
708 break;
709
710 sdp->sd_replayed_blocks++;
711 }
712
713 return error;
714}
715
716/* FIXME: sort out accounting for log blocks etc. */
717
718static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
719{
720 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
721 struct gfs2_sbd *sdp = ip->i_sbd;
722
723 if (error) {
724 gfs2_meta_sync(ip->i_gl,
725 DIO_START | DIO_WAIT);
726 return;
727 }
728 if (pass != 1)
729 return;
730
731 /* data sync? */
732 gfs2_meta_sync(ip->i_gl, DIO_START | DIO_WAIT);
733
734 fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
735 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
736}
737
738static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
739{
740 struct list_head *head = &sdp->sd_log_le_databuf;
741 struct gfs2_bufdata *bd;
742
743 while (!list_empty(head)) {
744 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
745 list_del(&bd->bd_le.le_list);
746 sdp->sd_log_num_databuf--;
747 sdp->sd_log_num_jdata--;
748 gfs2_unpin(sdp, bd->bd_bh, ai);
749 }
750 gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
751 gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata);
752}
753
754
755const struct gfs2_log_operations gfs2_glock_lops = {
756 .lo_add = glock_lo_add,
757 .lo_after_commit = glock_lo_after_commit,
758 .lo_name = "glock"
759};
760
761const struct gfs2_log_operations gfs2_buf_lops = {
762 .lo_add = buf_lo_add,
763 .lo_incore_commit = buf_lo_incore_commit,
764 .lo_before_commit = buf_lo_before_commit,
765 .lo_after_commit = buf_lo_after_commit,
766 .lo_before_scan = buf_lo_before_scan,
767 .lo_scan_elements = buf_lo_scan_elements,
768 .lo_after_scan = buf_lo_after_scan,
769 .lo_name = "buf"
770};
771
772const struct gfs2_log_operations gfs2_revoke_lops = {
773 .lo_add = revoke_lo_add,
774 .lo_before_commit = revoke_lo_before_commit,
775 .lo_before_scan = revoke_lo_before_scan,
776 .lo_scan_elements = revoke_lo_scan_elements,
777 .lo_after_scan = revoke_lo_after_scan,
778 .lo_name = "revoke"
779};
780
781const struct gfs2_log_operations gfs2_rg_lops = {
782 .lo_add = rg_lo_add,
783 .lo_after_commit = rg_lo_after_commit,
784 .lo_name = "rg"
785};
786
787const struct gfs2_log_operations gfs2_databuf_lops = {
788 .lo_add = databuf_lo_add,
789 .lo_incore_commit = buf_lo_incore_commit,
790 .lo_before_commit = databuf_lo_before_commit,
791 .lo_after_commit = databuf_lo_after_commit,
792 .lo_scan_elements = databuf_lo_scan_elements,
793 .lo_after_scan = databuf_lo_after_scan,
794 .lo_name = "databuf"
795};
796
797const struct gfs2_log_operations *gfs2_log_ops[] = {
798 &gfs2_glock_lops,
799 &gfs2_buf_lops,
800 &gfs2_revoke_lops,
801 &gfs2_rg_lops,
802 &gfs2_databuf_lops,
803 NULL
804};
805
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
new file mode 100644
index 000000000000..0c78d222d6f2
--- /dev/null
+++ b/fs/gfs2/lops.h
@@ -0,0 +1,96 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LOPS_DOT_H__
11#define __LOPS_DOT_H__
12
13extern const struct gfs2_log_operations gfs2_glock_lops;
14extern const struct gfs2_log_operations gfs2_buf_lops;
15extern const struct gfs2_log_operations gfs2_revoke_lops;
16extern const struct gfs2_log_operations gfs2_rg_lops;
17extern const struct gfs2_log_operations gfs2_databuf_lops;
18
19extern const struct gfs2_log_operations *gfs2_log_ops[];
20
21static inline void lops_init_le(struct gfs2_log_element *le,
22 const struct gfs2_log_operations *lops)
23{
24 INIT_LIST_HEAD(&le->le_list);
25 le->le_ops = lops;
26}
27
28static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
29{
30 if (le->le_ops->lo_add)
31 le->le_ops->lo_add(sdp, le);
32}
33
34static inline void lops_incore_commit(struct gfs2_sbd *sdp,
35 struct gfs2_trans *tr)
36{
37 int x;
38 for (x = 0; gfs2_log_ops[x]; x++)
39 if (gfs2_log_ops[x]->lo_incore_commit)
40 gfs2_log_ops[x]->lo_incore_commit(sdp, tr);
41}
42
43static inline void lops_before_commit(struct gfs2_sbd *sdp)
44{
45 int x;
46 for (x = 0; gfs2_log_ops[x]; x++)
47 if (gfs2_log_ops[x]->lo_before_commit)
48 gfs2_log_ops[x]->lo_before_commit(sdp);
49}
50
51static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
52{
53 int x;
54 for (x = 0; gfs2_log_ops[x]; x++)
55 if (gfs2_log_ops[x]->lo_after_commit)
56 gfs2_log_ops[x]->lo_after_commit(sdp, ai);
57}
58
59static inline void lops_before_scan(struct gfs2_jdesc *jd,
60 struct gfs2_log_header *head,
61 unsigned int pass)
62{
63 int x;
64 for (x = 0; gfs2_log_ops[x]; x++)
65 if (gfs2_log_ops[x]->lo_before_scan)
66 gfs2_log_ops[x]->lo_before_scan(jd, head, pass);
67}
68
69static inline int lops_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
70 struct gfs2_log_descriptor *ld,
71 __be64 *ptr,
72 unsigned int pass)
73{
74 int x, error;
75 for (x = 0; gfs2_log_ops[x]; x++)
76 if (gfs2_log_ops[x]->lo_scan_elements) {
77 error = gfs2_log_ops[x]->lo_scan_elements(jd, start,
78 ld, ptr, pass);
79 if (error)
80 return error;
81 }
82
83 return 0;
84}
85
86static inline void lops_after_scan(struct gfs2_jdesc *jd, int error,
87 unsigned int pass)
88{
89 int x;
90 for (x = 0; gfs2_log_ops[x]; x++)
91 if (gfs2_log_ops[x]->lo_before_scan)
92 gfs2_log_ops[x]->lo_after_scan(jd, error, pass);
93}
94
95#endif /* __LOPS_DOT_H__ */
96
diff --git a/fs/gfs2/lvb.c b/fs/gfs2/lvb.c
new file mode 100644
index 000000000000..63b815dad8e7
--- /dev/null
+++ b/fs/gfs2/lvb.c
@@ -0,0 +1,53 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <asm/semaphore.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "lvb.h"
22
23#define pv(struct, member, fmt) printk(KERN_INFO " "#member" = "fmt"\n", \
24 struct->member);
25
26void gfs2_quota_lvb_in(struct gfs2_quota_lvb *qb, char *lvb)
27{
28 struct gfs2_quota_lvb *str = (struct gfs2_quota_lvb *)lvb;
29
30 qb->qb_magic = be32_to_cpu(str->qb_magic);
31 qb->qb_limit = be64_to_cpu(str->qb_limit);
32 qb->qb_warn = be64_to_cpu(str->qb_warn);
33 qb->qb_value = be64_to_cpu(str->qb_value);
34}
35
36void gfs2_quota_lvb_out(struct gfs2_quota_lvb *qb, char *lvb)
37{
38 struct gfs2_quota_lvb *str = (struct gfs2_quota_lvb *)lvb;
39
40 str->qb_magic = cpu_to_be32(qb->qb_magic);
41 str->qb_limit = cpu_to_be64(qb->qb_limit);
42 str->qb_warn = cpu_to_be64(qb->qb_warn);
43 str->qb_value = cpu_to_be64(qb->qb_value);
44}
45
46void gfs2_quota_lvb_print(struct gfs2_quota_lvb *qb)
47{
48 pv(qb, qb_magic, "%u");
49 pv(qb, qb_limit, "%llu");
50 pv(qb, qb_warn, "%llu");
51 pv(qb, qb_value, "%lld");
52}
53
diff --git a/fs/gfs2/lvb.h b/fs/gfs2/lvb.h
new file mode 100644
index 000000000000..1b9eb69b9534
--- /dev/null
+++ b/fs/gfs2/lvb.h
@@ -0,0 +1,20 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __LVB_DOT_H__
11#define __LVB_DOT_H__
12
13#define GFS2_MIN_LVB_SIZE 32
14
15void gfs2_quota_lvb_in(struct gfs2_quota_lvb *qb, char *lvb);
16void gfs2_quota_lvb_out(struct gfs2_quota_lvb *qb, char *lvb);
17void gfs2_quota_lvb_print(struct gfs2_quota_lvb *qb);
18
19#endif /* __LVB_DOT_H__ */
20
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
new file mode 100644
index 000000000000..c8d17b7ba60b
--- /dev/null
+++ b/fs/gfs2/main.c
@@ -0,0 +1,114 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/module.h>
16#include <linux/init.h>
17#include <linux/gfs2_ondisk.h>
18#include <asm/semaphore.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "ops_fstype.h"
24#include "sys.h"
25#include "util.h"
26
27/**
28 * init_gfs2_fs - Register GFS2 as a filesystem
29 *
30 * Returns: 0 on success, error code on failure
31 */
32
33static int __init init_gfs2_fs(void)
34{
35 int error;
36
37 gfs2_init_lmh();
38
39 error = gfs2_sys_init();
40 if (error)
41 return error;
42
43 error = -ENOMEM;
44
45 gfs2_glock_cachep = kmem_cache_create("gfs2_glock",
46 sizeof(struct gfs2_glock),
47 0, 0, NULL, NULL);
48 if (!gfs2_glock_cachep)
49 goto fail;
50
51 gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
52 sizeof(struct gfs2_inode),
53 0, 0, NULL, NULL);
54 if (!gfs2_inode_cachep)
55 goto fail;
56
57 gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata",
58 sizeof(struct gfs2_bufdata),
59 0, 0, NULL, NULL);
60 if (!gfs2_bufdata_cachep)
61 goto fail;
62
63 error = register_filesystem(&gfs2_fs_type);
64 if (error)
65 goto fail;
66
67 error = register_filesystem(&gfs2meta_fs_type);
68 if (error)
69 goto fail_unregister;
70
71 printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
72
73 return 0;
74
75fail_unregister:
76 unregister_filesystem(&gfs2_fs_type);
77fail:
78 if (gfs2_bufdata_cachep)
79 kmem_cache_destroy(gfs2_bufdata_cachep);
80
81 if (gfs2_inode_cachep)
82 kmem_cache_destroy(gfs2_inode_cachep);
83
84 if (gfs2_glock_cachep)
85 kmem_cache_destroy(gfs2_glock_cachep);
86
87 gfs2_sys_uninit();
88 return error;
89}
90
91/**
92 * exit_gfs2_fs - Unregister the file system
93 *
94 */
95
96static void __exit exit_gfs2_fs(void)
97{
98 unregister_filesystem(&gfs2_fs_type);
99 unregister_filesystem(&gfs2meta_fs_type);
100
101 kmem_cache_destroy(gfs2_bufdata_cachep);
102 kmem_cache_destroy(gfs2_inode_cachep);
103 kmem_cache_destroy(gfs2_glock_cachep);
104
105 gfs2_sys_uninit();
106}
107
108MODULE_DESCRIPTION("Global File System");
109MODULE_AUTHOR("Red Hat, Inc.");
110MODULE_LICENSE("GPL");
111
112module_init(init_gfs2_fs);
113module_exit(exit_gfs2_fs);
114
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
new file mode 100644
index 000000000000..da49973a90d1
--- /dev/null
+++ b/fs/gfs2/meta_io.c
@@ -0,0 +1,889 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/writeback.h>
18#include <linux/swap.h>
19#include <linux/delay.h>
20#include <linux/gfs2_ondisk.h>
21#include <asm/semaphore.h>
22
23#include "gfs2.h"
24#include "lm_interface.h"
25#include "incore.h"
26#include "glock.h"
27#include "glops.h"
28#include "inode.h"
29#include "log.h"
30#include "lops.h"
31#include "meta_io.h"
32#include "rgrp.h"
33#include "trans.h"
34#include "util.h"
35
36#define buffer_busy(bh) \
37((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned)))
38#define buffer_in_io(bh) \
39((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock)))
40
41static int aspace_get_block(struct inode *inode, sector_t lblock,
42 struct buffer_head *bh_result, int create)
43{
44 gfs2_assert_warn(inode->i_sb->s_fs_info, 0);
45 return -EOPNOTSUPP;
46}
47
48static int gfs2_aspace_writepage(struct page *page,
49 struct writeback_control *wbc)
50{
51 return block_write_full_page(page, aspace_get_block, wbc);
52}
53
54/**
55 * stuck_releasepage - We're stuck in gfs2_releasepage(). Print stuff out.
56 * @bh: the buffer we're stuck on
57 *
58 */
59
60static void stuck_releasepage(struct buffer_head *bh)
61{
62 struct inode *inode = bh->b_page->mapping->host;
63 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
64 struct gfs2_bufdata *bd = bh->b_private;
65 struct gfs2_glock *gl;
66
67 fs_warn(sdp, "stuck in gfs2_releasepage() %p\n", inode);
68 fs_warn(sdp, "blkno = %llu, bh->b_count = %d\n",
69 (uint64_t)bh->b_blocknr, atomic_read(&bh->b_count));
70 fs_warn(sdp, "pinned = %u\n", buffer_pinned(bh));
71 fs_warn(sdp, "bh->b_private = %s\n", (bd) ? "!NULL" : "NULL");
72
73 if (!bd)
74 return;
75
76 gl = bd->bd_gl;
77
78 fs_warn(sdp, "gl = (%u, %llu)\n",
79 gl->gl_name.ln_type, gl->gl_name.ln_number);
80
81 fs_warn(sdp, "bd_list_tr = %s, bd_le.le_list = %s\n",
82 (list_empty(&bd->bd_list_tr)) ? "no" : "yes",
83 (list_empty(&bd->bd_le.le_list)) ? "no" : "yes");
84
85 if (gl->gl_ops == &gfs2_inode_glops) {
86 struct gfs2_inode *ip = gl->gl_object;
87 unsigned int x;
88
89 if (!ip)
90 return;
91
92 fs_warn(sdp, "ip = %llu %llu\n",
93 ip->i_num.no_formal_ino, ip->i_num.no_addr);
94 fs_warn(sdp, "ip->i_count = %d, ip->i_vnode = %s\n",
95 atomic_read(&ip->i_count),
96 (ip->i_vnode) ? "!NULL" : "NULL");
97
98 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++)
99 fs_warn(sdp, "ip->i_cache[%u] = %s\n",
100 x, (ip->i_cache[x]) ? "!NULL" : "NULL");
101 }
102}
103
104/**
105 * gfs2_aspace_releasepage - free the metadata associated with a page
106 * @page: the page that's being released
107 * @gfp_mask: passed from Linux VFS, ignored by us
108 *
109 * Call try_to_free_buffers() if the buffers in this page can be
110 * released.
111 *
112 * Returns: 0
113 */
114
115static int gfs2_aspace_releasepage(struct page *page, gfp_t gfp_mask)
116{
117 struct inode *aspace = page->mapping->host;
118 struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
119 struct buffer_head *bh, *head;
120 struct gfs2_bufdata *bd;
121 unsigned long t;
122
123 if (!page_has_buffers(page))
124 goto out;
125
126 head = bh = page_buffers(page);
127 do {
128 t = jiffies;
129
130 while (atomic_read(&bh->b_count)) {
131 if (atomic_read(&aspace->i_writecount)) {
132 if (time_after_eq(jiffies, t +
133 gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
134 stuck_releasepage(bh);
135 t = jiffies;
136 }
137
138 yield();
139 continue;
140 }
141
142 return 0;
143 }
144
145 gfs2_assert_warn(sdp, !buffer_pinned(bh));
146
147 bd = bh->b_private;
148 if (bd) {
149 gfs2_assert_warn(sdp, bd->bd_bh == bh);
150 gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
151 gfs2_assert_warn(sdp, list_empty(&bd->bd_le.le_list));
152 gfs2_assert_warn(sdp, !bd->bd_ail);
153 kmem_cache_free(gfs2_bufdata_cachep, bd);
154 bh->b_private = NULL;
155 }
156
157 bh = bh->b_this_page;
158 }
159 while (bh != head);
160
161 out:
162 return try_to_free_buffers(page);
163}
164
165static struct address_space_operations aspace_aops = {
166 .writepage = gfs2_aspace_writepage,
167 .releasepage = gfs2_aspace_releasepage,
168};
169
170/**
171 * gfs2_aspace_get - Create and initialize a struct inode structure
172 * @sdp: the filesystem the aspace is in
173 *
174 * Right now a struct inode is just a struct inode. Maybe Linux
175 * will supply a more lightweight address space construct (that works)
176 * in the future.
177 *
178 * Make sure pages/buffers in this aspace aren't in high memory.
179 *
180 * Returns: the aspace
181 */
182
183struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
184{
185 struct inode *aspace;
186
187 aspace = new_inode(sdp->sd_vfs);
188 if (aspace) {
189 mapping_set_gfp_mask(aspace->i_mapping, GFP_KERNEL);
190 aspace->i_mapping->a_ops = &aspace_aops;
191 aspace->i_size = ~0ULL;
192 aspace->u.generic_ip = NULL;
193 insert_inode_hash(aspace);
194 }
195 return aspace;
196}
197
198void gfs2_aspace_put(struct inode *aspace)
199{
200 remove_inode_hash(aspace);
201 iput(aspace);
202}
203
204/**
205 * gfs2_ail1_start_one - Start I/O on a part of the AIL
206 * @sdp: the filesystem
207 * @tr: the part of the AIL
208 *
209 */
210
211void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
212{
213 struct gfs2_bufdata *bd, *s;
214 struct buffer_head *bh;
215 int retry;
216
217 BUG_ON(!spin_is_locked(&sdp->sd_log_lock));
218
219 do {
220 retry = 0;
221
222 list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
223 bd_ail_st_list) {
224 bh = bd->bd_bh;
225
226 gfs2_assert(sdp, bd->bd_ail == ai);
227
228 if (!buffer_busy(bh)) {
229 if (!buffer_uptodate(bh))
230 gfs2_io_error_bh(sdp, bh);
231 list_move(&bd->bd_ail_st_list,
232 &ai->ai_ail2_list);
233 continue;
234 }
235
236 if (!buffer_dirty(bh))
237 continue;
238
239 list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
240
241 gfs2_log_unlock(sdp);
242 wait_on_buffer(bh);
243 ll_rw_block(WRITE, 1, &bh);
244 gfs2_log_lock(sdp);
245
246 retry = 1;
247 break;
248 }
249 } while (retry);
250}
251
252/**
253 * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced
254 * @sdp: the filesystem
255 * @ai: the AIL entry
256 *
257 */
258
259int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags)
260{
261 struct gfs2_bufdata *bd, *s;
262 struct buffer_head *bh;
263
264 list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
265 bd_ail_st_list) {
266 bh = bd->bd_bh;
267
268 gfs2_assert(sdp, bd->bd_ail == ai);
269
270 if (buffer_busy(bh)) {
271 if (flags & DIO_ALL)
272 continue;
273 else
274 break;
275 }
276
277 if (!buffer_uptodate(bh))
278 gfs2_io_error_bh(sdp, bh);
279
280 list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
281 }
282
283 return list_empty(&ai->ai_ail1_list);
284}
285
286/**
287 * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
288 * @sdp: the filesystem
289 * @ai: the AIL entry
290 *
291 */
292
293void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
294{
295 struct list_head *head = &ai->ai_ail2_list;
296 struct gfs2_bufdata *bd;
297
298 while (!list_empty(head)) {
299 bd = list_entry(head->prev, struct gfs2_bufdata,
300 bd_ail_st_list);
301 gfs2_assert(sdp, bd->bd_ail == ai);
302 bd->bd_ail = NULL;
303 list_del(&bd->bd_ail_st_list);
304 list_del(&bd->bd_ail_gl_list);
305 atomic_dec(&bd->bd_gl->gl_ail_count);
306 brelse(bd->bd_bh);
307 }
308}
309
310/**
311 * ail_empty_gl - remove all buffers for a given lock from the AIL
312 * @gl: the glock
313 *
314 * None of the buffers should be dirty, locked, or pinned.
315 */
316
317void gfs2_ail_empty_gl(struct gfs2_glock *gl)
318{
319 struct gfs2_sbd *sdp = gl->gl_sbd;
320 unsigned int blocks;
321 struct list_head *head = &gl->gl_ail_list;
322 struct gfs2_bufdata *bd;
323 struct buffer_head *bh;
324 uint64_t blkno;
325 int error;
326
327 blocks = atomic_read(&gl->gl_ail_count);
328 if (!blocks)
329 return;
330
331 error = gfs2_trans_begin(sdp, 0, blocks);
332 if (gfs2_assert_withdraw(sdp, !error))
333 return;
334
335 gfs2_log_lock(sdp);
336 while (!list_empty(head)) {
337 bd = list_entry(head->next, struct gfs2_bufdata,
338 bd_ail_gl_list);
339 bh = bd->bd_bh;
340 blkno = bh->b_blocknr;
341 gfs2_assert_withdraw(sdp, !buffer_busy(bh));
342
343 bd->bd_ail = NULL;
344 list_del(&bd->bd_ail_st_list);
345 list_del(&bd->bd_ail_gl_list);
346 atomic_dec(&gl->gl_ail_count);
347 brelse(bh);
348 gfs2_log_unlock(sdp);
349
350 gfs2_trans_add_revoke(sdp, blkno);
351
352 gfs2_log_lock(sdp);
353 }
354 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
355 gfs2_log_unlock(sdp);
356
357 gfs2_trans_end(sdp);
358 gfs2_log_flush(sdp, NULL);
359}
360
361/**
362 * gfs2_meta_inval - Invalidate all buffers associated with a glock
363 * @gl: the glock
364 *
365 */
366
367void gfs2_meta_inval(struct gfs2_glock *gl)
368{
369 struct gfs2_sbd *sdp = gl->gl_sbd;
370 struct inode *aspace = gl->gl_aspace;
371 struct address_space *mapping = gl->gl_aspace->i_mapping;
372
373 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
374
375 atomic_inc(&aspace->i_writecount);
376 truncate_inode_pages(mapping, 0);
377 atomic_dec(&aspace->i_writecount);
378
379 gfs2_assert_withdraw(sdp, !mapping->nrpages);
380}
381
382/**
383 * gfs2_meta_sync - Sync all buffers associated with a glock
384 * @gl: The glock
385 * @flags: DIO_START | DIO_WAIT
386 *
387 */
388
389void gfs2_meta_sync(struct gfs2_glock *gl, int flags)
390{
391 struct address_space *mapping = gl->gl_aspace->i_mapping;
392 int error = 0;
393
394 if (flags & DIO_START)
395 filemap_fdatawrite(mapping);
396 if (!error && (flags & DIO_WAIT))
397 error = filemap_fdatawait(mapping);
398
399 if (error)
400 gfs2_io_error(gl->gl_sbd);
401}
402
403/**
404 * getbuf - Get a buffer with a given address space
405 * @sdp: the filesystem
406 * @aspace: the address space
407 * @blkno: the block number (filesystem scope)
408 * @create: 1 if the buffer should be created
409 *
410 * Returns: the buffer
411 */
412
413static struct buffer_head *getbuf(struct gfs2_sbd *sdp, struct inode *aspace,
414 uint64_t blkno, int create)
415{
416 struct page *page;
417 struct buffer_head *bh;
418 unsigned int shift;
419 unsigned long index;
420 unsigned int bufnum;
421
422 shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
423 index = blkno >> shift; /* convert block to page */
424 bufnum = blkno - (index << shift); /* block buf index within page */
425
426 if (create) {
427 for (;;) {
428 page = grab_cache_page(aspace->i_mapping, index);
429 if (page)
430 break;
431 yield();
432 }
433 } else {
434 page = find_lock_page(aspace->i_mapping, index);
435 if (!page)
436 return NULL;
437 }
438
439 if (!page_has_buffers(page))
440 create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
441
442 /* Locate header for our buffer within our page */
443 for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
444 /* Do nothing */;
445 get_bh(bh);
446
447 if (!buffer_mapped(bh))
448 map_bh(bh, sdp->sd_vfs, blkno);
449
450 unlock_page(page);
451 mark_page_accessed(page);
452 page_cache_release(page);
453
454 return bh;
455}
456
457static void meta_prep_new(struct buffer_head *bh)
458{
459 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
460
461 lock_buffer(bh);
462 clear_buffer_dirty(bh);
463 set_buffer_uptodate(bh);
464 unlock_buffer(bh);
465
466 mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
467}
468
469/**
470 * gfs2_meta_new - Get a block
471 * @gl: The glock associated with this block
472 * @blkno: The block number
473 *
474 * Returns: The buffer
475 */
476
477struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, uint64_t blkno)
478{
479 struct buffer_head *bh;
480 bh = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
481 meta_prep_new(bh);
482 return bh;
483}
484
485/**
486 * gfs2_meta_read - Read a block from disk
487 * @gl: The glock covering the block
488 * @blkno: The block number
489 * @flags: flags to gfs2_dreread()
490 * @bhp: the place where the buffer is returned (NULL on failure)
491 *
492 * Returns: errno
493 */
494
495int gfs2_meta_read(struct gfs2_glock *gl, uint64_t blkno, int flags,
496 struct buffer_head **bhp)
497{
498 int error;
499
500 *bhp = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
501 error = gfs2_meta_reread(gl->gl_sbd, *bhp, flags);
502 if (error)
503 brelse(*bhp);
504
505 return error;
506}
507
508/**
509 * gfs2_meta_reread - Reread a block from disk
510 * @sdp: the filesystem
511 * @bh: The block to read
512 * @flags: Flags that control the read
513 *
514 * Returns: errno
515 */
516
517int gfs2_meta_reread(struct gfs2_sbd *sdp, struct buffer_head *bh, int flags)
518{
519 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
520 return -EIO;
521
522 if (flags & DIO_FORCE)
523 clear_buffer_uptodate(bh);
524
525 if ((flags & DIO_START) && !buffer_uptodate(bh))
526 ll_rw_block(READ, 1, &bh);
527
528 if (flags & DIO_WAIT) {
529 wait_on_buffer(bh);
530
531 if (!buffer_uptodate(bh)) {
532 struct gfs2_trans *tr = current->journal_info;
533 if (tr && tr->tr_touched)
534 gfs2_io_error_bh(sdp, bh);
535 return -EIO;
536 }
537 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
538 return -EIO;
539 }
540
541 return 0;
542}
543
544/**
545 * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer
546 * @gl: the glock the buffer belongs to
547 * @bh: The buffer to be attached to
548 * @meta: Flag to indicate whether its metadata or not
549 */
550
551void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
552 int meta)
553{
554 struct gfs2_bufdata *bd;
555
556 if (meta)
557 lock_page(bh->b_page);
558
559 if (bh->b_private) {
560 if (meta)
561 unlock_page(bh->b_page);
562 return;
563 }
564
565 bd = kmem_cache_alloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL),
566 memset(bd, 0, sizeof(struct gfs2_bufdata));
567
568 bd->bd_bh = bh;
569 bd->bd_gl = gl;
570
571 INIT_LIST_HEAD(&bd->bd_list_tr);
572 if (meta) {
573 lops_init_le(&bd->bd_le, &gfs2_buf_lops);
574 } else {
575 lops_init_le(&bd->bd_le, &gfs2_databuf_lops);
576 get_bh(bh);
577 }
578 bh->b_private = bd;
579
580 if (meta)
581 unlock_page(bh->b_page);
582}
583
584/**
585 * gfs2_pin - Pin a buffer in memory
586 * @sdp: the filesystem the buffer belongs to
587 * @bh: The buffer to be pinned
588 *
589 */
590
591void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
592{
593 struct gfs2_bufdata *bd = bh->b_private;
594
595 gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
596
597 if (test_set_buffer_pinned(bh))
598 gfs2_assert_withdraw(sdp, 0);
599
600 wait_on_buffer(bh);
601
602 /* If this buffer is in the AIL and it has already been written
603 to in-place disk block, remove it from the AIL. */
604
605 gfs2_log_lock(sdp);
606 if (bd->bd_ail && !buffer_in_io(bh))
607 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
608 gfs2_log_unlock(sdp);
609
610 clear_buffer_dirty(bh);
611 wait_on_buffer(bh);
612
613 if (!buffer_uptodate(bh))
614 gfs2_io_error_bh(sdp, bh);
615
616 get_bh(bh);
617}
618
619/**
620 * gfs2_unpin - Unpin a buffer
621 * @sdp: the filesystem the buffer belongs to
622 * @bh: The buffer to unpin
623 * @ai:
624 *
625 */
626
627void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
628 struct gfs2_ail *ai)
629{
630 struct gfs2_bufdata *bd = bh->b_private;
631
632 gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
633
634 if (!buffer_pinned(bh))
635 gfs2_assert_withdraw(sdp, 0);
636
637 mark_buffer_dirty(bh);
638 clear_buffer_pinned(bh);
639
640 gfs2_log_lock(sdp);
641 if (bd->bd_ail) {
642 list_del(&bd->bd_ail_st_list);
643 brelse(bh);
644 } else {
645 struct gfs2_glock *gl = bd->bd_gl;
646 list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
647 atomic_inc(&gl->gl_ail_count);
648 }
649 bd->bd_ail = ai;
650 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
651 gfs2_log_unlock(sdp);
652}
653
654/**
655 * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore
656 * @ip: the inode who owns the buffers
657 * @bstart: the first buffer in the run
658 * @blen: the number of buffers in the run
659 *
660 */
661
662void gfs2_meta_wipe(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen)
663{
664 struct gfs2_sbd *sdp = ip->i_sbd;
665 struct inode *aspace = ip->i_gl->gl_aspace;
666 struct buffer_head *bh;
667
668 while (blen) {
669 bh = getbuf(sdp, aspace, bstart, NO_CREATE);
670 if (bh) {
671 struct gfs2_bufdata *bd = bh->b_private;
672
673 if (test_clear_buffer_pinned(bh)) {
674 struct gfs2_trans *tr = current->journal_info;
675 gfs2_log_lock(sdp);
676 list_del_init(&bd->bd_le.le_list);
677 gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
678 sdp->sd_log_num_buf--;
679 gfs2_log_unlock(sdp);
680 tr->tr_num_buf_rm++;
681 brelse(bh);
682 }
683 if (bd) {
684 gfs2_log_lock(sdp);
685 if (bd->bd_ail) {
686 uint64_t blkno = bh->b_blocknr;
687 bd->bd_ail = NULL;
688 list_del(&bd->bd_ail_st_list);
689 list_del(&bd->bd_ail_gl_list);
690 atomic_dec(&bd->bd_gl->gl_ail_count);
691 brelse(bh);
692 gfs2_log_unlock(sdp);
693 gfs2_trans_add_revoke(sdp, blkno);
694 } else
695 gfs2_log_unlock(sdp);
696 }
697
698 lock_buffer(bh);
699 clear_buffer_dirty(bh);
700 clear_buffer_uptodate(bh);
701 unlock_buffer(bh);
702
703 brelse(bh);
704 }
705
706 bstart++;
707 blen--;
708 }
709}
710
711/**
712 * gfs2_meta_cache_flush - get rid of any references on buffers for this inode
713 * @ip: The GFS2 inode
714 *
715 * This releases buffers that are in the most-recently-used array of
716 * blocks used for indirect block addressing for this inode.
717 */
718
719void gfs2_meta_cache_flush(struct gfs2_inode *ip)
720{
721 struct buffer_head **bh_slot;
722 unsigned int x;
723
724 spin_lock(&ip->i_spin);
725
726 for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
727 bh_slot = &ip->i_cache[x];
728 if (!*bh_slot)
729 break;
730 brelse(*bh_slot);
731 *bh_slot = NULL;
732 }
733
734 spin_unlock(&ip->i_spin);
735}
736
737/**
738 * gfs2_meta_indirect_buffer - Get a metadata buffer
739 * @ip: The GFS2 inode
740 * @height: The level of this buf in the metadata (indir addr) tree (if any)
741 * @num: The block number (device relative) of the buffer
742 * @new: Non-zero if we may create a new buffer
743 * @bhp: the buffer is returned here
744 *
745 * Try to use the gfs2_inode's MRU metadata tree cache.
746 *
747 * Returns: errno
748 */
749
750int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, uint64_t num,
751 int new, struct buffer_head **bhp)
752{
753 struct buffer_head *bh, **bh_slot = ip->i_cache + height;
754 int error;
755
756 spin_lock(&ip->i_spin);
757 bh = *bh_slot;
758 if (bh) {
759 if (bh->b_blocknr == num)
760 get_bh(bh);
761 else
762 bh = NULL;
763 }
764 spin_unlock(&ip->i_spin);
765
766 if (bh) {
767 if (new)
768 meta_prep_new(bh);
769 else {
770 error = gfs2_meta_reread(ip->i_sbd, bh,
771 DIO_START | DIO_WAIT);
772 if (error) {
773 brelse(bh);
774 return error;
775 }
776 }
777 } else {
778 if (new)
779 bh = gfs2_meta_new(ip->i_gl, num);
780 else {
781 error = gfs2_meta_read(ip->i_gl, num,
782 DIO_START | DIO_WAIT, &bh);
783 if (error)
784 return error;
785 }
786
787 spin_lock(&ip->i_spin);
788 if (*bh_slot != bh) {
789 brelse(*bh_slot);
790 *bh_slot = bh;
791 get_bh(bh);
792 }
793 spin_unlock(&ip->i_spin);
794 }
795
796 if (new) {
797 if (gfs2_assert_warn(ip->i_sbd, height)) {
798 brelse(bh);
799 return -EIO;
800 }
801 gfs2_trans_add_bh(ip->i_gl, bh, 1);
802 gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
803 gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
804
805 } else if (gfs2_metatype_check(ip->i_sbd, bh,
806 (height) ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)) {
807 brelse(bh);
808 return -EIO;
809 }
810
811 *bhp = bh;
812
813 return 0;
814}
815
816/**
817 * gfs2_meta_ra - start readahead on an extent of a file
818 * @gl: the glock the blocks belong to
819 * @dblock: the starting disk block
820 * @extlen: the number of blocks in the extent
821 *
822 */
823
824void gfs2_meta_ra(struct gfs2_glock *gl, uint64_t dblock, uint32_t extlen)
825{
826 struct gfs2_sbd *sdp = gl->gl_sbd;
827 struct inode *aspace = gl->gl_aspace;
828 struct buffer_head *first_bh, *bh;
829 uint32_t max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
830 sdp->sd_sb.sb_bsize_shift;
831 int error;
832
833 if (!extlen || !max_ra)
834 return;
835 if (extlen > max_ra)
836 extlen = max_ra;
837
838 first_bh = getbuf(sdp, aspace, dblock, CREATE);
839
840 if (buffer_uptodate(first_bh))
841 goto out;
842 if (!buffer_locked(first_bh)) {
843 error = gfs2_meta_reread(sdp, first_bh, DIO_START);
844 if (error)
845 goto out;
846 }
847
848 dblock++;
849 extlen--;
850
851 while (extlen) {
852 bh = getbuf(sdp, aspace, dblock, CREATE);
853
854 if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
855 error = gfs2_meta_reread(sdp, bh, DIO_START);
856 brelse(bh);
857 if (error)
858 goto out;
859 } else
860 brelse(bh);
861
862 dblock++;
863 extlen--;
864
865 if (buffer_uptodate(first_bh))
866 break;
867 }
868
869 out:
870 brelse(first_bh);
871}
872
873/**
874 * gfs2_meta_syncfs - sync all the buffers in a filesystem
875 * @sdp: the filesystem
876 *
877 */
878
879void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
880{
881 gfs2_log_flush(sdp, NULL);
882 for (;;) {
883 gfs2_ail1_start(sdp, DIO_ALL);
884 if (gfs2_ail1_empty(sdp, DIO_ALL))
885 break;
886 msleep(10);
887 }
888}
889
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
new file mode 100644
index 000000000000..d72144d5d727
--- /dev/null
+++ b/fs/gfs2/meta_io.h
@@ -0,0 +1,89 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __DIO_DOT_H__
11#define __DIO_DOT_H__
12
13static inline void gfs2_buffer_clear(struct buffer_head *bh)
14{
15 memset(bh->b_data, 0, bh->b_size);
16}
17
18static inline void gfs2_buffer_clear_tail(struct buffer_head *bh, int head)
19{
20 memset(bh->b_data + head, 0, bh->b_size - head);
21}
22
23static inline void gfs2_buffer_clear_ends(struct buffer_head *bh, int offset,
24 int amount, int journaled)
25{
26 int z_off1 = (journaled) ? sizeof(struct gfs2_meta_header) : 0;
27 int z_len1 = offset - z_off1;
28 int z_off2 = offset + amount;
29 int z_len2 = (bh)->b_size - z_off2;
30
31 if (z_len1)
32 memset(bh->b_data + z_off1, 0, z_len1);
33
34 if (z_len2)
35 memset(bh->b_data + z_off2, 0, z_len2);
36}
37
38static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
39 int to_head,
40 struct buffer_head *from_bh,
41 int from_head)
42{
43 memcpy(to_bh->b_data + to_head,
44 from_bh->b_data + from_head,
45 from_bh->b_size - from_head);
46 memset(to_bh->b_data + to_bh->b_size + to_head - from_head,
47 0,
48 from_head - to_head);
49}
50
51struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
52void gfs2_aspace_put(struct inode *aspace);
53
54void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai);
55int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags);
56void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai);
57void gfs2_ail_empty_gl(struct gfs2_glock *gl);
58
59void gfs2_meta_inval(struct gfs2_glock *gl);
60void gfs2_meta_sync(struct gfs2_glock *gl, int flags);
61
62struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, uint64_t blkno);
63int gfs2_meta_read(struct gfs2_glock *gl, uint64_t blkno,
64 int flags, struct buffer_head **bhp);
65int gfs2_meta_reread(struct gfs2_sbd *sdp, struct buffer_head *bh, int flags);
66
67void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
68 int meta);
69void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
70void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
71 struct gfs2_ail *ai);
72
73void gfs2_meta_wipe(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen);
74
75void gfs2_meta_cache_flush(struct gfs2_inode *ip);
76int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, uint64_t num,
77 int new, struct buffer_head **bhp);
78
79static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
80 struct buffer_head **bhp)
81{
82 return gfs2_meta_indirect_buffer(ip, 0, ip->i_num.no_addr, 0, bhp);
83}
84
85void gfs2_meta_ra(struct gfs2_glock *gl, uint64_t dblock, uint32_t extlen);
86void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
87
88#endif /* __DIO_DOT_H__ */
89
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
new file mode 100644
index 000000000000..e90ea7d32f9e
--- /dev/null
+++ b/fs/gfs2/mount.c
@@ -0,0 +1,215 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <asm/semaphore.h>
17
18#include "gfs2.h"
19#include "lm_interface.h"
20#include "incore.h"
21#include "mount.h"
22#include "sys.h"
23#include "util.h"
24
25/**
26 * gfs2_mount_args - Parse mount options
27 * @sdp:
28 * @data:
29 *
30 * Return: errno
31 */
32
33int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
34{
35 struct gfs2_args *args = &sdp->sd_args;
36 char *data = data_arg;
37 char *options, *o, *v;
38 int error = 0;
39
40 if (!remount) {
41 /* If someone preloaded options, use those instead */
42 spin_lock(&gfs2_sys_margs_lock);
43 if (gfs2_sys_margs) {
44 data = gfs2_sys_margs;
45 gfs2_sys_margs = NULL;
46 }
47 spin_unlock(&gfs2_sys_margs_lock);
48
49 /* Set some defaults */
50 args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
51 args->ar_quota = GFS2_QUOTA_DEFAULT;
52 args->ar_data = GFS2_DATA_DEFAULT;
53 }
54
55 /* Split the options into tokens with the "," character and
56 process them */
57
58 for (options = data; (o = strsep(&options, ",")); ) {
59 if (!*o)
60 continue;
61
62 v = strchr(o, '=');
63 if (v)
64 *v++ = 0;
65
66 if (!strcmp(o, "lockproto")) {
67 if (!v)
68 goto need_value;
69 if (remount && strcmp(v, args->ar_lockproto))
70 goto cant_remount;
71 strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
72 args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
73 }
74
75 else if (!strcmp(o, "locktable")) {
76 if (!v)
77 goto need_value;
78 if (remount && strcmp(v, args->ar_locktable))
79 goto cant_remount;
80 strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
81 args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0;
82 }
83
84 else if (!strcmp(o, "hostdata")) {
85 if (!v)
86 goto need_value;
87 if (remount && strcmp(v, args->ar_hostdata))
88 goto cant_remount;
89 strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
90 args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
91 }
92
93 else if (!strcmp(o, "spectator")) {
94 if (remount && !args->ar_spectator)
95 goto cant_remount;
96 args->ar_spectator = 1;
97 sdp->sd_vfs->s_flags |= MS_RDONLY;
98 }
99
100 else if (!strcmp(o, "ignore_local_fs")) {
101 if (remount && !args->ar_ignore_local_fs)
102 goto cant_remount;
103 args->ar_ignore_local_fs = 1;
104 }
105
106 else if (!strcmp(o, "localflocks")) {
107 if (remount && !args->ar_localflocks)
108 goto cant_remount;
109 args->ar_localflocks = 1;
110 }
111
112 else if (!strcmp(o, "localcaching")) {
113 if (remount && !args->ar_localcaching)
114 goto cant_remount;
115 args->ar_localcaching = 1;
116 }
117
118 else if (!strcmp(o, "debug"))
119 args->ar_debug = 1;
120
121 else if (!strcmp(o, "nodebug"))
122 args->ar_debug = 0;
123
124 else if (!strcmp(o, "upgrade")) {
125 if (remount && !args->ar_upgrade)
126 goto cant_remount;
127 args->ar_upgrade = 1;
128 }
129
130 else if (!strcmp(o, "num_glockd")) {
131 unsigned int x;
132 if (!v)
133 goto need_value;
134 sscanf(v, "%u", &x);
135 if (remount && x != args->ar_num_glockd)
136 goto cant_remount;
137 if (!x || x > GFS2_GLOCKD_MAX) {
138 fs_info(sdp, "0 < num_glockd <= %u (not %u)\n",
139 GFS2_GLOCKD_MAX, x);
140 error = -EINVAL;
141 break;
142 }
143 args->ar_num_glockd = x;
144 }
145
146 else if (!strcmp(o, "acl")) {
147 args->ar_posix_acl = 1;
148 sdp->sd_vfs->s_flags |= MS_POSIXACL;
149 }
150
151 else if (!strcmp(o, "noacl")) {
152 args->ar_posix_acl = 0;
153 sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
154 }
155
156 else if (!strcmp(o, "quota")) {
157 if (!v)
158 goto need_value;
159 if (!strcmp(v, "off"))
160 args->ar_quota = GFS2_QUOTA_OFF;
161 else if (!strcmp(v, "account"))
162 args->ar_quota = GFS2_QUOTA_ACCOUNT;
163 else if (!strcmp(v, "on"))
164 args->ar_quota = GFS2_QUOTA_ON;
165 else {
166 fs_info(sdp, "invalid value for quota\n");
167 error = -EINVAL;
168 break;
169 }
170 }
171
172 else if (!strcmp(o, "suiddir"))
173 args->ar_suiddir = 1;
174
175 else if (!strcmp(o, "nosuiddir"))
176 args->ar_suiddir = 0;
177
178 else if (!strcmp(o, "data")) {
179 if (!v)
180 goto need_value;
181 if (!strcmp(v, "writeback"))
182 args->ar_data = GFS2_DATA_WRITEBACK;
183 else if (!strcmp(v, "ordered"))
184 args->ar_data = GFS2_DATA_ORDERED;
185 else {
186 fs_info(sdp, "invalid value for data\n");
187 error = -EINVAL;
188 break;
189 }
190 }
191
192 else {
193 fs_info(sdp, "unknown option: %s\n", o);
194 error = -EINVAL;
195 break;
196 }
197 }
198
199 if (error)
200 fs_info(sdp, "invalid mount option(s)\n");
201
202 if (data != data_arg)
203 kfree(data);
204
205 return error;
206
207 need_value:
208 fs_info(sdp, "need value for option %s\n", o);
209 return -EINVAL;
210
211 cant_remount:
212 fs_info(sdp, "can't remount with option %s\n", o);
213 return -EINVAL;
214}
215
diff --git a/fs/gfs2/mount.h b/fs/gfs2/mount.h
new file mode 100644
index 000000000000..bc8331cd7b2c
--- /dev/null
+++ b/fs/gfs2/mount.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __MOUNT_DOT_H__
11#define __MOUNT_DOT_H__
12
13int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount);
14
15#endif /* __MOUNT_DOT_H__ */
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
new file mode 100644
index 000000000000..acfc944ce13e
--- /dev/null
+++ b/fs/gfs2/ondisk.c
@@ -0,0 +1,517 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <asm/semaphore.h>
16
17#include "gfs2.h"
18#include <linux/gfs2_ondisk.h>
19
20#define pv(struct, member, fmt) printk(KERN_INFO " "#member" = "fmt"\n", \
21 struct->member);
22#define pa(struct, member, count) print_array(#member, struct->member, count);
23
24/**
25 * print_array - Print out an array of bytes
26 * @title: what to print before the array
27 * @buf: the array
28 * @count: the number of bytes
29 *
30 */
31
32static void print_array(char *title, char *buf, int count)
33{
34 int x;
35
36 printk(KERN_INFO " %s =\n" KERN_INFO, title);
37 for (x = 0; x < count; x++) {
38 printk("%.2X ", (unsigned char)buf[x]);
39 if (x % 16 == 15)
40 printk("\n" KERN_INFO);
41 }
42 if (x % 16)
43 printk("\n");
44}
45
46/*
47 * gfs2_xxx_in - read in an xxx struct
48 * first arg: the cpu-order structure
49 * buf: the disk-order buffer
50 *
51 * gfs2_xxx_out - write out an xxx struct
52 * first arg: the cpu-order structure
53 * buf: the disk-order buffer
54 *
55 * gfs2_xxx_print - print out an xxx struct
56 * first arg: the cpu-order structure
57 */
58
59void gfs2_inum_in(struct gfs2_inum *no, char *buf)
60{
61 struct gfs2_inum *str = (struct gfs2_inum *)buf;
62
63 no->no_formal_ino = be64_to_cpu(str->no_formal_ino);
64 no->no_addr = be64_to_cpu(str->no_addr);
65}
66
67void gfs2_inum_out(const struct gfs2_inum *no, char *buf)
68{
69 struct gfs2_inum *str = (struct gfs2_inum *)buf;
70
71 str->no_formal_ino = cpu_to_be64(no->no_formal_ino);
72 str->no_addr = cpu_to_be64(no->no_addr);
73}
74
75void gfs2_inum_print(struct gfs2_inum *no)
76{
77 pv(no, no_formal_ino, "%llu");
78 pv(no, no_addr, "%llu");
79}
80
81static void gfs2_meta_header_in(struct gfs2_meta_header *mh, char *buf)
82{
83 struct gfs2_meta_header *str = (struct gfs2_meta_header *)buf;
84
85 mh->mh_magic = be32_to_cpu(str->mh_magic);
86 mh->mh_type = be32_to_cpu(str->mh_type);
87 mh->mh_format = be32_to_cpu(str->mh_format);
88}
89
90static void gfs2_meta_header_out(struct gfs2_meta_header *mh, char *buf)
91{
92 struct gfs2_meta_header *str = (struct gfs2_meta_header *)buf;
93
94 str->mh_magic = cpu_to_be32(mh->mh_magic);
95 str->mh_type = cpu_to_be32(mh->mh_type);
96 str->mh_format = cpu_to_be32(mh->mh_format);
97}
98
99void gfs2_meta_header_print(struct gfs2_meta_header *mh)
100{
101 pv(mh, mh_magic, "0x%.8X");
102 pv(mh, mh_type, "%u");
103 pv(mh, mh_format, "%u");
104}
105
106void gfs2_sb_in(struct gfs2_sb *sb, char *buf)
107{
108 struct gfs2_sb *str = (struct gfs2_sb *)buf;
109
110 gfs2_meta_header_in(&sb->sb_header, buf);
111
112 sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
113 sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
114 sb->sb_bsize = be32_to_cpu(str->sb_bsize);
115 sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
116
117 gfs2_inum_in(&sb->sb_master_dir, (char *)&str->sb_master_dir);
118 gfs2_inum_in(&sb->sb_root_dir, (char *)&str->sb_root_dir);
119
120 memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
121 memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
122}
123
124void gfs2_sb_print(struct gfs2_sb *sb)
125{
126 gfs2_meta_header_print(&sb->sb_header);
127
128 pv(sb, sb_fs_format, "%u");
129 pv(sb, sb_multihost_format, "%u");
130
131 pv(sb, sb_bsize, "%u");
132 pv(sb, sb_bsize_shift, "%u");
133
134 gfs2_inum_print(&sb->sb_master_dir);
135
136 pv(sb, sb_lockproto, "%s");
137 pv(sb, sb_locktable, "%s");
138}
139
140void gfs2_rindex_in(struct gfs2_rindex *ri, char *buf)
141{
142 struct gfs2_rindex *str = (struct gfs2_rindex *)buf;
143
144 ri->ri_addr = be64_to_cpu(str->ri_addr);
145 ri->ri_length = be32_to_cpu(str->ri_length);
146 ri->ri_data0 = be64_to_cpu(str->ri_data0);
147 ri->ri_data = be32_to_cpu(str->ri_data);
148 ri->ri_bitbytes = be32_to_cpu(str->ri_bitbytes);
149
150}
151
152void gfs2_rindex_out(struct gfs2_rindex *ri, char *buf)
153{
154 struct gfs2_rindex *str = (struct gfs2_rindex *)buf;
155
156 str->ri_addr = cpu_to_be64(ri->ri_addr);
157 str->ri_length = cpu_to_be32(ri->ri_length);
158 str->__pad = 0;
159
160 str->ri_data0 = cpu_to_be64(ri->ri_data0);
161 str->ri_data = cpu_to_be32(ri->ri_data);
162 str->ri_bitbytes = cpu_to_be32(ri->ri_bitbytes);
163 memset(str->ri_reserved, 0, sizeof(str->ri_reserved));
164}
165
166void gfs2_rindex_print(struct gfs2_rindex *ri)
167{
168 pv(ri, ri_addr, "%llu");
169 pv(ri, ri_length, "%u");
170
171 pv(ri, ri_data0, "%llu");
172 pv(ri, ri_data, "%u");
173
174 pv(ri, ri_bitbytes, "%u");
175}
176
177void gfs2_rgrp_in(struct gfs2_rgrp *rg, char *buf)
178{
179 struct gfs2_rgrp *str = (struct gfs2_rgrp *)buf;
180
181 gfs2_meta_header_in(&rg->rg_header, buf);
182 rg->rg_flags = be32_to_cpu(str->rg_flags);
183 rg->rg_free = be32_to_cpu(str->rg_free);
184 rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
185}
186
187void gfs2_rgrp_out(struct gfs2_rgrp *rg, char *buf)
188{
189 struct gfs2_rgrp *str = (struct gfs2_rgrp *)buf;
190
191 gfs2_meta_header_out(&rg->rg_header, buf);
192 str->rg_flags = cpu_to_be32(rg->rg_flags);
193 str->rg_free = cpu_to_be32(rg->rg_free);
194 str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
195
196 memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
197}
198
199void gfs2_rgrp_print(struct gfs2_rgrp *rg)
200{
201 gfs2_meta_header_print(&rg->rg_header);
202 pv(rg, rg_flags, "%u");
203 pv(rg, rg_free, "%u");
204 pv(rg, rg_dinodes, "%u");
205
206 pa(rg, rg_reserved, 36);
207}
208
209void gfs2_quota_in(struct gfs2_quota *qu, char *buf)
210{
211 struct gfs2_quota *str = (struct gfs2_quota *)buf;
212
213 qu->qu_limit = be64_to_cpu(str->qu_limit);
214 qu->qu_warn = be64_to_cpu(str->qu_warn);
215 qu->qu_value = be64_to_cpu(str->qu_value);
216}
217
218void gfs2_quota_out(struct gfs2_quota *qu, char *buf)
219{
220 struct gfs2_quota *str = (struct gfs2_quota *)buf;
221
222 str->qu_limit = cpu_to_be64(qu->qu_limit);
223 str->qu_warn = cpu_to_be64(qu->qu_warn);
224 str->qu_value = cpu_to_be64(qu->qu_value);
225}
226
227void gfs2_quota_print(struct gfs2_quota *qu)
228{
229 pv(qu, qu_limit, "%llu");
230 pv(qu, qu_warn, "%llu");
231 pv(qu, qu_value, "%lld");
232}
233
234void gfs2_dinode_in(struct gfs2_dinode *di, char *buf)
235{
236 struct gfs2_dinode *str = (struct gfs2_dinode *)buf;
237
238 gfs2_meta_header_in(&di->di_header, buf);
239 gfs2_inum_in(&di->di_num, (char *)&str->di_num);
240
241 di->di_mode = be32_to_cpu(str->di_mode);
242 di->di_uid = be32_to_cpu(str->di_uid);
243 di->di_gid = be32_to_cpu(str->di_gid);
244 di->di_nlink = be32_to_cpu(str->di_nlink);
245 di->di_size = be64_to_cpu(str->di_size);
246 di->di_blocks = be64_to_cpu(str->di_blocks);
247 di->di_atime = be64_to_cpu(str->di_atime);
248 di->di_mtime = be64_to_cpu(str->di_mtime);
249 di->di_ctime = be64_to_cpu(str->di_ctime);
250 di->di_major = be32_to_cpu(str->di_major);
251 di->di_minor = be32_to_cpu(str->di_minor);
252
253 di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
254 di->di_goal_data = be64_to_cpu(str->di_goal_data);
255
256 di->di_flags = be32_to_cpu(str->di_flags);
257 di->di_payload_format = be32_to_cpu(str->di_payload_format);
258 di->di_height = be16_to_cpu(str->di_height);
259
260 di->di_depth = be16_to_cpu(str->di_depth);
261 di->di_entries = be32_to_cpu(str->di_entries);
262
263 di->di_eattr = be64_to_cpu(str->di_eattr);
264
265}
266
267void gfs2_dinode_out(struct gfs2_dinode *di, char *buf)
268{
269 struct gfs2_dinode *str = (struct gfs2_dinode *)buf;
270
271 gfs2_meta_header_out(&di->di_header, buf);
272 gfs2_inum_out(&di->di_num, (char *)&str->di_num);
273
274 str->di_mode = cpu_to_be32(di->di_mode);
275 str->di_uid = cpu_to_be32(di->di_uid);
276 str->di_gid = cpu_to_be32(di->di_gid);
277 str->di_nlink = cpu_to_be32(di->di_nlink);
278 str->di_size = cpu_to_be64(di->di_size);
279 str->di_blocks = cpu_to_be64(di->di_blocks);
280 str->di_atime = cpu_to_be64(di->di_atime);
281 str->di_mtime = cpu_to_be64(di->di_mtime);
282 str->di_ctime = cpu_to_be64(di->di_ctime);
283 str->di_major = cpu_to_be32(di->di_major);
284 str->di_minor = cpu_to_be32(di->di_minor);
285
286 str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
287 str->di_goal_data = cpu_to_be64(di->di_goal_data);
288
289 str->di_flags = cpu_to_be32(di->di_flags);
290 str->di_payload_format = cpu_to_be32(di->di_payload_format);
291 str->di_height = cpu_to_be16(di->di_height);
292
293 str->di_depth = cpu_to_be16(di->di_depth);
294 str->di_entries = cpu_to_be32(di->di_entries);
295
296 str->di_eattr = cpu_to_be64(di->di_eattr);
297
298}
299
300void gfs2_dinode_print(struct gfs2_dinode *di)
301{
302 gfs2_meta_header_print(&di->di_header);
303 gfs2_inum_print(&di->di_num);
304
305 pv(di, di_mode, "0%o");
306 pv(di, di_uid, "%u");
307 pv(di, di_gid, "%u");
308 pv(di, di_nlink, "%u");
309 pv(di, di_size, "%llu");
310 pv(di, di_blocks, "%llu");
311 pv(di, di_atime, "%lld");
312 pv(di, di_mtime, "%lld");
313 pv(di, di_ctime, "%lld");
314 pv(di, di_major, "%u");
315 pv(di, di_minor, "%u");
316
317 pv(di, di_goal_meta, "%llu");
318 pv(di, di_goal_data, "%llu");
319
320 pv(di, di_flags, "0x%.8X");
321 pv(di, di_payload_format, "%u");
322 pv(di, di_height, "%u");
323
324 pv(di, di_depth, "%u");
325 pv(di, di_entries, "%u");
326
327 pv(di, di_eattr, "%llu");
328}
329
330void gfs2_dirent_print(struct gfs2_dirent *de, char *name)
331{
332 char buf[GFS2_FNAMESIZE + 1];
333
334 gfs2_inum_print(&de->de_inum);
335 pv(de, de_hash, "0x%.8X");
336 pv(de, de_rec_len, "%u");
337 pv(de, de_name_len, "%u");
338 pv(de, de_type, "%u");
339
340 memset(buf, 0, GFS2_FNAMESIZE + 1);
341 memcpy(buf, name, de->de_name_len);
342 printk(KERN_INFO " name = %s\n", buf);
343}
344
345void gfs2_leaf_print(struct gfs2_leaf *lf)
346{
347 gfs2_meta_header_print(&lf->lf_header);
348 pv(lf, lf_depth, "%u");
349 pv(lf, lf_entries, "%u");
350 pv(lf, lf_dirent_format, "%u");
351 pv(lf, lf_next, "%llu");
352
353 pa(lf, lf_reserved, 32);
354}
355
356void gfs2_ea_header_in(struct gfs2_ea_header *ea, char *buf)
357{
358 struct gfs2_ea_header *str = (struct gfs2_ea_header *)buf;
359
360 ea->ea_rec_len = be32_to_cpu(str->ea_rec_len);
361 ea->ea_data_len = be32_to_cpu(str->ea_data_len);
362 ea->ea_name_len = str->ea_name_len;
363 ea->ea_type = str->ea_type;
364 ea->ea_flags = str->ea_flags;
365 ea->ea_num_ptrs = str->ea_num_ptrs;
366}
367
368void gfs2_ea_header_out(struct gfs2_ea_header *ea, char *buf)
369{
370 struct gfs2_ea_header *str = (struct gfs2_ea_header *)buf;
371
372 str->ea_rec_len = cpu_to_be32(ea->ea_rec_len);
373 str->ea_data_len = cpu_to_be32(ea->ea_data_len);
374 str->ea_name_len = ea->ea_name_len;
375 str->ea_type = ea->ea_type;
376 str->ea_flags = ea->ea_flags;
377 str->ea_num_ptrs = ea->ea_num_ptrs;
378 str->__pad = 0;
379}
380
381void gfs2_ea_header_print(struct gfs2_ea_header *ea, char *name)
382{
383 char buf[GFS2_EA_MAX_NAME_LEN + 1];
384
385 pv(ea, ea_rec_len, "%u");
386 pv(ea, ea_data_len, "%u");
387 pv(ea, ea_name_len, "%u");
388 pv(ea, ea_type, "%u");
389 pv(ea, ea_flags, "%u");
390 pv(ea, ea_num_ptrs, "%u");
391
392 memset(buf, 0, GFS2_EA_MAX_NAME_LEN + 1);
393 memcpy(buf, name, ea->ea_name_len);
394 printk(KERN_INFO " name = %s\n", buf);
395}
396
397void gfs2_log_header_in(struct gfs2_log_header *lh, char *buf)
398{
399 struct gfs2_log_header *str = (struct gfs2_log_header *)buf;
400
401 gfs2_meta_header_in(&lh->lh_header, buf);
402 lh->lh_sequence = be64_to_cpu(str->lh_sequence);
403 lh->lh_flags = be32_to_cpu(str->lh_flags);
404 lh->lh_tail = be32_to_cpu(str->lh_tail);
405 lh->lh_blkno = be32_to_cpu(str->lh_blkno);
406 lh->lh_hash = be32_to_cpu(str->lh_hash);
407}
408
409void gfs2_log_header_print(struct gfs2_log_header *lh)
410{
411 gfs2_meta_header_print(&lh->lh_header);
412 pv(lh, lh_sequence, "%llu");
413 pv(lh, lh_flags, "0x%.8X");
414 pv(lh, lh_tail, "%u");
415 pv(lh, lh_blkno, "%u");
416 pv(lh, lh_hash, "0x%.8X");
417}
418
419void gfs2_log_descriptor_print(struct gfs2_log_descriptor *ld)
420{
421 gfs2_meta_header_print(&ld->ld_header);
422 pv(ld, ld_type, "%u");
423 pv(ld, ld_length, "%u");
424 pv(ld, ld_data1, "%u");
425 pv(ld, ld_data2, "%u");
426
427 pa(ld, ld_reserved, 32);
428}
429
430void gfs2_inum_range_in(struct gfs2_inum_range *ir, char *buf)
431{
432 struct gfs2_inum_range *str = (struct gfs2_inum_range *)buf;
433
434 ir->ir_start = be64_to_cpu(str->ir_start);
435 ir->ir_length = be64_to_cpu(str->ir_length);
436}
437
438void gfs2_inum_range_out(struct gfs2_inum_range *ir, char *buf)
439{
440 struct gfs2_inum_range *str = (struct gfs2_inum_range *)buf;
441
442 str->ir_start = cpu_to_be64(ir->ir_start);
443 str->ir_length = cpu_to_be64(ir->ir_length);
444}
445
446void gfs2_inum_range_print(struct gfs2_inum_range *ir)
447{
448 pv(ir, ir_start, "%llu");
449 pv(ir, ir_length, "%llu");
450}
451
452void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, char *buf)
453{
454 struct gfs2_statfs_change *str = (struct gfs2_statfs_change *)buf;
455
456 sc->sc_total = be64_to_cpu(str->sc_total);
457 sc->sc_free = be64_to_cpu(str->sc_free);
458 sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
459}
460
461void gfs2_statfs_change_out(struct gfs2_statfs_change *sc, char *buf)
462{
463 struct gfs2_statfs_change *str = (struct gfs2_statfs_change *)buf;
464
465 str->sc_total = cpu_to_be64(sc->sc_total);
466 str->sc_free = cpu_to_be64(sc->sc_free);
467 str->sc_dinodes = cpu_to_be64(sc->sc_dinodes);
468}
469
470void gfs2_statfs_change_print(struct gfs2_statfs_change *sc)
471{
472 pv(sc, sc_total, "%lld");
473 pv(sc, sc_free, "%lld");
474 pv(sc, sc_dinodes, "%lld");
475}
476
477void gfs2_unlinked_tag_in(struct gfs2_unlinked_tag *ut, char *buf)
478{
479 struct gfs2_unlinked_tag *str = (struct gfs2_unlinked_tag *)buf;
480
481 gfs2_inum_in(&ut->ut_inum, buf);
482 ut->ut_flags = be32_to_cpu(str->ut_flags);
483}
484
485void gfs2_unlinked_tag_out(struct gfs2_unlinked_tag *ut, char *buf)
486{
487 struct gfs2_unlinked_tag *str = (struct gfs2_unlinked_tag *)buf;
488
489 gfs2_inum_out(&ut->ut_inum, buf);
490 str->ut_flags = cpu_to_be32(ut->ut_flags);
491 str->__pad = 0;
492}
493
494void gfs2_unlinked_tag_print(struct gfs2_unlinked_tag *ut)
495{
496 gfs2_inum_print(&ut->ut_inum);
497 pv(ut, ut_flags, "%u");
498}
499
500void gfs2_quota_change_in(struct gfs2_quota_change *qc, char *buf)
501{
502 struct gfs2_quota_change *str = (struct gfs2_quota_change *)buf;
503
504 qc->qc_change = be64_to_cpu(str->qc_change);
505 qc->qc_flags = be32_to_cpu(str->qc_flags);
506 qc->qc_id = be32_to_cpu(str->qc_id);
507}
508
509void gfs2_quota_change_print(struct gfs2_quota_change *qc)
510{
511 pv(qc, qc_change, "%lld");
512 pv(qc, qc_flags, "0x%.8X");
513 pv(qc, qc_id, "%u");
514}
515
516
517
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
new file mode 100644
index 000000000000..005c2522a879
--- /dev/null
+++ b/fs/gfs2/ops_address.c
@@ -0,0 +1,582 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/mpage.h>
17#include <linux/fs.h>
18#include <linux/gfs2_ondisk.h>
19#include <asm/semaphore.h>
20
21#include "gfs2.h"
22#include "lm_interface.h"
23#include "incore.h"
24#include "bmap.h"
25#include "glock.h"
26#include "inode.h"
27#include "log.h"
28#include "meta_io.h"
29#include "ops_address.h"
30#include "page.h"
31#include "quota.h"
32#include "trans.h"
33#include "rgrp.h"
34#include "ops_file.h"
35#include "util.h"
36
37/**
38 * gfs2_get_block - Fills in a buffer head with details about a block
39 * @inode: The inode
40 * @lblock: The block number to look up
41 * @bh_result: The buffer head to return the result in
42 * @create: Non-zero if we may add block to the file
43 *
44 * Returns: errno
45 */
46
47int gfs2_get_block(struct inode *inode, sector_t lblock,
48 struct buffer_head *bh_result, int create)
49{
50 struct gfs2_inode *ip = inode->u.generic_ip;
51 int new = create;
52 uint64_t dblock;
53 int error;
54
55 error = gfs2_block_map(ip, lblock, &new, &dblock, NULL);
56 if (error)
57 return error;
58
59 if (!dblock)
60 return 0;
61
62 map_bh(bh_result, inode->i_sb, dblock);
63 if (new)
64 set_buffer_new(bh_result);
65
66 return 0;
67}
68
69/**
70 * get_block_noalloc - Fills in a buffer head with details about a block
71 * @inode: The inode
72 * @lblock: The block number to look up
73 * @bh_result: The buffer head to return the result in
74 * @create: Non-zero if we may add block to the file
75 *
76 * Returns: errno
77 */
78
79static int get_block_noalloc(struct inode *inode, sector_t lblock,
80 struct buffer_head *bh_result, int create)
81{
82 struct gfs2_inode *ip = inode->u.generic_ip;
83 int new = 0;
84 uint64_t dblock;
85 int error;
86
87 error = gfs2_block_map(ip, lblock, &new, &dblock, NULL);
88 if (error)
89 return error;
90
91 if (dblock)
92 map_bh(bh_result, inode->i_sb, dblock);
93 else if (gfs2_assert_withdraw(ip->i_sbd, !create))
94 error = -EIO;
95
96 return error;
97}
98
99/**
100 * gfs2_writepage - Write complete page
101 * @page: Page to write
102 *
103 * Returns: errno
104 *
105 * Some of this is copied from block_write_full_page() although we still
106 * call it to do most of the work.
107 */
108
109static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
110{
111 struct inode *inode = page->mapping->host;
112 struct gfs2_inode *ip = page->mapping->host->u.generic_ip;
113 struct gfs2_sbd *sdp = ip->i_sbd;
114 loff_t i_size = i_size_read(inode);
115 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
116 unsigned offset;
117 int error;
118 int done_trans = 0;
119
120 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) {
121 unlock_page(page);
122 return -EIO;
123 }
124 if (current->journal_info)
125 goto out_ignore;
126
127 /* Is the page fully outside i_size? (truncate in progress) */
128 offset = i_size & (PAGE_CACHE_SIZE-1);
129 if (page->index >= end_index+1 || !offset) {
130 page->mapping->a_ops->invalidatepage(page, 0);
131 unlock_page(page);
132 return 0; /* don't care */
133 }
134
135 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) {
136 error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
137 if (error)
138 goto out_ignore;
139 gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
140 done_trans = 1;
141 }
142 error = block_write_full_page(page, get_block_noalloc, wbc);
143 if (done_trans)
144 gfs2_trans_end(sdp);
145 gfs2_meta_cache_flush(ip);
146 return error;
147
148out_ignore:
149 redirty_page_for_writepage(wbc, page);
150 unlock_page(page);
151 return 0;
152}
153
154/**
155 * stuffed_readpage - Fill in a Linux page with stuffed file data
156 * @ip: the inode
157 * @page: the page
158 *
159 * Returns: errno
160 */
161
162static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
163{
164 struct buffer_head *dibh;
165 void *kaddr;
166 int error;
167
168 error = gfs2_meta_inode_buffer(ip, &dibh);
169 if (error)
170 return error;
171
172 kaddr = kmap_atomic(page, KM_USER0);
173 memcpy((char *)kaddr,
174 dibh->b_data + sizeof(struct gfs2_dinode),
175 ip->i_di.di_size);
176 memset((char *)kaddr + ip->i_di.di_size,
177 0,
178 PAGE_CACHE_SIZE - ip->i_di.di_size);
179 kunmap_atomic(page, KM_USER0);
180
181 brelse(dibh);
182
183 SetPageUptodate(page);
184
185 return 0;
186}
187
188static int zero_readpage(struct page *page)
189{
190 void *kaddr;
191
192 kaddr = kmap_atomic(page, KM_USER0);
193 memset(kaddr, 0, PAGE_CACHE_SIZE);
194 kunmap_atomic(page, KM_USER0);
195
196 SetPageUptodate(page);
197 unlock_page(page);
198
199 return 0;
200}
201
202/**
203 * gfs2_readpage - readpage with locking
204 * @file: The file to read a page for. N.B. This may be NULL if we are
205 * reading an internal file.
206 * @page: The page to read
207 *
208 * Returns: errno
209 */
210
211static int gfs2_readpage(struct file *file, struct page *page)
212{
213 struct gfs2_inode *ip = page->mapping->host->u.generic_ip;
214 struct gfs2_sbd *sdp = ip->i_sbd;
215 struct gfs2_holder gh;
216 int error;
217
218 if (file != &gfs2_internal_file_sentinal) {
219 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|GL_AOP, &gh);
220 error = gfs2_glock_nq_m_atime(1, &gh);
221 if (error)
222 goto out_unlock;
223 }
224
225 if (gfs2_is_stuffed(ip)) {
226 if (!page->index) {
227 error = stuffed_readpage(ip, page);
228 unlock_page(page);
229 } else
230 error = zero_readpage(page);
231 } else
232 error = mpage_readpage(page, gfs2_get_block);
233
234 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
235 error = -EIO;
236
237 if (file != &gfs2_internal_file_sentinal) {
238 gfs2_glock_dq_m(1, &gh);
239 gfs2_holder_uninit(&gh);
240 }
241out:
242 return error;
243out_unlock:
244 unlock_page(page);
245 goto out;
246}
247
248/**
249 * gfs2_prepare_write - Prepare to write a page to a file
250 * @file: The file to write to
251 * @page: The page which is to be prepared for writing
252 * @from: From (byte range within page)
253 * @to: To (byte range within page)
254 *
255 * Returns: errno
256 */
257
258static int gfs2_prepare_write(struct file *file, struct page *page,
259 unsigned from, unsigned to)
260{
261 struct gfs2_inode *ip = page->mapping->host->u.generic_ip;
262 struct gfs2_sbd *sdp = ip->i_sbd;
263 unsigned int data_blocks, ind_blocks, rblocks;
264 int alloc_required;
265 int error = 0;
266 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + from;
267 loff_t end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
268 struct gfs2_alloc *al;
269
270 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|GL_AOP, &ip->i_gh);
271 error = gfs2_glock_nq_m_atime(1, &ip->i_gh);
272 if (error)
273 goto out_uninit;
274
275 gfs2_write_calc_reserv(ip, to - from, &data_blocks, &ind_blocks);
276
277 error = gfs2_write_alloc_required(ip, pos, from - to, &alloc_required);
278 if (error)
279 goto out_unlock;
280
281
282 if (alloc_required) {
283 al = gfs2_alloc_get(ip);
284
285 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
286 if (error)
287 goto out_alloc_put;
288
289 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
290 if (error)
291 goto out_qunlock;
292
293 al->al_requested = data_blocks + ind_blocks;
294 error = gfs2_inplace_reserve(ip);
295 if (error)
296 goto out_qunlock;
297 }
298
299 rblocks = RES_DINODE + ind_blocks;
300 if (gfs2_is_jdata(ip))
301 rblocks += data_blocks ? data_blocks : 1;
302 if (ind_blocks || data_blocks)
303 rblocks += RES_STATFS + RES_QUOTA;
304
305 error = gfs2_trans_begin(sdp, rblocks, 0);
306 if (error)
307 goto out;
308
309 if (gfs2_is_stuffed(ip)) {
310 if (end > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
311 error = gfs2_unstuff_dinode(ip, gfs2_unstuffer_page,
312 page);
313 if (error == 0)
314 goto prepare_write;
315 } else if (!PageUptodate(page))
316 error = stuffed_readpage(ip, page);
317 goto out;
318 }
319
320prepare_write:
321 error = block_prepare_write(page, from, to, gfs2_get_block);
322
323out:
324 if (error) {
325 gfs2_trans_end(sdp);
326 if (alloc_required) {
327 gfs2_inplace_release(ip);
328out_qunlock:
329 gfs2_quota_unlock(ip);
330out_alloc_put:
331 gfs2_alloc_put(ip);
332 }
333out_unlock:
334 gfs2_glock_dq_m(1, &ip->i_gh);
335out_uninit:
336 gfs2_holder_uninit(&ip->i_gh);
337 }
338
339 return error;
340}
341
342/**
343 * gfs2_commit_write - Commit write to a file
344 * @file: The file to write to
345 * @page: The page containing the data
346 * @from: From (byte range within page)
347 * @to: To (byte range within page)
348 *
349 * Returns: errno
350 */
351
352static int gfs2_commit_write(struct file *file, struct page *page,
353 unsigned from, unsigned to)
354{
355 struct inode *inode = page->mapping->host;
356 struct gfs2_inode *ip = inode->u.generic_ip;
357 struct gfs2_sbd *sdp = ip->i_sbd;
358 int error = -EOPNOTSUPP;
359 struct buffer_head *dibh;
360 struct gfs2_alloc *al = &ip->i_alloc;;
361
362 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(ip->i_gl)))
363 goto fail_nounlock;
364
365 error = gfs2_meta_inode_buffer(ip, &dibh);
366 if (error)
367 goto fail_endtrans;
368
369 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
370
371 if (gfs2_is_stuffed(ip)) {
372 uint64_t file_size;
373 void *kaddr;
374
375 file_size = ((uint64_t)page->index << PAGE_CACHE_SHIFT) + to;
376
377 kaddr = kmap_atomic(page, KM_USER0);
378 memcpy(dibh->b_data + sizeof(struct gfs2_dinode) + from,
379 (char *)kaddr + from, to - from);
380 kunmap_atomic(page, KM_USER0);
381
382 SetPageUptodate(page);
383
384 if (inode->i_size < file_size)
385 i_size_write(inode, file_size);
386 } else {
387 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED ||
388 gfs2_is_jdata(ip))
389 gfs2_page_add_databufs(ip, page, from, to);
390 error = generic_commit_write(file, page, from, to);
391 if (error)
392 goto fail;
393 }
394
395 if (ip->i_di.di_size < inode->i_size)
396 ip->i_di.di_size = inode->i_size;
397
398 gfs2_dinode_out(&ip->i_di, dibh->b_data);
399 brelse(dibh);
400 gfs2_trans_end(sdp);
401 if (al->al_requested) {
402 gfs2_inplace_release(ip);
403 gfs2_quota_unlock(ip);
404 gfs2_alloc_put(ip);
405 }
406 gfs2_glock_dq_m(1, &ip->i_gh);
407 gfs2_holder_uninit(&ip->i_gh);
408 return 0;
409
410fail:
411 brelse(dibh);
412fail_endtrans:
413 gfs2_trans_end(sdp);
414 if (al->al_requested) {
415 gfs2_inplace_release(ip);
416 gfs2_quota_unlock(ip);
417 gfs2_alloc_put(ip);
418 }
419 gfs2_glock_dq_m(1, &ip->i_gh);
420 gfs2_holder_uninit(&ip->i_gh);
421fail_nounlock:
422 ClearPageUptodate(page);
423 return error;
424}
425
426/**
427 * gfs2_bmap - Block map function
428 * @mapping: Address space info
429 * @lblock: The block to map
430 *
431 * Returns: The disk address for the block or 0 on hole or error
432 */
433
434static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
435{
436 struct gfs2_inode *ip = mapping->host->u.generic_ip;
437 struct gfs2_holder i_gh;
438 sector_t dblock = 0;
439 int error;
440
441 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
442 if (error)
443 return 0;
444
445 if (!gfs2_is_stuffed(ip))
446 dblock = generic_block_bmap(mapping, lblock, gfs2_get_block);
447
448 gfs2_glock_dq_uninit(&i_gh);
449
450 return dblock;
451}
452
453static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh)
454{
455 struct gfs2_bufdata *bd;
456
457 gfs2_log_lock(sdp);
458 bd = bh->b_private;
459 if (bd) {
460 bd->bd_bh = NULL;
461 bh->b_private = NULL;
462 gfs2_log_unlock(sdp);
463 brelse(bh);
464 } else
465 gfs2_log_unlock(sdp);
466
467 lock_buffer(bh);
468 clear_buffer_dirty(bh);
469 bh->b_bdev = NULL;
470 clear_buffer_mapped(bh);
471 clear_buffer_req(bh);
472 clear_buffer_new(bh);
473 clear_buffer_delay(bh);
474 unlock_buffer(bh);
475}
476
477static void gfs2_invalidatepage(struct page *page, unsigned long offset)
478{
479 struct gfs2_sbd *sdp = page->mapping->host->i_sb->s_fs_info;
480 struct buffer_head *head, *bh, *next;
481 unsigned int curr_off = 0;
482
483 BUG_ON(!PageLocked(page));
484 if (!page_has_buffers(page))
485 return;
486
487 bh = head = page_buffers(page);
488 do {
489 unsigned int next_off = curr_off + bh->b_size;
490 next = bh->b_this_page;
491
492 if (offset <= curr_off)
493 discard_buffer(sdp, bh);
494
495 curr_off = next_off;
496 bh = next;
497 } while (bh != head);
498
499 if (!offset)
500 try_to_release_page(page, 0);
501
502 return;
503}
504
505static ssize_t gfs2_direct_IO_write(struct kiocb *iocb, const struct iovec *iov,
506 loff_t offset, unsigned long nr_segs)
507{
508 struct file *file = iocb->ki_filp;
509 struct inode *inode = file->f_mapping->host;
510 struct gfs2_inode *ip = inode->u.generic_ip;
511 struct gfs2_holder gh;
512 int rv;
513
514 /*
515 * Shared lock, even though its write, since we do no allocation
516 * on this path. All we need change is atime.
517 */
518 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
519 rv = gfs2_glock_nq_m_atime(1, &gh);
520 if (rv)
521 goto out;
522
523 /*
524 * Should we return an error here? I can't see that O_DIRECT for
525 * a journaled file makes any sense. For now we'll silently fall
526 * back to buffered I/O, likewise we do the same for stuffed
527 * files since they are (a) small and (b) unaligned.
528 */
529 if (gfs2_is_jdata(ip))
530 goto out;
531
532 if (gfs2_is_stuffed(ip))
533 goto out;
534
535 rv = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev,
536 iov, offset, nr_segs, gfs2_get_block,
537 NULL, DIO_OWN_LOCKING);
538out:
539 gfs2_glock_dq_m(1, &gh);
540 gfs2_holder_uninit(&gh);
541
542 return rv;
543}
544
545/**
546 * gfs2_direct_IO
547 *
548 * This is called with a shared lock already held for the read path.
549 * Currently, no locks are held when the write path is called.
550 */
551static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
552 const struct iovec *iov, loff_t offset,
553 unsigned long nr_segs)
554{
555 struct file *file = iocb->ki_filp;
556 struct inode *inode = file->f_mapping->host;
557 struct gfs2_inode *ip = inode->u.generic_ip;
558 struct gfs2_sbd *sdp = ip->i_sbd;
559
560 if (rw == WRITE)
561 return gfs2_direct_IO_write(iocb, iov, offset, nr_segs);
562
563 if (gfs2_assert_warn(sdp, gfs2_glock_is_locked_by_me(ip->i_gl)) ||
564 gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
565 return -EINVAL;
566
567 return __blockdev_direct_IO(READ, iocb, inode, inode->i_sb->s_bdev, iov,
568 offset, nr_segs, gfs2_get_block, NULL,
569 DIO_OWN_LOCKING);
570}
571
572struct address_space_operations gfs2_file_aops = {
573 .writepage = gfs2_writepage,
574 .readpage = gfs2_readpage,
575 .sync_page = block_sync_page,
576 .prepare_write = gfs2_prepare_write,
577 .commit_write = gfs2_commit_write,
578 .bmap = gfs2_bmap,
579 .invalidatepage = gfs2_invalidatepage,
580 .direct_IO = gfs2_direct_IO,
581};
582
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
new file mode 100644
index 000000000000..f201a059fd91
--- /dev/null
+++ b/fs/gfs2/ops_address.h
@@ -0,0 +1,17 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_ADDRESS_DOT_H__
11#define __OPS_ADDRESS_DOT_H__
12
13extern struct address_space_operations gfs2_file_aops;
14extern int gfs2_get_block(struct inode *inode, sector_t lblock,
15 struct buffer_head *bh_result, int create);
16
17#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
new file mode 100644
index 000000000000..958371076093
--- /dev/null
+++ b/fs/gfs2/ops_dentry.c
@@ -0,0 +1,124 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/smp_lock.h>
16#include <linux/gfs2_ondisk.h>
17#include <linux/crc32.h>
18#include <asm/semaphore.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "dir.h"
24#include "glock.h"
25#include "ops_dentry.h"
26#include "util.h"
27
28/**
29 * gfs2_drevalidate - Check directory lookup consistency
30 * @dentry: the mapping to check
31 * @nd:
32 *
33 * Check to make sure the lookup necessary to arrive at this inode from its
34 * parent is still good.
35 *
36 * Returns: 1 if the dentry is ok, 0 if it isn't
37 */
38
39static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
40{
41 struct dentry *parent = dget_parent(dentry);
42 struct gfs2_sbd *sdp = parent->d_inode->i_sb->s_fs_info;
43 struct gfs2_inode *dip = parent->d_inode->u.generic_ip;
44 struct inode *inode = dentry->d_inode;
45 struct gfs2_holder d_gh;
46 struct gfs2_inode *ip;
47 struct gfs2_inum inum;
48 unsigned int type;
49 int error;
50
51 if (inode && is_bad_inode(inode))
52 goto invalid;
53
54 if (sdp->sd_args.ar_localcaching)
55 goto valid;
56
57 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
58 if (error)
59 goto fail;
60
61 error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type);
62 switch (error) {
63 case 0:
64 if (!inode)
65 goto invalid_gunlock;
66 break;
67 case -ENOENT:
68 if (!inode)
69 goto valid_gunlock;
70 goto invalid_gunlock;
71 default:
72 goto fail_gunlock;
73 }
74
75 ip = inode->u.generic_ip;
76
77 if (!gfs2_inum_equal(&ip->i_num, &inum))
78 goto invalid_gunlock;
79
80 if (IF2DT(ip->i_di.di_mode) != type) {
81 gfs2_consist_inode(dip);
82 goto fail_gunlock;
83 }
84
85 valid_gunlock:
86 gfs2_glock_dq_uninit(&d_gh);
87
88 valid:
89 dput(parent);
90 return 1;
91
92 invalid_gunlock:
93 gfs2_glock_dq_uninit(&d_gh);
94
95 invalid:
96 if (inode && S_ISDIR(inode->i_mode)) {
97 if (have_submounts(dentry))
98 goto valid;
99 shrink_dcache_parent(dentry);
100 }
101 d_drop(dentry);
102
103 dput(parent);
104 return 0;
105
106 fail_gunlock:
107 gfs2_glock_dq_uninit(&d_gh);
108
109 fail:
110 dput(parent);
111 return 0;
112}
113
114static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
115{
116 str->hash = gfs2_disk_hash(str->name, str->len);
117 return 0;
118}
119
120struct dentry_operations gfs2_dops = {
121 .d_revalidate = gfs2_drevalidate,
122 .d_hash = gfs2_dhash,
123};
124
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
new file mode 100644
index 000000000000..94e3ee170165
--- /dev/null
+++ b/fs/gfs2/ops_dentry.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_DENTRY_DOT_H__
11#define __OPS_DENTRY_DOT_H__
12
13extern struct dentry_operations gfs2_dops;
14
15#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
new file mode 100644
index 000000000000..be16c68263d1
--- /dev/null
+++ b/fs/gfs2/ops_export.c
@@ -0,0 +1,298 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17#include <asm/semaphore.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "dir.h"
23#include "glock.h"
24#include "glops.h"
25#include "inode.h"
26#include "ops_export.h"
27#include "rgrp.h"
28#include "util.h"
29
30static struct dentry *gfs2_decode_fh(struct super_block *sb,
31 __u32 *fh,
32 int fh_len,
33 int fh_type,
34 int (*acceptable)(void *context,
35 struct dentry *dentry),
36 void *context)
37{
38 struct gfs2_inum this, parent;
39
40 if (fh_type != fh_len)
41 return NULL;
42
43 memset(&parent, 0, sizeof(struct gfs2_inum));
44
45 switch (fh_type) {
46 case 8:
47 parent.no_formal_ino = ((uint64_t)be32_to_cpu(fh[4])) << 32;
48 parent.no_formal_ino |= be32_to_cpu(fh[5]);
49 parent.no_addr = ((uint64_t)be32_to_cpu(fh[6])) << 32;
50 parent.no_addr |= be32_to_cpu(fh[7]);
51 case 4:
52 this.no_formal_ino = ((uint64_t)be32_to_cpu(fh[0])) << 32;
53 this.no_formal_ino |= be32_to_cpu(fh[1]);
54 this.no_addr = ((uint64_t)be32_to_cpu(fh[2])) << 32;
55 this.no_addr |= be32_to_cpu(fh[3]);
56 break;
57 default:
58 return NULL;
59 }
60
61 return gfs2_export_ops.find_exported_dentry(sb, &this, &parent,
62 acceptable, context);
63}
64
65static int gfs2_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
66 int connectable)
67{
68 struct inode *inode = dentry->d_inode;
69 struct super_block *sb = inode->i_sb;
70 struct gfs2_inode *ip = inode->u.generic_ip;
71
72 if (*len < 4 || (connectable && *len < 8))
73 return 255;
74
75 fh[0] = ip->i_num.no_formal_ino >> 32;
76 fh[0] = cpu_to_be32(fh[0]);
77 fh[1] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
78 fh[1] = cpu_to_be32(fh[1]);
79 fh[2] = ip->i_num.no_addr >> 32;
80 fh[2] = cpu_to_be32(fh[2]);
81 fh[3] = ip->i_num.no_addr & 0xFFFFFFFF;
82 fh[3] = cpu_to_be32(fh[3]);
83 *len = 4;
84
85 if (!connectable || inode == sb->s_root->d_inode)
86 return *len;
87
88 spin_lock(&dentry->d_lock);
89 inode = dentry->d_parent->d_inode;
90 ip = inode->u.generic_ip;
91 gfs2_inode_hold(ip);
92 spin_unlock(&dentry->d_lock);
93
94 fh[4] = ip->i_num.no_formal_ino >> 32;
95 fh[4] = cpu_to_be32(fh[4]);
96 fh[5] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
97 fh[5] = cpu_to_be32(fh[5]);
98 fh[6] = ip->i_num.no_addr >> 32;
99 fh[6] = cpu_to_be32(fh[6]);
100 fh[7] = ip->i_num.no_addr & 0xFFFFFFFF;
101 fh[7] = cpu_to_be32(fh[7]);
102 *len = 8;
103
104 gfs2_inode_put(ip);
105
106 return *len;
107}
108
109struct get_name_filldir {
110 struct gfs2_inum inum;
111 char *name;
112};
113
114static int get_name_filldir(void *opaque, const char *name, unsigned int length,
115 uint64_t offset, struct gfs2_inum *inum,
116 unsigned int type)
117{
118 struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque;
119
120 if (!gfs2_inum_equal(inum, &gnfd->inum))
121 return 0;
122
123 memcpy(gnfd->name, name, length);
124 gnfd->name[length] = 0;
125
126 return 1;
127}
128
129static int gfs2_get_name(struct dentry *parent, char *name,
130 struct dentry *child)
131{
132 struct inode *dir = parent->d_inode;
133 struct inode *inode = child->d_inode;
134 struct gfs2_inode *dip, *ip;
135 struct get_name_filldir gnfd;
136 struct gfs2_holder gh;
137 uint64_t offset = 0;
138 int error;
139
140 if (!dir)
141 return -EINVAL;
142
143 if (!S_ISDIR(dir->i_mode) || !inode)
144 return -EINVAL;
145
146 dip = dir->u.generic_ip;
147 ip = inode->u.generic_ip;
148
149 *name = 0;
150 gnfd.inum = ip->i_num;
151 gnfd.name = name;
152
153 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
154 if (error)
155 return error;
156
157 error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
158
159 gfs2_glock_dq_uninit(&gh);
160
161 if (!error && !*name)
162 error = -ENOENT;
163
164 return error;
165}
166
167static struct dentry *gfs2_get_parent(struct dentry *child)
168{
169 struct qstr dotdot;
170 struct inode *inode;
171 struct dentry *dentry;
172
173 gfs2_str2qstr(&dotdot, "..");
174 inode = gfs2_lookupi(child->d_inode, &dotdot, 1, NULL);
175
176 if (!inode)
177 return ERR_PTR(-ENOENT);
178 if (IS_ERR(inode))
179 return ERR_PTR(PTR_ERR(inode));
180
181 dentry = d_alloc_anon(inode);
182 if (!dentry) {
183 iput(inode);
184 return ERR_PTR(-ENOMEM);
185 }
186
187 return dentry;
188}
189
190static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_p)
191{
192 struct gfs2_sbd *sdp = sb->s_fs_info;
193 struct gfs2_inum *inum = (struct gfs2_inum *)inum_p;
194 struct gfs2_holder i_gh, ri_gh, rgd_gh;
195 struct gfs2_rgrpd *rgd;
196 struct gfs2_inode *ip;
197 struct inode *inode;
198 struct dentry *dentry;
199 int error;
200
201 /* System files? */
202
203 inode = gfs2_iget(sb, inum);
204 if (inode) {
205 ip = inode->u.generic_ip;
206 if (ip->i_num.no_formal_ino != inum->no_formal_ino) {
207 iput(inode);
208 return ERR_PTR(-ESTALE);
209 }
210 goto out_inode;
211 }
212
213 error = gfs2_glock_nq_num(sdp,
214 inum->no_addr, &gfs2_inode_glops,
215 LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL,
216 &i_gh);
217 if (error)
218 return ERR_PTR(error);
219
220 error = gfs2_inode_get(i_gh.gh_gl, inum, NO_CREATE, &ip);
221 if (error)
222 goto fail;
223 if (ip)
224 goto out_ip;
225
226 error = gfs2_rindex_hold(sdp, &ri_gh);
227 if (error)
228 goto fail;
229
230 error = -EINVAL;
231 rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
232 if (!rgd)
233 goto fail_rindex;
234
235 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
236 if (error)
237 goto fail_rindex;
238
239 error = -ESTALE;
240 if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
241 goto fail_rgd;
242
243 gfs2_glock_dq_uninit(&rgd_gh);
244 gfs2_glock_dq_uninit(&ri_gh);
245
246 error = gfs2_inode_get(i_gh.gh_gl, inum, CREATE, &ip);
247 if (error)
248 goto fail;
249
250 error = gfs2_inode_refresh(ip);
251 if (error) {
252 gfs2_inode_put(ip);
253 goto fail;
254 }
255
256 out_ip:
257 error = -EIO;
258 if (ip->i_di.di_flags & GFS2_DIF_SYSTEM) {
259 gfs2_inode_put(ip);
260 goto fail;
261 }
262
263 gfs2_glock_dq_uninit(&i_gh);
264
265 inode = gfs2_ip2v(ip);
266 gfs2_inode_put(ip);
267
268 if (!inode)
269 return ERR_PTR(-ENOMEM);
270
271 out_inode:
272 dentry = d_alloc_anon(inode);
273 if (!dentry) {
274 iput(inode);
275 return ERR_PTR(-ENOMEM);
276 }
277
278 return dentry;
279
280 fail_rgd:
281 gfs2_glock_dq_uninit(&rgd_gh);
282
283 fail_rindex:
284 gfs2_glock_dq_uninit(&ri_gh);
285
286 fail:
287 gfs2_glock_dq_uninit(&i_gh);
288 return ERR_PTR(error);
289}
290
291struct export_operations gfs2_export_ops = {
292 .decode_fh = gfs2_decode_fh,
293 .encode_fh = gfs2_encode_fh,
294 .get_name = gfs2_get_name,
295 .get_parent = gfs2_get_parent,
296 .get_dentry = gfs2_get_dentry,
297};
298
diff --git a/fs/gfs2/ops_export.h b/fs/gfs2/ops_export.h
new file mode 100644
index 000000000000..2f342f3d8755
--- /dev/null
+++ b/fs/gfs2/ops_export.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_EXPORT_DOT_H__
11#define __OPS_EXPORT_DOT_H__
12
13extern struct export_operations gfs2_export_ops;
14
15#endif /* __OPS_EXPORT_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
new file mode 100644
index 000000000000..3fb1a29f88a6
--- /dev/null
+++ b/fs/gfs2/ops_file.c
@@ -0,0 +1,999 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/uio.h>
17#include <linux/blkdev.h>
18#include <linux/mm.h>
19#include <linux/smp_lock.h>
20#include <linux/fs.h>
21#include <linux/gfs2_ondisk.h>
22#include <linux/ext2_fs.h>
23#include <linux/crc32.h>
24#include <linux/iflags.h>
25#include <asm/semaphore.h>
26#include <asm/uaccess.h>
27
28#include "gfs2.h"
29#include "lm_interface.h"
30#include "incore.h"
31#include "bmap.h"
32#include "dir.h"
33#include "glock.h"
34#include "glops.h"
35#include "inode.h"
36#include "lm.h"
37#include "log.h"
38#include "meta_io.h"
39#include "ops_file.h"
40#include "ops_vm.h"
41#include "quota.h"
42#include "rgrp.h"
43#include "trans.h"
44#include "util.h"
45#include "eaops.h"
46
47/* "bad" is for NFS support */
48struct filldir_bad_entry {
49 char *fbe_name;
50 unsigned int fbe_length;
51 uint64_t fbe_offset;
52 struct gfs2_inum fbe_inum;
53 unsigned int fbe_type;
54};
55
56struct filldir_bad {
57 struct gfs2_sbd *fdb_sbd;
58
59 struct filldir_bad_entry *fdb_entry;
60 unsigned int fdb_entry_num;
61 unsigned int fdb_entry_off;
62
63 char *fdb_name;
64 unsigned int fdb_name_size;
65 unsigned int fdb_name_off;
66};
67
68/* For regular, non-NFS */
69struct filldir_reg {
70 struct gfs2_sbd *fdr_sbd;
71 int fdr_prefetch;
72
73 filldir_t fdr_filldir;
74 void *fdr_opaque;
75};
76
77/*
78 * Most fields left uninitialised to catch anybody who tries to
79 * use them. f_flags set to prevent file_accessed() from touching
80 * any other part of this. Its use is purely as a flag so that we
81 * know (in readpage()) whether or not do to locking.
82 */
83struct file gfs2_internal_file_sentinal = {
84 .f_flags = O_NOATIME|O_RDONLY,
85};
86
87static int gfs2_read_actor(read_descriptor_t *desc, struct page *page,
88 unsigned long offset, unsigned long size)
89{
90 char *kaddr;
91 unsigned long count = desc->count;
92
93 if (size > count)
94 size = count;
95
96 kaddr = kmap(page);
97 memcpy(desc->arg.buf, kaddr + offset, size);
98 kunmap(page);
99
100 desc->count = count - size;
101 desc->written += size;
102 desc->arg.buf += size;
103 return size;
104}
105
106int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
107 char *buf, loff_t *pos, unsigned size)
108{
109 struct inode *inode = ip->i_vnode;
110 read_descriptor_t desc;
111 desc.written = 0;
112 desc.arg.buf = buf;
113 desc.count = size;
114 desc.error = 0;
115 do_generic_mapping_read(inode->i_mapping, ra_state,
116 &gfs2_internal_file_sentinal, pos, &desc,
117 gfs2_read_actor);
118 return desc.written ? desc.written : desc.error;
119}
120
121/**
122 * gfs2_llseek - seek to a location in a file
123 * @file: the file
124 * @offset: the offset
125 * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
126 *
127 * SEEK_END requires the glock for the file because it references the
128 * file's size.
129 *
130 * Returns: The new offset, or errno
131 */
132
133static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
134{
135 struct gfs2_inode *ip = file->f_mapping->host->u.generic_ip;
136 struct gfs2_holder i_gh;
137 loff_t error;
138
139 if (origin == 2) {
140 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
141 &i_gh);
142 if (!error) {
143 error = remote_llseek(file, offset, origin);
144 gfs2_glock_dq_uninit(&i_gh);
145 }
146 } else
147 error = remote_llseek(file, offset, origin);
148
149 return error;
150}
151
152
153static ssize_t gfs2_direct_IO_read(struct kiocb *iocb, const struct iovec *iov,
154 loff_t offset, unsigned long nr_segs)
155{
156 struct file *file = iocb->ki_filp;
157 struct address_space *mapping = file->f_mapping;
158 ssize_t retval;
159
160 retval = filemap_write_and_wait(mapping);
161 if (retval == 0) {
162 retval = mapping->a_ops->direct_IO(READ, iocb, iov, offset,
163 nr_segs);
164 }
165 return retval;
166}
167
168/**
169 * __gfs2_file_aio_read - The main GFS2 read function
170 *
171 * N.B. This is almost, but not quite the same as __generic_file_aio_read()
172 * the important subtle different being that inode->i_size isn't valid
173 * unless we are holding a lock, and we do this _only_ on the O_DIRECT
174 * path since otherwise locking is done entirely at the page cache
175 * layer.
176 */
177static ssize_t __gfs2_file_aio_read(struct kiocb *iocb,
178 const struct iovec *iov,
179 unsigned long nr_segs, loff_t *ppos)
180{
181 struct file *filp = iocb->ki_filp;
182 struct gfs2_inode *ip = filp->f_mapping->host->u.generic_ip;
183 struct gfs2_holder gh;
184 ssize_t retval;
185 unsigned long seg;
186 size_t count;
187
188 count = 0;
189 for (seg = 0; seg < nr_segs; seg++) {
190 const struct iovec *iv = &iov[seg];
191
192 /*
193 * If any segment has a negative length, or the cumulative
194 * length ever wraps negative then return -EINVAL.
195 */
196 count += iv->iov_len;
197 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
198 return -EINVAL;
199 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
200 continue;
201 if (seg == 0)
202 return -EFAULT;
203 nr_segs = seg;
204 count -= iv->iov_len; /* This segment is no good */
205 break;
206 }
207
208 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
209 if (filp->f_flags & O_DIRECT) {
210 loff_t pos = *ppos, size;
211 struct address_space *mapping;
212 struct inode *inode;
213
214 mapping = filp->f_mapping;
215 inode = mapping->host;
216 retval = 0;
217 if (!count)
218 goto out; /* skip atime */
219
220 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
221 retval = gfs2_glock_nq_m_atime(1, &gh);
222 if (retval)
223 goto out;
224 if (gfs2_is_stuffed(ip)) {
225 gfs2_glock_dq_m(1, &gh);
226 gfs2_holder_uninit(&gh);
227 goto fallback_to_normal;
228 }
229 size = i_size_read(inode);
230 if (pos < size) {
231 retval = gfs2_direct_IO_read(iocb, iov, pos, nr_segs);
232 if (retval > 0 && !is_sync_kiocb(iocb))
233 retval = -EIOCBQUEUED;
234 if (retval > 0)
235 *ppos = pos + retval;
236 }
237 file_accessed(filp);
238 gfs2_glock_dq_m(1, &gh);
239 gfs2_holder_uninit(&gh);
240 goto out;
241 }
242
243fallback_to_normal:
244 retval = 0;
245 if (count) {
246 for (seg = 0; seg < nr_segs; seg++) {
247 read_descriptor_t desc;
248
249 desc.written = 0;
250 desc.arg.buf = iov[seg].iov_base;
251 desc.count = iov[seg].iov_len;
252 if (desc.count == 0)
253 continue;
254 desc.error = 0;
255 do_generic_file_read(filp,ppos,&desc,file_read_actor);
256 retval += desc.written;
257 if (desc.error) {
258 retval = retval ?: desc.error;
259 break;
260 }
261 }
262 }
263out:
264 return retval;
265}
266
267/**
268 * gfs2_read - Read bytes from a file
269 * @file: The file to read from
270 * @buf: The buffer to copy into
271 * @size: The amount of data requested
272 * @offset: The current file offset
273 *
274 * Outputs: Offset - updated according to number of bytes read
275 *
276 * Returns: The number of bytes read, errno on failure
277 */
278
279static ssize_t gfs2_read(struct file *filp, char __user *buf, size_t size,
280 loff_t *offset)
281{
282 struct iovec local_iov = { .iov_base = buf, .iov_len = size };
283 struct kiocb kiocb;
284 ssize_t ret;
285
286 init_sync_kiocb(&kiocb, filp);
287 ret = __gfs2_file_aio_read(&kiocb, &local_iov, 1, offset);
288 if (-EIOCBQUEUED == ret)
289 ret = wait_on_sync_kiocb(&kiocb);
290 return ret;
291}
292
293static ssize_t gfs2_file_readv(struct file *filp, const struct iovec *iov,
294 unsigned long nr_segs, loff_t *ppos)
295{
296 struct kiocb kiocb;
297 ssize_t ret;
298
299 init_sync_kiocb(&kiocb, filp);
300 ret = __gfs2_file_aio_read(&kiocb, iov, nr_segs, ppos);
301 if (-EIOCBQUEUED == ret)
302 ret = wait_on_sync_kiocb(&kiocb);
303 return ret;
304}
305
306static ssize_t gfs2_file_aio_read(struct kiocb *iocb, char __user *buf,
307 size_t count, loff_t pos)
308{
309 struct iovec local_iov = { .iov_base = buf, .iov_len = count };
310
311 BUG_ON(iocb->ki_pos != pos);
312 return __gfs2_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
313}
314
315
316/**
317 * filldir_reg_func - Report a directory entry to the caller of gfs2_dir_read()
318 * @opaque: opaque data used by the function
319 * @name: the name of the directory entry
320 * @length: the length of the name
321 * @offset: the entry's offset in the directory
322 * @inum: the inode number the entry points to
323 * @type: the type of inode the entry points to
324 *
325 * Returns: 0 on success, 1 if buffer full
326 */
327
328static int filldir_reg_func(void *opaque, const char *name, unsigned int length,
329 uint64_t offset, struct gfs2_inum *inum,
330 unsigned int type)
331{
332 struct filldir_reg *fdr = (struct filldir_reg *)opaque;
333 struct gfs2_sbd *sdp = fdr->fdr_sbd;
334 int error;
335
336 error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset,
337 inum->no_formal_ino, type);
338 if (error)
339 return 1;
340
341 if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) {
342 gfs2_glock_prefetch_num(sdp,
343 inum->no_addr, &gfs2_inode_glops,
344 LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
345 gfs2_glock_prefetch_num(sdp,
346 inum->no_addr, &gfs2_iopen_glops,
347 LM_ST_SHARED, LM_FLAG_TRY);
348 }
349
350 return 0;
351}
352
353/**
354 * readdir_reg - Read directory entries from a directory
355 * @file: The directory to read from
356 * @dirent: Buffer for dirents
357 * @filldir: Function used to do the copying
358 *
359 * Returns: errno
360 */
361
362static int readdir_reg(struct file *file, void *dirent, filldir_t filldir)
363{
364 struct inode *dir = file->f_mapping->host;
365 struct gfs2_inode *dip = dir->u.generic_ip;
366 struct filldir_reg fdr;
367 struct gfs2_holder d_gh;
368 uint64_t offset = file->f_pos;
369 int error;
370
371 fdr.fdr_sbd = dip->i_sbd;
372 fdr.fdr_prefetch = 1;
373 fdr.fdr_filldir = filldir;
374 fdr.fdr_opaque = dirent;
375
376 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
377 error = gfs2_glock_nq_atime(&d_gh);
378 if (error) {
379 gfs2_holder_uninit(&d_gh);
380 return error;
381 }
382
383 error = gfs2_dir_read(dir, &offset, &fdr, filldir_reg_func);
384
385 gfs2_glock_dq_uninit(&d_gh);
386
387 file->f_pos = offset;
388
389 return error;
390}
391
392/**
393 * filldir_bad_func - Report a directory entry to the caller of gfs2_dir_read()
394 * @opaque: opaque data used by the function
395 * @name: the name of the directory entry
396 * @length: the length of the name
397 * @offset: the entry's offset in the directory
398 * @inum: the inode number the entry points to
399 * @type: the type of inode the entry points to
400 *
401 * For supporting NFS.
402 *
403 * Returns: 0 on success, 1 if buffer full
404 */
405
406static int filldir_bad_func(void *opaque, const char *name, unsigned int length,
407 uint64_t offset, struct gfs2_inum *inum,
408 unsigned int type)
409{
410 struct filldir_bad *fdb = (struct filldir_bad *)opaque;
411 struct gfs2_sbd *sdp = fdb->fdb_sbd;
412 struct filldir_bad_entry *fbe;
413
414 if (fdb->fdb_entry_off == fdb->fdb_entry_num ||
415 fdb->fdb_name_off + length > fdb->fdb_name_size)
416 return 1;
417
418 fbe = &fdb->fdb_entry[fdb->fdb_entry_off];
419 fbe->fbe_name = fdb->fdb_name + fdb->fdb_name_off;
420 memcpy(fbe->fbe_name, name, length);
421 fbe->fbe_length = length;
422 fbe->fbe_offset = offset;
423 fbe->fbe_inum = *inum;
424 fbe->fbe_type = type;
425
426 fdb->fdb_entry_off++;
427 fdb->fdb_name_off += length;
428
429 if (!(length == 1 && *name == '.')) {
430 gfs2_glock_prefetch_num(sdp,
431 inum->no_addr, &gfs2_inode_glops,
432 LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
433 gfs2_glock_prefetch_num(sdp,
434 inum->no_addr, &gfs2_iopen_glops,
435 LM_ST_SHARED, LM_FLAG_TRY);
436 }
437
438 return 0;
439}
440
441/**
442 * readdir_bad - Read directory entries from a directory
443 * @file: The directory to read from
444 * @dirent: Buffer for dirents
445 * @filldir: Function used to do the copying
446 *
447 * For supporting NFS.
448 *
449 * Returns: errno
450 */
451
452static int readdir_bad(struct file *file, void *dirent, filldir_t filldir)
453{
454 struct inode *dir = file->f_mapping->host;
455 struct gfs2_inode *dip = dir->u.generic_ip;
456 struct gfs2_sbd *sdp = dip->i_sbd;
457 struct filldir_reg fdr;
458 unsigned int entries, size;
459 struct filldir_bad *fdb;
460 struct gfs2_holder d_gh;
461 uint64_t offset = file->f_pos;
462 unsigned int x;
463 struct filldir_bad_entry *fbe;
464 int error;
465
466 entries = gfs2_tune_get(sdp, gt_entries_per_readdir);
467 size = sizeof(struct filldir_bad) +
468 entries * (sizeof(struct filldir_bad_entry) + GFS2_FAST_NAME_SIZE);
469
470 fdb = kzalloc(size, GFP_KERNEL);
471 if (!fdb)
472 return -ENOMEM;
473
474 fdb->fdb_sbd = sdp;
475 fdb->fdb_entry = (struct filldir_bad_entry *)(fdb + 1);
476 fdb->fdb_entry_num = entries;
477 fdb->fdb_name = ((char *)fdb) + sizeof(struct filldir_bad) +
478 entries * sizeof(struct filldir_bad_entry);
479 fdb->fdb_name_size = entries * GFS2_FAST_NAME_SIZE;
480
481 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
482 error = gfs2_glock_nq_atime(&d_gh);
483 if (error) {
484 gfs2_holder_uninit(&d_gh);
485 goto out;
486 }
487
488 error = gfs2_dir_read(dir, &offset, fdb, filldir_bad_func);
489
490 gfs2_glock_dq_uninit(&d_gh);
491
492 fdr.fdr_sbd = sdp;
493 fdr.fdr_prefetch = 0;
494 fdr.fdr_filldir = filldir;
495 fdr.fdr_opaque = dirent;
496
497 for (x = 0; x < fdb->fdb_entry_off; x++) {
498 fbe = &fdb->fdb_entry[x];
499
500 error = filldir_reg_func(&fdr,
501 fbe->fbe_name, fbe->fbe_length,
502 fbe->fbe_offset,
503 &fbe->fbe_inum, fbe->fbe_type);
504 if (error) {
505 file->f_pos = fbe->fbe_offset;
506 error = 0;
507 goto out;
508 }
509 }
510
511 file->f_pos = offset;
512
513 out:
514 kfree(fdb);
515
516 return error;
517}
518
519/**
520 * gfs2_readdir - Read directory entries from a directory
521 * @file: The directory to read from
522 * @dirent: Buffer for dirents
523 * @filldir: Function used to do the copying
524 *
525 * Returns: errno
526 */
527
528static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
529{
530 int error;
531
532 if (strcmp(current->comm, "nfsd") != 0)
533 error = readdir_reg(file, dirent, filldir);
534 else
535 error = readdir_bad(file, dirent, filldir);
536
537 return error;
538}
539
540static const u32 iflags_to_gfs2[32] = {
541 [iflag_Sync] = GFS2_DIF_SYNC,
542 [iflag_Immutable] = GFS2_DIF_IMMUTABLE,
543 [iflag_Append] = GFS2_DIF_APPENDONLY,
544 [iflag_NoAtime] = GFS2_DIF_NOATIME,
545 [iflag_Index] = GFS2_DIF_EXHASH,
546 [iflag_JournalData] = GFS2_DIF_JDATA,
547 [iflag_DirectIO] = GFS2_DIF_DIRECTIO,
548 [iflag_InheritDirectIO] = GFS2_DIF_INHERIT_DIRECTIO,
549 [iflag_InheritJdata] = GFS2_DIF_INHERIT_JDATA,
550};
551
552static const u32 gfs2_to_iflags[32] = {
553 [gfs2fl_Sync] = IFLAG_SYNC,
554 [gfs2fl_Immutable] = IFLAG_IMMUTABLE,
555 [gfs2fl_AppendOnly] = IFLAG_APPEND,
556 [gfs2fl_NoAtime] = IFLAG_NOATIME,
557 [gfs2fl_ExHash] = IFLAG_INDEX,
558 [gfs2fl_Jdata] = IFLAG_JOURNAL_DATA,
559 [gfs2fl_Directio] = IFLAG_DIRECTIO,
560 [gfs2fl_InheritDirectio] = IFLAG_INHERITDIRECTIO,
561 [gfs2fl_InheritJdata] = IFLAG_INHERITJDATA,
562};
563
564static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
565{
566 struct inode *inode = filp->f_dentry->d_inode;
567 struct gfs2_inode *ip = inode->u.generic_ip;
568 struct gfs2_holder gh;
569 int error;
570 u32 iflags;
571
572 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
573 error = gfs2_glock_nq_m_atime(1, &gh);
574 if (error)
575 return error;
576
577 iflags = iflags_cvt(gfs2_to_iflags, ip->i_di.di_flags);
578 if (put_user(iflags, ptr))
579 error = -EFAULT;
580
581 gfs2_glock_dq_m(1, &gh);
582 gfs2_holder_uninit(&gh);
583 return error;
584}
585
586/* Flags that can be set by user space */
587#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \
588 GFS2_DIF_DIRECTIO| \
589 GFS2_DIF_IMMUTABLE| \
590 GFS2_DIF_APPENDONLY| \
591 GFS2_DIF_NOATIME| \
592 GFS2_DIF_SYNC| \
593 GFS2_DIF_SYSTEM| \
594 GFS2_DIF_INHERIT_DIRECTIO| \
595 GFS2_DIF_INHERIT_JDATA)
596
597/**
598 * gfs2_set_flags - set flags on an inode
599 * @inode: The inode
600 * @flags: The flags to set
601 * @mask: Indicates which flags are valid
602 *
603 */
604static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
605{
606 struct inode *inode = filp->f_dentry->d_inode;
607 struct gfs2_inode *ip = inode->u.generic_ip;
608 struct gfs2_sbd *sdp = ip->i_sbd;
609 struct buffer_head *bh;
610 struct gfs2_holder gh;
611 int error;
612 u32 new_flags, flags;
613
614 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
615 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
616 if (error)
617 return error;
618
619 flags = ip->i_di.di_flags;
620 new_flags = (flags & ~mask) | (reqflags & mask);
621 if ((new_flags ^ flags) == 0)
622 goto out;
623
624 error = -EINVAL;
625 if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
626 goto out;
627
628 if (S_ISDIR(inode->i_mode)) {
629 if ((new_flags ^ flags) & (GFS2_DIF_JDATA | GFS2_DIF_DIRECTIO))
630 goto out;
631 } else if (S_ISREG(inode->i_mode)) {
632 if ((new_flags ^ flags) & (GFS2_DIF_INHERIT_DIRECTIO|
633 GFS2_DIF_INHERIT_JDATA))
634 goto out;
635 } else
636 goto out;
637
638 error = -EPERM;
639 if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
640 goto out;
641 if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
642 goto out;
643 error = gfs2_repermission(inode, MAY_WRITE, NULL);
644 if (error)
645 goto out;
646
647 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
648 if (error)
649 goto out;
650 error = gfs2_meta_inode_buffer(ip, &bh);
651 if (error)
652 goto out_trans_end;
653 gfs2_trans_add_bh(ip->i_gl, bh, 1);
654 ip->i_di.di_flags = new_flags;
655 gfs2_dinode_out(&ip->i_di, bh->b_data);
656 brelse(bh);
657out_trans_end:
658 gfs2_trans_end(sdp);
659out:
660 gfs2_glock_dq_uninit(&gh);
661 return error;
662}
663
664static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
665{
666 u32 iflags, gfsflags;
667 if (get_user(iflags, ptr))
668 return -EFAULT;
669 gfsflags = iflags_cvt(iflags_to_gfs2, iflags);
670 return do_gfs2_set_flags(filp, gfsflags, ~0);
671}
672
673static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
674{
675 switch(cmd) {
676 case IFLAGS_GET_IOC:
677 return gfs2_get_flags(filp, (u32 __user *)arg);
678 case IFLAGS_SET_IOC:
679 return gfs2_set_flags(filp, (u32 __user *)arg);
680 }
681 return -ENOTTY;
682}
683
684
685/**
686 * gfs2_mmap -
687 * @file: The file to map
688 * @vma: The VMA which described the mapping
689 *
690 * Returns: 0 or error code
691 */
692
693static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
694{
695 struct gfs2_inode *ip = file->f_mapping->host->u.generic_ip;
696 struct gfs2_holder i_gh;
697 int error;
698
699 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
700 error = gfs2_glock_nq_atime(&i_gh);
701 if (error) {
702 gfs2_holder_uninit(&i_gh);
703 return error;
704 }
705
706 /* This is VM_MAYWRITE instead of VM_WRITE because a call
707 to mprotect() can turn on VM_WRITE later. */
708
709 if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) ==
710 (VM_MAYSHARE | VM_MAYWRITE))
711 vma->vm_ops = &gfs2_vm_ops_sharewrite;
712 else
713 vma->vm_ops = &gfs2_vm_ops_private;
714
715 gfs2_glock_dq_uninit(&i_gh);
716
717 return error;
718}
719
720/**
721 * gfs2_open - open a file
722 * @inode: the inode to open
723 * @file: the struct file for this opening
724 *
725 * Returns: errno
726 */
727
728static int gfs2_open(struct inode *inode, struct file *file)
729{
730 struct gfs2_inode *ip = inode->u.generic_ip;
731 struct gfs2_holder i_gh;
732 struct gfs2_file *fp;
733 int error;
734
735 fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
736 if (!fp)
737 return -ENOMEM;
738
739 mutex_init(&fp->f_fl_mutex);
740
741 gfs2_assert_warn(ip->i_sbd, !file->private_data);
742 file->private_data = fp;
743
744 if (S_ISREG(ip->i_di.di_mode)) {
745 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
746 &i_gh);
747 if (error)
748 goto fail;
749
750 if (!(file->f_flags & O_LARGEFILE) &&
751 ip->i_di.di_size > MAX_NON_LFS) {
752 error = -EFBIG;
753 goto fail_gunlock;
754 }
755
756 /* Listen to the Direct I/O flag */
757
758 if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
759 file->f_flags |= O_DIRECT;
760
761 gfs2_glock_dq_uninit(&i_gh);
762 }
763
764 return 0;
765
766 fail_gunlock:
767 gfs2_glock_dq_uninit(&i_gh);
768
769 fail:
770 file->private_data = NULL;
771 kfree(fp);
772
773 return error;
774}
775
776/**
777 * gfs2_close - called to close a struct file
778 * @inode: the inode the struct file belongs to
779 * @file: the struct file being closed
780 *
781 * Returns: errno
782 */
783
784static int gfs2_close(struct inode *inode, struct file *file)
785{
786 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
787 struct gfs2_file *fp;
788
789 fp = file->private_data;
790 file->private_data = NULL;
791
792 if (gfs2_assert_warn(sdp, fp))
793 return -EIO;
794
795 kfree(fp);
796
797 return 0;
798}
799
800/**
801 * gfs2_fsync - sync the dirty data for a file (across the cluster)
802 * @file: the file that points to the dentry (we ignore this)
803 * @dentry: the dentry that points to the inode to sync
804 *
805 * Returns: errno
806 */
807
808static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
809{
810 struct gfs2_inode *ip = dentry->d_inode->u.generic_ip;
811
812 gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
813
814 return 0;
815}
816
817/**
818 * gfs2_lock - acquire/release a posix lock on a file
819 * @file: the file pointer
820 * @cmd: either modify or retrieve lock state, possibly wait
821 * @fl: type and range of lock
822 *
823 * Returns: errno
824 */
825
826static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
827{
828 struct gfs2_inode *ip = file->f_mapping->host->u.generic_ip;
829 struct gfs2_sbd *sdp = ip->i_sbd;
830 struct lm_lockname name =
831 { .ln_number = ip->i_num.no_addr,
832 .ln_type = LM_TYPE_PLOCK };
833
834 if (!(fl->fl_flags & FL_POSIX))
835 return -ENOLCK;
836 if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
837 return -ENOLCK;
838
839 if (sdp->sd_args.ar_localflocks) {
840 if (IS_GETLK(cmd)) {
841 struct file_lock tmp;
842 int ret;
843 ret = posix_test_lock(file, fl, &tmp);
844 fl->fl_type = F_UNLCK;
845 if (ret)
846 memcpy(fl, &tmp, sizeof(struct file_lock));
847 return 0;
848 } else {
849 return posix_lock_file_wait(file, fl);
850 }
851 }
852
853 if (IS_GETLK(cmd))
854 return gfs2_lm_plock_get(sdp, &name, file, fl);
855 else if (fl->fl_type == F_UNLCK)
856 return gfs2_lm_punlock(sdp, &name, file, fl);
857 else
858 return gfs2_lm_plock(sdp, &name, file, cmd, fl);
859}
860
861/**
862 * gfs2_sendfile - Send bytes to a file or socket
863 * @in_file: The file to read from
864 * @out_file: The file to write to
865 * @count: The amount of data
866 * @offset: The beginning file offset
867 *
868 * Outputs: offset - updated according to number of bytes read
869 *
870 * Returns: The number of bytes sent, errno on failure
871 */
872
873static ssize_t gfs2_sendfile(struct file *in_file, loff_t *offset, size_t count,
874 read_actor_t actor, void *target)
875{
876 return generic_file_sendfile(in_file, offset, count, actor, target);
877}
878
879static int do_flock(struct file *file, int cmd, struct file_lock *fl)
880{
881 struct gfs2_file *fp = file->private_data;
882 struct gfs2_holder *fl_gh = &fp->f_fl_gh;
883 struct gfs2_inode *ip = file->f_dentry->d_inode->u.generic_ip;
884 struct gfs2_glock *gl;
885 unsigned int state;
886 int flags;
887 int error = 0;
888
889 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
890 flags = ((IS_SETLKW(cmd)) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
891
892 mutex_lock(&fp->f_fl_mutex);
893
894 gl = fl_gh->gh_gl;
895 if (gl) {
896 if (fl_gh->gh_state == state)
897 goto out;
898 gfs2_glock_hold(gl);
899 flock_lock_file_wait(file,
900 &(struct file_lock){.fl_type = F_UNLCK});
901 gfs2_glock_dq_uninit(fl_gh);
902 } else {
903 error = gfs2_glock_get(ip->i_sbd,
904 ip->i_num.no_addr, &gfs2_flock_glops,
905 CREATE, &gl);
906 if (error)
907 goto out;
908 }
909
910 gfs2_holder_init(gl, state, flags, fl_gh);
911 gfs2_glock_put(gl);
912
913 error = gfs2_glock_nq(fl_gh);
914 if (error) {
915 gfs2_holder_uninit(fl_gh);
916 if (error == GLR_TRYFAILED)
917 error = -EAGAIN;
918 } else {
919 error = flock_lock_file_wait(file, fl);
920 gfs2_assert_warn(ip->i_sbd, !error);
921 }
922
923 out:
924 mutex_unlock(&fp->f_fl_mutex);
925
926 return error;
927}
928
929static void do_unflock(struct file *file, struct file_lock *fl)
930{
931 struct gfs2_file *fp = file->private_data;
932 struct gfs2_holder *fl_gh = &fp->f_fl_gh;
933
934 mutex_lock(&fp->f_fl_mutex);
935 flock_lock_file_wait(file, fl);
936 if (fl_gh->gh_gl)
937 gfs2_glock_dq_uninit(fl_gh);
938 mutex_unlock(&fp->f_fl_mutex);
939}
940
941/**
942 * gfs2_flock - acquire/release a flock lock on a file
943 * @file: the file pointer
944 * @cmd: either modify or retrieve lock state, possibly wait
945 * @fl: type and range of lock
946 *
947 * Returns: errno
948 */
949
950static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
951{
952 struct gfs2_inode *ip = file->f_mapping->host->u.generic_ip;
953 struct gfs2_sbd *sdp = ip->i_sbd;
954
955 if (!(fl->fl_flags & FL_FLOCK))
956 return -ENOLCK;
957 if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
958 return -ENOLCK;
959
960 if (sdp->sd_args.ar_localflocks)
961 return flock_lock_file_wait(file, fl);
962
963 if (fl->fl_type == F_UNLCK) {
964 do_unflock(file, fl);
965 return 0;
966 } else
967 return do_flock(file, cmd, fl);
968}
969
970struct file_operations gfs2_file_fops = {
971 .llseek = gfs2_llseek,
972 .read = gfs2_read,
973 .readv = gfs2_file_readv,
974 .aio_read = gfs2_file_aio_read,
975 .write = generic_file_write,
976 .writev = generic_file_writev,
977 .aio_write = generic_file_aio_write,
978 .unlocked_ioctl = gfs2_ioctl,
979 .mmap = gfs2_mmap,
980 .open = gfs2_open,
981 .release = gfs2_close,
982 .fsync = gfs2_fsync,
983 .lock = gfs2_lock,
984 .sendfile = gfs2_sendfile,
985 .flock = gfs2_flock,
986 .splice_read = generic_file_splice_read,
987 .splice_write = generic_file_splice_write,
988};
989
990struct file_operations gfs2_dir_fops = {
991 .readdir = gfs2_readdir,
992 .unlocked_ioctl = gfs2_ioctl,
993 .open = gfs2_open,
994 .release = gfs2_close,
995 .fsync = gfs2_fsync,
996 .lock = gfs2_lock,
997 .flock = gfs2_flock,
998};
999
diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h
new file mode 100644
index 000000000000..192577b411f0
--- /dev/null
+++ b/fs/gfs2/ops_file.h
@@ -0,0 +1,20 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_FILE_DOT_H__
11#define __OPS_FILE_DOT_H__
12extern struct file gfs2_internal_file_sentinal;
13extern int gfs2_internal_read(struct gfs2_inode *ip,
14 struct file_ra_state *ra_state,
15 char *buf, loff_t *pos, unsigned size);
16
17extern struct file_operations gfs2_file_fops;
18extern struct file_operations gfs2_dir_fops;
19
20#endif /* __OPS_FILE_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
new file mode 100644
index 000000000000..70745f3561a6
--- /dev/null
+++ b/fs/gfs2/ops_fstype.c
@@ -0,0 +1,904 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/vmalloc.h>
16#include <linux/blkdev.h>
17#include <linux/kthread.h>
18#include <linux/gfs2_ondisk.h>
19#include <asm/semaphore.h>
20
21#include "gfs2.h"
22#include "lm_interface.h"
23#include "incore.h"
24#include "daemon.h"
25#include "glock.h"
26#include "glops.h"
27#include "inode.h"
28#include "lm.h"
29#include "mount.h"
30#include "ops_export.h"
31#include "ops_fstype.h"
32#include "ops_super.h"
33#include "recovery.h"
34#include "rgrp.h"
35#include "super.h"
36#include "unlinked.h"
37#include "sys.h"
38#include "util.h"
39
40#define DO 0
41#define UNDO 1
42
43static struct gfs2_sbd *init_sbd(struct super_block *sb)
44{
45 struct gfs2_sbd *sdp;
46 unsigned int x;
47
48 sdp = vmalloc(sizeof(struct gfs2_sbd));
49 if (!sdp)
50 return NULL;
51
52 memset(sdp, 0, sizeof(struct gfs2_sbd));
53
54 sb->s_fs_info = sdp;
55 sdp->sd_vfs = sb;
56
57 gfs2_tune_init(&sdp->sd_tune);
58
59 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
60 sdp->sd_gl_hash[x].hb_lock = RW_LOCK_UNLOCKED;
61 INIT_LIST_HEAD(&sdp->sd_gl_hash[x].hb_list);
62 }
63 INIT_LIST_HEAD(&sdp->sd_reclaim_list);
64 spin_lock_init(&sdp->sd_reclaim_lock);
65 init_waitqueue_head(&sdp->sd_reclaim_wq);
66 mutex_init(&sdp->sd_invalidate_inodes_mutex);
67
68 mutex_init(&sdp->sd_inum_mutex);
69 spin_lock_init(&sdp->sd_statfs_spin);
70 mutex_init(&sdp->sd_statfs_mutex);
71
72 spin_lock_init(&sdp->sd_rindex_spin);
73 mutex_init(&sdp->sd_rindex_mutex);
74 INIT_LIST_HEAD(&sdp->sd_rindex_list);
75 INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
76 INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
77
78 INIT_LIST_HEAD(&sdp->sd_jindex_list);
79 spin_lock_init(&sdp->sd_jindex_spin);
80 mutex_init(&sdp->sd_jindex_mutex);
81
82 INIT_LIST_HEAD(&sdp->sd_unlinked_list);
83 spin_lock_init(&sdp->sd_unlinked_spin);
84 mutex_init(&sdp->sd_unlinked_mutex);
85
86 INIT_LIST_HEAD(&sdp->sd_quota_list);
87 spin_lock_init(&sdp->sd_quota_spin);
88 mutex_init(&sdp->sd_quota_mutex);
89
90 spin_lock_init(&sdp->sd_log_lock);
91
92 INIT_LIST_HEAD(&sdp->sd_log_le_gl);
93 INIT_LIST_HEAD(&sdp->sd_log_le_buf);
94 INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
95 INIT_LIST_HEAD(&sdp->sd_log_le_rg);
96 INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
97
98 mutex_init(&sdp->sd_log_reserve_mutex);
99 INIT_LIST_HEAD(&sdp->sd_ail1_list);
100 INIT_LIST_HEAD(&sdp->sd_ail2_list);
101
102 init_rwsem(&sdp->sd_log_flush_lock);
103 INIT_LIST_HEAD(&sdp->sd_log_flush_list);
104
105 INIT_LIST_HEAD(&sdp->sd_revoke_list);
106
107 mutex_init(&sdp->sd_freeze_lock);
108
109 return sdp;
110}
111
112static void init_vfs(struct super_block *sb, unsigned noatime)
113{
114 struct gfs2_sbd *sdp = sb->s_fs_info;
115
116 sb->s_magic = GFS2_MAGIC;
117 sb->s_op = &gfs2_super_ops;
118 sb->s_export_op = &gfs2_export_ops;
119 sb->s_maxbytes = MAX_LFS_FILESIZE;
120
121 if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
122 set_bit(noatime, &sdp->sd_flags);
123
124 /* Don't let the VFS update atimes. GFS2 handles this itself. */
125 sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
126}
127
128static int init_names(struct gfs2_sbd *sdp, int silent)
129{
130 struct gfs2_sb *sb = NULL;
131 char *proto, *table;
132 int error = 0;
133
134 proto = sdp->sd_args.ar_lockproto;
135 table = sdp->sd_args.ar_locktable;
136
137 /* Try to autodetect */
138
139 if (!proto[0] || !table[0]) {
140 struct buffer_head *bh;
141 bh = sb_getblk(sdp->sd_vfs,
142 GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
143 lock_buffer(bh);
144 clear_buffer_uptodate(bh);
145 clear_buffer_dirty(bh);
146 unlock_buffer(bh);
147 ll_rw_block(READ, 1, &bh);
148 wait_on_buffer(bh);
149
150 if (!buffer_uptodate(bh)) {
151 brelse(bh);
152 return -EIO;
153 }
154
155 sb = kmalloc(sizeof(struct gfs2_sb), GFP_KERNEL);
156 if (!sb) {
157 brelse(bh);
158 return -ENOMEM;
159 }
160 gfs2_sb_in(sb, bh->b_data);
161 brelse(bh);
162
163 error = gfs2_check_sb(sdp, sb, silent);
164 if (error)
165 goto out;
166
167 if (!proto[0])
168 proto = sb->sb_lockproto;
169 if (!table[0])
170 table = sb->sb_locktable;
171 }
172
173 if (!table[0])
174 table = sdp->sd_vfs->s_id;
175
176 snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto);
177 snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table);
178
179 out:
180 kfree(sb);
181
182 return error;
183}
184
185static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
186 int undo)
187{
188 struct task_struct *p;
189 int error = 0;
190
191 if (undo)
192 goto fail_trans;
193
194 p = kthread_run(gfs2_scand, sdp, "gfs2_scand");
195 error = IS_ERR(p);
196 if (error) {
197 fs_err(sdp, "can't start scand thread: %d\n", error);
198 return error;
199 }
200 sdp->sd_scand_process = p;
201
202 for (sdp->sd_glockd_num = 0;
203 sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
204 sdp->sd_glockd_num++) {
205 p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
206 error = IS_ERR(p);
207 if (error) {
208 fs_err(sdp, "can't start glockd thread: %d\n", error);
209 goto fail;
210 }
211 sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
212 }
213
214 error = gfs2_glock_nq_num(sdp,
215 GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
216 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
217 mount_gh);
218 if (error) {
219 fs_err(sdp, "can't acquire mount glock: %d\n", error);
220 goto fail;
221 }
222
223 error = gfs2_glock_nq_num(sdp,
224 GFS2_LIVE_LOCK, &gfs2_nondisk_glops,
225 LM_ST_SHARED,
226 LM_FLAG_NOEXP | GL_EXACT | GL_NEVER_RECURSE,
227 &sdp->sd_live_gh);
228 if (error) {
229 fs_err(sdp, "can't acquire live glock: %d\n", error);
230 goto fail_mount;
231 }
232
233 error = gfs2_glock_get(sdp, GFS2_RENAME_LOCK, &gfs2_nondisk_glops,
234 CREATE, &sdp->sd_rename_gl);
235 if (error) {
236 fs_err(sdp, "can't create rename glock: %d\n", error);
237 goto fail_live;
238 }
239
240 error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops,
241 CREATE, &sdp->sd_trans_gl);
242 if (error) {
243 fs_err(sdp, "can't create transaction glock: %d\n", error);
244 goto fail_rename;
245 }
246 set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
247
248 return 0;
249
250 fail_trans:
251 gfs2_glock_put(sdp->sd_trans_gl);
252
253 fail_rename:
254 gfs2_glock_put(sdp->sd_rename_gl);
255
256 fail_live:
257 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
258
259 fail_mount:
260 gfs2_glock_dq_uninit(mount_gh);
261
262 fail:
263 while (sdp->sd_glockd_num--)
264 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
265
266 kthread_stop(sdp->sd_scand_process);
267
268 return error;
269}
270
271static struct inode *gfs2_lookup_root(struct gfs2_sbd *sdp,
272 const struct gfs2_inum *inum)
273{
274 int error;
275 struct gfs2_glock *gl;
276 struct gfs2_inode *ip;
277 struct inode *inode;
278
279 error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_inode_glops,
280 CREATE, &gl);
281 if (!error) {
282 error = gfs2_inode_get(gl, inum, CREATE, &ip);
283 if (!error) {
284 gfs2_inode_min_init(ip, DT_DIR);
285 inode = gfs2_ip2v(ip);
286 gfs2_inode_put(ip);
287 gfs2_glock_put(gl);
288 return inode;
289 }
290 gfs2_glock_put(gl);
291 }
292 return ERR_PTR(error);
293}
294
295static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
296{
297 struct super_block *sb = sdp->sd_vfs;
298 struct gfs2_holder sb_gh;
299 struct gfs2_inum *inum;
300 struct inode *inode;
301 int error = 0;
302
303 if (undo) {
304 return 0;
305 }
306
307 error = gfs2_glock_nq_num(sdp,
308 GFS2_SB_LOCK, &gfs2_meta_glops,
309 LM_ST_SHARED, 0, &sb_gh);
310 if (error) {
311 fs_err(sdp, "can't acquire superblock glock: %d\n", error);
312 return error;
313 }
314
315 error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
316 if (error) {
317 fs_err(sdp, "can't read superblock: %d\n", error);
318 goto out;
319 }
320
321 /* Set up the buffer cache and SB for real */
322 if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
323 error = -EINVAL;
324 fs_err(sdp, "FS block size (%u) is too small for device "
325 "block size (%u)\n",
326 sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
327 goto out;
328 }
329 if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
330 error = -EINVAL;
331 fs_err(sdp, "FS block size (%u) is too big for machine "
332 "page size (%u)\n",
333 sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
334 goto out;
335 }
336
337 /* Get rid of buffers from the original block size */
338 sb_gh.gh_gl->gl_ops->go_inval(sb_gh.gh_gl, DIO_METADATA | DIO_DATA);
339 sb_gh.gh_gl->gl_aspace->i_blkbits = sdp->sd_sb.sb_bsize_shift;
340
341 sb_set_blocksize(sb, sdp->sd_sb.sb_bsize);
342
343 /* Get the root inode */
344 inum = &sdp->sd_sb.sb_root_dir;
345 if (sb->s_type == &gfs2meta_fs_type)
346 inum = &sdp->sd_sb.sb_master_dir;
347 inode = gfs2_lookup_root(sdp, inum);
348 if (IS_ERR(inode)) {
349 error = PTR_ERR(inode);
350 fs_err(sdp, "can't read in root inode: %d\n", error);
351 goto out;
352 }
353
354 sb->s_root = d_alloc_root(inode);
355 if (!sb->s_root) {
356 fs_err(sdp, "can't get root dentry\n");
357 error = -ENOMEM;
358 iput(inode);
359 }
360out:
361 gfs2_glock_dq_uninit(&sb_gh);
362 return error;
363}
364
365static int init_journal(struct gfs2_sbd *sdp, int undo)
366{
367 struct gfs2_holder ji_gh;
368 struct task_struct *p;
369 struct gfs2_inode *ip;
370 int jindex = 1;
371 int error = 0;
372
373 if (undo) {
374 jindex = 0;
375 goto fail_recoverd;
376 }
377
378 sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex");
379 if (IS_ERR(sdp->sd_jindex)) {
380 fs_err(sdp, "can't lookup journal index: %d\n", error);
381 return PTR_ERR(sdp->sd_jindex);
382 }
383 ip = sdp->sd_jindex->u.generic_ip;
384 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
385
386 /* Load in the journal index special file */
387
388 error = gfs2_jindex_hold(sdp, &ji_gh);
389 if (error) {
390 fs_err(sdp, "can't read journal index: %d\n", error);
391 goto fail;
392 }
393
394 error = -EINVAL;
395 if (!gfs2_jindex_size(sdp)) {
396 fs_err(sdp, "no journals!\n");
397 goto fail_jindex;
398 }
399
400 if (sdp->sd_args.ar_spectator) {
401 sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
402 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
403 } else {
404 if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
405 fs_err(sdp, "can't mount journal #%u\n",
406 sdp->sd_lockstruct.ls_jid);
407 fs_err(sdp, "there are only %u journals (0 - %u)\n",
408 gfs2_jindex_size(sdp),
409 gfs2_jindex_size(sdp) - 1);
410 goto fail_jindex;
411 }
412 sdp->sd_jdesc = gfs2_jdesc_find(sdp, sdp->sd_lockstruct.ls_jid);
413
414 error = gfs2_glock_nq_num(sdp,
415 sdp->sd_lockstruct.ls_jid,
416 &gfs2_journal_glops,
417 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP,
418 &sdp->sd_journal_gh);
419 if (error) {
420 fs_err(sdp, "can't acquire journal glock: %d\n", error);
421 goto fail_jindex;
422 }
423
424 ip = sdp->sd_jdesc->jd_inode->u.generic_ip;
425 error = gfs2_glock_nq_init(ip->i_gl,
426 LM_ST_SHARED,
427 LM_FLAG_NOEXP | GL_EXACT,
428 &sdp->sd_jinode_gh);
429 if (error) {
430 fs_err(sdp, "can't acquire journal inode glock: %d\n",
431 error);
432 goto fail_journal_gh;
433 }
434
435 error = gfs2_jdesc_check(sdp->sd_jdesc);
436 if (error) {
437 fs_err(sdp, "my journal (%u) is bad: %d\n",
438 sdp->sd_jdesc->jd_jid, error);
439 goto fail_jinode_gh;
440 }
441 sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
442 }
443
444 if (sdp->sd_lockstruct.ls_first) {
445 unsigned int x;
446 for (x = 0; x < sdp->sd_journals; x++) {
447 error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x));
448 if (error) {
449 fs_err(sdp, "error recovering journal %u: %d\n",
450 x, error);
451 goto fail_jinode_gh;
452 }
453 }
454
455 gfs2_lm_others_may_mount(sdp);
456 } else if (!sdp->sd_args.ar_spectator) {
457 error = gfs2_recover_journal(sdp->sd_jdesc);
458 if (error) {
459 fs_err(sdp, "error recovering my journal: %d\n", error);
460 goto fail_jinode_gh;
461 }
462 }
463
464 set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags);
465 gfs2_glock_dq_uninit(&ji_gh);
466 jindex = 0;
467
468 /* Disown my Journal glock */
469
470 sdp->sd_journal_gh.gh_owner = NULL;
471 sdp->sd_jinode_gh.gh_owner = NULL;
472
473 p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
474 error = IS_ERR(p);
475 if (error) {
476 fs_err(sdp, "can't start recoverd thread: %d\n", error);
477 goto fail_jinode_gh;
478 }
479 sdp->sd_recoverd_process = p;
480
481 return 0;
482
483 fail_recoverd:
484 kthread_stop(sdp->sd_recoverd_process);
485
486 fail_jinode_gh:
487 if (!sdp->sd_args.ar_spectator)
488 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
489
490 fail_journal_gh:
491 if (!sdp->sd_args.ar_spectator)
492 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
493
494 fail_jindex:
495 gfs2_jindex_free(sdp);
496 if (jindex)
497 gfs2_glock_dq_uninit(&ji_gh);
498
499 fail:
500 iput(sdp->sd_jindex);
501
502 return error;
503}
504
505
506static int init_inodes(struct gfs2_sbd *sdp, int undo)
507{
508 int error = 0;
509 struct gfs2_inode *ip;
510 struct inode *inode;
511
512 if (undo)
513 goto fail_qinode;
514
515 inode = gfs2_lookup_root(sdp, &sdp->sd_sb.sb_master_dir);
516 if (IS_ERR(inode)) {
517 error = PTR_ERR(inode);
518 fs_err(sdp, "can't read in master directory: %d\n", error);
519 goto fail;
520 }
521 sdp->sd_master_dir = inode;
522
523 error = init_journal(sdp, undo);
524 if (error)
525 goto fail_master;
526
527 /* Read in the master inode number inode */
528 sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum");
529 if (IS_ERR(sdp->sd_inum_inode)) {
530 error = PTR_ERR(sdp->sd_inum_inode);
531 fs_err(sdp, "can't read in inum inode: %d\n", error);
532 goto fail_journal;
533 }
534
535
536 /* Read in the master statfs inode */
537 sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs");
538 if (IS_ERR(sdp->sd_statfs_inode)) {
539 error = PTR_ERR(sdp->sd_statfs_inode);
540 fs_err(sdp, "can't read in statfs inode: %d\n", error);
541 goto fail_inum;
542 }
543
544 /* Read in the resource index inode */
545 sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex");
546 if (IS_ERR(sdp->sd_rindex)) {
547 error = PTR_ERR(sdp->sd_rindex);
548 fs_err(sdp, "can't get resource index inode: %d\n", error);
549 goto fail_statfs;
550 }
551 ip = sdp->sd_rindex->u.generic_ip;
552 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
553 sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1;
554
555 /* Read in the quota inode */
556 sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
557 if (IS_ERR(sdp->sd_quota_inode)) {
558 error = PTR_ERR(sdp->sd_quota_inode);
559 fs_err(sdp, "can't get quota file inode: %d\n", error);
560 goto fail_rindex;
561 }
562 return 0;
563
564fail_qinode:
565 iput(sdp->sd_quota_inode);
566
567fail_rindex:
568 gfs2_clear_rgrpd(sdp);
569 iput(sdp->sd_rindex);
570
571fail_statfs:
572 iput(sdp->sd_statfs_inode);
573
574fail_inum:
575 iput(sdp->sd_inum_inode);
576fail_journal:
577 init_journal(sdp, UNDO);
578fail_master:
579 iput(sdp->sd_master_dir);
580fail:
581 return error;
582}
583
584static int init_per_node(struct gfs2_sbd *sdp, int undo)
585{
586 struct inode *pn = NULL;
587 char buf[30];
588 int error = 0;
589 struct gfs2_inode *ip;
590
591 if (sdp->sd_args.ar_spectator)
592 return 0;
593
594 if (undo)
595 goto fail_qc_gh;
596
597 pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node");
598 if (IS_ERR(pn)) {
599 error = PTR_ERR(pn);
600 fs_err(sdp, "can't find per_node directory: %d\n", error);
601 return error;
602 }
603
604 sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
605 sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
606 if (IS_ERR(sdp->sd_ir_inode)) {
607 error = PTR_ERR(sdp->sd_ir_inode);
608 fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
609 goto fail;
610 }
611
612 sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
613 sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
614 if (IS_ERR(sdp->sd_sc_inode)) {
615 error = PTR_ERR(sdp->sd_sc_inode);
616 fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
617 goto fail_ir_i;
618 }
619
620 sprintf(buf, "unlinked_tag%u", sdp->sd_jdesc->jd_jid);
621 sdp->sd_ut_inode = gfs2_lookup_simple(pn, buf);
622 if (IS_ERR(sdp->sd_ut_inode)) {
623 error = PTR_ERR(sdp->sd_ut_inode);
624 fs_err(sdp, "can't find local \"ut\" file: %d\n", error);
625 goto fail_sc_i;
626 }
627
628 sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
629 sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf);
630 if (IS_ERR(sdp->sd_qc_inode)) {
631 error = PTR_ERR(sdp->sd_qc_inode);
632 fs_err(sdp, "can't find local \"qc\" file: %d\n", error);
633 goto fail_ut_i;
634 }
635
636 iput(pn);
637 pn = NULL;
638
639 ip = sdp->sd_ir_inode->u.generic_ip;
640 error = gfs2_glock_nq_init(ip->i_gl,
641 LM_ST_EXCLUSIVE, GL_NEVER_RECURSE,
642 &sdp->sd_ir_gh);
643 if (error) {
644 fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
645 goto fail_qc_i;
646 }
647
648 ip = sdp->sd_sc_inode->u.generic_ip;
649 error = gfs2_glock_nq_init(ip->i_gl,
650 LM_ST_EXCLUSIVE, GL_NEVER_RECURSE,
651 &sdp->sd_sc_gh);
652 if (error) {
653 fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
654 goto fail_ir_gh;
655 }
656
657 ip = sdp->sd_ut_inode->u.generic_ip;
658 error = gfs2_glock_nq_init(ip->i_gl,
659 LM_ST_EXCLUSIVE, GL_NEVER_RECURSE,
660 &sdp->sd_ut_gh);
661 if (error) {
662 fs_err(sdp, "can't lock local \"ut\" file: %d\n", error);
663 goto fail_sc_gh;
664 }
665
666 ip = sdp->sd_qc_inode->u.generic_ip;
667 error = gfs2_glock_nq_init(ip->i_gl,
668 LM_ST_EXCLUSIVE, GL_NEVER_RECURSE,
669 &sdp->sd_qc_gh);
670 if (error) {
671 fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
672 goto fail_ut_gh;
673 }
674
675 return 0;
676
677 fail_qc_gh:
678 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
679
680 fail_ut_gh:
681 gfs2_glock_dq_uninit(&sdp->sd_ut_gh);
682
683 fail_sc_gh:
684 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
685
686 fail_ir_gh:
687 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
688
689 fail_qc_i:
690 iput(sdp->sd_qc_inode);
691
692 fail_ut_i:
693 iput(sdp->sd_ut_inode);
694
695 fail_sc_i:
696 iput(sdp->sd_sc_inode);
697
698 fail_ir_i:
699 iput(sdp->sd_ir_inode);
700
701 fail:
702 if (pn)
703 iput(pn);
704 return error;
705}
706
707static int init_threads(struct gfs2_sbd *sdp, int undo)
708{
709 struct task_struct *p;
710 int error = 0;
711
712 if (undo)
713 goto fail_inoded;
714
715 sdp->sd_log_flush_time = jiffies;
716 sdp->sd_jindex_refresh_time = jiffies;
717
718 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
719 error = IS_ERR(p);
720 if (error) {
721 fs_err(sdp, "can't start logd thread: %d\n", error);
722 return error;
723 }
724 sdp->sd_logd_process = p;
725
726 sdp->sd_statfs_sync_time = jiffies;
727 sdp->sd_quota_sync_time = jiffies;
728
729 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
730 error = IS_ERR(p);
731 if (error) {
732 fs_err(sdp, "can't start quotad thread: %d\n", error);
733 goto fail;
734 }
735 sdp->sd_quotad_process = p;
736
737 p = kthread_run(gfs2_inoded, sdp, "gfs2_inoded");
738 error = IS_ERR(p);
739 if (error) {
740 fs_err(sdp, "can't start inoded thread: %d\n", error);
741 goto fail_quotad;
742 }
743 sdp->sd_inoded_process = p;
744
745 return 0;
746
747 fail_inoded:
748 kthread_stop(sdp->sd_inoded_process);
749
750 fail_quotad:
751 kthread_stop(sdp->sd_quotad_process);
752
753 fail:
754 kthread_stop(sdp->sd_logd_process);
755
756 return error;
757}
758
759/**
760 * fill_super - Read in superblock
761 * @sb: The VFS superblock
762 * @data: Mount options
763 * @silent: Don't complain if it's not a GFS2 filesystem
764 *
765 * Returns: errno
766 */
767
768static int fill_super(struct super_block *sb, void *data, int silent)
769{
770 struct gfs2_sbd *sdp;
771 struct gfs2_holder mount_gh;
772 int error;
773
774 sdp = init_sbd(sb);
775 if (!sdp) {
776 printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
777 return -ENOMEM;
778 }
779
780 error = gfs2_mount_args(sdp, (char *)data, 0);
781 if (error) {
782 printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
783 goto fail;
784 }
785
786 init_vfs(sb, SDF_NOATIME);
787
788 /* Set up the buffer cache and fill in some fake block size values
789 to allow us to read-in the on-disk superblock. */
790 sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK);
791 sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
792 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
793 GFS2_BASIC_BLOCK_SHIFT;
794 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
795
796 error = init_names(sdp, silent);
797 if (error)
798 goto fail;
799
800 error = gfs2_sys_fs_add(sdp);
801 if (error)
802 goto fail;
803
804 error = gfs2_lm_mount(sdp, silent);
805 if (error)
806 goto fail_sys;
807
808 error = init_locking(sdp, &mount_gh, DO);
809 if (error)
810 goto fail_lm;
811
812 error = init_sb(sdp, silent, DO);
813 if (error)
814 goto fail_locking;
815
816 error = init_inodes(sdp, DO);
817 if (error)
818 goto fail_sb;
819
820 error = init_per_node(sdp, DO);
821 if (error)
822 goto fail_inodes;
823
824 error = gfs2_statfs_init(sdp);
825 if (error) {
826 fs_err(sdp, "can't initialize statfs subsystem: %d\n", error);
827 goto fail_per_node;
828 }
829
830 error = init_threads(sdp, DO);
831 if (error)
832 goto fail_per_node;
833
834 if (!(sb->s_flags & MS_RDONLY)) {
835 error = gfs2_make_fs_rw(sdp);
836 if (error) {
837 fs_err(sdp, "can't make FS RW: %d\n", error);
838 goto fail_threads;
839 }
840 }
841
842 gfs2_glock_dq_uninit(&mount_gh);
843
844 return 0;
845
846 fail_threads:
847 init_threads(sdp, UNDO);
848
849 fail_per_node:
850 init_per_node(sdp, UNDO);
851
852 fail_inodes:
853 init_inodes(sdp, UNDO);
854
855 fail_sb:
856 init_sb(sdp, 0, UNDO);
857
858 fail_locking:
859 init_locking(sdp, &mount_gh, UNDO);
860
861 fail_lm:
862 gfs2_gl_hash_clear(sdp, WAIT);
863 gfs2_lm_unmount(sdp);
864 while (invalidate_inodes(sb))
865 yield();
866
867 fail_sys:
868 gfs2_sys_fs_del(sdp);
869
870 fail:
871 vfree(sdp);
872 sb->s_fs_info = NULL;
873
874 return error;
875}
876
877static struct super_block *gfs2_get_sb(struct file_system_type *fs_type,
878 int flags, const char *dev_name,
879 void *data)
880{
881 return get_sb_bdev(fs_type, flags, dev_name, data, fill_super);
882}
883
884static void gfs2_kill_sb(struct super_block *sb)
885{
886 kill_block_super(sb);
887}
888
889struct file_system_type gfs2_fs_type = {
890 .name = "gfs2",
891 .fs_flags = FS_REQUIRES_DEV,
892 .get_sb = gfs2_get_sb,
893 .kill_sb = gfs2_kill_sb,
894 .owner = THIS_MODULE,
895};
896
897struct file_system_type gfs2meta_fs_type = {
898 .name = "gfs2meta",
899 .fs_flags = FS_REQUIRES_DEV,
900 .get_sb = gfs2_get_sb,
901 .kill_sb = gfs2_kill_sb,
902 .owner = THIS_MODULE,
903};
904
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
new file mode 100644
index 000000000000..c6452874483d
--- /dev/null
+++ b/fs/gfs2/ops_fstype.h
@@ -0,0 +1,16 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_FSTYPE_DOT_H__
11#define __OPS_FSTYPE_DOT_H__
12
13extern struct file_system_type gfs2_fs_type;
14extern struct file_system_type gfs2meta_fs_type;
15
16#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
new file mode 100644
index 000000000000..62a12a59d91b
--- /dev/null
+++ b/fs/gfs2/ops_inode.c
@@ -0,0 +1,1196 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/namei.h>
16#include <linux/utsname.h>
17#include <linux/mm.h>
18#include <linux/xattr.h>
19#include <linux/posix_acl.h>
20#include <linux/gfs2_ondisk.h>
21#include <linux/crc32.h>
22#include <asm/semaphore.h>
23#include <asm/uaccess.h>
24
25#include "gfs2.h"
26#include "lm_interface.h"
27#include "incore.h"
28#include "acl.h"
29#include "bmap.h"
30#include "dir.h"
31#include "eaops.h"
32#include "eattr.h"
33#include "glock.h"
34#include "inode.h"
35#include "meta_io.h"
36#include "ops_dentry.h"
37#include "ops_inode.h"
38#include "page.h"
39#include "quota.h"
40#include "rgrp.h"
41#include "trans.h"
42#include "unlinked.h"
43#include "util.h"
44
45/**
46 * gfs2_create - Create a file
47 * @dir: The directory in which to create the file
48 * @dentry: The dentry of the new file
49 * @mode: The mode of the new file
50 *
51 * Returns: errno
52 */
53
54static int gfs2_create(struct inode *dir, struct dentry *dentry,
55 int mode, struct nameidata *nd)
56{
57 struct gfs2_inode *dip = dir->u.generic_ip;
58 struct gfs2_sbd *sdp = dip->i_sbd;
59 struct gfs2_holder ghs[2];
60 struct inode *inode;
61 int new = 1;
62
63 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
64
65 for (;;) {
66 inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode);
67 if (!IS_ERR(inode)) {
68 gfs2_trans_end(sdp);
69 if (dip->i_alloc.al_rgd)
70 gfs2_inplace_release(dip);
71 gfs2_quota_unlock(dip);
72 gfs2_alloc_put(dip);
73 gfs2_glock_dq_uninit_m(2, ghs);
74 break;
75 } else if (PTR_ERR(inode) != -EEXIST ||
76 (nd->intent.open.flags & O_EXCL)) {
77 gfs2_holder_uninit(ghs);
78 return PTR_ERR(inode);
79 }
80
81 inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
82 if (inode) {
83 if (!IS_ERR(inode)) {
84 new = 0;
85 gfs2_holder_uninit(ghs);
86 break;
87 } else {
88 gfs2_holder_uninit(ghs);
89 return PTR_ERR(inode);
90 }
91 }
92 }
93
94 d_instantiate(dentry, inode);
95 if (new)
96 mark_inode_dirty(inode);
97
98 return 0;
99}
100
101/**
102 * gfs2_lookup - Look up a filename in a directory and return its inode
103 * @dir: The directory inode
104 * @dentry: The dentry of the new inode
105 * @nd: passed from Linux VFS, ignored by us
106 *
107 * Called by the VFS layer. Lock dir and call gfs2_lookupi()
108 *
109 * Returns: errno
110 */
111
112static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
113 struct nameidata *nd)
114{
115 struct inode *inode = NULL;
116
117 dentry->d_op = &gfs2_dops;
118
119 inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
120 if (inode && IS_ERR(inode))
121 return ERR_PTR(PTR_ERR(inode));
122
123 if (inode)
124 return d_splice_alias(inode, dentry);
125 d_add(dentry, inode);
126
127 return NULL;
128}
129
130/**
131 * gfs2_link - Link to a file
132 * @old_dentry: The inode to link
133 * @dir: Add link to this directory
134 * @dentry: The name of the link
135 *
136 * Link the inode in "old_dentry" into the directory "dir" with the
137 * name in "dentry".
138 *
139 * Returns: errno
140 */
141
142static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
143 struct dentry *dentry)
144{
145 struct gfs2_inode *dip = dir->u.generic_ip;
146 struct gfs2_sbd *sdp = dip->i_sbd;
147 struct inode *inode = old_dentry->d_inode;
148 struct gfs2_inode *ip = inode->u.generic_ip;
149 struct gfs2_holder ghs[2];
150 int alloc_required;
151 int error;
152
153 if (S_ISDIR(ip->i_di.di_mode))
154 return -EPERM;
155
156 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
157 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
158
159 error = gfs2_glock_nq_m(2, ghs);
160 if (error)
161 goto out;
162
163 error = gfs2_repermission(dir, MAY_WRITE | MAY_EXEC, NULL);
164 if (error)
165 goto out_gunlock;
166
167 error = gfs2_dir_search(dir, &dentry->d_name, NULL, NULL);
168 switch (error) {
169 case -ENOENT:
170 break;
171 case 0:
172 error = -EEXIST;
173 default:
174 goto out_gunlock;
175 }
176
177 error = -EINVAL;
178 if (!dip->i_di.di_nlink)
179 goto out_gunlock;
180 error = -EFBIG;
181 if (dip->i_di.di_entries == (uint32_t)-1)
182 goto out_gunlock;
183 error = -EPERM;
184 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
185 goto out_gunlock;
186 error = -EINVAL;
187 if (!ip->i_di.di_nlink)
188 goto out_gunlock;
189 error = -EMLINK;
190 if (ip->i_di.di_nlink == (uint32_t)-1)
191 goto out_gunlock;
192
193 alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
194 if (error < 0)
195 goto out_gunlock;
196 error = 0;
197
198 if (alloc_required) {
199 struct gfs2_alloc *al = gfs2_alloc_get(dip);
200
201 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
202 if (error)
203 goto out_alloc;
204
205 error = gfs2_quota_check(dip, dip->i_di.di_uid,
206 dip->i_di.di_gid);
207 if (error)
208 goto out_gunlock_q;
209
210 al->al_requested = sdp->sd_max_dirres;
211
212 error = gfs2_inplace_reserve(dip);
213 if (error)
214 goto out_gunlock_q;
215
216 error = gfs2_trans_begin(sdp,
217 sdp->sd_max_dirres +
218 al->al_rgd->rd_ri.ri_length +
219 2 * RES_DINODE + RES_STATFS +
220 RES_QUOTA, 0);
221 if (error)
222 goto out_ipres;
223 } else {
224 error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
225 if (error)
226 goto out_ipres;
227 }
228
229 error = gfs2_dir_add(dir, &dentry->d_name, &ip->i_num,
230 IF2DT(ip->i_di.di_mode));
231 if (error)
232 goto out_end_trans;
233
234 error = gfs2_change_nlink(ip, +1);
235
236 out_end_trans:
237 gfs2_trans_end(sdp);
238
239 out_ipres:
240 if (alloc_required)
241 gfs2_inplace_release(dip);
242
243 out_gunlock_q:
244 if (alloc_required)
245 gfs2_quota_unlock(dip);
246
247 out_alloc:
248 if (alloc_required)
249 gfs2_alloc_put(dip);
250
251 out_gunlock:
252 gfs2_glock_dq_m(2, ghs);
253
254 out:
255 gfs2_holder_uninit(ghs);
256 gfs2_holder_uninit(ghs + 1);
257
258 if (!error) {
259 atomic_inc(&inode->i_count);
260 d_instantiate(dentry, inode);
261 mark_inode_dirty(inode);
262 }
263
264 return error;
265}
266
267/**
268 * gfs2_unlink - Unlink a file
269 * @dir: The inode of the directory containing the file to unlink
270 * @dentry: The file itself
271 *
272 * Unlink a file. Call gfs2_unlinki()
273 *
274 * Returns: errno
275 */
276
277static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
278{
279 struct gfs2_inode *dip = dir->u.generic_ip;
280 struct gfs2_sbd *sdp = dip->i_sbd;
281 struct gfs2_inode *ip = dentry->d_inode->u.generic_ip;
282 struct gfs2_unlinked *ul;
283 struct gfs2_holder ghs[2];
284 int error;
285
286 error = gfs2_unlinked_get(sdp, &ul);
287 if (error)
288 return error;
289
290 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
291 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
292
293 error = gfs2_glock_nq_m(2, ghs);
294 if (error)
295 goto out;
296
297 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
298 if (error)
299 goto out_gunlock;
300
301 error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF +
302 RES_UNLINKED, 0);
303 if (error)
304 goto out_gunlock;
305
306 error = gfs2_unlinki(dip, &dentry->d_name, ip,ul);
307
308 gfs2_trans_end(sdp);
309
310 out_gunlock:
311 gfs2_glock_dq_m(2, ghs);
312
313 out:
314 gfs2_holder_uninit(ghs);
315 gfs2_holder_uninit(ghs + 1);
316
317 gfs2_unlinked_put(sdp, ul);
318
319 return error;
320}
321
322/**
323 * gfs2_symlink - Create a symlink
324 * @dir: The directory to create the symlink in
325 * @dentry: The dentry to put the symlink in
326 * @symname: The thing which the link points to
327 *
328 * Returns: errno
329 */
330
331static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
332 const char *symname)
333{
334 struct gfs2_inode *dip = dir->u.generic_ip, *ip;
335 struct gfs2_sbd *sdp = dip->i_sbd;
336 struct gfs2_holder ghs[2];
337 struct inode *inode;
338 struct buffer_head *dibh;
339 int size;
340 int error;
341
342 /* Must be stuffed with a null terminator for gfs2_follow_link() */
343 size = strlen(symname);
344 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
345 return -ENAMETOOLONG;
346
347 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
348
349 inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO);
350 if (IS_ERR(inode)) {
351 gfs2_holder_uninit(ghs);
352 return PTR_ERR(inode);
353 }
354
355 ip = ghs[1].gh_gl->gl_object;
356
357 ip->i_di.di_size = size;
358
359 error = gfs2_meta_inode_buffer(ip, &dibh);
360
361 if (!gfs2_assert_withdraw(sdp, !error)) {
362 gfs2_dinode_out(&ip->i_di, dibh->b_data);
363 memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname,
364 size);
365 brelse(dibh);
366 }
367
368 gfs2_trans_end(sdp);
369 if (dip->i_alloc.al_rgd)
370 gfs2_inplace_release(dip);
371 gfs2_quota_unlock(dip);
372 gfs2_alloc_put(dip);
373
374 gfs2_glock_dq_uninit_m(2, ghs);
375
376 d_instantiate(dentry, inode);
377 mark_inode_dirty(inode);
378
379 return 0;
380}
381
382/**
383 * gfs2_mkdir - Make a directory
384 * @dir: The parent directory of the new one
385 * @dentry: The dentry of the new directory
386 * @mode: The mode of the new directory
387 *
388 * Returns: errno
389 */
390
391static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
392{
393 struct gfs2_inode *dip = dir->u.generic_ip, *ip;
394 struct gfs2_sbd *sdp = dip->i_sbd;
395 struct gfs2_holder ghs[2];
396 struct inode *inode;
397 struct buffer_head *dibh;
398 int error;
399
400 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
401
402 inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode);
403 if (IS_ERR(inode)) {
404 gfs2_holder_uninit(ghs);
405 return PTR_ERR(inode);
406 }
407
408 ip = ghs[1].gh_gl->gl_object;
409
410 ip->i_di.di_nlink = 2;
411 ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
412 ip->i_di.di_flags |= GFS2_DIF_JDATA;
413 ip->i_di.di_payload_format = GFS2_FORMAT_DE;
414 ip->i_di.di_entries = 2;
415
416 error = gfs2_meta_inode_buffer(ip, &dibh);
417
418 if (!gfs2_assert_withdraw(sdp, !error)) {
419 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
420 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
421 struct qstr str;
422
423 gfs2_str2qstr(&str, ".");
424 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
425 gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent);
426 dent->de_inum = di->di_num; /* already GFS2 endian */
427 dent->de_type = DT_DIR;
428 di->di_entries = cpu_to_be32(1);
429
430 gfs2_str2qstr(&str, "..");
431 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
432 gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
433
434 gfs2_inum_out(&dip->i_num, (char *) &dent->de_inum);
435 dent->de_type = DT_DIR;
436
437 gfs2_dinode_out(&ip->i_di, (char *)di);
438
439 brelse(dibh);
440 }
441
442 error = gfs2_change_nlink(dip, +1);
443 gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
444
445 gfs2_trans_end(sdp);
446 if (dip->i_alloc.al_rgd)
447 gfs2_inplace_release(dip);
448 gfs2_quota_unlock(dip);
449 gfs2_alloc_put(dip);
450
451 gfs2_glock_dq_uninit_m(2, ghs);
452
453 d_instantiate(dentry, inode);
454 mark_inode_dirty(inode);
455
456 return 0;
457}
458
459/**
460 * gfs2_rmdir - Remove a directory
461 * @dir: The parent directory of the directory to be removed
462 * @dentry: The dentry of the directory to remove
463 *
464 * Remove a directory. Call gfs2_rmdiri()
465 *
466 * Returns: errno
467 */
468
469static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
470{
471 struct gfs2_inode *dip = dir->u.generic_ip;
472 struct gfs2_sbd *sdp = dip->i_sbd;
473 struct gfs2_inode *ip = dentry->d_inode->u.generic_ip;
474 struct gfs2_unlinked *ul;
475 struct gfs2_holder ghs[2];
476 int error;
477
478 error = gfs2_unlinked_get(sdp, &ul);
479 if (error)
480 return error;
481
482 gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
483 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
484
485 error = gfs2_glock_nq_m(2, ghs);
486 if (error)
487 goto out;
488
489 error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
490 if (error)
491 goto out_gunlock;
492
493 if (ip->i_di.di_entries < 2) {
494 if (gfs2_consist_inode(ip))
495 gfs2_dinode_print(&ip->i_di);
496 error = -EIO;
497 goto out_gunlock;
498 }
499 if (ip->i_di.di_entries > 2) {
500 error = -ENOTEMPTY;
501 goto out_gunlock;
502 }
503
504 error = gfs2_trans_begin(sdp, 2 * RES_DINODE + 3 * RES_LEAF +
505 RES_UNLINKED, 0);
506 if (error)
507 goto out_gunlock;
508
509 error = gfs2_rmdiri(dip, &dentry->d_name, ip, ul);
510
511 gfs2_trans_end(sdp);
512
513 out_gunlock:
514 gfs2_glock_dq_m(2, ghs);
515
516 out:
517 gfs2_holder_uninit(ghs);
518 gfs2_holder_uninit(ghs + 1);
519
520 gfs2_unlinked_put(sdp, ul);
521
522 return error;
523}
524
525/**
526 * gfs2_mknod - Make a special file
527 * @dir: The directory in which the special file will reside
528 * @dentry: The dentry of the special file
529 * @mode: The mode of the special file
530 * @rdev: The device specification of the special file
531 *
532 */
533
534static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
535 dev_t dev)
536{
537 struct gfs2_inode *dip = dir->u.generic_ip, *ip;
538 struct gfs2_sbd *sdp = dip->i_sbd;
539 struct gfs2_holder ghs[2];
540 struct inode *inode;
541 struct buffer_head *dibh;
542 uint32_t major = 0, minor = 0;
543 int error;
544
545 switch (mode & S_IFMT) {
546 case S_IFBLK:
547 case S_IFCHR:
548 major = MAJOR(dev);
549 minor = MINOR(dev);
550 break;
551 case S_IFIFO:
552 case S_IFSOCK:
553 break;
554 default:
555 return -EOPNOTSUPP;
556 };
557
558 gfs2_holder_init(dip->i_gl, 0, 0, ghs);
559
560 inode = gfs2_createi(ghs, &dentry->d_name, mode);
561 if (IS_ERR(inode)) {
562 gfs2_holder_uninit(ghs);
563 return PTR_ERR(inode);
564 }
565
566 ip = ghs[1].gh_gl->gl_object;
567
568 ip->i_di.di_major = major;
569 ip->i_di.di_minor = minor;
570
571 error = gfs2_meta_inode_buffer(ip, &dibh);
572
573 if (!gfs2_assert_withdraw(sdp, !error)) {
574 gfs2_dinode_out(&ip->i_di, dibh->b_data);
575 brelse(dibh);
576 }
577
578 gfs2_trans_end(sdp);
579 if (dip->i_alloc.al_rgd)
580 gfs2_inplace_release(dip);
581 gfs2_quota_unlock(dip);
582 gfs2_alloc_put(dip);
583
584 gfs2_glock_dq_uninit_m(2, ghs);
585
586 d_instantiate(dentry, inode);
587 mark_inode_dirty(inode);
588
589 return 0;
590}
591
592/**
593 * gfs2_rename - Rename a file
594 * @odir: Parent directory of old file name
595 * @odentry: The old dentry of the file
596 * @ndir: Parent directory of new file name
597 * @ndentry: The new dentry of the file
598 *
599 * Returns: errno
600 */
601
602static int gfs2_rename(struct inode *odir, struct dentry *odentry,
603 struct inode *ndir, struct dentry *ndentry)
604{
605 struct gfs2_inode *odip = odir->u.generic_ip;
606 struct gfs2_inode *ndip = ndir->u.generic_ip;
607 struct gfs2_inode *ip = odentry->d_inode->u.generic_ip;
608 struct gfs2_inode *nip = NULL;
609 struct gfs2_sbd *sdp = odip->i_sbd;
610 struct gfs2_unlinked *ul;
611 struct gfs2_holder ghs[4], r_gh;
612 unsigned int num_gh;
613 int dir_rename = 0;
614 int alloc_required;
615 unsigned int x;
616 int error;
617
618 if (ndentry->d_inode) {
619 nip = ndentry->d_inode->u.generic_ip;
620 if (ip == nip)
621 return 0;
622 }
623
624 error = gfs2_unlinked_get(sdp, &ul);
625 if (error)
626 return error;
627
628 /* Make sure we aren't trying to move a dirctory into it's subdir */
629
630 if (S_ISDIR(ip->i_di.di_mode) && odip != ndip) {
631 dir_rename = 1;
632
633 error = gfs2_glock_nq_init(sdp->sd_rename_gl,
634 LM_ST_EXCLUSIVE, 0,
635 &r_gh);
636 if (error)
637 goto out;
638
639 error = gfs2_ok_to_move(ip, ndip);
640 if (error)
641 goto out_gunlock_r;
642 }
643
644 gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
645 gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
646 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
647 num_gh = 3;
648
649 if (nip)
650 gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
651
652 error = gfs2_glock_nq_m(num_gh, ghs);
653 if (error)
654 goto out_uninit;
655
656 /* Check out the old directory */
657
658 error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
659 if (error)
660 goto out_gunlock;
661
662 /* Check out the new directory */
663
664 if (nip) {
665 error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
666 if (error)
667 goto out_gunlock;
668
669 if (S_ISDIR(nip->i_di.di_mode)) {
670 if (nip->i_di.di_entries < 2) {
671 if (gfs2_consist_inode(nip))
672 gfs2_dinode_print(&nip->i_di);
673 error = -EIO;
674 goto out_gunlock;
675 }
676 if (nip->i_di.di_entries > 2) {
677 error = -ENOTEMPTY;
678 goto out_gunlock;
679 }
680 }
681 } else {
682 error = gfs2_repermission(ndir, MAY_WRITE | MAY_EXEC, NULL);
683 if (error)
684 goto out_gunlock;
685
686 error = gfs2_dir_search(ndir, &ndentry->d_name, NULL, NULL);
687 switch (error) {
688 case -ENOENT:
689 error = 0;
690 break;
691 case 0:
692 error = -EEXIST;
693 default:
694 goto out_gunlock;
695 };
696
697 if (odip != ndip) {
698 if (!ndip->i_di.di_nlink) {
699 error = -EINVAL;
700 goto out_gunlock;
701 }
702 if (ndip->i_di.di_entries == (uint32_t)-1) {
703 error = -EFBIG;
704 goto out_gunlock;
705 }
706 if (S_ISDIR(ip->i_di.di_mode) &&
707 ndip->i_di.di_nlink == (uint32_t)-1) {
708 error = -EMLINK;
709 goto out_gunlock;
710 }
711 }
712 }
713
714 /* Check out the dir to be renamed */
715
716 if (dir_rename) {
717 error = gfs2_repermission(odentry->d_inode, MAY_WRITE, NULL);
718 if (error)
719 goto out_gunlock;
720 }
721
722 alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
723 if (error < 0)
724 goto out_gunlock;
725 error = 0;
726
727 if (alloc_required) {
728 struct gfs2_alloc *al = gfs2_alloc_get(ndip);
729
730 error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
731 if (error)
732 goto out_alloc;
733
734 error = gfs2_quota_check(ndip, ndip->i_di.di_uid,
735 ndip->i_di.di_gid);
736 if (error)
737 goto out_gunlock_q;
738
739 al->al_requested = sdp->sd_max_dirres;
740
741 error = gfs2_inplace_reserve(ndip);
742 if (error)
743 goto out_gunlock_q;
744
745 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
746 al->al_rgd->rd_ri.ri_length +
747 4 * RES_DINODE + 4 * RES_LEAF +
748 RES_UNLINKED + RES_STATFS +
749 RES_QUOTA, 0);
750 if (error)
751 goto out_ipreserv;
752 } else {
753 error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
754 5 * RES_LEAF +
755 RES_UNLINKED, 0);
756 if (error)
757 goto out_gunlock;
758 }
759
760 /* Remove the target file, if it exists */
761
762 if (nip) {
763 if (S_ISDIR(nip->i_di.di_mode))
764 error = gfs2_rmdiri(ndip, &ndentry->d_name, nip, ul);
765 else
766 error = gfs2_unlinki(ndip, &ndentry->d_name, nip, ul);
767 if (error)
768 goto out_end_trans;
769 }
770
771 if (dir_rename) {
772 struct qstr name;
773 gfs2_str2qstr(&name, "..");
774
775 error = gfs2_change_nlink(ndip, +1);
776 if (error)
777 goto out_end_trans;
778 error = gfs2_change_nlink(odip, -1);
779 if (error)
780 goto out_end_trans;
781
782 error = gfs2_dir_mvino(ip, &name, &ndip->i_num, DT_DIR);
783 if (error)
784 goto out_end_trans;
785 } else {
786 struct buffer_head *dibh;
787 error = gfs2_meta_inode_buffer(ip, &dibh);
788 if (error)
789 goto out_end_trans;
790 ip->i_di.di_ctime = get_seconds();
791 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
792 gfs2_dinode_out(&ip->i_di, dibh->b_data);
793 brelse(dibh);
794 }
795
796 error = gfs2_dir_del(odip, &odentry->d_name);
797 if (error)
798 goto out_end_trans;
799
800 error = gfs2_dir_add(ndir, &ndentry->d_name, &ip->i_num,
801 IF2DT(ip->i_di.di_mode));
802 if (error)
803 goto out_end_trans;
804
805 out_end_trans:
806 gfs2_trans_end(sdp);
807
808 out_ipreserv:
809 if (alloc_required)
810 gfs2_inplace_release(ndip);
811
812 out_gunlock_q:
813 if (alloc_required)
814 gfs2_quota_unlock(ndip);
815
816 out_alloc:
817 if (alloc_required)
818 gfs2_alloc_put(ndip);
819
820 out_gunlock:
821 gfs2_glock_dq_m(num_gh, ghs);
822
823 out_uninit:
824 for (x = 0; x < num_gh; x++)
825 gfs2_holder_uninit(ghs + x);
826
827 out_gunlock_r:
828 if (dir_rename)
829 gfs2_glock_dq_uninit(&r_gh);
830
831 out:
832 gfs2_unlinked_put(sdp, ul);
833
834 return error;
835}
836
837/**
838 * gfs2_readlink - Read the value of a symlink
839 * @dentry: the symlink
840 * @buf: the buffer to read the symlink data into
841 * @size: the size of the buffer
842 *
843 * Returns: errno
844 */
845
846static int gfs2_readlink(struct dentry *dentry, char __user *user_buf,
847 int user_size)
848{
849 struct gfs2_inode *ip = dentry->d_inode->u.generic_ip;
850 char array[GFS2_FAST_NAME_SIZE], *buf = array;
851 unsigned int len = GFS2_FAST_NAME_SIZE;
852 int error;
853
854 error = gfs2_readlinki(ip, &buf, &len);
855 if (error)
856 return error;
857
858 if (user_size > len - 1)
859 user_size = len - 1;
860
861 if (copy_to_user(user_buf, buf, user_size))
862 error = -EFAULT;
863 else
864 error = user_size;
865
866 if (buf != array)
867 kfree(buf);
868
869 return error;
870}
871
872/**
873 * gfs2_follow_link - Follow a symbolic link
874 * @dentry: The dentry of the link
875 * @nd: Data that we pass to vfs_follow_link()
876 *
877 * This can handle symlinks of any size. It is optimised for symlinks
878 * under GFS2_FAST_NAME_SIZE.
879 *
880 * Returns: 0 on success or error code
881 */
882
883static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
884{
885 struct gfs2_inode *ip = dentry->d_inode->u.generic_ip;
886 char array[GFS2_FAST_NAME_SIZE], *buf = array;
887 unsigned int len = GFS2_FAST_NAME_SIZE;
888 int error;
889
890 error = gfs2_readlinki(ip, &buf, &len);
891 if (!error) {
892 error = vfs_follow_link(nd, buf);
893 if (buf != array)
894 kfree(buf);
895 }
896
897 return ERR_PTR(error);
898}
899
900/**
901 * gfs2_permission -
902 * @inode:
903 * @mask:
904 * @nd: passed from Linux VFS, ignored by us
905 *
906 * Returns: errno
907 */
908
909static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
910{
911 struct gfs2_inode *ip = inode->u.generic_ip;
912 struct gfs2_holder i_gh;
913 int error;
914
915 if (ip->i_vn == ip->i_gl->gl_vn)
916 return generic_permission(inode, mask, gfs2_check_acl);
917
918 error = gfs2_glock_nq_init(ip->i_gl,
919 LM_ST_SHARED, LM_FLAG_ANY,
920 &i_gh);
921 if (!error) {
922 error = generic_permission(inode, mask, gfs2_check_acl_locked);
923 gfs2_glock_dq_uninit(&i_gh);
924 }
925
926 return error;
927}
928
929static int setattr_size(struct inode *inode, struct iattr *attr)
930{
931 struct gfs2_inode *ip = inode->u.generic_ip;
932 int error;
933
934 if (attr->ia_size != ip->i_di.di_size) {
935 error = vmtruncate(inode, attr->ia_size);
936 if (error)
937 return error;
938 }
939
940 error = gfs2_truncatei(ip, attr->ia_size);
941 if (error)
942 return error;
943
944 return error;
945}
946
947static int setattr_chown(struct inode *inode, struct iattr *attr)
948{
949 struct gfs2_inode *ip = inode->u.generic_ip;
950 struct gfs2_sbd *sdp = ip->i_sbd;
951 struct buffer_head *dibh;
952 uint32_t ouid, ogid, nuid, ngid;
953 int error;
954
955 ouid = ip->i_di.di_uid;
956 ogid = ip->i_di.di_gid;
957 nuid = attr->ia_uid;
958 ngid = attr->ia_gid;
959
960 if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
961 ouid = nuid = NO_QUOTA_CHANGE;
962 if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
963 ogid = ngid = NO_QUOTA_CHANGE;
964
965 gfs2_alloc_get(ip);
966
967 error = gfs2_quota_lock(ip, nuid, ngid);
968 if (error)
969 goto out_alloc;
970
971 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
972 error = gfs2_quota_check(ip, nuid, ngid);
973 if (error)
974 goto out_gunlock_q;
975 }
976
977 error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
978 if (error)
979 goto out_gunlock_q;
980
981 error = gfs2_meta_inode_buffer(ip, &dibh);
982 if (error)
983 goto out_end_trans;
984
985 error = inode_setattr(inode, attr);
986 gfs2_assert_warn(sdp, !error);
987 gfs2_inode_attr_out(ip);
988
989 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
990 gfs2_dinode_out(&ip->i_di, dibh->b_data);
991 brelse(dibh);
992
993 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
994 gfs2_quota_change(ip, -ip->i_di.di_blocks,
995 ouid, ogid);
996 gfs2_quota_change(ip, ip->i_di.di_blocks,
997 nuid, ngid);
998 }
999
1000 out_end_trans:
1001 gfs2_trans_end(sdp);
1002
1003 out_gunlock_q:
1004 gfs2_quota_unlock(ip);
1005
1006 out_alloc:
1007 gfs2_alloc_put(ip);
1008
1009 return error;
1010}
1011
1012/**
1013 * gfs2_setattr - Change attributes on an inode
1014 * @dentry: The dentry which is changing
1015 * @attr: The structure describing the change
1016 *
1017 * The VFS layer wants to change one or more of an inodes attributes. Write
1018 * that change out to disk.
1019 *
1020 * Returns: errno
1021 */
1022
1023static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
1024{
1025 struct inode *inode = dentry->d_inode;
1026 struct gfs2_inode *ip = inode->u.generic_ip;
1027 struct gfs2_holder i_gh;
1028 int error;
1029
1030 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1031 if (error)
1032 return error;
1033
1034 error = -EPERM;
1035 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
1036 goto out;
1037
1038 error = inode_change_ok(inode, attr);
1039 if (error)
1040 goto out;
1041
1042 if (attr->ia_valid & ATTR_SIZE)
1043 error = setattr_size(inode, attr);
1044 else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
1045 error = setattr_chown(inode, attr);
1046 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
1047 error = gfs2_acl_chmod(ip, attr);
1048 else
1049 error = gfs2_setattr_simple(ip, attr);
1050
1051 out:
1052 gfs2_glock_dq_uninit(&i_gh);
1053
1054 if (!error)
1055 mark_inode_dirty(inode);
1056
1057 return error;
1058}
1059
1060/**
1061 * gfs2_getattr - Read out an inode's attributes
1062 * @mnt: ?
1063 * @dentry: The dentry to stat
1064 * @stat: The inode's stats
1065 *
1066 * Returns: errno
1067 */
1068
1069static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
1070 struct kstat *stat)
1071{
1072 struct inode *inode = dentry->d_inode;
1073 struct gfs2_inode *ip = inode->u.generic_ip;
1074 struct gfs2_holder gh;
1075 int error;
1076
1077 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
1078 if (!error) {
1079 generic_fillattr(inode, stat);
1080 gfs2_glock_dq_uninit(&gh);
1081 }
1082
1083 return error;
1084}
1085
1086static int gfs2_setxattr(struct dentry *dentry, const char *name,
1087 const void *data, size_t size, int flags)
1088{
1089 struct gfs2_inode *ip = dentry->d_inode->u.generic_ip;
1090 struct gfs2_ea_request er;
1091
1092 memset(&er, 0, sizeof(struct gfs2_ea_request));
1093 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1094 if (er.er_type == GFS2_EATYPE_UNUSED)
1095 return -EOPNOTSUPP;
1096 er.er_data = (char *)data;
1097 er.er_name_len = strlen(er.er_name);
1098 er.er_data_len = size;
1099 er.er_flags = flags;
1100
1101 gfs2_assert_warn(ip->i_sbd, !(er.er_flags & GFS2_ERF_MODE));
1102
1103 return gfs2_ea_set(ip, &er);
1104}
1105
1106static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
1107 void *data, size_t size)
1108{
1109 struct gfs2_ea_request er;
1110
1111 memset(&er, 0, sizeof(struct gfs2_ea_request));
1112 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1113 if (er.er_type == GFS2_EATYPE_UNUSED)
1114 return -EOPNOTSUPP;
1115 er.er_data = data;
1116 er.er_name_len = strlen(er.er_name);
1117 er.er_data_len = size;
1118
1119 return gfs2_ea_get(dentry->d_inode->u.generic_ip, &er);
1120}
1121
1122static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
1123{
1124 struct gfs2_ea_request er;
1125
1126 memset(&er, 0, sizeof(struct gfs2_ea_request));
1127 er.er_data = (size) ? buffer : NULL;
1128 er.er_data_len = size;
1129
1130 return gfs2_ea_list(dentry->d_inode->u.generic_ip, &er);
1131}
1132
1133static int gfs2_removexattr(struct dentry *dentry, const char *name)
1134{
1135 struct gfs2_ea_request er;
1136
1137 memset(&er, 0, sizeof(struct gfs2_ea_request));
1138 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1139 if (er.er_type == GFS2_EATYPE_UNUSED)
1140 return -EOPNOTSUPP;
1141 er.er_name_len = strlen(er.er_name);
1142
1143 return gfs2_ea_remove(dentry->d_inode->u.generic_ip, &er);
1144}
1145
1146struct inode_operations gfs2_file_iops = {
1147 .permission = gfs2_permission,
1148 .setattr = gfs2_setattr,
1149 .getattr = gfs2_getattr,
1150 .setxattr = gfs2_setxattr,
1151 .getxattr = gfs2_getxattr,
1152 .listxattr = gfs2_listxattr,
1153 .removexattr = gfs2_removexattr,
1154};
1155
1156struct inode_operations gfs2_dev_iops = {
1157 .permission = gfs2_permission,
1158 .setattr = gfs2_setattr,
1159 .getattr = gfs2_getattr,
1160 .setxattr = gfs2_setxattr,
1161 .getxattr = gfs2_getxattr,
1162 .listxattr = gfs2_listxattr,
1163 .removexattr = gfs2_removexattr,
1164};
1165
1166struct inode_operations gfs2_dir_iops = {
1167 .create = gfs2_create,
1168 .lookup = gfs2_lookup,
1169 .link = gfs2_link,
1170 .unlink = gfs2_unlink,
1171 .symlink = gfs2_symlink,
1172 .mkdir = gfs2_mkdir,
1173 .rmdir = gfs2_rmdir,
1174 .mknod = gfs2_mknod,
1175 .rename = gfs2_rename,
1176 .permission = gfs2_permission,
1177 .setattr = gfs2_setattr,
1178 .getattr = gfs2_getattr,
1179 .setxattr = gfs2_setxattr,
1180 .getxattr = gfs2_getxattr,
1181 .listxattr = gfs2_listxattr,
1182 .removexattr = gfs2_removexattr,
1183};
1184
1185struct inode_operations gfs2_symlink_iops = {
1186 .readlink = gfs2_readlink,
1187 .follow_link = gfs2_follow_link,
1188 .permission = gfs2_permission,
1189 .setattr = gfs2_setattr,
1190 .getattr = gfs2_getattr,
1191 .setxattr = gfs2_setxattr,
1192 .getxattr = gfs2_getxattr,
1193 .listxattr = gfs2_listxattr,
1194 .removexattr = gfs2_removexattr,
1195};
1196
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
new file mode 100644
index 000000000000..5fafd87c8d7b
--- /dev/null
+++ b/fs/gfs2/ops_inode.h
@@ -0,0 +1,18 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_INODE_DOT_H__
11#define __OPS_INODE_DOT_H__
12
13extern struct inode_operations gfs2_file_iops;
14extern struct inode_operations gfs2_dir_iops;
15extern struct inode_operations gfs2_symlink_iops;
16extern struct inode_operations gfs2_dev_iops;
17
18#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
new file mode 100644
index 000000000000..60bf2563c7b4
--- /dev/null
+++ b/fs/gfs2/ops_super.c
@@ -0,0 +1,379 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/statfs.h>
16#include <linux/vmalloc.h>
17#include <linux/seq_file.h>
18#include <linux/mount.h>
19#include <linux/kthread.h>
20#include <linux/delay.h>
21#include <linux/gfs2_ondisk.h>
22#include <asm/semaphore.h>
23
24#include "gfs2.h"
25#include "lm_interface.h"
26#include "incore.h"
27#include "glock.h"
28#include "inode.h"
29#include "lm.h"
30#include "log.h"
31#include "mount.h"
32#include "ops_super.h"
33#include "page.h"
34#include "quota.h"
35#include "recovery.h"
36#include "rgrp.h"
37#include "super.h"
38#include "sys.h"
39#include "util.h"
40
41/**
42 * gfs2_write_inode - Make sure the inode is stable on the disk
43 * @inode: The inode
44 * @sync: synchronous write flag
45 *
46 * Returns: errno
47 */
48
49static int gfs2_write_inode(struct inode *inode, int sync)
50{
51 struct gfs2_inode *ip = inode->u.generic_ip;
52
53 if (current->flags & PF_MEMALLOC)
54 return 0;
55 if (ip && sync)
56 gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
57
58 return 0;
59}
60
61/**
62 * gfs2_put_super - Unmount the filesystem
63 * @sb: The VFS superblock
64 *
65 */
66
67static void gfs2_put_super(struct super_block *sb)
68{
69 struct gfs2_sbd *sdp = sb->s_fs_info;
70 int error;
71
72 if (!sdp)
73 return;
74
75 /* Unfreeze the filesystem, if we need to */
76
77 mutex_lock(&sdp->sd_freeze_lock);
78 if (sdp->sd_freeze_count)
79 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
80 mutex_unlock(&sdp->sd_freeze_lock);
81
82 kthread_stop(sdp->sd_inoded_process);
83 kthread_stop(sdp->sd_quotad_process);
84 kthread_stop(sdp->sd_logd_process);
85 kthread_stop(sdp->sd_recoverd_process);
86 while (sdp->sd_glockd_num--)
87 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
88 kthread_stop(sdp->sd_scand_process);
89
90 if (!(sb->s_flags & MS_RDONLY)) {
91 error = gfs2_make_fs_ro(sdp);
92 if (error)
93 gfs2_io_error(sdp);
94 }
95
96 /* At this point, we're through modifying the disk */
97
98 /* Release stuff */
99
100 iput(sdp->sd_master_dir);
101 iput(sdp->sd_jindex);
102 iput(sdp->sd_inum_inode);
103 iput(sdp->sd_statfs_inode);
104 iput(sdp->sd_rindex);
105 iput(sdp->sd_quota_inode);
106
107 gfs2_glock_put(sdp->sd_rename_gl);
108 gfs2_glock_put(sdp->sd_trans_gl);
109
110 if (!sdp->sd_args.ar_spectator) {
111 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
112 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
113 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
114 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
115 gfs2_glock_dq_uninit(&sdp->sd_ut_gh);
116 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
117 iput(sdp->sd_ir_inode);
118 iput(sdp->sd_sc_inode);
119 iput(sdp->sd_ut_inode);
120 iput(sdp->sd_qc_inode);
121 }
122
123 gfs2_glock_dq_uninit(&sdp->sd_live_gh);
124 gfs2_clear_rgrpd(sdp);
125 gfs2_jindex_free(sdp);
126 /* Take apart glock structures and buffer lists */
127 gfs2_gl_hash_clear(sdp, WAIT);
128
129 /* Unmount the locking protocol */
130 gfs2_lm_unmount(sdp);
131
132 /* At this point, we're through participating in the lockspace */
133
134 gfs2_sys_fs_del(sdp);
135 vfree(sdp);
136 sb->s_fs_info = NULL;
137}
138
139/**
140 * gfs2_write_super - disk commit all incore transactions
141 * @sb: the filesystem
142 *
143 * This function is called every time sync(2) is called.
144 * After this exits, all dirty buffers are synced.
145 */
146
147static void gfs2_write_super(struct super_block *sb)
148{
149 struct gfs2_sbd *sdp = sb->s_fs_info;
150 gfs2_log_flush(sdp, NULL);
151}
152
153/**
154 * gfs2_write_super_lockfs - prevent further writes to the filesystem
155 * @sb: the VFS structure for the filesystem
156 *
157 */
158
159static void gfs2_write_super_lockfs(struct super_block *sb)
160{
161 struct gfs2_sbd *sdp = sb->s_fs_info;
162 int error;
163
164 for (;;) {
165 error = gfs2_freeze_fs(sdp);
166 if (!error)
167 break;
168
169 switch (error) {
170 case -EBUSY:
171 fs_err(sdp, "waiting for recovery before freeze\n");
172 break;
173
174 default:
175 fs_err(sdp, "error freezing FS: %d\n", error);
176 break;
177 }
178
179 fs_err(sdp, "retrying...\n");
180 msleep(1000);
181 }
182}
183
184/**
185 * gfs2_unlockfs - reallow writes to the filesystem
186 * @sb: the VFS structure for the filesystem
187 *
188 */
189
190static void gfs2_unlockfs(struct super_block *sb)
191{
192 struct gfs2_sbd *sdp = sb->s_fs_info;
193 gfs2_unfreeze_fs(sdp);
194}
195
196/**
197 * gfs2_statfs - Gather and return stats about the filesystem
198 * @sb: The superblock
199 * @statfsbuf: The buffer
200 *
201 * Returns: 0 on success or error code
202 */
203
204static int gfs2_statfs(struct super_block *sb, struct kstatfs *buf)
205{
206 struct gfs2_sbd *sdp = sb->s_fs_info;
207 struct gfs2_statfs_change sc;
208 int error;
209
210 if (gfs2_tune_get(sdp, gt_statfs_slow))
211 error = gfs2_statfs_slow(sdp, &sc);
212 else
213 error = gfs2_statfs_i(sdp, &sc);
214
215 if (error)
216 return error;
217
218 memset(buf, 0, sizeof(struct kstatfs));
219
220 buf->f_type = GFS2_MAGIC;
221 buf->f_bsize = sdp->sd_sb.sb_bsize;
222 buf->f_blocks = sc.sc_total;
223 buf->f_bfree = sc.sc_free;
224 buf->f_bavail = sc.sc_free;
225 buf->f_files = sc.sc_dinodes + sc.sc_free;
226 buf->f_ffree = sc.sc_free;
227 buf->f_namelen = GFS2_FNAMESIZE;
228
229 return 0;
230}
231
232/**
233 * gfs2_remount_fs - called when the FS is remounted
234 * @sb: the filesystem
235 * @flags: the remount flags
236 * @data: extra data passed in (not used right now)
237 *
238 * Returns: errno
239 */
240
241static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
242{
243 struct gfs2_sbd *sdp = sb->s_fs_info;
244 int error;
245
246 error = gfs2_mount_args(sdp, data, 1);
247 if (error)
248 return error;
249
250 if (sdp->sd_args.ar_spectator)
251 *flags |= MS_RDONLY;
252 else {
253 if (*flags & MS_RDONLY) {
254 if (!(sb->s_flags & MS_RDONLY))
255 error = gfs2_make_fs_ro(sdp);
256 } else if (!(*flags & MS_RDONLY) &&
257 (sb->s_flags & MS_RDONLY)) {
258 error = gfs2_make_fs_rw(sdp);
259 }
260 }
261
262 if (*flags & (MS_NOATIME | MS_NODIRATIME))
263 set_bit(SDF_NOATIME, &sdp->sd_flags);
264 else
265 clear_bit(SDF_NOATIME, &sdp->sd_flags);
266
267 /* Don't let the VFS update atimes. GFS2 handles this itself. */
268 *flags |= MS_NOATIME | MS_NODIRATIME;
269
270 return error;
271}
272
273/**
274 * gfs2_clear_inode - Deallocate an inode when VFS is done with it
275 * @inode: The VFS inode
276 *
277 */
278
279static void gfs2_clear_inode(struct inode *inode)
280{
281 struct gfs2_inode *ip = inode->u.generic_ip;
282
283 if (ip) {
284 spin_lock(&ip->i_spin);
285 ip->i_vnode = NULL;
286 inode->u.generic_ip = NULL;
287 spin_unlock(&ip->i_spin);
288
289 gfs2_glock_schedule_for_reclaim(ip->i_gl);
290 gfs2_inode_put(ip);
291 }
292}
293
294/**
295 * gfs2_show_options - Show mount options for /proc/mounts
296 * @s: seq_file structure
297 * @mnt: vfsmount
298 *
299 * Returns: 0 on success or error code
300 */
301
302static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
303{
304 struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
305 struct gfs2_args *args = &sdp->sd_args;
306
307 if (args->ar_lockproto[0])
308 seq_printf(s, ",lockproto=%s", args->ar_lockproto);
309 if (args->ar_locktable[0])
310 seq_printf(s, ",locktable=%s", args->ar_locktable);
311 if (args->ar_hostdata[0])
312 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
313 if (args->ar_spectator)
314 seq_printf(s, ",spectator");
315 if (args->ar_ignore_local_fs)
316 seq_printf(s, ",ignore_local_fs");
317 if (args->ar_localflocks)
318 seq_printf(s, ",localflocks");
319 if (args->ar_localcaching)
320 seq_printf(s, ",localcaching");
321 if (args->ar_debug)
322 seq_printf(s, ",debug");
323 if (args->ar_upgrade)
324 seq_printf(s, ",upgrade");
325 if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
326 seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
327 if (args->ar_posix_acl)
328 seq_printf(s, ",acl");
329 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
330 char *state;
331 switch (args->ar_quota) {
332 case GFS2_QUOTA_OFF:
333 state = "off";
334 break;
335 case GFS2_QUOTA_ACCOUNT:
336 state = "account";
337 break;
338 case GFS2_QUOTA_ON:
339 state = "on";
340 break;
341 default:
342 state = "unknown";
343 break;
344 }
345 seq_printf(s, ",quota=%s", state);
346 }
347 if (args->ar_suiddir)
348 seq_printf(s, ",suiddir");
349 if (args->ar_data != GFS2_DATA_DEFAULT) {
350 char *state;
351 switch (args->ar_data) {
352 case GFS2_DATA_WRITEBACK:
353 state = "writeback";
354 break;
355 case GFS2_DATA_ORDERED:
356 state = "ordered";
357 break;
358 default:
359 state = "unknown";
360 break;
361 }
362 seq_printf(s, ",data=%s", state);
363 }
364
365 return 0;
366}
367
368struct super_operations gfs2_super_ops = {
369 .write_inode = gfs2_write_inode,
370 .put_super = gfs2_put_super,
371 .write_super = gfs2_write_super,
372 .write_super_lockfs = gfs2_write_super_lockfs,
373 .unlockfs = gfs2_unlockfs,
374 .statfs = gfs2_statfs,
375 .remount_fs = gfs2_remount_fs,
376 .clear_inode = gfs2_clear_inode,
377 .show_options = gfs2_show_options,
378};
379
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
new file mode 100644
index 000000000000..a41d208dc558
--- /dev/null
+++ b/fs/gfs2/ops_super.h
@@ -0,0 +1,15 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_SUPER_DOT_H__
11#define __OPS_SUPER_DOT_H__
12
13extern struct super_operations gfs2_super_ops;
14
15#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
new file mode 100644
index 000000000000..dbc57071e7bb
--- /dev/null
+++ b/fs/gfs2/ops_vm.c
@@ -0,0 +1,198 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/mm.h>
16#include <linux/pagemap.h>
17#include <linux/gfs2_ondisk.h>
18#include <asm/semaphore.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "bmap.h"
24#include "glock.h"
25#include "inode.h"
26#include "ops_vm.h"
27#include "page.h"
28#include "quota.h"
29#include "rgrp.h"
30#include "trans.h"
31#include "util.h"
32
33static void pfault_be_greedy(struct gfs2_inode *ip)
34{
35 unsigned int time;
36
37 spin_lock(&ip->i_spin);
38 time = ip->i_greedy;
39 ip->i_last_pfault = jiffies;
40 spin_unlock(&ip->i_spin);
41
42 gfs2_inode_hold(ip);
43 if (gfs2_glock_be_greedy(ip->i_gl, time))
44 gfs2_inode_put(ip);
45}
46
47static struct page *gfs2_private_nopage(struct vm_area_struct *area,
48 unsigned long address, int *type)
49{
50 struct gfs2_inode *ip = area->vm_file->f_mapping->host->u.generic_ip;
51 struct gfs2_holder i_gh;
52 struct page *result;
53 int error;
54
55 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
56 if (error)
57 return NULL;
58
59 set_bit(GIF_PAGED, &ip->i_flags);
60
61 result = filemap_nopage(area, address, type);
62
63 if (result && result != NOPAGE_OOM)
64 pfault_be_greedy(ip);
65
66 gfs2_glock_dq_uninit(&i_gh);
67
68 return result;
69}
70
71static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
72{
73 struct gfs2_sbd *sdp = ip->i_sbd;
74 unsigned long index = page->index;
75 uint64_t lblock = index << (PAGE_CACHE_SHIFT -
76 sdp->sd_sb.sb_bsize_shift);
77 unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift;
78 struct gfs2_alloc *al;
79 unsigned int data_blocks, ind_blocks;
80 unsigned int x;
81 int error;
82
83 al = gfs2_alloc_get(ip);
84
85 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
86 if (error)
87 goto out;
88
89 error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
90 if (error)
91 goto out_gunlock_q;
92
93 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE,
94 &data_blocks, &ind_blocks);
95
96 al->al_requested = data_blocks + ind_blocks;
97
98 error = gfs2_inplace_reserve(ip);
99 if (error)
100 goto out_gunlock_q;
101
102 error = gfs2_trans_begin(sdp,
103 al->al_rgd->rd_ri.ri_length +
104 ind_blocks + RES_DINODE +
105 RES_STATFS + RES_QUOTA, 0);
106 if (error)
107 goto out_ipres;
108
109 if (gfs2_is_stuffed(ip)) {
110 error = gfs2_unstuff_dinode(ip, gfs2_unstuffer_page, NULL);
111 if (error)
112 goto out_trans;
113 }
114
115 for (x = 0; x < blocks; ) {
116 uint64_t dblock;
117 unsigned int extlen;
118 int new = 1;
119
120 error = gfs2_block_map(ip, lblock, &new, &dblock, &extlen);
121 if (error)
122 goto out_trans;
123
124 lblock += extlen;
125 x += extlen;
126 }
127
128 gfs2_assert_warn(sdp, al->al_alloced);
129
130 out_trans:
131 gfs2_trans_end(sdp);
132
133 out_ipres:
134 gfs2_inplace_release(ip);
135
136 out_gunlock_q:
137 gfs2_quota_unlock(ip);
138
139 out:
140 gfs2_alloc_put(ip);
141
142 return error;
143}
144
145static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
146 unsigned long address, int *type)
147{
148 struct gfs2_inode *ip = area->vm_file->f_mapping->host->u.generic_ip;
149 struct gfs2_holder i_gh;
150 struct page *result = NULL;
151 unsigned long index = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) +
152 area->vm_pgoff;
153 int alloc_required;
154 int error;
155
156 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
157 if (error)
158 return NULL;
159
160 set_bit(GIF_PAGED, &ip->i_flags);
161 set_bit(GIF_SW_PAGED, &ip->i_flags);
162
163 error = gfs2_write_alloc_required(ip,
164 (uint64_t)index << PAGE_CACHE_SHIFT,
165 PAGE_CACHE_SIZE, &alloc_required);
166 if (error)
167 goto out;
168
169 result = filemap_nopage(area, address, type);
170 if (!result || result == NOPAGE_OOM)
171 goto out;
172
173 if (alloc_required) {
174 error = alloc_page_backing(ip, result);
175 if (error) {
176 page_cache_release(result);
177 result = NULL;
178 goto out;
179 }
180 set_page_dirty(result);
181 }
182
183 pfault_be_greedy(ip);
184
185 out:
186 gfs2_glock_dq_uninit(&i_gh);
187
188 return result;
189}
190
191struct vm_operations_struct gfs2_vm_ops_private = {
192 .nopage = gfs2_private_nopage,
193};
194
195struct vm_operations_struct gfs2_vm_ops_sharewrite = {
196 .nopage = gfs2_sharewrite_nopage,
197};
198
diff --git a/fs/gfs2/ops_vm.h b/fs/gfs2/ops_vm.h
new file mode 100644
index 000000000000..54e3a8769cbb
--- /dev/null
+++ b/fs/gfs2/ops_vm.h
@@ -0,0 +1,16 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __OPS_VM_DOT_H__
11#define __OPS_VM_DOT_H__
12
13extern struct vm_operations_struct gfs2_vm_ops_private;
14extern struct vm_operations_struct gfs2_vm_ops_sharewrite;
15
16#endif /* __OPS_VM_DOT_H__ */
diff --git a/fs/gfs2/page.c b/fs/gfs2/page.c
new file mode 100644
index 000000000000..a2c9e93c7c39
--- /dev/null
+++ b/fs/gfs2/page.c
@@ -0,0 +1,283 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/pagemap.h>
16#include <linux/mm.h>
17#include <linux/gfs2_ondisk.h>
18#include <asm/semaphore.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "bmap.h"
24#include "inode.h"
25#include "page.h"
26#include "trans.h"
27#include "ops_address.h"
28#include "util.h"
29
30/**
31 * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock
32 * @gl: the glock
33 *
34 */
35
36void gfs2_pte_inval(struct gfs2_glock *gl)
37{
38 struct gfs2_inode *ip;
39 struct inode *inode;
40
41 ip = gl->gl_object;
42 if (!ip || !S_ISREG(ip->i_di.di_mode))
43 return;
44
45 if (!test_bit(GIF_PAGED, &ip->i_flags))
46 return;
47
48 inode = gfs2_ip2v_lookup(ip);
49 if (inode) {
50 unmap_shared_mapping_range(inode->i_mapping, 0, 0);
51 iput(inode);
52
53 if (test_bit(GIF_SW_PAGED, &ip->i_flags))
54 set_bit(GLF_DIRTY, &gl->gl_flags);
55 }
56
57 clear_bit(GIF_SW_PAGED, &ip->i_flags);
58}
59
60/**
61 * gfs2_page_inval - Invalidate all pages associated with a glock
62 * @gl: the glock
63 *
64 */
65
66void gfs2_page_inval(struct gfs2_glock *gl)
67{
68 struct gfs2_inode *ip;
69 struct inode *inode;
70
71 ip = gl->gl_object;
72 if (!ip || !S_ISREG(ip->i_di.di_mode))
73 return;
74
75 inode = gfs2_ip2v_lookup(ip);
76 if (inode) {
77 struct address_space *mapping = inode->i_mapping;
78
79 truncate_inode_pages(mapping, 0);
80 gfs2_assert_withdraw(ip->i_sbd, !mapping->nrpages);
81
82 iput(inode);
83 }
84
85 clear_bit(GIF_PAGED, &ip->i_flags);
86}
87
88/**
89 * gfs2_page_sync - Sync the data pages (not metadata) associated with a glock
90 * @gl: the glock
91 * @flags: DIO_START | DIO_WAIT
92 *
93 * Syncs data (not metadata) for a regular file.
94 * No-op for all other types.
95 */
96
97void gfs2_page_sync(struct gfs2_glock *gl, int flags)
98{
99 struct gfs2_inode *ip;
100 struct inode *inode;
101
102 ip = gl->gl_object;
103 if (!ip || !S_ISREG(ip->i_di.di_mode))
104 return;
105
106 inode = gfs2_ip2v_lookup(ip);
107 if (inode) {
108 struct address_space *mapping = inode->i_mapping;
109 int error = 0;
110
111 if (flags & DIO_START)
112 filemap_fdatawrite(mapping);
113 if (!error && (flags & DIO_WAIT))
114 error = filemap_fdatawait(mapping);
115
116 /* Put back any errors cleared by filemap_fdatawait()
117 so they can be caught by someone who can pass them
118 up to user space. */
119
120 if (error == -ENOSPC)
121 set_bit(AS_ENOSPC, &mapping->flags);
122 else if (error)
123 set_bit(AS_EIO, &mapping->flags);
124
125 iput(inode);
126 }
127}
128
129/**
130 * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
131 * @ip: the inode
132 * @dibh: the dinode buffer
133 * @block: the block number that was allocated
134 * @private: any locked page held by the caller process
135 *
136 * Returns: errno
137 */
138
139int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
140 uint64_t block, void *private)
141{
142 struct gfs2_sbd *sdp = ip->i_sbd;
143 struct inode *inode = ip->i_vnode;
144 struct page *page = (struct page *)private;
145 struct buffer_head *bh;
146 int release = 0;
147
148 if (!page || page->index) {
149 page = grab_cache_page(inode->i_mapping, 0);
150 if (!page)
151 return -ENOMEM;
152 release = 1;
153 }
154
155 if (!PageUptodate(page)) {
156 void *kaddr = kmap(page);
157
158 memcpy(kaddr,
159 dibh->b_data + sizeof(struct gfs2_dinode),
160 ip->i_di.di_size);
161 memset(kaddr + ip->i_di.di_size,
162 0,
163 PAGE_CACHE_SIZE - ip->i_di.di_size);
164 kunmap(page);
165
166 SetPageUptodate(page);
167 }
168
169 if (!page_has_buffers(page))
170 create_empty_buffers(page, 1 << inode->i_blkbits,
171 (1 << BH_Uptodate));
172
173 bh = page_buffers(page);
174
175 if (!buffer_mapped(bh))
176 map_bh(bh, inode->i_sb, block);
177
178 set_buffer_uptodate(bh);
179 if ((sdp->sd_args.ar_data == GFS2_DATA_ORDERED) || gfs2_is_jdata(ip))
180 gfs2_trans_add_bh(ip->i_gl, bh, 0);
181 mark_buffer_dirty(bh);
182
183 if (release) {
184 unlock_page(page);
185 page_cache_release(page);
186 }
187
188 return 0;
189}
190
191/**
192 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
193 *
194 * This is partly borrowed from ext3.
195 */
196int gfs2_block_truncate_page(struct address_space *mapping)
197{
198 struct inode *inode = mapping->host;
199 struct gfs2_inode *ip = inode->u.generic_ip;
200 struct gfs2_sbd *sdp = ip->i_sbd;
201 loff_t from = inode->i_size;
202 unsigned long index = from >> PAGE_CACHE_SHIFT;
203 unsigned offset = from & (PAGE_CACHE_SIZE-1);
204 unsigned blocksize, iblock, length, pos;
205 struct buffer_head *bh;
206 struct page *page;
207 void *kaddr;
208 int err;
209
210 page = grab_cache_page(mapping, index);
211 if (!page)
212 return 0;
213
214 blocksize = inode->i_sb->s_blocksize;
215 length = blocksize - (offset & (blocksize - 1));
216 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
217
218 if (!page_has_buffers(page))
219 create_empty_buffers(page, blocksize, 0);
220
221 /* Find the buffer that contains "offset" */
222 bh = page_buffers(page);
223 pos = blocksize;
224 while (offset >= pos) {
225 bh = bh->b_this_page;
226 iblock++;
227 pos += blocksize;
228 }
229
230 err = 0;
231
232 if (!buffer_mapped(bh)) {
233 gfs2_get_block(inode, iblock, bh, 0);
234 /* unmapped? It's a hole - nothing to do */
235 if (!buffer_mapped(bh))
236 goto unlock;
237 }
238
239 /* Ok, it's mapped. Make sure it's up-to-date */
240 if (PageUptodate(page))
241 set_buffer_uptodate(bh);
242
243 if (!buffer_uptodate(bh)) {
244 err = -EIO;
245 ll_rw_block(READ, 1, &bh);
246 wait_on_buffer(bh);
247 /* Uhhuh. Read error. Complain and punt. */
248 if (!buffer_uptodate(bh))
249 goto unlock;
250 }
251
252 if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
253 gfs2_trans_add_bh(ip->i_gl, bh, 0);
254
255 kaddr = kmap_atomic(page, KM_USER0);
256 memset(kaddr + offset, 0, length);
257 flush_dcache_page(page);
258 kunmap_atomic(kaddr, KM_USER0);
259
260unlock:
261 unlock_page(page);
262 page_cache_release(page);
263 return err;
264}
265
266void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
267 unsigned int from, unsigned int to)
268{
269 struct buffer_head *head = page_buffers(page);
270 unsigned int bsize = head->b_size;
271 struct buffer_head *bh;
272 unsigned int start, end;
273
274 for (bh = head, start = 0;
275 bh != head || !start;
276 bh = bh->b_this_page, start = end) {
277 end = start + bsize;
278 if (end <= from || start >= to)
279 continue;
280 gfs2_trans_add_bh(ip->i_gl, bh, 0);
281 }
282}
283
diff --git a/fs/gfs2/page.h b/fs/gfs2/page.h
new file mode 100644
index 000000000000..346e296420c6
--- /dev/null
+++ b/fs/gfs2/page.h
@@ -0,0 +1,23 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __PAGE_DOT_H__
11#define __PAGE_DOT_H__
12
13void gfs2_pte_inval(struct gfs2_glock *gl);
14void gfs2_page_inval(struct gfs2_glock *gl);
15void gfs2_page_sync(struct gfs2_glock *gl, int flags);
16
17int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
18 uint64_t block, void *private);
19int gfs2_block_truncate_page(struct address_space *mapping);
20void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
21 unsigned int from, unsigned int to);
22
23#endif /* __PAGE_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
new file mode 100644
index 000000000000..90e32a3dc50d
--- /dev/null
+++ b/fs/gfs2/quota.c
@@ -0,0 +1,1303 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10/*
11 * Quota change tags are associated with each transaction that allocates or
12 * deallocates space. Those changes are accumulated locally to each node (in a
13 * per-node file) and then are periodically synced to the quota file. This
14 * avoids the bottleneck of constantly touching the quota file, but introduces
15 * fuzziness in the current usage value of IDs that are being used on different
16 * nodes in the cluster simultaneously. So, it is possible for a user on
17 * multiple nodes to overrun their quota, but that overrun is controlable.
18 * Since quota tags are part of transactions, there is no need to a quota check
19 * program to be run on node crashes or anything like that.
20 *
21 * There are couple of knobs that let the administrator manage the quota
22 * fuzziness. "quota_quantum" sets the maximum time a quota change can be
23 * sitting on one node before being synced to the quota file. (The default is
24 * 60 seconds.) Another knob, "quota_scale" controls how quickly the frequency
25 * of quota file syncs increases as the user moves closer to their limit. The
26 * more frequent the syncs, the more accurate the quota enforcement, but that
27 * means that there is more contention between the nodes for the quota file.
28 * The default value is one. This sets the maximum theoretical quota overrun
29 * (with infinite node with infinite bandwidth) to twice the user's limit. (In
30 * practice, the maximum overrun you see should be much less.) A "quota_scale"
31 * number greater than one makes quota syncs more frequent and reduces the
32 * maximum overrun. Numbers less than one (but greater than zero) make quota
33 * syncs less frequent.
34 *
35 * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of
36 * the quota file, so it is not being constantly read.
37 */
38
39#include <linux/sched.h>
40#include <linux/slab.h>
41#include <linux/spinlock.h>
42#include <linux/completion.h>
43#include <linux/buffer_head.h>
44#include <linux/tty.h>
45#include <linux/sort.h>
46#include <linux/fs.h>
47#include <linux/gfs2_ondisk.h>
48#include <asm/semaphore.h>
49
50#include "gfs2.h"
51#include "lm_interface.h"
52#include "incore.h"
53#include "bmap.h"
54#include "glock.h"
55#include "glops.h"
56#include "log.h"
57#include "lvb.h"
58#include "meta_io.h"
59#include "quota.h"
60#include "rgrp.h"
61#include "super.h"
62#include "trans.h"
63#include "inode.h"
64#include "ops_file.h"
65#include "ops_address.h"
66#include "util.h"
67
68#define QUOTA_USER 1
69#define QUOTA_GROUP 0
70
71static uint64_t qd2offset(struct gfs2_quota_data *qd)
72{
73 uint64_t offset;
74
75 offset = 2 * (uint64_t)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags);
76 offset *= sizeof(struct gfs2_quota);
77
78 return offset;
79}
80
81static int qd_alloc(struct gfs2_sbd *sdp, int user, uint32_t id,
82 struct gfs2_quota_data **qdp)
83{
84 struct gfs2_quota_data *qd;
85 int error;
86
87 qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL);
88 if (!qd)
89 return -ENOMEM;
90
91 qd->qd_count = 1;
92 qd->qd_id = id;
93 if (user)
94 set_bit(QDF_USER, &qd->qd_flags);
95 qd->qd_slot = -1;
96
97 error = gfs2_glock_get(sdp, 2 * (uint64_t)id + !user,
98 &gfs2_quota_glops, CREATE, &qd->qd_gl);
99 if (error)
100 goto fail;
101
102 error = gfs2_lvb_hold(qd->qd_gl);
103 gfs2_glock_put(qd->qd_gl);
104 if (error)
105 goto fail;
106
107 *qdp = qd;
108
109 return 0;
110
111 fail:
112 kfree(qd);
113 return error;
114}
115
116static int qd_get(struct gfs2_sbd *sdp, int user, uint32_t id, int create,
117 struct gfs2_quota_data **qdp)
118{
119 struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
120 int error, found;
121
122 *qdp = NULL;
123
124 for (;;) {
125 found = 0;
126 spin_lock(&sdp->sd_quota_spin);
127 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
128 if (qd->qd_id == id &&
129 !test_bit(QDF_USER, &qd->qd_flags) == !user) {
130 qd->qd_count++;
131 found = 1;
132 break;
133 }
134 }
135
136 if (!found)
137 qd = NULL;
138
139 if (!qd && new_qd) {
140 qd = new_qd;
141 list_add(&qd->qd_list, &sdp->sd_quota_list);
142 atomic_inc(&sdp->sd_quota_count);
143 new_qd = NULL;
144 }
145
146 spin_unlock(&sdp->sd_quota_spin);
147
148 if (qd || !create) {
149 if (new_qd) {
150 gfs2_lvb_unhold(new_qd->qd_gl);
151 kfree(new_qd);
152 }
153 *qdp = qd;
154 return 0;
155 }
156
157 error = qd_alloc(sdp, user, id, &new_qd);
158 if (error)
159 return error;
160 }
161}
162
163static void qd_hold(struct gfs2_quota_data *qd)
164{
165 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
166
167 spin_lock(&sdp->sd_quota_spin);
168 gfs2_assert(sdp, qd->qd_count);
169 qd->qd_count++;
170 spin_unlock(&sdp->sd_quota_spin);
171}
172
173static void qd_put(struct gfs2_quota_data *qd)
174{
175 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
176 spin_lock(&sdp->sd_quota_spin);
177 gfs2_assert(sdp, qd->qd_count);
178 if (!--qd->qd_count)
179 qd->qd_last_touched = jiffies;
180 spin_unlock(&sdp->sd_quota_spin);
181}
182
183static int slot_get(struct gfs2_quota_data *qd)
184{
185 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
186 unsigned int c, o = 0, b;
187 unsigned char byte = 0;
188
189 spin_lock(&sdp->sd_quota_spin);
190
191 if (qd->qd_slot_count++) {
192 spin_unlock(&sdp->sd_quota_spin);
193 return 0;
194 }
195
196 for (c = 0; c < sdp->sd_quota_chunks; c++)
197 for (o = 0; o < PAGE_SIZE; o++) {
198 byte = sdp->sd_quota_bitmap[c][o];
199 if (byte != 0xFF)
200 goto found;
201 }
202
203 goto fail;
204
205 found:
206 for (b = 0; b < 8; b++)
207 if (!(byte & (1 << b)))
208 break;
209 qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
210
211 if (qd->qd_slot >= sdp->sd_quota_slots)
212 goto fail;
213
214 sdp->sd_quota_bitmap[c][o] |= 1 << b;
215
216 spin_unlock(&sdp->sd_quota_spin);
217
218 return 0;
219
220 fail:
221 qd->qd_slot_count--;
222 spin_unlock(&sdp->sd_quota_spin);
223 return -ENOSPC;
224}
225
226static void slot_hold(struct gfs2_quota_data *qd)
227{
228 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
229
230 spin_lock(&sdp->sd_quota_spin);
231 gfs2_assert(sdp, qd->qd_slot_count);
232 qd->qd_slot_count++;
233 spin_unlock(&sdp->sd_quota_spin);
234}
235
236static void slot_put(struct gfs2_quota_data *qd)
237{
238 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
239
240 spin_lock(&sdp->sd_quota_spin);
241 gfs2_assert(sdp, qd->qd_slot_count);
242 if (!--qd->qd_slot_count) {
243 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
244 qd->qd_slot = -1;
245 }
246 spin_unlock(&sdp->sd_quota_spin);
247}
248
249static int bh_get(struct gfs2_quota_data *qd)
250{
251 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
252 struct gfs2_inode *ip = sdp->sd_qc_inode->u.generic_ip;
253 unsigned int block, offset;
254 uint64_t dblock;
255 int new = 0;
256 struct buffer_head *bh;
257 int error;
258
259 mutex_lock(&sdp->sd_quota_mutex);
260
261 if (qd->qd_bh_count++) {
262 mutex_unlock(&sdp->sd_quota_mutex);
263 return 0;
264 }
265
266 block = qd->qd_slot / sdp->sd_qc_per_block;
267 offset = qd->qd_slot % sdp->sd_qc_per_block;;
268
269 error = gfs2_block_map(ip, block, &new, &dblock, NULL);
270 if (error)
271 goto fail;
272 error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT, &bh);
273 if (error)
274 goto fail;
275 error = -EIO;
276 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC))
277 goto fail_brelse;
278
279 qd->qd_bh = bh;
280 qd->qd_bh_qc = (struct gfs2_quota_change *)
281 (bh->b_data + sizeof(struct gfs2_meta_header) +
282 offset * sizeof(struct gfs2_quota_change));
283
284 mutex_lock(&sdp->sd_quota_mutex);
285
286 return 0;
287
288 fail_brelse:
289 brelse(bh);
290
291 fail:
292 qd->qd_bh_count--;
293 mutex_unlock(&sdp->sd_quota_mutex);
294 return error;
295}
296
297static void bh_put(struct gfs2_quota_data *qd)
298{
299 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
300
301 mutex_lock(&sdp->sd_quota_mutex);
302 gfs2_assert(sdp, qd->qd_bh_count);
303 if (!--qd->qd_bh_count) {
304 brelse(qd->qd_bh);
305 qd->qd_bh = NULL;
306 qd->qd_bh_qc = NULL;
307 }
308 mutex_unlock(&sdp->sd_quota_mutex);
309}
310
311static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
312{
313 struct gfs2_quota_data *qd = NULL;
314 int error;
315 int found = 0;
316
317 *qdp = NULL;
318
319 if (sdp->sd_vfs->s_flags & MS_RDONLY)
320 return 0;
321
322 spin_lock(&sdp->sd_quota_spin);
323
324 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
325 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
326 !test_bit(QDF_CHANGE, &qd->qd_flags) ||
327 qd->qd_sync_gen >= sdp->sd_quota_sync_gen)
328 continue;
329
330 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
331
332 set_bit(QDF_LOCKED, &qd->qd_flags);
333 gfs2_assert_warn(sdp, qd->qd_count);
334 qd->qd_count++;
335 qd->qd_change_sync = qd->qd_change;
336 gfs2_assert_warn(sdp, qd->qd_slot_count);
337 qd->qd_slot_count++;
338 found = 1;
339
340 break;
341 }
342
343 if (!found)
344 qd = NULL;
345
346 spin_unlock(&sdp->sd_quota_spin);
347
348 if (qd) {
349 gfs2_assert_warn(sdp, qd->qd_change_sync);
350 error = bh_get(qd);
351 if (error) {
352 clear_bit(QDF_LOCKED, &qd->qd_flags);
353 slot_put(qd);
354 qd_put(qd);
355 return error;
356 }
357 }
358
359 *qdp = qd;
360
361 return 0;
362}
363
364static int qd_trylock(struct gfs2_quota_data *qd)
365{
366 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
367
368 if (sdp->sd_vfs->s_flags & MS_RDONLY)
369 return 0;
370
371 spin_lock(&sdp->sd_quota_spin);
372
373 if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
374 !test_bit(QDF_CHANGE, &qd->qd_flags)) {
375 spin_unlock(&sdp->sd_quota_spin);
376 return 0;
377 }
378
379 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
380
381 set_bit(QDF_LOCKED, &qd->qd_flags);
382 gfs2_assert_warn(sdp, qd->qd_count);
383 qd->qd_count++;
384 qd->qd_change_sync = qd->qd_change;
385 gfs2_assert_warn(sdp, qd->qd_slot_count);
386 qd->qd_slot_count++;
387
388 spin_unlock(&sdp->sd_quota_spin);
389
390 gfs2_assert_warn(sdp, qd->qd_change_sync);
391 if (bh_get(qd)) {
392 clear_bit(QDF_LOCKED, &qd->qd_flags);
393 slot_put(qd);
394 qd_put(qd);
395 return 0;
396 }
397
398 return 1;
399}
400
401static void qd_unlock(struct gfs2_quota_data *qd)
402{
403 gfs2_assert_warn(qd->qd_gl->gl_sbd,
404 test_bit(QDF_LOCKED, &qd->qd_flags));
405 clear_bit(QDF_LOCKED, &qd->qd_flags);
406 bh_put(qd);
407 slot_put(qd);
408 qd_put(qd);
409}
410
411static int qdsb_get(struct gfs2_sbd *sdp, int user, uint32_t id, int create,
412 struct gfs2_quota_data **qdp)
413{
414 int error;
415
416 error = qd_get(sdp, user, id, create, qdp);
417 if (error)
418 return error;
419
420 error = slot_get(*qdp);
421 if (error)
422 goto fail;
423
424 error = bh_get(*qdp);
425 if (error)
426 goto fail_slot;
427
428 return 0;
429
430 fail_slot:
431 slot_put(*qdp);
432
433 fail:
434 qd_put(*qdp);
435 return error;
436}
437
438static void qdsb_put(struct gfs2_quota_data *qd)
439{
440 bh_put(qd);
441 slot_put(qd);
442 qd_put(qd);
443}
444
445int gfs2_quota_hold(struct gfs2_inode *ip, uint32_t uid, uint32_t gid)
446{
447 struct gfs2_sbd *sdp = ip->i_sbd;
448 struct gfs2_alloc *al = &ip->i_alloc;
449 struct gfs2_quota_data **qd = al->al_qd;
450 int error;
451
452 if (gfs2_assert_warn(sdp, !al->al_qd_num) ||
453 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
454 return -EIO;
455
456 if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
457 return 0;
458
459 error = qdsb_get(sdp, QUOTA_USER, ip->i_di.di_uid, CREATE, qd);
460 if (error)
461 goto out;
462 al->al_qd_num++;
463 qd++;
464
465 error = qdsb_get(sdp, QUOTA_GROUP, ip->i_di.di_gid, CREATE, qd);
466 if (error)
467 goto out;
468 al->al_qd_num++;
469 qd++;
470
471 if (uid != NO_QUOTA_CHANGE && uid != ip->i_di.di_uid) {
472 error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd);
473 if (error)
474 goto out;
475 al->al_qd_num++;
476 qd++;
477 }
478
479 if (gid != NO_QUOTA_CHANGE && gid != ip->i_di.di_gid) {
480 error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd);
481 if (error)
482 goto out;
483 al->al_qd_num++;
484 qd++;
485 }
486
487 out:
488 if (error)
489 gfs2_quota_unhold(ip);
490
491 return error;
492}
493
494void gfs2_quota_unhold(struct gfs2_inode *ip)
495{
496 struct gfs2_sbd *sdp = ip->i_sbd;
497 struct gfs2_alloc *al = &ip->i_alloc;
498 unsigned int x;
499
500 gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
501
502 for (x = 0; x < al->al_qd_num; x++) {
503 qdsb_put(al->al_qd[x]);
504 al->al_qd[x] = NULL;
505 }
506 al->al_qd_num = 0;
507}
508
509static int sort_qd(const void *a, const void *b)
510{
511 struct gfs2_quota_data *qd_a = *(struct gfs2_quota_data **)a;
512 struct gfs2_quota_data *qd_b = *(struct gfs2_quota_data **)b;
513 int ret = 0;
514
515 if (!test_bit(QDF_USER, &qd_a->qd_flags) !=
516 !test_bit(QDF_USER, &qd_b->qd_flags)) {
517 if (test_bit(QDF_USER, &qd_a->qd_flags))
518 ret = -1;
519 else
520 ret = 1;
521 } else {
522 if (qd_a->qd_id < qd_b->qd_id)
523 ret = -1;
524 else if (qd_a->qd_id > qd_b->qd_id)
525 ret = 1;
526 }
527
528 return ret;
529}
530
531static void do_qc(struct gfs2_quota_data *qd, int64_t change)
532{
533 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
534 struct gfs2_inode *ip = sdp->sd_qc_inode->u.generic_ip;
535 struct gfs2_quota_change *qc = qd->qd_bh_qc;
536 int64_t x;
537
538 mutex_lock(&sdp->sd_quota_mutex);
539 gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1);
540
541 if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {
542 qc->qc_change = 0;
543 qc->qc_flags = 0;
544 if (test_bit(QDF_USER, &qd->qd_flags))
545 qc->qc_flags = cpu_to_be32(GFS2_QCF_USER);
546 qc->qc_id = cpu_to_be32(qd->qd_id);
547 }
548
549 x = qc->qc_change;
550 x = be64_to_cpu(x) + change;
551 qc->qc_change = cpu_to_be64(x);
552
553 spin_lock(&sdp->sd_quota_spin);
554 qd->qd_change = x;
555 spin_unlock(&sdp->sd_quota_spin);
556
557 if (!x) {
558 gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
559 clear_bit(QDF_CHANGE, &qd->qd_flags);
560 qc->qc_flags = 0;
561 qc->qc_id = 0;
562 slot_put(qd);
563 qd_put(qd);
564 } else if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) {
565 qd_hold(qd);
566 slot_hold(qd);
567 }
568
569 mutex_unlock(&sdp->sd_quota_mutex);
570}
571
572/**
573 * gfs2_adjust_quota
574 *
575 * This function was mostly borrowed from gfs2_block_truncate_page which was
576 * in turn mostly borrowed from ext3
577 */
578static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
579 int64_t change, struct gfs2_quota_data *qd)
580{
581 struct inode *inode = ip->i_vnode;
582 struct address_space *mapping = inode->i_mapping;
583 unsigned long index = loc >> PAGE_CACHE_SHIFT;
584 unsigned offset = loc & (PAGE_CACHE_SHIFT - 1);
585 unsigned blocksize, iblock, pos;
586 struct buffer_head *bh;
587 struct page *page;
588 void *kaddr;
589 __be64 *ptr;
590 u64 value;
591 int err = -EIO;
592
593 page = grab_cache_page(mapping, index);
594 if (!page)
595 return -ENOMEM;
596
597 blocksize = inode->i_sb->s_blocksize;
598 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
599
600 if (!page_has_buffers(page))
601 create_empty_buffers(page, blocksize, 0);
602
603 bh = page_buffers(page);
604 pos = blocksize;
605 while (offset >= pos) {
606 bh = bh->b_this_page;
607 iblock++;
608 pos += blocksize;
609 }
610
611 if (!buffer_mapped(bh)) {
612 gfs2_get_block(inode, iblock, bh, 1);
613 if (!buffer_mapped(bh))
614 goto unlock;
615 }
616
617 if (PageUptodate(page))
618 set_buffer_uptodate(bh);
619
620 if (!buffer_uptodate(bh)) {
621 ll_rw_block(READ, 1, &bh);
622 wait_on_buffer(bh);
623 if (!buffer_uptodate(bh))
624 goto unlock;
625 }
626
627 gfs2_trans_add_bh(ip->i_gl, bh, 0);
628
629 kaddr = kmap_atomic(page, KM_USER0);
630 ptr = (__be64 *)(kaddr + offset);
631 value = *ptr = cpu_to_be64(be64_to_cpu(*ptr) + change);
632 flush_dcache_page(page);
633 kunmap_atomic(kaddr, KM_USER0);
634 err = 0;
635 qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC);
636#if 0
637 qd->qd_qb.qb_limit = cpu_to_be64(q.qu_limit);
638 qd->qd_qb.qb_warn = cpu_to_be64(q.qu_warn);
639#endif
640 qd->qd_qb.qb_value = cpu_to_be64(value);
641unlock:
642 unlock_page(page);
643 page_cache_release(page);
644 return err;
645}
646
647static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
648{
649 struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
650 struct gfs2_inode *ip = sdp->sd_quota_inode->u.generic_ip;
651 unsigned int data_blocks, ind_blocks;
652 struct file_ra_state ra_state;
653 struct gfs2_holder *ghs, i_gh;
654 unsigned int qx, x;
655 struct gfs2_quota_data *qd;
656 loff_t offset;
657 unsigned int nalloc = 0;
658 struct gfs2_alloc *al = NULL;
659 int error;
660
661 gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
662 &data_blocks, &ind_blocks);
663
664 ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL);
665 if (!ghs)
666 return -ENOMEM;
667
668 sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
669 for (qx = 0; qx < num_qd; qx++) {
670 error = gfs2_glock_nq_init(qda[qx]->qd_gl,
671 LM_ST_EXCLUSIVE,
672 GL_NOCACHE, &ghs[qx]);
673 if (error)
674 goto out;
675 }
676
677 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
678 if (error)
679 goto out;
680
681 for (x = 0; x < num_qd; x++) {
682 int alloc_required;
683
684 offset = qd2offset(qda[x]);
685 error = gfs2_write_alloc_required(ip, offset,
686 sizeof(struct gfs2_quota),
687 &alloc_required);
688 if (error)
689 goto out_gunlock;
690 if (alloc_required)
691 nalloc++;
692 }
693
694 if (nalloc) {
695 al = gfs2_alloc_get(ip);
696
697 al->al_requested = nalloc * (data_blocks + ind_blocks);
698
699 error = gfs2_inplace_reserve(ip);
700 if (error)
701 goto out_alloc;
702
703 error = gfs2_trans_begin(sdp,
704 al->al_rgd->rd_ri.ri_length +
705 num_qd * data_blocks +
706 nalloc * ind_blocks +
707 RES_DINODE + num_qd +
708 RES_STATFS, 0);
709 if (error)
710 goto out_ipres;
711 } else {
712 error = gfs2_trans_begin(sdp,
713 num_qd * data_blocks +
714 RES_DINODE + num_qd, 0);
715 if (error)
716 goto out_gunlock;
717 }
718
719 file_ra_state_init(&ra_state, ip->i_vnode->i_mapping);
720 for (x = 0; x < num_qd; x++) {
721 qd = qda[x];
722 offset = qd2offset(qd);
723 error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync,
724 (struct gfs2_quota_data *)
725 qd->qd_gl->gl_lvb);
726 if (error)
727 goto out_end_trans;
728
729 do_qc(qd, -qd->qd_change_sync);
730 }
731
732 error = 0;
733
734 out_end_trans:
735 gfs2_trans_end(sdp);
736
737 out_ipres:
738 if (nalloc)
739 gfs2_inplace_release(ip);
740
741 out_alloc:
742 if (nalloc)
743 gfs2_alloc_put(ip);
744
745 out_gunlock:
746 gfs2_glock_dq_uninit(&i_gh);
747
748 out:
749 while (qx--)
750 gfs2_glock_dq_uninit(&ghs[qx]);
751 kfree(ghs);
752 gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
753
754 return error;
755}
756
757static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
758 struct gfs2_holder *q_gh)
759{
760 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
761 struct gfs2_inode *ip = sdp->sd_quota_inode->u.generic_ip;
762 struct gfs2_holder i_gh;
763 struct gfs2_quota q;
764 char buf[sizeof(struct gfs2_quota)];
765 struct file_ra_state ra_state;
766 int error;
767
768 file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping);
769 restart:
770 error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
771 if (error)
772 return error;
773
774 gfs2_quota_lvb_in(&qd->qd_qb, qd->qd_gl->gl_lvb);
775
776 if (force_refresh || qd->qd_qb.qb_magic != GFS2_MAGIC) {
777 loff_t pos;
778 gfs2_glock_dq_uninit(q_gh);
779 error = gfs2_glock_nq_init(qd->qd_gl,
780 LM_ST_EXCLUSIVE, GL_NOCACHE,
781 q_gh);
782 if (error)
783 return error;
784
785 error = gfs2_glock_nq_init(ip->i_gl,
786 LM_ST_SHARED, 0,
787 &i_gh);
788 if (error)
789 goto fail;
790
791 memset(buf, 0, sizeof(struct gfs2_quota));
792 pos = qd2offset(qd);
793 error = gfs2_internal_read(ip,
794 &ra_state, buf,
795 &pos,
796 sizeof(struct gfs2_quota));
797 if (error < 0)
798 goto fail_gunlock;
799
800 gfs2_glock_dq_uninit(&i_gh);
801
802 gfs2_quota_in(&q, buf);
803
804 memset(&qd->qd_qb, 0, sizeof(struct gfs2_quota_lvb));
805 qd->qd_qb.qb_magic = GFS2_MAGIC;
806 qd->qd_qb.qb_limit = q.qu_limit;
807 qd->qd_qb.qb_warn = q.qu_warn;
808 qd->qd_qb.qb_value = q.qu_value;
809
810 gfs2_quota_lvb_out(&qd->qd_qb, qd->qd_gl->gl_lvb);
811
812 if (gfs2_glock_is_blocking(qd->qd_gl)) {
813 gfs2_glock_dq_uninit(q_gh);
814 force_refresh = 0;
815 goto restart;
816 }
817 }
818
819 return 0;
820
821 fail_gunlock:
822 gfs2_glock_dq_uninit(&i_gh);
823
824 fail:
825 gfs2_glock_dq_uninit(q_gh);
826
827 return error;
828}
829
830int gfs2_quota_lock(struct gfs2_inode *ip, uint32_t uid, uint32_t gid)
831{
832 struct gfs2_sbd *sdp = ip->i_sbd;
833 struct gfs2_alloc *al = &ip->i_alloc;
834 unsigned int x;
835 int error = 0;
836
837 gfs2_quota_hold(ip, uid, gid);
838
839 if (capable(CAP_SYS_RESOURCE) ||
840 sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
841 return 0;
842
843 sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *),
844 sort_qd, NULL);
845
846 for (x = 0; x < al->al_qd_num; x++) {
847 error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]);
848 if (error)
849 break;
850 }
851
852 if (!error)
853 set_bit(GIF_QD_LOCKED, &ip->i_flags);
854 else {
855 while (x--)
856 gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
857 gfs2_quota_unhold(ip);
858 }
859
860 return error;
861}
862
863static int need_sync(struct gfs2_quota_data *qd)
864{
865 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
866 struct gfs2_tune *gt = &sdp->sd_tune;
867 int64_t value;
868 unsigned int num, den;
869 int do_sync = 1;
870
871 if (!qd->qd_qb.qb_limit)
872 return 0;
873
874 spin_lock(&sdp->sd_quota_spin);
875 value = qd->qd_change;
876 spin_unlock(&sdp->sd_quota_spin);
877
878 spin_lock(&gt->gt_spin);
879 num = gt->gt_quota_scale_num;
880 den = gt->gt_quota_scale_den;
881 spin_unlock(&gt->gt_spin);
882
883 if (value < 0)
884 do_sync = 0;
885 else if (qd->qd_qb.qb_value >= (int64_t)qd->qd_qb.qb_limit)
886 do_sync = 0;
887 else {
888 value *= gfs2_jindex_size(sdp) * num;
889 do_div(value, den);
890 value += qd->qd_qb.qb_value;
891 if (value < (int64_t)qd->qd_qb.qb_limit)
892 do_sync = 0;
893 }
894
895 return do_sync;
896}
897
898void gfs2_quota_unlock(struct gfs2_inode *ip)
899{
900 struct gfs2_alloc *al = &ip->i_alloc;
901 struct gfs2_quota_data *qda[4];
902 unsigned int count = 0;
903 unsigned int x;
904
905 if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
906 goto out;
907
908 for (x = 0; x < al->al_qd_num; x++) {
909 struct gfs2_quota_data *qd;
910 int sync;
911
912 qd = al->al_qd[x];
913 sync = need_sync(qd);
914
915 gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
916
917 if (sync && qd_trylock(qd))
918 qda[count++] = qd;
919 }
920
921 if (count) {
922 do_sync(count, qda);
923 for (x = 0; x < count; x++)
924 qd_unlock(qda[x]);
925 }
926
927 out:
928 gfs2_quota_unhold(ip);
929}
930
931#define MAX_LINE 256
932
933static int print_message(struct gfs2_quota_data *qd, char *type)
934{
935 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
936 char *line;
937 int len;
938
939 line = kmalloc(MAX_LINE, GFP_KERNEL);
940 if (!line)
941 return -ENOMEM;
942
943 len = snprintf(line, MAX_LINE-1,
944 "GFS2: fsid=%s: quota %s for %s %u\r\n",
945 sdp->sd_fsname, type,
946 (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
947 qd->qd_id);
948 line[MAX_LINE-1] = 0;
949
950 if (current->signal) { /* Is this test still required? */
951 tty_write_message(current->signal->tty, line);
952 }
953
954 kfree(line);
955
956 return 0;
957}
958
959int gfs2_quota_check(struct gfs2_inode *ip, uint32_t uid, uint32_t gid)
960{
961 struct gfs2_sbd *sdp = ip->i_sbd;
962 struct gfs2_alloc *al = &ip->i_alloc;
963 struct gfs2_quota_data *qd;
964 int64_t value;
965 unsigned int x;
966 int error = 0;
967
968 if (!test_bit(GIF_QD_LOCKED, &ip->i_flags))
969 return 0;
970
971 if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
972 return 0;
973
974 for (x = 0; x < al->al_qd_num; x++) {
975 qd = al->al_qd[x];
976
977 if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
978 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))))
979 continue;
980
981 value = qd->qd_qb.qb_value;
982 spin_lock(&sdp->sd_quota_spin);
983 value += qd->qd_change;
984 spin_unlock(&sdp->sd_quota_spin);
985
986 if (qd->qd_qb.qb_limit && (int64_t)qd->qd_qb.qb_limit < value) {
987 print_message(qd, "exceeded");
988 error = -EDQUOT;
989 break;
990 } else if (qd->qd_qb.qb_warn &&
991 (int64_t)qd->qd_qb.qb_warn < value &&
992 time_after_eq(jiffies, qd->qd_last_warn +
993 gfs2_tune_get(sdp,
994 gt_quota_warn_period) * HZ)) {
995 error = print_message(qd, "warning");
996 qd->qd_last_warn = jiffies;
997 }
998 }
999
1000 return error;
1001}
1002
1003void gfs2_quota_change(struct gfs2_inode *ip, int64_t change,
1004 uint32_t uid, uint32_t gid)
1005{
1006 struct gfs2_alloc *al = &ip->i_alloc;
1007 struct gfs2_quota_data *qd;
1008 unsigned int x;
1009 unsigned int found = 0;
1010
1011 if (gfs2_assert_warn(ip->i_sbd, change))
1012 return;
1013 if (ip->i_di.di_flags & GFS2_DIF_SYSTEM)
1014 return;
1015
1016 for (x = 0; x < al->al_qd_num; x++) {
1017 qd = al->al_qd[x];
1018
1019 if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
1020 (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
1021 do_qc(qd, change);
1022 found++;
1023 }
1024 }
1025}
1026
1027int gfs2_quota_sync(struct gfs2_sbd *sdp)
1028{
1029 struct gfs2_quota_data **qda;
1030 unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
1031 unsigned int num_qd;
1032 unsigned int x;
1033 int error = 0;
1034
1035 sdp->sd_quota_sync_gen++;
1036
1037 qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL);
1038 if (!qda)
1039 return -ENOMEM;
1040
1041 do {
1042 num_qd = 0;
1043
1044 for (;;) {
1045 error = qd_fish(sdp, qda + num_qd);
1046 if (error || !qda[num_qd])
1047 break;
1048 if (++num_qd == max_qd)
1049 break;
1050 }
1051
1052 if (num_qd) {
1053 if (!error)
1054 error = do_sync(num_qd, qda);
1055 if (!error)
1056 for (x = 0; x < num_qd; x++)
1057 qda[x]->qd_sync_gen =
1058 sdp->sd_quota_sync_gen;
1059
1060 for (x = 0; x < num_qd; x++)
1061 qd_unlock(qda[x]);
1062 }
1063 } while (!error && num_qd == max_qd);
1064
1065 kfree(qda);
1066
1067 return error;
1068}
1069
1070int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, uint32_t id)
1071{
1072 struct gfs2_quota_data *qd;
1073 struct gfs2_holder q_gh;
1074 int error;
1075
1076 error = qd_get(sdp, user, id, CREATE, &qd);
1077 if (error)
1078 return error;
1079
1080 error = do_glock(qd, FORCE, &q_gh);
1081 if (!error)
1082 gfs2_glock_dq_uninit(&q_gh);
1083
1084 qd_put(qd);
1085
1086 return error;
1087}
1088
1089int gfs2_quota_read(struct gfs2_sbd *sdp, int user, uint32_t id,
1090 struct gfs2_quota *q)
1091{
1092 struct gfs2_quota_data *qd;
1093 struct gfs2_holder q_gh;
1094 int error;
1095
1096 if (((user) ? (id != current->fsuid) : (!in_group_p(id))) &&
1097 !capable(CAP_SYS_ADMIN))
1098 return -EACCES;
1099
1100 error = qd_get(sdp, user, id, CREATE, &qd);
1101 if (error)
1102 return error;
1103
1104 error = do_glock(qd, NO_FORCE, &q_gh);
1105 if (error)
1106 goto out;
1107
1108 memset(q, 0, sizeof(struct gfs2_quota));
1109 q->qu_limit = qd->qd_qb.qb_limit;
1110 q->qu_warn = qd->qd_qb.qb_warn;
1111 q->qu_value = qd->qd_qb.qb_value;
1112
1113 spin_lock(&sdp->sd_quota_spin);
1114 q->qu_value += qd->qd_change;
1115 spin_unlock(&sdp->sd_quota_spin);
1116
1117 gfs2_glock_dq_uninit(&q_gh);
1118
1119 out:
1120 qd_put(qd);
1121
1122 return error;
1123}
1124
1125int gfs2_quota_init(struct gfs2_sbd *sdp)
1126{
1127 struct gfs2_inode *ip = sdp->sd_qc_inode->u.generic_ip;
1128 unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
1129 unsigned int x, slot = 0;
1130 unsigned int found = 0;
1131 uint64_t dblock;
1132 uint32_t extlen = 0;
1133 int error;
1134
1135 if (!ip->i_di.di_size ||
1136 ip->i_di.di_size > (64 << 20) ||
1137 ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
1138 gfs2_consist_inode(ip);
1139 return -EIO;
1140 }
1141 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
1142 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
1143
1144 error = -ENOMEM;
1145
1146 sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
1147 sizeof(unsigned char *), GFP_KERNEL);
1148 if (!sdp->sd_quota_bitmap)
1149 return error;
1150
1151 for (x = 0; x < sdp->sd_quota_chunks; x++) {
1152 sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL);
1153 if (!sdp->sd_quota_bitmap[x])
1154 goto fail;
1155 }
1156
1157 for (x = 0; x < blocks; x++) {
1158 struct buffer_head *bh;
1159 unsigned int y;
1160
1161 if (!extlen) {
1162 int new = 0;
1163 error = gfs2_block_map(ip, x, &new, &dblock, &extlen);
1164 if (error)
1165 goto fail;
1166 }
1167 gfs2_meta_ra(ip->i_gl, dblock, extlen);
1168 error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT,
1169 &bh);
1170 if (error)
1171 goto fail;
1172 error = -EIO;
1173 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) {
1174 brelse(bh);
1175 goto fail;
1176 }
1177
1178 for (y = 0;
1179 y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
1180 y++, slot++) {
1181 struct gfs2_quota_change qc;
1182 struct gfs2_quota_data *qd;
1183
1184 gfs2_quota_change_in(&qc, bh->b_data +
1185 sizeof(struct gfs2_meta_header) +
1186 y * sizeof(struct gfs2_quota_change));
1187 if (!qc.qc_change)
1188 continue;
1189
1190 error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER),
1191 qc.qc_id, &qd);
1192 if (error) {
1193 brelse(bh);
1194 goto fail;
1195 }
1196
1197 set_bit(QDF_CHANGE, &qd->qd_flags);
1198 qd->qd_change = qc.qc_change;
1199 qd->qd_slot = slot;
1200 qd->qd_slot_count = 1;
1201 qd->qd_last_touched = jiffies;
1202
1203 spin_lock(&sdp->sd_quota_spin);
1204 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
1205 list_add(&qd->qd_list, &sdp->sd_quota_list);
1206 atomic_inc(&sdp->sd_quota_count);
1207 spin_unlock(&sdp->sd_quota_spin);
1208
1209 found++;
1210 }
1211
1212 brelse(bh);
1213 dblock++;
1214 extlen--;
1215 }
1216
1217 if (found)
1218 fs_info(sdp, "found %u quota changes\n", found);
1219
1220 return 0;
1221
1222 fail:
1223 gfs2_quota_cleanup(sdp);
1224 return error;
1225}
1226
1227void gfs2_quota_scan(struct gfs2_sbd *sdp)
1228{
1229 struct gfs2_quota_data *qd, *safe;
1230 LIST_HEAD(dead);
1231
1232 spin_lock(&sdp->sd_quota_spin);
1233 list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) {
1234 if (!qd->qd_count &&
1235 time_after_eq(jiffies, qd->qd_last_touched +
1236 gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) {
1237 list_move(&qd->qd_list, &dead);
1238 gfs2_assert_warn(sdp,
1239 atomic_read(&sdp->sd_quota_count) > 0);
1240 atomic_dec(&sdp->sd_quota_count);
1241 }
1242 }
1243 spin_unlock(&sdp->sd_quota_spin);
1244
1245 while (!list_empty(&dead)) {
1246 qd = list_entry(dead.next, struct gfs2_quota_data, qd_list);
1247 list_del(&qd->qd_list);
1248
1249 gfs2_assert_warn(sdp, !qd->qd_change);
1250 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1251 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1252
1253 gfs2_lvb_unhold(qd->qd_gl);
1254 kfree(qd);
1255 }
1256}
1257
1258void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1259{
1260 struct list_head *head = &sdp->sd_quota_list;
1261 struct gfs2_quota_data *qd;
1262 unsigned int x;
1263
1264 spin_lock(&sdp->sd_quota_spin);
1265 while (!list_empty(head)) {
1266 qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
1267
1268 if (qd->qd_count > 1 ||
1269 (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
1270 list_move(&qd->qd_list, head);
1271 spin_unlock(&sdp->sd_quota_spin);
1272 schedule();
1273 spin_lock(&sdp->sd_quota_spin);
1274 continue;
1275 }
1276
1277 list_del(&qd->qd_list);
1278 atomic_dec(&sdp->sd_quota_count);
1279 spin_unlock(&sdp->sd_quota_spin);
1280
1281 if (!qd->qd_count) {
1282 gfs2_assert_warn(sdp, !qd->qd_change);
1283 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1284 } else
1285 gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
1286 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1287
1288 gfs2_lvb_unhold(qd->qd_gl);
1289 kfree(qd);
1290
1291 spin_lock(&sdp->sd_quota_spin);
1292 }
1293 spin_unlock(&sdp->sd_quota_spin);
1294
1295 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
1296
1297 if (sdp->sd_quota_bitmap) {
1298 for (x = 0; x < sdp->sd_quota_chunks; x++)
1299 kfree(sdp->sd_quota_bitmap[x]);
1300 kfree(sdp->sd_quota_bitmap);
1301 }
1302}
1303
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
new file mode 100644
index 000000000000..005529f6895d
--- /dev/null
+++ b/fs/gfs2/quota.h
@@ -0,0 +1,34 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __QUOTA_DOT_H__
11#define __QUOTA_DOT_H__
12
13#define NO_QUOTA_CHANGE ((uint32_t)-1)
14
15int gfs2_quota_hold(struct gfs2_inode *ip, uint32_t uid, uint32_t gid);
16void gfs2_quota_unhold(struct gfs2_inode *ip);
17
18int gfs2_quota_lock(struct gfs2_inode *ip, uint32_t uid, uint32_t gid);
19void gfs2_quota_unlock(struct gfs2_inode *ip);
20
21int gfs2_quota_check(struct gfs2_inode *ip, uint32_t uid, uint32_t gid);
22void gfs2_quota_change(struct gfs2_inode *ip, int64_t change,
23 uint32_t uid, uint32_t gid);
24
25int gfs2_quota_sync(struct gfs2_sbd *sdp);
26int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, uint32_t id);
27int gfs2_quota_read(struct gfs2_sbd *sdp, int user, uint32_t id,
28 struct gfs2_quota *q);
29
30int gfs2_quota_init(struct gfs2_sbd *sdp);
31void gfs2_quota_scan(struct gfs2_sbd *sdp);
32void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
33
34#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
new file mode 100644
index 000000000000..e91c2bda6c32
--- /dev/null
+++ b/fs/gfs2/recovery.c
@@ -0,0 +1,577 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h>
17#include <asm/semaphore.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "bmap.h"
23#include "glock.h"
24#include "glops.h"
25#include "lm.h"
26#include "lops.h"
27#include "meta_io.h"
28#include "recovery.h"
29#include "super.h"
30#include "util.h"
31#include "dir.h"
32
33int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
34 struct buffer_head **bh)
35{
36 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
37 struct gfs2_glock *gl = ip->i_gl;
38 int new = 0;
39 uint64_t dblock;
40 uint32_t extlen;
41 int error;
42
43 error = gfs2_block_map(ip, blk, &new, &dblock,
44 &extlen);
45 if (error)
46 return error;
47 if (!dblock) {
48 gfs2_consist_inode(ip);
49 return -EIO;
50 }
51
52 gfs2_meta_ra(gl, dblock, extlen);
53 error = gfs2_meta_read(gl, dblock, DIO_START | DIO_WAIT, bh);
54
55 return error;
56}
57
58int gfs2_revoke_add(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where)
59{
60 struct list_head *head = &sdp->sd_revoke_list;
61 struct gfs2_revoke_replay *rr;
62 int found = 0;
63
64 list_for_each_entry(rr, head, rr_list) {
65 if (rr->rr_blkno == blkno) {
66 found = 1;
67 break;
68 }
69 }
70
71 if (found) {
72 rr->rr_where = where;
73 return 0;
74 }
75
76 rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL);
77 if (!rr)
78 return -ENOMEM;
79
80 rr->rr_blkno = blkno;
81 rr->rr_where = where;
82 list_add(&rr->rr_list, head);
83
84 return 1;
85}
86
87int gfs2_revoke_check(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where)
88{
89 struct gfs2_revoke_replay *rr;
90 int wrap, a, b, revoke;
91 int found = 0;
92
93 list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) {
94 if (rr->rr_blkno == blkno) {
95 found = 1;
96 break;
97 }
98 }
99
100 if (!found)
101 return 0;
102
103 wrap = (rr->rr_where < sdp->sd_replay_tail);
104 a = (sdp->sd_replay_tail < where);
105 b = (where < rr->rr_where);
106 revoke = (wrap) ? (a || b) : (a && b);
107
108 return revoke;
109}
110
111void gfs2_revoke_clean(struct gfs2_sbd *sdp)
112{
113 struct list_head *head = &sdp->sd_revoke_list;
114 struct gfs2_revoke_replay *rr;
115
116 while (!list_empty(head)) {
117 rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list);
118 list_del(&rr->rr_list);
119 kfree(rr);
120 }
121}
122
123/**
124 * get_log_header - read the log header for a given segment
125 * @jd: the journal
126 * @blk: the block to look at
127 * @lh: the log header to return
128 *
129 * Read the log header for a given segement in a given journal. Do a few
130 * sanity checks on it.
131 *
132 * Returns: 0 on success,
133 * 1 if the header was invalid or incomplete,
134 * errno on error
135 */
136
137static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
138 struct gfs2_log_header *head)
139{
140 struct buffer_head *bh;
141 struct gfs2_log_header lh;
142 uint32_t hash;
143 int error;
144
145 error = gfs2_replay_read_block(jd, blk, &bh);
146 if (error)
147 return error;
148
149 memcpy(&lh, bh->b_data, sizeof(struct gfs2_log_header));
150 lh.lh_hash = 0;
151 hash = gfs2_disk_hash((char *)&lh, sizeof(struct gfs2_log_header));
152 gfs2_log_header_in(&lh, bh->b_data);
153
154 brelse(bh);
155
156 if (lh.lh_header.mh_magic != GFS2_MAGIC ||
157 lh.lh_header.mh_type != GFS2_METATYPE_LH ||
158 lh.lh_blkno != blk ||
159 lh.lh_hash != hash)
160 return 1;
161
162 *head = lh;
163
164 return 0;
165}
166
167/**
168 * find_good_lh - find a good log header
169 * @jd: the journal
170 * @blk: the segment to start searching from
171 * @lh: the log header to fill in
172 * @forward: if true search forward in the log, else search backward
173 *
174 * Call get_log_header() to get a log header for a segment, but if the
175 * segment is bad, either scan forward or backward until we find a good one.
176 *
177 * Returns: errno
178 */
179
180static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk,
181 struct gfs2_log_header *head)
182{
183 unsigned int orig_blk = *blk;
184 int error;
185
186 for (;;) {
187 error = get_log_header(jd, *blk, head);
188 if (error <= 0)
189 return error;
190
191 if (++*blk == jd->jd_blocks)
192 *blk = 0;
193
194 if (*blk == orig_blk) {
195 gfs2_consist_inode(jd->jd_inode->u.generic_ip);
196 return -EIO;
197 }
198 }
199}
200
201/**
202 * jhead_scan - make sure we've found the head of the log
203 * @jd: the journal
204 * @head: this is filled in with the log descriptor of the head
205 *
206 * At this point, seg and lh should be either the head of the log or just
207 * before. Scan forward until we find the head.
208 *
209 * Returns: errno
210 */
211
212static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
213{
214 unsigned int blk = head->lh_blkno;
215 struct gfs2_log_header lh;
216 int error;
217
218 for (;;) {
219 if (++blk == jd->jd_blocks)
220 blk = 0;
221
222 error = get_log_header(jd, blk, &lh);
223 if (error < 0)
224 return error;
225 if (error == 1)
226 continue;
227
228 if (lh.lh_sequence == head->lh_sequence) {
229 gfs2_consist_inode(jd->jd_inode->u.generic_ip);
230 return -EIO;
231 }
232 if (lh.lh_sequence < head->lh_sequence)
233 break;
234
235 *head = lh;
236 }
237
238 return 0;
239}
240
241/**
242 * gfs2_find_jhead - find the head of a log
243 * @jd: the journal
244 * @head: the log descriptor for the head of the log is returned here
245 *
246 * Do a binary search of a journal and find the valid log entry with the
247 * highest sequence number. (i.e. the log head)
248 *
249 * Returns: errno
250 */
251
252int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
253{
254 struct gfs2_log_header lh_1, lh_m;
255 uint32_t blk_1, blk_2, blk_m;
256 int error;
257
258 blk_1 = 0;
259 blk_2 = jd->jd_blocks - 1;
260
261 for (;;) {
262 blk_m = (blk_1 + blk_2) / 2;
263
264 error = find_good_lh(jd, &blk_1, &lh_1);
265 if (error)
266 return error;
267
268 error = find_good_lh(jd, &blk_m, &lh_m);
269 if (error)
270 return error;
271
272 if (blk_1 == blk_m || blk_m == blk_2)
273 break;
274
275 if (lh_1.lh_sequence <= lh_m.lh_sequence)
276 blk_1 = blk_m;
277 else
278 blk_2 = blk_m;
279 }
280
281 error = jhead_scan(jd, &lh_1);
282 if (error)
283 return error;
284
285 *head = lh_1;
286
287 return error;
288}
289
290/**
291 * foreach_descriptor - go through the active part of the log
292 * @jd: the journal
293 * @start: the first log header in the active region
294 * @end: the last log header (don't process the contents of this entry))
295 *
296 * Call a given function once for every log descriptor in the active
297 * portion of the log.
298 *
299 * Returns: errno
300 */
301
302static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start,
303 unsigned int end, int pass)
304{
305 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
306 struct gfs2_sbd *sdp = ip->i_sbd;
307 struct buffer_head *bh;
308 struct gfs2_log_descriptor *ld;
309 int error = 0;
310 u32 length;
311 __be64 *ptr;
312 unsigned int offset = sizeof(struct gfs2_log_descriptor);
313 offset += (sizeof(__be64)-1);
314 offset &= ~(sizeof(__be64)-1);
315
316 while (start != end) {
317 error = gfs2_replay_read_block(jd, start, &bh);
318 if (error)
319 return error;
320 if (gfs2_meta_check(sdp, bh)) {
321 brelse(bh);
322 return -EIO;
323 }
324 ld = (struct gfs2_log_descriptor *)bh->b_data;
325 length = be32_to_cpu(ld->ld_length);
326
327 if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) {
328 struct gfs2_log_header lh;
329 error = get_log_header(jd, start, &lh);
330 if (!error) {
331 gfs2_replay_incr_blk(sdp, &start);
332 continue;
333 }
334 if (error == 1) {
335 gfs2_consist_inode(jd->jd_inode->u.generic_ip);
336 error = -EIO;
337 }
338 brelse(bh);
339 return error;
340 } else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) {
341 brelse(bh);
342 return -EIO;
343 }
344 ptr = (__be64 *)(bh->b_data + offset);
345 error = lops_scan_elements(jd, start, ld, ptr, pass);
346 if (error) {
347 brelse(bh);
348 return error;
349 }
350
351 while (length--)
352 gfs2_replay_incr_blk(sdp, &start);
353
354 brelse(bh);
355 }
356
357 return 0;
358}
359
360/**
361 * clean_journal - mark a dirty journal as being clean
362 * @sdp: the filesystem
363 * @jd: the journal
364 * @gl: the journal's glock
365 * @head: the head journal to start from
366 *
367 * Returns: errno
368 */
369
370static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
371{
372 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
373 struct gfs2_sbd *sdp = ip->i_sbd;
374 unsigned int lblock;
375 int new = 0;
376 uint64_t dblock;
377 struct gfs2_log_header *lh;
378 uint32_t hash;
379 struct buffer_head *bh;
380 int error;
381
382 lblock = head->lh_blkno;
383 gfs2_replay_incr_blk(sdp, &lblock);
384 error = gfs2_block_map(ip, lblock, &new, &dblock, NULL);
385 if (error)
386 return error;
387 if (!dblock) {
388 gfs2_consist_inode(ip);
389 return -EIO;
390 }
391
392 bh = sb_getblk(sdp->sd_vfs, dblock);
393 lock_buffer(bh);
394 memset(bh->b_data, 0, bh->b_size);
395 set_buffer_uptodate(bh);
396 clear_buffer_dirty(bh);
397 unlock_buffer(bh);
398
399 lh = (struct gfs2_log_header *)bh->b_data;
400 memset(lh, 0, sizeof(struct gfs2_log_header));
401 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
402 lh->lh_header.mh_type = cpu_to_be16(GFS2_METATYPE_LH);
403 lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
404 lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
405 lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
406 lh->lh_blkno = cpu_to_be32(lblock);
407 hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header));
408 lh->lh_hash = cpu_to_be32(hash);
409
410 set_buffer_dirty(bh);
411 if (sync_dirty_buffer(bh))
412 gfs2_io_error_bh(sdp, bh);
413 brelse(bh);
414
415 return error;
416}
417
418/**
419 * gfs2_recover_journal - recovery a given journal
420 * @jd: the struct gfs2_jdesc describing the journal
421 *
422 * Acquire the journal's lock, check to see if the journal is clean, and
423 * do recovery if necessary.
424 *
425 * Returns: errno
426 */
427
428int gfs2_recover_journal(struct gfs2_jdesc *jd)
429{
430 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
431 struct gfs2_sbd *sdp = ip->i_sbd;
432 struct gfs2_log_header head;
433 struct gfs2_holder j_gh, ji_gh, t_gh;
434 unsigned long t;
435 int ro = 0;
436 unsigned int pass;
437 int error;
438
439 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", jd->jd_jid);
440
441 /* Aquire the journal lock so we can do recovery */
442
443 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
444 LM_ST_EXCLUSIVE,
445 LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE,
446 &j_gh);
447 switch (error) {
448 case 0:
449 break;
450
451 case GLR_TRYFAILED:
452 fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid);
453 error = 0;
454
455 default:
456 goto fail;
457 };
458
459 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
460 LM_FLAG_NOEXP, &ji_gh);
461 if (error)
462 goto fail_gunlock_j;
463
464 fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid);
465
466 error = gfs2_jdesc_check(jd);
467 if (error)
468 goto fail_gunlock_ji;
469
470 error = gfs2_find_jhead(jd, &head);
471 if (error)
472 goto fail_gunlock_ji;
473
474 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
475 fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n",
476 jd->jd_jid);
477
478 t = jiffies;
479
480 /* Acquire a shared hold on the transaction lock */
481
482 error = gfs2_glock_nq_init(sdp->sd_trans_gl,
483 LM_ST_SHARED,
484 LM_FLAG_NOEXP |
485 LM_FLAG_PRIORITY |
486 GL_NEVER_RECURSE |
487 GL_NOCANCEL |
488 GL_NOCACHE,
489 &t_gh);
490 if (error)
491 goto fail_gunlock_ji;
492
493 if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
494 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
495 ro = 1;
496 } else {
497 if (sdp->sd_vfs->s_flags & MS_RDONLY)
498 ro = 1;
499 }
500
501 if (ro) {
502 fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n",
503 jd->jd_jid);
504 error = -EROFS;
505 goto fail_gunlock_tr;
506 }
507
508 fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid);
509
510 for (pass = 0; pass < 2; pass++) {
511 lops_before_scan(jd, &head, pass);
512 error = foreach_descriptor(jd, head.lh_tail,
513 head.lh_blkno, pass);
514 lops_after_scan(jd, error, pass);
515 if (error)
516 goto fail_gunlock_tr;
517 }
518
519 error = clean_journal(jd, &head);
520 if (error)
521 goto fail_gunlock_tr;
522
523 gfs2_glock_dq_uninit(&t_gh);
524
525 t = DIV_ROUND_UP(jiffies - t, HZ);
526
527 fs_info(sdp, "jid=%u: Journal replayed in %lus\n",
528 jd->jd_jid, t);
529 }
530
531 gfs2_glock_dq_uninit(&ji_gh);
532
533 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
534
535 gfs2_glock_dq_uninit(&j_gh);
536
537 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
538
539 return 0;
540
541 fail_gunlock_tr:
542 gfs2_glock_dq_uninit(&t_gh);
543
544 fail_gunlock_ji:
545 gfs2_glock_dq_uninit(&ji_gh);
546
547 fail_gunlock_j:
548 gfs2_glock_dq_uninit(&j_gh);
549
550 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
551
552 fail:
553 gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
554
555 return error;
556}
557
558/**
559 * gfs2_check_journals - Recover any dirty journals
560 * @sdp: the filesystem
561 *
562 */
563
564void gfs2_check_journals(struct gfs2_sbd *sdp)
565{
566 struct gfs2_jdesc *jd;
567
568 for (;;) {
569 jd = gfs2_jdesc_find_dirty(sdp);
570 if (!jd)
571 break;
572
573 if (jd != sdp->sd_jdesc)
574 gfs2_recover_journal(jd);
575 }
576}
577
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
new file mode 100644
index 000000000000..248481189300
--- /dev/null
+++ b/fs/gfs2/recovery.h
@@ -0,0 +1,32 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __RECOVERY_DOT_H__
11#define __RECOVERY_DOT_H__
12
13static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
14{
15 if (++*blk == sdp->sd_jdesc->jd_blocks)
16 *blk = 0;
17}
18
19int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
20 struct buffer_head **bh);
21
22int gfs2_revoke_add(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where);
23int gfs2_revoke_check(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where);
24void gfs2_revoke_clean(struct gfs2_sbd *sdp);
25
26int gfs2_find_jhead(struct gfs2_jdesc *jd,
27 struct gfs2_log_header *head);
28int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
29void gfs2_check_journals(struct gfs2_sbd *sdp);
30
31#endif /* __RECOVERY_DOT_H__ */
32
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
new file mode 100644
index 000000000000..4ae559694396
--- /dev/null
+++ b/fs/gfs2/rgrp.c
@@ -0,0 +1,1369 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/fs.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/semaphore.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "bits.h"
23#include "glock.h"
24#include "glops.h"
25#include "lops.h"
26#include "meta_io.h"
27#include "quota.h"
28#include "rgrp.h"
29#include "super.h"
30#include "trans.h"
31#include "ops_file.h"
32#include "util.h"
33
34/**
35 * gfs2_rgrp_verify - Verify that a resource group is consistent
36 * @sdp: the filesystem
37 * @rgd: the rgrp
38 *
39 */
40
41void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
42{
43 struct gfs2_sbd *sdp = rgd->rd_sbd;
44 struct gfs2_bitmap *bi = NULL;
45 uint32_t length = rgd->rd_ri.ri_length;
46 uint32_t count[4], tmp;
47 int buf, x;
48
49 memset(count, 0, 4 * sizeof(uint32_t));
50
51 /* Count # blocks in each of 4 possible allocation states */
52 for (buf = 0; buf < length; buf++) {
53 bi = rgd->rd_bits + buf;
54 for (x = 0; x < 4; x++)
55 count[x] += gfs2_bitcount(rgd,
56 bi->bi_bh->b_data +
57 bi->bi_offset,
58 bi->bi_len, x);
59 }
60
61 if (count[0] != rgd->rd_rg.rg_free) {
62 if (gfs2_consist_rgrpd(rgd))
63 fs_err(sdp, "free data mismatch: %u != %u\n",
64 count[0], rgd->rd_rg.rg_free);
65 return;
66 }
67
68 tmp = rgd->rd_ri.ri_data -
69 rgd->rd_rg.rg_free -
70 rgd->rd_rg.rg_dinodes;
71 if (count[1] != tmp) {
72 if (gfs2_consist_rgrpd(rgd))
73 fs_err(sdp, "used data mismatch: %u != %u\n",
74 count[1], tmp);
75 return;
76 }
77
78 if (count[2]) {
79 if (gfs2_consist_rgrpd(rgd))
80 fs_err(sdp, "free metadata mismatch: %u != 0\n",
81 count[2]);
82 return;
83 }
84
85 if (count[3] != rgd->rd_rg.rg_dinodes) {
86 if (gfs2_consist_rgrpd(rgd))
87 fs_err(sdp, "used metadata mismatch: %u != %u\n",
88 count[3], rgd->rd_rg.rg_dinodes);
89 return;
90 }
91}
92
93static inline int rgrp_contains_block(struct gfs2_rindex *ri, uint64_t block)
94{
95 uint64_t first = ri->ri_data0;
96 uint64_t last = first + ri->ri_data;
97 return !!(first <= block && block < last);
98}
99
100/**
101 * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
102 * @sdp: The GFS2 superblock
103 * @n: The data block number
104 *
105 * Returns: The resource group, or NULL if not found
106 */
107
108struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, uint64_t blk)
109{
110 struct gfs2_rgrpd *rgd;
111
112 spin_lock(&sdp->sd_rindex_spin);
113
114 list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) {
115 if (rgrp_contains_block(&rgd->rd_ri, blk)) {
116 list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
117 spin_unlock(&sdp->sd_rindex_spin);
118 return rgd;
119 }
120 }
121
122 spin_unlock(&sdp->sd_rindex_spin);
123
124 return NULL;
125}
126
127/**
128 * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
129 * @sdp: The GFS2 superblock
130 *
131 * Returns: The first rgrp in the filesystem
132 */
133
134struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
135{
136 gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list));
137 return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list);
138}
139
140/**
141 * gfs2_rgrpd_get_next - get the next RG
142 * @rgd: A RG
143 *
144 * Returns: The next rgrp
145 */
146
147struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
148{
149 if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list)
150 return NULL;
151 return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list);
152}
153
154static void clear_rgrpdi(struct gfs2_sbd *sdp)
155{
156 struct list_head *head;
157 struct gfs2_rgrpd *rgd;
158 struct gfs2_glock *gl;
159
160 spin_lock(&sdp->sd_rindex_spin);
161 sdp->sd_rindex_forward = NULL;
162 head = &sdp->sd_rindex_recent_list;
163 while (!list_empty(head)) {
164 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
165 list_del(&rgd->rd_recent);
166 }
167 spin_unlock(&sdp->sd_rindex_spin);
168
169 head = &sdp->sd_rindex_list;
170 while (!list_empty(head)) {
171 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list);
172 gl = rgd->rd_gl;
173
174 list_del(&rgd->rd_list);
175 list_del(&rgd->rd_list_mru);
176
177 if (gl) {
178 gl->gl_object = NULL;
179 gfs2_glock_put(gl);
180 }
181
182 kfree(rgd->rd_bits);
183 kfree(rgd);
184 }
185}
186
187void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
188{
189 mutex_lock(&sdp->sd_rindex_mutex);
190 clear_rgrpdi(sdp);
191 mutex_unlock(&sdp->sd_rindex_mutex);
192}
193
194/**
195 * gfs2_compute_bitstructs - Compute the bitmap sizes
196 * @rgd: The resource group descriptor
197 *
198 * Calculates bitmap descriptors, one for each block that contains bitmap data
199 *
200 * Returns: errno
201 */
202
203static int compute_bitstructs(struct gfs2_rgrpd *rgd)
204{
205 struct gfs2_sbd *sdp = rgd->rd_sbd;
206 struct gfs2_bitmap *bi;
207 uint32_t length = rgd->rd_ri.ri_length; /* # blocks in hdr & bitmap */
208 uint32_t bytes_left, bytes;
209 int x;
210
211 rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_KERNEL);
212 if (!rgd->rd_bits)
213 return -ENOMEM;
214
215 bytes_left = rgd->rd_ri.ri_bitbytes;
216
217 for (x = 0; x < length; x++) {
218 bi = rgd->rd_bits + x;
219
220 /* small rgrp; bitmap stored completely in header block */
221 if (length == 1) {
222 bytes = bytes_left;
223 bi->bi_offset = sizeof(struct gfs2_rgrp);
224 bi->bi_start = 0;
225 bi->bi_len = bytes;
226 /* header block */
227 } else if (x == 0) {
228 bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
229 bi->bi_offset = sizeof(struct gfs2_rgrp);
230 bi->bi_start = 0;
231 bi->bi_len = bytes;
232 /* last block */
233 } else if (x + 1 == length) {
234 bytes = bytes_left;
235 bi->bi_offset = sizeof(struct gfs2_meta_header);
236 bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
237 bi->bi_len = bytes;
238 /* other blocks */
239 } else {
240 bytes = sdp->sd_sb.sb_bsize -
241 sizeof(struct gfs2_meta_header);
242 bi->bi_offset = sizeof(struct gfs2_meta_header);
243 bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
244 bi->bi_len = bytes;
245 }
246
247 bytes_left -= bytes;
248 }
249
250 if (bytes_left) {
251 gfs2_consist_rgrpd(rgd);
252 return -EIO;
253 }
254 bi = rgd->rd_bits + (length - 1);
255 if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_ri.ri_data) {
256 if (gfs2_consist_rgrpd(rgd)) {
257 gfs2_rindex_print(&rgd->rd_ri);
258 fs_err(sdp, "start=%u len=%u offset=%u\n",
259 bi->bi_start, bi->bi_len, bi->bi_offset);
260 }
261 return -EIO;
262 }
263
264 return 0;
265}
266
267/**
268 * gfs2_ri_update - Pull in a new resource index from the disk
269 * @gl: The glock covering the rindex inode
270 *
271 * Returns: 0 on successful update, error code otherwise
272 */
273
274static int gfs2_ri_update(struct gfs2_inode *ip)
275{
276 struct gfs2_sbd *sdp = ip->i_sbd;
277 struct inode *inode = ip->i_vnode;
278 struct gfs2_rgrpd *rgd;
279 char buf[sizeof(struct gfs2_rindex)];
280 struct file_ra_state ra_state;
281 uint64_t junk = ip->i_di.di_size;
282 int error;
283
284 if (do_div(junk, sizeof(struct gfs2_rindex))) {
285 gfs2_consist_inode(ip);
286 return -EIO;
287 }
288
289 clear_rgrpdi(sdp);
290
291 file_ra_state_init(&ra_state, inode->i_mapping);
292 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
293 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
294 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
295 sizeof(struct gfs2_rindex));
296 if (!error)
297 break;
298 if (error != sizeof(struct gfs2_rindex)) {
299 if (error > 0)
300 error = -EIO;
301 goto fail;
302 }
303
304 rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_KERNEL);
305 error = -ENOMEM;
306 if (!rgd)
307 goto fail;
308
309 mutex_init(&rgd->rd_mutex);
310 lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
311 rgd->rd_sbd = sdp;
312
313 list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
314 list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
315
316 gfs2_rindex_in(&rgd->rd_ri, buf);
317
318 error = compute_bitstructs(rgd);
319 if (error)
320 goto fail;
321
322 error = gfs2_glock_get(sdp, rgd->rd_ri.ri_addr,
323 &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
324 if (error)
325 goto fail;
326
327 rgd->rd_gl->gl_object = rgd;
328 rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
329 }
330
331 sdp->sd_rindex_vn = ip->i_gl->gl_vn;
332
333 return 0;
334
335 fail:
336 clear_rgrpdi(sdp);
337
338 return error;
339}
340
341/**
342 * gfs2_rindex_hold - Grab a lock on the rindex
343 * @sdp: The GFS2 superblock
344 * @ri_gh: the glock holder
345 *
346 * We grab a lock on the rindex inode to make sure that it doesn't
347 * change whilst we are performing an operation. We keep this lock
348 * for quite long periods of time compared to other locks. This
349 * doesn't matter, since it is shared and it is very, very rarely
350 * accessed in the exclusive mode (i.e. only when expanding the filesystem).
351 *
352 * This makes sure that we're using the latest copy of the resource index
353 * special file, which might have been updated if someone expanded the
354 * filesystem (via gfs2_grow utility), which adds new resource groups.
355 *
356 * Returns: 0 on success, error code otherwise
357 */
358
359int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
360{
361 struct gfs2_inode *ip = sdp->sd_rindex->u.generic_ip;
362 struct gfs2_glock *gl = ip->i_gl;
363 int error;
364
365 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
366 if (error)
367 return error;
368
369 /* Read new copy from disk if we don't have the latest */
370 if (sdp->sd_rindex_vn != gl->gl_vn) {
371 mutex_lock(&sdp->sd_rindex_mutex);
372 if (sdp->sd_rindex_vn != gl->gl_vn) {
373 error = gfs2_ri_update(ip);
374 if (error)
375 gfs2_glock_dq_uninit(ri_gh);
376 }
377 mutex_unlock(&sdp->sd_rindex_mutex);
378 }
379
380 return error;
381}
382
383/**
384 * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
385 * @rgd: the struct gfs2_rgrpd describing the RG to read in
386 *
387 * Read in all of a Resource Group's header and bitmap blocks.
388 * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
389 *
390 * Returns: errno
391 */
392
393int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
394{
395 struct gfs2_sbd *sdp = rgd->rd_sbd;
396 struct gfs2_glock *gl = rgd->rd_gl;
397 unsigned int length = rgd->rd_ri.ri_length;
398 struct gfs2_bitmap *bi;
399 unsigned int x, y;
400 int error;
401
402 mutex_lock(&rgd->rd_mutex);
403
404 spin_lock(&sdp->sd_rindex_spin);
405 if (rgd->rd_bh_count) {
406 rgd->rd_bh_count++;
407 spin_unlock(&sdp->sd_rindex_spin);
408 mutex_unlock(&rgd->rd_mutex);
409 return 0;
410 }
411 spin_unlock(&sdp->sd_rindex_spin);
412
413 for (x = 0; x < length; x++) {
414 bi = rgd->rd_bits + x;
415 error = gfs2_meta_read(gl, rgd->rd_ri.ri_addr + x, DIO_START,
416 &bi->bi_bh);
417 if (error)
418 goto fail;
419 }
420
421 for (y = length; y--;) {
422 bi = rgd->rd_bits + y;
423 error = gfs2_meta_reread(sdp, bi->bi_bh, DIO_WAIT);
424 if (error)
425 goto fail;
426 if (gfs2_metatype_check(sdp, bi->bi_bh,
427 (y) ? GFS2_METATYPE_RB :
428 GFS2_METATYPE_RG)) {
429 error = -EIO;
430 goto fail;
431 }
432 }
433
434 if (rgd->rd_rg_vn != gl->gl_vn) {
435 gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data);
436 rgd->rd_rg_vn = gl->gl_vn;
437 }
438
439 spin_lock(&sdp->sd_rindex_spin);
440 rgd->rd_free_clone = rgd->rd_rg.rg_free;
441 rgd->rd_bh_count++;
442 spin_unlock(&sdp->sd_rindex_spin);
443
444 mutex_unlock(&rgd->rd_mutex);
445
446 return 0;
447
448 fail:
449 while (x--) {
450 bi = rgd->rd_bits + x;
451 brelse(bi->bi_bh);
452 bi->bi_bh = NULL;
453 gfs2_assert_warn(sdp, !bi->bi_clone);
454 }
455 mutex_unlock(&rgd->rd_mutex);
456
457 return error;
458}
459
460void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
461{
462 struct gfs2_sbd *sdp = rgd->rd_sbd;
463
464 spin_lock(&sdp->sd_rindex_spin);
465 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
466 rgd->rd_bh_count++;
467 spin_unlock(&sdp->sd_rindex_spin);
468}
469
470/**
471 * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get()
472 * @rgd: the struct gfs2_rgrpd describing the RG to read in
473 *
474 */
475
476void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
477{
478 struct gfs2_sbd *sdp = rgd->rd_sbd;
479 int x, length = rgd->rd_ri.ri_length;
480
481 spin_lock(&sdp->sd_rindex_spin);
482 gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
483 if (--rgd->rd_bh_count) {
484 spin_unlock(&sdp->sd_rindex_spin);
485 return;
486 }
487
488 for (x = 0; x < length; x++) {
489 struct gfs2_bitmap *bi = rgd->rd_bits + x;
490 kfree(bi->bi_clone);
491 bi->bi_clone = NULL;
492 brelse(bi->bi_bh);
493 bi->bi_bh = NULL;
494 }
495
496 spin_unlock(&sdp->sd_rindex_spin);
497}
498
499void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
500{
501 struct gfs2_sbd *sdp = rgd->rd_sbd;
502 unsigned int length = rgd->rd_ri.ri_length;
503 unsigned int x;
504
505 for (x = 0; x < length; x++) {
506 struct gfs2_bitmap *bi = rgd->rd_bits + x;
507 if (!bi->bi_clone)
508 continue;
509 memcpy(bi->bi_clone + bi->bi_offset,
510 bi->bi_bh->b_data + bi->bi_offset,
511 bi->bi_len);
512 }
513
514 spin_lock(&sdp->sd_rindex_spin);
515 rgd->rd_free_clone = rgd->rd_rg.rg_free;
516 spin_unlock(&sdp->sd_rindex_spin);
517}
518
519/**
520 * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
521 * @ip: the incore GFS2 inode structure
522 *
523 * Returns: the struct gfs2_alloc
524 */
525
526struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
527{
528 struct gfs2_alloc *al = &ip->i_alloc;
529
530 /* FIXME: Should assert that the correct locks are held here... */
531 memset(al, 0, sizeof(*al));
532 return al;
533}
534
535/**
536 * gfs2_alloc_put - throw away the struct gfs2_alloc for an inode
537 * @ip: the inode
538 *
539 */
540
541void gfs2_alloc_put(struct gfs2_inode *ip)
542{
543 return;
544}
545
546/**
547 * try_rgrp_fit - See if a given reservation will fit in a given RG
548 * @rgd: the RG data
549 * @al: the struct gfs2_alloc structure describing the reservation
550 *
551 * If there's room for the requested blocks to be allocated from the RG:
552 * Sets the $al_reserved_data field in @al.
553 * Sets the $al_reserved_meta field in @al.
554 * Sets the $al_rgd field in @al.
555 *
556 * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
557 */
558
559static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
560{
561 struct gfs2_sbd *sdp = rgd->rd_sbd;
562 int ret = 0;
563
564 spin_lock(&sdp->sd_rindex_spin);
565 if (rgd->rd_free_clone >= al->al_requested) {
566 al->al_rgd = rgd;
567 ret = 1;
568 }
569 spin_unlock(&sdp->sd_rindex_spin);
570
571 return ret;
572}
573
574/**
575 * recent_rgrp_first - get first RG from "recent" list
576 * @sdp: The GFS2 superblock
577 * @rglast: address of the rgrp used last
578 *
579 * Returns: The first rgrp in the recent list
580 */
581
582static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
583 uint64_t rglast)
584{
585 struct gfs2_rgrpd *rgd = NULL;
586
587 spin_lock(&sdp->sd_rindex_spin);
588
589 if (list_empty(&sdp->sd_rindex_recent_list))
590 goto out;
591
592 if (!rglast)
593 goto first;
594
595 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
596 if (rgd->rd_ri.ri_addr == rglast)
597 goto out;
598 }
599
600 first:
601 rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd,
602 rd_recent);
603
604 out:
605 spin_unlock(&sdp->sd_rindex_spin);
606
607 return rgd;
608}
609
610/**
611 * recent_rgrp_next - get next RG from "recent" list
612 * @cur_rgd: current rgrp
613 * @remove:
614 *
615 * Returns: The next rgrp in the recent list
616 */
617
618static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
619 int remove)
620{
621 struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
622 struct list_head *head;
623 struct gfs2_rgrpd *rgd;
624
625 spin_lock(&sdp->sd_rindex_spin);
626
627 head = &sdp->sd_rindex_recent_list;
628
629 list_for_each_entry(rgd, head, rd_recent) {
630 if (rgd == cur_rgd) {
631 if (cur_rgd->rd_recent.next != head)
632 rgd = list_entry(cur_rgd->rd_recent.next,
633 struct gfs2_rgrpd, rd_recent);
634 else
635 rgd = NULL;
636
637 if (remove)
638 list_del(&cur_rgd->rd_recent);
639
640 goto out;
641 }
642 }
643
644 rgd = NULL;
645 if (!list_empty(head))
646 rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
647
648 out:
649 spin_unlock(&sdp->sd_rindex_spin);
650
651 return rgd;
652}
653
654/**
655 * recent_rgrp_add - add an RG to tail of "recent" list
656 * @new_rgd: The rgrp to add
657 *
658 */
659
660static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
661{
662 struct gfs2_sbd *sdp = new_rgd->rd_sbd;
663 struct gfs2_rgrpd *rgd;
664 unsigned int count = 0;
665 unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
666
667 spin_lock(&sdp->sd_rindex_spin);
668
669 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
670 if (rgd == new_rgd)
671 goto out;
672
673 if (++count >= max)
674 goto out;
675 }
676 list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
677
678 out:
679 spin_unlock(&sdp->sd_rindex_spin);
680}
681
682/**
683 * forward_rgrp_get - get an rgrp to try next from full list
684 * @sdp: The GFS2 superblock
685 *
686 * Returns: The rgrp to try next
687 */
688
689static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp)
690{
691 struct gfs2_rgrpd *rgd;
692 unsigned int journals = gfs2_jindex_size(sdp);
693 unsigned int rg = 0, x;
694
695 spin_lock(&sdp->sd_rindex_spin);
696
697 rgd = sdp->sd_rindex_forward;
698 if (!rgd) {
699 if (sdp->sd_rgrps >= journals)
700 rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals;
701
702 for (x = 0, rgd = gfs2_rgrpd_get_first(sdp);
703 x < rg;
704 x++, rgd = gfs2_rgrpd_get_next(rgd))
705 /* Do Nothing */;
706
707 sdp->sd_rindex_forward = rgd;
708 }
709
710 spin_unlock(&sdp->sd_rindex_spin);
711
712 return rgd;
713}
714
715/**
716 * forward_rgrp_set - set the forward rgrp pointer
717 * @sdp: the filesystem
718 * @rgd: The new forward rgrp
719 *
720 */
721
722static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
723{
724 spin_lock(&sdp->sd_rindex_spin);
725 sdp->sd_rindex_forward = rgd;
726 spin_unlock(&sdp->sd_rindex_spin);
727}
728
729/**
730 * get_local_rgrp - Choose and lock a rgrp for allocation
731 * @ip: the inode to reserve space for
732 * @rgp: the chosen and locked rgrp
733 *
734 * Try to acquire rgrp in way which avoids contending with others.
735 *
736 * Returns: errno
737 */
738
739static int get_local_rgrp(struct gfs2_inode *ip)
740{
741 struct gfs2_sbd *sdp = ip->i_sbd;
742 struct gfs2_rgrpd *rgd, *begin = NULL;
743 struct gfs2_alloc *al = &ip->i_alloc;
744 int flags = LM_FLAG_TRY;
745 int skipped = 0;
746 int loops = 0;
747 int error;
748
749 /* Try recently successful rgrps */
750
751 rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
752
753 while (rgd) {
754 error = gfs2_glock_nq_init(rgd->rd_gl,
755 LM_ST_EXCLUSIVE, LM_FLAG_TRY,
756 &al->al_rgd_gh);
757 switch (error) {
758 case 0:
759 if (try_rgrp_fit(rgd, al))
760 goto out;
761 gfs2_glock_dq_uninit(&al->al_rgd_gh);
762 rgd = recent_rgrp_next(rgd, 1);
763 break;
764
765 case GLR_TRYFAILED:
766 rgd = recent_rgrp_next(rgd, 0);
767 break;
768
769 default:
770 return error;
771 }
772 }
773
774 /* Go through full list of rgrps */
775
776 begin = rgd = forward_rgrp_get(sdp);
777
778 for (;;) {
779 error = gfs2_glock_nq_init(rgd->rd_gl,
780 LM_ST_EXCLUSIVE, flags,
781 &al->al_rgd_gh);
782 switch (error) {
783 case 0:
784 if (try_rgrp_fit(rgd, al))
785 goto out;
786 gfs2_glock_dq_uninit(&al->al_rgd_gh);
787 break;
788
789 case GLR_TRYFAILED:
790 skipped++;
791 break;
792
793 default:
794 return error;
795 }
796
797 rgd = gfs2_rgrpd_get_next(rgd);
798 if (!rgd)
799 rgd = gfs2_rgrpd_get_first(sdp);
800
801 if (rgd == begin) {
802 if (++loops >= 2 || !skipped)
803 return -ENOSPC;
804 flags = 0;
805 }
806 }
807
808 out:
809 ip->i_last_rg_alloc = rgd->rd_ri.ri_addr;
810
811 if (begin) {
812 recent_rgrp_add(rgd);
813 rgd = gfs2_rgrpd_get_next(rgd);
814 if (!rgd)
815 rgd = gfs2_rgrpd_get_first(sdp);
816 forward_rgrp_set(sdp, rgd);
817 }
818
819 return 0;
820}
821
822/**
823 * gfs2_inplace_reserve_i - Reserve space in the filesystem
824 * @ip: the inode to reserve space for
825 *
826 * Returns: errno
827 */
828
829int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
830{
831 struct gfs2_sbd *sdp = ip->i_sbd;
832 struct gfs2_alloc *al = &ip->i_alloc;
833 int error;
834
835 if (gfs2_assert_warn(sdp, al->al_requested))
836 return -EINVAL;
837
838 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
839 if (error)
840 return error;
841
842 error = get_local_rgrp(ip);
843 if (error) {
844 gfs2_glock_dq_uninit(&al->al_ri_gh);
845 return error;
846 }
847
848 al->al_file = file;
849 al->al_line = line;
850
851 return 0;
852}
853
854/**
855 * gfs2_inplace_release - release an inplace reservation
856 * @ip: the inode the reservation was taken out on
857 *
858 * Release a reservation made by gfs2_inplace_reserve().
859 */
860
861void gfs2_inplace_release(struct gfs2_inode *ip)
862{
863 struct gfs2_sbd *sdp = ip->i_sbd;
864 struct gfs2_alloc *al = &ip->i_alloc;
865
866 if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
867 fs_warn(sdp, "al_alloced = %u, al_requested = %u "
868 "al_file = %s, al_line = %u\n",
869 al->al_alloced, al->al_requested, al->al_file,
870 al->al_line);
871
872 al->al_rgd = NULL;
873 gfs2_glock_dq_uninit(&al->al_rgd_gh);
874 gfs2_glock_dq_uninit(&al->al_ri_gh);
875}
876
877/**
878 * gfs2_get_block_type - Check a block in a RG is of given type
879 * @rgd: the resource group holding the block
880 * @block: the block number
881 *
882 * Returns: The block type (GFS2_BLKST_*)
883 */
884
885unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, uint64_t block)
886{
887 struct gfs2_bitmap *bi = NULL;
888 uint32_t length, rgrp_block, buf_block;
889 unsigned int buf;
890 unsigned char type;
891
892 length = rgd->rd_ri.ri_length;
893 rgrp_block = block - rgd->rd_ri.ri_data0;
894
895 for (buf = 0; buf < length; buf++) {
896 bi = rgd->rd_bits + buf;
897 if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
898 break;
899 }
900
901 gfs2_assert(rgd->rd_sbd, buf < length);
902 buf_block = rgrp_block - bi->bi_start * GFS2_NBBY;
903
904 type = gfs2_testbit(rgd,
905 bi->bi_bh->b_data + bi->bi_offset,
906 bi->bi_len, buf_block);
907
908 return type;
909}
910
911/**
912 * rgblk_search - find a block in @old_state, change allocation
913 * state to @new_state
914 * @rgd: the resource group descriptor
915 * @goal: the goal block within the RG (start here to search for avail block)
916 * @old_state: GFS2_BLKST_XXX the before-allocation state to find
917 * @new_state: GFS2_BLKST_XXX the after-allocation block state
918 *
919 * Walk rgrp's bitmap to find bits that represent a block in @old_state.
920 * Add the found bitmap buffer to the transaction.
921 * Set the found bits to @new_state to change block's allocation state.
922 *
923 * This function never fails, because we wouldn't call it unless we
924 * know (from reservation results, etc.) that a block is available.
925 *
926 * Scope of @goal and returned block is just within rgrp, not the whole
927 * filesystem.
928 *
929 * Returns: the block number allocated
930 */
931
932static uint32_t rgblk_search(struct gfs2_rgrpd *rgd, uint32_t goal,
933 unsigned char old_state, unsigned char new_state)
934{
935 struct gfs2_bitmap *bi = NULL;
936 uint32_t length = rgd->rd_ri.ri_length;
937 uint32_t blk = 0;
938 unsigned int buf, x;
939
940 /* Find bitmap block that contains bits for goal block */
941 for (buf = 0; buf < length; buf++) {
942 bi = rgd->rd_bits + buf;
943 if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
944 break;
945 }
946
947 gfs2_assert(rgd->rd_sbd, buf < length);
948
949 /* Convert scope of "goal" from rgrp-wide to within found bit block */
950 goal -= bi->bi_start * GFS2_NBBY;
951
952 /* Search (up to entire) bitmap in this rgrp for allocatable block.
953 "x <= length", instead of "x < length", because we typically start
954 the search in the middle of a bit block, but if we can't find an
955 allocatable block anywhere else, we want to be able wrap around and
956 search in the first part of our first-searched bit block. */
957 for (x = 0; x <= length; x++) {
958 if (bi->bi_clone)
959 blk = gfs2_bitfit(rgd,
960 bi->bi_clone + bi->bi_offset,
961 bi->bi_len, goal, old_state);
962 else
963 blk = gfs2_bitfit(rgd,
964 bi->bi_bh->b_data + bi->bi_offset,
965 bi->bi_len, goal, old_state);
966 if (blk != BFITNOENT)
967 break;
968
969 /* Try next bitmap block (wrap back to rgrp header if at end) */
970 buf = (buf + 1) % length;
971 bi = rgd->rd_bits + buf;
972 goal = 0;
973 }
974
975 if (gfs2_assert_withdraw(rgd->rd_sbd, x <= length))
976 blk = 0;
977
978 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
979 gfs2_setbit(rgd,
980 bi->bi_bh->b_data + bi->bi_offset,
981 bi->bi_len, blk, new_state);
982 if (bi->bi_clone)
983 gfs2_setbit(rgd,
984 bi->bi_clone + bi->bi_offset,
985 bi->bi_len, blk, new_state);
986
987 return bi->bi_start * GFS2_NBBY + blk;
988}
989
990/**
991 * rgblk_free - Change alloc state of given block(s)
992 * @sdp: the filesystem
993 * @bstart: the start of a run of blocks to free
994 * @blen: the length of the block run (all must lie within ONE RG!)
995 * @new_state: GFS2_BLKST_XXX the after-allocation block state
996 *
997 * Returns: Resource group containing the block(s)
998 */
999
1000static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, uint64_t bstart,
1001 uint32_t blen, unsigned char new_state)
1002{
1003 struct gfs2_rgrpd *rgd;
1004 struct gfs2_bitmap *bi = NULL;
1005 uint32_t length, rgrp_blk, buf_blk;
1006 unsigned int buf;
1007
1008 rgd = gfs2_blk2rgrpd(sdp, bstart);
1009 if (!rgd) {
1010 if (gfs2_consist(sdp))
1011 fs_err(sdp, "block = %llu\n", bstart);
1012 return NULL;
1013 }
1014
1015 length = rgd->rd_ri.ri_length;
1016
1017 rgrp_blk = bstart - rgd->rd_ri.ri_data0;
1018
1019 while (blen--) {
1020 for (buf = 0; buf < length; buf++) {
1021 bi = rgd->rd_bits + buf;
1022 if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
1023 break;
1024 }
1025
1026 gfs2_assert(rgd->rd_sbd, buf < length);
1027
1028 buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY;
1029 rgrp_blk++;
1030
1031 if (!bi->bi_clone) {
1032 bi->bi_clone = kmalloc(bi->bi_bh->b_size,
1033 GFP_KERNEL | __GFP_NOFAIL);
1034 memcpy(bi->bi_clone + bi->bi_offset,
1035 bi->bi_bh->b_data + bi->bi_offset,
1036 bi->bi_len);
1037 }
1038 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1039 gfs2_setbit(rgd,
1040 bi->bi_bh->b_data + bi->bi_offset,
1041 bi->bi_len, buf_blk, new_state);
1042 }
1043
1044 return rgd;
1045}
1046
1047/**
1048 * gfs2_alloc_data - Allocate a data block
1049 * @ip: the inode to allocate the data block for
1050 *
1051 * Returns: the allocated block
1052 */
1053
1054uint64_t gfs2_alloc_data(struct gfs2_inode *ip)
1055{
1056 struct gfs2_sbd *sdp = ip->i_sbd;
1057 struct gfs2_alloc *al = &ip->i_alloc;
1058 struct gfs2_rgrpd *rgd = al->al_rgd;
1059 uint32_t goal, blk;
1060 uint64_t block;
1061
1062 if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_data))
1063 goal = ip->i_di.di_goal_data - rgd->rd_ri.ri_data0;
1064 else
1065 goal = rgd->rd_last_alloc_data;
1066
1067 blk = rgblk_search(rgd, goal,
1068 GFS2_BLKST_FREE, GFS2_BLKST_USED);
1069 rgd->rd_last_alloc_data = blk;
1070
1071 block = rgd->rd_ri.ri_data0 + blk;
1072 ip->i_di.di_goal_data = block;
1073
1074 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1075 rgd->rd_rg.rg_free--;
1076
1077 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1078 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1079
1080 al->al_alloced++;
1081
1082 gfs2_statfs_change(sdp, 0, -1, 0);
1083 gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
1084
1085 spin_lock(&sdp->sd_rindex_spin);
1086 rgd->rd_free_clone--;
1087 spin_unlock(&sdp->sd_rindex_spin);
1088
1089 return block;
1090}
1091
1092/**
1093 * gfs2_alloc_meta - Allocate a metadata block
1094 * @ip: the inode to allocate the metadata block for
1095 *
1096 * Returns: the allocated block
1097 */
1098
1099uint64_t gfs2_alloc_meta(struct gfs2_inode *ip)
1100{
1101 struct gfs2_sbd *sdp = ip->i_sbd;
1102 struct gfs2_alloc *al = &ip->i_alloc;
1103 struct gfs2_rgrpd *rgd = al->al_rgd;
1104 uint32_t goal, blk;
1105 uint64_t block;
1106
1107 if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_meta))
1108 goal = ip->i_di.di_goal_meta - rgd->rd_ri.ri_data0;
1109 else
1110 goal = rgd->rd_last_alloc_meta;
1111
1112 blk = rgblk_search(rgd, goal,
1113 GFS2_BLKST_FREE, GFS2_BLKST_USED);
1114 rgd->rd_last_alloc_meta = blk;
1115
1116 block = rgd->rd_ri.ri_data0 + blk;
1117 ip->i_di.di_goal_meta = block;
1118
1119 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1120 rgd->rd_rg.rg_free--;
1121
1122 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1123 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1124
1125 al->al_alloced++;
1126
1127 gfs2_statfs_change(sdp, 0, -1, 0);
1128 gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
1129 gfs2_trans_add_unrevoke(sdp, block);
1130
1131 spin_lock(&sdp->sd_rindex_spin);
1132 rgd->rd_free_clone--;
1133 spin_unlock(&sdp->sd_rindex_spin);
1134
1135 return block;
1136}
1137
1138/**
1139 * gfs2_alloc_di - Allocate a dinode
1140 * @dip: the directory that the inode is going in
1141 *
1142 * Returns: the block allocated
1143 */
1144
1145uint64_t gfs2_alloc_di(struct gfs2_inode *dip)
1146{
1147 struct gfs2_sbd *sdp = dip->i_sbd;
1148 struct gfs2_alloc *al = &dip->i_alloc;
1149 struct gfs2_rgrpd *rgd = al->al_rgd;
1150 uint32_t blk;
1151 uint64_t block;
1152
1153 blk = rgblk_search(rgd, rgd->rd_last_alloc_meta,
1154 GFS2_BLKST_FREE, GFS2_BLKST_DINODE);
1155
1156 rgd->rd_last_alloc_meta = blk;
1157
1158 block = rgd->rd_ri.ri_data0 + blk;
1159
1160 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1161 rgd->rd_rg.rg_free--;
1162 rgd->rd_rg.rg_dinodes++;
1163
1164 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1165 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1166
1167 al->al_alloced++;
1168
1169 gfs2_statfs_change(sdp, 0, -1, +1);
1170 gfs2_trans_add_unrevoke(sdp, block);
1171
1172 spin_lock(&sdp->sd_rindex_spin);
1173 rgd->rd_free_clone--;
1174 spin_unlock(&sdp->sd_rindex_spin);
1175
1176 return block;
1177}
1178
1179/**
1180 * gfs2_free_data - free a contiguous run of data block(s)
1181 * @ip: the inode these blocks are being freed from
1182 * @bstart: first block of a run of contiguous blocks
1183 * @blen: the length of the block run
1184 *
1185 */
1186
1187void gfs2_free_data(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen)
1188{
1189 struct gfs2_sbd *sdp = ip->i_sbd;
1190 struct gfs2_rgrpd *rgd;
1191
1192 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1193 if (!rgd)
1194 return;
1195
1196 rgd->rd_rg.rg_free += blen;
1197
1198 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1199 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1200
1201 gfs2_trans_add_rg(rgd);
1202
1203 gfs2_statfs_change(sdp, 0, +blen, 0);
1204 gfs2_quota_change(ip, -(int64_t)blen,
1205 ip->i_di.di_uid, ip->i_di.di_gid);
1206}
1207
1208/**
1209 * gfs2_free_meta - free a contiguous run of data block(s)
1210 * @ip: the inode these blocks are being freed from
1211 * @bstart: first block of a run of contiguous blocks
1212 * @blen: the length of the block run
1213 *
1214 */
1215
1216void gfs2_free_meta(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen)
1217{
1218 struct gfs2_sbd *sdp = ip->i_sbd;
1219 struct gfs2_rgrpd *rgd;
1220
1221 rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
1222 if (!rgd)
1223 return;
1224
1225 rgd->rd_rg.rg_free += blen;
1226
1227 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1228 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1229
1230 gfs2_trans_add_rg(rgd);
1231
1232 gfs2_statfs_change(sdp, 0, +blen, 0);
1233 gfs2_quota_change(ip, -(int64_t)blen,
1234 ip->i_di.di_uid, ip->i_di.di_gid);
1235 gfs2_meta_wipe(ip, bstart, blen);
1236}
1237
1238void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, uint64_t blkno)
1239{
1240 struct gfs2_sbd *sdp = rgd->rd_sbd;
1241 struct gfs2_rgrpd *tmp_rgd;
1242
1243 tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE);
1244 if (!tmp_rgd)
1245 return;
1246 gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
1247
1248 if (!rgd->rd_rg.rg_dinodes)
1249 gfs2_consist_rgrpd(rgd);
1250 rgd->rd_rg.rg_dinodes--;
1251 rgd->rd_rg.rg_free++;
1252
1253 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1254 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1255
1256 gfs2_statfs_change(sdp, 0, +1, -1);
1257 gfs2_trans_add_rg(rgd);
1258}
1259
1260/**
1261 * gfs2_free_uninit_di - free a dinode block
1262 * @rgd: the resource group that contains the dinode
1263 * @ip: the inode
1264 *
1265 */
1266
1267void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
1268{
1269 gfs2_free_uninit_di(rgd, ip->i_num.no_addr);
1270 gfs2_quota_change(ip, -1, ip->i_di.di_uid, ip->i_di.di_gid);
1271 gfs2_meta_wipe(ip, ip->i_num.no_addr, 1);
1272}
1273
1274/**
1275 * gfs2_rlist_add - add a RG to a list of RGs
1276 * @sdp: the filesystem
1277 * @rlist: the list of resource groups
1278 * @block: the block
1279 *
1280 * Figure out what RG a block belongs to and add that RG to the list
1281 *
1282 * FIXME: Don't use NOFAIL
1283 *
1284 */
1285
1286void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
1287 uint64_t block)
1288{
1289 struct gfs2_rgrpd *rgd;
1290 struct gfs2_rgrpd **tmp;
1291 unsigned int new_space;
1292 unsigned int x;
1293
1294 if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
1295 return;
1296
1297 rgd = gfs2_blk2rgrpd(sdp, block);
1298 if (!rgd) {
1299 if (gfs2_consist(sdp))
1300 fs_err(sdp, "block = %llu\n", block);
1301 return;
1302 }
1303
1304 for (x = 0; x < rlist->rl_rgrps; x++)
1305 if (rlist->rl_rgd[x] == rgd)
1306 return;
1307
1308 if (rlist->rl_rgrps == rlist->rl_space) {
1309 new_space = rlist->rl_space + 10;
1310
1311 tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
1312 GFP_KERNEL | __GFP_NOFAIL);
1313
1314 if (rlist->rl_rgd) {
1315 memcpy(tmp, rlist->rl_rgd,
1316 rlist->rl_space * sizeof(struct gfs2_rgrpd *));
1317 kfree(rlist->rl_rgd);
1318 }
1319
1320 rlist->rl_space = new_space;
1321 rlist->rl_rgd = tmp;
1322 }
1323
1324 rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
1325}
1326
1327/**
1328 * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
1329 * and initialize an array of glock holders for them
1330 * @rlist: the list of resource groups
1331 * @state: the lock state to acquire the RG lock in
1332 * @flags: the modifier flags for the holder structures
1333 *
1334 * FIXME: Don't use NOFAIL
1335 *
1336 */
1337
1338void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
1339 int flags)
1340{
1341 unsigned int x;
1342
1343 rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
1344 GFP_KERNEL | __GFP_NOFAIL);
1345 for (x = 0; x < rlist->rl_rgrps; x++)
1346 gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
1347 state, flags,
1348 &rlist->rl_ghs[x]);
1349}
1350
1351/**
1352 * gfs2_rlist_free - free a resource group list
1353 * @list: the list of resource groups
1354 *
1355 */
1356
1357void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
1358{
1359 unsigned int x;
1360
1361 kfree(rlist->rl_rgd);
1362
1363 if (rlist->rl_ghs) {
1364 for (x = 0; x < rlist->rl_rgrps; x++)
1365 gfs2_holder_uninit(&rlist->rl_ghs[x]);
1366 kfree(rlist->rl_ghs);
1367 }
1368}
1369
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
new file mode 100644
index 000000000000..4c44a191b1c1
--- /dev/null
+++ b/fs/gfs2/rgrp.h
@@ -0,0 +1,62 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __RGRP_DOT_H__
11#define __RGRP_DOT_H__
12
13void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
14
15struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, uint64_t blk);
16struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
17struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
18
19void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
20int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
21
22int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
23void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
24void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
25
26void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
27
28struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
29void gfs2_alloc_put(struct gfs2_inode *ip);
30
31int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
32 char *file, unsigned int line);
33#define gfs2_inplace_reserve(ip) \
34gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
35
36void gfs2_inplace_release(struct gfs2_inode *ip);
37
38unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, uint64_t block);
39
40uint64_t gfs2_alloc_data(struct gfs2_inode *ip);
41uint64_t gfs2_alloc_meta(struct gfs2_inode *ip);
42uint64_t gfs2_alloc_di(struct gfs2_inode *ip);
43
44void gfs2_free_data(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen);
45void gfs2_free_meta(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen);
46void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, uint64_t blkno);
47void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
48
49struct gfs2_rgrp_list {
50 unsigned int rl_rgrps;
51 unsigned int rl_space;
52 struct gfs2_rgrpd **rl_rgd;
53 struct gfs2_holder *rl_ghs;
54};
55
56void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
57 uint64_t block);
58void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
59 int flags);
60void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
61
62#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
new file mode 100644
index 000000000000..a4da649d086f
--- /dev/null
+++ b/fs/gfs2/super.c
@@ -0,0 +1,950 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/crc32.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/semaphore.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "bmap.h"
23#include "dir.h"
24#include "format.h"
25#include "glock.h"
26#include "glops.h"
27#include "inode.h"
28#include "log.h"
29#include "meta_io.h"
30#include "quota.h"
31#include "recovery.h"
32#include "rgrp.h"
33#include "super.h"
34#include "trans.h"
35#include "unlinked.h"
36#include "util.h"
37
38/**
39 * gfs2_tune_init - Fill a gfs2_tune structure with default values
40 * @gt: tune
41 *
42 */
43
44void gfs2_tune_init(struct gfs2_tune *gt)
45{
46 spin_lock_init(&gt->gt_spin);
47
48 gt->gt_ilimit = 100;
49 gt->gt_ilimit_tries = 3;
50 gt->gt_ilimit_min = 1;
51 gt->gt_demote_secs = 300;
52 gt->gt_incore_log_blocks = 1024;
53 gt->gt_log_flush_secs = 60;
54 gt->gt_jindex_refresh_secs = 60;
55 gt->gt_scand_secs = 15;
56 gt->gt_recoverd_secs = 60;
57 gt->gt_logd_secs = 1;
58 gt->gt_quotad_secs = 5;
59 gt->gt_inoded_secs = 15;
60 gt->gt_quota_simul_sync = 64;
61 gt->gt_quota_warn_period = 10;
62 gt->gt_quota_scale_num = 1;
63 gt->gt_quota_scale_den = 1;
64 gt->gt_quota_cache_secs = 300;
65 gt->gt_quota_quantum = 60;
66 gt->gt_atime_quantum = 3600;
67 gt->gt_new_files_jdata = 0;
68 gt->gt_new_files_directio = 0;
69 gt->gt_max_atomic_write = 4 << 20;
70 gt->gt_max_readahead = 1 << 18;
71 gt->gt_lockdump_size = 131072;
72 gt->gt_stall_secs = 600;
73 gt->gt_complain_secs = 10;
74 gt->gt_reclaim_limit = 5000;
75 gt->gt_entries_per_readdir = 32;
76 gt->gt_prefetch_secs = 10;
77 gt->gt_greedy_default = HZ / 10;
78 gt->gt_greedy_quantum = HZ / 40;
79 gt->gt_greedy_max = HZ / 4;
80 gt->gt_statfs_quantum = 30;
81 gt->gt_statfs_slow = 0;
82}
83
84/**
85 * gfs2_check_sb - Check superblock
86 * @sdp: the filesystem
87 * @sb: The superblock
88 * @silent: Don't print a message if the check fails
89 *
90 * Checks the version code of the FS is one that we understand how to
91 * read and that the sizes of the various on-disk structures have not
92 * changed.
93 */
94
95int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent)
96{
97 unsigned int x;
98
99 if (sb->sb_header.mh_magic != GFS2_MAGIC ||
100 sb->sb_header.mh_type != GFS2_METATYPE_SB) {
101 if (!silent)
102 printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
103 return -EINVAL;
104 }
105
106 /* If format numbers match exactly, we're done. */
107
108 if (sb->sb_fs_format == GFS2_FORMAT_FS &&
109 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
110 return 0;
111
112 if (sb->sb_fs_format != GFS2_FORMAT_FS) {
113 for (x = 0; gfs2_old_fs_formats[x]; x++)
114 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
115 break;
116
117 if (!gfs2_old_fs_formats[x]) {
118 printk(KERN_WARNING
119 "GFS2: code version (%u, %u) is incompatible "
120 "with ondisk format (%u, %u)\n",
121 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
122 sb->sb_fs_format, sb->sb_multihost_format);
123 printk(KERN_WARNING
124 "GFS2: I don't know how to upgrade this FS\n");
125 return -EINVAL;
126 }
127 }
128
129 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
130 for (x = 0; gfs2_old_multihost_formats[x]; x++)
131 if (gfs2_old_multihost_formats[x] ==
132 sb->sb_multihost_format)
133 break;
134
135 if (!gfs2_old_multihost_formats[x]) {
136 printk(KERN_WARNING
137 "GFS2: code version (%u, %u) is incompatible "
138 "with ondisk format (%u, %u)\n",
139 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
140 sb->sb_fs_format, sb->sb_multihost_format);
141 printk(KERN_WARNING
142 "GFS2: I don't know how to upgrade this FS\n");
143 return -EINVAL;
144 }
145 }
146
147 if (!sdp->sd_args.ar_upgrade) {
148 printk(KERN_WARNING
149 "GFS2: code version (%u, %u) is incompatible "
150 "with ondisk format (%u, %u)\n",
151 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
152 sb->sb_fs_format, sb->sb_multihost_format);
153 printk(KERN_INFO
154 "GFS2: Use the \"upgrade\" mount option to upgrade "
155 "the FS\n");
156 printk(KERN_INFO "GFS2: See the manual for more details\n");
157 return -EINVAL;
158 }
159
160 return 0;
161}
162
163/**
164 * gfs2_read_sb - Read super block
165 * @sdp: The GFS2 superblock
166 * @gl: the glock for the superblock (assumed to be held)
167 * @silent: Don't print message if mount fails
168 *
169 */
170
171int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
172{
173 struct buffer_head *bh;
174 uint32_t hash_blocks, ind_blocks, leaf_blocks;
175 uint32_t tmp_blocks;
176 unsigned int x;
177 int error;
178
179 error = gfs2_meta_read(gl, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift,
180 DIO_FORCE | DIO_START | DIO_WAIT, &bh);
181 if (error) {
182 if (!silent)
183 fs_err(sdp, "can't read superblock\n");
184 return error;
185 }
186
187 gfs2_assert(sdp, sizeof(struct gfs2_sb) <= bh->b_size);
188 gfs2_sb_in(&sdp->sd_sb, bh->b_data);
189 brelse(bh);
190
191 error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
192 if (error)
193 return error;
194
195 sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
196 GFS2_BASIC_BLOCK_SHIFT;
197 sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
198 sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
199 sizeof(struct gfs2_dinode)) / sizeof(uint64_t);
200 sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
201 sizeof(struct gfs2_meta_header)) / sizeof(uint64_t);
202 sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
203 sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
204 sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
205 sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(uint64_t);
206 sdp->sd_ut_per_block = (sdp->sd_sb.sb_bsize -
207 sizeof(struct gfs2_meta_header)) /
208 sizeof(struct gfs2_unlinked_tag);
209 sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
210 sizeof(struct gfs2_meta_header)) /
211 sizeof(struct gfs2_quota_change);
212
213 /* Compute maximum reservation required to add a entry to a directory */
214
215 hash_blocks = DIV_ROUND_UP(sizeof(uint64_t) * (1 << GFS2_DIR_MAX_DEPTH),
216 sdp->sd_jbsize);
217
218 ind_blocks = 0;
219 for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
220 tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
221 ind_blocks += tmp_blocks;
222 }
223
224 leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
225
226 sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
227
228 sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
229 sizeof(struct gfs2_dinode);
230 sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
231 for (x = 2;; x++) {
232 uint64_t space, d;
233 uint32_t m;
234
235 space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
236 d = space;
237 m = do_div(d, sdp->sd_inptrs);
238
239 if (d != sdp->sd_heightsize[x - 1] || m)
240 break;
241 sdp->sd_heightsize[x] = space;
242 }
243 sdp->sd_max_height = x;
244 gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
245
246 sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
247 sizeof(struct gfs2_dinode);
248 sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
249 for (x = 2;; x++) {
250 uint64_t space, d;
251 uint32_t m;
252
253 space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
254 d = space;
255 m = do_div(d, sdp->sd_inptrs);
256
257 if (d != sdp->sd_jheightsize[x - 1] || m)
258 break;
259 sdp->sd_jheightsize[x] = space;
260 }
261 sdp->sd_max_jheight = x;
262 gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
263
264 return 0;
265}
266
267int gfs2_do_upgrade(struct gfs2_sbd *sdp, struct gfs2_glock *sb_gl)
268{
269 return 0;
270}
271
272/**
273 * gfs2_jindex_hold - Grab a lock on the jindex
274 * @sdp: The GFS2 superblock
275 * @ji_gh: the holder for the jindex glock
276 *
277 * This is very similar to the gfs2_rindex_hold() function, except that
278 * in general we hold the jindex lock for longer periods of time and
279 * we grab it far less frequently (in general) then the rgrp lock.
280 *
281 * Returns: errno
282 */
283
284int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
285{
286 struct gfs2_inode *dip = sdp->sd_jindex->u.generic_ip;
287 struct qstr name;
288 char buf[20];
289 struct gfs2_jdesc *jd;
290 int error;
291
292 name.name = buf;
293
294 mutex_lock(&sdp->sd_jindex_mutex);
295
296 for (;;) {
297 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED,
298 GL_LOCAL_EXCL, ji_gh);
299 if (error)
300 break;
301
302 name.len = sprintf(buf, "journal%u", sdp->sd_journals);
303 name.hash = gfs2_disk_hash(name.name, name.len);
304
305 error = gfs2_dir_search(sdp->sd_jindex,
306 &name, NULL, NULL);
307 if (error == -ENOENT) {
308 error = 0;
309 break;
310 }
311
312 gfs2_glock_dq_uninit(ji_gh);
313
314 if (error)
315 break;
316
317 error = -ENOMEM;
318 jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
319 if (!jd)
320 break;
321
322 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
323 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
324 if (!jd->jd_inode)
325 error = -ENOENT;
326 else
327 error = PTR_ERR(jd->jd_inode);
328 kfree(jd);
329 break;
330 }
331
332 spin_lock(&sdp->sd_jindex_spin);
333 jd->jd_jid = sdp->sd_journals++;
334 list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
335 spin_unlock(&sdp->sd_jindex_spin);
336 }
337
338 mutex_unlock(&sdp->sd_jindex_mutex);
339
340 return error;
341}
342
343/**
344 * gfs2_jindex_free - Clear all the journal index information
345 * @sdp: The GFS2 superblock
346 *
347 */
348
349void gfs2_jindex_free(struct gfs2_sbd *sdp)
350{
351 struct list_head list;
352 struct gfs2_jdesc *jd;
353
354 spin_lock(&sdp->sd_jindex_spin);
355 list_add(&list, &sdp->sd_jindex_list);
356 list_del_init(&sdp->sd_jindex_list);
357 sdp->sd_journals = 0;
358 spin_unlock(&sdp->sd_jindex_spin);
359
360 while (!list_empty(&list)) {
361 jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
362 list_del(&jd->jd_list);
363 iput(jd->jd_inode);
364 kfree(jd);
365 }
366}
367
368static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid)
369{
370 struct gfs2_jdesc *jd;
371 int found = 0;
372
373 list_for_each_entry(jd, head, jd_list) {
374 if (jd->jd_jid == jid) {
375 found = 1;
376 break;
377 }
378 }
379
380 if (!found)
381 jd = NULL;
382
383 return jd;
384}
385
386struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
387{
388 struct gfs2_jdesc *jd;
389
390 spin_lock(&sdp->sd_jindex_spin);
391 jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
392 spin_unlock(&sdp->sd_jindex_spin);
393
394 return jd;
395}
396
397void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
398{
399 struct gfs2_jdesc *jd;
400
401 spin_lock(&sdp->sd_jindex_spin);
402 jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
403 if (jd)
404 jd->jd_dirty = 1;
405 spin_unlock(&sdp->sd_jindex_spin);
406}
407
408struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
409{
410 struct gfs2_jdesc *jd;
411 int found = 0;
412
413 spin_lock(&sdp->sd_jindex_spin);
414
415 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
416 if (jd->jd_dirty) {
417 jd->jd_dirty = 0;
418 found = 1;
419 break;
420 }
421 }
422 spin_unlock(&sdp->sd_jindex_spin);
423
424 if (!found)
425 jd = NULL;
426
427 return jd;
428}
429
430int gfs2_jdesc_check(struct gfs2_jdesc *jd)
431{
432 struct gfs2_inode *ip = jd->jd_inode->u.generic_ip;
433 struct gfs2_sbd *sdp = ip->i_sbd;
434 int ar;
435 int error;
436
437 if (ip->i_di.di_size < (8 << 20) ||
438 ip->i_di.di_size > (1 << 30) ||
439 (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) {
440 gfs2_consist_inode(ip);
441 return -EIO;
442 }
443 jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
444
445 error = gfs2_write_alloc_required(ip,
446 0, ip->i_di.di_size,
447 &ar);
448 if (!error && ar) {
449 gfs2_consist_inode(ip);
450 error = -EIO;
451 }
452
453 return error;
454}
455
456/**
457 * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
458 * @sdp: the filesystem
459 *
460 * Returns: errno
461 */
462
463int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
464{
465 struct gfs2_inode *ip = sdp->sd_jdesc->jd_inode->u.generic_ip;
466 struct gfs2_glock *j_gl = ip->i_gl;
467 struct gfs2_holder t_gh;
468 struct gfs2_log_header head;
469 int error;
470
471 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
472 GL_LOCAL_EXCL | GL_NEVER_RECURSE, &t_gh);
473 if (error)
474 return error;
475
476 gfs2_meta_cache_flush(ip);
477 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
478
479 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
480 if (error)
481 goto fail;
482
483 if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
484 gfs2_consist(sdp);
485 error = -EIO;
486 goto fail;
487 }
488
489 /* Initialize some head of the log stuff */
490 sdp->sd_log_sequence = head.lh_sequence + 1;
491 gfs2_log_pointers_init(sdp, head.lh_blkno);
492
493 error = gfs2_unlinked_init(sdp);
494 if (error)
495 goto fail;
496 error = gfs2_quota_init(sdp);
497 if (error)
498 goto fail_unlinked;
499
500 set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
501
502 gfs2_glock_dq_uninit(&t_gh);
503
504 return 0;
505
506 fail_unlinked:
507 gfs2_unlinked_cleanup(sdp);
508
509 fail:
510 t_gh.gh_flags |= GL_NOCACHE;
511 gfs2_glock_dq_uninit(&t_gh);
512
513 return error;
514}
515
516/**
517 * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
518 * @sdp: the filesystem
519 *
520 * Returns: errno
521 */
522
523int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
524{
525 struct gfs2_holder t_gh;
526 int error;
527
528 gfs2_unlinked_dealloc(sdp);
529 gfs2_quota_sync(sdp);
530 gfs2_statfs_sync(sdp);
531
532 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
533 GL_LOCAL_EXCL | GL_NEVER_RECURSE | GL_NOCACHE,
534 &t_gh);
535 if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
536 return error;
537
538 gfs2_meta_syncfs(sdp);
539 gfs2_log_shutdown(sdp);
540
541 clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
542
543 if (t_gh.gh_gl)
544 gfs2_glock_dq_uninit(&t_gh);
545
546 gfs2_unlinked_cleanup(sdp);
547 gfs2_quota_cleanup(sdp);
548
549 return error;
550}
551
552int gfs2_statfs_init(struct gfs2_sbd *sdp)
553{
554 struct gfs2_inode *m_ip = sdp->sd_statfs_inode->u.generic_ip;
555 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
556 struct gfs2_inode *l_ip = sdp->sd_sc_inode->u.generic_ip;
557 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
558 struct buffer_head *m_bh, *l_bh;
559 struct gfs2_holder gh;
560 int error;
561
562 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
563 &gh);
564 if (error)
565 return error;
566
567 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
568 if (error)
569 goto out;
570
571 if (sdp->sd_args.ar_spectator) {
572 spin_lock(&sdp->sd_statfs_spin);
573 gfs2_statfs_change_in(m_sc, m_bh->b_data +
574 sizeof(struct gfs2_dinode));
575 spin_unlock(&sdp->sd_statfs_spin);
576 } else {
577 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
578 if (error)
579 goto out_m_bh;
580
581 spin_lock(&sdp->sd_statfs_spin);
582 gfs2_statfs_change_in(m_sc, m_bh->b_data +
583 sizeof(struct gfs2_dinode));
584 gfs2_statfs_change_in(l_sc, l_bh->b_data +
585 sizeof(struct gfs2_dinode));
586 spin_unlock(&sdp->sd_statfs_spin);
587
588 brelse(l_bh);
589 }
590
591 out_m_bh:
592 brelse(m_bh);
593
594 out:
595 gfs2_glock_dq_uninit(&gh);
596
597 return 0;
598}
599
600void gfs2_statfs_change(struct gfs2_sbd *sdp, int64_t total, int64_t free,
601 int64_t dinodes)
602{
603 struct gfs2_inode *l_ip = sdp->sd_sc_inode->u.generic_ip;
604 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
605 struct buffer_head *l_bh;
606 int error;
607
608 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
609 if (error)
610 return;
611
612 mutex_lock(&sdp->sd_statfs_mutex);
613 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
614 mutex_unlock(&sdp->sd_statfs_mutex);
615
616 spin_lock(&sdp->sd_statfs_spin);
617 l_sc->sc_total += total;
618 l_sc->sc_free += free;
619 l_sc->sc_dinodes += dinodes;
620 gfs2_statfs_change_out(l_sc, l_bh->b_data +
621 sizeof(struct gfs2_dinode));
622 spin_unlock(&sdp->sd_statfs_spin);
623
624 brelse(l_bh);
625}
626
627int gfs2_statfs_sync(struct gfs2_sbd *sdp)
628{
629 struct gfs2_inode *m_ip = sdp->sd_statfs_inode->u.generic_ip;
630 struct gfs2_inode *l_ip = sdp->sd_sc_inode->u.generic_ip;
631 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
632 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
633 struct gfs2_holder gh;
634 struct buffer_head *m_bh, *l_bh;
635 int error;
636
637 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
638 &gh);
639 if (error)
640 return error;
641
642 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
643 if (error)
644 goto out;
645
646 spin_lock(&sdp->sd_statfs_spin);
647 gfs2_statfs_change_in(m_sc, m_bh->b_data +
648 sizeof(struct gfs2_dinode));
649 if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) {
650 spin_unlock(&sdp->sd_statfs_spin);
651 goto out_bh;
652 }
653 spin_unlock(&sdp->sd_statfs_spin);
654
655 error = gfs2_meta_inode_buffer(l_ip, &l_bh);
656 if (error)
657 goto out_bh;
658
659 error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
660 if (error)
661 goto out_bh2;
662
663 mutex_lock(&sdp->sd_statfs_mutex);
664 gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
665 mutex_unlock(&sdp->sd_statfs_mutex);
666
667 spin_lock(&sdp->sd_statfs_spin);
668 m_sc->sc_total += l_sc->sc_total;
669 m_sc->sc_free += l_sc->sc_free;
670 m_sc->sc_dinodes += l_sc->sc_dinodes;
671 memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
672 memset(l_bh->b_data + sizeof(struct gfs2_dinode),
673 0, sizeof(struct gfs2_statfs_change));
674 spin_unlock(&sdp->sd_statfs_spin);
675
676 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
677 gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
678
679 gfs2_trans_end(sdp);
680
681 out_bh2:
682 brelse(l_bh);
683
684 out_bh:
685 brelse(m_bh);
686
687 out:
688 gfs2_glock_dq_uninit(&gh);
689
690 return error;
691}
692
693/**
694 * gfs2_statfs_i - Do a statfs
695 * @sdp: the filesystem
696 * @sg: the sg structure
697 *
698 * Returns: errno
699 */
700
701int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
702{
703 struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
704 struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
705
706 spin_lock(&sdp->sd_statfs_spin);
707
708 *sc = *m_sc;
709 sc->sc_total += l_sc->sc_total;
710 sc->sc_free += l_sc->sc_free;
711 sc->sc_dinodes += l_sc->sc_dinodes;
712
713 spin_unlock(&sdp->sd_statfs_spin);
714
715 if (sc->sc_free < 0)
716 sc->sc_free = 0;
717 if (sc->sc_free > sc->sc_total)
718 sc->sc_free = sc->sc_total;
719 if (sc->sc_dinodes < 0)
720 sc->sc_dinodes = 0;
721
722 return 0;
723}
724
725/**
726 * statfs_fill - fill in the sg for a given RG
727 * @rgd: the RG
728 * @sc: the sc structure
729 *
730 * Returns: 0 on success, -ESTALE if the LVB is invalid
731 */
732
733static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
734 struct gfs2_statfs_change *sc)
735{
736 gfs2_rgrp_verify(rgd);
737 sc->sc_total += rgd->rd_ri.ri_data;
738 sc->sc_free += rgd->rd_rg.rg_free;
739 sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
740 return 0;
741}
742
743/**
744 * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
745 * @sdp: the filesystem
746 * @sc: the sc info that will be returned
747 *
748 * Any error (other than a signal) will cause this routine to fall back
749 * to the synchronous version.
750 *
751 * FIXME: This really shouldn't busy wait like this.
752 *
753 * Returns: errno
754 */
755
756int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
757{
758 struct gfs2_holder ri_gh;
759 struct gfs2_rgrpd *rgd_next;
760 struct gfs2_holder *gha, *gh;
761 unsigned int slots = 64;
762 unsigned int x;
763 int done;
764 int error = 0, err;
765
766 memset(sc, 0, sizeof(struct gfs2_statfs_change));
767 gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
768 if (!gha)
769 return -ENOMEM;
770
771 error = gfs2_rindex_hold(sdp, &ri_gh);
772 if (error)
773 goto out;
774
775 rgd_next = gfs2_rgrpd_get_first(sdp);
776
777 for (;;) {
778 done = 1;
779
780 for (x = 0; x < slots; x++) {
781 gh = gha + x;
782
783 if (gh->gh_gl && gfs2_glock_poll(gh)) {
784 err = gfs2_glock_wait(gh);
785 if (err) {
786 gfs2_holder_uninit(gh);
787 error = err;
788 } else {
789 if (!error)
790 error = statfs_slow_fill(
791 gh->gh_gl->gl_object, sc);
792 gfs2_glock_dq_uninit(gh);
793 }
794 }
795
796 if (gh->gh_gl)
797 done = 0;
798 else if (rgd_next && !error) {
799 error = gfs2_glock_nq_init(rgd_next->rd_gl,
800 LM_ST_SHARED,
801 GL_ASYNC,
802 gh);
803 rgd_next = gfs2_rgrpd_get_next(rgd_next);
804 done = 0;
805 }
806
807 if (signal_pending(current))
808 error = -ERESTARTSYS;
809 }
810
811 if (done)
812 break;
813
814 yield();
815 }
816
817 gfs2_glock_dq_uninit(&ri_gh);
818
819 out:
820 kfree(gha);
821
822 return error;
823}
824
825struct lfcc {
826 struct list_head list;
827 struct gfs2_holder gh;
828};
829
830/**
831 * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all
832 * journals are clean
833 * @sdp: the file system
834 * @state: the state to put the transaction lock into
835 * @t_gh: the hold on the transaction lock
836 *
837 * Returns: errno
838 */
839
840int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, struct gfs2_holder *t_gh)
841{
842 struct gfs2_inode *ip;
843 struct gfs2_holder ji_gh;
844 struct gfs2_jdesc *jd;
845 struct lfcc *lfcc;
846 LIST_HEAD(list);
847 struct gfs2_log_header lh;
848 int error;
849
850 error = gfs2_jindex_hold(sdp, &ji_gh);
851 if (error)
852 return error;
853
854 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
855 lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
856 if (!lfcc) {
857 error = -ENOMEM;
858 goto out;
859 }
860 ip = jd->jd_inode->u.generic_ip;
861 error = gfs2_glock_nq_init(ip->i_gl,
862 LM_ST_SHARED, 0,
863 &lfcc->gh);
864 if (error) {
865 kfree(lfcc);
866 goto out;
867 }
868 list_add(&lfcc->list, &list);
869 }
870
871 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
872 LM_FLAG_PRIORITY | GL_NEVER_RECURSE | GL_NOCACHE,
873 t_gh);
874
875 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
876 error = gfs2_jdesc_check(jd);
877 if (error)
878 break;
879 error = gfs2_find_jhead(jd, &lh);
880 if (error)
881 break;
882 if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
883 error = -EBUSY;
884 break;
885 }
886 }
887
888 if (error)
889 gfs2_glock_dq_uninit(t_gh);
890
891 out:
892 while (!list_empty(&list)) {
893 lfcc = list_entry(list.next, struct lfcc, list);
894 list_del(&lfcc->list);
895 gfs2_glock_dq_uninit(&lfcc->gh);
896 kfree(lfcc);
897 }
898 gfs2_glock_dq_uninit(&ji_gh);
899
900 return error;
901}
902
903/**
904 * gfs2_freeze_fs - freezes the file system
905 * @sdp: the file system
906 *
907 * This function flushes data and meta data for all machines by
908 * aquiring the transaction log exclusively. All journals are
909 * ensured to be in a clean state as well.
910 *
911 * Returns: errno
912 */
913
914int gfs2_freeze_fs(struct gfs2_sbd *sdp)
915{
916 int error = 0;
917
918 mutex_lock(&sdp->sd_freeze_lock);
919
920 if (!sdp->sd_freeze_count++) {
921 error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
922 if (error)
923 sdp->sd_freeze_count--;
924 }
925
926 mutex_unlock(&sdp->sd_freeze_lock);
927
928 return error;
929}
930
931/**
932 * gfs2_unfreeze_fs - unfreezes the file system
933 * @sdp: the file system
934 *
935 * This function allows the file system to proceed by unlocking
936 * the exclusively held transaction lock. Other GFS2 nodes are
937 * now free to acquire the lock shared and go on with their lives.
938 *
939 */
940
941void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
942{
943 mutex_lock(&sdp->sd_freeze_lock);
944
945 if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
946 gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
947
948 mutex_unlock(&sdp->sd_freeze_lock);
949}
950
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
new file mode 100644
index 000000000000..6abb7b5c8828
--- /dev/null
+++ b/fs/gfs2/super.h
@@ -0,0 +1,54 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __SUPER_DOT_H__
11#define __SUPER_DOT_H__
12
13void gfs2_tune_init(struct gfs2_tune *gt);
14
15int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent);
16int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
17int gfs2_do_upgrade(struct gfs2_sbd *sdp, struct gfs2_glock *gl_sb);
18
19static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
20{
21 unsigned int x;
22 spin_lock(&sdp->sd_jindex_spin);
23 x = sdp->sd_journals;
24 spin_unlock(&sdp->sd_jindex_spin);
25 return x;
26}
27
28int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
29void gfs2_jindex_free(struct gfs2_sbd *sdp);
30
31struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
32void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
33struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
34int gfs2_jdesc_check(struct gfs2_jdesc *jd);
35
36int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
37 struct gfs2_inode **ipp);
38
39int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
40int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
41
42int gfs2_statfs_init(struct gfs2_sbd *sdp);
43void gfs2_statfs_change(struct gfs2_sbd *sdp,
44 int64_t total, int64_t free, int64_t dinodes);
45int gfs2_statfs_sync(struct gfs2_sbd *sdp);
46int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
47int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
48
49int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, struct gfs2_holder *t_gh);
50int gfs2_freeze_fs(struct gfs2_sbd *sdp);
51void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
52
53#endif /* __SUPER_DOT_H__ */
54
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
new file mode 100644
index 000000000000..f05ba8f69132
--- /dev/null
+++ b/fs/gfs2/sys.c
@@ -0,0 +1,582 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/module.h>
16#include <linux/kobject.h>
17#include <linux/gfs2_ondisk.h>
18#include <asm/semaphore.h>
19#include <asm/uaccess.h>
20
21#include "gfs2.h"
22#include "lm_interface.h"
23#include "incore.h"
24#include "lm.h"
25#include "sys.h"
26#include "super.h"
27#include "glock.h"
28#include "quota.h"
29#include "util.h"
30
31char *gfs2_sys_margs;
32spinlock_t gfs2_sys_margs_lock;
33
34static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
35{
36 return sprintf(buf, "%s\n", sdp->sd_vfs->s_id);
37}
38
39static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
40{
41 return sprintf(buf, "%s\n", sdp->sd_fsname);
42}
43
44static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
45{
46 unsigned int count;
47
48 mutex_lock(&sdp->sd_freeze_lock);
49 count = sdp->sd_freeze_count;
50 mutex_unlock(&sdp->sd_freeze_lock);
51
52 return sprintf(buf, "%u\n", count);
53}
54
55static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
56{
57 ssize_t ret = len;
58 int error = 0;
59 int n = simple_strtol(buf, NULL, 0);
60
61 if (!capable(CAP_SYS_ADMIN))
62 return -EACCES;
63
64 switch (n) {
65 case 0:
66 gfs2_unfreeze_fs(sdp);
67 break;
68 case 1:
69 error = gfs2_freeze_fs(sdp);
70 break;
71 default:
72 ret = -EINVAL;
73 }
74
75 if (error)
76 fs_warn(sdp, "freeze %d error %d", n, error);
77
78 return ret;
79}
80
81static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
82{
83 unsigned int b = test_bit(SDF_SHUTDOWN, &sdp->sd_flags);
84 return sprintf(buf, "%u\n", b);
85}
86
87static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
88{
89 if (!capable(CAP_SYS_ADMIN))
90 return -EACCES;
91
92 if (simple_strtol(buf, NULL, 0) != 1)
93 return -EINVAL;
94
95 gfs2_lm_withdraw(sdp,
96 "GFS2: fsid=%s: withdrawing from cluster at user's request\n",
97 sdp->sd_fsname);
98 return len;
99}
100
101static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
102 size_t len)
103{
104 if (!capable(CAP_SYS_ADMIN))
105 return -EACCES;
106
107 if (simple_strtol(buf, NULL, 0) != 1)
108 return -EINVAL;
109
110 gfs2_statfs_sync(sdp);
111 return len;
112}
113
114static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
115{
116 if (!capable(CAP_SYS_ADMIN))
117 return -EACCES;
118
119 if (simple_strtol(buf, NULL, 0) != 1)
120 return -EINVAL;
121
122 gfs2_gl_hash_clear(sdp, NO_WAIT);
123 return len;
124}
125
126static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
127 size_t len)
128{
129 if (!capable(CAP_SYS_ADMIN))
130 return -EACCES;
131
132 if (simple_strtol(buf, NULL, 0) != 1)
133 return -EINVAL;
134
135 gfs2_quota_sync(sdp);
136 return len;
137}
138
139static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
140 size_t len)
141{
142 uint32_t id;
143
144 if (!capable(CAP_SYS_ADMIN))
145 return -EACCES;
146
147 id = simple_strtoul(buf, NULL, 0);
148
149 gfs2_quota_refresh(sdp, 1, id);
150 return len;
151}
152
153static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
154 size_t len)
155{
156 uint32_t id;
157
158 if (!capable(CAP_SYS_ADMIN))
159 return -EACCES;
160
161 id = simple_strtoul(buf, NULL, 0);
162
163 gfs2_quota_refresh(sdp, 0, id);
164 return len;
165}
166
167struct gfs2_attr {
168 struct attribute attr;
169 ssize_t (*show)(struct gfs2_sbd *, char *);
170 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
171};
172
173#define GFS2_ATTR(name, mode, show, store) \
174static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
175
176GFS2_ATTR(id, 0444, id_show, NULL);
177GFS2_ATTR(fsname, 0444, fsname_show, NULL);
178GFS2_ATTR(freeze, 0644, freeze_show, freeze_store);
179GFS2_ATTR(shrink, 0200, NULL, shrink_store);
180GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
181GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store);
182GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store);
183GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store);
184GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store);
185
186static struct attribute *gfs2_attrs[] = {
187 &gfs2_attr_id.attr,
188 &gfs2_attr_fsname.attr,
189 &gfs2_attr_freeze.attr,
190 &gfs2_attr_shrink.attr,
191 &gfs2_attr_withdraw.attr,
192 &gfs2_attr_statfs_sync.attr,
193 &gfs2_attr_quota_sync.attr,
194 &gfs2_attr_quota_refresh_user.attr,
195 &gfs2_attr_quota_refresh_group.attr,
196 NULL,
197};
198
199static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
200 char *buf)
201{
202 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
203 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
204 return a->show ? a->show(sdp, buf) : 0;
205}
206
207static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
208 const char *buf, size_t len)
209{
210 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
211 struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
212 return a->store ? a->store(sdp, buf, len) : len;
213}
214
215static struct sysfs_ops gfs2_attr_ops = {
216 .show = gfs2_attr_show,
217 .store = gfs2_attr_store,
218};
219
220static struct kobj_type gfs2_ktype = {
221 .default_attrs = gfs2_attrs,
222 .sysfs_ops = &gfs2_attr_ops,
223};
224
225static struct kset gfs2_kset = {
226 .subsys = &fs_subsys,
227 .kobj = {.name = "gfs2",},
228 .ktype = &gfs2_ktype,
229};
230
231/*
232 * display struct lm_lockstruct fields
233 */
234
235struct lockstruct_attr {
236 struct attribute attr;
237 ssize_t (*show)(struct gfs2_sbd *, char *);
238};
239
240#define LOCKSTRUCT_ATTR(name, fmt) \
241static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
242{ \
243 return sprintf(buf, fmt, sdp->sd_lockstruct.ls_##name); \
244} \
245static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
246
247LOCKSTRUCT_ATTR(jid, "%u\n");
248LOCKSTRUCT_ATTR(first, "%u\n");
249LOCKSTRUCT_ATTR(lvb_size, "%u\n");
250LOCKSTRUCT_ATTR(flags, "%d\n");
251
252static struct attribute *lockstruct_attrs[] = {
253 &lockstruct_attr_jid.attr,
254 &lockstruct_attr_first.attr,
255 &lockstruct_attr_lvb_size.attr,
256 &lockstruct_attr_flags.attr,
257 NULL
258};
259
260/*
261 * display struct gfs2_args fields
262 */
263
264struct args_attr {
265 struct attribute attr;
266 ssize_t (*show)(struct gfs2_sbd *, char *);
267};
268
269#define ARGS_ATTR(name, fmt) \
270static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
271{ \
272 return sprintf(buf, fmt, sdp->sd_args.ar_##name); \
273} \
274static struct args_attr args_attr_##name = __ATTR_RO(name)
275
276ARGS_ATTR(lockproto, "%s\n");
277ARGS_ATTR(locktable, "%s\n");
278ARGS_ATTR(hostdata, "%s\n");
279ARGS_ATTR(spectator, "%d\n");
280ARGS_ATTR(ignore_local_fs, "%d\n");
281ARGS_ATTR(localcaching, "%d\n");
282ARGS_ATTR(localflocks, "%d\n");
283ARGS_ATTR(debug, "%d\n");
284ARGS_ATTR(upgrade, "%d\n");
285ARGS_ATTR(num_glockd, "%u\n");
286ARGS_ATTR(posix_acl, "%d\n");
287ARGS_ATTR(quota, "%u\n");
288ARGS_ATTR(suiddir, "%d\n");
289ARGS_ATTR(data, "%d\n");
290
291/* one oddball doesn't fit the macro mold */
292static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
293{
294 return sprintf(buf, "%d\n", !!test_bit(SDF_NOATIME, &sdp->sd_flags));
295}
296static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
297
298static struct attribute *args_attrs[] = {
299 &args_attr_lockproto.attr,
300 &args_attr_locktable.attr,
301 &args_attr_hostdata.attr,
302 &args_attr_spectator.attr,
303 &args_attr_ignore_local_fs.attr,
304 &args_attr_localcaching.attr,
305 &args_attr_localflocks.attr,
306 &args_attr_debug.attr,
307 &args_attr_upgrade.attr,
308 &args_attr_num_glockd.attr,
309 &args_attr_posix_acl.attr,
310 &args_attr_quota.attr,
311 &args_attr_suiddir.attr,
312 &args_attr_data.attr,
313 &args_attr_noatime.attr,
314 NULL
315};
316
317/*
318 * display counters from superblock
319 */
320
321struct counters_attr {
322 struct attribute attr;
323 ssize_t (*show)(struct gfs2_sbd *, char *);
324};
325
326#define COUNTERS_ATTR(name, fmt) \
327static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
328{ \
329 return sprintf(buf, fmt, (unsigned int)atomic_read(&sdp->sd_##name)); \
330} \
331static struct counters_attr counters_attr_##name = __ATTR_RO(name)
332
333COUNTERS_ATTR(glock_count, "%u\n");
334COUNTERS_ATTR(glock_held_count, "%u\n");
335COUNTERS_ATTR(inode_count, "%u\n");
336COUNTERS_ATTR(reclaimed, "%u\n");
337
338static struct attribute *counters_attrs[] = {
339 &counters_attr_glock_count.attr,
340 &counters_attr_glock_held_count.attr,
341 &counters_attr_inode_count.attr,
342 &counters_attr_reclaimed.attr,
343 NULL
344};
345
346/*
347 * get and set struct gfs2_tune fields
348 */
349
350static ssize_t quota_scale_show(struct gfs2_sbd *sdp, char *buf)
351{
352 return sprintf(buf, "%u %u\n", sdp->sd_tune.gt_quota_scale_num,
353 sdp->sd_tune.gt_quota_scale_den);
354}
355
356static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf,
357 size_t len)
358{
359 struct gfs2_tune *gt = &sdp->sd_tune;
360 unsigned int x, y;
361
362 if (!capable(CAP_SYS_ADMIN))
363 return -EACCES;
364
365 if (sscanf(buf, "%u %u", &x, &y) != 2 || !y)
366 return -EINVAL;
367
368 spin_lock(&gt->gt_spin);
369 gt->gt_quota_scale_num = x;
370 gt->gt_quota_scale_den = y;
371 spin_unlock(&gt->gt_spin);
372 return len;
373}
374
375static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
376 int check_zero, const char *buf, size_t len)
377{
378 struct gfs2_tune *gt = &sdp->sd_tune;
379 unsigned int x;
380
381 if (!capable(CAP_SYS_ADMIN))
382 return -EACCES;
383
384 x = simple_strtoul(buf, NULL, 0);
385
386 if (check_zero && !x)
387 return -EINVAL;
388
389 spin_lock(&gt->gt_spin);
390 *field = x;
391 spin_unlock(&gt->gt_spin);
392 return len;
393}
394
395struct tune_attr {
396 struct attribute attr;
397 ssize_t (*show)(struct gfs2_sbd *, char *);
398 ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
399};
400
401#define TUNE_ATTR_3(name, show, store) \
402static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store)
403
404#define TUNE_ATTR_2(name, store) \
405static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
406{ \
407 return sprintf(buf, "%u\n", sdp->sd_tune.gt_##name); \
408} \
409TUNE_ATTR_3(name, name##_show, store)
410
411#define TUNE_ATTR(name, check_zero) \
412static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
413{ \
414 return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len); \
415} \
416TUNE_ATTR_2(name, name##_store)
417
418#define TUNE_ATTR_DAEMON(name, process) \
419static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
420{ \
421 ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len); \
422 wake_up_process(sdp->sd_##process); \
423 return r; \
424} \
425TUNE_ATTR_2(name, name##_store)
426
427TUNE_ATTR(ilimit, 0);
428TUNE_ATTR(ilimit_tries, 0);
429TUNE_ATTR(ilimit_min, 0);
430TUNE_ATTR(demote_secs, 0);
431TUNE_ATTR(incore_log_blocks, 0);
432TUNE_ATTR(log_flush_secs, 0);
433TUNE_ATTR(jindex_refresh_secs, 0);
434TUNE_ATTR(quota_warn_period, 0);
435TUNE_ATTR(quota_quantum, 0);
436TUNE_ATTR(atime_quantum, 0);
437TUNE_ATTR(max_readahead, 0);
438TUNE_ATTR(complain_secs, 0);
439TUNE_ATTR(reclaim_limit, 0);
440TUNE_ATTR(prefetch_secs, 0);
441TUNE_ATTR(statfs_slow, 0);
442TUNE_ATTR(new_files_jdata, 0);
443TUNE_ATTR(new_files_directio, 0);
444TUNE_ATTR(quota_simul_sync, 1);
445TUNE_ATTR(quota_cache_secs, 1);
446TUNE_ATTR(max_atomic_write, 1);
447TUNE_ATTR(stall_secs, 1);
448TUNE_ATTR(entries_per_readdir, 1);
449TUNE_ATTR(greedy_default, 1);
450TUNE_ATTR(greedy_quantum, 1);
451TUNE_ATTR(greedy_max, 1);
452TUNE_ATTR(statfs_quantum, 1);
453TUNE_ATTR_DAEMON(scand_secs, scand_process);
454TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
455TUNE_ATTR_DAEMON(logd_secs, logd_process);
456TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
457TUNE_ATTR_DAEMON(inoded_secs, inoded_process);
458TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
459
460static struct attribute *tune_attrs[] = {
461 &tune_attr_ilimit.attr,
462 &tune_attr_ilimit_tries.attr,
463 &tune_attr_ilimit_min.attr,
464 &tune_attr_demote_secs.attr,
465 &tune_attr_incore_log_blocks.attr,
466 &tune_attr_log_flush_secs.attr,
467 &tune_attr_jindex_refresh_secs.attr,
468 &tune_attr_quota_warn_period.attr,
469 &tune_attr_quota_quantum.attr,
470 &tune_attr_atime_quantum.attr,
471 &tune_attr_max_readahead.attr,
472 &tune_attr_complain_secs.attr,
473 &tune_attr_reclaim_limit.attr,
474 &tune_attr_prefetch_secs.attr,
475 &tune_attr_statfs_slow.attr,
476 &tune_attr_quota_simul_sync.attr,
477 &tune_attr_quota_cache_secs.attr,
478 &tune_attr_max_atomic_write.attr,
479 &tune_attr_stall_secs.attr,
480 &tune_attr_entries_per_readdir.attr,
481 &tune_attr_greedy_default.attr,
482 &tune_attr_greedy_quantum.attr,
483 &tune_attr_greedy_max.attr,
484 &tune_attr_statfs_quantum.attr,
485 &tune_attr_scand_secs.attr,
486 &tune_attr_recoverd_secs.attr,
487 &tune_attr_logd_secs.attr,
488 &tune_attr_quotad_secs.attr,
489 &tune_attr_inoded_secs.attr,
490 &tune_attr_quota_scale.attr,
491 &tune_attr_new_files_jdata.attr,
492 &tune_attr_new_files_directio.attr,
493 NULL
494};
495
496static struct attribute_group lockstruct_group = {
497 .name = "lockstruct",
498 .attrs = lockstruct_attrs
499};
500
501static struct attribute_group counters_group = {
502 .name = "counters",
503 .attrs = counters_attrs
504};
505
506static struct attribute_group args_group = {
507 .name = "args",
508 .attrs = args_attrs
509};
510
511static struct attribute_group tune_group = {
512 .name = "tune",
513 .attrs = tune_attrs
514};
515
516int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
517{
518 int error;
519
520 sdp->sd_kobj.kset = &gfs2_kset;
521 sdp->sd_kobj.ktype = &gfs2_ktype;
522
523 error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name);
524 if (error)
525 goto fail;
526
527 error = kobject_register(&sdp->sd_kobj);
528 if (error)
529 goto fail;
530
531 error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
532 if (error)
533 goto fail_reg;
534
535 error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
536 if (error)
537 goto fail_lockstruct;
538
539 error = sysfs_create_group(&sdp->sd_kobj, &args_group);
540 if (error)
541 goto fail_counters;
542
543 error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
544 if (error)
545 goto fail_args;
546
547 return 0;
548
549 fail_args:
550 sysfs_remove_group(&sdp->sd_kobj, &args_group);
551 fail_counters:
552 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
553 fail_lockstruct:
554 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
555 fail_reg:
556 kobject_unregister(&sdp->sd_kobj);
557 fail:
558 return error;
559}
560
561void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
562{
563 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
564 sysfs_remove_group(&sdp->sd_kobj, &args_group);
565 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
566 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
567 kobject_unregister(&sdp->sd_kobj);
568}
569
570int gfs2_sys_init(void)
571{
572 gfs2_sys_margs = NULL;
573 spin_lock_init(&gfs2_sys_margs_lock);
574 return kset_register(&gfs2_kset);
575}
576
577void gfs2_sys_uninit(void)
578{
579 kfree(gfs2_sys_margs);
580 kset_unregister(&gfs2_kset);
581}
582
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
new file mode 100644
index 000000000000..62c8ed89ab9c
--- /dev/null
+++ b/fs/gfs2/sys.h
@@ -0,0 +1,24 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __SYS_DOT_H__
11#define __SYS_DOT_H__
12
13/* Allow args to be passed to GFS2 when using an initial ram disk */
14extern char *gfs2_sys_margs;
15extern spinlock_t gfs2_sys_margs_lock;
16
17int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
18void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
19
20int gfs2_sys_init(void);
21void gfs2_sys_uninit(void);
22
23#endif /* __SYS_DOT_H__ */
24
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
new file mode 100644
index 000000000000..6b02d8c38f0f
--- /dev/null
+++ b/fs/gfs2/trans.c
@@ -0,0 +1,186 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/kallsyms.h>
17#include <asm/semaphore.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "glock.h"
23#include "log.h"
24#include "lops.h"
25#include "meta_io.h"
26#include "trans.h"
27#include "util.h"
28
29int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
30 unsigned int revokes)
31{
32 struct gfs2_trans *tr;
33 int error;
34
35 BUG_ON(current->journal_info);
36 BUG_ON(blocks == 0 && revokes == 0);
37
38 tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
39 if (!tr)
40 return -ENOMEM;
41
42 tr->tr_ip = (unsigned long)__builtin_return_address(0);
43 tr->tr_blocks = blocks;
44 tr->tr_revokes = revokes;
45 tr->tr_reserved = 1;
46 if (blocks)
47 tr->tr_reserved += 6 + blocks;
48 if (revokes)
49 tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
50 sizeof(uint64_t));
51 INIT_LIST_HEAD(&tr->tr_list_buf);
52
53 gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED,
54 GL_NEVER_RECURSE, &tr->tr_t_gh);
55
56 error = gfs2_glock_nq(&tr->tr_t_gh);
57 if (error)
58 goto fail_holder_uninit;
59
60 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
61 tr->tr_t_gh.gh_flags |= GL_NOCACHE;
62 error = -EROFS;
63 goto fail_gunlock;
64 }
65
66 error = gfs2_log_reserve(sdp, tr->tr_reserved);
67 if (error)
68 goto fail_gunlock;
69
70 current->journal_info = tr;
71
72 return 0;
73
74fail_gunlock:
75 gfs2_glock_dq(&tr->tr_t_gh);
76
77fail_holder_uninit:
78 gfs2_holder_uninit(&tr->tr_t_gh);
79 kfree(tr);
80
81 return error;
82}
83
84void gfs2_trans_end(struct gfs2_sbd *sdp)
85{
86 struct gfs2_trans *tr = current->journal_info;
87
88 BUG_ON(!tr);
89 current->journal_info = NULL;
90
91 if (!tr->tr_touched) {
92 gfs2_log_release(sdp, tr->tr_reserved);
93 gfs2_glock_dq(&tr->tr_t_gh);
94 gfs2_holder_uninit(&tr->tr_t_gh);
95 kfree(tr);
96 return;
97 }
98
99 if (gfs2_assert_withdraw(sdp, tr->tr_num_buf <= tr->tr_blocks)) {
100 fs_err(sdp, "tr_num_buf = %u, tr_blocks = %u ",
101 tr->tr_num_buf, tr->tr_blocks);
102 print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
103 }
104 if (gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) {
105 fs_err(sdp, "tr_num_revoke = %u, tr_revokes = %u ",
106 tr->tr_num_revoke, tr->tr_revokes);
107 print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
108 }
109
110 gfs2_log_commit(sdp, tr);
111 gfs2_glock_dq(&tr->tr_t_gh);
112 gfs2_holder_uninit(&tr->tr_t_gh);
113 kfree(tr);
114
115 if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
116 gfs2_log_flush(sdp, NULL);
117}
118
119void gfs2_trans_add_gl(struct gfs2_glock *gl)
120{
121 lops_add(gl->gl_sbd, &gl->gl_le);
122}
123
124/**
125 * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
126 * @gl: the glock the buffer belongs to
127 * @bh: The buffer to add
128 * @meta: True in the case of adding metadata
129 *
130 */
131
132void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
133{
134 struct gfs2_sbd *sdp = gl->gl_sbd;
135 struct gfs2_bufdata *bd;
136
137 bd = bh->b_private;
138 if (bd)
139 gfs2_assert(sdp, bd->bd_gl == gl);
140 else {
141 gfs2_attach_bufdata(gl, bh, meta);
142 bd = bh->b_private;
143 }
144 lops_add(sdp, &bd->bd_le);
145}
146
147void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, uint64_t blkno)
148{
149 struct gfs2_revoke *rv = kmalloc(sizeof(struct gfs2_revoke),
150 GFP_NOFS | __GFP_NOFAIL);
151 lops_init_le(&rv->rv_le, &gfs2_revoke_lops);
152 rv->rv_blkno = blkno;
153 lops_add(sdp, &rv->rv_le);
154}
155
156void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, uint64_t blkno)
157{
158 struct gfs2_revoke *rv;
159 int found = 0;
160
161 gfs2_log_lock(sdp);
162
163 list_for_each_entry(rv, &sdp->sd_log_le_revoke, rv_le.le_list) {
164 if (rv->rv_blkno == blkno) {
165 list_del(&rv->rv_le.le_list);
166 gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
167 sdp->sd_log_num_revoke--;
168 found = 1;
169 break;
170 }
171 }
172
173 gfs2_log_unlock(sdp);
174
175 if (found) {
176 struct gfs2_trans *tr = current->journal_info;
177 kfree(rv);
178 tr->tr_num_revoke_rm++;
179 }
180}
181
182void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
183{
184 lops_add(rgd->rd_sbd, &rgd->rd_le);
185}
186
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
new file mode 100644
index 000000000000..6b5e9e8bf561
--- /dev/null
+++ b/fs/gfs2/trans.h
@@ -0,0 +1,35 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __TRANS_DOT_H__
11#define __TRANS_DOT_H__
12
13#define RES_DINODE 1
14#define RES_INDIRECT 1
15#define RES_JDATA 1
16#define RES_DATA 1
17#define RES_LEAF 1
18#define RES_RG_BIT 2
19#define RES_EATTR 1
20#define RES_UNLINKED 1
21#define RES_STATFS 1
22#define RES_QUOTA 2
23
24int gfs2_trans_begin(struct gfs2_sbd *sdp,
25 unsigned int blocks, unsigned int revokes);
26
27void gfs2_trans_end(struct gfs2_sbd *sdp);
28
29void gfs2_trans_add_gl(struct gfs2_glock *gl);
30void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
31void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, uint64_t blkno);
32void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, uint64_t blkno);
33void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
34
35#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/unlinked.c b/fs/gfs2/unlinked.c
new file mode 100644
index 000000000000..24b91c23bc2d
--- /dev/null
+++ b/fs/gfs2/unlinked.c
@@ -0,0 +1,458 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/kthread.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/semaphore.h>
18
19#include "gfs2.h"
20#include "lm_interface.h"
21#include "incore.h"
22#include "bmap.h"
23#include "inode.h"
24#include "meta_io.h"
25#include "trans.h"
26#include "unlinked.h"
27#include "util.h"
28
29static int munge_ondisk(struct gfs2_sbd *sdp, unsigned int slot,
30 struct gfs2_unlinked_tag *ut)
31{
32 struct gfs2_inode *ip = sdp->sd_ut_inode->u.generic_ip;
33 unsigned int block, offset;
34 uint64_t dblock;
35 int new = 0;
36 struct buffer_head *bh;
37 int error;
38
39 block = slot / sdp->sd_ut_per_block;
40 offset = slot % sdp->sd_ut_per_block;
41
42 error = gfs2_block_map(ip, block, &new, &dblock, NULL);
43 if (error)
44 return error;
45 error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT, &bh);
46 if (error)
47 return error;
48 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_UT)) {
49 error = -EIO;
50 goto out;
51 }
52
53 mutex_lock(&sdp->sd_unlinked_mutex);
54 gfs2_trans_add_bh(ip->i_gl, bh, 1);
55 gfs2_unlinked_tag_out(ut, bh->b_data +
56 sizeof(struct gfs2_meta_header) +
57 offset * sizeof(struct gfs2_unlinked_tag));
58 mutex_unlock(&sdp->sd_unlinked_mutex);
59
60 out:
61 brelse(bh);
62
63 return error;
64}
65
66static void ul_hash(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
67{
68 spin_lock(&sdp->sd_unlinked_spin);
69 list_add(&ul->ul_list, &sdp->sd_unlinked_list);
70 gfs2_assert(sdp, ul->ul_count);
71 ul->ul_count++;
72 atomic_inc(&sdp->sd_unlinked_count);
73 spin_unlock(&sdp->sd_unlinked_spin);
74}
75
76static void ul_unhash(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
77{
78 spin_lock(&sdp->sd_unlinked_spin);
79 list_del_init(&ul->ul_list);
80 gfs2_assert(sdp, ul->ul_count > 1);
81 ul->ul_count--;
82 gfs2_assert_warn(sdp, atomic_read(&sdp->sd_unlinked_count) > 0);
83 atomic_dec(&sdp->sd_unlinked_count);
84 spin_unlock(&sdp->sd_unlinked_spin);
85}
86
87static struct gfs2_unlinked *ul_fish(struct gfs2_sbd *sdp)
88{
89 struct list_head *head;
90 struct gfs2_unlinked *ul;
91 int found = 0;
92
93 if (sdp->sd_vfs->s_flags & MS_RDONLY)
94 return NULL;
95
96 spin_lock(&sdp->sd_unlinked_spin);
97
98 head = &sdp->sd_unlinked_list;
99
100 list_for_each_entry(ul, head, ul_list) {
101 if (test_bit(ULF_LOCKED, &ul->ul_flags))
102 continue;
103
104 list_move_tail(&ul->ul_list, head);
105 ul->ul_count++;
106 set_bit(ULF_LOCKED, &ul->ul_flags);
107 found = 1;
108
109 break;
110 }
111
112 if (!found)
113 ul = NULL;
114
115 spin_unlock(&sdp->sd_unlinked_spin);
116
117 return ul;
118}
119
120/**
121 * enforce_limit - limit the number of inodes waiting to be deallocated
122 * @sdp: the filesystem
123 *
124 * Returns: errno
125 */
126
127static void enforce_limit(struct gfs2_sbd *sdp)
128{
129 unsigned int tries = 0, min = 0;
130 int error;
131
132 if (atomic_read(&sdp->sd_unlinked_count) >=
133 gfs2_tune_get(sdp, gt_ilimit)) {
134 tries = gfs2_tune_get(sdp, gt_ilimit_tries);
135 min = gfs2_tune_get(sdp, gt_ilimit_min);
136 }
137
138 while (tries--) {
139 struct gfs2_unlinked *ul = ul_fish(sdp);
140 if (!ul)
141 break;
142 error = gfs2_inode_dealloc(sdp, ul);
143 gfs2_unlinked_put(sdp, ul);
144
145 if (!error) {
146 if (!--min)
147 break;
148 } else if (error != 1)
149 break;
150 }
151}
152
153static struct gfs2_unlinked *ul_alloc(struct gfs2_sbd *sdp)
154{
155 struct gfs2_unlinked *ul;
156
157 ul = kzalloc(sizeof(struct gfs2_unlinked), GFP_KERNEL);
158 if (ul) {
159 INIT_LIST_HEAD(&ul->ul_list);
160 ul->ul_count = 1;
161 set_bit(ULF_LOCKED, &ul->ul_flags);
162 }
163
164 return ul;
165}
166
167int gfs2_unlinked_get(struct gfs2_sbd *sdp, struct gfs2_unlinked **ul)
168{
169 unsigned int c, o = 0, b;
170 unsigned char byte = 0;
171
172 enforce_limit(sdp);
173
174 *ul = ul_alloc(sdp);
175 if (!*ul)
176 return -ENOMEM;
177
178 spin_lock(&sdp->sd_unlinked_spin);
179
180 for (c = 0; c < sdp->sd_unlinked_chunks; c++)
181 for (o = 0; o < PAGE_SIZE; o++) {
182 byte = sdp->sd_unlinked_bitmap[c][o];
183 if (byte != 0xFF)
184 goto found;
185 }
186
187 goto fail;
188
189 found:
190 for (b = 0; b < 8; b++)
191 if (!(byte & (1 << b)))
192 break;
193 (*ul)->ul_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
194
195 if ((*ul)->ul_slot >= sdp->sd_unlinked_slots)
196 goto fail;
197
198 sdp->sd_unlinked_bitmap[c][o] |= 1 << b;
199
200 spin_unlock(&sdp->sd_unlinked_spin);
201
202 return 0;
203
204 fail:
205 spin_unlock(&sdp->sd_unlinked_spin);
206 kfree(*ul);
207 return -ENOSPC;
208}
209
210void gfs2_unlinked_put(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
211{
212 gfs2_assert_warn(sdp, test_and_clear_bit(ULF_LOCKED, &ul->ul_flags));
213
214 spin_lock(&sdp->sd_unlinked_spin);
215 gfs2_assert(sdp, ul->ul_count);
216 ul->ul_count--;
217 if (!ul->ul_count) {
218 gfs2_icbit_munge(sdp, sdp->sd_unlinked_bitmap, ul->ul_slot, 0);
219 spin_unlock(&sdp->sd_unlinked_spin);
220 kfree(ul);
221 } else
222 spin_unlock(&sdp->sd_unlinked_spin);
223}
224
225int gfs2_unlinked_ondisk_add(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
226{
227 int error;
228
229 gfs2_assert_warn(sdp, test_bit(ULF_LOCKED, &ul->ul_flags));
230 gfs2_assert_warn(sdp, list_empty(&ul->ul_list));
231
232 error = munge_ondisk(sdp, ul->ul_slot, &ul->ul_ut);
233 if (!error)
234 ul_hash(sdp, ul);
235
236 return error;
237}
238
239int gfs2_unlinked_ondisk_munge(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
240{
241 int error;
242
243 gfs2_assert_warn(sdp, test_bit(ULF_LOCKED, &ul->ul_flags));
244 gfs2_assert_warn(sdp, !list_empty(&ul->ul_list));
245
246 error = munge_ondisk(sdp, ul->ul_slot, &ul->ul_ut);
247
248 return error;
249}
250
251int gfs2_unlinked_ondisk_rm(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul)
252{
253 struct gfs2_unlinked_tag ut;
254 int error;
255
256 gfs2_assert_warn(sdp, test_bit(ULF_LOCKED, &ul->ul_flags));
257 gfs2_assert_warn(sdp, !list_empty(&ul->ul_list));
258
259 memset(&ut, 0, sizeof(struct gfs2_unlinked_tag));
260
261 error = munge_ondisk(sdp, ul->ul_slot, &ut);
262 if (error)
263 return error;
264
265 ul_unhash(sdp, ul);
266
267 return 0;
268}
269
270/**
271 * gfs2_unlinked_dealloc - Go through the list of inodes to be deallocated
272 * @sdp: the filesystem
273 *
274 * Returns: errno
275 */
276
277int gfs2_unlinked_dealloc(struct gfs2_sbd *sdp)
278{
279 unsigned int hits, strikes;
280 int error;
281
282 for (;;) {
283 hits = 0;
284 strikes = 0;
285
286 for (;;) {
287 struct gfs2_unlinked *ul = ul_fish(sdp);
288 if (!ul)
289 return 0;
290 error = gfs2_inode_dealloc(sdp, ul);
291 gfs2_unlinked_put(sdp, ul);
292
293 if (!error) {
294 hits++;
295 if (strikes)
296 strikes--;
297 } else if (error == 1) {
298 strikes++;
299 if (strikes >=
300 atomic_read(&sdp->sd_unlinked_count)) {
301 error = 0;
302 break;
303 }
304 } else
305 return error;
306 }
307
308 if (!hits || kthread_should_stop())
309 break;
310
311 cond_resched();
312 }
313
314 return 0;
315}
316
317int gfs2_unlinked_init(struct gfs2_sbd *sdp)
318{
319 struct gfs2_inode *ip = sdp->sd_ut_inode->u.generic_ip;
320 unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
321 unsigned int x, slot = 0;
322 unsigned int found = 0;
323 uint64_t dblock;
324 uint32_t extlen = 0;
325 int error;
326
327 if (!ip->i_di.di_size ||
328 ip->i_di.di_size > (64 << 20) ||
329 ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
330 gfs2_consist_inode(ip);
331 return -EIO;
332 }
333 sdp->sd_unlinked_slots = blocks * sdp->sd_ut_per_block;
334 sdp->sd_unlinked_chunks = DIV_ROUND_UP(sdp->sd_unlinked_slots,
335 8 * PAGE_SIZE);
336
337 error = -ENOMEM;
338
339 sdp->sd_unlinked_bitmap = kcalloc(sdp->sd_unlinked_chunks,
340 sizeof(unsigned char *),
341 GFP_KERNEL);
342 if (!sdp->sd_unlinked_bitmap)
343 return error;
344
345 for (x = 0; x < sdp->sd_unlinked_chunks; x++) {
346 sdp->sd_unlinked_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL);
347 if (!sdp->sd_unlinked_bitmap[x])
348 goto fail;
349 }
350
351 for (x = 0; x < blocks; x++) {
352 struct buffer_head *bh;
353 unsigned int y;
354
355 if (!extlen) {
356 int new = 0;
357 error = gfs2_block_map(ip, x, &new, &dblock, &extlen);
358 if (error)
359 goto fail;
360 }
361 gfs2_meta_ra(ip->i_gl, dblock, extlen);
362 error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT,
363 &bh);
364 if (error)
365 goto fail;
366 error = -EIO;
367 if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_UT)) {
368 brelse(bh);
369 goto fail;
370 }
371
372 for (y = 0;
373 y < sdp->sd_ut_per_block && slot < sdp->sd_unlinked_slots;
374 y++, slot++) {
375 struct gfs2_unlinked_tag ut;
376 struct gfs2_unlinked *ul;
377
378 gfs2_unlinked_tag_in(&ut, bh->b_data +
379 sizeof(struct gfs2_meta_header) +
380 y * sizeof(struct gfs2_unlinked_tag));
381 if (!ut.ut_inum.no_addr)
382 continue;
383
384 error = -ENOMEM;
385 ul = ul_alloc(sdp);
386 if (!ul) {
387 brelse(bh);
388 goto fail;
389 }
390 ul->ul_ut = ut;
391 ul->ul_slot = slot;
392
393 spin_lock(&sdp->sd_unlinked_spin);
394 gfs2_icbit_munge(sdp, sdp->sd_unlinked_bitmap, slot, 1);
395 spin_unlock(&sdp->sd_unlinked_spin);
396 ul_hash(sdp, ul);
397
398 gfs2_unlinked_put(sdp, ul);
399 found++;
400 }
401
402 brelse(bh);
403 dblock++;
404 extlen--;
405 }
406
407 if (found)
408 fs_info(sdp, "found %u unlinked inodes\n", found);
409
410 return 0;
411
412 fail:
413 gfs2_unlinked_cleanup(sdp);
414 return error;
415}
416
417/**
418 * gfs2_unlinked_cleanup - get rid of any extra struct gfs2_unlinked structures
419 * @sdp: the filesystem
420 *
421 */
422
423void gfs2_unlinked_cleanup(struct gfs2_sbd *sdp)
424{
425 struct list_head *head = &sdp->sd_unlinked_list;
426 struct gfs2_unlinked *ul;
427 unsigned int x;
428
429 spin_lock(&sdp->sd_unlinked_spin);
430 while (!list_empty(head)) {
431 ul = list_entry(head->next, struct gfs2_unlinked, ul_list);
432
433 if (ul->ul_count > 1) {
434 list_move_tail(&ul->ul_list, head);
435 spin_unlock(&sdp->sd_unlinked_spin);
436 schedule();
437 spin_lock(&sdp->sd_unlinked_spin);
438 continue;
439 }
440
441 list_del_init(&ul->ul_list);
442 atomic_dec(&sdp->sd_unlinked_count);
443
444 gfs2_assert_warn(sdp, ul->ul_count == 1);
445 gfs2_assert_warn(sdp, !test_bit(ULF_LOCKED, &ul->ul_flags));
446 kfree(ul);
447 }
448 spin_unlock(&sdp->sd_unlinked_spin);
449
450 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_unlinked_count));
451
452 if (sdp->sd_unlinked_bitmap) {
453 for (x = 0; x < sdp->sd_unlinked_chunks; x++)
454 kfree(sdp->sd_unlinked_bitmap[x]);
455 kfree(sdp->sd_unlinked_bitmap);
456 }
457}
458
diff --git a/fs/gfs2/unlinked.h b/fs/gfs2/unlinked.h
new file mode 100644
index 000000000000..51e77f88d74f
--- /dev/null
+++ b/fs/gfs2/unlinked.h
@@ -0,0 +1,25 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __UNLINKED_DOT_H__
11#define __UNLINKED_DOT_H__
12
13int gfs2_unlinked_get(struct gfs2_sbd *sdp, struct gfs2_unlinked **ul);
14void gfs2_unlinked_put(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul);
15
16int gfs2_unlinked_ondisk_add(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul);
17int gfs2_unlinked_ondisk_munge(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul);
18int gfs2_unlinked_ondisk_rm(struct gfs2_sbd *sdp, struct gfs2_unlinked *ul);
19
20int gfs2_unlinked_dealloc(struct gfs2_sbd *sdp);
21
22int gfs2_unlinked_init(struct gfs2_sbd *sdp);
23void gfs2_unlinked_cleanup(struct gfs2_sbd *sdp);
24
25#endif /* __UNLINKED_DOT_H__ */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
new file mode 100644
index 000000000000..7cd9e25639c4
--- /dev/null
+++ b/fs/gfs2/util.c
@@ -0,0 +1,245 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/crc32.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/semaphore.h>
18#include <asm/uaccess.h>
19
20#include "gfs2.h"
21#include "lm_interface.h"
22#include "incore.h"
23#include "glock.h"
24#include "lm.h"
25#include "util.h"
26
27kmem_cache_t *gfs2_glock_cachep __read_mostly;
28kmem_cache_t *gfs2_inode_cachep __read_mostly;
29kmem_cache_t *gfs2_bufdata_cachep __read_mostly;
30
31void gfs2_assert_i(struct gfs2_sbd *sdp)
32{
33 printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n",
34 sdp->sd_fsname);
35}
36
37/**
38 * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
39 * Returns: -1 if this call withdrew the machine,
40 * -2 if it was already withdrawn
41 */
42
43int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
44 const char *function, char *file, unsigned int line)
45{
46 int me;
47 me = gfs2_lm_withdraw(sdp,
48 "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n"
49 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
50 sdp->sd_fsname, assertion,
51 sdp->sd_fsname, function, file, line);
52 dump_stack();
53 return (me) ? -1 : -2;
54}
55
56/**
57 * gfs2_assert_warn_i - Print a message to the console if @assertion is false
58 * Returns: -1 if we printed something
59 * -2 if we didn't
60 */
61
62int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
63 const char *function, char *file, unsigned int line)
64{
65 if (time_before(jiffies,
66 sdp->sd_last_warning +
67 gfs2_tune_get(sdp, gt_complain_secs) * HZ))
68 return -2;
69
70 printk(KERN_WARNING
71 "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
72 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
73 sdp->sd_fsname, assertion,
74 sdp->sd_fsname, function, file, line);
75
76 if (sdp->sd_args.ar_debug)
77 BUG();
78 else
79 dump_stack();
80
81 sdp->sd_last_warning = jiffies;
82
83 return -1;
84}
85
86/**
87 * gfs2_consist_i - Flag a filesystem consistency error and withdraw
88 * Returns: -1 if this call withdrew the machine,
89 * 0 if it was already withdrawn
90 */
91
92int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
93 char *file, unsigned int line)
94{
95 int rv;
96 rv = gfs2_lm_withdraw(sdp,
97 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
98 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
99 sdp->sd_fsname,
100 sdp->sd_fsname, function, file, line);
101 return rv;
102}
103
104/**
105 * gfs2_consist_inode_i - Flag an inode consistency error and withdraw
106 * Returns: -1 if this call withdrew the machine,
107 * 0 if it was already withdrawn
108 */
109
110int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
111 const char *function, char *file, unsigned int line)
112{
113 struct gfs2_sbd *sdp = ip->i_sbd;
114 int rv;
115 rv = gfs2_lm_withdraw(sdp,
116 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
117 "GFS2: fsid=%s: inode = %llu %llu\n"
118 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
119 sdp->sd_fsname,
120 sdp->sd_fsname, ip->i_num.no_formal_ino, ip->i_num.no_addr,
121 sdp->sd_fsname, function, file, line);
122 return rv;
123}
124
125/**
126 * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw
127 * Returns: -1 if this call withdrew the machine,
128 * 0 if it was already withdrawn
129 */
130
131int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
132 const char *function, char *file, unsigned int line)
133{
134 struct gfs2_sbd *sdp = rgd->rd_sbd;
135 int rv;
136 rv = gfs2_lm_withdraw(sdp,
137 "GFS2: fsid=%s: fatal: filesystem consistency error\n"
138 "GFS2: fsid=%s: RG = %llu\n"
139 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
140 sdp->sd_fsname,
141 sdp->sd_fsname, rgd->rd_ri.ri_addr,
142 sdp->sd_fsname, function, file, line);
143 return rv;
144}
145
146/**
147 * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw
148 * Returns: -1 if this call withdrew the machine,
149 * -2 if it was already withdrawn
150 */
151
152int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
153 const char *type, const char *function, char *file,
154 unsigned int line)
155{
156 int me;
157 me = gfs2_lm_withdraw(sdp,
158 "GFS2: fsid=%s: fatal: invalid metadata block\n"
159 "GFS2: fsid=%s: bh = %llu (%s)\n"
160 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
161 sdp->sd_fsname,
162 sdp->sd_fsname, (uint64_t)bh->b_blocknr, type,
163 sdp->sd_fsname, function, file, line);
164 return (me) ? -1 : -2;
165}
166
167/**
168 * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw
169 * Returns: -1 if this call withdrew the machine,
170 * -2 if it was already withdrawn
171 */
172
173int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
174 uint16_t type, uint16_t t, const char *function,
175 char *file, unsigned int line)
176{
177 int me;
178 me = gfs2_lm_withdraw(sdp,
179 "GFS2: fsid=%s: fatal: invalid metadata block\n"
180 "GFS2: fsid=%s: bh = %llu (type: exp=%u, found=%u)\n"
181 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
182 sdp->sd_fsname,
183 sdp->sd_fsname, (uint64_t)bh->b_blocknr, type, t,
184 sdp->sd_fsname, function, file, line);
185 return (me) ? -1 : -2;
186}
187
188/**
189 * gfs2_io_error_i - Flag an I/O error and withdraw
190 * Returns: -1 if this call withdrew the machine,
191 * 0 if it was already withdrawn
192 */
193
194int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
195 unsigned int line)
196{
197 int rv;
198 rv = gfs2_lm_withdraw(sdp,
199 "GFS2: fsid=%s: fatal: I/O error\n"
200 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
201 sdp->sd_fsname,
202 sdp->sd_fsname, function, file, line);
203 return rv;
204}
205
206/**
207 * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw
208 * Returns: -1 if this call withdrew the machine,
209 * 0 if it was already withdrawn
210 */
211
212int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
213 const char *function, char *file, unsigned int line)
214{
215 int rv;
216 rv = gfs2_lm_withdraw(sdp,
217 "GFS2: fsid=%s: fatal: I/O error\n"
218 "GFS2: fsid=%s: block = %llu\n"
219 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
220 sdp->sd_fsname,
221 sdp->sd_fsname, (uint64_t)bh->b_blocknr,
222 sdp->sd_fsname, function, file, line);
223 return rv;
224}
225
226void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
227 unsigned int bit, int new_value)
228{
229 unsigned int c, o, b = bit;
230 int old_value;
231
232 c = b / (8 * PAGE_SIZE);
233 b %= 8 * PAGE_SIZE;
234 o = b / 8;
235 b %= 8;
236
237 old_value = (bitmap[c][o] & (1 << b));
238 gfs2_assert_withdraw(sdp, !old_value != !new_value);
239
240 if (new_value)
241 bitmap[c][o] |= 1 << b;
242 else
243 bitmap[c][o] &= ~(1 << b);
244}
245
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
new file mode 100644
index 000000000000..4532dbab0a2c
--- /dev/null
+++ b/fs/gfs2/util.h
@@ -0,0 +1,169 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License v.2.
8 */
9
10#ifndef __UTIL_DOT_H__
11#define __UTIL_DOT_H__
12
13
14#define fs_printk(level, fs, fmt, arg...) \
15 printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg)
16
17#define fs_info(fs, fmt, arg...) \
18 fs_printk(KERN_INFO , fs , fmt , ## arg)
19
20#define fs_warn(fs, fmt, arg...) \
21 fs_printk(KERN_WARNING , fs , fmt , ## arg)
22
23#define fs_err(fs, fmt, arg...) \
24 fs_printk(KERN_ERR, fs , fmt , ## arg)
25
26
27void gfs2_assert_i(struct gfs2_sbd *sdp);
28
29#define gfs2_assert(sdp, assertion) \
30do { \
31 if (unlikely(!(assertion))) { \
32 gfs2_assert_i(sdp); \
33 BUG(); \
34 } \
35} while (0)
36
37
38int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
39 const char *function, char *file, unsigned int line);
40
41#define gfs2_assert_withdraw(sdp, assertion) \
42((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \
43 __FUNCTION__, __FILE__, __LINE__))
44
45
46int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
47 const char *function, char *file, unsigned int line);
48
49#define gfs2_assert_warn(sdp, assertion) \
50((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \
51 __FUNCTION__, __FILE__, __LINE__))
52
53
54int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide,
55 const char *function, char *file, unsigned int line);
56
57#define gfs2_consist(sdp) \
58gfs2_consist_i((sdp), 0, __FUNCTION__, __FILE__, __LINE__)
59
60
61int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
62 const char *function, char *file, unsigned int line);
63
64#define gfs2_consist_inode(ip) \
65gfs2_consist_inode_i((ip), 0, __FUNCTION__, __FILE__, __LINE__)
66
67
68int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
69 const char *function, char *file, unsigned int line);
70
71#define gfs2_consist_rgrpd(rgd) \
72gfs2_consist_rgrpd_i((rgd), 0, __FUNCTION__, __FILE__, __LINE__)
73
74
75int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
76 const char *type, const char *function,
77 char *file, unsigned int line);
78
79static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp,
80 struct buffer_head *bh,
81 const char *function,
82 char *file, unsigned int line)
83{
84 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
85 uint32_t magic = mh->mh_magic;
86 magic = be32_to_cpu(magic);
87 if (unlikely(magic != GFS2_MAGIC))
88 return gfs2_meta_check_ii(sdp, bh, "magic number", function,
89 file, line);
90 return 0;
91}
92
93#define gfs2_meta_check(sdp, bh) \
94gfs2_meta_check_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__)
95
96
97int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
98 uint16_t type, uint16_t t,
99 const char *function,
100 char *file, unsigned int line);
101
102static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
103 struct buffer_head *bh,
104 uint16_t type,
105 const char *function,
106 char *file, unsigned int line)
107{
108 struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
109 uint32_t magic = mh->mh_magic;
110 uint16_t t = be32_to_cpu(mh->mh_type);
111 magic = be32_to_cpu(magic);
112 if (unlikely(magic != GFS2_MAGIC))
113 return gfs2_meta_check_ii(sdp, bh, "magic number", function,
114 file, line);
115 if (unlikely(t != type))
116 return gfs2_metatype_check_ii(sdp, bh, type, t, function,
117 file, line);
118 return 0;
119}
120
121#define gfs2_metatype_check(sdp, bh, type) \
122gfs2_metatype_check_i((sdp), (bh), (type), __FUNCTION__, __FILE__, __LINE__)
123
124static inline void gfs2_metatype_set(struct buffer_head *bh, uint16_t type,
125 uint16_t format)
126{
127 struct gfs2_meta_header *mh;
128 mh = (struct gfs2_meta_header *)bh->b_data;
129 mh->mh_type = cpu_to_be32(type);
130 mh->mh_format = cpu_to_be32(format);
131}
132
133
134int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
135 char *file, unsigned int line);
136
137#define gfs2_io_error(sdp) \
138gfs2_io_error_i((sdp), __FUNCTION__, __FILE__, __LINE__);
139
140
141int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
142 const char *function, char *file, unsigned int line);
143
144#define gfs2_io_error_bh(sdp, bh) \
145gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__);
146
147
148extern kmem_cache_t *gfs2_glock_cachep;
149extern kmem_cache_t *gfs2_inode_cachep;
150extern kmem_cache_t *gfs2_bufdata_cachep;
151
152static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
153 unsigned int *p)
154{
155 unsigned int x;
156 spin_lock(&gt->gt_spin);
157 x = *p;
158 spin_unlock(&gt->gt_spin);
159 return x;
160}
161
162#define gfs2_tune_get(sdp, field) \
163gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
164
165void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
166 unsigned int bit, int new_value);
167
168#endif /* __UTIL_DOT_H__ */
169
diff --git a/include/linux/dlm.h b/include/linux/dlm.h
new file mode 100644
index 000000000000..1b1dcb9a40bb
--- /dev/null
+++ b/include/linux/dlm.h
@@ -0,0 +1,302 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14#ifndef __DLM_DOT_H__
15#define __DLM_DOT_H__
16
17/*
18 * Interface to Distributed Lock Manager (DLM)
19 * routines and structures to use DLM lockspaces
20 */
21
22/*
23 * Lock Modes
24 */
25
26#define DLM_LOCK_IV -1 /* invalid */
27#define DLM_LOCK_NL 0 /* null */
28#define DLM_LOCK_CR 1 /* concurrent read */
29#define DLM_LOCK_CW 2 /* concurrent write */
30#define DLM_LOCK_PR 3 /* protected read */
31#define DLM_LOCK_PW 4 /* protected write */
32#define DLM_LOCK_EX 5 /* exclusive */
33
34/*
35 * Maximum size in bytes of a dlm_lock name
36 */
37
38#define DLM_RESNAME_MAXLEN 64
39
40/*
41 * Flags to dlm_lock
42 *
43 * DLM_LKF_NOQUEUE
44 *
45 * Do not queue the lock request on the wait queue if it cannot be granted
46 * immediately. If the lock cannot be granted because of this flag, DLM will
47 * either return -EAGAIN from the dlm_lock call or will return 0 from
48 * dlm_lock and -EAGAIN in the lock status block when the AST is executed.
49 *
50 * DLM_LKF_CANCEL
51 *
52 * Used to cancel a pending lock request or conversion. A converting lock is
53 * returned to its previously granted mode.
54 *
55 * DLM_LKF_CONVERT
56 *
57 * Indicates a lock conversion request. For conversions the name and namelen
58 * are ignored and the lock ID in the LKSB is used to identify the lock.
59 *
60 * DLM_LKF_VALBLK
61 *
62 * Requests DLM to return the current contents of the lock value block in the
63 * lock status block. When this flag is set in a lock conversion from PW or EX
64 * modes, DLM assigns the value specified in the lock status block to the lock
65 * value block of the lock resource. The LVB is a DLM_LVB_LEN size array
66 * containing application-specific information.
67 *
68 * DLM_LKF_QUECVT
69 *
70 * Force a conversion request to be queued, even if it is compatible with
71 * the granted modes of other locks on the same resource.
72 *
73 * DLM_LKF_IVVALBLK
74 *
75 * Invalidate the lock value block.
76 *
77 * DLM_LKF_CONVDEADLK
78 *
79 * Allows the dlm to resolve conversion deadlocks internally by demoting the
80 * granted mode of a converting lock to NL. The DLM_SBF_DEMOTED flag is
81 * returned for a conversion that's been effected by this.
82 *
83 * DLM_LKF_PERSISTENT
84 *
85 * Only relevant to locks originating in userspace. A persistent lock will not
86 * be removed if the process holding the lock exits.
87 *
88 * DLM_LKF_NODLKWT
89 * DLM_LKF_NODLCKBLK
90 *
91 * net yet implemented
92 *
93 * DLM_LKF_EXPEDITE
94 *
95 * Used only with new requests for NL mode locks. Tells the lock manager
96 * to grant the lock, ignoring other locks in convert and wait queues.
97 *
98 * DLM_LKF_NOQUEUEBAST
99 *
100 * Send blocking AST's before returning -EAGAIN to the caller. It is only
101 * used along with the NOQUEUE flag. Blocking AST's are not sent for failed
102 * NOQUEUE requests otherwise.
103 *
104 * DLM_LKF_HEADQUE
105 *
106 * Add a lock to the head of the convert or wait queue rather than the tail.
107 *
108 * DLM_LKF_NOORDER
109 *
110 * Disregard the standard grant order rules and grant a lock as soon as it
111 * is compatible with other granted locks.
112 *
113 * DLM_LKF_ORPHAN
114 *
115 * not yet implemented
116 *
117 * DLM_LKF_ALTPR
118 *
119 * If the requested mode cannot be granted immediately, try to grant the lock
120 * in PR mode instead. If this alternate mode is granted instead of the
121 * requested mode, DLM_SBF_ALTMODE is returned in the lksb.
122 *
123 * DLM_LKF_ALTCW
124 *
125 * The same as ALTPR, but the alternate mode is CW.
126 *
127 * DLM_LKF_FORCEUNLOCK
128 *
129 * Unlock the lock even if it is converting or waiting or has sublocks.
130 * Only really for use by the userland device.c code.
131 *
132 */
133
134#define DLM_LKF_NOQUEUE 0x00000001
135#define DLM_LKF_CANCEL 0x00000002
136#define DLM_LKF_CONVERT 0x00000004
137#define DLM_LKF_VALBLK 0x00000008
138#define DLM_LKF_QUECVT 0x00000010
139#define DLM_LKF_IVVALBLK 0x00000020
140#define DLM_LKF_CONVDEADLK 0x00000040
141#define DLM_LKF_PERSISTENT 0x00000080
142#define DLM_LKF_NODLCKWT 0x00000100
143#define DLM_LKF_NODLCKBLK 0x00000200
144#define DLM_LKF_EXPEDITE 0x00000400
145#define DLM_LKF_NOQUEUEBAST 0x00000800
146#define DLM_LKF_HEADQUE 0x00001000
147#define DLM_LKF_NOORDER 0x00002000
148#define DLM_LKF_ORPHAN 0x00004000
149#define DLM_LKF_ALTPR 0x00008000
150#define DLM_LKF_ALTCW 0x00010000
151#define DLM_LKF_FORCEUNLOCK 0x00020000
152
153/*
154 * Some return codes that are not in errno.h
155 */
156
157#define DLM_ECANCEL 0x10001
158#define DLM_EUNLOCK 0x10002
159
160typedef void dlm_lockspace_t;
161
162/*
163 * Lock status block
164 *
165 * Use this structure to specify the contents of the lock value block. For a
166 * conversion request, this structure is used to specify the lock ID of the
167 * lock. DLM writes the status of the lock request and the lock ID assigned
168 * to the request in the lock status block.
169 *
170 * sb_lkid: the returned lock ID. It is set on new (non-conversion) requests.
171 * It is available when dlm_lock returns.
172 *
173 * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules
174 * shown for the DLM_LKF_VALBLK flag.
175 *
176 * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock,
177 * it was first demoted to NL to avoid conversion deadlock.
178 * DLM_SBF_VALNOTVALID is returned if the resource's LVB is marked invalid.
179 *
180 * sb_status: the returned status of the lock request set prior to AST
181 * execution. Possible return values:
182 *
183 * 0 if lock request was successful
184 * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
185 * -ENOMEM if there is no memory to process request
186 * -EINVAL if there are invalid parameters
187 * -DLM_EUNLOCK if unlock request was successful
188 * -DLM_ECANCEL if a cancel completed successfully
189 */
190
191#define DLM_SBF_DEMOTED 0x01
192#define DLM_SBF_VALNOTVALID 0x02
193#define DLM_SBF_ALTMODE 0x04
194
195struct dlm_lksb {
196 int sb_status;
197 uint32_t sb_lkid;
198 char sb_flags;
199 char * sb_lvbptr;
200};
201
202
203#ifdef __KERNEL__
204
205#define DLM_LSFL_NODIR 0x00000001
206
207/*
208 * dlm_new_lockspace
209 *
210 * Starts a lockspace with the given name. If the named lockspace exists in
211 * the cluster, the calling node joins it.
212 */
213
214int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace,
215 uint32_t flags, int lvblen);
216
217/*
218 * dlm_release_lockspace
219 *
220 * Stop a lockspace.
221 */
222
223int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force);
224
225/*
226 * dlm_lock
227 *
228 * Make an asyncronous request to acquire or convert a lock on a named
229 * resource.
230 *
231 * lockspace: context for the request
232 * mode: the requested mode of the lock (DLM_LOCK_)
233 * lksb: lock status block for input and async return values
234 * flags: input flags (DLM_LKF_)
235 * name: name of the resource to lock, can be binary
236 * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN)
237 * parent: the lock ID of a parent lock or 0 if none
238 * lockast: function DLM executes when it completes processing the request
239 * astarg: argument passed to lockast and bast functions
240 * bast: function DLM executes when this lock later blocks another request
241 *
242 * Returns:
243 * 0 if request is successfully queued for processing
244 * -EINVAL if any input parameters are invalid
245 * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE
246 * -ENOMEM if there is no memory to process request
247 * -ENOTCONN if there is a communication error
248 *
249 * If the call to dlm_lock returns an error then the operation has failed and
250 * the AST routine will not be called. If dlm_lock returns 0 it is still
251 * possible that the lock operation will fail. The AST routine will be called
252 * when the locking is complete and the status is returned in the lksb.
253 *
254 * If the AST routines or parameter are passed to a conversion operation then
255 * they will overwrite those values that were passed to a previous dlm_lock
256 * call.
257 *
258 * AST routines should not block (at least not for long), but may make
259 * any locking calls they please.
260 */
261
262int dlm_lock(dlm_lockspace_t *lockspace,
263 int mode,
264 struct dlm_lksb *lksb,
265 uint32_t flags,
266 void *name,
267 unsigned int namelen,
268 uint32_t parent_lkid,
269 void (*lockast) (void *astarg),
270 void *astarg,
271 void (*bast) (void *astarg, int mode));
272
273/*
274 * dlm_unlock
275 *
276 * Asynchronously release a lock on a resource. The AST routine is called
277 * when the resource is successfully unlocked.
278 *
279 * lockspace: context for the request
280 * lkid: the lock ID as returned in the lksb
281 * flags: input flags (DLM_LKF_)
282 * lksb: if NULL the lksb parameter passed to last lock request is used
283 * astarg: the arg used with the completion ast for the unlock
284 *
285 * Returns:
286 * 0 if request is successfully queued for processing
287 * -EINVAL if any input parameters are invalid
288 * -ENOTEMPTY if the lock still has sublocks
289 * -EBUSY if the lock is waiting for a remote lock operation
290 * -ENOTCONN if there is a communication error
291 */
292
293int dlm_unlock(dlm_lockspace_t *lockspace,
294 uint32_t lkid,
295 uint32_t flags,
296 struct dlm_lksb *lksb,
297 void *astarg);
298
299#endif /* __KERNEL__ */
300
301#endif /* __DLM_DOT_H__ */
302
diff --git a/include/linux/dlm_device.h b/include/linux/dlm_device.h
new file mode 100644
index 000000000000..f8ba1981aa96
--- /dev/null
+++ b/include/linux/dlm_device.h
@@ -0,0 +1,83 @@
1/******************************************************************************
2*******************************************************************************
3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
6**
7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions
9** of the GNU General Public License v.2.
10**
11*******************************************************************************
12******************************************************************************/
13
14/* This is the device interface for dlm, most users will use a library
15 * interface.
16 */
17
18#define DLM_USER_LVB_LEN 32
19
20/* Version of the device interface */
21#define DLM_DEVICE_VERSION_MAJOR 4
22#define DLM_DEVICE_VERSION_MINOR 0
23#define DLM_DEVICE_VERSION_PATCH 0
24
25/* struct passed to the lock write */
26struct dlm_lock_params {
27 __u8 mode;
28 __u16 flags;
29 __u32 lkid;
30 __u32 parent;
31 __u8 namelen;
32 void __user *castparam;
33 void __user *castaddr;
34 void __user *bastparam;
35 void __user *bastaddr;
36 struct dlm_lksb __user *lksb;
37 char lvb[DLM_USER_LVB_LEN];
38 char name[1];
39};
40
41struct dlm_lspace_params {
42 __u32 flags;
43 __u32 minor;
44 char name[1];
45};
46
47struct dlm_write_request {
48 __u32 version[3];
49 __u8 cmd;
50
51 union {
52 struct dlm_lock_params lock;
53 struct dlm_lspace_params lspace;
54 } i;
55};
56
57/* struct read from the "device" fd,
58 consists mainly of userspace pointers for the library to use */
59struct dlm_lock_result {
60 __u32 length;
61 void __user * user_astaddr;
62 void __user * user_astparam;
63 struct dlm_lksb __user * user_lksb;
64 struct dlm_lksb lksb;
65 __u8 bast_mode;
66 /* Offsets may be zero if no data is present */
67 __u32 lvb_offset;
68};
69
70/* Commands passed to the device */
71#define DLM_USER_LOCK 1
72#define DLM_USER_UNLOCK 2
73#define DLM_USER_QUERY 3
74#define DLM_USER_CREATE_LOCKSPACE 4
75#define DLM_USER_REMOVE_LOCKSPACE 5
76
77/* Arbitrary length restriction */
78#define MAX_LS_NAME_LEN 64
79
80/* Lockspace flags */
81#define DLM_USER_LSFLG_AUTOFREE 1
82#define DLM_USER_LSFLG_FORCEFREE 2
83
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3de2bfb2410f..ff56c0bec43c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1329,6 +1329,9 @@ extern struct subsystem fs_subsys;
1329#define FLOCK_VERIFY_READ 1 1329#define FLOCK_VERIFY_READ 1
1330#define FLOCK_VERIFY_WRITE 2 1330#define FLOCK_VERIFY_WRITE 2
1331 1331
1332/* /sys/fs */
1333extern struct subsystem fs_subsys;
1334
1332extern int locks_mandatory_locked(struct inode *); 1335extern int locks_mandatory_locked(struct inode *);
1333extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); 1336extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t);
1334 1337
diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h
new file mode 100644
index 000000000000..3ab40917383f
--- /dev/null
+++ b/include/linux/gfs2_ondisk.h
@@ -0,0 +1,472 @@
1/*
2* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3* Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4*
5* This copyrighted material is made available to anyone wishing to use,
6* modify, copy, or redistribute it subject to the terms and conditions
7* of the GNU General Public License v.2.
8*/
9
10#ifndef __GFS2_ONDISK_DOT_H__
11#define __GFS2_ONDISK_DOT_H__
12
13#define GFS2_MAGIC 0x01161970
14#define GFS2_BASIC_BLOCK 512
15#define GFS2_BASIC_BLOCK_SHIFT 9
16
17/* Lock numbers of the LM_TYPE_NONDISK type */
18
19#define GFS2_MOUNT_LOCK 0
20#define GFS2_LIVE_LOCK 1
21#define GFS2_TRANS_LOCK 2
22#define GFS2_RENAME_LOCK 3
23
24/* Format numbers for various metadata types */
25
26#define GFS2_FORMAT_NONE 0
27#define GFS2_FORMAT_SB 100
28#define GFS2_FORMAT_RG 200
29#define GFS2_FORMAT_RB 300
30#define GFS2_FORMAT_DI 400
31#define GFS2_FORMAT_IN 500
32#define GFS2_FORMAT_LF 600
33#define GFS2_FORMAT_JD 700
34#define GFS2_FORMAT_LH 800
35#define GFS2_FORMAT_LD 900
36#define GFS2_FORMAT_LB 1000
37#define GFS2_FORMAT_EA 1600
38#define GFS2_FORMAT_ED 1700
39#define GFS2_FORMAT_UT 1300
40#define GFS2_FORMAT_QC 1400
41/* These are format numbers for entities contained in files */
42#define GFS2_FORMAT_RI 1100
43#define GFS2_FORMAT_DE 1200
44#define GFS2_FORMAT_QU 1500
45/* These are part of the superblock */
46#define GFS2_FORMAT_FS 1801
47#define GFS2_FORMAT_MULTI 1900
48
49/*
50 * An on-disk inode number
51 */
52
53struct gfs2_inum {
54 __be64 no_formal_ino;
55 __be64 no_addr;
56};
57
58static inline int gfs2_inum_equal(const struct gfs2_inum *ino1,
59 const struct gfs2_inum *ino2)
60{
61 return ino1->no_formal_ino == ino2->no_formal_ino &&
62 ino1->no_addr == ino2->no_addr;
63}
64
65/*
66 * Generic metadata head structure
67 * Every inplace buffer logged in the journal must start with this.
68 */
69
70#define GFS2_METATYPE_NONE 0
71#define GFS2_METATYPE_SB 1
72#define GFS2_METATYPE_RG 2
73#define GFS2_METATYPE_RB 3
74#define GFS2_METATYPE_DI 4
75#define GFS2_METATYPE_IN 5
76#define GFS2_METATYPE_LF 6
77#define GFS2_METATYPE_JD 7
78#define GFS2_METATYPE_LH 8
79#define GFS2_METATYPE_LD 9
80#define GFS2_METATYPE_LB 12
81#define GFS2_METATYPE_EA 10
82#define GFS2_METATYPE_ED 11
83#define GFS2_METATYPE_UT 13
84#define GFS2_METATYPE_QC 14
85
86struct gfs2_meta_header {
87 __be32 mh_magic;
88 __be32 mh_type;
89 __be64 __pad0; /* Was generation number in gfs1 */
90 __be32 mh_format;
91 __be32 __pad1; /* Was incarnation number in gfs1 */
92};
93
94/*
95 * super-block structure
96 *
97 * It's probably good if SIZEOF_SB <= GFS2_BASIC_BLOCK (512 bytes)
98 *
99 * Order is important, need to be able to read old superblocks to do on-disk
100 * version upgrades.
101 */
102
103/* Address of superblock in GFS2 basic blocks */
104#define GFS2_SB_ADDR 128
105
106/* The lock number for the superblock (must be zero) */
107#define GFS2_SB_LOCK 0
108
109/* Requirement: GFS2_LOCKNAME_LEN % 8 == 0
110 Includes: the fencing zero at the end */
111#define GFS2_LOCKNAME_LEN 64
112
113struct gfs2_sb {
114 struct gfs2_meta_header sb_header;
115
116 __be32 sb_fs_format;
117 __be32 sb_multihost_format;
118 __u32 __pad0; /* Was superblock flags in gfs1 */
119
120 __be32 sb_bsize;
121 __be32 sb_bsize_shift;
122 __u32 __pad1; /* Was journal segment size in gfs1 */
123
124 struct gfs2_inum sb_master_dir; /* Was jindex dinode in gfs1 */
125 struct gfs2_inum __pad2; /* Was rindex dinode in gfs1 */
126 struct gfs2_inum sb_root_dir;
127
128 char sb_lockproto[GFS2_LOCKNAME_LEN];
129 char sb_locktable[GFS2_LOCKNAME_LEN];
130 /* In gfs1, quota and license dinodes followed */
131};
132
133/*
134 * resource index structure
135 */
136
137struct gfs2_rindex {
138 __be64 ri_addr; /* grp block disk address */
139 __be32 ri_length; /* length of rgrp header in fs blocks */
140 __u32 __pad;
141
142 __be64 ri_data0; /* first data location */
143 __be32 ri_data; /* num of data blocks in rgrp */
144
145 __be32 ri_bitbytes; /* number of bytes in data bitmaps */
146
147 __u8 ri_reserved[64];
148};
149
150/*
151 * resource group header structure
152 */
153
154/* Number of blocks per byte in rgrp */
155#define GFS2_NBBY 4
156#define GFS2_BIT_SIZE 2
157#define GFS2_BIT_MASK 0x00000003
158
159#define GFS2_BLKST_FREE 0
160#define GFS2_BLKST_USED 1
161#define GFS2_BLKST_INVALID 2
162#define GFS2_BLKST_DINODE 3
163
164#define GFS2_RGF_JOURNAL 0x00000001
165#define GFS2_RGF_METAONLY 0x00000002
166#define GFS2_RGF_DATAONLY 0x00000004
167#define GFS2_RGF_NOALLOC 0x00000008
168
169struct gfs2_rgrp {
170 struct gfs2_meta_header rg_header;
171
172 __be32 rg_flags;
173 __be32 rg_free;
174 __be32 rg_dinodes;
175
176 __u8 rg_reserved[92]; /* Several fields from gfs1 now reserved */
177};
178
179/*
180 * quota structure
181 */
182
183struct gfs2_quota {
184 __be64 qu_limit;
185 __be64 qu_warn;
186 __be64 qu_value;
187 __u8 qu_reserved[64];
188};
189
190/*
191 * dinode structure
192 */
193
194#define GFS2_MAX_META_HEIGHT 10
195#define GFS2_DIR_MAX_DEPTH 17
196
197#define DT2IF(dt) (((dt) << 12) & S_IFMT)
198#define IF2DT(sif) (((sif) & S_IFMT) >> 12)
199
200enum {
201 gfs2fl_Jdata = 0,
202 gfs2fl_ExHash = 1,
203 gfs2fl_Unused = 2,
204 gfs2fl_EaIndirect = 3,
205 gfs2fl_Directio = 4,
206 gfs2fl_Immutable = 5,
207 gfs2fl_AppendOnly = 6,
208 gfs2fl_NoAtime = 7,
209 gfs2fl_Sync = 8,
210 gfs2fl_System = 9,
211 gfs2fl_TruncInProg = 29,
212 gfs2fl_InheritDirectio = 30,
213 gfs2fl_InheritJdata = 31,
214};
215
216/* Dinode flags */
217#define GFS2_DIF_JDATA 0x00000001
218#define GFS2_DIF_EXHASH 0x00000002
219#define GFS2_DIF_UNUSED 0x00000004 /* only in gfs1 */
220#define GFS2_DIF_EA_INDIRECT 0x00000008
221#define GFS2_DIF_DIRECTIO 0x00000010
222#define GFS2_DIF_IMMUTABLE 0x00000020
223#define GFS2_DIF_APPENDONLY 0x00000040
224#define GFS2_DIF_NOATIME 0x00000080
225#define GFS2_DIF_SYNC 0x00000100
226#define GFS2_DIF_SYSTEM 0x00000200 /* New in gfs2 */
227#define GFS2_DIF_TRUNC_IN_PROG 0x20000000 /* New in gfs2 */
228#define GFS2_DIF_INHERIT_DIRECTIO 0x40000000
229#define GFS2_DIF_INHERIT_JDATA 0x80000000
230
231struct gfs2_dinode {
232 struct gfs2_meta_header di_header;
233
234 struct gfs2_inum di_num;
235
236 __be32 di_mode; /* mode of file */
237 __be32 di_uid; /* owner's user id */
238 __be32 di_gid; /* owner's group id */
239 __be32 di_nlink; /* number of links to this file */
240 __be64 di_size; /* number of bytes in file */
241 __be64 di_blocks; /* number of blocks in file */
242 __be64 di_atime; /* time last accessed */
243 __be64 di_mtime; /* time last modified */
244 __be64 di_ctime; /* time last changed */
245 __be32 di_major; /* device major number */
246 __be32 di_minor; /* device minor number */
247
248 /* This section varies from gfs1. Padding added to align with
249 * remainder of dinode
250 */
251 __be64 di_goal_meta; /* rgrp to alloc from next */
252 __be64 di_goal_data; /* data block goal */
253 __u32 __pad[2];
254
255 __be32 di_flags; /* GFS2_DIF_... */
256 __be32 di_payload_format; /* GFS2_FORMAT_... */
257 __u16 __pad1; /* Was ditype in gfs1 */
258 __be16 di_height; /* height of metadata */
259 __u32 __pad2; /* Unused incarnation number from gfs1 */
260
261 /* These only apply to directories */
262 __u16 __pad3; /* Padding */
263 __be16 di_depth; /* Number of bits in the table */
264 __be32 di_entries; /* The number of entries in the directory */
265
266 struct gfs2_inum __pad4; /* Unused even in current gfs1 */
267
268 __be64 di_eattr; /* extended attribute block number */
269
270 __u8 di_reserved[56];
271};
272
273/*
274 * directory structure - many of these per directory file
275 */
276
277#define GFS2_FNAMESIZE 255
278#define GFS2_DIRENT_SIZE(name_len) ((sizeof(struct gfs2_dirent) + (name_len) + 7) & ~7)
279
280struct gfs2_dirent {
281 struct gfs2_inum de_inum;
282 __be32 de_hash;
283 __be16 de_rec_len;
284 __be16 de_name_len;
285 __be16 de_type;
286 __u8 __pad[14];
287};
288
289/*
290 * Header of leaf directory nodes
291 */
292
293struct gfs2_leaf {
294 struct gfs2_meta_header lf_header;
295
296 __be16 lf_depth; /* Depth of leaf */
297 __be16 lf_entries; /* Number of dirents in leaf */
298 __be32 lf_dirent_format; /* Format of the dirents */
299 __be64 lf_next; /* Next leaf, if overflow */
300
301 __u8 lf_reserved[64];
302};
303
304/*
305 * Extended attribute header format
306 */
307
308#define GFS2_EA_MAX_NAME_LEN 255
309#define GFS2_EA_MAX_DATA_LEN 65536
310
311#define GFS2_EATYPE_UNUSED 0
312#define GFS2_EATYPE_USR 1
313#define GFS2_EATYPE_SYS 2
314
315#define GFS2_EATYPE_LAST 2
316#define GFS2_EATYPE_VALID(x) ((x) <= GFS2_EATYPE_LAST)
317
318#define GFS2_EAFLAG_LAST 0x01 /* last ea in block */
319
320struct gfs2_ea_header {
321 __be32 ea_rec_len;
322 __be32 ea_data_len;
323 __u8 ea_name_len; /* no NULL pointer after the string */
324 __u8 ea_type; /* GFS2_EATYPE_... */
325 __u8 ea_flags; /* GFS2_EAFLAG_... */
326 __u8 ea_num_ptrs;
327 __u32 __pad;
328};
329
330/*
331 * Log header structure
332 */
333
334#define GFS2_LOG_HEAD_UNMOUNT 0x00000001 /* log is clean */
335
336struct gfs2_log_header {
337 struct gfs2_meta_header lh_header;
338
339 __be64 lh_sequence; /* Sequence number of this transaction */
340 __be32 lh_flags; /* GFS2_LOG_HEAD_... */
341 __be32 lh_tail; /* Block number of log tail */
342 __be32 lh_blkno;
343 __be32 lh_hash;
344};
345
346/*
347 * Log type descriptor
348 */
349
350#define GFS2_LOG_DESC_METADATA 300
351/* ld_data1 is the number of metadata blocks in the descriptor.
352 ld_data2 is unused. */
353
354#define GFS2_LOG_DESC_REVOKE 301
355/* ld_data1 is the number of revoke blocks in the descriptor.
356 ld_data2 is unused. */
357
358#define GFS2_LOG_DESC_JDATA 302
359/* ld_data1 is the number of data blocks in the descriptor.
360 ld_data2 is unused. */
361
362struct gfs2_log_descriptor {
363 struct gfs2_meta_header ld_header;
364
365 __be32 ld_type; /* GFS2_LOG_DESC_... */
366 __be32 ld_length; /* Number of buffers in this chunk */
367 __be32 ld_data1; /* descriptor-specific field */
368 __be32 ld_data2; /* descriptor-specific field */
369
370 __u8 ld_reserved[32];
371};
372
373/*
374 * Inum Range
375 * Describe a range of formal inode numbers allocated to
376 * one machine to assign to inodes.
377 */
378
379#define GFS2_INUM_QUANTUM 1048576
380
381struct gfs2_inum_range {
382 __be64 ir_start;
383 __be64 ir_length;
384};
385
386/*
387 * Statfs change
388 * Describes an change to the pool of free and allocated
389 * blocks.
390 */
391
392struct gfs2_statfs_change {
393 __be64 sc_total;
394 __be64 sc_free;
395 __be64 sc_dinodes;
396};
397
398/*
399 * Unlinked Tag
400 * Describes an allocated inode that isn't linked into
401 * the directory tree and might need to be deallocated.
402 */
403
404#define GFS2_UTF_UNINIT 0x00000001
405
406struct gfs2_unlinked_tag {
407 struct gfs2_inum ut_inum;
408 __be32 ut_flags; /* GFS2_UTF_... */
409 __u32 __pad;
410};
411
412/*
413 * Quota change
414 * Describes an allocation change for a particular
415 * user or group.
416 */
417
418#define GFS2_QCF_USER 0x00000001
419
420struct gfs2_quota_change {
421 __be64 qc_change;
422 __be32 qc_flags; /* GFS2_QCF_... */
423 __be32 qc_id;
424};
425
426#ifdef __KERNEL__
427/* Translation functions */
428
429extern void gfs2_inum_in(struct gfs2_inum *no, char *buf);
430extern void gfs2_inum_out(const struct gfs2_inum *no, char *buf);
431extern void gfs2_sb_in(struct gfs2_sb *sb, char *buf);
432extern void gfs2_rindex_in(struct gfs2_rindex *ri, char *buf);
433extern void gfs2_rindex_out(struct gfs2_rindex *ri, char *buf);
434extern void gfs2_rgrp_in(struct gfs2_rgrp *rg, char *buf);
435extern void gfs2_rgrp_out(struct gfs2_rgrp *rg, char *buf);
436extern void gfs2_quota_in(struct gfs2_quota *qu, char *buf);
437extern void gfs2_quota_out(struct gfs2_quota *qu, char *buf);
438extern void gfs2_dinode_in(struct gfs2_dinode *di, char *buf);
439extern void gfs2_dinode_out(struct gfs2_dinode *di, char *buf);
440extern void gfs2_ea_header_in(struct gfs2_ea_header *ea, char *buf);
441extern void gfs2_ea_header_out(struct gfs2_ea_header *ea, char *buf);
442extern void gfs2_log_header_in(struct gfs2_log_header *lh, char *buf);
443extern void gfs2_inum_range_in(struct gfs2_inum_range *ir, char *buf);
444extern void gfs2_inum_range_out(struct gfs2_inum_range *ir, char *buf);
445extern void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, char *buf);
446extern void gfs2_statfs_change_out(struct gfs2_statfs_change *sc, char *buf);
447extern void gfs2_unlinked_tag_in(struct gfs2_unlinked_tag *ut, char *buf);
448extern void gfs2_unlinked_tag_out(struct gfs2_unlinked_tag *ut, char *buf);
449extern void gfs2_quota_change_in(struct gfs2_quota_change *qc, char *buf);
450
451/* Printing functions */
452
453extern void gfs2_inum_print(struct gfs2_inum *no);
454extern void gfs2_meta_header_print(struct gfs2_meta_header *mh);
455extern void gfs2_sb_print(struct gfs2_sb *sb);
456extern void gfs2_rindex_print(struct gfs2_rindex *ri);
457extern void gfs2_rgrp_print(struct gfs2_rgrp *rg);
458extern void gfs2_quota_print(struct gfs2_quota *qu);
459extern void gfs2_dinode_print(struct gfs2_dinode *di);
460extern void gfs2_dirent_print(struct gfs2_dirent *de, char *name);
461extern void gfs2_leaf_print(struct gfs2_leaf *lf);
462extern void gfs2_ea_header_print(struct gfs2_ea_header *ea, char *name);
463extern void gfs2_log_header_print(struct gfs2_log_header *lh);
464extern void gfs2_log_descriptor_print(struct gfs2_log_descriptor *ld);
465extern void gfs2_inum_range_print(struct gfs2_inum_range *ir);
466extern void gfs2_statfs_change_print(struct gfs2_statfs_change *sc);
467extern void gfs2_unlinked_tag_print(struct gfs2_unlinked_tag *ut);
468extern void gfs2_quota_change_print(struct gfs2_quota_change *qc);
469
470#endif /* __KERNEL__ */
471
472#endif /* __GFS2_ONDISK_DOT_H__ */
diff --git a/include/linux/iflags.h b/include/linux/iflags.h
new file mode 100644
index 000000000000..1b4d9ef5d62b
--- /dev/null
+++ b/include/linux/iflags.h
@@ -0,0 +1,104 @@
1#ifndef _LINUX_IFLAGS_H
2#define _LINUX_IFLAGS_H
3
4/*
5 * A universal set of inode flags.
6 *
7 * Originally taken from ext2/3 with additions for other filesystems.
8 * Filesystems supporting this interface should interoperate with
9 * the lsattr and chattr command line tools.
10 *
11 * This interface is supported in whole or in part by:
12 * ext2
13 * ext3
14 * xfs
15 * jfs
16 * gfs2
17 *
18 */
19
20#define IFLAGS_GET_IOC _IOR('f', 1, long)
21#define IFLAGS_SET_IOC _IOW('f', 2, long)
22
23/*
24 * These values are provided for use as indices of an array
25 * for use with the iflags_cvt function below
26 */
27enum {
28 iflag_SecureRm = 0, /* Secure deletion */
29 iflag_Unrm = 1, /* Undelete */
30 iflag_Compress = 2, /* Compress file */
31 iflag_Sync = 3, /* Synchronous updates */
32 iflag_Immutable = 4, /* Immutable */
33 iflag_Append = 5, /* Append */
34 iflag_NoDump = 6, /* Don't dump file */
35 iflag_NoAtime = 7, /* No atime updates */
36 /* Reserved for compression usage */
37 iflag_Dirty = 8,
38 iflag_ComprBlk = 9, /* One or more compressed clusters */
39 iflag_NoComp = 10, /* Don't compress */
40 iflag_Ecompr = 11, /* Compression error */
41 /* End of compression flags */
42 iflag_Btree = 12, /* btree format dir */
43 iflag_Index = 12, /* hash-indexed directory */
44 iflag_Imagic = 13, /* AFS directory */
45 iflag_JournalData = 14, /* file data should be journaled */
46 iflag_NoTail = 15, /* file tail should not be merged */
47 iflag_DirSync = 16, /* dirsync behaviour */
48 iflag_TopDir = 17, /* Top of directory hierarchies */
49 iflag_DirectIO = 18, /* Always use direct I/O on this file */
50 iflag_InheritDirectIO = 19, /* Set DirectIO on new files in dir */
51 iflag_InheritJdata = 20, /* Set JournalData on create in dir */
52 iflag_Reserved = 31 /* reserved for ext2/3 lib */
53};
54
55#define __IFL(x) (1<<(iflag_##x))
56#define IFLAG_SECRM __IFL(SecureRm) /* 0x00000001 */
57#define IFLAG_UNRM __IFL(Unrm) /* 0x00000002 */
58#define IFLAG_COMPR __IFL(Compr) /* 0x00000004 */
59#define IFLAG_SYNC __IFL(Sync) /* 0x00000008 */
60#define IFLAG_IMMUTABLE __IFL(Immutable) /* 0x00000010 */
61#define IFLAG_APPEND __IFL(Append) /* 0x00000020 */
62#define IFLAG_NODUMP __IFL(NoDump) /* 0x00000040 */
63#define IFLAG_NOATIME __IFL(NoAtime) /* 0x00000080 */
64#define IFLAG_DIRTY __IFL(Dirty) /* 0x00000100 */
65#define IFLAG_COMPRBLK __IFL(ComprBlk) /* 0x00000200 */
66#define IFLAG_NOCOMP __IFL(NoComp) /* 0x00000400 */
67#define IFLAG_ECOMPR __IFL(Ecompr) /* 0x00000800 */
68#define IFLAG_BTREE __IFL(Btree) /* 0x00001000 */
69#define IFLAG_INDEX __IFL(Index) /* 0x00001000 */
70#define IFLAG_IMAGIC __IFL(Imagic) /* 0x00002000 */
71#define IFLAG_JOURNAL_DATA __IFL(JournalData) /* 0x00004000 */
72#define IFLAG_NOTAIL __IFL(NoTail) /* 0x00008000 */
73#define IFLAG_DIRSYNC __IFL(DirSync) /* 0x00010000 */
74#define IFLAG_TOPDIR __IFL(TopDir) /* 0x00020000 */
75#define IFLAG_DIRECTIO __IFL(DirectIO) /* 0x00040000 */
76#define IFLAG_INHERITDIRECTIO __IFL(InheritDirectIO) /* 0x00080000 */
77#define IFLAG_INHERITJDATA __IFL(InheritJdata) /* 0x00100000 */
78#define IFLAG_RESERVED __IFL(Reserved) /* 0x80000000 */
79
80#ifdef __KERNEL__
81/**
82 * iflags_cvt
83 * @table: A table of 32 u32 flags
84 * @val: a 32 bit value to convert
85 *
86 * This function can be used to convert between IFLAGS values and
87 * the filesystem's own flags values.
88 *
89 * Returns: the converted flags
90 */
91static inline u32 iflags_cvt(const u32 *table, u32 val)
92{
93 u32 res = 0;
94 while(val) {
95 if (val & 1)
96 res |= *table;
97 table++;
98 val >>= 1;
99 }
100 return res;
101}
102#endif /* __KERNEL__ */
103
104#endif /* _LINUX_IFLAGS_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e1bd0842f6a1..2ae50277f581 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -29,6 +29,7 @@ extern const char linux_banner[];
29 29
30#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 30#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
31#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1)) 31#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
32#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
32 33
33#define KERN_EMERG "<0>" /* system is unusable */ 34#define KERN_EMERG "<0>" /* system is unusable */
34#define KERN_ALERT "<1>" /* action must be taken immediately */ 35#define KERN_ALERT "<1>" /* action must be taken immediately */
diff --git a/include/linux/lock_dlm_plock.h b/include/linux/lock_dlm_plock.h
new file mode 100644
index 000000000000..007b07a178ab
--- /dev/null
+++ b/include/linux/lock_dlm_plock.h
@@ -0,0 +1,40 @@
1/*
2 * Copyright (C) 2005 Red Hat, Inc. All rights reserved.
3 *
4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License v.2.
7 */
8
9#ifndef __LOCK_DLM_PLOCK_DOT_H__
10#define __LOCK_DLM_PLOCK_DOT_H__
11
12#define GDLM_PLOCK_MISC_NAME "lock_dlm_plock"
13
14#define GDLM_PLOCK_VERSION_MAJOR 1
15#define GDLM_PLOCK_VERSION_MINOR 0
16#define GDLM_PLOCK_VERSION_PATCH 0
17
18enum {
19 GDLM_PLOCK_OP_LOCK = 1,
20 GDLM_PLOCK_OP_UNLOCK,
21 GDLM_PLOCK_OP_GET,
22};
23
24struct gdlm_plock_info {
25 __u32 version[3];
26 __u8 optype;
27 __u8 ex;
28 __u8 wait;
29 __u8 pad;
30 __u32 pid;
31 __s32 nodeid;
32 __s32 rv;
33 __u32 fsid;
34 __u64 number;
35 __u64 start;
36 __u64 end;
37};
38
39#endif
40
diff --git a/kernel/printk.c b/kernel/printk.c
index c056f3324432..a33f342b31b7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -997,6 +997,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
997 tty->driver->write(tty, msg, strlen(msg)); 997 tty->driver->write(tty, msg, strlen(msg));
998 return; 998 return;
999} 999}
1000EXPORT_SYMBOL_GPL(tty_write_message);
1000 1001
1001/* 1002/*
1002 * printk rate limiting, lifted from the networking subsystem. 1003 * printk rate limiting, lifted from the networking subsystem.
diff --git a/mm/filemap.c b/mm/filemap.c
index 3ef20739e725..1120338a5d0f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1012,6 +1012,7 @@ success:
1012 desc->arg.buf += size; 1012 desc->arg.buf += size;
1013 return size; 1013 return size;
1014} 1014}
1015EXPORT_SYMBOL(file_read_actor);
1015 1016
1016/* 1017/*
1017 * This is the "read()" routine for all filesystems 1018 * This is the "read()" routine for all filesystems
diff --git a/mm/readahead.c b/mm/readahead.c
index 0f142a40984b..ba7db816f4c8 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -38,6 +38,7 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
38 ra->ra_pages = mapping->backing_dev_info->ra_pages; 38 ra->ra_pages = mapping->backing_dev_info->ra_pages;
39 ra->prev_page = -1; 39 ra->prev_page = -1;
40} 40}
41EXPORT_SYMBOL_GPL(file_ra_state_init);
41 42
42/* 43/*
43 * Return max readahead size for this inode in number-of-pages. 44 * Return max readahead size for this inode in number-of-pages.