aboutsummaryrefslogtreecommitdiffstats
path: root/fs/jfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/jfs')
-rw-r--r--fs/jfs/Makefile15
-rw-r--r--fs/jfs/acl.c234
-rw-r--r--fs/jfs/endian24.h49
-rw-r--r--fs/jfs/file.c119
-rw-r--r--fs/jfs/inode.c384
-rw-r--r--fs/jfs/jfs_acl.h30
-rw-r--r--fs/jfs/jfs_btree.h172
-rw-r--r--fs/jfs/jfs_debug.c154
-rw-r--r--fs/jfs/jfs_debug.h122
-rw-r--r--fs/jfs/jfs_dinode.h151
-rw-r--r--fs/jfs/jfs_dmap.c4272
-rw-r--r--fs/jfs/jfs_dmap.h314
-rw-r--r--fs/jfs/jfs_dtree.c4752
-rw-r--r--fs/jfs/jfs_dtree.h279
-rw-r--r--fs/jfs/jfs_extent.c668
-rw-r--r--fs/jfs/jfs_extent.h31
-rw-r--r--fs/jfs/jfs_filsys.h280
-rw-r--r--fs/jfs/jfs_imap.c3270
-rw-r--r--fs/jfs/jfs_imap.h175
-rw-r--r--fs/jfs/jfs_incore.h197
-rw-r--r--fs/jfs/jfs_inode.c104
-rw-r--r--fs/jfs/jfs_inode.h23
-rw-r--r--fs/jfs/jfs_lock.h51
-rw-r--r--fs/jfs/jfs_logmgr.c2524
-rw-r--r--fs/jfs/jfs_logmgr.h510
-rw-r--r--fs/jfs/jfs_metapage.c580
-rw-r--r--fs/jfs/jfs_metapage.h115
-rw-r--r--fs/jfs/jfs_mount.c512
-rw-r--r--fs/jfs/jfs_superblock.h113
-rw-r--r--fs/jfs/jfs_txnmgr.c3131
-rw-r--r--fs/jfs/jfs_txnmgr.h318
-rw-r--r--fs/jfs/jfs_types.h192
-rw-r--r--fs/jfs/jfs_umount.c178
-rw-r--r--fs/jfs/jfs_unicode.c137
-rw-r--r--fs/jfs/jfs_unicode.h155
-rw-r--r--fs/jfs/jfs_uniupr.c134
-rw-r--r--fs/jfs/jfs_xattr.h64
-rw-r--r--fs/jfs/jfs_xtree.c4485
-rw-r--r--fs/jfs/jfs_xtree.h140
-rw-r--r--fs/jfs/namei.c1540
-rw-r--r--fs/jfs/resize.c537
-rw-r--r--fs/jfs/super.c700
-rw-r--r--fs/jfs/symlink.c39
-rw-r--r--fs/jfs/xattr.c1127
44 files changed, 33077 insertions, 0 deletions
diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile
new file mode 100644
index 000000000000..6f1e0e95587a
--- /dev/null
+++ b/fs/jfs/Makefile
@@ -0,0 +1,15 @@
1#
2# Makefile for the Linux JFS filesystem routines.
3#
4
5obj-$(CONFIG_JFS_FS) += jfs.o
6
7jfs-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \
8 jfs_xtree.o jfs_imap.o jfs_debug.o jfs_dmap.o \
9 jfs_unicode.o jfs_dtree.o jfs_inode.o \
10 jfs_extent.o symlink.o jfs_metapage.o \
11 jfs_logmgr.o jfs_txnmgr.o jfs_uniupr.o resize.o xattr.o
12
13jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o
14
15EXTRA_CFLAGS += -D_JFS_4K
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
new file mode 100644
index 000000000000..8d2a9ab981d4
--- /dev/null
+++ b/fs/jfs/acl.c
@@ -0,0 +1,234 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2002-2004
3 * Copyright (C) Andreas Gruenbacher, 2001
4 * Copyright (C) Linus Torvalds, 1991, 1992
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
14 * the GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21#include <linux/sched.h>
22#include <linux/fs.h>
23#include <linux/quotaops.h>
24#include "jfs_incore.h"
25#include "jfs_xattr.h"
26#include "jfs_acl.h"
27
28static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
29{
30 struct posix_acl *acl;
31 char *ea_name;
32 struct jfs_inode_info *ji = JFS_IP(inode);
33 struct posix_acl **p_acl;
34 int size;
35 char *value = NULL;
36
37 switch(type) {
38 case ACL_TYPE_ACCESS:
39 ea_name = XATTR_NAME_ACL_ACCESS;
40 p_acl = &ji->i_acl;
41 break;
42 case ACL_TYPE_DEFAULT:
43 ea_name = XATTR_NAME_ACL_DEFAULT;
44 p_acl = &ji->i_default_acl;
45 break;
46 default:
47 return ERR_PTR(-EINVAL);
48 }
49
50 if (*p_acl != JFS_ACL_NOT_CACHED)
51 return posix_acl_dup(*p_acl);
52
53 size = __jfs_getxattr(inode, ea_name, NULL, 0);
54
55 if (size > 0) {
56 value = kmalloc(size, GFP_KERNEL);
57 if (!value)
58 return ERR_PTR(-ENOMEM);
59 size = __jfs_getxattr(inode, ea_name, value, size);
60 }
61
62 if (size < 0) {
63 if (size == -ENODATA) {
64 *p_acl = NULL;
65 acl = NULL;
66 } else
67 acl = ERR_PTR(size);
68 } else {
69 acl = posix_acl_from_xattr(value, size);
70 if (!IS_ERR(acl))
71 *p_acl = posix_acl_dup(acl);
72 }
73 if (value)
74 kfree(value);
75 return acl;
76}
77
78static int jfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
79{
80 char *ea_name;
81 struct jfs_inode_info *ji = JFS_IP(inode);
82 struct posix_acl **p_acl;
83 int rc;
84 int size = 0;
85 char *value = NULL;
86
87 if (S_ISLNK(inode->i_mode))
88 return -EOPNOTSUPP;
89
90 switch(type) {
91 case ACL_TYPE_ACCESS:
92 ea_name = XATTR_NAME_ACL_ACCESS;
93 p_acl = &ji->i_acl;
94 break;
95 case ACL_TYPE_DEFAULT:
96 ea_name = XATTR_NAME_ACL_DEFAULT;
97 p_acl = &ji->i_default_acl;
98 if (!S_ISDIR(inode->i_mode))
99 return acl ? -EACCES : 0;
100 break;
101 default:
102 return -EINVAL;
103 }
104 if (acl) {
105 size = xattr_acl_size(acl->a_count);
106 value = kmalloc(size, GFP_KERNEL);
107 if (!value)
108 return -ENOMEM;
109 rc = posix_acl_to_xattr(acl, value, size);
110 if (rc < 0)
111 goto out;
112 }
113 rc = __jfs_setxattr(inode, ea_name, value, size, 0);
114out:
115 if (value)
116 kfree(value);
117
118 if (!rc) {
119 if (*p_acl && (*p_acl != JFS_ACL_NOT_CACHED))
120 posix_acl_release(*p_acl);
121 *p_acl = posix_acl_dup(acl);
122 }
123 return rc;
124}
125
126static int jfs_check_acl(struct inode *inode, int mask)
127{
128 struct jfs_inode_info *ji = JFS_IP(inode);
129
130 if (ji->i_acl == JFS_ACL_NOT_CACHED) {
131 struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
132 if (IS_ERR(acl))
133 return PTR_ERR(acl);
134 posix_acl_release(acl);
135 }
136
137 if (ji->i_acl)
138 return posix_acl_permission(inode, ji->i_acl, mask);
139 return -EAGAIN;
140}
141
142int jfs_permission(struct inode *inode, int mask, struct nameidata *nd)
143{
144 return generic_permission(inode, mask, jfs_check_acl);
145}
146
147int jfs_init_acl(struct inode *inode, struct inode *dir)
148{
149 struct posix_acl *acl = NULL;
150 struct posix_acl *clone;
151 mode_t mode;
152 int rc = 0;
153
154 if (S_ISLNK(inode->i_mode))
155 return 0;
156
157 acl = jfs_get_acl(dir, ACL_TYPE_DEFAULT);
158 if (IS_ERR(acl))
159 return PTR_ERR(acl);
160
161 if (acl) {
162 if (S_ISDIR(inode->i_mode)) {
163 rc = jfs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
164 if (rc)
165 goto cleanup;
166 }
167 clone = posix_acl_clone(acl, GFP_KERNEL);
168 if (!clone) {
169 rc = -ENOMEM;
170 goto cleanup;
171 }
172 mode = inode->i_mode;
173 rc = posix_acl_create_masq(clone, &mode);
174 if (rc >= 0) {
175 inode->i_mode = mode;
176 if (rc > 0)
177 rc = jfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
178 }
179 posix_acl_release(clone);
180cleanup:
181 posix_acl_release(acl);
182 } else
183 inode->i_mode &= ~current->fs->umask;
184
185 return rc;
186}
187
188static int jfs_acl_chmod(struct inode *inode)
189{
190 struct posix_acl *acl, *clone;
191 int rc;
192
193 if (S_ISLNK(inode->i_mode))
194 return -EOPNOTSUPP;
195
196 acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
197 if (IS_ERR(acl) || !acl)
198 return PTR_ERR(acl);
199
200 clone = posix_acl_clone(acl, GFP_KERNEL);
201 posix_acl_release(acl);
202 if (!clone)
203 return -ENOMEM;
204
205 rc = posix_acl_chmod_masq(clone, inode->i_mode);
206 if (!rc)
207 rc = jfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
208
209 posix_acl_release(clone);
210 return rc;
211}
212
213int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
214{
215 struct inode *inode = dentry->d_inode;
216 int rc;
217
218 rc = inode_change_ok(inode, iattr);
219 if (rc)
220 return rc;
221
222 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
223 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
224 if (DQUOT_TRANSFER(inode, iattr))
225 return -EDQUOT;
226 }
227
228 rc = inode_setattr(inode, iattr);
229
230 if (!rc && (iattr->ia_valid & ATTR_MODE))
231 rc = jfs_acl_chmod(inode);
232
233 return rc;
234}
diff --git a/fs/jfs/endian24.h b/fs/jfs/endian24.h
new file mode 100644
index 000000000000..ab7cd0567c95
--- /dev/null
+++ b/fs/jfs/endian24.h
@@ -0,0 +1,49 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2001
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18#ifndef _H_ENDIAN24
19#define _H_ENDIAN24
20
21/*
22 * endian24.h:
23 *
24 * Endian conversion for 24-byte data
25 *
26 */
27#define __swab24(x) \
28({ \
29 __u32 __x = (x); \
30 ((__u32)( \
31 ((__x & (__u32)0x000000ffUL) << 16) | \
32 (__x & (__u32)0x0000ff00UL) | \
33 ((__x & (__u32)0x00ff0000UL) >> 16) )); \
34})
35
36#if (defined(__KERNEL__) && defined(__LITTLE_ENDIAN)) || (defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN))
37 #define __cpu_to_le24(x) ((__u32)(x))
38 #define __le24_to_cpu(x) ((__u32)(x))
39#else
40 #define __cpu_to_le24(x) __swab24(x)
41 #define __le24_to_cpu(x) __swab24(x)
42#endif
43
44#ifdef __KERNEL__
45 #define cpu_to_le24 __cpu_to_le24
46 #define le24_to_cpu __le24_to_cpu
47#endif
48
49#endif /* !_H_ENDIAN24 */
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
new file mode 100644
index 000000000000..a87b06fa8ff8
--- /dev/null
+++ b/fs/jfs/file.c
@@ -0,0 +1,119 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2000-2002
3 * Portions Copyright (c) Christoph Hellwig, 2001-2002
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/fs.h>
21#include "jfs_incore.h"
22#include "jfs_dmap.h"
23#include "jfs_txnmgr.h"
24#include "jfs_xattr.h"
25#include "jfs_acl.h"
26#include "jfs_debug.h"
27
28
29extern int jfs_commit_inode(struct inode *, int);
30extern void jfs_truncate(struct inode *);
31
32int jfs_fsync(struct file *file, struct dentry *dentry, int datasync)
33{
34 struct inode *inode = dentry->d_inode;
35 int rc = 0;
36
37 if (!(inode->i_state & I_DIRTY) ||
38 (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
39 /* Make sure committed changes hit the disk */
40 jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
41 return rc;
42 }
43
44 rc |= jfs_commit_inode(inode, 1);
45
46 return rc ? -EIO : 0;
47}
48
49static int jfs_open(struct inode *inode, struct file *file)
50{
51 int rc;
52
53 if ((rc = generic_file_open(inode, file)))
54 return rc;
55
56 /*
57 * We attempt to allow only one "active" file open per aggregate
58 * group. Otherwise, appending to files in parallel can cause
59 * fragmentation within the files.
60 *
61 * If the file is empty, it was probably just created and going
62 * to be written to. If it has a size, we'll hold off until the
63 * file is actually grown.
64 */
65 if (S_ISREG(inode->i_mode) && file->f_mode & FMODE_WRITE &&
66 (inode->i_size == 0)) {
67 struct jfs_inode_info *ji = JFS_IP(inode);
68 spin_lock_irq(&ji->ag_lock);
69 if (ji->active_ag == -1) {
70 ji->active_ag = ji->agno;
71 atomic_inc(
72 &JFS_SBI(inode->i_sb)->bmap->db_active[ji->agno]);
73 }
74 spin_unlock_irq(&ji->ag_lock);
75 }
76
77 return 0;
78}
79static int jfs_release(struct inode *inode, struct file *file)
80{
81 struct jfs_inode_info *ji = JFS_IP(inode);
82
83 spin_lock_irq(&ji->ag_lock);
84 if (ji->active_ag != -1) {
85 struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap;
86 atomic_dec(&bmap->db_active[ji->active_ag]);
87 ji->active_ag = -1;
88 }
89 spin_unlock_irq(&ji->ag_lock);
90
91 return 0;
92}
93
94struct inode_operations jfs_file_inode_operations = {
95 .truncate = jfs_truncate,
96 .setxattr = jfs_setxattr,
97 .getxattr = jfs_getxattr,
98 .listxattr = jfs_listxattr,
99 .removexattr = jfs_removexattr,
100#ifdef CONFIG_JFS_POSIX_ACL
101 .setattr = jfs_setattr,
102 .permission = jfs_permission,
103#endif
104};
105
106struct file_operations jfs_file_operations = {
107 .open = jfs_open,
108 .llseek = generic_file_llseek,
109 .write = generic_file_write,
110 .read = generic_file_read,
111 .aio_read = generic_file_aio_read,
112 .aio_write = generic_file_aio_write,
113 .mmap = generic_file_mmap,
114 .readv = generic_file_readv,
115 .writev = generic_file_writev,
116 .sendfile = generic_file_sendfile,
117 .fsync = jfs_fsync,
118 .release = jfs_release,
119};
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
new file mode 100644
index 000000000000..7bc906677b0d
--- /dev/null
+++ b/fs/jfs/inode.c
@@ -0,0 +1,384 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 * Portions Copyright (C) Christoph Hellwig, 2001-2002
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/fs.h>
21#include <linux/mpage.h>
22#include <linux/buffer_head.h>
23#include <linux/pagemap.h>
24#include <linux/quotaops.h>
25#include "jfs_incore.h"
26#include "jfs_filsys.h"
27#include "jfs_imap.h"
28#include "jfs_extent.h"
29#include "jfs_unicode.h"
30#include "jfs_debug.h"
31
32
33extern struct inode_operations jfs_dir_inode_operations;
34extern struct inode_operations jfs_file_inode_operations;
35extern struct inode_operations jfs_symlink_inode_operations;
36extern struct file_operations jfs_dir_operations;
37extern struct file_operations jfs_file_operations;
38struct address_space_operations jfs_aops;
39extern int freeZeroLink(struct inode *);
40
41void jfs_read_inode(struct inode *inode)
42{
43 if (diRead(inode)) {
44 make_bad_inode(inode);
45 return;
46 }
47
48 if (S_ISREG(inode->i_mode)) {
49 inode->i_op = &jfs_file_inode_operations;
50 inode->i_fop = &jfs_file_operations;
51 inode->i_mapping->a_ops = &jfs_aops;
52 } else if (S_ISDIR(inode->i_mode)) {
53 inode->i_op = &jfs_dir_inode_operations;
54 inode->i_fop = &jfs_dir_operations;
55 } else if (S_ISLNK(inode->i_mode)) {
56 if (inode->i_size >= IDATASIZE) {
57 inode->i_op = &page_symlink_inode_operations;
58 inode->i_mapping->a_ops = &jfs_aops;
59 } else
60 inode->i_op = &jfs_symlink_inode_operations;
61 } else {
62 inode->i_op = &jfs_file_inode_operations;
63 init_special_inode(inode, inode->i_mode, inode->i_rdev);
64 }
65}
66
67/*
68 * Workhorse of both fsync & write_inode
69 */
70int jfs_commit_inode(struct inode *inode, int wait)
71{
72 int rc = 0;
73 tid_t tid;
74 static int noisy = 5;
75
76 jfs_info("In jfs_commit_inode, inode = 0x%p", inode);
77
78 /*
79 * Don't commit if inode has been committed since last being
80 * marked dirty, or if it has been deleted.
81 */
82 if (inode->i_nlink == 0 || !test_cflag(COMMIT_Dirty, inode))
83 return 0;
84
85 if (isReadOnly(inode)) {
86 /* kernel allows writes to devices on read-only
87 * partitions and may think inode is dirty
88 */
89 if (!special_file(inode->i_mode) && noisy) {
90 jfs_err("jfs_commit_inode(0x%p) called on "
91 "read-only volume", inode);
92 jfs_err("Is remount racy?");
93 noisy--;
94 }
95 return 0;
96 }
97
98 tid = txBegin(inode->i_sb, COMMIT_INODE);
99 down(&JFS_IP(inode)->commit_sem);
100
101 /*
102 * Retest inode state after taking commit_sem
103 */
104 if (inode->i_nlink && test_cflag(COMMIT_Dirty, inode))
105 rc = txCommit(tid, 1, &inode, wait ? COMMIT_SYNC : 0);
106
107 txEnd(tid);
108 up(&JFS_IP(inode)->commit_sem);
109 return rc;
110}
111
112int jfs_write_inode(struct inode *inode, int wait)
113{
114 if (test_cflag(COMMIT_Nolink, inode))
115 return 0;
116 /*
117 * If COMMIT_DIRTY is not set, the inode isn't really dirty.
118 * It has been committed since the last change, but was still
119 * on the dirty inode list.
120 */
121 if (!test_cflag(COMMIT_Dirty, inode)) {
122 /* Make sure committed changes hit the disk */
123 jfs_flush_journal(JFS_SBI(inode->i_sb)->log, wait);
124 return 0;
125 }
126
127 if (jfs_commit_inode(inode, wait)) {
128 jfs_err("jfs_write_inode: jfs_commit_inode failed!");
129 return -EIO;
130 } else
131 return 0;
132}
133
134void jfs_delete_inode(struct inode *inode)
135{
136 jfs_info("In jfs_delete_inode, inode = 0x%p", inode);
137
138 if (test_cflag(COMMIT_Freewmap, inode))
139 freeZeroLink(inode);
140
141 diFree(inode);
142
143 /*
144 * Free the inode from the quota allocation.
145 */
146 DQUOT_INIT(inode);
147 DQUOT_FREE_INODE(inode);
148 DQUOT_DROP(inode);
149
150 clear_inode(inode);
151}
152
153void jfs_dirty_inode(struct inode *inode)
154{
155 static int noisy = 5;
156
157 if (isReadOnly(inode)) {
158 if (!special_file(inode->i_mode) && noisy) {
159 /* kernel allows writes to devices on read-only
160 * partitions and may try to mark inode dirty
161 */
162 jfs_err("jfs_dirty_inode called on read-only volume");
163 jfs_err("Is remount racy?");
164 noisy--;
165 }
166 return;
167 }
168
169 set_cflag(COMMIT_Dirty, inode);
170}
171
172static int
173jfs_get_blocks(struct inode *ip, sector_t lblock, unsigned long max_blocks,
174 struct buffer_head *bh_result, int create)
175{
176 s64 lblock64 = lblock;
177 int rc = 0;
178 int take_locks;
179 xad_t xad;
180 s64 xaddr;
181 int xflag;
182 s32 xlen;
183
184 /*
185 * If this is a special inode (imap, dmap)
186 * the lock should already be taken
187 */
188 take_locks = (JFS_IP(ip)->fileset != AGGREGATE_I);
189
190 /*
191 * Take appropriate lock on inode
192 */
193 if (take_locks) {
194 if (create)
195 IWRITE_LOCK(ip);
196 else
197 IREAD_LOCK(ip);
198 }
199
200 if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) &&
201 (xtLookup(ip, lblock64, max_blocks, &xflag, &xaddr, &xlen, 0)
202 == 0) && xlen) {
203 if (xflag & XAD_NOTRECORDED) {
204 if (!create)
205 /*
206 * Allocated but not recorded, read treats
207 * this as a hole
208 */
209 goto unlock;
210#ifdef _JFS_4K
211 XADoffset(&xad, lblock64);
212 XADlength(&xad, xlen);
213 XADaddress(&xad, xaddr);
214#else /* _JFS_4K */
215 /*
216 * As long as block size = 4K, this isn't a problem.
217 * We should mark the whole page not ABNR, but how
218 * will we know to mark the other blocks BH_New?
219 */
220 BUG();
221#endif /* _JFS_4K */
222 rc = extRecord(ip, &xad);
223 if (rc)
224 goto unlock;
225 set_buffer_new(bh_result);
226 }
227
228 map_bh(bh_result, ip->i_sb, xaddr);
229 bh_result->b_size = xlen << ip->i_blkbits;
230 goto unlock;
231 }
232 if (!create)
233 goto unlock;
234
235 /*
236 * Allocate a new block
237 */
238#ifdef _JFS_4K
239 if ((rc = extHint(ip, lblock64 << ip->i_sb->s_blocksize_bits, &xad)))
240 goto unlock;
241 rc = extAlloc(ip, max_blocks, lblock64, &xad, FALSE);
242 if (rc)
243 goto unlock;
244
245 set_buffer_new(bh_result);
246 map_bh(bh_result, ip->i_sb, addressXAD(&xad));
247 bh_result->b_size = lengthXAD(&xad) << ip->i_blkbits;
248
249#else /* _JFS_4K */
250 /*
251 * We need to do whatever it takes to keep all but the last buffers
252 * in 4K pages - see jfs_write.c
253 */
254 BUG();
255#endif /* _JFS_4K */
256
257 unlock:
258 /*
259 * Release lock on inode
260 */
261 if (take_locks) {
262 if (create)
263 IWRITE_UNLOCK(ip);
264 else
265 IREAD_UNLOCK(ip);
266 }
267 return rc;
268}
269
270static int jfs_get_block(struct inode *ip, sector_t lblock,
271 struct buffer_head *bh_result, int create)
272{
273 return jfs_get_blocks(ip, lblock, 1, bh_result, create);
274}
275
276static int jfs_writepage(struct page *page, struct writeback_control *wbc)
277{
278 return nobh_writepage(page, jfs_get_block, wbc);
279}
280
281static int jfs_writepages(struct address_space *mapping,
282 struct writeback_control *wbc)
283{
284 return mpage_writepages(mapping, wbc, jfs_get_block);
285}
286
287static int jfs_readpage(struct file *file, struct page *page)
288{
289 return mpage_readpage(page, jfs_get_block);
290}
291
292static int jfs_readpages(struct file *file, struct address_space *mapping,
293 struct list_head *pages, unsigned nr_pages)
294{
295 return mpage_readpages(mapping, pages, nr_pages, jfs_get_block);
296}
297
298static int jfs_prepare_write(struct file *file,
299 struct page *page, unsigned from, unsigned to)
300{
301 return nobh_prepare_write(page, from, to, jfs_get_block);
302}
303
304static sector_t jfs_bmap(struct address_space *mapping, sector_t block)
305{
306 return generic_block_bmap(mapping, block, jfs_get_block);
307}
308
309static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
310 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
311{
312 struct file *file = iocb->ki_filp;
313 struct inode *inode = file->f_mapping->host;
314
315 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
316 offset, nr_segs, jfs_get_blocks, NULL);
317}
318
319struct address_space_operations jfs_aops = {
320 .readpage = jfs_readpage,
321 .readpages = jfs_readpages,
322 .writepage = jfs_writepage,
323 .writepages = jfs_writepages,
324 .sync_page = block_sync_page,
325 .prepare_write = jfs_prepare_write,
326 .commit_write = nobh_commit_write,
327 .bmap = jfs_bmap,
328 .direct_IO = jfs_direct_IO,
329};
330
331/*
332 * Guts of jfs_truncate. Called with locks already held. Can be called
333 * with directory for truncating directory index table.
334 */
335void jfs_truncate_nolock(struct inode *ip, loff_t length)
336{
337 loff_t newsize;
338 tid_t tid;
339
340 ASSERT(length >= 0);
341
342 if (test_cflag(COMMIT_Nolink, ip)) {
343 xtTruncate(0, ip, length, COMMIT_WMAP);
344 return;
345 }
346
347 do {
348 tid = txBegin(ip->i_sb, 0);
349
350 /*
351 * The commit_sem cannot be taken before txBegin.
352 * txBegin may block and there is a chance the inode
353 * could be marked dirty and need to be committed
354 * before txBegin unblocks
355 */
356 down(&JFS_IP(ip)->commit_sem);
357
358 newsize = xtTruncate(tid, ip, length,
359 COMMIT_TRUNCATE | COMMIT_PWMAP);
360 if (newsize < 0) {
361 txEnd(tid);
362 up(&JFS_IP(ip)->commit_sem);
363 break;
364 }
365
366 ip->i_mtime = ip->i_ctime = CURRENT_TIME;
367 mark_inode_dirty(ip);
368
369 txCommit(tid, 1, &ip, 0);
370 txEnd(tid);
371 up(&JFS_IP(ip)->commit_sem);
372 } while (newsize > length); /* Truncate isn't always atomic */
373}
374
375void jfs_truncate(struct inode *ip)
376{
377 jfs_info("jfs_truncate: size = 0x%lx", (ulong) ip->i_size);
378
379 nobh_truncate_page(ip->i_mapping, ip->i_size);
380
381 IWRITE_LOCK(ip);
382 jfs_truncate_nolock(ip, ip->i_size);
383 IWRITE_UNLOCK(ip);
384}
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
new file mode 100644
index 000000000000..d2ae430adecf
--- /dev/null
+++ b/fs/jfs/jfs_acl.h
@@ -0,0 +1,30 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2002
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18#ifndef _H_JFS_ACL
19#define _H_JFS_ACL
20
21#ifdef CONFIG_JFS_POSIX_ACL
22
23#include <linux/xattr_acl.h>
24
25int jfs_permission(struct inode *, int, struct nameidata *);
26int jfs_init_acl(struct inode *, struct inode *);
27int jfs_setattr(struct dentry *, struct iattr *);
28
29#endif /* CONFIG_JFS_POSIX_ACL */
30#endif /* _H_JFS_ACL */
diff --git a/fs/jfs/jfs_btree.h b/fs/jfs/jfs_btree.h
new file mode 100644
index 000000000000..7f3e9ac454ff
--- /dev/null
+++ b/fs/jfs/jfs_btree.h
@@ -0,0 +1,172 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18#ifndef _H_JFS_BTREE
19#define _H_JFS_BTREE
20
21/*
22 * jfs_btree.h: B+-tree
23 *
24 * JFS B+-tree (dtree and xtree) common definitions
25 */
26
27/*
28 * basic btree page - btpage
29 *
30struct btpage {
31 s64 next; right sibling bn
32 s64 prev; left sibling bn
33
34 u8 flag;
35 u8 rsrvd[7]; type specific
36 s64 self; self address
37
38 u8 entry[4064];
39}; */
40
41/* btpaget_t flag */
42#define BT_TYPE 0x07 /* B+-tree index */
43#define BT_ROOT 0x01 /* root page */
44#define BT_LEAF 0x02 /* leaf page */
45#define BT_INTERNAL 0x04 /* internal page */
46#define BT_RIGHTMOST 0x10 /* rightmost page */
47#define BT_LEFTMOST 0x20 /* leftmost page */
48#define BT_SWAPPED 0x80 /* used by fsck for endian swapping */
49
50/* btorder (in inode) */
51#define BT_RANDOM 0x0000
52#define BT_SEQUENTIAL 0x0001
53#define BT_LOOKUP 0x0010
54#define BT_INSERT 0x0020
55#define BT_DELETE 0x0040
56
57/*
58 * btree page buffer cache access
59 */
60#define BT_IS_ROOT(MP) (((MP)->xflag & COMMIT_PAGE) == 0)
61
62/* get page from buffer page */
63#define BT_PAGE(IP, MP, TYPE, ROOT)\
64 (BT_IS_ROOT(MP) ? (TYPE *)&JFS_IP(IP)->ROOT : (TYPE *)(MP)->data)
65
66/* get the page buffer and the page for specified block address */
67#define BT_GETPAGE(IP, BN, MP, TYPE, SIZE, P, RC, ROOT)\
68{\
69 if ((BN) == 0)\
70 {\
71 MP = (struct metapage *)&JFS_IP(IP)->bxflag;\
72 P = (TYPE *)&JFS_IP(IP)->ROOT;\
73 RC = 0;\
74 }\
75 else\
76 {\
77 MP = read_metapage((IP), BN, SIZE, 1);\
78 if (MP) {\
79 RC = 0;\
80 P = (MP)->data;\
81 } else {\
82 P = NULL;\
83 jfs_err("bread failed!");\
84 RC = -EIO;\
85 }\
86 }\
87}
88
89#define BT_MARK_DIRTY(MP, IP)\
90{\
91 if (BT_IS_ROOT(MP))\
92 mark_inode_dirty(IP);\
93 else\
94 mark_metapage_dirty(MP);\
95}
96
97/* put the page buffer */
98#define BT_PUTPAGE(MP)\
99{\
100 if (! BT_IS_ROOT(MP)) \
101 release_metapage(MP); \
102}
103
104
105/*
106 * btree traversal stack
107 *
108 * record the path traversed during the search;
109 * top frame record the leaf page/entry selected.
110 */
111struct btframe { /* stack frame */
112 s64 bn; /* 8: */
113 s16 index; /* 2: */
114 s16 lastindex; /* 2: unused */
115 struct metapage *mp; /* 4/8: */
116}; /* (16/24) */
117
118struct btstack {
119 struct btframe *top;
120 int nsplit;
121 struct btframe stack[MAXTREEHEIGHT];
122};
123
124#define BT_CLR(btstack)\
125 (btstack)->top = (btstack)->stack
126
127#define BT_STACK_FULL(btstack)\
128 ( (btstack)->top == &((btstack)->stack[MAXTREEHEIGHT-1]))
129
130#define BT_PUSH(BTSTACK, BN, INDEX)\
131{\
132 assert(!BT_STACK_FULL(BTSTACK));\
133 (BTSTACK)->top->bn = BN;\
134 (BTSTACK)->top->index = INDEX;\
135 ++(BTSTACK)->top;\
136}
137
138#define BT_POP(btstack)\
139 ( (btstack)->top == (btstack)->stack ? NULL : --(btstack)->top )
140
141#define BT_STACK(btstack)\
142 ( (btstack)->top == (btstack)->stack ? NULL : (btstack)->top )
143
144static inline void BT_STACK_DUMP(struct btstack *btstack)
145{
146 int i;
147 printk("btstack dump:\n");
148 for (i = 0; i < MAXTREEHEIGHT; i++)
149 printk(KERN_ERR "bn = %Lx, index = %d\n",
150 (long long)btstack->stack[i].bn,
151 btstack->stack[i].index);
152}
153
154/* retrieve search results */
155#define BT_GETSEARCH(IP, LEAF, BN, MP, TYPE, P, INDEX, ROOT)\
156{\
157 BN = (LEAF)->bn;\
158 MP = (LEAF)->mp;\
159 if (BN)\
160 P = (TYPE *)MP->data;\
161 else\
162 P = (TYPE *)&JFS_IP(IP)->ROOT;\
163 INDEX = (LEAF)->index;\
164}
165
166/* put the page buffer of search */
167#define BT_PUTSEARCH(BTSTACK)\
168{\
169 if (! BT_IS_ROOT((BTSTACK)->top->mp))\
170 release_metapage((BTSTACK)->top->mp);\
171}
172#endif /* _H_JFS_BTREE */
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
new file mode 100644
index 000000000000..91a0a889ebc5
--- /dev/null
+++ b/fs/jfs/jfs_debug.c
@@ -0,0 +1,154 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 * Portions Copyright (C) Christoph Hellwig, 2001-2002
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/fs.h>
21#include <linux/ctype.h>
22#include <linux/module.h>
23#include <linux/proc_fs.h>
24#include <asm/uaccess.h>
25#include "jfs_incore.h"
26#include "jfs_filsys.h"
27#include "jfs_debug.h"
28
29#ifdef CONFIG_JFS_DEBUG
30void dump_mem(char *label, void *data, int length)
31{
32 int i, j;
33 int *intptr = data;
34 char *charptr = data;
35 char buf[10], line[80];
36
37 printk("%s: dump of %d bytes of data at 0x%p\n\n", label, length,
38 data);
39 for (i = 0; i < length; i += 16) {
40 line[0] = 0;
41 for (j = 0; (j < 4) && (i + j * 4 < length); j++) {
42 sprintf(buf, " %08x", intptr[i / 4 + j]);
43 strcat(line, buf);
44 }
45 buf[0] = ' ';
46 buf[2] = 0;
47 for (j = 0; (j < 16) && (i + j < length); j++) {
48 buf[1] =
49 isprint(charptr[i + j]) ? charptr[i + j] : '.';
50 strcat(line, buf);
51 }
52 printk("%s\n", line);
53 }
54}
55#endif
56
57#ifdef PROC_FS_JFS /* see jfs_debug.h */
58
59static struct proc_dir_entry *base;
60#ifdef CONFIG_JFS_DEBUG
61extern read_proc_t jfs_txanchor_read;
62
63static int loglevel_read(char *page, char **start, off_t off,
64 int count, int *eof, void *data)
65{
66 int len;
67
68 len = sprintf(page, "%d\n", jfsloglevel);
69
70 len -= off;
71 *start = page + off;
72
73 if (len > count)
74 len = count;
75 else
76 *eof = 1;
77
78 if (len < 0)
79 len = 0;
80
81 return len;
82}
83
84static int loglevel_write(struct file *file, const char __user *buffer,
85 unsigned long count, void *data)
86{
87 char c;
88
89 if (get_user(c, buffer))
90 return -EFAULT;
91
92 /* yes, I know this is an ASCIIism. --hch */
93 if (c < '0' || c > '9')
94 return -EINVAL;
95 jfsloglevel = c - '0';
96 return count;
97}
98#endif
99
100
101#ifdef CONFIG_JFS_STATISTICS
102extern read_proc_t jfs_lmstats_read;
103extern read_proc_t jfs_txstats_read;
104extern read_proc_t jfs_xtstat_read;
105extern read_proc_t jfs_mpstat_read;
106#endif
107
108static struct {
109 const char *name;
110 read_proc_t *read_fn;
111 write_proc_t *write_fn;
112} Entries[] = {
113#ifdef CONFIG_JFS_STATISTICS
114 { "lmstats", jfs_lmstats_read, },
115 { "txstats", jfs_txstats_read, },
116 { "xtstat", jfs_xtstat_read, },
117 { "mpstat", jfs_mpstat_read, },
118#endif
119#ifdef CONFIG_JFS_DEBUG
120 { "TxAnchor", jfs_txanchor_read, },
121 { "loglevel", loglevel_read, loglevel_write }
122#endif
123};
124#define NPROCENT (sizeof(Entries)/sizeof(Entries[0]))
125
126void jfs_proc_init(void)
127{
128 int i;
129
130 if (!(base = proc_mkdir("jfs", proc_root_fs)))
131 return;
132 base->owner = THIS_MODULE;
133
134 for (i = 0; i < NPROCENT; i++) {
135 struct proc_dir_entry *p;
136 if ((p = create_proc_entry(Entries[i].name, 0, base))) {
137 p->read_proc = Entries[i].read_fn;
138 p->write_proc = Entries[i].write_fn;
139 }
140 }
141}
142
143void jfs_proc_clean(void)
144{
145 int i;
146
147 if (base) {
148 for (i = 0; i < NPROCENT; i++)
149 remove_proc_entry(Entries[i].name, base);
150 remove_proc_entry("jfs", proc_root_fs);
151 }
152}
153
154#endif /* PROC_FS_JFS */
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
new file mode 100644
index 000000000000..a38079ae1e00
--- /dev/null
+++ b/fs/jfs/jfs_debug.h
@@ -0,0 +1,122 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2000-2002
3 * Portions Copyright (c) Christoph Hellwig, 2001-2002
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19#ifndef _H_JFS_DEBUG
20#define _H_JFS_DEBUG
21
22/*
23 * jfs_debug.h
24 *
25 * global debug message, data structure/macro definitions
26 * under control of CONFIG_JFS_DEBUG, CONFIG_JFS_STATISTICS;
27 */
28
29/*
30 * Create /proc/fs/jfs if procfs is enabled andeither
31 * CONFIG_JFS_DEBUG or CONFIG_JFS_STATISTICS is defined
32 */
33#if defined(CONFIG_PROC_FS) && (defined(CONFIG_JFS_DEBUG) || defined(CONFIG_JFS_STATISTICS))
34 #define PROC_FS_JFS
35#endif
36
37/*
38 * assert with traditional printf/panic
39 */
40#ifdef CONFIG_KERNEL_ASSERTS
41/* kgdb stuff */
42#define assert(p) KERNEL_ASSERT(#p, p)
43#else
44#define assert(p) do { \
45 if (!(p)) { \
46 printk(KERN_CRIT "BUG at %s:%d assert(%s)\n", \
47 __FILE__, __LINE__, #p); \
48 BUG(); \
49 } \
50} while (0)
51#endif
52
53/*
54 * debug ON
55 * --------
56 */
57#ifdef CONFIG_JFS_DEBUG
58#define ASSERT(p) assert(p)
59
60/* printk verbosity */
61#define JFS_LOGLEVEL_ERR 1
62#define JFS_LOGLEVEL_WARN 2
63#define JFS_LOGLEVEL_DEBUG 3
64#define JFS_LOGLEVEL_INFO 4
65
66extern int jfsloglevel;
67
68/* dump memory contents */
69extern void dump_mem(char *label, void *data, int length);
70
71/* information message: e.g., configuration, major event */
72#define jfs_info(fmt, arg...) do { \
73 if (jfsloglevel >= JFS_LOGLEVEL_INFO) \
74 printk(KERN_INFO fmt "\n", ## arg); \
75} while (0)
76
77/* debug message: ad hoc */
78#define jfs_debug(fmt, arg...) do { \
79 if (jfsloglevel >= JFS_LOGLEVEL_DEBUG) \
80 printk(KERN_DEBUG fmt "\n", ## arg); \
81} while (0)
82
83/* warn message: */
84#define jfs_warn(fmt, arg...) do { \
85 if (jfsloglevel >= JFS_LOGLEVEL_WARN) \
86 printk(KERN_WARNING fmt "\n", ## arg); \
87} while (0)
88
89/* error event message: e.g., i/o error */
90#define jfs_err(fmt, arg...) do { \
91 if (jfsloglevel >= JFS_LOGLEVEL_ERR) \
92 printk(KERN_ERR fmt "\n", ## arg); \
93} while (0)
94
95/*
96 * debug OFF
97 * ---------
98 */
99#else /* CONFIG_JFS_DEBUG */
100#define dump_mem(label,data,length) do {} while (0)
101#define ASSERT(p) do {} while (0)
102#define jfs_info(fmt, arg...) do {} while (0)
103#define jfs_debug(fmt, arg...) do {} while (0)
104#define jfs_warn(fmt, arg...) do {} while (0)
105#define jfs_err(fmt, arg...) do {} while (0)
106#endif /* CONFIG_JFS_DEBUG */
107
108/*
109 * statistics
110 * ----------
111 */
112#ifdef CONFIG_JFS_STATISTICS
113#define INCREMENT(x) ((x)++)
114#define DECREMENT(x) ((x)--)
115#define HIGHWATERMARK(x,y) ((x) = max((x), (y)))
116#else
117#define INCREMENT(x)
118#define DECREMENT(x)
119#define HIGHWATERMARK(x,y)
120#endif /* CONFIG_JFS_STATISTICS */
121
122#endif /* _H_JFS_DEBUG */
diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h
new file mode 100644
index 000000000000..580a3258449b
--- /dev/null
+++ b/fs/jfs/jfs_dinode.h
@@ -0,0 +1,151 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2000-2001
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18#ifndef _H_JFS_DINODE
19#define _H_JFS_DINODE
20
21/*
22 * jfs_dinode.h: on-disk inode manager
23 */
24
25#define INODESLOTSIZE 128
26#define L2INODESLOTSIZE 7
27#define log2INODESIZE 9 /* log2(bytes per dinode) */
28
29
30/*
31 * on-disk inode : 512 bytes
32 *
33 * note: align 64-bit fields on 8-byte boundary.
34 */
35struct dinode {
36 /*
37 * I. base area (128 bytes)
38 * ------------------------
39 *
40 * define generic/POSIX attributes
41 */
42 __le32 di_inostamp; /* 4: stamp to show inode belongs to fileset */
43 __le32 di_fileset; /* 4: fileset number */
44 __le32 di_number; /* 4: inode number, aka file serial number */
45 __le32 di_gen; /* 4: inode generation number */
46
47 pxd_t di_ixpxd; /* 8: inode extent descriptor */
48
49 __le64 di_size; /* 8: size */
50 __le64 di_nblocks; /* 8: number of blocks allocated */
51
52 __le32 di_nlink; /* 4: number of links to the object */
53
54 __le32 di_uid; /* 4: user id of owner */
55 __le32 di_gid; /* 4: group id of owner */
56
57 __le32 di_mode; /* 4: attribute, format and permission */
58
59 struct timestruc_t di_atime; /* 8: time last data accessed */
60 struct timestruc_t di_ctime; /* 8: time last status changed */
61 struct timestruc_t di_mtime; /* 8: time last data modified */
62 struct timestruc_t di_otime; /* 8: time created */
63
64 dxd_t di_acl; /* 16: acl descriptor */
65
66 dxd_t di_ea; /* 16: ea descriptor */
67
68 __le32 di_next_index; /* 4: Next available dir_table index */
69
70 __le32 di_acltype; /* 4: Type of ACL */
71
72 /*
73 * Extension Areas.
74 *
75 * Historically, the inode was partitioned into 4 128-byte areas,
76 * the last 3 being defined as unions which could have multiple
77 * uses. The first 96 bytes had been completely unused until
78 * an index table was added to the directory. It is now more
79 * useful to describe the last 3/4 of the inode as a single
80 * union. We would probably be better off redesigning the
81 * entire structure from scratch, but we don't want to break
82 * commonality with OS/2's JFS at this time.
83 */
84 union {
85 struct {
86 /*
87 * This table contains the information needed to
88 * find a directory entry from a 32-bit index.
89 * If the index is small enough, the table is inline,
90 * otherwise, an x-tree root overlays this table
91 */
92 struct dir_table_slot _table[12]; /* 96: inline */
93
94 dtroot_t _dtroot; /* 288: dtree root */
95 } _dir; /* (384) */
96#define di_dirtable u._dir._table
97#define di_dtroot u._dir._dtroot
98#define di_parent di_dtroot.header.idotdot
99#define di_DASD di_dtroot.header.DASD
100
101 struct {
102 union {
103 u8 _data[96]; /* 96: unused */
104 struct {
105 void *_imap; /* 4: unused */
106 __le32 _gengen; /* 4: generator */
107 } _imap;
108 } _u1; /* 96: */
109#define di_gengen u._file._u1._imap._gengen
110
111 union {
112 xtpage_t _xtroot;
113 struct {
114 u8 unused[16]; /* 16: */
115 dxd_t _dxd; /* 16: */
116 union {
117 __le32 _rdev; /* 4: */
118 u8 _fastsymlink[128];
119 } _u;
120 u8 _inlineea[128];
121 } _special;
122 } _u2;
123 } _file;
124#define di_xtroot u._file._u2._xtroot
125#define di_dxd u._file._u2._special._dxd
126#define di_btroot di_xtroot
127#define di_inlinedata u._file._u2._special._u
128#define di_rdev u._file._u2._special._u._rdev
129#define di_fastsymlink u._file._u2._special._u._fastsymlink
130#define di_inlineea u._file._u2._special._inlineea
131 } u;
132};
133
134/* extended mode bits (on-disk inode di_mode) */
135#define IFJOURNAL 0x00010000 /* journalled file */
136#define ISPARSE 0x00020000 /* sparse file enabled */
137#define INLINEEA 0x00040000 /* inline EA area free */
138#define ISWAPFILE 0x00800000 /* file open for pager swap space */
139
140/* more extended mode bits: attributes for OS/2 */
141#define IREADONLY 0x02000000 /* no write access to file */
142#define IARCHIVE 0x40000000 /* file archive bit */
143#define ISYSTEM 0x08000000 /* system file */
144#define IHIDDEN 0x04000000 /* hidden file */
145#define IRASH 0x4E000000 /* mask for changeable attributes */
146#define INEWNAME 0x80000000 /* non-8.3 filename format */
147#define IDIRECTORY 0x20000000 /* directory (shadow of real bit) */
148#define ATTRSHIFT 25 /* bits to shift to move attribute
149 specification to mode position */
150
151#endif /*_H_JFS_DINODE */
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
new file mode 100644
index 000000000000..d86e467c6e42
--- /dev/null
+++ b/fs/jfs/jfs_dmap.c
@@ -0,0 +1,4272 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#include <linux/fs.h>
20#include "jfs_incore.h"
21#include "jfs_superblock.h"
22#include "jfs_dmap.h"
23#include "jfs_imap.h"
24#include "jfs_lock.h"
25#include "jfs_metapage.h"
26#include "jfs_debug.h"
27
28/*
29 * Debug code for double-checking block map
30 */
31/* #define _JFS_DEBUG_DMAP 1 */
32
33#ifdef _JFS_DEBUG_DMAP
34#define DBINITMAP(size,ipbmap,results) \
35 DBinitmap(size,ipbmap,results)
36#define DBALLOC(dbmap,mapsize,blkno,nblocks) \
37 DBAlloc(dbmap,mapsize,blkno,nblocks)
38#define DBFREE(dbmap,mapsize,blkno,nblocks) \
39 DBFree(dbmap,mapsize,blkno,nblocks)
40#define DBALLOCCK(dbmap,mapsize,blkno,nblocks) \
41 DBAllocCK(dbmap,mapsize,blkno,nblocks)
42#define DBFREECK(dbmap,mapsize,blkno,nblocks) \
43 DBFreeCK(dbmap,mapsize,blkno,nblocks)
44
45static void DBinitmap(s64, struct inode *, u32 **);
46static void DBAlloc(uint *, s64, s64, s64);
47static void DBFree(uint *, s64, s64, s64);
48static void DBAllocCK(uint *, s64, s64, s64);
49static void DBFreeCK(uint *, s64, s64, s64);
50#else
51#define DBINITMAP(size,ipbmap,results)
52#define DBALLOC(dbmap, mapsize, blkno, nblocks)
53#define DBFREE(dbmap, mapsize, blkno, nblocks)
54#define DBALLOCCK(dbmap, mapsize, blkno, nblocks)
55#define DBFREECK(dbmap, mapsize, blkno, nblocks)
56#endif /* _JFS_DEBUG_DMAP */
57
58/*
59 * SERIALIZATION of the Block Allocation Map.
60 *
61 * the working state of the block allocation map is accessed in
62 * two directions:
63 *
64 * 1) allocation and free requests that start at the dmap
65 * level and move up through the dmap control pages (i.e.
66 * the vast majority of requests).
67 *
68 * 2) allocation requests that start at dmap control page
69 * level and work down towards the dmaps.
70 *
71 * the serialization scheme used here is as follows.
72 *
73 * requests which start at the bottom are serialized against each
74 * other through buffers and each requests holds onto its buffers
75 * as it works it way up from a single dmap to the required level
76 * of dmap control page.
77 * requests that start at the top are serialized against each other
78 * and request that start from the bottom by the multiple read/single
79 * write inode lock of the bmap inode. requests starting at the top
80 * take this lock in write mode while request starting at the bottom
81 * take the lock in read mode. a single top-down request may proceed
82 * exclusively while multiple bottoms-up requests may proceed
83 * simultaneously (under the protection of busy buffers).
84 *
85 * in addition to information found in dmaps and dmap control pages,
86 * the working state of the block allocation map also includes read/
87 * write information maintained in the bmap descriptor (i.e. total
88 * free block count, allocation group level free block counts).
89 * a single exclusive lock (BMAP_LOCK) is used to guard this information
90 * in the face of multiple-bottoms up requests.
91 * (lock ordering: IREAD_LOCK, BMAP_LOCK);
92 *
93 * accesses to the persistent state of the block allocation map (limited
94 * to the persistent bitmaps in dmaps) is guarded by (busy) buffers.
95 */
96
97#define BMAP_LOCK_INIT(bmp) init_MUTEX(&bmp->db_bmaplock)
98#define BMAP_LOCK(bmp) down(&bmp->db_bmaplock)
99#define BMAP_UNLOCK(bmp) up(&bmp->db_bmaplock)
100
101/*
102 * forward references
103 */
104static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
105 int nblocks);
106static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval);
107static void dbBackSplit(dmtree_t * tp, int leafno);
108static void dbJoin(dmtree_t * tp, int leafno, int newval);
109static void dbAdjTree(dmtree_t * tp, int leafno, int newval);
110static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc,
111 int level);
112static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results);
113static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
114 int nblocks);
115static int dbAllocNear(struct bmap * bmp, struct dmap * dp, s64 blkno,
116 int nblocks,
117 int l2nb, s64 * results);
118static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
119 int nblocks);
120static int dbAllocDmapLev(struct bmap * bmp, struct dmap * dp, int nblocks,
121 int l2nb,
122 s64 * results);
123static int dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb,
124 s64 * results);
125static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno,
126 s64 * results);
127static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks);
128static int dbFindBits(u32 word, int l2nb);
129static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno);
130static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx);
131static void dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
132 int nblocks);
133static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
134 int nblocks);
135static int dbMaxBud(u8 * cp);
136s64 dbMapFileSizeToMapSize(struct inode *ipbmap);
137static int blkstol2(s64 nb);
138
139static int cntlz(u32 value);
140static int cnttz(u32 word);
141
142static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
143 int nblocks);
144static int dbInitDmap(struct dmap * dp, s64 blkno, int nblocks);
145static int dbInitDmapTree(struct dmap * dp);
146static int dbInitTree(struct dmaptree * dtp);
147static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i);
148static int dbGetL2AGSize(s64 nblocks);
149
150/*
151 * buddy table
152 *
153 * table used for determining buddy sizes within characters of
154 * dmap bitmap words. the characters themselves serve as indexes
155 * into the table, with the table elements yielding the maximum
156 * binary buddy of free bits within the character.
157 */
158static s8 budtab[256] = {
159 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
160 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
161 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
162 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
163 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
164 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
165 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
166 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
167 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
168 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
169 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
170 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
171 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
172 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
173 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
174 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1
175};
176
177
178/*
179 * NAME: dbMount()
180 *
181 * FUNCTION: initializate the block allocation map.
182 *
183 * memory is allocated for the in-core bmap descriptor and
184 * the in-core descriptor is initialized from disk.
185 *
186 * PARAMETERS:
187 * ipbmap - pointer to in-core inode for the block map.
188 *
189 * RETURN VALUES:
190 * 0 - success
191 * -ENOMEM - insufficient memory
192 * -EIO - i/o error
193 */
194int dbMount(struct inode *ipbmap)
195{
196 struct bmap *bmp;
197 struct dbmap_disk *dbmp_le;
198 struct metapage *mp;
199 int i;
200
201 /*
202 * allocate/initialize the in-memory bmap descriptor
203 */
204 /* allocate memory for the in-memory bmap descriptor */
205 bmp = kmalloc(sizeof(struct bmap), GFP_KERNEL);
206 if (bmp == NULL)
207 return -ENOMEM;
208
209 /* read the on-disk bmap descriptor. */
210 mp = read_metapage(ipbmap,
211 BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
212 PSIZE, 0);
213 if (mp == NULL) {
214 kfree(bmp);
215 return -EIO;
216 }
217
218 /* copy the on-disk bmap descriptor to its in-memory version. */
219 dbmp_le = (struct dbmap_disk *) mp->data;
220 bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize);
221 bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
222 bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
223 bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
224 bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
225 bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
226 bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
227 bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
228 bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth);
229 bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
230 bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
231 bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
232 for (i = 0; i < MAXAG; i++)
233 bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]);
234 bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize);
235 bmp->db_maxfreebud = dbmp_le->dn_maxfreebud;
236
237 /* release the buffer. */
238 release_metapage(mp);
239
240 /* bind the bmap inode and the bmap descriptor to each other. */
241 bmp->db_ipbmap = ipbmap;
242 JFS_SBI(ipbmap->i_sb)->bmap = bmp;
243
244 memset(bmp->db_active, 0, sizeof(bmp->db_active));
245 DBINITMAP(bmp->db_mapsize, ipbmap, &bmp->db_DBmap);
246
247 /*
248 * allocate/initialize the bmap lock
249 */
250 BMAP_LOCK_INIT(bmp);
251
252 return (0);
253}
254
255
256/*
257 * NAME: dbUnmount()
258 *
259 * FUNCTION: terminate the block allocation map in preparation for
260 * file system unmount.
261 *
262 * the in-core bmap descriptor is written to disk and
263 * the memory for this descriptor is freed.
264 *
265 * PARAMETERS:
266 * ipbmap - pointer to in-core inode for the block map.
267 *
268 * RETURN VALUES:
269 * 0 - success
270 * -EIO - i/o error
271 */
272int dbUnmount(struct inode *ipbmap, int mounterror)
273{
274 struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
275 int i;
276
277 if (!(mounterror || isReadOnly(ipbmap)))
278 dbSync(ipbmap);
279
280 /*
281 * Invalidate the page cache buffers
282 */
283 truncate_inode_pages(ipbmap->i_mapping, 0);
284
285 /*
286 * Sanity Check
287 */
288 for (i = 0; i < bmp->db_numag; i++)
289 if (atomic_read(&bmp->db_active[i]))
290 printk(KERN_ERR "dbUnmount: db_active[%d] = %d\n",
291 i, atomic_read(&bmp->db_active[i]));
292
293 /* free the memory for the in-memory bmap. */
294 kfree(bmp);
295
296 return (0);
297}
298
299/*
300 * dbSync()
301 */
302int dbSync(struct inode *ipbmap)
303{
304 struct dbmap_disk *dbmp_le;
305 struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
306 struct metapage *mp;
307 int i;
308
309 /*
310 * write bmap global control page
311 */
312 /* get the buffer for the on-disk bmap descriptor. */
313 mp = read_metapage(ipbmap,
314 BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
315 PSIZE, 0);
316 if (mp == NULL) {
317 jfs_err("dbSync: read_metapage failed!");
318 return -EIO;
319 }
320 /* copy the in-memory version of the bmap to the on-disk version */
321 dbmp_le = (struct dbmap_disk *) mp->data;
322 dbmp_le->dn_mapsize = cpu_to_le64(bmp->db_mapsize);
323 dbmp_le->dn_nfree = cpu_to_le64(bmp->db_nfree);
324 dbmp_le->dn_l2nbperpage = cpu_to_le32(bmp->db_l2nbperpage);
325 dbmp_le->dn_numag = cpu_to_le32(bmp->db_numag);
326 dbmp_le->dn_maxlevel = cpu_to_le32(bmp->db_maxlevel);
327 dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
328 dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
329 dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
330 dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth);
331 dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
332 dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
333 dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
334 for (i = 0; i < MAXAG; i++)
335 dbmp_le->dn_agfree[i] = cpu_to_le64(bmp->db_agfree[i]);
336 dbmp_le->dn_agsize = cpu_to_le64(bmp->db_agsize);
337 dbmp_le->dn_maxfreebud = bmp->db_maxfreebud;
338
339 /* write the buffer */
340 write_metapage(mp);
341
342 /*
343 * write out dirty pages of bmap
344 */
345 filemap_fdatawrite(ipbmap->i_mapping);
346 filemap_fdatawait(ipbmap->i_mapping);
347
348 ipbmap->i_state |= I_DIRTY;
349 diWriteSpecial(ipbmap, 0);
350
351 return (0);
352}
353
354
355/*
356 * NAME: dbFree()
357 *
358 * FUNCTION: free the specified block range from the working block
359 * allocation map.
360 *
361 * the blocks will be free from the working map one dmap
362 * at a time.
363 *
364 * PARAMETERS:
365 * ip - pointer to in-core inode;
366 * blkno - starting block number to be freed.
367 * nblocks - number of blocks to be freed.
368 *
369 * RETURN VALUES:
370 * 0 - success
371 * -EIO - i/o error
372 */
373int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
374{
375 struct metapage *mp;
376 struct dmap *dp;
377 int nb, rc;
378 s64 lblkno, rem;
379 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
380 struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
381
382 IREAD_LOCK(ipbmap);
383
384 /* block to be freed better be within the mapsize. */
385 if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) {
386 IREAD_UNLOCK(ipbmap);
387 printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
388 (unsigned long long) blkno,
389 (unsigned long long) nblocks);
390 jfs_error(ip->i_sb,
391 "dbFree: block to be freed is outside the map");
392 return -EIO;
393 }
394
395 /*
396 * free the blocks a dmap at a time.
397 */
398 mp = NULL;
399 for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
400 /* release previous dmap if any */
401 if (mp) {
402 write_metapage(mp);
403 }
404
405 /* get the buffer for the current dmap. */
406 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
407 mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
408 if (mp == NULL) {
409 IREAD_UNLOCK(ipbmap);
410 return -EIO;
411 }
412 dp = (struct dmap *) mp->data;
413
414 /* determine the number of blocks to be freed from
415 * this dmap.
416 */
417 nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
418
419 DBALLOCCK(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
420
421 /* free the blocks. */
422 if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) {
423 release_metapage(mp);
424 IREAD_UNLOCK(ipbmap);
425 return (rc);
426 }
427
428 DBFREE(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
429 }
430
431 /* write the last buffer. */
432 write_metapage(mp);
433
434 IREAD_UNLOCK(ipbmap);
435
436 return (0);
437}
438
439
440/*
441 * NAME: dbUpdatePMap()
442 *
443 * FUNCTION: update the allocation state (free or allocate) of the
444 * specified block range in the persistent block allocation map.
445 *
446 * the blocks will be updated in the persistent map one
447 * dmap at a time.
448 *
449 * PARAMETERS:
450 * ipbmap - pointer to in-core inode for the block map.
451 * free - TRUE if block range is to be freed from the persistent
452 * map; FALSE if it is to be allocated.
453 * blkno - starting block number of the range.
454 * nblocks - number of contiguous blocks in the range.
455 * tblk - transaction block;
456 *
457 * RETURN VALUES:
458 * 0 - success
459 * -EIO - i/o error
460 */
461int
462dbUpdatePMap(struct inode *ipbmap,
463 int free, s64 blkno, s64 nblocks, struct tblock * tblk)
464{
465 int nblks, dbitno, wbitno, rbits;
466 int word, nbits, nwords;
467 struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
468 s64 lblkno, rem, lastlblkno;
469 u32 mask;
470 struct dmap *dp;
471 struct metapage *mp;
472 struct jfs_log *log;
473 int lsn, difft, diffp;
474
475 /* the blocks better be within the mapsize. */
476 if (blkno + nblocks > bmp->db_mapsize) {
477 printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
478 (unsigned long long) blkno,
479 (unsigned long long) nblocks);
480 jfs_error(ipbmap->i_sb,
481 "dbUpdatePMap: blocks are outside the map");
482 return -EIO;
483 }
484
485 /* compute delta of transaction lsn from log syncpt */
486 lsn = tblk->lsn;
487 log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
488 logdiff(difft, lsn, log);
489
490 /*
491 * update the block state a dmap at a time.
492 */
493 mp = NULL;
494 lastlblkno = 0;
495 for (rem = nblocks; rem > 0; rem -= nblks, blkno += nblks) {
496 /* get the buffer for the current dmap. */
497 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
498 if (lblkno != lastlblkno) {
499 if (mp) {
500 write_metapage(mp);
501 }
502
503 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE,
504 0);
505 if (mp == NULL)
506 return -EIO;
507 }
508 dp = (struct dmap *) mp->data;
509
510 /* determine the bit number and word within the dmap of
511 * the starting block. also determine how many blocks
512 * are to be updated within this dmap.
513 */
514 dbitno = blkno & (BPERDMAP - 1);
515 word = dbitno >> L2DBWORD;
516 nblks = min(rem, (s64)BPERDMAP - dbitno);
517
518 /* update the bits of the dmap words. the first and last
519 * words may only have a subset of their bits updated. if
520 * this is the case, we'll work against that word (i.e.
521 * partial first and/or last) only in a single pass. a
522 * single pass will also be used to update all words that
523 * are to have all their bits updated.
524 */
525 for (rbits = nblks; rbits > 0;
526 rbits -= nbits, dbitno += nbits) {
527 /* determine the bit number within the word and
528 * the number of bits within the word.
529 */
530 wbitno = dbitno & (DBWORD - 1);
531 nbits = min(rbits, DBWORD - wbitno);
532
533 /* check if only part of the word is to be updated. */
534 if (nbits < DBWORD) {
535 /* update (free or allocate) the bits
536 * in this word.
537 */
538 mask =
539 (ONES << (DBWORD - nbits) >> wbitno);
540 if (free)
541 dp->pmap[word] &=
542 cpu_to_le32(~mask);
543 else
544 dp->pmap[word] |=
545 cpu_to_le32(mask);
546
547 word += 1;
548 } else {
549 /* one or more words are to have all
550 * their bits updated. determine how
551 * many words and how many bits.
552 */
553 nwords = rbits >> L2DBWORD;
554 nbits = nwords << L2DBWORD;
555
556 /* update (free or allocate) the bits
557 * in these words.
558 */
559 if (free)
560 memset(&dp->pmap[word], 0,
561 nwords * 4);
562 else
563 memset(&dp->pmap[word], (int) ONES,
564 nwords * 4);
565
566 word += nwords;
567 }
568 }
569
570 /*
571 * update dmap lsn
572 */
573 if (lblkno == lastlblkno)
574 continue;
575
576 lastlblkno = lblkno;
577
578 if (mp->lsn != 0) {
579 /* inherit older/smaller lsn */
580 logdiff(diffp, mp->lsn, log);
581 if (difft < diffp) {
582 mp->lsn = lsn;
583
584 /* move bp after tblock in logsync list */
585 LOGSYNC_LOCK(log);
586 list_move(&mp->synclist, &tblk->synclist);
587 LOGSYNC_UNLOCK(log);
588 }
589
590 /* inherit younger/larger clsn */
591 LOGSYNC_LOCK(log);
592 logdiff(difft, tblk->clsn, log);
593 logdiff(diffp, mp->clsn, log);
594 if (difft > diffp)
595 mp->clsn = tblk->clsn;
596 LOGSYNC_UNLOCK(log);
597 } else {
598 mp->log = log;
599 mp->lsn = lsn;
600
601 /* insert bp after tblock in logsync list */
602 LOGSYNC_LOCK(log);
603
604 log->count++;
605 list_add(&mp->synclist, &tblk->synclist);
606
607 mp->clsn = tblk->clsn;
608 LOGSYNC_UNLOCK(log);
609 }
610 }
611
612 /* write the last buffer. */
613 if (mp) {
614 write_metapage(mp);
615 }
616
617 return (0);
618}
619
620
621/*
622 * NAME: dbNextAG()
623 *
624 * FUNCTION: find the preferred allocation group for new allocations.
625 *
626 * Within the allocation groups, we maintain a preferred
627 * allocation group which consists of a group with at least
628 * average free space. It is the preferred group that we target
629 * new inode allocation towards. The tie-in between inode
630 * allocation and block allocation occurs as we allocate the
631 * first (data) block of an inode and specify the inode (block)
632 * as the allocation hint for this block.
633 *
634 * We try to avoid having more than one open file growing in
635 * an allocation group, as this will lead to fragmentation.
636 * This differs from the old OS/2 method of trying to keep
637 * empty ags around for large allocations.
638 *
639 * PARAMETERS:
640 * ipbmap - pointer to in-core inode for the block map.
641 *
642 * RETURN VALUES:
643 * the preferred allocation group number.
644 */
645int dbNextAG(struct inode *ipbmap)
646{
647 s64 avgfree;
648 int agpref;
649 s64 hwm = 0;
650 int i;
651 int next_best = -1;
652 struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
653
654 BMAP_LOCK(bmp);
655
656 /* determine the average number of free blocks within the ags. */
657 avgfree = (u32)bmp->db_nfree / bmp->db_numag;
658
659 /*
660 * if the current preferred ag does not have an active allocator
661 * and has at least average freespace, return it
662 */
663 agpref = bmp->db_agpref;
664 if ((atomic_read(&bmp->db_active[agpref]) == 0) &&
665 (bmp->db_agfree[agpref] >= avgfree))
666 goto unlock;
667
668 /* From the last preferred ag, find the next one with at least
669 * average free space.
670 */
671 for (i = 0 ; i < bmp->db_numag; i++, agpref++) {
672 if (agpref == bmp->db_numag)
673 agpref = 0;
674
675 if (atomic_read(&bmp->db_active[agpref]))
676 /* open file is currently growing in this ag */
677 continue;
678 if (bmp->db_agfree[agpref] >= avgfree) {
679 /* Return this one */
680 bmp->db_agpref = agpref;
681 goto unlock;
682 } else if (bmp->db_agfree[agpref] > hwm) {
683 /* Less than avg. freespace, but best so far */
684 hwm = bmp->db_agfree[agpref];
685 next_best = agpref;
686 }
687 }
688
689 /*
690 * If no inactive ag was found with average freespace, use the
691 * next best
692 */
693 if (next_best != -1)
694 bmp->db_agpref = next_best;
695 /* else leave db_agpref unchanged */
696unlock:
697 BMAP_UNLOCK(bmp);
698
699 /* return the preferred group.
700 */
701 return (bmp->db_agpref);
702}
703
704/*
705 * NAME: dbAlloc()
706 *
707 * FUNCTION: attempt to allocate a specified number of contiguous free
708 * blocks from the working allocation block map.
709 *
710 * the block allocation policy uses hints and a multi-step
711 * approach.
712 *
713 * for allocation requests smaller than the number of blocks
714 * per dmap, we first try to allocate the new blocks
715 * immediately following the hint. if these blocks are not
716 * available, we try to allocate blocks near the hint. if
717 * no blocks near the hint are available, we next try to
718 * allocate within the same dmap as contains the hint.
719 *
720 * if no blocks are available in the dmap or the allocation
721 * request is larger than the dmap size, we try to allocate
722 * within the same allocation group as contains the hint. if
723 * this does not succeed, we finally try to allocate anywhere
724 * within the aggregate.
725 *
726 * we also try to allocate anywhere within the aggregate for
727 * for allocation requests larger than the allocation group
728 * size or requests that specify no hint value.
729 *
730 * PARAMETERS:
731 * ip - pointer to in-core inode;
732 * hint - allocation hint.
733 * nblocks - number of contiguous blocks in the range.
734 * results - on successful return, set to the starting block number
735 * of the newly allocated contiguous range.
736 *
737 * RETURN VALUES:
738 * 0 - success
739 * -ENOSPC - insufficient disk resources
740 * -EIO - i/o error
741 */
742int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
743{
744 int rc, agno;
745 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
746 struct bmap *bmp;
747 struct metapage *mp;
748 s64 lblkno, blkno;
749 struct dmap *dp;
750 int l2nb;
751 s64 mapSize;
752 int writers;
753
754 /* assert that nblocks is valid */
755 assert(nblocks > 0);
756
757#ifdef _STILL_TO_PORT
758 /* DASD limit check F226941 */
759 if (OVER_LIMIT(ip, nblocks))
760 return -ENOSPC;
761#endif /* _STILL_TO_PORT */
762
763 /* get the log2 number of blocks to be allocated.
764 * if the number of blocks is not a log2 multiple,
765 * it will be rounded up to the next log2 multiple.
766 */
767 l2nb = BLKSTOL2(nblocks);
768
769 bmp = JFS_SBI(ip->i_sb)->bmap;
770
771//retry: /* serialize w.r.t.extendfs() */
772 mapSize = bmp->db_mapsize;
773
774 /* the hint should be within the map */
775 if (hint >= mapSize) {
776 jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map");
777 return -EIO;
778 }
779
780 /* if the number of blocks to be allocated is greater than the
781 * allocation group size, try to allocate anywhere.
782 */
783 if (l2nb > bmp->db_agl2size) {
784 IWRITE_LOCK(ipbmap);
785
786 rc = dbAllocAny(bmp, nblocks, l2nb, results);
787 if (rc == 0) {
788 DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results,
789 nblocks);
790 }
791
792 goto write_unlock;
793 }
794
795 /*
796 * If no hint, let dbNextAG recommend an allocation group
797 */
798 if (hint == 0)
799 goto pref_ag;
800
801 /* we would like to allocate close to the hint. adjust the
802 * hint to the block following the hint since the allocators
803 * will start looking for free space starting at this point.
804 */
805 blkno = hint + 1;
806
807 if (blkno >= bmp->db_mapsize)
808 goto pref_ag;
809
810 agno = blkno >> bmp->db_agl2size;
811
812 /* check if blkno crosses over into a new allocation group.
813 * if so, check if we should allow allocations within this
814 * allocation group.
815 */
816 if ((blkno & (bmp->db_agsize - 1)) == 0)
817 /* check if the AG is currenly being written to.
818 * if so, call dbNextAG() to find a non-busy
819 * AG with sufficient free space.
820 */
821 if (atomic_read(&bmp->db_active[agno]))
822 goto pref_ag;
823
824 /* check if the allocation request size can be satisfied from a
825 * single dmap. if so, try to allocate from the dmap containing
826 * the hint using a tiered strategy.
827 */
828 if (nblocks <= BPERDMAP) {
829 IREAD_LOCK(ipbmap);
830
831 /* get the buffer for the dmap containing the hint.
832 */
833 rc = -EIO;
834 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
835 mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
836 if (mp == NULL)
837 goto read_unlock;
838
839 dp = (struct dmap *) mp->data;
840
841 /* first, try to satisfy the allocation request with the
842 * blocks beginning at the hint.
843 */
844 if ((rc = dbAllocNext(bmp, dp, blkno, (int) nblocks))
845 != -ENOSPC) {
846 if (rc == 0) {
847 *results = blkno;
848 DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
849 *results, nblocks);
850 mark_metapage_dirty(mp);
851 }
852
853 release_metapage(mp);
854 goto read_unlock;
855 }
856
857 writers = atomic_read(&bmp->db_active[agno]);
858 if ((writers > 1) ||
859 ((writers == 1) && (JFS_IP(ip)->active_ag != agno))) {
860 /*
861 * Someone else is writing in this allocation
862 * group. To avoid fragmenting, try another ag
863 */
864 release_metapage(mp);
865 IREAD_UNLOCK(ipbmap);
866 goto pref_ag;
867 }
868
869 /* next, try to satisfy the allocation request with blocks
870 * near the hint.
871 */
872 if ((rc =
873 dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results))
874 != -ENOSPC) {
875 if (rc == 0) {
876 DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
877 *results, nblocks);
878 mark_metapage_dirty(mp);
879 }
880
881 release_metapage(mp);
882 goto read_unlock;
883 }
884
885 /* try to satisfy the allocation request with blocks within
886 * the same dmap as the hint.
887 */
888 if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results))
889 != -ENOSPC) {
890 if (rc == 0) {
891 DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
892 *results, nblocks);
893 mark_metapage_dirty(mp);
894 }
895
896 release_metapage(mp);
897 goto read_unlock;
898 }
899
900 release_metapage(mp);
901 IREAD_UNLOCK(ipbmap);
902 }
903
904 /* try to satisfy the allocation request with blocks within
905 * the same allocation group as the hint.
906 */
907 IWRITE_LOCK(ipbmap);
908 if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results))
909 != -ENOSPC) {
910 if (rc == 0)
911 DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
912 *results, nblocks);
913 goto write_unlock;
914 }
915 IWRITE_UNLOCK(ipbmap);
916
917
918 pref_ag:
919 /*
920 * Let dbNextAG recommend a preferred allocation group
921 */
922 agno = dbNextAG(ipbmap);
923 IWRITE_LOCK(ipbmap);
924
925 /* Try to allocate within this allocation group. if that fails, try to
926 * allocate anywhere in the map.
927 */
928 if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == -ENOSPC)
929 rc = dbAllocAny(bmp, nblocks, l2nb, results);
930 if (rc == 0) {
931 DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results, nblocks);
932 }
933
934 write_unlock:
935 IWRITE_UNLOCK(ipbmap);
936
937 return (rc);
938
939 read_unlock:
940 IREAD_UNLOCK(ipbmap);
941
942 return (rc);
943}
944
945#ifdef _NOTYET
946/*
947 * NAME: dbAllocExact()
948 *
949 * FUNCTION: try to allocate the requested extent;
950 *
951 * PARAMETERS:
952 * ip - pointer to in-core inode;
953 * blkno - extent address;
954 * nblocks - extent length;
955 *
956 * RETURN VALUES:
957 * 0 - success
958 * -ENOSPC - insufficient disk resources
959 * -EIO - i/o error
960 */
961int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
962{
963 int rc;
964 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
965 struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
966 struct dmap *dp;
967 s64 lblkno;
968 struct metapage *mp;
969
970 IREAD_LOCK(ipbmap);
971
972 /*
973 * validate extent request:
974 *
975 * note: defragfs policy:
976 * max 64 blocks will be moved.
977 * allocation request size must be satisfied from a single dmap.
978 */
979 if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) {
980 IREAD_UNLOCK(ipbmap);
981 return -EINVAL;
982 }
983
984 if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) {
985 /* the free space is no longer available */
986 IREAD_UNLOCK(ipbmap);
987 return -ENOSPC;
988 }
989
990 /* read in the dmap covering the extent */
991 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
992 mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
993 if (mp == NULL) {
994 IREAD_UNLOCK(ipbmap);
995 return -EIO;
996 }
997 dp = (struct dmap *) mp->data;
998
999 /* try to allocate the requested extent */
1000 rc = dbAllocNext(bmp, dp, blkno, nblocks);
1001
1002 IREAD_UNLOCK(ipbmap);
1003
1004 if (rc == 0) {
1005 DBALLOC(bmp->db_DBmap, bmp->db_mapsize, blkno, nblocks);
1006 mark_metapage_dirty(mp);
1007 }
1008 release_metapage(mp);
1009
1010 return (rc);
1011}
1012#endif /* _NOTYET */
1013
1014/*
1015 * NAME: dbReAlloc()
1016 *
1017 * FUNCTION: attempt to extend a current allocation by a specified
1018 * number of blocks.
1019 *
1020 * this routine attempts to satisfy the allocation request
1021 * by first trying to extend the existing allocation in
1022 * place by allocating the additional blocks as the blocks
1023 * immediately following the current allocation. if these
1024 * blocks are not available, this routine will attempt to
1025 * allocate a new set of contiguous blocks large enough
1026 * to cover the existing allocation plus the additional
1027 * number of blocks required.
1028 *
1029 * PARAMETERS:
1030 * ip - pointer to in-core inode requiring allocation.
1031 * blkno - starting block of the current allocation.
1032 * nblocks - number of contiguous blocks within the current
1033 * allocation.
1034 * addnblocks - number of blocks to add to the allocation.
1035 * results - on successful return, set to the starting block number
1036 * of the existing allocation if the existing allocation
1037 * was extended in place or to a newly allocated contiguous
1038 * range if the existing allocation could not be extended
1039 * in place.
1040 *
1041 * RETURN VALUES:
1042 * 0 - success
1043 * -ENOSPC - insufficient disk resources
1044 * -EIO - i/o error
1045 */
1046int
1047dbReAlloc(struct inode *ip,
1048 s64 blkno, s64 nblocks, s64 addnblocks, s64 * results)
1049{
1050 int rc;
1051
1052 /* try to extend the allocation in place.
1053 */
1054 if ((rc = dbExtend(ip, blkno, nblocks, addnblocks)) == 0) {
1055 *results = blkno;
1056 return (0);
1057 } else {
1058 if (rc != -ENOSPC)
1059 return (rc);
1060 }
1061
1062 /* could not extend the allocation in place, so allocate a
1063 * new set of blocks for the entire request (i.e. try to get
1064 * a range of contiguous blocks large enough to cover the
1065 * existing allocation plus the additional blocks.)
1066 */
1067 return (dbAlloc
1068 (ip, blkno + nblocks - 1, addnblocks + nblocks, results));
1069}
1070
1071
1072/*
1073 * NAME: dbExtend()
1074 *
1075 * FUNCTION: attempt to extend a current allocation by a specified
1076 * number of blocks.
1077 *
1078 * this routine attempts to satisfy the allocation request
1079 * by first trying to extend the existing allocation in
1080 * place by allocating the additional blocks as the blocks
1081 * immediately following the current allocation.
1082 *
1083 * PARAMETERS:
1084 * ip - pointer to in-core inode requiring allocation.
1085 * blkno - starting block of the current allocation.
1086 * nblocks - number of contiguous blocks within the current
1087 * allocation.
1088 * addnblocks - number of blocks to add to the allocation.
1089 *
1090 * RETURN VALUES:
1091 * 0 - success
1092 * -ENOSPC - insufficient disk resources
1093 * -EIO - i/o error
1094 */
1095static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
1096{
1097 struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
1098 s64 lblkno, lastblkno, extblkno;
1099 uint rel_block;
1100 struct metapage *mp;
1101 struct dmap *dp;
1102 int rc;
1103 struct inode *ipbmap = sbi->ipbmap;
1104 struct bmap *bmp;
1105
1106 /*
1107 * We don't want a non-aligned extent to cross a page boundary
1108 */
1109 if (((rel_block = blkno & (sbi->nbperpage - 1))) &&
1110 (rel_block + nblocks + addnblocks > sbi->nbperpage))
1111 return -ENOSPC;
1112
1113 /* get the last block of the current allocation */
1114 lastblkno = blkno + nblocks - 1;
1115
1116 /* determine the block number of the block following
1117 * the existing allocation.
1118 */
1119 extblkno = lastblkno + 1;
1120
1121 IREAD_LOCK(ipbmap);
1122
1123 /* better be within the file system */
1124 bmp = sbi->bmap;
1125 if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) {
1126 IREAD_UNLOCK(ipbmap);
1127 jfs_error(ip->i_sb,
1128 "dbExtend: the block is outside the filesystem");
1129 return -EIO;
1130 }
1131
1132 /* we'll attempt to extend the current allocation in place by
1133 * allocating the additional blocks as the blocks immediately
1134 * following the current allocation. we only try to extend the
1135 * current allocation in place if the number of additional blocks
1136 * can fit into a dmap, the last block of the current allocation
1137 * is not the last block of the file system, and the start of the
1138 * inplace extension is not on an allocation group boundary.
1139 */
1140 if (addnblocks > BPERDMAP || extblkno >= bmp->db_mapsize ||
1141 (extblkno & (bmp->db_agsize - 1)) == 0) {
1142 IREAD_UNLOCK(ipbmap);
1143 return -ENOSPC;
1144 }
1145
1146 /* get the buffer for the dmap containing the first block
1147 * of the extension.
1148 */
1149 lblkno = BLKTODMAP(extblkno, bmp->db_l2nbperpage);
1150 mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
1151 if (mp == NULL) {
1152 IREAD_UNLOCK(ipbmap);
1153 return -EIO;
1154 }
1155
1156 DBALLOCCK(bmp->db_DBmap, bmp->db_mapsize, blkno, nblocks);
1157 dp = (struct dmap *) mp->data;
1158
1159 /* try to allocate the blocks immediately following the
1160 * current allocation.
1161 */
1162 rc = dbAllocNext(bmp, dp, extblkno, (int) addnblocks);
1163
1164 IREAD_UNLOCK(ipbmap);
1165
1166 /* were we successful ? */
1167 if (rc == 0) {
1168 DBALLOC(bmp->db_DBmap, bmp->db_mapsize, extblkno,
1169 addnblocks);
1170 write_metapage(mp);
1171 } else
1172 /* we were not successful */
1173 release_metapage(mp);
1174
1175
1176 return (rc);
1177}
1178
1179
1180/*
1181 * NAME: dbAllocNext()
1182 *
1183 * FUNCTION: attempt to allocate the blocks of the specified block
1184 * range within a dmap.
1185 *
1186 * PARAMETERS:
1187 * bmp - pointer to bmap descriptor
1188 * dp - pointer to dmap.
1189 * blkno - starting block number of the range.
1190 * nblocks - number of contiguous free blocks of the range.
1191 *
1192 * RETURN VALUES:
1193 * 0 - success
1194 * -ENOSPC - insufficient disk resources
1195 * -EIO - i/o error
1196 *
1197 * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
1198 */
1199static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
1200 int nblocks)
1201{
1202 int dbitno, word, rembits, nb, nwords, wbitno, nw;
1203 int l2size;
1204 s8 *leaf;
1205 u32 mask;
1206
1207 if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
1208 jfs_error(bmp->db_ipbmap->i_sb,
1209 "dbAllocNext: Corrupt dmap page");
1210 return -EIO;
1211 }
1212
1213 /* pick up a pointer to the leaves of the dmap tree.
1214 */
1215 leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
1216
1217 /* determine the bit number and word within the dmap of the
1218 * starting block.
1219 */
1220 dbitno = blkno & (BPERDMAP - 1);
1221 word = dbitno >> L2DBWORD;
1222
1223 /* check if the specified block range is contained within
1224 * this dmap.
1225 */
1226 if (dbitno + nblocks > BPERDMAP)
1227 return -ENOSPC;
1228
1229 /* check if the starting leaf indicates that anything
1230 * is free.
1231 */
1232 if (leaf[word] == NOFREE)
1233 return -ENOSPC;
1234
1235 /* check the dmaps words corresponding to block range to see
1236 * if the block range is free. not all bits of the first and
1237 * last words may be contained within the block range. if this
1238 * is the case, we'll work against those words (i.e. partial first
1239 * and/or last) on an individual basis (a single pass) and examine
1240 * the actual bits to determine if they are free. a single pass
1241 * will be used for all dmap words fully contained within the
1242 * specified range. within this pass, the leaves of the dmap
1243 * tree will be examined to determine if the blocks are free. a
1244 * single leaf may describe the free space of multiple dmap
1245 * words, so we may visit only a subset of the actual leaves
1246 * corresponding to the dmap words of the block range.
1247 */
1248 for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
1249 /* determine the bit number within the word and
1250 * the number of bits within the word.
1251 */
1252 wbitno = dbitno & (DBWORD - 1);
1253 nb = min(rembits, DBWORD - wbitno);
1254
1255 /* check if only part of the word is to be examined.
1256 */
1257 if (nb < DBWORD) {
1258 /* check if the bits are free.
1259 */
1260 mask = (ONES << (DBWORD - nb) >> wbitno);
1261 if ((mask & ~le32_to_cpu(dp->wmap[word])) != mask)
1262 return -ENOSPC;
1263
1264 word += 1;
1265 } else {
1266 /* one or more dmap words are fully contained
1267 * within the block range. determine how many
1268 * words and how many bits.
1269 */
1270 nwords = rembits >> L2DBWORD;
1271 nb = nwords << L2DBWORD;
1272
1273 /* now examine the appropriate leaves to determine
1274 * if the blocks are free.
1275 */
1276 while (nwords > 0) {
1277 /* does the leaf describe any free space ?
1278 */
1279 if (leaf[word] < BUDMIN)
1280 return -ENOSPC;
1281
1282 /* determine the l2 number of bits provided
1283 * by this leaf.
1284 */
1285 l2size =
1286 min((int)leaf[word], NLSTOL2BSZ(nwords));
1287
1288 /* determine how many words were handled.
1289 */
1290 nw = BUDSIZE(l2size, BUDMIN);
1291
1292 nwords -= nw;
1293 word += nw;
1294 }
1295 }
1296 }
1297
1298 /* allocate the blocks.
1299 */
1300 return (dbAllocDmap(bmp, dp, blkno, nblocks));
1301}
1302
1303
1304/*
1305 * NAME: dbAllocNear()
1306 *
1307 * FUNCTION: attempt to allocate a number of contiguous free blocks near
1308 * a specified block (hint) within a dmap.
1309 *
1310 * starting with the dmap leaf that covers the hint, we'll
1311 * check the next four contiguous leaves for sufficient free
1312 * space. if sufficient free space is found, we'll allocate
1313 * the desired free space.
1314 *
1315 * PARAMETERS:
1316 * bmp - pointer to bmap descriptor
1317 * dp - pointer to dmap.
1318 * blkno - block number to allocate near.
1319 * nblocks - actual number of contiguous free blocks desired.
1320 * l2nb - log2 number of contiguous free blocks desired.
1321 * results - on successful return, set to the starting block number
1322 * of the newly allocated range.
1323 *
1324 * RETURN VALUES:
1325 * 0 - success
1326 * -ENOSPC - insufficient disk resources
1327 * -EIO - i/o error
1328 *
1329 * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
1330 */
1331static int
1332dbAllocNear(struct bmap * bmp,
1333 struct dmap * dp, s64 blkno, int nblocks, int l2nb, s64 * results)
1334{
1335 int word, lword, rc;
1336 s8 *leaf;
1337
1338 if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
1339 jfs_error(bmp->db_ipbmap->i_sb,
1340 "dbAllocNear: Corrupt dmap page");
1341 return -EIO;
1342 }
1343
1344 leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
1345
1346 /* determine the word within the dmap that holds the hint
1347 * (i.e. blkno). also, determine the last word in the dmap
1348 * that we'll include in our examination.
1349 */
1350 word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
1351 lword = min(word + 4, LPERDMAP);
1352
1353 /* examine the leaves for sufficient free space.
1354 */
1355 for (; word < lword; word++) {
1356 /* does the leaf describe sufficient free space ?
1357 */
1358 if (leaf[word] < l2nb)
1359 continue;
1360
1361 /* determine the block number within the file system
1362 * of the first block described by this dmap word.
1363 */
1364 blkno = le64_to_cpu(dp->start) + (word << L2DBWORD);
1365
1366 /* if not all bits of the dmap word are free, get the
1367 * starting bit number within the dmap word of the required
1368 * string of free bits and adjust the block number with the
1369 * value.
1370 */
1371 if (leaf[word] < BUDMIN)
1372 blkno +=
1373 dbFindBits(le32_to_cpu(dp->wmap[word]), l2nb);
1374
1375 /* allocate the blocks.
1376 */
1377 if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
1378 *results = blkno;
1379
1380 return (rc);
1381 }
1382
1383 return -ENOSPC;
1384}
1385
1386
1387/*
1388 * NAME: dbAllocAG()
1389 *
1390 * FUNCTION: attempt to allocate the specified number of contiguous
1391 * free blocks within the specified allocation group.
1392 *
1393 * unless the allocation group size is equal to the number
1394 * of blocks per dmap, the dmap control pages will be used to
1395 * find the required free space, if available. we start the
1396 * search at the highest dmap control page level which
1397 * distinctly describes the allocation group's free space
1398 * (i.e. the highest level at which the allocation group's
1399 * free space is not mixed in with that of any other group).
1400 * in addition, we start the search within this level at a
1401 * height of the dmapctl dmtree at which the nodes distinctly
1402 * describe the allocation group's free space. at this height,
1403 * the allocation group's free space may be represented by 1
1404 * or two sub-trees, depending on the allocation group size.
1405 * we search the top nodes of these subtrees left to right for
1406 * sufficient free space. if sufficient free space is found,
1407 * the subtree is searched to find the leftmost leaf that
1408 * has free space. once we have made it to the leaf, we
1409 * move the search to the next lower level dmap control page
1410 * corresponding to this leaf. we continue down the dmap control
1411 * pages until we find the dmap that contains or starts the
1412 * sufficient free space and we allocate at this dmap.
1413 *
1414 * if the allocation group size is equal to the dmap size,
1415 * we'll start at the dmap corresponding to the allocation
1416 * group and attempt the allocation at this level.
1417 *
1418 * the dmap control page search is also not performed if the
1419 * allocation group is completely free and we go to the first
1420 * dmap of the allocation group to do the allocation. this is
1421 * done because the allocation group may be part (not the first
1422 * part) of a larger binary buddy system, causing the dmap
1423 * control pages to indicate no free space (NOFREE) within
1424 * the allocation group.
1425 *
1426 * PARAMETERS:
1427 * bmp - pointer to bmap descriptor
1428 * agno - allocation group number.
1429 * nblocks - actual number of contiguous free blocks desired.
1430 * l2nb - log2 number of contiguous free blocks desired.
1431 * results - on successful return, set to the starting block number
1432 * of the newly allocated range.
1433 *
1434 * RETURN VALUES:
1435 * 0 - success
1436 * -ENOSPC - insufficient disk resources
1437 * -EIO - i/o error
1438 *
1439 * note: IWRITE_LOCK(ipmap) held on entry/exit;
1440 */
1441static int
1442dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1443{
1444 struct metapage *mp;
1445 struct dmapctl *dcp;
1446 int rc, ti, i, k, m, n, agperlev;
1447 s64 blkno, lblkno;
1448 int budmin;
1449
1450 /* allocation request should not be for more than the
1451 * allocation group size.
1452 */
1453 if (l2nb > bmp->db_agl2size) {
1454 jfs_error(bmp->db_ipbmap->i_sb,
1455 "dbAllocAG: allocation request is larger than the "
1456 "allocation group size");
1457 return -EIO;
1458 }
1459
1460 /* determine the starting block number of the allocation
1461 * group.
1462 */
1463 blkno = (s64) agno << bmp->db_agl2size;
1464
1465 /* check if the allocation group size is the minimum allocation
1466 * group size or if the allocation group is completely free. if
1467 * the allocation group size is the minimum size of BPERDMAP (i.e.
1468 * 1 dmap), there is no need to search the dmap control page (below)
1469 * that fully describes the allocation group since the allocation
1470 * group is already fully described by a dmap. in this case, we
1471 * just call dbAllocCtl() to search the dmap tree and allocate the
1472 * required space if available.
1473 *
1474 * if the allocation group is completely free, dbAllocCtl() is
1475 * also called to allocate the required space. this is done for
1476 * two reasons. first, it makes no sense searching the dmap control
1477 * pages for free space when we know that free space exists. second,
1478 * the dmap control pages may indicate that the allocation group
1479 * has no free space if the allocation group is part (not the first
1480 * part) of a larger binary buddy system.
1481 */
1482 if (bmp->db_agsize == BPERDMAP
1483 || bmp->db_agfree[agno] == bmp->db_agsize) {
1484 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
1485 if ((rc == -ENOSPC) &&
1486 (bmp->db_agfree[agno] == bmp->db_agsize)) {
1487 printk(KERN_ERR "blkno = %Lx, blocks = %Lx\n",
1488 (unsigned long long) blkno,
1489 (unsigned long long) nblocks);
1490 jfs_error(bmp->db_ipbmap->i_sb,
1491 "dbAllocAG: dbAllocCtl failed in free AG");
1492 }
1493 return (rc);
1494 }
1495
1496 /* the buffer for the dmap control page that fully describes the
1497 * allocation group.
1498 */
1499 lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, bmp->db_aglevel);
1500 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1501 if (mp == NULL)
1502 return -EIO;
1503 dcp = (struct dmapctl *) mp->data;
1504 budmin = dcp->budmin;
1505
1506 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
1507 jfs_error(bmp->db_ipbmap->i_sb,
1508 "dbAllocAG: Corrupt dmapctl page");
1509 release_metapage(mp);
1510 return -EIO;
1511 }
1512
1513 /* search the subtree(s) of the dmap control page that describes
1514 * the allocation group, looking for sufficient free space. to begin,
1515 * determine how many allocation groups are represented in a dmap
1516 * control page at the control page level (i.e. L0, L1, L2) that
1517 * fully describes an allocation group. next, determine the starting
1518 * tree index of this allocation group within the control page.
1519 */
1520 agperlev =
1521 (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth;
1522 ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
1523
1524 /* dmap control page trees fan-out by 4 and a single allocation
1525 * group may be described by 1 or 2 subtrees within the ag level
1526 * dmap control page, depending upon the ag size. examine the ag's
1527 * subtrees for sufficient free space, starting with the leftmost
1528 * subtree.
1529 */
1530 for (i = 0; i < bmp->db_agwidth; i++, ti++) {
1531 /* is there sufficient free space ?
1532 */
1533 if (l2nb > dcp->stree[ti])
1534 continue;
1535
1536 /* sufficient free space found in a subtree. now search down
1537 * the subtree to find the leftmost leaf that describes this
1538 * free space.
1539 */
1540 for (k = bmp->db_agheigth; k > 0; k--) {
1541 for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
1542 if (l2nb <= dcp->stree[m + n]) {
1543 ti = m + n;
1544 break;
1545 }
1546 }
1547 if (n == 4) {
1548 jfs_error(bmp->db_ipbmap->i_sb,
1549 "dbAllocAG: failed descending stree");
1550 release_metapage(mp);
1551 return -EIO;
1552 }
1553 }
1554
1555 /* determine the block number within the file system
1556 * that corresponds to this leaf.
1557 */
1558 if (bmp->db_aglevel == 2)
1559 blkno = 0;
1560 else if (bmp->db_aglevel == 1)
1561 blkno &= ~(MAXL1SIZE - 1);
1562 else /* bmp->db_aglevel == 0 */
1563 blkno &= ~(MAXL0SIZE - 1);
1564
1565 blkno +=
1566 ((s64) (ti - le32_to_cpu(dcp->leafidx))) << budmin;
1567
1568 /* release the buffer in preparation for going down
1569 * the next level of dmap control pages.
1570 */
1571 release_metapage(mp);
1572
1573 /* check if we need to continue to search down the lower
1574 * level dmap control pages. we need to if the number of
1575 * blocks required is less than maximum number of blocks
1576 * described at the next lower level.
1577 */
1578 if (l2nb < budmin) {
1579
1580 /* search the lower level dmap control pages to get
1581 * the starting block number of the the dmap that
1582 * contains or starts off the free space.
1583 */
1584 if ((rc =
1585 dbFindCtl(bmp, l2nb, bmp->db_aglevel - 1,
1586 &blkno))) {
1587 if (rc == -ENOSPC) {
1588 jfs_error(bmp->db_ipbmap->i_sb,
1589 "dbAllocAG: control page "
1590 "inconsistent");
1591 return -EIO;
1592 }
1593 return (rc);
1594 }
1595 }
1596
1597 /* allocate the blocks.
1598 */
1599 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
1600 if (rc == -ENOSPC) {
1601 jfs_error(bmp->db_ipbmap->i_sb,
1602 "dbAllocAG: unable to allocate blocks");
1603 rc = -EIO;
1604 }
1605 return (rc);
1606 }
1607
1608 /* no space in the allocation group. release the buffer and
1609 * return -ENOSPC.
1610 */
1611 release_metapage(mp);
1612
1613 return -ENOSPC;
1614}
1615
1616
1617/*
1618 * NAME: dbAllocAny()
1619 *
1620 * FUNCTION: attempt to allocate the specified number of contiguous
1621 * free blocks anywhere in the file system.
1622 *
1623 * dbAllocAny() attempts to find the sufficient free space by
1624 * searching down the dmap control pages, starting with the
1625 * highest level (i.e. L0, L1, L2) control page. if free space
1626 * large enough to satisfy the desired free space is found, the
1627 * desired free space is allocated.
1628 *
1629 * PARAMETERS:
1630 * bmp - pointer to bmap descriptor
1631 * nblocks - actual number of contiguous free blocks desired.
1632 * l2nb - log2 number of contiguous free blocks desired.
1633 * results - on successful return, set to the starting block number
1634 * of the newly allocated range.
1635 *
1636 * RETURN VALUES:
1637 * 0 - success
1638 * -ENOSPC - insufficient disk resources
1639 * -EIO - i/o error
1640 *
1641 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
1642 */
1643static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
1644{
1645 int rc;
1646 s64 blkno = 0;
1647
1648 /* starting with the top level dmap control page, search
1649 * down the dmap control levels for sufficient free space.
1650 * if free space is found, dbFindCtl() returns the starting
1651 * block number of the dmap that contains or starts off the
1652 * range of free space.
1653 */
1654 if ((rc = dbFindCtl(bmp, l2nb, bmp->db_maxlevel, &blkno)))
1655 return (rc);
1656
1657 /* allocate the blocks.
1658 */
1659 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
1660 if (rc == -ENOSPC) {
1661 jfs_error(bmp->db_ipbmap->i_sb,
1662 "dbAllocAny: unable to allocate blocks");
1663 return -EIO;
1664 }
1665 return (rc);
1666}
1667
1668
1669/*
1670 * NAME: dbFindCtl()
1671 *
1672 * FUNCTION: starting at a specified dmap control page level and block
1673 * number, search down the dmap control levels for a range of
1674 * contiguous free blocks large enough to satisfy an allocation
1675 * request for the specified number of free blocks.
1676 *
1677 * if sufficient contiguous free blocks are found, this routine
1678 * returns the starting block number within a dmap page that
1679 * contains or starts a range of contiqious free blocks that
1680 * is sufficient in size.
1681 *
1682 * PARAMETERS:
1683 * bmp - pointer to bmap descriptor
1684 * level - starting dmap control page level.
1685 * l2nb - log2 number of contiguous free blocks desired.
1686 * *blkno - on entry, starting block number for conducting the search.
1687 * on successful return, the first block within a dmap page
1688 * that contains or starts a range of contiguous free blocks.
1689 *
1690 * RETURN VALUES:
1691 * 0 - success
1692 * -ENOSPC - insufficient disk resources
1693 * -EIO - i/o error
1694 *
1695 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
1696 */
1697static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
1698{
1699 int rc, leafidx, lev;
1700 s64 b, lblkno;
1701 struct dmapctl *dcp;
1702 int budmin;
1703 struct metapage *mp;
1704
1705 /* starting at the specified dmap control page level and block
1706 * number, search down the dmap control levels for the starting
1707 * block number of a dmap page that contains or starts off
1708 * sufficient free blocks.
1709 */
1710 for (lev = level, b = *blkno; lev >= 0; lev--) {
1711 /* get the buffer of the dmap control page for the block
1712 * number and level (i.e. L0, L1, L2).
1713 */
1714 lblkno = BLKTOCTL(b, bmp->db_l2nbperpage, lev);
1715 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1716 if (mp == NULL)
1717 return -EIO;
1718 dcp = (struct dmapctl *) mp->data;
1719 budmin = dcp->budmin;
1720
1721 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
1722 jfs_error(bmp->db_ipbmap->i_sb,
1723 "dbFindCtl: Corrupt dmapctl page");
1724 release_metapage(mp);
1725 return -EIO;
1726 }
1727
1728 /* search the tree within the dmap control page for
1729 * sufficent free space. if sufficient free space is found,
1730 * dbFindLeaf() returns the index of the leaf at which
1731 * free space was found.
1732 */
1733 rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx);
1734
1735 /* release the buffer.
1736 */
1737 release_metapage(mp);
1738
1739 /* space found ?
1740 */
1741 if (rc) {
1742 if (lev != level) {
1743 jfs_error(bmp->db_ipbmap->i_sb,
1744 "dbFindCtl: dmap inconsistent");
1745 return -EIO;
1746 }
1747 return -ENOSPC;
1748 }
1749
1750 /* adjust the block number to reflect the location within
1751 * the dmap control page (i.e. the leaf) at which free
1752 * space was found.
1753 */
1754 b += (((s64) leafidx) << budmin);
1755
1756 /* we stop the search at this dmap control page level if
1757 * the number of blocks required is greater than or equal
1758 * to the maximum number of blocks described at the next
1759 * (lower) level.
1760 */
1761 if (l2nb >= budmin)
1762 break;
1763 }
1764
1765 *blkno = b;
1766 return (0);
1767}
1768
1769
1770/*
1771 * NAME: dbAllocCtl()
1772 *
1773 * FUNCTION: attempt to allocate a specified number of contiguous
1774 * blocks starting within a specific dmap.
1775 *
1776 * this routine is called by higher level routines that search
1777 * the dmap control pages above the actual dmaps for contiguous
1778 * free space. the result of successful searches by these
1779 * routines are the starting block numbers within dmaps, with
1780 * the dmaps themselves containing the desired contiguous free
1781 * space or starting a contiguous free space of desired size
1782 * that is made up of the blocks of one or more dmaps. these
1783 * calls should not fail due to insufficent resources.
1784 *
1785 * this routine is called in some cases where it is not known
1786 * whether it will fail due to insufficient resources. more
1787 * specifically, this occurs when allocating from an allocation
1788 * group whose size is equal to the number of blocks per dmap.
1789 * in this case, the dmap control pages are not examined prior
1790 * to calling this routine (to save pathlength) and the call
1791 * might fail.
1792 *
1793 * for a request size that fits within a dmap, this routine relies
1794 * upon the dmap's dmtree to find the requested contiguous free
1795 * space. for request sizes that are larger than a dmap, the
1796 * requested free space will start at the first block of the
1797 * first dmap (i.e. blkno).
1798 *
1799 * PARAMETERS:
1800 * bmp - pointer to bmap descriptor
1801 * nblocks - actual number of contiguous free blocks to allocate.
1802 * l2nb - log2 number of contiguous free blocks to allocate.
1803 * blkno - starting block number of the dmap to start the allocation
1804 * from.
1805 * results - on successful return, set to the starting block number
1806 * of the newly allocated range.
1807 *
1808 * RETURN VALUES:
1809 * 0 - success
1810 * -ENOSPC - insufficient disk resources
1811 * -EIO - i/o error
1812 *
1813 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
1814 */
1815static int
1816dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
1817{
1818 int rc, nb;
1819 s64 b, lblkno, n;
1820 struct metapage *mp;
1821 struct dmap *dp;
1822
1823 /* check if the allocation request is confined to a single dmap.
1824 */
1825 if (l2nb <= L2BPERDMAP) {
1826 /* get the buffer for the dmap.
1827 */
1828 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
1829 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1830 if (mp == NULL)
1831 return -EIO;
1832 dp = (struct dmap *) mp->data;
1833
1834 /* try to allocate the blocks.
1835 */
1836 rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results);
1837 if (rc == 0)
1838 mark_metapage_dirty(mp);
1839
1840 release_metapage(mp);
1841
1842 return (rc);
1843 }
1844
1845 /* allocation request involving multiple dmaps. it must start on
1846 * a dmap boundary.
1847 */
1848 assert((blkno & (BPERDMAP - 1)) == 0);
1849
1850 /* allocate the blocks dmap by dmap.
1851 */
1852 for (n = nblocks, b = blkno; n > 0; n -= nb, b += nb) {
1853 /* get the buffer for the dmap.
1854 */
1855 lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
1856 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1857 if (mp == NULL) {
1858 rc = -EIO;
1859 goto backout;
1860 }
1861 dp = (struct dmap *) mp->data;
1862
1863 /* the dmap better be all free.
1864 */
1865 if (dp->tree.stree[ROOT] != L2BPERDMAP) {
1866 release_metapage(mp);
1867 jfs_error(bmp->db_ipbmap->i_sb,
1868 "dbAllocCtl: the dmap is not all free");
1869 rc = -EIO;
1870 goto backout;
1871 }
1872
1873 /* determine how many blocks to allocate from this dmap.
1874 */
1875 nb = min(n, (s64)BPERDMAP);
1876
1877 /* allocate the blocks from the dmap.
1878 */
1879 if ((rc = dbAllocDmap(bmp, dp, b, nb))) {
1880 release_metapage(mp);
1881 goto backout;
1882 }
1883
1884 /* write the buffer.
1885 */
1886 write_metapage(mp);
1887 }
1888
1889 /* set the results (starting block number) and return.
1890 */
1891 *results = blkno;
1892 return (0);
1893
1894 /* something failed in handling an allocation request involving
1895 * multiple dmaps. we'll try to clean up by backing out any
1896 * allocation that has already happened for this request. if
1897 * we fail in backing out the allocation, we'll mark the file
1898 * system to indicate that blocks have been leaked.
1899 */
1900 backout:
1901
1902 /* try to backout the allocations dmap by dmap.
1903 */
1904 for (n = nblocks - n, b = blkno; n > 0;
1905 n -= BPERDMAP, b += BPERDMAP) {
1906 /* get the buffer for this dmap.
1907 */
1908 lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
1909 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1910 if (mp == NULL) {
1911 /* could not back out. mark the file system
1912 * to indicate that we have leaked blocks.
1913 */
1914 jfs_error(bmp->db_ipbmap->i_sb,
1915 "dbAllocCtl: I/O Error: Block Leakage.");
1916 continue;
1917 }
1918 dp = (struct dmap *) mp->data;
1919
1920 /* free the blocks is this dmap.
1921 */
1922 if (dbFreeDmap(bmp, dp, b, BPERDMAP)) {
1923 /* could not back out. mark the file system
1924 * to indicate that we have leaked blocks.
1925 */
1926 release_metapage(mp);
1927 jfs_error(bmp->db_ipbmap->i_sb,
1928 "dbAllocCtl: Block Leakage.");
1929 continue;
1930 }
1931
1932 /* write the buffer.
1933 */
1934 write_metapage(mp);
1935 }
1936
1937 return (rc);
1938}
1939
1940
1941/*
1942 * NAME: dbAllocDmapLev()
1943 *
1944 * FUNCTION: attempt to allocate a specified number of contiguous blocks
1945 * from a specified dmap.
1946 *
1947 * this routine checks if the contiguous blocks are available.
1948 * if so, nblocks of blocks are allocated; otherwise, ENOSPC is
1949 * returned.
1950 *
1951 * PARAMETERS:
1952 * mp - pointer to bmap descriptor
1953 * dp - pointer to dmap to attempt to allocate blocks from.
1954 * l2nb - log2 number of contiguous block desired.
1955 * nblocks - actual number of contiguous block desired.
1956 * results - on successful return, set to the starting block number
1957 * of the newly allocated range.
1958 *
1959 * RETURN VALUES:
1960 * 0 - success
1961 * -ENOSPC - insufficient disk resources
1962 * -EIO - i/o error
1963 *
1964 * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or
1965 * IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit;
1966 */
1967static int
1968dbAllocDmapLev(struct bmap * bmp,
1969 struct dmap * dp, int nblocks, int l2nb, s64 * results)
1970{
1971 s64 blkno;
1972 int leafidx, rc;
1973
1974 /* can't be more than a dmaps worth of blocks */
1975 assert(l2nb <= L2BPERDMAP);
1976
1977 /* search the tree within the dmap page for sufficient
1978 * free space. if sufficient free space is found, dbFindLeaf()
1979 * returns the index of the leaf at which free space was found.
1980 */
1981 if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx))
1982 return -ENOSPC;
1983
1984 /* determine the block number within the file system corresponding
1985 * to the leaf at which free space was found.
1986 */
1987 blkno = le64_to_cpu(dp->start) + (leafidx << L2DBWORD);
1988
1989 /* if not all bits of the dmap word are free, get the starting
1990 * bit number within the dmap word of the required string of free
1991 * bits and adjust the block number with this value.
1992 */
1993 if (dp->tree.stree[leafidx + LEAFIND] < BUDMIN)
1994 blkno += dbFindBits(le32_to_cpu(dp->wmap[leafidx]), l2nb);
1995
1996 /* allocate the blocks */
1997 if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
1998 *results = blkno;
1999
2000 return (rc);
2001}
2002
2003
2004/*
2005 * NAME: dbAllocDmap()
2006 *
2007 * FUNCTION: adjust the disk allocation map to reflect the allocation
2008 * of a specified block range within a dmap.
2009 *
2010 * this routine allocates the specified blocks from the dmap
2011 * through a call to dbAllocBits(). if the allocation of the
2012 * block range causes the maximum string of free blocks within
2013 * the dmap to change (i.e. the value of the root of the dmap's
2014 * dmtree), this routine will cause this change to be reflected
2015 * up through the appropriate levels of the dmap control pages
2016 * by a call to dbAdjCtl() for the L0 dmap control page that
2017 * covers this dmap.
2018 *
2019 * PARAMETERS:
2020 * bmp - pointer to bmap descriptor
2021 * dp - pointer to dmap to allocate the block range from.
2022 * blkno - starting block number of the block to be allocated.
2023 * nblocks - number of blocks to be allocated.
2024 *
2025 * RETURN VALUES:
2026 * 0 - success
2027 * -EIO - i/o error
2028 *
2029 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2030 */
2031static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
2032 int nblocks)
2033{
2034 s8 oldroot;
2035 int rc;
2036
2037 /* save the current value of the root (i.e. maximum free string)
2038 * of the dmap tree.
2039 */
2040 oldroot = dp->tree.stree[ROOT];
2041
2042 /* allocate the specified (blocks) bits */
2043 dbAllocBits(bmp, dp, blkno, nblocks);
2044
2045 /* if the root has not changed, done. */
2046 if (dp->tree.stree[ROOT] == oldroot)
2047 return (0);
2048
2049 /* root changed. bubble the change up to the dmap control pages.
2050 * if the adjustment of the upper level control pages fails,
2051 * backout the bit allocation (thus making everything consistent).
2052 */
2053 if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 1, 0)))
2054 dbFreeBits(bmp, dp, blkno, nblocks);
2055
2056 return (rc);
2057}
2058
2059
2060/*
2061 * NAME: dbFreeDmap()
2062 *
2063 * FUNCTION: adjust the disk allocation map to reflect the allocation
2064 * of a specified block range within a dmap.
2065 *
2066 * this routine frees the specified blocks from the dmap through
2067 * a call to dbFreeBits(). if the deallocation of the block range
2068 * causes the maximum string of free blocks within the dmap to
2069 * change (i.e. the value of the root of the dmap's dmtree), this
2070 * routine will cause this change to be reflected up through the
2071 * appropriate levels of the dmap control pages by a call to
2072 * dbAdjCtl() for the L0 dmap control page that covers this dmap.
2073 *
2074 * PARAMETERS:
2075 * bmp - pointer to bmap descriptor
2076 * dp - pointer to dmap to free the block range from.
2077 * blkno - starting block number of the block to be freed.
2078 * nblocks - number of blocks to be freed.
2079 *
2080 * RETURN VALUES:
2081 * 0 - success
2082 * -EIO - i/o error
2083 *
2084 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2085 */
2086static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
2087 int nblocks)
2088{
2089 s8 oldroot;
2090 int rc, word;
2091
2092 /* save the current value of the root (i.e. maximum free string)
2093 * of the dmap tree.
2094 */
2095 oldroot = dp->tree.stree[ROOT];
2096
2097 /* free the specified (blocks) bits */
2098 dbFreeBits(bmp, dp, blkno, nblocks);
2099
2100 /* if the root has not changed, done. */
2101 if (dp->tree.stree[ROOT] == oldroot)
2102 return (0);
2103
2104 /* root changed. bubble the change up to the dmap control pages.
2105 * if the adjustment of the upper level control pages fails,
2106 * backout the deallocation.
2107 */
2108 if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 0, 0))) {
2109 word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
2110
2111 /* as part of backing out the deallocation, we will have
2112 * to back split the dmap tree if the deallocation caused
2113 * the freed blocks to become part of a larger binary buddy
2114 * system.
2115 */
2116 if (dp->tree.stree[word] == NOFREE)
2117 dbBackSplit((dmtree_t *) & dp->tree, word);
2118
2119 dbAllocBits(bmp, dp, blkno, nblocks);
2120 }
2121
2122 return (rc);
2123}
2124
2125
2126/*
2127 * NAME: dbAllocBits()
2128 *
2129 * FUNCTION: allocate a specified block range from a dmap.
2130 *
2131 * this routine updates the dmap to reflect the working
2132 * state allocation of the specified block range. it directly
2133 * updates the bits of the working map and causes the adjustment
2134 * of the binary buddy system described by the dmap's dmtree
2135 * leaves to reflect the bits allocated. it also causes the
2136 * dmap's dmtree, as a whole, to reflect the allocated range.
2137 *
2138 * PARAMETERS:
2139 * bmp - pointer to bmap descriptor
2140 * dp - pointer to dmap to allocate bits from.
2141 * blkno - starting block number of the bits to be allocated.
2142 * nblocks - number of bits to be allocated.
2143 *
2144 * RETURN VALUES: none
2145 *
2146 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2147 */
2148static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2149 int nblocks)
2150{
2151 int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
2152 dmtree_t *tp = (dmtree_t *) & dp->tree;
2153 int size;
2154 s8 *leaf;
2155
2156 /* pick up a pointer to the leaves of the dmap tree */
2157 leaf = dp->tree.stree + LEAFIND;
2158
2159 /* determine the bit number and word within the dmap of the
2160 * starting block.
2161 */
2162 dbitno = blkno & (BPERDMAP - 1);
2163 word = dbitno >> L2DBWORD;
2164
2165 /* block range better be within the dmap */
2166 assert(dbitno + nblocks <= BPERDMAP);
2167
2168 /* allocate the bits of the dmap's words corresponding to the block
2169 * range. not all bits of the first and last words may be contained
2170 * within the block range. if this is the case, we'll work against
2171 * those words (i.e. partial first and/or last) on an individual basis
2172 * (a single pass), allocating the bits of interest by hand and
2173 * updating the leaf corresponding to the dmap word. a single pass
2174 * will be used for all dmap words fully contained within the
2175 * specified range. within this pass, the bits of all fully contained
2176 * dmap words will be marked as free in a single shot and the leaves
2177 * will be updated. a single leaf may describe the free space of
2178 * multiple dmap words, so we may update only a subset of the actual
2179 * leaves corresponding to the dmap words of the block range.
2180 */
2181 for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
2182 /* determine the bit number within the word and
2183 * the number of bits within the word.
2184 */
2185 wbitno = dbitno & (DBWORD - 1);
2186 nb = min(rembits, DBWORD - wbitno);
2187
2188 /* check if only part of a word is to be allocated.
2189 */
2190 if (nb < DBWORD) {
2191 /* allocate (set to 1) the appropriate bits within
2192 * this dmap word.
2193 */
2194 dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
2195 >> wbitno);
2196
2197 /* update the leaf for this dmap word. in addition
2198 * to setting the leaf value to the binary buddy max
2199 * of the updated dmap word, dbSplit() will split
2200 * the binary system of the leaves if need be.
2201 */
2202 dbSplit(tp, word, BUDMIN,
2203 dbMaxBud((u8 *) & dp->wmap[word]));
2204
2205 word += 1;
2206 } else {
2207 /* one or more dmap words are fully contained
2208 * within the block range. determine how many
2209 * words and allocate (set to 1) the bits of these
2210 * words.
2211 */
2212 nwords = rembits >> L2DBWORD;
2213 memset(&dp->wmap[word], (int) ONES, nwords * 4);
2214
2215 /* determine how many bits.
2216 */
2217 nb = nwords << L2DBWORD;
2218
2219 /* now update the appropriate leaves to reflect
2220 * the allocated words.
2221 */
2222 for (; nwords > 0; nwords -= nw) {
2223 if (leaf[word] < BUDMIN) {
2224 jfs_error(bmp->db_ipbmap->i_sb,
2225 "dbAllocBits: leaf page "
2226 "corrupt");
2227 break;
2228 }
2229
2230 /* determine what the leaf value should be
2231 * updated to as the minimum of the l2 number
2232 * of bits being allocated and the l2 number
2233 * of bits currently described by this leaf.
2234 */
2235 size = min((int)leaf[word], NLSTOL2BSZ(nwords));
2236
2237 /* update the leaf to reflect the allocation.
2238 * in addition to setting the leaf value to
2239 * NOFREE, dbSplit() will split the binary
2240 * system of the leaves to reflect the current
2241 * allocation (size).
2242 */
2243 dbSplit(tp, word, size, NOFREE);
2244
2245 /* get the number of dmap words handled */
2246 nw = BUDSIZE(size, BUDMIN);
2247 word += nw;
2248 }
2249 }
2250 }
2251
2252 /* update the free count for this dmap */
2253 dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
2254
2255 BMAP_LOCK(bmp);
2256
2257 /* if this allocation group is completely free,
2258 * update the maximum allocation group number if this allocation
2259 * group is the new max.
2260 */
2261 agno = blkno >> bmp->db_agl2size;
2262 if (agno > bmp->db_maxag)
2263 bmp->db_maxag = agno;
2264
2265 /* update the free count for the allocation group and map */
2266 bmp->db_agfree[agno] -= nblocks;
2267 bmp->db_nfree -= nblocks;
2268
2269 BMAP_UNLOCK(bmp);
2270}
2271
2272
2273/*
2274 * NAME: dbFreeBits()
2275 *
2276 * FUNCTION: free a specified block range from a dmap.
2277 *
2278 * this routine updates the dmap to reflect the working
2279 * state allocation of the specified block range. it directly
2280 * updates the bits of the working map and causes the adjustment
2281 * of the binary buddy system described by the dmap's dmtree
2282 * leaves to reflect the bits freed. it also causes the dmap's
2283 * dmtree, as a whole, to reflect the deallocated range.
2284 *
2285 * PARAMETERS:
2286 * bmp - pointer to bmap descriptor
2287 * dp - pointer to dmap to free bits from.
2288 * blkno - starting block number of the bits to be freed.
2289 * nblocks - number of bits to be freed.
2290 *
2291 * RETURN VALUES: none
2292 *
2293 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2294 */
2295static void dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2296 int nblocks)
2297{
2298 int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
2299 dmtree_t *tp = (dmtree_t *) & dp->tree;
2300 int size;
2301
2302 /* determine the bit number and word within the dmap of the
2303 * starting block.
2304 */
2305 dbitno = blkno & (BPERDMAP - 1);
2306 word = dbitno >> L2DBWORD;
2307
2308 /* block range better be within the dmap.
2309 */
2310 assert(dbitno + nblocks <= BPERDMAP);
2311
2312 /* free the bits of the dmaps words corresponding to the block range.
2313 * not all bits of the first and last words may be contained within
2314 * the block range. if this is the case, we'll work against those
2315 * words (i.e. partial first and/or last) on an individual basis
2316 * (a single pass), freeing the bits of interest by hand and updating
2317 * the leaf corresponding to the dmap word. a single pass will be used
2318 * for all dmap words fully contained within the specified range.
2319 * within this pass, the bits of all fully contained dmap words will
2320 * be marked as free in a single shot and the leaves will be updated. a
2321 * single leaf may describe the free space of multiple dmap words,
2322 * so we may update only a subset of the actual leaves corresponding
2323 * to the dmap words of the block range.
2324 *
2325 * dbJoin() is used to update leaf values and will join the binary
2326 * buddy system of the leaves if the new leaf values indicate this
2327 * should be done.
2328 */
2329 for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
2330 /* determine the bit number within the word and
2331 * the number of bits within the word.
2332 */
2333 wbitno = dbitno & (DBWORD - 1);
2334 nb = min(rembits, DBWORD - wbitno);
2335
2336 /* check if only part of a word is to be freed.
2337 */
2338 if (nb < DBWORD) {
2339 /* free (zero) the appropriate bits within this
2340 * dmap word.
2341 */
2342 dp->wmap[word] &=
2343 cpu_to_le32(~(ONES << (DBWORD - nb)
2344 >> wbitno));
2345
2346 /* update the leaf for this dmap word.
2347 */
2348 dbJoin(tp, word,
2349 dbMaxBud((u8 *) & dp->wmap[word]));
2350
2351 word += 1;
2352 } else {
2353 /* one or more dmap words are fully contained
2354 * within the block range. determine how many
2355 * words and free (zero) the bits of these words.
2356 */
2357 nwords = rembits >> L2DBWORD;
2358 memset(&dp->wmap[word], 0, nwords * 4);
2359
2360 /* determine how many bits.
2361 */
2362 nb = nwords << L2DBWORD;
2363
2364 /* now update the appropriate leaves to reflect
2365 * the freed words.
2366 */
2367 for (; nwords > 0; nwords -= nw) {
2368 /* determine what the leaf value should be
2369 * updated to as the minimum of the l2 number
2370 * of bits being freed and the l2 (max) number
2371 * of bits that can be described by this leaf.
2372 */
2373 size =
2374 min(LITOL2BSZ
2375 (word, L2LPERDMAP, BUDMIN),
2376 NLSTOL2BSZ(nwords));
2377
2378 /* update the leaf.
2379 */
2380 dbJoin(tp, word, size);
2381
2382 /* get the number of dmap words handled.
2383 */
2384 nw = BUDSIZE(size, BUDMIN);
2385 word += nw;
2386 }
2387 }
2388 }
2389
2390 /* update the free count for this dmap.
2391 */
2392 dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
2393
2394 BMAP_LOCK(bmp);
2395
2396 /* update the free count for the allocation group and
2397 * map.
2398 */
2399 agno = blkno >> bmp->db_agl2size;
2400 bmp->db_nfree += nblocks;
2401 bmp->db_agfree[agno] += nblocks;
2402
2403 /* check if this allocation group is not completely free and
2404 * if it is currently the maximum (rightmost) allocation group.
2405 * if so, establish the new maximum allocation group number by
2406 * searching left for the first allocation group with allocation.
2407 */
2408 if ((bmp->db_agfree[agno] == bmp->db_agsize && agno == bmp->db_maxag) ||
2409 (agno == bmp->db_numag - 1 &&
2410 bmp->db_agfree[agno] == (bmp-> db_mapsize & (BPERDMAP - 1)))) {
2411 while (bmp->db_maxag > 0) {
2412 bmp->db_maxag -= 1;
2413 if (bmp->db_agfree[bmp->db_maxag] !=
2414 bmp->db_agsize)
2415 break;
2416 }
2417
2418 /* re-establish the allocation group preference if the
2419 * current preference is right of the maximum allocation
2420 * group.
2421 */
2422 if (bmp->db_agpref > bmp->db_maxag)
2423 bmp->db_agpref = bmp->db_maxag;
2424 }
2425
2426 BMAP_UNLOCK(bmp);
2427}
2428
2429
2430/*
2431 * NAME: dbAdjCtl()
2432 *
2433 * FUNCTION: adjust a dmap control page at a specified level to reflect
2434 * the change in a lower level dmap or dmap control page's
2435 * maximum string of free blocks (i.e. a change in the root
2436 * of the lower level object's dmtree) due to the allocation
2437 * or deallocation of a range of blocks with a single dmap.
2438 *
2439 * on entry, this routine is provided with the new value of
2440 * the lower level dmap or dmap control page root and the
2441 * starting block number of the block range whose allocation
2442 * or deallocation resulted in the root change. this range
2443 * is respresented by a single leaf of the current dmapctl
2444 * and the leaf will be updated with this value, possibly
2445 * causing a binary buddy system within the leaves to be
2446 * split or joined. the update may also cause the dmapctl's
2447 * dmtree to be updated.
2448 *
2449 * if the adjustment of the dmap control page, itself, causes its
2450 * root to change, this change will be bubbled up to the next dmap
2451 * control level by a recursive call to this routine, specifying
2452 * the new root value and the next dmap control page level to
2453 * be adjusted.
2454 * PARAMETERS:
2455 * bmp - pointer to bmap descriptor
2456 * blkno - the first block of a block range within a dmap. it is
2457 * the allocation or deallocation of this block range that
2458 * requires the dmap control page to be adjusted.
2459 * newval - the new value of the lower level dmap or dmap control
2460 * page root.
2461 * alloc - TRUE if adjustment is due to an allocation.
2462 * level - current level of dmap control page (i.e. L0, L1, L2) to
2463 * be adjusted.
2464 *
2465 * RETURN VALUES:
2466 * 0 - success
2467 * -EIO - i/o error
2468 *
2469 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2470 */
2471static int
2472dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
2473{
2474 struct metapage *mp;
2475 s8 oldroot;
2476 int oldval;
2477 s64 lblkno;
2478 struct dmapctl *dcp;
2479 int rc, leafno, ti;
2480
2481 /* get the buffer for the dmap control page for the specified
2482 * block number and control page level.
2483 */
2484 lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, level);
2485 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
2486 if (mp == NULL)
2487 return -EIO;
2488 dcp = (struct dmapctl *) mp->data;
2489
2490 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
2491 jfs_error(bmp->db_ipbmap->i_sb,
2492 "dbAdjCtl: Corrupt dmapctl page");
2493 release_metapage(mp);
2494 return -EIO;
2495 }
2496
2497 /* determine the leaf number corresponding to the block and
2498 * the index within the dmap control tree.
2499 */
2500 leafno = BLKTOCTLLEAF(blkno, dcp->budmin);
2501 ti = leafno + le32_to_cpu(dcp->leafidx);
2502
2503 /* save the current leaf value and the current root level (i.e.
2504 * maximum l2 free string described by this dmapctl).
2505 */
2506 oldval = dcp->stree[ti];
2507 oldroot = dcp->stree[ROOT];
2508
2509 /* check if this is a control page update for an allocation.
2510 * if so, update the leaf to reflect the new leaf value using
2511 * dbSplit(); otherwise (deallocation), use dbJoin() to udpate
2512 * the leaf with the new value. in addition to updating the
2513 * leaf, dbSplit() will also split the binary buddy system of
2514 * the leaves, if required, and bubble new values within the
2515 * dmapctl tree, if required. similarly, dbJoin() will join
2516 * the binary buddy system of leaves and bubble new values up
2517 * the dmapctl tree as required by the new leaf value.
2518 */
2519 if (alloc) {
2520 /* check if we are in the middle of a binary buddy
2521 * system. this happens when we are performing the
2522 * first allocation out of an allocation group that
2523 * is part (not the first part) of a larger binary
2524 * buddy system. if we are in the middle, back split
2525 * the system prior to calling dbSplit() which assumes
2526 * that it is at the front of a binary buddy system.
2527 */
2528 if (oldval == NOFREE) {
2529 dbBackSplit((dmtree_t *) dcp, leafno);
2530 oldval = dcp->stree[ti];
2531 }
2532 dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval);
2533 } else {
2534 dbJoin((dmtree_t *) dcp, leafno, newval);
2535 }
2536
2537 /* check if the root of the current dmap control page changed due
2538 * to the update and if the current dmap control page is not at
2539 * the current top level (i.e. L0, L1, L2) of the map. if so (i.e.
2540 * root changed and this is not the top level), call this routine
2541 * again (recursion) for the next higher level of the mapping to
2542 * reflect the change in root for the current dmap control page.
2543 */
2544 if (dcp->stree[ROOT] != oldroot) {
2545 /* are we below the top level of the map. if so,
2546 * bubble the root up to the next higher level.
2547 */
2548 if (level < bmp->db_maxlevel) {
2549 /* bubble up the new root of this dmap control page to
2550 * the next level.
2551 */
2552 if ((rc =
2553 dbAdjCtl(bmp, blkno, dcp->stree[ROOT], alloc,
2554 level + 1))) {
2555 /* something went wrong in bubbling up the new
2556 * root value, so backout the changes to the
2557 * current dmap control page.
2558 */
2559 if (alloc) {
2560 dbJoin((dmtree_t *) dcp, leafno,
2561 oldval);
2562 } else {
2563 /* the dbJoin() above might have
2564 * caused a larger binary buddy system
2565 * to form and we may now be in the
2566 * middle of it. if this is the case,
2567 * back split the buddies.
2568 */
2569 if (dcp->stree[ti] == NOFREE)
2570 dbBackSplit((dmtree_t *)
2571 dcp, leafno);
2572 dbSplit((dmtree_t *) dcp, leafno,
2573 dcp->budmin, oldval);
2574 }
2575
2576 /* release the buffer and return the error.
2577 */
2578 release_metapage(mp);
2579 return (rc);
2580 }
2581 } else {
2582 /* we're at the top level of the map. update
2583 * the bmap control page to reflect the size
2584 * of the maximum free buddy system.
2585 */
2586 assert(level == bmp->db_maxlevel);
2587 if (bmp->db_maxfreebud != oldroot) {
2588 jfs_error(bmp->db_ipbmap->i_sb,
2589 "dbAdjCtl: the maximum free buddy is "
2590 "not the old root");
2591 }
2592 bmp->db_maxfreebud = dcp->stree[ROOT];
2593 }
2594 }
2595
2596 /* write the buffer.
2597 */
2598 write_metapage(mp);
2599
2600 return (0);
2601}
2602
2603
2604/*
2605 * NAME: dbSplit()
2606 *
2607 * FUNCTION: update the leaf of a dmtree with a new value, splitting
2608 * the leaf from the binary buddy system of the dmtree's
2609 * leaves, as required.
2610 *
2611 * PARAMETERS:
2612 * tp - pointer to the tree containing the leaf.
2613 * leafno - the number of the leaf to be updated.
2614 * splitsz - the size the binary buddy system starting at the leaf
2615 * must be split to, specified as the log2 number of blocks.
2616 * newval - the new value for the leaf.
2617 *
2618 * RETURN VALUES: none
2619 *
2620 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2621 */
2622static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
2623{
2624 int budsz;
2625 int cursz;
2626 s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
2627
2628 /* check if the leaf needs to be split.
2629 */
2630 if (leaf[leafno] > tp->dmt_budmin) {
2631 /* the split occurs by cutting the buddy system in half
2632 * at the specified leaf until we reach the specified
2633 * size. pick up the starting split size (current size
2634 * - 1 in l2) and the corresponding buddy size.
2635 */
2636 cursz = leaf[leafno] - 1;
2637 budsz = BUDSIZE(cursz, tp->dmt_budmin);
2638
2639 /* split until we reach the specified size.
2640 */
2641 while (cursz >= splitsz) {
2642 /* update the buddy's leaf with its new value.
2643 */
2644 dbAdjTree(tp, leafno ^ budsz, cursz);
2645
2646 /* on to the next size and buddy.
2647 */
2648 cursz -= 1;
2649 budsz >>= 1;
2650 }
2651 }
2652
2653 /* adjust the dmap tree to reflect the specified leaf's new
2654 * value.
2655 */
2656 dbAdjTree(tp, leafno, newval);
2657}
2658
2659
2660/*
2661 * NAME: dbBackSplit()
2662 *
2663 * FUNCTION: back split the binary buddy system of dmtree leaves
2664 * that hold a specified leaf until the specified leaf
2665 * starts its own binary buddy system.
2666 *
2667 * the allocators typically perform allocations at the start
2668 * of binary buddy systems and dbSplit() is used to accomplish
2669 * any required splits. in some cases, however, allocation
2670 * may occur in the middle of a binary system and requires a
2671 * back split, with the split proceeding out from the middle of
2672 * the system (less efficient) rather than the start of the
2673 * system (more efficient). the cases in which a back split
2674 * is required are rare and are limited to the first allocation
2675 * within an allocation group which is a part (not first part)
2676 * of a larger binary buddy system and a few exception cases
2677 * in which a previous join operation must be backed out.
2678 *
2679 * PARAMETERS:
2680 * tp - pointer to the tree containing the leaf.
2681 * leafno - the number of the leaf to be updated.
2682 *
2683 * RETURN VALUES: none
2684 *
2685 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2686 */
2687static void dbBackSplit(dmtree_t * tp, int leafno)
2688{
2689 int budsz, bud, w, bsz, size;
2690 int cursz;
2691 s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
2692
2693 /* leaf should be part (not first part) of a binary
2694 * buddy system.
2695 */
2696 assert(leaf[leafno] == NOFREE);
2697
2698 /* the back split is accomplished by iteratively finding the leaf
2699 * that starts the buddy system that contains the specified leaf and
2700 * splitting that system in two. this iteration continues until
2701 * the specified leaf becomes the start of a buddy system.
2702 *
2703 * determine maximum possible l2 size for the specified leaf.
2704 */
2705 size =
2706 LITOL2BSZ(leafno, le32_to_cpu(tp->dmt_l2nleafs),
2707 tp->dmt_budmin);
2708
2709 /* determine the number of leaves covered by this size. this
2710 * is the buddy size that we will start with as we search for
2711 * the buddy system that contains the specified leaf.
2712 */
2713 budsz = BUDSIZE(size, tp->dmt_budmin);
2714
2715 /* back split.
2716 */
2717 while (leaf[leafno] == NOFREE) {
2718 /* find the leftmost buddy leaf.
2719 */
2720 for (w = leafno, bsz = budsz;; bsz <<= 1,
2721 w = (w < bud) ? w : bud) {
2722 assert(bsz < le32_to_cpu(tp->dmt_nleafs));
2723
2724 /* determine the buddy.
2725 */
2726 bud = w ^ bsz;
2727
2728 /* check if this buddy is the start of the system.
2729 */
2730 if (leaf[bud] != NOFREE) {
2731 /* split the leaf at the start of the
2732 * system in two.
2733 */
2734 cursz = leaf[bud] - 1;
2735 dbSplit(tp, bud, cursz, cursz);
2736 break;
2737 }
2738 }
2739 }
2740
2741 assert(leaf[leafno] == size);
2742}
2743
2744
2745/*
2746 * NAME: dbJoin()
2747 *
2748 * FUNCTION: update the leaf of a dmtree with a new value, joining
2749 * the leaf with other leaves of the dmtree into a multi-leaf
2750 * binary buddy system, as required.
2751 *
2752 * PARAMETERS:
2753 * tp - pointer to the tree containing the leaf.
2754 * leafno - the number of the leaf to be updated.
2755 * newval - the new value for the leaf.
2756 *
2757 * RETURN VALUES: none
2758 */
2759static void dbJoin(dmtree_t * tp, int leafno, int newval)
2760{
2761 int budsz, buddy;
2762 s8 *leaf;
2763
2764 /* can the new leaf value require a join with other leaves ?
2765 */
2766 if (newval >= tp->dmt_budmin) {
2767 /* pickup a pointer to the leaves of the tree.
2768 */
2769 leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
2770
2771 /* try to join the specified leaf into a large binary
2772 * buddy system. the join proceeds by attempting to join
2773 * the specified leafno with its buddy (leaf) at new value.
2774 * if the join occurs, we attempt to join the left leaf
2775 * of the joined buddies with its buddy at new value + 1.
2776 * we continue to join until we find a buddy that cannot be
2777 * joined (does not have a value equal to the size of the
2778 * last join) or until all leaves have been joined into a
2779 * single system.
2780 *
2781 * get the buddy size (number of words covered) of
2782 * the new value.
2783 */
2784 budsz = BUDSIZE(newval, tp->dmt_budmin);
2785
2786 /* try to join.
2787 */
2788 while (budsz < le32_to_cpu(tp->dmt_nleafs)) {
2789 /* get the buddy leaf.
2790 */
2791 buddy = leafno ^ budsz;
2792
2793 /* if the leaf's new value is greater than its
2794 * buddy's value, we join no more.
2795 */
2796 if (newval > leaf[buddy])
2797 break;
2798
2799 assert(newval == leaf[buddy]);
2800
2801 /* check which (leafno or buddy) is the left buddy.
2802 * the left buddy gets to claim the blocks resulting
2803 * from the join while the right gets to claim none.
2804 * the left buddy is also eligable to participate in
2805 * a join at the next higher level while the right
2806 * is not.
2807 *
2808 */
2809 if (leafno < buddy) {
2810 /* leafno is the left buddy.
2811 */
2812 dbAdjTree(tp, buddy, NOFREE);
2813 } else {
2814 /* buddy is the left buddy and becomes
2815 * leafno.
2816 */
2817 dbAdjTree(tp, leafno, NOFREE);
2818 leafno = buddy;
2819 }
2820
2821 /* on to try the next join.
2822 */
2823 newval += 1;
2824 budsz <<= 1;
2825 }
2826 }
2827
2828 /* update the leaf value.
2829 */
2830 dbAdjTree(tp, leafno, newval);
2831}
2832
2833
2834/*
2835 * NAME: dbAdjTree()
2836 *
2837 * FUNCTION: update a leaf of a dmtree with a new value, adjusting
2838 * the dmtree, as required, to reflect the new leaf value.
2839 * the combination of any buddies must already be done before
2840 * this is called.
2841 *
2842 * PARAMETERS:
2843 * tp - pointer to the tree to be adjusted.
2844 * leafno - the number of the leaf to be updated.
2845 * newval - the new value for the leaf.
2846 *
2847 * RETURN VALUES: none
2848 */
2849static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
2850{
2851 int lp, pp, k;
2852 int max;
2853
2854 /* pick up the index of the leaf for this leafno.
2855 */
2856 lp = leafno + le32_to_cpu(tp->dmt_leafidx);
2857
2858 /* is the current value the same as the old value ? if so,
2859 * there is nothing to do.
2860 */
2861 if (tp->dmt_stree[lp] == newval)
2862 return;
2863
2864 /* set the new value.
2865 */
2866 tp->dmt_stree[lp] = newval;
2867
2868 /* bubble the new value up the tree as required.
2869 */
2870 for (k = 0; k < le32_to_cpu(tp->dmt_height); k++) {
2871 /* get the index of the first leaf of the 4 leaf
2872 * group containing the specified leaf (leafno).
2873 */
2874 lp = ((lp - 1) & ~0x03) + 1;
2875
2876 /* get the index of the parent of this 4 leaf group.
2877 */
2878 pp = (lp - 1) >> 2;
2879
2880 /* determine the maximum of the 4 leaves.
2881 */
2882 max = TREEMAX(&tp->dmt_stree[lp]);
2883
2884 /* if the maximum of the 4 is the same as the
2885 * parent's value, we're done.
2886 */
2887 if (tp->dmt_stree[pp] == max)
2888 break;
2889
2890 /* parent gets new value.
2891 */
2892 tp->dmt_stree[pp] = max;
2893
2894 /* parent becomes leaf for next go-round.
2895 */
2896 lp = pp;
2897 }
2898}
2899
2900
2901/*
2902 * NAME: dbFindLeaf()
2903 *
2904 * FUNCTION: search a dmtree_t for sufficient free blocks, returning
2905 * the index of a leaf describing the free blocks if
2906 * sufficient free blocks are found.
2907 *
2908 * the search starts at the top of the dmtree_t tree and
2909 * proceeds down the tree to the leftmost leaf with sufficient
2910 * free space.
2911 *
2912 * PARAMETERS:
2913 * tp - pointer to the tree to be searched.
2914 * l2nb - log2 number of free blocks to search for.
2915 * leafidx - return pointer to be set to the index of the leaf
2916 * describing at least l2nb free blocks if sufficient
2917 * free blocks are found.
2918 *
2919 * RETURN VALUES:
2920 * 0 - success
2921 * -ENOSPC - insufficient free blocks.
2922 */
2923static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
2924{
2925 int ti, n = 0, k, x = 0;
2926
2927 /* first check the root of the tree to see if there is
2928 * sufficient free space.
2929 */
2930 if (l2nb > tp->dmt_stree[ROOT])
2931 return -ENOSPC;
2932
2933 /* sufficient free space available. now search down the tree
2934 * starting at the next level for the leftmost leaf that
2935 * describes sufficient free space.
2936 */
2937 for (k = le32_to_cpu(tp->dmt_height), ti = 1;
2938 k > 0; k--, ti = ((ti + n) << 2) + 1) {
2939 /* search the four nodes at this level, starting from
2940 * the left.
2941 */
2942 for (x = ti, n = 0; n < 4; n++) {
2943 /* sufficient free space found. move to the next
2944 * level (or quit if this is the last level).
2945 */
2946 if (l2nb <= tp->dmt_stree[x + n])
2947 break;
2948 }
2949
2950 /* better have found something since the higher
2951 * levels of the tree said it was here.
2952 */
2953 assert(n < 4);
2954 }
2955
2956 /* set the return to the leftmost leaf describing sufficient
2957 * free space.
2958 */
2959 *leafidx = x + n - le32_to_cpu(tp->dmt_leafidx);
2960
2961 return (0);
2962}
2963
2964
2965/*
2966 * NAME: dbFindBits()
2967 *
2968 * FUNCTION: find a specified number of binary buddy free bits within a
2969 * dmap bitmap word value.
2970 *
2971 * this routine searches the bitmap value for (1 << l2nb) free
2972 * bits at (1 << l2nb) alignments within the value.
2973 *
2974 * PARAMETERS:
2975 * word - dmap bitmap word value.
2976 * l2nb - number of free bits specified as a log2 number.
2977 *
2978 * RETURN VALUES:
2979 * starting bit number of free bits.
2980 */
2981static int dbFindBits(u32 word, int l2nb)
2982{
2983 int bitno, nb;
2984 u32 mask;
2985
2986 /* get the number of bits.
2987 */
2988 nb = 1 << l2nb;
2989 assert(nb <= DBWORD);
2990
2991 /* complement the word so we can use a mask (i.e. 0s represent
2992 * free bits) and compute the mask.
2993 */
2994 word = ~word;
2995 mask = ONES << (DBWORD - nb);
2996
2997 /* scan the word for nb free bits at nb alignments.
2998 */
2999 for (bitno = 0; mask != 0; bitno += nb, mask >>= nb) {
3000 if ((mask & word) == mask)
3001 break;
3002 }
3003
3004 ASSERT(bitno < 32);
3005
3006 /* return the bit number.
3007 */
3008 return (bitno);
3009}
3010
3011
3012/*
3013 * NAME: dbMaxBud(u8 *cp)
3014 *
3015 * FUNCTION: determine the largest binary buddy string of free
3016 * bits within 32-bits of the map.
3017 *
3018 * PARAMETERS:
3019 * cp - pointer to the 32-bit value.
3020 *
3021 * RETURN VALUES:
3022 * largest binary buddy of free bits within a dmap word.
3023 */
3024static int dbMaxBud(u8 * cp)
3025{
3026 signed char tmp1, tmp2;
3027
3028 /* check if the wmap word is all free. if so, the
3029 * free buddy size is BUDMIN.
3030 */
3031 if (*((uint *) cp) == 0)
3032 return (BUDMIN);
3033
3034 /* check if the wmap word is half free. if so, the
3035 * free buddy size is BUDMIN-1.
3036 */
3037 if (*((u16 *) cp) == 0 || *((u16 *) cp + 1) == 0)
3038 return (BUDMIN - 1);
3039
3040 /* not all free or half free. determine the free buddy
3041 * size thru table lookup using quarters of the wmap word.
3042 */
3043 tmp1 = max(budtab[cp[2]], budtab[cp[3]]);
3044 tmp2 = max(budtab[cp[0]], budtab[cp[1]]);
3045 return (max(tmp1, tmp2));
3046}
3047
3048
3049/*
3050 * NAME: cnttz(uint word)
3051 *
3052 * FUNCTION: determine the number of trailing zeros within a 32-bit
3053 * value.
3054 *
3055 * PARAMETERS:
3056 * value - 32-bit value to be examined.
3057 *
3058 * RETURN VALUES:
3059 * count of trailing zeros
3060 */
3061static int cnttz(u32 word)
3062{
3063 int n;
3064
3065 for (n = 0; n < 32; n++, word >>= 1) {
3066 if (word & 0x01)
3067 break;
3068 }
3069
3070 return (n);
3071}
3072
3073
3074/*
3075 * NAME: cntlz(u32 value)
3076 *
3077 * FUNCTION: determine the number of leading zeros within a 32-bit
3078 * value.
3079 *
3080 * PARAMETERS:
3081 * value - 32-bit value to be examined.
3082 *
3083 * RETURN VALUES:
3084 * count of leading zeros
3085 */
3086static int cntlz(u32 value)
3087{
3088 int n;
3089
3090 for (n = 0; n < 32; n++, value <<= 1) {
3091 if (value & HIGHORDER)
3092 break;
3093 }
3094 return (n);
3095}
3096
3097
3098/*
3099 * NAME: blkstol2(s64 nb)
3100 *
3101 * FUNCTION: convert a block count to its log2 value. if the block
3102 * count is not a l2 multiple, it is rounded up to the next
3103 * larger l2 multiple.
3104 *
3105 * PARAMETERS:
3106 * nb - number of blocks
3107 *
3108 * RETURN VALUES:
3109 * log2 number of blocks
3110 */
3111int blkstol2(s64 nb)
3112{
3113 int l2nb;
3114 s64 mask; /* meant to be signed */
3115
3116 mask = (s64) 1 << (64 - 1);
3117
3118 /* count the leading bits.
3119 */
3120 for (l2nb = 0; l2nb < 64; l2nb++, mask >>= 1) {
3121 /* leading bit found.
3122 */
3123 if (nb & mask) {
3124 /* determine the l2 value.
3125 */
3126 l2nb = (64 - 1) - l2nb;
3127
3128 /* check if we need to round up.
3129 */
3130 if (~mask & nb)
3131 l2nb++;
3132
3133 return (l2nb);
3134 }
3135 }
3136 assert(0);
3137 return 0; /* fix compiler warning */
3138}
3139
3140
3141/*
3142 * NAME: dbAllocBottomUp()
3143 *
3144 * FUNCTION: alloc the specified block range from the working block
3145 * allocation map.
3146 *
3147 * the blocks will be alloc from the working map one dmap
3148 * at a time.
3149 *
3150 * PARAMETERS:
3151 * ip - pointer to in-core inode;
3152 * blkno - starting block number to be freed.
3153 * nblocks - number of blocks to be freed.
3154 *
3155 * RETURN VALUES:
3156 * 0 - success
3157 * -EIO - i/o error
3158 */
3159int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks)
3160{
3161 struct metapage *mp;
3162 struct dmap *dp;
3163 int nb, rc;
3164 s64 lblkno, rem;
3165 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
3166 struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
3167
3168 IREAD_LOCK(ipbmap);
3169
3170 /* block to be allocated better be within the mapsize. */
3171 ASSERT(nblocks <= bmp->db_mapsize - blkno);
3172
3173 /*
3174 * allocate the blocks a dmap at a time.
3175 */
3176 mp = NULL;
3177 for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
3178 /* release previous dmap if any */
3179 if (mp) {
3180 write_metapage(mp);
3181 }
3182
3183 /* get the buffer for the current dmap. */
3184 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
3185 mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
3186 if (mp == NULL) {
3187 IREAD_UNLOCK(ipbmap);
3188 return -EIO;
3189 }
3190 dp = (struct dmap *) mp->data;
3191
3192 /* determine the number of blocks to be allocated from
3193 * this dmap.
3194 */
3195 nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
3196
3197 DBFREECK(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
3198
3199 /* allocate the blocks. */
3200 if ((rc = dbAllocDmapBU(bmp, dp, blkno, nb))) {
3201 release_metapage(mp);
3202 IREAD_UNLOCK(ipbmap);
3203 return (rc);
3204 }
3205
3206 DBALLOC(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
3207 }
3208
3209 /* write the last buffer. */
3210 write_metapage(mp);
3211
3212 IREAD_UNLOCK(ipbmap);
3213
3214 return (0);
3215}
3216
3217
3218static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
3219 int nblocks)
3220{
3221 int rc;
3222 int dbitno, word, rembits, nb, nwords, wbitno, agno;
3223 s8 oldroot, *leaf;
3224 struct dmaptree *tp = (struct dmaptree *) & dp->tree;
3225
3226 /* save the current value of the root (i.e. maximum free string)
3227 * of the dmap tree.
3228 */
3229 oldroot = tp->stree[ROOT];
3230
3231 /* pick up a pointer to the leaves of the dmap tree */
3232 leaf = tp->stree + LEAFIND;
3233
3234 /* determine the bit number and word within the dmap of the
3235 * starting block.
3236 */
3237 dbitno = blkno & (BPERDMAP - 1);
3238 word = dbitno >> L2DBWORD;
3239
3240 /* block range better be within the dmap */
3241 assert(dbitno + nblocks <= BPERDMAP);
3242
3243 /* allocate the bits of the dmap's words corresponding to the block
3244 * range. not all bits of the first and last words may be contained
3245 * within the block range. if this is the case, we'll work against
3246 * those words (i.e. partial first and/or last) on an individual basis
3247 * (a single pass), allocating the bits of interest by hand and
3248 * updating the leaf corresponding to the dmap word. a single pass
3249 * will be used for all dmap words fully contained within the
3250 * specified range. within this pass, the bits of all fully contained
3251 * dmap words will be marked as free in a single shot and the leaves
3252 * will be updated. a single leaf may describe the free space of
3253 * multiple dmap words, so we may update only a subset of the actual
3254 * leaves corresponding to the dmap words of the block range.
3255 */
3256 for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
3257 /* determine the bit number within the word and
3258 * the number of bits within the word.
3259 */
3260 wbitno = dbitno & (DBWORD - 1);
3261 nb = min(rembits, DBWORD - wbitno);
3262
3263 /* check if only part of a word is to be allocated.
3264 */
3265 if (nb < DBWORD) {
3266 /* allocate (set to 1) the appropriate bits within
3267 * this dmap word.
3268 */
3269 dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
3270 >> wbitno);
3271
3272 word++;
3273 } else {
3274 /* one or more dmap words are fully contained
3275 * within the block range. determine how many
3276 * words and allocate (set to 1) the bits of these
3277 * words.
3278 */
3279 nwords = rembits >> L2DBWORD;
3280 memset(&dp->wmap[word], (int) ONES, nwords * 4);
3281
3282 /* determine how many bits */
3283 nb = nwords << L2DBWORD;
3284 word += nwords;
3285 }
3286 }
3287
3288 /* update the free count for this dmap */
3289 dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
3290
3291 /* reconstruct summary tree */
3292 dbInitDmapTree(dp);
3293
3294 BMAP_LOCK(bmp);
3295
3296 /* if this allocation group is completely free,
3297 * update the highest active allocation group number
3298 * if this allocation group is the new max.
3299 */
3300 agno = blkno >> bmp->db_agl2size;
3301 if (agno > bmp->db_maxag)
3302 bmp->db_maxag = agno;
3303
3304 /* update the free count for the allocation group and map */
3305 bmp->db_agfree[agno] -= nblocks;
3306 bmp->db_nfree -= nblocks;
3307
3308 BMAP_UNLOCK(bmp);
3309
3310 /* if the root has not changed, done. */
3311 if (tp->stree[ROOT] == oldroot)
3312 return (0);
3313
3314 /* root changed. bubble the change up to the dmap control pages.
3315 * if the adjustment of the upper level control pages fails,
3316 * backout the bit allocation (thus making everything consistent).
3317 */
3318 if ((rc = dbAdjCtl(bmp, blkno, tp->stree[ROOT], 1, 0)))
3319 dbFreeBits(bmp, dp, blkno, nblocks);
3320
3321 return (rc);
3322}
3323
3324
3325/*
3326 * NAME: dbExtendFS()
3327 *
3328 * FUNCTION: extend bmap from blkno for nblocks;
3329 * dbExtendFS() updates bmap ready for dbAllocBottomUp();
3330 *
3331 * L2
3332 * |
3333 * L1---------------------------------L1
3334 * | |
3335 * L0---------L0---------L0 L0---------L0---------L0
3336 * | | | | | |
3337 * d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,.,dm;
3338 * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm
3339 *
3340 * <---old---><----------------------------extend----------------------->
3341 */
3342int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
3343{
3344 struct jfs_sb_info *sbi = JFS_SBI(ipbmap->i_sb);
3345 int nbperpage = sbi->nbperpage;
3346 int i, i0 = TRUE, j, j0 = TRUE, k, n;
3347 s64 newsize;
3348 s64 p;
3349 struct metapage *mp, *l2mp, *l1mp = NULL, *l0mp = NULL;
3350 struct dmapctl *l2dcp, *l1dcp, *l0dcp;
3351 struct dmap *dp;
3352 s8 *l0leaf, *l1leaf, *l2leaf;
3353 struct bmap *bmp = sbi->bmap;
3354 int agno, l2agsize, oldl2agsize;
3355 s64 ag_rem;
3356
3357 newsize = blkno + nblocks;
3358
3359 jfs_info("dbExtendFS: blkno:%Ld nblocks:%Ld newsize:%Ld",
3360 (long long) blkno, (long long) nblocks, (long long) newsize);
3361
3362 /*
3363 * initialize bmap control page.
3364 *
3365 * all the data in bmap control page should exclude
3366 * the mkfs hidden dmap page.
3367 */
3368
3369 /* update mapsize */
3370 bmp->db_mapsize = newsize;
3371 bmp->db_maxlevel = BMAPSZTOLEV(bmp->db_mapsize);
3372
3373 /* compute new AG size */
3374 l2agsize = dbGetL2AGSize(newsize);
3375 oldl2agsize = bmp->db_agl2size;
3376
3377 bmp->db_agl2size = l2agsize;
3378 bmp->db_agsize = 1 << l2agsize;
3379
3380 /* compute new number of AG */
3381 agno = bmp->db_numag;
3382 bmp->db_numag = newsize >> l2agsize;
3383 bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0;
3384
3385 /*
3386 * reconfigure db_agfree[]
3387 * from old AG configuration to new AG configuration;
3388 *
3389 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
3390 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
3391 * note: new AG size = old AG size * (2**x).
3392 */
3393 if (l2agsize == oldl2agsize)
3394 goto extend;
3395 k = 1 << (l2agsize - oldl2agsize);
3396 ag_rem = bmp->db_agfree[0]; /* save agfree[0] */
3397 for (i = 0, n = 0; i < agno; n++) {
3398 bmp->db_agfree[n] = 0; /* init collection point */
3399
3400 /* coalesce cotiguous k AGs; */
3401 for (j = 0; j < k && i < agno; j++, i++) {
3402 /* merge AGi to AGn */
3403 bmp->db_agfree[n] += bmp->db_agfree[i];
3404 }
3405 }
3406 bmp->db_agfree[0] += ag_rem; /* restore agfree[0] */
3407
3408 for (; n < MAXAG; n++)
3409 bmp->db_agfree[n] = 0;
3410
3411 /*
3412 * update highest active ag number
3413 */
3414
3415 bmp->db_maxag = bmp->db_maxag / k;
3416
3417 /*
3418 * extend bmap
3419 *
3420 * update bit maps and corresponding level control pages;
3421 * global control page db_nfree, db_agfree[agno], db_maxfreebud;
3422 */
3423 extend:
3424 /* get L2 page */
3425 p = BMAPBLKNO + nbperpage; /* L2 page */
3426 l2mp = read_metapage(ipbmap, p, PSIZE, 0);
3427 if (!l2mp) {
3428 jfs_error(ipbmap->i_sb, "dbExtendFS: L2 page could not be read");
3429 return -EIO;
3430 }
3431 l2dcp = (struct dmapctl *) l2mp->data;
3432
3433 /* compute start L1 */
3434 k = blkno >> L2MAXL1SIZE;
3435 l2leaf = l2dcp->stree + CTLLEAFIND + k;
3436 p = BLKTOL1(blkno, sbi->l2nbperpage); /* L1 page */
3437
3438 /*
3439 * extend each L1 in L2
3440 */
3441 for (; k < LPERCTL; k++, p += nbperpage) {
3442 /* get L1 page */
3443 if (j0) {
3444 /* read in L1 page: (blkno & (MAXL1SIZE - 1)) */
3445 l1mp = read_metapage(ipbmap, p, PSIZE, 0);
3446 if (l1mp == NULL)
3447 goto errout;
3448 l1dcp = (struct dmapctl *) l1mp->data;
3449
3450 /* compute start L0 */
3451 j = (blkno & (MAXL1SIZE - 1)) >> L2MAXL0SIZE;
3452 l1leaf = l1dcp->stree + CTLLEAFIND + j;
3453 p = BLKTOL0(blkno, sbi->l2nbperpage);
3454 j0 = FALSE;
3455 } else {
3456 /* assign/init L1 page */
3457 l1mp = get_metapage(ipbmap, p, PSIZE, 0);
3458 if (l1mp == NULL)
3459 goto errout;
3460
3461 l1dcp = (struct dmapctl *) l1mp->data;
3462
3463 /* compute start L0 */
3464 j = 0;
3465 l1leaf = l1dcp->stree + CTLLEAFIND;
3466 p += nbperpage; /* 1st L0 of L1.k */
3467 }
3468
3469 /*
3470 * extend each L0 in L1
3471 */
3472 for (; j < LPERCTL; j++) {
3473 /* get L0 page */
3474 if (i0) {
3475 /* read in L0 page: (blkno & (MAXL0SIZE - 1)) */
3476
3477 l0mp = read_metapage(ipbmap, p, PSIZE, 0);
3478 if (l0mp == NULL)
3479 goto errout;
3480 l0dcp = (struct dmapctl *) l0mp->data;
3481
3482 /* compute start dmap */
3483 i = (blkno & (MAXL0SIZE - 1)) >>
3484 L2BPERDMAP;
3485 l0leaf = l0dcp->stree + CTLLEAFIND + i;
3486 p = BLKTODMAP(blkno,
3487 sbi->l2nbperpage);
3488 i0 = FALSE;
3489 } else {
3490 /* assign/init L0 page */
3491 l0mp = get_metapage(ipbmap, p, PSIZE, 0);
3492 if (l0mp == NULL)
3493 goto errout;
3494
3495 l0dcp = (struct dmapctl *) l0mp->data;
3496
3497 /* compute start dmap */
3498 i = 0;
3499 l0leaf = l0dcp->stree + CTLLEAFIND;
3500 p += nbperpage; /* 1st dmap of L0.j */
3501 }
3502
3503 /*
3504 * extend each dmap in L0
3505 */
3506 for (; i < LPERCTL; i++) {
3507 /*
3508 * reconstruct the dmap page, and
3509 * initialize corresponding parent L0 leaf
3510 */
3511 if ((n = blkno & (BPERDMAP - 1))) {
3512 /* read in dmap page: */
3513 mp = read_metapage(ipbmap, p,
3514 PSIZE, 0);
3515 if (mp == NULL)
3516 goto errout;
3517 n = min(nblocks, (s64)BPERDMAP - n);
3518 } else {
3519 /* assign/init dmap page */
3520 mp = read_metapage(ipbmap, p,
3521 PSIZE, 0);
3522 if (mp == NULL)
3523 goto errout;
3524
3525 n = min(nblocks, (s64)BPERDMAP);
3526 }
3527
3528 dp = (struct dmap *) mp->data;
3529 *l0leaf = dbInitDmap(dp, blkno, n);
3530
3531 bmp->db_nfree += n;
3532 agno = le64_to_cpu(dp->start) >> l2agsize;
3533 bmp->db_agfree[agno] += n;
3534
3535 write_metapage(mp);
3536
3537 l0leaf++;
3538 p += nbperpage;
3539
3540 blkno += n;
3541 nblocks -= n;
3542 if (nblocks == 0)
3543 break;
3544 } /* for each dmap in a L0 */
3545
3546 /*
3547 * build current L0 page from its leaves, and
3548 * initialize corresponding parent L1 leaf
3549 */
3550 *l1leaf = dbInitDmapCtl(l0dcp, 0, ++i);
3551 write_metapage(l0mp);
3552 l0mp = NULL;
3553
3554 if (nblocks)
3555 l1leaf++; /* continue for next L0 */
3556 else {
3557 /* more than 1 L0 ? */
3558 if (j > 0)
3559 break; /* build L1 page */
3560 else {
3561 /* summarize in global bmap page */
3562 bmp->db_maxfreebud = *l1leaf;
3563 release_metapage(l1mp);
3564 release_metapage(l2mp);
3565 goto finalize;
3566 }
3567 }
3568 } /* for each L0 in a L1 */
3569
3570 /*
3571 * build current L1 page from its leaves, and
3572 * initialize corresponding parent L2 leaf
3573 */
3574 *l2leaf = dbInitDmapCtl(l1dcp, 1, ++j);
3575 write_metapage(l1mp);
3576 l1mp = NULL;
3577
3578 if (nblocks)
3579 l2leaf++; /* continue for next L1 */
3580 else {
3581 /* more than 1 L1 ? */
3582 if (k > 0)
3583 break; /* build L2 page */
3584 else {
3585 /* summarize in global bmap page */
3586 bmp->db_maxfreebud = *l2leaf;
3587 release_metapage(l2mp);
3588 goto finalize;
3589 }
3590 }
3591 } /* for each L1 in a L2 */
3592
3593 jfs_error(ipbmap->i_sb,
3594 "dbExtendFS: function has not returned as expected");
3595errout:
3596 if (l0mp)
3597 release_metapage(l0mp);
3598 if (l1mp)
3599 release_metapage(l1mp);
3600 release_metapage(l2mp);
3601 return -EIO;
3602
3603 /*
3604 * finalize bmap control page
3605 */
3606finalize:
3607
3608 return 0;
3609}
3610
3611
3612/*
3613 * dbFinalizeBmap()
3614 */
3615void dbFinalizeBmap(struct inode *ipbmap)
3616{
3617 struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
3618 int actags, inactags, l2nl;
3619 s64 ag_rem, actfree, inactfree, avgfree;
3620 int i, n;
3621
3622 /*
3623 * finalize bmap control page
3624 */
3625//finalize:
3626 /*
3627 * compute db_agpref: preferred ag to allocate from
3628 * (the leftmost ag with average free space in it);
3629 */
3630//agpref:
3631 /* get the number of active ags and inacitve ags */
3632 actags = bmp->db_maxag + 1;
3633 inactags = bmp->db_numag - actags;
3634 ag_rem = bmp->db_mapsize & (bmp->db_agsize - 1); /* ??? */
3635
3636 /* determine how many blocks are in the inactive allocation
3637 * groups. in doing this, we must account for the fact that
3638 * the rightmost group might be a partial group (i.e. file
3639 * system size is not a multiple of the group size).
3640 */
3641 inactfree = (inactags && ag_rem) ?
3642 ((inactags - 1) << bmp->db_agl2size) + ag_rem
3643 : inactags << bmp->db_agl2size;
3644
3645 /* determine how many free blocks are in the active
3646 * allocation groups plus the average number of free blocks
3647 * within the active ags.
3648 */
3649 actfree = bmp->db_nfree - inactfree;
3650 avgfree = (u32) actfree / (u32) actags;
3651
3652 /* if the preferred allocation group has not average free space.
3653 * re-establish the preferred group as the leftmost
3654 * group with average free space.
3655 */
3656 if (bmp->db_agfree[bmp->db_agpref] < avgfree) {
3657 for (bmp->db_agpref = 0; bmp->db_agpref < actags;
3658 bmp->db_agpref++) {
3659 if (bmp->db_agfree[bmp->db_agpref] >= avgfree)
3660 break;
3661 }
3662 if (bmp->db_agpref >= bmp->db_numag) {
3663 jfs_error(ipbmap->i_sb,
3664 "cannot find ag with average freespace");
3665 }
3666 }
3667
3668 /*
3669 * compute db_aglevel, db_agheigth, db_width, db_agstart:
3670 * an ag is covered in aglevel dmapctl summary tree,
3671 * at agheight level height (from leaf) with agwidth number of nodes
3672 * each, which starts at agstart index node of the smmary tree node
3673 * array;
3674 */
3675 bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
3676 l2nl =
3677 bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
3678 bmp->db_agheigth = l2nl >> 1;
3679 bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1));
3680 for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0;
3681 i--) {
3682 bmp->db_agstart += n;
3683 n <<= 2;
3684 }
3685
3686}
3687
3688
3689/*
3690 * NAME: dbInitDmap()/ujfs_idmap_page()
3691 *
3692 * FUNCTION: initialize working/persistent bitmap of the dmap page
3693 * for the specified number of blocks:
3694 *
3695 * at entry, the bitmaps had been initialized as free (ZEROS);
3696 * The number of blocks will only account for the actually
3697 * existing blocks. Blocks which don't actually exist in
3698 * the aggregate will be marked as allocated (ONES);
3699 *
3700 * PARAMETERS:
3701 * dp - pointer to page of map
3702 * nblocks - number of blocks this page
3703 *
3704 * RETURNS: NONE
3705 */
3706static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks)
3707{
3708 int blkno, w, b, r, nw, nb, i;
3709
3710 /* starting block number within the dmap */
3711 blkno = Blkno & (BPERDMAP - 1);
3712
3713 if (blkno == 0) {
3714 dp->nblocks = dp->nfree = cpu_to_le32(nblocks);
3715 dp->start = cpu_to_le64(Blkno);
3716
3717 if (nblocks == BPERDMAP) {
3718 memset(&dp->wmap[0], 0, LPERDMAP * 4);
3719 memset(&dp->pmap[0], 0, LPERDMAP * 4);
3720 goto initTree;
3721 }
3722 } else {
3723 dp->nblocks =
3724 cpu_to_le32(le32_to_cpu(dp->nblocks) + nblocks);
3725 dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
3726 }
3727
3728 /* word number containing start block number */
3729 w = blkno >> L2DBWORD;
3730
3731 /*
3732 * free the bits corresponding to the block range (ZEROS):
3733 * note: not all bits of the first and last words may be contained
3734 * within the block range.
3735 */
3736 for (r = nblocks; r > 0; r -= nb, blkno += nb) {
3737 /* number of bits preceding range to be freed in the word */
3738 b = blkno & (DBWORD - 1);
3739 /* number of bits to free in the word */
3740 nb = min(r, DBWORD - b);
3741
3742 /* is partial word to be freed ? */
3743 if (nb < DBWORD) {
3744 /* free (set to 0) from the bitmap word */
3745 dp->wmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
3746 >> b));
3747 dp->pmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
3748 >> b));
3749
3750 /* skip the word freed */
3751 w++;
3752 } else {
3753 /* free (set to 0) contiguous bitmap words */
3754 nw = r >> L2DBWORD;
3755 memset(&dp->wmap[w], 0, nw * 4);
3756 memset(&dp->pmap[w], 0, nw * 4);
3757
3758 /* skip the words freed */
3759 nb = nw << L2DBWORD;
3760 w += nw;
3761 }
3762 }
3763
3764 /*
3765 * mark bits following the range to be freed (non-existing
3766 * blocks) as allocated (ONES)
3767 */
3768
3769 if (blkno == BPERDMAP)
3770 goto initTree;
3771
3772 /* the first word beyond the end of existing blocks */
3773 w = blkno >> L2DBWORD;
3774
3775 /* does nblocks fall on a 32-bit boundary ? */
3776 b = blkno & (DBWORD - 1);
3777 if (b) {
3778 /* mark a partial word allocated */
3779 dp->wmap[w] = dp->pmap[w] = cpu_to_le32(ONES >> b);
3780 w++;
3781 }
3782
3783 /* set the rest of the words in the page to allocated (ONES) */
3784 for (i = w; i < LPERDMAP; i++)
3785 dp->pmap[i] = dp->wmap[i] = cpu_to_le32(ONES);
3786
3787 /*
3788 * init tree
3789 */
3790 initTree:
3791 return (dbInitDmapTree(dp));
3792}
3793
3794
3795/*
3796 * NAME: dbInitDmapTree()/ujfs_complete_dmap()
3797 *
3798 * FUNCTION: initialize summary tree of the specified dmap:
3799 *
3800 * at entry, bitmap of the dmap has been initialized;
3801 *
3802 * PARAMETERS:
3803 * dp - dmap to complete
3804 * blkno - starting block number for this dmap
3805 * treemax - will be filled in with max free for this dmap
3806 *
3807 * RETURNS: max free string at the root of the tree
3808 */
3809static int dbInitDmapTree(struct dmap * dp)
3810{
3811 struct dmaptree *tp;
3812 s8 *cp;
3813 int i;
3814
3815 /* init fixed info of tree */
3816 tp = &dp->tree;
3817 tp->nleafs = cpu_to_le32(LPERDMAP);
3818 tp->l2nleafs = cpu_to_le32(L2LPERDMAP);
3819 tp->leafidx = cpu_to_le32(LEAFIND);
3820 tp->height = cpu_to_le32(4);
3821 tp->budmin = BUDMIN;
3822
3823 /* init each leaf from corresponding wmap word:
3824 * note: leaf is set to NOFREE(-1) if all blocks of corresponding
3825 * bitmap word are allocated.
3826 */
3827 cp = tp->stree + le32_to_cpu(tp->leafidx);
3828 for (i = 0; i < LPERDMAP; i++)
3829 *cp++ = dbMaxBud((u8 *) & dp->wmap[i]);
3830
3831 /* build the dmap's binary buddy summary tree */
3832 return (dbInitTree(tp));
3833}
3834
3835
3836/*
3837 * NAME: dbInitTree()/ujfs_adjtree()
3838 *
3839 * FUNCTION: initialize binary buddy summary tree of a dmap or dmapctl.
3840 *
3841 * at entry, the leaves of the tree has been initialized
3842 * from corresponding bitmap word or root of summary tree
3843 * of the child control page;
3844 * configure binary buddy system at the leaf level, then
3845 * bubble up the values of the leaf nodes up the tree.
3846 *
3847 * PARAMETERS:
3848 * cp - Pointer to the root of the tree
3849 * l2leaves- Number of leaf nodes as a power of 2
3850 * l2min - Number of blocks that can be covered by a leaf
3851 * as a power of 2
3852 *
3853 * RETURNS: max free string at the root of the tree
3854 */
3855static int dbInitTree(struct dmaptree * dtp)
3856{
3857 int l2max, l2free, bsize, nextb, i;
3858 int child, parent, nparent;
3859 s8 *tp, *cp, *cp1;
3860
3861 tp = dtp->stree;
3862
3863 /* Determine the maximum free string possible for the leaves */
3864 l2max = le32_to_cpu(dtp->l2nleafs) + dtp->budmin;
3865
3866 /*
3867 * configure the leaf levevl into binary buddy system
3868 *
3869 * Try to combine buddies starting with a buddy size of 1
3870 * (i.e. two leaves). At a buddy size of 1 two buddy leaves
3871 * can be combined if both buddies have a maximum free of l2min;
3872 * the combination will result in the left-most buddy leaf having
3873 * a maximum free of l2min+1.
3874 * After processing all buddies for a given size, process buddies
3875 * at the next higher buddy size (i.e. current size * 2) and
3876 * the next maximum free (current free + 1).
3877 * This continues until the maximum possible buddy combination
3878 * yields maximum free.
3879 */
3880 for (l2free = dtp->budmin, bsize = 1; l2free < l2max;
3881 l2free++, bsize = nextb) {
3882 /* get next buddy size == current buddy pair size */
3883 nextb = bsize << 1;
3884
3885 /* scan each adjacent buddy pair at current buddy size */
3886 for (i = 0, cp = tp + le32_to_cpu(dtp->leafidx);
3887 i < le32_to_cpu(dtp->nleafs);
3888 i += nextb, cp += nextb) {
3889 /* coalesce if both adjacent buddies are max free */
3890 if (*cp == l2free && *(cp + bsize) == l2free) {
3891 *cp = l2free + 1; /* left take right */
3892 *(cp + bsize) = -1; /* right give left */
3893 }
3894 }
3895 }
3896
3897 /*
3898 * bubble summary information of leaves up the tree.
3899 *
3900 * Starting at the leaf node level, the four nodes described by
3901 * the higher level parent node are compared for a maximum free and
3902 * this maximum becomes the value of the parent node.
3903 * when all lower level nodes are processed in this fashion then
3904 * move up to the next level (parent becomes a lower level node) and
3905 * continue the process for that level.
3906 */
3907 for (child = le32_to_cpu(dtp->leafidx),
3908 nparent = le32_to_cpu(dtp->nleafs) >> 2;
3909 nparent > 0; nparent >>= 2, child = parent) {
3910 /* get index of 1st node of parent level */
3911 parent = (child - 1) >> 2;
3912
3913 /* set the value of the parent node as the maximum
3914 * of the four nodes of the current level.
3915 */
3916 for (i = 0, cp = tp + child, cp1 = tp + parent;
3917 i < nparent; i++, cp += 4, cp1++)
3918 *cp1 = TREEMAX(cp);
3919 }
3920
3921 return (*tp);
3922}
3923
3924
3925/*
3926 * dbInitDmapCtl()
3927 *
3928 * function: initialize dmapctl page
3929 */
3930static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i)
3931{ /* start leaf index not covered by range */
3932 s8 *cp;
3933
3934 dcp->nleafs = cpu_to_le32(LPERCTL);
3935 dcp->l2nleafs = cpu_to_le32(L2LPERCTL);
3936 dcp->leafidx = cpu_to_le32(CTLLEAFIND);
3937 dcp->height = cpu_to_le32(5);
3938 dcp->budmin = L2BPERDMAP + L2LPERCTL * level;
3939
3940 /*
3941 * initialize the leaves of current level that were not covered
3942 * by the specified input block range (i.e. the leaves have no
3943 * low level dmapctl or dmap).
3944 */
3945 cp = &dcp->stree[CTLLEAFIND + i];
3946 for (; i < LPERCTL; i++)
3947 *cp++ = NOFREE;
3948
3949 /* build the dmap's binary buddy summary tree */
3950 return (dbInitTree((struct dmaptree *) dcp));
3951}
3952
3953
3954/*
3955 * NAME: dbGetL2AGSize()/ujfs_getagl2size()
3956 *
3957 * FUNCTION: Determine log2(allocation group size) from aggregate size
3958 *
3959 * PARAMETERS:
3960 * nblocks - Number of blocks in aggregate
3961 *
3962 * RETURNS: log2(allocation group size) in aggregate blocks
3963 */
3964static int dbGetL2AGSize(s64 nblocks)
3965{
3966 s64 sz;
3967 s64 m;
3968 int l2sz;
3969
3970 if (nblocks < BPERDMAP * MAXAG)
3971 return (L2BPERDMAP);
3972
3973 /* round up aggregate size to power of 2 */
3974 m = ((u64) 1 << (64 - 1));
3975 for (l2sz = 64; l2sz >= 0; l2sz--, m >>= 1) {
3976 if (m & nblocks)
3977 break;
3978 }
3979
3980 sz = (s64) 1 << l2sz;
3981 if (sz < nblocks)
3982 l2sz += 1;
3983
3984 /* agsize = roundupSize/max_number_of_ag */
3985 return (l2sz - L2MAXAG);
3986}
3987
3988
3989/*
3990 * NAME: dbMapFileSizeToMapSize()
3991 *
3992 * FUNCTION: compute number of blocks the block allocation map file
3993 * can cover from the map file size;
3994 *
3995 * RETURNS: Number of blocks which can be covered by this block map file;
3996 */
3997
3998/*
3999 * maximum number of map pages at each level including control pages
4000 */
4001#define MAXL0PAGES (1 + LPERCTL)
4002#define MAXL1PAGES (1 + LPERCTL * MAXL0PAGES)
4003#define MAXL2PAGES (1 + LPERCTL * MAXL1PAGES)
4004
4005/*
4006 * convert number of map pages to the zero origin top dmapctl level
4007 */
4008#define BMAPPGTOLEV(npages) \
4009 (((npages) <= 3 + MAXL0PAGES) ? 0 \
4010 : ((npages) <= 2 + MAXL1PAGES) ? 1 : 2)
4011
4012s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
4013{
4014 struct super_block *sb = ipbmap->i_sb;
4015 s64 nblocks;
4016 s64 npages, ndmaps;
4017 int level, i;
4018 int complete, factor;
4019
4020 nblocks = ipbmap->i_size >> JFS_SBI(sb)->l2bsize;
4021 npages = nblocks >> JFS_SBI(sb)->l2nbperpage;
4022 level = BMAPPGTOLEV(npages);
4023
4024 /* At each level, accumulate the number of dmap pages covered by
4025 * the number of full child levels below it;
4026 * repeat for the last incomplete child level.
4027 */
4028 ndmaps = 0;
4029 npages--; /* skip the first global control page */
4030 /* skip higher level control pages above top level covered by map */
4031 npages -= (2 - level);
4032 npages--; /* skip top level's control page */
4033 for (i = level; i >= 0; i--) {
4034 factor =
4035 (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1);
4036 complete = (u32) npages / factor;
4037 ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL
4038 : ((i == 1) ? LPERCTL : 1));
4039
4040 /* pages in last/incomplete child */
4041 npages = (u32) npages % factor;
4042 /* skip incomplete child's level control page */
4043 npages--;
4044 }
4045
4046 /* convert the number of dmaps into the number of blocks
4047 * which can be covered by the dmaps;
4048 */
4049 nblocks = ndmaps << L2BPERDMAP;
4050
4051 return (nblocks);
4052}
4053
4054
4055#ifdef _JFS_DEBUG_DMAP
4056/*
4057 * DBinitmap()
4058 */
4059static void DBinitmap(s64 size, struct inode *ipbmap, u32 ** results)
4060{
4061 int npages;
4062 u32 *dbmap, *d;
4063 int n;
4064 s64 lblkno, cur_block;
4065 struct dmap *dp;
4066 struct metapage *mp;
4067
4068 npages = size / 32768;
4069 npages += (size % 32768) ? 1 : 0;
4070
4071 dbmap = (u32 *) xmalloc(npages * 4096, L2PSIZE, kernel_heap);
4072 if (dbmap == NULL)
4073 BUG(); /* Not robust since this is only unused debug code */
4074
4075 for (n = 0, d = dbmap; n < npages; n++, d += 1024)
4076 bzero(d, 4096);
4077
4078 /* Need to initialize from disk map pages
4079 */
4080 for (d = dbmap, cur_block = 0; cur_block < size;
4081 cur_block += BPERDMAP, d += LPERDMAP) {
4082 lblkno = BLKTODMAP(cur_block,
4083 JFS_SBI(ipbmap->i_sb)->bmap->
4084 db_l2nbperpage);
4085 mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
4086 if (mp == NULL) {
4087 jfs_error(ipbmap->i_sb,
4088 "DBinitmap: could not read disk map page");
4089 continue;
4090 }
4091 dp = (struct dmap *) mp->data;
4092
4093 for (n = 0; n < LPERDMAP; n++)
4094 d[n] = le32_to_cpu(dp->wmap[n]);
4095
4096 release_metapage(mp);
4097 }
4098
4099 *results = dbmap;
4100}
4101
4102
4103/*
4104 * DBAlloc()
4105 */
4106void DBAlloc(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
4107{
4108 int word, nb, bitno;
4109 u32 mask;
4110
4111 assert(blkno > 0 && blkno < mapsize);
4112 assert(nblocks > 0 && nblocks <= mapsize);
4113
4114 assert(blkno + nblocks <= mapsize);
4115
4116 dbmap += (blkno / 32);
4117 while (nblocks > 0) {
4118 bitno = blkno & (32 - 1);
4119 nb = min(nblocks, 32 - bitno);
4120
4121 mask = (0xffffffff << (32 - nb) >> bitno);
4122 assert((mask & *dbmap) == 0);
4123 *dbmap |= mask;
4124
4125 dbmap++;
4126 blkno += nb;
4127 nblocks -= nb;
4128 }
4129}
4130
4131
4132/*
4133 * DBFree()
4134 */
4135static void DBFree(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
4136{
4137 int word, nb, bitno;
4138 u32 mask;
4139
4140 assert(blkno > 0 && blkno < mapsize);
4141 assert(nblocks > 0 && nblocks <= mapsize);
4142
4143 assert(blkno + nblocks <= mapsize);
4144
4145 dbmap += (blkno / 32);
4146 while (nblocks > 0) {
4147 bitno = blkno & (32 - 1);
4148 nb = min(nblocks, 32 - bitno);
4149
4150 mask = (0xffffffff << (32 - nb) >> bitno);
4151 assert((mask & *dbmap) == mask);
4152 *dbmap &= ~mask;
4153
4154 dbmap++;
4155 blkno += nb;
4156 nblocks -= nb;
4157 }
4158}
4159
4160
4161/*
4162 * DBAllocCK()
4163 */
4164static void DBAllocCK(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
4165{
4166 int word, nb, bitno;
4167 u32 mask;
4168
4169 assert(blkno > 0 && blkno < mapsize);
4170 assert(nblocks > 0 && nblocks <= mapsize);
4171
4172 assert(blkno + nblocks <= mapsize);
4173
4174 dbmap += (blkno / 32);
4175 while (nblocks > 0) {
4176 bitno = blkno & (32 - 1);
4177 nb = min(nblocks, 32 - bitno);
4178
4179 mask = (0xffffffff << (32 - nb) >> bitno);
4180 assert((mask & *dbmap) == mask);
4181
4182 dbmap++;
4183 blkno += nb;
4184 nblocks -= nb;
4185 }
4186}
4187
4188
4189/*
4190 * DBFreeCK()
4191 */
4192static void DBFreeCK(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
4193{
4194 int word, nb, bitno;
4195 u32 mask;
4196
4197 assert(blkno > 0 && blkno < mapsize);
4198 assert(nblocks > 0 && nblocks <= mapsize);
4199
4200 assert(blkno + nblocks <= mapsize);
4201
4202 dbmap += (blkno / 32);
4203 while (nblocks > 0) {
4204 bitno = blkno & (32 - 1);
4205 nb = min(nblocks, 32 - bitno);
4206
4207 mask = (0xffffffff << (32 - nb) >> bitno);
4208 assert((mask & *dbmap) == 0);
4209
4210 dbmap++;
4211 blkno += nb;
4212 nblocks -= nb;
4213 }
4214}
4215
4216
4217/*
4218 * dbPrtMap()
4219 */
4220static void dbPrtMap(struct bmap * bmp)
4221{
4222 printk(" mapsize: %d%d\n", bmp->db_mapsize);
4223 printk(" nfree: %d%d\n", bmp->db_nfree);
4224 printk(" numag: %d\n", bmp->db_numag);
4225 printk(" agsize: %d%d\n", bmp->db_agsize);
4226 printk(" agl2size: %d\n", bmp->db_agl2size);
4227 printk(" agwidth: %d\n", bmp->db_agwidth);
4228 printk(" agstart: %d\n", bmp->db_agstart);
4229 printk(" agheigth: %d\n", bmp->db_agheigth);
4230 printk(" aglevel: %d\n", bmp->db_aglevel);
4231 printk(" maxlevel: %d\n", bmp->db_maxlevel);
4232 printk(" maxag: %d\n", bmp->db_maxag);
4233 printk(" agpref: %d\n", bmp->db_agpref);
4234 printk(" l2nbppg: %d\n", bmp->db_l2nbperpage);
4235}
4236
4237
4238/*
4239 * dbPrtCtl()
4240 */
4241static void dbPrtCtl(struct dmapctl * dcp)
4242{
4243 int i, j, n;
4244
4245 printk(" height: %08x\n", le32_to_cpu(dcp->height));
4246 printk(" leafidx: %08x\n", le32_to_cpu(dcp->leafidx));
4247 printk(" budmin: %08x\n", dcp->budmin);
4248 printk(" nleafs: %08x\n", le32_to_cpu(dcp->nleafs));
4249 printk(" l2nleafs: %08x\n", le32_to_cpu(dcp->l2nleafs));
4250
4251 printk("\n Tree:\n");
4252 for (i = 0; i < CTLLEAFIND; i += 8) {
4253 n = min(8, CTLLEAFIND - i);
4254
4255 for (j = 0; j < n; j++)
4256 printf(" [%03x]: %02x", i + j,
4257 (char) dcp->stree[i + j]);
4258 printf("\n");
4259 }
4260
4261 printk("\n Tree Leaves:\n");
4262 for (i = 0; i < LPERCTL; i += 8) {
4263 n = min(8, LPERCTL - i);
4264
4265 for (j = 0; j < n; j++)
4266 printf(" [%03x]: %02x",
4267 i + j,
4268 (char) dcp->stree[i + j + CTLLEAFIND]);
4269 printf("\n");
4270 }
4271}
4272#endif /* _JFS_DEBUG_DMAP */
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
new file mode 100644
index 000000000000..32e25884e7e8
--- /dev/null
+++ b/fs/jfs/jfs_dmap.h
@@ -0,0 +1,314 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2000-2002
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18#ifndef _H_JFS_DMAP
19#define _H_JFS_DMAP
20
21#include "jfs_txnmgr.h"
22
23#define BMAPVERSION 1 /* version number */
24#define TREESIZE (256+64+16+4+1) /* size of a dmap tree */
25#define LEAFIND (64+16+4+1) /* index of 1st leaf of a dmap tree */
26#define LPERDMAP 256 /* num leaves per dmap tree */
27#define L2LPERDMAP 8 /* l2 number of leaves per dmap tree */
28#define DBWORD 32 /* # of blks covered by a map word */
29#define L2DBWORD 5 /* l2 # of blks covered by a mword */
30#define BUDMIN L2DBWORD /* max free string in a map word */
31#define BPERDMAP (LPERDMAP * DBWORD) /* num of blks per dmap */
32#define L2BPERDMAP 13 /* l2 num of blks per dmap */
33#define CTLTREESIZE (1024+256+64+16+4+1) /* size of a dmapctl tree */
34#define CTLLEAFIND (256+64+16+4+1) /* idx of 1st leaf of a dmapctl tree */
35#define LPERCTL 1024 /* num of leaves per dmapctl tree */
36#define L2LPERCTL 10 /* l2 num of leaves per dmapctl tree */
37#define ROOT 0 /* index of the root of a tree */
38#define NOFREE ((s8) -1) /* no blocks free */
39#define MAXAG 128 /* max number of allocation groups */
40#define L2MAXAG 7 /* l2 max num of AG */
41#define L2MINAGSZ 25 /* l2 of minimum AG size in bytes */
42#define BMAPBLKNO 0 /* lblkno of bmap within the map */
43
44/*
45 * maximum l2 number of disk blocks at the various dmapctl levels.
46 */
47#define L2MAXL0SIZE (L2BPERDMAP + 1 * L2LPERCTL)
48#define L2MAXL1SIZE (L2BPERDMAP + 2 * L2LPERCTL)
49#define L2MAXL2SIZE (L2BPERDMAP + 3 * L2LPERCTL)
50
51/*
52 * maximum number of disk blocks at the various dmapctl levels.
53 */
54#define MAXL0SIZE ((s64)1 << L2MAXL0SIZE)
55#define MAXL1SIZE ((s64)1 << L2MAXL1SIZE)
56#define MAXL2SIZE ((s64)1 << L2MAXL2SIZE)
57
58#define MAXMAPSIZE MAXL2SIZE /* maximum aggregate map size */
59
60/*
61 * determine the maximum free string for four (lower level) nodes
62 * of the tree.
63 */
64static __inline signed char TREEMAX(signed char *cp)
65{
66 signed char tmp1, tmp2;
67
68 tmp1 = max(*(cp+2), *(cp+3));
69 tmp2 = max(*(cp), *(cp+1));
70
71 return max(tmp1, tmp2);
72}
73
74/*
75 * convert disk block number to the logical block number of the dmap
76 * describing the disk block. s is the log2(number of logical blocks per page)
77 *
78 * The calculation figures out how many logical pages are in front of the dmap.
79 * - the number of dmaps preceding it
80 * - the number of L0 pages preceding its L0 page
81 * - the number of L1 pages preceding its L1 page
82 * - 3 is added to account for the L2, L1, and L0 page for this dmap
83 * - 1 is added to account for the control page of the map.
84 */
85#define BLKTODMAP(b,s) \
86 ((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s))
87
88/*
89 * convert disk block number to the logical block number of the LEVEL 0
90 * dmapctl describing the disk block. s is the log2(number of logical blocks
91 * per page)
92 *
93 * The calculation figures out how many logical pages are in front of the L0.
94 * - the number of dmap pages preceding it
95 * - the number of L0 pages preceding it
96 * - the number of L1 pages preceding its L1 page
97 * - 2 is added to account for the L2, and L1 page for this L0
98 * - 1 is added to account for the control page of the map.
99 */
100#define BLKTOL0(b,s) \
101 (((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s))
102
103/*
104 * convert disk block number to the logical block number of the LEVEL 1
105 * dmapctl describing the disk block. s is the log2(number of logical blocks
106 * per page)
107 *
108 * The calculation figures out how many logical pages are in front of the L1.
109 * - the number of dmap pages preceding it
110 * - the number of L0 pages preceding it
111 * - the number of L1 pages preceding it
112 * - 1 is added to account for the L2 page
113 * - 1 is added to account for the control page of the map.
114 */
115#define BLKTOL1(b,s) \
116 (((((b) >> 33) << 20) + (((b) >> 33) << 10) + ((b) >> 33) + 1 + 1) << (s))
117
118/*
119 * convert disk block number to the logical block number of the dmapctl
120 * at the specified level which describes the disk block.
121 */
122#define BLKTOCTL(b,s,l) \
123 (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s)))
124
125/*
126 * convert aggregate map size to the zero origin dmapctl level of the
127 * top dmapctl.
128 */
129#define BMAPSZTOLEV(size) \
130 (((size) <= MAXL0SIZE) ? 0 : ((size) <= MAXL1SIZE) ? 1 : 2)
131
132/* convert disk block number to allocation group number.
133 */
134#define BLKTOAG(b,sbi) ((b) >> ((sbi)->bmap->db_agl2size))
135
136/* convert allocation group number to starting disk block
137 * number.
138 */
139#define AGTOBLK(a,ip) \
140 ((s64)(a) << (JFS_SBI((ip)->i_sb)->bmap->db_agl2size))
141
142/*
143 * dmap summary tree
144 *
145 * dmaptree must be consistent with dmapctl.
146 */
147struct dmaptree {
148 __le32 nleafs; /* 4: number of tree leafs */
149 __le32 l2nleafs; /* 4: l2 number of tree leafs */
150 __le32 leafidx; /* 4: index of first tree leaf */
151 __le32 height; /* 4: height of the tree */
152 s8 budmin; /* 1: min l2 tree leaf value to combine */
153 s8 stree[TREESIZE]; /* TREESIZE: tree */
154 u8 pad[2]; /* 2: pad to word boundary */
155}; /* - 360 - */
156
157/*
158 * dmap page per 8K blocks bitmap
159 */
160struct dmap {
161 __le32 nblocks; /* 4: num blks covered by this dmap */
162 __le32 nfree; /* 4: num of free blks in this dmap */
163 __le64 start; /* 8: starting blkno for this dmap */
164 struct dmaptree tree; /* 360: dmap tree */
165 u8 pad[1672]; /* 1672: pad to 2048 bytes */
166 __le32 wmap[LPERDMAP]; /* 1024: bits of the working map */
167 __le32 pmap[LPERDMAP]; /* 1024: bits of the persistent map */
168}; /* - 4096 - */
169
170/*
171 * disk map control page per level.
172 *
173 * dmapctl must be consistent with dmaptree.
174 */
175struct dmapctl {
176 __le32 nleafs; /* 4: number of tree leafs */
177 __le32 l2nleafs; /* 4: l2 number of tree leafs */
178 __le32 leafidx; /* 4: index of the first tree leaf */
179 __le32 height; /* 4: height of tree */
180 s8 budmin; /* 1: minimum l2 tree leaf value */
181 s8 stree[CTLTREESIZE]; /* CTLTREESIZE: dmapctl tree */
182 u8 pad[2714]; /* 2714: pad to 4096 */
183}; /* - 4096 - */
184
185/*
186 * common definition for dmaptree within dmap and dmapctl
187 */
188typedef union dmtree {
189 struct dmaptree t1;
190 struct dmapctl t2;
191} dmtree_t;
192
193/* macros for accessing fields within dmtree */
194#define dmt_nleafs t1.nleafs
195#define dmt_l2nleafs t1.l2nleafs
196#define dmt_leafidx t1.leafidx
197#define dmt_height t1.height
198#define dmt_budmin t1.budmin
199#define dmt_stree t1.stree
200
201/*
202 * on-disk aggregate disk allocation map descriptor.
203 */
204struct dbmap_disk {
205 __le64 dn_mapsize; /* 8: number of blocks in aggregate */
206 __le64 dn_nfree; /* 8: num free blks in aggregate map */
207 __le32 dn_l2nbperpage; /* 4: number of blks per page */
208 __le32 dn_numag; /* 4: total number of ags */
209 __le32 dn_maxlevel; /* 4: number of active ags */
210 __le32 dn_maxag; /* 4: max active alloc group number */
211 __le32 dn_agpref; /* 4: preferred alloc group (hint) */
212 __le32 dn_aglevel; /* 4: dmapctl level holding the AG */
213 __le32 dn_agheigth; /* 4: height in dmapctl of the AG */
214 __le32 dn_agwidth; /* 4: width in dmapctl of the AG */
215 __le32 dn_agstart; /* 4: start tree index at AG height */
216 __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */
217 __le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count */
218 __le64 dn_agsize; /* 8: num of blks per alloc group */
219 s8 dn_maxfreebud; /* 1: max free buddy system */
220 u8 pad[3007]; /* 3007: pad to 4096 */
221}; /* - 4096 - */
222
223struct dbmap {
224 s64 dn_mapsize; /* number of blocks in aggregate */
225 s64 dn_nfree; /* num free blks in aggregate map */
226 int dn_l2nbperpage; /* number of blks per page */
227 int dn_numag; /* total number of ags */
228 int dn_maxlevel; /* number of active ags */
229 int dn_maxag; /* max active alloc group number */
230 int dn_agpref; /* preferred alloc group (hint) */
231 int dn_aglevel; /* dmapctl level holding the AG */
232 int dn_agheigth; /* height in dmapctl of the AG */
233 int dn_agwidth; /* width in dmapctl of the AG */
234 int dn_agstart; /* start tree index at AG height */
235 int dn_agl2size; /* l2 num of blks per alloc group */
236 s64 dn_agfree[MAXAG]; /* per AG free count */
237 s64 dn_agsize; /* num of blks per alloc group */
238 signed char dn_maxfreebud; /* max free buddy system */
239}; /* - 4096 - */
240/*
241 * in-memory aggregate disk allocation map descriptor.
242 */
243struct bmap {
244 struct dbmap db_bmap; /* on-disk aggregate map descriptor */
245 struct inode *db_ipbmap; /* ptr to aggregate map incore inode */
246 struct semaphore db_bmaplock; /* aggregate map lock */
247 atomic_t db_active[MAXAG]; /* count of active, open files in AG */
248 u32 *db_DBmap;
249};
250
251/* macros for accessing fields within in-memory aggregate map descriptor */
252#define db_mapsize db_bmap.dn_mapsize
253#define db_nfree db_bmap.dn_nfree
254#define db_agfree db_bmap.dn_agfree
255#define db_agsize db_bmap.dn_agsize
256#define db_agl2size db_bmap.dn_agl2size
257#define db_agwidth db_bmap.dn_agwidth
258#define db_agheigth db_bmap.dn_agheigth
259#define db_agstart db_bmap.dn_agstart
260#define db_numag db_bmap.dn_numag
261#define db_maxlevel db_bmap.dn_maxlevel
262#define db_aglevel db_bmap.dn_aglevel
263#define db_agpref db_bmap.dn_agpref
264#define db_maxag db_bmap.dn_maxag
265#define db_maxfreebud db_bmap.dn_maxfreebud
266#define db_l2nbperpage db_bmap.dn_l2nbperpage
267
268/*
269 * macros for various conversions needed by the allocators.
270 * blkstol2(), cntlz(), and cnttz() are operating system dependent functions.
271 */
272/* convert number of blocks to log2 number of blocks, rounding up to
273 * the next log2 value if blocks is not a l2 multiple.
274 */
275#define BLKSTOL2(d) (blkstol2(d))
276
277/* convert number of leafs to log2 leaf value */
278#define NLSTOL2BSZ(n) (31 - cntlz((n)) + BUDMIN)
279
280/* convert leaf index to log2 leaf value */
281#define LITOL2BSZ(n,m,b) ((((n) == 0) ? (m) : cnttz((n))) + (b))
282
283/* convert a block number to a dmap control leaf index */
284#define BLKTOCTLLEAF(b,m) \
285 (((b) & (((s64)1 << ((m) + L2LPERCTL)) - 1)) >> (m))
286
287/* convert log2 leaf value to buddy size */
288#define BUDSIZE(s,m) (1 << ((s) - (m)))
289
290/*
291 * external references.
292 */
293extern int dbMount(struct inode *ipbmap);
294
295extern int dbUnmount(struct inode *ipbmap, int mounterror);
296
297extern int dbFree(struct inode *ipbmap, s64 blkno, s64 nblocks);
298
299extern int dbUpdatePMap(struct inode *ipbmap,
300 int free, s64 blkno, s64 nblocks, struct tblock * tblk);
301
302extern int dbNextAG(struct inode *ipbmap);
303
304extern int dbAlloc(struct inode *ipbmap, s64 hint, s64 nblocks, s64 * results);
305
306extern int dbReAlloc(struct inode *ipbmap,
307 s64 blkno, s64 nblocks, s64 addnblocks, s64 * results);
308
309extern int dbSync(struct inode *ipbmap);
310extern int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks);
311extern int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks);
312extern void dbFinalizeBmap(struct inode *ipbmap);
313extern s64 dbMapFileSizeToMapSize(struct inode *ipbmap);
314#endif /* _H_JFS_DMAP */
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
new file mode 100644
index 000000000000..e357890adfb2
--- /dev/null
+++ b/fs/jfs/jfs_dtree.c
@@ -0,0 +1,4752 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19/*
20 * jfs_dtree.c: directory B+-tree manager
21 *
22 * B+-tree with variable length key directory:
23 *
24 * each directory page is structured as an array of 32-byte
25 * directory entry slots initialized as a freelist
26 * to avoid search/compaction of free space at insertion.
27 * when an entry is inserted, a number of slots are allocated
28 * from the freelist as required to store variable length data
29 * of the entry; when the entry is deleted, slots of the entry
30 * are returned to freelist.
31 *
32 * leaf entry stores full name as key and file serial number
33 * (aka inode number) as data.
34 * internal/router entry stores sufffix compressed name
35 * as key and simple extent descriptor as data.
36 *
37 * each directory page maintains a sorted entry index table
38 * which stores the start slot index of sorted entries
39 * to allow binary search on the table.
40 *
41 * directory starts as a root/leaf page in on-disk inode
42 * inline data area.
43 * when it becomes full, it starts a leaf of a external extent
44 * of length of 1 block. each time the first leaf becomes full,
45 * it is extended rather than split (its size is doubled),
46 * until its length becoms 4 KBytes, from then the extent is split
47 * with new 4 Kbyte extent when it becomes full
48 * to reduce external fragmentation of small directories.
49 *
50 * blah, blah, blah, for linear scan of directory in pieces by
51 * readdir().
52 *
53 *
54 * case-insensitive directory file system
55 *
56 * names are stored in case-sensitive way in leaf entry.
57 * but stored, searched and compared in case-insensitive (uppercase) order
58 * (i.e., both search key and entry key are folded for search/compare):
59 * (note that case-sensitive order is BROKEN in storage, e.g.,
60 * sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad
61 *
62 * entries which folds to the same key makes up a equivalent class
63 * whose members are stored as contiguous cluster (may cross page boundary)
64 * but whose order is arbitrary and acts as duplicate, e.g.,
65 * abc, Abc, aBc, abC)
66 *
67 * once match is found at leaf, requires scan forward/backward
68 * either for, in case-insensitive search, duplicate
69 * or for, in case-sensitive search, for exact match
70 *
71 * router entry must be created/stored in case-insensitive way
72 * in internal entry:
73 * (right most key of left page and left most key of right page
74 * are folded, and its suffix compression is propagated as router
75 * key in parent)
76 * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB>
77 * should be made the router key for the split)
78 *
79 * case-insensitive search:
80 *
81 * fold search key;
82 *
83 * case-insensitive search of B-tree:
84 * for internal entry, router key is already folded;
85 * for leaf entry, fold the entry key before comparison.
86 *
87 * if (leaf entry case-insensitive match found)
88 * if (next entry satisfies case-insensitive match)
89 * return EDUPLICATE;
90 * if (prev entry satisfies case-insensitive match)
91 * return EDUPLICATE;
92 * return match;
93 * else
94 * return no match;
95 *
96 * serialization:
97 * target directory inode lock is being held on entry/exit
98 * of all main directory service routines.
99 *
100 * log based recovery:
101 */
102
103#include <linux/fs.h>
104#include <linux/quotaops.h>
105#include "jfs_incore.h"
106#include "jfs_superblock.h"
107#include "jfs_filsys.h"
108#include "jfs_metapage.h"
109#include "jfs_dmap.h"
110#include "jfs_unicode.h"
111#include "jfs_debug.h"
112
113/* dtree split parameter */
114struct dtsplit {
115 struct metapage *mp;
116 s16 index;
117 s16 nslot;
118 struct component_name *key;
119 ddata_t *data;
120 struct pxdlist *pxdlist;
121};
122
123#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot)
124
125/* get page buffer for specified block address */
126#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
127{\
128 BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot)\
129 if (!(RC))\
130 {\
131 if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\
132 ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\
133 {\
134 BT_PUTPAGE(MP);\
135 jfs_error((IP)->i_sb, "DT_GETPAGE: dtree page corrupt");\
136 MP = NULL;\
137 RC = -EIO;\
138 }\
139 }\
140}
141
142/* for consistency */
143#define DT_PUTPAGE(MP) BT_PUTPAGE(MP)
144
145#define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
146 BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot)
147
148/*
149 * forward references
150 */
151static int dtSplitUp(tid_t tid, struct inode *ip,
152 struct dtsplit * split, struct btstack * btstack);
153
154static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
155 struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp);
156
157static int dtExtendPage(tid_t tid, struct inode *ip,
158 struct dtsplit * split, struct btstack * btstack);
159
160static int dtSplitRoot(tid_t tid, struct inode *ip,
161 struct dtsplit * split, struct metapage ** rmpp);
162
163static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
164 dtpage_t * fp, struct btstack * btstack);
165
166static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p);
167
168static int dtReadFirst(struct inode *ip, struct btstack * btstack);
169
170static int dtReadNext(struct inode *ip,
171 loff_t * offset, struct btstack * btstack);
172
173static int dtCompare(struct component_name * key, dtpage_t * p, int si);
174
175static int ciCompare(struct component_name * key, dtpage_t * p, int si,
176 int flag);
177
178static void dtGetKey(dtpage_t * p, int i, struct component_name * key,
179 int flag);
180
181static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
182 int ri, struct component_name * key, int flag);
183
184static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
185 ddata_t * data, struct dt_lock **);
186
187static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
188 struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
189 int do_index);
190
191static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock);
192
193static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock);
194
195static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock);
196
197#define ciToUpper(c) UniStrupr((c)->name)
198
199/*
200 * read_index_page()
201 *
202 * Reads a page of a directory's index table.
203 * Having metadata mapped into the directory inode's address space
204 * presents a multitude of problems. We avoid this by mapping to
205 * the absolute address space outside of the *_metapage routines
206 */
207static struct metapage *read_index_page(struct inode *inode, s64 blkno)
208{
209 int rc;
210 s64 xaddr;
211 int xflag;
212 s32 xlen;
213
214 rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
215 if (rc || (xlen == 0))
216 return NULL;
217
218 return read_metapage(inode, xaddr, PSIZE, 1);
219}
220
221/*
222 * get_index_page()
223 *
224 * Same as get_index_page(), but get's a new page without reading
225 */
226static struct metapage *get_index_page(struct inode *inode, s64 blkno)
227{
228 int rc;
229 s64 xaddr;
230 int xflag;
231 s32 xlen;
232
233 rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
234 if (rc || (xlen == 0))
235 return NULL;
236
237 return get_metapage(inode, xaddr, PSIZE, 1);
238}
239
240/*
241 * find_index()
242 *
243 * Returns dtree page containing directory table entry for specified
244 * index and pointer to its entry.
245 *
246 * mp must be released by caller.
247 */
248static struct dir_table_slot *find_index(struct inode *ip, u32 index,
249 struct metapage ** mp, s64 *lblock)
250{
251 struct jfs_inode_info *jfs_ip = JFS_IP(ip);
252 s64 blkno;
253 s64 offset;
254 int page_offset;
255 struct dir_table_slot *slot;
256 static int maxWarnings = 10;
257
258 if (index < 2) {
259 if (maxWarnings) {
260 jfs_warn("find_entry called with index = %d", index);
261 maxWarnings--;
262 }
263 return NULL;
264 }
265
266 if (index >= jfs_ip->next_index) {
267 jfs_warn("find_entry called with index >= next_index");
268 return NULL;
269 }
270
271 if (jfs_dirtable_inline(ip)) {
272 /*
273 * Inline directory table
274 */
275 *mp = NULL;
276 slot = &jfs_ip->i_dirtable[index - 2];
277 } else {
278 offset = (index - 2) * sizeof(struct dir_table_slot);
279 page_offset = offset & (PSIZE - 1);
280 blkno = ((offset + 1) >> L2PSIZE) <<
281 JFS_SBI(ip->i_sb)->l2nbperpage;
282
283 if (*mp && (*lblock != blkno)) {
284 release_metapage(*mp);
285 *mp = NULL;
286 }
287 if (*mp == 0) {
288 *lblock = blkno;
289 *mp = read_index_page(ip, blkno);
290 }
291 if (*mp == 0) {
292 jfs_err("free_index: error reading directory table");
293 return NULL;
294 }
295
296 slot =
297 (struct dir_table_slot *) ((char *) (*mp)->data +
298 page_offset);
299 }
300 return slot;
301}
302
303static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp,
304 u32 index)
305{
306 struct tlock *tlck;
307 struct linelock *llck;
308 struct lv *lv;
309
310 tlck = txLock(tid, ip, mp, tlckDATA);
311 llck = (struct linelock *) tlck->lock;
312
313 if (llck->index >= llck->maxcnt)
314 llck = txLinelock(llck);
315 lv = &llck->lv[llck->index];
316
317 /*
318 * Linelock slot size is twice the size of directory table
319 * slot size. 512 entries per page.
320 */
321 lv->offset = ((index - 2) & 511) >> 1;
322 lv->length = 1;
323 llck->index++;
324}
325
326/*
327 * add_index()
328 *
329 * Adds an entry to the directory index table. This is used to provide
330 * each directory entry with a persistent index in which to resume
331 * directory traversals
332 */
333static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
334{
335 struct super_block *sb = ip->i_sb;
336 struct jfs_sb_info *sbi = JFS_SBI(sb);
337 struct jfs_inode_info *jfs_ip = JFS_IP(ip);
338 u64 blkno;
339 struct dir_table_slot *dirtab_slot;
340 u32 index;
341 struct linelock *llck;
342 struct lv *lv;
343 struct metapage *mp;
344 s64 offset;
345 uint page_offset;
346 struct tlock *tlck;
347 s64 xaddr;
348
349 ASSERT(DO_INDEX(ip));
350
351 if (jfs_ip->next_index < 2) {
352 jfs_warn("add_index: next_index = %d. Resetting!",
353 jfs_ip->next_index);
354 jfs_ip->next_index = 2;
355 }
356
357 index = jfs_ip->next_index++;
358
359 if (index <= MAX_INLINE_DIRTABLE_ENTRY) {
360 /*
361 * i_size reflects size of index table, or 8 bytes per entry.
362 */
363 ip->i_size = (loff_t) (index - 1) << 3;
364
365 /*
366 * dir table fits inline within inode
367 */
368 dirtab_slot = &jfs_ip->i_dirtable[index-2];
369 dirtab_slot->flag = DIR_INDEX_VALID;
370 dirtab_slot->slot = slot;
371 DTSaddress(dirtab_slot, bn);
372
373 set_cflag(COMMIT_Dirtable, ip);
374
375 return index;
376 }
377 if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) {
378 struct dir_table_slot temp_table[12];
379
380 /*
381 * It's time to move the inline table to an external
382 * page and begin to build the xtree
383 */
384 if (DQUOT_ALLOC_BLOCK(ip, sbi->nbperpage) ||
385 dbAlloc(ip, 0, sbi->nbperpage, &xaddr))
386 goto clean_up; /* No space */
387
388 /*
389 * Save the table, we're going to overwrite it with the
390 * xtree root
391 */
392 memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table));
393
394 /*
395 * Initialize empty x-tree
396 */
397 xtInitRoot(tid, ip);
398
399 /*
400 * Allocate the first block & add it to the xtree
401 */
402 if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) {
403 /* This really shouldn't fail */
404 jfs_warn("add_index: xtInsert failed!");
405 memcpy(&jfs_ip->i_dirtable, temp_table,
406 sizeof (temp_table));
407 goto clean_up;
408 }
409 ip->i_size = PSIZE;
410
411 if ((mp = get_index_page(ip, 0)) == 0) {
412 jfs_err("add_index: get_metapage failed!");
413 xtTruncate(tid, ip, 0, COMMIT_PWMAP);
414 memcpy(&jfs_ip->i_dirtable, temp_table,
415 sizeof (temp_table));
416 goto clean_up;
417 }
418 tlck = txLock(tid, ip, mp, tlckDATA);
419 llck = (struct linelock *) & tlck->lock;
420 ASSERT(llck->index == 0);
421 lv = &llck->lv[0];
422
423 lv->offset = 0;
424 lv->length = 6; /* tlckDATA slot size is 16 bytes */
425 llck->index++;
426
427 memcpy(mp->data, temp_table, sizeof(temp_table));
428
429 mark_metapage_dirty(mp);
430 release_metapage(mp);
431
432 /*
433 * Logging is now directed by xtree tlocks
434 */
435 clear_cflag(COMMIT_Dirtable, ip);
436 }
437
438 offset = (index - 2) * sizeof(struct dir_table_slot);
439 page_offset = offset & (PSIZE - 1);
440 blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage;
441 if (page_offset == 0) {
442 /*
443 * This will be the beginning of a new page
444 */
445 xaddr = 0;
446 if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) {
447 jfs_warn("add_index: xtInsert failed!");
448 goto clean_up;
449 }
450 ip->i_size += PSIZE;
451
452 if ((mp = get_index_page(ip, blkno)))
453 memset(mp->data, 0, PSIZE); /* Just looks better */
454 else
455 xtTruncate(tid, ip, offset, COMMIT_PWMAP);
456 } else
457 mp = read_index_page(ip, blkno);
458
459 if (mp == 0) {
460 jfs_err("add_index: get/read_metapage failed!");
461 goto clean_up;
462 }
463
464 lock_index(tid, ip, mp, index);
465
466 dirtab_slot =
467 (struct dir_table_slot *) ((char *) mp->data + page_offset);
468 dirtab_slot->flag = DIR_INDEX_VALID;
469 dirtab_slot->slot = slot;
470 DTSaddress(dirtab_slot, bn);
471
472 mark_metapage_dirty(mp);
473 release_metapage(mp);
474
475 return index;
476
477 clean_up:
478
479 jfs_ip->next_index--;
480
481 return 0;
482}
483
484/*
485 * free_index()
486 *
487 * Marks an entry to the directory index table as free.
488 */
489static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next)
490{
491 struct dir_table_slot *dirtab_slot;
492 s64 lblock;
493 struct metapage *mp = NULL;
494
495 dirtab_slot = find_index(ip, index, &mp, &lblock);
496
497 if (dirtab_slot == 0)
498 return;
499
500 dirtab_slot->flag = DIR_INDEX_FREE;
501 dirtab_slot->slot = dirtab_slot->addr1 = 0;
502 dirtab_slot->addr2 = cpu_to_le32(next);
503
504 if (mp) {
505 lock_index(tid, ip, mp, index);
506 mark_metapage_dirty(mp);
507 release_metapage(mp);
508 } else
509 set_cflag(COMMIT_Dirtable, ip);
510}
511
512/*
513 * modify_index()
514 *
515 * Changes an entry in the directory index table
516 */
517static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn,
518 int slot, struct metapage ** mp, u64 *lblock)
519{
520 struct dir_table_slot *dirtab_slot;
521
522 dirtab_slot = find_index(ip, index, mp, lblock);
523
524 if (dirtab_slot == 0)
525 return;
526
527 DTSaddress(dirtab_slot, bn);
528 dirtab_slot->slot = slot;
529
530 if (*mp) {
531 lock_index(tid, ip, *mp, index);
532 mark_metapage_dirty(*mp);
533 } else
534 set_cflag(COMMIT_Dirtable, ip);
535}
536
537/*
538 * read_index()
539 *
540 * reads a directory table slot
541 */
542static int read_index(struct inode *ip, u32 index,
543 struct dir_table_slot * dirtab_slot)
544{
545 s64 lblock;
546 struct metapage *mp = NULL;
547 struct dir_table_slot *slot;
548
549 slot = find_index(ip, index, &mp, &lblock);
550 if (slot == 0) {
551 return -EIO;
552 }
553
554 memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot));
555
556 if (mp)
557 release_metapage(mp);
558
559 return 0;
560}
561
562/*
563 * dtSearch()
564 *
565 * function:
566 * Search for the entry with specified key
567 *
568 * parameter:
569 *
570 * return: 0 - search result on stack, leaf page pinned;
571 * errno - I/O error
572 */
573int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
574 struct btstack * btstack, int flag)
575{
576 int rc = 0;
577 int cmp = 1; /* init for empty page */
578 s64 bn;
579 struct metapage *mp;
580 dtpage_t *p;
581 s8 *stbl;
582 int base, index, lim;
583 struct btframe *btsp;
584 pxd_t *pxd;
585 int psize = 288; /* initial in-line directory */
586 ino_t inumber;
587 struct component_name ciKey;
588 struct super_block *sb = ip->i_sb;
589
590 ciKey.name =
591 (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
592 GFP_NOFS);
593 if (ciKey.name == 0) {
594 rc = -ENOMEM;
595 goto dtSearch_Exit2;
596 }
597
598
599 /* uppercase search key for c-i directory */
600 UniStrcpy(ciKey.name, key->name);
601 ciKey.namlen = key->namlen;
602
603 /* only uppercase if case-insensitive support is on */
604 if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) {
605 ciToUpper(&ciKey);
606 }
607 BT_CLR(btstack); /* reset stack */
608
609 /* init level count for max pages to split */
610 btstack->nsplit = 1;
611
612 /*
613 * search down tree from root:
614 *
615 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
616 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
617 *
618 * if entry with search key K is not found
619 * internal page search find the entry with largest key Ki
620 * less than K which point to the child page to search;
621 * leaf page search find the entry with smallest key Kj
622 * greater than K so that the returned index is the position of
623 * the entry to be shifted right for insertion of new entry.
624 * for empty tree, search key is greater than any key of the tree.
625 *
626 * by convention, root bn = 0.
627 */
628 for (bn = 0;;) {
629 /* get/pin the page to search */
630 DT_GETPAGE(ip, bn, mp, psize, p, rc);
631 if (rc)
632 goto dtSearch_Exit1;
633
634 /* get sorted entry table of the page */
635 stbl = DT_GETSTBL(p);
636
637 /*
638 * binary search with search key K on the current page.
639 */
640 for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) {
641 index = base + (lim >> 1);
642
643 if (p->header.flag & BT_LEAF) {
644 /* uppercase leaf name to compare */
645 cmp =
646 ciCompare(&ciKey, p, stbl[index],
647 JFS_SBI(sb)->mntflag);
648 } else {
649 /* router key is in uppercase */
650
651 cmp = dtCompare(&ciKey, p, stbl[index]);
652
653
654 }
655 if (cmp == 0) {
656 /*
657 * search hit
658 */
659 /* search hit - leaf page:
660 * return the entry found
661 */
662 if (p->header.flag & BT_LEAF) {
663 inumber = le32_to_cpu(
664 ((struct ldtentry *) & p->slot[stbl[index]])->inumber);
665
666 /*
667 * search for JFS_LOOKUP
668 */
669 if (flag == JFS_LOOKUP) {
670 *data = inumber;
671 rc = 0;
672 goto out;
673 }
674
675 /*
676 * search for JFS_CREATE
677 */
678 if (flag == JFS_CREATE) {
679 *data = inumber;
680 rc = -EEXIST;
681 goto out;
682 }
683
684 /*
685 * search for JFS_REMOVE or JFS_RENAME
686 */
687 if ((flag == JFS_REMOVE ||
688 flag == JFS_RENAME) &&
689 *data != inumber) {
690 rc = -ESTALE;
691 goto out;
692 }
693
694 /*
695 * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME
696 */
697 /* save search result */
698 *data = inumber;
699 btsp = btstack->top;
700 btsp->bn = bn;
701 btsp->index = index;
702 btsp->mp = mp;
703
704 rc = 0;
705 goto dtSearch_Exit1;
706 }
707
708 /* search hit - internal page:
709 * descend/search its child page
710 */
711 goto getChild;
712 }
713
714 if (cmp > 0) {
715 base = index + 1;
716 --lim;
717 }
718 }
719
720 /*
721 * search miss
722 *
723 * base is the smallest index with key (Kj) greater than
724 * search key (K) and may be zero or (maxindex + 1) index.
725 */
726 /*
727 * search miss - leaf page
728 *
729 * return location of entry (base) where new entry with
730 * search key K is to be inserted.
731 */
732 if (p->header.flag & BT_LEAF) {
733 /*
734 * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME
735 */
736 if (flag == JFS_LOOKUP || flag == JFS_REMOVE ||
737 flag == JFS_RENAME) {
738 rc = -ENOENT;
739 goto out;
740 }
741
742 /*
743 * search for JFS_CREATE|JFS_FINDDIR:
744 *
745 * save search result
746 */
747 *data = 0;
748 btsp = btstack->top;
749 btsp->bn = bn;
750 btsp->index = base;
751 btsp->mp = mp;
752
753 rc = 0;
754 goto dtSearch_Exit1;
755 }
756
757 /*
758 * search miss - internal page
759 *
760 * if base is non-zero, decrement base by one to get the parent
761 * entry of the child page to search.
762 */
763 index = base ? base - 1 : base;
764
765 /*
766 * go down to child page
767 */
768 getChild:
769 /* update max. number of pages to split */
770 if (BT_STACK_FULL(btstack)) {
771 /* Something's corrupted, mark filesytem dirty so
772 * chkdsk will fix it.
773 */
774 jfs_error(sb, "stack overrun in dtSearch!");
775 BT_STACK_DUMP(btstack);
776 rc = -EIO;
777 goto out;
778 }
779 btstack->nsplit++;
780
781 /* push (bn, index) of the parent page/entry */
782 BT_PUSH(btstack, bn, index);
783
784 /* get the child page block number */
785 pxd = (pxd_t *) & p->slot[stbl[index]];
786 bn = addressPXD(pxd);
787 psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
788
789 /* unpin the parent page */
790 DT_PUTPAGE(mp);
791 }
792
793 out:
794 DT_PUTPAGE(mp);
795
796 dtSearch_Exit1:
797
798 kfree(ciKey.name);
799
800 dtSearch_Exit2:
801
802 return rc;
803}
804
805
806/*
807 * dtInsert()
808 *
809 * function: insert an entry to directory tree
810 *
811 * parameter:
812 *
813 * return: 0 - success;
814 * errno - failure;
815 */
816int dtInsert(tid_t tid, struct inode *ip,
817 struct component_name * name, ino_t * fsn, struct btstack * btstack)
818{
819 int rc = 0;
820 struct metapage *mp; /* meta-page buffer */
821 dtpage_t *p; /* base B+-tree index page */
822 s64 bn;
823 int index;
824 struct dtsplit split; /* split information */
825 ddata_t data;
826 struct dt_lock *dtlck;
827 int n;
828 struct tlock *tlck;
829 struct lv *lv;
830
831 /*
832 * retrieve search result
833 *
834 * dtSearch() returns (leaf page pinned, index at which to insert).
835 * n.b. dtSearch() may return index of (maxindex + 1) of
836 * the full page.
837 */
838 DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
839
840 /*
841 * insert entry for new key
842 */
843 if (DO_INDEX(ip)) {
844 if (JFS_IP(ip)->next_index == DIREND) {
845 DT_PUTPAGE(mp);
846 return -EMLINK;
847 }
848 n = NDTLEAF(name->namlen);
849 data.leaf.tid = tid;
850 data.leaf.ip = ip;
851 } else {
852 n = NDTLEAF_LEGACY(name->namlen);
853 data.leaf.ip = NULL; /* signifies legacy directory format */
854 }
855 data.leaf.ino = *fsn;
856
857 /*
858 * leaf page does not have enough room for new entry:
859 *
860 * extend/split the leaf page;
861 *
862 * dtSplitUp() will insert the entry and unpin the leaf page.
863 */
864 if (n > p->header.freecnt) {
865 split.mp = mp;
866 split.index = index;
867 split.nslot = n;
868 split.key = name;
869 split.data = &data;
870 rc = dtSplitUp(tid, ip, &split, btstack);
871 return rc;
872 }
873
874 /*
875 * leaf page does have enough room for new entry:
876 *
877 * insert the new data entry into the leaf page;
878 */
879 BT_MARK_DIRTY(mp, ip);
880 /*
881 * acquire a transaction lock on the leaf page
882 */
883 tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
884 dtlck = (struct dt_lock *) & tlck->lock;
885 ASSERT(dtlck->index == 0);
886 lv = & dtlck->lv[0];
887
888 /* linelock header */
889 lv->offset = 0;
890 lv->length = 1;
891 dtlck->index++;
892
893 dtInsertEntry(p, index, name, &data, &dtlck);
894
895 /* linelock stbl of non-root leaf page */
896 if (!(p->header.flag & BT_ROOT)) {
897 if (dtlck->index >= dtlck->maxcnt)
898 dtlck = (struct dt_lock *) txLinelock(dtlck);
899 lv = & dtlck->lv[dtlck->index];
900 n = index >> L2DTSLOTSIZE;
901 lv->offset = p->header.stblindex + n;
902 lv->length =
903 ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
904 dtlck->index++;
905 }
906
907 /* unpin the leaf page */
908 DT_PUTPAGE(mp);
909
910 return 0;
911}
912
913
914/*
915 * dtSplitUp()
916 *
917 * function: propagate insertion bottom up;
918 *
919 * parameter:
920 *
921 * return: 0 - success;
922 * errno - failure;
923 * leaf page unpinned;
924 */
925static int dtSplitUp(tid_t tid,
926 struct inode *ip, struct dtsplit * split, struct btstack * btstack)
927{
928 struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
929 int rc = 0;
930 struct metapage *smp;
931 dtpage_t *sp; /* split page */
932 struct metapage *rmp;
933 dtpage_t *rp; /* new right page split from sp */
934 pxd_t rpxd; /* new right page extent descriptor */
935 struct metapage *lmp;
936 dtpage_t *lp; /* left child page */
937 int skip; /* index of entry of insertion */
938 struct btframe *parent; /* parent page entry on traverse stack */
939 s64 xaddr, nxaddr;
940 int xlen, xsize;
941 struct pxdlist pxdlist;
942 pxd_t *pxd;
943 struct component_name key = { 0, NULL };
944 ddata_t *data = split->data;
945 int n;
946 struct dt_lock *dtlck;
947 struct tlock *tlck;
948 struct lv *lv;
949 int quota_allocation = 0;
950
951 /* get split page */
952 smp = split->mp;
953 sp = DT_PAGE(ip, smp);
954
955 key.name =
956 (wchar_t *) kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t),
957 GFP_NOFS);
958 if (key.name == 0) {
959 DT_PUTPAGE(smp);
960 rc = -ENOMEM;
961 goto dtSplitUp_Exit;
962 }
963
964 /*
965 * split leaf page
966 *
967 * The split routines insert the new entry, and
968 * acquire txLock as appropriate.
969 */
970 /*
971 * split root leaf page:
972 */
973 if (sp->header.flag & BT_ROOT) {
974 /*
975 * allocate a single extent child page
976 */
977 xlen = 1;
978 n = sbi->bsize >> L2DTSLOTSIZE;
979 n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */
980 n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */
981 if (n <= split->nslot)
982 xlen++;
983 if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) {
984 DT_PUTPAGE(smp);
985 goto freeKeyName;
986 }
987
988 pxdlist.maxnpxd = 1;
989 pxdlist.npxd = 0;
990 pxd = &pxdlist.pxd[0];
991 PXDaddress(pxd, xaddr);
992 PXDlength(pxd, xlen);
993 split->pxdlist = &pxdlist;
994 rc = dtSplitRoot(tid, ip, split, &rmp);
995
996 if (rc)
997 dbFree(ip, xaddr, xlen);
998 else
999 DT_PUTPAGE(rmp);
1000
1001 DT_PUTPAGE(smp);
1002
1003 goto freeKeyName;
1004 }
1005
1006 /*
1007 * extend first leaf page
1008 *
1009 * extend the 1st extent if less than buffer page size
1010 * (dtExtendPage() reurns leaf page unpinned)
1011 */
1012 pxd = &sp->header.self;
1013 xlen = lengthPXD(pxd);
1014 xsize = xlen << sbi->l2bsize;
1015 if (xsize < PSIZE) {
1016 xaddr = addressPXD(pxd);
1017 n = xsize >> L2DTSLOTSIZE;
1018 n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */
1019 if ((n + sp->header.freecnt) <= split->nslot)
1020 n = xlen + (xlen << 1);
1021 else
1022 n = xlen;
1023
1024 /* Allocate blocks to quota. */
1025 if (DQUOT_ALLOC_BLOCK(ip, n)) {
1026 rc = -EDQUOT;
1027 goto extendOut;
1028 }
1029 quota_allocation += n;
1030
1031 if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
1032 (s64) n, &nxaddr)))
1033 goto extendOut;
1034
1035 pxdlist.maxnpxd = 1;
1036 pxdlist.npxd = 0;
1037 pxd = &pxdlist.pxd[0];
1038 PXDaddress(pxd, nxaddr)
1039 PXDlength(pxd, xlen + n);
1040 split->pxdlist = &pxdlist;
1041 if ((rc = dtExtendPage(tid, ip, split, btstack))) {
1042 nxaddr = addressPXD(pxd);
1043 if (xaddr != nxaddr) {
1044 /* free relocated extent */
1045 xlen = lengthPXD(pxd);
1046 dbFree(ip, nxaddr, (s64) xlen);
1047 } else {
1048 /* free extended delta */
1049 xlen = lengthPXD(pxd) - n;
1050 xaddr = addressPXD(pxd) + xlen;
1051 dbFree(ip, xaddr, (s64) n);
1052 }
1053 }
1054
1055 extendOut:
1056 DT_PUTPAGE(smp);
1057 goto freeKeyName;
1058 }
1059
1060 /*
1061 * split leaf page <sp> into <sp> and a new right page <rp>.
1062 *
1063 * return <rp> pinned and its extent descriptor <rpxd>
1064 */
1065 /*
1066 * allocate new directory page extent and
1067 * new index page(s) to cover page split(s)
1068 *
1069 * allocation hint: ?
1070 */
1071 n = btstack->nsplit;
1072 pxdlist.maxnpxd = pxdlist.npxd = 0;
1073 xlen = sbi->nbperpage;
1074 for (pxd = pxdlist.pxd; n > 0; n--, pxd++) {
1075 if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) {
1076 PXDaddress(pxd, xaddr);
1077 PXDlength(pxd, xlen);
1078 pxdlist.maxnpxd++;
1079 continue;
1080 }
1081
1082 DT_PUTPAGE(smp);
1083
1084 /* undo allocation */
1085 goto splitOut;
1086 }
1087
1088 split->pxdlist = &pxdlist;
1089 if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) {
1090 DT_PUTPAGE(smp);
1091
1092 /* undo allocation */
1093 goto splitOut;
1094 }
1095
1096 /*
1097 * propagate up the router entry for the leaf page just split
1098 *
1099 * insert a router entry for the new page into the parent page,
1100 * propagate the insert/split up the tree by walking back the stack
1101 * of (bn of parent page, index of child page entry in parent page)
1102 * that were traversed during the search for the page that split.
1103 *
1104 * the propagation of insert/split up the tree stops if the root
1105 * splits or the page inserted into doesn't have to split to hold
1106 * the new entry.
1107 *
1108 * the parent entry for the split page remains the same, and
1109 * a new entry is inserted at its right with the first key and
1110 * block number of the new right page.
1111 *
1112 * There are a maximum of 4 pages pinned at any time:
1113 * two children, left parent and right parent (when the parent splits).
1114 * keep the child pages pinned while working on the parent.
1115 * make sure that all pins are released at exit.
1116 */
1117 while ((parent = BT_POP(btstack)) != NULL) {
1118 /* parent page specified by stack frame <parent> */
1119
1120 /* keep current child pages (<lp>, <rp>) pinned */
1121 lmp = smp;
1122 lp = sp;
1123
1124 /*
1125 * insert router entry in parent for new right child page <rp>
1126 */
1127 /* get the parent page <sp> */
1128 DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
1129 if (rc) {
1130 DT_PUTPAGE(lmp);
1131 DT_PUTPAGE(rmp);
1132 goto splitOut;
1133 }
1134
1135 /*
1136 * The new key entry goes ONE AFTER the index of parent entry,
1137 * because the split was to the right.
1138 */
1139 skip = parent->index + 1;
1140
1141 /*
1142 * compute the key for the router entry
1143 *
1144 * key suffix compression:
1145 * for internal pages that have leaf pages as children,
1146 * retain only what's needed to distinguish between
1147 * the new entry and the entry on the page to its left.
1148 * If the keys compare equal, retain the entire key.
1149 *
1150 * note that compression is performed only at computing
1151 * router key at the lowest internal level.
1152 * further compression of the key between pairs of higher
1153 * level internal pages loses too much information and
1154 * the search may fail.
1155 * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,}
1156 * results in two adjacent parent entries (a)(xx).
1157 * if split occurs between these two entries, and
1158 * if compression is applied, the router key of parent entry
1159 * of right page (x) will divert search for x into right
1160 * subtree and miss x in the left subtree.)
1161 *
1162 * the entire key must be retained for the next-to-leftmost
1163 * internal key at any level of the tree, or search may fail
1164 * (e.g., ?)
1165 */
1166 switch (rp->header.flag & BT_TYPE) {
1167 case BT_LEAF:
1168 /*
1169 * compute the length of prefix for suffix compression
1170 * between last entry of left page and first entry
1171 * of right page
1172 */
1173 if ((sp->header.flag & BT_ROOT && skip > 1) ||
1174 sp->header.prev != 0 || skip > 1) {
1175 /* compute uppercase router prefix key */
1176 rc = ciGetLeafPrefixKey(lp,
1177 lp->header.nextindex-1,
1178 rp, 0, &key,
1179 sbi->mntflag);
1180 if (rc) {
1181 DT_PUTPAGE(lmp);
1182 DT_PUTPAGE(rmp);
1183 DT_PUTPAGE(smp);
1184 goto splitOut;
1185 }
1186 } else {
1187 /* next to leftmost entry of
1188 lowest internal level */
1189
1190 /* compute uppercase router key */
1191 dtGetKey(rp, 0, &key, sbi->mntflag);
1192 key.name[key.namlen] = 0;
1193
1194 if ((sbi->mntflag & JFS_OS2) == JFS_OS2)
1195 ciToUpper(&key);
1196 }
1197
1198 n = NDTINTERNAL(key.namlen);
1199 break;
1200
1201 case BT_INTERNAL:
1202 dtGetKey(rp, 0, &key, sbi->mntflag);
1203 n = NDTINTERNAL(key.namlen);
1204 break;
1205
1206 default:
1207 jfs_err("dtSplitUp(): UFO!");
1208 break;
1209 }
1210
1211 /* unpin left child page */
1212 DT_PUTPAGE(lmp);
1213
1214 /*
1215 * compute the data for the router entry
1216 */
1217 data->xd = rpxd; /* child page xd */
1218
1219 /*
1220 * parent page is full - split the parent page
1221 */
1222 if (n > sp->header.freecnt) {
1223 /* init for parent page split */
1224 split->mp = smp;
1225 split->index = skip; /* index at insert */
1226 split->nslot = n;
1227 split->key = &key;
1228 /* split->data = data; */
1229
1230 /* unpin right child page */
1231 DT_PUTPAGE(rmp);
1232
1233 /* The split routines insert the new entry,
1234 * acquire txLock as appropriate.
1235 * return <rp> pinned and its block number <rbn>.
1236 */
1237 rc = (sp->header.flag & BT_ROOT) ?
1238 dtSplitRoot(tid, ip, split, &rmp) :
1239 dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd);
1240 if (rc) {
1241 DT_PUTPAGE(smp);
1242 goto splitOut;
1243 }
1244
1245 /* smp and rmp are pinned */
1246 }
1247 /*
1248 * parent page is not full - insert router entry in parent page
1249 */
1250 else {
1251 BT_MARK_DIRTY(smp, ip);
1252 /*
1253 * acquire a transaction lock on the parent page
1254 */
1255 tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
1256 dtlck = (struct dt_lock *) & tlck->lock;
1257 ASSERT(dtlck->index == 0);
1258 lv = & dtlck->lv[0];
1259
1260 /* linelock header */
1261 lv->offset = 0;
1262 lv->length = 1;
1263 dtlck->index++;
1264
1265 /* linelock stbl of non-root parent page */
1266 if (!(sp->header.flag & BT_ROOT)) {
1267 lv++;
1268 n = skip >> L2DTSLOTSIZE;
1269 lv->offset = sp->header.stblindex + n;
1270 lv->length =
1271 ((sp->header.nextindex -
1272 1) >> L2DTSLOTSIZE) - n + 1;
1273 dtlck->index++;
1274 }
1275
1276 dtInsertEntry(sp, skip, &key, data, &dtlck);
1277
1278 /* exit propagate up */
1279 break;
1280 }
1281 }
1282
1283 /* unpin current split and its right page */
1284 DT_PUTPAGE(smp);
1285 DT_PUTPAGE(rmp);
1286
1287 /*
1288 * free remaining extents allocated for split
1289 */
1290 splitOut:
1291 n = pxdlist.npxd;
1292 pxd = &pxdlist.pxd[n];
1293 for (; n < pxdlist.maxnpxd; n++, pxd++)
1294 dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd));
1295
1296 freeKeyName:
1297 kfree(key.name);
1298
1299 /* Rollback quota allocation */
1300 if (rc && quota_allocation)
1301 DQUOT_FREE_BLOCK(ip, quota_allocation);
1302
1303 dtSplitUp_Exit:
1304
1305 return rc;
1306}
1307
1308
1309/*
1310 * dtSplitPage()
1311 *
1312 * function: Split a non-root page of a btree.
1313 *
1314 * parameter:
1315 *
1316 * return: 0 - success;
1317 * errno - failure;
1318 * return split and new page pinned;
1319 */
1320static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
1321 struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp)
1322{
1323 int rc = 0;
1324 struct metapage *smp;
1325 dtpage_t *sp;
1326 struct metapage *rmp;
1327 dtpage_t *rp; /* new right page allocated */
1328 s64 rbn; /* new right page block number */
1329 struct metapage *mp;
1330 dtpage_t *p;
1331 s64 nextbn;
1332 struct pxdlist *pxdlist;
1333 pxd_t *pxd;
1334 int skip, nextindex, half, left, nxt, off, si;
1335 struct ldtentry *ldtentry;
1336 struct idtentry *idtentry;
1337 u8 *stbl;
1338 struct dtslot *f;
1339 int fsi, stblsize;
1340 int n;
1341 struct dt_lock *sdtlck, *rdtlck;
1342 struct tlock *tlck;
1343 struct dt_lock *dtlck;
1344 struct lv *slv, *rlv, *lv;
1345
1346 /* get split page */
1347 smp = split->mp;
1348 sp = DT_PAGE(ip, smp);
1349
1350 /*
1351 * allocate the new right page for the split
1352 */
1353 pxdlist = split->pxdlist;
1354 pxd = &pxdlist->pxd[pxdlist->npxd];
1355 pxdlist->npxd++;
1356 rbn = addressPXD(pxd);
1357 rmp = get_metapage(ip, rbn, PSIZE, 1);
1358 if (rmp == NULL)
1359 return -EIO;
1360
1361 /* Allocate blocks to quota. */
1362 if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
1363 release_metapage(rmp);
1364 return -EDQUOT;
1365 }
1366
1367 jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
1368
1369 BT_MARK_DIRTY(rmp, ip);
1370 /*
1371 * acquire a transaction lock on the new right page
1372 */
1373 tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
1374 rdtlck = (struct dt_lock *) & tlck->lock;
1375
1376 rp = (dtpage_t *) rmp->data;
1377 *rpp = rp;
1378 rp->header.self = *pxd;
1379
1380 BT_MARK_DIRTY(smp, ip);
1381 /*
1382 * acquire a transaction lock on the split page
1383 *
1384 * action:
1385 */
1386 tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY);
1387 sdtlck = (struct dt_lock *) & tlck->lock;
1388
1389 /* linelock header of split page */
1390 ASSERT(sdtlck->index == 0);
1391 slv = & sdtlck->lv[0];
1392 slv->offset = 0;
1393 slv->length = 1;
1394 sdtlck->index++;
1395
1396 /*
1397 * initialize/update sibling pointers between sp and rp
1398 */
1399 nextbn = le64_to_cpu(sp->header.next);
1400 rp->header.next = cpu_to_le64(nextbn);
1401 rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self));
1402 sp->header.next = cpu_to_le64(rbn);
1403
1404 /*
1405 * initialize new right page
1406 */
1407 rp->header.flag = sp->header.flag;
1408
1409 /* compute sorted entry table at start of extent data area */
1410 rp->header.nextindex = 0;
1411 rp->header.stblindex = 1;
1412
1413 n = PSIZE >> L2DTSLOTSIZE;
1414 rp->header.maxslot = n;
1415 stblsize = (n + 31) >> L2DTSLOTSIZE; /* in unit of slot */
1416
1417 /* init freelist */
1418 fsi = rp->header.stblindex + stblsize;
1419 rp->header.freelist = fsi;
1420 rp->header.freecnt = rp->header.maxslot - fsi;
1421
1422 /*
1423 * sequential append at tail: append without split
1424 *
1425 * If splitting the last page on a level because of appending
1426 * a entry to it (skip is maxentry), it's likely that the access is
1427 * sequential. Adding an empty page on the side of the level is less
1428 * work and can push the fill factor much higher than normal.
1429 * If we're wrong it's no big deal, we'll just do the split the right
1430 * way next time.
1431 * (It may look like it's equally easy to do a similar hack for
1432 * reverse sorted data, that is, split the tree left,
1433 * but it's not. Be my guest.)
1434 */
1435 if (nextbn == 0 && split->index == sp->header.nextindex) {
1436 /* linelock header + stbl (first slot) of new page */
1437 rlv = & rdtlck->lv[rdtlck->index];
1438 rlv->offset = 0;
1439 rlv->length = 2;
1440 rdtlck->index++;
1441
1442 /*
1443 * initialize freelist of new right page
1444 */
1445 f = &rp->slot[fsi];
1446 for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1447 f->next = fsi;
1448 f->next = -1;
1449
1450 /* insert entry at the first entry of the new right page */
1451 dtInsertEntry(rp, 0, split->key, split->data, &rdtlck);
1452
1453 goto out;
1454 }
1455
1456 /*
1457 * non-sequential insert (at possibly middle page)
1458 */
1459
1460 /*
1461 * update prev pointer of previous right sibling page;
1462 */
1463 if (nextbn != 0) {
1464 DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
1465 if (rc) {
1466 discard_metapage(rmp);
1467 return rc;
1468 }
1469
1470 BT_MARK_DIRTY(mp, ip);
1471 /*
1472 * acquire a transaction lock on the next page
1473 */
1474 tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
1475 jfs_info("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p",
1476 tlck, ip, mp);
1477 dtlck = (struct dt_lock *) & tlck->lock;
1478
1479 /* linelock header of previous right sibling page */
1480 lv = & dtlck->lv[dtlck->index];
1481 lv->offset = 0;
1482 lv->length = 1;
1483 dtlck->index++;
1484
1485 p->header.prev = cpu_to_le64(rbn);
1486
1487 DT_PUTPAGE(mp);
1488 }
1489
1490 /*
1491 * split the data between the split and right pages.
1492 */
1493 skip = split->index;
1494 half = (PSIZE >> L2DTSLOTSIZE) >> 1; /* swag */
1495 left = 0;
1496
1497 /*
1498 * compute fill factor for split pages
1499 *
1500 * <nxt> traces the next entry to move to rp
1501 * <off> traces the next entry to stay in sp
1502 */
1503 stbl = (u8 *) & sp->slot[sp->header.stblindex];
1504 nextindex = sp->header.nextindex;
1505 for (nxt = off = 0; nxt < nextindex; ++off) {
1506 if (off == skip)
1507 /* check for fill factor with new entry size */
1508 n = split->nslot;
1509 else {
1510 si = stbl[nxt];
1511 switch (sp->header.flag & BT_TYPE) {
1512 case BT_LEAF:
1513 ldtentry = (struct ldtentry *) & sp->slot[si];
1514 if (DO_INDEX(ip))
1515 n = NDTLEAF(ldtentry->namlen);
1516 else
1517 n = NDTLEAF_LEGACY(ldtentry->
1518 namlen);
1519 break;
1520
1521 case BT_INTERNAL:
1522 idtentry = (struct idtentry *) & sp->slot[si];
1523 n = NDTINTERNAL(idtentry->namlen);
1524 break;
1525
1526 default:
1527 break;
1528 }
1529
1530 ++nxt; /* advance to next entry to move in sp */
1531 }
1532
1533 left += n;
1534 if (left >= half)
1535 break;
1536 }
1537
1538 /* <nxt> poins to the 1st entry to move */
1539
1540 /*
1541 * move entries to right page
1542 *
1543 * dtMoveEntry() initializes rp and reserves entry for insertion
1544 *
1545 * split page moved out entries are linelocked;
1546 * new/right page moved in entries are linelocked;
1547 */
1548 /* linelock header + stbl of new right page */
1549 rlv = & rdtlck->lv[rdtlck->index];
1550 rlv->offset = 0;
1551 rlv->length = 5;
1552 rdtlck->index++;
1553
1554 dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip));
1555
1556 sp->header.nextindex = nxt;
1557
1558 /*
1559 * finalize freelist of new right page
1560 */
1561 fsi = rp->header.freelist;
1562 f = &rp->slot[fsi];
1563 for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1564 f->next = fsi;
1565 f->next = -1;
1566
1567 /*
1568 * Update directory index table for entries now in right page
1569 */
1570 if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
1571 s64 lblock;
1572
1573 mp = NULL;
1574 stbl = DT_GETSTBL(rp);
1575 for (n = 0; n < rp->header.nextindex; n++) {
1576 ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
1577 modify_index(tid, ip, le32_to_cpu(ldtentry->index),
1578 rbn, n, &mp, &lblock);
1579 }
1580 if (mp)
1581 release_metapage(mp);
1582 }
1583
1584 /*
1585 * the skipped index was on the left page,
1586 */
1587 if (skip <= off) {
1588 /* insert the new entry in the split page */
1589 dtInsertEntry(sp, skip, split->key, split->data, &sdtlck);
1590
1591 /* linelock stbl of split page */
1592 if (sdtlck->index >= sdtlck->maxcnt)
1593 sdtlck = (struct dt_lock *) txLinelock(sdtlck);
1594 slv = & sdtlck->lv[sdtlck->index];
1595 n = skip >> L2DTSLOTSIZE;
1596 slv->offset = sp->header.stblindex + n;
1597 slv->length =
1598 ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
1599 sdtlck->index++;
1600 }
1601 /*
1602 * the skipped index was on the right page,
1603 */
1604 else {
1605 /* adjust the skip index to reflect the new position */
1606 skip -= nxt;
1607
1608 /* insert the new entry in the right page */
1609 dtInsertEntry(rp, skip, split->key, split->data, &rdtlck);
1610 }
1611
1612 out:
1613 *rmpp = rmp;
1614 *rpxdp = *pxd;
1615
1616 return rc;
1617}
1618
1619
1620/*
1621 * dtExtendPage()
1622 *
1623 * function: extend 1st/only directory leaf page
1624 *
1625 * parameter:
1626 *
1627 * return: 0 - success;
1628 * errno - failure;
1629 * return extended page pinned;
1630 */
1631static int dtExtendPage(tid_t tid,
1632 struct inode *ip, struct dtsplit * split, struct btstack * btstack)
1633{
1634 struct super_block *sb = ip->i_sb;
1635 int rc;
1636 struct metapage *smp, *pmp, *mp;
1637 dtpage_t *sp, *pp;
1638 struct pxdlist *pxdlist;
1639 pxd_t *pxd, *tpxd;
1640 int xlen, xsize;
1641 int newstblindex, newstblsize;
1642 int oldstblindex, oldstblsize;
1643 int fsi, last;
1644 struct dtslot *f;
1645 struct btframe *parent;
1646 int n;
1647 struct dt_lock *dtlck;
1648 s64 xaddr, txaddr;
1649 struct tlock *tlck;
1650 struct pxd_lock *pxdlock;
1651 struct lv *lv;
1652 uint type;
1653 struct ldtentry *ldtentry;
1654 u8 *stbl;
1655
1656 /* get page to extend */
1657 smp = split->mp;
1658 sp = DT_PAGE(ip, smp);
1659
1660 /* get parent/root page */
1661 parent = BT_POP(btstack);
1662 DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc);
1663 if (rc)
1664 return (rc);
1665
1666 /*
1667 * extend the extent
1668 */
1669 pxdlist = split->pxdlist;
1670 pxd = &pxdlist->pxd[pxdlist->npxd];
1671 pxdlist->npxd++;
1672
1673 xaddr = addressPXD(pxd);
1674 tpxd = &sp->header.self;
1675 txaddr = addressPXD(tpxd);
1676 /* in-place extension */
1677 if (xaddr == txaddr) {
1678 type = tlckEXTEND;
1679 }
1680 /* relocation */
1681 else {
1682 type = tlckNEW;
1683
1684 /* save moved extent descriptor for later free */
1685 tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE);
1686 pxdlock = (struct pxd_lock *) & tlck->lock;
1687 pxdlock->flag = mlckFREEPXD;
1688 pxdlock->pxd = sp->header.self;
1689 pxdlock->index = 1;
1690
1691 /*
1692 * Update directory index table to reflect new page address
1693 */
1694 if (DO_INDEX(ip)) {
1695 s64 lblock;
1696
1697 mp = NULL;
1698 stbl = DT_GETSTBL(sp);
1699 for (n = 0; n < sp->header.nextindex; n++) {
1700 ldtentry =
1701 (struct ldtentry *) & sp->slot[stbl[n]];
1702 modify_index(tid, ip,
1703 le32_to_cpu(ldtentry->index),
1704 xaddr, n, &mp, &lblock);
1705 }
1706 if (mp)
1707 release_metapage(mp);
1708 }
1709 }
1710
1711 /*
1712 * extend the page
1713 */
1714 sp->header.self = *pxd;
1715
1716 jfs_info("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p", ip, smp, sp);
1717
1718 BT_MARK_DIRTY(smp, ip);
1719 /*
1720 * acquire a transaction lock on the extended/leaf page
1721 */
1722 tlck = txLock(tid, ip, smp, tlckDTREE | type);
1723 dtlck = (struct dt_lock *) & tlck->lock;
1724 lv = & dtlck->lv[0];
1725
1726 /* update buffer extent descriptor of extended page */
1727 xlen = lengthPXD(pxd);
1728 xsize = xlen << JFS_SBI(sb)->l2bsize;
1729#ifdef _STILL_TO_PORT
1730 bmSetXD(smp, xaddr, xsize);
1731#endif /* _STILL_TO_PORT */
1732
1733 /*
1734 * copy old stbl to new stbl at start of extended area
1735 */
1736 oldstblindex = sp->header.stblindex;
1737 oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE;
1738 newstblindex = sp->header.maxslot;
1739 n = xsize >> L2DTSLOTSIZE;
1740 newstblsize = (n + 31) >> L2DTSLOTSIZE;
1741 memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex],
1742 sp->header.nextindex);
1743
1744 /*
1745 * in-line extension: linelock old area of extended page
1746 */
1747 if (type == tlckEXTEND) {
1748 /* linelock header */
1749 lv->offset = 0;
1750 lv->length = 1;
1751 dtlck->index++;
1752 lv++;
1753
1754 /* linelock new stbl of extended page */
1755 lv->offset = newstblindex;
1756 lv->length = newstblsize;
1757 }
1758 /*
1759 * relocation: linelock whole relocated area
1760 */
1761 else {
1762 lv->offset = 0;
1763 lv->length = sp->header.maxslot + newstblsize;
1764 }
1765
1766 dtlck->index++;
1767
1768 sp->header.maxslot = n;
1769 sp->header.stblindex = newstblindex;
1770 /* sp->header.nextindex remains the same */
1771
1772 /*
1773 * add old stbl region at head of freelist
1774 */
1775 fsi = oldstblindex;
1776 f = &sp->slot[fsi];
1777 last = sp->header.freelist;
1778 for (n = 0; n < oldstblsize; n++, fsi++, f++) {
1779 f->next = last;
1780 last = fsi;
1781 }
1782 sp->header.freelist = last;
1783 sp->header.freecnt += oldstblsize;
1784
1785 /*
1786 * append free region of newly extended area at tail of freelist
1787 */
1788 /* init free region of newly extended area */
1789 fsi = n = newstblindex + newstblsize;
1790 f = &sp->slot[fsi];
1791 for (fsi++; fsi < sp->header.maxslot; f++, fsi++)
1792 f->next = fsi;
1793 f->next = -1;
1794
1795 /* append new free region at tail of old freelist */
1796 fsi = sp->header.freelist;
1797 if (fsi == -1)
1798 sp->header.freelist = n;
1799 else {
1800 do {
1801 f = &sp->slot[fsi];
1802 fsi = f->next;
1803 } while (fsi != -1);
1804
1805 f->next = n;
1806 }
1807
1808 sp->header.freecnt += sp->header.maxslot - n;
1809
1810 /*
1811 * insert the new entry
1812 */
1813 dtInsertEntry(sp, split->index, split->key, split->data, &dtlck);
1814
1815 BT_MARK_DIRTY(pmp, ip);
1816 /*
1817 * linelock any freeslots residing in old extent
1818 */
1819 if (type == tlckEXTEND) {
1820 n = sp->header.maxslot >> 2;
1821 if (sp->header.freelist < n)
1822 dtLinelockFreelist(sp, n, &dtlck);
1823 }
1824
1825 /*
1826 * update parent entry on the parent/root page
1827 */
1828 /*
1829 * acquire a transaction lock on the parent/root page
1830 */
1831 tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
1832 dtlck = (struct dt_lock *) & tlck->lock;
1833 lv = & dtlck->lv[dtlck->index];
1834
1835 /* linelock parent entry - 1st slot */
1836 lv->offset = 1;
1837 lv->length = 1;
1838 dtlck->index++;
1839
1840 /* update the parent pxd for page extension */
1841 tpxd = (pxd_t *) & pp->slot[1];
1842 *tpxd = *pxd;
1843
1844 DT_PUTPAGE(pmp);
1845 return 0;
1846}
1847
1848
1849/*
1850 * dtSplitRoot()
1851 *
1852 * function:
1853 * split the full root page into
1854 * original/root/split page and new right page
1855 * i.e., root remains fixed in tree anchor (inode) and
1856 * the root is copied to a single new right child page
1857 * since root page << non-root page, and
1858 * the split root page contains a single entry for the
1859 * new right child page.
1860 *
1861 * parameter:
1862 *
1863 * return: 0 - success;
1864 * errno - failure;
1865 * return new page pinned;
1866 */
1867static int dtSplitRoot(tid_t tid,
1868 struct inode *ip, struct dtsplit * split, struct metapage ** rmpp)
1869{
1870 struct super_block *sb = ip->i_sb;
1871 struct metapage *smp;
1872 dtroot_t *sp;
1873 struct metapage *rmp;
1874 dtpage_t *rp;
1875 s64 rbn;
1876 int xlen;
1877 int xsize;
1878 struct dtslot *f;
1879 s8 *stbl;
1880 int fsi, stblsize, n;
1881 struct idtentry *s;
1882 pxd_t *ppxd;
1883 struct pxdlist *pxdlist;
1884 pxd_t *pxd;
1885 struct dt_lock *dtlck;
1886 struct tlock *tlck;
1887 struct lv *lv;
1888
1889 /* get split root page */
1890 smp = split->mp;
1891 sp = &JFS_IP(ip)->i_dtroot;
1892
1893 /*
1894 * allocate/initialize a single (right) child page
1895 *
1896 * N.B. at first split, a one (or two) block to fit new entry
1897 * is allocated; at subsequent split, a full page is allocated;
1898 */
1899 pxdlist = split->pxdlist;
1900 pxd = &pxdlist->pxd[pxdlist->npxd];
1901 pxdlist->npxd++;
1902 rbn = addressPXD(pxd);
1903 xlen = lengthPXD(pxd);
1904 xsize = xlen << JFS_SBI(sb)->l2bsize;
1905 rmp = get_metapage(ip, rbn, xsize, 1);
1906 if (!rmp)
1907 return -EIO;
1908
1909 rp = rmp->data;
1910
1911 /* Allocate blocks to quota. */
1912 if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
1913 release_metapage(rmp);
1914 return -EDQUOT;
1915 }
1916
1917 BT_MARK_DIRTY(rmp, ip);
1918 /*
1919 * acquire a transaction lock on the new right page
1920 */
1921 tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW);
1922 dtlck = (struct dt_lock *) & tlck->lock;
1923
1924 rp->header.flag =
1925 (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL;
1926 rp->header.self = *pxd;
1927
1928 /* initialize sibling pointers */
1929 rp->header.next = 0;
1930 rp->header.prev = 0;
1931
1932 /*
1933 * move in-line root page into new right page extent
1934 */
1935 /* linelock header + copied entries + new stbl (1st slot) in new page */
1936 ASSERT(dtlck->index == 0);
1937 lv = & dtlck->lv[0];
1938 lv->offset = 0;
1939 lv->length = 10; /* 1 + 8 + 1 */
1940 dtlck->index++;
1941
1942 n = xsize >> L2DTSLOTSIZE;
1943 rp->header.maxslot = n;
1944 stblsize = (n + 31) >> L2DTSLOTSIZE;
1945
1946 /* copy old stbl to new stbl at start of extended area */
1947 rp->header.stblindex = DTROOTMAXSLOT;
1948 stbl = (s8 *) & rp->slot[DTROOTMAXSLOT];
1949 memcpy(stbl, sp->header.stbl, sp->header.nextindex);
1950 rp->header.nextindex = sp->header.nextindex;
1951
1952 /* copy old data area to start of new data area */
1953 memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE);
1954
1955 /*
1956 * append free region of newly extended area at tail of freelist
1957 */
1958 /* init free region of newly extended area */
1959 fsi = n = DTROOTMAXSLOT + stblsize;
1960 f = &rp->slot[fsi];
1961 for (fsi++; fsi < rp->header.maxslot; f++, fsi++)
1962 f->next = fsi;
1963 f->next = -1;
1964
1965 /* append new free region at tail of old freelist */
1966 fsi = sp->header.freelist;
1967 if (fsi == -1)
1968 rp->header.freelist = n;
1969 else {
1970 rp->header.freelist = fsi;
1971
1972 do {
1973 f = &rp->slot[fsi];
1974 fsi = f->next;
1975 } while (fsi != -1);
1976
1977 f->next = n;
1978 }
1979
1980 rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n;
1981
1982 /*
1983 * Update directory index table for entries now in right page
1984 */
1985 if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) {
1986 s64 lblock;
1987 struct metapage *mp = NULL;
1988 struct ldtentry *ldtentry;
1989
1990 stbl = DT_GETSTBL(rp);
1991 for (n = 0; n < rp->header.nextindex; n++) {
1992 ldtentry = (struct ldtentry *) & rp->slot[stbl[n]];
1993 modify_index(tid, ip, le32_to_cpu(ldtentry->index),
1994 rbn, n, &mp, &lblock);
1995 }
1996 if (mp)
1997 release_metapage(mp);
1998 }
1999 /*
2000 * insert the new entry into the new right/child page
2001 * (skip index in the new right page will not change)
2002 */
2003 dtInsertEntry(rp, split->index, split->key, split->data, &dtlck);
2004
2005 /*
2006 * reset parent/root page
2007 *
2008 * set the 1st entry offset to 0, which force the left-most key
2009 * at any level of the tree to be less than any search key.
2010 *
2011 * The btree comparison code guarantees that the left-most key on any
2012 * level of the tree is never used, so it doesn't need to be filled in.
2013 */
2014 BT_MARK_DIRTY(smp, ip);
2015 /*
2016 * acquire a transaction lock on the root page (in-memory inode)
2017 */
2018 tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT);
2019 dtlck = (struct dt_lock *) & tlck->lock;
2020
2021 /* linelock root */
2022 ASSERT(dtlck->index == 0);
2023 lv = & dtlck->lv[0];
2024 lv->offset = 0;
2025 lv->length = DTROOTMAXSLOT;
2026 dtlck->index++;
2027
2028 /* update page header of root */
2029 if (sp->header.flag & BT_LEAF) {
2030 sp->header.flag &= ~BT_LEAF;
2031 sp->header.flag |= BT_INTERNAL;
2032 }
2033
2034 /* init the first entry */
2035 s = (struct idtentry *) & sp->slot[DTENTRYSTART];
2036 ppxd = (pxd_t *) s;
2037 *ppxd = *pxd;
2038 s->next = -1;
2039 s->namlen = 0;
2040
2041 stbl = sp->header.stbl;
2042 stbl[0] = DTENTRYSTART;
2043 sp->header.nextindex = 1;
2044
2045 /* init freelist */
2046 fsi = DTENTRYSTART + 1;
2047 f = &sp->slot[fsi];
2048
2049 /* init free region of remaining area */
2050 for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
2051 f->next = fsi;
2052 f->next = -1;
2053
2054 sp->header.freelist = DTENTRYSTART + 1;
2055 sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1);
2056
2057 *rmpp = rmp;
2058
2059 return 0;
2060}
2061
2062
2063/*
2064 * dtDelete()
2065 *
2066 * function: delete the entry(s) referenced by a key.
2067 *
2068 * parameter:
2069 *
2070 * return:
2071 */
2072int dtDelete(tid_t tid,
2073 struct inode *ip, struct component_name * key, ino_t * ino, int flag)
2074{
2075 int rc = 0;
2076 s64 bn;
2077 struct metapage *mp, *imp;
2078 dtpage_t *p;
2079 int index;
2080 struct btstack btstack;
2081 struct dt_lock *dtlck;
2082 struct tlock *tlck;
2083 struct lv *lv;
2084 int i;
2085 struct ldtentry *ldtentry;
2086 u8 *stbl;
2087 u32 table_index, next_index;
2088 struct metapage *nmp;
2089 dtpage_t *np;
2090
2091 /*
2092 * search for the entry to delete:
2093 *
2094 * dtSearch() returns (leaf page pinned, index at which to delete).
2095 */
2096 if ((rc = dtSearch(ip, key, ino, &btstack, flag)))
2097 return rc;
2098
2099 /* retrieve search result */
2100 DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
2101
2102 /*
2103 * We need to find put the index of the next entry into the
2104 * directory index table in order to resume a readdir from this
2105 * entry.
2106 */
2107 if (DO_INDEX(ip)) {
2108 stbl = DT_GETSTBL(p);
2109 ldtentry = (struct ldtentry *) & p->slot[stbl[index]];
2110 table_index = le32_to_cpu(ldtentry->index);
2111 if (index == (p->header.nextindex - 1)) {
2112 /*
2113 * Last entry in this leaf page
2114 */
2115 if ((p->header.flag & BT_ROOT)
2116 || (p->header.next == 0))
2117 next_index = -1;
2118 else {
2119 /* Read next leaf page */
2120 DT_GETPAGE(ip, le64_to_cpu(p->header.next),
2121 nmp, PSIZE, np, rc);
2122 if (rc)
2123 next_index = -1;
2124 else {
2125 stbl = DT_GETSTBL(np);
2126 ldtentry =
2127 (struct ldtentry *) & np->
2128 slot[stbl[0]];
2129 next_index =
2130 le32_to_cpu(ldtentry->index);
2131 DT_PUTPAGE(nmp);
2132 }
2133 }
2134 } else {
2135 ldtentry =
2136 (struct ldtentry *) & p->slot[stbl[index + 1]];
2137 next_index = le32_to_cpu(ldtentry->index);
2138 }
2139 free_index(tid, ip, table_index, next_index);
2140 }
2141 /*
2142 * the leaf page becomes empty, delete the page
2143 */
2144 if (p->header.nextindex == 1) {
2145 /* delete empty page */
2146 rc = dtDeleteUp(tid, ip, mp, p, &btstack);
2147 }
2148 /*
2149 * the leaf page has other entries remaining:
2150 *
2151 * delete the entry from the leaf page.
2152 */
2153 else {
2154 BT_MARK_DIRTY(mp, ip);
2155 /*
2156 * acquire a transaction lock on the leaf page
2157 */
2158 tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
2159 dtlck = (struct dt_lock *) & tlck->lock;
2160
2161 /*
2162 * Do not assume that dtlck->index will be zero. During a
2163 * rename within a directory, this transaction may have
2164 * modified this page already when adding the new entry.
2165 */
2166
2167 /* linelock header */
2168 if (dtlck->index >= dtlck->maxcnt)
2169 dtlck = (struct dt_lock *) txLinelock(dtlck);
2170 lv = & dtlck->lv[dtlck->index];
2171 lv->offset = 0;
2172 lv->length = 1;
2173 dtlck->index++;
2174
2175 /* linelock stbl of non-root leaf page */
2176 if (!(p->header.flag & BT_ROOT)) {
2177 if (dtlck->index >= dtlck->maxcnt)
2178 dtlck = (struct dt_lock *) txLinelock(dtlck);
2179 lv = & dtlck->lv[dtlck->index];
2180 i = index >> L2DTSLOTSIZE;
2181 lv->offset = p->header.stblindex + i;
2182 lv->length =
2183 ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
2184 i + 1;
2185 dtlck->index++;
2186 }
2187
2188 /* free the leaf entry */
2189 dtDeleteEntry(p, index, &dtlck);
2190
2191 /*
2192 * Update directory index table for entries moved in stbl
2193 */
2194 if (DO_INDEX(ip) && index < p->header.nextindex) {
2195 s64 lblock;
2196
2197 imp = NULL;
2198 stbl = DT_GETSTBL(p);
2199 for (i = index; i < p->header.nextindex; i++) {
2200 ldtentry =
2201 (struct ldtentry *) & p->slot[stbl[i]];
2202 modify_index(tid, ip,
2203 le32_to_cpu(ldtentry->index),
2204 bn, i, &imp, &lblock);
2205 }
2206 if (imp)
2207 release_metapage(imp);
2208 }
2209
2210 DT_PUTPAGE(mp);
2211 }
2212
2213 return rc;
2214}
2215
2216
2217/*
2218 * dtDeleteUp()
2219 *
2220 * function:
2221 * free empty pages as propagating deletion up the tree
2222 *
2223 * parameter:
2224 *
2225 * return:
2226 */
2227static int dtDeleteUp(tid_t tid, struct inode *ip,
2228 struct metapage * fmp, dtpage_t * fp, struct btstack * btstack)
2229{
2230 int rc = 0;
2231 struct metapage *mp;
2232 dtpage_t *p;
2233 int index, nextindex;
2234 int xlen;
2235 struct btframe *parent;
2236 struct dt_lock *dtlck;
2237 struct tlock *tlck;
2238 struct lv *lv;
2239 struct pxd_lock *pxdlock;
2240 int i;
2241
2242 /*
2243 * keep the root leaf page which has become empty
2244 */
2245 if (BT_IS_ROOT(fmp)) {
2246 /*
2247 * reset the root
2248 *
2249 * dtInitRoot() acquires txlock on the root
2250 */
2251 dtInitRoot(tid, ip, PARENT(ip));
2252
2253 DT_PUTPAGE(fmp);
2254
2255 return 0;
2256 }
2257
2258 /*
2259 * free the non-root leaf page
2260 */
2261 /*
2262 * acquire a transaction lock on the page
2263 *
2264 * write FREEXTENT|NOREDOPAGE log record
2265 * N.B. linelock is overlaid as freed extent descriptor, and
2266 * the buffer page is freed;
2267 */
2268 tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
2269 pxdlock = (struct pxd_lock *) & tlck->lock;
2270 pxdlock->flag = mlckFREEPXD;
2271 pxdlock->pxd = fp->header.self;
2272 pxdlock->index = 1;
2273
2274 /* update sibling pointers */
2275 if ((rc = dtRelink(tid, ip, fp))) {
2276 BT_PUTPAGE(fmp);
2277 return rc;
2278 }
2279
2280 xlen = lengthPXD(&fp->header.self);
2281
2282 /* Free quota allocation. */
2283 DQUOT_FREE_BLOCK(ip, xlen);
2284
2285 /* free/invalidate its buffer page */
2286 discard_metapage(fmp);
2287
2288 /*
2289 * propagate page deletion up the directory tree
2290 *
2291 * If the delete from the parent page makes it empty,
2292 * continue all the way up the tree.
2293 * stop if the root page is reached (which is never deleted) or
2294 * if the entry deletion does not empty the page.
2295 */
2296 while ((parent = BT_POP(btstack)) != NULL) {
2297 /* pin the parent page <sp> */
2298 DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
2299 if (rc)
2300 return rc;
2301
2302 /*
2303 * free the extent of the child page deleted
2304 */
2305 index = parent->index;
2306
2307 /*
2308 * delete the entry for the child page from parent
2309 */
2310 nextindex = p->header.nextindex;
2311
2312 /*
2313 * the parent has the single entry being deleted:
2314 *
2315 * free the parent page which has become empty.
2316 */
2317 if (nextindex == 1) {
2318 /*
2319 * keep the root internal page which has become empty
2320 */
2321 if (p->header.flag & BT_ROOT) {
2322 /*
2323 * reset the root
2324 *
2325 * dtInitRoot() acquires txlock on the root
2326 */
2327 dtInitRoot(tid, ip, PARENT(ip));
2328
2329 DT_PUTPAGE(mp);
2330
2331 return 0;
2332 }
2333 /*
2334 * free the parent page
2335 */
2336 else {
2337 /*
2338 * acquire a transaction lock on the page
2339 *
2340 * write FREEXTENT|NOREDOPAGE log record
2341 */
2342 tlck =
2343 txMaplock(tid, ip,
2344 tlckDTREE | tlckFREE);
2345 pxdlock = (struct pxd_lock *) & tlck->lock;
2346 pxdlock->flag = mlckFREEPXD;
2347 pxdlock->pxd = p->header.self;
2348 pxdlock->index = 1;
2349
2350 /* update sibling pointers */
2351 if ((rc = dtRelink(tid, ip, p))) {
2352 DT_PUTPAGE(mp);
2353 return rc;
2354 }
2355
2356 xlen = lengthPXD(&p->header.self);
2357
2358 /* Free quota allocation */
2359 DQUOT_FREE_BLOCK(ip, xlen);
2360
2361 /* free/invalidate its buffer page */
2362 discard_metapage(mp);
2363
2364 /* propagate up */
2365 continue;
2366 }
2367 }
2368
2369 /*
2370 * the parent has other entries remaining:
2371 *
2372 * delete the router entry from the parent page.
2373 */
2374 BT_MARK_DIRTY(mp, ip);
2375 /*
2376 * acquire a transaction lock on the page
2377 *
2378 * action: router entry deletion
2379 */
2380 tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
2381 dtlck = (struct dt_lock *) & tlck->lock;
2382
2383 /* linelock header */
2384 if (dtlck->index >= dtlck->maxcnt)
2385 dtlck = (struct dt_lock *) txLinelock(dtlck);
2386 lv = & dtlck->lv[dtlck->index];
2387 lv->offset = 0;
2388 lv->length = 1;
2389 dtlck->index++;
2390
2391 /* linelock stbl of non-root leaf page */
2392 if (!(p->header.flag & BT_ROOT)) {
2393 if (dtlck->index < dtlck->maxcnt)
2394 lv++;
2395 else {
2396 dtlck = (struct dt_lock *) txLinelock(dtlck);
2397 lv = & dtlck->lv[0];
2398 }
2399 i = index >> L2DTSLOTSIZE;
2400 lv->offset = p->header.stblindex + i;
2401 lv->length =
2402 ((p->header.nextindex - 1) >> L2DTSLOTSIZE) -
2403 i + 1;
2404 dtlck->index++;
2405 }
2406
2407 /* free the router entry */
2408 dtDeleteEntry(p, index, &dtlck);
2409
2410 /* reset key of new leftmost entry of level (for consistency) */
2411 if (index == 0 &&
2412 ((p->header.flag & BT_ROOT) || p->header.prev == 0))
2413 dtTruncateEntry(p, 0, &dtlck);
2414
2415 /* unpin the parent page */
2416 DT_PUTPAGE(mp);
2417
2418 /* exit propagation up */
2419 break;
2420 }
2421
2422 return 0;
2423}
2424
2425#ifdef _NOTYET
2426/*
2427 * NAME: dtRelocate()
2428 *
2429 * FUNCTION: relocate dtpage (internal or leaf) of directory;
2430 * This function is mainly used by defragfs utility.
2431 */
2432int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd,
2433 s64 nxaddr)
2434{
2435 int rc = 0;
2436 struct metapage *mp, *pmp, *lmp, *rmp;
2437 dtpage_t *p, *pp, *rp = 0, *lp= 0;
2438 s64 bn;
2439 int index;
2440 struct btstack btstack;
2441 pxd_t *pxd;
2442 s64 oxaddr, nextbn, prevbn;
2443 int xlen, xsize;
2444 struct tlock *tlck;
2445 struct dt_lock *dtlck;
2446 struct pxd_lock *pxdlock;
2447 s8 *stbl;
2448 struct lv *lv;
2449
2450 oxaddr = addressPXD(opxd);
2451 xlen = lengthPXD(opxd);
2452
2453 jfs_info("dtRelocate: lmxaddr:%Ld xaddr:%Ld:%Ld xlen:%d",
2454 (long long)lmxaddr, (long long)oxaddr, (long long)nxaddr,
2455 xlen);
2456
2457 /*
2458 * 1. get the internal parent dtpage covering
2459 * router entry for the tartget page to be relocated;
2460 */
2461 rc = dtSearchNode(ip, lmxaddr, opxd, &btstack);
2462 if (rc)
2463 return rc;
2464
2465 /* retrieve search result */
2466 DT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
2467 jfs_info("dtRelocate: parent router entry validated.");
2468
2469 /*
2470 * 2. relocate the target dtpage
2471 */
2472 /* read in the target page from src extent */
2473 DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
2474 if (rc) {
2475 /* release the pinned parent page */
2476 DT_PUTPAGE(pmp);
2477 return rc;
2478 }
2479
2480 /*
2481 * read in sibling pages if any to update sibling pointers;
2482 */
2483 rmp = NULL;
2484 if (p->header.next) {
2485 nextbn = le64_to_cpu(p->header.next);
2486 DT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
2487 if (rc) {
2488 DT_PUTPAGE(mp);
2489 DT_PUTPAGE(pmp);
2490 return (rc);
2491 }
2492 }
2493
2494 lmp = NULL;
2495 if (p->header.prev) {
2496 prevbn = le64_to_cpu(p->header.prev);
2497 DT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
2498 if (rc) {
2499 DT_PUTPAGE(mp);
2500 DT_PUTPAGE(pmp);
2501 if (rmp)
2502 DT_PUTPAGE(rmp);
2503 return (rc);
2504 }
2505 }
2506
2507 /* at this point, all xtpages to be updated are in memory */
2508
2509 /*
2510 * update sibling pointers of sibling dtpages if any;
2511 */
2512 if (lmp) {
2513 tlck = txLock(tid, ip, lmp, tlckDTREE | tlckRELINK);
2514 dtlck = (struct dt_lock *) & tlck->lock;
2515 /* linelock header */
2516 ASSERT(dtlck->index == 0);
2517 lv = & dtlck->lv[0];
2518 lv->offset = 0;
2519 lv->length = 1;
2520 dtlck->index++;
2521
2522 lp->header.next = cpu_to_le64(nxaddr);
2523 DT_PUTPAGE(lmp);
2524 }
2525
2526 if (rmp) {
2527 tlck = txLock(tid, ip, rmp, tlckDTREE | tlckRELINK);
2528 dtlck = (struct dt_lock *) & tlck->lock;
2529 /* linelock header */
2530 ASSERT(dtlck->index == 0);
2531 lv = & dtlck->lv[0];
2532 lv->offset = 0;
2533 lv->length = 1;
2534 dtlck->index++;
2535
2536 rp->header.prev = cpu_to_le64(nxaddr);
2537 DT_PUTPAGE(rmp);
2538 }
2539
2540 /*
2541 * update the target dtpage to be relocated
2542 *
2543 * write LOG_REDOPAGE of LOG_NEW type for dst page
2544 * for the whole target page (logredo() will apply
2545 * after image and update bmap for allocation of the
2546 * dst extent), and update bmap for allocation of
2547 * the dst extent;
2548 */
2549 tlck = txLock(tid, ip, mp, tlckDTREE | tlckNEW);
2550 dtlck = (struct dt_lock *) & tlck->lock;
2551 /* linelock header */
2552 ASSERT(dtlck->index == 0);
2553 lv = & dtlck->lv[0];
2554
2555 /* update the self address in the dtpage header */
2556 pxd = &p->header.self;
2557 PXDaddress(pxd, nxaddr);
2558
2559 /* the dst page is the same as the src page, i.e.,
2560 * linelock for afterimage of the whole page;
2561 */
2562 lv->offset = 0;
2563 lv->length = p->header.maxslot;
2564 dtlck->index++;
2565
2566 /* update the buffer extent descriptor of the dtpage */
2567 xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
2568#ifdef _STILL_TO_PORT
2569 bmSetXD(mp, nxaddr, xsize);
2570#endif /* _STILL_TO_PORT */
2571 /* unpin the relocated page */
2572 DT_PUTPAGE(mp);
2573 jfs_info("dtRelocate: target dtpage relocated.");
2574
2575 /* the moved extent is dtpage, then a LOG_NOREDOPAGE log rec
2576 * needs to be written (in logredo(), the LOG_NOREDOPAGE log rec
2577 * will also force a bmap update ).
2578 */
2579
2580 /*
2581 * 3. acquire maplock for the source extent to be freed;
2582 */
2583 /* for dtpage relocation, write a LOG_NOREDOPAGE record
2584 * for the source dtpage (logredo() will init NoRedoPage
2585 * filter and will also update bmap for free of the source
2586 * dtpage), and upadte bmap for free of the source dtpage;
2587 */
2588 tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE);
2589 pxdlock = (struct pxd_lock *) & tlck->lock;
2590 pxdlock->flag = mlckFREEPXD;
2591 PXDaddress(&pxdlock->pxd, oxaddr);
2592 PXDlength(&pxdlock->pxd, xlen);
2593 pxdlock->index = 1;
2594
2595 /*
2596 * 4. update the parent router entry for relocation;
2597 *
2598 * acquire tlck for the parent entry covering the target dtpage;
2599 * write LOG_REDOPAGE to apply after image only;
2600 */
2601 jfs_info("dtRelocate: update parent router entry.");
2602 tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY);
2603 dtlck = (struct dt_lock *) & tlck->lock;
2604 lv = & dtlck->lv[dtlck->index];
2605
2606 /* update the PXD with the new address */
2607 stbl = DT_GETSTBL(pp);
2608 pxd = (pxd_t *) & pp->slot[stbl[index]];
2609 PXDaddress(pxd, nxaddr);
2610 lv->offset = stbl[index];
2611 lv->length = 1;
2612 dtlck->index++;
2613
2614 /* unpin the parent dtpage */
2615 DT_PUTPAGE(pmp);
2616
2617 return rc;
2618}
2619
2620/*
2621 * NAME: dtSearchNode()
2622 *
2623 * FUNCTION: Search for an dtpage containing a specified address
2624 * This function is mainly used by defragfs utility.
2625 *
2626 * NOTE: Search result on stack, the found page is pinned at exit.
2627 * The result page must be an internal dtpage.
2628 * lmxaddr give the address of the left most page of the
2629 * dtree level, in which the required dtpage resides.
2630 */
2631static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd,
2632 struct btstack * btstack)
2633{
2634 int rc = 0;
2635 s64 bn;
2636 struct metapage *mp;
2637 dtpage_t *p;
2638 int psize = 288; /* initial in-line directory */
2639 s8 *stbl;
2640 int i;
2641 pxd_t *pxd;
2642 struct btframe *btsp;
2643
2644 BT_CLR(btstack); /* reset stack */
2645
2646 /*
2647 * descend tree to the level with specified leftmost page
2648 *
2649 * by convention, root bn = 0.
2650 */
2651 for (bn = 0;;) {
2652 /* get/pin the page to search */
2653 DT_GETPAGE(ip, bn, mp, psize, p, rc);
2654 if (rc)
2655 return rc;
2656
2657 /* does the xaddr of leftmost page of the levevl
2658 * matches levevl search key ?
2659 */
2660 if (p->header.flag & BT_ROOT) {
2661 if (lmxaddr == 0)
2662 break;
2663 } else if (addressPXD(&p->header.self) == lmxaddr)
2664 break;
2665
2666 /*
2667 * descend down to leftmost child page
2668 */
2669 if (p->header.flag & BT_LEAF) {
2670 DT_PUTPAGE(mp);
2671 return -ESTALE;
2672 }
2673
2674 /* get the leftmost entry */
2675 stbl = DT_GETSTBL(p);
2676 pxd = (pxd_t *) & p->slot[stbl[0]];
2677
2678 /* get the child page block address */
2679 bn = addressPXD(pxd);
2680 psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
2681 /* unpin the parent page */
2682 DT_PUTPAGE(mp);
2683 }
2684
2685 /*
2686 * search each page at the current levevl
2687 */
2688 loop:
2689 stbl = DT_GETSTBL(p);
2690 for (i = 0; i < p->header.nextindex; i++) {
2691 pxd = (pxd_t *) & p->slot[stbl[i]];
2692
2693 /* found the specified router entry */
2694 if (addressPXD(pxd) == addressPXD(kpxd) &&
2695 lengthPXD(pxd) == lengthPXD(kpxd)) {
2696 btsp = btstack->top;
2697 btsp->bn = bn;
2698 btsp->index = i;
2699 btsp->mp = mp;
2700
2701 return 0;
2702 }
2703 }
2704
2705 /* get the right sibling page if any */
2706 if (p->header.next)
2707 bn = le64_to_cpu(p->header.next);
2708 else {
2709 DT_PUTPAGE(mp);
2710 return -ESTALE;
2711 }
2712
2713 /* unpin current page */
2714 DT_PUTPAGE(mp);
2715
2716 /* get the right sibling page */
2717 DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2718 if (rc)
2719 return rc;
2720
2721 goto loop;
2722}
2723#endif /* _NOTYET */
2724
2725/*
2726 * dtRelink()
2727 *
2728 * function:
2729 * link around a freed page.
2730 *
2731 * parameter:
2732 * fp: page to be freed
2733 *
2734 * return:
2735 */
2736static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p)
2737{
2738 int rc;
2739 struct metapage *mp;
2740 s64 nextbn, prevbn;
2741 struct tlock *tlck;
2742 struct dt_lock *dtlck;
2743 struct lv *lv;
2744
2745 nextbn = le64_to_cpu(p->header.next);
2746 prevbn = le64_to_cpu(p->header.prev);
2747
2748 /* update prev pointer of the next page */
2749 if (nextbn != 0) {
2750 DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
2751 if (rc)
2752 return rc;
2753
2754 BT_MARK_DIRTY(mp, ip);
2755 /*
2756 * acquire a transaction lock on the next page
2757 *
2758 * action: update prev pointer;
2759 */
2760 tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
2761 jfs_info("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p",
2762 tlck, ip, mp);
2763 dtlck = (struct dt_lock *) & tlck->lock;
2764
2765 /* linelock header */
2766 if (dtlck->index >= dtlck->maxcnt)
2767 dtlck = (struct dt_lock *) txLinelock(dtlck);
2768 lv = & dtlck->lv[dtlck->index];
2769 lv->offset = 0;
2770 lv->length = 1;
2771 dtlck->index++;
2772
2773 p->header.prev = cpu_to_le64(prevbn);
2774 DT_PUTPAGE(mp);
2775 }
2776
2777 /* update next pointer of the previous page */
2778 if (prevbn != 0) {
2779 DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
2780 if (rc)
2781 return rc;
2782
2783 BT_MARK_DIRTY(mp, ip);
2784 /*
2785 * acquire a transaction lock on the prev page
2786 *
2787 * action: update next pointer;
2788 */
2789 tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK);
2790 jfs_info("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p",
2791 tlck, ip, mp);
2792 dtlck = (struct dt_lock *) & tlck->lock;
2793
2794 /* linelock header */
2795 if (dtlck->index >= dtlck->maxcnt)
2796 dtlck = (struct dt_lock *) txLinelock(dtlck);
2797 lv = & dtlck->lv[dtlck->index];
2798 lv->offset = 0;
2799 lv->length = 1;
2800 dtlck->index++;
2801
2802 p->header.next = cpu_to_le64(nextbn);
2803 DT_PUTPAGE(mp);
2804 }
2805
2806 return 0;
2807}
2808
2809
2810/*
2811 * dtInitRoot()
2812 *
2813 * initialize directory root (inline in inode)
2814 */
2815void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot)
2816{
2817 struct jfs_inode_info *jfs_ip = JFS_IP(ip);
2818 dtroot_t *p;
2819 int fsi;
2820 struct dtslot *f;
2821 struct tlock *tlck;
2822 struct dt_lock *dtlck;
2823 struct lv *lv;
2824 u16 xflag_save;
2825
2826 /*
2827 * If this was previously an non-empty directory, we need to remove
2828 * the old directory table.
2829 */
2830 if (DO_INDEX(ip)) {
2831 if (!jfs_dirtable_inline(ip)) {
2832 struct tblock *tblk = tid_to_tblock(tid);
2833 /*
2834 * We're playing games with the tid's xflag. If
2835 * we're removing a regular file, the file's xtree
2836 * is committed with COMMIT_PMAP, but we always
2837 * commit the directories xtree with COMMIT_PWMAP.
2838 */
2839 xflag_save = tblk->xflag;
2840 tblk->xflag = 0;
2841 /*
2842 * xtTruncate isn't guaranteed to fully truncate
2843 * the xtree. The caller needs to check i_size
2844 * after committing the transaction to see if
2845 * additional truncation is needed. The
2846 * COMMIT_Stale flag tells caller that we
2847 * initiated the truncation.
2848 */
2849 xtTruncate(tid, ip, 0, COMMIT_PWMAP);
2850 set_cflag(COMMIT_Stale, ip);
2851
2852 tblk->xflag = xflag_save;
2853 } else
2854 ip->i_size = 1;
2855
2856 jfs_ip->next_index = 2;
2857 } else
2858 ip->i_size = IDATASIZE;
2859
2860 /*
2861 * acquire a transaction lock on the root
2862 *
2863 * action: directory initialization;
2864 */
2865 tlck = txLock(tid, ip, (struct metapage *) & jfs_ip->bxflag,
2866 tlckDTREE | tlckENTRY | tlckBTROOT);
2867 dtlck = (struct dt_lock *) & tlck->lock;
2868
2869 /* linelock root */
2870 ASSERT(dtlck->index == 0);
2871 lv = & dtlck->lv[0];
2872 lv->offset = 0;
2873 lv->length = DTROOTMAXSLOT;
2874 dtlck->index++;
2875
2876 p = &jfs_ip->i_dtroot;
2877
2878 p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF;
2879
2880 p->header.nextindex = 0;
2881
2882 /* init freelist */
2883 fsi = 1;
2884 f = &p->slot[fsi];
2885
2886 /* init data area of root */
2887 for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++)
2888 f->next = fsi;
2889 f->next = -1;
2890
2891 p->header.freelist = 1;
2892 p->header.freecnt = 8;
2893
2894 /* init '..' entry */
2895 p->header.idotdot = cpu_to_le32(idotdot);
2896
2897 return;
2898}
2899
2900/*
2901 * add_missing_indices()
2902 *
2903 * function: Fix dtree page in which one or more entries has an invalid index.
2904 * fsck.jfs should really fix this, but it currently does not.
2905 * Called from jfs_readdir when bad index is detected.
2906 */
2907static void add_missing_indices(struct inode *inode, s64 bn)
2908{
2909 struct ldtentry *d;
2910 struct dt_lock *dtlck;
2911 int i;
2912 uint index;
2913 struct lv *lv;
2914 struct metapage *mp;
2915 dtpage_t *p;
2916 int rc;
2917 s8 *stbl;
2918 tid_t tid;
2919 struct tlock *tlck;
2920
2921 tid = txBegin(inode->i_sb, 0);
2922
2923 DT_GETPAGE(inode, bn, mp, PSIZE, p, rc);
2924
2925 if (rc) {
2926 printk(KERN_ERR "DT_GETPAGE failed!\n");
2927 goto end;
2928 }
2929 BT_MARK_DIRTY(mp, inode);
2930
2931 ASSERT(p->header.flag & BT_LEAF);
2932
2933 tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY);
2934 dtlck = (struct dt_lock *) &tlck->lock;
2935
2936 stbl = DT_GETSTBL(p);
2937 for (i = 0; i < p->header.nextindex; i++) {
2938 d = (struct ldtentry *) &p->slot[stbl[i]];
2939 index = le32_to_cpu(d->index);
2940 if ((index < 2) || (index >= JFS_IP(inode)->next_index)) {
2941 d->index = cpu_to_le32(add_index(tid, inode, bn, i));
2942 if (dtlck->index >= dtlck->maxcnt)
2943 dtlck = (struct dt_lock *) txLinelock(dtlck);
2944 lv = &dtlck->lv[dtlck->index];
2945 lv->offset = stbl[i];
2946 lv->length = 1;
2947 dtlck->index++;
2948 }
2949 }
2950
2951 DT_PUTPAGE(mp);
2952 (void) txCommit(tid, 1, &inode, 0);
2953end:
2954 txEnd(tid);
2955}
2956
2957/*
2958 * Buffer to hold directory entry info while traversing a dtree page
2959 * before being fed to the filldir function
2960 */
2961struct jfs_dirent {
2962 loff_t position;
2963 int ino;
2964 u16 name_len;
2965 char name[0];
2966};
2967
2968/*
2969 * function to determine next variable-sized jfs_dirent in buffer
2970 */
2971static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent)
2972{
2973 return (struct jfs_dirent *)
2974 ((char *)dirent +
2975 ((sizeof (struct jfs_dirent) + dirent->name_len + 1 +
2976 sizeof (loff_t) - 1) &
2977 ~(sizeof (loff_t) - 1)));
2978}
2979
2980/*
2981 * jfs_readdir()
2982 *
2983 * function: read directory entries sequentially
2984 * from the specified entry offset
2985 *
2986 * parameter:
2987 *
2988 * return: offset = (pn, index) of start entry
2989 * of next jfs_readdir()/dtRead()
2990 */
2991int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
2992{
2993 struct inode *ip = filp->f_dentry->d_inode;
2994 struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
2995 int rc = 0;
2996 loff_t dtpos; /* legacy OS/2 style position */
2997 struct dtoffset {
2998 s16 pn;
2999 s16 index;
3000 s32 unused;
3001 } *dtoffset = (struct dtoffset *) &dtpos;
3002 s64 bn;
3003 struct metapage *mp;
3004 dtpage_t *p;
3005 int index;
3006 s8 *stbl;
3007 struct btstack btstack;
3008 int i, next;
3009 struct ldtentry *d;
3010 struct dtslot *t;
3011 int d_namleft, len, outlen;
3012 unsigned long dirent_buf;
3013 char *name_ptr;
3014 u32 dir_index;
3015 int do_index = 0;
3016 uint loop_count = 0;
3017 struct jfs_dirent *jfs_dirent;
3018 int jfs_dirents;
3019 int overflow, fix_page, page_fixed = 0;
3020 static int unique_pos = 2; /* If we can't fix broken index */
3021
3022 if (filp->f_pos == DIREND)
3023 return 0;
3024
3025 if (DO_INDEX(ip)) {
3026 /*
3027 * persistent index is stored in directory entries.
3028 * Special cases: 0 = .
3029 * 1 = ..
3030 * -1 = End of directory
3031 */
3032 do_index = 1;
3033
3034 dir_index = (u32) filp->f_pos;
3035
3036 if (dir_index > 1) {
3037 struct dir_table_slot dirtab_slot;
3038
3039 if (dtEmpty(ip) ||
3040 (dir_index >= JFS_IP(ip)->next_index)) {
3041 /* Stale position. Directory has shrunk */
3042 filp->f_pos = DIREND;
3043 return 0;
3044 }
3045 repeat:
3046 rc = read_index(ip, dir_index, &dirtab_slot);
3047 if (rc) {
3048 filp->f_pos = DIREND;
3049 return rc;
3050 }
3051 if (dirtab_slot.flag == DIR_INDEX_FREE) {
3052 if (loop_count++ > JFS_IP(ip)->next_index) {
3053 jfs_err("jfs_readdir detected "
3054 "infinite loop!");
3055 filp->f_pos = DIREND;
3056 return 0;
3057 }
3058 dir_index = le32_to_cpu(dirtab_slot.addr2);
3059 if (dir_index == -1) {
3060 filp->f_pos = DIREND;
3061 return 0;
3062 }
3063 goto repeat;
3064 }
3065 bn = addressDTS(&dirtab_slot);
3066 index = dirtab_slot.slot;
3067 DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3068 if (rc) {
3069 filp->f_pos = DIREND;
3070 return 0;
3071 }
3072 if (p->header.flag & BT_INTERNAL) {
3073 jfs_err("jfs_readdir: bad index table");
3074 DT_PUTPAGE(mp);
3075 filp->f_pos = -1;
3076 return 0;
3077 }
3078 } else {
3079 if (dir_index == 0) {
3080 /*
3081 * self "."
3082 */
3083 filp->f_pos = 0;
3084 if (filldir(dirent, ".", 1, 0, ip->i_ino,
3085 DT_DIR))
3086 return 0;
3087 }
3088 /*
3089 * parent ".."
3090 */
3091 filp->f_pos = 1;
3092 if (filldir(dirent, "..", 2, 1, PARENT(ip), DT_DIR))
3093 return 0;
3094
3095 /*
3096 * Find first entry of left-most leaf
3097 */
3098 if (dtEmpty(ip)) {
3099 filp->f_pos = DIREND;
3100 return 0;
3101 }
3102
3103 if ((rc = dtReadFirst(ip, &btstack)))
3104 return rc;
3105
3106 DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
3107 }
3108 } else {
3109 /*
3110 * Legacy filesystem - OS/2 & Linux JFS < 0.3.6
3111 *
3112 * pn = index = 0: First entry "."
3113 * pn = 0; index = 1: Second entry ".."
3114 * pn > 0: Real entries, pn=1 -> leftmost page
3115 * pn = index = -1: No more entries
3116 */
3117 dtpos = filp->f_pos;
3118 if (dtpos == 0) {
3119 /* build "." entry */
3120
3121 if (filldir(dirent, ".", 1, filp->f_pos, ip->i_ino,
3122 DT_DIR))
3123 return 0;
3124 dtoffset->index = 1;
3125 filp->f_pos = dtpos;
3126 }
3127
3128 if (dtoffset->pn == 0) {
3129 if (dtoffset->index == 1) {
3130 /* build ".." entry */
3131
3132 if (filldir(dirent, "..", 2, filp->f_pos,
3133 PARENT(ip), DT_DIR))
3134 return 0;
3135 } else {
3136 jfs_err("jfs_readdir called with "
3137 "invalid offset!");
3138 }
3139 dtoffset->pn = 1;
3140 dtoffset->index = 0;
3141 filp->f_pos = dtpos;
3142 }
3143
3144 if (dtEmpty(ip)) {
3145 filp->f_pos = DIREND;
3146 return 0;
3147 }
3148
3149 if ((rc = dtReadNext(ip, &filp->f_pos, &btstack))) {
3150 jfs_err("jfs_readdir: unexpected rc = %d "
3151 "from dtReadNext", rc);
3152 filp->f_pos = DIREND;
3153 return 0;
3154 }
3155 /* get start leaf page and index */
3156 DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
3157
3158 /* offset beyond directory eof ? */
3159 if (bn < 0) {
3160 filp->f_pos = DIREND;
3161 return 0;
3162 }
3163 }
3164
3165 dirent_buf = __get_free_page(GFP_KERNEL);
3166 if (dirent_buf == 0) {
3167 DT_PUTPAGE(mp);
3168 jfs_warn("jfs_readdir: __get_free_page failed!");
3169 filp->f_pos = DIREND;
3170 return -ENOMEM;
3171 }
3172
3173 while (1) {
3174 jfs_dirent = (struct jfs_dirent *) dirent_buf;
3175 jfs_dirents = 0;
3176 overflow = fix_page = 0;
3177
3178 stbl = DT_GETSTBL(p);
3179
3180 for (i = index; i < p->header.nextindex; i++) {
3181 d = (struct ldtentry *) & p->slot[stbl[i]];
3182
3183 if (((long) jfs_dirent + d->namlen + 1) >
3184 (dirent_buf + PSIZE)) {
3185 /* DBCS codepages could overrun dirent_buf */
3186 index = i;
3187 overflow = 1;
3188 break;
3189 }
3190
3191 d_namleft = d->namlen;
3192 name_ptr = jfs_dirent->name;
3193 jfs_dirent->ino = le32_to_cpu(d->inumber);
3194
3195 if (do_index) {
3196 len = min(d_namleft, DTLHDRDATALEN);
3197 jfs_dirent->position = le32_to_cpu(d->index);
3198 /*
3199 * d->index should always be valid, but it
3200 * isn't. fsck.jfs doesn't create the
3201 * directory index for the lost+found
3202 * directory. Rather than let it go,
3203 * we can try to fix it.
3204 */
3205 if ((jfs_dirent->position < 2) ||
3206 (jfs_dirent->position >=
3207 JFS_IP(ip)->next_index)) {
3208 if (!page_fixed && !isReadOnly(ip)) {
3209 fix_page = 1;
3210 /*
3211 * setting overflow and setting
3212 * index to i will cause the
3213 * same page to be processed
3214 * again starting here
3215 */
3216 overflow = 1;
3217 index = i;
3218 break;
3219 }
3220 jfs_dirent->position = unique_pos++;
3221 }
3222 } else {
3223 jfs_dirent->position = dtpos;
3224 len = min(d_namleft, DTLHDRDATALEN_LEGACY);
3225 }
3226
3227 /* copy the name of head/only segment */
3228 outlen = jfs_strfromUCS_le(name_ptr, d->name, len,
3229 codepage);
3230 jfs_dirent->name_len = outlen;
3231
3232 /* copy name in the additional segment(s) */
3233 next = d->next;
3234 while (next >= 0) {
3235 t = (struct dtslot *) & p->slot[next];
3236 name_ptr += outlen;
3237 d_namleft -= len;
3238 /* Sanity Check */
3239 if (d_namleft == 0) {
3240 jfs_error(ip->i_sb,
3241 "JFS:Dtree error: ino = "
3242 "%ld, bn=%Ld, index = %d",
3243 (long)ip->i_ino,
3244 (long long)bn,
3245 i);
3246 goto skip_one;
3247 }
3248 len = min(d_namleft, DTSLOTDATALEN);
3249 outlen = jfs_strfromUCS_le(name_ptr, t->name,
3250 len, codepage);
3251 jfs_dirent->name_len += outlen;
3252
3253 next = t->next;
3254 }
3255
3256 jfs_dirents++;
3257 jfs_dirent = next_jfs_dirent(jfs_dirent);
3258skip_one:
3259 if (!do_index)
3260 dtoffset->index++;
3261 }
3262
3263 if (!overflow) {
3264 /* Point to next leaf page */
3265 if (p->header.flag & BT_ROOT)
3266 bn = 0;
3267 else {
3268 bn = le64_to_cpu(p->header.next);
3269 index = 0;
3270 /* update offset (pn:index) for new page */
3271 if (!do_index) {
3272 dtoffset->pn++;
3273 dtoffset->index = 0;
3274 }
3275 }
3276 page_fixed = 0;
3277 }
3278
3279 /* unpin previous leaf page */
3280 DT_PUTPAGE(mp);
3281
3282 jfs_dirent = (struct jfs_dirent *) dirent_buf;
3283 while (jfs_dirents--) {
3284 filp->f_pos = jfs_dirent->position;
3285 if (filldir(dirent, jfs_dirent->name,
3286 jfs_dirent->name_len, filp->f_pos,
3287 jfs_dirent->ino, DT_UNKNOWN))
3288 goto out;
3289 jfs_dirent = next_jfs_dirent(jfs_dirent);
3290 }
3291
3292 if (fix_page) {
3293 add_missing_indices(ip, bn);
3294 page_fixed = 1;
3295 }
3296
3297 if (!overflow && (bn == 0)) {
3298 filp->f_pos = DIREND;
3299 break;
3300 }
3301
3302 DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3303 if (rc) {
3304 free_page(dirent_buf);
3305 return rc;
3306 }
3307 }
3308
3309 out:
3310 free_page(dirent_buf);
3311
3312 return rc;
3313}
3314
3315
3316/*
3317 * dtReadFirst()
3318 *
3319 * function: get the leftmost page of the directory
3320 */
3321static int dtReadFirst(struct inode *ip, struct btstack * btstack)
3322{
3323 int rc = 0;
3324 s64 bn;
3325 int psize = 288; /* initial in-line directory */
3326 struct metapage *mp;
3327 dtpage_t *p;
3328 s8 *stbl;
3329 struct btframe *btsp;
3330 pxd_t *xd;
3331
3332 BT_CLR(btstack); /* reset stack */
3333
3334 /*
3335 * descend leftmost path of the tree
3336 *
3337 * by convention, root bn = 0.
3338 */
3339 for (bn = 0;;) {
3340 DT_GETPAGE(ip, bn, mp, psize, p, rc);
3341 if (rc)
3342 return rc;
3343
3344 /*
3345 * leftmost leaf page
3346 */
3347 if (p->header.flag & BT_LEAF) {
3348 /* return leftmost entry */
3349 btsp = btstack->top;
3350 btsp->bn = bn;
3351 btsp->index = 0;
3352 btsp->mp = mp;
3353
3354 return 0;
3355 }
3356
3357 /*
3358 * descend down to leftmost child page
3359 */
3360 if (BT_STACK_FULL(btstack)) {
3361 DT_PUTPAGE(mp);
3362 jfs_error(ip->i_sb, "dtReadFirst: btstack overrun");
3363 BT_STACK_DUMP(btstack);
3364 return -EIO;
3365 }
3366 /* push (bn, index) of the parent page/entry */
3367 BT_PUSH(btstack, bn, 0);
3368
3369 /* get the leftmost entry */
3370 stbl = DT_GETSTBL(p);
3371 xd = (pxd_t *) & p->slot[stbl[0]];
3372
3373 /* get the child page block address */
3374 bn = addressPXD(xd);
3375 psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize;
3376
3377 /* unpin the parent page */
3378 DT_PUTPAGE(mp);
3379 }
3380}
3381
3382
3383/*
3384 * dtReadNext()
3385 *
3386 * function: get the page of the specified offset (pn:index)
3387 *
3388 * return: if (offset > eof), bn = -1;
3389 *
3390 * note: if index > nextindex of the target leaf page,
3391 * start with 1st entry of next leaf page;
3392 */
3393static int dtReadNext(struct inode *ip, loff_t * offset,
3394 struct btstack * btstack)
3395{
3396 int rc = 0;
3397 struct dtoffset {
3398 s16 pn;
3399 s16 index;
3400 s32 unused;
3401 } *dtoffset = (struct dtoffset *) offset;
3402 s64 bn;
3403 struct metapage *mp;
3404 dtpage_t *p;
3405 int index;
3406 int pn;
3407 s8 *stbl;
3408 struct btframe *btsp, *parent;
3409 pxd_t *xd;
3410
3411 /*
3412 * get leftmost leaf page pinned
3413 */
3414 if ((rc = dtReadFirst(ip, btstack)))
3415 return rc;
3416
3417 /* get leaf page */
3418 DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
3419
3420 /* get the start offset (pn:index) */
3421 pn = dtoffset->pn - 1; /* Now pn = 0 represents leftmost leaf */
3422 index = dtoffset->index;
3423
3424 /* start at leftmost page ? */
3425 if (pn == 0) {
3426 /* offset beyond eof ? */
3427 if (index < p->header.nextindex)
3428 goto out;
3429
3430 if (p->header.flag & BT_ROOT) {
3431 bn = -1;
3432 goto out;
3433 }
3434
3435 /* start with 1st entry of next leaf page */
3436 dtoffset->pn++;
3437 dtoffset->index = index = 0;
3438 goto a;
3439 }
3440
3441 /* start at non-leftmost page: scan parent pages for large pn */
3442 if (p->header.flag & BT_ROOT) {
3443 bn = -1;
3444 goto out;
3445 }
3446
3447 /* start after next leaf page ? */
3448 if (pn > 1)
3449 goto b;
3450
3451 /* get leaf page pn = 1 */
3452 a:
3453 bn = le64_to_cpu(p->header.next);
3454
3455 /* unpin leaf page */
3456 DT_PUTPAGE(mp);
3457
3458 /* offset beyond eof ? */
3459 if (bn == 0) {
3460 bn = -1;
3461 goto out;
3462 }
3463
3464 goto c;
3465
3466 /*
3467 * scan last internal page level to get target leaf page
3468 */
3469 b:
3470 /* unpin leftmost leaf page */
3471 DT_PUTPAGE(mp);
3472
3473 /* get left most parent page */
3474 btsp = btstack->top;
3475 parent = btsp - 1;
3476 bn = parent->bn;
3477 DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3478 if (rc)
3479 return rc;
3480
3481 /* scan parent pages at last internal page level */
3482 while (pn >= p->header.nextindex) {
3483 pn -= p->header.nextindex;
3484
3485 /* get next parent page address */
3486 bn = le64_to_cpu(p->header.next);
3487
3488 /* unpin current parent page */
3489 DT_PUTPAGE(mp);
3490
3491 /* offset beyond eof ? */
3492 if (bn == 0) {
3493 bn = -1;
3494 goto out;
3495 }
3496
3497 /* get next parent page */
3498 DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3499 if (rc)
3500 return rc;
3501
3502 /* update parent page stack frame */
3503 parent->bn = bn;
3504 }
3505
3506 /* get leaf page address */
3507 stbl = DT_GETSTBL(p);
3508 xd = (pxd_t *) & p->slot[stbl[pn]];
3509 bn = addressPXD(xd);
3510
3511 /* unpin parent page */
3512 DT_PUTPAGE(mp);
3513
3514 /*
3515 * get target leaf page
3516 */
3517 c:
3518 DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3519 if (rc)
3520 return rc;
3521
3522 /*
3523 * leaf page has been completed:
3524 * start with 1st entry of next leaf page
3525 */
3526 if (index >= p->header.nextindex) {
3527 bn = le64_to_cpu(p->header.next);
3528
3529 /* unpin leaf page */
3530 DT_PUTPAGE(mp);
3531
3532 /* offset beyond eof ? */
3533 if (bn == 0) {
3534 bn = -1;
3535 goto out;
3536 }
3537
3538 /* get next leaf page */
3539 DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3540 if (rc)
3541 return rc;
3542
3543 /* start with 1st entry of next leaf page */
3544 dtoffset->pn++;
3545 dtoffset->index = 0;
3546 }
3547
3548 out:
3549 /* return target leaf page pinned */
3550 btsp = btstack->top;
3551 btsp->bn = bn;
3552 btsp->index = dtoffset->index;
3553 btsp->mp = mp;
3554
3555 return 0;
3556}
3557
3558
3559/*
3560 * dtCompare()
3561 *
3562 * function: compare search key with an internal entry
3563 *
3564 * return:
3565 * < 0 if k is < record
3566 * = 0 if k is = record
3567 * > 0 if k is > record
3568 */
3569static int dtCompare(struct component_name * key, /* search key */
3570 dtpage_t * p, /* directory page */
3571 int si)
3572{ /* entry slot index */
3573 wchar_t *kname;
3574 __le16 *name;
3575 int klen, namlen, len, rc;
3576 struct idtentry *ih;
3577 struct dtslot *t;
3578
3579 /*
3580 * force the left-most key on internal pages, at any level of
3581 * the tree, to be less than any search key.
3582 * this obviates having to update the leftmost key on an internal
3583 * page when the user inserts a new key in the tree smaller than
3584 * anything that has been stored.
3585 *
3586 * (? if/when dtSearch() narrows down to 1st entry (index = 0),
3587 * at any internal page at any level of the tree,
3588 * it descends to child of the entry anyway -
3589 * ? make the entry as min size dummy entry)
3590 *
3591 * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
3592 * return (1);
3593 */
3594
3595 kname = key->name;
3596 klen = key->namlen;
3597
3598 ih = (struct idtentry *) & p->slot[si];
3599 si = ih->next;
3600 name = ih->name;
3601 namlen = ih->namlen;
3602 len = min(namlen, DTIHDRDATALEN);
3603
3604 /* compare with head/only segment */
3605 len = min(klen, len);
3606 if ((rc = UniStrncmp_le(kname, name, len)))
3607 return rc;
3608
3609 klen -= len;
3610 namlen -= len;
3611
3612 /* compare with additional segment(s) */
3613 kname += len;
3614 while (klen > 0 && namlen > 0) {
3615 /* compare with next name segment */
3616 t = (struct dtslot *) & p->slot[si];
3617 len = min(namlen, DTSLOTDATALEN);
3618 len = min(klen, len);
3619 name = t->name;
3620 if ((rc = UniStrncmp_le(kname, name, len)))
3621 return rc;
3622
3623 klen -= len;
3624 namlen -= len;
3625 kname += len;
3626 si = t->next;
3627 }
3628
3629 return (klen - namlen);
3630}
3631
3632
3633
3634
3635/*
3636 * ciCompare()
3637 *
3638 * function: compare search key with an (leaf/internal) entry
3639 *
3640 * return:
3641 * < 0 if k is < record
3642 * = 0 if k is = record
3643 * > 0 if k is > record
3644 */
3645static int ciCompare(struct component_name * key, /* search key */
3646 dtpage_t * p, /* directory page */
3647 int si, /* entry slot index */
3648 int flag)
3649{
3650 wchar_t *kname, x;
3651 __le16 *name;
3652 int klen, namlen, len, rc;
3653 struct ldtentry *lh;
3654 struct idtentry *ih;
3655 struct dtslot *t;
3656 int i;
3657
3658 /*
3659 * force the left-most key on internal pages, at any level of
3660 * the tree, to be less than any search key.
3661 * this obviates having to update the leftmost key on an internal
3662 * page when the user inserts a new key in the tree smaller than
3663 * anything that has been stored.
3664 *
3665 * (? if/when dtSearch() narrows down to 1st entry (index = 0),
3666 * at any internal page at any level of the tree,
3667 * it descends to child of the entry anyway -
3668 * ? make the entry as min size dummy entry)
3669 *
3670 * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF))
3671 * return (1);
3672 */
3673
3674 kname = key->name;
3675 klen = key->namlen;
3676
3677 /*
3678 * leaf page entry
3679 */
3680 if (p->header.flag & BT_LEAF) {
3681 lh = (struct ldtentry *) & p->slot[si];
3682 si = lh->next;
3683 name = lh->name;
3684 namlen = lh->namlen;
3685 if (flag & JFS_DIR_INDEX)
3686 len = min(namlen, DTLHDRDATALEN);
3687 else
3688 len = min(namlen, DTLHDRDATALEN_LEGACY);
3689 }
3690 /*
3691 * internal page entry
3692 */
3693 else {
3694 ih = (struct idtentry *) & p->slot[si];
3695 si = ih->next;
3696 name = ih->name;
3697 namlen = ih->namlen;
3698 len = min(namlen, DTIHDRDATALEN);
3699 }
3700
3701 /* compare with head/only segment */
3702 len = min(klen, len);
3703 for (i = 0; i < len; i++, kname++, name++) {
3704 /* only uppercase if case-insensitive support is on */
3705 if ((flag & JFS_OS2) == JFS_OS2)
3706 x = UniToupper(le16_to_cpu(*name));
3707 else
3708 x = le16_to_cpu(*name);
3709 if ((rc = *kname - x))
3710 return rc;
3711 }
3712
3713 klen -= len;
3714 namlen -= len;
3715
3716 /* compare with additional segment(s) */
3717 while (klen > 0 && namlen > 0) {
3718 /* compare with next name segment */
3719 t = (struct dtslot *) & p->slot[si];
3720 len = min(namlen, DTSLOTDATALEN);
3721 len = min(klen, len);
3722 name = t->name;
3723 for (i = 0; i < len; i++, kname++, name++) {
3724 /* only uppercase if case-insensitive support is on */
3725 if ((flag & JFS_OS2) == JFS_OS2)
3726 x = UniToupper(le16_to_cpu(*name));
3727 else
3728 x = le16_to_cpu(*name);
3729
3730 if ((rc = *kname - x))
3731 return rc;
3732 }
3733
3734 klen -= len;
3735 namlen -= len;
3736 si = t->next;
3737 }
3738
3739 return (klen - namlen);
3740}
3741
3742
3743/*
3744 * ciGetLeafPrefixKey()
3745 *
3746 * function: compute prefix of suffix compression
3747 * from two adjacent leaf entries
3748 * across page boundary
3749 *
3750 * return: non-zero on error
3751 *
3752 */
3753static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
3754 int ri, struct component_name * key, int flag)
3755{
3756 int klen, namlen;
3757 wchar_t *pl, *pr, *kname;
3758 struct component_name lkey;
3759 struct component_name rkey;
3760
3761 lkey.name = (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
3762 GFP_KERNEL);
3763 if (lkey.name == NULL)
3764 return -ENOSPC;
3765
3766 rkey.name = (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
3767 GFP_KERNEL);
3768 if (rkey.name == NULL) {
3769 kfree(lkey.name);
3770 return -ENOSPC;
3771 }
3772
3773 /* get left and right key */
3774 dtGetKey(lp, li, &lkey, flag);
3775 lkey.name[lkey.namlen] = 0;
3776
3777 if ((flag & JFS_OS2) == JFS_OS2)
3778 ciToUpper(&lkey);
3779
3780 dtGetKey(rp, ri, &rkey, flag);
3781 rkey.name[rkey.namlen] = 0;
3782
3783
3784 if ((flag & JFS_OS2) == JFS_OS2)
3785 ciToUpper(&rkey);
3786
3787 /* compute prefix */
3788 klen = 0;
3789 kname = key->name;
3790 namlen = min(lkey.namlen, rkey.namlen);
3791 for (pl = lkey.name, pr = rkey.name;
3792 namlen; pl++, pr++, namlen--, klen++, kname++) {
3793 *kname = *pr;
3794 if (*pl != *pr) {
3795 key->namlen = klen + 1;
3796 goto free_names;
3797 }
3798 }
3799
3800 /* l->namlen <= r->namlen since l <= r */
3801 if (lkey.namlen < rkey.namlen) {
3802 *kname = *pr;
3803 key->namlen = klen + 1;
3804 } else /* l->namelen == r->namelen */
3805 key->namlen = klen;
3806
3807free_names:
3808 kfree(lkey.name);
3809 kfree(rkey.name);
3810 return 0;
3811}
3812
3813
3814
3815/*
3816 * dtGetKey()
3817 *
3818 * function: get key of the entry
3819 */
3820static void dtGetKey(dtpage_t * p, int i, /* entry index */
3821 struct component_name * key, int flag)
3822{
3823 int si;
3824 s8 *stbl;
3825 struct ldtentry *lh;
3826 struct idtentry *ih;
3827 struct dtslot *t;
3828 int namlen, len;
3829 wchar_t *kname;
3830 __le16 *name;
3831
3832 /* get entry */
3833 stbl = DT_GETSTBL(p);
3834 si = stbl[i];
3835 if (p->header.flag & BT_LEAF) {
3836 lh = (struct ldtentry *) & p->slot[si];
3837 si = lh->next;
3838 namlen = lh->namlen;
3839 name = lh->name;
3840 if (flag & JFS_DIR_INDEX)
3841 len = min(namlen, DTLHDRDATALEN);
3842 else
3843 len = min(namlen, DTLHDRDATALEN_LEGACY);
3844 } else {
3845 ih = (struct idtentry *) & p->slot[si];
3846 si = ih->next;
3847 namlen = ih->namlen;
3848 name = ih->name;
3849 len = min(namlen, DTIHDRDATALEN);
3850 }
3851
3852 key->namlen = namlen;
3853 kname = key->name;
3854
3855 /*
3856 * move head/only segment
3857 */
3858 UniStrncpy_from_le(kname, name, len);
3859
3860 /*
3861 * move additional segment(s)
3862 */
3863 while (si >= 0) {
3864 /* get next segment */
3865 t = &p->slot[si];
3866 kname += len;
3867 namlen -= len;
3868 len = min(namlen, DTSLOTDATALEN);
3869 UniStrncpy_from_le(kname, t->name, len);
3870
3871 si = t->next;
3872 }
3873}
3874
3875
3876/*
3877 * dtInsertEntry()
3878 *
3879 * function: allocate free slot(s) and
3880 * write a leaf/internal entry
3881 *
3882 * return: entry slot index
3883 */
3884static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
3885 ddata_t * data, struct dt_lock ** dtlock)
3886{
3887 struct dtslot *h, *t;
3888 struct ldtentry *lh = NULL;
3889 struct idtentry *ih = NULL;
3890 int hsi, fsi, klen, len, nextindex;
3891 wchar_t *kname;
3892 __le16 *name;
3893 s8 *stbl;
3894 pxd_t *xd;
3895 struct dt_lock *dtlck = *dtlock;
3896 struct lv *lv;
3897 int xsi, n;
3898 s64 bn = 0;
3899 struct metapage *mp = NULL;
3900
3901 klen = key->namlen;
3902 kname = key->name;
3903
3904 /* allocate a free slot */
3905 hsi = fsi = p->header.freelist;
3906 h = &p->slot[fsi];
3907 p->header.freelist = h->next;
3908 --p->header.freecnt;
3909
3910 /* open new linelock */
3911 if (dtlck->index >= dtlck->maxcnt)
3912 dtlck = (struct dt_lock *) txLinelock(dtlck);
3913
3914 lv = & dtlck->lv[dtlck->index];
3915 lv->offset = hsi;
3916
3917 /* write head/only segment */
3918 if (p->header.flag & BT_LEAF) {
3919 lh = (struct ldtentry *) h;
3920 lh->next = h->next;
3921 lh->inumber = cpu_to_le32(data->leaf.ino);
3922 lh->namlen = klen;
3923 name = lh->name;
3924 if (data->leaf.ip) {
3925 len = min(klen, DTLHDRDATALEN);
3926 if (!(p->header.flag & BT_ROOT))
3927 bn = addressPXD(&p->header.self);
3928 lh->index = cpu_to_le32(add_index(data->leaf.tid,
3929 data->leaf.ip,
3930 bn, index));
3931 } else
3932 len = min(klen, DTLHDRDATALEN_LEGACY);
3933 } else {
3934 ih = (struct idtentry *) h;
3935 ih->next = h->next;
3936 xd = (pxd_t *) ih;
3937 *xd = data->xd;
3938 ih->namlen = klen;
3939 name = ih->name;
3940 len = min(klen, DTIHDRDATALEN);
3941 }
3942
3943 UniStrncpy_to_le(name, kname, len);
3944
3945 n = 1;
3946 xsi = hsi;
3947
3948 /* write additional segment(s) */
3949 t = h;
3950 klen -= len;
3951 while (klen) {
3952 /* get free slot */
3953 fsi = p->header.freelist;
3954 t = &p->slot[fsi];
3955 p->header.freelist = t->next;
3956 --p->header.freecnt;
3957
3958 /* is next slot contiguous ? */
3959 if (fsi != xsi + 1) {
3960 /* close current linelock */
3961 lv->length = n;
3962 dtlck->index++;
3963
3964 /* open new linelock */
3965 if (dtlck->index < dtlck->maxcnt)
3966 lv++;
3967 else {
3968 dtlck = (struct dt_lock *) txLinelock(dtlck);
3969 lv = & dtlck->lv[0];
3970 }
3971
3972 lv->offset = fsi;
3973 n = 0;
3974 }
3975
3976 kname += len;
3977 len = min(klen, DTSLOTDATALEN);
3978 UniStrncpy_to_le(t->name, kname, len);
3979
3980 n++;
3981 xsi = fsi;
3982 klen -= len;
3983 }
3984
3985 /* close current linelock */
3986 lv->length = n;
3987 dtlck->index++;
3988
3989 *dtlock = dtlck;
3990
3991 /* terminate last/only segment */
3992 if (h == t) {
3993 /* single segment entry */
3994 if (p->header.flag & BT_LEAF)
3995 lh->next = -1;
3996 else
3997 ih->next = -1;
3998 } else
3999 /* multi-segment entry */
4000 t->next = -1;
4001
4002 /* if insert into middle, shift right succeeding entries in stbl */
4003 stbl = DT_GETSTBL(p);
4004 nextindex = p->header.nextindex;
4005 if (index < nextindex) {
4006 memmove(stbl + index + 1, stbl + index, nextindex - index);
4007
4008 if ((p->header.flag & BT_LEAF) && data->leaf.ip) {
4009 s64 lblock;
4010
4011 /*
4012 * Need to update slot number for entries that moved
4013 * in the stbl
4014 */
4015 mp = NULL;
4016 for (n = index + 1; n <= nextindex; n++) {
4017 lh = (struct ldtentry *) & (p->slot[stbl[n]]);
4018 modify_index(data->leaf.tid, data->leaf.ip,
4019 le32_to_cpu(lh->index), bn, n,
4020 &mp, &lblock);
4021 }
4022 if (mp)
4023 release_metapage(mp);
4024 }
4025 }
4026
4027 stbl[index] = hsi;
4028
4029 /* advance next available entry index of stbl */
4030 ++p->header.nextindex;
4031}
4032
4033
4034/*
4035 * dtMoveEntry()
4036 *
4037 * function: move entries from split/left page to new/right page
4038 *
4039 * nextindex of dst page and freelist/freecnt of both pages
4040 * are updated.
4041 */
4042static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
4043 struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
4044 int do_index)
4045{
4046 int ssi, next; /* src slot index */
4047 int di; /* dst entry index */
4048 int dsi; /* dst slot index */
4049 s8 *sstbl, *dstbl; /* sorted entry table */
4050 int snamlen, len;
4051 struct ldtentry *slh, *dlh = NULL;
4052 struct idtentry *sih, *dih = NULL;
4053 struct dtslot *h, *s, *d;
4054 struct dt_lock *sdtlck = *sdtlock, *ddtlck = *ddtlock;
4055 struct lv *slv, *dlv;
4056 int xssi, ns, nd;
4057 int sfsi;
4058
4059 sstbl = (s8 *) & sp->slot[sp->header.stblindex];
4060 dstbl = (s8 *) & dp->slot[dp->header.stblindex];
4061
4062 dsi = dp->header.freelist; /* first (whole page) free slot */
4063 sfsi = sp->header.freelist;
4064
4065 /* linelock destination entry slot */
4066 dlv = & ddtlck->lv[ddtlck->index];
4067 dlv->offset = dsi;
4068
4069 /* linelock source entry slot */
4070 slv = & sdtlck->lv[sdtlck->index];
4071 slv->offset = sstbl[si];
4072 xssi = slv->offset - 1;
4073
4074 /*
4075 * move entries
4076 */
4077 ns = nd = 0;
4078 for (di = 0; si < sp->header.nextindex; si++, di++) {
4079 ssi = sstbl[si];
4080 dstbl[di] = dsi;
4081
4082 /* is next slot contiguous ? */
4083 if (ssi != xssi + 1) {
4084 /* close current linelock */
4085 slv->length = ns;
4086 sdtlck->index++;
4087
4088 /* open new linelock */
4089 if (sdtlck->index < sdtlck->maxcnt)
4090 slv++;
4091 else {
4092 sdtlck = (struct dt_lock *) txLinelock(sdtlck);
4093 slv = & sdtlck->lv[0];
4094 }
4095
4096 slv->offset = ssi;
4097 ns = 0;
4098 }
4099
4100 /*
4101 * move head/only segment of an entry
4102 */
4103 /* get dst slot */
4104 h = d = &dp->slot[dsi];
4105
4106 /* get src slot and move */
4107 s = &sp->slot[ssi];
4108 if (sp->header.flag & BT_LEAF) {
4109 /* get source entry */
4110 slh = (struct ldtentry *) s;
4111 dlh = (struct ldtentry *) h;
4112 snamlen = slh->namlen;
4113
4114 if (do_index) {
4115 len = min(snamlen, DTLHDRDATALEN);
4116 dlh->index = slh->index; /* little-endian */
4117 } else
4118 len = min(snamlen, DTLHDRDATALEN_LEGACY);
4119
4120 memcpy(dlh, slh, 6 + len * 2);
4121
4122 next = slh->next;
4123
4124 /* update dst head/only segment next field */
4125 dsi++;
4126 dlh->next = dsi;
4127 } else {
4128 sih = (struct idtentry *) s;
4129 snamlen = sih->namlen;
4130
4131 len = min(snamlen, DTIHDRDATALEN);
4132 dih = (struct idtentry *) h;
4133 memcpy(dih, sih, 10 + len * 2);
4134 next = sih->next;
4135
4136 dsi++;
4137 dih->next = dsi;
4138 }
4139
4140 /* free src head/only segment */
4141 s->next = sfsi;
4142 s->cnt = 1;
4143 sfsi = ssi;
4144
4145 ns++;
4146 nd++;
4147 xssi = ssi;
4148
4149 /*
4150 * move additional segment(s) of the entry
4151 */
4152 snamlen -= len;
4153 while ((ssi = next) >= 0) {
4154 /* is next slot contiguous ? */
4155 if (ssi != xssi + 1) {
4156 /* close current linelock */
4157 slv->length = ns;
4158 sdtlck->index++;
4159
4160 /* open new linelock */
4161 if (sdtlck->index < sdtlck->maxcnt)
4162 slv++;
4163 else {
4164 sdtlck =
4165 (struct dt_lock *)
4166 txLinelock(sdtlck);
4167 slv = & sdtlck->lv[0];
4168 }
4169
4170 slv->offset = ssi;
4171 ns = 0;
4172 }
4173
4174 /* get next source segment */
4175 s = &sp->slot[ssi];
4176
4177 /* get next destination free slot */
4178 d++;
4179
4180 len = min(snamlen, DTSLOTDATALEN);
4181 UniStrncpy_le(d->name, s->name, len);
4182
4183 ns++;
4184 nd++;
4185 xssi = ssi;
4186
4187 dsi++;
4188 d->next = dsi;
4189
4190 /* free source segment */
4191 next = s->next;
4192 s->next = sfsi;
4193 s->cnt = 1;
4194 sfsi = ssi;
4195
4196 snamlen -= len;
4197 } /* end while */
4198
4199 /* terminate dst last/only segment */
4200 if (h == d) {
4201 /* single segment entry */
4202 if (dp->header.flag & BT_LEAF)
4203 dlh->next = -1;
4204 else
4205 dih->next = -1;
4206 } else
4207 /* multi-segment entry */
4208 d->next = -1;
4209 } /* end for */
4210
4211 /* close current linelock */
4212 slv->length = ns;
4213 sdtlck->index++;
4214 *sdtlock = sdtlck;
4215
4216 dlv->length = nd;
4217 ddtlck->index++;
4218 *ddtlock = ddtlck;
4219
4220 /* update source header */
4221 sp->header.freelist = sfsi;
4222 sp->header.freecnt += nd;
4223
4224 /* update destination header */
4225 dp->header.nextindex = di;
4226
4227 dp->header.freelist = dsi;
4228 dp->header.freecnt -= nd;
4229}
4230
4231
4232/*
4233 * dtDeleteEntry()
4234 *
4235 * function: free a (leaf/internal) entry
4236 *
4237 * log freelist header, stbl, and each segment slot of entry
4238 * (even though last/only segment next field is modified,
4239 * physical image logging requires all segment slots of
4240 * the entry logged to avoid applying previous updates
4241 * to the same slots)
4242 */
4243static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock)
4244{
4245 int fsi; /* free entry slot index */
4246 s8 *stbl;
4247 struct dtslot *t;
4248 int si, freecnt;
4249 struct dt_lock *dtlck = *dtlock;
4250 struct lv *lv;
4251 int xsi, n;
4252
4253 /* get free entry slot index */
4254 stbl = DT_GETSTBL(p);
4255 fsi = stbl[fi];
4256
4257 /* open new linelock */
4258 if (dtlck->index >= dtlck->maxcnt)
4259 dtlck = (struct dt_lock *) txLinelock(dtlck);
4260 lv = & dtlck->lv[dtlck->index];
4261
4262 lv->offset = fsi;
4263
4264 /* get the head/only segment */
4265 t = &p->slot[fsi];
4266 if (p->header.flag & BT_LEAF)
4267 si = ((struct ldtentry *) t)->next;
4268 else
4269 si = ((struct idtentry *) t)->next;
4270 t->next = si;
4271 t->cnt = 1;
4272
4273 n = freecnt = 1;
4274 xsi = fsi;
4275
4276 /* find the last/only segment */
4277 while (si >= 0) {
4278 /* is next slot contiguous ? */
4279 if (si != xsi + 1) {
4280 /* close current linelock */
4281 lv->length = n;
4282 dtlck->index++;
4283
4284 /* open new linelock */
4285 if (dtlck->index < dtlck->maxcnt)
4286 lv++;
4287 else {
4288 dtlck = (struct dt_lock *) txLinelock(dtlck);
4289 lv = & dtlck->lv[0];
4290 }
4291
4292 lv->offset = si;
4293 n = 0;
4294 }
4295
4296 n++;
4297 xsi = si;
4298 freecnt++;
4299
4300 t = &p->slot[si];
4301 t->cnt = 1;
4302 si = t->next;
4303 }
4304
4305 /* close current linelock */
4306 lv->length = n;
4307 dtlck->index++;
4308
4309 *dtlock = dtlck;
4310
4311 /* update freelist */
4312 t->next = p->header.freelist;
4313 p->header.freelist = fsi;
4314 p->header.freecnt += freecnt;
4315
4316 /* if delete from middle,
4317 * shift left the succedding entries in the stbl
4318 */
4319 si = p->header.nextindex;
4320 if (fi < si - 1)
4321 memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1);
4322
4323 p->header.nextindex--;
4324}
4325
4326
4327/*
4328 * dtTruncateEntry()
4329 *
4330 * function: truncate a (leaf/internal) entry
4331 *
4332 * log freelist header, stbl, and each segment slot of entry
4333 * (even though last/only segment next field is modified,
4334 * physical image logging requires all segment slots of
4335 * the entry logged to avoid applying previous updates
4336 * to the same slots)
4337 */
4338static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock)
4339{
4340 int tsi; /* truncate entry slot index */
4341 s8 *stbl;
4342 struct dtslot *t;
4343 int si, freecnt;
4344 struct dt_lock *dtlck = *dtlock;
4345 struct lv *lv;
4346 int fsi, xsi, n;
4347
4348 /* get free entry slot index */
4349 stbl = DT_GETSTBL(p);
4350 tsi = stbl[ti];
4351
4352 /* open new linelock */
4353 if (dtlck->index >= dtlck->maxcnt)
4354 dtlck = (struct dt_lock *) txLinelock(dtlck);
4355 lv = & dtlck->lv[dtlck->index];
4356
4357 lv->offset = tsi;
4358
4359 /* get the head/only segment */
4360 t = &p->slot[tsi];
4361 ASSERT(p->header.flag & BT_INTERNAL);
4362 ((struct idtentry *) t)->namlen = 0;
4363 si = ((struct idtentry *) t)->next;
4364 ((struct idtentry *) t)->next = -1;
4365
4366 n = 1;
4367 freecnt = 0;
4368 fsi = si;
4369 xsi = tsi;
4370
4371 /* find the last/only segment */
4372 while (si >= 0) {
4373 /* is next slot contiguous ? */
4374 if (si != xsi + 1) {
4375 /* close current linelock */
4376 lv->length = n;
4377 dtlck->index++;
4378
4379 /* open new linelock */
4380 if (dtlck->index < dtlck->maxcnt)
4381 lv++;
4382 else {
4383 dtlck = (struct dt_lock *) txLinelock(dtlck);
4384 lv = & dtlck->lv[0];
4385 }
4386
4387 lv->offset = si;
4388 n = 0;
4389 }
4390
4391 n++;
4392 xsi = si;
4393 freecnt++;
4394
4395 t = &p->slot[si];
4396 t->cnt = 1;
4397 si = t->next;
4398 }
4399
4400 /* close current linelock */
4401 lv->length = n;
4402 dtlck->index++;
4403
4404 *dtlock = dtlck;
4405
4406 /* update freelist */
4407 if (freecnt == 0)
4408 return;
4409 t->next = p->header.freelist;
4410 p->header.freelist = fsi;
4411 p->header.freecnt += freecnt;
4412}
4413
4414
4415/*
4416 * dtLinelockFreelist()
4417 */
4418static void dtLinelockFreelist(dtpage_t * p, /* directory page */
4419 int m, /* max slot index */
4420 struct dt_lock ** dtlock)
4421{
4422 int fsi; /* free entry slot index */
4423 struct dtslot *t;
4424 int si;
4425 struct dt_lock *dtlck = *dtlock;
4426 struct lv *lv;
4427 int xsi, n;
4428
4429 /* get free entry slot index */
4430 fsi = p->header.freelist;
4431
4432 /* open new linelock */
4433 if (dtlck->index >= dtlck->maxcnt)
4434 dtlck = (struct dt_lock *) txLinelock(dtlck);
4435 lv = & dtlck->lv[dtlck->index];
4436
4437 lv->offset = fsi;
4438
4439 n = 1;
4440 xsi = fsi;
4441
4442 t = &p->slot[fsi];
4443 si = t->next;
4444
4445 /* find the last/only segment */
4446 while (si < m && si >= 0) {
4447 /* is next slot contiguous ? */
4448 if (si != xsi + 1) {
4449 /* close current linelock */
4450 lv->length = n;
4451 dtlck->index++;
4452
4453 /* open new linelock */
4454 if (dtlck->index < dtlck->maxcnt)
4455 lv++;
4456 else {
4457 dtlck = (struct dt_lock *) txLinelock(dtlck);
4458 lv = & dtlck->lv[0];
4459 }
4460
4461 lv->offset = si;
4462 n = 0;
4463 }
4464
4465 n++;
4466 xsi = si;
4467
4468 t = &p->slot[si];
4469 si = t->next;
4470 }
4471
4472 /* close current linelock */
4473 lv->length = n;
4474 dtlck->index++;
4475
4476 *dtlock = dtlck;
4477}
4478
4479
4480/*
4481 * NAME: dtModify
4482 *
4483 * FUNCTION: Modify the inode number part of a directory entry
4484 *
4485 * PARAMETERS:
4486 * tid - Transaction id
4487 * ip - Inode of parent directory
4488 * key - Name of entry to be modified
4489 * orig_ino - Original inode number expected in entry
4490 * new_ino - New inode number to put into entry
4491 * flag - JFS_RENAME
4492 *
4493 * RETURNS:
4494 * -ESTALE - If entry found does not match orig_ino passed in
4495 * -ENOENT - If no entry can be found to match key
4496 * 0 - If successfully modified entry
4497 */
4498int dtModify(tid_t tid, struct inode *ip,
4499 struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag)
4500{
4501 int rc;
4502 s64 bn;
4503 struct metapage *mp;
4504 dtpage_t *p;
4505 int index;
4506 struct btstack btstack;
4507 struct tlock *tlck;
4508 struct dt_lock *dtlck;
4509 struct lv *lv;
4510 s8 *stbl;
4511 int entry_si; /* entry slot index */
4512 struct ldtentry *entry;
4513
4514 /*
4515 * search for the entry to modify:
4516 *
4517 * dtSearch() returns (leaf page pinned, index at which to modify).
4518 */
4519 if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag)))
4520 return rc;
4521
4522 /* retrieve search result */
4523 DT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
4524
4525 BT_MARK_DIRTY(mp, ip);
4526 /*
4527 * acquire a transaction lock on the leaf page of named entry
4528 */
4529 tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
4530 dtlck = (struct dt_lock *) & tlck->lock;
4531
4532 /* get slot index of the entry */
4533 stbl = DT_GETSTBL(p);
4534 entry_si = stbl[index];
4535
4536 /* linelock entry */
4537 ASSERT(dtlck->index == 0);
4538 lv = & dtlck->lv[0];
4539 lv->offset = entry_si;
4540 lv->length = 1;
4541 dtlck->index++;
4542
4543 /* get the head/only segment */
4544 entry = (struct ldtentry *) & p->slot[entry_si];
4545
4546 /* substitute the inode number of the entry */
4547 entry->inumber = cpu_to_le32(new_ino);
4548
4549 /* unpin the leaf page */
4550 DT_PUTPAGE(mp);
4551
4552 return 0;
4553}
4554
4555#ifdef _JFS_DEBUG_DTREE
4556/*
4557 * dtDisplayTree()
4558 *
4559 * function: traverse forward
4560 */
4561int dtDisplayTree(struct inode *ip)
4562{
4563 int rc;
4564 struct metapage *mp;
4565 dtpage_t *p;
4566 s64 bn, pbn;
4567 int index, lastindex, v, h;
4568 pxd_t *xd;
4569 struct btstack btstack;
4570 struct btframe *btsp;
4571 struct btframe *parent;
4572 u8 *stbl;
4573 int psize = 256;
4574
4575 printk("display B+-tree.\n");
4576
4577 /* clear stack */
4578 btsp = btstack.stack;
4579
4580 /*
4581 * start with root
4582 *
4583 * root resides in the inode
4584 */
4585 bn = 0;
4586 v = h = 0;
4587
4588 /*
4589 * first access of each page:
4590 */
4591 newPage:
4592 DT_GETPAGE(ip, bn, mp, psize, p, rc);
4593 if (rc)
4594 return rc;
4595
4596 /* process entries forward from first index */
4597 index = 0;
4598 lastindex = p->header.nextindex - 1;
4599
4600 if (p->header.flag & BT_INTERNAL) {
4601 /*
4602 * first access of each internal page
4603 */
4604 printf("internal page ");
4605 dtDisplayPage(ip, bn, p);
4606
4607 goto getChild;
4608 } else { /* (p->header.flag & BT_LEAF) */
4609
4610 /*
4611 * first access of each leaf page
4612 */
4613 printf("leaf page ");
4614 dtDisplayPage(ip, bn, p);
4615
4616 /*
4617 * process leaf page entries
4618 *
4619 for ( ; index <= lastindex; index++)
4620 {
4621 }
4622 */
4623
4624 /* unpin the leaf page */
4625 DT_PUTPAGE(mp);
4626 }
4627
4628 /*
4629 * go back up to the parent page
4630 */
4631 getParent:
4632 /* pop/restore parent entry for the current child page */
4633 if ((parent = (btsp == btstack.stack ? NULL : --btsp)) == NULL)
4634 /* current page must have been root */
4635 return;
4636
4637 /*
4638 * parent page scan completed
4639 */
4640 if ((index = parent->index) == (lastindex = parent->lastindex)) {
4641 /* go back up to the parent page */
4642 goto getParent;
4643 }
4644
4645 /*
4646 * parent page has entries remaining
4647 */
4648 /* get back the parent page */
4649 bn = parent->bn;
4650 /* v = parent->level; */
4651 DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
4652 if (rc)
4653 return rc;
4654
4655 /* get next parent entry */
4656 index++;
4657
4658 /*
4659 * internal page: go down to child page of current entry
4660 */
4661 getChild:
4662 /* push/save current parent entry for the child page */
4663 btsp->bn = pbn = bn;
4664 btsp->index = index;
4665 btsp->lastindex = lastindex;
4666 /* btsp->level = v; */
4667 /* btsp->node = h; */
4668 ++btsp;
4669
4670 /* get current entry for the child page */
4671 stbl = DT_GETSTBL(p);
4672 xd = (pxd_t *) & p->slot[stbl[index]];
4673
4674 /*
4675 * first access of each internal entry:
4676 */
4677
4678 /* get child page */
4679 bn = addressPXD(xd);
4680 psize = lengthPXD(xd) << ip->i_ipmnt->i_l2bsize;
4681
4682 printk("traverse down 0x%Lx[%d]->0x%Lx\n", pbn, index, bn);
4683 v++;
4684 h = index;
4685
4686 /* release parent page */
4687 DT_PUTPAGE(mp);
4688
4689 /* process the child page */
4690 goto newPage;
4691}
4692
4693
4694/*
4695 * dtDisplayPage()
4696 *
4697 * function: display page
4698 */
4699int dtDisplayPage(struct inode *ip, s64 bn, dtpage_t * p)
4700{
4701 int rc;
4702 struct metapage *mp;
4703 struct ldtentry *lh;
4704 struct idtentry *ih;
4705 pxd_t *xd;
4706 int i, j;
4707 u8 *stbl;
4708 wchar_t name[JFS_NAME_MAX + 1];
4709 struct component_name key = { 0, name };
4710 int freepage = 0;
4711
4712 if (p == NULL) {
4713 freepage = 1;
4714 DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
4715 if (rc)
4716 return rc;
4717 }
4718
4719 /* display page control */
4720 printk("bn:0x%Lx flag:0x%08x nextindex:%d\n",
4721 bn, p->header.flag, p->header.nextindex);
4722
4723 /* display entries */
4724 stbl = DT_GETSTBL(p);
4725 for (i = 0, j = 1; i < p->header.nextindex; i++, j++) {
4726 dtGetKey(p, i, &key, JFS_SBI(ip->i_sb)->mntflag);
4727 key.name[key.namlen] = '\0';
4728 if (p->header.flag & BT_LEAF) {
4729 lh = (struct ldtentry *) & p->slot[stbl[i]];
4730 printf("\t[%d] %s:%d", i, key.name,
4731 le32_to_cpu(lh->inumber));
4732 } else {
4733 ih = (struct idtentry *) & p->slot[stbl[i]];
4734 xd = (pxd_t *) ih;
4735 bn = addressPXD(xd);
4736 printf("\t[%d] %s:0x%Lx", i, key.name, bn);
4737 }
4738
4739 if (j == 4) {
4740 printf("\n");
4741 j = 0;
4742 }
4743 }
4744
4745 printf("\n");
4746
4747 if (freepage)
4748 DT_PUTPAGE(mp);
4749
4750 return 0;
4751}
4752#endif /* _JFS_DEBUG_DTREE */
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
new file mode 100644
index 000000000000..273a80130c9d
--- /dev/null
+++ b/fs/jfs/jfs_dtree.h
@@ -0,0 +1,279 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2000-2002
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18#ifndef _H_JFS_DTREE
19#define _H_JFS_DTREE
20
21/*
22 * jfs_dtree.h: directory B+-tree manager
23 */
24
25#include "jfs_btree.h"
26
27typedef union {
28 struct {
29 tid_t tid;
30 struct inode *ip;
31 u32 ino;
32 } leaf;
33 pxd_t xd;
34} ddata_t;
35
36
37/*
38 * entry segment/slot
39 *
40 * an entry consists of type dependent head/only segment/slot and
41 * additional segments/slots linked vi next field;
42 * N.B. last/only segment of entry is terminated by next = -1;
43 */
44/*
45 * directory page slot
46 */
47struct dtslot {
48 s8 next; /* 1: */
49 s8 cnt; /* 1: */
50 __le16 name[15]; /* 30: */
51}; /* (32) */
52
53
54#define DATASLOTSIZE 16
55#define L2DATASLOTSIZE 4
56#define DTSLOTSIZE 32
57#define L2DTSLOTSIZE 5
58#define DTSLOTHDRSIZE 2
59#define DTSLOTDATASIZE 30
60#define DTSLOTDATALEN 15
61
62/*
63 * internal node entry head/only segment
64 */
65struct idtentry {
66 pxd_t xd; /* 8: child extent descriptor */
67
68 s8 next; /* 1: */
69 u8 namlen; /* 1: */
70 __le16 name[11]; /* 22: 2-byte aligned */
71}; /* (32) */
72
73#define DTIHDRSIZE 10
74#define DTIHDRDATALEN 11
75
76/* compute number of slots for entry */
77#define NDTINTERNAL(klen) ( ((4 + (klen)) + (15 - 1)) / 15 )
78
79
80/*
81 * leaf node entry head/only segment
82 *
83 * For legacy filesystems, name contains 13 wchars -- no index field
84 */
85struct ldtentry {
86 __le32 inumber; /* 4: 4-byte aligned */
87 s8 next; /* 1: */
88 u8 namlen; /* 1: */
89 __le16 name[11]; /* 22: 2-byte aligned */
90 __le32 index; /* 4: index into dir_table */
91}; /* (32) */
92
93#define DTLHDRSIZE 6
94#define DTLHDRDATALEN_LEGACY 13 /* Old (OS/2) format */
95#define DTLHDRDATALEN 11
96
97/*
98 * dir_table used for directory traversal during readdir
99 */
100
101/*
102 * Keep persistent index for directory entries
103 */
104#define DO_INDEX(INODE) (JFS_SBI((INODE)->i_sb)->mntflag & JFS_DIR_INDEX)
105
106/*
107 * Maximum entry in inline directory table
108 */
109#define MAX_INLINE_DIRTABLE_ENTRY 13
110
111struct dir_table_slot {
112 u8 rsrvd; /* 1: */
113 u8 flag; /* 1: 0 if free */
114 u8 slot; /* 1: slot within leaf page of entry */
115 u8 addr1; /* 1: upper 8 bits of leaf page address */
116 __le32 addr2; /* 4: lower 32 bits of leaf page address -OR-
117 index of next entry when this entry was deleted */
118}; /* (8) */
119
120/*
121 * flag values
122 */
123#define DIR_INDEX_VALID 1
124#define DIR_INDEX_FREE 0
125
126#define DTSaddress(dir_table_slot, address64)\
127{\
128 (dir_table_slot)->addr1 = ((u64)address64) >> 32;\
129 (dir_table_slot)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
130}
131
132#define addressDTS(dts)\
133 ( ((s64)((dts)->addr1)) << 32 | __le32_to_cpu((dts)->addr2) )
134
135/* compute number of slots for entry */
136#define NDTLEAF_LEGACY(klen) ( ((2 + (klen)) + (15 - 1)) / 15 )
137#define NDTLEAF NDTINTERNAL
138
139
140/*
141 * directory root page (in-line in on-disk inode):
142 *
143 * cf. dtpage_t below.
144 */
145typedef union {
146 struct {
147 struct dasd DASD; /* 16: DASD limit/usage info */
148
149 u8 flag; /* 1: */
150 u8 nextindex; /* 1: next free entry in stbl */
151 s8 freecnt; /* 1: free count */
152 s8 freelist; /* 1: freelist header */
153
154 __le32 idotdot; /* 4: parent inode number */
155
156 s8 stbl[8]; /* 8: sorted entry index table */
157 } header; /* (32) */
158
159 struct dtslot slot[9];
160} dtroot_t;
161
162#define PARENT(IP) \
163 (le32_to_cpu(JFS_IP(IP)->i_dtroot.header.idotdot))
164
165#define DTROOTMAXSLOT 9
166
167#define dtEmpty(IP) (JFS_IP(IP)->i_dtroot.header.nextindex == 0)
168
169
170/*
171 * directory regular page:
172 *
173 * entry slot array of 32 byte slot
174 *
175 * sorted entry slot index table (stbl):
176 * contiguous slots at slot specified by stblindex,
177 * 1-byte per entry
178 * 512 byte block: 16 entry tbl (1 slot)
179 * 1024 byte block: 32 entry tbl (1 slot)
180 * 2048 byte block: 64 entry tbl (2 slot)
181 * 4096 byte block: 128 entry tbl (4 slot)
182 *
183 * data area:
184 * 512 byte block: 16 - 2 = 14 slot
185 * 1024 byte block: 32 - 2 = 30 slot
186 * 2048 byte block: 64 - 3 = 61 slot
187 * 4096 byte block: 128 - 5 = 123 slot
188 *
189 * N.B. index is 0-based; index fields refer to slot index
190 * except nextindex which refers to entry index in stbl;
191 * end of entry stot list or freelist is marked with -1.
192 */
193typedef union {
194 struct {
195 __le64 next; /* 8: next sibling */
196 __le64 prev; /* 8: previous sibling */
197
198 u8 flag; /* 1: */
199 u8 nextindex; /* 1: next entry index in stbl */
200 s8 freecnt; /* 1: */
201 s8 freelist; /* 1: slot index of head of freelist */
202
203 u8 maxslot; /* 1: number of slots in page slot[] */
204 u8 stblindex; /* 1: slot index of start of stbl */
205 u8 rsrvd[2]; /* 2: */
206
207 pxd_t self; /* 8: self pxd */
208 } header; /* (32) */
209
210 struct dtslot slot[128];
211} dtpage_t;
212
213#define DTPAGEMAXSLOT 128
214
215#define DT8THPGNODEBYTES 512
216#define DT8THPGNODETSLOTS 1
217#define DT8THPGNODESLOTS 16
218
219#define DTQTRPGNODEBYTES 1024
220#define DTQTRPGNODETSLOTS 1
221#define DTQTRPGNODESLOTS 32
222
223#define DTHALFPGNODEBYTES 2048
224#define DTHALFPGNODETSLOTS 2
225#define DTHALFPGNODESLOTS 64
226
227#define DTFULLPGNODEBYTES 4096
228#define DTFULLPGNODETSLOTS 4
229#define DTFULLPGNODESLOTS 128
230
231#define DTENTRYSTART 1
232
233/* get sorted entry table of the page */
234#define DT_GETSTBL(p) ( ((p)->header.flag & BT_ROOT) ?\
235 ((dtroot_t *)(p))->header.stbl : \
236 (s8 *)&(p)->slot[(p)->header.stblindex] )
237
238/*
239 * Flags for dtSearch
240 */
241#define JFS_CREATE 1
242#define JFS_LOOKUP 2
243#define JFS_REMOVE 3
244#define JFS_RENAME 4
245
246#define DIRENTSIZ(namlen) \
247 ( (sizeof(struct dirent) - 2*(JFS_NAME_MAX+1) + 2*((namlen)+1) + 3) &~ 3 )
248
249/*
250 * Maximum file offset for directories.
251 */
252#define DIREND INT_MAX
253
254/*
255 * external declarations
256 */
257extern void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot);
258
259extern int dtSearch(struct inode *ip, struct component_name * key,
260 ino_t * data, struct btstack * btstack, int flag);
261
262extern int dtInsert(tid_t tid, struct inode *ip, struct component_name * key,
263 ino_t * ino, struct btstack * btstack);
264
265extern int dtDelete(tid_t tid, struct inode *ip, struct component_name * key,
266 ino_t * data, int flag);
267
268extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key,
269 ino_t * orig_ino, ino_t new_ino, int flag);
270
271extern int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir);
272
273#ifdef _JFS_DEBUG_DTREE
274extern int dtDisplayTree(struct inode *ip);
275
276extern int dtDisplayPage(struct inode *ip, s64 bn, dtpage_t * p);
277#endif /* _JFS_DEBUG_DTREE */
278
279#endif /* !_H_JFS_DTREE */
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
new file mode 100644
index 000000000000..1953acb79266
--- /dev/null
+++ b/fs/jfs/jfs_extent.c
@@ -0,0 +1,668 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#include <linux/fs.h>
20#include <linux/quotaops.h>
21#include "jfs_incore.h"
22#include "jfs_superblock.h"
23#include "jfs_dmap.h"
24#include "jfs_extent.h"
25#include "jfs_debug.h"
26
27/*
28 * forward references
29 */
30static int extBalloc(struct inode *, s64, s64 *, s64 *);
31#ifdef _NOTYET
32static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *);
33#endif
34static s64 extRoundDown(s64 nb);
35
36/*
37 * external references
38 */
39extern int jfs_commit_inode(struct inode *, int);
40
41
42#define DPD(a) (printk("(a): %d\n",(a)))
43#define DPC(a) (printk("(a): %c\n",(a)))
44#define DPL1(a) \
45{ \
46 if ((a) >> 32) \
47 printk("(a): %x%08x ",(a)); \
48 else \
49 printk("(a): %x ",(a) << 32); \
50}
51#define DPL(a) \
52{ \
53 if ((a) >> 32) \
54 printk("(a): %x%08x\n",(a)); \
55 else \
56 printk("(a): %x\n",(a) << 32); \
57}
58
59#define DPD1(a) (printk("(a): %d ",(a)))
60#define DPX(a) (printk("(a): %08x\n",(a)))
61#define DPX1(a) (printk("(a): %08x ",(a)))
62#define DPS(a) (printk("%s\n",(a)))
63#define DPE(a) (printk("\nENTERING: %s\n",(a)))
64#define DPE1(a) (printk("\nENTERING: %s",(a)))
65#define DPS1(a) (printk(" %s ",(a)))
66
67
68/*
69 * NAME: extAlloc()
70 *
71 * FUNCTION: allocate an extent for a specified page range within a
72 * file.
73 *
74 * PARAMETERS:
75 * ip - the inode of the file.
76 * xlen - requested extent length.
77 * pno - the starting page number with the file.
78 * xp - pointer to an xad. on entry, xad describes an
79 * extent that is used as an allocation hint if the
80 * xaddr of the xad is non-zero. on successful exit,
81 * the xad describes the newly allocated extent.
82 * abnr - boolean_t indicating whether the newly allocated extent
83 * should be marked as allocated but not recorded.
84 *
85 * RETURN VALUES:
86 * 0 - success
87 * -EIO - i/o error.
88 * -ENOSPC - insufficient disk resources.
89 */
90int
91extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
92{
93 struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
94 s64 nxlen, nxaddr, xoff, hint, xaddr = 0;
95 int rc;
96 int xflag;
97
98 /* This blocks if we are low on resources */
99 txBeginAnon(ip->i_sb);
100
101 /* Avoid race with jfs_commit_inode() */
102 down(&JFS_IP(ip)->commit_sem);
103
104 /* validate extent length */
105 if (xlen > MAXXLEN)
106 xlen = MAXXLEN;
107
108 /* get the page's starting extent offset */
109 xoff = pno << sbi->l2nbperpage;
110
111 /* check if an allocation hint was provided */
112 if ((hint = addressXAD(xp))) {
113 /* get the size of the extent described by the hint */
114 nxlen = lengthXAD(xp);
115
116 /* check if the hint is for the portion of the file
117 * immediately previous to the current allocation
118 * request and if hint extent has the same abnr
119 * value as the current request. if so, we can
120 * extend the hint extent to include the current
121 * extent if we can allocate the blocks immediately
122 * following the hint extent.
123 */
124 if (offsetXAD(xp) + nxlen == xoff &&
125 abnr == ((xp->flag & XAD_NOTRECORDED) ? TRUE : FALSE))
126 xaddr = hint + nxlen;
127
128 /* adjust the hint to the last block of the extent */
129 hint += (nxlen - 1);
130 }
131
132 /* allocate the disk blocks for the extent. initially, extBalloc()
133 * will try to allocate disk blocks for the requested size (xlen).
134 * if this fails (xlen contigious free blocks not avaliable), it'll
135 * try to allocate a smaller number of blocks (producing a smaller
136 * extent), with this smaller number of blocks consisting of the
137 * requested number of blocks rounded down to the next smaller
138 * power of 2 number (i.e. 16 -> 8). it'll continue to round down
139 * and retry the allocation until the number of blocks to allocate
140 * is smaller than the number of blocks per page.
141 */
142 nxlen = xlen;
143 if ((rc = extBalloc(ip, hint ? hint : INOHINT(ip), &nxlen, &nxaddr))) {
144 up(&JFS_IP(ip)->commit_sem);
145 return (rc);
146 }
147
148 /* Allocate blocks to quota. */
149 if (DQUOT_ALLOC_BLOCK(ip, nxlen)) {
150 dbFree(ip, nxaddr, (s64) nxlen);
151 up(&JFS_IP(ip)->commit_sem);
152 return -EDQUOT;
153 }
154
155 /* determine the value of the extent flag */
156 xflag = (abnr == TRUE) ? XAD_NOTRECORDED : 0;
157
158 /* if we can extend the hint extent to cover the current request,
159 * extend it. otherwise, insert a new extent to
160 * cover the current request.
161 */
162 if (xaddr && xaddr == nxaddr)
163 rc = xtExtend(0, ip, xoff, (int) nxlen, 0);
164 else
165 rc = xtInsert(0, ip, xflag, xoff, (int) nxlen, &nxaddr, 0);
166
167 /* if the extend or insert failed,
168 * free the newly allocated blocks and return the error.
169 */
170 if (rc) {
171 dbFree(ip, nxaddr, nxlen);
172 DQUOT_FREE_BLOCK(ip, nxlen);
173 up(&JFS_IP(ip)->commit_sem);
174 return (rc);
175 }
176
177 /* set the results of the extent allocation */
178 XADaddress(xp, nxaddr);
179 XADlength(xp, nxlen);
180 XADoffset(xp, xoff);
181 xp->flag = xflag;
182
183 mark_inode_dirty(ip);
184
185 up(&JFS_IP(ip)->commit_sem);
186 /*
187 * COMMIT_SyncList flags an anonymous tlock on page that is on
188 * sync list.
189 * We need to commit the inode to get the page written disk.
190 */
191 if (test_and_clear_cflag(COMMIT_Synclist,ip))
192 jfs_commit_inode(ip, 0);
193
194 return (0);
195}
196
197
198#ifdef _NOTYET
199/*
200 * NAME: extRealloc()
201 *
202 * FUNCTION: extend the allocation of a file extent containing a
203 * partial back last page.
204 *
205 * PARAMETERS:
206 * ip - the inode of the file.
207 * cp - cbuf for the partial backed last page.
208 * xlen - request size of the resulting extent.
209 * xp - pointer to an xad. on successful exit, the xad
210 * describes the newly allocated extent.
211 * abnr - boolean_t indicating whether the newly allocated extent
212 * should be marked as allocated but not recorded.
213 *
214 * RETURN VALUES:
215 * 0 - success
216 * -EIO - i/o error.
217 * -ENOSPC - insufficient disk resources.
218 */
219int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, boolean_t abnr)
220{
221 struct super_block *sb = ip->i_sb;
222 s64 xaddr, xlen, nxaddr, delta, xoff;
223 s64 ntail, nextend, ninsert;
224 int rc, nbperpage = JFS_SBI(sb)->nbperpage;
225 int xflag;
226
227 /* This blocks if we are low on resources */
228 txBeginAnon(ip->i_sb);
229
230 down(&JFS_IP(ip)->commit_sem);
231 /* validate extent length */
232 if (nxlen > MAXXLEN)
233 nxlen = MAXXLEN;
234
235 /* get the extend (partial) page's disk block address and
236 * number of blocks.
237 */
238 xaddr = addressXAD(xp);
239 xlen = lengthXAD(xp);
240 xoff = offsetXAD(xp);
241
242 /* if the extend page is abnr and if the request is for
243 * the extent to be allocated and recorded,
244 * make the page allocated and recorded.
245 */
246 if ((xp->flag & XAD_NOTRECORDED) && !abnr) {
247 xp->flag = 0;
248 if ((rc = xtUpdate(0, ip, xp)))
249 goto exit;
250 }
251
252 /* try to allocated the request number of blocks for the
253 * extent. dbRealloc() first tries to satisfy the request
254 * by extending the allocation in place. otherwise, it will
255 * try to allocate a new set of blocks large enough for the
256 * request. in satisfying a request, dbReAlloc() may allocate
257 * less than what was request but will always allocate enough
258 * space as to satisfy the extend page.
259 */
260 if ((rc = extBrealloc(ip, xaddr, xlen, &nxlen, &nxaddr)))
261 goto exit;
262
263 /* Allocat blocks to quota. */
264 if (DQUOT_ALLOC_BLOCK(ip, nxlen)) {
265 dbFree(ip, nxaddr, (s64) nxlen);
266 up(&JFS_IP(ip)->commit_sem);
267 return -EDQUOT;
268 }
269
270 delta = nxlen - xlen;
271
272 /* check if the extend page is not abnr but the request is abnr
273 * and the allocated disk space is for more than one page. if this
274 * is the case, there is a miss match of abnr between the extend page
275 * and the one or more pages following the extend page. as a result,
276 * two extents will have to be manipulated. the first will be that
277 * of the extent of the extend page and will be manipulated thru
278 * an xtExtend() or an xtTailgate(), depending upon whether the
279 * disk allocation occurred as an inplace extension. the second
280 * extent will be manipulated (created) through an xtInsert() and
281 * will be for the pages following the extend page.
282 */
283 if (abnr && (!(xp->flag & XAD_NOTRECORDED)) && (nxlen > nbperpage)) {
284 ntail = nbperpage;
285 nextend = ntail - xlen;
286 ninsert = nxlen - nbperpage;
287
288 xflag = XAD_NOTRECORDED;
289 } else {
290 ntail = nxlen;
291 nextend = delta;
292 ninsert = 0;
293
294 xflag = xp->flag;
295 }
296
297 /* if we were able to extend the disk allocation in place,
298 * extend the extent. otherwise, move the extent to a
299 * new disk location.
300 */
301 if (xaddr == nxaddr) {
302 /* extend the extent */
303 if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) {
304 dbFree(ip, xaddr + xlen, delta);
305 DQUOT_FREE_BLOCK(ip, nxlen);
306 goto exit;
307 }
308 } else {
309 /*
310 * move the extent to a new location:
311 *
312 * xtTailgate() accounts for relocated tail extent;
313 */
314 if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) {
315 dbFree(ip, nxaddr, nxlen);
316 DQUOT_FREE_BLOCK(ip, nxlen);
317 goto exit;
318 }
319 }
320
321
322 /* check if we need to also insert a new extent */
323 if (ninsert) {
324 /* perform the insert. if it fails, free the blocks
325 * to be inserted and make it appear that we only did
326 * the xtExtend() or xtTailgate() above.
327 */
328 xaddr = nxaddr + ntail;
329 if (xtInsert (0, ip, xflag, xoff + ntail, (int) ninsert,
330 &xaddr, 0)) {
331 dbFree(ip, xaddr, (s64) ninsert);
332 delta = nextend;
333 nxlen = ntail;
334 xflag = 0;
335 }
336 }
337
338 /* set the return results */
339 XADaddress(xp, nxaddr);
340 XADlength(xp, nxlen);
341 XADoffset(xp, xoff);
342 xp->flag = xflag;
343
344 mark_inode_dirty(ip);
345exit:
346 up(&JFS_IP(ip)->commit_sem);
347 return (rc);
348}
349#endif /* _NOTYET */
350
351
352/*
353 * NAME: extHint()
354 *
355 * FUNCTION: produce an extent allocation hint for a file offset.
356 *
357 * PARAMETERS:
358 * ip - the inode of the file.
359 * offset - file offset for which the hint is needed.
360 * xp - pointer to the xad that is to be filled in with
361 * the hint.
362 *
363 * RETURN VALUES:
364 * 0 - success
365 * -EIO - i/o error.
366 */
367int extHint(struct inode *ip, s64 offset, xad_t * xp)
368{
369 struct super_block *sb = ip->i_sb;
370 struct xadlist xadl;
371 struct lxdlist lxdl;
372 lxd_t lxd;
373 s64 prev;
374 int rc, nbperpage = JFS_SBI(sb)->nbperpage;
375
376 /* init the hint as "no hint provided" */
377 XADaddress(xp, 0);
378
379 /* determine the starting extent offset of the page previous
380 * to the page containing the offset.
381 */
382 prev = ((offset & ~POFFSET) >> JFS_SBI(sb)->l2bsize) - nbperpage;
383
384 /* if the offsets in the first page of the file,
385 * no hint provided.
386 */
387 if (prev < 0)
388 return (0);
389
390 /* prepare to lookup the previous page's extent info */
391 lxdl.maxnlxd = 1;
392 lxdl.nlxd = 1;
393 lxdl.lxd = &lxd;
394 LXDoffset(&lxd, prev)
395 LXDlength(&lxd, nbperpage);
396
397 xadl.maxnxad = 1;
398 xadl.nxad = 0;
399 xadl.xad = xp;
400
401 /* perform the lookup */
402 if ((rc = xtLookupList(ip, &lxdl, &xadl, 0)))
403 return (rc);
404
405 /* check if not extent exists for the previous page.
406 * this is possible for sparse files.
407 */
408 if (xadl.nxad == 0) {
409// assert(ISSPARSE(ip));
410 return (0);
411 }
412
413 /* only preserve the abnr flag within the xad flags
414 * of the returned hint.
415 */
416 xp->flag &= XAD_NOTRECORDED;
417
418 if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) {
419 jfs_error(ip->i_sb, "extHint: corrupt xtree");
420 return -EIO;
421 }
422
423 return (0);
424}
425
426
427/*
428 * NAME: extRecord()
429 *
430 * FUNCTION: change a page with a file from not recorded to recorded.
431 *
432 * PARAMETERS:
433 * ip - inode of the file.
434 * cp - cbuf of the file page.
435 *
436 * RETURN VALUES:
437 * 0 - success
438 * -EIO - i/o error.
439 * -ENOSPC - insufficient disk resources.
440 */
441int extRecord(struct inode *ip, xad_t * xp)
442{
443 int rc;
444
445 txBeginAnon(ip->i_sb);
446
447 down(&JFS_IP(ip)->commit_sem);
448
449 /* update the extent */
450 rc = xtUpdate(0, ip, xp);
451
452 up(&JFS_IP(ip)->commit_sem);
453 return rc;
454}
455
456
457#ifdef _NOTYET
458/*
459 * NAME: extFill()
460 *
461 * FUNCTION: allocate disk space for a file page that represents
462 * a file hole.
463 *
464 * PARAMETERS:
465 * ip - the inode of the file.
466 * cp - cbuf of the file page represent the hole.
467 *
468 * RETURN VALUES:
469 * 0 - success
470 * -EIO - i/o error.
471 * -ENOSPC - insufficient disk resources.
472 */
473int extFill(struct inode *ip, xad_t * xp)
474{
475 int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage;
476 s64 blkno = offsetXAD(xp) >> ip->i_blksize;
477
478// assert(ISSPARSE(ip));
479
480 /* initialize the extent allocation hint */
481 XADaddress(xp, 0);
482
483 /* allocate an extent to fill the hole */
484 if ((rc = extAlloc(ip, nbperpage, blkno, xp, FALSE)))
485 return (rc);
486
487 assert(lengthPXD(xp) == nbperpage);
488
489 return (0);
490}
491#endif /* _NOTYET */
492
493
494/*
495 * NAME: extBalloc()
496 *
497 * FUNCTION: allocate disk blocks to form an extent.
498 *
499 * initially, we will try to allocate disk blocks for the
500 * requested size (nblocks). if this fails (nblocks
501 * contigious free blocks not avaliable), we'll try to allocate
502 * a smaller number of blocks (producing a smaller extent), with
503 * this smaller number of blocks consisting of the requested
504 * number of blocks rounded down to the next smaller power of 2
505 * number (i.e. 16 -> 8). we'll continue to round down and
506 * retry the allocation until the number of blocks to allocate
507 * is smaller than the number of blocks per page.
508 *
509 * PARAMETERS:
510 * ip - the inode of the file.
511 * hint - disk block number to be used as an allocation hint.
512 * *nblocks - pointer to an s64 value. on entry, this value specifies
513 * the desired number of block to be allocated. on successful
514 * exit, this value is set to the number of blocks actually
515 * allocated.
516 * blkno - pointer to a block address that is filled in on successful
517 * return with the starting block number of the newly
518 * allocated block range.
519 *
520 * RETURN VALUES:
521 * 0 - success
522 * -EIO - i/o error.
523 * -ENOSPC - insufficient disk resources.
524 */
525static int
526extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
527{
528 struct jfs_inode_info *ji = JFS_IP(ip);
529 struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
530 s64 nb, nblks, daddr, max;
531 int rc, nbperpage = sbi->nbperpage;
532 struct bmap *bmp = sbi->bmap;
533 int ag;
534
535 /* get the number of blocks to initially attempt to allocate.
536 * we'll first try the number of blocks requested unless this
537 * number is greater than the maximum number of contigious free
538 * blocks in the map. in that case, we'll start off with the
539 * maximum free.
540 */
541 max = (s64) 1 << bmp->db_maxfreebud;
542 if (*nblocks >= max && *nblocks > nbperpage)
543 nb = nblks = (max > nbperpage) ? max : nbperpage;
544 else
545 nb = nblks = *nblocks;
546
547 /* try to allocate blocks */
548 while ((rc = dbAlloc(ip, hint, nb, &daddr)) != 0) {
549 /* if something other than an out of space error,
550 * stop and return this error.
551 */
552 if (rc != -ENOSPC)
553 return (rc);
554
555 /* decrease the allocation request size */
556 nb = min(nblks, extRoundDown(nb));
557
558 /* give up if we cannot cover a page */
559 if (nb < nbperpage)
560 return (rc);
561 }
562
563 *nblocks = nb;
564 *blkno = daddr;
565
566 if (S_ISREG(ip->i_mode) && (ji->fileset == FILESYSTEM_I)) {
567 ag = BLKTOAG(daddr, sbi);
568 spin_lock_irq(&ji->ag_lock);
569 if (ji->active_ag == -1) {
570 atomic_inc(&bmp->db_active[ag]);
571 ji->active_ag = ag;
572 } else if (ji->active_ag != ag) {
573 atomic_dec(&bmp->db_active[ji->active_ag]);
574 atomic_inc(&bmp->db_active[ag]);
575 ji->active_ag = ag;
576 }
577 spin_unlock_irq(&ji->ag_lock);
578 }
579
580 return (0);
581}
582
583
584#ifdef _NOTYET
585/*
586 * NAME: extBrealloc()
587 *
588 * FUNCTION: attempt to extend an extent's allocation.
589 *
590 * initially, we will try to extend the extent's allocation
591 * in place. if this fails, we'll try to move the extent
592 * to a new set of blocks. if moving the extent, we initially
593 * will try to allocate disk blocks for the requested size
594 * (nnew). if this fails (nnew contigious free blocks not
595 * avaliable), we'll try to allocate a smaller number of
596 * blocks (producing a smaller extent), with this smaller
597 * number of blocks consisting of the requested number of
598 * blocks rounded down to the next smaller power of 2
599 * number (i.e. 16 -> 8). we'll continue to round down and
600 * retry the allocation until the number of blocks to allocate
601 * is smaller than the number of blocks per page.
602 *
603 * PARAMETERS:
604 * ip - the inode of the file.
605 * blkno - starting block number of the extents current allocation.
606 * nblks - number of blocks within the extents current allocation.
607 * newnblks - pointer to a s64 value. on entry, this value is the
608 * the new desired extent size (number of blocks). on
609 * successful exit, this value is set to the extent's actual
610 * new size (new number of blocks).
611 * newblkno - the starting block number of the extents new allocation.
612 *
613 * RETURN VALUES:
614 * 0 - success
615 * -EIO - i/o error.
616 * -ENOSPC - insufficient disk resources.
617 */
618static int
619extBrealloc(struct inode *ip,
620 s64 blkno, s64 nblks, s64 * newnblks, s64 * newblkno)
621{
622 int rc;
623
624 /* try to extend in place */
625 if ((rc = dbExtend(ip, blkno, nblks, *newnblks - nblks)) == 0) {
626 *newblkno = blkno;
627 return (0);
628 } else {
629 if (rc != -ENOSPC)
630 return (rc);
631 }
632
633 /* in place extension not possible.
634 * try to move the extent to a new set of blocks.
635 */
636 return (extBalloc(ip, blkno, newnblks, newblkno));
637}
638#endif /* _NOTYET */
639
640
641/*
642 * NAME: extRoundDown()
643 *
644 * FUNCTION: round down a specified number of blocks to the next
645 * smallest power of 2 number.
646 *
647 * PARAMETERS:
648 * nb - the inode of the file.
649 *
650 * RETURN VALUES:
651 * next smallest power of 2 number.
652 */
653static s64 extRoundDown(s64 nb)
654{
655 int i;
656 u64 m, k;
657
658 for (i = 0, m = (u64) 1 << 63; i < 64; i++, m >>= 1) {
659 if (m & nb)
660 break;
661 }
662
663 i = 63 - i;
664 k = (u64) 1 << i;
665 k = ((k - 1) & nb) ? k : k >> 1;
666
667 return (k);
668}
diff --git a/fs/jfs/jfs_extent.h b/fs/jfs/jfs_extent.h
new file mode 100644
index 000000000000..e80fc7ced87d
--- /dev/null
+++ b/fs/jfs/jfs_extent.h
@@ -0,0 +1,31 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2000-2001
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18#ifndef _H_JFS_EXTENT
19#define _H_JFS_EXTENT
20
21/* get block allocation allocation hint as location of disk inode */
22#define INOHINT(ip) \
23 (addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1)
24
25extern int extAlloc(struct inode *, s64, s64, xad_t *, boolean_t);
26extern int extFill(struct inode *, xad_t *);
27extern int extHint(struct inode *, s64, xad_t *);
28extern int extRealloc(struct inode *, s64, xad_t *, boolean_t);
29extern int extRecord(struct inode *, xad_t *);
30
31#endif /* _H_JFS_EXTENT */
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
new file mode 100644
index 000000000000..86ccac80f0ab
--- /dev/null
+++ b/fs/jfs/jfs_filsys.h
@@ -0,0 +1,280 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2003
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18#ifndef _H_JFS_FILSYS
19#define _H_JFS_FILSYS
20
21/*
22 * jfs_filsys.h
23 *
24 * file system (implementation-dependent) constants
25 *
26 * refer to <limits.h> for system wide implementation-dependent constants
27 */
28
29/*
30 * file system option (superblock flag)
31 */
32/* mount time flag to disable journaling to disk */
33#define JFS_NOINTEGRITY 0x00000010
34
35/* mount time flags for error handling */
36#define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */
37#define JFS_ERR_CONTINUE 0x00000004 /* continue */
38#define JFS_ERR_PANIC 0x00000008 /* panic */
39
40/* platform option (conditional compilation) */
41#define JFS_AIX 0x80000000 /* AIX support */
42/* POSIX name/directory support */
43
44#define JFS_OS2 0x40000000 /* OS/2 support */
45/* case-insensitive name/directory support */
46
47#define JFS_DFS 0x20000000 /* DCE DFS LFS support */
48
49#define JFS_LINUX 0x10000000 /* Linux support */
50/* case-sensitive name/directory support */
51
52/* directory option */
53#define JFS_UNICODE 0x00000001 /* unicode name */
54
55/* commit option */
56#define JFS_COMMIT 0x00000f00 /* commit option mask */
57#define JFS_GROUPCOMMIT 0x00000100 /* group (of 1) commit */
58#define JFS_LAZYCOMMIT 0x00000200 /* lazy commit */
59#define JFS_TMPFS 0x00000400 /* temporary file system -
60 * do not log/commit:
61 */
62
63/* log logical volume option */
64#define JFS_INLINELOG 0x00000800 /* inline log within file system */
65#define JFS_INLINEMOVE 0x00001000 /* inline log being moved */
66
67/* Secondary aggregate inode table */
68#define JFS_BAD_SAIT 0x00010000 /* current secondary ait is bad */
69
70/* sparse regular file support */
71#define JFS_SPARSE 0x00020000 /* sparse regular file */
72
73/* DASD Limits F226941 */
74#define JFS_DASD_ENABLED 0x00040000 /* DASD limits enabled */
75#define JFS_DASD_PRIME 0x00080000 /* Prime DASD usage on boot */
76
77/* big endian flag */
78#define JFS_SWAP_BYTES 0x00100000 /* running on big endian computer */
79
80/* Directory index */
81#define JFS_DIR_INDEX 0x00200000 /* Persistant index for */
82 /* directory entries */
83
84
85/*
86 * buffer cache configuration
87 */
88/* page size */
89#ifdef PSIZE
90#undef PSIZE
91#endif
92#define PSIZE 4096 /* page size (in byte) */
93#define L2PSIZE 12 /* log2(PSIZE) */
94#define POFFSET 4095 /* offset within page */
95
96/* buffer page size */
97#define BPSIZE PSIZE
98
99/*
100 * fs fundamental size
101 *
102 * PSIZE >= file system block size >= PBSIZE >= DISIZE
103 */
104#define PBSIZE 512 /* physical block size (in byte) */
105#define L2PBSIZE 9 /* log2(PBSIZE) */
106
107#define DISIZE 512 /* on-disk inode size (in byte) */
108#define L2DISIZE 9 /* log2(DISIZE) */
109
110#define IDATASIZE 256 /* inode inline data size */
111#define IXATTRSIZE 128 /* inode inline extended attribute size */
112
113#define XTPAGE_SIZE 4096
114#define log2_PAGESIZE 12
115
116#define IAG_SIZE 4096
117#define IAG_EXTENT_SIZE 4096
118#define INOSPERIAG 4096 /* number of disk inodes per iag */
119#define L2INOSPERIAG 12 /* l2 number of disk inodes per iag */
120#define INOSPEREXT 32 /* number of disk inode per extent */
121#define L2INOSPEREXT 5 /* l2 number of disk inode per extent */
122#define IXSIZE (DISIZE * INOSPEREXT) /* inode extent size */
123#define INOSPERPAGE 8 /* number of disk inodes per 4K page */
124#define L2INOSPERPAGE 3 /* log2(INOSPERPAGE) */
125
126#define IAGFREELIST_LWM 64
127
128#define INODE_EXTENT_SIZE IXSIZE /* inode extent size */
129#define NUM_INODE_PER_EXTENT INOSPEREXT
130#define NUM_INODE_PER_IAG INOSPERIAG
131
132#define MINBLOCKSIZE 512
133#define MAXBLOCKSIZE 4096
134#define MAXFILESIZE ((s64)1 << 52)
135
136#define JFS_LINK_MAX 0xffffffff
137
138/* Minimum number of bytes supported for a JFS partition */
139#define MINJFS (0x1000000)
140#define MINJFSTEXT "16"
141
142/*
143 * file system block size -> physical block size
144 */
145#define LBOFFSET(x) ((x) & (PBSIZE - 1))
146#define LBNUMBER(x) ((x) >> L2PBSIZE)
147#define LBLK2PBLK(sb,b) ((b) << (sb->s_blocksize_bits - L2PBSIZE))
148#define PBLK2LBLK(sb,b) ((b) >> (sb->s_blocksize_bits - L2PBSIZE))
149/* size in byte -> last page number */
150#define SIZE2PN(size) ( ((s64)((size) - 1)) >> (L2PSIZE) )
151/* size in byte -> last file system block number */
152#define SIZE2BN(size, l2bsize) ( ((s64)((size) - 1)) >> (l2bsize) )
153
154/*
155 * fixed physical block address (physical block size = 512 byte)
156 *
157 * NOTE: since we can't guarantee a physical block size of 512 bytes the use of
158 * these macros should be removed and the byte offset macros used instead.
159 */
160#define SUPER1_B 64 /* primary superblock */
161#define AIMAP_B (SUPER1_B + 8) /* 1st extent of aggregate inode map */
162#define AITBL_B (AIMAP_B + 16) /*
163 * 1st extent of aggregate inode table
164 */
165#define SUPER2_B (AITBL_B + 32) /* 2ndary superblock pbn */
166#define BMAP_B (SUPER2_B + 8) /* block allocation map */
167
168/*
169 * SIZE_OF_SUPER defines the total amount of space reserved on disk for the
170 * superblock. This is not the same as the superblock structure, since all of
171 * this space is not currently being used.
172 */
173#define SIZE_OF_SUPER PSIZE
174
175/*
176 * SIZE_OF_AG_TABLE defines the amount of space reserved to hold the AG table
177 */
178#define SIZE_OF_AG_TABLE PSIZE
179
180/*
181 * SIZE_OF_MAP_PAGE defines the amount of disk space reserved for each page of
182 * the inode allocation map (to hold iag)
183 */
184#define SIZE_OF_MAP_PAGE PSIZE
185
186/*
187 * fixed byte offset address
188 */
189#define SUPER1_OFF 0x8000 /* primary superblock */
190#define AIMAP_OFF (SUPER1_OFF + SIZE_OF_SUPER)
191 /*
192 * Control page of aggregate inode map
193 * followed by 1st extent of map
194 */
195#define AITBL_OFF (AIMAP_OFF + (SIZE_OF_MAP_PAGE << 1))
196 /*
197 * 1st extent of aggregate inode table
198 */
199#define SUPER2_OFF (AITBL_OFF + INODE_EXTENT_SIZE)
200 /*
201 * secondary superblock
202 */
203#define BMAP_OFF (SUPER2_OFF + SIZE_OF_SUPER)
204 /*
205 * block allocation map
206 */
207
208/*
209 * The following macro is used to indicate the number of reserved disk blocks at
210 * the front of an aggregate, in terms of physical blocks. This value is
211 * currently defined to be 32K. This turns out to be the same as the primary
212 * superblock's address, since it directly follows the reserved blocks.
213 */
214#define AGGR_RSVD_BLOCKS SUPER1_B
215
216/*
217 * The following macro is used to indicate the number of reserved bytes at the
218 * front of an aggregate. This value is currently defined to be 32K. This
219 * turns out to be the same as the primary superblock's byte offset, since it
220 * directly follows the reserved blocks.
221 */
222#define AGGR_RSVD_BYTES SUPER1_OFF
223
224/*
225 * The following macro defines the byte offset for the first inode extent in
226 * the aggregate inode table. This allows us to find the self inode to find the
227 * rest of the table. Currently this value is 44K.
228 */
229#define AGGR_INODE_TABLE_START AITBL_OFF
230
231/*
232 * fixed reserved inode number
233 */
234/* aggregate inode */
235#define AGGR_RESERVED_I 0 /* aggregate inode (reserved) */
236#define AGGREGATE_I 1 /* aggregate inode map inode */
237#define BMAP_I 2 /* aggregate block allocation map inode */
238#define LOG_I 3 /* aggregate inline log inode */
239#define BADBLOCK_I 4 /* aggregate bad block inode */
240#define FILESYSTEM_I 16 /* 1st/only fileset inode in ait:
241 * fileset inode map inode
242 */
243
244/* per fileset inode */
245#define FILESET_RSVD_I 0 /* fileset inode (reserved) */
246#define FILESET_EXT_I 1 /* fileset inode extension */
247#define ROOT_I 2 /* fileset root inode */
248#define ACL_I 3 /* fileset ACL inode */
249
250#define FILESET_OBJECT_I 4 /* the first fileset inode available for a file
251 * or directory or link...
252 */
253#define FIRST_FILESET_INO 16 /* the first aggregate inode which describes
254 * an inode. (To fsck this is also the first
255 * inode in part 2 of the agg inode table.)
256 */
257
258/*
259 * directory configuration
260 */
261#define JFS_NAME_MAX 255
262#define JFS_PATH_MAX BPSIZE
263
264
265/*
266 * file system state (superblock state)
267 */
268#define FM_CLEAN 0x00000000 /* file system is unmounted and clean */
269#define FM_MOUNT 0x00000001 /* file system is mounted cleanly */
270#define FM_DIRTY 0x00000002 /* file system was not unmounted and clean
271 * when mounted or
272 * commit failure occurred while being mounted:
273 * fsck() must be run to repair
274 */
275#define FM_LOGREDO 0x00000004 /* log based recovery (logredo()) failed:
276 * fsck() must be run to repair
277 */
278#define FM_EXTENDFS 0x00000008 /* file system extendfs() in progress */
279
280#endif /* _H_JFS_FILSYS */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
new file mode 100644
index 000000000000..783831301625
--- /dev/null
+++ b/fs/jfs/jfs_imap.c
@@ -0,0 +1,3270 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19/*
20 * jfs_imap.c: inode allocation map manager
21 *
22 * Serialization:
23 * Each AG has a simple lock which is used to control the serialization of
24 * the AG level lists. This lock should be taken first whenever an AG
25 * level list will be modified or accessed.
26 *
27 * Each IAG is locked by obtaining the buffer for the IAG page.
28 *
29 * There is also a inode lock for the inode map inode. A read lock needs to
30 * be taken whenever an IAG is read from the map or the global level
31 * information is read. A write lock needs to be taken whenever the global
32 * level information is modified or an atomic operation needs to be used.
33 *
34 * If more than one IAG is read at one time, the read lock may not
35 * be given up until all of the IAG's are read. Otherwise, a deadlock
36 * may occur when trying to obtain the read lock while another thread
37 * holding the read lock is waiting on the IAG already being held.
38 *
39 * The control page of the inode map is read into memory by diMount().
40 * Thereafter it should only be modified in memory and then it will be
41 * written out when the filesystem is unmounted by diUnmount().
42 */
43
44#include <linux/fs.h>
45#include <linux/buffer_head.h>
46#include <linux/pagemap.h>
47#include <linux/quotaops.h>
48
49#include "jfs_incore.h"
50#include "jfs_filsys.h"
51#include "jfs_dinode.h"
52#include "jfs_dmap.h"
53#include "jfs_imap.h"
54#include "jfs_metapage.h"
55#include "jfs_superblock.h"
56#include "jfs_debug.h"
57
58/*
59 * imap locks
60 */
61/* iag free list lock */
62#define IAGFREE_LOCK_INIT(imap) init_MUTEX(&imap->im_freelock)
63#define IAGFREE_LOCK(imap) down(&imap->im_freelock)
64#define IAGFREE_UNLOCK(imap) up(&imap->im_freelock)
65
66/* per ag iag list locks */
67#define AG_LOCK_INIT(imap,index) init_MUTEX(&(imap->im_aglock[index]))
68#define AG_LOCK(imap,agno) down(&imap->im_aglock[agno])
69#define AG_UNLOCK(imap,agno) up(&imap->im_aglock[agno])
70
71/*
72 * external references
73 */
74extern struct address_space_operations jfs_aops;
75
76/*
77 * forward references
78 */
79static int diAllocAG(struct inomap *, int, boolean_t, struct inode *);
80static int diAllocAny(struct inomap *, int, boolean_t, struct inode *);
81static int diAllocBit(struct inomap *, struct iag *, int);
82static int diAllocExt(struct inomap *, int, struct inode *);
83static int diAllocIno(struct inomap *, int, struct inode *);
84static int diFindFree(u32, int);
85static int diNewExt(struct inomap *, struct iag *, int);
86static int diNewIAG(struct inomap *, int *, int, struct metapage **);
87static void duplicateIXtree(struct super_block *, s64, int, s64 *);
88
89static int diIAGRead(struct inomap * imap, int, struct metapage **);
90static int copy_from_dinode(struct dinode *, struct inode *);
91static void copy_to_dinode(struct dinode *, struct inode *);
92
93/*
94 * debug code for double-checking inode map
95 */
96/* #define _JFS_DEBUG_IMAP 1 */
97
98#ifdef _JFS_DEBUG_IMAP
99#define DBG_DIINIT(imap) DBGdiInit(imap)
100#define DBG_DIALLOC(imap, ino) DBGdiAlloc(imap, ino)
101#define DBG_DIFREE(imap, ino) DBGdiFree(imap, ino)
102
103static void *DBGdiInit(struct inomap * imap);
104static void DBGdiAlloc(struct inomap * imap, ino_t ino);
105static void DBGdiFree(struct inomap * imap, ino_t ino);
106#else
107#define DBG_DIINIT(imap)
108#define DBG_DIALLOC(imap, ino)
109#define DBG_DIFREE(imap, ino)
110#endif /* _JFS_DEBUG_IMAP */
111
112/*
113 * NAME: diMount()
114 *
115 * FUNCTION: initialize the incore inode map control structures for
116 * a fileset or aggregate init time.
117 *
118 * the inode map's control structure (dinomap) is
119 * brought in from disk and placed in virtual memory.
120 *
121 * PARAMETERS:
122 * ipimap - pointer to inode map inode for the aggregate or fileset.
123 *
124 * RETURN VALUES:
125 * 0 - success
126 * -ENOMEM - insufficient free virtual memory.
127 * -EIO - i/o error.
128 */
129int diMount(struct inode *ipimap)
130{
131 struct inomap *imap;
132 struct metapage *mp;
133 int index;
134 struct dinomap_disk *dinom_le;
135
136 /*
137 * allocate/initialize the in-memory inode map control structure
138 */
139 /* allocate the in-memory inode map control structure. */
140 imap = (struct inomap *) kmalloc(sizeof(struct inomap), GFP_KERNEL);
141 if (imap == NULL) {
142 jfs_err("diMount: kmalloc returned NULL!");
143 return -ENOMEM;
144 }
145
146 /* read the on-disk inode map control structure. */
147
148 mp = read_metapage(ipimap,
149 IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage,
150 PSIZE, 0);
151 if (mp == NULL) {
152 kfree(imap);
153 return -EIO;
154 }
155
156 /* copy the on-disk version to the in-memory version. */
157 dinom_le = (struct dinomap_disk *) mp->data;
158 imap->im_freeiag = le32_to_cpu(dinom_le->in_freeiag);
159 imap->im_nextiag = le32_to_cpu(dinom_le->in_nextiag);
160 atomic_set(&imap->im_numinos, le32_to_cpu(dinom_le->in_numinos));
161 atomic_set(&imap->im_numfree, le32_to_cpu(dinom_le->in_numfree));
162 imap->im_nbperiext = le32_to_cpu(dinom_le->in_nbperiext);
163 imap->im_l2nbperiext = le32_to_cpu(dinom_le->in_l2nbperiext);
164 for (index = 0; index < MAXAG; index++) {
165 imap->im_agctl[index].inofree =
166 le32_to_cpu(dinom_le->in_agctl[index].inofree);
167 imap->im_agctl[index].extfree =
168 le32_to_cpu(dinom_le->in_agctl[index].extfree);
169 imap->im_agctl[index].numinos =
170 le32_to_cpu(dinom_le->in_agctl[index].numinos);
171 imap->im_agctl[index].numfree =
172 le32_to_cpu(dinom_le->in_agctl[index].numfree);
173 }
174
175 /* release the buffer. */
176 release_metapage(mp);
177
178 /*
179 * allocate/initialize inode allocation map locks
180 */
181 /* allocate and init iag free list lock */
182 IAGFREE_LOCK_INIT(imap);
183
184 /* allocate and init ag list locks */
185 for (index = 0; index < MAXAG; index++) {
186 AG_LOCK_INIT(imap, index);
187 }
188
189 /* bind the inode map inode and inode map control structure
190 * to each other.
191 */
192 imap->im_ipimap = ipimap;
193 JFS_IP(ipimap)->i_imap = imap;
194
195// DBG_DIINIT(imap);
196
197 return (0);
198}
199
200
201/*
202 * NAME: diUnmount()
203 *
204 * FUNCTION: write to disk the incore inode map control structures for
205 * a fileset or aggregate at unmount time.
206 *
207 * PARAMETERS:
208 * ipimap - pointer to inode map inode for the aggregate or fileset.
209 *
210 * RETURN VALUES:
211 * 0 - success
212 * -ENOMEM - insufficient free virtual memory.
213 * -EIO - i/o error.
214 */
215int diUnmount(struct inode *ipimap, int mounterror)
216{
217 struct inomap *imap = JFS_IP(ipimap)->i_imap;
218
219 /*
220 * update the on-disk inode map control structure
221 */
222
223 if (!(mounterror || isReadOnly(ipimap)))
224 diSync(ipimap);
225
226 /*
227 * Invalidate the page cache buffers
228 */
229 truncate_inode_pages(ipimap->i_mapping, 0);
230
231 /*
232 * free in-memory control structure
233 */
234 kfree(imap);
235
236 return (0);
237}
238
239
240/*
241 * diSync()
242 */
243int diSync(struct inode *ipimap)
244{
245 struct dinomap_disk *dinom_le;
246 struct inomap *imp = JFS_IP(ipimap)->i_imap;
247 struct metapage *mp;
248 int index;
249
250 /*
251 * write imap global conrol page
252 */
253 /* read the on-disk inode map control structure */
254 mp = get_metapage(ipimap,
255 IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage,
256 PSIZE, 0);
257 if (mp == NULL) {
258 jfs_err("diSync: get_metapage failed!");
259 return -EIO;
260 }
261
262 /* copy the in-memory version to the on-disk version */
263 dinom_le = (struct dinomap_disk *) mp->data;
264 dinom_le->in_freeiag = cpu_to_le32(imp->im_freeiag);
265 dinom_le->in_nextiag = cpu_to_le32(imp->im_nextiag);
266 dinom_le->in_numinos = cpu_to_le32(atomic_read(&imp->im_numinos));
267 dinom_le->in_numfree = cpu_to_le32(atomic_read(&imp->im_numfree));
268 dinom_le->in_nbperiext = cpu_to_le32(imp->im_nbperiext);
269 dinom_le->in_l2nbperiext = cpu_to_le32(imp->im_l2nbperiext);
270 for (index = 0; index < MAXAG; index++) {
271 dinom_le->in_agctl[index].inofree =
272 cpu_to_le32(imp->im_agctl[index].inofree);
273 dinom_le->in_agctl[index].extfree =
274 cpu_to_le32(imp->im_agctl[index].extfree);
275 dinom_le->in_agctl[index].numinos =
276 cpu_to_le32(imp->im_agctl[index].numinos);
277 dinom_le->in_agctl[index].numfree =
278 cpu_to_le32(imp->im_agctl[index].numfree);
279 }
280
281 /* write out the control structure */
282 write_metapage(mp);
283
284 /*
285 * write out dirty pages of imap
286 */
287 filemap_fdatawrite(ipimap->i_mapping);
288 filemap_fdatawait(ipimap->i_mapping);
289
290 diWriteSpecial(ipimap, 0);
291
292 return (0);
293}
294
295
296/*
297 * NAME: diRead()
298 *
299 * FUNCTION: initialize an incore inode from disk.
300 *
301 * on entry, the specifed incore inode should itself
302 * specify the disk inode number corresponding to the
303 * incore inode (i.e. i_number should be initialized).
304 *
305 * this routine handles incore inode initialization for
306 * both "special" and "regular" inodes. special inodes
307 * are those required early in the mount process and
308 * require special handling since much of the file system
309 * is not yet initialized. these "special" inodes are
310 * identified by a NULL inode map inode pointer and are
311 * actually initialized by a call to diReadSpecial().
312 *
313 * for regular inodes, the iag describing the disk inode
314 * is read from disk to determine the inode extent address
315 * for the disk inode. with the inode extent address in
316 * hand, the page of the extent that contains the disk
317 * inode is read and the disk inode is copied to the
318 * incore inode.
319 *
320 * PARAMETERS:
321 * ip - pointer to incore inode to be initialized from disk.
322 *
323 * RETURN VALUES:
324 * 0 - success
325 * -EIO - i/o error.
326 * -ENOMEM - insufficient memory
327 *
328 */
329int diRead(struct inode *ip)
330{
331 struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
332 int iagno, ino, extno, rc;
333 struct inode *ipimap;
334 struct dinode *dp;
335 struct iag *iagp;
336 struct metapage *mp;
337 s64 blkno, agstart;
338 struct inomap *imap;
339 int block_offset;
340 int inodes_left;
341 uint pageno;
342 int rel_inode;
343
344 jfs_info("diRead: ino = %ld", ip->i_ino);
345
346 ipimap = sbi->ipimap;
347 JFS_IP(ip)->ipimap = ipimap;
348
349 /* determine the iag number for this inode (number) */
350 iagno = INOTOIAG(ip->i_ino);
351
352 /* read the iag */
353 imap = JFS_IP(ipimap)->i_imap;
354 IREAD_LOCK(ipimap);
355 rc = diIAGRead(imap, iagno, &mp);
356 IREAD_UNLOCK(ipimap);
357 if (rc) {
358 jfs_err("diRead: diIAGRead returned %d", rc);
359 return (rc);
360 }
361
362 iagp = (struct iag *) mp->data;
363
364 /* determine inode extent that holds the disk inode */
365 ino = ip->i_ino & (INOSPERIAG - 1);
366 extno = ino >> L2INOSPEREXT;
367
368 if ((lengthPXD(&iagp->inoext[extno]) != imap->im_nbperiext) ||
369 (addressPXD(&iagp->inoext[extno]) == 0)) {
370 release_metapage(mp);
371 return -ESTALE;
372 }
373
374 /* get disk block number of the page within the inode extent
375 * that holds the disk inode.
376 */
377 blkno = INOPBLK(&iagp->inoext[extno], ino, sbi->l2nbperpage);
378
379 /* get the ag for the iag */
380 agstart = le64_to_cpu(iagp->agstart);
381
382 release_metapage(mp);
383
384 rel_inode = (ino & (INOSPERPAGE - 1));
385 pageno = blkno >> sbi->l2nbperpage;
386
387 if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) {
388 /*
389 * OS/2 didn't always align inode extents on page boundaries
390 */
391 inodes_left =
392 (sbi->nbperpage - block_offset) << sbi->l2niperblk;
393
394 if (rel_inode < inodes_left)
395 rel_inode += block_offset << sbi->l2niperblk;
396 else {
397 pageno += 1;
398 rel_inode -= inodes_left;
399 }
400 }
401
402 /* read the page of disk inode */
403 mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
404 if (mp == 0) {
405 jfs_err("diRead: read_metapage failed");
406 return -EIO;
407 }
408
409 /* locate the the disk inode requested */
410 dp = (struct dinode *) mp->data;
411 dp += rel_inode;
412
413 if (ip->i_ino != le32_to_cpu(dp->di_number)) {
414 jfs_error(ip->i_sb, "diRead: i_ino != di_number");
415 rc = -EIO;
416 } else if (le32_to_cpu(dp->di_nlink) == 0)
417 rc = -ESTALE;
418 else
419 /* copy the disk inode to the in-memory inode */
420 rc = copy_from_dinode(dp, ip);
421
422 release_metapage(mp);
423
424 /* set the ag for the inode */
425 JFS_IP(ip)->agno = BLKTOAG(agstart, sbi);
426 JFS_IP(ip)->active_ag = -1;
427
428 return (rc);
429}
430
431
432/*
433 * NAME: diReadSpecial()
434 *
435 * FUNCTION: initialize a 'special' inode from disk.
436 *
437 * this routines handles aggregate level inodes. The
438 * inode cache cannot differentiate between the
439 * aggregate inodes and the filesystem inodes, so we
440 * handle these here. We don't actually use the aggregate
441 * inode map, since these inodes are at a fixed location
442 * and in some cases the aggregate inode map isn't initialized
443 * yet.
444 *
445 * PARAMETERS:
446 * sb - filesystem superblock
447 * inum - aggregate inode number
448 * secondary - 1 if secondary aggregate inode table
449 *
450 * RETURN VALUES:
451 * new inode - success
452 * NULL - i/o error.
453 */
454struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
455{
456 struct jfs_sb_info *sbi = JFS_SBI(sb);
457 uint address;
458 struct dinode *dp;
459 struct inode *ip;
460 struct metapage *mp;
461
462 ip = new_inode(sb);
463 if (ip == NULL) {
464 jfs_err("diReadSpecial: new_inode returned NULL!");
465 return ip;
466 }
467
468 if (secondary) {
469 address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
470 JFS_IP(ip)->ipimap = sbi->ipaimap2;
471 } else {
472 address = AITBL_OFF >> L2PSIZE;
473 JFS_IP(ip)->ipimap = sbi->ipaimap;
474 }
475
476 ASSERT(inum < INOSPEREXT);
477
478 ip->i_ino = inum;
479
480 address += inum >> 3; /* 8 inodes per 4K page */
481
482 /* read the page of fixed disk inode (AIT) in raw mode */
483 mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
484 if (mp == NULL) {
485 ip->i_nlink = 1; /* Don't want iput() deleting it */
486 iput(ip);
487 return (NULL);
488 }
489
490 /* get the pointer to the disk inode of interest */
491 dp = (struct dinode *) (mp->data);
492 dp += inum % 8; /* 8 inodes per 4K page */
493
494 /* copy on-disk inode to in-memory inode */
495 if ((copy_from_dinode(dp, ip)) != 0) {
496 /* handle bad return by returning NULL for ip */
497 ip->i_nlink = 1; /* Don't want iput() deleting it */
498 iput(ip);
499 /* release the page */
500 release_metapage(mp);
501 return (NULL);
502
503 }
504
505 ip->i_mapping->a_ops = &jfs_aops;
506 mapping_set_gfp_mask(ip->i_mapping, GFP_NOFS);
507
508 /* Allocations to metadata inodes should not affect quotas */
509 ip->i_flags |= S_NOQUOTA;
510
511 if ((inum == FILESYSTEM_I) && (JFS_IP(ip)->ipimap == sbi->ipaimap)) {
512 sbi->gengen = le32_to_cpu(dp->di_gengen);
513 sbi->inostamp = le32_to_cpu(dp->di_inostamp);
514 }
515
516 /* release the page */
517 release_metapage(mp);
518
519 return (ip);
520}
521
522/*
523 * NAME: diWriteSpecial()
524 *
525 * FUNCTION: Write the special inode to disk
526 *
527 * PARAMETERS:
528 * ip - special inode
529 * secondary - 1 if secondary aggregate inode table
530 *
531 * RETURN VALUES: none
532 */
533
534void diWriteSpecial(struct inode *ip, int secondary)
535{
536 struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
537 uint address;
538 struct dinode *dp;
539 ino_t inum = ip->i_ino;
540 struct metapage *mp;
541
542 ip->i_state &= ~I_DIRTY;
543
544 if (secondary)
545 address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
546 else
547 address = AITBL_OFF >> L2PSIZE;
548
549 ASSERT(inum < INOSPEREXT);
550
551 address += inum >> 3; /* 8 inodes per 4K page */
552
553 /* read the page of fixed disk inode (AIT) in raw mode */
554 mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
555 if (mp == NULL) {
556 jfs_err("diWriteSpecial: failed to read aggregate inode "
557 "extent!");
558 return;
559 }
560
561 /* get the pointer to the disk inode of interest */
562 dp = (struct dinode *) (mp->data);
563 dp += inum % 8; /* 8 inodes per 4K page */
564
565 /* copy on-disk inode to in-memory inode */
566 copy_to_dinode(dp, ip);
567 memcpy(&dp->di_xtroot, &JFS_IP(ip)->i_xtroot, 288);
568
569 if (inum == FILESYSTEM_I)
570 dp->di_gengen = cpu_to_le32(sbi->gengen);
571
572 /* write the page */
573 write_metapage(mp);
574}
575
576/*
577 * NAME: diFreeSpecial()
578 *
579 * FUNCTION: Free allocated space for special inode
580 */
581void diFreeSpecial(struct inode *ip)
582{
583 if (ip == NULL) {
584 jfs_err("diFreeSpecial called with NULL ip!");
585 return;
586 }
587 filemap_fdatawrite(ip->i_mapping);
588 filemap_fdatawait(ip->i_mapping);
589 truncate_inode_pages(ip->i_mapping, 0);
590 iput(ip);
591}
592
593
594
595/*
596 * NAME: diWrite()
597 *
598 * FUNCTION: write the on-disk inode portion of the in-memory inode
599 * to its corresponding on-disk inode.
600 *
601 * on entry, the specifed incore inode should itself
602 * specify the disk inode number corresponding to the
603 * incore inode (i.e. i_number should be initialized).
604 *
605 * the inode contains the inode extent address for the disk
606 * inode. with the inode extent address in hand, the
607 * page of the extent that contains the disk inode is
608 * read and the disk inode portion of the incore inode
609 * is copied to the disk inode.
610 *
611 * PARAMETERS:
612 * tid - transacation id
613 * ip - pointer to incore inode to be written to the inode extent.
614 *
615 * RETURN VALUES:
616 * 0 - success
617 * -EIO - i/o error.
618 */
619int diWrite(tid_t tid, struct inode *ip)
620{
621 struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
622 struct jfs_inode_info *jfs_ip = JFS_IP(ip);
623 int rc = 0;
624 s32 ino;
625 struct dinode *dp;
626 s64 blkno;
627 int block_offset;
628 int inodes_left;
629 struct metapage *mp;
630 uint pageno;
631 int rel_inode;
632 int dioffset;
633 struct inode *ipimap;
634 uint type;
635 lid_t lid;
636 struct tlock *ditlck, *tlck;
637 struct linelock *dilinelock, *ilinelock;
638 struct lv *lv;
639 int n;
640
641 ipimap = jfs_ip->ipimap;
642
643 ino = ip->i_ino & (INOSPERIAG - 1);
644
645 if (!addressPXD(&(jfs_ip->ixpxd)) ||
646 (lengthPXD(&(jfs_ip->ixpxd)) !=
647 JFS_IP(ipimap)->i_imap->im_nbperiext)) {
648 jfs_error(ip->i_sb, "diWrite: ixpxd invalid");
649 return -EIO;
650 }
651
652 /*
653 * read the page of disk inode containing the specified inode:
654 */
655 /* compute the block address of the page */
656 blkno = INOPBLK(&(jfs_ip->ixpxd), ino, sbi->l2nbperpage);
657
658 rel_inode = (ino & (INOSPERPAGE - 1));
659 pageno = blkno >> sbi->l2nbperpage;
660
661 if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) {
662 /*
663 * OS/2 didn't always align inode extents on page boundaries
664 */
665 inodes_left =
666 (sbi->nbperpage - block_offset) << sbi->l2niperblk;
667
668 if (rel_inode < inodes_left)
669 rel_inode += block_offset << sbi->l2niperblk;
670 else {
671 pageno += 1;
672 rel_inode -= inodes_left;
673 }
674 }
675 /* read the page of disk inode */
676 retry:
677 mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
678 if (mp == 0)
679 return -EIO;
680
681 /* get the pointer to the disk inode */
682 dp = (struct dinode *) mp->data;
683 dp += rel_inode;
684
685 dioffset = (ino & (INOSPERPAGE - 1)) << L2DISIZE;
686
687 /*
688 * acquire transaction lock on the on-disk inode;
689 * N.B. tlock is acquired on ipimap not ip;
690 */
691 if ((ditlck =
692 txLock(tid, ipimap, mp, tlckINODE | tlckENTRY)) == NULL)
693 goto retry;
694 dilinelock = (struct linelock *) & ditlck->lock;
695
696 /*
697 * copy btree root from in-memory inode to on-disk inode
698 *
699 * (tlock is taken from inline B+-tree root in in-memory
700 * inode when the B+-tree root is updated, which is pointed
701 * by jfs_ip->blid as well as being on tx tlock list)
702 *
703 * further processing of btree root is based on the copy
704 * in in-memory inode, where txLog() will log from, and,
705 * for xtree root, txUpdateMap() will update map and reset
706 * XAD_NEW bit;
707 */
708
709 if (S_ISDIR(ip->i_mode) && (lid = jfs_ip->xtlid)) {
710 /*
711 * This is the special xtree inside the directory for storing
712 * the directory table
713 */
714 xtpage_t *p, *xp;
715 xad_t *xad;
716
717 jfs_ip->xtlid = 0;
718 tlck = lid_to_tlock(lid);
719 assert(tlck->type & tlckXTREE);
720 tlck->type |= tlckBTROOT;
721 tlck->mp = mp;
722 ilinelock = (struct linelock *) & tlck->lock;
723
724 /*
725 * copy xtree root from inode to dinode:
726 */
727 p = &jfs_ip->i_xtroot;
728 xp = (xtpage_t *) &dp->di_dirtable;
729 lv = ilinelock->lv;
730 for (n = 0; n < ilinelock->index; n++, lv++) {
731 memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
732 lv->length << L2XTSLOTSIZE);
733 }
734
735 /* reset on-disk (metadata page) xtree XAD_NEW bit */
736 xad = &xp->xad[XTENTRYSTART];
737 for (n = XTENTRYSTART;
738 n < le16_to_cpu(xp->header.nextindex); n++, xad++)
739 if (xad->flag & (XAD_NEW | XAD_EXTENDED))
740 xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
741 }
742
743 if ((lid = jfs_ip->blid) == 0)
744 goto inlineData;
745 jfs_ip->blid = 0;
746
747 tlck = lid_to_tlock(lid);
748 type = tlck->type;
749 tlck->type |= tlckBTROOT;
750 tlck->mp = mp;
751 ilinelock = (struct linelock *) & tlck->lock;
752
753 /*
754 * regular file: 16 byte (XAD slot) granularity
755 */
756 if (type & tlckXTREE) {
757 xtpage_t *p, *xp;
758 xad_t *xad;
759
760 /*
761 * copy xtree root from inode to dinode:
762 */
763 p = &jfs_ip->i_xtroot;
764 xp = &dp->di_xtroot;
765 lv = ilinelock->lv;
766 for (n = 0; n < ilinelock->index; n++, lv++) {
767 memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
768 lv->length << L2XTSLOTSIZE);
769 }
770
771 /* reset on-disk (metadata page) xtree XAD_NEW bit */
772 xad = &xp->xad[XTENTRYSTART];
773 for (n = XTENTRYSTART;
774 n < le16_to_cpu(xp->header.nextindex); n++, xad++)
775 if (xad->flag & (XAD_NEW | XAD_EXTENDED))
776 xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
777 }
778 /*
779 * directory: 32 byte (directory entry slot) granularity
780 */
781 else if (type & tlckDTREE) {
782 dtpage_t *p, *xp;
783
784 /*
785 * copy dtree root from inode to dinode:
786 */
787 p = (dtpage_t *) &jfs_ip->i_dtroot;
788 xp = (dtpage_t *) & dp->di_dtroot;
789 lv = ilinelock->lv;
790 for (n = 0; n < ilinelock->index; n++, lv++) {
791 memcpy(&xp->slot[lv->offset], &p->slot[lv->offset],
792 lv->length << L2DTSLOTSIZE);
793 }
794 } else {
795 jfs_err("diWrite: UFO tlock");
796 }
797
798 inlineData:
799 /*
800 * copy inline symlink from in-memory inode to on-disk inode
801 */
802 if (S_ISLNK(ip->i_mode) && ip->i_size < IDATASIZE) {
803 lv = & dilinelock->lv[dilinelock->index];
804 lv->offset = (dioffset + 2 * 128) >> L2INODESLOTSIZE;
805 lv->length = 2;
806 memcpy(&dp->di_fastsymlink, jfs_ip->i_inline, IDATASIZE);
807 dilinelock->index++;
808 }
809 /*
810 * copy inline data from in-memory inode to on-disk inode:
811 * 128 byte slot granularity
812 */
813 if (test_cflag(COMMIT_Inlineea, ip)) {
814 lv = & dilinelock->lv[dilinelock->index];
815 lv->offset = (dioffset + 3 * 128) >> L2INODESLOTSIZE;
816 lv->length = 1;
817 memcpy(&dp->di_inlineea, jfs_ip->i_inline_ea, INODESLOTSIZE);
818 dilinelock->index++;
819
820 clear_cflag(COMMIT_Inlineea, ip);
821 }
822
823 /*
824 * lock/copy inode base: 128 byte slot granularity
825 */
826// baseDinode:
827 lv = & dilinelock->lv[dilinelock->index];
828 lv->offset = dioffset >> L2INODESLOTSIZE;
829 copy_to_dinode(dp, ip);
830 if (test_and_clear_cflag(COMMIT_Dirtable, ip)) {
831 lv->length = 2;
832 memcpy(&dp->di_dirtable, &jfs_ip->i_dirtable, 96);
833 } else
834 lv->length = 1;
835 dilinelock->index++;
836
837#ifdef _JFS_FASTDASD
838 /*
839 * We aren't logging changes to the DASD used in directory inodes,
840 * but we need to write them to disk. If we don't unmount cleanly,
841 * mount will recalculate the DASD used.
842 */
843 if (S_ISDIR(ip->i_mode)
844 && (ip->i_ipmnt->i_mntflag & JFS_DASD_ENABLED))
845 memcpy(&dp->di_DASD, &ip->i_DASD, sizeof(struct dasd));
846#endif /* _JFS_FASTDASD */
847
848 /* release the buffer holding the updated on-disk inode.
849 * the buffer will be later written by commit processing.
850 */
851 write_metapage(mp);
852
853 return (rc);
854}
855
856
857/*
858 * NAME: diFree(ip)
859 *
860 * FUNCTION: free a specified inode from the inode working map
861 * for a fileset or aggregate.
862 *
863 * if the inode to be freed represents the first (only)
864 * free inode within the iag, the iag will be placed on
865 * the ag free inode list.
866 *
867 * freeing the inode will cause the inode extent to be
868 * freed if the inode is the only allocated inode within
869 * the extent. in this case all the disk resource backing
870 * up the inode extent will be freed. in addition, the iag
871 * will be placed on the ag extent free list if the extent
872 * is the first free extent in the iag. if freeing the
873 * extent also means that no free inodes will exist for
874 * the iag, the iag will also be removed from the ag free
875 * inode list.
876 *
877 * the iag describing the inode will be freed if the extent
878 * is to be freed and it is the only backed extent within
879 * the iag. in this case, the iag will be removed from the
880 * ag free extent list and ag free inode list and placed on
881 * the inode map's free iag list.
882 *
883 * a careful update approach is used to provide consistency
884 * in the face of updates to multiple buffers. under this
885 * approach, all required buffers are obtained before making
886 * any updates and are held until all updates are complete.
887 *
888 * PARAMETERS:
889 * ip - inode to be freed.
890 *
891 * RETURN VALUES:
892 * 0 - success
893 * -EIO - i/o error.
894 */
895int diFree(struct inode *ip)
896{
897 int rc;
898 ino_t inum = ip->i_ino;
899 struct iag *iagp, *aiagp, *biagp, *ciagp, *diagp;
900 struct metapage *mp, *amp, *bmp, *cmp, *dmp;
901 int iagno, ino, extno, bitno, sword, agno;
902 int back, fwd;
903 u32 bitmap, mask;
904 struct inode *ipimap = JFS_SBI(ip->i_sb)->ipimap;
905 struct inomap *imap = JFS_IP(ipimap)->i_imap;
906 pxd_t freepxd;
907 tid_t tid;
908 struct inode *iplist[3];
909 struct tlock *tlck;
910 struct pxd_lock *pxdlock;
911
912 /*
913 * This is just to suppress compiler warnings. The same logic that
914 * references these variables is used to initialize them.
915 */
916 aiagp = biagp = ciagp = diagp = NULL;
917
918 /* get the iag number containing the inode.
919 */
920 iagno = INOTOIAG(inum);
921
922 /* make sure that the iag is contained within
923 * the map.
924 */
925 if (iagno >= imap->im_nextiag) {
926 dump_mem("imap", imap, 32);
927 jfs_error(ip->i_sb,
928 "diFree: inum = %d, iagno = %d, nextiag = %d",
929 (uint) inum, iagno, imap->im_nextiag);
930 return -EIO;
931 }
932
933 /* get the allocation group for this ino.
934 */
935 agno = JFS_IP(ip)->agno;
936
937 /* Lock the AG specific inode map information
938 */
939 AG_LOCK(imap, agno);
940
941 /* Obtain read lock in imap inode. Don't release it until we have
942 * read all of the IAG's that we are going to.
943 */
944 IREAD_LOCK(ipimap);
945
946 /* read the iag.
947 */
948 if ((rc = diIAGRead(imap, iagno, &mp))) {
949 IREAD_UNLOCK(ipimap);
950 AG_UNLOCK(imap, agno);
951 return (rc);
952 }
953 iagp = (struct iag *) mp->data;
954
955 /* get the inode number and extent number of the inode within
956 * the iag and the inode number within the extent.
957 */
958 ino = inum & (INOSPERIAG - 1);
959 extno = ino >> L2INOSPEREXT;
960 bitno = ino & (INOSPEREXT - 1);
961 mask = HIGHORDER >> bitno;
962
963 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
964 jfs_error(ip->i_sb,
965 "diFree: wmap shows inode already free");
966 }
967
968 if (!addressPXD(&iagp->inoext[extno])) {
969 release_metapage(mp);
970 IREAD_UNLOCK(ipimap);
971 AG_UNLOCK(imap, agno);
972 jfs_error(ip->i_sb, "diFree: invalid inoext");
973 return -EIO;
974 }
975
976 /* compute the bitmap for the extent reflecting the freed inode.
977 */
978 bitmap = le32_to_cpu(iagp->wmap[extno]) & ~mask;
979
980 if (imap->im_agctl[agno].numfree > imap->im_agctl[agno].numinos) {
981 release_metapage(mp);
982 IREAD_UNLOCK(ipimap);
983 AG_UNLOCK(imap, agno);
984 jfs_error(ip->i_sb, "diFree: numfree > numinos");
985 return -EIO;
986 }
987 /*
988 * inode extent still has some inodes or below low water mark:
989 * keep the inode extent;
990 */
991 if (bitmap ||
992 imap->im_agctl[agno].numfree < 96 ||
993 (imap->im_agctl[agno].numfree < 288 &&
994 (((imap->im_agctl[agno].numfree * 100) /
995 imap->im_agctl[agno].numinos) <= 25))) {
996 /* if the iag currently has no free inodes (i.e.,
997 * the inode being freed is the first free inode of iag),
998 * insert the iag at head of the inode free list for the ag.
999 */
1000 if (iagp->nfreeinos == 0) {
1001 /* check if there are any iags on the ag inode
1002 * free list. if so, read the first one so that
1003 * we can link the current iag onto the list at
1004 * the head.
1005 */
1006 if ((fwd = imap->im_agctl[agno].inofree) >= 0) {
1007 /* read the iag that currently is the head
1008 * of the list.
1009 */
1010 if ((rc = diIAGRead(imap, fwd, &amp))) {
1011 IREAD_UNLOCK(ipimap);
1012 AG_UNLOCK(imap, agno);
1013 release_metapage(mp);
1014 return (rc);
1015 }
1016 aiagp = (struct iag *) amp->data;
1017
1018 /* make current head point back to the iag.
1019 */
1020 aiagp->inofreeback = cpu_to_le32(iagno);
1021
1022 write_metapage(amp);
1023 }
1024
1025 /* iag points forward to current head and iag
1026 * becomes the new head of the list.
1027 */
1028 iagp->inofreefwd =
1029 cpu_to_le32(imap->im_agctl[agno].inofree);
1030 iagp->inofreeback = cpu_to_le32(-1);
1031 imap->im_agctl[agno].inofree = iagno;
1032 }
1033 IREAD_UNLOCK(ipimap);
1034
1035 /* update the free inode summary map for the extent if
1036 * freeing the inode means the extent will now have free
1037 * inodes (i.e., the inode being freed is the first free
1038 * inode of extent),
1039 */
1040 if (iagp->wmap[extno] == cpu_to_le32(ONES)) {
1041 sword = extno >> L2EXTSPERSUM;
1042 bitno = extno & (EXTSPERSUM - 1);
1043 iagp->inosmap[sword] &=
1044 cpu_to_le32(~(HIGHORDER >> bitno));
1045 }
1046
1047 /* update the bitmap.
1048 */
1049 iagp->wmap[extno] = cpu_to_le32(bitmap);
1050 DBG_DIFREE(imap, inum);
1051
1052 /* update the free inode counts at the iag, ag and
1053 * map level.
1054 */
1055 iagp->nfreeinos =
1056 cpu_to_le32(le32_to_cpu(iagp->nfreeinos) + 1);
1057 imap->im_agctl[agno].numfree += 1;
1058 atomic_inc(&imap->im_numfree);
1059
1060 /* release the AG inode map lock
1061 */
1062 AG_UNLOCK(imap, agno);
1063
1064 /* write the iag */
1065 write_metapage(mp);
1066
1067 return (0);
1068 }
1069
1070
1071 /*
1072 * inode extent has become free and above low water mark:
1073 * free the inode extent;
1074 */
1075
1076 /*
1077 * prepare to update iag list(s) (careful update step 1)
1078 */
1079 amp = bmp = cmp = dmp = NULL;
1080 fwd = back = -1;
1081
1082 /* check if the iag currently has no free extents. if so,
1083 * it will be placed on the head of the ag extent free list.
1084 */
1085 if (iagp->nfreeexts == 0) {
1086 /* check if the ag extent free list has any iags.
1087 * if so, read the iag at the head of the list now.
1088 * this (head) iag will be updated later to reflect
1089 * the addition of the current iag at the head of
1090 * the list.
1091 */
1092 if ((fwd = imap->im_agctl[agno].extfree) >= 0) {
1093 if ((rc = diIAGRead(imap, fwd, &amp)))
1094 goto error_out;
1095 aiagp = (struct iag *) amp->data;
1096 }
1097 } else {
1098 /* iag has free extents. check if the addition of a free
1099 * extent will cause all extents to be free within this
1100 * iag. if so, the iag will be removed from the ag extent
1101 * free list and placed on the inode map's free iag list.
1102 */
1103 if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) {
1104 /* in preparation for removing the iag from the
1105 * ag extent free list, read the iags preceeding
1106 * and following the iag on the ag extent free
1107 * list.
1108 */
1109 if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) {
1110 if ((rc = diIAGRead(imap, fwd, &amp)))
1111 goto error_out;
1112 aiagp = (struct iag *) amp->data;
1113 }
1114
1115 if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) {
1116 if ((rc = diIAGRead(imap, back, &bmp)))
1117 goto error_out;
1118 biagp = (struct iag *) bmp->data;
1119 }
1120 }
1121 }
1122
1123 /* remove the iag from the ag inode free list if freeing
1124 * this extent cause the iag to have no free inodes.
1125 */
1126 if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) {
1127 int inofreeback = le32_to_cpu(iagp->inofreeback);
1128 int inofreefwd = le32_to_cpu(iagp->inofreefwd);
1129
1130 /* in preparation for removing the iag from the
1131 * ag inode free list, read the iags preceeding
1132 * and following the iag on the ag inode free
1133 * list. before reading these iags, we must make
1134 * sure that we already don't have them in hand
1135 * from up above, since re-reading an iag (buffer)
1136 * we are currently holding would cause a deadlock.
1137 */
1138 if (inofreefwd >= 0) {
1139
1140 if (inofreefwd == fwd)
1141 ciagp = (struct iag *) amp->data;
1142 else if (inofreefwd == back)
1143 ciagp = (struct iag *) bmp->data;
1144 else {
1145 if ((rc =
1146 diIAGRead(imap, inofreefwd, &cmp)))
1147 goto error_out;
1148 ciagp = (struct iag *) cmp->data;
1149 }
1150 assert(ciagp != NULL);
1151 }
1152
1153 if (inofreeback >= 0) {
1154 if (inofreeback == fwd)
1155 diagp = (struct iag *) amp->data;
1156 else if (inofreeback == back)
1157 diagp = (struct iag *) bmp->data;
1158 else {
1159 if ((rc =
1160 diIAGRead(imap, inofreeback, &dmp)))
1161 goto error_out;
1162 diagp = (struct iag *) dmp->data;
1163 }
1164 assert(diagp != NULL);
1165 }
1166 }
1167
1168 IREAD_UNLOCK(ipimap);
1169
1170 /*
1171 * invalidate any page of the inode extent freed from buffer cache;
1172 */
1173 freepxd = iagp->inoext[extno];
1174 invalidate_pxd_metapages(ip, freepxd);
1175
1176 /*
1177 * update iag list(s) (careful update step 2)
1178 */
1179 /* add the iag to the ag extent free list if this is the
1180 * first free extent for the iag.
1181 */
1182 if (iagp->nfreeexts == 0) {
1183 if (fwd >= 0)
1184 aiagp->extfreeback = cpu_to_le32(iagno);
1185
1186 iagp->extfreefwd =
1187 cpu_to_le32(imap->im_agctl[agno].extfree);
1188 iagp->extfreeback = cpu_to_le32(-1);
1189 imap->im_agctl[agno].extfree = iagno;
1190 } else {
1191 /* remove the iag from the ag extent list if all extents
1192 * are now free and place it on the inode map iag free list.
1193 */
1194 if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) {
1195 if (fwd >= 0)
1196 aiagp->extfreeback = iagp->extfreeback;
1197
1198 if (back >= 0)
1199 biagp->extfreefwd = iagp->extfreefwd;
1200 else
1201 imap->im_agctl[agno].extfree =
1202 le32_to_cpu(iagp->extfreefwd);
1203
1204 iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1);
1205
1206 IAGFREE_LOCK(imap);
1207 iagp->iagfree = cpu_to_le32(imap->im_freeiag);
1208 imap->im_freeiag = iagno;
1209 IAGFREE_UNLOCK(imap);
1210 }
1211 }
1212
1213 /* remove the iag from the ag inode free list if freeing
1214 * this extent causes the iag to have no free inodes.
1215 */
1216 if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) {
1217 if ((int) le32_to_cpu(iagp->inofreefwd) >= 0)
1218 ciagp->inofreeback = iagp->inofreeback;
1219
1220 if ((int) le32_to_cpu(iagp->inofreeback) >= 0)
1221 diagp->inofreefwd = iagp->inofreefwd;
1222 else
1223 imap->im_agctl[agno].inofree =
1224 le32_to_cpu(iagp->inofreefwd);
1225
1226 iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1);
1227 }
1228
1229 /* update the inode extent address and working map
1230 * to reflect the free extent.
1231 * the permanent map should have been updated already
1232 * for the inode being freed.
1233 */
1234 if (iagp->pmap[extno] != 0) {
1235 jfs_error(ip->i_sb, "diFree: the pmap does not show inode free");
1236 }
1237 iagp->wmap[extno] = 0;
1238 DBG_DIFREE(imap, inum);
1239 PXDlength(&iagp->inoext[extno], 0);
1240 PXDaddress(&iagp->inoext[extno], 0);
1241
1242 /* update the free extent and free inode summary maps
1243 * to reflect the freed extent.
1244 * the inode summary map is marked to indicate no inodes
1245 * available for the freed extent.
1246 */
1247 sword = extno >> L2EXTSPERSUM;
1248 bitno = extno & (EXTSPERSUM - 1);
1249 mask = HIGHORDER >> bitno;
1250 iagp->inosmap[sword] |= cpu_to_le32(mask);
1251 iagp->extsmap[sword] &= cpu_to_le32(~mask);
1252
1253 /* update the number of free inodes and number of free extents
1254 * for the iag.
1255 */
1256 iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) -
1257 (INOSPEREXT - 1));
1258 iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) + 1);
1259
1260 /* update the number of free inodes and backed inodes
1261 * at the ag and inode map level.
1262 */
1263 imap->im_agctl[agno].numfree -= (INOSPEREXT - 1);
1264 imap->im_agctl[agno].numinos -= INOSPEREXT;
1265 atomic_sub(INOSPEREXT - 1, &imap->im_numfree);
1266 atomic_sub(INOSPEREXT, &imap->im_numinos);
1267
1268 if (amp)
1269 write_metapage(amp);
1270 if (bmp)
1271 write_metapage(bmp);
1272 if (cmp)
1273 write_metapage(cmp);
1274 if (dmp)
1275 write_metapage(dmp);
1276
1277 /*
1278 * start transaction to update block allocation map
1279 * for the inode extent freed;
1280 *
1281 * N.B. AG_LOCK is released and iag will be released below, and
1282 * other thread may allocate inode from/reusing the ixad freed
1283 * BUT with new/different backing inode extent from the extent
1284 * to be freed by the transaction;
1285 */
1286 tid = txBegin(ipimap->i_sb, COMMIT_FORCE);
1287 down(&JFS_IP(ipimap)->commit_sem);
1288
1289 /* acquire tlock of the iag page of the freed ixad
1290 * to force the page NOHOMEOK (even though no data is
1291 * logged from the iag page) until NOREDOPAGE|FREEXTENT log
1292 * for the free of the extent is committed;
1293 * write FREEXTENT|NOREDOPAGE log record
1294 * N.B. linelock is overlaid as freed extent descriptor;
1295 */
1296 tlck = txLock(tid, ipimap, mp, tlckINODE | tlckFREE);
1297 pxdlock = (struct pxd_lock *) & tlck->lock;
1298 pxdlock->flag = mlckFREEPXD;
1299 pxdlock->pxd = freepxd;
1300 pxdlock->index = 1;
1301
1302 write_metapage(mp);
1303
1304 iplist[0] = ipimap;
1305
1306 /*
1307 * logredo needs the IAG number and IAG extent index in order
1308 * to ensure that the IMap is consistent. The least disruptive
1309 * way to pass these values through to the transaction manager
1310 * is in the iplist array.
1311 *
1312 * It's not pretty, but it works.
1313 */
1314 iplist[1] = (struct inode *) (size_t)iagno;
1315 iplist[2] = (struct inode *) (size_t)extno;
1316
1317 rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE);
1318
1319 txEnd(tid);
1320 up(&JFS_IP(ipimap)->commit_sem);
1321
1322 /* unlock the AG inode map information */
1323 AG_UNLOCK(imap, agno);
1324
1325 return (0);
1326
1327 error_out:
1328 IREAD_UNLOCK(ipimap);
1329
1330 if (amp)
1331 release_metapage(amp);
1332 if (bmp)
1333 release_metapage(bmp);
1334 if (cmp)
1335 release_metapage(cmp);
1336 if (dmp)
1337 release_metapage(dmp);
1338
1339 AG_UNLOCK(imap, agno);
1340
1341 release_metapage(mp);
1342
1343 return (rc);
1344}
1345
1346/*
1347 * There are several places in the diAlloc* routines where we initialize
1348 * the inode.
1349 */
1350static inline void
1351diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp)
1352{
1353 struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
1354 struct jfs_inode_info *jfs_ip = JFS_IP(ip);
1355
1356 ip->i_ino = (iagno << L2INOSPERIAG) + ino;
1357 DBG_DIALLOC(JFS_IP(ipimap)->i_imap, ip->i_ino);
1358 jfs_ip->ixpxd = iagp->inoext[extno];
1359 jfs_ip->agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi);
1360 jfs_ip->active_ag = -1;
1361}
1362
1363
1364/*
1365 * NAME: diAlloc(pip,dir,ip)
1366 *
1367 * FUNCTION: allocate a disk inode from the inode working map
1368 * for a fileset or aggregate.
1369 *
1370 * PARAMETERS:
1371 * pip - pointer to incore inode for the parent inode.
1372 * dir - TRUE if the new disk inode is for a directory.
1373 * ip - pointer to a new inode
1374 *
1375 * RETURN VALUES:
1376 * 0 - success.
1377 * -ENOSPC - insufficient disk resources.
1378 * -EIO - i/o error.
1379 */
1380int diAlloc(struct inode *pip, boolean_t dir, struct inode *ip)
1381{
1382 int rc, ino, iagno, addext, extno, bitno, sword;
1383 int nwords, rem, i, agno;
1384 u32 mask, inosmap, extsmap;
1385 struct inode *ipimap;
1386 struct metapage *mp;
1387 ino_t inum;
1388 struct iag *iagp;
1389 struct inomap *imap;
1390
1391 /* get the pointers to the inode map inode and the
1392 * corresponding imap control structure.
1393 */
1394 ipimap = JFS_SBI(pip->i_sb)->ipimap;
1395 imap = JFS_IP(ipimap)->i_imap;
1396 JFS_IP(ip)->ipimap = ipimap;
1397 JFS_IP(ip)->fileset = FILESYSTEM_I;
1398
1399 /* for a directory, the allocation policy is to start
1400 * at the ag level using the preferred ag.
1401 */
1402 if (dir == TRUE) {
1403 agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap);
1404 AG_LOCK(imap, agno);
1405 goto tryag;
1406 }
1407
1408 /* for files, the policy starts off by trying to allocate from
1409 * the same iag containing the parent disk inode:
1410 * try to allocate the new disk inode close to the parent disk
1411 * inode, using parent disk inode number + 1 as the allocation
1412 * hint. (we use a left-to-right policy to attempt to avoid
1413 * moving backward on the disk.) compute the hint within the
1414 * file system and the iag.
1415 */
1416
1417 /* get the ag number of this iag */
1418 agno = JFS_IP(pip)->agno;
1419
1420 if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) {
1421 /*
1422 * There is an open file actively growing. We want to
1423 * allocate new inodes from a different ag to avoid
1424 * fragmentation problems.
1425 */
1426 agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap);
1427 AG_LOCK(imap, agno);
1428 goto tryag;
1429 }
1430
1431 inum = pip->i_ino + 1;
1432 ino = inum & (INOSPERIAG - 1);
1433
1434 /* back off the the hint if it is outside of the iag */
1435 if (ino == 0)
1436 inum = pip->i_ino;
1437
1438 /* lock the AG inode map information */
1439 AG_LOCK(imap, agno);
1440
1441 /* Get read lock on imap inode */
1442 IREAD_LOCK(ipimap);
1443
1444 /* get the iag number and read the iag */
1445 iagno = INOTOIAG(inum);
1446 if ((rc = diIAGRead(imap, iagno, &mp))) {
1447 IREAD_UNLOCK(ipimap);
1448 AG_UNLOCK(imap, agno);
1449 return (rc);
1450 }
1451 iagp = (struct iag *) mp->data;
1452
1453 /* determine if new inode extent is allowed to be added to the iag.
1454 * new inode extent can be added to the iag if the ag
1455 * has less than 32 free disk inodes and the iag has free extents.
1456 */
1457 addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts);
1458
1459 /*
1460 * try to allocate from the IAG
1461 */
1462 /* check if the inode may be allocated from the iag
1463 * (i.e. the inode has free inodes or new extent can be added).
1464 */
1465 if (iagp->nfreeinos || addext) {
1466 /* determine the extent number of the hint.
1467 */
1468 extno = ino >> L2INOSPEREXT;
1469
1470 /* check if the extent containing the hint has backed
1471 * inodes. if so, try to allocate within this extent.
1472 */
1473 if (addressPXD(&iagp->inoext[extno])) {
1474 bitno = ino & (INOSPEREXT - 1);
1475 if ((bitno =
1476 diFindFree(le32_to_cpu(iagp->wmap[extno]),
1477 bitno))
1478 < INOSPEREXT) {
1479 ino = (extno << L2INOSPEREXT) + bitno;
1480
1481 /* a free inode (bit) was found within this
1482 * extent, so allocate it.
1483 */
1484 rc = diAllocBit(imap, iagp, ino);
1485 IREAD_UNLOCK(ipimap);
1486 if (rc) {
1487 assert(rc == -EIO);
1488 } else {
1489 /* set the results of the allocation
1490 * and write the iag.
1491 */
1492 diInitInode(ip, iagno, ino, extno,
1493 iagp);
1494 mark_metapage_dirty(mp);
1495 }
1496 release_metapage(mp);
1497
1498 /* free the AG lock and return.
1499 */
1500 AG_UNLOCK(imap, agno);
1501 return (rc);
1502 }
1503
1504 if (!addext)
1505 extno =
1506 (extno ==
1507 EXTSPERIAG - 1) ? 0 : extno + 1;
1508 }
1509
1510 /*
1511 * no free inodes within the extent containing the hint.
1512 *
1513 * try to allocate from the backed extents following
1514 * hint or, if appropriate (i.e. addext is true), allocate
1515 * an extent of free inodes at or following the extent
1516 * containing the hint.
1517 *
1518 * the free inode and free extent summary maps are used
1519 * here, so determine the starting summary map position
1520 * and the number of words we'll have to examine. again,
1521 * the approach is to allocate following the hint, so we
1522 * might have to initially ignore prior bits of the summary
1523 * map that represent extents prior to the extent containing
1524 * the hint and later revisit these bits.
1525 */
1526 bitno = extno & (EXTSPERSUM - 1);
1527 nwords = (bitno == 0) ? SMAPSZ : SMAPSZ + 1;
1528 sword = extno >> L2EXTSPERSUM;
1529
1530 /* mask any prior bits for the starting words of the
1531 * summary map.
1532 */
1533 mask = ONES << (EXTSPERSUM - bitno);
1534 inosmap = le32_to_cpu(iagp->inosmap[sword]) | mask;
1535 extsmap = le32_to_cpu(iagp->extsmap[sword]) | mask;
1536
1537 /* scan the free inode and free extent summary maps for
1538 * free resources.
1539 */
1540 for (i = 0; i < nwords; i++) {
1541 /* check if this word of the free inode summary
1542 * map describes an extent with free inodes.
1543 */
1544 if (~inosmap) {
1545 /* an extent with free inodes has been
1546 * found. determine the extent number
1547 * and the inode number within the extent.
1548 */
1549 rem = diFindFree(inosmap, 0);
1550 extno = (sword << L2EXTSPERSUM) + rem;
1551 rem = diFindFree(le32_to_cpu(iagp->wmap[extno]),
1552 0);
1553 if (rem >= INOSPEREXT) {
1554 IREAD_UNLOCK(ipimap);
1555 release_metapage(mp);
1556 AG_UNLOCK(imap, agno);
1557 jfs_error(ip->i_sb,
1558 "diAlloc: can't find free bit "
1559 "in wmap");
1560 return EIO;
1561 }
1562
1563 /* determine the inode number within the
1564 * iag and allocate the inode from the
1565 * map.
1566 */
1567 ino = (extno << L2INOSPEREXT) + rem;
1568 rc = diAllocBit(imap, iagp, ino);
1569 IREAD_UNLOCK(ipimap);
1570 if (rc)
1571 assert(rc == -EIO);
1572 else {
1573 /* set the results of the allocation
1574 * and write the iag.
1575 */
1576 diInitInode(ip, iagno, ino, extno,
1577 iagp);
1578 mark_metapage_dirty(mp);
1579 }
1580 release_metapage(mp);
1581
1582 /* free the AG lock and return.
1583 */
1584 AG_UNLOCK(imap, agno);
1585 return (rc);
1586
1587 }
1588
1589 /* check if we may allocate an extent of free
1590 * inodes and whether this word of the free
1591 * extents summary map describes a free extent.
1592 */
1593 if (addext && ~extsmap) {
1594 /* a free extent has been found. determine
1595 * the extent number.
1596 */
1597 rem = diFindFree(extsmap, 0);
1598 extno = (sword << L2EXTSPERSUM) + rem;
1599
1600 /* allocate an extent of free inodes.
1601 */
1602 if ((rc = diNewExt(imap, iagp, extno))) {
1603 /* if there is no disk space for a
1604 * new extent, try to allocate the
1605 * disk inode from somewhere else.
1606 */
1607 if (rc == -ENOSPC)
1608 break;
1609
1610 assert(rc == -EIO);
1611 } else {
1612 /* set the results of the allocation
1613 * and write the iag.
1614 */
1615 diInitInode(ip, iagno,
1616 extno << L2INOSPEREXT,
1617 extno, iagp);
1618 mark_metapage_dirty(mp);
1619 }
1620 release_metapage(mp);
1621 /* free the imap inode & the AG lock & return.
1622 */
1623 IREAD_UNLOCK(ipimap);
1624 AG_UNLOCK(imap, agno);
1625 return (rc);
1626 }
1627
1628 /* move on to the next set of summary map words.
1629 */
1630 sword = (sword == SMAPSZ - 1) ? 0 : sword + 1;
1631 inosmap = le32_to_cpu(iagp->inosmap[sword]);
1632 extsmap = le32_to_cpu(iagp->extsmap[sword]);
1633 }
1634 }
1635 /* unlock imap inode */
1636 IREAD_UNLOCK(ipimap);
1637
1638 /* nothing doing in this iag, so release it. */
1639 release_metapage(mp);
1640
1641 tryag:
1642 /*
1643 * try to allocate anywhere within the same AG as the parent inode.
1644 */
1645 rc = diAllocAG(imap, agno, dir, ip);
1646
1647 AG_UNLOCK(imap, agno);
1648
1649 if (rc != -ENOSPC)
1650 return (rc);
1651
1652 /*
1653 * try to allocate in any AG.
1654 */
1655 return (diAllocAny(imap, agno, dir, ip));
1656}
1657
1658
1659/*
1660 * NAME: diAllocAG(imap,agno,dir,ip)
1661 *
1662 * FUNCTION: allocate a disk inode from the allocation group.
1663 *
1664 * this routine first determines if a new extent of free
1665 * inodes should be added for the allocation group, with
1666 * the current request satisfied from this extent. if this
1667 * is the case, an attempt will be made to do just that. if
1668 * this attempt fails or it has been determined that a new
1669 * extent should not be added, an attempt is made to satisfy
1670 * the request by allocating an existing (backed) free inode
1671 * from the allocation group.
1672 *
1673 * PRE CONDITION: Already have the AG lock for this AG.
1674 *
1675 * PARAMETERS:
1676 * imap - pointer to inode map control structure.
1677 * agno - allocation group to allocate from.
1678 * dir - TRUE if the new disk inode is for a directory.
1679 * ip - pointer to the new inode to be filled in on successful return
1680 * with the disk inode number allocated, its extent address
1681 * and the start of the ag.
1682 *
1683 * RETURN VALUES:
1684 * 0 - success.
1685 * -ENOSPC - insufficient disk resources.
1686 * -EIO - i/o error.
1687 */
1688static int
1689diAllocAG(struct inomap * imap, int agno, boolean_t dir, struct inode *ip)
1690{
1691 int rc, addext, numfree, numinos;
1692
1693 /* get the number of free and the number of backed disk
1694 * inodes currently within the ag.
1695 */
1696 numfree = imap->im_agctl[agno].numfree;
1697 numinos = imap->im_agctl[agno].numinos;
1698
1699 if (numfree > numinos) {
1700 jfs_error(ip->i_sb, "diAllocAG: numfree > numinos");
1701 return -EIO;
1702 }
1703
1704 /* determine if we should allocate a new extent of free inodes
1705 * within the ag: for directory inodes, add a new extent
1706 * if there are a small number of free inodes or number of free
1707 * inodes is a small percentage of the number of backed inodes.
1708 */
1709 if (dir == TRUE)
1710 addext = (numfree < 64 ||
1711 (numfree < 256
1712 && ((numfree * 100) / numinos) <= 20));
1713 else
1714 addext = (numfree == 0);
1715
1716 /*
1717 * try to allocate a new extent of free inodes.
1718 */
1719 if (addext) {
1720 /* if free space is not avaliable for this new extent, try
1721 * below to allocate a free and existing (already backed)
1722 * inode from the ag.
1723 */
1724 if ((rc = diAllocExt(imap, agno, ip)) != -ENOSPC)
1725 return (rc);
1726 }
1727
1728 /*
1729 * try to allocate an existing free inode from the ag.
1730 */
1731 return (diAllocIno(imap, agno, ip));
1732}
1733
1734
1735/*
1736 * NAME: diAllocAny(imap,agno,dir,iap)
1737 *
1738 * FUNCTION: allocate a disk inode from any other allocation group.
1739 *
1740 * this routine is called when an allocation attempt within
1741 * the primary allocation group has failed. if attempts to
1742 * allocate an inode from any allocation group other than the
1743 * specified primary group.
1744 *
1745 * PARAMETERS:
1746 * imap - pointer to inode map control structure.
1747 * agno - primary allocation group (to avoid).
1748 * dir - TRUE if the new disk inode is for a directory.
1749 * ip - pointer to a new inode to be filled in on successful return
1750 * with the disk inode number allocated, its extent address
1751 * and the start of the ag.
1752 *
1753 * RETURN VALUES:
1754 * 0 - success.
1755 * -ENOSPC - insufficient disk resources.
1756 * -EIO - i/o error.
1757 */
1758static int
1759diAllocAny(struct inomap * imap, int agno, boolean_t dir, struct inode *ip)
1760{
1761 int ag, rc;
1762 int maxag = JFS_SBI(imap->im_ipimap->i_sb)->bmap->db_maxag;
1763
1764
1765 /* try to allocate from the ags following agno up to
1766 * the maximum ag number.
1767 */
1768 for (ag = agno + 1; ag <= maxag; ag++) {
1769 AG_LOCK(imap, ag);
1770
1771 rc = diAllocAG(imap, ag, dir, ip);
1772
1773 AG_UNLOCK(imap, ag);
1774
1775 if (rc != -ENOSPC)
1776 return (rc);
1777 }
1778
1779 /* try to allocate from the ags in front of agno.
1780 */
1781 for (ag = 0; ag < agno; ag++) {
1782 AG_LOCK(imap, ag);
1783
1784 rc = diAllocAG(imap, ag, dir, ip);
1785
1786 AG_UNLOCK(imap, ag);
1787
1788 if (rc != -ENOSPC)
1789 return (rc);
1790 }
1791
1792 /* no free disk inodes.
1793 */
1794 return -ENOSPC;
1795}
1796
1797
1798/*
1799 * NAME: diAllocIno(imap,agno,ip)
1800 *
1801 * FUNCTION: allocate a disk inode from the allocation group's free
1802 * inode list, returning an error if this free list is
1803 * empty (i.e. no iags on the list).
1804 *
1805 * allocation occurs from the first iag on the list using
1806 * the iag's free inode summary map to find the leftmost
1807 * free inode in the iag.
1808 *
1809 * PRE CONDITION: Already have AG lock for this AG.
1810 *
1811 * PARAMETERS:
1812 * imap - pointer to inode map control structure.
1813 * agno - allocation group.
1814 * ip - pointer to new inode to be filled in on successful return
1815 * with the disk inode number allocated, its extent address
1816 * and the start of the ag.
1817 *
1818 * RETURN VALUES:
1819 * 0 - success.
1820 * -ENOSPC - insufficient disk resources.
1821 * -EIO - i/o error.
1822 */
1823static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1824{
1825 int iagno, ino, rc, rem, extno, sword;
1826 struct metapage *mp;
1827 struct iag *iagp;
1828
1829 /* check if there are iags on the ag's free inode list.
1830 */
1831 if ((iagno = imap->im_agctl[agno].inofree) < 0)
1832 return -ENOSPC;
1833
1834 /* obtain read lock on imap inode */
1835 IREAD_LOCK(imap->im_ipimap);
1836
1837 /* read the iag at the head of the list.
1838 */
1839 if ((rc = diIAGRead(imap, iagno, &mp))) {
1840 IREAD_UNLOCK(imap->im_ipimap);
1841 return (rc);
1842 }
1843 iagp = (struct iag *) mp->data;
1844
1845 /* better be free inodes in this iag if it is on the
1846 * list.
1847 */
1848 if (!iagp->nfreeinos) {
1849 IREAD_UNLOCK(imap->im_ipimap);
1850 release_metapage(mp);
1851 jfs_error(ip->i_sb,
1852 "diAllocIno: nfreeinos = 0, but iag on freelist");
1853 return -EIO;
1854 }
1855
1856 /* scan the free inode summary map to find an extent
1857 * with free inodes.
1858 */
1859 for (sword = 0;; sword++) {
1860 if (sword >= SMAPSZ) {
1861 IREAD_UNLOCK(imap->im_ipimap);
1862 release_metapage(mp);
1863 jfs_error(ip->i_sb,
1864 "diAllocIno: free inode not found in summary map");
1865 return -EIO;
1866 }
1867
1868 if (~iagp->inosmap[sword])
1869 break;
1870 }
1871
1872 /* found a extent with free inodes. determine
1873 * the extent number.
1874 */
1875 rem = diFindFree(le32_to_cpu(iagp->inosmap[sword]), 0);
1876 if (rem >= EXTSPERSUM) {
1877 IREAD_UNLOCK(imap->im_ipimap);
1878 release_metapage(mp);
1879 jfs_error(ip->i_sb, "diAllocIno: no free extent found");
1880 return -EIO;
1881 }
1882 extno = (sword << L2EXTSPERSUM) + rem;
1883
1884 /* find the first free inode in the extent.
1885 */
1886 rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), 0);
1887 if (rem >= INOSPEREXT) {
1888 IREAD_UNLOCK(imap->im_ipimap);
1889 release_metapage(mp);
1890 jfs_error(ip->i_sb, "diAllocIno: free inode not found");
1891 return -EIO;
1892 }
1893
1894 /* compute the inode number within the iag.
1895 */
1896 ino = (extno << L2INOSPEREXT) + rem;
1897
1898 /* allocate the inode.
1899 */
1900 rc = diAllocBit(imap, iagp, ino);
1901 IREAD_UNLOCK(imap->im_ipimap);
1902 if (rc) {
1903 release_metapage(mp);
1904 return (rc);
1905 }
1906
1907 /* set the results of the allocation and write the iag.
1908 */
1909 diInitInode(ip, iagno, ino, extno, iagp);
1910 write_metapage(mp);
1911
1912 return (0);
1913}
1914
1915
1916/*
1917 * NAME: diAllocExt(imap,agno,ip)
1918 *
1919 * FUNCTION: add a new extent of free inodes to an iag, allocating
1920 * an inode from this extent to satisfy the current allocation
1921 * request.
1922 *
1923 * this routine first tries to find an existing iag with free
1924 * extents through the ag free extent list. if list is not
1925 * empty, the head of the list will be selected as the home
1926 * of the new extent of free inodes. otherwise (the list is
1927 * empty), a new iag will be allocated for the ag to contain
1928 * the extent.
1929 *
1930 * once an iag has been selected, the free extent summary map
1931 * is used to locate a free extent within the iag and diNewExt()
1932 * is called to initialize the extent, with initialization
1933 * including the allocation of the first inode of the extent
1934 * for the purpose of satisfying this request.
1935 *
1936 * PARAMETERS:
1937 * imap - pointer to inode map control structure.
1938 * agno - allocation group number.
1939 * ip - pointer to new inode to be filled in on successful return
1940 * with the disk inode number allocated, its extent address
1941 * and the start of the ag.
1942 *
1943 * RETURN VALUES:
1944 * 0 - success.
1945 * -ENOSPC - insufficient disk resources.
1946 * -EIO - i/o error.
1947 */
1948static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
1949{
1950 int rem, iagno, sword, extno, rc;
1951 struct metapage *mp;
1952 struct iag *iagp;
1953
1954 /* check if the ag has any iags with free extents. if not,
1955 * allocate a new iag for the ag.
1956 */
1957 if ((iagno = imap->im_agctl[agno].extfree) < 0) {
1958 /* If successful, diNewIAG will obtain the read lock on the
1959 * imap inode.
1960 */
1961 if ((rc = diNewIAG(imap, &iagno, agno, &mp))) {
1962 return (rc);
1963 }
1964 iagp = (struct iag *) mp->data;
1965
1966 /* set the ag number if this a brand new iag
1967 */
1968 iagp->agstart =
1969 cpu_to_le64(AGTOBLK(agno, imap->im_ipimap));
1970 } else {
1971 /* read the iag.
1972 */
1973 IREAD_LOCK(imap->im_ipimap);
1974 if ((rc = diIAGRead(imap, iagno, &mp))) {
1975 IREAD_UNLOCK(imap->im_ipimap);
1976 jfs_error(ip->i_sb, "diAllocExt: error reading iag");
1977 return rc;
1978 }
1979 iagp = (struct iag *) mp->data;
1980 }
1981
1982 /* using the free extent summary map, find a free extent.
1983 */
1984 for (sword = 0;; sword++) {
1985 if (sword >= SMAPSZ) {
1986 release_metapage(mp);
1987 IREAD_UNLOCK(imap->im_ipimap);
1988 jfs_error(ip->i_sb,
1989 "diAllocExt: free ext summary map not found");
1990 return -EIO;
1991 }
1992 if (~iagp->extsmap[sword])
1993 break;
1994 }
1995
1996 /* determine the extent number of the free extent.
1997 */
1998 rem = diFindFree(le32_to_cpu(iagp->extsmap[sword]), 0);
1999 if (rem >= EXTSPERSUM) {
2000 release_metapage(mp);
2001 IREAD_UNLOCK(imap->im_ipimap);
2002 jfs_error(ip->i_sb, "diAllocExt: free extent not found");
2003 return -EIO;
2004 }
2005 extno = (sword << L2EXTSPERSUM) + rem;
2006
2007 /* initialize the new extent.
2008 */
2009 rc = diNewExt(imap, iagp, extno);
2010 IREAD_UNLOCK(imap->im_ipimap);
2011 if (rc) {
2012 /* something bad happened. if a new iag was allocated,
2013 * place it back on the inode map's iag free list, and
2014 * clear the ag number information.
2015 */
2016 if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
2017 IAGFREE_LOCK(imap);
2018 iagp->iagfree = cpu_to_le32(imap->im_freeiag);
2019 imap->im_freeiag = iagno;
2020 IAGFREE_UNLOCK(imap);
2021 }
2022 write_metapage(mp);
2023 return (rc);
2024 }
2025
2026 /* set the results of the allocation and write the iag.
2027 */
2028 diInitInode(ip, iagno, extno << L2INOSPEREXT, extno, iagp);
2029
2030 write_metapage(mp);
2031
2032 return (0);
2033}
2034
2035
2036/*
2037 * NAME: diAllocBit(imap,iagp,ino)
2038 *
2039 * FUNCTION: allocate a backed inode from an iag.
2040 *
2041 * this routine performs the mechanics of allocating a
2042 * specified inode from a backed extent.
2043 *
2044 * if the inode to be allocated represents the last free
2045 * inode within the iag, the iag will be removed from the
2046 * ag free inode list.
2047 *
2048 * a careful update approach is used to provide consistency
2049 * in the face of updates to multiple buffers. under this
2050 * approach, all required buffers are obtained before making
2051 * any updates and are held all are updates are complete.
2052 *
2053 * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on
2054 * this AG. Must have read lock on imap inode.
2055 *
2056 * PARAMETERS:
2057 * imap - pointer to inode map control structure.
2058 * iagp - pointer to iag.
2059 * ino - inode number to be allocated within the iag.
2060 *
2061 * RETURN VALUES:
2062 * 0 - success.
2063 * -ENOSPC - insufficient disk resources.
2064 * -EIO - i/o error.
2065 */
2066static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
2067{
2068 int extno, bitno, agno, sword, rc;
2069 struct metapage *amp = NULL, *bmp = NULL;
2070 struct iag *aiagp = NULL, *biagp = NULL;
2071 u32 mask;
2072
2073 /* check if this is the last free inode within the iag.
2074 * if so, it will have to be removed from the ag free
2075 * inode list, so get the iags preceeding and following
2076 * it on the list.
2077 */
2078 if (iagp->nfreeinos == cpu_to_le32(1)) {
2079 if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) {
2080 if ((rc =
2081 diIAGRead(imap, le32_to_cpu(iagp->inofreefwd),
2082 &amp)))
2083 return (rc);
2084 aiagp = (struct iag *) amp->data;
2085 }
2086
2087 if ((int) le32_to_cpu(iagp->inofreeback) >= 0) {
2088 if ((rc =
2089 diIAGRead(imap,
2090 le32_to_cpu(iagp->inofreeback),
2091 &bmp))) {
2092 if (amp)
2093 release_metapage(amp);
2094 return (rc);
2095 }
2096 biagp = (struct iag *) bmp->data;
2097 }
2098 }
2099
2100 /* get the ag number, extent number, inode number within
2101 * the extent.
2102 */
2103 agno = BLKTOAG(le64_to_cpu(iagp->agstart), JFS_SBI(imap->im_ipimap->i_sb));
2104 extno = ino >> L2INOSPEREXT;
2105 bitno = ino & (INOSPEREXT - 1);
2106
2107 /* compute the mask for setting the map.
2108 */
2109 mask = HIGHORDER >> bitno;
2110
2111 /* the inode should be free and backed.
2112 */
2113 if (((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) ||
2114 ((le32_to_cpu(iagp->wmap[extno]) & mask) != 0) ||
2115 (addressPXD(&iagp->inoext[extno]) == 0)) {
2116 if (amp)
2117 release_metapage(amp);
2118 if (bmp)
2119 release_metapage(bmp);
2120
2121 jfs_error(imap->im_ipimap->i_sb,
2122 "diAllocBit: iag inconsistent");
2123 return -EIO;
2124 }
2125
2126 /* mark the inode as allocated in the working map.
2127 */
2128 iagp->wmap[extno] |= cpu_to_le32(mask);
2129
2130 /* check if all inodes within the extent are now
2131 * allocated. if so, update the free inode summary
2132 * map to reflect this.
2133 */
2134 if (iagp->wmap[extno] == cpu_to_le32(ONES)) {
2135 sword = extno >> L2EXTSPERSUM;
2136 bitno = extno & (EXTSPERSUM - 1);
2137 iagp->inosmap[sword] |= cpu_to_le32(HIGHORDER >> bitno);
2138 }
2139
2140 /* if this was the last free inode in the iag, remove the
2141 * iag from the ag free inode list.
2142 */
2143 if (iagp->nfreeinos == cpu_to_le32(1)) {
2144 if (amp) {
2145 aiagp->inofreeback = iagp->inofreeback;
2146 write_metapage(amp);
2147 }
2148
2149 if (bmp) {
2150 biagp->inofreefwd = iagp->inofreefwd;
2151 write_metapage(bmp);
2152 } else {
2153 imap->im_agctl[agno].inofree =
2154 le32_to_cpu(iagp->inofreefwd);
2155 }
2156 iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1);
2157 }
2158
2159 /* update the free inode count at the iag, ag, inode
2160 * map levels.
2161 */
2162 iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) - 1);
2163 imap->im_agctl[agno].numfree -= 1;
2164 atomic_dec(&imap->im_numfree);
2165
2166 return (0);
2167}
2168
2169
2170/*
2171 * NAME: diNewExt(imap,iagp,extno)
2172 *
2173 * FUNCTION: initialize a new extent of inodes for an iag, allocating
2174 * the first inode of the extent for use for the current
2175 * allocation request.
2176 *
2177 * disk resources are allocated for the new extent of inodes
2178 * and the inodes themselves are initialized to reflect their
2179 * existence within the extent (i.e. their inode numbers and
2180 * inode extent addresses are set) and their initial state
2181 * (mode and link count are set to zero).
2182 *
2183 * if the iag is new, it is not yet on an ag extent free list
2184 * but will now be placed on this list.
2185 *
2186 * if the allocation of the new extent causes the iag to
2187 * have no free extent, the iag will be removed from the
2188 * ag extent free list.
2189 *
2190 * if the iag has no free backed inodes, it will be placed
2191 * on the ag free inode list, since the addition of the new
2192 * extent will now cause it to have free inodes.
2193 *
2194 * a careful update approach is used to provide consistency
2195 * (i.e. list consistency) in the face of updates to multiple
2196 * buffers. under this approach, all required buffers are
2197 * obtained before making any updates and are held until all
2198 * updates are complete.
2199 *
2200 * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on
2201 * this AG. Must have read lock on imap inode.
2202 *
2203 * PARAMETERS:
2204 * imap - pointer to inode map control structure.
2205 * iagp - pointer to iag.
2206 * extno - extent number.
2207 *
2208 * RETURN VALUES:
2209 * 0 - success.
2210 * -ENOSPC - insufficient disk resources.
2211 * -EIO - i/o error.
2212 */
2213static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
2214{
2215 int agno, iagno, fwd, back, freei = 0, sword, rc;
2216 struct iag *aiagp = NULL, *biagp = NULL, *ciagp = NULL;
2217 struct metapage *amp, *bmp, *cmp, *dmp;
2218 struct inode *ipimap;
2219 s64 blkno, hint;
2220 int i, j;
2221 u32 mask;
2222 ino_t ino;
2223 struct dinode *dp;
2224 struct jfs_sb_info *sbi;
2225
2226 /* better have free extents.
2227 */
2228 if (!iagp->nfreeexts) {
2229 jfs_error(imap->im_ipimap->i_sb, "diNewExt: no free extents");
2230 return -EIO;
2231 }
2232
2233 /* get the inode map inode.
2234 */
2235 ipimap = imap->im_ipimap;
2236 sbi = JFS_SBI(ipimap->i_sb);
2237
2238 amp = bmp = cmp = NULL;
2239
2240 /* get the ag and iag numbers for this iag.
2241 */
2242 agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi);
2243 iagno = le32_to_cpu(iagp->iagnum);
2244
2245 /* check if this is the last free extent within the
2246 * iag. if so, the iag must be removed from the ag
2247 * free extent list, so get the iags preceeding and
2248 * following the iag on this list.
2249 */
2250 if (iagp->nfreeexts == cpu_to_le32(1)) {
2251 if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) {
2252 if ((rc = diIAGRead(imap, fwd, &amp)))
2253 return (rc);
2254 aiagp = (struct iag *) amp->data;
2255 }
2256
2257 if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) {
2258 if ((rc = diIAGRead(imap, back, &bmp)))
2259 goto error_out;
2260 biagp = (struct iag *) bmp->data;
2261 }
2262 } else {
2263 /* the iag has free extents. if all extents are free
2264 * (as is the case for a newly allocated iag), the iag
2265 * must be added to the ag free extent list, so get
2266 * the iag at the head of the list in preparation for
2267 * adding this iag to this list.
2268 */
2269 fwd = back = -1;
2270 if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
2271 if ((fwd = imap->im_agctl[agno].extfree) >= 0) {
2272 if ((rc = diIAGRead(imap, fwd, &amp)))
2273 goto error_out;
2274 aiagp = (struct iag *) amp->data;
2275 }
2276 }
2277 }
2278
2279 /* check if the iag has no free inodes. if so, the iag
2280 * will have to be added to the ag free inode list, so get
2281 * the iag at the head of the list in preparation for
2282 * adding this iag to this list. in doing this, we must
2283 * check if we already have the iag at the head of
2284 * the list in hand.
2285 */
2286 if (iagp->nfreeinos == 0) {
2287 freei = imap->im_agctl[agno].inofree;
2288
2289 if (freei >= 0) {
2290 if (freei == fwd) {
2291 ciagp = aiagp;
2292 } else if (freei == back) {
2293 ciagp = biagp;
2294 } else {
2295 if ((rc = diIAGRead(imap, freei, &cmp)))
2296 goto error_out;
2297 ciagp = (struct iag *) cmp->data;
2298 }
2299 if (ciagp == NULL) {
2300 jfs_error(imap->im_ipimap->i_sb,
2301 "diNewExt: ciagp == NULL");
2302 rc = -EIO;
2303 goto error_out;
2304 }
2305 }
2306 }
2307
2308 /* allocate disk space for the inode extent.
2309 */
2310 if ((extno == 0) || (addressPXD(&iagp->inoext[extno - 1]) == 0))
2311 hint = ((s64) agno << sbi->bmap->db_agl2size) - 1;
2312 else
2313 hint = addressPXD(&iagp->inoext[extno - 1]) +
2314 lengthPXD(&iagp->inoext[extno - 1]) - 1;
2315
2316 if ((rc = dbAlloc(ipimap, hint, (s64) imap->im_nbperiext, &blkno)))
2317 goto error_out;
2318
2319 /* compute the inode number of the first inode within the
2320 * extent.
2321 */
2322 ino = (iagno << L2INOSPERIAG) + (extno << L2INOSPEREXT);
2323
2324 /* initialize the inodes within the newly allocated extent a
2325 * page at a time.
2326 */
2327 for (i = 0; i < imap->im_nbperiext; i += sbi->nbperpage) {
2328 /* get a buffer for this page of disk inodes.
2329 */
2330 dmp = get_metapage(ipimap, blkno + i, PSIZE, 1);
2331 if (dmp == NULL) {
2332 rc = -EIO;
2333 goto error_out;
2334 }
2335 dp = (struct dinode *) dmp->data;
2336
2337 /* initialize the inode number, mode, link count and
2338 * inode extent address.
2339 */
2340 for (j = 0; j < INOSPERPAGE; j++, dp++, ino++) {
2341 dp->di_inostamp = cpu_to_le32(sbi->inostamp);
2342 dp->di_number = cpu_to_le32(ino);
2343 dp->di_fileset = cpu_to_le32(FILESYSTEM_I);
2344 dp->di_mode = 0;
2345 dp->di_nlink = 0;
2346 PXDaddress(&(dp->di_ixpxd), blkno);
2347 PXDlength(&(dp->di_ixpxd), imap->im_nbperiext);
2348 }
2349 write_metapage(dmp);
2350 }
2351
2352 /* if this is the last free extent within the iag, remove the
2353 * iag from the ag free extent list.
2354 */
2355 if (iagp->nfreeexts == cpu_to_le32(1)) {
2356 if (fwd >= 0)
2357 aiagp->extfreeback = iagp->extfreeback;
2358
2359 if (back >= 0)
2360 biagp->extfreefwd = iagp->extfreefwd;
2361 else
2362 imap->im_agctl[agno].extfree =
2363 le32_to_cpu(iagp->extfreefwd);
2364
2365 iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1);
2366 } else {
2367 /* if the iag has all free extents (newly allocated iag),
2368 * add the iag to the ag free extent list.
2369 */
2370 if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
2371 if (fwd >= 0)
2372 aiagp->extfreeback = cpu_to_le32(iagno);
2373
2374 iagp->extfreefwd = cpu_to_le32(fwd);
2375 iagp->extfreeback = cpu_to_le32(-1);
2376 imap->im_agctl[agno].extfree = iagno;
2377 }
2378 }
2379
2380 /* if the iag has no free inodes, add the iag to the
2381 * ag free inode list.
2382 */
2383 if (iagp->nfreeinos == 0) {
2384 if (freei >= 0)
2385 ciagp->inofreeback = cpu_to_le32(iagno);
2386
2387 iagp->inofreefwd =
2388 cpu_to_le32(imap->im_agctl[agno].inofree);
2389 iagp->inofreeback = cpu_to_le32(-1);
2390 imap->im_agctl[agno].inofree = iagno;
2391 }
2392
2393 /* initialize the extent descriptor of the extent. */
2394 PXDlength(&iagp->inoext[extno], imap->im_nbperiext);
2395 PXDaddress(&iagp->inoext[extno], blkno);
2396
2397 /* initialize the working and persistent map of the extent.
2398 * the working map will be initialized such that
2399 * it indicates the first inode of the extent is allocated.
2400 */
2401 iagp->wmap[extno] = cpu_to_le32(HIGHORDER);
2402 iagp->pmap[extno] = 0;
2403
2404 /* update the free inode and free extent summary maps
2405 * for the extent to indicate the extent has free inodes
2406 * and no longer represents a free extent.
2407 */
2408 sword = extno >> L2EXTSPERSUM;
2409 mask = HIGHORDER >> (extno & (EXTSPERSUM - 1));
2410 iagp->extsmap[sword] |= cpu_to_le32(mask);
2411 iagp->inosmap[sword] &= cpu_to_le32(~mask);
2412
2413 /* update the free inode and free extent counts for the
2414 * iag.
2415 */
2416 iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) +
2417 (INOSPEREXT - 1));
2418 iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) - 1);
2419
2420 /* update the free and backed inode counts for the ag.
2421 */
2422 imap->im_agctl[agno].numfree += (INOSPEREXT - 1);
2423 imap->im_agctl[agno].numinos += INOSPEREXT;
2424
2425 /* update the free and backed inode counts for the inode map.
2426 */
2427 atomic_add(INOSPEREXT - 1, &imap->im_numfree);
2428 atomic_add(INOSPEREXT, &imap->im_numinos);
2429
2430 /* write the iags.
2431 */
2432 if (amp)
2433 write_metapage(amp);
2434 if (bmp)
2435 write_metapage(bmp);
2436 if (cmp)
2437 write_metapage(cmp);
2438
2439 return (0);
2440
2441 error_out:
2442
2443 /* release the iags.
2444 */
2445 if (amp)
2446 release_metapage(amp);
2447 if (bmp)
2448 release_metapage(bmp);
2449 if (cmp)
2450 release_metapage(cmp);
2451
2452 return (rc);
2453}
2454
2455
2456/*
2457 * NAME: diNewIAG(imap,iagnop,agno)
2458 *
2459 * FUNCTION: allocate a new iag for an allocation group.
2460 *
2461 * first tries to allocate the iag from the inode map
2462 * iagfree list:
2463 * if the list has free iags, the head of the list is removed
2464 * and returned to satisfy the request.
2465 * if the inode map's iag free list is empty, the inode map
2466 * is extended to hold a new iag. this new iag is initialized
2467 * and returned to satisfy the request.
2468 *
2469 * PARAMETERS:
2470 * imap - pointer to inode map control structure.
2471 * iagnop - pointer to an iag number set with the number of the
2472 * newly allocated iag upon successful return.
2473 * agno - allocation group number.
2474 * bpp - Buffer pointer to be filled in with new IAG's buffer
2475 *
2476 * RETURN VALUES:
2477 * 0 - success.
2478 * -ENOSPC - insufficient disk resources.
2479 * -EIO - i/o error.
2480 *
2481 * serialization:
2482 * AG lock held on entry/exit;
2483 * write lock on the map is held inside;
2484 * read lock on the map is held on successful completion;
2485 *
2486 * note: new iag transaction:
2487 * . synchronously write iag;
2488 * . write log of xtree and inode of imap;
2489 * . commit;
2490 * . synchronous write of xtree (right to left, bottom to top);
2491 * . at start of logredo(): init in-memory imap with one additional iag page;
2492 * . at end of logredo(): re-read imap inode to determine
2493 * new imap size;
2494 */
2495static int
2496diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
2497{
2498 int rc;
2499 int iagno, i, xlen;
2500 struct inode *ipimap;
2501 struct super_block *sb;
2502 struct jfs_sb_info *sbi;
2503 struct metapage *mp;
2504 struct iag *iagp;
2505 s64 xaddr = 0;
2506 s64 blkno;
2507 tid_t tid;
2508#ifdef _STILL_TO_PORT
2509 xad_t xad;
2510#endif /* _STILL_TO_PORT */
2511 struct inode *iplist[1];
2512
2513 /* pick up pointers to the inode map and mount inodes */
2514 ipimap = imap->im_ipimap;
2515 sb = ipimap->i_sb;
2516 sbi = JFS_SBI(sb);
2517
2518 /* acquire the free iag lock */
2519 IAGFREE_LOCK(imap);
2520
2521 /* if there are any iags on the inode map free iag list,
2522 * allocate the iag from the head of the list.
2523 */
2524 if (imap->im_freeiag >= 0) {
2525 /* pick up the iag number at the head of the list */
2526 iagno = imap->im_freeiag;
2527
2528 /* determine the logical block number of the iag */
2529 blkno = IAGTOLBLK(iagno, sbi->l2nbperpage);
2530 } else {
2531 /* no free iags. the inode map will have to be extented
2532 * to include a new iag.
2533 */
2534
2535 /* acquire inode map lock */
2536 IWRITE_LOCK(ipimap);
2537
2538 if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) {
2539 IWRITE_UNLOCK(ipimap);
2540 IAGFREE_UNLOCK(imap);
2541 jfs_error(imap->im_ipimap->i_sb,
2542 "diNewIAG: ipimap->i_size is wrong");
2543 return -EIO;
2544 }
2545
2546
2547 /* get the next avaliable iag number */
2548 iagno = imap->im_nextiag;
2549
2550 /* make sure that we have not exceeded the maximum inode
2551 * number limit.
2552 */
2553 if (iagno > (MAXIAGS - 1)) {
2554 /* release the inode map lock */
2555 IWRITE_UNLOCK(ipimap);
2556
2557 rc = -ENOSPC;
2558 goto out;
2559 }
2560
2561 /*
2562 * synchronously append new iag page.
2563 */
2564 /* determine the logical address of iag page to append */
2565 blkno = IAGTOLBLK(iagno, sbi->l2nbperpage);
2566
2567 /* Allocate extent for new iag page */
2568 xlen = sbi->nbperpage;
2569 if ((rc = dbAlloc(ipimap, 0, (s64) xlen, &xaddr))) {
2570 /* release the inode map lock */
2571 IWRITE_UNLOCK(ipimap);
2572
2573 goto out;
2574 }
2575
2576 /* assign a buffer for the page */
2577 mp = get_metapage(ipimap, xaddr, PSIZE, 1);
2578 if (!mp) {
2579 /* Free the blocks allocated for the iag since it was
2580 * not successfully added to the inode map
2581 */
2582 dbFree(ipimap, xaddr, (s64) xlen);
2583
2584 /* release the inode map lock */
2585 IWRITE_UNLOCK(ipimap);
2586
2587 rc = -EIO;
2588 goto out;
2589 }
2590 iagp = (struct iag *) mp->data;
2591
2592 /* init the iag */
2593 memset(iagp, 0, sizeof(struct iag));
2594 iagp->iagnum = cpu_to_le32(iagno);
2595 iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1);
2596 iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1);
2597 iagp->iagfree = cpu_to_le32(-1);
2598 iagp->nfreeinos = 0;
2599 iagp->nfreeexts = cpu_to_le32(EXTSPERIAG);
2600
2601 /* initialize the free inode summary map (free extent
2602 * summary map initialization handled by bzero).
2603 */
2604 for (i = 0; i < SMAPSZ; i++)
2605 iagp->inosmap[i] = cpu_to_le32(ONES);
2606
2607 /*
2608 * Invalidate the page after writing and syncing it.
2609 * After it's initialized, we access it in a different
2610 * address space
2611 */
2612 set_bit(META_discard, &mp->flag);
2613 flush_metapage(mp);
2614
2615 /*
2616 * start tyransaction of update of the inode map
2617 * addressing structure pointing to the new iag page;
2618 */
2619 tid = txBegin(sb, COMMIT_FORCE);
2620 down(&JFS_IP(ipimap)->commit_sem);
2621
2622 /* update the inode map addressing structure to point to it */
2623 if ((rc =
2624 xtInsert(tid, ipimap, 0, blkno, xlen, &xaddr, 0))) {
2625 txEnd(tid);
2626 up(&JFS_IP(ipimap)->commit_sem);
2627 /* Free the blocks allocated for the iag since it was
2628 * not successfully added to the inode map
2629 */
2630 dbFree(ipimap, xaddr, (s64) xlen);
2631
2632 /* release the inode map lock */
2633 IWRITE_UNLOCK(ipimap);
2634
2635 goto out;
2636 }
2637
2638 /* update the inode map's inode to reflect the extension */
2639 ipimap->i_size += PSIZE;
2640 inode_add_bytes(ipimap, PSIZE);
2641
2642 /*
2643 * txCommit(COMMIT_FORCE) will synchronously write address
2644 * index pages and inode after commit in careful update order
2645 * of address index pages (right to left, bottom up);
2646 */
2647 iplist[0] = ipimap;
2648 rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE);
2649
2650 txEnd(tid);
2651 up(&JFS_IP(ipimap)->commit_sem);
2652
2653 duplicateIXtree(sb, blkno, xlen, &xaddr);
2654
2655 /* update the next avaliable iag number */
2656 imap->im_nextiag += 1;
2657
2658 /* Add the iag to the iag free list so we don't lose the iag
2659 * if a failure happens now.
2660 */
2661 imap->im_freeiag = iagno;
2662
2663 /* Until we have logredo working, we want the imap inode &
2664 * control page to be up to date.
2665 */
2666 diSync(ipimap);
2667
2668 /* release the inode map lock */
2669 IWRITE_UNLOCK(ipimap);
2670 }
2671
2672 /* obtain read lock on map */
2673 IREAD_LOCK(ipimap);
2674
2675 /* read the iag */
2676 if ((rc = diIAGRead(imap, iagno, &mp))) {
2677 IREAD_UNLOCK(ipimap);
2678 rc = -EIO;
2679 goto out;
2680 }
2681 iagp = (struct iag *) mp->data;
2682
2683 /* remove the iag from the iag free list */
2684 imap->im_freeiag = le32_to_cpu(iagp->iagfree);
2685 iagp->iagfree = cpu_to_le32(-1);
2686
2687 /* set the return iag number and buffer pointer */
2688 *iagnop = iagno;
2689 *mpp = mp;
2690
2691 out:
2692 /* release the iag free lock */
2693 IAGFREE_UNLOCK(imap);
2694
2695 return (rc);
2696}
2697
2698/*
2699 * NAME: diIAGRead()
2700 *
2701 * FUNCTION: get the buffer for the specified iag within a fileset
2702 * or aggregate inode map.
2703 *
2704 * PARAMETERS:
2705 * imap - pointer to inode map control structure.
2706 * iagno - iag number.
2707 * bpp - point to buffer pointer to be filled in on successful
2708 * exit.
2709 *
2710 * SERIALIZATION:
2711 * must have read lock on imap inode
2712 * (When called by diExtendFS, the filesystem is quiesced, therefore
2713 * the read lock is unnecessary.)
2714 *
2715 * RETURN VALUES:
2716 * 0 - success.
2717 * -EIO - i/o error.
2718 */
2719static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp)
2720{
2721 struct inode *ipimap = imap->im_ipimap;
2722 s64 blkno;
2723
2724 /* compute the logical block number of the iag. */
2725 blkno = IAGTOLBLK(iagno, JFS_SBI(ipimap->i_sb)->l2nbperpage);
2726
2727 /* read the iag. */
2728 *mpp = read_metapage(ipimap, blkno, PSIZE, 0);
2729 if (*mpp == NULL) {
2730 return -EIO;
2731 }
2732
2733 return (0);
2734}
2735
2736/*
2737 * NAME: diFindFree()
2738 *
2739 * FUNCTION: find the first free bit in a word starting at
2740 * the specified bit position.
2741 *
2742 * PARAMETERS:
2743 * word - word to be examined.
2744 * start - starting bit position.
2745 *
2746 * RETURN VALUES:
2747 * bit position of first free bit in the word or 32 if
2748 * no free bits were found.
2749 */
2750static int diFindFree(u32 word, int start)
2751{
2752 int bitno;
2753 assert(start < 32);
2754 /* scan the word for the first free bit. */
2755 for (word <<= start, bitno = start; bitno < 32;
2756 bitno++, word <<= 1) {
2757 if ((word & HIGHORDER) == 0)
2758 break;
2759 }
2760 return (bitno);
2761}
2762
2763/*
2764 * NAME: diUpdatePMap()
2765 *
2766 * FUNCTION: Update the persistent map in an IAG for the allocation or
2767 * freeing of the specified inode.
2768 *
2769 * PRE CONDITIONS: Working map has already been updated for allocate.
2770 *
2771 * PARAMETERS:
2772 * ipimap - Incore inode map inode
2773 * inum - Number of inode to mark in permanent map
2774 * is_free - If TRUE indicates inode should be marked freed, otherwise
2775 * indicates inode should be marked allocated.
2776 *
2777 * RETURN VALUES:
2778 * 0 for success
2779 */
2780int
2781diUpdatePMap(struct inode *ipimap,
2782 unsigned long inum, boolean_t is_free, struct tblock * tblk)
2783{
2784 int rc;
2785 struct iag *iagp;
2786 struct metapage *mp;
2787 int iagno, ino, extno, bitno;
2788 struct inomap *imap;
2789 u32 mask;
2790 struct jfs_log *log;
2791 int lsn, difft, diffp;
2792
2793 imap = JFS_IP(ipimap)->i_imap;
2794 /* get the iag number containing the inode */
2795 iagno = INOTOIAG(inum);
2796 /* make sure that the iag is contained within the map */
2797 if (iagno >= imap->im_nextiag) {
2798 jfs_error(ipimap->i_sb,
2799 "diUpdatePMap: the iag is outside the map");
2800 return -EIO;
2801 }
2802 /* read the iag */
2803 IREAD_LOCK(ipimap);
2804 rc = diIAGRead(imap, iagno, &mp);
2805 IREAD_UNLOCK(ipimap);
2806 if (rc)
2807 return (rc);
2808 iagp = (struct iag *) mp->data;
2809 /* get the inode number and extent number of the inode within
2810 * the iag and the inode number within the extent.
2811 */
2812 ino = inum & (INOSPERIAG - 1);
2813 extno = ino >> L2INOSPEREXT;
2814 bitno = ino & (INOSPEREXT - 1);
2815 mask = HIGHORDER >> bitno;
2816 /*
2817 * mark the inode free in persistent map:
2818 */
2819 if (is_free == TRUE) {
2820 /* The inode should have been allocated both in working
2821 * map and in persistent map;
2822 * the inode will be freed from working map at the release
2823 * of last reference release;
2824 */
2825 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
2826 jfs_error(ipimap->i_sb,
2827 "diUpdatePMap: inode %ld not marked as "
2828 "allocated in wmap!", inum);
2829 }
2830 if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) {
2831 jfs_error(ipimap->i_sb,
2832 "diUpdatePMap: inode %ld not marked as "
2833 "allocated in pmap!", inum);
2834 }
2835 /* update the bitmap for the extent of the freed inode */
2836 iagp->pmap[extno] &= cpu_to_le32(~mask);
2837 }
2838 /*
2839 * mark the inode allocated in persistent map:
2840 */
2841 else {
2842 /* The inode should be already allocated in the working map
2843 * and should be free in persistent map;
2844 */
2845 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
2846 release_metapage(mp);
2847 jfs_error(ipimap->i_sb,
2848 "diUpdatePMap: the inode is not allocated in "
2849 "the working map");
2850 return -EIO;
2851 }
2852 if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) {
2853 release_metapage(mp);
2854 jfs_error(ipimap->i_sb,
2855 "diUpdatePMap: the inode is not free in the "
2856 "persistent map");
2857 return -EIO;
2858 }
2859 /* update the bitmap for the extent of the allocated inode */
2860 iagp->pmap[extno] |= cpu_to_le32(mask);
2861 }
2862 /*
2863 * update iag lsn
2864 */
2865 lsn = tblk->lsn;
2866 log = JFS_SBI(tblk->sb)->log;
2867 if (mp->lsn != 0) {
2868 /* inherit older/smaller lsn */
2869 logdiff(difft, lsn, log);
2870 logdiff(diffp, mp->lsn, log);
2871 if (difft < diffp) {
2872 mp->lsn = lsn;
2873 /* move mp after tblock in logsync list */
2874 LOGSYNC_LOCK(log);
2875 list_move(&mp->synclist, &tblk->synclist);
2876 LOGSYNC_UNLOCK(log);
2877 }
2878 /* inherit younger/larger clsn */
2879 LOGSYNC_LOCK(log);
2880 assert(mp->clsn);
2881 logdiff(difft, tblk->clsn, log);
2882 logdiff(diffp, mp->clsn, log);
2883 if (difft > diffp)
2884 mp->clsn = tblk->clsn;
2885 LOGSYNC_UNLOCK(log);
2886 } else {
2887 mp->log = log;
2888 mp->lsn = lsn;
2889 /* insert mp after tblock in logsync list */
2890 LOGSYNC_LOCK(log);
2891 log->count++;
2892 list_add(&mp->synclist, &tblk->synclist);
2893 mp->clsn = tblk->clsn;
2894 LOGSYNC_UNLOCK(log);
2895 }
2896 write_metapage(mp);
2897 return (0);
2898}
2899
2900/*
2901 * diExtendFS()
2902 *
2903 * function: update imap for extendfs();
2904 *
2905 * note: AG size has been increased s.t. each k old contiguous AGs are
2906 * coalesced into a new AG;
2907 */
2908int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
2909{
2910 int rc, rcx = 0;
2911 struct inomap *imap = JFS_IP(ipimap)->i_imap;
2912 struct iag *iagp = NULL, *hiagp = NULL;
2913 struct bmap *mp = JFS_SBI(ipbmap->i_sb)->bmap;
2914 struct metapage *bp, *hbp;
2915 int i, n, head;
2916 int numinos, xnuminos = 0, xnumfree = 0;
2917 s64 agstart;
2918
2919 jfs_info("diExtendFS: nextiag:%d numinos:%d numfree:%d",
2920 imap->im_nextiag, atomic_read(&imap->im_numinos),
2921 atomic_read(&imap->im_numfree));
2922
2923 /*
2924 * reconstruct imap
2925 *
2926 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
2927 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
2928 * note: new AG size = old AG size * (2**x).
2929 */
2930
2931 /* init per AG control information im_agctl[] */
2932 for (i = 0; i < MAXAG; i++) {
2933 imap->im_agctl[i].inofree = -1;
2934 imap->im_agctl[i].extfree = -1;
2935 imap->im_agctl[i].numinos = 0; /* number of backed inodes */
2936 imap->im_agctl[i].numfree = 0; /* number of free backed inodes */
2937 }
2938
2939 /*
2940 * process each iag page of the map.
2941 *
2942 * rebuild AG Free Inode List, AG Free Inode Extent List;
2943 */
2944 for (i = 0; i < imap->im_nextiag; i++) {
2945 if ((rc = diIAGRead(imap, i, &bp))) {
2946 rcx = rc;
2947 continue;
2948 }
2949 iagp = (struct iag *) bp->data;
2950 if (le32_to_cpu(iagp->iagnum) != i) {
2951 release_metapage(bp);
2952 jfs_error(ipimap->i_sb,
2953 "diExtendFs: unexpected value of iagnum");
2954 return -EIO;
2955 }
2956
2957 /* leave free iag in the free iag list */
2958 if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
2959 release_metapage(bp);
2960 continue;
2961 }
2962
2963 /* agstart that computes to the same ag is treated as same; */
2964 agstart = le64_to_cpu(iagp->agstart);
2965 /* iagp->agstart = agstart & ~(mp->db_agsize - 1); */
2966 n = agstart >> mp->db_agl2size;
2967
2968 /* compute backed inodes */
2969 numinos = (EXTSPERIAG - le32_to_cpu(iagp->nfreeexts))
2970 << L2INOSPEREXT;
2971 if (numinos > 0) {
2972 /* merge AG backed inodes */
2973 imap->im_agctl[n].numinos += numinos;
2974 xnuminos += numinos;
2975 }
2976
2977 /* if any backed free inodes, insert at AG free inode list */
2978 if ((int) le32_to_cpu(iagp->nfreeinos) > 0) {
2979 if ((head = imap->im_agctl[n].inofree) == -1) {
2980 iagp->inofreefwd = cpu_to_le32(-1);
2981 iagp->inofreeback = cpu_to_le32(-1);
2982 } else {
2983 if ((rc = diIAGRead(imap, head, &hbp))) {
2984 rcx = rc;
2985 goto nextiag;
2986 }
2987 hiagp = (struct iag *) hbp->data;
2988 hiagp->inofreeback = iagp->iagnum;
2989 iagp->inofreefwd = cpu_to_le32(head);
2990 iagp->inofreeback = cpu_to_le32(-1);
2991 write_metapage(hbp);
2992 }
2993
2994 imap->im_agctl[n].inofree =
2995 le32_to_cpu(iagp->iagnum);
2996
2997 /* merge AG backed free inodes */
2998 imap->im_agctl[n].numfree +=
2999 le32_to_cpu(iagp->nfreeinos);
3000 xnumfree += le32_to_cpu(iagp->nfreeinos);
3001 }
3002
3003 /* if any free extents, insert at AG free extent list */
3004 if (le32_to_cpu(iagp->nfreeexts) > 0) {
3005 if ((head = imap->im_agctl[n].extfree) == -1) {
3006 iagp->extfreefwd = cpu_to_le32(-1);
3007 iagp->extfreeback = cpu_to_le32(-1);
3008 } else {
3009 if ((rc = diIAGRead(imap, head, &hbp))) {
3010 rcx = rc;
3011 goto nextiag;
3012 }
3013 hiagp = (struct iag *) hbp->data;
3014 hiagp->extfreeback = iagp->iagnum;
3015 iagp->extfreefwd = cpu_to_le32(head);
3016 iagp->extfreeback = cpu_to_le32(-1);
3017 write_metapage(hbp);
3018 }
3019
3020 imap->im_agctl[n].extfree =
3021 le32_to_cpu(iagp->iagnum);
3022 }
3023
3024 nextiag:
3025 write_metapage(bp);
3026 }
3027
3028 if (xnuminos != atomic_read(&imap->im_numinos) ||
3029 xnumfree != atomic_read(&imap->im_numfree)) {
3030 jfs_error(ipimap->i_sb,
3031 "diExtendFs: numinos or numfree incorrect");
3032 return -EIO;
3033 }
3034
3035 return rcx;
3036}
3037
3038
3039/*
3040 * duplicateIXtree()
3041 *
3042 * serialization: IWRITE_LOCK held on entry/exit
3043 *
3044 * note: shadow page with regular inode (rel.2);
3045 */
3046static void duplicateIXtree(struct super_block *sb, s64 blkno,
3047 int xlen, s64 *xaddr)
3048{
3049 struct jfs_superblock *j_sb;
3050 struct buffer_head *bh;
3051 struct inode *ip;
3052 tid_t tid;
3053
3054 /* if AIT2 ipmap2 is bad, do not try to update it */
3055 if (JFS_SBI(sb)->mntflag & JFS_BAD_SAIT) /* s_flag */
3056 return;
3057 ip = diReadSpecial(sb, FILESYSTEM_I, 1);
3058 if (ip == NULL) {
3059 JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT;
3060 if (readSuper(sb, &bh))
3061 return;
3062 j_sb = (struct jfs_superblock *)bh->b_data;
3063 j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT);
3064
3065 mark_buffer_dirty(bh);
3066 sync_dirty_buffer(bh);
3067 brelse(bh);
3068 return;
3069 }
3070
3071 /* start transaction */
3072 tid = txBegin(sb, COMMIT_FORCE);
3073 /* update the inode map addressing structure to point to it */
3074 if (xtInsert(tid, ip, 0, blkno, xlen, xaddr, 0)) {
3075 JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT;
3076 txAbort(tid, 1);
3077 goto cleanup;
3078
3079 }
3080 /* update the inode map's inode to reflect the extension */
3081 ip->i_size += PSIZE;
3082 inode_add_bytes(ip, PSIZE);
3083 txCommit(tid, 1, &ip, COMMIT_FORCE);
3084 cleanup:
3085 txEnd(tid);
3086 diFreeSpecial(ip);
3087}
3088
3089/*
3090 * NAME: copy_from_dinode()
3091 *
3092 * FUNCTION: Copies inode info from disk inode to in-memory inode
3093 *
3094 * RETURN VALUES:
3095 * 0 - success
3096 * -ENOMEM - insufficient memory
3097 */
3098static int copy_from_dinode(struct dinode * dip, struct inode *ip)
3099{
3100 struct jfs_inode_info *jfs_ip = JFS_IP(ip);
3101
3102 jfs_ip->fileset = le32_to_cpu(dip->di_fileset);
3103 jfs_ip->mode2 = le32_to_cpu(dip->di_mode);
3104
3105 ip->i_mode = le32_to_cpu(dip->di_mode) & 0xffff;
3106 ip->i_nlink = le32_to_cpu(dip->di_nlink);
3107 ip->i_uid = le32_to_cpu(dip->di_uid);
3108 ip->i_gid = le32_to_cpu(dip->di_gid);
3109 ip->i_size = le64_to_cpu(dip->di_size);
3110 ip->i_atime.tv_sec = le32_to_cpu(dip->di_atime.tv_sec);
3111 ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec);
3112 ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec);
3113 ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec);
3114 ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec);
3115 ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec);
3116 ip->i_blksize = ip->i_sb->s_blocksize;
3117 ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks));
3118 ip->i_generation = le32_to_cpu(dip->di_gen);
3119
3120 jfs_ip->ixpxd = dip->di_ixpxd; /* in-memory pxd's are little-endian */
3121 jfs_ip->acl = dip->di_acl; /* as are dxd's */
3122 jfs_ip->ea = dip->di_ea;
3123 jfs_ip->next_index = le32_to_cpu(dip->di_next_index);
3124 jfs_ip->otime = le32_to_cpu(dip->di_otime.tv_sec);
3125 jfs_ip->acltype = le32_to_cpu(dip->di_acltype);
3126
3127 if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode)) {
3128 jfs_ip->dev = le32_to_cpu(dip->di_rdev);
3129 ip->i_rdev = new_decode_dev(jfs_ip->dev);
3130 }
3131
3132 if (S_ISDIR(ip->i_mode)) {
3133 memcpy(&jfs_ip->i_dirtable, &dip->di_dirtable, 384);
3134 } else if (S_ISREG(ip->i_mode) || S_ISLNK(ip->i_mode)) {
3135 memcpy(&jfs_ip->i_xtroot, &dip->di_xtroot, 288);
3136 } else
3137 memcpy(&jfs_ip->i_inline_ea, &dip->di_inlineea, 128);
3138
3139 /* Zero the in-memory-only stuff */
3140 jfs_ip->cflag = 0;
3141 jfs_ip->btindex = 0;
3142 jfs_ip->btorder = 0;
3143 jfs_ip->bxflag = 0;
3144 jfs_ip->blid = 0;
3145 jfs_ip->atlhead = 0;
3146 jfs_ip->atltail = 0;
3147 jfs_ip->xtlid = 0;
3148 return (0);
3149}
3150
3151/*
3152 * NAME: copy_to_dinode()
3153 *
3154 * FUNCTION: Copies inode info from in-memory inode to disk inode
3155 */
3156static void copy_to_dinode(struct dinode * dip, struct inode *ip)
3157{
3158 struct jfs_inode_info *jfs_ip = JFS_IP(ip);
3159
3160 dip->di_fileset = cpu_to_le32(jfs_ip->fileset);
3161 dip->di_inostamp = cpu_to_le32(JFS_SBI(ip->i_sb)->inostamp);
3162 dip->di_number = cpu_to_le32(ip->i_ino);
3163 dip->di_gen = cpu_to_le32(ip->i_generation);
3164 dip->di_size = cpu_to_le64(ip->i_size);
3165 dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks));
3166 dip->di_nlink = cpu_to_le32(ip->i_nlink);
3167 dip->di_uid = cpu_to_le32(ip->i_uid);
3168 dip->di_gid = cpu_to_le32(ip->i_gid);
3169 /*
3170 * mode2 is only needed for storing the higher order bits.
3171 * Trust i_mode for the lower order ones
3172 */
3173 dip->di_mode = cpu_to_le32((jfs_ip->mode2 & 0xffff0000) | ip->i_mode);
3174 dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec);
3175 dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec);
3176 dip->di_ctime.tv_sec = cpu_to_le32(ip->i_ctime.tv_sec);
3177 dip->di_ctime.tv_nsec = cpu_to_le32(ip->i_ctime.tv_nsec);
3178 dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec);
3179 dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec);
3180 dip->di_ixpxd = jfs_ip->ixpxd; /* in-memory pxd's are little-endian */
3181 dip->di_acl = jfs_ip->acl; /* as are dxd's */
3182 dip->di_ea = jfs_ip->ea;
3183 dip->di_next_index = cpu_to_le32(jfs_ip->next_index);
3184 dip->di_otime.tv_sec = cpu_to_le32(jfs_ip->otime);
3185 dip->di_otime.tv_nsec = 0;
3186 dip->di_acltype = cpu_to_le32(jfs_ip->acltype);
3187 if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode))
3188 dip->di_rdev = cpu_to_le32(jfs_ip->dev);
3189}
3190
3191#ifdef _JFS_DEBUG_IMAP
3192/*
3193 * DBGdiInit()
3194 */
3195static void *DBGdiInit(struct inomap * imap)
3196{
3197 u32 *dimap;
3198 int size;
3199 size = 64 * 1024;
3200 if ((dimap = (u32 *) xmalloc(size, L2PSIZE, kernel_heap)) == NULL)
3201 assert(0);
3202 bzero((void *) dimap, size);
3203 imap->im_DBGdimap = dimap;
3204}
3205
3206/*
3207 * DBGdiAlloc()
3208 */
3209static void DBGdiAlloc(struct inomap * imap, ino_t ino)
3210{
3211 u32 *dimap = imap->im_DBGdimap;
3212 int w, b;
3213 u32 m;
3214 w = ino >> 5;
3215 b = ino & 31;
3216 m = 0x80000000 >> b;
3217 assert(w < 64 * 256);
3218 if (dimap[w] & m) {
3219 printk("DEBUG diAlloc: duplicate alloc ino:0x%x\n", ino);
3220 }
3221 dimap[w] |= m;
3222}
3223
3224/*
3225 * DBGdiFree()
3226 */
3227static void DBGdiFree(struct inomap * imap, ino_t ino)
3228{
3229 u32 *dimap = imap->im_DBGdimap;
3230 int w, b;
3231 u32 m;
3232 w = ino >> 5;
3233 b = ino & 31;
3234 m = 0x80000000 >> b;
3235 assert(w < 64 * 256);
3236 if ((dimap[w] & m) == 0) {
3237 printk("DEBUG diFree: duplicate free ino:0x%x\n", ino);
3238 }
3239 dimap[w] &= ~m;
3240}
3241
3242static void dump_cp(struct inomap * ipimap, char *function, int line)
3243{
3244 printk("\n* ********* *\nControl Page %s %d\n", function, line);
3245 printk("FreeIAG %d\tNextIAG %d\n", ipimap->im_freeiag,
3246 ipimap->im_nextiag);
3247 printk("NumInos %d\tNumFree %d\n",
3248 atomic_read(&ipimap->im_numinos),
3249 atomic_read(&ipimap->im_numfree));
3250 printk("AG InoFree %d\tAG ExtFree %d\n",
3251 ipimap->im_agctl[0].inofree, ipimap->im_agctl[0].extfree);
3252 printk("AG NumInos %d\tAG NumFree %d\n",
3253 ipimap->im_agctl[0].numinos, ipimap->im_agctl[0].numfree);
3254}
3255
3256static void dump_iag(struct iag * iag, char *function, int line)
3257{
3258 printk("\n* ********* *\nIAG %s %d\n", function, line);
3259 printk("IagNum %d\tIAG Free %d\n", le32_to_cpu(iag->iagnum),
3260 le32_to_cpu(iag->iagfree));
3261 printk("InoFreeFwd %d\tInoFreeBack %d\n",
3262 le32_to_cpu(iag->inofreefwd),
3263 le32_to_cpu(iag->inofreeback));
3264 printk("ExtFreeFwd %d\tExtFreeBack %d\n",
3265 le32_to_cpu(iag->extfreefwd),
3266 le32_to_cpu(iag->extfreeback));
3267 printk("NFreeInos %d\tNFreeExts %d\n", le32_to_cpu(iag->nfreeinos),
3268 le32_to_cpu(iag->nfreeexts));
3269}
3270#endif /* _JFS_DEBUG_IMAP */
diff --git a/fs/jfs/jfs_imap.h b/fs/jfs/jfs_imap.h
new file mode 100644
index 000000000000..6b59adec036a
--- /dev/null
+++ b/fs/jfs/jfs_imap.h
@@ -0,0 +1,175 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2000-2002
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18#ifndef _H_JFS_IMAP
19#define _H_JFS_IMAP
20
21#include "jfs_txnmgr.h"
22
23/*
24 * jfs_imap.h: disk inode manager
25 */
26
27#define EXTSPERIAG 128 /* number of disk inode extent per iag */
28#define IMAPBLKNO 0 /* lblkno of dinomap within inode map */
29#define SMAPSZ 4 /* number of words per summary map */
30#define EXTSPERSUM 32 /* number of extents per summary map entry */
31#define L2EXTSPERSUM 5 /* l2 number of extents per summary map */
32#define PGSPERIEXT 4 /* number of 4K pages per dinode extent */
33#define MAXIAGS ((1<<20)-1) /* maximum number of iags */
34#define MAXAG 128 /* maximum number of allocation groups */
35
36#define AMAPSIZE 512 /* bytes in the IAG allocation maps */
37#define SMAPSIZE 16 /* bytes in the IAG summary maps */
38
39/* convert inode number to iag number */
40#define INOTOIAG(ino) ((ino) >> L2INOSPERIAG)
41
42/* convert iag number to logical block number of the iag page */
43#define IAGTOLBLK(iagno,l2nbperpg) (((iagno) + 1) << (l2nbperpg))
44
45/* get the starting block number of the 4K page of an inode extent
46 * that contains ino.
47 */
48#define INOPBLK(pxd,ino,l2nbperpg) (addressPXD((pxd)) + \
49 ((((ino) & (INOSPEREXT-1)) >> L2INOSPERPAGE) << (l2nbperpg)))
50
51/*
52 * inode allocation map:
53 *
54 * inode allocation map consists of
55 * . the inode map control page and
56 * . inode allocation group pages (per 4096 inodes)
57 * which are addressed by standard JFS xtree.
58 */
59/*
60 * inode allocation group page (per 4096 inodes of an AG)
61 */
62struct iag {
63 __le64 agstart; /* 8: starting block of ag */
64 __le32 iagnum; /* 4: inode allocation group number */
65 __le32 inofreefwd; /* 4: ag inode free list forward */
66 __le32 inofreeback; /* 4: ag inode free list back */
67 __le32 extfreefwd; /* 4: ag inode extent free list forward */
68 __le32 extfreeback; /* 4: ag inode extent free list back */
69 __le32 iagfree; /* 4: iag free list */
70
71 /* summary map: 1 bit per inode extent */
72 __le32 inosmap[SMAPSZ]; /* 16: sum map of mapwords w/ free inodes;
73 * note: this indicates free and backed
74 * inodes, if the extent is not backed the
75 * value will be 1. if the extent is
76 * backed but all inodes are being used the
77 * value will be 1. if the extent is
78 * backed but at least one of the inodes is
79 * free the value will be 0.
80 */
81 __le32 extsmap[SMAPSZ]; /* 16: sum map of mapwords w/ free extents */
82 __le32 nfreeinos; /* 4: number of free inodes */
83 __le32 nfreeexts; /* 4: number of free extents */
84 /* (72) */
85 u8 pad[1976]; /* 1976: pad to 2048 bytes */
86 /* allocation bit map: 1 bit per inode (0 - free, 1 - allocated) */
87 __le32 wmap[EXTSPERIAG]; /* 512: working allocation map */
88 __le32 pmap[EXTSPERIAG]; /* 512: persistent allocation map */
89 pxd_t inoext[EXTSPERIAG]; /* 1024: inode extent addresses */
90}; /* (4096) */
91
92/*
93 * per AG control information (in inode map control page)
94 */
95struct iagctl_disk {
96 __le32 inofree; /* 4: free inode list anchor */
97 __le32 extfree; /* 4: free extent list anchor */
98 __le32 numinos; /* 4: number of backed inodes */
99 __le32 numfree; /* 4: number of free inodes */
100}; /* (16) */
101
102struct iagctl {
103 int inofree; /* free inode list anchor */
104 int extfree; /* free extent list anchor */
105 int numinos; /* number of backed inodes */
106 int numfree; /* number of free inodes */
107};
108
109/*
110 * per fileset/aggregate inode map control page
111 */
112struct dinomap_disk {
113 __le32 in_freeiag; /* 4: free iag list anchor */
114 __le32 in_nextiag; /* 4: next free iag number */
115 __le32 in_numinos; /* 4: num of backed inodes */
116 __le32 in_numfree; /* 4: num of free backed inodes */
117 __le32 in_nbperiext; /* 4: num of blocks per inode extent */
118 __le32 in_l2nbperiext; /* 4: l2 of in_nbperiext */
119 __le32 in_diskblock; /* 4: for standalone test driver */
120 __le32 in_maxag; /* 4: for standalone test driver */
121 u8 pad[2016]; /* 2016: pad to 2048 */
122 struct iagctl_disk in_agctl[MAXAG]; /* 2048: AG control information */
123}; /* (4096) */
124
125struct dinomap {
126 int in_freeiag; /* free iag list anchor */
127 int in_nextiag; /* next free iag number */
128 int in_numinos; /* num of backed inodes */
129 int in_numfree; /* num of free backed inodes */
130 int in_nbperiext; /* num of blocks per inode extent */
131 int in_l2nbperiext; /* l2 of in_nbperiext */
132 int in_diskblock; /* for standalone test driver */
133 int in_maxag; /* for standalone test driver */
134 struct iagctl in_agctl[MAXAG]; /* AG control information */
135};
136
137/*
138 * In-core inode map control page
139 */
140struct inomap {
141 struct dinomap im_imap; /* 4096: inode allocation control */
142 struct inode *im_ipimap; /* 4: ptr to inode for imap */
143 struct semaphore im_freelock; /* 4: iag free list lock */
144 struct semaphore im_aglock[MAXAG]; /* 512: per AG locks */
145 u32 *im_DBGdimap;
146 atomic_t im_numinos; /* num of backed inodes */
147 atomic_t im_numfree; /* num of free backed inodes */
148};
149
150#define im_freeiag im_imap.in_freeiag
151#define im_nextiag im_imap.in_nextiag
152#define im_agctl im_imap.in_agctl
153#define im_nbperiext im_imap.in_nbperiext
154#define im_l2nbperiext im_imap.in_l2nbperiext
155
156/* for standalone testdriver
157 */
158#define im_diskblock im_imap.in_diskblock
159#define im_maxag im_imap.in_maxag
160
161extern int diFree(struct inode *);
162extern int diAlloc(struct inode *, boolean_t, struct inode *);
163extern int diSync(struct inode *);
164/* external references */
165extern int diUpdatePMap(struct inode *ipimap, unsigned long inum,
166 boolean_t is_free, struct tblock * tblk);
167extern int diExtendFS(struct inode *ipimap, struct inode *ipbmap);
168extern int diMount(struct inode *);
169extern int diUnmount(struct inode *, int);
170extern int diRead(struct inode *);
171extern struct inode *diReadSpecial(struct super_block *, ino_t, int);
172extern void diWriteSpecial(struct inode *, int);
173extern void diFreeSpecial(struct inode *);
174extern int diWrite(tid_t tid, struct inode *);
175#endif /* _H_JFS_IMAP */
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
new file mode 100644
index 000000000000..ebd77c1bed66
--- /dev/null
+++ b/fs/jfs/jfs_incore.h
@@ -0,0 +1,197 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 * Portions Copyright (C) Christoph Hellwig, 2001-2002
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19#ifndef _H_JFS_INCORE
20#define _H_JFS_INCORE
21
22#include <linux/rwsem.h>
23#include <linux/slab.h>
24#include <linux/bitops.h>
25#include "jfs_types.h"
26#include "jfs_xtree.h"
27#include "jfs_dtree.h"
28
29/*
30 * JFS magic number
31 */
32#define JFS_SUPER_MAGIC 0x3153464a /* "JFS1" */
33
34/*
35 * JFS-private inode information
36 */
37struct jfs_inode_info {
38 int fileset; /* fileset number (always 16)*/
39 uint mode2; /* jfs-specific mode */
40 pxd_t ixpxd; /* inode extent descriptor */
41 dxd_t acl; /* dxd describing acl */
42 dxd_t ea; /* dxd describing ea */
43 time_t otime; /* time created */
44 uint next_index; /* next available directory entry index */
45 int acltype; /* Type of ACL */
46 short btorder; /* access order */
47 short btindex; /* btpage entry index*/
48 struct inode *ipimap; /* inode map */
49 long cflag; /* commit flags */
50 u16 bxflag; /* xflag of pseudo buffer? */
51 unchar agno; /* ag number */
52 signed char active_ag; /* ag currently allocating from */
53 lid_t blid; /* lid of pseudo buffer? */
54 lid_t atlhead; /* anonymous tlock list head */
55 lid_t atltail; /* anonymous tlock list tail */
56 spinlock_t ag_lock; /* protects active_ag */
57 struct list_head anon_inode_list; /* inodes having anonymous txns */
58 /*
59 * rdwrlock serializes xtree between reads & writes and synchronizes
60 * changes to special inodes. It's use would be redundant on
61 * directories since the i_sem taken in the VFS is sufficient.
62 */
63 struct rw_semaphore rdwrlock;
64 /*
65 * commit_sem serializes transaction processing on an inode.
66 * It must be taken after beginning a transaction (txBegin), since
67 * dirty inodes may be committed while a new transaction on the
68 * inode is blocked in txBegin or TxBeginAnon
69 */
70 struct semaphore commit_sem;
71 /* xattr_sem allows us to access the xattrs without taking i_sem */
72 struct rw_semaphore xattr_sem;
73 lid_t xtlid; /* lid of xtree lock on directory */
74#ifdef CONFIG_JFS_POSIX_ACL
75 struct posix_acl *i_acl;
76 struct posix_acl *i_default_acl;
77#endif
78 union {
79 struct {
80 xtpage_t _xtroot; /* 288: xtree root */
81 struct inomap *_imap; /* 4: inode map header */
82 } file;
83 struct {
84 struct dir_table_slot _table[12]; /* 96: dir index */
85 dtroot_t _dtroot; /* 288: dtree root */
86 } dir;
87 struct {
88 unchar _unused[16]; /* 16: */
89 dxd_t _dxd; /* 16: */
90 unchar _inline[128]; /* 128: inline symlink */
91 /* _inline_ea may overlay the last part of
92 * file._xtroot if maxentry = XTROOTINITSLOT
93 */
94 unchar _inline_ea[128]; /* 128: inline extended attr */
95 } link;
96 } u;
97 u32 dev; /* will die when we get wide dev_t */
98 struct inode vfs_inode;
99};
100#define i_xtroot u.file._xtroot
101#define i_imap u.file._imap
102#define i_dirtable u.dir._table
103#define i_dtroot u.dir._dtroot
104#define i_inline u.link._inline
105#define i_inline_ea u.link._inline_ea
106
107#define JFS_ACL_NOT_CACHED ((void *)-1)
108
109#define IREAD_LOCK(ip) down_read(&JFS_IP(ip)->rdwrlock)
110#define IREAD_UNLOCK(ip) up_read(&JFS_IP(ip)->rdwrlock)
111#define IWRITE_LOCK(ip) down_write(&JFS_IP(ip)->rdwrlock)
112#define IWRITE_UNLOCK(ip) up_write(&JFS_IP(ip)->rdwrlock)
113
114/*
115 * cflag
116 */
117enum cflags {
118 COMMIT_Nolink, /* inode committed with zero link count */
119 COMMIT_Inlineea, /* commit inode inline EA */
120 COMMIT_Freewmap, /* free WMAP at iClose() */
121 COMMIT_Dirty, /* Inode is really dirty */
122 COMMIT_Dirtable, /* commit changes to di_dirtable */
123 COMMIT_Stale, /* data extent is no longer valid */
124 COMMIT_Synclist, /* metadata pages on group commit synclist */
125};
126
127#define set_cflag(flag, ip) set_bit(flag, &(JFS_IP(ip)->cflag))
128#define clear_cflag(flag, ip) clear_bit(flag, &(JFS_IP(ip)->cflag))
129#define test_cflag(flag, ip) test_bit(flag, &(JFS_IP(ip)->cflag))
130#define test_and_clear_cflag(flag, ip) \
131 test_and_clear_bit(flag, &(JFS_IP(ip)->cflag))
132/*
133 * JFS-private superblock information.
134 */
135struct jfs_sb_info {
136 struct super_block *sb; /* Point back to vfs super block */
137 unsigned long mntflag; /* aggregate attributes */
138 struct inode *ipbmap; /* block map inode */
139 struct inode *ipaimap; /* aggregate inode map inode */
140 struct inode *ipaimap2; /* secondary aimap inode */
141 struct inode *ipimap; /* aggregate inode map inode */
142 struct jfs_log *log; /* log */
143 struct list_head log_list; /* volumes associated with a journal */
144 short bsize; /* logical block size */
145 short l2bsize; /* log2 logical block size */
146 short nbperpage; /* blocks per page */
147 short l2nbperpage; /* log2 blocks per page */
148 short l2niperblk; /* log2 inodes per page */
149 dev_t logdev; /* external log device */
150 uint aggregate; /* volume identifier in log record */
151 pxd_t logpxd; /* pxd describing log */
152 pxd_t fsckpxd; /* pxd describing fsck wkspc */
153 pxd_t ait2; /* pxd describing AIT copy */
154 char uuid[16]; /* 128-bit uuid for volume */
155 char loguuid[16]; /* 128-bit uuid for log */
156 /*
157 * commit_state is used for synchronization of the jfs_commit
158 * threads. It is protected by LAZY_LOCK().
159 */
160 int commit_state; /* commit state */
161 /* Formerly in ipimap */
162 uint gengen; /* inode generation generator*/
163 uint inostamp; /* shows inode belongs to fileset*/
164
165 /* Formerly in ipbmap */
166 struct bmap *bmap; /* incore bmap descriptor */
167 struct nls_table *nls_tab; /* current codepage */
168 uint state; /* mount/recovery state */
169 unsigned long flag; /* mount time flags */
170 uint p_state; /* state prior to going no integrity */
171};
172
173/* jfs_sb_info commit_state */
174#define IN_LAZYCOMMIT 1
175
176static inline struct jfs_inode_info *JFS_IP(struct inode *inode)
177{
178 return list_entry(inode, struct jfs_inode_info, vfs_inode);
179}
180
181static inline int jfs_dirtable_inline(struct inode *inode)
182{
183 return (JFS_IP(inode)->next_index <= (MAX_INLINE_DIRTABLE_ENTRY + 1));
184}
185
186static inline struct jfs_sb_info *JFS_SBI(struct super_block *sb)
187{
188 return sb->s_fs_info;
189}
190
191static inline int isReadOnly(struct inode *inode)
192{
193 if (JFS_SBI(inode->i_sb)->log)
194 return 0;
195 return 1;
196}
197#endif /* _H_JFS_INCORE */
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
new file mode 100644
index 000000000000..84f2459b2191
--- /dev/null
+++ b/fs/jfs/jfs_inode.c
@@ -0,0 +1,104 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#include <linux/fs.h>
20#include <linux/quotaops.h>
21#include "jfs_incore.h"
22#include "jfs_filsys.h"
23#include "jfs_imap.h"
24#include "jfs_dinode.h"
25#include "jfs_debug.h"
26
27/*
28 * NAME: ialloc()
29 *
30 * FUNCTION: Allocate a new inode
31 *
32 */
33struct inode *ialloc(struct inode *parent, umode_t mode)
34{
35 struct super_block *sb = parent->i_sb;
36 struct inode *inode;
37 struct jfs_inode_info *jfs_inode;
38 int rc;
39
40 inode = new_inode(sb);
41 if (!inode) {
42 jfs_warn("ialloc: new_inode returned NULL!");
43 return inode;
44 }
45
46 jfs_inode = JFS_IP(inode);
47
48 rc = diAlloc(parent, S_ISDIR(mode), inode);
49 if (rc) {
50 jfs_warn("ialloc: diAlloc returned %d!", rc);
51 make_bad_inode(inode);
52 iput(inode);
53 return NULL;
54 }
55
56 inode->i_uid = current->fsuid;
57 if (parent->i_mode & S_ISGID) {
58 inode->i_gid = parent->i_gid;
59 if (S_ISDIR(mode))
60 mode |= S_ISGID;
61 } else
62 inode->i_gid = current->fsgid;
63
64 /*
65 * Allocate inode to quota.
66 */
67 if (DQUOT_ALLOC_INODE(inode)) {
68 DQUOT_DROP(inode);
69 inode->i_flags |= S_NOQUOTA;
70 inode->i_nlink = 0;
71 iput(inode);
72 return NULL;
73 }
74
75 inode->i_mode = mode;
76 if (S_ISDIR(mode))
77 jfs_inode->mode2 = IDIRECTORY | mode;
78 else
79 jfs_inode->mode2 = INLINEEA | ISPARSE | mode;
80 inode->i_blksize = sb->s_blocksize;
81 inode->i_blocks = 0;
82 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
83 jfs_inode->otime = inode->i_ctime.tv_sec;
84 inode->i_generation = JFS_SBI(sb)->gengen++;
85
86 jfs_inode->cflag = 0;
87
88 /* Zero remaining fields */
89 memset(&jfs_inode->acl, 0, sizeof(dxd_t));
90 memset(&jfs_inode->ea, 0, sizeof(dxd_t));
91 jfs_inode->next_index = 0;
92 jfs_inode->acltype = 0;
93 jfs_inode->btorder = 0;
94 jfs_inode->btindex = 0;
95 jfs_inode->bxflag = 0;
96 jfs_inode->blid = 0;
97 jfs_inode->atlhead = 0;
98 jfs_inode->atltail = 0;
99 jfs_inode->xtlid = 0;
100
101 jfs_info("ialloc returns inode = 0x%p\n", inode);
102
103 return inode;
104}
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
new file mode 100644
index 000000000000..3df91fbfe781
--- /dev/null
+++ b/fs/jfs/jfs_inode.h
@@ -0,0 +1,23 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2000-2001
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18#ifndef _H_JFS_INODE
19#define _H_JFS_INODE
20
21extern struct inode *ialloc(struct inode *, umode_t);
22
23#endif /* _H_JFS_INODE */
diff --git a/fs/jfs/jfs_lock.h b/fs/jfs/jfs_lock.h
new file mode 100644
index 000000000000..10ad1d086685
--- /dev/null
+++ b/fs/jfs/jfs_lock.h
@@ -0,0 +1,51 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2000-2001
3 * Portions Copyright (c) Christoph Hellwig, 2001-2002
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19#ifndef _H_JFS_LOCK
20#define _H_JFS_LOCK
21
22#include <linux/spinlock.h>
23#include <linux/sched.h>
24
25/*
26 * jfs_lock.h
27 */
28
29/*
30 * Conditional sleep where condition is protected by spinlock
31 *
32 * lock_cmd and unlock_cmd take and release the spinlock
33 */
34#define __SLEEP_COND(wq, cond, lock_cmd, unlock_cmd) \
35do { \
36 DECLARE_WAITQUEUE(__wait, current); \
37 \
38 add_wait_queue(&wq, &__wait); \
39 for (;;) { \
40 set_current_state(TASK_UNINTERRUPTIBLE);\
41 if (cond) \
42 break; \
43 unlock_cmd; \
44 schedule(); \
45 lock_cmd; \
46 } \
47 current->state = TASK_RUNNING; \
48 remove_wait_queue(&wq, &__wait); \
49} while (0)
50
51#endif /* _H_JFS_LOCK */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
new file mode 100644
index 000000000000..b6a6869ebb4f
--- /dev/null
+++ b/fs/jfs/jfs_logmgr.c
@@ -0,0 +1,2524 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 * Portions Copyright (C) Christoph Hellwig, 2001-2002
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20/*
21 * jfs_logmgr.c: log manager
22 *
23 * for related information, see transaction manager (jfs_txnmgr.c), and
24 * recovery manager (jfs_logredo.c).
25 *
26 * note: for detail, RTFS.
27 *
28 * log buffer manager:
29 * special purpose buffer manager supporting log i/o requirements.
30 * per log serial pageout of logpage
31 * queuing i/o requests and redrive i/o at iodone
32 * maintain current logpage buffer
33 * no caching since append only
34 * appropriate jfs buffer cache buffers as needed
35 *
36 * group commit:
37 * transactions which wrote COMMIT records in the same in-memory
38 * log page during the pageout of previous/current log page(s) are
39 * committed together by the pageout of the page.
40 *
41 * TBD lazy commit:
42 * transactions are committed asynchronously when the log page
43 * containing it COMMIT is paged out when it becomes full;
44 *
45 * serialization:
46 * . a per log lock serialize log write.
47 * . a per log lock serialize group commit.
48 * . a per log lock serialize log open/close;
49 *
50 * TBD log integrity:
51 * careful-write (ping-pong) of last logpage to recover from crash
52 * in overwrite.
53 * detection of split (out-of-order) write of physical sectors
54 * of last logpage via timestamp at end of each sector
55 * with its mirror data array at trailer).
56 *
57 * alternatives:
58 * lsn - 64-bit monotonically increasing integer vs
59 * 32-bit lspn and page eor.
60 */
61
62#include <linux/fs.h>
63#include <linux/blkdev.h>
64#include <linux/interrupt.h>
65#include <linux/smp_lock.h>
66#include <linux/completion.h>
67#include <linux/buffer_head.h> /* for sync_blockdev() */
68#include <linux/bio.h>
69#include <linux/suspend.h>
70#include <linux/delay.h>
71#include "jfs_incore.h"
72#include "jfs_filsys.h"
73#include "jfs_metapage.h"
74#include "jfs_txnmgr.h"
75#include "jfs_debug.h"
76
77
78/*
79 * lbuf's ready to be redriven. Protected by log_redrive_lock (jfsIO thread)
80 */
81static struct lbuf *log_redrive_list;
82static DEFINE_SPINLOCK(log_redrive_lock);
83DECLARE_WAIT_QUEUE_HEAD(jfs_IO_thread_wait);
84
85
86/*
87 * log read/write serialization (per log)
88 */
89#define LOG_LOCK_INIT(log) init_MUTEX(&(log)->loglock)
90#define LOG_LOCK(log) down(&((log)->loglock))
91#define LOG_UNLOCK(log) up(&((log)->loglock))
92
93
94/*
95 * log group commit serialization (per log)
96 */
97
98#define LOGGC_LOCK_INIT(log) spin_lock_init(&(log)->gclock)
99#define LOGGC_LOCK(log) spin_lock_irq(&(log)->gclock)
100#define LOGGC_UNLOCK(log) spin_unlock_irq(&(log)->gclock)
101#define LOGGC_WAKEUP(tblk) wake_up_all(&(tblk)->gcwait)
102
103/*
104 * log sync serialization (per log)
105 */
106#define LOGSYNC_DELTA(logsize) min((logsize)/8, 128*LOGPSIZE)
107#define LOGSYNC_BARRIER(logsize) ((logsize)/4)
108/*
109#define LOGSYNC_DELTA(logsize) min((logsize)/4, 256*LOGPSIZE)
110#define LOGSYNC_BARRIER(logsize) ((logsize)/2)
111*/
112
113
114/*
115 * log buffer cache synchronization
116 */
117static DEFINE_SPINLOCK(jfsLCacheLock);
118
119#define LCACHE_LOCK(flags) spin_lock_irqsave(&jfsLCacheLock, flags)
120#define LCACHE_UNLOCK(flags) spin_unlock_irqrestore(&jfsLCacheLock, flags)
121
122/*
123 * See __SLEEP_COND in jfs_locks.h
124 */
125#define LCACHE_SLEEP_COND(wq, cond, flags) \
126do { \
127 if (cond) \
128 break; \
129 __SLEEP_COND(wq, cond, LCACHE_LOCK(flags), LCACHE_UNLOCK(flags)); \
130} while (0)
131
132#define LCACHE_WAKEUP(event) wake_up(event)
133
134
135/*
136 * lbuf buffer cache (lCache) control
137 */
138/* log buffer manager pageout control (cumulative, inclusive) */
139#define lbmREAD 0x0001
140#define lbmWRITE 0x0002 /* enqueue at tail of write queue;
141 * init pageout if at head of queue;
142 */
143#define lbmRELEASE 0x0004 /* remove from write queue
144 * at completion of pageout;
145 * do not free/recycle it yet:
146 * caller will free it;
147 */
148#define lbmSYNC 0x0008 /* do not return to freelist
149 * when removed from write queue;
150 */
151#define lbmFREE 0x0010 /* return to freelist
152 * at completion of pageout;
153 * the buffer may be recycled;
154 */
155#define lbmDONE 0x0020
156#define lbmERROR 0x0040
157#define lbmGC 0x0080 /* lbmIODone to perform post-GC processing
158 * of log page
159 */
160#define lbmDIRECT 0x0100
161
162/*
163 * Global list of active external journals
164 */
165static LIST_HEAD(jfs_external_logs);
166static struct jfs_log *dummy_log = NULL;
167static DECLARE_MUTEX(jfs_log_sem);
168
169/*
170 * external references
171 */
172extern void txLazyUnlock(struct tblock * tblk);
173extern int jfs_stop_threads;
174extern struct completion jfsIOwait;
175extern int jfs_tlocks_low;
176
177/*
178 * forward references
179 */
180static int lmWriteRecord(struct jfs_log * log, struct tblock * tblk,
181 struct lrd * lrd, struct tlock * tlck);
182
183static int lmNextPage(struct jfs_log * log);
184static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi,
185 int activate);
186
187static int open_inline_log(struct super_block *sb);
188static int open_dummy_log(struct super_block *sb);
189static int lbmLogInit(struct jfs_log * log);
190static void lbmLogShutdown(struct jfs_log * log);
191static struct lbuf *lbmAllocate(struct jfs_log * log, int);
192static void lbmFree(struct lbuf * bp);
193static void lbmfree(struct lbuf * bp);
194static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp);
195static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, int cant_block);
196static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag);
197static int lbmIOWait(struct lbuf * bp, int flag);
198static bio_end_io_t lbmIODone;
199static void lbmStartIO(struct lbuf * bp);
200static void lmGCwrite(struct jfs_log * log, int cant_block);
201static int lmLogSync(struct jfs_log * log, int nosyncwait);
202
203
204
205/*
206 * statistics
207 */
208#ifdef CONFIG_JFS_STATISTICS
209static struct lmStat {
210 uint commit; /* # of commit */
211 uint pagedone; /* # of page written */
212 uint submitted; /* # of pages submitted */
213 uint full_page; /* # of full pages submitted */
214 uint partial_page; /* # of partial pages submitted */
215} lmStat;
216#endif
217
218
219/*
220 * NAME: lmLog()
221 *
222 * FUNCTION: write a log record;
223 *
224 * PARAMETER:
225 *
226 * RETURN: lsn - offset to the next log record to write (end-of-log);
227 * -1 - error;
228 *
229 * note: todo: log error handler
230 */
231int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
232 struct tlock * tlck)
233{
234 int lsn;
235 int diffp, difft;
236 struct metapage *mp = NULL;
237
238 jfs_info("lmLog: log:0x%p tblk:0x%p, lrd:0x%p tlck:0x%p",
239 log, tblk, lrd, tlck);
240
241 LOG_LOCK(log);
242
243 /* log by (out-of-transaction) JFS ? */
244 if (tblk == NULL)
245 goto writeRecord;
246
247 /* log from page ? */
248 if (tlck == NULL ||
249 tlck->type & tlckBTROOT || (mp = tlck->mp) == NULL)
250 goto writeRecord;
251
252 /*
253 * initialize/update page/transaction recovery lsn
254 */
255 lsn = log->lsn;
256
257 LOGSYNC_LOCK(log);
258
259 /*
260 * initialize page lsn if first log write of the page
261 */
262 if (mp->lsn == 0) {
263 mp->log = log;
264 mp->lsn = lsn;
265 log->count++;
266
267 /* insert page at tail of logsynclist */
268 list_add_tail(&mp->synclist, &log->synclist);
269 }
270
271 /*
272 * initialize/update lsn of tblock of the page
273 *
274 * transaction inherits oldest lsn of pages associated
275 * with allocation/deallocation of resources (their
276 * log records are used to reconstruct allocation map
277 * at recovery time: inode for inode allocation map,
278 * B+-tree index of extent descriptors for block
279 * allocation map);
280 * allocation map pages inherit transaction lsn at
281 * commit time to allow forwarding log syncpt past log
282 * records associated with allocation/deallocation of
283 * resources only after persistent map of these map pages
284 * have been updated and propagated to home.
285 */
286 /*
287 * initialize transaction lsn:
288 */
289 if (tblk->lsn == 0) {
290 /* inherit lsn of its first page logged */
291 tblk->lsn = mp->lsn;
292 log->count++;
293
294 /* insert tblock after the page on logsynclist */
295 list_add(&tblk->synclist, &mp->synclist);
296 }
297 /*
298 * update transaction lsn:
299 */
300 else {
301 /* inherit oldest/smallest lsn of page */
302 logdiff(diffp, mp->lsn, log);
303 logdiff(difft, tblk->lsn, log);
304 if (diffp < difft) {
305 /* update tblock lsn with page lsn */
306 tblk->lsn = mp->lsn;
307
308 /* move tblock after page on logsynclist */
309 list_move(&tblk->synclist, &mp->synclist);
310 }
311 }
312
313 LOGSYNC_UNLOCK(log);
314
315 /*
316 * write the log record
317 */
318 writeRecord:
319 lsn = lmWriteRecord(log, tblk, lrd, tlck);
320
321 /*
322 * forward log syncpt if log reached next syncpt trigger
323 */
324 logdiff(diffp, lsn, log);
325 if (diffp >= log->nextsync)
326 lsn = lmLogSync(log, 0);
327
328 /* update end-of-log lsn */
329 log->lsn = lsn;
330
331 LOG_UNLOCK(log);
332
333 /* return end-of-log address */
334 return lsn;
335}
336
337
338/*
339 * NAME: lmWriteRecord()
340 *
341 * FUNCTION: move the log record to current log page
342 *
343 * PARAMETER: cd - commit descriptor
344 *
345 * RETURN: end-of-log address
346 *
347 * serialization: LOG_LOCK() held on entry/exit
348 */
349static int
350lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
351 struct tlock * tlck)
352{
353 int lsn = 0; /* end-of-log address */
354 struct lbuf *bp; /* dst log page buffer */
355 struct logpage *lp; /* dst log page */
356 caddr_t dst; /* destination address in log page */
357 int dstoffset; /* end-of-log offset in log page */
358 int freespace; /* free space in log page */
359 caddr_t p; /* src meta-data page */
360 caddr_t src;
361 int srclen;
362 int nbytes; /* number of bytes to move */
363 int i;
364 int len;
365 struct linelock *linelock;
366 struct lv *lv;
367 struct lvd *lvd;
368 int l2linesize;
369
370 len = 0;
371
372 /* retrieve destination log page to write */
373 bp = (struct lbuf *) log->bp;
374 lp = (struct logpage *) bp->l_ldata;
375 dstoffset = log->eor;
376
377 /* any log data to write ? */
378 if (tlck == NULL)
379 goto moveLrd;
380
381 /*
382 * move log record data
383 */
384 /* retrieve source meta-data page to log */
385 if (tlck->flag & tlckPAGELOCK) {
386 p = (caddr_t) (tlck->mp->data);
387 linelock = (struct linelock *) & tlck->lock;
388 }
389 /* retrieve source in-memory inode to log */
390 else if (tlck->flag & tlckINODELOCK) {
391 if (tlck->type & tlckDTREE)
392 p = (caddr_t) &JFS_IP(tlck->ip)->i_dtroot;
393 else
394 p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot;
395 linelock = (struct linelock *) & tlck->lock;
396 }
397#ifdef _JFS_WIP
398 else if (tlck->flag & tlckINLINELOCK) {
399
400 inlinelock = (struct inlinelock *) & tlck;
401 p = (caddr_t) & inlinelock->pxd;
402 linelock = (struct linelock *) & tlck;
403 }
404#endif /* _JFS_WIP */
405 else {
406 jfs_err("lmWriteRecord: UFO tlck:0x%p", tlck);
407 return 0; /* Probably should trap */
408 }
409 l2linesize = linelock->l2linesize;
410
411 moveData:
412 ASSERT(linelock->index <= linelock->maxcnt);
413
414 lv = linelock->lv;
415 for (i = 0; i < linelock->index; i++, lv++) {
416 if (lv->length == 0)
417 continue;
418
419 /* is page full ? */
420 if (dstoffset >= LOGPSIZE - LOGPTLRSIZE) {
421 /* page become full: move on to next page */
422 lmNextPage(log);
423
424 bp = log->bp;
425 lp = (struct logpage *) bp->l_ldata;
426 dstoffset = LOGPHDRSIZE;
427 }
428
429 /*
430 * move log vector data
431 */
432 src = (u8 *) p + (lv->offset << l2linesize);
433 srclen = lv->length << l2linesize;
434 len += srclen;
435 while (srclen > 0) {
436 freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset;
437 nbytes = min(freespace, srclen);
438 dst = (caddr_t) lp + dstoffset;
439 memcpy(dst, src, nbytes);
440 dstoffset += nbytes;
441
442 /* is page not full ? */
443 if (dstoffset < LOGPSIZE - LOGPTLRSIZE)
444 break;
445
446 /* page become full: move on to next page */
447 lmNextPage(log);
448
449 bp = (struct lbuf *) log->bp;
450 lp = (struct logpage *) bp->l_ldata;
451 dstoffset = LOGPHDRSIZE;
452
453 srclen -= nbytes;
454 src += nbytes;
455 }
456
457 /*
458 * move log vector descriptor
459 */
460 len += 4;
461 lvd = (struct lvd *) ((caddr_t) lp + dstoffset);
462 lvd->offset = cpu_to_le16(lv->offset);
463 lvd->length = cpu_to_le16(lv->length);
464 dstoffset += 4;
465 jfs_info("lmWriteRecord: lv offset:%d length:%d",
466 lv->offset, lv->length);
467 }
468
469 if ((i = linelock->next)) {
470 linelock = (struct linelock *) lid_to_tlock(i);
471 goto moveData;
472 }
473
474 /*
475 * move log record descriptor
476 */
477 moveLrd:
478 lrd->length = cpu_to_le16(len);
479
480 src = (caddr_t) lrd;
481 srclen = LOGRDSIZE;
482
483 while (srclen > 0) {
484 freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset;
485 nbytes = min(freespace, srclen);
486 dst = (caddr_t) lp + dstoffset;
487 memcpy(dst, src, nbytes);
488
489 dstoffset += nbytes;
490 srclen -= nbytes;
491
492 /* are there more to move than freespace of page ? */
493 if (srclen)
494 goto pageFull;
495
496 /*
497 * end of log record descriptor
498 */
499
500 /* update last log record eor */
501 log->eor = dstoffset;
502 bp->l_eor = dstoffset;
503 lsn = (log->page << L2LOGPSIZE) + dstoffset;
504
505 if (lrd->type & cpu_to_le16(LOG_COMMIT)) {
506 tblk->clsn = lsn;
507 jfs_info("wr: tclsn:0x%x, beor:0x%x", tblk->clsn,
508 bp->l_eor);
509
510 INCREMENT(lmStat.commit); /* # of commit */
511
512 /*
513 * enqueue tblock for group commit:
514 *
515 * enqueue tblock of non-trivial/synchronous COMMIT
516 * at tail of group commit queue
517 * (trivial/asynchronous COMMITs are ignored by
518 * group commit.)
519 */
520 LOGGC_LOCK(log);
521
522 /* init tblock gc state */
523 tblk->flag = tblkGC_QUEUE;
524 tblk->bp = log->bp;
525 tblk->pn = log->page;
526 tblk->eor = log->eor;
527
528 /* enqueue transaction to commit queue */
529 list_add_tail(&tblk->cqueue, &log->cqueue);
530
531 LOGGC_UNLOCK(log);
532 }
533
534 jfs_info("lmWriteRecord: lrd:0x%04x bp:0x%p pn:%d eor:0x%x",
535 le16_to_cpu(lrd->type), log->bp, log->page, dstoffset);
536
537 /* page not full ? */
538 if (dstoffset < LOGPSIZE - LOGPTLRSIZE)
539 return lsn;
540
541 pageFull:
542 /* page become full: move on to next page */
543 lmNextPage(log);
544
545 bp = (struct lbuf *) log->bp;
546 lp = (struct logpage *) bp->l_ldata;
547 dstoffset = LOGPHDRSIZE;
548 src += nbytes;
549 }
550
551 return lsn;
552}
553
554
555/*
556 * NAME: lmNextPage()
557 *
558 * FUNCTION: write current page and allocate next page.
559 *
560 * PARAMETER: log
561 *
562 * RETURN: 0
563 *
564 * serialization: LOG_LOCK() held on entry/exit
565 */
566static int lmNextPage(struct jfs_log * log)
567{
568 struct logpage *lp;
569 int lspn; /* log sequence page number */
570 int pn; /* current page number */
571 struct lbuf *bp;
572 struct lbuf *nextbp;
573 struct tblock *tblk;
574
575 /* get current log page number and log sequence page number */
576 pn = log->page;
577 bp = log->bp;
578 lp = (struct logpage *) bp->l_ldata;
579 lspn = le32_to_cpu(lp->h.page);
580
581 LOGGC_LOCK(log);
582
583 /*
584 * write or queue the full page at the tail of write queue
585 */
586 /* get the tail tblk on commit queue */
587 if (list_empty(&log->cqueue))
588 tblk = NULL;
589 else
590 tblk = list_entry(log->cqueue.prev, struct tblock, cqueue);
591
592 /* every tblk who has COMMIT record on the current page,
593 * and has not been committed, must be on commit queue
594 * since tblk is queued at commit queueu at the time
595 * of writing its COMMIT record on the page before
596 * page becomes full (even though the tblk thread
597 * who wrote COMMIT record may have been suspended
598 * currently);
599 */
600
601 /* is page bound with outstanding tail tblk ? */
602 if (tblk && tblk->pn == pn) {
603 /* mark tblk for end-of-page */
604 tblk->flag |= tblkGC_EOP;
605
606 if (log->cflag & logGC_PAGEOUT) {
607 /* if page is not already on write queue,
608 * just enqueue (no lbmWRITE to prevent redrive)
609 * buffer to wqueue to ensure correct serial order
610 * of the pages since log pages will be added
611 * continuously
612 */
613 if (bp->l_wqnext == NULL)
614 lbmWrite(log, bp, 0, 0);
615 } else {
616 /*
617 * No current GC leader, initiate group commit
618 */
619 log->cflag |= logGC_PAGEOUT;
620 lmGCwrite(log, 0);
621 }
622 }
623 /* page is not bound with outstanding tblk:
624 * init write or mark it to be redriven (lbmWRITE)
625 */
626 else {
627 /* finalize the page */
628 bp->l_ceor = bp->l_eor;
629 lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
630 lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, 0);
631 }
632 LOGGC_UNLOCK(log);
633
634 /*
635 * allocate/initialize next page
636 */
637 /* if log wraps, the first data page of log is 2
638 * (0 never used, 1 is superblock).
639 */
640 log->page = (pn == log->size - 1) ? 2 : pn + 1;
641 log->eor = LOGPHDRSIZE; /* ? valid page empty/full at logRedo() */
642
643 /* allocate/initialize next log page buffer */
644 nextbp = lbmAllocate(log, log->page);
645 nextbp->l_eor = log->eor;
646 log->bp = nextbp;
647
648 /* initialize next log page */
649 lp = (struct logpage *) nextbp->l_ldata;
650 lp->h.page = lp->t.page = cpu_to_le32(lspn + 1);
651 lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE);
652
653 return 0;
654}
655
656
657/*
658 * NAME: lmGroupCommit()
659 *
660 * FUNCTION: group commit
661 * initiate pageout of the pages with COMMIT in the order of
662 * page number - redrive pageout of the page at the head of
663 * pageout queue until full page has been written.
664 *
665 * RETURN:
666 *
667 * NOTE:
668 * LOGGC_LOCK serializes log group commit queue, and
669 * transaction blocks on the commit queue.
670 * N.B. LOG_LOCK is NOT held during lmGroupCommit().
671 */
672int lmGroupCommit(struct jfs_log * log, struct tblock * tblk)
673{
674 int rc = 0;
675
676 LOGGC_LOCK(log);
677
678 /* group committed already ? */
679 if (tblk->flag & tblkGC_COMMITTED) {
680 if (tblk->flag & tblkGC_ERROR)
681 rc = -EIO;
682
683 LOGGC_UNLOCK(log);
684 return rc;
685 }
686 jfs_info("lmGroup Commit: tblk = 0x%p, gcrtc = %d", tblk, log->gcrtc);
687
688 if (tblk->xflag & COMMIT_LAZY)
689 tblk->flag |= tblkGC_LAZY;
690
691 if ((!(log->cflag & logGC_PAGEOUT)) && (!list_empty(&log->cqueue)) &&
692 (!(tblk->xflag & COMMIT_LAZY) || test_bit(log_FLUSH, &log->flag)
693 || jfs_tlocks_low)) {
694 /*
695 * No pageout in progress
696 *
697 * start group commit as its group leader.
698 */
699 log->cflag |= logGC_PAGEOUT;
700
701 lmGCwrite(log, 0);
702 }
703
704 if (tblk->xflag & COMMIT_LAZY) {
705 /*
706 * Lazy transactions can leave now
707 */
708 LOGGC_UNLOCK(log);
709 return 0;
710 }
711
712 /* lmGCwrite gives up LOGGC_LOCK, check again */
713
714 if (tblk->flag & tblkGC_COMMITTED) {
715 if (tblk->flag & tblkGC_ERROR)
716 rc = -EIO;
717
718 LOGGC_UNLOCK(log);
719 return rc;
720 }
721
722 /* upcount transaction waiting for completion
723 */
724 log->gcrtc++;
725 tblk->flag |= tblkGC_READY;
726
727 __SLEEP_COND(tblk->gcwait, (tblk->flag & tblkGC_COMMITTED),
728 LOGGC_LOCK(log), LOGGC_UNLOCK(log));
729
730 /* removed from commit queue */
731 if (tblk->flag & tblkGC_ERROR)
732 rc = -EIO;
733
734 LOGGC_UNLOCK(log);
735 return rc;
736}
737
738/*
739 * NAME: lmGCwrite()
740 *
741 * FUNCTION: group commit write
742 * initiate write of log page, building a group of all transactions
743 * with commit records on that page.
744 *
745 * RETURN: None
746 *
747 * NOTE:
748 * LOGGC_LOCK must be held by caller.
749 * N.B. LOG_LOCK is NOT held during lmGroupCommit().
750 */
751static void lmGCwrite(struct jfs_log * log, int cant_write)
752{
753 struct lbuf *bp;
754 struct logpage *lp;
755 int gcpn; /* group commit page number */
756 struct tblock *tblk;
757 struct tblock *xtblk = NULL;
758
759 /*
760 * build the commit group of a log page
761 *
762 * scan commit queue and make a commit group of all
763 * transactions with COMMIT records on the same log page.
764 */
765 /* get the head tblk on the commit queue */
766 gcpn = list_entry(log->cqueue.next, struct tblock, cqueue)->pn;
767
768 list_for_each_entry(tblk, &log->cqueue, cqueue) {
769 if (tblk->pn != gcpn)
770 break;
771
772 xtblk = tblk;
773
774 /* state transition: (QUEUE, READY) -> COMMIT */
775 tblk->flag |= tblkGC_COMMIT;
776 }
777 tblk = xtblk; /* last tblk of the page */
778
779 /*
780 * pageout to commit transactions on the log page.
781 */
782 bp = (struct lbuf *) tblk->bp;
783 lp = (struct logpage *) bp->l_ldata;
784 /* is page already full ? */
785 if (tblk->flag & tblkGC_EOP) {
786 /* mark page to free at end of group commit of the page */
787 tblk->flag &= ~tblkGC_EOP;
788 tblk->flag |= tblkGC_FREE;
789 bp->l_ceor = bp->l_eor;
790 lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
791 lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmGC,
792 cant_write);
793 INCREMENT(lmStat.full_page);
794 }
795 /* page is not yet full */
796 else {
797 bp->l_ceor = tblk->eor; /* ? bp->l_ceor = bp->l_eor; */
798 lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
799 lbmWrite(log, bp, lbmWRITE | lbmGC, cant_write);
800 INCREMENT(lmStat.partial_page);
801 }
802}
803
804/*
805 * NAME: lmPostGC()
806 *
807 * FUNCTION: group commit post-processing
808 * Processes transactions after their commit records have been written
809 * to disk, redriving log I/O if necessary.
810 *
811 * RETURN: None
812 *
813 * NOTE:
814 * This routine is called a interrupt time by lbmIODone
815 */
816static void lmPostGC(struct lbuf * bp)
817{
818 unsigned long flags;
819 struct jfs_log *log = bp->l_log;
820 struct logpage *lp;
821 struct tblock *tblk, *temp;
822
823 //LOGGC_LOCK(log);
824 spin_lock_irqsave(&log->gclock, flags);
825 /*
826 * current pageout of group commit completed.
827 *
828 * remove/wakeup transactions from commit queue who were
829 * group committed with the current log page
830 */
831 list_for_each_entry_safe(tblk, temp, &log->cqueue, cqueue) {
832 if (!(tblk->flag & tblkGC_COMMIT))
833 break;
834 /* if transaction was marked GC_COMMIT then
835 * it has been shipped in the current pageout
836 * and made it to disk - it is committed.
837 */
838
839 if (bp->l_flag & lbmERROR)
840 tblk->flag |= tblkGC_ERROR;
841
842 /* remove it from the commit queue */
843 list_del(&tblk->cqueue);
844 tblk->flag &= ~tblkGC_QUEUE;
845
846 if (tblk == log->flush_tblk) {
847 /* we can stop flushing the log now */
848 clear_bit(log_FLUSH, &log->flag);
849 log->flush_tblk = NULL;
850 }
851
852 jfs_info("lmPostGC: tblk = 0x%p, flag = 0x%x", tblk,
853 tblk->flag);
854
855 if (!(tblk->xflag & COMMIT_FORCE))
856 /*
857 * Hand tblk over to lazy commit thread
858 */
859 txLazyUnlock(tblk);
860 else {
861 /* state transition: COMMIT -> COMMITTED */
862 tblk->flag |= tblkGC_COMMITTED;
863
864 if (tblk->flag & tblkGC_READY)
865 log->gcrtc--;
866
867 LOGGC_WAKEUP(tblk);
868 }
869
870 /* was page full before pageout ?
871 * (and this is the last tblk bound with the page)
872 */
873 if (tblk->flag & tblkGC_FREE)
874 lbmFree(bp);
875 /* did page become full after pageout ?
876 * (and this is the last tblk bound with the page)
877 */
878 else if (tblk->flag & tblkGC_EOP) {
879 /* finalize the page */
880 lp = (struct logpage *) bp->l_ldata;
881 bp->l_ceor = bp->l_eor;
882 lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
883 jfs_info("lmPostGC: calling lbmWrite");
884 lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE,
885 1);
886 }
887
888 }
889
890 /* are there any transactions who have entered lnGroupCommit()
891 * (whose COMMITs are after that of the last log page written.
892 * They are waiting for new group commit (above at (SLEEP 1))
893 * or lazy transactions are on a full (queued) log page,
894 * select the latest ready transaction as new group leader and
895 * wake her up to lead her group.
896 */
897 if ((!list_empty(&log->cqueue)) &&
898 ((log->gcrtc > 0) || (tblk->bp->l_wqnext != NULL) ||
899 test_bit(log_FLUSH, &log->flag) || jfs_tlocks_low))
900 /*
901 * Call lmGCwrite with new group leader
902 */
903 lmGCwrite(log, 1);
904
905 /* no transaction are ready yet (transactions are only just
906 * queued (GC_QUEUE) and not entered for group commit yet).
907 * the first transaction entering group commit
908 * will elect herself as new group leader.
909 */
910 else
911 log->cflag &= ~logGC_PAGEOUT;
912
913 //LOGGC_UNLOCK(log);
914 spin_unlock_irqrestore(&log->gclock, flags);
915 return;
916}
917
918/*
919 * NAME: lmLogSync()
920 *
921 * FUNCTION: write log SYNCPT record for specified log
922 * if new sync address is available
923 * (normally the case if sync() is executed by back-ground
924 * process).
925 * if not, explicitly run jfs_blogsync() to initiate
926 * getting of new sync address.
927 * calculate new value of i_nextsync which determines when
928 * this code is called again.
929 *
930 * this is called only from lmLog().
931 *
932 * PARAMETER: ip - pointer to logs inode.
933 *
934 * RETURN: 0
935 *
936 * serialization: LOG_LOCK() held on entry/exit
937 */
938static int lmLogSync(struct jfs_log * log, int nosyncwait)
939{
940 int logsize;
941 int written; /* written since last syncpt */
942 int free; /* free space left available */
943 int delta; /* additional delta to write normally */
944 int more; /* additional write granted */
945 struct lrd lrd;
946 int lsn;
947 struct logsyncblk *lp;
948
949 /*
950 * forward syncpt
951 */
952 /* if last sync is same as last syncpt,
953 * invoke sync point forward processing to update sync.
954 */
955
956 if (log->sync == log->syncpt) {
957 LOGSYNC_LOCK(log);
958 /* ToDo: push dirty metapages out to disk */
959// bmLogSync(log);
960
961 if (list_empty(&log->synclist))
962 log->sync = log->lsn;
963 else {
964 lp = list_entry(log->synclist.next,
965 struct logsyncblk, synclist);
966 log->sync = lp->lsn;
967 }
968 LOGSYNC_UNLOCK(log);
969
970 }
971
972 /* if sync is different from last syncpt,
973 * write a SYNCPT record with syncpt = sync.
974 * reset syncpt = sync
975 */
976 if (log->sync != log->syncpt) {
977 struct jfs_sb_info *sbi;
978
979 /*
980 * We need to make sure all of the "written" metapages
981 * actually make it to disk
982 */
983 list_for_each_entry(sbi, &log->sb_list, log_list) {
984 if (sbi->flag & JFS_NOINTEGRITY)
985 continue;
986 filemap_fdatawrite(sbi->ipbmap->i_mapping);
987 filemap_fdatawrite(sbi->ipimap->i_mapping);
988 filemap_fdatawrite(sbi->sb->s_bdev->bd_inode->i_mapping);
989 }
990 list_for_each_entry(sbi, &log->sb_list, log_list) {
991 if (sbi->flag & JFS_NOINTEGRITY)
992 continue;
993 filemap_fdatawait(sbi->ipbmap->i_mapping);
994 filemap_fdatawait(sbi->ipimap->i_mapping);
995 filemap_fdatawait(sbi->sb->s_bdev->bd_inode->i_mapping);
996 }
997
998 lrd.logtid = 0;
999 lrd.backchain = 0;
1000 lrd.type = cpu_to_le16(LOG_SYNCPT);
1001 lrd.length = 0;
1002 lrd.log.syncpt.sync = cpu_to_le32(log->sync);
1003 lsn = lmWriteRecord(log, NULL, &lrd, NULL);
1004
1005 log->syncpt = log->sync;
1006 } else
1007 lsn = log->lsn;
1008
1009 /*
1010 * setup next syncpt trigger (SWAG)
1011 */
1012 logsize = log->logsize;
1013
1014 logdiff(written, lsn, log);
1015 free = logsize - written;
1016 delta = LOGSYNC_DELTA(logsize);
1017 more = min(free / 2, delta);
1018 if (more < 2 * LOGPSIZE) {
1019 jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n");
1020 /*
1021 * log wrapping
1022 *
1023 * option 1 - panic ? No.!
1024 * option 2 - shutdown file systems
1025 * associated with log ?
1026 * option 3 - extend log ?
1027 */
1028 /*
1029 * option 4 - second chance
1030 *
1031 * mark log wrapped, and continue.
1032 * when all active transactions are completed,
1033 * mark log vaild for recovery.
1034 * if crashed during invalid state, log state
1035 * implies invald log, forcing fsck().
1036 */
1037 /* mark log state log wrap in log superblock */
1038 /* log->state = LOGWRAP; */
1039
1040 /* reset sync point computation */
1041 log->syncpt = log->sync = lsn;
1042 log->nextsync = delta;
1043 } else
1044 /* next syncpt trigger = written + more */
1045 log->nextsync = written + more;
1046
1047 /* return if lmLogSync() from outside of transaction, e.g., sync() */
1048 if (nosyncwait)
1049 return lsn;
1050
1051 /* if number of bytes written from last sync point is more
1052 * than 1/4 of the log size, stop new transactions from
1053 * starting until all current transactions are completed
1054 * by setting syncbarrier flag.
1055 */
1056 if (written > LOGSYNC_BARRIER(logsize) && logsize > 32 * LOGPSIZE) {
1057 set_bit(log_SYNCBARRIER, &log->flag);
1058 jfs_info("log barrier on: lsn=0x%x syncpt=0x%x", lsn,
1059 log->syncpt);
1060 /*
1061 * We may have to initiate group commit
1062 */
1063 jfs_flush_journal(log, 0);
1064 }
1065
1066 return lsn;
1067}
1068
1069
1070/*
1071 * NAME: lmLogOpen()
1072 *
1073 * FUNCTION: open the log on first open;
1074 * insert filesystem in the active list of the log.
1075 *
1076 * PARAMETER: ipmnt - file system mount inode
1077 * iplog - log inode (out)
1078 *
1079 * RETURN:
1080 *
1081 * serialization:
1082 */
1083int lmLogOpen(struct super_block *sb)
1084{
1085 int rc;
1086 struct block_device *bdev;
1087 struct jfs_log *log;
1088 struct jfs_sb_info *sbi = JFS_SBI(sb);
1089
1090 if (sbi->flag & JFS_NOINTEGRITY)
1091 return open_dummy_log(sb);
1092
1093 if (sbi->mntflag & JFS_INLINELOG)
1094 return open_inline_log(sb);
1095
1096 down(&jfs_log_sem);
1097 list_for_each_entry(log, &jfs_external_logs, journal_list) {
1098 if (log->bdev->bd_dev == sbi->logdev) {
1099 if (memcmp(log->uuid, sbi->loguuid,
1100 sizeof(log->uuid))) {
1101 jfs_warn("wrong uuid on JFS journal\n");
1102 up(&jfs_log_sem);
1103 return -EINVAL;
1104 }
1105 /*
1106 * add file system to log active file system list
1107 */
1108 if ((rc = lmLogFileSystem(log, sbi, 1))) {
1109 up(&jfs_log_sem);
1110 return rc;
1111 }
1112 goto journal_found;
1113 }
1114 }
1115
1116 if (!(log = kmalloc(sizeof(struct jfs_log), GFP_KERNEL))) {
1117 up(&jfs_log_sem);
1118 return -ENOMEM;
1119 }
1120 memset(log, 0, sizeof(struct jfs_log));
1121 INIT_LIST_HEAD(&log->sb_list);
1122 init_waitqueue_head(&log->syncwait);
1123
1124 /*
1125 * external log as separate logical volume
1126 *
1127 * file systems to log may have n-to-1 relationship;
1128 */
1129
1130 bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE);
1131 if (IS_ERR(bdev)) {
1132 rc = -PTR_ERR(bdev);
1133 goto free;
1134 }
1135
1136 if ((rc = bd_claim(bdev, log))) {
1137 goto close;
1138 }
1139
1140 log->bdev = bdev;
1141 memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid));
1142
1143 /*
1144 * initialize log:
1145 */
1146 if ((rc = lmLogInit(log)))
1147 goto unclaim;
1148
1149 list_add(&log->journal_list, &jfs_external_logs);
1150
1151 /*
1152 * add file system to log active file system list
1153 */
1154 if ((rc = lmLogFileSystem(log, sbi, 1)))
1155 goto shutdown;
1156
1157journal_found:
1158 LOG_LOCK(log);
1159 list_add(&sbi->log_list, &log->sb_list);
1160 sbi->log = log;
1161 LOG_UNLOCK(log);
1162
1163 up(&jfs_log_sem);
1164 return 0;
1165
1166 /*
1167 * unwind on error
1168 */
1169 shutdown: /* unwind lbmLogInit() */
1170 list_del(&log->journal_list);
1171 lbmLogShutdown(log);
1172
1173 unclaim:
1174 bd_release(bdev);
1175
1176 close: /* close external log device */
1177 blkdev_put(bdev);
1178
1179 free: /* free log descriptor */
1180 up(&jfs_log_sem);
1181 kfree(log);
1182
1183 jfs_warn("lmLogOpen: exit(%d)", rc);
1184 return rc;
1185}
1186
1187static int open_inline_log(struct super_block *sb)
1188{
1189 struct jfs_log *log;
1190 int rc;
1191
1192 if (!(log = kmalloc(sizeof(struct jfs_log), GFP_KERNEL)))
1193 return -ENOMEM;
1194 memset(log, 0, sizeof(struct jfs_log));
1195 INIT_LIST_HEAD(&log->sb_list);
1196 init_waitqueue_head(&log->syncwait);
1197
1198 set_bit(log_INLINELOG, &log->flag);
1199 log->bdev = sb->s_bdev;
1200 log->base = addressPXD(&JFS_SBI(sb)->logpxd);
1201 log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >>
1202 (L2LOGPSIZE - sb->s_blocksize_bits);
1203 log->l2bsize = sb->s_blocksize_bits;
1204 ASSERT(L2LOGPSIZE >= sb->s_blocksize_bits);
1205
1206 /*
1207 * initialize log.
1208 */
1209 if ((rc = lmLogInit(log))) {
1210 kfree(log);
1211 jfs_warn("lmLogOpen: exit(%d)", rc);
1212 return rc;
1213 }
1214
1215 list_add(&JFS_SBI(sb)->log_list, &log->sb_list);
1216 JFS_SBI(sb)->log = log;
1217
1218 return rc;
1219}
1220
1221static int open_dummy_log(struct super_block *sb)
1222{
1223 int rc;
1224
1225 down(&jfs_log_sem);
1226 if (!dummy_log) {
1227 dummy_log = kmalloc(sizeof(struct jfs_log), GFP_KERNEL);
1228 if (!dummy_log) {
1229 up(&jfs_log_sem);
1230 return -ENOMEM;
1231 }
1232 memset(dummy_log, 0, sizeof(struct jfs_log));
1233 INIT_LIST_HEAD(&dummy_log->sb_list);
1234 init_waitqueue_head(&dummy_log->syncwait);
1235 dummy_log->no_integrity = 1;
1236 /* Make up some stuff */
1237 dummy_log->base = 0;
1238 dummy_log->size = 1024;
1239 rc = lmLogInit(dummy_log);
1240 if (rc) {
1241 kfree(dummy_log);
1242 dummy_log = NULL;
1243 up(&jfs_log_sem);
1244 return rc;
1245 }
1246 }
1247
1248 LOG_LOCK(dummy_log);
1249 list_add(&JFS_SBI(sb)->log_list, &dummy_log->sb_list);
1250 JFS_SBI(sb)->log = dummy_log;
1251 LOG_UNLOCK(dummy_log);
1252 up(&jfs_log_sem);
1253
1254 return 0;
1255}
1256
1257/*
1258 * NAME: lmLogInit()
1259 *
1260 * FUNCTION: log initialization at first log open.
1261 *
1262 * logredo() (or logformat()) should have been run previously.
1263 * initialize the log from log superblock.
1264 * set the log state in the superblock to LOGMOUNT and
1265 * write SYNCPT log record.
1266 *
1267 * PARAMETER: log - log structure
1268 *
1269 * RETURN: 0 - if ok
1270 * -EINVAL - bad log magic number or superblock dirty
1271 * error returned from logwait()
1272 *
1273 * serialization: single first open thread
1274 */
1275int lmLogInit(struct jfs_log * log)
1276{
1277 int rc = 0;
1278 struct lrd lrd;
1279 struct logsuper *logsuper;
1280 struct lbuf *bpsuper;
1281 struct lbuf *bp;
1282 struct logpage *lp;
1283 int lsn = 0;
1284
1285 jfs_info("lmLogInit: log:0x%p", log);
1286
1287 /* initialize the group commit serialization lock */
1288 LOGGC_LOCK_INIT(log);
1289
1290 /* allocate/initialize the log write serialization lock */
1291 LOG_LOCK_INIT(log);
1292
1293 LOGSYNC_LOCK_INIT(log);
1294
1295 INIT_LIST_HEAD(&log->synclist);
1296
1297 INIT_LIST_HEAD(&log->cqueue);
1298 log->flush_tblk = NULL;
1299
1300 log->count = 0;
1301
1302 /*
1303 * initialize log i/o
1304 */
1305 if ((rc = lbmLogInit(log)))
1306 return rc;
1307
1308 if (!test_bit(log_INLINELOG, &log->flag))
1309 log->l2bsize = L2LOGPSIZE;
1310
1311 /* check for disabled journaling to disk */
1312 if (log->no_integrity) {
1313 /*
1314 * Journal pages will still be filled. When the time comes
1315 * to actually do the I/O, the write is not done, and the
1316 * endio routine is called directly.
1317 */
1318 bp = lbmAllocate(log , 0);
1319 log->bp = bp;
1320 bp->l_pn = bp->l_eor = 0;
1321 } else {
1322 /*
1323 * validate log superblock
1324 */
1325 if ((rc = lbmRead(log, 1, &bpsuper)))
1326 goto errout10;
1327
1328 logsuper = (struct logsuper *) bpsuper->l_ldata;
1329
1330 if (logsuper->magic != cpu_to_le32(LOGMAGIC)) {
1331 jfs_warn("*** Log Format Error ! ***");
1332 rc = -EINVAL;
1333 goto errout20;
1334 }
1335
1336 /* logredo() should have been run successfully. */
1337 if (logsuper->state != cpu_to_le32(LOGREDONE)) {
1338 jfs_warn("*** Log Is Dirty ! ***");
1339 rc = -EINVAL;
1340 goto errout20;
1341 }
1342
1343 /* initialize log from log superblock */
1344 if (test_bit(log_INLINELOG,&log->flag)) {
1345 if (log->size != le32_to_cpu(logsuper->size)) {
1346 rc = -EINVAL;
1347 goto errout20;
1348 }
1349 jfs_info("lmLogInit: inline log:0x%p base:0x%Lx "
1350 "size:0x%x", log,
1351 (unsigned long long) log->base, log->size);
1352 } else {
1353 if (memcmp(logsuper->uuid, log->uuid, 16)) {
1354 jfs_warn("wrong uuid on JFS log device");
1355 goto errout20;
1356 }
1357 log->size = le32_to_cpu(logsuper->size);
1358 log->l2bsize = le32_to_cpu(logsuper->l2bsize);
1359 jfs_info("lmLogInit: external log:0x%p base:0x%Lx "
1360 "size:0x%x", log,
1361 (unsigned long long) log->base, log->size);
1362 }
1363
1364 log->page = le32_to_cpu(logsuper->end) / LOGPSIZE;
1365 log->eor = le32_to_cpu(logsuper->end) - (LOGPSIZE * log->page);
1366
1367 /*
1368 * initialize for log append write mode
1369 */
1370 /* establish current/end-of-log page/buffer */
1371 if ((rc = lbmRead(log, log->page, &bp)))
1372 goto errout20;
1373
1374 lp = (struct logpage *) bp->l_ldata;
1375
1376 jfs_info("lmLogInit: lsn:0x%x page:%d eor:%d:%d",
1377 le32_to_cpu(logsuper->end), log->page, log->eor,
1378 le16_to_cpu(lp->h.eor));
1379
1380 log->bp = bp;
1381 bp->l_pn = log->page;
1382 bp->l_eor = log->eor;
1383
1384 /* if current page is full, move on to next page */
1385 if (log->eor >= LOGPSIZE - LOGPTLRSIZE)
1386 lmNextPage(log);
1387
1388 /*
1389 * initialize log syncpoint
1390 */
1391 /*
1392 * write the first SYNCPT record with syncpoint = 0
1393 * (i.e., log redo up to HERE !);
1394 * remove current page from lbm write queue at end of pageout
1395 * (to write log superblock update), but do not release to
1396 * freelist;
1397 */
1398 lrd.logtid = 0;
1399 lrd.backchain = 0;
1400 lrd.type = cpu_to_le16(LOG_SYNCPT);
1401 lrd.length = 0;
1402 lrd.log.syncpt.sync = 0;
1403 lsn = lmWriteRecord(log, NULL, &lrd, NULL);
1404 bp = log->bp;
1405 bp->l_ceor = bp->l_eor;
1406 lp = (struct logpage *) bp->l_ldata;
1407 lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
1408 lbmWrite(log, bp, lbmWRITE | lbmSYNC, 0);
1409 if ((rc = lbmIOWait(bp, 0)))
1410 goto errout30;
1411
1412 /*
1413 * update/write superblock
1414 */
1415 logsuper->state = cpu_to_le32(LOGMOUNT);
1416 log->serial = le32_to_cpu(logsuper->serial) + 1;
1417 logsuper->serial = cpu_to_le32(log->serial);
1418 lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
1419 if ((rc = lbmIOWait(bpsuper, lbmFREE)))
1420 goto errout30;
1421 }
1422
1423 /* initialize logsync parameters */
1424 log->logsize = (log->size - 2) << L2LOGPSIZE;
1425 log->lsn = lsn;
1426 log->syncpt = lsn;
1427 log->sync = log->syncpt;
1428 log->nextsync = LOGSYNC_DELTA(log->logsize);
1429
1430 jfs_info("lmLogInit: lsn:0x%x syncpt:0x%x sync:0x%x",
1431 log->lsn, log->syncpt, log->sync);
1432
1433 /*
1434 * initialize for lazy/group commit
1435 */
1436 log->clsn = lsn;
1437
1438 return 0;
1439
1440 /*
1441 * unwind on error
1442 */
1443 errout30: /* release log page */
1444 log->wqueue = NULL;
1445 bp->l_wqnext = NULL;
1446 lbmFree(bp);
1447
1448 errout20: /* release log superblock */
1449 lbmFree(bpsuper);
1450
1451 errout10: /* unwind lbmLogInit() */
1452 lbmLogShutdown(log);
1453
1454 jfs_warn("lmLogInit: exit(%d)", rc);
1455 return rc;
1456}
1457
1458
1459/*
1460 * NAME: lmLogClose()
1461 *
1462 * FUNCTION: remove file system <ipmnt> from active list of log <iplog>
1463 * and close it on last close.
1464 *
1465 * PARAMETER: sb - superblock
1466 *
1467 * RETURN: errors from subroutines
1468 *
1469 * serialization:
1470 */
1471int lmLogClose(struct super_block *sb)
1472{
1473 struct jfs_sb_info *sbi = JFS_SBI(sb);
1474 struct jfs_log *log = sbi->log;
1475 struct block_device *bdev;
1476 int rc = 0;
1477
1478 jfs_info("lmLogClose: log:0x%p", log);
1479
1480 down(&jfs_log_sem);
1481 LOG_LOCK(log);
1482 list_del(&sbi->log_list);
1483 LOG_UNLOCK(log);
1484 sbi->log = NULL;
1485
1486 /*
1487 * We need to make sure all of the "written" metapages
1488 * actually make it to disk
1489 */
1490 sync_blockdev(sb->s_bdev);
1491
1492 if (test_bit(log_INLINELOG, &log->flag)) {
1493 /*
1494 * in-line log in host file system
1495 */
1496 rc = lmLogShutdown(log);
1497 kfree(log);
1498 goto out;
1499 }
1500
1501 if (!log->no_integrity)
1502 lmLogFileSystem(log, sbi, 0);
1503
1504 if (!list_empty(&log->sb_list))
1505 goto out;
1506
1507 /*
1508 * TODO: ensure that the dummy_log is in a state to allow
1509 * lbmLogShutdown to deallocate all the buffers and call
1510 * kfree against dummy_log. For now, leave dummy_log & its
1511 * buffers in memory, and resuse if another no-integrity mount
1512 * is requested.
1513 */
1514 if (log->no_integrity)
1515 goto out;
1516
1517 /*
1518 * external log as separate logical volume
1519 */
1520 list_del(&log->journal_list);
1521 bdev = log->bdev;
1522 rc = lmLogShutdown(log);
1523
1524 bd_release(bdev);
1525 blkdev_put(bdev);
1526
1527 kfree(log);
1528
1529 out:
1530 up(&jfs_log_sem);
1531 jfs_info("lmLogClose: exit(%d)", rc);
1532 return rc;
1533}
1534
1535
1536/*
1537 * NAME: jfs_flush_journal()
1538 *
1539 * FUNCTION: initiate write of any outstanding transactions to the journal
1540 * and optionally wait until they are all written to disk
1541 *
1542 * wait == 0 flush until latest txn is committed, don't wait
1543 * wait == 1 flush until latest txn is committed, wait
1544 * wait > 1 flush until all txn's are complete, wait
1545 */
1546void jfs_flush_journal(struct jfs_log *log, int wait)
1547{
1548 int i;
1549 struct tblock *target = NULL;
1550
1551 /* jfs_write_inode may call us during read-only mount */
1552 if (!log)
1553 return;
1554
1555 jfs_info("jfs_flush_journal: log:0x%p wait=%d", log, wait);
1556
1557 LOGGC_LOCK(log);
1558
1559 if (!list_empty(&log->cqueue)) {
1560 /*
1561 * This ensures that we will keep writing to the journal as long
1562 * as there are unwritten commit records
1563 */
1564 target = list_entry(log->cqueue.prev, struct tblock, cqueue);
1565
1566 if (test_bit(log_FLUSH, &log->flag)) {
1567 /*
1568 * We're already flushing.
1569 * if flush_tblk is NULL, we are flushing everything,
1570 * so leave it that way. Otherwise, update it to the
1571 * latest transaction
1572 */
1573 if (log->flush_tblk)
1574 log->flush_tblk = target;
1575 } else {
1576 /* Only flush until latest transaction is committed */
1577 log->flush_tblk = target;
1578 set_bit(log_FLUSH, &log->flag);
1579
1580 /*
1581 * Initiate I/O on outstanding transactions
1582 */
1583 if (!(log->cflag & logGC_PAGEOUT)) {
1584 log->cflag |= logGC_PAGEOUT;
1585 lmGCwrite(log, 0);
1586 }
1587 }
1588 }
1589 if ((wait > 1) || test_bit(log_SYNCBARRIER, &log->flag)) {
1590 /* Flush until all activity complete */
1591 set_bit(log_FLUSH, &log->flag);
1592 log->flush_tblk = NULL;
1593 }
1594
1595 if (wait && target && !(target->flag & tblkGC_COMMITTED)) {
1596 DECLARE_WAITQUEUE(__wait, current);
1597
1598 add_wait_queue(&target->gcwait, &__wait);
1599 set_current_state(TASK_UNINTERRUPTIBLE);
1600 LOGGC_UNLOCK(log);
1601 schedule();
1602 current->state = TASK_RUNNING;
1603 LOGGC_LOCK(log);
1604 remove_wait_queue(&target->gcwait, &__wait);
1605 }
1606 LOGGC_UNLOCK(log);
1607
1608 if (wait < 2)
1609 return;
1610
1611 /*
1612 * If there was recent activity, we may need to wait
1613 * for the lazycommit thread to catch up
1614 */
1615 if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) {
1616 for (i = 0; i < 800; i++) { /* Too much? */
1617 msleep(250);
1618 if (list_empty(&log->cqueue) &&
1619 list_empty(&log->synclist))
1620 break;
1621 }
1622 }
1623 assert(list_empty(&log->cqueue));
1624 assert(list_empty(&log->synclist));
1625 clear_bit(log_FLUSH, &log->flag);
1626}
1627
1628/*
1629 * NAME: lmLogShutdown()
1630 *
1631 * FUNCTION: log shutdown at last LogClose().
1632 *
1633 * write log syncpt record.
1634 * update super block to set redone flag to 0.
1635 *
1636 * PARAMETER: log - log inode
1637 *
1638 * RETURN: 0 - success
1639 *
1640 * serialization: single last close thread
1641 */
1642int lmLogShutdown(struct jfs_log * log)
1643{
1644 int rc;
1645 struct lrd lrd;
1646 int lsn;
1647 struct logsuper *logsuper;
1648 struct lbuf *bpsuper;
1649 struct lbuf *bp;
1650 struct logpage *lp;
1651
1652 jfs_info("lmLogShutdown: log:0x%p", log);
1653
1654 jfs_flush_journal(log, 2);
1655
1656 /*
1657 * write the last SYNCPT record with syncpoint = 0
1658 * (i.e., log redo up to HERE !)
1659 */
1660 lrd.logtid = 0;
1661 lrd.backchain = 0;
1662 lrd.type = cpu_to_le16(LOG_SYNCPT);
1663 lrd.length = 0;
1664 lrd.log.syncpt.sync = 0;
1665
1666 lsn = lmWriteRecord(log, NULL, &lrd, NULL);
1667 bp = log->bp;
1668 lp = (struct logpage *) bp->l_ldata;
1669 lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
1670 lbmWrite(log, log->bp, lbmWRITE | lbmRELEASE | lbmSYNC, 0);
1671 lbmIOWait(log->bp, lbmFREE);
1672
1673 /*
1674 * synchronous update log superblock
1675 * mark log state as shutdown cleanly
1676 * (i.e., Log does not need to be replayed).
1677 */
1678 if ((rc = lbmRead(log, 1, &bpsuper)))
1679 goto out;
1680
1681 logsuper = (struct logsuper *) bpsuper->l_ldata;
1682 logsuper->state = cpu_to_le32(LOGREDONE);
1683 logsuper->end = cpu_to_le32(lsn);
1684 lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
1685 rc = lbmIOWait(bpsuper, lbmFREE);
1686
1687 jfs_info("lmLogShutdown: lsn:0x%x page:%d eor:%d",
1688 lsn, log->page, log->eor);
1689
1690 out:
1691 /*
1692 * shutdown per log i/o
1693 */
1694 lbmLogShutdown(log);
1695
1696 if (rc) {
1697 jfs_warn("lmLogShutdown: exit(%d)", rc);
1698 }
1699 return rc;
1700}
1701
1702
1703/*
1704 * NAME: lmLogFileSystem()
1705 *
1706 * FUNCTION: insert (<activate> = true)/remove (<activate> = false)
1707 * file system into/from log active file system list.
1708 *
1709 * PARAMETE: log - pointer to logs inode.
1710 * fsdev - kdev_t of filesystem.
1711 * serial - pointer to returned log serial number
1712 * activate - insert/remove device from active list.
1713 *
1714 * RETURN: 0 - success
1715 * errors returned by vms_iowait().
1716 */
1717static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi,
1718 int activate)
1719{
1720 int rc = 0;
1721 int i;
1722 struct logsuper *logsuper;
1723 struct lbuf *bpsuper;
1724 char *uuid = sbi->uuid;
1725
1726 /*
1727 * insert/remove file system device to log active file system list.
1728 */
1729 if ((rc = lbmRead(log, 1, &bpsuper)))
1730 return rc;
1731
1732 logsuper = (struct logsuper *) bpsuper->l_ldata;
1733 if (activate) {
1734 for (i = 0; i < MAX_ACTIVE; i++)
1735 if (!memcmp(logsuper->active[i].uuid, NULL_UUID, 16)) {
1736 memcpy(logsuper->active[i].uuid, uuid, 16);
1737 sbi->aggregate = i;
1738 break;
1739 }
1740 if (i == MAX_ACTIVE) {
1741 jfs_warn("Too many file systems sharing journal!");
1742 lbmFree(bpsuper);
1743 return -EMFILE; /* Is there a better rc? */
1744 }
1745 } else {
1746 for (i = 0; i < MAX_ACTIVE; i++)
1747 if (!memcmp(logsuper->active[i].uuid, uuid, 16)) {
1748 memcpy(logsuper->active[i].uuid, NULL_UUID, 16);
1749 break;
1750 }
1751 if (i == MAX_ACTIVE) {
1752 jfs_warn("Somebody stomped on the journal!");
1753 lbmFree(bpsuper);
1754 return -EIO;
1755 }
1756
1757 }
1758
1759 /*
1760 * synchronous write log superblock:
1761 *
1762 * write sidestream bypassing write queue:
1763 * at file system mount, log super block is updated for
1764 * activation of the file system before any log record
1765 * (MOUNT record) of the file system, and at file system
1766 * unmount, all meta data for the file system has been
1767 * flushed before log super block is updated for deactivation
1768 * of the file system.
1769 */
1770 lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
1771 rc = lbmIOWait(bpsuper, lbmFREE);
1772
1773 return rc;
1774}
1775
1776/*
1777 * log buffer manager (lbm)
1778 * ------------------------
1779 *
1780 * special purpose buffer manager supporting log i/o requirements.
1781 *
1782 * per log write queue:
1783 * log pageout occurs in serial order by fifo write queue and
1784 * restricting to a single i/o in pregress at any one time.
1785 * a circular singly-linked list
1786 * (log->wrqueue points to the tail, and buffers are linked via
1787 * bp->wrqueue field), and
1788 * maintains log page in pageout ot waiting for pageout in serial pageout.
1789 */
1790
1791/*
1792 * lbmLogInit()
1793 *
1794 * initialize per log I/O setup at lmLogInit()
1795 */
1796static int lbmLogInit(struct jfs_log * log)
1797{ /* log inode */
1798 int i;
1799 struct lbuf *lbuf;
1800
1801 jfs_info("lbmLogInit: log:0x%p", log);
1802
1803 /* initialize current buffer cursor */
1804 log->bp = NULL;
1805
1806 /* initialize log device write queue */
1807 log->wqueue = NULL;
1808
1809 /*
1810 * Each log has its own buffer pages allocated to it. These are
1811 * not managed by the page cache. This ensures that a transaction
1812 * writing to the log does not block trying to allocate a page from
1813 * the page cache (for the log). This would be bad, since page
1814 * allocation waits on the kswapd thread that may be committing inodes
1815 * which would cause log activity. Was that clear? I'm trying to
1816 * avoid deadlock here.
1817 */
1818 init_waitqueue_head(&log->free_wait);
1819
1820 log->lbuf_free = NULL;
1821
1822 for (i = 0; i < LOGPAGES; i++) {
1823 lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL);
1824 if (lbuf == 0)
1825 goto error;
1826 lbuf->l_ldata = (char *) get_zeroed_page(GFP_KERNEL);
1827 if (lbuf->l_ldata == 0) {
1828 kfree(lbuf);
1829 goto error;
1830 }
1831 lbuf->l_log = log;
1832 init_waitqueue_head(&lbuf->l_ioevent);
1833
1834 lbuf->l_freelist = log->lbuf_free;
1835 log->lbuf_free = lbuf;
1836 }
1837
1838 return (0);
1839
1840 error:
1841 lbmLogShutdown(log);
1842 return -ENOMEM;
1843}
1844
1845
1846/*
1847 * lbmLogShutdown()
1848 *
1849 * finalize per log I/O setup at lmLogShutdown()
1850 */
1851static void lbmLogShutdown(struct jfs_log * log)
1852{
1853 struct lbuf *lbuf;
1854
1855 jfs_info("lbmLogShutdown: log:0x%p", log);
1856
1857 lbuf = log->lbuf_free;
1858 while (lbuf) {
1859 struct lbuf *next = lbuf->l_freelist;
1860 free_page((unsigned long) lbuf->l_ldata);
1861 kfree(lbuf);
1862 lbuf = next;
1863 }
1864
1865 log->bp = NULL;
1866}
1867
1868
1869/*
1870 * lbmAllocate()
1871 *
1872 * allocate an empty log buffer
1873 */
1874static struct lbuf *lbmAllocate(struct jfs_log * log, int pn)
1875{
1876 struct lbuf *bp;
1877 unsigned long flags;
1878
1879 /*
1880 * recycle from log buffer freelist if any
1881 */
1882 LCACHE_LOCK(flags);
1883 LCACHE_SLEEP_COND(log->free_wait, (bp = log->lbuf_free), flags);
1884 log->lbuf_free = bp->l_freelist;
1885 LCACHE_UNLOCK(flags);
1886
1887 bp->l_flag = 0;
1888
1889 bp->l_wqnext = NULL;
1890 bp->l_freelist = NULL;
1891
1892 bp->l_pn = pn;
1893 bp->l_blkno = log->base + (pn << (L2LOGPSIZE - log->l2bsize));
1894 bp->l_ceor = 0;
1895
1896 return bp;
1897}
1898
1899
1900/*
1901 * lbmFree()
1902 *
1903 * release a log buffer to freelist
1904 */
1905static void lbmFree(struct lbuf * bp)
1906{
1907 unsigned long flags;
1908
1909 LCACHE_LOCK(flags);
1910
1911 lbmfree(bp);
1912
1913 LCACHE_UNLOCK(flags);
1914}
1915
1916static void lbmfree(struct lbuf * bp)
1917{
1918 struct jfs_log *log = bp->l_log;
1919
1920 assert(bp->l_wqnext == NULL);
1921
1922 /*
1923 * return the buffer to head of freelist
1924 */
1925 bp->l_freelist = log->lbuf_free;
1926 log->lbuf_free = bp;
1927
1928 wake_up(&log->free_wait);
1929 return;
1930}
1931
1932
1933/*
1934 * NAME: lbmRedrive
1935 *
1936 * FUNCTION: add a log buffer to the the log redrive list
1937 *
1938 * PARAMETER:
1939 * bp - log buffer
1940 *
1941 * NOTES:
1942 * Takes log_redrive_lock.
1943 */
1944static inline void lbmRedrive(struct lbuf *bp)
1945{
1946 unsigned long flags;
1947
1948 spin_lock_irqsave(&log_redrive_lock, flags);
1949 bp->l_redrive_next = log_redrive_list;
1950 log_redrive_list = bp;
1951 spin_unlock_irqrestore(&log_redrive_lock, flags);
1952
1953 wake_up(&jfs_IO_thread_wait);
1954}
1955
1956
1957/*
1958 * lbmRead()
1959 */
1960static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
1961{
1962 struct bio *bio;
1963 struct lbuf *bp;
1964
1965 /*
1966 * allocate a log buffer
1967 */
1968 *bpp = bp = lbmAllocate(log, pn);
1969 jfs_info("lbmRead: bp:0x%p pn:0x%x", bp, pn);
1970
1971 bp->l_flag |= lbmREAD;
1972
1973 bio = bio_alloc(GFP_NOFS, 1);
1974
1975 bio->bi_sector = bp->l_blkno << (log->l2bsize - 9);
1976 bio->bi_bdev = log->bdev;
1977 bio->bi_io_vec[0].bv_page = virt_to_page(bp->l_ldata);
1978 bio->bi_io_vec[0].bv_len = LOGPSIZE;
1979 bio->bi_io_vec[0].bv_offset = 0;
1980
1981 bio->bi_vcnt = 1;
1982 bio->bi_idx = 0;
1983 bio->bi_size = LOGPSIZE;
1984
1985 bio->bi_end_io = lbmIODone;
1986 bio->bi_private = bp;
1987 submit_bio(READ_SYNC, bio);
1988
1989 wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
1990
1991 return 0;
1992}
1993
1994
1995/*
1996 * lbmWrite()
1997 *
1998 * buffer at head of pageout queue stays after completion of
1999 * partial-page pageout and redriven by explicit initiation of
2000 * pageout by caller until full-page pageout is completed and
2001 * released.
2002 *
2003 * device driver i/o done redrives pageout of new buffer at
2004 * head of pageout queue when current buffer at head of pageout
2005 * queue is released at the completion of its full-page pageout.
2006 *
2007 * LOGGC_LOCK() serializes lbmWrite() by lmNextPage() and lmGroupCommit().
2008 * LCACHE_LOCK() serializes xflag between lbmWrite() and lbmIODone()
2009 */
2010static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag,
2011 int cant_block)
2012{
2013 struct lbuf *tail;
2014 unsigned long flags;
2015
2016 jfs_info("lbmWrite: bp:0x%p flag:0x%x pn:0x%x", bp, flag, bp->l_pn);
2017
2018 /* map the logical block address to physical block address */
2019 bp->l_blkno =
2020 log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
2021
2022 LCACHE_LOCK(flags); /* disable+lock */
2023
2024 /*
2025 * initialize buffer for device driver
2026 */
2027 bp->l_flag = flag;
2028
2029 /*
2030 * insert bp at tail of write queue associated with log
2031 *
2032 * (request is either for bp already/currently at head of queue
2033 * or new bp to be inserted at tail)
2034 */
2035 tail = log->wqueue;
2036
2037 /* is buffer not already on write queue ? */
2038 if (bp->l_wqnext == NULL) {
2039 /* insert at tail of wqueue */
2040 if (tail == NULL) {
2041 log->wqueue = bp;
2042 bp->l_wqnext = bp;
2043 } else {
2044 log->wqueue = bp;
2045 bp->l_wqnext = tail->l_wqnext;
2046 tail->l_wqnext = bp;
2047 }
2048
2049 tail = bp;
2050 }
2051
2052 /* is buffer at head of wqueue and for write ? */
2053 if ((bp != tail->l_wqnext) || !(flag & lbmWRITE)) {
2054 LCACHE_UNLOCK(flags); /* unlock+enable */
2055 return;
2056 }
2057
2058 LCACHE_UNLOCK(flags); /* unlock+enable */
2059
2060 if (cant_block)
2061 lbmRedrive(bp);
2062 else if (flag & lbmSYNC)
2063 lbmStartIO(bp);
2064 else {
2065 LOGGC_UNLOCK(log);
2066 lbmStartIO(bp);
2067 LOGGC_LOCK(log);
2068 }
2069}
2070
2071
2072/*
2073 * lbmDirectWrite()
2074 *
2075 * initiate pageout bypassing write queue for sidestream
2076 * (e.g., log superblock) write;
2077 */
2078static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag)
2079{
2080 jfs_info("lbmDirectWrite: bp:0x%p flag:0x%x pn:0x%x",
2081 bp, flag, bp->l_pn);
2082
2083 /*
2084 * initialize buffer for device driver
2085 */
2086 bp->l_flag = flag | lbmDIRECT;
2087
2088 /* map the logical block address to physical block address */
2089 bp->l_blkno =
2090 log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
2091
2092 /*
2093 * initiate pageout of the page
2094 */
2095 lbmStartIO(bp);
2096}
2097
2098
2099/*
2100 * NAME: lbmStartIO()
2101 *
2102 * FUNCTION: Interface to DD strategy routine
2103 *
2104 * RETURN: none
2105 *
2106 * serialization: LCACHE_LOCK() is NOT held during log i/o;
2107 */
2108static void lbmStartIO(struct lbuf * bp)
2109{
2110 struct bio *bio;
2111 struct jfs_log *log = bp->l_log;
2112
2113 jfs_info("lbmStartIO\n");
2114
2115 bio = bio_alloc(GFP_NOFS, 1);
2116 bio->bi_sector = bp->l_blkno << (log->l2bsize - 9);
2117 bio->bi_bdev = log->bdev;
2118 bio->bi_io_vec[0].bv_page = virt_to_page(bp->l_ldata);
2119 bio->bi_io_vec[0].bv_len = LOGPSIZE;
2120 bio->bi_io_vec[0].bv_offset = 0;
2121
2122 bio->bi_vcnt = 1;
2123 bio->bi_idx = 0;
2124 bio->bi_size = LOGPSIZE;
2125
2126 bio->bi_end_io = lbmIODone;
2127 bio->bi_private = bp;
2128
2129 /* check if journaling to disk has been disabled */
2130 if (!log->no_integrity) {
2131 submit_bio(WRITE_SYNC, bio);
2132 INCREMENT(lmStat.submitted);
2133 }
2134 else {
2135 bio->bi_size = 0;
2136 lbmIODone(bio, 0, 0); /* 2nd argument appears to not be used => 0
2137 * 3rd argument appears to not be used => 0
2138 */
2139 }
2140}
2141
2142
2143/*
2144 * lbmIOWait()
2145 */
2146static int lbmIOWait(struct lbuf * bp, int flag)
2147{
2148 unsigned long flags;
2149 int rc = 0;
2150
2151 jfs_info("lbmIOWait1: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag);
2152
2153 LCACHE_LOCK(flags); /* disable+lock */
2154
2155 LCACHE_SLEEP_COND(bp->l_ioevent, (bp->l_flag & lbmDONE), flags);
2156
2157 rc = (bp->l_flag & lbmERROR) ? -EIO : 0;
2158
2159 if (flag & lbmFREE)
2160 lbmfree(bp);
2161
2162 LCACHE_UNLOCK(flags); /* unlock+enable */
2163
2164 jfs_info("lbmIOWait2: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag);
2165 return rc;
2166}
2167
2168/*
2169 * lbmIODone()
2170 *
2171 * executed at INTIODONE level
2172 */
2173static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
2174{
2175 struct lbuf *bp = bio->bi_private;
2176 struct lbuf *nextbp, *tail;
2177 struct jfs_log *log;
2178 unsigned long flags;
2179
2180 if (bio->bi_size)
2181 return 1;
2182
2183 /*
2184 * get back jfs buffer bound to the i/o buffer
2185 */
2186 jfs_info("lbmIODone: bp:0x%p flag:0x%x", bp, bp->l_flag);
2187
2188 LCACHE_LOCK(flags); /* disable+lock */
2189
2190 bp->l_flag |= lbmDONE;
2191
2192 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2193 bp->l_flag |= lbmERROR;
2194
2195 jfs_err("lbmIODone: I/O error in JFS log");
2196 }
2197
2198 bio_put(bio);
2199
2200 /*
2201 * pagein completion
2202 */
2203 if (bp->l_flag & lbmREAD) {
2204 bp->l_flag &= ~lbmREAD;
2205
2206 LCACHE_UNLOCK(flags); /* unlock+enable */
2207
2208 /* wakeup I/O initiator */
2209 LCACHE_WAKEUP(&bp->l_ioevent);
2210
2211 return 0;
2212 }
2213
2214 /*
2215 * pageout completion
2216 *
2217 * the bp at the head of write queue has completed pageout.
2218 *
2219 * if single-commit/full-page pageout, remove the current buffer
2220 * from head of pageout queue, and redrive pageout with
2221 * the new buffer at head of pageout queue;
2222 * otherwise, the partial-page pageout buffer stays at
2223 * the head of pageout queue to be redriven for pageout
2224 * by lmGroupCommit() until full-page pageout is completed.
2225 */
2226 bp->l_flag &= ~lbmWRITE;
2227 INCREMENT(lmStat.pagedone);
2228
2229 /* update committed lsn */
2230 log = bp->l_log;
2231 log->clsn = (bp->l_pn << L2LOGPSIZE) + bp->l_ceor;
2232
2233 if (bp->l_flag & lbmDIRECT) {
2234 LCACHE_WAKEUP(&bp->l_ioevent);
2235 LCACHE_UNLOCK(flags);
2236 return 0;
2237 }
2238
2239 tail = log->wqueue;
2240
2241 /* single element queue */
2242 if (bp == tail) {
2243 /* remove head buffer of full-page pageout
2244 * from log device write queue
2245 */
2246 if (bp->l_flag & lbmRELEASE) {
2247 log->wqueue = NULL;
2248 bp->l_wqnext = NULL;
2249 }
2250 }
2251 /* multi element queue */
2252 else {
2253 /* remove head buffer of full-page pageout
2254 * from log device write queue
2255 */
2256 if (bp->l_flag & lbmRELEASE) {
2257 nextbp = tail->l_wqnext = bp->l_wqnext;
2258 bp->l_wqnext = NULL;
2259
2260 /*
2261 * redrive pageout of next page at head of write queue:
2262 * redrive next page without any bound tblk
2263 * (i.e., page w/o any COMMIT records), or
2264 * first page of new group commit which has been
2265 * queued after current page (subsequent pageout
2266 * is performed synchronously, except page without
2267 * any COMMITs) by lmGroupCommit() as indicated
2268 * by lbmWRITE flag;
2269 */
2270 if (nextbp->l_flag & lbmWRITE) {
2271 /*
2272 * We can't do the I/O at interrupt time.
2273 * The jfsIO thread can do it
2274 */
2275 lbmRedrive(nextbp);
2276 }
2277 }
2278 }
2279
2280 /*
2281 * synchronous pageout:
2282 *
2283 * buffer has not necessarily been removed from write queue
2284 * (e.g., synchronous write of partial-page with COMMIT):
2285 * leave buffer for i/o initiator to dispose
2286 */
2287 if (bp->l_flag & lbmSYNC) {
2288 LCACHE_UNLOCK(flags); /* unlock+enable */
2289
2290 /* wakeup I/O initiator */
2291 LCACHE_WAKEUP(&bp->l_ioevent);
2292 }
2293
2294 /*
2295 * Group Commit pageout:
2296 */
2297 else if (bp->l_flag & lbmGC) {
2298 LCACHE_UNLOCK(flags);
2299 lmPostGC(bp);
2300 }
2301
2302 /*
2303 * asynchronous pageout:
2304 *
2305 * buffer must have been removed from write queue:
2306 * insert buffer at head of freelist where it can be recycled
2307 */
2308 else {
2309 assert(bp->l_flag & lbmRELEASE);
2310 assert(bp->l_flag & lbmFREE);
2311 lbmfree(bp);
2312
2313 LCACHE_UNLOCK(flags); /* unlock+enable */
2314 }
2315
2316 return 0;
2317}
2318
2319int jfsIOWait(void *arg)
2320{
2321 struct lbuf *bp;
2322
2323 daemonize("jfsIO");
2324
2325 complete(&jfsIOwait);
2326
2327 do {
2328 DECLARE_WAITQUEUE(wq, current);
2329
2330 spin_lock_irq(&log_redrive_lock);
2331 while ((bp = log_redrive_list) != 0) {
2332 log_redrive_list = bp->l_redrive_next;
2333 bp->l_redrive_next = NULL;
2334 spin_unlock_irq(&log_redrive_lock);
2335 lbmStartIO(bp);
2336 spin_lock_irq(&log_redrive_lock);
2337 }
2338 if (current->flags & PF_FREEZE) {
2339 spin_unlock_irq(&log_redrive_lock);
2340 refrigerator(PF_FREEZE);
2341 } else {
2342 add_wait_queue(&jfs_IO_thread_wait, &wq);
2343 set_current_state(TASK_INTERRUPTIBLE);
2344 spin_unlock_irq(&log_redrive_lock);
2345 schedule();
2346 current->state = TASK_RUNNING;
2347 remove_wait_queue(&jfs_IO_thread_wait, &wq);
2348 }
2349 } while (!jfs_stop_threads);
2350
2351 jfs_info("jfsIOWait being killed!");
2352 complete_and_exit(&jfsIOwait, 0);
2353}
2354
2355/*
2356 * NAME: lmLogFormat()/jfs_logform()
2357 *
2358 * FUNCTION: format file system log
2359 *
2360 * PARAMETERS:
2361 * log - volume log
2362 * logAddress - start address of log space in FS block
2363 * logSize - length of log space in FS block;
2364 *
2365 * RETURN: 0 - success
2366 * -EIO - i/o error
2367 *
2368 * XXX: We're synchronously writing one page at a time. This needs to
2369 * be improved by writing multiple pages at once.
2370 */
2371int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
2372{
2373 int rc = -EIO;
2374 struct jfs_sb_info *sbi;
2375 struct logsuper *logsuper;
2376 struct logpage *lp;
2377 int lspn; /* log sequence page number */
2378 struct lrd *lrd_ptr;
2379 int npages = 0;
2380 struct lbuf *bp;
2381
2382 jfs_info("lmLogFormat: logAddress:%Ld logSize:%d",
2383 (long long)logAddress, logSize);
2384
2385 sbi = list_entry(log->sb_list.next, struct jfs_sb_info, log_list);
2386
2387 /* allocate a log buffer */
2388 bp = lbmAllocate(log, 1);
2389
2390 npages = logSize >> sbi->l2nbperpage;
2391
2392 /*
2393 * log space:
2394 *
2395 * page 0 - reserved;
2396 * page 1 - log superblock;
2397 * page 2 - log data page: A SYNC log record is written
2398 * into this page at logform time;
2399 * pages 3-N - log data page: set to empty log data pages;
2400 */
2401 /*
2402 * init log superblock: log page 1
2403 */
2404 logsuper = (struct logsuper *) bp->l_ldata;
2405
2406 logsuper->magic = cpu_to_le32(LOGMAGIC);
2407 logsuper->version = cpu_to_le32(LOGVERSION);
2408 logsuper->state = cpu_to_le32(LOGREDONE);
2409 logsuper->flag = cpu_to_le32(sbi->mntflag); /* ? */
2410 logsuper->size = cpu_to_le32(npages);
2411 logsuper->bsize = cpu_to_le32(sbi->bsize);
2412 logsuper->l2bsize = cpu_to_le32(sbi->l2bsize);
2413 logsuper->end = cpu_to_le32(2 * LOGPSIZE + LOGPHDRSIZE + LOGRDSIZE);
2414
2415 bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
2416 bp->l_blkno = logAddress + sbi->nbperpage;
2417 lbmStartIO(bp);
2418 if ((rc = lbmIOWait(bp, 0)))
2419 goto exit;
2420
2421 /*
2422 * init pages 2 to npages-1 as log data pages:
2423 *
2424 * log page sequence number (lpsn) initialization:
2425 *
2426 * pn: 0 1 2 3 n-1
2427 * +-----+-----+=====+=====+===.....===+=====+
2428 * lspn: N-1 0 1 N-2
2429 * <--- N page circular file ---->
2430 *
2431 * the N (= npages-2) data pages of the log is maintained as
2432 * a circular file for the log records;
2433 * lpsn grows by 1 monotonically as each log page is written
2434 * to the circular file of the log;
2435 * and setLogpage() will not reset the page number even if
2436 * the eor is equal to LOGPHDRSIZE. In order for binary search
2437 * still work in find log end process, we have to simulate the
2438 * log wrap situation at the log format time.
2439 * The 1st log page written will have the highest lpsn. Then
2440 * the succeeding log pages will have ascending order of
2441 * the lspn starting from 0, ... (N-2)
2442 */
2443 lp = (struct logpage *) bp->l_ldata;
2444 /*
2445 * initialize 1st log page to be written: lpsn = N - 1,
2446 * write a SYNCPT log record is written to this page
2447 */
2448 lp->h.page = lp->t.page = cpu_to_le32(npages - 3);
2449 lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE + LOGRDSIZE);
2450
2451 lrd_ptr = (struct lrd *) &lp->data;
2452 lrd_ptr->logtid = 0;
2453 lrd_ptr->backchain = 0;
2454 lrd_ptr->type = cpu_to_le16(LOG_SYNCPT);
2455 lrd_ptr->length = 0;
2456 lrd_ptr->log.syncpt.sync = 0;
2457
2458 bp->l_blkno += sbi->nbperpage;
2459 bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
2460 lbmStartIO(bp);
2461 if ((rc = lbmIOWait(bp, 0)))
2462 goto exit;
2463
2464 /*
2465 * initialize succeeding log pages: lpsn = 0, 1, ..., (N-2)
2466 */
2467 for (lspn = 0; lspn < npages - 3; lspn++) {
2468 lp->h.page = lp->t.page = cpu_to_le32(lspn);
2469 lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE);
2470
2471 bp->l_blkno += sbi->nbperpage;
2472 bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
2473 lbmStartIO(bp);
2474 if ((rc = lbmIOWait(bp, 0)))
2475 goto exit;
2476 }
2477
2478 rc = 0;
2479exit:
2480 /*
2481 * finalize log
2482 */
2483 /* release the buffer */
2484 lbmFree(bp);
2485
2486 return rc;
2487}
2488
2489#ifdef CONFIG_JFS_STATISTICS
2490int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
2491 int *eof, void *data)
2492{
2493 int len = 0;
2494 off_t begin;
2495
2496 len += sprintf(buffer,
2497 "JFS Logmgr stats\n"
2498 "================\n"
2499 "commits = %d\n"
2500 "writes submitted = %d\n"
2501 "writes completed = %d\n"
2502 "full pages submitted = %d\n"
2503 "partial pages submitted = %d\n",
2504 lmStat.commit,
2505 lmStat.submitted,
2506 lmStat.pagedone,
2507 lmStat.full_page,
2508 lmStat.partial_page);
2509
2510 begin = offset;
2511 *start = buffer + begin;
2512 len -= begin;
2513
2514 if (len > length)
2515 len = length;
2516 else
2517 *eof = 1;
2518
2519 if (len < 0)
2520 len = 0;
2521
2522 return len;
2523}
2524#endif /* CONFIG_JFS_STATISTICS */
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
new file mode 100644
index 000000000000..141ad74010c9
--- /dev/null
+++ b/fs/jfs/jfs_logmgr.h
@@ -0,0 +1,510 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 * Portions Copyright (C) Christoph Hellwig, 2001-2002
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19#ifndef _H_JFS_LOGMGR
20#define _H_JFS_LOGMGR
21
22#include "jfs_filsys.h"
23#include "jfs_lock.h"
24
25/*
26 * log manager configuration parameters
27 */
28
29/* log page size */
30#define LOGPSIZE 4096
31#define L2LOGPSIZE 12
32
33#define LOGPAGES 16 /* Log pages per mounted file system */
34
35/*
36 * log logical volume
37 *
38 * a log is used to make the commit operation on journalled
39 * files within the same logical volume group atomic.
40 * a log is implemented with a logical volume.
41 * there is one log per logical volume group.
42 *
43 * block 0 of the log logical volume is not used (ipl etc).
44 * block 1 contains a log "superblock" and is used by logFormat(),
45 * lmLogInit(), lmLogShutdown(), and logRedo() to record status
46 * of the log but is not otherwise used during normal processing.
47 * blocks 2 - (N-1) are used to contain log records.
48 *
49 * when a volume group is varied-on-line, logRedo() must have
50 * been executed before the file systems (logical volumes) in
51 * the volume group can be mounted.
52 */
53/*
54 * log superblock (block 1 of logical volume)
55 */
56#define LOGSUPER_B 1
57#define LOGSTART_B 2
58
59#define LOGMAGIC 0x87654321
60#define LOGVERSION 1
61
62#define MAX_ACTIVE 128 /* Max active file systems sharing log */
63
64struct logsuper {
65 __le32 magic; /* 4: log lv identifier */
66 __le32 version; /* 4: version number */
67 __le32 serial; /* 4: log open/mount counter */
68 __le32 size; /* 4: size in number of LOGPSIZE blocks */
69 __le32 bsize; /* 4: logical block size in byte */
70 __le32 l2bsize; /* 4: log2 of bsize */
71
72 __le32 flag; /* 4: option */
73 __le32 state; /* 4: state - see below */
74
75 __le32 end; /* 4: addr of last log record set by logredo */
76 char uuid[16]; /* 16: 128-bit journal uuid */
77 char label[16]; /* 16: journal label */
78 struct {
79 char uuid[16];
80 } active[MAX_ACTIVE]; /* 2048: active file systems list */
81};
82
83#define NULL_UUID "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
84
85/* log flag: commit option (see jfs_filsys.h) */
86
87/* log state */
88#define LOGMOUNT 0 /* log mounted by lmLogInit() */
89#define LOGREDONE 1 /* log shutdown by lmLogShutdown().
90 * log redo completed by logredo().
91 */
92#define LOGWRAP 2 /* log wrapped */
93#define LOGREADERR 3 /* log read error detected in logredo() */
94
95
96/*
97 * log logical page
98 *
99 * (this comment should be rewritten !)
100 * the header and trailer structures (h,t) will normally have
101 * the same page and eor value.
102 * An exception to this occurs when a complete page write is not
103 * accomplished on a power failure. Since the hardware may "split write"
104 * sectors in the page, any out of order sequence may occur during powerfail
105 * and needs to be recognized during log replay. The xor value is
106 * an "exclusive or" of all log words in the page up to eor. This
107 * 32 bit eor is stored with the top 16 bits in the header and the
108 * bottom 16 bits in the trailer. logredo can easily recognize pages
109 * that were not completed by reconstructing this eor and checking
110 * the log page.
111 *
112 * Previous versions of the operating system did not allow split
113 * writes and detected partially written records in logredo by
114 * ordering the updates to the header, trailer, and the move of data
115 * into the logdata area. The order: (1) data is moved (2) header
116 * is updated (3) trailer is updated. In logredo, when the header
117 * differed from the trailer, the header and trailer were reconciled
118 * as follows: if h.page != t.page they were set to the smaller of
119 * the two and h.eor and t.eor set to 8 (i.e. empty page). if (only)
120 * h.eor != t.eor they were set to the smaller of their two values.
121 */
122struct logpage {
123 struct { /* header */
124 __le32 page; /* 4: log sequence page number */
125 __le16 rsrvd; /* 2: */
126 __le16 eor; /* 2: end-of-log offset of lasrt record write */
127 } h;
128
129 __le32 data[LOGPSIZE / 4 - 4]; /* log record area */
130
131 struct { /* trailer */
132 __le32 page; /* 4: normally the same as h.page */
133 __le16 rsrvd; /* 2: */
134 __le16 eor; /* 2: normally the same as h.eor */
135 } t;
136};
137
138#define LOGPHDRSIZE 8 /* log page header size */
139#define LOGPTLRSIZE 8 /* log page trailer size */
140
141
142/*
143 * log record
144 *
145 * (this comment should be rewritten !)
146 * jfs uses only "after" log records (only a single writer is allowed
147 * in a page, pages are written to temporary paging space if
148 * if they must be written to disk before commit, and i/o is
149 * scheduled for modified pages to their home location after
150 * the log records containing the after values and the commit
151 * record is written to the log on disk, undo discards the copy
152 * in main-memory.)
153 *
154 * a log record consists of a data area of variable length followed by
155 * a descriptor of fixed size LOGRDSIZE bytes.
156 * the data area is rounded up to an integral number of 4-bytes and
157 * must be no longer than LOGPSIZE.
158 * the descriptor is of size of multiple of 4-bytes and aligned on a
159 * 4-byte boundary.
160 * records are packed one after the other in the data area of log pages.
161 * (sometimes a DUMMY record is inserted so that at least one record ends
162 * on every page or the longest record is placed on at most two pages).
163 * the field eor in page header/trailer points to the byte following
164 * the last record on a page.
165 */
166
167/* log record types */
168#define LOG_COMMIT 0x8000
169#define LOG_SYNCPT 0x4000
170#define LOG_MOUNT 0x2000
171#define LOG_REDOPAGE 0x0800
172#define LOG_NOREDOPAGE 0x0080
173#define LOG_NOREDOINOEXT 0x0040
174#define LOG_UPDATEMAP 0x0008
175#define LOG_NOREDOFILE 0x0001
176
177/* REDOPAGE/NOREDOPAGE log record data type */
178#define LOG_INODE 0x0001
179#define LOG_XTREE 0x0002
180#define LOG_DTREE 0x0004
181#define LOG_BTROOT 0x0010
182#define LOG_EA 0x0020
183#define LOG_ACL 0x0040
184#define LOG_DATA 0x0080
185#define LOG_NEW 0x0100
186#define LOG_EXTEND 0x0200
187#define LOG_RELOCATE 0x0400
188#define LOG_DIR_XTREE 0x0800 /* Xtree is in directory inode */
189
190/* UPDATEMAP log record descriptor type */
191#define LOG_ALLOCXADLIST 0x0080
192#define LOG_ALLOCPXDLIST 0x0040
193#define LOG_ALLOCXAD 0x0020
194#define LOG_ALLOCPXD 0x0010
195#define LOG_FREEXADLIST 0x0008
196#define LOG_FREEPXDLIST 0x0004
197#define LOG_FREEXAD 0x0002
198#define LOG_FREEPXD 0x0001
199
200
201struct lrd {
202 /*
203 * type independent area
204 */
205 __le32 logtid; /* 4: log transaction identifier */
206 __le32 backchain; /* 4: ptr to prev record of same transaction */
207 __le16 type; /* 2: record type */
208 __le16 length; /* 2: length of data in record (in byte) */
209 __le32 aggregate; /* 4: file system lv/aggregate */
210 /* (16) */
211
212 /*
213 * type dependent area (20)
214 */
215 union {
216
217 /*
218 * COMMIT: commit
219 *
220 * transaction commit: no type-dependent information;
221 */
222
223 /*
224 * REDOPAGE: after-image
225 *
226 * apply after-image;
227 *
228 * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
229 */
230 struct {
231 __le32 fileset; /* 4: fileset number */
232 __le32 inode; /* 4: inode number */
233 __le16 type; /* 2: REDOPAGE record type */
234 __le16 l2linesize; /* 2: log2 of line size */
235 pxd_t pxd; /* 8: on-disk page pxd */
236 } redopage; /* (20) */
237
238 /*
239 * NOREDOPAGE: the page is freed
240 *
241 * do not apply after-image records which precede this record
242 * in the log with the same page block number to this page.
243 *
244 * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
245 */
246 struct {
247 __le32 fileset; /* 4: fileset number */
248 __le32 inode; /* 4: inode number */
249 __le16 type; /* 2: NOREDOPAGE record type */
250 __le16 rsrvd; /* 2: reserved */
251 pxd_t pxd; /* 8: on-disk page pxd */
252 } noredopage; /* (20) */
253
254 /*
255 * UPDATEMAP: update block allocation map
256 *
257 * either in-line PXD,
258 * or out-of-line XADLIST;
259 *
260 * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format;
261 */
262 struct {
263 __le32 fileset; /* 4: fileset number */
264 __le32 inode; /* 4: inode number */
265 __le16 type; /* 2: UPDATEMAP record type */
266 __le16 nxd; /* 2: number of extents */
267 pxd_t pxd; /* 8: pxd */
268 } updatemap; /* (20) */
269
270 /*
271 * NOREDOINOEXT: the inode extent is freed
272 *
273 * do not apply after-image records which precede this
274 * record in the log with the any of the 4 page block
275 * numbers in this inode extent.
276 *
277 * NOTE: The fileset and pxd fields MUST remain in
278 * the same fields in the REDOPAGE record format.
279 *
280 */
281 struct {
282 __le32 fileset; /* 4: fileset number */
283 __le32 iagnum; /* 4: IAG number */
284 __le32 inoext_idx; /* 4: inode extent index */
285 pxd_t pxd; /* 8: on-disk page pxd */
286 } noredoinoext; /* (20) */
287
288 /*
289 * SYNCPT: log sync point
290 *
291 * replay log upto syncpt address specified;
292 */
293 struct {
294 __le32 sync; /* 4: syncpt address (0 = here) */
295 } syncpt;
296
297 /*
298 * MOUNT: file system mount
299 *
300 * file system mount: no type-dependent information;
301 */
302
303 /*
304 * ? FREEXTENT: free specified extent(s)
305 *
306 * free specified extent(s) from block allocation map
307 * N.B.: nextents should be length of data/sizeof(xad_t)
308 */
309 struct {
310 __le32 type; /* 4: FREEXTENT record type */
311 __le32 nextent; /* 4: number of extents */
312
313 /* data: PXD or XAD list */
314 } freextent;
315
316 /*
317 * ? NOREDOFILE: this file is freed
318 *
319 * do not apply records which precede this record in the log
320 * with the same inode number.
321 *
322 * NOREDILE must be the first to be written at commit
323 * (last to be read in logredo()) - it prevents
324 * replay of preceding updates of all preceding generations
325 * of the inumber esp. the on-disk inode itself,
326 * but does NOT prevent
327 * replay of the
328 */
329 struct {
330 __le32 fileset; /* 4: fileset number */
331 __le32 inode; /* 4: inode number */
332 } noredofile;
333
334 /*
335 * ? NEWPAGE:
336 *
337 * metadata type dependent
338 */
339 struct {
340 __le32 fileset; /* 4: fileset number */
341 __le32 inode; /* 4: inode number */
342 __le32 type; /* 4: NEWPAGE record type */
343 pxd_t pxd; /* 8: on-disk page pxd */
344 } newpage;
345
346 /*
347 * ? DUMMY: filler
348 *
349 * no type-dependent information
350 */
351 } log;
352}; /* (36) */
353
354#define LOGRDSIZE (sizeof(struct lrd))
355
356/*
357 * line vector descriptor
358 */
359struct lvd {
360 __le16 offset;
361 __le16 length;
362};
363
364
365/*
366 * log logical volume
367 */
368struct jfs_log {
369
370 struct list_head sb_list;/* This is used to sync metadata
371 * before writing syncpt.
372 */
373 struct list_head journal_list; /* Global list */
374 struct block_device *bdev; /* 4: log lv pointer */
375 int serial; /* 4: log mount serial number */
376
377 s64 base; /* @8: log extent address (inline log ) */
378 int size; /* 4: log size in log page (in page) */
379 int l2bsize; /* 4: log2 of bsize */
380
381 long flag; /* 4: flag */
382
383 struct lbuf *lbuf_free; /* 4: free lbufs */
384 wait_queue_head_t free_wait; /* 4: */
385
386 /* log write */
387 int logtid; /* 4: log tid */
388 int page; /* 4: page number of eol page */
389 int eor; /* 4: eor of last record in eol page */
390 struct lbuf *bp; /* 4: current log page buffer */
391
392 struct semaphore loglock; /* 4: log write serialization lock */
393
394 /* syncpt */
395 int nextsync; /* 4: bytes to write before next syncpt */
396 int active; /* 4: */
397 wait_queue_head_t syncwait; /* 4: */
398
399 /* commit */
400 uint cflag; /* 4: */
401 struct list_head cqueue; /* FIFO commit queue */
402 struct tblock *flush_tblk; /* tblk we're waiting on for flush */
403 int gcrtc; /* 4: GC_READY transaction count */
404 struct tblock *gclrt; /* 4: latest GC_READY transaction */
405 spinlock_t gclock; /* 4: group commit lock */
406 int logsize; /* 4: log data area size in byte */
407 int lsn; /* 4: end-of-log */
408 int clsn; /* 4: clsn */
409 int syncpt; /* 4: addr of last syncpt record */
410 int sync; /* 4: addr from last logsync() */
411 struct list_head synclist; /* 8: logsynclist anchor */
412 spinlock_t synclock; /* 4: synclist lock */
413 struct lbuf *wqueue; /* 4: log pageout queue */
414 int count; /* 4: count */
415 char uuid[16]; /* 16: 128-bit uuid of log device */
416
417 int no_integrity; /* 3: flag to disable journaling to disk */
418};
419
420/*
421 * Log flag
422 */
423#define log_INLINELOG 1
424#define log_SYNCBARRIER 2
425#define log_QUIESCE 3
426#define log_FLUSH 4
427
428/*
429 * group commit flag
430 */
431/* jfs_log */
432#define logGC_PAGEOUT 0x00000001
433
434/* tblock/lbuf */
435#define tblkGC_QUEUE 0x0001
436#define tblkGC_READY 0x0002
437#define tblkGC_COMMIT 0x0004
438#define tblkGC_COMMITTED 0x0008
439#define tblkGC_EOP 0x0010
440#define tblkGC_FREE 0x0020
441#define tblkGC_LEADER 0x0040
442#define tblkGC_ERROR 0x0080
443#define tblkGC_LAZY 0x0100 // D230860
444#define tblkGC_UNLOCKED 0x0200 // D230860
445
446/*
447 * log cache buffer header
448 */
449struct lbuf {
450 struct jfs_log *l_log; /* 4: log associated with buffer */
451
452 /*
453 * data buffer base area
454 */
455 uint l_flag; /* 4: pageout control flags */
456
457 struct lbuf *l_wqnext; /* 4: write queue link */
458 struct lbuf *l_freelist; /* 4: freelistlink */
459
460 int l_pn; /* 4: log page number */
461 int l_eor; /* 4: log record eor */
462 int l_ceor; /* 4: committed log record eor */
463
464 s64 l_blkno; /* 8: log page block number */
465 caddr_t l_ldata; /* 4: data page */
466
467 wait_queue_head_t l_ioevent; /* 4: i/o done event */
468 struct page *l_page; /* The page itself */
469};
470
471/* Reuse l_freelist for redrive list */
472#define l_redrive_next l_freelist
473
474/*
475 * logsynclist block
476 *
477 * common logsyncblk prefix for jbuf_t and tblock
478 */
479struct logsyncblk {
480 u16 xflag; /* flags */
481 u16 flag; /* only meaninful in tblock */
482 lid_t lid; /* lock id */
483 s32 lsn; /* log sequence number */
484 struct list_head synclist; /* log sync list link */
485};
486
487/*
488 * logsynclist serialization (per log)
489 */
490
491#define LOGSYNC_LOCK_INIT(log) spin_lock_init(&(log)->synclock)
492#define LOGSYNC_LOCK(log) spin_lock(&(log)->synclock)
493#define LOGSYNC_UNLOCK(log) spin_unlock(&(log)->synclock)
494
495/* compute the difference in bytes of lsn from sync point */
496#define logdiff(diff, lsn, log)\
497{\
498 diff = (lsn) - (log)->syncpt;\
499 if (diff < 0)\
500 diff += (log)->logsize;\
501}
502
503extern int lmLogOpen(struct super_block *sb);
504extern int lmLogClose(struct super_block *sb);
505extern int lmLogShutdown(struct jfs_log * log);
506extern int lmLogInit(struct jfs_log * log);
507extern int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize);
508extern void jfs_flush_journal(struct jfs_log * log, int wait);
509
510#endif /* _H_JFS_LOGMGR */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
new file mode 100644
index 000000000000..4c0a3ac75c08
--- /dev/null
+++ b/fs/jfs/jfs_metapage.c
@@ -0,0 +1,580 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2003
3 * Portions Copyright (C) Christoph Hellwig, 2001-2002
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/fs.h>
21#include <linux/init.h>
22#include <linux/buffer_head.h>
23#include <linux/mempool.h>
24#include <linux/delay.h>
25#include "jfs_incore.h"
26#include "jfs_superblock.h"
27#include "jfs_filsys.h"
28#include "jfs_metapage.h"
29#include "jfs_txnmgr.h"
30#include "jfs_debug.h"
31
32static DEFINE_SPINLOCK(meta_lock);
33
34#ifdef CONFIG_JFS_STATISTICS
35static struct {
36 uint pagealloc; /* # of page allocations */
37 uint pagefree; /* # of page frees */
38 uint lockwait; /* # of sleeping lock_metapage() calls */
39} mpStat;
40#endif
41
42
43#define HASH_BITS 10 /* This makes hash_table 1 4K page */
44#define HASH_SIZE (1 << HASH_BITS)
45static struct metapage **hash_table = NULL;
46static unsigned long hash_order;
47
48
49static inline int metapage_locked(struct metapage *mp)
50{
51 return test_bit(META_locked, &mp->flag);
52}
53
54static inline int trylock_metapage(struct metapage *mp)
55{
56 return test_and_set_bit(META_locked, &mp->flag);
57}
58
59static inline void unlock_metapage(struct metapage *mp)
60{
61 clear_bit(META_locked, &mp->flag);
62 wake_up(&mp->wait);
63}
64
65static void __lock_metapage(struct metapage *mp)
66{
67 DECLARE_WAITQUEUE(wait, current);
68
69 INCREMENT(mpStat.lockwait);
70
71 add_wait_queue_exclusive(&mp->wait, &wait);
72 do {
73 set_current_state(TASK_UNINTERRUPTIBLE);
74 if (metapage_locked(mp)) {
75 spin_unlock(&meta_lock);
76 schedule();
77 spin_lock(&meta_lock);
78 }
79 } while (trylock_metapage(mp));
80 __set_current_state(TASK_RUNNING);
81 remove_wait_queue(&mp->wait, &wait);
82}
83
84/* needs meta_lock */
85static inline void lock_metapage(struct metapage *mp)
86{
87 if (trylock_metapage(mp))
88 __lock_metapage(mp);
89}
90
91#define METAPOOL_MIN_PAGES 32
92static kmem_cache_t *metapage_cache;
93static mempool_t *metapage_mempool;
94
95static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
96{
97 struct metapage *mp = (struct metapage *)foo;
98
99 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
100 SLAB_CTOR_CONSTRUCTOR) {
101 mp->lid = 0;
102 mp->lsn = 0;
103 mp->flag = 0;
104 mp->data = NULL;
105 mp->clsn = 0;
106 mp->log = NULL;
107 set_bit(META_free, &mp->flag);
108 init_waitqueue_head(&mp->wait);
109 }
110}
111
112static inline struct metapage *alloc_metapage(int gfp_mask)
113{
114 return mempool_alloc(metapage_mempool, gfp_mask);
115}
116
117static inline void free_metapage(struct metapage *mp)
118{
119 mp->flag = 0;
120 set_bit(META_free, &mp->flag);
121
122 mempool_free(mp, metapage_mempool);
123}
124
125int __init metapage_init(void)
126{
127 /*
128 * Allocate the metapage structures
129 */
130 metapage_cache = kmem_cache_create("jfs_mp", sizeof(struct metapage),
131 0, 0, init_once, NULL);
132 if (metapage_cache == NULL)
133 return -ENOMEM;
134
135 metapage_mempool = mempool_create(METAPOOL_MIN_PAGES, mempool_alloc_slab,
136 mempool_free_slab, metapage_cache);
137
138 if (metapage_mempool == NULL) {
139 kmem_cache_destroy(metapage_cache);
140 return -ENOMEM;
141 }
142 /*
143 * Now the hash list
144 */
145 for (hash_order = 0;
146 ((PAGE_SIZE << hash_order) / sizeof(void *)) < HASH_SIZE;
147 hash_order++);
148 hash_table =
149 (struct metapage **) __get_free_pages(GFP_KERNEL, hash_order);
150 assert(hash_table);
151 memset(hash_table, 0, PAGE_SIZE << hash_order);
152
153 return 0;
154}
155
156void metapage_exit(void)
157{
158 mempool_destroy(metapage_mempool);
159 kmem_cache_destroy(metapage_cache);
160}
161
162/*
163 * Basically same hash as in pagemap.h, but using our hash table
164 */
165static struct metapage **meta_hash(struct address_space *mapping,
166 unsigned long index)
167{
168#define i (((unsigned long)mapping)/ \
169 (sizeof(struct inode) & ~(sizeof(struct inode) -1 )))
170#define s(x) ((x) + ((x) >> HASH_BITS))
171 return hash_table + (s(i + index) & (HASH_SIZE - 1));
172#undef i
173#undef s
174}
175
176static struct metapage *search_hash(struct metapage ** hash_ptr,
177 struct address_space *mapping,
178 unsigned long index)
179{
180 struct metapage *ptr;
181
182 for (ptr = *hash_ptr; ptr; ptr = ptr->hash_next) {
183 if ((ptr->mapping == mapping) && (ptr->index == index))
184 return ptr;
185 }
186
187 return NULL;
188}
189
190static void add_to_hash(struct metapage * mp, struct metapage ** hash_ptr)
191{
192 if (*hash_ptr)
193 (*hash_ptr)->hash_prev = mp;
194
195 mp->hash_prev = NULL;
196 mp->hash_next = *hash_ptr;
197 *hash_ptr = mp;
198}
199
200static void remove_from_hash(struct metapage * mp, struct metapage ** hash_ptr)
201{
202 if (mp->hash_prev)
203 mp->hash_prev->hash_next = mp->hash_next;
204 else {
205 assert(*hash_ptr == mp);
206 *hash_ptr = mp->hash_next;
207 }
208
209 if (mp->hash_next)
210 mp->hash_next->hash_prev = mp->hash_prev;
211}
212
213struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
214 unsigned int size, int absolute,
215 unsigned long new)
216{
217 struct metapage **hash_ptr;
218 int l2BlocksPerPage;
219 int l2bsize;
220 struct address_space *mapping;
221 struct metapage *mp;
222 unsigned long page_index;
223 unsigned long page_offset;
224
225 jfs_info("__get_metapage: inode = 0x%p, lblock = 0x%lx", inode, lblock);
226
227 if (absolute)
228 mapping = inode->i_sb->s_bdev->bd_inode->i_mapping;
229 else {
230 /*
231 * If an nfs client tries to read an inode that is larger
232 * than any existing inodes, we may try to read past the
233 * end of the inode map
234 */
235 if ((lblock << inode->i_blkbits) >= inode->i_size)
236 return NULL;
237 mapping = inode->i_mapping;
238 }
239
240 hash_ptr = meta_hash(mapping, lblock);
241again:
242 spin_lock(&meta_lock);
243 mp = search_hash(hash_ptr, mapping, lblock);
244 if (mp) {
245 page_found:
246 if (test_bit(META_stale, &mp->flag)) {
247 spin_unlock(&meta_lock);
248 msleep(1);
249 goto again;
250 }
251 mp->count++;
252 lock_metapage(mp);
253 spin_unlock(&meta_lock);
254 if (test_bit(META_discard, &mp->flag)) {
255 if (!new) {
256 jfs_error(inode->i_sb,
257 "__get_metapage: using a "
258 "discarded metapage");
259 release_metapage(mp);
260 return NULL;
261 }
262 clear_bit(META_discard, &mp->flag);
263 }
264 jfs_info("__get_metapage: found 0x%p, in hash", mp);
265 if (mp->logical_size != size) {
266 jfs_error(inode->i_sb,
267 "__get_metapage: mp->logical_size != size");
268 release_metapage(mp);
269 return NULL;
270 }
271 } else {
272 l2bsize = inode->i_blkbits;
273 l2BlocksPerPage = PAGE_CACHE_SHIFT - l2bsize;
274 page_index = lblock >> l2BlocksPerPage;
275 page_offset = (lblock - (page_index << l2BlocksPerPage)) <<
276 l2bsize;
277 if ((page_offset + size) > PAGE_CACHE_SIZE) {
278 spin_unlock(&meta_lock);
279 jfs_err("MetaData crosses page boundary!!");
280 return NULL;
281 }
282
283 /*
284 * Locks held on aggregate inode pages are usually
285 * not held long, and they are taken in critical code
286 * paths (committing dirty inodes, txCommit thread)
287 *
288 * Attempt to get metapage without blocking, tapping into
289 * reserves if necessary.
290 */
291 mp = NULL;
292 if (JFS_IP(inode)->fileset == AGGREGATE_I) {
293 mp = alloc_metapage(GFP_ATOMIC);
294 if (!mp) {
295 /*
296 * mempool is supposed to protect us from
297 * failing here. We will try a blocking
298 * call, but a deadlock is possible here
299 */
300 printk(KERN_WARNING
301 "__get_metapage: atomic call to mempool_alloc failed.\n");
302 printk(KERN_WARNING
303 "Will attempt blocking call\n");
304 }
305 }
306 if (!mp) {
307 struct metapage *mp2;
308
309 spin_unlock(&meta_lock);
310 mp = alloc_metapage(GFP_NOFS);
311 spin_lock(&meta_lock);
312
313 /* we dropped the meta_lock, we need to search the
314 * hash again.
315 */
316 mp2 = search_hash(hash_ptr, mapping, lblock);
317 if (mp2) {
318 free_metapage(mp);
319 mp = mp2;
320 goto page_found;
321 }
322 }
323 mp->flag = 0;
324 lock_metapage(mp);
325 if (absolute)
326 set_bit(META_absolute, &mp->flag);
327 mp->xflag = COMMIT_PAGE;
328 mp->count = 1;
329 atomic_set(&mp->nohomeok,0);
330 mp->mapping = mapping;
331 mp->index = lblock;
332 mp->page = NULL;
333 mp->logical_size = size;
334 add_to_hash(mp, hash_ptr);
335 spin_unlock(&meta_lock);
336
337 if (new) {
338 jfs_info("__get_metapage: Calling grab_cache_page");
339 mp->page = grab_cache_page(mapping, page_index);
340 if (!mp->page) {
341 jfs_err("grab_cache_page failed!");
342 goto freeit;
343 } else {
344 INCREMENT(mpStat.pagealloc);
345 unlock_page(mp->page);
346 }
347 } else {
348 jfs_info("__get_metapage: Calling read_cache_page");
349 mp->page = read_cache_page(mapping, lblock,
350 (filler_t *)mapping->a_ops->readpage, NULL);
351 if (IS_ERR(mp->page)) {
352 jfs_err("read_cache_page failed!");
353 goto freeit;
354 } else
355 INCREMENT(mpStat.pagealloc);
356 }
357 mp->data = kmap(mp->page) + page_offset;
358 }
359
360 if (new)
361 memset(mp->data, 0, PSIZE);
362
363 jfs_info("__get_metapage: returning = 0x%p", mp);
364 return mp;
365
366freeit:
367 spin_lock(&meta_lock);
368 remove_from_hash(mp, hash_ptr);
369 free_metapage(mp);
370 spin_unlock(&meta_lock);
371 return NULL;
372}
373
374void hold_metapage(struct metapage * mp, int force)
375{
376 spin_lock(&meta_lock);
377
378 mp->count++;
379
380 if (force) {
381 ASSERT (!(test_bit(META_forced, &mp->flag)));
382 if (trylock_metapage(mp))
383 set_bit(META_forced, &mp->flag);
384 } else
385 lock_metapage(mp);
386
387 spin_unlock(&meta_lock);
388}
389
390static void __write_metapage(struct metapage * mp)
391{
392 int l2bsize = mp->mapping->host->i_blkbits;
393 int l2BlocksPerPage = PAGE_CACHE_SHIFT - l2bsize;
394 unsigned long page_index;
395 unsigned long page_offset;
396 int rc;
397
398 jfs_info("__write_metapage: mp = 0x%p", mp);
399
400 page_index = mp->page->index;
401 page_offset =
402 (mp->index - (page_index << l2BlocksPerPage)) << l2bsize;
403
404 lock_page(mp->page);
405 rc = mp->mapping->a_ops->prepare_write(NULL, mp->page, page_offset,
406 page_offset +
407 mp->logical_size);
408 if (rc) {
409 jfs_err("prepare_write return %d!", rc);
410 ClearPageUptodate(mp->page);
411 unlock_page(mp->page);
412 clear_bit(META_dirty, &mp->flag);
413 return;
414 }
415 rc = mp->mapping->a_ops->commit_write(NULL, mp->page, page_offset,
416 page_offset +
417 mp->logical_size);
418 if (rc) {
419 jfs_err("commit_write returned %d", rc);
420 }
421
422 unlock_page(mp->page);
423 clear_bit(META_dirty, &mp->flag);
424
425 jfs_info("__write_metapage done");
426}
427
428static inline void sync_metapage(struct metapage *mp)
429{
430 struct page *page = mp->page;
431
432 page_cache_get(page);
433 lock_page(page);
434
435 /* we're done with this page - no need to check for errors */
436 if (page_has_buffers(page))
437 write_one_page(page, 1);
438 else
439 unlock_page(page);
440 page_cache_release(page);
441}
442
443void release_metapage(struct metapage * mp)
444{
445 struct jfs_log *log;
446
447 jfs_info("release_metapage: mp = 0x%p, flag = 0x%lx", mp, mp->flag);
448
449 spin_lock(&meta_lock);
450 if (test_bit(META_forced, &mp->flag)) {
451 clear_bit(META_forced, &mp->flag);
452 mp->count--;
453 spin_unlock(&meta_lock);
454 return;
455 }
456
457 assert(mp->count);
458 if (--mp->count || atomic_read(&mp->nohomeok)) {
459 unlock_metapage(mp);
460 spin_unlock(&meta_lock);
461 return;
462 }
463
464 if (mp->page) {
465 set_bit(META_stale, &mp->flag);
466 spin_unlock(&meta_lock);
467 kunmap(mp->page);
468 mp->data = NULL;
469 if (test_bit(META_dirty, &mp->flag))
470 __write_metapage(mp);
471 if (test_bit(META_sync, &mp->flag)) {
472 sync_metapage(mp);
473 clear_bit(META_sync, &mp->flag);
474 }
475
476 if (test_bit(META_discard, &mp->flag)) {
477 lock_page(mp->page);
478 block_invalidatepage(mp->page, 0);
479 unlock_page(mp->page);
480 }
481
482 page_cache_release(mp->page);
483 mp->page = NULL;
484 INCREMENT(mpStat.pagefree);
485 spin_lock(&meta_lock);
486 }
487
488 if (mp->lsn) {
489 /*
490 * Remove metapage from logsynclist.
491 */
492 log = mp->log;
493 LOGSYNC_LOCK(log);
494 mp->log = NULL;
495 mp->lsn = 0;
496 mp->clsn = 0;
497 log->count--;
498 list_del(&mp->synclist);
499 LOGSYNC_UNLOCK(log);
500 }
501 remove_from_hash(mp, meta_hash(mp->mapping, mp->index));
502 spin_unlock(&meta_lock);
503
504 free_metapage(mp);
505}
506
507void __invalidate_metapages(struct inode *ip, s64 addr, int len)
508{
509 struct metapage **hash_ptr;
510 unsigned long lblock;
511 int l2BlocksPerPage = PAGE_CACHE_SHIFT - ip->i_blkbits;
512 /* All callers are interested in block device's mapping */
513 struct address_space *mapping = ip->i_sb->s_bdev->bd_inode->i_mapping;
514 struct metapage *mp;
515 struct page *page;
516
517 /*
518 * First, mark metapages to discard. They will eventually be
519 * released, but should not be written.
520 */
521 for (lblock = addr; lblock < addr + len;
522 lblock += 1 << l2BlocksPerPage) {
523 hash_ptr = meta_hash(mapping, lblock);
524again:
525 spin_lock(&meta_lock);
526 mp = search_hash(hash_ptr, mapping, lblock);
527 if (mp) {
528 if (test_bit(META_stale, &mp->flag)) {
529 spin_unlock(&meta_lock);
530 msleep(1);
531 goto again;
532 }
533
534 clear_bit(META_dirty, &mp->flag);
535 set_bit(META_discard, &mp->flag);
536 spin_unlock(&meta_lock);
537 } else {
538 spin_unlock(&meta_lock);
539 page = find_lock_page(mapping, lblock>>l2BlocksPerPage);
540 if (page) {
541 block_invalidatepage(page, 0);
542 unlock_page(page);
543 page_cache_release(page);
544 }
545 }
546 }
547}
548
549#ifdef CONFIG_JFS_STATISTICS
550int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length,
551 int *eof, void *data)
552{
553 int len = 0;
554 off_t begin;
555
556 len += sprintf(buffer,
557 "JFS Metapage statistics\n"
558 "=======================\n"
559 "page allocations = %d\n"
560 "page frees = %d\n"
561 "lock waits = %d\n",
562 mpStat.pagealloc,
563 mpStat.pagefree,
564 mpStat.lockwait);
565
566 begin = offset;
567 *start = buffer + begin;
568 len -= begin;
569
570 if (len > length)
571 len = length;
572 else
573 *eof = 1;
574
575 if (len < 0)
576 len = 0;
577
578 return len;
579}
580#endif
diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h
new file mode 100644
index 000000000000..0e58aba58c37
--- /dev/null
+++ b/fs/jfs/jfs_metapage.h
@@ -0,0 +1,115 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2000-2002
3 * Portions Copyright (c) Christoph Hellwig, 2001-2002
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19#ifndef _H_JFS_METAPAGE
20#define _H_JFS_METAPAGE
21
22#include <linux/pagemap.h>
23
24struct metapage {
25 /* Common logsyncblk prefix (see jfs_logmgr.h) */
26 u16 xflag;
27 u16 unused;
28 lid_t lid;
29 int lsn;
30 struct list_head synclist;
31 /* End of logsyncblk prefix */
32
33 unsigned long flag; /* See Below */
34 unsigned long count; /* Reference count */
35 void *data; /* Data pointer */
36
37 /* list management stuff */
38 struct metapage *hash_prev;
39 struct metapage *hash_next; /* Also used for free list */
40
41 /*
42 * mapping & index become redundant, but we need these here to
43 * add the metapage to the hash before we have the real page
44 */
45 struct address_space *mapping;
46 unsigned long index;
47 wait_queue_head_t wait;
48
49 /* implementation */
50 struct page *page;
51 unsigned long logical_size;
52
53 /* Journal management */
54 int clsn;
55 atomic_t nohomeok;
56 struct jfs_log *log;
57};
58
59/* metapage flag */
60#define META_locked 0
61#define META_absolute 1
62#define META_free 2
63#define META_dirty 3
64#define META_sync 4
65#define META_discard 5
66#define META_forced 6
67#define META_stale 7
68
69#define mark_metapage_dirty(mp) set_bit(META_dirty, &(mp)->flag)
70
71/* function prototypes */
72extern struct metapage *__get_metapage(struct inode *inode,
73 unsigned long lblock, unsigned int size,
74 int absolute, unsigned long new);
75
76#define read_metapage(inode, lblock, size, absolute)\
77 __get_metapage(inode, lblock, size, absolute, FALSE)
78
79#define get_metapage(inode, lblock, size, absolute)\
80 __get_metapage(inode, lblock, size, absolute, TRUE)
81
82extern void release_metapage(struct metapage *);
83extern void hold_metapage(struct metapage *, int);
84
85static inline void write_metapage(struct metapage *mp)
86{
87 set_bit(META_dirty, &mp->flag);
88 release_metapage(mp);
89}
90
91static inline void flush_metapage(struct metapage *mp)
92{
93 set_bit(META_sync, &mp->flag);
94 write_metapage(mp);
95}
96
97static inline void discard_metapage(struct metapage *mp)
98{
99 clear_bit(META_dirty, &mp->flag);
100 set_bit(META_discard, &mp->flag);
101 release_metapage(mp);
102}
103
104/*
105 * This routines invalidate all pages for an extent.
106 */
107extern void __invalidate_metapages(struct inode *, s64, int);
108#define invalidate_pxd_metapages(ip, pxd) \
109 __invalidate_metapages((ip), addressPXD(&(pxd)), lengthPXD(&(pxd)))
110#define invalidate_dxd_metapages(ip, dxd) \
111 __invalidate_metapages((ip), addressDXD(&(dxd)), lengthDXD(&(dxd)))
112#define invalidate_xad_metapages(ip, xad) \
113 __invalidate_metapages((ip), addressXAD(&(xad)), lengthXAD(&(xad)))
114
115#endif /* _H_JFS_METAPAGE */
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
new file mode 100644
index 000000000000..c535ffd638e8
--- /dev/null
+++ b/fs/jfs/jfs_mount.c
@@ -0,0 +1,512 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19/*
20 * Module: jfs_mount.c
21 *
22 * note: file system in transition to aggregate/fileset:
23 *
24 * file system mount is interpreted as the mount of aggregate,
25 * if not already mounted, and mount of the single/only fileset in
26 * the aggregate;
27 *
28 * a file system/aggregate is represented by an internal inode
29 * (aka mount inode) initialized with aggregate superblock;
30 * each vfs represents a fileset, and points to its "fileset inode
31 * allocation map inode" (aka fileset inode):
32 * (an aggregate itself is structured recursively as a filset:
33 * an internal vfs is constructed and points to its "fileset inode
34 * allocation map inode" (aka aggregate inode) where each inode
35 * represents a fileset inode) so that inode number is mapped to
36 * on-disk inode in uniform way at both aggregate and fileset level;
37 *
38 * each vnode/inode of a fileset is linked to its vfs (to facilitate
39 * per fileset inode operations, e.g., unmount of a fileset, etc.);
40 * each inode points to the mount inode (to facilitate access to
41 * per aggregate information, e.g., block size, etc.) as well as
42 * its file set inode.
43 *
44 * aggregate
45 * ipmnt
46 * mntvfs -> fileset ipimap+ -> aggregate ipbmap -> aggregate ipaimap;
47 * fileset vfs -> vp(1) <-> ... <-> vp(n) <->vproot;
48 */
49
50#include <linux/fs.h>
51#include <linux/buffer_head.h>
52
53#include "jfs_incore.h"
54#include "jfs_filsys.h"
55#include "jfs_superblock.h"
56#include "jfs_dmap.h"
57#include "jfs_imap.h"
58#include "jfs_metapage.h"
59#include "jfs_debug.h"
60
61
62/*
63 * forward references
64 */
65static int chkSuper(struct super_block *);
66static int logMOUNT(struct super_block *sb);
67
68/*
69 * NAME: jfs_mount(sb)
70 *
71 * FUNCTION: vfs_mount()
72 *
73 * PARAMETER: sb - super block
74 *
75 * RETURN: -EBUSY - device already mounted or open for write
76 * -EBUSY - cvrdvp already mounted;
77 * -EBUSY - mount table full
78 * -ENOTDIR- cvrdvp not directory on a device mount
79 * -ENXIO - device open failure
80 */
81int jfs_mount(struct super_block *sb)
82{
83 int rc = 0; /* Return code */
84 struct jfs_sb_info *sbi = JFS_SBI(sb);
85 struct inode *ipaimap = NULL;
86 struct inode *ipaimap2 = NULL;
87 struct inode *ipimap = NULL;
88 struct inode *ipbmap = NULL;
89
90 /*
91 * read/validate superblock
92 * (initialize mount inode from the superblock)
93 */
94 if ((rc = chkSuper(sb))) {
95 goto errout20;
96 }
97
98 ipaimap = diReadSpecial(sb, AGGREGATE_I, 0);
99 if (ipaimap == NULL) {
100 jfs_err("jfs_mount: Faild to read AGGREGATE_I");
101 rc = -EIO;
102 goto errout20;
103 }
104 sbi->ipaimap = ipaimap;
105
106 jfs_info("jfs_mount: ipaimap:0x%p", ipaimap);
107
108 /*
109 * initialize aggregate inode allocation map
110 */
111 if ((rc = diMount(ipaimap))) {
112 jfs_err("jfs_mount: diMount(ipaimap) failed w/rc = %d", rc);
113 goto errout21;
114 }
115
116 /*
117 * open aggregate block allocation map
118 */
119 ipbmap = diReadSpecial(sb, BMAP_I, 0);
120 if (ipbmap == NULL) {
121 rc = -EIO;
122 goto errout22;
123 }
124
125 jfs_info("jfs_mount: ipbmap:0x%p", ipbmap);
126
127 sbi->ipbmap = ipbmap;
128
129 /*
130 * initialize aggregate block allocation map
131 */
132 if ((rc = dbMount(ipbmap))) {
133 jfs_err("jfs_mount: dbMount failed w/rc = %d", rc);
134 goto errout22;
135 }
136
137 /*
138 * open the secondary aggregate inode allocation map
139 *
140 * This is a duplicate of the aggregate inode allocation map.
141 *
142 * hand craft a vfs in the same fashion as we did to read ipaimap.
143 * By adding INOSPEREXT (32) to the inode number, we are telling
144 * diReadSpecial that we are reading from the secondary aggregate
145 * inode table. This also creates a unique entry in the inode hash
146 * table.
147 */
148 if ((sbi->mntflag & JFS_BAD_SAIT) == 0) {
149 ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1);
150 if (ipaimap2 == 0) {
151 jfs_err("jfs_mount: Faild to read AGGREGATE_I");
152 rc = -EIO;
153 goto errout35;
154 }
155 sbi->ipaimap2 = ipaimap2;
156
157 jfs_info("jfs_mount: ipaimap2:0x%p", ipaimap2);
158
159 /*
160 * initialize secondary aggregate inode allocation map
161 */
162 if ((rc = diMount(ipaimap2))) {
163 jfs_err("jfs_mount: diMount(ipaimap2) failed, rc = %d",
164 rc);
165 goto errout35;
166 }
167 } else
168 /* Secondary aggregate inode table is not valid */
169 sbi->ipaimap2 = NULL;
170
171 /*
172 * mount (the only/single) fileset
173 */
174 /*
175 * open fileset inode allocation map (aka fileset inode)
176 */
177 ipimap = diReadSpecial(sb, FILESYSTEM_I, 0);
178 if (ipimap == NULL) {
179 jfs_err("jfs_mount: Failed to read FILESYSTEM_I");
180 /* open fileset secondary inode allocation map */
181 rc = -EIO;
182 goto errout40;
183 }
184 jfs_info("jfs_mount: ipimap:0x%p", ipimap);
185
186 /* map further access of per fileset inodes by the fileset inode */
187 sbi->ipimap = ipimap;
188
189 /* initialize fileset inode allocation map */
190 if ((rc = diMount(ipimap))) {
191 jfs_err("jfs_mount: diMount failed w/rc = %d", rc);
192 goto errout41;
193 }
194
195 goto out;
196
197 /*
198 * unwind on error
199 */
200 errout41: /* close fileset inode allocation map inode */
201 diFreeSpecial(ipimap);
202
203 errout40: /* fileset closed */
204
205 /* close secondary aggregate inode allocation map */
206 if (ipaimap2) {
207 diUnmount(ipaimap2, 1);
208 diFreeSpecial(ipaimap2);
209 }
210
211 errout35:
212
213 /* close aggregate block allocation map */
214 dbUnmount(ipbmap, 1);
215 diFreeSpecial(ipbmap);
216
217 errout22: /* close aggregate inode allocation map */
218
219 diUnmount(ipaimap, 1);
220
221 errout21: /* close aggregate inodes */
222 diFreeSpecial(ipaimap);
223 errout20: /* aggregate closed */
224
225 out:
226
227 if (rc)
228 jfs_err("Mount JFS Failure: %d", rc);
229
230 return rc;
231}
232
233/*
234 * NAME: jfs_mount_rw(sb, remount)
235 *
236 * FUNCTION: Completes read-write mount, or remounts read-only volume
237 * as read-write
238 */
239int jfs_mount_rw(struct super_block *sb, int remount)
240{
241 struct jfs_sb_info *sbi = JFS_SBI(sb);
242 int rc;
243
244 /*
245 * If we are re-mounting a previously read-only volume, we want to
246 * re-read the inode and block maps, since fsck.jfs may have updated
247 * them.
248 */
249 if (remount) {
250 if (chkSuper(sb) || (sbi->state != FM_CLEAN))
251 return -EINVAL;
252
253 truncate_inode_pages(sbi->ipimap->i_mapping, 0);
254 truncate_inode_pages(sbi->ipbmap->i_mapping, 0);
255 diUnmount(sbi->ipimap, 1);
256 if ((rc = diMount(sbi->ipimap))) {
257 jfs_err("jfs_mount_rw: diMount failed!");
258 return rc;
259 }
260
261 dbUnmount(sbi->ipbmap, 1);
262 if ((rc = dbMount(sbi->ipbmap))) {
263 jfs_err("jfs_mount_rw: dbMount failed!");
264 return rc;
265 }
266 }
267
268 /*
269 * open/initialize log
270 */
271 if ((rc = lmLogOpen(sb)))
272 return rc;
273
274 /*
275 * update file system superblock;
276 */
277 if ((rc = updateSuper(sb, FM_MOUNT))) {
278 jfs_err("jfs_mount: updateSuper failed w/rc = %d", rc);
279 lmLogClose(sb);
280 return rc;
281 }
282
283 /*
284 * write MOUNT log record of the file system
285 */
286 logMOUNT(sb);
287
288 /*
289 * Set page cache allocation policy
290 */
291 mapping_set_gfp_mask(sb->s_bdev->bd_inode->i_mapping, GFP_NOFS);
292
293 return rc;
294}
295
296/*
297 * chkSuper()
298 *
299 * validate the superblock of the file system to be mounted and
300 * get the file system parameters.
301 *
302 * returns
303 * 0 with fragsize set if check successful
304 * error code if not successful
305 */
306static int chkSuper(struct super_block *sb)
307{
308 int rc = 0;
309 struct jfs_sb_info *sbi = JFS_SBI(sb);
310 struct jfs_superblock *j_sb;
311 struct buffer_head *bh;
312 int AIM_bytesize, AIT_bytesize;
313 int expected_AIM_bytesize, expected_AIT_bytesize;
314 s64 AIM_byte_addr, AIT_byte_addr, fsckwsp_addr;
315 s64 byte_addr_diff0, byte_addr_diff1;
316 s32 bsize;
317
318 if ((rc = readSuper(sb, &bh)))
319 return rc;
320 j_sb = (struct jfs_superblock *)bh->b_data;
321
322 /*
323 * validate superblock
324 */
325 /* validate fs signature */
326 if (strncmp(j_sb->s_magic, JFS_MAGIC, 4) ||
327 le32_to_cpu(j_sb->s_version) > JFS_VERSION) {
328 rc = -EINVAL;
329 goto out;
330 }
331
332 bsize = le32_to_cpu(j_sb->s_bsize);
333#ifdef _JFS_4K
334 if (bsize != PSIZE) {
335 jfs_err("Currently only 4K block size supported!");
336 rc = -EINVAL;
337 goto out;
338 }
339#endif /* _JFS_4K */
340
341 jfs_info("superblock: flag:0x%08x state:0x%08x size:0x%Lx",
342 le32_to_cpu(j_sb->s_flag), le32_to_cpu(j_sb->s_state),
343 (unsigned long long) le64_to_cpu(j_sb->s_size));
344
345 /* validate the descriptors for Secondary AIM and AIT */
346 if ((j_sb->s_flag & cpu_to_le32(JFS_BAD_SAIT)) !=
347 cpu_to_le32(JFS_BAD_SAIT)) {
348 expected_AIM_bytesize = 2 * PSIZE;
349 AIM_bytesize = lengthPXD(&(j_sb->s_aim2)) * bsize;
350 expected_AIT_bytesize = 4 * PSIZE;
351 AIT_bytesize = lengthPXD(&(j_sb->s_ait2)) * bsize;
352 AIM_byte_addr = addressPXD(&(j_sb->s_aim2)) * bsize;
353 AIT_byte_addr = addressPXD(&(j_sb->s_ait2)) * bsize;
354 byte_addr_diff0 = AIT_byte_addr - AIM_byte_addr;
355 fsckwsp_addr = addressPXD(&(j_sb->s_fsckpxd)) * bsize;
356 byte_addr_diff1 = fsckwsp_addr - AIT_byte_addr;
357 if ((AIM_bytesize != expected_AIM_bytesize) ||
358 (AIT_bytesize != expected_AIT_bytesize) ||
359 (byte_addr_diff0 != AIM_bytesize) ||
360 (byte_addr_diff1 <= AIT_bytesize))
361 j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT);
362 }
363
364 if ((j_sb->s_flag & cpu_to_le32(JFS_GROUPCOMMIT)) !=
365 cpu_to_le32(JFS_GROUPCOMMIT))
366 j_sb->s_flag |= cpu_to_le32(JFS_GROUPCOMMIT);
367
368 /* validate fs state */
369 if (j_sb->s_state != cpu_to_le32(FM_CLEAN) &&
370 !(sb->s_flags & MS_RDONLY)) {
371 jfs_err("jfs_mount: Mount Failure: File System Dirty.");
372 rc = -EINVAL;
373 goto out;
374 }
375
376 sbi->state = le32_to_cpu(j_sb->s_state);
377 sbi->mntflag = le32_to_cpu(j_sb->s_flag);
378
379 /*
380 * JFS always does I/O by 4K pages. Don't tell the buffer cache
381 * that we use anything else (leave s_blocksize alone).
382 */
383 sbi->bsize = bsize;
384 sbi->l2bsize = le16_to_cpu(j_sb->s_l2bsize);
385
386 /*
387 * For now, ignore s_pbsize, l2bfactor. All I/O going through buffer
388 * cache.
389 */
390 sbi->nbperpage = PSIZE >> sbi->l2bsize;
391 sbi->l2nbperpage = L2PSIZE - sbi->l2bsize;
392 sbi->l2niperblk = sbi->l2bsize - L2DISIZE;
393 if (sbi->mntflag & JFS_INLINELOG)
394 sbi->logpxd = j_sb->s_logpxd;
395 else {
396 sbi->logdev = new_decode_dev(le32_to_cpu(j_sb->s_logdev));
397 memcpy(sbi->uuid, j_sb->s_uuid, sizeof(sbi->uuid));
398 memcpy(sbi->loguuid, j_sb->s_loguuid, sizeof(sbi->uuid));
399 }
400 sbi->fsckpxd = j_sb->s_fsckpxd;
401 sbi->ait2 = j_sb->s_ait2;
402
403 out:
404 brelse(bh);
405 return rc;
406}
407
408
409/*
410 * updateSuper()
411 *
412 * update synchronously superblock if it is mounted read-write.
413 */
414int updateSuper(struct super_block *sb, uint state)
415{
416 struct jfs_superblock *j_sb;
417 struct jfs_sb_info *sbi = JFS_SBI(sb);
418 struct buffer_head *bh;
419 int rc;
420
421 if (sbi->flag & JFS_NOINTEGRITY) {
422 if (state == FM_DIRTY) {
423 sbi->p_state = state;
424 return 0;
425 } else if (state == FM_MOUNT) {
426 sbi->p_state = sbi->state;
427 state = FM_DIRTY;
428 } else if (state == FM_CLEAN) {
429 state = sbi->p_state;
430 } else
431 jfs_err("updateSuper: bad state");
432 } else if (sbi->state == FM_DIRTY)
433 return 0;
434
435 if ((rc = readSuper(sb, &bh)))
436 return rc;
437
438 j_sb = (struct jfs_superblock *)bh->b_data;
439
440 j_sb->s_state = cpu_to_le32(state);
441 sbi->state = state;
442
443 if (state == FM_MOUNT) {
444 /* record log's dev_t and mount serial number */
445 j_sb->s_logdev = cpu_to_le32(new_encode_dev(sbi->log->bdev->bd_dev));
446 j_sb->s_logserial = cpu_to_le32(sbi->log->serial);
447 } else if (state == FM_CLEAN) {
448 /*
449 * If this volume is shared with OS/2, OS/2 will need to
450 * recalculate DASD usage, since we don't deal with it.
451 */
452 if (j_sb->s_flag & cpu_to_le32(JFS_DASD_ENABLED))
453 j_sb->s_flag |= cpu_to_le32(JFS_DASD_PRIME);
454 }
455
456 mark_buffer_dirty(bh);
457 sync_dirty_buffer(bh);
458 brelse(bh);
459
460 return 0;
461}
462
463
464/*
465 * readSuper()
466 *
467 * read superblock by raw sector address
468 */
469int readSuper(struct super_block *sb, struct buffer_head **bpp)
470{
471 /* read in primary superblock */
472 *bpp = sb_bread(sb, SUPER1_OFF >> sb->s_blocksize_bits);
473 if (*bpp)
474 return 0;
475
476 /* read in secondary/replicated superblock */
477 *bpp = sb_bread(sb, SUPER2_OFF >> sb->s_blocksize_bits);
478 if (*bpp)
479 return 0;
480
481 return -EIO;
482}
483
484
485/*
486 * logMOUNT()
487 *
488 * function: write a MOUNT log record for file system.
489 *
490 * MOUNT record keeps logredo() from processing log records
491 * for this file system past this point in log.
492 * it is harmless if mount fails.
493 *
494 * note: MOUNT record is at aggregate level, not at fileset level,
495 * since log records of previous mounts of a fileset
496 * (e.g., AFTER record of extent allocation) have to be processed
497 * to update block allocation map at aggregate level.
498 */
499static int logMOUNT(struct super_block *sb)
500{
501 struct jfs_log *log = JFS_SBI(sb)->log;
502 struct lrd lrd;
503
504 lrd.logtid = 0;
505 lrd.backchain = 0;
506 lrd.type = cpu_to_le16(LOG_MOUNT);
507 lrd.length = 0;
508 lrd.aggregate = cpu_to_le32(new_encode_dev(sb->s_bdev->bd_dev));
509 lmLog(log, NULL, &lrd, NULL);
510
511 return 0;
512}
diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h
new file mode 100644
index 000000000000..ab0566f70cfa
--- /dev/null
+++ b/fs/jfs/jfs_superblock.h
@@ -0,0 +1,113 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2003
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18#ifndef _H_JFS_SUPERBLOCK
19#define _H_JFS_SUPERBLOCK
20
21/*
22 * make the magic number something a human could read
23 */
24#define JFS_MAGIC "JFS1" /* Magic word */
25
26#define JFS_VERSION 2 /* Version number: Version 2 */
27
28#define LV_NAME_SIZE 11 /* MUST BE 11 for OS/2 boot sector */
29
30/*
31 * aggregate superblock
32 *
33 * The name superblock is too close to super_block, so the name has been
34 * changed to jfs_superblock. The utilities are still using the old name.
35 */
36struct jfs_superblock {
37 char s_magic[4]; /* 4: magic number */
38 __le32 s_version; /* 4: version number */
39
40 __le64 s_size; /* 8: aggregate size in hardware/LVM blocks;
41 * VFS: number of blocks
42 */
43 __le32 s_bsize; /* 4: aggregate block size in bytes;
44 * VFS: fragment size
45 */
46 __le16 s_l2bsize; /* 2: log2 of s_bsize */
47 __le16 s_l2bfactor; /* 2: log2(s_bsize/hardware block size) */
48 __le32 s_pbsize; /* 4: hardware/LVM block size in bytes */
49 __le16 s_l2pbsize; /* 2: log2 of s_pbsize */
50 __le16 pad; /* 2: padding necessary for alignment */
51
52 __le32 s_agsize; /* 4: allocation group size in aggr. blocks */
53
54 __le32 s_flag; /* 4: aggregate attributes:
55 * see jfs_filsys.h
56 */
57 __le32 s_state; /* 4: mount/unmount/recovery state:
58 * see jfs_filsys.h
59 */
60 __le32 s_compress; /* 4: > 0 if data compression */
61
62 pxd_t s_ait2; /* 8: first extent of secondary
63 * aggregate inode table
64 */
65
66 pxd_t s_aim2; /* 8: first extent of secondary
67 * aggregate inode map
68 */
69 __le32 s_logdev; /* 4: device address of log */
70 __le32 s_logserial; /* 4: log serial number at aggregate mount */
71 pxd_t s_logpxd; /* 8: inline log extent */
72
73 pxd_t s_fsckpxd; /* 8: inline fsck work space extent */
74
75 struct timestruc_t s_time; /* 8: time last updated */
76
77 __le32 s_fsckloglen; /* 4: Number of filesystem blocks reserved for
78 * the fsck service log.
79 * N.B. These blocks are divided among the
80 * versions kept. This is not a per
81 * version size.
82 * N.B. These blocks are included in the
83 * length field of s_fsckpxd.
84 */
85 s8 s_fscklog; /* 1: which fsck service log is most recent
86 * 0 => no service log data yet
87 * 1 => the first one
88 * 2 => the 2nd one
89 */
90 char s_fpack[11]; /* 11: file system volume name
91 * N.B. This must be 11 bytes to
92 * conform with the OS/2 BootSector
93 * requirements
94 * Only used when s_version is 1
95 */
96
97 /* extendfs() parameter under s_state & FM_EXTENDFS */
98 __le64 s_xsize; /* 8: extendfs s_size */
99 pxd_t s_xfsckpxd; /* 8: extendfs fsckpxd */
100 pxd_t s_xlogpxd; /* 8: extendfs logpxd */
101 /* - 128 byte boundary - */
102
103 char s_uuid[16]; /* 16: 128-bit uuid for volume */
104 char s_label[16]; /* 16: volume label */
105 char s_loguuid[16]; /* 16: 128-bit uuid for log device */
106
107};
108
109extern int readSuper(struct super_block *, struct buffer_head **);
110extern int updateSuper(struct super_block *, uint);
111extern void jfs_error(struct super_block *, const char *, ...);
112
113#endif /*_H_JFS_SUPERBLOCK */
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
new file mode 100644
index 000000000000..f40301d93f74
--- /dev/null
+++ b/fs/jfs/jfs_txnmgr.c
@@ -0,0 +1,3131 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2005
3 * Portions Copyright (C) Christoph Hellwig, 2001-2002
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20/*
21 * jfs_txnmgr.c: transaction manager
22 *
23 * notes:
24 * transaction starts with txBegin() and ends with txCommit()
25 * or txAbort().
26 *
27 * tlock is acquired at the time of update;
28 * (obviate scan at commit time for xtree and dtree)
29 * tlock and mp points to each other;
30 * (no hashlist for mp -> tlock).
31 *
32 * special cases:
33 * tlock on in-memory inode:
34 * in-place tlock in the in-memory inode itself;
35 * converted to page lock by iWrite() at commit time.
36 *
37 * tlock during write()/mmap() under anonymous transaction (tid = 0):
38 * transferred (?) to transaction at commit time.
39 *
40 * use the page itself to update allocation maps
41 * (obviate intermediate replication of allocation/deallocation data)
42 * hold on to mp+lock thru update of maps
43 */
44
45
46#include <linux/fs.h>
47#include <linux/vmalloc.h>
48#include <linux/smp_lock.h>
49#include <linux/completion.h>
50#include <linux/suspend.h>
51#include <linux/module.h>
52#include <linux/moduleparam.h>
53#include "jfs_incore.h"
54#include "jfs_filsys.h"
55#include "jfs_metapage.h"
56#include "jfs_dinode.h"
57#include "jfs_imap.h"
58#include "jfs_dmap.h"
59#include "jfs_superblock.h"
60#include "jfs_debug.h"
61
62/*
63 * transaction management structures
64 */
65static struct {
66 int freetid; /* index of a free tid structure */
67 int freelock; /* index first free lock word */
68 wait_queue_head_t freewait; /* eventlist of free tblock */
69 wait_queue_head_t freelockwait; /* eventlist of free tlock */
70 wait_queue_head_t lowlockwait; /* eventlist of ample tlocks */
71 int tlocksInUse; /* Number of tlocks in use */
72 spinlock_t LazyLock; /* synchronize sync_queue & unlock_queue */
73/* struct tblock *sync_queue; * Transactions waiting for data sync */
74 struct list_head unlock_queue; /* Txns waiting to be released */
75 struct list_head anon_list; /* inodes having anonymous txns */
76 struct list_head anon_list2; /* inodes having anonymous txns
77 that couldn't be sync'ed */
78} TxAnchor;
79
80int jfs_tlocks_low; /* Indicates low number of available tlocks */
81
82#ifdef CONFIG_JFS_STATISTICS
83static struct {
84 uint txBegin;
85 uint txBegin_barrier;
86 uint txBegin_lockslow;
87 uint txBegin_freetid;
88 uint txBeginAnon;
89 uint txBeginAnon_barrier;
90 uint txBeginAnon_lockslow;
91 uint txLockAlloc;
92 uint txLockAlloc_freelock;
93} TxStat;
94#endif
95
96static int nTxBlock = -1; /* number of transaction blocks */
97module_param(nTxBlock, int, 0);
98MODULE_PARM_DESC(nTxBlock,
99 "Number of transaction blocks (max:65536)");
100
101static int nTxLock = -1; /* number of transaction locks */
102module_param(nTxLock, int, 0);
103MODULE_PARM_DESC(nTxLock,
104 "Number of transaction locks (max:65536)");
105
106struct tblock *TxBlock; /* transaction block table */
107static int TxLockLWM; /* Low water mark for number of txLocks used */
108static int TxLockHWM; /* High water mark for number of txLocks used */
109static int TxLockVHWM; /* Very High water mark */
110struct tlock *TxLock; /* transaction lock table */
111
112
113/*
114 * transaction management lock
115 */
116static DEFINE_SPINLOCK(jfsTxnLock);
117
118#define TXN_LOCK() spin_lock(&jfsTxnLock)
119#define TXN_UNLOCK() spin_unlock(&jfsTxnLock)
120
121#define LAZY_LOCK_INIT() spin_lock_init(&TxAnchor.LazyLock);
122#define LAZY_LOCK(flags) spin_lock_irqsave(&TxAnchor.LazyLock, flags)
123#define LAZY_UNLOCK(flags) spin_unlock_irqrestore(&TxAnchor.LazyLock, flags)
124
125DECLARE_WAIT_QUEUE_HEAD(jfs_sync_thread_wait);
126DECLARE_WAIT_QUEUE_HEAD(jfs_commit_thread_wait);
127static int jfs_commit_thread_waking;
128
129/*
130 * Retry logic exist outside these macros to protect from spurrious wakeups.
131 */
132static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
133{
134 DECLARE_WAITQUEUE(wait, current);
135
136 add_wait_queue(event, &wait);
137 set_current_state(TASK_UNINTERRUPTIBLE);
138 TXN_UNLOCK();
139 schedule();
140 current->state = TASK_RUNNING;
141 remove_wait_queue(event, &wait);
142}
143
144#define TXN_SLEEP(event)\
145{\
146 TXN_SLEEP_DROP_LOCK(event);\
147 TXN_LOCK();\
148}
149
150#define TXN_WAKEUP(event) wake_up_all(event)
151
152
153/*
154 * statistics
155 */
156static struct {
157 tid_t maxtid; /* 4: biggest tid ever used */
158 lid_t maxlid; /* 4: biggest lid ever used */
159 int ntid; /* 4: # of transactions performed */
160 int nlid; /* 4: # of tlocks acquired */
161 int waitlock; /* 4: # of tlock wait */
162} stattx;
163
164
165/*
166 * external references
167 */
168extern int lmGroupCommit(struct jfs_log *, struct tblock *);
169extern int jfs_commit_inode(struct inode *, int);
170extern int jfs_stop_threads;
171
172extern struct completion jfsIOwait;
173
174/*
175 * forward references
176 */
177static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
178 struct tlock * tlck, struct commit * cd);
179static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
180 struct tlock * tlck);
181static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
182 struct tlock * tlck);
183static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
184 struct tlock * tlck);
185static void txAllocPMap(struct inode *ip, struct maplock * maplock,
186 struct tblock * tblk);
187static void txForce(struct tblock * tblk);
188static int txLog(struct jfs_log * log, struct tblock * tblk,
189 struct commit * cd);
190static void txUpdateMap(struct tblock * tblk);
191static void txRelease(struct tblock * tblk);
192static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
193 struct tlock * tlck);
194static void LogSyncRelease(struct metapage * mp);
195
196/*
197 * transaction block/lock management
198 * ---------------------------------
199 */
200
201/*
202 * Get a transaction lock from the free list. If the number in use is
203 * greater than the high water mark, wake up the sync daemon. This should
204 * free some anonymous transaction locks. (TXN_LOCK must be held.)
205 */
206static lid_t txLockAlloc(void)
207{
208 lid_t lid;
209
210 INCREMENT(TxStat.txLockAlloc);
211 if (!TxAnchor.freelock) {
212 INCREMENT(TxStat.txLockAlloc_freelock);
213 }
214
215 while (!(lid = TxAnchor.freelock))
216 TXN_SLEEP(&TxAnchor.freelockwait);
217 TxAnchor.freelock = TxLock[lid].next;
218 HIGHWATERMARK(stattx.maxlid, lid);
219 if ((++TxAnchor.tlocksInUse > TxLockHWM) && (jfs_tlocks_low == 0)) {
220 jfs_info("txLockAlloc tlocks low");
221 jfs_tlocks_low = 1;
222 wake_up(&jfs_sync_thread_wait);
223 }
224
225 return lid;
226}
227
228static void txLockFree(lid_t lid)
229{
230 TxLock[lid].next = TxAnchor.freelock;
231 TxAnchor.freelock = lid;
232 TxAnchor.tlocksInUse--;
233 if (jfs_tlocks_low && (TxAnchor.tlocksInUse < TxLockLWM)) {
234 jfs_info("txLockFree jfs_tlocks_low no more");
235 jfs_tlocks_low = 0;
236 TXN_WAKEUP(&TxAnchor.lowlockwait);
237 }
238 TXN_WAKEUP(&TxAnchor.freelockwait);
239}
240
241/*
242 * NAME: txInit()
243 *
244 * FUNCTION: initialize transaction management structures
245 *
246 * RETURN:
247 *
248 * serialization: single thread at jfs_init()
249 */
250int txInit(void)
251{
252 int k, size;
253 struct sysinfo si;
254
255 /* Set defaults for nTxLock and nTxBlock if unset */
256
257 if (nTxLock == -1) {
258 if (nTxBlock == -1) {
259 /* Base default on memory size */
260 si_meminfo(&si);
261 if (si.totalram > (256 * 1024)) /* 1 GB */
262 nTxLock = 64 * 1024;
263 else
264 nTxLock = si.totalram >> 2;
265 } else if (nTxBlock > (8 * 1024))
266 nTxLock = 64 * 1024;
267 else
268 nTxLock = nTxBlock << 3;
269 }
270 if (nTxBlock == -1)
271 nTxBlock = nTxLock >> 3;
272
273 /* Verify tunable parameters */
274 if (nTxBlock < 16)
275 nTxBlock = 16; /* No one should set it this low */
276 if (nTxBlock > 65536)
277 nTxBlock = 65536;
278 if (nTxLock < 256)
279 nTxLock = 256; /* No one should set it this low */
280 if (nTxLock > 65536)
281 nTxLock = 65536;
282
283 printk(KERN_INFO "JFS: nTxBlock = %d, nTxLock = %d\n",
284 nTxBlock, nTxLock);
285 /*
286 * initialize transaction block (tblock) table
287 *
288 * transaction id (tid) = tblock index
289 * tid = 0 is reserved.
290 */
291 TxLockLWM = (nTxLock * 4) / 10;
292 TxLockHWM = (nTxLock * 7) / 10;
293 TxLockVHWM = (nTxLock * 8) / 10;
294
295 size = sizeof(struct tblock) * nTxBlock;
296 TxBlock = (struct tblock *) vmalloc(size);
297 if (TxBlock == NULL)
298 return -ENOMEM;
299
300 for (k = 1; k < nTxBlock - 1; k++) {
301 TxBlock[k].next = k + 1;
302 init_waitqueue_head(&TxBlock[k].gcwait);
303 init_waitqueue_head(&TxBlock[k].waitor);
304 }
305 TxBlock[k].next = 0;
306 init_waitqueue_head(&TxBlock[k].gcwait);
307 init_waitqueue_head(&TxBlock[k].waitor);
308
309 TxAnchor.freetid = 1;
310 init_waitqueue_head(&TxAnchor.freewait);
311
312 stattx.maxtid = 1; /* statistics */
313
314 /*
315 * initialize transaction lock (tlock) table
316 *
317 * transaction lock id = tlock index
318 * tlock id = 0 is reserved.
319 */
320 size = sizeof(struct tlock) * nTxLock;
321 TxLock = (struct tlock *) vmalloc(size);
322 if (TxLock == NULL) {
323 vfree(TxBlock);
324 return -ENOMEM;
325 }
326
327 /* initialize tlock table */
328 for (k = 1; k < nTxLock - 1; k++)
329 TxLock[k].next = k + 1;
330 TxLock[k].next = 0;
331 init_waitqueue_head(&TxAnchor.freelockwait);
332 init_waitqueue_head(&TxAnchor.lowlockwait);
333
334 TxAnchor.freelock = 1;
335 TxAnchor.tlocksInUse = 0;
336 INIT_LIST_HEAD(&TxAnchor.anon_list);
337 INIT_LIST_HEAD(&TxAnchor.anon_list2);
338
339 LAZY_LOCK_INIT();
340 INIT_LIST_HEAD(&TxAnchor.unlock_queue);
341
342 stattx.maxlid = 1; /* statistics */
343
344 return 0;
345}
346
347/*
348 * NAME: txExit()
349 *
350 * FUNCTION: clean up when module is unloaded
351 */
352void txExit(void)
353{
354 vfree(TxLock);
355 TxLock = NULL;
356 vfree(TxBlock);
357 TxBlock = NULL;
358}
359
360
361/*
362 * NAME: txBegin()
363 *
364 * FUNCTION: start a transaction.
365 *
366 * PARAMETER: sb - superblock
367 * flag - force for nested tx;
368 *
369 * RETURN: tid - transaction id
370 *
371 * note: flag force allows to start tx for nested tx
372 * to prevent deadlock on logsync barrier;
373 */
374tid_t txBegin(struct super_block *sb, int flag)
375{
376 tid_t t;
377 struct tblock *tblk;
378 struct jfs_log *log;
379
380 jfs_info("txBegin: flag = 0x%x", flag);
381 log = JFS_SBI(sb)->log;
382
383 TXN_LOCK();
384
385 INCREMENT(TxStat.txBegin);
386
387 retry:
388 if (!(flag & COMMIT_FORCE)) {
389 /*
390 * synchronize with logsync barrier
391 */
392 if (test_bit(log_SYNCBARRIER, &log->flag) ||
393 test_bit(log_QUIESCE, &log->flag)) {
394 INCREMENT(TxStat.txBegin_barrier);
395 TXN_SLEEP(&log->syncwait);
396 goto retry;
397 }
398 }
399 if (flag == 0) {
400 /*
401 * Don't begin transaction if we're getting starved for tlocks
402 * unless COMMIT_FORCE or COMMIT_INODE (which may ultimately
403 * free tlocks)
404 */
405 if (TxAnchor.tlocksInUse > TxLockVHWM) {
406 INCREMENT(TxStat.txBegin_lockslow);
407 TXN_SLEEP(&TxAnchor.lowlockwait);
408 goto retry;
409 }
410 }
411
412 /*
413 * allocate transaction id/block
414 */
415 if ((t = TxAnchor.freetid) == 0) {
416 jfs_info("txBegin: waiting for free tid");
417 INCREMENT(TxStat.txBegin_freetid);
418 TXN_SLEEP(&TxAnchor.freewait);
419 goto retry;
420 }
421
422 tblk = tid_to_tblock(t);
423
424 if ((tblk->next == 0) && !(flag & COMMIT_FORCE)) {
425 /* Don't let a non-forced transaction take the last tblk */
426 jfs_info("txBegin: waiting for free tid");
427 INCREMENT(TxStat.txBegin_freetid);
428 TXN_SLEEP(&TxAnchor.freewait);
429 goto retry;
430 }
431
432 TxAnchor.freetid = tblk->next;
433
434 /*
435 * initialize transaction
436 */
437
438 /*
439 * We can't zero the whole thing or we screw up another thread being
440 * awakened after sleeping on tblk->waitor
441 *
442 * memset(tblk, 0, sizeof(struct tblock));
443 */
444 tblk->next = tblk->last = tblk->xflag = tblk->flag = tblk->lsn = 0;
445
446 tblk->sb = sb;
447 ++log->logtid;
448 tblk->logtid = log->logtid;
449
450 ++log->active;
451
452 HIGHWATERMARK(stattx.maxtid, t); /* statistics */
453 INCREMENT(stattx.ntid); /* statistics */
454
455 TXN_UNLOCK();
456
457 jfs_info("txBegin: returning tid = %d", t);
458
459 return t;
460}
461
462
463/*
464 * NAME: txBeginAnon()
465 *
466 * FUNCTION: start an anonymous transaction.
467 * Blocks if logsync or available tlocks are low to prevent
468 * anonymous tlocks from depleting supply.
469 *
470 * PARAMETER: sb - superblock
471 *
472 * RETURN: none
473 */
474void txBeginAnon(struct super_block *sb)
475{
476 struct jfs_log *log;
477
478 log = JFS_SBI(sb)->log;
479
480 TXN_LOCK();
481 INCREMENT(TxStat.txBeginAnon);
482
483 retry:
484 /*
485 * synchronize with logsync barrier
486 */
487 if (test_bit(log_SYNCBARRIER, &log->flag) ||
488 test_bit(log_QUIESCE, &log->flag)) {
489 INCREMENT(TxStat.txBeginAnon_barrier);
490 TXN_SLEEP(&log->syncwait);
491 goto retry;
492 }
493
494 /*
495 * Don't begin transaction if we're getting starved for tlocks
496 */
497 if (TxAnchor.tlocksInUse > TxLockVHWM) {
498 INCREMENT(TxStat.txBeginAnon_lockslow);
499 TXN_SLEEP(&TxAnchor.lowlockwait);
500 goto retry;
501 }
502 TXN_UNLOCK();
503}
504
505
506/*
507 * txEnd()
508 *
509 * function: free specified transaction block.
510 *
511 * logsync barrier processing:
512 *
513 * serialization:
514 */
515void txEnd(tid_t tid)
516{
517 struct tblock *tblk = tid_to_tblock(tid);
518 struct jfs_log *log;
519
520 jfs_info("txEnd: tid = %d", tid);
521 TXN_LOCK();
522
523 /*
524 * wakeup transactions waiting on the page locked
525 * by the current transaction
526 */
527 TXN_WAKEUP(&tblk->waitor);
528
529 log = JFS_SBI(tblk->sb)->log;
530
531 /*
532 * Lazy commit thread can't free this guy until we mark it UNLOCKED,
533 * otherwise, we would be left with a transaction that may have been
534 * reused.
535 *
536 * Lazy commit thread will turn off tblkGC_LAZY before calling this
537 * routine.
538 */
539 if (tblk->flag & tblkGC_LAZY) {
540 jfs_info("txEnd called w/lazy tid: %d, tblk = 0x%p", tid, tblk);
541 TXN_UNLOCK();
542
543 spin_lock_irq(&log->gclock); // LOGGC_LOCK
544 tblk->flag |= tblkGC_UNLOCKED;
545 spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK
546 return;
547 }
548
549 jfs_info("txEnd: tid: %d, tblk = 0x%p", tid, tblk);
550
551 assert(tblk->next == 0);
552
553 /*
554 * insert tblock back on freelist
555 */
556 tblk->next = TxAnchor.freetid;
557 TxAnchor.freetid = tid;
558
559 /*
560 * mark the tblock not active
561 */
562 if (--log->active == 0) {
563 clear_bit(log_FLUSH, &log->flag);
564
565 /*
566 * synchronize with logsync barrier
567 */
568 if (test_bit(log_SYNCBARRIER, &log->flag)) {
569 /* forward log syncpt */
570 /* lmSync(log); */
571
572 jfs_info("log barrier off: 0x%x", log->lsn);
573
574 /* enable new transactions start */
575 clear_bit(log_SYNCBARRIER, &log->flag);
576
577 /* wakeup all waitors for logsync barrier */
578 TXN_WAKEUP(&log->syncwait);
579 }
580 }
581
582 /*
583 * wakeup all waitors for a free tblock
584 */
585 TXN_WAKEUP(&TxAnchor.freewait);
586
587 TXN_UNLOCK();
588}
589
590
591/*
592 * txLock()
593 *
594 * function: acquire a transaction lock on the specified <mp>
595 *
596 * parameter:
597 *
598 * return: transaction lock id
599 *
600 * serialization:
601 */
602struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
603 int type)
604{
605 struct jfs_inode_info *jfs_ip = JFS_IP(ip);
606 int dir_xtree = 0;
607 lid_t lid;
608 tid_t xtid;
609 struct tlock *tlck;
610 struct xtlock *xtlck;
611 struct linelock *linelock;
612 xtpage_t *p;
613 struct tblock *tblk;
614
615 TXN_LOCK();
616
617 if (S_ISDIR(ip->i_mode) && (type & tlckXTREE) &&
618 !(mp->xflag & COMMIT_PAGE)) {
619 /*
620 * Directory inode is special. It can have both an xtree tlock
621 * and a dtree tlock associated with it.
622 */
623 dir_xtree = 1;
624 lid = jfs_ip->xtlid;
625 } else
626 lid = mp->lid;
627
628 /* is page not locked by a transaction ? */
629 if (lid == 0)
630 goto allocateLock;
631
632 jfs_info("txLock: tid:%d ip:0x%p mp:0x%p lid:%d", tid, ip, mp, lid);
633
634 /* is page locked by the requester transaction ? */
635 tlck = lid_to_tlock(lid);
636 if ((xtid = tlck->tid) == tid)
637 goto grantLock;
638
639 /*
640 * is page locked by anonymous transaction/lock ?
641 *
642 * (page update without transaction (i.e., file write) is
643 * locked under anonymous transaction tid = 0:
644 * anonymous tlocks maintained on anonymous tlock list of
645 * the inode of the page and available to all anonymous
646 * transactions until txCommit() time at which point
647 * they are transferred to the transaction tlock list of
648 * the commiting transaction of the inode)
649 */
650 if (xtid == 0) {
651 tlck->tid = tid;
652 tblk = tid_to_tblock(tid);
653 /*
654 * The order of the tlocks in the transaction is important
655 * (during truncate, child xtree pages must be freed before
656 * parent's tlocks change the working map).
657 * Take tlock off anonymous list and add to tail of
658 * transaction list
659 *
660 * Note: We really need to get rid of the tid & lid and
661 * use list_head's. This code is getting UGLY!
662 */
663 if (jfs_ip->atlhead == lid) {
664 if (jfs_ip->atltail == lid) {
665 /* only anonymous txn.
666 * Remove from anon_list
667 */
668 list_del_init(&jfs_ip->anon_inode_list);
669 }
670 jfs_ip->atlhead = tlck->next;
671 } else {
672 lid_t last;
673 for (last = jfs_ip->atlhead;
674 lid_to_tlock(last)->next != lid;
675 last = lid_to_tlock(last)->next) {
676 assert(last);
677 }
678 lid_to_tlock(last)->next = tlck->next;
679 if (jfs_ip->atltail == lid)
680 jfs_ip->atltail = last;
681 }
682
683 /* insert the tlock at tail of transaction tlock list */
684
685 if (tblk->next)
686 lid_to_tlock(tblk->last)->next = lid;
687 else
688 tblk->next = lid;
689 tlck->next = 0;
690 tblk->last = lid;
691
692 goto grantLock;
693 }
694
695 goto waitLock;
696
697 /*
698 * allocate a tlock
699 */
700 allocateLock:
701 lid = txLockAlloc();
702 tlck = lid_to_tlock(lid);
703
704 /*
705 * initialize tlock
706 */
707 tlck->tid = tid;
708
709 /* mark tlock for meta-data page */
710 if (mp->xflag & COMMIT_PAGE) {
711
712 tlck->flag = tlckPAGELOCK;
713
714 /* mark the page dirty and nohomeok */
715 mark_metapage_dirty(mp);
716 atomic_inc(&mp->nohomeok);
717
718 jfs_info("locking mp = 0x%p, nohomeok = %d tid = %d tlck = 0x%p",
719 mp, atomic_read(&mp->nohomeok), tid, tlck);
720
721 /* if anonymous transaction, and buffer is on the group
722 * commit synclist, mark inode to show this. This will
723 * prevent the buffer from being marked nohomeok for too
724 * long a time.
725 */
726 if ((tid == 0) && mp->lsn)
727 set_cflag(COMMIT_Synclist, ip);
728 }
729 /* mark tlock for in-memory inode */
730 else
731 tlck->flag = tlckINODELOCK;
732
733 tlck->type = 0;
734
735 /* bind the tlock and the page */
736 tlck->ip = ip;
737 tlck->mp = mp;
738 if (dir_xtree)
739 jfs_ip->xtlid = lid;
740 else
741 mp->lid = lid;
742
743 /*
744 * enqueue transaction lock to transaction/inode
745 */
746 /* insert the tlock at tail of transaction tlock list */
747 if (tid) {
748 tblk = tid_to_tblock(tid);
749 if (tblk->next)
750 lid_to_tlock(tblk->last)->next = lid;
751 else
752 tblk->next = lid;
753 tlck->next = 0;
754 tblk->last = lid;
755 }
756 /* anonymous transaction:
757 * insert the tlock at head of inode anonymous tlock list
758 */
759 else {
760 tlck->next = jfs_ip->atlhead;
761 jfs_ip->atlhead = lid;
762 if (tlck->next == 0) {
763 /* This inode's first anonymous transaction */
764 jfs_ip->atltail = lid;
765 list_add_tail(&jfs_ip->anon_inode_list,
766 &TxAnchor.anon_list);
767 }
768 }
769
770 /* initialize type dependent area for linelock */
771 linelock = (struct linelock *) & tlck->lock;
772 linelock->next = 0;
773 linelock->flag = tlckLINELOCK;
774 linelock->maxcnt = TLOCKSHORT;
775 linelock->index = 0;
776
777 switch (type & tlckTYPE) {
778 case tlckDTREE:
779 linelock->l2linesize = L2DTSLOTSIZE;
780 break;
781
782 case tlckXTREE:
783 linelock->l2linesize = L2XTSLOTSIZE;
784
785 xtlck = (struct xtlock *) linelock;
786 xtlck->header.offset = 0;
787 xtlck->header.length = 2;
788
789 if (type & tlckNEW) {
790 xtlck->lwm.offset = XTENTRYSTART;
791 } else {
792 if (mp->xflag & COMMIT_PAGE)
793 p = (xtpage_t *) mp->data;
794 else
795 p = &jfs_ip->i_xtroot;
796 xtlck->lwm.offset =
797 le16_to_cpu(p->header.nextindex);
798 }
799 xtlck->lwm.length = 0; /* ! */
800 xtlck->twm.offset = 0;
801 xtlck->hwm.offset = 0;
802
803 xtlck->index = 2;
804 break;
805
806 case tlckINODE:
807 linelock->l2linesize = L2INODESLOTSIZE;
808 break;
809
810 case tlckDATA:
811 linelock->l2linesize = L2DATASLOTSIZE;
812 break;
813
814 default:
815 jfs_err("UFO tlock:0x%p", tlck);
816 }
817
818 /*
819 * update tlock vector
820 */
821 grantLock:
822 tlck->type |= type;
823
824 TXN_UNLOCK();
825
826 return tlck;
827
828 /*
829 * page is being locked by another transaction:
830 */
831 waitLock:
832 /* Only locks on ipimap or ipaimap should reach here */
833 /* assert(jfs_ip->fileset == AGGREGATE_I); */
834 if (jfs_ip->fileset != AGGREGATE_I) {
835 jfs_err("txLock: trying to lock locked page!");
836 dump_mem("ip", ip, sizeof(struct inode));
837 dump_mem("mp", mp, sizeof(struct metapage));
838 dump_mem("Locker's tblk", tid_to_tblock(tid),
839 sizeof(struct tblock));
840 dump_mem("Tlock", tlck, sizeof(struct tlock));
841 BUG();
842 }
843 INCREMENT(stattx.waitlock); /* statistics */
844 release_metapage(mp);
845
846 jfs_info("txLock: in waitLock, tid = %d, xtid = %d, lid = %d",
847 tid, xtid, lid);
848 TXN_SLEEP_DROP_LOCK(&tid_to_tblock(xtid)->waitor);
849 jfs_info("txLock: awakened tid = %d, lid = %d", tid, lid);
850
851 return NULL;
852}
853
854
855/*
856 * NAME: txRelease()
857 *
858 * FUNCTION: Release buffers associated with transaction locks, but don't
859 * mark homeok yet. The allows other transactions to modify
860 * buffers, but won't let them go to disk until commit record
861 * actually gets written.
862 *
863 * PARAMETER:
864 * tblk -
865 *
866 * RETURN: Errors from subroutines.
867 */
868static void txRelease(struct tblock * tblk)
869{
870 struct metapage *mp;
871 lid_t lid;
872 struct tlock *tlck;
873
874 TXN_LOCK();
875
876 for (lid = tblk->next; lid; lid = tlck->next) {
877 tlck = lid_to_tlock(lid);
878 if ((mp = tlck->mp) != NULL &&
879 (tlck->type & tlckBTROOT) == 0) {
880 assert(mp->xflag & COMMIT_PAGE);
881 mp->lid = 0;
882 }
883 }
884
885 /*
886 * wakeup transactions waiting on a page locked
887 * by the current transaction
888 */
889 TXN_WAKEUP(&tblk->waitor);
890
891 TXN_UNLOCK();
892}
893
894
895/*
896 * NAME: txUnlock()
897 *
898 * FUNCTION: Initiates pageout of pages modified by tid in journalled
899 * objects and frees their lockwords.
900 */
901static void txUnlock(struct tblock * tblk)
902{
903 struct tlock *tlck;
904 struct linelock *linelock;
905 lid_t lid, next, llid, k;
906 struct metapage *mp;
907 struct jfs_log *log;
908 int difft, diffp;
909
910 jfs_info("txUnlock: tblk = 0x%p", tblk);
911 log = JFS_SBI(tblk->sb)->log;
912
913 /*
914 * mark page under tlock homeok (its log has been written):
915 */
916 for (lid = tblk->next; lid; lid = next) {
917 tlck = lid_to_tlock(lid);
918 next = tlck->next;
919
920 jfs_info("unlocking lid = %d, tlck = 0x%p", lid, tlck);
921
922 /* unbind page from tlock */
923 if ((mp = tlck->mp) != NULL &&
924 (tlck->type & tlckBTROOT) == 0) {
925 assert(mp->xflag & COMMIT_PAGE);
926
927 /* hold buffer
928 *
929 * It's possible that someone else has the metapage.
930 * The only things were changing are nohomeok, which
931 * is handled atomically, and clsn which is protected
932 * by the LOGSYNC_LOCK.
933 */
934 hold_metapage(mp, 1);
935
936 assert(atomic_read(&mp->nohomeok) > 0);
937 atomic_dec(&mp->nohomeok);
938
939 /* inherit younger/larger clsn */
940 LOGSYNC_LOCK(log);
941 if (mp->clsn) {
942 logdiff(difft, tblk->clsn, log);
943 logdiff(diffp, mp->clsn, log);
944 if (difft > diffp)
945 mp->clsn = tblk->clsn;
946 } else
947 mp->clsn = tblk->clsn;
948 LOGSYNC_UNLOCK(log);
949
950 assert(!(tlck->flag & tlckFREEPAGE));
951
952 if (tlck->flag & tlckWRITEPAGE) {
953 write_metapage(mp);
954 } else {
955 /* release page which has been forced */
956 release_metapage(mp);
957 }
958 }
959
960 /* insert tlock, and linelock(s) of the tlock if any,
961 * at head of freelist
962 */
963 TXN_LOCK();
964
965 llid = ((struct linelock *) & tlck->lock)->next;
966 while (llid) {
967 linelock = (struct linelock *) lid_to_tlock(llid);
968 k = linelock->next;
969 txLockFree(llid);
970 llid = k;
971 }
972 txLockFree(lid);
973
974 TXN_UNLOCK();
975 }
976 tblk->next = tblk->last = 0;
977
978 /*
979 * remove tblock from logsynclist
980 * (allocation map pages inherited lsn of tblk and
981 * has been inserted in logsync list at txUpdateMap())
982 */
983 if (tblk->lsn) {
984 LOGSYNC_LOCK(log);
985 log->count--;
986 list_del(&tblk->synclist);
987 LOGSYNC_UNLOCK(log);
988 }
989}
990
991
992/*
993 * txMaplock()
994 *
995 * function: allocate a transaction lock for freed page/entry;
996 * for freed page, maplock is used as xtlock/dtlock type;
997 */
998struct tlock *txMaplock(tid_t tid, struct inode *ip, int type)
999{
1000 struct jfs_inode_info *jfs_ip = JFS_IP(ip);
1001 lid_t lid;
1002 struct tblock *tblk;
1003 struct tlock *tlck;
1004 struct maplock *maplock;
1005
1006 TXN_LOCK();
1007
1008 /*
1009 * allocate a tlock
1010 */
1011 lid = txLockAlloc();
1012 tlck = lid_to_tlock(lid);
1013
1014 /*
1015 * initialize tlock
1016 */
1017 tlck->tid = tid;
1018
1019 /* bind the tlock and the object */
1020 tlck->flag = tlckINODELOCK;
1021 tlck->ip = ip;
1022 tlck->mp = NULL;
1023
1024 tlck->type = type;
1025
1026 /*
1027 * enqueue transaction lock to transaction/inode
1028 */
1029 /* insert the tlock at tail of transaction tlock list */
1030 if (tid) {
1031 tblk = tid_to_tblock(tid);
1032 if (tblk->next)
1033 lid_to_tlock(tblk->last)->next = lid;
1034 else
1035 tblk->next = lid;
1036 tlck->next = 0;
1037 tblk->last = lid;
1038 }
1039 /* anonymous transaction:
1040 * insert the tlock at head of inode anonymous tlock list
1041 */
1042 else {
1043 tlck->next = jfs_ip->atlhead;
1044 jfs_ip->atlhead = lid;
1045 if (tlck->next == 0) {
1046 /* This inode's first anonymous transaction */
1047 jfs_ip->atltail = lid;
1048 list_add_tail(&jfs_ip->anon_inode_list,
1049 &TxAnchor.anon_list);
1050 }
1051 }
1052
1053 TXN_UNLOCK();
1054
1055 /* initialize type dependent area for maplock */
1056 maplock = (struct maplock *) & tlck->lock;
1057 maplock->next = 0;
1058 maplock->maxcnt = 0;
1059 maplock->index = 0;
1060
1061 return tlck;
1062}
1063
1064
1065/*
1066 * txLinelock()
1067 *
1068 * function: allocate a transaction lock for log vector list
1069 */
1070struct linelock *txLinelock(struct linelock * tlock)
1071{
1072 lid_t lid;
1073 struct tlock *tlck;
1074 struct linelock *linelock;
1075
1076 TXN_LOCK();
1077
1078 /* allocate a TxLock structure */
1079 lid = txLockAlloc();
1080 tlck = lid_to_tlock(lid);
1081
1082 TXN_UNLOCK();
1083
1084 /* initialize linelock */
1085 linelock = (struct linelock *) tlck;
1086 linelock->next = 0;
1087 linelock->flag = tlckLINELOCK;
1088 linelock->maxcnt = TLOCKLONG;
1089 linelock->index = 0;
1090
1091 /* append linelock after tlock */
1092 linelock->next = tlock->next;
1093 tlock->next = lid;
1094
1095 return linelock;
1096}
1097
1098
1099
1100/*
1101 * transaction commit management
1102 * -----------------------------
1103 */
1104
1105/*
1106 * NAME: txCommit()
1107 *
1108 * FUNCTION: commit the changes to the objects specified in
1109 * clist. For journalled segments only the
1110 * changes of the caller are committed, ie by tid.
1111 * for non-journalled segments the data are flushed to
1112 * disk and then the change to the disk inode and indirect
1113 * blocks committed (so blocks newly allocated to the
1114 * segment will be made a part of the segment atomically).
1115 *
1116 * all of the segments specified in clist must be in
1117 * one file system. no more than 6 segments are needed
1118 * to handle all unix svcs.
1119 *
1120 * if the i_nlink field (i.e. disk inode link count)
1121 * is zero, and the type of inode is a regular file or
1122 * directory, or symbolic link , the inode is truncated
1123 * to zero length. the truncation is committed but the
1124 * VM resources are unaffected until it is closed (see
1125 * iput and iclose).
1126 *
1127 * PARAMETER:
1128 *
1129 * RETURN:
1130 *
1131 * serialization:
1132 * on entry the inode lock on each segment is assumed
1133 * to be held.
1134 *
1135 * i/o error:
1136 */
1137int txCommit(tid_t tid, /* transaction identifier */
1138 int nip, /* number of inodes to commit */
1139 struct inode **iplist, /* list of inode to commit */
1140 int flag)
1141{
1142 int rc = 0;
1143 struct commit cd;
1144 struct jfs_log *log;
1145 struct tblock *tblk;
1146 struct lrd *lrd;
1147 int lsn;
1148 struct inode *ip;
1149 struct jfs_inode_info *jfs_ip;
1150 int k, n;
1151 ino_t top;
1152 struct super_block *sb;
1153
1154 jfs_info("txCommit, tid = %d, flag = %d", tid, flag);
1155 /* is read-only file system ? */
1156 if (isReadOnly(iplist[0])) {
1157 rc = -EROFS;
1158 goto TheEnd;
1159 }
1160
1161 sb = cd.sb = iplist[0]->i_sb;
1162 cd.tid = tid;
1163
1164 if (tid == 0)
1165 tid = txBegin(sb, 0);
1166 tblk = tid_to_tblock(tid);
1167
1168 /*
1169 * initialize commit structure
1170 */
1171 log = JFS_SBI(sb)->log;
1172 cd.log = log;
1173
1174 /* initialize log record descriptor in commit */
1175 lrd = &cd.lrd;
1176 lrd->logtid = cpu_to_le32(tblk->logtid);
1177 lrd->backchain = 0;
1178
1179 tblk->xflag |= flag;
1180
1181 if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0)
1182 tblk->xflag |= COMMIT_LAZY;
1183 /*
1184 * prepare non-journaled objects for commit
1185 *
1186 * flush data pages of non-journaled file
1187 * to prevent the file getting non-initialized disk blocks
1188 * in case of crash.
1189 * (new blocks - )
1190 */
1191 cd.iplist = iplist;
1192 cd.nip = nip;
1193
1194 /*
1195 * acquire transaction lock on (on-disk) inodes
1196 *
1197 * update on-disk inode from in-memory inode
1198 * acquiring transaction locks for AFTER records
1199 * on the on-disk inode of file object
1200 *
1201 * sort the inodes array by inode number in descending order
1202 * to prevent deadlock when acquiring transaction lock
1203 * of on-disk inodes on multiple on-disk inode pages by
1204 * multiple concurrent transactions
1205 */
1206 for (k = 0; k < cd.nip; k++) {
1207 top = (cd.iplist[k])->i_ino;
1208 for (n = k + 1; n < cd.nip; n++) {
1209 ip = cd.iplist[n];
1210 if (ip->i_ino > top) {
1211 top = ip->i_ino;
1212 cd.iplist[n] = cd.iplist[k];
1213 cd.iplist[k] = ip;
1214 }
1215 }
1216
1217 ip = cd.iplist[k];
1218 jfs_ip = JFS_IP(ip);
1219
1220 /*
1221 * BUGBUG - This code has temporarily been removed. The
1222 * intent is to ensure that any file data is written before
1223 * the metadata is committed to the journal. This prevents
1224 * uninitialized data from appearing in a file after the
1225 * journal has been replayed. (The uninitialized data
1226 * could be sensitive data removed by another user.)
1227 *
1228 * The problem now is that we are holding the IWRITELOCK
1229 * on the inode, and calling filemap_fdatawrite on an
1230 * unmapped page will cause a deadlock in jfs_get_block.
1231 *
1232 * The long term solution is to pare down the use of
1233 * IWRITELOCK. We are currently holding it too long.
1234 * We could also be smarter about which data pages need
1235 * to be written before the transaction is committed and
1236 * when we don't need to worry about it at all.
1237 *
1238 * if ((!S_ISDIR(ip->i_mode))
1239 * && (tblk->flag & COMMIT_DELETE) == 0) {
1240 * filemap_fdatawrite(ip->i_mapping);
1241 * filemap_fdatawait(ip->i_mapping);
1242 * }
1243 */
1244
1245 /*
1246 * Mark inode as not dirty. It will still be on the dirty
1247 * inode list, but we'll know not to commit it again unless
1248 * it gets marked dirty again
1249 */
1250 clear_cflag(COMMIT_Dirty, ip);
1251
1252 /* inherit anonymous tlock(s) of inode */
1253 if (jfs_ip->atlhead) {
1254 lid_to_tlock(jfs_ip->atltail)->next = tblk->next;
1255 tblk->next = jfs_ip->atlhead;
1256 if (!tblk->last)
1257 tblk->last = jfs_ip->atltail;
1258 jfs_ip->atlhead = jfs_ip->atltail = 0;
1259 TXN_LOCK();
1260 list_del_init(&jfs_ip->anon_inode_list);
1261 TXN_UNLOCK();
1262 }
1263
1264 /*
1265 * acquire transaction lock on on-disk inode page
1266 * (become first tlock of the tblk's tlock list)
1267 */
1268 if (((rc = diWrite(tid, ip))))
1269 goto out;
1270 }
1271
1272 /*
1273 * write log records from transaction locks
1274 *
1275 * txUpdateMap() resets XAD_NEW in XAD.
1276 */
1277 if ((rc = txLog(log, tblk, &cd)))
1278 goto TheEnd;
1279
1280 /*
1281 * Ensure that inode isn't reused before
1282 * lazy commit thread finishes processing
1283 */
1284 if (tblk->xflag & COMMIT_DELETE) {
1285 atomic_inc(&tblk->u.ip->i_count);
1286 /*
1287 * Avoid a rare deadlock
1288 *
1289 * If the inode is locked, we may be blocked in
1290 * jfs_commit_inode. If so, we don't want the
1291 * lazy_commit thread doing the last iput() on the inode
1292 * since that may block on the locked inode. Instead,
1293 * commit the transaction synchronously, so the last iput
1294 * will be done by the calling thread (or later)
1295 */
1296 if (tblk->u.ip->i_state & I_LOCK)
1297 tblk->xflag &= ~COMMIT_LAZY;
1298 }
1299
1300 ASSERT((!(tblk->xflag & COMMIT_DELETE)) ||
1301 ((tblk->u.ip->i_nlink == 0) &&
1302 !test_cflag(COMMIT_Nolink, tblk->u.ip)));
1303
1304 /*
1305 * write COMMIT log record
1306 */
1307 lrd->type = cpu_to_le16(LOG_COMMIT);
1308 lrd->length = 0;
1309 lsn = lmLog(log, tblk, lrd, NULL);
1310
1311 lmGroupCommit(log, tblk);
1312
1313 /*
1314 * - transaction is now committed -
1315 */
1316
1317 /*
1318 * force pages in careful update
1319 * (imap addressing structure update)
1320 */
1321 if (flag & COMMIT_FORCE)
1322 txForce(tblk);
1323
1324 /*
1325 * update allocation map.
1326 *
1327 * update inode allocation map and inode:
1328 * free pager lock on memory object of inode if any.
1329 * update block allocation map.
1330 *
1331 * txUpdateMap() resets XAD_NEW in XAD.
1332 */
1333 if (tblk->xflag & COMMIT_FORCE)
1334 txUpdateMap(tblk);
1335
1336 /*
1337 * free transaction locks and pageout/free pages
1338 */
1339 txRelease(tblk);
1340
1341 if ((tblk->flag & tblkGC_LAZY) == 0)
1342 txUnlock(tblk);
1343
1344
1345 /*
1346 * reset in-memory object state
1347 */
1348 for (k = 0; k < cd.nip; k++) {
1349 ip = cd.iplist[k];
1350 jfs_ip = JFS_IP(ip);
1351
1352 /*
1353 * reset in-memory inode state
1354 */
1355 jfs_ip->bxflag = 0;
1356 jfs_ip->blid = 0;
1357 }
1358
1359 out:
1360 if (rc != 0)
1361 txAbort(tid, 1);
1362
1363 TheEnd:
1364 jfs_info("txCommit: tid = %d, returning %d", tid, rc);
1365 return rc;
1366}
1367
1368
1369/*
1370 * NAME: txLog()
1371 *
1372 * FUNCTION: Writes AFTER log records for all lines modified
1373 * by tid for segments specified by inodes in comdata.
1374 * Code assumes only WRITELOCKS are recorded in lockwords.
1375 *
1376 * PARAMETERS:
1377 *
1378 * RETURN :
1379 */
1380static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd)
1381{
1382 int rc = 0;
1383 struct inode *ip;
1384 lid_t lid;
1385 struct tlock *tlck;
1386 struct lrd *lrd = &cd->lrd;
1387
1388 /*
1389 * write log record(s) for each tlock of transaction,
1390 */
1391 for (lid = tblk->next; lid; lid = tlck->next) {
1392 tlck = lid_to_tlock(lid);
1393
1394 tlck->flag |= tlckLOG;
1395
1396 /* initialize lrd common */
1397 ip = tlck->ip;
1398 lrd->aggregate = cpu_to_le32(JFS_SBI(ip->i_sb)->aggregate);
1399 lrd->log.redopage.fileset = cpu_to_le32(JFS_IP(ip)->fileset);
1400 lrd->log.redopage.inode = cpu_to_le32(ip->i_ino);
1401
1402 /* write log record of page from the tlock */
1403 switch (tlck->type & tlckTYPE) {
1404 case tlckXTREE:
1405 xtLog(log, tblk, lrd, tlck);
1406 break;
1407
1408 case tlckDTREE:
1409 dtLog(log, tblk, lrd, tlck);
1410 break;
1411
1412 case tlckINODE:
1413 diLog(log, tblk, lrd, tlck, cd);
1414 break;
1415
1416 case tlckMAP:
1417 mapLog(log, tblk, lrd, tlck);
1418 break;
1419
1420 case tlckDATA:
1421 dataLog(log, tblk, lrd, tlck);
1422 break;
1423
1424 default:
1425 jfs_err("UFO tlock:0x%p", tlck);
1426 }
1427 }
1428
1429 return rc;
1430}
1431
1432
1433/*
1434 * diLog()
1435 *
1436 * function: log inode tlock and format maplock to update bmap;
1437 */
1438static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1439 struct tlock * tlck, struct commit * cd)
1440{
1441 int rc = 0;
1442 struct metapage *mp;
1443 pxd_t *pxd;
1444 struct pxd_lock *pxdlock;
1445
1446 mp = tlck->mp;
1447
1448 /* initialize as REDOPAGE record format */
1449 lrd->log.redopage.type = cpu_to_le16(LOG_INODE);
1450 lrd->log.redopage.l2linesize = cpu_to_le16(L2INODESLOTSIZE);
1451
1452 pxd = &lrd->log.redopage.pxd;
1453
1454 /*
1455 * inode after image
1456 */
1457 if (tlck->type & tlckENTRY) {
1458 /* log after-image for logredo(): */
1459 lrd->type = cpu_to_le16(LOG_REDOPAGE);
1460// *pxd = mp->cm_pxd;
1461 PXDaddress(pxd, mp->index);
1462 PXDlength(pxd,
1463 mp->logical_size >> tblk->sb->s_blocksize_bits);
1464 lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1465
1466 /* mark page as homeward bound */
1467 tlck->flag |= tlckWRITEPAGE;
1468 } else if (tlck->type & tlckFREE) {
1469 /*
1470 * free inode extent
1471 *
1472 * (pages of the freed inode extent have been invalidated and
1473 * a maplock for free of the extent has been formatted at
1474 * txLock() time);
1475 *
1476 * the tlock had been acquired on the inode allocation map page
1477 * (iag) that specifies the freed extent, even though the map
1478 * page is not itself logged, to prevent pageout of the map
1479 * page before the log;
1480 */
1481
1482 /* log LOG_NOREDOINOEXT of the freed inode extent for
1483 * logredo() to start NoRedoPage filters, and to update
1484 * imap and bmap for free of the extent;
1485 */
1486 lrd->type = cpu_to_le16(LOG_NOREDOINOEXT);
1487 /*
1488 * For the LOG_NOREDOINOEXT record, we need
1489 * to pass the IAG number and inode extent
1490 * index (within that IAG) from which the
1491 * the extent being released. These have been
1492 * passed to us in the iplist[1] and iplist[2].
1493 */
1494 lrd->log.noredoinoext.iagnum =
1495 cpu_to_le32((u32) (size_t) cd->iplist[1]);
1496 lrd->log.noredoinoext.inoext_idx =
1497 cpu_to_le32((u32) (size_t) cd->iplist[2]);
1498
1499 pxdlock = (struct pxd_lock *) & tlck->lock;
1500 *pxd = pxdlock->pxd;
1501 lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1502
1503 /* update bmap */
1504 tlck->flag |= tlckUPDATEMAP;
1505
1506 /* mark page as homeward bound */
1507 tlck->flag |= tlckWRITEPAGE;
1508 } else
1509 jfs_err("diLog: UFO type tlck:0x%p", tlck);
1510#ifdef _JFS_WIP
1511 /*
1512 * alloc/free external EA extent
1513 *
1514 * a maplock for txUpdateMap() to update bPWMAP for alloc/free
1515 * of the extent has been formatted at txLock() time;
1516 */
1517 else {
1518 assert(tlck->type & tlckEA);
1519
1520 /* log LOG_UPDATEMAP for logredo() to update bmap for
1521 * alloc of new (and free of old) external EA extent;
1522 */
1523 lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1524 pxdlock = (struct pxd_lock *) & tlck->lock;
1525 nlock = pxdlock->index;
1526 for (i = 0; i < nlock; i++, pxdlock++) {
1527 if (pxdlock->flag & mlckALLOCPXD)
1528 lrd->log.updatemap.type =
1529 cpu_to_le16(LOG_ALLOCPXD);
1530 else
1531 lrd->log.updatemap.type =
1532 cpu_to_le16(LOG_FREEPXD);
1533 lrd->log.updatemap.nxd = cpu_to_le16(1);
1534 lrd->log.updatemap.pxd = pxdlock->pxd;
1535 lrd->backchain =
1536 cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1537 }
1538
1539 /* update bmap */
1540 tlck->flag |= tlckUPDATEMAP;
1541 }
1542#endif /* _JFS_WIP */
1543
1544 return rc;
1545}
1546
1547
1548/*
1549 * dataLog()
1550 *
1551 * function: log data tlock
1552 */
1553static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1554 struct tlock * tlck)
1555{
1556 struct metapage *mp;
1557 pxd_t *pxd;
1558
1559 mp = tlck->mp;
1560
1561 /* initialize as REDOPAGE record format */
1562 lrd->log.redopage.type = cpu_to_le16(LOG_DATA);
1563 lrd->log.redopage.l2linesize = cpu_to_le16(L2DATASLOTSIZE);
1564
1565 pxd = &lrd->log.redopage.pxd;
1566
1567 /* log after-image for logredo(): */
1568 lrd->type = cpu_to_le16(LOG_REDOPAGE);
1569
1570 if (jfs_dirtable_inline(tlck->ip)) {
1571 /*
1572 * The table has been truncated, we've must have deleted
1573 * the last entry, so don't bother logging this
1574 */
1575 mp->lid = 0;
1576 hold_metapage(mp, 0);
1577 atomic_dec(&mp->nohomeok);
1578 discard_metapage(mp);
1579 tlck->mp = NULL;
1580 return 0;
1581 }
1582
1583 PXDaddress(pxd, mp->index);
1584 PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits);
1585
1586 lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1587
1588 /* mark page as homeward bound */
1589 tlck->flag |= tlckWRITEPAGE;
1590
1591 return 0;
1592}
1593
1594
1595/*
1596 * dtLog()
1597 *
1598 * function: log dtree tlock and format maplock to update bmap;
1599 */
1600static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1601 struct tlock * tlck)
1602{
1603 struct metapage *mp;
1604 struct pxd_lock *pxdlock;
1605 pxd_t *pxd;
1606
1607 mp = tlck->mp;
1608
1609 /* initialize as REDOPAGE/NOREDOPAGE record format */
1610 lrd->log.redopage.type = cpu_to_le16(LOG_DTREE);
1611 lrd->log.redopage.l2linesize = cpu_to_le16(L2DTSLOTSIZE);
1612
1613 pxd = &lrd->log.redopage.pxd;
1614
1615 if (tlck->type & tlckBTROOT)
1616 lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
1617
1618 /*
1619 * page extension via relocation: entry insertion;
1620 * page extension in-place: entry insertion;
1621 * new right page from page split, reinitialized in-line
1622 * root from root page split: entry insertion;
1623 */
1624 if (tlck->type & (tlckNEW | tlckEXTEND)) {
1625 /* log after-image of the new page for logredo():
1626 * mark log (LOG_NEW) for logredo() to initialize
1627 * freelist and update bmap for alloc of the new page;
1628 */
1629 lrd->type = cpu_to_le16(LOG_REDOPAGE);
1630 if (tlck->type & tlckEXTEND)
1631 lrd->log.redopage.type |= cpu_to_le16(LOG_EXTEND);
1632 else
1633 lrd->log.redopage.type |= cpu_to_le16(LOG_NEW);
1634// *pxd = mp->cm_pxd;
1635 PXDaddress(pxd, mp->index);
1636 PXDlength(pxd,
1637 mp->logical_size >> tblk->sb->s_blocksize_bits);
1638 lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1639
1640 /* format a maplock for txUpdateMap() to update bPMAP for
1641 * alloc of the new page;
1642 */
1643 if (tlck->type & tlckBTROOT)
1644 return;
1645 tlck->flag |= tlckUPDATEMAP;
1646 pxdlock = (struct pxd_lock *) & tlck->lock;
1647 pxdlock->flag = mlckALLOCPXD;
1648 pxdlock->pxd = *pxd;
1649
1650 pxdlock->index = 1;
1651
1652 /* mark page as homeward bound */
1653 tlck->flag |= tlckWRITEPAGE;
1654 return;
1655 }
1656
1657 /*
1658 * entry insertion/deletion,
1659 * sibling page link update (old right page before split);
1660 */
1661 if (tlck->type & (tlckENTRY | tlckRELINK)) {
1662 /* log after-image for logredo(): */
1663 lrd->type = cpu_to_le16(LOG_REDOPAGE);
1664 PXDaddress(pxd, mp->index);
1665 PXDlength(pxd,
1666 mp->logical_size >> tblk->sb->s_blocksize_bits);
1667 lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1668
1669 /* mark page as homeward bound */
1670 tlck->flag |= tlckWRITEPAGE;
1671 return;
1672 }
1673
1674 /*
1675 * page deletion: page has been invalidated
1676 * page relocation: source extent
1677 *
1678 * a maplock for free of the page has been formatted
1679 * at txLock() time);
1680 */
1681 if (tlck->type & (tlckFREE | tlckRELOCATE)) {
1682 /* log LOG_NOREDOPAGE of the deleted page for logredo()
1683 * to start NoRedoPage filter and to update bmap for free
1684 * of the deletd page
1685 */
1686 lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
1687 pxdlock = (struct pxd_lock *) & tlck->lock;
1688 *pxd = pxdlock->pxd;
1689 lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1690
1691 /* a maplock for txUpdateMap() for free of the page
1692 * has been formatted at txLock() time;
1693 */
1694 tlck->flag |= tlckUPDATEMAP;
1695 }
1696 return;
1697}
1698
1699
1700/*
1701 * xtLog()
1702 *
1703 * function: log xtree tlock and format maplock to update bmap;
1704 */
1705static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1706 struct tlock * tlck)
1707{
1708 struct inode *ip;
1709 struct metapage *mp;
1710 xtpage_t *p;
1711 struct xtlock *xtlck;
1712 struct maplock *maplock;
1713 struct xdlistlock *xadlock;
1714 struct pxd_lock *pxdlock;
1715 pxd_t *pxd;
1716 int next, lwm, hwm;
1717
1718 ip = tlck->ip;
1719 mp = tlck->mp;
1720
1721 /* initialize as REDOPAGE/NOREDOPAGE record format */
1722 lrd->log.redopage.type = cpu_to_le16(LOG_XTREE);
1723 lrd->log.redopage.l2linesize = cpu_to_le16(L2XTSLOTSIZE);
1724
1725 pxd = &lrd->log.redopage.pxd;
1726
1727 if (tlck->type & tlckBTROOT) {
1728 lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
1729 p = &JFS_IP(ip)->i_xtroot;
1730 if (S_ISDIR(ip->i_mode))
1731 lrd->log.redopage.type |=
1732 cpu_to_le16(LOG_DIR_XTREE);
1733 } else
1734 p = (xtpage_t *) mp->data;
1735 next = le16_to_cpu(p->header.nextindex);
1736
1737 xtlck = (struct xtlock *) & tlck->lock;
1738
1739 maplock = (struct maplock *) & tlck->lock;
1740 xadlock = (struct xdlistlock *) maplock;
1741
1742 /*
1743 * entry insertion/extension;
1744 * sibling page link update (old right page before split);
1745 */
1746 if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) {
1747 /* log after-image for logredo():
1748 * logredo() will update bmap for alloc of new/extended
1749 * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from
1750 * after-image of XADlist;
1751 * logredo() resets (XAD_NEW|XAD_EXTEND) flag when
1752 * applying the after-image to the meta-data page.
1753 */
1754 lrd->type = cpu_to_le16(LOG_REDOPAGE);
1755// *pxd = mp->cm_pxd;
1756 PXDaddress(pxd, mp->index);
1757 PXDlength(pxd,
1758 mp->logical_size >> tblk->sb->s_blocksize_bits);
1759 lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1760
1761 /* format a maplock for txUpdateMap() to update bPMAP
1762 * for alloc of new/extended extents of XAD[lwm:next)
1763 * from the page itself;
1764 * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag.
1765 */
1766 lwm = xtlck->lwm.offset;
1767 if (lwm == 0)
1768 lwm = XTPAGEMAXSLOT;
1769
1770 if (lwm == next)
1771 goto out;
1772 if (lwm > next) {
1773 jfs_err("xtLog: lwm > next\n");
1774 goto out;
1775 }
1776 tlck->flag |= tlckUPDATEMAP;
1777 xadlock->flag = mlckALLOCXADLIST;
1778 xadlock->count = next - lwm;
1779 if ((xadlock->count <= 2) && (tblk->xflag & COMMIT_LAZY)) {
1780 int i;
1781 /*
1782 * Lazy commit may allow xtree to be modified before
1783 * txUpdateMap runs. Copy xad into linelock to
1784 * preserve correct data.
1785 */
1786 xadlock->xdlist = &xtlck->pxdlock;
1787 memcpy(xadlock->xdlist, &p->xad[lwm],
1788 sizeof(xad_t) * xadlock->count);
1789
1790 for (i = 0; i < xadlock->count; i++)
1791 p->xad[lwm + i].flag &=
1792 ~(XAD_NEW | XAD_EXTENDED);
1793 } else {
1794 /*
1795 * xdlist will point to into inode's xtree, ensure
1796 * that transaction is not committed lazily.
1797 */
1798 xadlock->xdlist = &p->xad[lwm];
1799 tblk->xflag &= ~COMMIT_LAZY;
1800 }
1801 jfs_info("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d "
1802 "count:%d", tlck->ip, mp, tlck, lwm, xadlock->count);
1803
1804 maplock->index = 1;
1805
1806 out:
1807 /* mark page as homeward bound */
1808 tlck->flag |= tlckWRITEPAGE;
1809
1810 return;
1811 }
1812
1813 /*
1814 * page deletion: file deletion/truncation (ref. xtTruncate())
1815 *
1816 * (page will be invalidated after log is written and bmap
1817 * is updated from the page);
1818 */
1819 if (tlck->type & tlckFREE) {
1820 /* LOG_NOREDOPAGE log for NoRedoPage filter:
1821 * if page free from file delete, NoRedoFile filter from
1822 * inode image of zero link count will subsume NoRedoPage
1823 * filters for each page;
1824 * if page free from file truncattion, write NoRedoPage
1825 * filter;
1826 *
1827 * upadte of block allocation map for the page itself:
1828 * if page free from deletion and truncation, LOG_UPDATEMAP
1829 * log for the page itself is generated from processing
1830 * its parent page xad entries;
1831 */
1832 /* if page free from file truncation, log LOG_NOREDOPAGE
1833 * of the deleted page for logredo() to start NoRedoPage
1834 * filter for the page;
1835 */
1836 if (tblk->xflag & COMMIT_TRUNCATE) {
1837 /* write NOREDOPAGE for the page */
1838 lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
1839 PXDaddress(pxd, mp->index);
1840 PXDlength(pxd,
1841 mp->logical_size >> tblk->sb->
1842 s_blocksize_bits);
1843 lrd->backchain =
1844 cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1845
1846 if (tlck->type & tlckBTROOT) {
1847 /* Empty xtree must be logged */
1848 lrd->type = cpu_to_le16(LOG_REDOPAGE);
1849 lrd->backchain =
1850 cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1851 }
1852 }
1853
1854 /* init LOG_UPDATEMAP of the freed extents
1855 * XAD[XTENTRYSTART:hwm) from the deleted page itself
1856 * for logredo() to update bmap;
1857 */
1858 lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1859 lrd->log.updatemap.type = cpu_to_le16(LOG_FREEXADLIST);
1860 xtlck = (struct xtlock *) & tlck->lock;
1861 hwm = xtlck->hwm.offset;
1862 lrd->log.updatemap.nxd =
1863 cpu_to_le16(hwm - XTENTRYSTART + 1);
1864 /* reformat linelock for lmLog() */
1865 xtlck->header.offset = XTENTRYSTART;
1866 xtlck->header.length = hwm - XTENTRYSTART + 1;
1867 xtlck->index = 1;
1868 lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1869
1870 /* format a maplock for txUpdateMap() to update bmap
1871 * to free extents of XAD[XTENTRYSTART:hwm) from the
1872 * deleted page itself;
1873 */
1874 tlck->flag |= tlckUPDATEMAP;
1875 xadlock->flag = mlckFREEXADLIST;
1876 xadlock->count = hwm - XTENTRYSTART + 1;
1877 if ((xadlock->count <= 2) && (tblk->xflag & COMMIT_LAZY)) {
1878 /*
1879 * Lazy commit may allow xtree to be modified before
1880 * txUpdateMap runs. Copy xad into linelock to
1881 * preserve correct data.
1882 */
1883 xadlock->xdlist = &xtlck->pxdlock;
1884 memcpy(xadlock->xdlist, &p->xad[XTENTRYSTART],
1885 sizeof(xad_t) * xadlock->count);
1886 } else {
1887 /*
1888 * xdlist will point to into inode's xtree, ensure
1889 * that transaction is not committed lazily.
1890 */
1891 xadlock->xdlist = &p->xad[XTENTRYSTART];
1892 tblk->xflag &= ~COMMIT_LAZY;
1893 }
1894 jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d lwm:2",
1895 tlck->ip, mp, xadlock->count);
1896
1897 maplock->index = 1;
1898
1899 /* mark page as invalid */
1900 if (((tblk->xflag & COMMIT_PWMAP) || S_ISDIR(ip->i_mode))
1901 && !(tlck->type & tlckBTROOT))
1902 tlck->flag |= tlckFREEPAGE;
1903 /*
1904 else (tblk->xflag & COMMIT_PMAP)
1905 ? release the page;
1906 */
1907 return;
1908 }
1909
1910 /*
1911 * page/entry truncation: file truncation (ref. xtTruncate())
1912 *
1913 * |----------+------+------+---------------|
1914 * | | |
1915 * | | hwm - hwm before truncation
1916 * | next - truncation point
1917 * lwm - lwm before truncation
1918 * header ?
1919 */
1920 if (tlck->type & tlckTRUNCATE) {
1921 pxd_t tpxd; /* truncated extent of xad */
1922 int twm;
1923
1924 /*
1925 * For truncation the entire linelock may be used, so it would
1926 * be difficult to store xad list in linelock itself.
1927 * Therefore, we'll just force transaction to be committed
1928 * synchronously, so that xtree pages won't be changed before
1929 * txUpdateMap runs.
1930 */
1931 tblk->xflag &= ~COMMIT_LAZY;
1932 lwm = xtlck->lwm.offset;
1933 if (lwm == 0)
1934 lwm = XTPAGEMAXSLOT;
1935 hwm = xtlck->hwm.offset;
1936 twm = xtlck->twm.offset;
1937
1938 /*
1939 * write log records
1940 */
1941 /* log after-image for logredo():
1942 *
1943 * logredo() will update bmap for alloc of new/extended
1944 * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from
1945 * after-image of XADlist;
1946 * logredo() resets (XAD_NEW|XAD_EXTEND) flag when
1947 * applying the after-image to the meta-data page.
1948 */
1949 lrd->type = cpu_to_le16(LOG_REDOPAGE);
1950 PXDaddress(pxd, mp->index);
1951 PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits);
1952 lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1953
1954 /*
1955 * truncate entry XAD[twm == next - 1]:
1956 */
1957 if (twm == next - 1) {
1958 /* init LOG_UPDATEMAP for logredo() to update bmap for
1959 * free of truncated delta extent of the truncated
1960 * entry XAD[next - 1]:
1961 * (xtlck->pxdlock = truncated delta extent);
1962 */
1963 pxdlock = (struct pxd_lock *) & xtlck->pxdlock;
1964 /* assert(pxdlock->type & tlckTRUNCATE); */
1965 lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1966 lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD);
1967 lrd->log.updatemap.nxd = cpu_to_le16(1);
1968 lrd->log.updatemap.pxd = pxdlock->pxd;
1969 tpxd = pxdlock->pxd; /* save to format maplock */
1970 lrd->backchain =
1971 cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1972 }
1973
1974 /*
1975 * free entries XAD[next:hwm]:
1976 */
1977 if (hwm >= next) {
1978 /* init LOG_UPDATEMAP of the freed extents
1979 * XAD[next:hwm] from the deleted page itself
1980 * for logredo() to update bmap;
1981 */
1982 lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1983 lrd->log.updatemap.type =
1984 cpu_to_le16(LOG_FREEXADLIST);
1985 xtlck = (struct xtlock *) & tlck->lock;
1986 hwm = xtlck->hwm.offset;
1987 lrd->log.updatemap.nxd =
1988 cpu_to_le16(hwm - next + 1);
1989 /* reformat linelock for lmLog() */
1990 xtlck->header.offset = next;
1991 xtlck->header.length = hwm - next + 1;
1992 xtlck->index = 1;
1993 lrd->backchain =
1994 cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1995 }
1996
1997 /*
1998 * format maplock(s) for txUpdateMap() to update bmap
1999 */
2000 maplock->index = 0;
2001
2002 /*
2003 * allocate entries XAD[lwm:next):
2004 */
2005 if (lwm < next) {
2006 /* format a maplock for txUpdateMap() to update bPMAP
2007 * for alloc of new/extended extents of XAD[lwm:next)
2008 * from the page itself;
2009 * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag.
2010 */
2011 tlck->flag |= tlckUPDATEMAP;
2012 xadlock->flag = mlckALLOCXADLIST;
2013 xadlock->count = next - lwm;
2014 xadlock->xdlist = &p->xad[lwm];
2015
2016 jfs_info("xtLog: alloc ip:0x%p mp:0x%p count:%d "
2017 "lwm:%d next:%d",
2018 tlck->ip, mp, xadlock->count, lwm, next);
2019 maplock->index++;
2020 xadlock++;
2021 }
2022
2023 /*
2024 * truncate entry XAD[twm == next - 1]:
2025 */
2026 if (twm == next - 1) {
2027 struct pxd_lock *pxdlock;
2028
2029 /* format a maplock for txUpdateMap() to update bmap
2030 * to free truncated delta extent of the truncated
2031 * entry XAD[next - 1];
2032 * (xtlck->pxdlock = truncated delta extent);
2033 */
2034 tlck->flag |= tlckUPDATEMAP;
2035 pxdlock = (struct pxd_lock *) xadlock;
2036 pxdlock->flag = mlckFREEPXD;
2037 pxdlock->count = 1;
2038 pxdlock->pxd = tpxd;
2039
2040 jfs_info("xtLog: truncate ip:0x%p mp:0x%p count:%d "
2041 "hwm:%d", ip, mp, pxdlock->count, hwm);
2042 maplock->index++;
2043 xadlock++;
2044 }
2045
2046 /*
2047 * free entries XAD[next:hwm]:
2048 */
2049 if (hwm >= next) {
2050 /* format a maplock for txUpdateMap() to update bmap
2051 * to free extents of XAD[next:hwm] from thedeleted
2052 * page itself;
2053 */
2054 tlck->flag |= tlckUPDATEMAP;
2055 xadlock->flag = mlckFREEXADLIST;
2056 xadlock->count = hwm - next + 1;
2057 xadlock->xdlist = &p->xad[next];
2058
2059 jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d "
2060 "next:%d hwm:%d",
2061 tlck->ip, mp, xadlock->count, next, hwm);
2062 maplock->index++;
2063 }
2064
2065 /* mark page as homeward bound */
2066 tlck->flag |= tlckWRITEPAGE;
2067 }
2068 return;
2069}
2070
2071
2072/*
2073 * mapLog()
2074 *
2075 * function: log from maplock of freed data extents;
2076 */
2077void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
2078 struct tlock * tlck)
2079{
2080 struct pxd_lock *pxdlock;
2081 int i, nlock;
2082 pxd_t *pxd;
2083
2084 /*
2085 * page relocation: free the source page extent
2086 *
2087 * a maplock for txUpdateMap() for free of the page
2088 * has been formatted at txLock() time saving the src
2089 * relocated page address;
2090 */
2091 if (tlck->type & tlckRELOCATE) {
2092 /* log LOG_NOREDOPAGE of the old relocated page
2093 * for logredo() to start NoRedoPage filter;
2094 */
2095 lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
2096 pxdlock = (struct pxd_lock *) & tlck->lock;
2097 pxd = &lrd->log.redopage.pxd;
2098 *pxd = pxdlock->pxd;
2099 lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
2100
2101 /* (N.B. currently, logredo() does NOT update bmap
2102 * for free of the page itself for (LOG_XTREE|LOG_NOREDOPAGE);
2103 * if page free from relocation, LOG_UPDATEMAP log is
2104 * specifically generated now for logredo()
2105 * to update bmap for free of src relocated page;
2106 * (new flag LOG_RELOCATE may be introduced which will
2107 * inform logredo() to start NORedoPage filter and also
2108 * update block allocation map at the same time, thus
2109 * avoiding an extra log write);
2110 */
2111 lrd->type = cpu_to_le16(LOG_UPDATEMAP);
2112 lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD);
2113 lrd->log.updatemap.nxd = cpu_to_le16(1);
2114 lrd->log.updatemap.pxd = pxdlock->pxd;
2115 lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
2116
2117 /* a maplock for txUpdateMap() for free of the page
2118 * has been formatted at txLock() time;
2119 */
2120 tlck->flag |= tlckUPDATEMAP;
2121 return;
2122 }
2123 /*
2124
2125 * Otherwise it's not a relocate request
2126 *
2127 */
2128 else {
2129 /* log LOG_UPDATEMAP for logredo() to update bmap for
2130 * free of truncated/relocated delta extent of the data;
2131 * e.g.: external EA extent, relocated/truncated extent
2132 * from xtTailgate();
2133 */
2134 lrd->type = cpu_to_le16(LOG_UPDATEMAP);
2135 pxdlock = (struct pxd_lock *) & tlck->lock;
2136 nlock = pxdlock->index;
2137 for (i = 0; i < nlock; i++, pxdlock++) {
2138 if (pxdlock->flag & mlckALLOCPXD)
2139 lrd->log.updatemap.type =
2140 cpu_to_le16(LOG_ALLOCPXD);
2141 else
2142 lrd->log.updatemap.type =
2143 cpu_to_le16(LOG_FREEPXD);
2144 lrd->log.updatemap.nxd = cpu_to_le16(1);
2145 lrd->log.updatemap.pxd = pxdlock->pxd;
2146 lrd->backchain =
2147 cpu_to_le32(lmLog(log, tblk, lrd, NULL));
2148 jfs_info("mapLog: xaddr:0x%lx xlen:0x%x",
2149 (ulong) addressPXD(&pxdlock->pxd),
2150 lengthPXD(&pxdlock->pxd));
2151 }
2152
2153 /* update bmap */
2154 tlck->flag |= tlckUPDATEMAP;
2155 }
2156}
2157
2158
2159/*
2160 * txEA()
2161 *
2162 * function: acquire maplock for EA/ACL extents or
2163 * set COMMIT_INLINE flag;
2164 */
2165void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea)
2166{
2167 struct tlock *tlck = NULL;
2168 struct pxd_lock *maplock = NULL, *pxdlock = NULL;
2169
2170 /*
2171 * format maplock for alloc of new EA extent
2172 */
2173 if (newea) {
2174 /* Since the newea could be a completely zeroed entry we need to
2175 * check for the two flags which indicate we should actually
2176 * commit new EA data
2177 */
2178 if (newea->flag & DXD_EXTENT) {
2179 tlck = txMaplock(tid, ip, tlckMAP);
2180 maplock = (struct pxd_lock *) & tlck->lock;
2181 pxdlock = (struct pxd_lock *) maplock;
2182 pxdlock->flag = mlckALLOCPXD;
2183 PXDaddress(&pxdlock->pxd, addressDXD(newea));
2184 PXDlength(&pxdlock->pxd, lengthDXD(newea));
2185 pxdlock++;
2186 maplock->index = 1;
2187 } else if (newea->flag & DXD_INLINE) {
2188 tlck = NULL;
2189
2190 set_cflag(COMMIT_Inlineea, ip);
2191 }
2192 }
2193
2194 /*
2195 * format maplock for free of old EA extent
2196 */
2197 if (!test_cflag(COMMIT_Nolink, ip) && oldea->flag & DXD_EXTENT) {
2198 if (tlck == NULL) {
2199 tlck = txMaplock(tid, ip, tlckMAP);
2200 maplock = (struct pxd_lock *) & tlck->lock;
2201 pxdlock = (struct pxd_lock *) maplock;
2202 maplock->index = 0;
2203 }
2204 pxdlock->flag = mlckFREEPXD;
2205 PXDaddress(&pxdlock->pxd, addressDXD(oldea));
2206 PXDlength(&pxdlock->pxd, lengthDXD(oldea));
2207 maplock->index++;
2208 }
2209}
2210
2211
2212/*
2213 * txForce()
2214 *
2215 * function: synchronously write pages locked by transaction
2216 * after txLog() but before txUpdateMap();
2217 */
2218void txForce(struct tblock * tblk)
2219{
2220 struct tlock *tlck;
2221 lid_t lid, next;
2222 struct metapage *mp;
2223
2224 /*
2225 * reverse the order of transaction tlocks in
2226 * careful update order of address index pages
2227 * (right to left, bottom up)
2228 */
2229 tlck = lid_to_tlock(tblk->next);
2230 lid = tlck->next;
2231 tlck->next = 0;
2232 while (lid) {
2233 tlck = lid_to_tlock(lid);
2234 next = tlck->next;
2235 tlck->next = tblk->next;
2236 tblk->next = lid;
2237 lid = next;
2238 }
2239
2240 /*
2241 * synchronously write the page, and
2242 * hold the page for txUpdateMap();
2243 */
2244 for (lid = tblk->next; lid; lid = next) {
2245 tlck = lid_to_tlock(lid);
2246 next = tlck->next;
2247
2248 if ((mp = tlck->mp) != NULL &&
2249 (tlck->type & tlckBTROOT) == 0) {
2250 assert(mp->xflag & COMMIT_PAGE);
2251
2252 if (tlck->flag & tlckWRITEPAGE) {
2253 tlck->flag &= ~tlckWRITEPAGE;
2254
2255 /* do not release page to freelist */
2256
2257 /*
2258 * The "right" thing to do here is to
2259 * synchronously write the metadata.
2260 * With the current implementation this
2261 * is hard since write_metapage requires
2262 * us to kunmap & remap the page. If we
2263 * have tlocks pointing into the metadata
2264 * pages, we don't want to do this. I think
2265 * we can get by with synchronously writing
2266 * the pages when they are released.
2267 */
2268 assert(atomic_read(&mp->nohomeok));
2269 set_bit(META_dirty, &mp->flag);
2270 set_bit(META_sync, &mp->flag);
2271 }
2272 }
2273 }
2274}
2275
2276
2277/*
2278 * txUpdateMap()
2279 *
2280 * function: update persistent allocation map (and working map
2281 * if appropriate);
2282 *
2283 * parameter:
2284 */
2285static void txUpdateMap(struct tblock * tblk)
2286{
2287 struct inode *ip;
2288 struct inode *ipimap;
2289 lid_t lid;
2290 struct tlock *tlck;
2291 struct maplock *maplock;
2292 struct pxd_lock pxdlock;
2293 int maptype;
2294 int k, nlock;
2295 struct metapage *mp = NULL;
2296
2297 ipimap = JFS_SBI(tblk->sb)->ipimap;
2298
2299 maptype = (tblk->xflag & COMMIT_PMAP) ? COMMIT_PMAP : COMMIT_PWMAP;
2300
2301
2302 /*
2303 * update block allocation map
2304 *
2305 * update allocation state in pmap (and wmap) and
2306 * update lsn of the pmap page;
2307 */
2308 /*
2309 * scan each tlock/page of transaction for block allocation/free:
2310 *
2311 * for each tlock/page of transaction, update map.
2312 * ? are there tlock for pmap and pwmap at the same time ?
2313 */
2314 for (lid = tblk->next; lid; lid = tlck->next) {
2315 tlck = lid_to_tlock(lid);
2316
2317 if ((tlck->flag & tlckUPDATEMAP) == 0)
2318 continue;
2319
2320 if (tlck->flag & tlckFREEPAGE) {
2321 /*
2322 * Another thread may attempt to reuse freed space
2323 * immediately, so we want to get rid of the metapage
2324 * before anyone else has a chance to get it.
2325 * Lock metapage, update maps, then invalidate
2326 * the metapage.
2327 */
2328 mp = tlck->mp;
2329 ASSERT(mp->xflag & COMMIT_PAGE);
2330 hold_metapage(mp, 0);
2331 }
2332
2333 /*
2334 * extent list:
2335 * . in-line PXD list:
2336 * . out-of-line XAD list:
2337 */
2338 maplock = (struct maplock *) & tlck->lock;
2339 nlock = maplock->index;
2340
2341 for (k = 0; k < nlock; k++, maplock++) {
2342 /*
2343 * allocate blocks in persistent map:
2344 *
2345 * blocks have been allocated from wmap at alloc time;
2346 */
2347 if (maplock->flag & mlckALLOC) {
2348 txAllocPMap(ipimap, maplock, tblk);
2349 }
2350 /*
2351 * free blocks in persistent and working map:
2352 * blocks will be freed in pmap and then in wmap;
2353 *
2354 * ? tblock specifies the PMAP/PWMAP based upon
2355 * transaction
2356 *
2357 * free blocks in persistent map:
2358 * blocks will be freed from wmap at last reference
2359 * release of the object for regular files;
2360 *
2361 * Alway free blocks from both persistent & working
2362 * maps for directories
2363 */
2364 else { /* (maplock->flag & mlckFREE) */
2365
2366 if (S_ISDIR(tlck->ip->i_mode))
2367 txFreeMap(ipimap, maplock,
2368 tblk, COMMIT_PWMAP);
2369 else
2370 txFreeMap(ipimap, maplock,
2371 tblk, maptype);
2372 }
2373 }
2374 if (tlck->flag & tlckFREEPAGE) {
2375 if (!(tblk->flag & tblkGC_LAZY)) {
2376 /* This is equivalent to txRelease */
2377 ASSERT(mp->lid == lid);
2378 tlck->mp->lid = 0;
2379 }
2380 assert(atomic_read(&mp->nohomeok) == 1);
2381 atomic_dec(&mp->nohomeok);
2382 discard_metapage(mp);
2383 tlck->mp = NULL;
2384 }
2385 }
2386 /*
2387 * update inode allocation map
2388 *
2389 * update allocation state in pmap and
2390 * update lsn of the pmap page;
2391 * update in-memory inode flag/state
2392 *
2393 * unlock mapper/write lock
2394 */
2395 if (tblk->xflag & COMMIT_CREATE) {
2396 diUpdatePMap(ipimap, tblk->ino, FALSE, tblk);
2397 ipimap->i_state |= I_DIRTY;
2398 /* update persistent block allocation map
2399 * for the allocation of inode extent;
2400 */
2401 pxdlock.flag = mlckALLOCPXD;
2402 pxdlock.pxd = tblk->u.ixpxd;
2403 pxdlock.index = 1;
2404 txAllocPMap(ipimap, (struct maplock *) & pxdlock, tblk);
2405 } else if (tblk->xflag & COMMIT_DELETE) {
2406 ip = tblk->u.ip;
2407 diUpdatePMap(ipimap, ip->i_ino, TRUE, tblk);
2408 ipimap->i_state |= I_DIRTY;
2409 iput(ip);
2410 }
2411}
2412
2413
2414/*
2415 * txAllocPMap()
2416 *
2417 * function: allocate from persistent map;
2418 *
2419 * parameter:
2420 * ipbmap -
2421 * malock -
2422 * xad list:
2423 * pxd:
2424 *
2425 * maptype -
2426 * allocate from persistent map;
2427 * free from persistent map;
2428 * (e.g., tmp file - free from working map at releae
2429 * of last reference);
2430 * free from persistent and working map;
2431 *
2432 * lsn - log sequence number;
2433 */
2434static void txAllocPMap(struct inode *ip, struct maplock * maplock,
2435 struct tblock * tblk)
2436{
2437 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
2438 struct xdlistlock *xadlistlock;
2439 xad_t *xad;
2440 s64 xaddr;
2441 int xlen;
2442 struct pxd_lock *pxdlock;
2443 struct xdlistlock *pxdlistlock;
2444 pxd_t *pxd;
2445 int n;
2446
2447 /*
2448 * allocate from persistent map;
2449 */
2450 if (maplock->flag & mlckALLOCXADLIST) {
2451 xadlistlock = (struct xdlistlock *) maplock;
2452 xad = xadlistlock->xdlist;
2453 for (n = 0; n < xadlistlock->count; n++, xad++) {
2454 if (xad->flag & (XAD_NEW | XAD_EXTENDED)) {
2455 xaddr = addressXAD(xad);
2456 xlen = lengthXAD(xad);
2457 dbUpdatePMap(ipbmap, FALSE, xaddr,
2458 (s64) xlen, tblk);
2459 xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
2460 jfs_info("allocPMap: xaddr:0x%lx xlen:%d",
2461 (ulong) xaddr, xlen);
2462 }
2463 }
2464 } else if (maplock->flag & mlckALLOCPXD) {
2465 pxdlock = (struct pxd_lock *) maplock;
2466 xaddr = addressPXD(&pxdlock->pxd);
2467 xlen = lengthPXD(&pxdlock->pxd);
2468 dbUpdatePMap(ipbmap, FALSE, xaddr, (s64) xlen, tblk);
2469 jfs_info("allocPMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen);
2470 } else { /* (maplock->flag & mlckALLOCPXDLIST) */
2471
2472 pxdlistlock = (struct xdlistlock *) maplock;
2473 pxd = pxdlistlock->xdlist;
2474 for (n = 0; n < pxdlistlock->count; n++, pxd++) {
2475 xaddr = addressPXD(pxd);
2476 xlen = lengthPXD(pxd);
2477 dbUpdatePMap(ipbmap, FALSE, xaddr, (s64) xlen,
2478 tblk);
2479 jfs_info("allocPMap: xaddr:0x%lx xlen:%d",
2480 (ulong) xaddr, xlen);
2481 }
2482 }
2483}
2484
2485
2486/*
2487 * txFreeMap()
2488 *
2489 * function: free from persistent and/or working map;
2490 *
2491 * todo: optimization
2492 */
2493void txFreeMap(struct inode *ip,
2494 struct maplock * maplock, struct tblock * tblk, int maptype)
2495{
2496 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
2497 struct xdlistlock *xadlistlock;
2498 xad_t *xad;
2499 s64 xaddr;
2500 int xlen;
2501 struct pxd_lock *pxdlock;
2502 struct xdlistlock *pxdlistlock;
2503 pxd_t *pxd;
2504 int n;
2505
2506 jfs_info("txFreeMap: tblk:0x%p maplock:0x%p maptype:0x%x",
2507 tblk, maplock, maptype);
2508
2509 /*
2510 * free from persistent map;
2511 */
2512 if (maptype == COMMIT_PMAP || maptype == COMMIT_PWMAP) {
2513 if (maplock->flag & mlckFREEXADLIST) {
2514 xadlistlock = (struct xdlistlock *) maplock;
2515 xad = xadlistlock->xdlist;
2516 for (n = 0; n < xadlistlock->count; n++, xad++) {
2517 if (!(xad->flag & XAD_NEW)) {
2518 xaddr = addressXAD(xad);
2519 xlen = lengthXAD(xad);
2520 dbUpdatePMap(ipbmap, TRUE, xaddr,
2521 (s64) xlen, tblk);
2522 jfs_info("freePMap: xaddr:0x%lx "
2523 "xlen:%d",
2524 (ulong) xaddr, xlen);
2525 }
2526 }
2527 } else if (maplock->flag & mlckFREEPXD) {
2528 pxdlock = (struct pxd_lock *) maplock;
2529 xaddr = addressPXD(&pxdlock->pxd);
2530 xlen = lengthPXD(&pxdlock->pxd);
2531 dbUpdatePMap(ipbmap, TRUE, xaddr, (s64) xlen,
2532 tblk);
2533 jfs_info("freePMap: xaddr:0x%lx xlen:%d",
2534 (ulong) xaddr, xlen);
2535 } else { /* (maplock->flag & mlckALLOCPXDLIST) */
2536
2537 pxdlistlock = (struct xdlistlock *) maplock;
2538 pxd = pxdlistlock->xdlist;
2539 for (n = 0; n < pxdlistlock->count; n++, pxd++) {
2540 xaddr = addressPXD(pxd);
2541 xlen = lengthPXD(pxd);
2542 dbUpdatePMap(ipbmap, TRUE, xaddr,
2543 (s64) xlen, tblk);
2544 jfs_info("freePMap: xaddr:0x%lx xlen:%d",
2545 (ulong) xaddr, xlen);
2546 }
2547 }
2548 }
2549
2550 /*
2551 * free from working map;
2552 */
2553 if (maptype == COMMIT_PWMAP || maptype == COMMIT_WMAP) {
2554 if (maplock->flag & mlckFREEXADLIST) {
2555 xadlistlock = (struct xdlistlock *) maplock;
2556 xad = xadlistlock->xdlist;
2557 for (n = 0; n < xadlistlock->count; n++, xad++) {
2558 xaddr = addressXAD(xad);
2559 xlen = lengthXAD(xad);
2560 dbFree(ip, xaddr, (s64) xlen);
2561 xad->flag = 0;
2562 jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
2563 (ulong) xaddr, xlen);
2564 }
2565 } else if (maplock->flag & mlckFREEPXD) {
2566 pxdlock = (struct pxd_lock *) maplock;
2567 xaddr = addressPXD(&pxdlock->pxd);
2568 xlen = lengthPXD(&pxdlock->pxd);
2569 dbFree(ip, xaddr, (s64) xlen);
2570 jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
2571 (ulong) xaddr, xlen);
2572 } else { /* (maplock->flag & mlckFREEPXDLIST) */
2573
2574 pxdlistlock = (struct xdlistlock *) maplock;
2575 pxd = pxdlistlock->xdlist;
2576 for (n = 0; n < pxdlistlock->count; n++, pxd++) {
2577 xaddr = addressPXD(pxd);
2578 xlen = lengthPXD(pxd);
2579 dbFree(ip, xaddr, (s64) xlen);
2580 jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
2581 (ulong) xaddr, xlen);
2582 }
2583 }
2584 }
2585}
2586
2587
2588/*
2589 * txFreelock()
2590 *
2591 * function: remove tlock from inode anonymous locklist
2592 */
2593void txFreelock(struct inode *ip)
2594{
2595 struct jfs_inode_info *jfs_ip = JFS_IP(ip);
2596 struct tlock *xtlck, *tlck;
2597 lid_t xlid = 0, lid;
2598
2599 if (!jfs_ip->atlhead)
2600 return;
2601
2602 TXN_LOCK();
2603 xtlck = (struct tlock *) &jfs_ip->atlhead;
2604
2605 while ((lid = xtlck->next) != 0) {
2606 tlck = lid_to_tlock(lid);
2607 if (tlck->flag & tlckFREELOCK) {
2608 xtlck->next = tlck->next;
2609 txLockFree(lid);
2610 } else {
2611 xtlck = tlck;
2612 xlid = lid;
2613 }
2614 }
2615
2616 if (jfs_ip->atlhead)
2617 jfs_ip->atltail = xlid;
2618 else {
2619 jfs_ip->atltail = 0;
2620 /*
2621 * If inode was on anon_list, remove it
2622 */
2623 list_del_init(&jfs_ip->anon_inode_list);
2624 }
2625 TXN_UNLOCK();
2626}
2627
2628
2629/*
2630 * txAbort()
2631 *
2632 * function: abort tx before commit;
2633 *
2634 * frees line-locks and segment locks for all
2635 * segments in comdata structure.
2636 * Optionally sets state of file-system to FM_DIRTY in super-block.
2637 * log age of page-frames in memory for which caller has
2638 * are reset to 0 (to avoid logwarap).
2639 */
2640void txAbort(tid_t tid, int dirty)
2641{
2642 lid_t lid, next;
2643 struct metapage *mp;
2644 struct tblock *tblk = tid_to_tblock(tid);
2645 struct tlock *tlck;
2646
2647 /*
2648 * free tlocks of the transaction
2649 */
2650 for (lid = tblk->next; lid; lid = next) {
2651 tlck = lid_to_tlock(lid);
2652 next = tlck->next;
2653 mp = tlck->mp;
2654 JFS_IP(tlck->ip)->xtlid = 0;
2655
2656 if (mp) {
2657 mp->lid = 0;
2658
2659 /*
2660 * reset lsn of page to avoid logwarap:
2661 *
2662 * (page may have been previously committed by another
2663 * transaction(s) but has not been paged, i.e.,
2664 * it may be on logsync list even though it has not
2665 * been logged for the current tx.)
2666 */
2667 if (mp->xflag & COMMIT_PAGE && mp->lsn)
2668 LogSyncRelease(mp);
2669 }
2670 /* insert tlock at head of freelist */
2671 TXN_LOCK();
2672 txLockFree(lid);
2673 TXN_UNLOCK();
2674 }
2675
2676 /* caller will free the transaction block */
2677
2678 tblk->next = tblk->last = 0;
2679
2680 /*
2681 * mark filesystem dirty
2682 */
2683 if (dirty)
2684 jfs_error(tblk->sb, "txAbort");
2685
2686 return;
2687}
2688
2689/*
2690 * txLazyCommit(void)
2691 *
2692 * All transactions except those changing ipimap (COMMIT_FORCE) are
2693 * processed by this routine. This insures that the inode and block
2694 * allocation maps are updated in order. For synchronous transactions,
2695 * let the user thread finish processing after txUpdateMap() is called.
2696 */
2697static void txLazyCommit(struct tblock * tblk)
2698{
2699 struct jfs_log *log;
2700
2701 while (((tblk->flag & tblkGC_READY) == 0) &&
2702 ((tblk->flag & tblkGC_UNLOCKED) == 0)) {
2703 /* We must have gotten ahead of the user thread
2704 */
2705 jfs_info("jfs_lazycommit: tblk 0x%p not unlocked", tblk);
2706 yield();
2707 }
2708
2709 jfs_info("txLazyCommit: processing tblk 0x%p", tblk);
2710
2711 txUpdateMap(tblk);
2712
2713 log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
2714
2715 spin_lock_irq(&log->gclock); // LOGGC_LOCK
2716
2717 tblk->flag |= tblkGC_COMMITTED;
2718
2719 if (tblk->flag & tblkGC_READY)
2720 log->gcrtc--;
2721
2722 wake_up_all(&tblk->gcwait); // LOGGC_WAKEUP
2723
2724 /*
2725 * Can't release log->gclock until we've tested tblk->flag
2726 */
2727 if (tblk->flag & tblkGC_LAZY) {
2728 spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK
2729 txUnlock(tblk);
2730 tblk->flag &= ~tblkGC_LAZY;
2731 txEnd(tblk - TxBlock); /* Convert back to tid */
2732 } else
2733 spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK
2734
2735 jfs_info("txLazyCommit: done: tblk = 0x%p", tblk);
2736}
2737
2738/*
2739 * jfs_lazycommit(void)
2740 *
2741 * To be run as a kernel daemon. If lbmIODone is called in an interrupt
2742 * context, or where blocking is not wanted, this routine will process
2743 * committed transactions from the unlock queue.
2744 */
2745int jfs_lazycommit(void *arg)
2746{
2747 int WorkDone;
2748 struct tblock *tblk;
2749 unsigned long flags;
2750 struct jfs_sb_info *sbi;
2751
2752 daemonize("jfsCommit");
2753
2754 complete(&jfsIOwait);
2755
2756 do {
2757 LAZY_LOCK(flags);
2758 jfs_commit_thread_waking = 0; /* OK to wake another thread */
2759 while (!list_empty(&TxAnchor.unlock_queue)) {
2760 WorkDone = 0;
2761 list_for_each_entry(tblk, &TxAnchor.unlock_queue,
2762 cqueue) {
2763
2764 sbi = JFS_SBI(tblk->sb);
2765 /*
2766 * For each volume, the transactions must be
2767 * handled in order. If another commit thread
2768 * is handling a tblk for this superblock,
2769 * skip it
2770 */
2771 if (sbi->commit_state & IN_LAZYCOMMIT)
2772 continue;
2773
2774 sbi->commit_state |= IN_LAZYCOMMIT;
2775 WorkDone = 1;
2776
2777 /*
2778 * Remove transaction from queue
2779 */
2780 list_del(&tblk->cqueue);
2781
2782 LAZY_UNLOCK(flags);
2783 txLazyCommit(tblk);
2784 LAZY_LOCK(flags);
2785
2786 sbi->commit_state &= ~IN_LAZYCOMMIT;
2787 /*
2788 * Don't continue in the for loop. (We can't
2789 * anyway, it's unsafe!) We want to go back to
2790 * the beginning of the list.
2791 */
2792 break;
2793 }
2794
2795 /* If there was nothing to do, don't continue */
2796 if (!WorkDone)
2797 break;
2798 }
2799 /* In case a wakeup came while all threads were active */
2800 jfs_commit_thread_waking = 0;
2801
2802 if (current->flags & PF_FREEZE) {
2803 LAZY_UNLOCK(flags);
2804 refrigerator(PF_FREEZE);
2805 } else {
2806 DECLARE_WAITQUEUE(wq, current);
2807
2808 add_wait_queue(&jfs_commit_thread_wait, &wq);
2809 set_current_state(TASK_INTERRUPTIBLE);
2810 LAZY_UNLOCK(flags);
2811 schedule();
2812 current->state = TASK_RUNNING;
2813 remove_wait_queue(&jfs_commit_thread_wait, &wq);
2814 }
2815 } while (!jfs_stop_threads);
2816
2817 if (!list_empty(&TxAnchor.unlock_queue))
2818 jfs_err("jfs_lazycommit being killed w/pending transactions!");
2819 else
2820 jfs_info("jfs_lazycommit being killed\n");
2821 complete_and_exit(&jfsIOwait, 0);
2822}
2823
2824void txLazyUnlock(struct tblock * tblk)
2825{
2826 unsigned long flags;
2827
2828 LAZY_LOCK(flags);
2829
2830 list_add_tail(&tblk->cqueue, &TxAnchor.unlock_queue);
2831 /*
2832 * Don't wake up a commit thread if there is already one servicing
2833 * this superblock, or if the last one we woke up hasn't started yet.
2834 */
2835 if (!(JFS_SBI(tblk->sb)->commit_state & IN_LAZYCOMMIT) &&
2836 !jfs_commit_thread_waking) {
2837 jfs_commit_thread_waking = 1;
2838 wake_up(&jfs_commit_thread_wait);
2839 }
2840 LAZY_UNLOCK(flags);
2841}
2842
2843static void LogSyncRelease(struct metapage * mp)
2844{
2845 struct jfs_log *log = mp->log;
2846
2847 assert(atomic_read(&mp->nohomeok));
2848 assert(log);
2849 atomic_dec(&mp->nohomeok);
2850
2851 if (atomic_read(&mp->nohomeok))
2852 return;
2853
2854 hold_metapage(mp, 0);
2855
2856 LOGSYNC_LOCK(log);
2857 mp->log = NULL;
2858 mp->lsn = 0;
2859 mp->clsn = 0;
2860 log->count--;
2861 list_del_init(&mp->synclist);
2862 LOGSYNC_UNLOCK(log);
2863
2864 release_metapage(mp);
2865}
2866
2867/*
2868 * txQuiesce
2869 *
2870 * Block all new transactions and push anonymous transactions to
2871 * completion
2872 *
2873 * This does almost the same thing as jfs_sync below. We don't
2874 * worry about deadlocking when jfs_tlocks_low is set, since we would
2875 * expect jfs_sync to get us out of that jam.
2876 */
2877void txQuiesce(struct super_block *sb)
2878{
2879 struct inode *ip;
2880 struct jfs_inode_info *jfs_ip;
2881 struct jfs_log *log = JFS_SBI(sb)->log;
2882 tid_t tid;
2883
2884 set_bit(log_QUIESCE, &log->flag);
2885
2886 TXN_LOCK();
2887restart:
2888 while (!list_empty(&TxAnchor.anon_list)) {
2889 jfs_ip = list_entry(TxAnchor.anon_list.next,
2890 struct jfs_inode_info,
2891 anon_inode_list);
2892 ip = &jfs_ip->vfs_inode;
2893
2894 /*
2895 * inode will be removed from anonymous list
2896 * when it is committed
2897 */
2898 TXN_UNLOCK();
2899 tid = txBegin(ip->i_sb, COMMIT_INODE | COMMIT_FORCE);
2900 down(&jfs_ip->commit_sem);
2901 txCommit(tid, 1, &ip, 0);
2902 txEnd(tid);
2903 up(&jfs_ip->commit_sem);
2904 /*
2905 * Just to be safe. I don't know how
2906 * long we can run without blocking
2907 */
2908 cond_resched();
2909 TXN_LOCK();
2910 }
2911
2912 /*
2913 * If jfs_sync is running in parallel, there could be some inodes
2914 * on anon_list2. Let's check.
2915 */
2916 if (!list_empty(&TxAnchor.anon_list2)) {
2917 list_splice(&TxAnchor.anon_list2, &TxAnchor.anon_list);
2918 INIT_LIST_HEAD(&TxAnchor.anon_list2);
2919 goto restart;
2920 }
2921 TXN_UNLOCK();
2922
2923 /*
2924 * We may need to kick off the group commit
2925 */
2926 jfs_flush_journal(log, 0);
2927}
2928
2929/*
2930 * txResume()
2931 *
2932 * Allows transactions to start again following txQuiesce
2933 */
2934void txResume(struct super_block *sb)
2935{
2936 struct jfs_log *log = JFS_SBI(sb)->log;
2937
2938 clear_bit(log_QUIESCE, &log->flag);
2939 TXN_WAKEUP(&log->syncwait);
2940}
2941
2942/*
2943 * jfs_sync(void)
2944 *
2945 * To be run as a kernel daemon. This is awakened when tlocks run low.
2946 * We write any inodes that have anonymous tlocks so they will become
2947 * available.
2948 */
2949int jfs_sync(void *arg)
2950{
2951 struct inode *ip;
2952 struct jfs_inode_info *jfs_ip;
2953 int rc;
2954 tid_t tid;
2955
2956 daemonize("jfsSync");
2957
2958 complete(&jfsIOwait);
2959
2960 do {
2961 /*
2962 * write each inode on the anonymous inode list
2963 */
2964 TXN_LOCK();
2965 while (jfs_tlocks_low && !list_empty(&TxAnchor.anon_list)) {
2966 jfs_ip = list_entry(TxAnchor.anon_list.next,
2967 struct jfs_inode_info,
2968 anon_inode_list);
2969 ip = &jfs_ip->vfs_inode;
2970
2971 if (! igrab(ip)) {
2972 /*
2973 * Inode is being freed
2974 */
2975 list_del_init(&jfs_ip->anon_inode_list);
2976 } else if (! down_trylock(&jfs_ip->commit_sem)) {
2977 /*
2978 * inode will be removed from anonymous list
2979 * when it is committed
2980 */
2981 TXN_UNLOCK();
2982 tid = txBegin(ip->i_sb, COMMIT_INODE);
2983 rc = txCommit(tid, 1, &ip, 0);
2984 txEnd(tid);
2985 up(&jfs_ip->commit_sem);
2986
2987 iput(ip);
2988 /*
2989 * Just to be safe. I don't know how
2990 * long we can run without blocking
2991 */
2992 cond_resched();
2993 TXN_LOCK();
2994 } else {
2995 /* We can't get the commit semaphore. It may
2996 * be held by a thread waiting for tlock's
2997 * so let's not block here. Save it to
2998 * put back on the anon_list.
2999 */
3000
3001 /* Take off anon_list */
3002 list_del(&jfs_ip->anon_inode_list);
3003
3004 /* Put on anon_list2 */
3005 list_add(&jfs_ip->anon_inode_list,
3006 &TxAnchor.anon_list2);
3007
3008 TXN_UNLOCK();
3009 iput(ip);
3010 TXN_LOCK();
3011 }
3012 }
3013 /* Add anon_list2 back to anon_list */
3014 list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list);
3015
3016 if (current->flags & PF_FREEZE) {
3017 TXN_UNLOCK();
3018 refrigerator(PF_FREEZE);
3019 } else {
3020 DECLARE_WAITQUEUE(wq, current);
3021
3022 add_wait_queue(&jfs_sync_thread_wait, &wq);
3023 set_current_state(TASK_INTERRUPTIBLE);
3024 TXN_UNLOCK();
3025 schedule();
3026 current->state = TASK_RUNNING;
3027 remove_wait_queue(&jfs_sync_thread_wait, &wq);
3028 }
3029 } while (!jfs_stop_threads);
3030
3031 jfs_info("jfs_sync being killed");
3032 complete_and_exit(&jfsIOwait, 0);
3033}
3034
3035#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
3036int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
3037 int *eof, void *data)
3038{
3039 int len = 0;
3040 off_t begin;
3041 char *freewait;
3042 char *freelockwait;
3043 char *lowlockwait;
3044
3045 freewait =
3046 waitqueue_active(&TxAnchor.freewait) ? "active" : "empty";
3047 freelockwait =
3048 waitqueue_active(&TxAnchor.freelockwait) ? "active" : "empty";
3049 lowlockwait =
3050 waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty";
3051
3052 len += sprintf(buffer,
3053 "JFS TxAnchor\n"
3054 "============\n"
3055 "freetid = %d\n"
3056 "freewait = %s\n"
3057 "freelock = %d\n"
3058 "freelockwait = %s\n"
3059 "lowlockwait = %s\n"
3060 "tlocksInUse = %d\n"
3061 "jfs_tlocks_low = %d\n"
3062 "unlock_queue is %sempty\n",
3063 TxAnchor.freetid,
3064 freewait,
3065 TxAnchor.freelock,
3066 freelockwait,
3067 lowlockwait,
3068 TxAnchor.tlocksInUse,
3069 jfs_tlocks_low,
3070 list_empty(&TxAnchor.unlock_queue) ? "" : "not ");
3071
3072 begin = offset;
3073 *start = buffer + begin;
3074 len -= begin;
3075
3076 if (len > length)
3077 len = length;
3078 else
3079 *eof = 1;
3080
3081 if (len < 0)
3082 len = 0;
3083
3084 return len;
3085}
3086#endif
3087
3088#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS)
3089int jfs_txstats_read(char *buffer, char **start, off_t offset, int length,
3090 int *eof, void *data)
3091{
3092 int len = 0;
3093 off_t begin;
3094
3095 len += sprintf(buffer,
3096 "JFS TxStats\n"
3097 "===========\n"
3098 "calls to txBegin = %d\n"
3099 "txBegin blocked by sync barrier = %d\n"
3100 "txBegin blocked by tlocks low = %d\n"
3101 "txBegin blocked by no free tid = %d\n"
3102 "calls to txBeginAnon = %d\n"
3103 "txBeginAnon blocked by sync barrier = %d\n"
3104 "txBeginAnon blocked by tlocks low = %d\n"
3105 "calls to txLockAlloc = %d\n"
3106 "tLockAlloc blocked by no free lock = %d\n",
3107 TxStat.txBegin,
3108 TxStat.txBegin_barrier,
3109 TxStat.txBegin_lockslow,
3110 TxStat.txBegin_freetid,
3111 TxStat.txBeginAnon,
3112 TxStat.txBeginAnon_barrier,
3113 TxStat.txBeginAnon_lockslow,
3114 TxStat.txLockAlloc,
3115 TxStat.txLockAlloc_freelock);
3116
3117 begin = offset;
3118 *start = buffer + begin;
3119 len -= begin;
3120
3121 if (len > length)
3122 len = length;
3123 else
3124 *eof = 1;
3125
3126 if (len < 0)
3127 len = 0;
3128
3129 return len;
3130}
3131#endif
diff --git a/fs/jfs/jfs_txnmgr.h b/fs/jfs/jfs_txnmgr.h
new file mode 100644
index 000000000000..b71b82c2df04
--- /dev/null
+++ b/fs/jfs/jfs_txnmgr.h
@@ -0,0 +1,318 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18#ifndef _H_JFS_TXNMGR
19#define _H_JFS_TXNMGR
20
21#include "jfs_logmgr.h"
22
23/*
24 * Hide implementation of TxBlock and TxLock
25 */
26#define tid_to_tblock(tid) (&TxBlock[tid])
27
28#define lid_to_tlock(lid) (&TxLock[lid])
29
30/*
31 * transaction block
32 */
33struct tblock {
34 /*
35 * tblock and jbuf_t common area: struct logsyncblk
36 *
37 * the following 5 fields are the same as struct logsyncblk
38 * which is common to tblock and jbuf to form logsynclist
39 */
40 u16 xflag; /* tx commit type */
41 u16 flag; /* tx commit state */
42 lid_t dummy; /* Must keep structures common */
43 s32 lsn; /* recovery lsn */
44 struct list_head synclist; /* logsynclist link */
45
46 /* lock management */
47 struct super_block *sb; /* super block */
48 lid_t next; /* index of first tlock of tid */
49 lid_t last; /* index of last tlock of tid */
50 wait_queue_head_t waitor; /* tids waiting on this tid */
51
52 /* log management */
53 u32 logtid; /* log transaction id */
54
55 /* commit management */
56 struct list_head cqueue; /* commit queue list */
57 s32 clsn; /* commit lsn */
58 struct lbuf *bp;
59 s32 pn; /* commit record log page number */
60 s32 eor; /* commit record eor */
61 wait_queue_head_t gcwait; /* group commit event list:
62 * ready transactions wait on this
63 * event for group commit completion.
64 */
65 union {
66 struct inode *ip; /* inode being deleted */
67 pxd_t ixpxd; /* pxd of inode extent for created inode */
68 } u;
69 u32 ino; /* inode number being created */
70};
71
72extern struct tblock *TxBlock; /* transaction block table */
73
74/* commit flags: tblk->xflag */
75#define COMMIT_SYNC 0x0001 /* synchronous commit */
76#define COMMIT_FORCE 0x0002 /* force pageout at end of commit */
77#define COMMIT_FLUSH 0x0004 /* init flush at end of commit */
78#define COMMIT_MAP 0x00f0
79#define COMMIT_PMAP 0x0010 /* update pmap */
80#define COMMIT_WMAP 0x0020 /* update wmap */
81#define COMMIT_PWMAP 0x0040 /* update pwmap */
82#define COMMIT_FREE 0x0f00
83#define COMMIT_DELETE 0x0100 /* inode delete */
84#define COMMIT_TRUNCATE 0x0200 /* file truncation */
85#define COMMIT_CREATE 0x0400 /* inode create */
86#define COMMIT_LAZY 0x0800 /* lazy commit */
87#define COMMIT_PAGE 0x1000 /* Identifies element as metapage */
88#define COMMIT_INODE 0x2000 /* Identifies element as inode */
89
90/* group commit flags tblk->flag: see jfs_logmgr.h */
91
92/*
93 * transaction lock
94 */
95struct tlock {
96 lid_t next; /* 2: index next lockword on tid locklist
97 * next lockword on freelist
98 */
99 tid_t tid; /* 2: transaction id holding lock */
100
101 u16 flag; /* 2: lock control */
102 u16 type; /* 2: log type */
103
104 struct metapage *mp; /* 4/8: object page buffer locked */
105 struct inode *ip; /* 4/8: object */
106 /* (16) */
107
108 s16 lock[24]; /* 48: overlay area */
109}; /* (64) */
110
111extern struct tlock *TxLock; /* transaction lock table */
112
113/*
114 * tlock flag
115 */
116/* txLock state */
117#define tlckPAGELOCK 0x8000
118#define tlckINODELOCK 0x4000
119#define tlckLINELOCK 0x2000
120#define tlckINLINELOCK 0x1000
121/* lmLog state */
122#define tlckLOG 0x0800
123/* updateMap state */
124#define tlckUPDATEMAP 0x0080
125/* freeLock state */
126#define tlckFREELOCK 0x0008
127#define tlckWRITEPAGE 0x0004
128#define tlckFREEPAGE 0x0002
129
130/*
131 * tlock type
132 */
133#define tlckTYPE 0xfe00
134#define tlckINODE 0x8000
135#define tlckXTREE 0x4000
136#define tlckDTREE 0x2000
137#define tlckMAP 0x1000
138#define tlckEA 0x0800
139#define tlckACL 0x0400
140#define tlckDATA 0x0200
141#define tlckBTROOT 0x0100
142
143#define tlckOPERATION 0x00ff
144#define tlckGROW 0x0001 /* file grow */
145#define tlckREMOVE 0x0002 /* file delete */
146#define tlckTRUNCATE 0x0004 /* file truncate */
147#define tlckRELOCATE 0x0008 /* file/directory relocate */
148#define tlckENTRY 0x0001 /* directory insert/delete */
149#define tlckEXTEND 0x0002 /* directory extend in-line */
150#define tlckSPLIT 0x0010 /* splited page */
151#define tlckNEW 0x0020 /* new page from split */
152#define tlckFREE 0x0040 /* free page */
153#define tlckRELINK 0x0080 /* update sibling pointer */
154
155/*
156 * linelock for lmLog()
157 *
158 * note: linelock and its variations are overlaid
159 * at tlock.lock: watch for alignment;
160 */
161struct lv {
162 u8 offset; /* 1: */
163 u8 length; /* 1: */
164}; /* (2) */
165
166#define TLOCKSHORT 20
167#define TLOCKLONG 28
168
169struct linelock {
170 lid_t next; /* 2: next linelock */
171
172 s8 maxcnt; /* 1: */
173 s8 index; /* 1: */
174
175 u16 flag; /* 2: */
176 u8 type; /* 1: */
177 u8 l2linesize; /* 1: log2 of linesize */
178 /* (8) */
179
180 struct lv lv[20]; /* 40: */
181}; /* (48) */
182
183#define dt_lock linelock
184
185struct xtlock {
186 lid_t next; /* 2: */
187
188 s8 maxcnt; /* 1: */
189 s8 index; /* 1: */
190
191 u16 flag; /* 2: */
192 u8 type; /* 1: */
193 u8 l2linesize; /* 1: log2 of linesize */
194 /* (8) */
195
196 struct lv header; /* 2: */
197 struct lv lwm; /* 2: low water mark */
198 struct lv hwm; /* 2: high water mark */
199 struct lv twm; /* 2: */
200 /* (16) */
201
202 s32 pxdlock[8]; /* 32: */
203}; /* (48) */
204
205
206/*
207 * maplock for txUpdateMap()
208 *
209 * note: maplock and its variations are overlaid
210 * at tlock.lock/linelock: watch for alignment;
211 * N.B. next field may be set by linelock, and should not
212 * be modified by maplock;
213 * N.B. index of the first pxdlock specifies index of next
214 * free maplock (i.e., number of maplock) in the tlock;
215 */
216struct maplock {
217 lid_t next; /* 2: */
218
219 u8 maxcnt; /* 2: */
220 u8 index; /* 2: next free maplock index */
221
222 u16 flag; /* 2: */
223 u8 type; /* 1: */
224 u8 count; /* 1: number of pxd/xad */
225 /* (8) */
226
227 pxd_t pxd; /* 8: */
228}; /* (16): */
229
230/* maplock flag */
231#define mlckALLOC 0x00f0
232#define mlckALLOCXADLIST 0x0080
233#define mlckALLOCPXDLIST 0x0040
234#define mlckALLOCXAD 0x0020
235#define mlckALLOCPXD 0x0010
236#define mlckFREE 0x000f
237#define mlckFREEXADLIST 0x0008
238#define mlckFREEPXDLIST 0x0004
239#define mlckFREEXAD 0x0002
240#define mlckFREEPXD 0x0001
241
242#define pxd_lock maplock
243
244struct xdlistlock {
245 lid_t next; /* 2: */
246
247 u8 maxcnt; /* 2: */
248 u8 index; /* 2: */
249
250 u16 flag; /* 2: */
251 u8 type; /* 1: */
252 u8 count; /* 1: number of pxd/xad */
253 /* (8) */
254
255 /*
256 * We need xdlist to be 64 bits (8 bytes), regardless of
257 * whether void * is 32 or 64 bits
258 */
259 union {
260 void *_xdlist; /* pxd/xad list */
261 s64 pad; /* 8: Force 64-bit xdlist size */
262 } union64;
263}; /* (16): */
264
265#define xdlist union64._xdlist
266
267/*
268 * commit
269 *
270 * parameter to the commit manager routines
271 */
272struct commit {
273 tid_t tid; /* tid = index of tblock */
274 int flag; /* flags */
275 struct jfs_log *log; /* log */
276 struct super_block *sb; /* superblock */
277
278 int nip; /* number of entries in iplist */
279 struct inode **iplist; /* list of pointers to inodes */
280
281 /* log record descriptor on 64-bit boundary */
282 struct lrd lrd; /* : log record descriptor */
283};
284
285/*
286 * external declarations
287 */
288extern struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage *mp,
289 int flag);
290
291extern struct tlock *txMaplock(tid_t tid, struct inode *ip, int flag);
292
293extern int txCommit(tid_t tid, int nip, struct inode **iplist, int flag);
294
295extern tid_t txBegin(struct super_block *sb, int flag);
296
297extern void txBeginAnon(struct super_block *sb);
298
299extern void txEnd(tid_t tid);
300
301extern void txAbort(tid_t tid, int dirty);
302
303extern struct linelock *txLinelock(struct linelock * tlock);
304
305extern void txFreeMap(struct inode *ip, struct maplock * maplock,
306 struct tblock * tblk, int maptype);
307
308extern void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea);
309
310extern void txFreelock(struct inode *ip);
311
312extern int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
313 struct tlock * tlck);
314
315extern void txQuiesce(struct super_block *sb);
316
317extern void txResume(struct super_block *sb);
318#endif /* _H_JFS_TXNMGR */
diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h
new file mode 100644
index 000000000000..5bfad39a2078
--- /dev/null
+++ b/fs/jfs/jfs_types.h
@@ -0,0 +1,192 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18#ifndef _H_JFS_TYPES
19#define _H_JFS_TYPES
20
21/*
22 * jfs_types.h:
23 *
24 * basic type/utility definitions
25 *
26 * note: this header file must be the 1st include file
27 * of JFS include list in all JFS .c file.
28 */
29
30#include <linux/types.h>
31#include <linux/nls.h>
32
33#include "endian24.h"
34
35/*
36 * transaction and lock id's
37 *
38 * Don't change these without carefully considering the impact on the
39 * size and alignment of all of the linelock variants
40 */
41typedef u16 tid_t;
42typedef u16 lid_t;
43
44/*
45 * Almost identical to Linux's timespec, but not quite
46 */
47struct timestruc_t {
48 __le32 tv_sec;
49 __le32 tv_nsec;
50};
51
52/*
53 * handy
54 */
55
56#define LEFTMOSTONE 0x80000000
57#define HIGHORDER 0x80000000u /* high order bit on */
58#define ONES 0xffffffffu /* all bit on */
59
60typedef int boolean_t;
61#define TRUE 1
62#define FALSE 0
63
64/*
65 * logical xd (lxd)
66 */
67typedef struct {
68 unsigned len:24;
69 unsigned off1:8;
70 u32 off2;
71} lxd_t;
72
73/* lxd_t field construction */
74#define LXDlength(lxd, length32) ( (lxd)->len = length32 )
75#define LXDoffset(lxd, offset64)\
76{\
77 (lxd)->off1 = ((s64)offset64) >> 32;\
78 (lxd)->off2 = (offset64) & 0xffffffff;\
79}
80
81/* lxd_t field extraction */
82#define lengthLXD(lxd) ( (lxd)->len )
83#define offsetLXD(lxd)\
84 ( ((s64)((lxd)->off1)) << 32 | (lxd)->off2 )
85
86/* lxd list */
87struct lxdlist {
88 s16 maxnlxd;
89 s16 nlxd;
90 lxd_t *lxd;
91};
92
93/*
94 * physical xd (pxd)
95 */
96typedef struct {
97 unsigned len:24;
98 unsigned addr1:8;
99 __le32 addr2;
100} pxd_t;
101
102/* xd_t field construction */
103
104#define PXDlength(pxd, length32) ((pxd)->len = __cpu_to_le24(length32))
105#define PXDaddress(pxd, address64)\
106{\
107 (pxd)->addr1 = ((s64)address64) >> 32;\
108 (pxd)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
109}
110
111/* xd_t field extraction */
112#define lengthPXD(pxd) __le24_to_cpu((pxd)->len)
113#define addressPXD(pxd)\
114 ( ((s64)((pxd)->addr1)) << 32 | __le32_to_cpu((pxd)->addr2))
115
116#define MAXTREEHEIGHT 8
117/* pxd list */
118struct pxdlist {
119 s16 maxnpxd;
120 s16 npxd;
121 pxd_t pxd[MAXTREEHEIGHT];
122};
123
124
125/*
126 * data extent descriptor (dxd)
127 */
128typedef struct {
129 unsigned flag:8; /* 1: flags */
130 unsigned rsrvd:24;
131 __le32 size; /* 4: size in byte */
132 unsigned len:24; /* 3: length in unit of fsblksize */
133 unsigned addr1:8; /* 1: address in unit of fsblksize */
134 __le32 addr2; /* 4: address in unit of fsblksize */
135} dxd_t; /* - 16 - */
136
137/* dxd_t flags */
138#define DXD_INDEX 0x80 /* B+-tree index */
139#define DXD_INLINE 0x40 /* in-line data extent */
140#define DXD_EXTENT 0x20 /* out-of-line single extent */
141#define DXD_FILE 0x10 /* out-of-line file (inode) */
142#define DXD_CORRUPT 0x08 /* Inconsistency detected */
143
144/* dxd_t field construction
145 * Conveniently, the PXD macros work for DXD
146 */
147#define DXDlength PXDlength
148#define DXDaddress PXDaddress
149#define lengthDXD lengthPXD
150#define addressDXD addressPXD
151#define DXDsize(dxd, size32) ((dxd)->size = cpu_to_le32(size32))
152#define sizeDXD(dxd) le32_to_cpu((dxd)->size)
153
154/*
155 * directory entry argument
156 */
157struct component_name {
158 int namlen;
159 wchar_t *name;
160};
161
162
163/*
164 * DASD limit information - stored in directory inode
165 */
166struct dasd {
167 u8 thresh; /* Alert Threshold (in percent) */
168 u8 delta; /* Alert Threshold delta (in percent) */
169 u8 rsrvd1;
170 u8 limit_hi; /* DASD limit (in logical blocks) */
171 __le32 limit_lo; /* DASD limit (in logical blocks) */
172 u8 rsrvd2[3];
173 u8 used_hi; /* DASD usage (in logical blocks) */
174 __le32 used_lo; /* DASD usage (in logical blocks) */
175};
176
177#define DASDLIMIT(dasdp) \
178 (((u64)((dasdp)->limit_hi) << 32) + __le32_to_cpu((dasdp)->limit_lo))
179#define setDASDLIMIT(dasdp, limit)\
180{\
181 (dasdp)->limit_hi = ((u64)limit) >> 32;\
182 (dasdp)->limit_lo = __cpu_to_le32(limit);\
183}
184#define DASDUSED(dasdp) \
185 (((u64)((dasdp)->used_hi) << 32) + __le32_to_cpu((dasdp)->used_lo))
186#define setDASDUSED(dasdp, used)\
187{\
188 (dasdp)->used_hi = ((u64)used) >> 32;\
189 (dasdp)->used_lo = __cpu_to_le32(used);\
190}
191
192#endif /* !_H_JFS_TYPES */
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
new file mode 100644
index 000000000000..f31a9e3f3fec
--- /dev/null
+++ b/fs/jfs/jfs_umount.c
@@ -0,0 +1,178 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19/*
20 * jfs_umount.c
21 *
22 * note: file system in transition to aggregate/fileset:
23 * (ref. jfs_mount.c)
24 *
25 * file system unmount is interpreted as mount of the single/only
26 * fileset in the aggregate and, if unmount of the last fileset,
27 * as unmount of the aggerate;
28 */
29
30#include <linux/fs.h>
31#include "jfs_incore.h"
32#include "jfs_filsys.h"
33#include "jfs_superblock.h"
34#include "jfs_dmap.h"
35#include "jfs_imap.h"
36#include "jfs_metapage.h"
37#include "jfs_debug.h"
38
39/*
40 * NAME: jfs_umount(vfsp, flags, crp)
41 *
42 * FUNCTION: vfs_umount()
43 *
44 * PARAMETERS: vfsp - virtual file system pointer
45 * flags - unmount for shutdown
46 * crp - credential
47 *
48 * RETURN : EBUSY - device has open files
49 */
50int jfs_umount(struct super_block *sb)
51{
52 struct address_space *bdev_mapping = sb->s_bdev->bd_inode->i_mapping;
53 struct jfs_sb_info *sbi = JFS_SBI(sb);
54 struct inode *ipbmap = sbi->ipbmap;
55 struct inode *ipimap = sbi->ipimap;
56 struct inode *ipaimap = sbi->ipaimap;
57 struct inode *ipaimap2 = sbi->ipaimap2;
58 struct jfs_log *log;
59 int rc = 0;
60
61 jfs_info("UnMount JFS: sb:0x%p", sb);
62
63 /*
64 * update superblock and close log
65 *
66 * if mounted read-write and log based recovery was enabled
67 */
68 if ((log = sbi->log))
69 /*
70 * Wait for outstanding transactions to be written to log:
71 */
72 jfs_flush_journal(log, 2);
73
74 /*
75 * close fileset inode allocation map (aka fileset inode)
76 */
77 diUnmount(ipimap, 0);
78
79 diFreeSpecial(ipimap);
80 sbi->ipimap = NULL;
81
82 /*
83 * close secondary aggregate inode allocation map
84 */
85 ipaimap2 = sbi->ipaimap2;
86 if (ipaimap2) {
87 diUnmount(ipaimap2, 0);
88 diFreeSpecial(ipaimap2);
89 sbi->ipaimap2 = NULL;
90 }
91
92 /*
93 * close aggregate inode allocation map
94 */
95 ipaimap = sbi->ipaimap;
96 diUnmount(ipaimap, 0);
97 diFreeSpecial(ipaimap);
98 sbi->ipaimap = NULL;
99
100 /*
101 * close aggregate block allocation map
102 */
103 dbUnmount(ipbmap, 0);
104
105 diFreeSpecial(ipbmap);
106 sbi->ipimap = NULL;
107
108 /*
109 * Make sure all metadata makes it to disk before we mark
110 * the superblock as clean
111 */
112 filemap_fdatawrite(bdev_mapping);
113 filemap_fdatawait(bdev_mapping);
114
115 /*
116 * ensure all file system file pages are propagated to their
117 * home blocks on disk (and their in-memory buffer pages are
118 * invalidated) BEFORE updating file system superblock state
119 * (to signify file system is unmounted cleanly, and thus in
120 * consistent state) and log superblock active file system
121 * list (to signify skip logredo()).
122 */
123 if (log) { /* log = NULL if read-only mount */
124 updateSuper(sb, FM_CLEAN);
125
126 /* Restore default gfp_mask for bdev */
127 mapping_set_gfp_mask(bdev_mapping, GFP_USER);
128
129 /*
130 * close log:
131 *
132 * remove file system from log active file system list.
133 */
134 rc = lmLogClose(sb);
135 }
136 jfs_info("UnMount JFS Complete: rc = %d", rc);
137 return rc;
138}
139
140
141int jfs_umount_rw(struct super_block *sb)
142{
143 struct address_space *bdev_mapping = sb->s_bdev->bd_inode->i_mapping;
144 struct jfs_sb_info *sbi = JFS_SBI(sb);
145 struct jfs_log *log = sbi->log;
146
147 if (!log)
148 return 0;
149
150 /*
151 * close log:
152 *
153 * remove file system from log active file system list.
154 */
155 jfs_flush_journal(log, 2);
156
157 /*
158 * Make sure all metadata makes it to disk
159 */
160 dbSync(sbi->ipbmap);
161 diSync(sbi->ipimap);
162
163 /*
164 * Note that we have to do this even if sync_blockdev() will
165 * do exactly the same a few instructions later: We can't
166 * mark the superblock clean before everything is flushed to
167 * disk.
168 */
169 filemap_fdatawrite(bdev_mapping);
170 filemap_fdatawait(bdev_mapping);
171
172 updateSuper(sb, FM_CLEAN);
173
174 /* Restore default gfp_mask for bdev */
175 mapping_set_gfp_mask(bdev_mapping, GFP_USER);
176
177 return lmLogClose(sb);
178}
diff --git a/fs/jfs/jfs_unicode.c b/fs/jfs/jfs_unicode.c
new file mode 100644
index 000000000000..b32208aad550
--- /dev/null
+++ b/fs/jfs/jfs_unicode.c
@@ -0,0 +1,137 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#include <linux/fs.h>
20#include <linux/slab.h>
21#include "jfs_incore.h"
22#include "jfs_filsys.h"
23#include "jfs_unicode.h"
24#include "jfs_debug.h"
25
26/*
27 * NAME: jfs_strfromUCS()
28 *
29 * FUNCTION: Convert little-endian unicode string to character string
30 *
31 */
32int jfs_strfromUCS_le(char *to, const __le16 * from,
33 int len, struct nls_table *codepage)
34{
35 int i;
36 int outlen = 0;
37 static int warn_again = 5; /* Only warn up to 5 times total */
38 int warn = !!warn_again; /* once per string */
39
40 if (codepage) {
41 for (i = 0; (i < len) && from[i]; i++) {
42 int charlen;
43 charlen =
44 codepage->uni2char(le16_to_cpu(from[i]),
45 &to[outlen],
46 NLS_MAX_CHARSET_SIZE);
47 if (charlen > 0)
48 outlen += charlen;
49 else
50 to[outlen++] = '?';
51 }
52 } else {
53 for (i = 0; (i < len) && from[i]; i++) {
54 if (le16_to_cpu(from[i]) & 0xff00) {
55 if (warn) {
56 warn--;
57 warn_again--;
58 printk(KERN_ERR
59 "non-latin1 character 0x%x found in JFS file name\n",
60 le16_to_cpu(from[i]));
61 printk(KERN_ERR
62 "mount with iocharset=utf8 to access\n");
63 }
64 to[i] = '?';
65 }
66 else
67 to[i] = (char) (le16_to_cpu(from[i]));
68 }
69 outlen = i;
70 }
71 to[outlen] = 0;
72 return outlen;
73}
74
75/*
76 * NAME: jfs_strtoUCS()
77 *
78 * FUNCTION: Convert character string to unicode string
79 *
80 */
81static int jfs_strtoUCS(wchar_t * to, const unsigned char *from, int len,
82 struct nls_table *codepage)
83{
84 int charlen;
85 int i;
86
87 if (codepage) {
88 for (i = 0; len && *from; i++, from += charlen, len -= charlen)
89 {
90 charlen = codepage->char2uni(from, len, &to[i]);
91 if (charlen < 1) {
92 jfs_err("jfs_strtoUCS: char2uni returned %d.",
93 charlen);
94 jfs_err("charset = %s, char = 0x%x",
95 codepage->charset, *from);
96 return charlen;
97 }
98 }
99 } else {
100 for (i = 0; (i < len) && from[i]; i++)
101 to[i] = (wchar_t) from[i];
102 }
103
104 to[i] = 0;
105 return i;
106}
107
108/*
109 * NAME: get_UCSname()
110 *
111 * FUNCTION: Allocate and translate to unicode string
112 *
113 */
114int get_UCSname(struct component_name * uniName, struct dentry *dentry)
115{
116 struct nls_table *nls_tab = JFS_SBI(dentry->d_sb)->nls_tab;
117 int length = dentry->d_name.len;
118
119 if (length > JFS_NAME_MAX)
120 return -ENAMETOOLONG;
121
122 uniName->name =
123 kmalloc((length + 1) * sizeof(wchar_t), GFP_NOFS);
124
125 if (uniName->name == NULL)
126 return -ENOSPC;
127
128 uniName->namlen = jfs_strtoUCS(uniName->name, dentry->d_name.name,
129 length, nls_tab);
130
131 if (uniName->namlen < 0) {
132 kfree(uniName->name);
133 return uniName->namlen;
134 }
135
136 return 0;
137}
diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h
new file mode 100644
index 000000000000..69e25ebe87ac
--- /dev/null
+++ b/fs/jfs/jfs_unicode.h
@@ -0,0 +1,155 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2000-2002
3 * Portions Copyright (c) Christoph Hellwig, 2001-2002
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19#ifndef _H_JFS_UNICODE
20#define _H_JFS_UNICODE
21
22#include <asm/byteorder.h>
23#include "jfs_types.h"
24
25typedef struct {
26 wchar_t start;
27 wchar_t end;
28 signed char *table;
29} UNICASERANGE;
30
31extern signed char UniUpperTable[512];
32extern UNICASERANGE UniUpperRange[];
33extern int get_UCSname(struct component_name *, struct dentry *);
34extern int jfs_strfromUCS_le(char *, const __le16 *, int, struct nls_table *);
35
36#define free_UCSname(COMP) kfree((COMP)->name)
37
38/*
39 * UniStrcpy: Copy a string
40 */
41static inline wchar_t *UniStrcpy(wchar_t * ucs1, const wchar_t * ucs2)
42{
43 wchar_t *anchor = ucs1; /* save the start of result string */
44
45 while ((*ucs1++ = *ucs2++));
46 return anchor;
47}
48
49
50
51/*
52 * UniStrncpy: Copy length limited string with pad
53 */
54static inline __le16 *UniStrncpy_le(__le16 * ucs1, const __le16 * ucs2,
55 size_t n)
56{
57 __le16 *anchor = ucs1;
58
59 while (n-- && *ucs2) /* Copy the strings */
60 *ucs1++ = *ucs2++;
61
62 n++;
63 while (n--) /* Pad with nulls */
64 *ucs1++ = 0;
65 return anchor;
66}
67
68/*
69 * UniStrncmp_le: Compare length limited string - native to little-endian
70 */
71static inline int UniStrncmp_le(const wchar_t * ucs1, const __le16 * ucs2,
72 size_t n)
73{
74 if (!n)
75 return 0; /* Null strings are equal */
76 while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) {
77 ucs1++;
78 ucs2++;
79 }
80 return (int) *ucs1 - (int) __le16_to_cpu(*ucs2);
81}
82
83/*
84 * UniStrncpy_to_le: Copy length limited string with pad to little-endian
85 */
86static inline __le16 *UniStrncpy_to_le(__le16 * ucs1, const wchar_t * ucs2,
87 size_t n)
88{
89 __le16 *anchor = ucs1;
90
91 while (n-- && *ucs2) /* Copy the strings */
92 *ucs1++ = cpu_to_le16(*ucs2++);
93
94 n++;
95 while (n--) /* Pad with nulls */
96 *ucs1++ = 0;
97 return anchor;
98}
99
100/*
101 * UniStrncpy_from_le: Copy length limited string with pad from little-endian
102 */
103static inline wchar_t *UniStrncpy_from_le(wchar_t * ucs1, const __le16 * ucs2,
104 size_t n)
105{
106 wchar_t *anchor = ucs1;
107
108 while (n-- && *ucs2) /* Copy the strings */
109 *ucs1++ = __le16_to_cpu(*ucs2++);
110
111 n++;
112 while (n--) /* Pad with nulls */
113 *ucs1++ = 0;
114 return anchor;
115}
116
117/*
118 * UniToupper: Convert a unicode character to upper case
119 */
120static inline wchar_t UniToupper(wchar_t uc)
121{
122 UNICASERANGE *rp;
123
124 if (uc < sizeof(UniUpperTable)) { /* Latin characters */
125 return uc + UniUpperTable[uc]; /* Use base tables */
126 } else {
127 rp = UniUpperRange; /* Use range tables */
128 while (rp->start) {
129 if (uc < rp->start) /* Before start of range */
130 return uc; /* Uppercase = input */
131 if (uc <= rp->end) /* In range */
132 return uc + rp->table[uc - rp->start];
133 rp++; /* Try next range */
134 }
135 }
136 return uc; /* Past last range */
137}
138
139
140/*
141 * UniStrupr: Upper case a unicode string
142 */
143static inline wchar_t *UniStrupr(wchar_t * upin)
144{
145 wchar_t *up;
146
147 up = upin;
148 while (*up) { /* For all characters */
149 *up = UniToupper(*up);
150 up++;
151 }
152 return upin; /* Return input pointer */
153}
154
155#endif /* !_H_JFS_UNICODE */
diff --git a/fs/jfs/jfs_uniupr.c b/fs/jfs/jfs_uniupr.c
new file mode 100644
index 000000000000..4ab185d26308
--- /dev/null
+++ b/fs/jfs/jfs_uniupr.c
@@ -0,0 +1,134 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2000-2002
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#include <linux/fs.h>
20#include "jfs_unicode.h"
21
22/*
23 * Latin upper case
24 */
25signed char UniUpperTable[512] = {
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */
29 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */
30 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 040-04f */
31 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 050-05f */
32 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 060-06f */
33 -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, 0, 0, 0, 0, 0, /* 070-07f */
34 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */
35 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */
36 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */
37 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */
38 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0c0-0cf */
39 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0d0-0df */
40 -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 0e0-0ef */
41 -32,-32,-32,-32,-32,-32,-32, 0,-32,-32,-32,-32,-32,-32,-32,121, /* 0f0-0ff */
42 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 100-10f */
43 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 110-11f */
44 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 120-12f */
45 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 130-13f */
46 -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, /* 140-14f */
47 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 150-15f */
48 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 160-16f */
49 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 170-17f */
50 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, /* 180-18f */
51 0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, /* 190-19f */
52 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, /* 1a0-1af */
53 -1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, /* 1b0-1bf */
54 0, 0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0, /* 1c0-1cf */
55 -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,-79, 0, -1, /* 1d0-1df */
56 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e0-1ef */
57 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, /* 1f0-1ff */
58};
59
60/* Upper case range - Greek */
61static signed char UniCaseRangeU03a0[47] = {
62 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-38,-37,-37,-37, /* 3a0-3af */
63 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 3b0-3bf */
64 -32,-32,-31,-32,-32,-32,-32,-32,-32,-32,-32,-32,-64,-63,-63,
65};
66
67/* Upper case range - Cyrillic */
68static signed char UniCaseRangeU0430[48] = {
69 -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 430-43f */
70 -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 440-44f */
71 0,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80, 0,-80,-80, /* 450-45f */
72};
73
74/* Upper case range - Extended cyrillic */
75static signed char UniCaseRangeU0490[61] = {
76 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 490-49f */
77 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4a0-4af */
78 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4b0-4bf */
79 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1,
80};
81
82/* Upper case range - Extended latin and greek */
83static signed char UniCaseRangeU1e00[509] = {
84 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e00-1e0f */
85 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e10-1e1f */
86 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e20-1e2f */
87 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e30-1e3f */
88 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e40-1e4f */
89 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e50-1e5f */
90 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e60-1e6f */
91 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e70-1e7f */
92 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e80-1e8f */
93 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0,-59, 0, -1, 0, -1, /* 1e90-1e9f */
94 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ea0-1eaf */
95 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1eb0-1ebf */
96 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ec0-1ecf */
97 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ed0-1edf */
98 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ee0-1eef */
99 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */
100 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f00-1f0f */
101 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f10-1f1f */
102 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f20-1f2f */
103 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f30-1f3f */
104 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f40-1f4f */
105 0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f50-1f5f */
106 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f60-1f6f */
107 74, 74, 86, 86, 86, 86,100,100, 0, 0,112,112,126,126, 0, 0, /* 1f70-1f7f */
108 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f80-1f8f */
109 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f90-1f9f */
110 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fa0-1faf */
111 8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fb0-1fbf */
112 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fc0-1fcf */
113 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fd0-1fdf */
114 8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fe0-1fef */
115 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116};
117
118/* Upper case range - Wide latin */
119static signed char UniCaseRangeUff40[27] = {
120 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* ff40-ff4f */
121 -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,
122};
123
124/*
125 * Upper Case Range
126 */
127UNICASERANGE UniUpperRange[] = {
128 { 0x03a0, 0x03ce, UniCaseRangeU03a0 },
129 { 0x0430, 0x045f, UniCaseRangeU0430 },
130 { 0x0490, 0x04cc, UniCaseRangeU0490 },
131 { 0x1e00, 0x1ffc, UniCaseRangeU1e00 },
132 { 0xff40, 0xff5a, UniCaseRangeUff40 },
133 { 0 }
134};
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
new file mode 100644
index 000000000000..a1052f3f0bee
--- /dev/null
+++ b/fs/jfs/jfs_xattr.h
@@ -0,0 +1,64 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2000-2002
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#ifndef H_JFS_XATTR
20#define H_JFS_XATTR
21
22/*
23 * jfs_ea_list describe the on-disk format of the extended attributes.
24 * I know the null-terminator is redundant since namelen is stored, but
25 * I am maintaining compatibility with OS/2 where possible.
26 */
27struct jfs_ea {
28 u8 flag; /* Unused? */
29 u8 namelen; /* Length of name */
30 __le16 valuelen; /* Length of value */
31 char name[0]; /* Attribute name (includes null-terminator) */
32}; /* Value immediately follows name */
33
34struct jfs_ea_list {
35 __le32 size; /* overall size */
36 struct jfs_ea ea[0]; /* Variable length list */
37};
38
39/* Macros for defining maxiumum number of bytes supported for EAs */
40#define MAXEASIZE 65535
41#define MAXEALISTSIZE MAXEASIZE
42
43/*
44 * some macros for dealing with variable length EA lists.
45 */
46#define EA_SIZE(ea) \
47 (sizeof (struct jfs_ea) + (ea)->namelen + 1 + \
48 le16_to_cpu((ea)->valuelen))
49#define NEXT_EA(ea) ((struct jfs_ea *) (((char *) (ea)) + (EA_SIZE (ea))))
50#define FIRST_EA(ealist) ((ealist)->ea)
51#define EALIST_SIZE(ealist) le32_to_cpu((ealist)->size)
52#define END_EALIST(ealist) \
53 ((struct jfs_ea *) (((char *) (ealist)) + EALIST_SIZE(ealist)))
54
55extern int __jfs_setxattr(struct inode *, const char *, const void *, size_t,
56 int);
57extern int jfs_setxattr(struct dentry *, const char *, const void *, size_t,
58 int);
59extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t);
60extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t);
61extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
62extern int jfs_removexattr(struct dentry *, const char *);
63
64#endif /* H_JFS_XATTR */
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
new file mode 100644
index 000000000000..11c58c54b818
--- /dev/null
+++ b/fs/jfs/jfs_xtree.c
@@ -0,0 +1,4485 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18/*
19 * jfs_xtree.c: extent allocation descriptor B+-tree manager
20 */
21
22#include <linux/fs.h>
23#include <linux/quotaops.h>
24#include "jfs_incore.h"
25#include "jfs_filsys.h"
26#include "jfs_metapage.h"
27#include "jfs_dmap.h"
28#include "jfs_dinode.h"
29#include "jfs_superblock.h"
30#include "jfs_debug.h"
31
32/*
33 * xtree local flag
34 */
35#define XT_INSERT 0x00000001
36
37/*
38 * xtree key/entry comparison: extent offset
39 *
40 * return:
41 * -1: k < start of extent
42 * 0: start_of_extent <= k <= end_of_extent
43 * 1: k > end_of_extent
44 */
45#define XT_CMP(CMP, K, X, OFFSET64)\
46{\
47 OFFSET64 = offsetXAD(X);\
48 (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\
49 ((K) < OFFSET64) ? -1 : 0;\
50}
51
52/* write a xad entry */
53#define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\
54{\
55 (XAD)->flag = (FLAG);\
56 XADoffset((XAD), (OFF));\
57 XADlength((XAD), (LEN));\
58 XADaddress((XAD), (ADDR));\
59}
60
61#define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot)
62
63/* get page buffer for specified block address */
64/* ToDo: Replace this ugly macro with a function */
65#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
66{\
67 BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot)\
68 if (!(RC))\
69 {\
70 if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) ||\
71 (le16_to_cpu((P)->header.nextindex) > le16_to_cpu((P)->header.maxentry)) ||\
72 (le16_to_cpu((P)->header.maxentry) > (((BN)==0)?XTROOTMAXSLOT:PSIZE>>L2XTSLOTSIZE)))\
73 {\
74 jfs_error((IP)->i_sb, "XT_GETPAGE: xtree page corrupt");\
75 BT_PUTPAGE(MP);\
76 MP = NULL;\
77 RC = -EIO;\
78 }\
79 }\
80}
81
82/* for consistency */
83#define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
84
85#define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
86 BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot)
87/* xtree entry parameter descriptor */
88struct xtsplit {
89 struct metapage *mp;
90 s16 index;
91 u8 flag;
92 s64 off;
93 s64 addr;
94 int len;
95 struct pxdlist *pxdlist;
96};
97
98
99/*
100 * statistics
101 */
102#ifdef CONFIG_JFS_STATISTICS
103static struct {
104 uint search;
105 uint fastSearch;
106 uint split;
107} xtStat;
108#endif
109
110
111/*
112 * forward references
113 */
114static int xtSearch(struct inode *ip,
115 s64 xoff, int *cmpp, struct btstack * btstack, int flag);
116
117static int xtSplitUp(tid_t tid,
118 struct inode *ip,
119 struct xtsplit * split, struct btstack * btstack);
120
121static int xtSplitPage(tid_t tid, struct inode *ip, struct xtsplit * split,
122 struct metapage ** rmpp, s64 * rbnp);
123
124static int xtSplitRoot(tid_t tid, struct inode *ip,
125 struct xtsplit * split, struct metapage ** rmpp);
126
127#ifdef _STILL_TO_PORT
128static int xtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
129 xtpage_t * fp, struct btstack * btstack);
130
131static int xtSearchNode(struct inode *ip,
132 xad_t * xad,
133 int *cmpp, struct btstack * btstack, int flag);
134
135static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp);
136#endif /* _STILL_TO_PORT */
137
138/* External references */
139
140/*
141 * debug control
142 */
143/* #define _JFS_DEBUG_XTREE 1 */
144
145
146/*
147 * xtLookup()
148 *
149 * function: map a single page into a physical extent;
150 */
151int xtLookup(struct inode *ip, s64 lstart,
152 s64 llen, int *pflag, s64 * paddr, s32 * plen, int no_check)
153{
154 int rc = 0;
155 struct btstack btstack;
156 int cmp;
157 s64 bn;
158 struct metapage *mp;
159 xtpage_t *p;
160 int index;
161 xad_t *xad;
162 s64 size, xoff, xend;
163 int xlen;
164 s64 xaddr;
165
166 *plen = 0;
167
168 if (!no_check) {
169 /* is lookup offset beyond eof ? */
170 size = ((u64) ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
171 JFS_SBI(ip->i_sb)->l2bsize;
172 if (lstart >= size) {
173 jfs_err("xtLookup: lstart (0x%lx) >= size (0x%lx)",
174 (ulong) lstart, (ulong) size);
175 return 0;
176 }
177 }
178
179 /*
180 * search for the xad entry covering the logical extent
181 */
182//search:
183 if ((rc = xtSearch(ip, lstart, &cmp, &btstack, 0))) {
184 jfs_err("xtLookup: xtSearch returned %d", rc);
185 return rc;
186 }
187
188 /*
189 * compute the physical extent covering logical extent
190 *
191 * N.B. search may have failed (e.g., hole in sparse file),
192 * and returned the index of the next entry.
193 */
194 /* retrieve search result */
195 XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
196
197 /* is xad found covering start of logical extent ?
198 * lstart is a page start address,
199 * i.e., lstart cannot start in a hole;
200 */
201 if (cmp)
202 goto out;
203
204 /*
205 * lxd covered by xad
206 */
207 xad = &p->xad[index];
208 xoff = offsetXAD(xad);
209 xlen = lengthXAD(xad);
210 xend = xoff + xlen;
211 xaddr = addressXAD(xad);
212
213 /* initialize new pxd */
214 *pflag = xad->flag;
215 *paddr = xaddr + (lstart - xoff);
216 /* a page must be fully covered by an xad */
217 *plen = min(xend - lstart, llen);
218
219 out:
220 XT_PUTPAGE(mp);
221
222 return rc;
223}
224
225
226/*
227 * xtLookupList()
228 *
229 * function: map a single logical extent into a list of physical extent;
230 *
231 * parameter:
232 * struct inode *ip,
233 * struct lxdlist *lxdlist, lxd list (in)
234 * struct xadlist *xadlist, xad list (in/out)
235 * int flag)
236 *
237 * coverage of lxd by xad under assumption of
238 * . lxd's are ordered and disjoint.
239 * . xad's are ordered and disjoint.
240 *
241 * return:
242 * 0: success
243 *
244 * note: a page being written (even a single byte) is backed fully,
245 * except the last page which is only backed with blocks
246 * required to cover the last byte;
247 * the extent backing a page is fully contained within an xad;
248 */
249int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
250 struct xadlist * xadlist, int flag)
251{
252 int rc = 0;
253 struct btstack btstack;
254 int cmp;
255 s64 bn;
256 struct metapage *mp;
257 xtpage_t *p;
258 int index;
259 lxd_t *lxd;
260 xad_t *xad, *pxd;
261 s64 size, lstart, lend, xstart, xend, pstart;
262 s64 llen, xlen, plen;
263 s64 xaddr, paddr;
264 int nlxd, npxd, maxnpxd;
265
266 npxd = xadlist->nxad = 0;
267 maxnpxd = xadlist->maxnxad;
268 pxd = xadlist->xad;
269
270 nlxd = lxdlist->nlxd;
271 lxd = lxdlist->lxd;
272
273 lstart = offsetLXD(lxd);
274 llen = lengthLXD(lxd);
275 lend = lstart + llen;
276
277 size = (ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
278 JFS_SBI(ip->i_sb)->l2bsize;
279
280 /*
281 * search for the xad entry covering the logical extent
282 */
283 search:
284 if (lstart >= size)
285 return 0;
286
287 if ((rc = xtSearch(ip, lstart, &cmp, &btstack, 0)))
288 return rc;
289
290 /*
291 * compute the physical extent covering logical extent
292 *
293 * N.B. search may have failed (e.g., hole in sparse file),
294 * and returned the index of the next entry.
295 */
296//map:
297 /* retrieve search result */
298 XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
299
300 /* is xad on the next sibling page ? */
301 if (index == le16_to_cpu(p->header.nextindex)) {
302 if (p->header.flag & BT_ROOT)
303 goto mapend;
304
305 if ((bn = le64_to_cpu(p->header.next)) == 0)
306 goto mapend;
307
308 XT_PUTPAGE(mp);
309
310 /* get next sibling page */
311 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
312 if (rc)
313 return rc;
314
315 index = XTENTRYSTART;
316 }
317
318 xad = &p->xad[index];
319
320 /*
321 * is lxd covered by xad ?
322 */
323 compare:
324 xstart = offsetXAD(xad);
325 xlen = lengthXAD(xad);
326 xend = xstart + xlen;
327 xaddr = addressXAD(xad);
328
329 compare1:
330 if (xstart < lstart)
331 goto compare2;
332
333 /* (lstart <= xstart) */
334
335 /* lxd is NOT covered by xad */
336 if (lend <= xstart) {
337 /*
338 * get next lxd
339 */
340 if (--nlxd == 0)
341 goto mapend;
342 lxd++;
343
344 lstart = offsetLXD(lxd);
345 llen = lengthLXD(lxd);
346 lend = lstart + llen;
347 if (lstart >= size)
348 goto mapend;
349
350 /* compare with the current xad */
351 goto compare1;
352 }
353 /* lxd is covered by xad */
354 else { /* (xstart < lend) */
355
356 /* initialize new pxd */
357 pstart = xstart;
358 plen = min(lend - xstart, xlen);
359 paddr = xaddr;
360
361 goto cover;
362 }
363
364 /* (xstart < lstart) */
365 compare2:
366 /* lxd is covered by xad */
367 if (lstart < xend) {
368 /* initialize new pxd */
369 pstart = lstart;
370 plen = min(xend - lstart, llen);
371 paddr = xaddr + (lstart - xstart);
372
373 goto cover;
374 }
375 /* lxd is NOT covered by xad */
376 else { /* (xend <= lstart) */
377
378 /*
379 * get next xad
380 *
381 * linear search next xad covering lxd on
382 * the current xad page, and then tree search
383 */
384 if (index == le16_to_cpu(p->header.nextindex) - 1) {
385 if (p->header.flag & BT_ROOT)
386 goto mapend;
387
388 XT_PUTPAGE(mp);
389 goto search;
390 } else {
391 index++;
392 xad++;
393
394 /* compare with new xad */
395 goto compare;
396 }
397 }
398
399 /*
400 * lxd is covered by xad and a new pxd has been initialized
401 * (lstart <= xstart < lend) or (xstart < lstart < xend)
402 */
403 cover:
404 /* finalize pxd corresponding to current xad */
405 XT_PUTENTRY(pxd, xad->flag, pstart, plen, paddr);
406
407 if (++npxd >= maxnpxd)
408 goto mapend;
409 pxd++;
410
411 /*
412 * lxd is fully covered by xad
413 */
414 if (lend <= xend) {
415 /*
416 * get next lxd
417 */
418 if (--nlxd == 0)
419 goto mapend;
420 lxd++;
421
422 lstart = offsetLXD(lxd);
423 llen = lengthLXD(lxd);
424 lend = lstart + llen;
425 if (lstart >= size)
426 goto mapend;
427
428 /*
429 * test for old xad covering new lxd
430 * (old xstart < new lstart)
431 */
432 goto compare2;
433 }
434 /*
435 * lxd is partially covered by xad
436 */
437 else { /* (xend < lend) */
438
439 /*
440 * get next xad
441 *
442 * linear search next xad covering lxd on
443 * the current xad page, and then next xad page search
444 */
445 if (index == le16_to_cpu(p->header.nextindex) - 1) {
446 if (p->header.flag & BT_ROOT)
447 goto mapend;
448
449 if ((bn = le64_to_cpu(p->header.next)) == 0)
450 goto mapend;
451
452 XT_PUTPAGE(mp);
453
454 /* get next sibling page */
455 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
456 if (rc)
457 return rc;
458
459 index = XTENTRYSTART;
460 xad = &p->xad[index];
461 } else {
462 index++;
463 xad++;
464 }
465
466 /*
467 * test for new xad covering old lxd
468 * (old lstart < new xstart)
469 */
470 goto compare;
471 }
472
473 mapend:
474 xadlist->nxad = npxd;
475
476//out:
477 XT_PUTPAGE(mp);
478
479 return rc;
480}
481
482
483/*
484 * xtSearch()
485 *
486 * function: search for the xad entry covering specified offset.
487 *
488 * parameters:
489 * ip - file object;
490 * xoff - extent offset;
491 * cmpp - comparison result:
492 * btstack - traverse stack;
493 * flag - search process flag (XT_INSERT);
494 *
495 * returns:
496 * btstack contains (bn, index) of search path traversed to the entry.
497 * *cmpp is set to result of comparison with the entry returned.
498 * the page containing the entry is pinned at exit.
499 */
500static int xtSearch(struct inode *ip, s64 xoff, /* offset of extent */
501 int *cmpp, struct btstack * btstack, int flag)
502{
503 struct jfs_inode_info *jfs_ip = JFS_IP(ip);
504 int rc = 0;
505 int cmp = 1; /* init for empty page */
506 s64 bn; /* block number */
507 struct metapage *mp; /* page buffer */
508 xtpage_t *p; /* page */
509 xad_t *xad;
510 int base, index, lim, btindex;
511 struct btframe *btsp;
512 int nsplit = 0; /* number of pages to split */
513 s64 t64;
514
515 INCREMENT(xtStat.search);
516
517 BT_CLR(btstack);
518
519 btstack->nsplit = 0;
520
521 /*
522 * search down tree from root:
523 *
524 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
525 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
526 *
527 * if entry with search key K is not found
528 * internal page search find the entry with largest key Ki
529 * less than K which point to the child page to search;
530 * leaf page search find the entry with smallest key Kj
531 * greater than K so that the returned index is the position of
532 * the entry to be shifted right for insertion of new entry.
533 * for empty tree, search key is greater than any key of the tree.
534 *
535 * by convention, root bn = 0.
536 */
537 for (bn = 0;;) {
538 /* get/pin the page to search */
539 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
540 if (rc)
541 return rc;
542
543 /* try sequential access heuristics with the previous
544 * access entry in target leaf page:
545 * once search narrowed down into the target leaf,
546 * key must either match an entry in the leaf or
547 * key entry does not exist in the tree;
548 */
549//fastSearch:
550 if ((jfs_ip->btorder & BT_SEQUENTIAL) &&
551 (p->header.flag & BT_LEAF) &&
552 (index = jfs_ip->btindex) <
553 le16_to_cpu(p->header.nextindex)) {
554 xad = &p->xad[index];
555 t64 = offsetXAD(xad);
556 if (xoff < t64 + lengthXAD(xad)) {
557 if (xoff >= t64) {
558 *cmpp = 0;
559 goto out;
560 }
561
562 /* stop sequential access heuristics */
563 goto binarySearch;
564 } else { /* (t64 + lengthXAD(xad)) <= xoff */
565
566 /* try next sequential entry */
567 index++;
568 if (index <
569 le16_to_cpu(p->header.nextindex)) {
570 xad++;
571 t64 = offsetXAD(xad);
572 if (xoff < t64 + lengthXAD(xad)) {
573 if (xoff >= t64) {
574 *cmpp = 0;
575 goto out;
576 }
577
578 /* miss: key falls between
579 * previous and this entry
580 */
581 *cmpp = 1;
582 goto out;
583 }
584
585 /* (xoff >= t64 + lengthXAD(xad));
586 * matching entry may be further out:
587 * stop heuristic search
588 */
589 /* stop sequential access heuristics */
590 goto binarySearch;
591 }
592
593 /* (index == p->header.nextindex);
594 * miss: key entry does not exist in
595 * the target leaf/tree
596 */
597 *cmpp = 1;
598 goto out;
599 }
600
601 /*
602 * if hit, return index of the entry found, and
603 * if miss, where new entry with search key is
604 * to be inserted;
605 */
606 out:
607 /* compute number of pages to split */
608 if (flag & XT_INSERT) {
609 if (p->header.nextindex == /* little-endian */
610 p->header.maxentry)
611 nsplit++;
612 else
613 nsplit = 0;
614 btstack->nsplit = nsplit;
615 }
616
617 /* save search result */
618 btsp = btstack->top;
619 btsp->bn = bn;
620 btsp->index = index;
621 btsp->mp = mp;
622
623 /* update sequential access heuristics */
624 jfs_ip->btindex = index;
625
626 INCREMENT(xtStat.fastSearch);
627 return 0;
628 }
629
630 /* well, ... full search now */
631 binarySearch:
632 lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
633
634 /*
635 * binary search with search key K on the current page
636 */
637 for (base = XTENTRYSTART; lim; lim >>= 1) {
638 index = base + (lim >> 1);
639
640 XT_CMP(cmp, xoff, &p->xad[index], t64);
641 if (cmp == 0) {
642 /*
643 * search hit
644 */
645 /* search hit - leaf page:
646 * return the entry found
647 */
648 if (p->header.flag & BT_LEAF) {
649 *cmpp = cmp;
650
651 /* compute number of pages to split */
652 if (flag & XT_INSERT) {
653 if (p->header.nextindex ==
654 p->header.maxentry)
655 nsplit++;
656 else
657 nsplit = 0;
658 btstack->nsplit = nsplit;
659 }
660
661 /* save search result */
662 btsp = btstack->top;
663 btsp->bn = bn;
664 btsp->index = index;
665 btsp->mp = mp;
666
667 /* init sequential access heuristics */
668 btindex = jfs_ip->btindex;
669 if (index == btindex ||
670 index == btindex + 1)
671 jfs_ip->btorder = BT_SEQUENTIAL;
672 else
673 jfs_ip->btorder = BT_RANDOM;
674 jfs_ip->btindex = index;
675
676 return 0;
677 }
678
679 /* search hit - internal page:
680 * descend/search its child page
681 */
682 goto next;
683 }
684
685 if (cmp > 0) {
686 base = index + 1;
687 --lim;
688 }
689 }
690
691 /*
692 * search miss
693 *
694 * base is the smallest index with key (Kj) greater than
695 * search key (K) and may be zero or maxentry index.
696 */
697 /*
698 * search miss - leaf page:
699 *
700 * return location of entry (base) where new entry with
701 * search key K is to be inserted.
702 */
703 if (p->header.flag & BT_LEAF) {
704 *cmpp = cmp;
705
706 /* compute number of pages to split */
707 if (flag & XT_INSERT) {
708 if (p->header.nextindex ==
709 p->header.maxentry)
710 nsplit++;
711 else
712 nsplit = 0;
713 btstack->nsplit = nsplit;
714 }
715
716 /* save search result */
717 btsp = btstack->top;
718 btsp->bn = bn;
719 btsp->index = base;
720 btsp->mp = mp;
721
722 /* init sequential access heuristics */
723 btindex = jfs_ip->btindex;
724 if (base == btindex || base == btindex + 1)
725 jfs_ip->btorder = BT_SEQUENTIAL;
726 else
727 jfs_ip->btorder = BT_RANDOM;
728 jfs_ip->btindex = base;
729
730 return 0;
731 }
732
733 /*
734 * search miss - non-leaf page:
735 *
736 * if base is non-zero, decrement base by one to get the parent
737 * entry of the child page to search.
738 */
739 index = base ? base - 1 : base;
740
741 /*
742 * go down to child page
743 */
744 next:
745 /* update number of pages to split */
746 if (p->header.nextindex == p->header.maxentry)
747 nsplit++;
748 else
749 nsplit = 0;
750
751 /* push (bn, index) of the parent page/entry */
752 BT_PUSH(btstack, bn, index);
753
754 /* get the child page block number */
755 bn = addressXAD(&p->xad[index]);
756
757 /* unpin the parent page */
758 XT_PUTPAGE(mp);
759 }
760}
761
762/*
763 * xtInsert()
764 *
765 * function:
766 *
767 * parameter:
768 * tid - transaction id;
769 * ip - file object;
770 * xflag - extent flag (XAD_NOTRECORDED):
771 * xoff - extent offset;
772 * xlen - extent length;
773 * xaddrp - extent address pointer (in/out):
774 * if (*xaddrp)
775 * caller allocated data extent at *xaddrp;
776 * else
777 * allocate data extent and return its xaddr;
778 * flag -
779 *
780 * return:
781 */
782int xtInsert(tid_t tid, /* transaction id */
783 struct inode *ip, int xflag, s64 xoff, s32 xlen, s64 * xaddrp,
784 int flag)
785{
786 int rc = 0;
787 s64 xaddr, hint;
788 struct metapage *mp; /* meta-page buffer */
789 xtpage_t *p; /* base B+-tree index page */
790 s64 bn;
791 int index, nextindex;
792 struct btstack btstack; /* traverse stack */
793 struct xtsplit split; /* split information */
794 xad_t *xad;
795 int cmp;
796 struct tlock *tlck;
797 struct xtlock *xtlck;
798
799 jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen);
800
801 /*
802 * search for the entry location at which to insert:
803 *
804 * xtFastSearch() and xtSearch() both returns (leaf page
805 * pinned, index at which to insert).
806 * n.b. xtSearch() may return index of maxentry of
807 * the full page.
808 */
809 if ((rc = xtSearch(ip, xoff, &cmp, &btstack, XT_INSERT)))
810 return rc;
811
812 /* retrieve search result */
813 XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
814
815 /* This test must follow XT_GETSEARCH since mp must be valid if
816 * we branch to out: */
817 if (cmp == 0) {
818 rc = -EEXIST;
819 goto out;
820 }
821
822 /*
823 * allocate data extent requested
824 *
825 * allocation hint: last xad
826 */
827 if ((xaddr = *xaddrp) == 0) {
828 if (index > XTENTRYSTART) {
829 xad = &p->xad[index - 1];
830 hint = addressXAD(xad) + lengthXAD(xad) - 1;
831 } else
832 hint = 0;
833 if ((rc = DQUOT_ALLOC_BLOCK(ip, xlen)))
834 goto out;
835 if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) {
836 DQUOT_FREE_BLOCK(ip, xlen);
837 goto out;
838 }
839 }
840
841 /*
842 * insert entry for new extent
843 */
844 xflag |= XAD_NEW;
845
846 /*
847 * if the leaf page is full, split the page and
848 * propagate up the router entry for the new page from split
849 *
850 * The xtSplitUp() will insert the entry and unpin the leaf page.
851 */
852 nextindex = le16_to_cpu(p->header.nextindex);
853 if (nextindex == le16_to_cpu(p->header.maxentry)) {
854 split.mp = mp;
855 split.index = index;
856 split.flag = xflag;
857 split.off = xoff;
858 split.len = xlen;
859 split.addr = xaddr;
860 split.pxdlist = NULL;
861 if ((rc = xtSplitUp(tid, ip, &split, &btstack))) {
862 /* undo data extent allocation */
863 if (*xaddrp == 0) {
864 dbFree(ip, xaddr, (s64) xlen);
865 DQUOT_FREE_BLOCK(ip, xlen);
866 }
867 return rc;
868 }
869
870 *xaddrp = xaddr;
871 return 0;
872 }
873
874 /*
875 * insert the new entry into the leaf page
876 */
877 /*
878 * acquire a transaction lock on the leaf page;
879 *
880 * action: xad insertion/extension;
881 */
882 BT_MARK_DIRTY(mp, ip);
883
884 /* if insert into middle, shift right remaining entries. */
885 if (index < nextindex)
886 memmove(&p->xad[index + 1], &p->xad[index],
887 (nextindex - index) * sizeof(xad_t));
888
889 /* insert the new entry: mark the entry NEW */
890 xad = &p->xad[index];
891 XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
892
893 /* advance next available entry index */
894 p->header.nextindex =
895 cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
896
897 /* Don't log it if there are no links to the file */
898 if (!test_cflag(COMMIT_Nolink, ip)) {
899 tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
900 xtlck = (struct xtlock *) & tlck->lock;
901 xtlck->lwm.offset =
902 (xtlck->lwm.offset) ? min(index,
903 (int)xtlck->lwm.offset) : index;
904 xtlck->lwm.length =
905 le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
906 }
907
908 *xaddrp = xaddr;
909
910 out:
911 /* unpin the leaf page */
912 XT_PUTPAGE(mp);
913
914 return rc;
915}
916
917
918/*
919 * xtSplitUp()
920 *
921 * function:
922 * split full pages as propagating insertion up the tree
923 *
924 * parameter:
925 * tid - transaction id;
926 * ip - file object;
927 * split - entry parameter descriptor;
928 * btstack - traverse stack from xtSearch()
929 *
930 * return:
931 */
932static int
933xtSplitUp(tid_t tid,
934 struct inode *ip, struct xtsplit * split, struct btstack * btstack)
935{
936 int rc = 0;
937 struct metapage *smp;
938 xtpage_t *sp; /* split page */
939 struct metapage *rmp;
940 s64 rbn; /* new right page block number */
941 struct metapage *rcmp;
942 xtpage_t *rcp; /* right child page */
943 s64 rcbn; /* right child page block number */
944 int skip; /* index of entry of insertion */
945 int nextindex; /* next available entry index of p */
946 struct btframe *parent; /* parent page entry on traverse stack */
947 xad_t *xad;
948 s64 xaddr;
949 int xlen;
950 int nsplit; /* number of pages split */
951 struct pxdlist pxdlist;
952 pxd_t *pxd;
953 struct tlock *tlck;
954 struct xtlock *xtlck;
955
956 smp = split->mp;
957 sp = XT_PAGE(ip, smp);
958
959 /* is inode xtree root extension/inline EA area free ? */
960 if ((sp->header.flag & BT_ROOT) && (!S_ISDIR(ip->i_mode)) &&
961 (le16_to_cpu(sp->header.maxentry) < XTROOTMAXSLOT) &&
962 (JFS_IP(ip)->mode2 & INLINEEA)) {
963 sp->header.maxentry = cpu_to_le16(XTROOTMAXSLOT);
964 JFS_IP(ip)->mode2 &= ~INLINEEA;
965
966 BT_MARK_DIRTY(smp, ip);
967 /*
968 * acquire a transaction lock on the leaf page;
969 *
970 * action: xad insertion/extension;
971 */
972
973 /* if insert into middle, shift right remaining entries. */
974 skip = split->index;
975 nextindex = le16_to_cpu(sp->header.nextindex);
976 if (skip < nextindex)
977 memmove(&sp->xad[skip + 1], &sp->xad[skip],
978 (nextindex - skip) * sizeof(xad_t));
979
980 /* insert the new entry: mark the entry NEW */
981 xad = &sp->xad[skip];
982 XT_PUTENTRY(xad, split->flag, split->off, split->len,
983 split->addr);
984
985 /* advance next available entry index */
986 sp->header.nextindex =
987 cpu_to_le16(le16_to_cpu(sp->header.nextindex) + 1);
988
989 /* Don't log it if there are no links to the file */
990 if (!test_cflag(COMMIT_Nolink, ip)) {
991 tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW);
992 xtlck = (struct xtlock *) & tlck->lock;
993 xtlck->lwm.offset = (xtlck->lwm.offset) ?
994 min(skip, (int)xtlck->lwm.offset) : skip;
995 xtlck->lwm.length =
996 le16_to_cpu(sp->header.nextindex) -
997 xtlck->lwm.offset;
998 }
999
1000 return 0;
1001 }
1002
1003 /*
1004 * allocate new index blocks to cover index page split(s)
1005 *
1006 * allocation hint: ?
1007 */
1008 if (split->pxdlist == NULL) {
1009 nsplit = btstack->nsplit;
1010 split->pxdlist = &pxdlist;
1011 pxdlist.maxnpxd = pxdlist.npxd = 0;
1012 pxd = &pxdlist.pxd[0];
1013 xlen = JFS_SBI(ip->i_sb)->nbperpage;
1014 for (; nsplit > 0; nsplit--, pxd++) {
1015 if ((rc = dbAlloc(ip, (s64) 0, (s64) xlen, &xaddr))
1016 == 0) {
1017 PXDaddress(pxd, xaddr);
1018 PXDlength(pxd, xlen);
1019
1020 pxdlist.maxnpxd++;
1021
1022 continue;
1023 }
1024
1025 /* undo allocation */
1026
1027 XT_PUTPAGE(smp);
1028 return rc;
1029 }
1030 }
1031
1032 /*
1033 * Split leaf page <sp> into <sp> and a new right page <rp>.
1034 *
1035 * The split routines insert the new entry into the leaf page,
1036 * and acquire txLock as appropriate.
1037 * return <rp> pinned and its block number <rpbn>.
1038 */
1039 rc = (sp->header.flag & BT_ROOT) ?
1040 xtSplitRoot(tid, ip, split, &rmp) :
1041 xtSplitPage(tid, ip, split, &rmp, &rbn);
1042
1043 XT_PUTPAGE(smp);
1044
1045 if (rc)
1046 return -EIO;
1047 /*
1048 * propagate up the router entry for the leaf page just split
1049 *
1050 * insert a router entry for the new page into the parent page,
1051 * propagate the insert/split up the tree by walking back the stack
1052 * of (bn of parent page, index of child page entry in parent page)
1053 * that were traversed during the search for the page that split.
1054 *
1055 * the propagation of insert/split up the tree stops if the root
1056 * splits or the page inserted into doesn't have to split to hold
1057 * the new entry.
1058 *
1059 * the parent entry for the split page remains the same, and
1060 * a new entry is inserted at its right with the first key and
1061 * block number of the new right page.
1062 *
1063 * There are a maximum of 3 pages pinned at any time:
1064 * right child, left parent and right parent (when the parent splits)
1065 * to keep the child page pinned while working on the parent.
1066 * make sure that all pins are released at exit.
1067 */
1068 while ((parent = BT_POP(btstack)) != NULL) {
1069 /* parent page specified by stack frame <parent> */
1070
1071 /* keep current child pages <rcp> pinned */
1072 rcmp = rmp;
1073 rcbn = rbn;
1074 rcp = XT_PAGE(ip, rcmp);
1075
1076 /*
1077 * insert router entry in parent for new right child page <rp>
1078 */
1079 /* get/pin the parent page <sp> */
1080 XT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
1081 if (rc) {
1082 XT_PUTPAGE(rcmp);
1083 return rc;
1084 }
1085
1086 /*
1087 * The new key entry goes ONE AFTER the index of parent entry,
1088 * because the split was to the right.
1089 */
1090 skip = parent->index + 1;
1091
1092 /*
1093 * split or shift right remaining entries of the parent page
1094 */
1095 nextindex = le16_to_cpu(sp->header.nextindex);
1096 /*
1097 * parent page is full - split the parent page
1098 */
1099 if (nextindex == le16_to_cpu(sp->header.maxentry)) {
1100 /* init for parent page split */
1101 split->mp = smp;
1102 split->index = skip; /* index at insert */
1103 split->flag = XAD_NEW;
1104 split->off = offsetXAD(&rcp->xad[XTENTRYSTART]);
1105 split->len = JFS_SBI(ip->i_sb)->nbperpage;
1106 split->addr = rcbn;
1107
1108 /* unpin previous right child page */
1109 XT_PUTPAGE(rcmp);
1110
1111 /* The split routines insert the new entry,
1112 * and acquire txLock as appropriate.
1113 * return <rp> pinned and its block number <rpbn>.
1114 */
1115 rc = (sp->header.flag & BT_ROOT) ?
1116 xtSplitRoot(tid, ip, split, &rmp) :
1117 xtSplitPage(tid, ip, split, &rmp, &rbn);
1118 if (rc) {
1119 XT_PUTPAGE(smp);
1120 return rc;
1121 }
1122
1123 XT_PUTPAGE(smp);
1124 /* keep new child page <rp> pinned */
1125 }
1126 /*
1127 * parent page is not full - insert in parent page
1128 */
1129 else {
1130 /*
1131 * insert router entry in parent for the right child
1132 * page from the first entry of the right child page:
1133 */
1134 /*
1135 * acquire a transaction lock on the parent page;
1136 *
1137 * action: router xad insertion;
1138 */
1139 BT_MARK_DIRTY(smp, ip);
1140
1141 /*
1142 * if insert into middle, shift right remaining entries
1143 */
1144 if (skip < nextindex)
1145 memmove(&sp->xad[skip + 1], &sp->xad[skip],
1146 (nextindex -
1147 skip) << L2XTSLOTSIZE);
1148
1149 /* insert the router entry */
1150 xad = &sp->xad[skip];
1151 XT_PUTENTRY(xad, XAD_NEW,
1152 offsetXAD(&rcp->xad[XTENTRYSTART]),
1153 JFS_SBI(ip->i_sb)->nbperpage, rcbn);
1154
1155 /* advance next available entry index. */
1156 sp->header.nextindex =
1157 cpu_to_le16(le16_to_cpu(sp->header.nextindex) +
1158 1);
1159
1160 /* Don't log it if there are no links to the file */
1161 if (!test_cflag(COMMIT_Nolink, ip)) {
1162 tlck = txLock(tid, ip, smp,
1163 tlckXTREE | tlckGROW);
1164 xtlck = (struct xtlock *) & tlck->lock;
1165 xtlck->lwm.offset = (xtlck->lwm.offset) ?
1166 min(skip, (int)xtlck->lwm.offset) : skip;
1167 xtlck->lwm.length =
1168 le16_to_cpu(sp->header.nextindex) -
1169 xtlck->lwm.offset;
1170 }
1171
1172 /* unpin parent page */
1173 XT_PUTPAGE(smp);
1174
1175 /* exit propagate up */
1176 break;
1177 }
1178 }
1179
1180 /* unpin current right page */
1181 XT_PUTPAGE(rmp);
1182
1183 return 0;
1184}
1185
1186
1187/*
1188 * xtSplitPage()
1189 *
1190 * function:
1191 * split a full non-root page into
1192 * original/split/left page and new right page
1193 * i.e., the original/split page remains as left page.
1194 *
1195 * parameter:
1196 * int tid,
1197 * struct inode *ip,
1198 * struct xtsplit *split,
1199 * struct metapage **rmpp,
1200 * u64 *rbnp,
1201 *
1202 * return:
1203 * Pointer to page in which to insert or NULL on error.
1204 */
1205static int
1206xtSplitPage(tid_t tid, struct inode *ip,
1207 struct xtsplit * split, struct metapage ** rmpp, s64 * rbnp)
1208{
1209 int rc = 0;
1210 struct metapage *smp;
1211 xtpage_t *sp;
1212 struct metapage *rmp;
1213 xtpage_t *rp; /* new right page allocated */
1214 s64 rbn; /* new right page block number */
1215 struct metapage *mp;
1216 xtpage_t *p;
1217 s64 nextbn;
1218 int skip, maxentry, middle, righthalf, n;
1219 xad_t *xad;
1220 struct pxdlist *pxdlist;
1221 pxd_t *pxd;
1222 struct tlock *tlck;
1223 struct xtlock *sxtlck = NULL, *rxtlck = NULL;
1224 int quota_allocation = 0;
1225
1226 smp = split->mp;
1227 sp = XT_PAGE(ip, smp);
1228
1229 INCREMENT(xtStat.split);
1230
1231 pxdlist = split->pxdlist;
1232 pxd = &pxdlist->pxd[pxdlist->npxd];
1233 pxdlist->npxd++;
1234 rbn = addressPXD(pxd);
1235
1236 /* Allocate blocks to quota. */
1237 if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
1238 rc = -EDQUOT;
1239 goto clean_up;
1240 }
1241
1242 quota_allocation += lengthPXD(pxd);
1243
1244 /*
1245 * allocate the new right page for the split
1246 */
1247 rmp = get_metapage(ip, rbn, PSIZE, 1);
1248 if (rmp == NULL) {
1249 rc = -EIO;
1250 goto clean_up;
1251 }
1252
1253 jfs_info("xtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp);
1254
1255 BT_MARK_DIRTY(rmp, ip);
1256 /*
1257 * action: new page;
1258 */
1259
1260 rp = (xtpage_t *) rmp->data;
1261 rp->header.self = *pxd;
1262 rp->header.flag = sp->header.flag & BT_TYPE;
1263 rp->header.maxentry = sp->header.maxentry; /* little-endian */
1264 rp->header.nextindex = cpu_to_le16(XTENTRYSTART);
1265
1266 BT_MARK_DIRTY(smp, ip);
1267 /* Don't log it if there are no links to the file */
1268 if (!test_cflag(COMMIT_Nolink, ip)) {
1269 /*
1270 * acquire a transaction lock on the new right page;
1271 */
1272 tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW);
1273 rxtlck = (struct xtlock *) & tlck->lock;
1274 rxtlck->lwm.offset = XTENTRYSTART;
1275 /*
1276 * acquire a transaction lock on the split page
1277 */
1278 tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW);
1279 sxtlck = (struct xtlock *) & tlck->lock;
1280 }
1281
1282 /*
1283 * initialize/update sibling pointers of <sp> and <rp>
1284 */
1285 nextbn = le64_to_cpu(sp->header.next);
1286 rp->header.next = cpu_to_le64(nextbn);
1287 rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self));
1288 sp->header.next = cpu_to_le64(rbn);
1289
1290 skip = split->index;
1291
1292 /*
1293 * sequential append at tail (after last entry of last page)
1294 *
1295 * if splitting the last page on a level because of appending
1296 * a entry to it (skip is maxentry), it's likely that the access is
1297 * sequential. adding an empty page on the side of the level is less
1298 * work and can push the fill factor much higher than normal.
1299 * if we're wrong it's no big deal - we will do the split the right
1300 * way next time.
1301 * (it may look like it's equally easy to do a similar hack for
1302 * reverse sorted data, that is, split the tree left, but it's not.
1303 * Be my guest.)
1304 */
1305 if (nextbn == 0 && skip == le16_to_cpu(sp->header.maxentry)) {
1306 /*
1307 * acquire a transaction lock on the new/right page;
1308 *
1309 * action: xad insertion;
1310 */
1311 /* insert entry at the first entry of the new right page */
1312 xad = &rp->xad[XTENTRYSTART];
1313 XT_PUTENTRY(xad, split->flag, split->off, split->len,
1314 split->addr);
1315
1316 rp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1);
1317
1318 if (!test_cflag(COMMIT_Nolink, ip)) {
1319 /* rxtlck->lwm.offset = XTENTRYSTART; */
1320 rxtlck->lwm.length = 1;
1321 }
1322
1323 *rmpp = rmp;
1324 *rbnp = rbn;
1325
1326 jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp);
1327 return 0;
1328 }
1329
1330 /*
1331 * non-sequential insert (at possibly middle page)
1332 */
1333
1334 /*
1335 * update previous pointer of old next/right page of <sp>
1336 */
1337 if (nextbn != 0) {
1338 XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
1339 if (rc) {
1340 XT_PUTPAGE(rmp);
1341 goto clean_up;
1342 }
1343
1344 BT_MARK_DIRTY(mp, ip);
1345 /*
1346 * acquire a transaction lock on the next page;
1347 *
1348 * action:sibling pointer update;
1349 */
1350 if (!test_cflag(COMMIT_Nolink, ip))
1351 tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
1352
1353 p->header.prev = cpu_to_le64(rbn);
1354
1355 /* sibling page may have been updated previously, or
1356 * it may be updated later;
1357 */
1358
1359 XT_PUTPAGE(mp);
1360 }
1361
1362 /*
1363 * split the data between the split and new/right pages
1364 */
1365 maxentry = le16_to_cpu(sp->header.maxentry);
1366 middle = maxentry >> 1;
1367 righthalf = maxentry - middle;
1368
1369 /*
1370 * skip index in old split/left page - insert into left page:
1371 */
1372 if (skip <= middle) {
1373 /* move right half of split page to the new right page */
1374 memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle],
1375 righthalf << L2XTSLOTSIZE);
1376
1377 /* shift right tail of left half to make room for new entry */
1378 if (skip < middle)
1379 memmove(&sp->xad[skip + 1], &sp->xad[skip],
1380 (middle - skip) << L2XTSLOTSIZE);
1381
1382 /* insert new entry */
1383 xad = &sp->xad[skip];
1384 XT_PUTENTRY(xad, split->flag, split->off, split->len,
1385 split->addr);
1386
1387 /* update page header */
1388 sp->header.nextindex = cpu_to_le16(middle + 1);
1389 if (!test_cflag(COMMIT_Nolink, ip)) {
1390 sxtlck->lwm.offset = (sxtlck->lwm.offset) ?
1391 min(skip, (int)sxtlck->lwm.offset) : skip;
1392 }
1393
1394 rp->header.nextindex =
1395 cpu_to_le16(XTENTRYSTART + righthalf);
1396 }
1397 /*
1398 * skip index in new right page - insert into right page:
1399 */
1400 else {
1401 /* move left head of right half to right page */
1402 n = skip - middle;
1403 memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle],
1404 n << L2XTSLOTSIZE);
1405
1406 /* insert new entry */
1407 n += XTENTRYSTART;
1408 xad = &rp->xad[n];
1409 XT_PUTENTRY(xad, split->flag, split->off, split->len,
1410 split->addr);
1411
1412 /* move right tail of right half to right page */
1413 if (skip < maxentry)
1414 memmove(&rp->xad[n + 1], &sp->xad[skip],
1415 (maxentry - skip) << L2XTSLOTSIZE);
1416
1417 /* update page header */
1418 sp->header.nextindex = cpu_to_le16(middle);
1419 if (!test_cflag(COMMIT_Nolink, ip)) {
1420 sxtlck->lwm.offset = (sxtlck->lwm.offset) ?
1421 min(middle, (int)sxtlck->lwm.offset) : middle;
1422 }
1423
1424 rp->header.nextindex = cpu_to_le16(XTENTRYSTART +
1425 righthalf + 1);
1426 }
1427
1428 if (!test_cflag(COMMIT_Nolink, ip)) {
1429 sxtlck->lwm.length = le16_to_cpu(sp->header.nextindex) -
1430 sxtlck->lwm.offset;
1431
1432 /* rxtlck->lwm.offset = XTENTRYSTART; */
1433 rxtlck->lwm.length = le16_to_cpu(rp->header.nextindex) -
1434 XTENTRYSTART;
1435 }
1436
1437 *rmpp = rmp;
1438 *rbnp = rbn;
1439
1440 jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp);
1441 return rc;
1442
1443 clean_up:
1444
1445 /* Rollback quota allocation. */
1446 if (quota_allocation)
1447 DQUOT_FREE_BLOCK(ip, quota_allocation);
1448
1449 return (rc);
1450}
1451
1452
1453/*
1454 * xtSplitRoot()
1455 *
1456 * function:
1457 * split the full root page into
1458 * original/root/split page and new right page
1459 * i.e., root remains fixed in tree anchor (inode) and
1460 * the root is copied to a single new right child page
1461 * since root page << non-root page, and
1462 * the split root page contains a single entry for the
1463 * new right child page.
1464 *
1465 * parameter:
1466 * int tid,
1467 * struct inode *ip,
1468 * struct xtsplit *split,
1469 * struct metapage **rmpp)
1470 *
1471 * return:
1472 * Pointer to page in which to insert or NULL on error.
1473 */
1474static int
1475xtSplitRoot(tid_t tid,
1476 struct inode *ip, struct xtsplit * split, struct metapage ** rmpp)
1477{
1478 xtpage_t *sp;
1479 struct metapage *rmp;
1480 xtpage_t *rp;
1481 s64 rbn;
1482 int skip, nextindex;
1483 xad_t *xad;
1484 pxd_t *pxd;
1485 struct pxdlist *pxdlist;
1486 struct tlock *tlck;
1487 struct xtlock *xtlck;
1488
1489 sp = &JFS_IP(ip)->i_xtroot;
1490
1491 INCREMENT(xtStat.split);
1492
1493 /*
1494 * allocate a single (right) child page
1495 */
1496 pxdlist = split->pxdlist;
1497 pxd = &pxdlist->pxd[pxdlist->npxd];
1498 pxdlist->npxd++;
1499 rbn = addressPXD(pxd);
1500 rmp = get_metapage(ip, rbn, PSIZE, 1);
1501 if (rmp == NULL)
1502 return -EIO;
1503
1504 /* Allocate blocks to quota. */
1505 if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
1506 release_metapage(rmp);
1507 return -EDQUOT;
1508 }
1509
1510 jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp);
1511
1512 /*
1513 * acquire a transaction lock on the new right page;
1514 *
1515 * action: new page;
1516 */
1517 BT_MARK_DIRTY(rmp, ip);
1518
1519 rp = (xtpage_t *) rmp->data;
1520 rp->header.flag =
1521 (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL;
1522 rp->header.self = *pxd;
1523 rp->header.nextindex = cpu_to_le16(XTENTRYSTART);
1524 rp->header.maxentry = cpu_to_le16(PSIZE >> L2XTSLOTSIZE);
1525
1526 /* initialize sibling pointers */
1527 rp->header.next = 0;
1528 rp->header.prev = 0;
1529
1530 /*
1531 * copy the in-line root page into new right page extent
1532 */
1533 nextindex = le16_to_cpu(sp->header.maxentry);
1534 memmove(&rp->xad[XTENTRYSTART], &sp->xad[XTENTRYSTART],
1535 (nextindex - XTENTRYSTART) << L2XTSLOTSIZE);
1536
1537 /*
1538 * insert the new entry into the new right/child page
1539 * (skip index in the new right page will not change)
1540 */
1541 skip = split->index;
1542 /* if insert into middle, shift right remaining entries */
1543 if (skip != nextindex)
1544 memmove(&rp->xad[skip + 1], &rp->xad[skip],
1545 (nextindex - skip) * sizeof(xad_t));
1546
1547 xad = &rp->xad[skip];
1548 XT_PUTENTRY(xad, split->flag, split->off, split->len, split->addr);
1549
1550 /* update page header */
1551 rp->header.nextindex = cpu_to_le16(nextindex + 1);
1552
1553 if (!test_cflag(COMMIT_Nolink, ip)) {
1554 tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW);
1555 xtlck = (struct xtlock *) & tlck->lock;
1556 xtlck->lwm.offset = XTENTRYSTART;
1557 xtlck->lwm.length = le16_to_cpu(rp->header.nextindex) -
1558 XTENTRYSTART;
1559 }
1560
1561 /*
1562 * reset the root
1563 *
1564 * init root with the single entry for the new right page
1565 * set the 1st entry offset to 0, which force the left-most key
1566 * at any level of the tree to be less than any search key.
1567 */
1568 /*
1569 * acquire a transaction lock on the root page (in-memory inode);
1570 *
1571 * action: root split;
1572 */
1573 BT_MARK_DIRTY(split->mp, ip);
1574
1575 xad = &sp->xad[XTENTRYSTART];
1576 XT_PUTENTRY(xad, XAD_NEW, 0, JFS_SBI(ip->i_sb)->nbperpage, rbn);
1577
1578 /* update page header of root */
1579 sp->header.flag &= ~BT_LEAF;
1580 sp->header.flag |= BT_INTERNAL;
1581
1582 sp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1);
1583
1584 if (!test_cflag(COMMIT_Nolink, ip)) {
1585 tlck = txLock(tid, ip, split->mp, tlckXTREE | tlckGROW);
1586 xtlck = (struct xtlock *) & tlck->lock;
1587 xtlck->lwm.offset = XTENTRYSTART;
1588 xtlck->lwm.length = 1;
1589 }
1590
1591 *rmpp = rmp;
1592
1593 jfs_info("xtSplitRoot: sp:0x%p rp:0x%p", sp, rp);
1594 return 0;
1595}
1596
1597
1598/*
1599 * xtExtend()
1600 *
1601 * function: extend in-place;
1602 *
1603 * note: existing extent may or may not have been committed.
1604 * caller is responsible for pager buffer cache update, and
1605 * working block allocation map update;
1606 * update pmap: alloc whole extended extent;
1607 */
1608int xtExtend(tid_t tid, /* transaction id */
1609 struct inode *ip, s64 xoff, /* delta extent offset */
1610 s32 xlen, /* delta extent length */
1611 int flag)
1612{
1613 int rc = 0;
1614 int cmp;
1615 struct metapage *mp; /* meta-page buffer */
1616 xtpage_t *p; /* base B+-tree index page */
1617 s64 bn;
1618 int index, nextindex, len;
1619 struct btstack btstack; /* traverse stack */
1620 struct xtsplit split; /* split information */
1621 xad_t *xad;
1622 s64 xaddr;
1623 struct tlock *tlck;
1624 struct xtlock *xtlck = NULL;
1625
1626 jfs_info("xtExtend: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen);
1627
1628 /* there must exist extent to be extended */
1629 if ((rc = xtSearch(ip, xoff - 1, &cmp, &btstack, XT_INSERT)))
1630 return rc;
1631
1632 /* retrieve search result */
1633 XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
1634
1635 if (cmp != 0) {
1636 XT_PUTPAGE(mp);
1637 jfs_error(ip->i_sb, "xtExtend: xtSearch did not find extent");
1638 return -EIO;
1639 }
1640
1641 /* extension must be contiguous */
1642 xad = &p->xad[index];
1643 if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) {
1644 XT_PUTPAGE(mp);
1645 jfs_error(ip->i_sb, "xtExtend: extension is not contiguous");
1646 return -EIO;
1647 }
1648
1649 /*
1650 * acquire a transaction lock on the leaf page;
1651 *
1652 * action: xad insertion/extension;
1653 */
1654 BT_MARK_DIRTY(mp, ip);
1655 if (!test_cflag(COMMIT_Nolink, ip)) {
1656 tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
1657 xtlck = (struct xtlock *) & tlck->lock;
1658 }
1659
1660 /* extend will overflow extent ? */
1661 xlen = lengthXAD(xad) + xlen;
1662 if ((len = xlen - MAXXLEN) <= 0)
1663 goto extendOld;
1664
1665 /*
1666 * extent overflow: insert entry for new extent
1667 */
1668//insertNew:
1669 xoff = offsetXAD(xad) + MAXXLEN;
1670 xaddr = addressXAD(xad) + MAXXLEN;
1671 nextindex = le16_to_cpu(p->header.nextindex);
1672
1673 /*
1674 * if the leaf page is full, insert the new entry and
1675 * propagate up the router entry for the new page from split
1676 *
1677 * The xtSplitUp() will insert the entry and unpin the leaf page.
1678 */
1679 if (nextindex == le16_to_cpu(p->header.maxentry)) {
1680 /* xtSpliUp() unpins leaf pages */
1681 split.mp = mp;
1682 split.index = index + 1;
1683 split.flag = XAD_NEW;
1684 split.off = xoff; /* split offset */
1685 split.len = len;
1686 split.addr = xaddr;
1687 split.pxdlist = NULL;
1688 if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
1689 return rc;
1690
1691 /* get back old page */
1692 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
1693 if (rc)
1694 return rc;
1695 /*
1696 * if leaf root has been split, original root has been
1697 * copied to new child page, i.e., original entry now
1698 * resides on the new child page;
1699 */
1700 if (p->header.flag & BT_INTERNAL) {
1701 ASSERT(p->header.nextindex ==
1702 cpu_to_le16(XTENTRYSTART + 1));
1703 xad = &p->xad[XTENTRYSTART];
1704 bn = addressXAD(xad);
1705 XT_PUTPAGE(mp);
1706
1707 /* get new child page */
1708 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
1709 if (rc)
1710 return rc;
1711
1712 BT_MARK_DIRTY(mp, ip);
1713 if (!test_cflag(COMMIT_Nolink, ip)) {
1714 tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
1715 xtlck = (struct xtlock *) & tlck->lock;
1716 }
1717 }
1718 }
1719 /*
1720 * insert the new entry into the leaf page
1721 */
1722 else {
1723 /* insert the new entry: mark the entry NEW */
1724 xad = &p->xad[index + 1];
1725 XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr);
1726
1727 /* advance next available entry index */
1728 p->header.nextindex =
1729 cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
1730 }
1731
1732 /* get back old entry */
1733 xad = &p->xad[index];
1734 xlen = MAXXLEN;
1735
1736 /*
1737 * extend old extent
1738 */
1739 extendOld:
1740 XADlength(xad, xlen);
1741 if (!(xad->flag & XAD_NEW))
1742 xad->flag |= XAD_EXTENDED;
1743
1744 if (!test_cflag(COMMIT_Nolink, ip)) {
1745 xtlck->lwm.offset =
1746 (xtlck->lwm.offset) ? min(index,
1747 (int)xtlck->lwm.offset) : index;
1748 xtlck->lwm.length =
1749 le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
1750 }
1751
1752 /* unpin the leaf page */
1753 XT_PUTPAGE(mp);
1754
1755 return rc;
1756}
1757
1758#ifdef _NOTYET
1759/*
1760 * xtTailgate()
1761 *
1762 * function: split existing 'tail' extent
1763 * (split offset >= start offset of tail extent), and
1764 * relocate and extend the split tail half;
1765 *
1766 * note: existing extent may or may not have been committed.
1767 * caller is responsible for pager buffer cache update, and
1768 * working block allocation map update;
1769 * update pmap: free old split tail extent, alloc new extent;
1770 */
1771int xtTailgate(tid_t tid, /* transaction id */
1772 struct inode *ip, s64 xoff, /* split/new extent offset */
1773 s32 xlen, /* new extent length */
1774 s64 xaddr, /* new extent address */
1775 int flag)
1776{
1777 int rc = 0;
1778 int cmp;
1779 struct metapage *mp; /* meta-page buffer */
1780 xtpage_t *p; /* base B+-tree index page */
1781 s64 bn;
1782 int index, nextindex, llen, rlen;
1783 struct btstack btstack; /* traverse stack */
1784 struct xtsplit split; /* split information */
1785 xad_t *xad;
1786 struct tlock *tlck;
1787 struct xtlock *xtlck = 0;
1788 struct tlock *mtlck;
1789 struct maplock *pxdlock;
1790
1791/*
1792printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
1793 (ulong)xoff, xlen, (ulong)xaddr);
1794*/
1795
1796 /* there must exist extent to be tailgated */
1797 if ((rc = xtSearch(ip, xoff, &cmp, &btstack, XT_INSERT)))
1798 return rc;
1799
1800 /* retrieve search result */
1801 XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
1802
1803 if (cmp != 0) {
1804 XT_PUTPAGE(mp);
1805 jfs_error(ip->i_sb, "xtTailgate: couldn't find extent");
1806 return -EIO;
1807 }
1808
1809 /* entry found must be last entry */
1810 nextindex = le16_to_cpu(p->header.nextindex);
1811 if (index != nextindex - 1) {
1812 XT_PUTPAGE(mp);
1813 jfs_error(ip->i_sb,
1814 "xtTailgate: the entry found is not the last entry");
1815 return -EIO;
1816 }
1817
1818 BT_MARK_DIRTY(mp, ip);
1819 /*
1820 * acquire tlock of the leaf page containing original entry
1821 */
1822 if (!test_cflag(COMMIT_Nolink, ip)) {
1823 tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
1824 xtlck = (struct xtlock *) & tlck->lock;
1825 }
1826
1827 /* completely replace extent ? */
1828 xad = &p->xad[index];
1829/*
1830printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
1831 (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad));
1832*/
1833 if ((llen = xoff - offsetXAD(xad)) == 0)
1834 goto updateOld;
1835
1836 /*
1837 * partially replace extent: insert entry for new extent
1838 */
1839//insertNew:
1840 /*
1841 * if the leaf page is full, insert the new entry and
1842 * propagate up the router entry for the new page from split
1843 *
1844 * The xtSplitUp() will insert the entry and unpin the leaf page.
1845 */
1846 if (nextindex == le16_to_cpu(p->header.maxentry)) {
1847 /* xtSpliUp() unpins leaf pages */
1848 split.mp = mp;
1849 split.index = index + 1;
1850 split.flag = XAD_NEW;
1851 split.off = xoff; /* split offset */
1852 split.len = xlen;
1853 split.addr = xaddr;
1854 split.pxdlist = NULL;
1855 if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
1856 return rc;
1857
1858 /* get back old page */
1859 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
1860 if (rc)
1861 return rc;
1862 /*
1863 * if leaf root has been split, original root has been
1864 * copied to new child page, i.e., original entry now
1865 * resides on the new child page;
1866 */
1867 if (p->header.flag & BT_INTERNAL) {
1868 ASSERT(p->header.nextindex ==
1869 cpu_to_le16(XTENTRYSTART + 1));
1870 xad = &p->xad[XTENTRYSTART];
1871 bn = addressXAD(xad);
1872 XT_PUTPAGE(mp);
1873
1874 /* get new child page */
1875 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
1876 if (rc)
1877 return rc;
1878
1879 BT_MARK_DIRTY(mp, ip);
1880 if (!test_cflag(COMMIT_Nolink, ip)) {
1881 tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
1882 xtlck = (struct xtlock *) & tlck->lock;
1883 }
1884 }
1885 }
1886 /*
1887 * insert the new entry into the leaf page
1888 */
1889 else {
1890 /* insert the new entry: mark the entry NEW */
1891 xad = &p->xad[index + 1];
1892 XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
1893
1894 /* advance next available entry index */
1895 p->header.nextindex =
1896 cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
1897 }
1898
1899 /* get back old XAD */
1900 xad = &p->xad[index];
1901
1902 /*
1903 * truncate/relocate old extent at split offset
1904 */
1905 updateOld:
1906 /* update dmap for old/committed/truncated extent */
1907 rlen = lengthXAD(xad) - llen;
1908 if (!(xad->flag & XAD_NEW)) {
1909 /* free from PWMAP at commit */
1910 if (!test_cflag(COMMIT_Nolink, ip)) {
1911 mtlck = txMaplock(tid, ip, tlckMAP);
1912 pxdlock = (struct maplock *) & mtlck->lock;
1913 pxdlock->flag = mlckFREEPXD;
1914 PXDaddress(&pxdlock->pxd, addressXAD(xad) + llen);
1915 PXDlength(&pxdlock->pxd, rlen);
1916 pxdlock->index = 1;
1917 }
1918 } else
1919 /* free from WMAP */
1920 dbFree(ip, addressXAD(xad) + llen, (s64) rlen);
1921
1922 if (llen)
1923 /* truncate */
1924 XADlength(xad, llen);
1925 else
1926 /* replace */
1927 XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
1928
1929 if (!test_cflag(COMMIT_Nolink, ip)) {
1930 xtlck->lwm.offset = (xtlck->lwm.offset) ?
1931 min(index, (int)xtlck->lwm.offset) : index;
1932 xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
1933 xtlck->lwm.offset;
1934 }
1935
1936 /* unpin the leaf page */
1937 XT_PUTPAGE(mp);
1938
1939 return rc;
1940}
1941#endif /* _NOTYET */
1942
1943/*
1944 * xtUpdate()
1945 *
1946 * function: update XAD;
1947 *
1948 * update extent for allocated_but_not_recorded or
1949 * compressed extent;
1950 *
1951 * parameter:
1952 * nxad - new XAD;
1953 * logical extent of the specified XAD must be completely
1954 * contained by an existing XAD;
1955 */
1956int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
1957{ /* new XAD */
1958 int rc = 0;
1959 int cmp;
1960 struct metapage *mp; /* meta-page buffer */
1961 xtpage_t *p; /* base B+-tree index page */
1962 s64 bn;
1963 int index0, index, newindex, nextindex;
1964 struct btstack btstack; /* traverse stack */
1965 struct xtsplit split; /* split information */
1966 xad_t *xad, *lxad, *rxad;
1967 int xflag;
1968 s64 nxoff, xoff;
1969 int nxlen, xlen, lxlen, rxlen;
1970 s64 nxaddr, xaddr;
1971 struct tlock *tlck;
1972 struct xtlock *xtlck = NULL;
1973 int newpage = 0;
1974
1975 /* there must exist extent to be tailgated */
1976 nxoff = offsetXAD(nxad);
1977 nxlen = lengthXAD(nxad);
1978 nxaddr = addressXAD(nxad);
1979
1980 if ((rc = xtSearch(ip, nxoff, &cmp, &btstack, XT_INSERT)))
1981 return rc;
1982
1983 /* retrieve search result */
1984 XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0);
1985
1986 if (cmp != 0) {
1987 XT_PUTPAGE(mp);
1988 jfs_error(ip->i_sb, "xtUpdate: Could not find extent");
1989 return -EIO;
1990 }
1991
1992 BT_MARK_DIRTY(mp, ip);
1993 /*
1994 * acquire tlock of the leaf page containing original entry
1995 */
1996 if (!test_cflag(COMMIT_Nolink, ip)) {
1997 tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
1998 xtlck = (struct xtlock *) & tlck->lock;
1999 }
2000
2001 xad = &p->xad[index0];
2002 xflag = xad->flag;
2003 xoff = offsetXAD(xad);
2004 xlen = lengthXAD(xad);
2005 xaddr = addressXAD(xad);
2006
2007 /* nXAD must be completely contained within XAD */
2008 if ((xoff > nxoff) ||
2009 (nxoff + nxlen > xoff + xlen)) {
2010 XT_PUTPAGE(mp);
2011 jfs_error(ip->i_sb,
2012 "xtUpdate: nXAD in not completely contained within XAD");
2013 return -EIO;
2014 }
2015
2016 index = index0;
2017 newindex = index + 1;
2018 nextindex = le16_to_cpu(p->header.nextindex);
2019
2020#ifdef _JFS_WIP_NOCOALESCE
2021 if (xoff < nxoff)
2022 goto updateRight;
2023
2024 /*
2025 * replace XAD with nXAD
2026 */
2027 replace: /* (nxoff == xoff) */
2028 if (nxlen == xlen) {
2029 /* replace XAD with nXAD:recorded */
2030 *xad = *nxad;
2031 xad->flag = xflag & ~XAD_NOTRECORDED;
2032
2033 goto out;
2034 } else /* (nxlen < xlen) */
2035 goto updateLeft;
2036#endif /* _JFS_WIP_NOCOALESCE */
2037
2038/* #ifdef _JFS_WIP_COALESCE */
2039 if (xoff < nxoff)
2040 goto coalesceRight;
2041
2042 /*
2043 * coalesce with left XAD
2044 */
2045//coalesceLeft: /* (xoff == nxoff) */
2046 /* is XAD first entry of page ? */
2047 if (index == XTENTRYSTART)
2048 goto replace;
2049
2050 /* is nXAD logically and physically contiguous with lXAD ? */
2051 lxad = &p->xad[index - 1];
2052 lxlen = lengthXAD(lxad);
2053 if (!(lxad->flag & XAD_NOTRECORDED) &&
2054 (nxoff == offsetXAD(lxad) + lxlen) &&
2055 (nxaddr == addressXAD(lxad) + lxlen) &&
2056 (lxlen + nxlen < MAXXLEN)) {
2057 /* extend right lXAD */
2058 index0 = index - 1;
2059 XADlength(lxad, lxlen + nxlen);
2060
2061 /* If we just merged two extents together, need to make sure the
2062 * right extent gets logged. If the left one is marked XAD_NEW,
2063 * then we know it will be logged. Otherwise, mark as
2064 * XAD_EXTENDED
2065 */
2066 if (!(lxad->flag & XAD_NEW))
2067 lxad->flag |= XAD_EXTENDED;
2068
2069 if (xlen > nxlen) {
2070 /* truncate XAD */
2071 XADoffset(xad, xoff + nxlen);
2072 XADlength(xad, xlen - nxlen);
2073 XADaddress(xad, xaddr + nxlen);
2074 goto out;
2075 } else { /* (xlen == nxlen) */
2076
2077 /* remove XAD */
2078 if (index < nextindex - 1)
2079 memmove(&p->xad[index], &p->xad[index + 1],
2080 (nextindex - index -
2081 1) << L2XTSLOTSIZE);
2082
2083 p->header.nextindex =
2084 cpu_to_le16(le16_to_cpu(p->header.nextindex) -
2085 1);
2086
2087 index = index0;
2088 newindex = index + 1;
2089 nextindex = le16_to_cpu(p->header.nextindex);
2090 xoff = nxoff = offsetXAD(lxad);
2091 xlen = nxlen = lxlen + nxlen;
2092 xaddr = nxaddr = addressXAD(lxad);
2093 goto coalesceRight;
2094 }
2095 }
2096
2097 /*
2098 * replace XAD with nXAD
2099 */
2100 replace: /* (nxoff == xoff) */
2101 if (nxlen == xlen) {
2102 /* replace XAD with nXAD:recorded */
2103 *xad = *nxad;
2104 xad->flag = xflag & ~XAD_NOTRECORDED;
2105
2106 goto coalesceRight;
2107 } else /* (nxlen < xlen) */
2108 goto updateLeft;
2109
2110 /*
2111 * coalesce with right XAD
2112 */
2113 coalesceRight: /* (xoff <= nxoff) */
2114 /* is XAD last entry of page ? */
2115 if (newindex == nextindex) {
2116 if (xoff == nxoff)
2117 goto out;
2118 goto updateRight;
2119 }
2120
2121 /* is nXAD logically and physically contiguous with rXAD ? */
2122 rxad = &p->xad[index + 1];
2123 rxlen = lengthXAD(rxad);
2124 if (!(rxad->flag & XAD_NOTRECORDED) &&
2125 (nxoff + nxlen == offsetXAD(rxad)) &&
2126 (nxaddr + nxlen == addressXAD(rxad)) &&
2127 (rxlen + nxlen < MAXXLEN)) {
2128 /* extend left rXAD */
2129 XADoffset(rxad, nxoff);
2130 XADlength(rxad, rxlen + nxlen);
2131 XADaddress(rxad, nxaddr);
2132
2133 /* If we just merged two extents together, need to make sure
2134 * the left extent gets logged. If the right one is marked
2135 * XAD_NEW, then we know it will be logged. Otherwise, mark as
2136 * XAD_EXTENDED
2137 */
2138 if (!(rxad->flag & XAD_NEW))
2139 rxad->flag |= XAD_EXTENDED;
2140
2141 if (xlen > nxlen)
2142 /* truncate XAD */
2143 XADlength(xad, xlen - nxlen);
2144 else { /* (xlen == nxlen) */
2145
2146 /* remove XAD */
2147 memmove(&p->xad[index], &p->xad[index + 1],
2148 (nextindex - index - 1) << L2XTSLOTSIZE);
2149
2150 p->header.nextindex =
2151 cpu_to_le16(le16_to_cpu(p->header.nextindex) -
2152 1);
2153 }
2154
2155 goto out;
2156 } else if (xoff == nxoff)
2157 goto out;
2158
2159 if (xoff >= nxoff) {
2160 XT_PUTPAGE(mp);
2161 jfs_error(ip->i_sb, "xtUpdate: xoff >= nxoff");
2162 return -EIO;
2163 }
2164/* #endif _JFS_WIP_COALESCE */
2165
2166 /*
2167 * split XAD into (lXAD, nXAD):
2168 *
2169 * |---nXAD--->
2170 * --|----------XAD----------|--
2171 * |-lXAD-|
2172 */
2173 updateRight: /* (xoff < nxoff) */
2174 /* truncate old XAD as lXAD:not_recorded */
2175 xad = &p->xad[index];
2176 XADlength(xad, nxoff - xoff);
2177
2178 /* insert nXAD:recorded */
2179 if (nextindex == le16_to_cpu(p->header.maxentry)) {
2180
2181 /* xtSpliUp() unpins leaf pages */
2182 split.mp = mp;
2183 split.index = newindex;
2184 split.flag = xflag & ~XAD_NOTRECORDED;
2185 split.off = nxoff;
2186 split.len = nxlen;
2187 split.addr = nxaddr;
2188 split.pxdlist = NULL;
2189 if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
2190 return rc;
2191
2192 /* get back old page */
2193 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2194 if (rc)
2195 return rc;
2196 /*
2197 * if leaf root has been split, original root has been
2198 * copied to new child page, i.e., original entry now
2199 * resides on the new child page;
2200 */
2201 if (p->header.flag & BT_INTERNAL) {
2202 ASSERT(p->header.nextindex ==
2203 cpu_to_le16(XTENTRYSTART + 1));
2204 xad = &p->xad[XTENTRYSTART];
2205 bn = addressXAD(xad);
2206 XT_PUTPAGE(mp);
2207
2208 /* get new child page */
2209 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2210 if (rc)
2211 return rc;
2212
2213 BT_MARK_DIRTY(mp, ip);
2214 if (!test_cflag(COMMIT_Nolink, ip)) {
2215 tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
2216 xtlck = (struct xtlock *) & tlck->lock;
2217 }
2218 } else {
2219 /* is nXAD on new page ? */
2220 if (newindex >
2221 (le16_to_cpu(p->header.maxentry) >> 1)) {
2222 newindex =
2223 newindex -
2224 le16_to_cpu(p->header.nextindex) +
2225 XTENTRYSTART;
2226 newpage = 1;
2227 }
2228 }
2229 } else {
2230 /* if insert into middle, shift right remaining entries */
2231 if (newindex < nextindex)
2232 memmove(&p->xad[newindex + 1], &p->xad[newindex],
2233 (nextindex - newindex) << L2XTSLOTSIZE);
2234
2235 /* insert the entry */
2236 xad = &p->xad[newindex];
2237 *xad = *nxad;
2238 xad->flag = xflag & ~XAD_NOTRECORDED;
2239
2240 /* advance next available entry index. */
2241 p->header.nextindex =
2242 cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
2243 }
2244
2245 /*
2246 * does nXAD force 3-way split ?
2247 *
2248 * |---nXAD--->|
2249 * --|----------XAD-------------|--
2250 * |-lXAD-| |-rXAD -|
2251 */
2252 if (nxoff + nxlen == xoff + xlen)
2253 goto out;
2254
2255 /* reorient nXAD as XAD for further split XAD into (nXAD, rXAD) */
2256 if (newpage) {
2257 /* close out old page */
2258 if (!test_cflag(COMMIT_Nolink, ip)) {
2259 xtlck->lwm.offset = (xtlck->lwm.offset) ?
2260 min(index0, (int)xtlck->lwm.offset) : index0;
2261 xtlck->lwm.length =
2262 le16_to_cpu(p->header.nextindex) -
2263 xtlck->lwm.offset;
2264 }
2265
2266 bn = le64_to_cpu(p->header.next);
2267 XT_PUTPAGE(mp);
2268
2269 /* get new right page */
2270 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2271 if (rc)
2272 return rc;
2273
2274 BT_MARK_DIRTY(mp, ip);
2275 if (!test_cflag(COMMIT_Nolink, ip)) {
2276 tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
2277 xtlck = (struct xtlock *) & tlck->lock;
2278 }
2279
2280 index0 = index = newindex;
2281 } else
2282 index++;
2283
2284 newindex = index + 1;
2285 nextindex = le16_to_cpu(p->header.nextindex);
2286 xlen = xlen - (nxoff - xoff);
2287 xoff = nxoff;
2288 xaddr = nxaddr;
2289
2290 /* recompute split pages */
2291 if (nextindex == le16_to_cpu(p->header.maxentry)) {
2292 XT_PUTPAGE(mp);
2293
2294 if ((rc = xtSearch(ip, nxoff, &cmp, &btstack, XT_INSERT)))
2295 return rc;
2296
2297 /* retrieve search result */
2298 XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0);
2299
2300 if (cmp != 0) {
2301 XT_PUTPAGE(mp);
2302 jfs_error(ip->i_sb, "xtUpdate: xtSearch failed");
2303 return -EIO;
2304 }
2305
2306 if (index0 != index) {
2307 XT_PUTPAGE(mp);
2308 jfs_error(ip->i_sb,
2309 "xtUpdate: unexpected value of index");
2310 return -EIO;
2311 }
2312 }
2313
2314 /*
2315 * split XAD into (nXAD, rXAD)
2316 *
2317 * ---nXAD---|
2318 * --|----------XAD----------|--
2319 * |-rXAD-|
2320 */
2321 updateLeft: /* (nxoff == xoff) && (nxlen < xlen) */
2322 /* update old XAD with nXAD:recorded */
2323 xad = &p->xad[index];
2324 *xad = *nxad;
2325 xad->flag = xflag & ~XAD_NOTRECORDED;
2326
2327 /* insert rXAD:not_recorded */
2328 xoff = xoff + nxlen;
2329 xlen = xlen - nxlen;
2330 xaddr = xaddr + nxlen;
2331 if (nextindex == le16_to_cpu(p->header.maxentry)) {
2332/*
2333printf("xtUpdate.updateLeft.split p:0x%p\n", p);
2334*/
2335 /* xtSpliUp() unpins leaf pages */
2336 split.mp = mp;
2337 split.index = newindex;
2338 split.flag = xflag;
2339 split.off = xoff;
2340 split.len = xlen;
2341 split.addr = xaddr;
2342 split.pxdlist = NULL;
2343 if ((rc = xtSplitUp(tid, ip, &split, &btstack)))
2344 return rc;
2345
2346 /* get back old page */
2347 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2348 if (rc)
2349 return rc;
2350
2351 /*
2352 * if leaf root has been split, original root has been
2353 * copied to new child page, i.e., original entry now
2354 * resides on the new child page;
2355 */
2356 if (p->header.flag & BT_INTERNAL) {
2357 ASSERT(p->header.nextindex ==
2358 cpu_to_le16(XTENTRYSTART + 1));
2359 xad = &p->xad[XTENTRYSTART];
2360 bn = addressXAD(xad);
2361 XT_PUTPAGE(mp);
2362
2363 /* get new child page */
2364 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
2365 if (rc)
2366 return rc;
2367
2368 BT_MARK_DIRTY(mp, ip);
2369 if (!test_cflag(COMMIT_Nolink, ip)) {
2370 tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
2371 xtlck = (struct xtlock *) & tlck->lock;
2372 }
2373 }
2374 } else {
2375 /* if insert into middle, shift right remaining entries */
2376 if (newindex < nextindex)
2377 memmove(&p->xad[newindex + 1], &p->xad[newindex],
2378 (nextindex - newindex) << L2XTSLOTSIZE);
2379
2380 /* insert the entry */
2381 xad = &p->xad[newindex];
2382 XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
2383
2384 /* advance next available entry index. */
2385 p->header.nextindex =
2386 cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
2387 }
2388
2389 out:
2390 if (!test_cflag(COMMIT_Nolink, ip)) {
2391 xtlck->lwm.offset = (xtlck->lwm.offset) ?
2392 min(index0, (int)xtlck->lwm.offset) : index0;
2393 xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
2394 xtlck->lwm.offset;
2395 }
2396
2397 /* unpin the leaf page */
2398 XT_PUTPAGE(mp);
2399
2400 return rc;
2401}
2402
2403
2404/*
2405 * xtAppend()
2406 *
2407 * function: grow in append mode from contiguous region specified ;
2408 *
2409 * parameter:
2410 * tid - transaction id;
2411 * ip - file object;
2412 * xflag - extent flag:
2413 * xoff - extent offset;
2414 * maxblocks - max extent length;
2415 * xlen - extent length (in/out);
2416 * xaddrp - extent address pointer (in/out):
2417 * flag -
2418 *
2419 * return:
2420 */
2421int xtAppend(tid_t tid, /* transaction id */
2422 struct inode *ip, int xflag, s64 xoff, s32 maxblocks,
2423 s32 * xlenp, /* (in/out) */
2424 s64 * xaddrp, /* (in/out) */
2425 int flag)
2426{
2427 int rc = 0;
2428 struct metapage *mp; /* meta-page buffer */
2429 xtpage_t *p; /* base B+-tree index page */
2430 s64 bn, xaddr;
2431 int index, nextindex;
2432 struct btstack btstack; /* traverse stack */
2433 struct xtsplit split; /* split information */
2434 xad_t *xad;
2435 int cmp;
2436 struct tlock *tlck;
2437 struct xtlock *xtlck;
2438 int nsplit, nblocks, xlen;
2439 struct pxdlist pxdlist;
2440 pxd_t *pxd;
2441
2442 xaddr = *xaddrp;
2443 xlen = *xlenp;
2444 jfs_info("xtAppend: xoff:0x%lx maxblocks:%d xlen:%d xaddr:0x%lx",
2445 (ulong) xoff, maxblocks, xlen, (ulong) xaddr);
2446
2447 /*
2448 * search for the entry location at which to insert:
2449 *
2450 * xtFastSearch() and xtSearch() both returns (leaf page
2451 * pinned, index at which to insert).
2452 * n.b. xtSearch() may return index of maxentry of
2453 * the full page.
2454 */
2455 if ((rc = xtSearch(ip, xoff, &cmp, &btstack, XT_INSERT)))
2456 return rc;
2457
2458 /* retrieve search result */
2459 XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
2460
2461 if (cmp == 0) {
2462 rc = -EEXIST;
2463 goto out;
2464 }
2465//insert:
2466 /*
2467 * insert entry for new extent
2468 */
2469 xflag |= XAD_NEW;
2470
2471 /*
2472 * if the leaf page is full, split the page and
2473 * propagate up the router entry for the new page from split
2474 *
2475 * The xtSplitUp() will insert the entry and unpin the leaf page.
2476 */
2477 nextindex = le16_to_cpu(p->header.nextindex);
2478 if (nextindex < le16_to_cpu(p->header.maxentry))
2479 goto insertLeaf;
2480
2481 /*
2482 * allocate new index blocks to cover index page split(s)
2483 */
2484 nsplit = btstack.nsplit;
2485 split.pxdlist = &pxdlist;
2486 pxdlist.maxnpxd = pxdlist.npxd = 0;
2487 pxd = &pxdlist.pxd[0];
2488 nblocks = JFS_SBI(ip->i_sb)->nbperpage;
2489 for (; nsplit > 0; nsplit--, pxd++, xaddr += nblocks, maxblocks -= nblocks) {
2490 if ((rc = dbAllocBottomUp(ip, xaddr, (s64) nblocks)) == 0) {
2491 PXDaddress(pxd, xaddr);
2492 PXDlength(pxd, nblocks);
2493
2494 pxdlist.maxnpxd++;
2495
2496 continue;
2497 }
2498
2499 /* undo allocation */
2500
2501 goto out;
2502 }
2503
2504 xlen = min(xlen, maxblocks);
2505
2506 /*
2507 * allocate data extent requested
2508 */
2509 if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen)))
2510 goto out;
2511
2512 split.mp = mp;
2513 split.index = index;
2514 split.flag = xflag;
2515 split.off = xoff;
2516 split.len = xlen;
2517 split.addr = xaddr;
2518 if ((rc = xtSplitUp(tid, ip, &split, &btstack))) {
2519 /* undo data extent allocation */
2520 dbFree(ip, *xaddrp, (s64) * xlenp);
2521
2522 return rc;
2523 }
2524
2525 *xaddrp = xaddr;
2526 *xlenp = xlen;
2527 return 0;
2528
2529 /*
2530 * insert the new entry into the leaf page
2531 */
2532 insertLeaf:
2533 /*
2534 * allocate data extent requested
2535 */
2536 if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen)))
2537 goto out;
2538
2539 BT_MARK_DIRTY(mp, ip);
2540 /*
2541 * acquire a transaction lock on the leaf page;
2542 *
2543 * action: xad insertion/extension;
2544 */
2545 tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW);
2546 xtlck = (struct xtlock *) & tlck->lock;
2547
2548 /* insert the new entry: mark the entry NEW */
2549 xad = &p->xad[index];
2550 XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
2551
2552 /* advance next available entry index */
2553 p->header.nextindex =
2554 cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
2555
2556 xtlck->lwm.offset =
2557 (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index;
2558 xtlck->lwm.length = le16_to_cpu(p->header.nextindex) -
2559 xtlck->lwm.offset;
2560
2561 *xaddrp = xaddr;
2562 *xlenp = xlen;
2563
2564 out:
2565 /* unpin the leaf page */
2566 XT_PUTPAGE(mp);
2567
2568 return rc;
2569}
2570#ifdef _STILL_TO_PORT
2571
2572/* - TBD for defragmentaion/reorganization -
2573 *
2574 * xtDelete()
2575 *
2576 * function:
2577 * delete the entry with the specified key.
2578 *
2579 * N.B.: whole extent of the entry is assumed to be deleted.
2580 *
2581 * parameter:
2582 *
2583 * return:
2584 * ENOENT: if the entry is not found.
2585 *
2586 * exception:
2587 */
2588int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag)
2589{
2590 int rc = 0;
2591 struct btstack btstack;
2592 int cmp;
2593 s64 bn;
2594 struct metapage *mp;
2595 xtpage_t *p;
2596 int index, nextindex;
2597 struct tlock *tlck;
2598 struct xtlock *xtlck;
2599
2600 /*
2601 * find the matching entry; xtSearch() pins the page
2602 */
2603 if ((rc = xtSearch(ip, xoff, &cmp, &btstack, 0)))
2604 return rc;
2605
2606 XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
2607 if (cmp) {
2608 /* unpin the leaf page */
2609 XT_PUTPAGE(mp);
2610 return -ENOENT;
2611 }
2612
2613 /*
2614 * delete the entry from the leaf page
2615 */
2616 nextindex = le16_to_cpu(p->header.nextindex);
2617 p->header.nextindex =
2618 cpu_to_le16(le16_to_cpu(p->header.nextindex) - 1);
2619
2620 /*
2621 * if the leaf page bocome empty, free the page
2622 */
2623 if (p->header.nextindex == cpu_to_le16(XTENTRYSTART))
2624 return (xtDeleteUp(tid, ip, mp, p, &btstack));
2625
2626 BT_MARK_DIRTY(mp, ip);
2627 /*
2628 * acquire a transaction lock on the leaf page;
2629 *
2630 * action:xad deletion;
2631 */
2632 tlck = txLock(tid, ip, mp, tlckXTREE);
2633 xtlck = (struct xtlock *) & tlck->lock;
2634 xtlck->lwm.offset =
2635 (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index;
2636
2637 /* if delete from middle, shift left/compact the remaining entries */
2638 if (index < nextindex - 1)
2639 memmove(&p->xad[index], &p->xad[index + 1],
2640 (nextindex - index - 1) * sizeof(xad_t));
2641
2642 XT_PUTPAGE(mp);
2643
2644 return 0;
2645}
2646
2647
2648/* - TBD for defragmentaion/reorganization -
2649 *
2650 * xtDeleteUp()
2651 *
2652 * function:
2653 * free empty pages as propagating deletion up the tree
2654 *
2655 * parameter:
2656 *
2657 * return:
2658 */
2659static int
2660xtDeleteUp(tid_t tid, struct inode *ip,
2661 struct metapage * fmp, xtpage_t * fp, struct btstack * btstack)
2662{
2663 int rc = 0;
2664 struct metapage *mp;
2665 xtpage_t *p;
2666 int index, nextindex;
2667 s64 xaddr;
2668 int xlen;
2669 struct btframe *parent;
2670 struct tlock *tlck;
2671 struct xtlock *xtlck;
2672
2673 /*
2674 * keep root leaf page which has become empty
2675 */
2676 if (fp->header.flag & BT_ROOT) {
2677 /* keep the root page */
2678 fp->header.flag &= ~BT_INTERNAL;
2679 fp->header.flag |= BT_LEAF;
2680 fp->header.nextindex = cpu_to_le16(XTENTRYSTART);
2681
2682 /* XT_PUTPAGE(fmp); */
2683
2684 return 0;
2685 }
2686
2687 /*
2688 * free non-root leaf page
2689 */
2690 if ((rc = xtRelink(tid, ip, fp))) {
2691 XT_PUTPAGE(fmp);
2692 return rc;
2693 }
2694
2695 xaddr = addressPXD(&fp->header.self);
2696 xlen = lengthPXD(&fp->header.self);
2697 /* free the page extent */
2698 dbFree(ip, xaddr, (s64) xlen);
2699
2700 /* free the buffer page */
2701 discard_metapage(fmp);
2702
2703 /*
2704 * propagate page deletion up the index tree
2705 *
2706 * If the delete from the parent page makes it empty,
2707 * continue all the way up the tree.
2708 * stop if the root page is reached (which is never deleted) or
2709 * if the entry deletion does not empty the page.
2710 */
2711 while ((parent = BT_POP(btstack)) != NULL) {
2712 /* get/pin the parent page <sp> */
2713 XT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc);
2714 if (rc)
2715 return rc;
2716
2717 index = parent->index;
2718
2719 /* delete the entry for the freed child page from parent.
2720 */
2721 nextindex = le16_to_cpu(p->header.nextindex);
2722
2723 /*
2724 * the parent has the single entry being deleted:
2725 * free the parent page which has become empty.
2726 */
2727 if (nextindex == 1) {
2728 if (p->header.flag & BT_ROOT) {
2729 /* keep the root page */
2730 p->header.flag &= ~BT_INTERNAL;
2731 p->header.flag |= BT_LEAF;
2732 p->header.nextindex =
2733 cpu_to_le16(XTENTRYSTART);
2734
2735 /* XT_PUTPAGE(mp); */
2736
2737 break;
2738 } else {
2739 /* free the parent page */
2740 if ((rc = xtRelink(tid, ip, p)))
2741 return rc;
2742
2743 xaddr = addressPXD(&p->header.self);
2744 /* free the page extent */
2745 dbFree(ip, xaddr,
2746 (s64) JFS_SBI(ip->i_sb)->nbperpage);
2747
2748 /* unpin/free the buffer page */
2749 discard_metapage(mp);
2750
2751 /* propagate up */
2752 continue;
2753 }
2754 }
2755 /*
2756 * the parent has other entries remaining:
2757 * delete the router entry from the parent page.
2758 */
2759 else {
2760 BT_MARK_DIRTY(mp, ip);
2761 /*
2762 * acquire a transaction lock on the leaf page;
2763 *
2764 * action:xad deletion;
2765 */
2766 tlck = txLock(tid, ip, mp, tlckXTREE);
2767 xtlck = (struct xtlock *) & tlck->lock;
2768 xtlck->lwm.offset =
2769 (xtlck->lwm.offset) ? min(index,
2770 xtlck->lwm.
2771 offset) : index;
2772
2773 /* if delete from middle,
2774 * shift left/compact the remaining entries in the page
2775 */
2776 if (index < nextindex - 1)
2777 memmove(&p->xad[index], &p->xad[index + 1],
2778 (nextindex - index -
2779 1) << L2XTSLOTSIZE);
2780
2781 p->header.nextindex =
2782 cpu_to_le16(le16_to_cpu(p->header.nextindex) -
2783 1);
2784 jfs_info("xtDeleteUp(entry): 0x%lx[%d]",
2785 (ulong) parent->bn, index);
2786 }
2787
2788 /* unpin the parent page */
2789 XT_PUTPAGE(mp);
2790
2791 /* exit propagation up */
2792 break;
2793 }
2794
2795 return 0;
2796}
2797
2798
2799/*
2800 * NAME: xtRelocate()
2801 *
2802 * FUNCTION: relocate xtpage or data extent of regular file;
2803 * This function is mainly used by defragfs utility.
2804 *
2805 * NOTE: This routine does not have the logic to handle
2806 * uncommitted allocated extent. The caller should call
2807 * txCommit() to commit all the allocation before call
2808 * this routine.
2809 */
2810int
2811xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */
2812 s64 nxaddr, /* new xaddr */
2813 int xtype)
2814{ /* extent type: XTPAGE or DATAEXT */
2815 int rc = 0;
2816 struct tblock *tblk;
2817 struct tlock *tlck;
2818 struct xtlock *xtlck;
2819 struct metapage *mp, *pmp, *lmp, *rmp; /* meta-page buffer */
2820 xtpage_t *p, *pp, *rp, *lp; /* base B+-tree index page */
2821 xad_t *xad;
2822 pxd_t *pxd;
2823 s64 xoff, xsize;
2824 int xlen;
2825 s64 oxaddr, sxaddr, dxaddr, nextbn, prevbn;
2826 cbuf_t *cp;
2827 s64 offset, nbytes, nbrd, pno;
2828 int nb, npages, nblks;
2829 s64 bn;
2830 int cmp;
2831 int index;
2832 struct pxd_lock *pxdlock;
2833 struct btstack btstack; /* traverse stack */
2834
2835 xtype = xtype & EXTENT_TYPE;
2836
2837 xoff = offsetXAD(oxad);
2838 oxaddr = addressXAD(oxad);
2839 xlen = lengthXAD(oxad);
2840
2841 /* validate extent offset */
2842 offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
2843 if (offset >= ip->i_size)
2844 return -ESTALE; /* stale extent */
2845
2846 jfs_info("xtRelocate: xtype:%d xoff:0x%lx xlen:0x%x xaddr:0x%lx:0x%lx",
2847 xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr);
2848
2849 /*
2850 * 1. get and validate the parent xtpage/xad entry
2851 * covering the source extent to be relocated;
2852 */
2853 if (xtype == DATAEXT) {
2854 /* search in leaf entry */
2855 rc = xtSearch(ip, xoff, &cmp, &btstack, 0);
2856 if (rc)
2857 return rc;
2858
2859 /* retrieve search result */
2860 XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
2861
2862 if (cmp) {
2863 XT_PUTPAGE(pmp);
2864 return -ESTALE;
2865 }
2866
2867 /* validate for exact match with a single entry */
2868 xad = &pp->xad[index];
2869 if (addressXAD(xad) != oxaddr || lengthXAD(xad) != xlen) {
2870 XT_PUTPAGE(pmp);
2871 return -ESTALE;
2872 }
2873 } else { /* (xtype == XTPAGE) */
2874
2875 /* search in internal entry */
2876 rc = xtSearchNode(ip, oxad, &cmp, &btstack, 0);
2877 if (rc)
2878 return rc;
2879
2880 /* retrieve search result */
2881 XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
2882
2883 if (cmp) {
2884 XT_PUTPAGE(pmp);
2885 return -ESTALE;
2886 }
2887
2888 /* xtSearchNode() validated for exact match with a single entry
2889 */
2890 xad = &pp->xad[index];
2891 }
2892 jfs_info("xtRelocate: parent xad entry validated.");
2893
2894 /*
2895 * 2. relocate the extent
2896 */
2897 if (xtype == DATAEXT) {
2898 /* if the extent is allocated-but-not-recorded
2899 * there is no real data to be moved in this extent,
2900 */
2901 if (xad->flag & XAD_NOTRECORDED)
2902 goto out;
2903 else
2904 /* release xtpage for cmRead()/xtLookup() */
2905 XT_PUTPAGE(pmp);
2906
2907 /*
2908 * cmRelocate()
2909 *
2910 * copy target data pages to be relocated;
2911 *
2912 * data extent must start at page boundary and
2913 * multiple of page size (except the last data extent);
2914 * read in each page of the source data extent into cbuf,
2915 * update the cbuf extent descriptor of the page to be
2916 * homeward bound to new dst data extent
2917 * copy the data from the old extent to new extent.
2918 * copy is essential for compressed files to avoid problems
2919 * that can arise if there was a change in compression
2920 * algorithms.
2921 * it is a good strategy because it may disrupt cache
2922 * policy to keep the pages in memory afterwards.
2923 */
2924 offset = xoff << JFS_SBI(ip->i_sb)->l2bsize;
2925 assert((offset & CM_OFFSET) == 0);
2926 nbytes = xlen << JFS_SBI(ip->i_sb)->l2bsize;
2927 pno = offset >> CM_L2BSIZE;
2928 npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE;
2929/*
2930 npages = ((offset + nbytes - 1) >> CM_L2BSIZE) -
2931 (offset >> CM_L2BSIZE) + 1;
2932*/
2933 sxaddr = oxaddr;
2934 dxaddr = nxaddr;
2935
2936 /* process the request one cache buffer at a time */
2937 for (nbrd = 0; nbrd < nbytes; nbrd += nb,
2938 offset += nb, pno++, npages--) {
2939 /* compute page size */
2940 nb = min(nbytes - nbrd, CM_BSIZE);
2941
2942 /* get the cache buffer of the page */
2943 if (rc = cmRead(ip, offset, npages, &cp))
2944 break;
2945
2946 assert(addressPXD(&cp->cm_pxd) == sxaddr);
2947 assert(!cp->cm_modified);
2948
2949 /* bind buffer with the new extent address */
2950 nblks = nb >> JFS_IP(ip->i_sb)->l2bsize;
2951 cmSetXD(ip, cp, pno, dxaddr, nblks);
2952
2953 /* release the cbuf, mark it as modified */
2954 cmPut(cp, TRUE);
2955
2956 dxaddr += nblks;
2957 sxaddr += nblks;
2958 }
2959
2960 /* get back parent page */
2961 if ((rc = xtSearch(ip, xoff, &cmp, &btstack, 0)))
2962 return rc;
2963
2964 XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index);
2965 jfs_info("xtRelocate: target data extent relocated.");
2966 } else { /* (xtype == XTPAGE) */
2967
2968 /*
2969 * read in the target xtpage from the source extent;
2970 */
2971 XT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc);
2972 if (rc) {
2973 XT_PUTPAGE(pmp);
2974 return rc;
2975 }
2976
2977 /*
2978 * read in sibling pages if any to update sibling pointers;
2979 */
2980 rmp = NULL;
2981 if (p->header.next) {
2982 nextbn = le64_to_cpu(p->header.next);
2983 XT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc);
2984 if (rc) {
2985 XT_PUTPAGE(pmp);
2986 XT_PUTPAGE(mp);
2987 return (rc);
2988 }
2989 }
2990
2991 lmp = NULL;
2992 if (p->header.prev) {
2993 prevbn = le64_to_cpu(p->header.prev);
2994 XT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc);
2995 if (rc) {
2996 XT_PUTPAGE(pmp);
2997 XT_PUTPAGE(mp);
2998 if (rmp)
2999 XT_PUTPAGE(rmp);
3000 return (rc);
3001 }
3002 }
3003
3004 /* at this point, all xtpages to be updated are in memory */
3005
3006 /*
3007 * update sibling pointers of sibling xtpages if any;
3008 */
3009 if (lmp) {
3010 BT_MARK_DIRTY(lmp, ip);
3011 tlck =
3012 txLock(tid, ip, lmp, tlckXTREE | tlckRELINK);
3013 lp->header.next = cpu_to_le64(nxaddr);
3014 XT_PUTPAGE(lmp);
3015 }
3016
3017 if (rmp) {
3018 BT_MARK_DIRTY(rmp, ip);
3019 tlck =
3020 txLock(tid, ip, rmp, tlckXTREE | tlckRELINK);
3021 rp->header.prev = cpu_to_le64(nxaddr);
3022 XT_PUTPAGE(rmp);
3023 }
3024
3025 /*
3026 * update the target xtpage to be relocated
3027 *
3028 * update the self address of the target page
3029 * and write to destination extent;
3030 * redo image covers the whole xtpage since it is new page
3031 * to the destination extent;
3032 * update of bmap for the free of source extent
3033 * of the target xtpage itself:
3034 * update of bmap for the allocation of destination extent
3035 * of the target xtpage itself:
3036 * update of bmap for the extents covered by xad entries in
3037 * the target xtpage is not necessary since they are not
3038 * updated;
3039 * if not committed before this relocation,
3040 * target page may contain XAD_NEW entries which must
3041 * be scanned for bmap update (logredo() always
3042 * scan xtpage REDOPAGE image for bmap update);
3043 * if committed before this relocation (tlckRELOCATE),
3044 * scan may be skipped by commit() and logredo();
3045 */
3046 BT_MARK_DIRTY(mp, ip);
3047 /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */
3048 tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW);
3049 xtlck = (struct xtlock *) & tlck->lock;
3050
3051 /* update the self address in the xtpage header */
3052 pxd = &p->header.self;
3053 PXDaddress(pxd, nxaddr);
3054
3055 /* linelock for the after image of the whole page */
3056 xtlck->lwm.length =
3057 le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset;
3058
3059 /* update the buffer extent descriptor of target xtpage */
3060 xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize;
3061 bmSetXD(mp, nxaddr, xsize);
3062
3063 /* unpin the target page to new homeward bound */
3064 XT_PUTPAGE(mp);
3065 jfs_info("xtRelocate: target xtpage relocated.");
3066 }
3067
3068 /*
3069 * 3. acquire maplock for the source extent to be freed;
3070 *
3071 * acquire a maplock saving the src relocated extent address;
3072 * to free of the extent at commit time;
3073 */
3074 out:
3075 /* if DATAEXT relocation, write a LOG_UPDATEMAP record for
3076 * free PXD of the source data extent (logredo() will update
3077 * bmap for free of source data extent), and update bmap for
3078 * free of the source data extent;
3079 */
3080 if (xtype == DATAEXT)
3081 tlck = txMaplock(tid, ip, tlckMAP);
3082 /* if XTPAGE relocation, write a LOG_NOREDOPAGE record
3083 * for the source xtpage (logredo() will init NoRedoPage
3084 * filter and will also update bmap for free of the source
3085 * xtpage), and update bmap for free of the source xtpage;
3086 * N.B. We use tlckMAP instead of tlkcXTREE because there
3087 * is no buffer associated with this lock since the buffer
3088 * has been redirected to the target location.
3089 */
3090 else /* (xtype == XTPAGE) */
3091 tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE);
3092
3093 pxdlock = (struct pxd_lock *) & tlck->lock;
3094 pxdlock->flag = mlckFREEPXD;
3095 PXDaddress(&pxdlock->pxd, oxaddr);
3096 PXDlength(&pxdlock->pxd, xlen);
3097 pxdlock->index = 1;
3098
3099 /*
3100 * 4. update the parent xad entry for relocation;
3101 *
3102 * acquire tlck for the parent entry with XAD_NEW as entry
3103 * update which will write LOG_REDOPAGE and update bmap for
3104 * allocation of XAD_NEW destination extent;
3105 */
3106 jfs_info("xtRelocate: update parent xad entry.");
3107 BT_MARK_DIRTY(pmp, ip);
3108 tlck = txLock(tid, ip, pmp, tlckXTREE | tlckGROW);
3109 xtlck = (struct xtlock *) & tlck->lock;
3110
3111 /* update the XAD with the new destination extent; */
3112 xad = &pp->xad[index];
3113 xad->flag |= XAD_NEW;
3114 XADaddress(xad, nxaddr);
3115
3116 xtlck->lwm.offset = min(index, xtlck->lwm.offset);
3117 xtlck->lwm.length = le16_to_cpu(pp->header.nextindex) -
3118 xtlck->lwm.offset;
3119
3120 /* unpin the parent xtpage */
3121 XT_PUTPAGE(pmp);
3122
3123 return rc;
3124}
3125
3126
3127/*
3128 * xtSearchNode()
3129 *
3130 * function: search for the internal xad entry covering specified extent.
3131 * This function is mainly used by defragfs utility.
3132 *
3133 * parameters:
3134 * ip - file object;
3135 * xad - extent to find;
3136 * cmpp - comparison result:
3137 * btstack - traverse stack;
3138 * flag - search process flag;
3139 *
3140 * returns:
3141 * btstack contains (bn, index) of search path traversed to the entry.
3142 * *cmpp is set to result of comparison with the entry returned.
3143 * the page containing the entry is pinned at exit.
3144 */
3145static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */
3146 int *cmpp, struct btstack * btstack, int flag)
3147{
3148 int rc = 0;
3149 s64 xoff, xaddr;
3150 int xlen;
3151 int cmp = 1; /* init for empty page */
3152 s64 bn; /* block number */
3153 struct metapage *mp; /* meta-page buffer */
3154 xtpage_t *p; /* page */
3155 int base, index, lim;
3156 struct btframe *btsp;
3157 s64 t64;
3158
3159 BT_CLR(btstack);
3160
3161 xoff = offsetXAD(xad);
3162 xlen = lengthXAD(xad);
3163 xaddr = addressXAD(xad);
3164
3165 /*
3166 * search down tree from root:
3167 *
3168 * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
3169 * internal page, child page Pi contains entry with k, Ki <= K < Kj.
3170 *
3171 * if entry with search key K is not found
3172 * internal page search find the entry with largest key Ki
3173 * less than K which point to the child page to search;
3174 * leaf page search find the entry with smallest key Kj
3175 * greater than K so that the returned index is the position of
3176 * the entry to be shifted right for insertion of new entry.
3177 * for empty tree, search key is greater than any key of the tree.
3178 *
3179 * by convention, root bn = 0.
3180 */
3181 for (bn = 0;;) {
3182 /* get/pin the page to search */
3183 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3184 if (rc)
3185 return rc;
3186 if (p->header.flag & BT_LEAF) {
3187 XT_PUTPAGE(mp);
3188 return -ESTALE;
3189 }
3190
3191 lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
3192
3193 /*
3194 * binary search with search key K on the current page
3195 */
3196 for (base = XTENTRYSTART; lim; lim >>= 1) {
3197 index = base + (lim >> 1);
3198
3199 XT_CMP(cmp, xoff, &p->xad[index], t64);
3200 if (cmp == 0) {
3201 /*
3202 * search hit
3203 *
3204 * verify for exact match;
3205 */
3206 if (xaddr == addressXAD(&p->xad[index]) &&
3207 xoff == offsetXAD(&p->xad[index])) {
3208 *cmpp = cmp;
3209
3210 /* save search result */
3211 btsp = btstack->top;
3212 btsp->bn = bn;
3213 btsp->index = index;
3214 btsp->mp = mp;
3215
3216 return 0;
3217 }
3218
3219 /* descend/search its child page */
3220 goto next;
3221 }
3222
3223 if (cmp > 0) {
3224 base = index + 1;
3225 --lim;
3226 }
3227 }
3228
3229 /*
3230 * search miss - non-leaf page:
3231 *
3232 * base is the smallest index with key (Kj) greater than
3233 * search key (K) and may be zero or maxentry index.
3234 * if base is non-zero, decrement base by one to get the parent
3235 * entry of the child page to search.
3236 */
3237 index = base ? base - 1 : base;
3238
3239 /*
3240 * go down to child page
3241 */
3242 next:
3243 /* get the child page block number */
3244 bn = addressXAD(&p->xad[index]);
3245
3246 /* unpin the parent page */
3247 XT_PUTPAGE(mp);
3248 }
3249}
3250
3251
3252/*
3253 * xtRelink()
3254 *
3255 * function:
3256 * link around a freed page.
3257 *
3258 * Parameter:
3259 * int tid,
3260 * struct inode *ip,
3261 * xtpage_t *p)
3262 *
3263 * returns:
3264 */
3265static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p)
3266{
3267 int rc = 0;
3268 struct metapage *mp;
3269 s64 nextbn, prevbn;
3270 struct tlock *tlck;
3271
3272 nextbn = le64_to_cpu(p->header.next);
3273 prevbn = le64_to_cpu(p->header.prev);
3274
3275 /* update prev pointer of the next page */
3276 if (nextbn != 0) {
3277 XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
3278 if (rc)
3279 return rc;
3280
3281 /*
3282 * acquire a transaction lock on the page;
3283 *
3284 * action: update prev pointer;
3285 */
3286 BT_MARK_DIRTY(mp, ip);
3287 tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
3288
3289 /* the page may already have been tlock'd */
3290
3291 p->header.prev = cpu_to_le64(prevbn);
3292
3293 XT_PUTPAGE(mp);
3294 }
3295
3296 /* update next pointer of the previous page */
3297 if (prevbn != 0) {
3298 XT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc);
3299 if (rc)
3300 return rc;
3301
3302 /*
3303 * acquire a transaction lock on the page;
3304 *
3305 * action: update next pointer;
3306 */
3307 BT_MARK_DIRTY(mp, ip);
3308 tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK);
3309
3310 /* the page may already have been tlock'd */
3311
3312 p->header.next = le64_to_cpu(nextbn);
3313
3314 XT_PUTPAGE(mp);
3315 }
3316
3317 return 0;
3318}
3319#endif /* _STILL_TO_PORT */
3320
3321
3322/*
3323 * xtInitRoot()
3324 *
3325 * initialize file root (inline in inode)
3326 */
3327void xtInitRoot(tid_t tid, struct inode *ip)
3328{
3329 xtpage_t *p;
3330
3331 /*
3332 * acquire a transaction lock on the root
3333 *
3334 * action:
3335 */
3336 txLock(tid, ip, (struct metapage *) &JFS_IP(ip)->bxflag,
3337 tlckXTREE | tlckNEW);
3338 p = &JFS_IP(ip)->i_xtroot;
3339
3340 p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF;
3341 p->header.nextindex = cpu_to_le16(XTENTRYSTART);
3342
3343 if (S_ISDIR(ip->i_mode))
3344 p->header.maxentry = cpu_to_le16(XTROOTINITSLOT_DIR);
3345 else {
3346 p->header.maxentry = cpu_to_le16(XTROOTINITSLOT);
3347 ip->i_size = 0;
3348 }
3349
3350
3351 return;
3352}
3353
3354
3355/*
3356 * We can run into a deadlock truncating a file with a large number of
3357 * xtree pages (large fragmented file). A robust fix would entail a
3358 * reservation system where we would reserve a number of metadata pages
3359 * and tlocks which we would be guaranteed without a deadlock. Without
3360 * this, a partial fix is to limit number of metadata pages we will lock
3361 * in a single transaction. Currently we will truncate the file so that
3362 * no more than 50 leaf pages will be locked. The caller of xtTruncate
3363 * will be responsible for ensuring that the current transaction gets
3364 * committed, and that subsequent transactions are created to truncate
3365 * the file further if needed.
3366 */
3367#define MAX_TRUNCATE_LEAVES 50
3368
3369/*
3370 * xtTruncate()
3371 *
3372 * function:
3373 * traverse for truncation logging backward bottom up;
3374 * terminate at the last extent entry at the current subtree
3375 * root page covering new down size.
3376 * truncation may occur within the last extent entry.
3377 *
3378 * parameter:
3379 * int tid,
3380 * struct inode *ip,
3381 * s64 newsize,
3382 * int type) {PWMAP, PMAP, WMAP; DELETE, TRUNCATE}
3383 *
3384 * return:
3385 *
3386 * note:
3387 * PWMAP:
3388 * 1. truncate (non-COMMIT_NOLINK file)
3389 * by jfs_truncate() or jfs_open(O_TRUNC):
3390 * xtree is updated;
3391 * 2. truncate index table of directory when last entry removed
3392 * map update via tlock at commit time;
3393 * PMAP:
3394 * Call xtTruncate_pmap instead
3395 * WMAP:
3396 * 1. remove (free zero link count) on last reference release
3397 * (pmap has been freed at commit zero link count);
3398 * 2. truncate (COMMIT_NOLINK file, i.e., tmp file):
3399 * xtree is updated;
3400 * map update directly at truncation time;
3401 *
3402 * if (DELETE)
3403 * no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient);
3404 * else if (TRUNCATE)
3405 * must write LOG_NOREDOPAGE for deleted index page;
3406 *
3407 * pages may already have been tlocked by anonymous transactions
3408 * during file growth (i.e., write) before truncation;
3409 *
3410 * except last truncated entry, deleted entries remains as is
3411 * in the page (nextindex is updated) for other use
3412 * (e.g., log/update allocation map): this avoid copying the page
3413 * info but delay free of pages;
3414 *
3415 */
3416s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3417{
3418 int rc = 0;
3419 s64 teof;
3420 struct metapage *mp;
3421 xtpage_t *p;
3422 s64 bn;
3423 int index, nextindex;
3424 xad_t *xad;
3425 s64 xoff, xaddr;
3426 int xlen, len, freexlen;
3427 struct btstack btstack;
3428 struct btframe *parent;
3429 struct tblock *tblk = NULL;
3430 struct tlock *tlck = NULL;
3431 struct xtlock *xtlck = NULL;
3432 struct xdlistlock xadlock; /* maplock for COMMIT_WMAP */
3433 struct pxd_lock *pxdlock; /* maplock for COMMIT_WMAP */
3434 s64 nfreed;
3435 int freed, log;
3436 int locked_leaves = 0;
3437
3438 /* save object truncation type */
3439 if (tid) {
3440 tblk = tid_to_tblock(tid);
3441 tblk->xflag |= flag;
3442 }
3443
3444 nfreed = 0;
3445
3446 flag &= COMMIT_MAP;
3447 assert(flag != COMMIT_PMAP);
3448
3449 if (flag == COMMIT_PWMAP)
3450 log = 1;
3451 else {
3452 log = 0;
3453 xadlock.flag = mlckFREEXADLIST;
3454 xadlock.index = 1;
3455 }
3456
3457 /*
3458 * if the newsize is not an integral number of pages,
3459 * the file between newsize and next page boundary will
3460 * be cleared.
3461 * if truncating into a file hole, it will cause
3462 * a full block to be allocated for the logical block.
3463 */
3464
3465 /*
3466 * release page blocks of truncated region <teof, eof>
3467 *
3468 * free the data blocks from the leaf index blocks.
3469 * delete the parent index entries corresponding to
3470 * the freed child data/index blocks.
3471 * free the index blocks themselves which aren't needed
3472 * in new sized file.
3473 *
3474 * index blocks are updated only if the blocks are to be
3475 * retained in the new sized file.
3476 * if type is PMAP, the data and index pages are NOT
3477 * freed, and the data and index blocks are NOT freed
3478 * from working map.
3479 * (this will allow continued access of data/index of
3480 * temporary file (zerolink count file truncated to zero-length)).
3481 */
3482 teof = (newsize + (JFS_SBI(ip->i_sb)->bsize - 1)) >>
3483 JFS_SBI(ip->i_sb)->l2bsize;
3484
3485 /* clear stack */
3486 BT_CLR(&btstack);
3487
3488 /*
3489 * start with root
3490 *
3491 * root resides in the inode
3492 */
3493 bn = 0;
3494
3495 /*
3496 * first access of each page:
3497 */
3498 getPage:
3499 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3500 if (rc)
3501 return rc;
3502
3503 /* process entries backward from last index */
3504 index = le16_to_cpu(p->header.nextindex) - 1;
3505
3506 if (p->header.flag & BT_INTERNAL)
3507 goto getChild;
3508
3509 /*
3510 * leaf page
3511 */
3512
3513 /* Since this is the rightmost leaf, and we may have already freed
3514 * a page that was formerly to the right, let's make sure that the
3515 * next pointer is zero.
3516 */
3517 if (p->header.next) {
3518 if (log)
3519 /*
3520 * Make sure this change to the header is logged.
3521 * If we really truncate this leaf, the flag
3522 * will be changed to tlckTRUNCATE
3523 */
3524 tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW);
3525 BT_MARK_DIRTY(mp, ip);
3526 p->header.next = 0;
3527 }
3528
3529 freed = 0;
3530
3531 /* does region covered by leaf page precede Teof ? */
3532 xad = &p->xad[index];
3533 xoff = offsetXAD(xad);
3534 xlen = lengthXAD(xad);
3535 if (teof >= xoff + xlen) {
3536 XT_PUTPAGE(mp);
3537 goto getParent;
3538 }
3539
3540 /* (re)acquire tlock of the leaf page */
3541 if (log) {
3542 if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
3543 /*
3544 * We need to limit the size of the transaction
3545 * to avoid exhausting pagecache & tlocks
3546 */
3547 XT_PUTPAGE(mp);
3548 newsize = (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
3549 goto getParent;
3550 }
3551 tlck = txLock(tid, ip, mp, tlckXTREE);
3552 tlck->type = tlckXTREE | tlckTRUNCATE;
3553 xtlck = (struct xtlock *) & tlck->lock;
3554 xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1;
3555 }
3556 BT_MARK_DIRTY(mp, ip);
3557
3558 /*
3559 * scan backward leaf page entries
3560 */
3561 for (; index >= XTENTRYSTART; index--) {
3562 xad = &p->xad[index];
3563 xoff = offsetXAD(xad);
3564 xlen = lengthXAD(xad);
3565 xaddr = addressXAD(xad);
3566
3567 /*
3568 * The "data" for a directory is indexed by the block
3569 * device's address space. This metadata must be invalidated
3570 * here
3571 */
3572 if (S_ISDIR(ip->i_mode) && (teof == 0))
3573 invalidate_xad_metapages(ip, *xad);
3574 /*
3575 * entry beyond eof: continue scan of current page
3576 * xad
3577 * ---|---=======------->
3578 * eof
3579 */
3580 if (teof < xoff) {
3581 nfreed += xlen;
3582 continue;
3583 }
3584
3585 /*
3586 * (xoff <= teof): last entry to be deleted from page;
3587 * If other entries remain in page: keep and update the page.
3588 */
3589
3590 /*
3591 * eof == entry_start: delete the entry
3592 * xad
3593 * -------|=======------->
3594 * eof
3595 *
3596 */
3597 if (teof == xoff) {
3598 nfreed += xlen;
3599
3600 if (index == XTENTRYSTART)
3601 break;
3602
3603 nextindex = index;
3604 }
3605 /*
3606 * eof within the entry: truncate the entry.
3607 * xad
3608 * -------===|===------->
3609 * eof
3610 */
3611 else if (teof < xoff + xlen) {
3612 /* update truncated entry */
3613 len = teof - xoff;
3614 freexlen = xlen - len;
3615 XADlength(xad, len);
3616
3617 /* save pxd of truncated extent in tlck */
3618 xaddr += len;
3619 if (log) { /* COMMIT_PWMAP */
3620 xtlck->lwm.offset = (xtlck->lwm.offset) ?
3621 min(index, (int)xtlck->lwm.offset) : index;
3622 xtlck->lwm.length = index + 1 -
3623 xtlck->lwm.offset;
3624 xtlck->twm.offset = index;
3625 pxdlock = (struct pxd_lock *) & xtlck->pxdlock;
3626 pxdlock->flag = mlckFREEPXD;
3627 PXDaddress(&pxdlock->pxd, xaddr);
3628 PXDlength(&pxdlock->pxd, freexlen);
3629 }
3630 /* free truncated extent */
3631 else { /* COMMIT_WMAP */
3632
3633 pxdlock = (struct pxd_lock *) & xadlock;
3634 pxdlock->flag = mlckFREEPXD;
3635 PXDaddress(&pxdlock->pxd, xaddr);
3636 PXDlength(&pxdlock->pxd, freexlen);
3637 txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP);
3638
3639 /* reset map lock */
3640 xadlock.flag = mlckFREEXADLIST;
3641 }
3642
3643 /* current entry is new last entry; */
3644 nextindex = index + 1;
3645
3646 nfreed += freexlen;
3647 }
3648 /*
3649 * eof beyond the entry:
3650 * xad
3651 * -------=======---|--->
3652 * eof
3653 */
3654 else { /* (xoff + xlen < teof) */
3655
3656 nextindex = index + 1;
3657 }
3658
3659 if (nextindex < le16_to_cpu(p->header.nextindex)) {
3660 if (!log) { /* COMMIT_WAMP */
3661 xadlock.xdlist = &p->xad[nextindex];
3662 xadlock.count =
3663 le16_to_cpu(p->header.nextindex) -
3664 nextindex;
3665 txFreeMap(ip, (struct maplock *) & xadlock,
3666 NULL, COMMIT_WMAP);
3667 }
3668 p->header.nextindex = cpu_to_le16(nextindex);
3669 }
3670
3671 XT_PUTPAGE(mp);
3672
3673 /* assert(freed == 0); */
3674 goto getParent;
3675 } /* end scan of leaf page entries */
3676
3677 freed = 1;
3678
3679 /*
3680 * leaf page become empty: free the page if type != PMAP
3681 */
3682 if (log) { /* COMMIT_PWMAP */
3683 /* txCommit() with tlckFREE:
3684 * free data extents covered by leaf [XTENTRYSTART:hwm);
3685 * invalidate leaf if COMMIT_PWMAP;
3686 * if (TRUNCATE), will write LOG_NOREDOPAGE;
3687 */
3688 tlck->type = tlckXTREE | tlckFREE;
3689 } else { /* COMMIT_WAMP */
3690
3691 /* free data extents covered by leaf */
3692 xadlock.xdlist = &p->xad[XTENTRYSTART];
3693 xadlock.count =
3694 le16_to_cpu(p->header.nextindex) - XTENTRYSTART;
3695 txFreeMap(ip, (struct maplock *) & xadlock, NULL, COMMIT_WMAP);
3696 }
3697
3698 if (p->header.flag & BT_ROOT) {
3699 p->header.flag &= ~BT_INTERNAL;
3700 p->header.flag |= BT_LEAF;
3701 p->header.nextindex = cpu_to_le16(XTENTRYSTART);
3702
3703 XT_PUTPAGE(mp); /* debug */
3704 goto out;
3705 } else {
3706 if (log) { /* COMMIT_PWMAP */
3707 /* page will be invalidated at tx completion
3708 */
3709 XT_PUTPAGE(mp);
3710 } else { /* COMMIT_WMAP */
3711
3712 if (mp->lid)
3713 lid_to_tlock(mp->lid)->flag |= tlckFREELOCK;
3714
3715 /* invalidate empty leaf page */
3716 discard_metapage(mp);
3717 }
3718 }
3719
3720 /*
3721 * the leaf page become empty: delete the parent entry
3722 * for the leaf page if the parent page is to be kept
3723 * in the new sized file.
3724 */
3725
3726 /*
3727 * go back up to the parent page
3728 */
3729 getParent:
3730 /* pop/restore parent entry for the current child page */
3731 if ((parent = BT_POP(&btstack)) == NULL)
3732 /* current page must have been root */
3733 goto out;
3734
3735 /* get back the parent page */
3736 bn = parent->bn;
3737 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3738 if (rc)
3739 return rc;
3740
3741 index = parent->index;
3742
3743 /*
3744 * child page was not empty:
3745 */
3746 if (freed == 0) {
3747 /* has any entry deleted from parent ? */
3748 if (index < le16_to_cpu(p->header.nextindex) - 1) {
3749 /* (re)acquire tlock on the parent page */
3750 if (log) { /* COMMIT_PWMAP */
3751 /* txCommit() with tlckTRUNCATE:
3752 * free child extents covered by parent [);
3753 */
3754 tlck = txLock(tid, ip, mp, tlckXTREE);
3755 xtlck = (struct xtlock *) & tlck->lock;
3756 if (!(tlck->type & tlckTRUNCATE)) {
3757 xtlck->hwm.offset =
3758 le16_to_cpu(p->header.
3759 nextindex) - 1;
3760 tlck->type =
3761 tlckXTREE | tlckTRUNCATE;
3762 }
3763 } else { /* COMMIT_WMAP */
3764
3765 /* free child extents covered by parent */
3766 xadlock.xdlist = &p->xad[index + 1];
3767 xadlock.count =
3768 le16_to_cpu(p->header.nextindex) -
3769 index - 1;
3770 txFreeMap(ip, (struct maplock *) & xadlock,
3771 NULL, COMMIT_WMAP);
3772 }
3773 BT_MARK_DIRTY(mp, ip);
3774
3775 p->header.nextindex = cpu_to_le16(index + 1);
3776 }
3777 XT_PUTPAGE(mp);
3778 goto getParent;
3779 }
3780
3781 /*
3782 * child page was empty:
3783 */
3784 nfreed += lengthXAD(&p->xad[index]);
3785
3786 /*
3787 * During working map update, child page's tlock must be handled
3788 * before parent's. This is because the parent's tlock will cause
3789 * the child's disk space to be marked available in the wmap, so
3790 * it's important that the child page be released by that time.
3791 *
3792 * ToDo: tlocks should be on doubly-linked list, so we can
3793 * quickly remove it and add it to the end.
3794 */
3795
3796 /*
3797 * Move parent page's tlock to the end of the tid's tlock list
3798 */
3799 if (log && mp->lid && (tblk->last != mp->lid) &&
3800 lid_to_tlock(mp->lid)->tid) {
3801 lid_t lid = mp->lid;
3802 struct tlock *prev;
3803
3804 tlck = lid_to_tlock(lid);
3805
3806 if (tblk->next == lid)
3807 tblk->next = tlck->next;
3808 else {
3809 for (prev = lid_to_tlock(tblk->next);
3810 prev->next != lid;
3811 prev = lid_to_tlock(prev->next)) {
3812 assert(prev->next);
3813 }
3814 prev->next = tlck->next;
3815 }
3816 lid_to_tlock(tblk->last)->next = lid;
3817 tlck->next = 0;
3818 tblk->last = lid;
3819 }
3820
3821 /*
3822 * parent page become empty: free the page
3823 */
3824 if (index == XTENTRYSTART) {
3825 if (log) { /* COMMIT_PWMAP */
3826 /* txCommit() with tlckFREE:
3827 * free child extents covered by parent;
3828 * invalidate parent if COMMIT_PWMAP;
3829 */
3830 tlck = txLock(tid, ip, mp, tlckXTREE);
3831 xtlck = (struct xtlock *) & tlck->lock;
3832 xtlck->hwm.offset =
3833 le16_to_cpu(p->header.nextindex) - 1;
3834 tlck->type = tlckXTREE | tlckFREE;
3835 } else { /* COMMIT_WMAP */
3836
3837 /* free child extents covered by parent */
3838 xadlock.xdlist = &p->xad[XTENTRYSTART];
3839 xadlock.count =
3840 le16_to_cpu(p->header.nextindex) -
3841 XTENTRYSTART;
3842 txFreeMap(ip, (struct maplock *) & xadlock, NULL,
3843 COMMIT_WMAP);
3844 }
3845 BT_MARK_DIRTY(mp, ip);
3846
3847 if (p->header.flag & BT_ROOT) {
3848 p->header.flag &= ~BT_INTERNAL;
3849 p->header.flag |= BT_LEAF;
3850 p->header.nextindex = cpu_to_le16(XTENTRYSTART);
3851 if (le16_to_cpu(p->header.maxentry) == XTROOTMAXSLOT) {
3852 /*
3853 * Shrink root down to allow inline
3854 * EA (otherwise fsck complains)
3855 */
3856 p->header.maxentry =
3857 cpu_to_le16(XTROOTINITSLOT);
3858 JFS_IP(ip)->mode2 |= INLINEEA;
3859 }
3860
3861 XT_PUTPAGE(mp); /* debug */
3862 goto out;
3863 } else {
3864 if (log) { /* COMMIT_PWMAP */
3865 /* page will be invalidated at tx completion
3866 */
3867 XT_PUTPAGE(mp);
3868 } else { /* COMMIT_WMAP */
3869
3870 if (mp->lid)
3871 lid_to_tlock(mp->lid)->flag |=
3872 tlckFREELOCK;
3873
3874 /* invalidate parent page */
3875 discard_metapage(mp);
3876 }
3877
3878 /* parent has become empty and freed:
3879 * go back up to its parent page
3880 */
3881 /* freed = 1; */
3882 goto getParent;
3883 }
3884 }
3885 /*
3886 * parent page still has entries for front region;
3887 */
3888 else {
3889 /* try truncate region covered by preceding entry
3890 * (process backward)
3891 */
3892 index--;
3893
3894 /* go back down to the child page corresponding
3895 * to the entry
3896 */
3897 goto getChild;
3898 }
3899
3900 /*
3901 * internal page: go down to child page of current entry
3902 */
3903 getChild:
3904 /* save current parent entry for the child page */
3905 BT_PUSH(&btstack, bn, index);
3906
3907 /* get child page */
3908 xad = &p->xad[index];
3909 bn = addressXAD(xad);
3910
3911 /*
3912 * first access of each internal entry:
3913 */
3914 /* release parent page */
3915 XT_PUTPAGE(mp);
3916
3917 /* process the child page */
3918 goto getPage;
3919
3920 out:
3921 /*
3922 * update file resource stat
3923 */
3924 /* set size
3925 */
3926 if (S_ISDIR(ip->i_mode) && !newsize)
3927 ip->i_size = 1; /* fsck hates zero-length directories */
3928 else
3929 ip->i_size = newsize;
3930
3931 /* update quota allocation to reflect freed blocks */
3932 DQUOT_FREE_BLOCK(ip, nfreed);
3933
3934 /*
3935 * free tlock of invalidated pages
3936 */
3937 if (flag == COMMIT_WMAP)
3938 txFreelock(ip);
3939
3940 return newsize;
3941}
3942
3943
3944/*
3945 * xtTruncate_pmap()
3946 *
3947 * function:
3948 * Perform truncate to zero lenghth for deleted file, leaving the
3949 * the xtree and working map untouched. This allows the file to
3950 * be accessed via open file handles, while the delete of the file
3951 * is committed to disk.
3952 *
3953 * parameter:
3954 * tid_t tid,
3955 * struct inode *ip,
3956 * s64 committed_size)
3957 *
3958 * return: new committed size
3959 *
3960 * note:
3961 *
3962 * To avoid deadlock by holding too many transaction locks, the
3963 * truncation may be broken up into multiple transactions.
3964 * The committed_size keeps track of part of the file has been
3965 * freed from the pmaps.
3966 */
3967s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
3968{
3969 s64 bn;
3970 struct btstack btstack;
3971 int cmp;
3972 int index;
3973 int locked_leaves = 0;
3974 struct metapage *mp;
3975 xtpage_t *p;
3976 struct btframe *parent;
3977 int rc;
3978 struct tblock *tblk;
3979 struct tlock *tlck = NULL;
3980 xad_t *xad;
3981 int xlen;
3982 s64 xoff;
3983 struct xtlock *xtlck = NULL;
3984
3985 /* save object truncation type */
3986 tblk = tid_to_tblock(tid);
3987 tblk->xflag |= COMMIT_PMAP;
3988
3989 /* clear stack */
3990 BT_CLR(&btstack);
3991
3992 if (committed_size) {
3993 xoff = (committed_size >> JFS_SBI(ip->i_sb)->l2bsize) - 1;
3994 rc = xtSearch(ip, xoff, &cmp, &btstack, 0);
3995 if (rc)
3996 return rc;
3997
3998 XT_GETSEARCH(ip, btstack.top, bn, mp, p, index);
3999
4000 if (cmp != 0) {
4001 XT_PUTPAGE(mp);
4002 jfs_error(ip->i_sb,
4003 "xtTruncate_pmap: did not find extent");
4004 return -EIO;
4005 }
4006 } else {
4007 /*
4008 * start with root
4009 *
4010 * root resides in the inode
4011 */
4012 bn = 0;
4013
4014 /*
4015 * first access of each page:
4016 */
4017 getPage:
4018 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
4019 if (rc)
4020 return rc;
4021
4022 /* process entries backward from last index */
4023 index = le16_to_cpu(p->header.nextindex) - 1;
4024
4025 if (p->header.flag & BT_INTERNAL)
4026 goto getChild;
4027 }
4028
4029 /*
4030 * leaf page
4031 */
4032
4033 if (++locked_leaves > MAX_TRUNCATE_LEAVES) {
4034 /*
4035 * We need to limit the size of the transaction
4036 * to avoid exhausting pagecache & tlocks
4037 */
4038 xad = &p->xad[index];
4039 xoff = offsetXAD(xad);
4040 xlen = lengthXAD(xad);
4041 XT_PUTPAGE(mp);
4042 return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize;
4043 }
4044 tlck = txLock(tid, ip, mp, tlckXTREE);
4045 tlck->type = tlckXTREE | tlckFREE;
4046 xtlck = (struct xtlock *) & tlck->lock;
4047 xtlck->hwm.offset = index;
4048
4049
4050 XT_PUTPAGE(mp);
4051
4052 /*
4053 * go back up to the parent page
4054 */
4055 getParent:
4056 /* pop/restore parent entry for the current child page */
4057 if ((parent = BT_POP(&btstack)) == NULL)
4058 /* current page must have been root */
4059 goto out;
4060
4061 /* get back the parent page */
4062 bn = parent->bn;
4063 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
4064 if (rc)
4065 return rc;
4066
4067 index = parent->index;
4068
4069 /*
4070 * parent page become empty: free the page
4071 */
4072 if (index == XTENTRYSTART) {
4073 /* txCommit() with tlckFREE:
4074 * free child extents covered by parent;
4075 * invalidate parent if COMMIT_PWMAP;
4076 */
4077 tlck = txLock(tid, ip, mp, tlckXTREE);
4078 xtlck = (struct xtlock *) & tlck->lock;
4079 xtlck->hwm.offset =
4080 le16_to_cpu(p->header.nextindex) - 1;
4081 tlck->type = tlckXTREE | tlckFREE;
4082
4083 XT_PUTPAGE(mp);
4084
4085 if (p->header.flag & BT_ROOT) {
4086
4087 goto out;
4088 } else {
4089 goto getParent;
4090 }
4091 }
4092 /*
4093 * parent page still has entries for front region;
4094 */
4095 else
4096 index--;
4097 /*
4098 * internal page: go down to child page of current entry
4099 */
4100 getChild:
4101 /* save current parent entry for the child page */
4102 BT_PUSH(&btstack, bn, index);
4103
4104 /* get child page */
4105 xad = &p->xad[index];
4106 bn = addressXAD(xad);
4107
4108 /*
4109 * first access of each internal entry:
4110 */
4111 /* release parent page */
4112 XT_PUTPAGE(mp);
4113
4114 /* process the child page */
4115 goto getPage;
4116
4117 out:
4118
4119 return 0;
4120}
4121
4122
4123#ifdef _JFS_DEBUG_XTREE
4124/*
4125 * xtDisplayTree()
4126 *
4127 * function: traverse forward
4128 */
4129int xtDisplayTree(struct inode *ip)
4130{
4131 int rc = 0;
4132 struct metapage *mp;
4133 xtpage_t *p;
4134 s64 bn, pbn;
4135 int index, lastindex, v, h;
4136 xad_t *xad;
4137 struct btstack btstack;
4138 struct btframe *btsp;
4139 struct btframe *parent;
4140
4141 printk("display B+-tree.\n");
4142
4143 /* clear stack */
4144 btsp = btstack.stack;
4145
4146 /*
4147 * start with root
4148 *
4149 * root resides in the inode
4150 */
4151 bn = 0;
4152 v = h = 0;
4153
4154 /*
4155 * first access of each page:
4156 */
4157 getPage:
4158 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
4159 if (rc)
4160 return rc;
4161
4162 /* process entries forward from first index */
4163 index = XTENTRYSTART;
4164 lastindex = le16_to_cpu(p->header.nextindex) - 1;
4165
4166 if (p->header.flag & BT_INTERNAL) {
4167 /*
4168 * first access of each internal page
4169 */
4170 goto getChild;
4171 } else { /* (p->header.flag & BT_LEAF) */
4172
4173 /*
4174 * first access of each leaf page
4175 */
4176 printf("leaf page ");
4177 xtDisplayPage(ip, bn, p);
4178
4179 /* unpin the leaf page */
4180 XT_PUTPAGE(mp);
4181 }
4182
4183 /*
4184 * go back up to the parent page
4185 */
4186 getParent:
4187 /* pop/restore parent entry for the current child page */
4188 if ((parent = (btsp == btstack.stack ? NULL : --btsp)) == NULL)
4189 /* current page must have been root */
4190 return;
4191
4192 /*
4193 * parent page scan completed
4194 */
4195 if ((index = parent->index) == (lastindex = parent->lastindex)) {
4196 /* go back up to the parent page */
4197 goto getParent;
4198 }
4199
4200 /*
4201 * parent page has entries remaining
4202 */
4203 /* get back the parent page */
4204 bn = parent->bn;
4205 /* v = parent->level; */
4206 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
4207 if (rc)
4208 return rc;
4209
4210 /* get next parent entry */
4211 index++;
4212
4213 /*
4214 * internal page: go down to child page of current entry
4215 */
4216 getChild:
4217 /* push/save current parent entry for the child page */
4218 btsp->bn = pbn = bn;
4219 btsp->index = index;
4220 btsp->lastindex = lastindex;
4221 /* btsp->level = v; */
4222 /* btsp->node = h; */
4223 ++btsp;
4224
4225 /* get child page */
4226 xad = &p->xad[index];
4227 bn = addressXAD(xad);
4228
4229 /*
4230 * first access of each internal entry:
4231 */
4232 /* release parent page */
4233 XT_PUTPAGE(mp);
4234
4235 printk("traverse down 0x%lx[%d]->0x%lx\n", (ulong) pbn, index,
4236 (ulong) bn);
4237 v++;
4238 h = index;
4239
4240 /* process the child page */
4241 goto getPage;
4242}
4243
4244
4245/*
4246 * xtDisplayPage()
4247 *
4248 * function: display page
4249 */
4250int xtDisplayPage(struct inode *ip, s64 bn, xtpage_t * p)
4251{
4252 int rc = 0;
4253 xad_t *xad;
4254 s64 xaddr, xoff;
4255 int xlen, i, j;
4256
4257 /* display page control */
4258 printf("bn:0x%lx flag:0x%x nextindex:%d\n",
4259 (ulong) bn, p->header.flag,
4260 le16_to_cpu(p->header.nextindex));
4261
4262 /* display entries */
4263 xad = &p->xad[XTENTRYSTART];
4264 for (i = XTENTRYSTART, j = 1; i < le16_to_cpu(p->header.nextindex);
4265 i++, xad++, j++) {
4266 xoff = offsetXAD(xad);
4267 xaddr = addressXAD(xad);
4268 xlen = lengthXAD(xad);
4269 printf("\t[%d] 0x%lx:0x%lx(0x%x)", i, (ulong) xoff,
4270 (ulong) xaddr, xlen);
4271
4272 if (j == 4) {
4273 printf("\n");
4274 j = 0;
4275 }
4276 }
4277
4278 printf("\n");
4279}
4280#endif /* _JFS_DEBUG_XTREE */
4281
4282
4283#ifdef _JFS_WIP
4284/*
4285 * xtGather()
4286 *
4287 * function:
4288 * traverse for allocation acquiring tlock at commit time
4289 * (vs at the time of update) logging backward top down
4290 *
4291 * note:
4292 * problem - establishing that all new allocation have been
4293 * processed both for append and random write in sparse file
4294 * at the current entry at the current subtree root page
4295 *
4296 */
4297int xtGather(btree_t *t)
4298{
4299 int rc = 0;
4300 xtpage_t *p;
4301 u64 bn;
4302 int index;
4303 btentry_t *e;
4304 struct btstack btstack;
4305 struct btsf *parent;
4306
4307 /* clear stack */
4308 BT_CLR(&btstack);
4309
4310 /*
4311 * start with root
4312 *
4313 * root resides in the inode
4314 */
4315 bn = 0;
4316 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
4317 if (rc)
4318 return rc;
4319
4320 /* new root is NOT pointed by a new entry
4321 if (p->header.flag & NEW)
4322 allocate new page lock;
4323 write a NEWPAGE log;
4324 */
4325
4326 dopage:
4327 /*
4328 * first access of each page:
4329 */
4330 /* process entries backward from last index */
4331 index = le16_to_cpu(p->header.nextindex) - 1;
4332
4333 if (p->header.flag & BT_LEAF) {
4334 /*
4335 * first access of each leaf page
4336 */
4337 /* process leaf page entries backward */
4338 for (; index >= XTENTRYSTART; index--) {
4339 e = &p->xad[index];
4340 /*
4341 * if newpage, log NEWPAGE.
4342 *
4343 if (e->flag & XAD_NEW) {
4344 nfound =+ entry->length;
4345 update current page lock for the entry;
4346 newpage(entry);
4347 *
4348 * if moved, log move.
4349 *
4350 } else if (e->flag & XAD_MOVED) {
4351 reset flag;
4352 update current page lock for the entry;
4353 }
4354 */
4355 }
4356
4357 /* unpin the leaf page */
4358 XT_PUTPAGE(mp);
4359
4360 /*
4361 * go back up to the parent page
4362 */
4363 getParent:
4364 /* restore parent entry for the current child page */
4365 if ((parent = BT_POP(&btstack)) == NULL)
4366 /* current page must have been root */
4367 return 0;
4368
4369 if ((index = parent->index) == XTENTRYSTART) {
4370 /*
4371 * parent page scan completed
4372 */
4373 /* go back up to the parent page */
4374 goto getParent;
4375 } else {
4376 /*
4377 * parent page has entries remaining
4378 */
4379 /* get back the parent page */
4380 bn = parent->bn;
4381 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
4382 if (rc)
4383 return -EIO;
4384
4385 /* first subroot page which
4386 * covers all new allocated blocks
4387 * itself not new/modified.
4388 * (if modified from split of descendent,
4389 * go down path of split page)
4390
4391 if (nfound == nnew &&
4392 !(p->header.flag & (NEW | MOD)))
4393 exit scan;
4394 */
4395
4396 /* process parent page entries backward */
4397 index--;
4398 }
4399 } else {
4400 /*
4401 * first access of each internal page
4402 */
4403 }
4404
4405 /*
4406 * internal page: go down to child page of current entry
4407 */
4408
4409 /* save current parent entry for the child page */
4410 BT_PUSH(&btstack, bn, index);
4411
4412 /* get current entry for the child page */
4413 e = &p->xad[index];
4414
4415 /*
4416 * first access of each internal entry:
4417 */
4418 /*
4419 * if new entry, log btree_tnewentry.
4420 *
4421 if (e->flag & XAD_NEW)
4422 update parent page lock for the entry;
4423 */
4424
4425 /* release parent page */
4426 XT_PUTPAGE(mp);
4427
4428 /* get child page */
4429 bn = e->bn;
4430 XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
4431 if (rc)
4432 return rc;
4433
4434 /*
4435 * first access of each non-root page:
4436 */
4437 /*
4438 * if new, log btree_newpage.
4439 *
4440 if (p->header.flag & NEW)
4441 allocate new page lock;
4442 write a NEWPAGE log (next, prev);
4443 */
4444
4445 /* process the child page */
4446 goto dopage;
4447
4448 out:
4449 return 0;
4450}
4451#endif /* _JFS_WIP */
4452
4453
4454#ifdef CONFIG_JFS_STATISTICS
4455int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
4456 int *eof, void *data)
4457{
4458 int len = 0;
4459 off_t begin;
4460
4461 len += sprintf(buffer,
4462 "JFS Xtree statistics\n"
4463 "====================\n"
4464 "searches = %d\n"
4465 "fast searches = %d\n"
4466 "splits = %d\n",
4467 xtStat.search,
4468 xtStat.fastSearch,
4469 xtStat.split);
4470
4471 begin = offset;
4472 *start = buffer + begin;
4473 len -= begin;
4474
4475 if (len > length)
4476 len = length;
4477 else
4478 *eof = 1;
4479
4480 if (len < 0)
4481 len = 0;
4482
4483 return len;
4484}
4485#endif
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
new file mode 100644
index 000000000000..a69784254fe7
--- /dev/null
+++ b/fs/jfs/jfs_xtree.h
@@ -0,0 +1,140 @@
1/*
2 * Copyright (c) International Business Machines Corp., 2000-2002
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18#ifndef _H_JFS_XTREE
19#define _H_JFS_XTREE
20
21/*
22 * jfs_xtree.h: extent allocation descriptor B+-tree manager
23 */
24
25#include "jfs_btree.h"
26
27
28/*
29 * extent allocation descriptor (xad)
30 */
31typedef struct xad {
32 unsigned flag:8; /* 1: flag */
33 unsigned rsvrd:16; /* 2: reserved */
34 unsigned off1:8; /* 1: offset in unit of fsblksize */
35 __le32 off2; /* 4: offset in unit of fsblksize */
36 unsigned len:24; /* 3: length in unit of fsblksize */
37 unsigned addr1:8; /* 1: address in unit of fsblksize */
38 __le32 addr2; /* 4: address in unit of fsblksize */
39} xad_t; /* (16) */
40
41#define MAXXLEN ((1 << 24) - 1)
42
43#define XTSLOTSIZE 16
44#define L2XTSLOTSIZE 4
45
46/* xad_t field construction */
47#define XADoffset(xad, offset64)\
48{\
49 (xad)->off1 = ((u64)offset64) >> 32;\
50 (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\
51}
52#define XADaddress(xad, address64)\
53{\
54 (xad)->addr1 = ((u64)address64) >> 32;\
55 (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
56}
57#define XADlength(xad, length32) (xad)->len = __cpu_to_le24(length32)
58
59/* xad_t field extraction */
60#define offsetXAD(xad)\
61 ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2))
62#define addressXAD(xad)\
63 ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2))
64#define lengthXAD(xad) __le24_to_cpu((xad)->len)
65
66/* xad list */
67struct xadlist {
68 s16 maxnxad;
69 s16 nxad;
70 xad_t *xad;
71};
72
73/* xad_t flags */
74#define XAD_NEW 0x01 /* new */
75#define XAD_EXTENDED 0x02 /* extended */
76#define XAD_COMPRESSED 0x04 /* compressed with recorded length */
77#define XAD_NOTRECORDED 0x08 /* allocated but not recorded */
78#define XAD_COW 0x10 /* copy-on-write */
79
80
81/* possible values for maxentry */
82#define XTROOTINITSLOT_DIR 6
83#define XTROOTINITSLOT 10
84#define XTROOTMAXSLOT 18
85#define XTPAGEMAXSLOT 256
86#define XTENTRYSTART 2
87
88/*
89 * xtree page:
90 */
91typedef union {
92 struct xtheader {
93 __le64 next; /* 8: */
94 __le64 prev; /* 8: */
95
96 u8 flag; /* 1: */
97 u8 rsrvd1; /* 1: */
98 __le16 nextindex; /* 2: next index = number of entries */
99 __le16 maxentry; /* 2: max number of entries */
100 __le16 rsrvd2; /* 2: */
101
102 pxd_t self; /* 8: self */
103 } header; /* (32) */
104
105 xad_t xad[XTROOTMAXSLOT]; /* 16 * maxentry: xad array */
106} xtpage_t;
107
108/*
109 * external declaration
110 */
111extern int xtLookup(struct inode *ip, s64 lstart, s64 llen,
112 int *pflag, s64 * paddr, int *plen, int flag);
113extern int xtLookupList(struct inode *ip, struct lxdlist * lxdlist,
114 struct xadlist * xadlist, int flag);
115extern void xtInitRoot(tid_t tid, struct inode *ip);
116extern int xtInsert(tid_t tid, struct inode *ip,
117 int xflag, s64 xoff, int xlen, s64 * xaddrp, int flag);
118extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen,
119 int flag);
120#ifdef _NOTYET
121extern int xtTailgate(tid_t tid, struct inode *ip,
122 s64 xoff, int xlen, s64 xaddr, int flag);
123#endif
124extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad);
125extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen,
126 int flag);
127extern s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int type);
128extern s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size);
129extern int xtRelocate(tid_t tid, struct inode *ip,
130 xad_t * oxad, s64 nxaddr, int xtype);
131extern int xtAppend(tid_t tid,
132 struct inode *ip, int xflag, s64 xoff, int maxblocks,
133 int *xlenp, s64 * xaddrp, int flag);
134
135#ifdef _JFS_DEBUG_XTREE
136extern int xtDisplayTree(struct inode *ip);
137extern int xtDisplayPage(struct inode *ip, s64 bn, xtpage_t * p);
138#endif /* _JFS_DEBUG_XTREE */
139
140#endif /* !_H_JFS_XTREE */
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
new file mode 100644
index 000000000000..8413a368f449
--- /dev/null
+++ b/fs/jfs/namei.c
@@ -0,0 +1,1540 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 * Portions Copyright (C) Christoph Hellwig, 2001-2002
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/fs.h>
21#include <linux/ctype.h>
22#include <linux/quotaops.h>
23#include "jfs_incore.h"
24#include "jfs_superblock.h"
25#include "jfs_inode.h"
26#include "jfs_dinode.h"
27#include "jfs_dmap.h"
28#include "jfs_unicode.h"
29#include "jfs_metapage.h"
30#include "jfs_xattr.h"
31#include "jfs_acl.h"
32#include "jfs_debug.h"
33
34extern struct inode_operations jfs_file_inode_operations;
35extern struct inode_operations jfs_symlink_inode_operations;
36extern struct file_operations jfs_file_operations;
37extern struct address_space_operations jfs_aops;
38
39extern int jfs_fsync(struct file *, struct dentry *, int);
40extern void jfs_truncate_nolock(struct inode *, loff_t);
41extern int jfs_init_acl(struct inode *, struct inode *);
42
43/*
44 * forward references
45 */
46struct inode_operations jfs_dir_inode_operations;
47struct file_operations jfs_dir_operations;
48struct dentry_operations jfs_ci_dentry_operations;
49
50static s64 commitZeroLink(tid_t, struct inode *);
51
52/*
53 * NAME: jfs_create(dip, dentry, mode)
54 *
55 * FUNCTION: create a regular file in the parent directory <dip>
56 * with name = <from dentry> and mode = <mode>
57 *
58 * PARAMETER: dip - parent directory vnode
59 * dentry - dentry of new file
60 * mode - create mode (rwxrwxrwx).
61 * nd- nd struct
62 *
63 * RETURN: Errors from subroutines
64 *
65 */
66static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
67 struct nameidata *nd)
68{
69 int rc = 0;
70 tid_t tid; /* transaction id */
71 struct inode *ip = NULL; /* child directory inode */
72 ino_t ino;
73 struct component_name dname; /* child directory name */
74 struct btstack btstack;
75 struct inode *iplist[2];
76 struct tblock *tblk;
77
78 jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name);
79
80 /*
81 * search parent directory for entry/freespace
82 * (dtSearch() returns parent directory page pinned)
83 */
84 if ((rc = get_UCSname(&dname, dentry)))
85 goto out1;
86
87 /*
88 * Either iAlloc() or txBegin() may block. Deadlock can occur if we
89 * block there while holding dtree page, so we allocate the inode &
90 * begin the transaction before we search the directory.
91 */
92 ip = ialloc(dip, mode);
93 if (ip == NULL) {
94 rc = -ENOSPC;
95 goto out2;
96 }
97
98 tid = txBegin(dip->i_sb, 0);
99
100 down(&JFS_IP(dip)->commit_sem);
101 down(&JFS_IP(ip)->commit_sem);
102
103 if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) {
104 jfs_err("jfs_create: dtSearch returned %d", rc);
105 goto out3;
106 }
107
108 tblk = tid_to_tblock(tid);
109 tblk->xflag |= COMMIT_CREATE;
110 tblk->ino = ip->i_ino;
111 tblk->u.ixpxd = JFS_IP(ip)->ixpxd;
112
113 iplist[0] = dip;
114 iplist[1] = ip;
115
116 /*
117 * initialize the child XAD tree root in-line in inode
118 */
119 xtInitRoot(tid, ip);
120
121 /*
122 * create entry in parent directory for child directory
123 * (dtInsert() releases parent directory page)
124 */
125 ino = ip->i_ino;
126 if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) {
127 if (rc == -EIO) {
128 jfs_err("jfs_create: dtInsert returned -EIO");
129 txAbort(tid, 1); /* Marks Filesystem dirty */
130 } else
131 txAbort(tid, 0); /* Filesystem full */
132 goto out3;
133 }
134
135 ip->i_op = &jfs_file_inode_operations;
136 ip->i_fop = &jfs_file_operations;
137 ip->i_mapping->a_ops = &jfs_aops;
138
139 insert_inode_hash(ip);
140 mark_inode_dirty(ip);
141
142 dip->i_ctime = dip->i_mtime = CURRENT_TIME;
143
144 mark_inode_dirty(dip);
145
146 rc = txCommit(tid, 2, &iplist[0], 0);
147
148 out3:
149 txEnd(tid);
150 up(&JFS_IP(dip)->commit_sem);
151 up(&JFS_IP(ip)->commit_sem);
152 if (rc) {
153 ip->i_nlink = 0;
154 iput(ip);
155 } else
156 d_instantiate(dentry, ip);
157
158 out2:
159 free_UCSname(&dname);
160
161#ifdef CONFIG_JFS_POSIX_ACL
162 if (rc == 0)
163 jfs_init_acl(ip, dip);
164#endif
165
166 out1:
167
168 jfs_info("jfs_create: rc:%d", rc);
169 return rc;
170}
171
172
173/*
174 * NAME: jfs_mkdir(dip, dentry, mode)
175 *
176 * FUNCTION: create a child directory in the parent directory <dip>
177 * with name = <from dentry> and mode = <mode>
178 *
179 * PARAMETER: dip - parent directory vnode
180 * dentry - dentry of child directory
181 * mode - create mode (rwxrwxrwx).
182 *
183 * RETURN: Errors from subroutines
184 *
185 * note:
186 * EACCESS: user needs search+write permission on the parent directory
187 */
188static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
189{
190 int rc = 0;
191 tid_t tid; /* transaction id */
192 struct inode *ip = NULL; /* child directory inode */
193 ino_t ino;
194 struct component_name dname; /* child directory name */
195 struct btstack btstack;
196 struct inode *iplist[2];
197 struct tblock *tblk;
198
199 jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name);
200
201 /* link count overflow on parent directory ? */
202 if (dip->i_nlink == JFS_LINK_MAX) {
203 rc = -EMLINK;
204 goto out1;
205 }
206
207 /*
208 * search parent directory for entry/freespace
209 * (dtSearch() returns parent directory page pinned)
210 */
211 if ((rc = get_UCSname(&dname, dentry)))
212 goto out1;
213
214 /*
215 * Either iAlloc() or txBegin() may block. Deadlock can occur if we
216 * block there while holding dtree page, so we allocate the inode &
217 * begin the transaction before we search the directory.
218 */
219 ip = ialloc(dip, S_IFDIR | mode);
220 if (ip == NULL) {
221 rc = -ENOSPC;
222 goto out2;
223 }
224
225 tid = txBegin(dip->i_sb, 0);
226
227 down(&JFS_IP(dip)->commit_sem);
228 down(&JFS_IP(ip)->commit_sem);
229
230 if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) {
231 jfs_err("jfs_mkdir: dtSearch returned %d", rc);
232 goto out3;
233 }
234
235 tblk = tid_to_tblock(tid);
236 tblk->xflag |= COMMIT_CREATE;
237 tblk->ino = ip->i_ino;
238 tblk->u.ixpxd = JFS_IP(ip)->ixpxd;
239
240 iplist[0] = dip;
241 iplist[1] = ip;
242
243 /*
244 * initialize the child directory in-line in inode
245 */
246 dtInitRoot(tid, ip, dip->i_ino);
247
248 /*
249 * create entry in parent directory for child directory
250 * (dtInsert() releases parent directory page)
251 */
252 ino = ip->i_ino;
253 if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) {
254 if (rc == -EIO) {
255 jfs_err("jfs_mkdir: dtInsert returned -EIO");
256 txAbort(tid, 1); /* Marks Filesystem dirty */
257 } else
258 txAbort(tid, 0); /* Filesystem full */
259 goto out3;
260 }
261
262 ip->i_nlink = 2; /* for '.' */
263 ip->i_op = &jfs_dir_inode_operations;
264 ip->i_fop = &jfs_dir_operations;
265
266 insert_inode_hash(ip);
267 mark_inode_dirty(ip);
268
269 /* update parent directory inode */
270 dip->i_nlink++; /* for '..' from child directory */
271 dip->i_ctime = dip->i_mtime = CURRENT_TIME;
272 mark_inode_dirty(dip);
273
274 rc = txCommit(tid, 2, &iplist[0], 0);
275
276 out3:
277 txEnd(tid);
278 up(&JFS_IP(dip)->commit_sem);
279 up(&JFS_IP(ip)->commit_sem);
280 if (rc) {
281 ip->i_nlink = 0;
282 iput(ip);
283 } else
284 d_instantiate(dentry, ip);
285
286 out2:
287 free_UCSname(&dname);
288
289#ifdef CONFIG_JFS_POSIX_ACL
290 if (rc == 0)
291 jfs_init_acl(ip, dip);
292#endif
293
294 out1:
295
296 jfs_info("jfs_mkdir: rc:%d", rc);
297 return rc;
298}
299
300/*
301 * NAME: jfs_rmdir(dip, dentry)
302 *
303 * FUNCTION: remove a link to child directory
304 *
305 * PARAMETER: dip - parent inode
306 * dentry - child directory dentry
307 *
308 * RETURN: -EINVAL - if name is . or ..
309 * -EINVAL - if . or .. exist but are invalid.
310 * errors from subroutines
311 *
312 * note:
313 * if other threads have the directory open when the last link
314 * is removed, the "." and ".." entries, if present, are removed before
315 * rmdir() returns and no new entries may be created in the directory,
316 * but the directory is not removed until the last reference to
317 * the directory is released (cf.unlink() of regular file).
318 */
319static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
320{
321 int rc;
322 tid_t tid; /* transaction id */
323 struct inode *ip = dentry->d_inode;
324 ino_t ino;
325 struct component_name dname;
326 struct inode *iplist[2];
327 struct tblock *tblk;
328
329 jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
330
331 /* Init inode for quota operations. */
332 DQUOT_INIT(ip);
333
334 /* directory must be empty to be removed */
335 if (!dtEmpty(ip)) {
336 rc = -ENOTEMPTY;
337 goto out;
338 }
339
340 if ((rc = get_UCSname(&dname, dentry))) {
341 goto out;
342 }
343
344 tid = txBegin(dip->i_sb, 0);
345
346 down(&JFS_IP(dip)->commit_sem);
347 down(&JFS_IP(ip)->commit_sem);
348
349 iplist[0] = dip;
350 iplist[1] = ip;
351
352 tblk = tid_to_tblock(tid);
353 tblk->xflag |= COMMIT_DELETE;
354 tblk->u.ip = ip;
355
356 /*
357 * delete the entry of target directory from parent directory
358 */
359 ino = ip->i_ino;
360 if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) {
361 jfs_err("jfs_rmdir: dtDelete returned %d", rc);
362 if (rc == -EIO)
363 txAbort(tid, 1);
364 txEnd(tid);
365 up(&JFS_IP(dip)->commit_sem);
366 up(&JFS_IP(ip)->commit_sem);
367
368 goto out2;
369 }
370
371 /* update parent directory's link count corresponding
372 * to ".." entry of the target directory deleted
373 */
374 dip->i_nlink--;
375 dip->i_ctime = dip->i_mtime = CURRENT_TIME;
376 mark_inode_dirty(dip);
377
378 /*
379 * OS/2 could have created EA and/or ACL
380 */
381 /* free EA from both persistent and working map */
382 if (JFS_IP(ip)->ea.flag & DXD_EXTENT) {
383 /* free EA pages */
384 txEA(tid, ip, &JFS_IP(ip)->ea, NULL);
385 }
386 JFS_IP(ip)->ea.flag = 0;
387
388 /* free ACL from both persistent and working map */
389 if (JFS_IP(ip)->acl.flag & DXD_EXTENT) {
390 /* free ACL pages */
391 txEA(tid, ip, &JFS_IP(ip)->acl, NULL);
392 }
393 JFS_IP(ip)->acl.flag = 0;
394
395 /* mark the target directory as deleted */
396 ip->i_nlink = 0;
397 mark_inode_dirty(ip);
398
399 rc = txCommit(tid, 2, &iplist[0], 0);
400
401 txEnd(tid);
402
403 up(&JFS_IP(dip)->commit_sem);
404 up(&JFS_IP(ip)->commit_sem);
405
406 /*
407 * Truncating the directory index table is not guaranteed. It
408 * may need to be done iteratively
409 */
410 if (test_cflag(COMMIT_Stale, dip)) {
411 if (dip->i_size > 1)
412 jfs_truncate_nolock(dip, 0);
413
414 clear_cflag(COMMIT_Stale, dip);
415 }
416
417 out2:
418 free_UCSname(&dname);
419
420 out:
421 jfs_info("jfs_rmdir: rc:%d", rc);
422 return rc;
423}
424
425/*
426 * NAME: jfs_unlink(dip, dentry)
427 *
428 * FUNCTION: remove a link to object <vp> named by <name>
429 * from parent directory <dvp>
430 *
431 * PARAMETER: dip - inode of parent directory
432 * dentry - dentry of object to be removed
433 *
434 * RETURN: errors from subroutines
435 *
436 * note:
437 * temporary file: if one or more processes have the file open
438 * when the last link is removed, the link will be removed before
439 * unlink() returns, but the removal of the file contents will be
440 * postponed until all references to the files are closed.
441 *
442 * JFS does NOT support unlink() on directories.
443 *
444 */
445static int jfs_unlink(struct inode *dip, struct dentry *dentry)
446{
447 int rc;
448 tid_t tid; /* transaction id */
449 struct inode *ip = dentry->d_inode;
450 ino_t ino;
451 struct component_name dname; /* object name */
452 struct inode *iplist[2];
453 struct tblock *tblk;
454 s64 new_size = 0;
455 int commit_flag;
456
457 jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name);
458
459 /* Init inode for quota operations. */
460 DQUOT_INIT(ip);
461
462 if ((rc = get_UCSname(&dname, dentry)))
463 goto out;
464
465 IWRITE_LOCK(ip);
466
467 tid = txBegin(dip->i_sb, 0);
468
469 down(&JFS_IP(dip)->commit_sem);
470 down(&JFS_IP(ip)->commit_sem);
471
472 iplist[0] = dip;
473 iplist[1] = ip;
474
475 /*
476 * delete the entry of target file from parent directory
477 */
478 ino = ip->i_ino;
479 if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) {
480 jfs_err("jfs_unlink: dtDelete returned %d", rc);
481 if (rc == -EIO)
482 txAbort(tid, 1); /* Marks FS Dirty */
483 txEnd(tid);
484 up(&JFS_IP(dip)->commit_sem);
485 up(&JFS_IP(ip)->commit_sem);
486 IWRITE_UNLOCK(ip);
487 goto out1;
488 }
489
490 ASSERT(ip->i_nlink);
491
492 ip->i_ctime = dip->i_ctime = dip->i_mtime = CURRENT_TIME;
493 mark_inode_dirty(dip);
494
495 /* update target's inode */
496 ip->i_nlink--;
497 mark_inode_dirty(ip);
498
499 /*
500 * commit zero link count object
501 */
502 if (ip->i_nlink == 0) {
503 assert(!test_cflag(COMMIT_Nolink, ip));
504 /* free block resources */
505 if ((new_size = commitZeroLink(tid, ip)) < 0) {
506 txAbort(tid, 1); /* Marks FS Dirty */
507 txEnd(tid);
508 up(&JFS_IP(dip)->commit_sem);
509 up(&JFS_IP(ip)->commit_sem);
510 IWRITE_UNLOCK(ip);
511 rc = new_size;
512 goto out1;
513 }
514 tblk = tid_to_tblock(tid);
515 tblk->xflag |= COMMIT_DELETE;
516 tblk->u.ip = ip;
517 }
518
519 /*
520 * Incomplete truncate of file data can
521 * result in timing problems unless we synchronously commit the
522 * transaction.
523 */
524 if (new_size)
525 commit_flag = COMMIT_SYNC;
526 else
527 commit_flag = 0;
528
529 /*
530 * If xtTruncate was incomplete, commit synchronously to avoid
531 * timing complications
532 */
533 rc = txCommit(tid, 2, &iplist[0], commit_flag);
534
535 txEnd(tid);
536
537 up(&JFS_IP(dip)->commit_sem);
538 up(&JFS_IP(ip)->commit_sem);
539
540
541 while (new_size && (rc == 0)) {
542 tid = txBegin(dip->i_sb, 0);
543 down(&JFS_IP(ip)->commit_sem);
544 new_size = xtTruncate_pmap(tid, ip, new_size);
545 if (new_size < 0) {
546 txAbort(tid, 1); /* Marks FS Dirty */
547 rc = new_size;
548 } else
549 rc = txCommit(tid, 2, &iplist[0], COMMIT_SYNC);
550 txEnd(tid);
551 up(&JFS_IP(ip)->commit_sem);
552 }
553
554 if (ip->i_nlink == 0)
555 set_cflag(COMMIT_Nolink, ip);
556
557 IWRITE_UNLOCK(ip);
558
559 /*
560 * Truncating the directory index table is not guaranteed. It
561 * may need to be done iteratively
562 */
563 if (test_cflag(COMMIT_Stale, dip)) {
564 if (dip->i_size > 1)
565 jfs_truncate_nolock(dip, 0);
566
567 clear_cflag(COMMIT_Stale, dip);
568 }
569
570 out1:
571 free_UCSname(&dname);
572 out:
573 jfs_info("jfs_unlink: rc:%d", rc);
574 return rc;
575}
576
577/*
578 * NAME: commitZeroLink()
579 *
580 * FUNCTION: for non-directory, called by jfs_remove(),
581 * truncate a regular file, directory or symbolic
582 * link to zero length. return 0 if type is not
583 * one of these.
584 *
585 * if the file is currently associated with a VM segment
586 * only permanent disk and inode map resources are freed,
587 * and neither the inode nor indirect blocks are modified
588 * so that the resources can be later freed in the work
589 * map by ctrunc1.
590 * if there is no VM segment on entry, the resources are
591 * freed in both work and permanent map.
592 * (? for temporary file - memory object is cached even
593 * after no reference:
594 * reference count > 0 - )
595 *
596 * PARAMETERS: cd - pointer to commit data structure.
597 * current inode is the one to truncate.
598 *
599 * RETURN: Errors from subroutines
600 */
601static s64 commitZeroLink(tid_t tid, struct inode *ip)
602{
603 int filetype;
604 struct tblock *tblk;
605
606 jfs_info("commitZeroLink: tid = %d, ip = 0x%p", tid, ip);
607
608 filetype = ip->i_mode & S_IFMT;
609 switch (filetype) {
610 case S_IFREG:
611 break;
612 case S_IFLNK:
613 /* fast symbolic link */
614 if (ip->i_size < IDATASIZE) {
615 ip->i_size = 0;
616 return 0;
617 }
618 break;
619 default:
620 assert(filetype != S_IFDIR);
621 return 0;
622 }
623
624 set_cflag(COMMIT_Freewmap, ip);
625
626 /* mark transaction of block map update type */
627 tblk = tid_to_tblock(tid);
628 tblk->xflag |= COMMIT_PMAP;
629
630 /*
631 * free EA
632 */
633 if (JFS_IP(ip)->ea.flag & DXD_EXTENT)
634 /* acquire maplock on EA to be freed from block map */
635 txEA(tid, ip, &JFS_IP(ip)->ea, NULL);
636
637 /*
638 * free ACL
639 */
640 if (JFS_IP(ip)->acl.flag & DXD_EXTENT)
641 /* acquire maplock on EA to be freed from block map */
642 txEA(tid, ip, &JFS_IP(ip)->acl, NULL);
643
644 /*
645 * free xtree/data (truncate to zero length):
646 * free xtree/data pages from cache if COMMIT_PWMAP,
647 * free xtree/data blocks from persistent block map, and
648 * free xtree/data blocks from working block map if COMMIT_PWMAP;
649 */
650 if (ip->i_size)
651 return xtTruncate_pmap(tid, ip, 0);
652
653 return 0;
654}
655
656
657/*
658 * NAME: freeZeroLink()
659 *
660 * FUNCTION: for non-directory, called by iClose(),
661 * free resources of a file from cache and WORKING map
662 * for a file previously committed with zero link count
663 * while associated with a pager object,
664 *
665 * PARAMETER: ip - pointer to inode of file.
666 *
667 * RETURN: 0 -ok
668 */
669int freeZeroLink(struct inode *ip)
670{
671 int rc = 0;
672 int type;
673
674 jfs_info("freeZeroLink: ip = 0x%p", ip);
675
676 /* return if not reg or symbolic link or if size is
677 * already ok.
678 */
679 type = ip->i_mode & S_IFMT;
680
681 switch (type) {
682 case S_IFREG:
683 break;
684 case S_IFLNK:
685 /* if its contained in inode nothing to do */
686 if (ip->i_size < IDATASIZE)
687 return 0;
688 break;
689 default:
690 return 0;
691 }
692
693 /*
694 * free EA
695 */
696 if (JFS_IP(ip)->ea.flag & DXD_EXTENT) {
697 s64 xaddr = addressDXD(&JFS_IP(ip)->ea);
698 int xlen = lengthDXD(&JFS_IP(ip)->ea);
699 struct maplock maplock; /* maplock for COMMIT_WMAP */
700 struct pxd_lock *pxdlock; /* maplock for COMMIT_WMAP */
701
702 /* free EA pages from cache */
703 invalidate_dxd_metapages(ip, JFS_IP(ip)->ea);
704
705 /* free EA extent from working block map */
706 maplock.index = 1;
707 pxdlock = (struct pxd_lock *) & maplock;
708 pxdlock->flag = mlckFREEPXD;
709 PXDaddress(&pxdlock->pxd, xaddr);
710 PXDlength(&pxdlock->pxd, xlen);
711 txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP);
712 }
713
714 /*
715 * free ACL
716 */
717 if (JFS_IP(ip)->acl.flag & DXD_EXTENT) {
718 s64 xaddr = addressDXD(&JFS_IP(ip)->acl);
719 int xlen = lengthDXD(&JFS_IP(ip)->acl);
720 struct maplock maplock; /* maplock for COMMIT_WMAP */
721 struct pxd_lock *pxdlock; /* maplock for COMMIT_WMAP */
722
723 invalidate_dxd_metapages(ip, JFS_IP(ip)->acl);
724
725 /* free ACL extent from working block map */
726 maplock.index = 1;
727 pxdlock = (struct pxd_lock *) & maplock;
728 pxdlock->flag = mlckFREEPXD;
729 PXDaddress(&pxdlock->pxd, xaddr);
730 PXDlength(&pxdlock->pxd, xlen);
731 txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP);
732 }
733
734 /*
735 * free xtree/data (truncate to zero length):
736 * free xtree/data pages from cache, and
737 * free xtree/data blocks from working block map;
738 */
739 if (ip->i_size)
740 rc = xtTruncate(0, ip, 0, COMMIT_WMAP);
741
742 return rc;
743}
744
745/*
746 * NAME: jfs_link(vp, dvp, name, crp)
747 *
748 * FUNCTION: create a link to <vp> by the name = <name>
749 * in the parent directory <dvp>
750 *
751 * PARAMETER: vp - target object
752 * dvp - parent directory of new link
753 * name - name of new link to target object
754 * crp - credential
755 *
756 * RETURN: Errors from subroutines
757 *
758 * note:
759 * JFS does NOT support link() on directories (to prevent circular
760 * path in the directory hierarchy);
761 * EPERM: the target object is a directory, and either the caller
762 * does not have appropriate privileges or the implementation prohibits
763 * using link() on directories [XPG4.2].
764 *
765 * JFS does NOT support links between file systems:
766 * EXDEV: target object and new link are on different file systems and
767 * implementation does not support links between file systems [XPG4.2].
768 */
769static int jfs_link(struct dentry *old_dentry,
770 struct inode *dir, struct dentry *dentry)
771{
772 int rc;
773 tid_t tid;
774 struct inode *ip = old_dentry->d_inode;
775 ino_t ino;
776 struct component_name dname;
777 struct btstack btstack;
778 struct inode *iplist[2];
779
780 jfs_info("jfs_link: %s %s", old_dentry->d_name.name,
781 dentry->d_name.name);
782
783 if (ip->i_nlink == JFS_LINK_MAX)
784 return -EMLINK;
785
786 if (ip->i_nlink == 0)
787 return -ENOENT;
788
789 tid = txBegin(ip->i_sb, 0);
790
791 down(&JFS_IP(dir)->commit_sem);
792 down(&JFS_IP(ip)->commit_sem);
793
794 /*
795 * scan parent directory for entry/freespace
796 */
797 if ((rc = get_UCSname(&dname, dentry)))
798 goto out;
799
800 if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE)))
801 goto free_dname;
802
803 /*
804 * create entry for new link in parent directory
805 */
806 ino = ip->i_ino;
807 if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack)))
808 goto free_dname;
809
810 /* update object inode */
811 ip->i_nlink++; /* for new link */
812 ip->i_ctime = CURRENT_TIME;
813 mark_inode_dirty(dir);
814 atomic_inc(&ip->i_count);
815
816 iplist[0] = ip;
817 iplist[1] = dir;
818 rc = txCommit(tid, 2, &iplist[0], 0);
819
820 if (rc) {
821 ip->i_nlink--;
822 iput(ip);
823 } else
824 d_instantiate(dentry, ip);
825
826 free_dname:
827 free_UCSname(&dname);
828
829 out:
830 txEnd(tid);
831
832 up(&JFS_IP(dir)->commit_sem);
833 up(&JFS_IP(ip)->commit_sem);
834
835 jfs_info("jfs_link: rc:%d", rc);
836 return rc;
837}
838
839/*
840 * NAME: jfs_symlink(dip, dentry, name)
841 *
842 * FUNCTION: creates a symbolic link to <symlink> by name <name>
843 * in directory <dip>
844 *
845 * PARAMETER: dip - parent directory vnode
846 * dentry - dentry of symbolic link
847 * name - the path name of the existing object
848 * that will be the source of the link
849 *
850 * RETURN: errors from subroutines
851 *
852 * note:
853 * ENAMETOOLONG: pathname resolution of a symbolic link produced
854 * an intermediate result whose length exceeds PATH_MAX [XPG4.2]
855*/
856
857static int jfs_symlink(struct inode *dip, struct dentry *dentry,
858 const char *name)
859{
860 int rc;
861 tid_t tid;
862 ino_t ino = 0;
863 struct component_name dname;
864 int ssize; /* source pathname size */
865 struct btstack btstack;
866 struct inode *ip = dentry->d_inode;
867 unchar *i_fastsymlink;
868 s64 xlen = 0;
869 int bmask = 0, xsize;
870 s64 extent = 0, xaddr;
871 struct metapage *mp;
872 struct super_block *sb;
873 struct tblock *tblk;
874
875 struct inode *iplist[2];
876
877 jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name);
878
879 ssize = strlen(name) + 1;
880
881 /*
882 * search parent directory for entry/freespace
883 * (dtSearch() returns parent directory page pinned)
884 */
885
886 if ((rc = get_UCSname(&dname, dentry)))
887 goto out1;
888
889 /*
890 * allocate on-disk/in-memory inode for symbolic link:
891 * (iAlloc() returns new, locked inode)
892 */
893 ip = ialloc(dip, S_IFLNK | 0777);
894 if (ip == NULL) {
895 rc = -ENOSPC;
896 goto out2;
897 }
898
899 tid = txBegin(dip->i_sb, 0);
900
901 down(&JFS_IP(dip)->commit_sem);
902 down(&JFS_IP(ip)->commit_sem);
903
904 tblk = tid_to_tblock(tid);
905 tblk->xflag |= COMMIT_CREATE;
906 tblk->ino = ip->i_ino;
907 tblk->u.ixpxd = JFS_IP(ip)->ixpxd;
908
909 /* fix symlink access permission
910 * (dir_create() ANDs in the u.u_cmask,
911 * but symlinks really need to be 777 access)
912 */
913 ip->i_mode |= 0777;
914
915 /*
916 * write symbolic link target path name
917 */
918 xtInitRoot(tid, ip);
919
920 /*
921 * write source path name inline in on-disk inode (fast symbolic link)
922 */
923
924 if (ssize <= IDATASIZE) {
925 ip->i_op = &jfs_symlink_inode_operations;
926
927 i_fastsymlink = JFS_IP(ip)->i_inline;
928 memcpy(i_fastsymlink, name, ssize);
929 ip->i_size = ssize - 1;
930
931 /*
932 * if symlink is > 128 bytes, we don't have the space to
933 * store inline extended attributes
934 */
935 if (ssize > sizeof (JFS_IP(ip)->i_inline))
936 JFS_IP(ip)->mode2 &= ~INLINEEA;
937
938 jfs_info("jfs_symlink: fast symlink added ssize:%d name:%s ",
939 ssize, name);
940 }
941 /*
942 * write source path name in a single extent
943 */
944 else {
945 jfs_info("jfs_symlink: allocate extent ip:0x%p", ip);
946
947 ip->i_op = &page_symlink_inode_operations;
948 ip->i_mapping->a_ops = &jfs_aops;
949
950 /*
951 * even though the data of symlink object (source
952 * path name) is treated as non-journaled user data,
953 * it is read/written thru buffer cache for performance.
954 */
955 sb = ip->i_sb;
956 bmask = JFS_SBI(sb)->bsize - 1;
957 xsize = (ssize + bmask) & ~bmask;
958 xaddr = 0;
959 xlen = xsize >> JFS_SBI(sb)->l2bsize;
960 if ((rc = xtInsert(tid, ip, 0, 0, xlen, &xaddr, 0))) {
961 txAbort(tid, 0);
962 rc = -ENOSPC;
963 goto out3;
964 }
965 extent = xaddr;
966 ip->i_size = ssize - 1;
967 while (ssize) {
968 /* This is kind of silly since PATH_MAX == 4K */
969 int copy_size = min(ssize, PSIZE);
970
971 mp = get_metapage(ip, xaddr, PSIZE, 1);
972
973 if (mp == NULL) {
974 xtTruncate(tid, ip, 0, COMMIT_PWMAP);
975 rc = -EIO;
976 txAbort(tid, 0);
977 goto out3;
978 }
979 memcpy(mp->data, name, copy_size);
980 flush_metapage(mp);
981 ssize -= copy_size;
982 name += copy_size;
983 xaddr += JFS_SBI(sb)->nbperpage;
984 }
985 }
986
987 /*
988 * create entry for symbolic link in parent directory
989 */
990 rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE);
991 if (rc == 0) {
992 ino = ip->i_ino;
993 rc = dtInsert(tid, dip, &dname, &ino, &btstack);
994 }
995 if (rc) {
996 if (xlen)
997 xtTruncate(tid, ip, 0, COMMIT_PWMAP);
998 txAbort(tid, 0);
999 /* discard new inode */
1000 goto out3;
1001 }
1002
1003 insert_inode_hash(ip);
1004 mark_inode_dirty(ip);
1005
1006 /*
1007 * commit update of parent directory and link object
1008 */
1009
1010 iplist[0] = dip;
1011 iplist[1] = ip;
1012 rc = txCommit(tid, 2, &iplist[0], 0);
1013
1014 out3:
1015 txEnd(tid);
1016 up(&JFS_IP(dip)->commit_sem);
1017 up(&JFS_IP(ip)->commit_sem);
1018 if (rc) {
1019 ip->i_nlink = 0;
1020 iput(ip);
1021 } else
1022 d_instantiate(dentry, ip);
1023
1024 out2:
1025 free_UCSname(&dname);
1026
1027#ifdef CONFIG_JFS_POSIX_ACL
1028 if (rc == 0)
1029 jfs_init_acl(ip, dip);
1030#endif
1031
1032 out1:
1033 jfs_info("jfs_symlink: rc:%d", rc);
1034 return rc;
1035}
1036
1037
1038/*
1039 * NAME: jfs_rename
1040 *
1041 * FUNCTION: rename a file or directory
1042 */
1043static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1044 struct inode *new_dir, struct dentry *new_dentry)
1045{
1046 struct btstack btstack;
1047 ino_t ino;
1048 struct component_name new_dname;
1049 struct inode *new_ip;
1050 struct component_name old_dname;
1051 struct inode *old_ip;
1052 int rc;
1053 tid_t tid;
1054 struct tlock *tlck;
1055 struct dt_lock *dtlck;
1056 struct lv *lv;
1057 int ipcount;
1058 struct inode *iplist[4];
1059 struct tblock *tblk;
1060 s64 new_size = 0;
1061 int commit_flag;
1062
1063
1064 jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
1065 new_dentry->d_name.name);
1066
1067 old_ip = old_dentry->d_inode;
1068 new_ip = new_dentry->d_inode;
1069
1070 if ((rc = get_UCSname(&old_dname, old_dentry)))
1071 goto out1;
1072
1073 if ((rc = get_UCSname(&new_dname, new_dentry)))
1074 goto out2;
1075
1076 /*
1077 * Make sure source inode number is what we think it is
1078 */
1079 rc = dtSearch(old_dir, &old_dname, &ino, &btstack, JFS_LOOKUP);
1080 if (rc || (ino != old_ip->i_ino)) {
1081 rc = -ENOENT;
1082 goto out3;
1083 }
1084
1085 /*
1086 * Make sure dest inode number (if any) is what we think it is
1087 */
1088 rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_LOOKUP);
1089 if (rc == 0) {
1090 if ((new_ip == 0) || (ino != new_ip->i_ino)) {
1091 rc = -ESTALE;
1092 goto out3;
1093 }
1094 } else if (rc != -ENOENT)
1095 goto out3;
1096 else if (new_ip) {
1097 /* no entry exists, but one was expected */
1098 rc = -ESTALE;
1099 goto out3;
1100 }
1101
1102 if (S_ISDIR(old_ip->i_mode)) {
1103 if (new_ip) {
1104 if (!dtEmpty(new_ip)) {
1105 rc = -ENOTEMPTY;
1106 goto out3;
1107 }
1108 } else if ((new_dir != old_dir) &&
1109 (new_dir->i_nlink == JFS_LINK_MAX)) {
1110 rc = -EMLINK;
1111 goto out3;
1112 }
1113 } else if (new_ip) {
1114 IWRITE_LOCK(new_ip);
1115 /* Init inode for quota operations. */
1116 DQUOT_INIT(new_ip);
1117 }
1118
1119 /*
1120 * The real work starts here
1121 */
1122 tid = txBegin(new_dir->i_sb, 0);
1123
1124 down(&JFS_IP(new_dir)->commit_sem);
1125 down(&JFS_IP(old_ip)->commit_sem);
1126 if (old_dir != new_dir)
1127 down(&JFS_IP(old_dir)->commit_sem);
1128
1129 if (new_ip) {
1130 down(&JFS_IP(new_ip)->commit_sem);
1131 /*
1132 * Change existing directory entry to new inode number
1133 */
1134 ino = new_ip->i_ino;
1135 rc = dtModify(tid, new_dir, &new_dname, &ino,
1136 old_ip->i_ino, JFS_RENAME);
1137 if (rc)
1138 goto out4;
1139 new_ip->i_nlink--;
1140 if (S_ISDIR(new_ip->i_mode)) {
1141 new_ip->i_nlink--;
1142 if (new_ip->i_nlink) {
1143 up(&JFS_IP(new_dir)->commit_sem);
1144 up(&JFS_IP(old_ip)->commit_sem);
1145 if (old_dir != new_dir)
1146 up(&JFS_IP(old_dir)->commit_sem);
1147 if (!S_ISDIR(old_ip->i_mode) && new_ip)
1148 IWRITE_UNLOCK(new_ip);
1149 jfs_error(new_ip->i_sb,
1150 "jfs_rename: new_ip->i_nlink != 0");
1151 return -EIO;
1152 }
1153 tblk = tid_to_tblock(tid);
1154 tblk->xflag |= COMMIT_DELETE;
1155 tblk->u.ip = new_ip;
1156 } else if (new_ip->i_nlink == 0) {
1157 assert(!test_cflag(COMMIT_Nolink, new_ip));
1158 /* free block resources */
1159 if ((new_size = commitZeroLink(tid, new_ip)) < 0) {
1160 txAbort(tid, 1); /* Marks FS Dirty */
1161 rc = new_size;
1162 goto out4;
1163 }
1164 tblk = tid_to_tblock(tid);
1165 tblk->xflag |= COMMIT_DELETE;
1166 tblk->u.ip = new_ip;
1167 } else {
1168 new_ip->i_ctime = CURRENT_TIME;
1169 mark_inode_dirty(new_ip);
1170 }
1171 } else {
1172 /*
1173 * Add new directory entry
1174 */
1175 rc = dtSearch(new_dir, &new_dname, &ino, &btstack,
1176 JFS_CREATE);
1177 if (rc) {
1178 jfs_err("jfs_rename didn't expect dtSearch to fail "
1179 "w/rc = %d", rc);
1180 goto out4;
1181 }
1182
1183 ino = old_ip->i_ino;
1184 rc = dtInsert(tid, new_dir, &new_dname, &ino, &btstack);
1185 if (rc) {
1186 if (rc == -EIO)
1187 jfs_err("jfs_rename: dtInsert returned -EIO");
1188 goto out4;
1189 }
1190 if (S_ISDIR(old_ip->i_mode))
1191 new_dir->i_nlink++;
1192 }
1193 /*
1194 * Remove old directory entry
1195 */
1196
1197 ino = old_ip->i_ino;
1198 rc = dtDelete(tid, old_dir, &old_dname, &ino, JFS_REMOVE);
1199 if (rc) {
1200 jfs_err("jfs_rename did not expect dtDelete to return rc = %d",
1201 rc);
1202 txAbort(tid, 1); /* Marks Filesystem dirty */
1203 goto out4;
1204 }
1205 if (S_ISDIR(old_ip->i_mode)) {
1206 old_dir->i_nlink--;
1207 if (old_dir != new_dir) {
1208 /*
1209 * Change inode number of parent for moved directory
1210 */
1211
1212 JFS_IP(old_ip)->i_dtroot.header.idotdot =
1213 cpu_to_le32(new_dir->i_ino);
1214
1215 /* Linelock header of dtree */
1216 tlck = txLock(tid, old_ip,
1217 (struct metapage *) &JFS_IP(old_ip)->bxflag,
1218 tlckDTREE | tlckBTROOT | tlckRELINK);
1219 dtlck = (struct dt_lock *) & tlck->lock;
1220 ASSERT(dtlck->index == 0);
1221 lv = & dtlck->lv[0];
1222 lv->offset = 0;
1223 lv->length = 1;
1224 dtlck->index++;
1225 }
1226 }
1227
1228 /*
1229 * Update ctime on changed/moved inodes & mark dirty
1230 */
1231 old_ip->i_ctime = CURRENT_TIME;
1232 mark_inode_dirty(old_ip);
1233
1234 new_dir->i_ctime = new_dir->i_mtime = current_fs_time(new_dir->i_sb);
1235 mark_inode_dirty(new_dir);
1236
1237 /* Build list of inodes modified by this transaction */
1238 ipcount = 0;
1239 iplist[ipcount++] = old_ip;
1240 if (new_ip)
1241 iplist[ipcount++] = new_ip;
1242 iplist[ipcount++] = old_dir;
1243
1244 if (old_dir != new_dir) {
1245 iplist[ipcount++] = new_dir;
1246 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
1247 mark_inode_dirty(old_dir);
1248 }
1249
1250 /*
1251 * Incomplete truncate of file data can
1252 * result in timing problems unless we synchronously commit the
1253 * transaction.
1254 */
1255 if (new_size)
1256 commit_flag = COMMIT_SYNC;
1257 else
1258 commit_flag = 0;
1259
1260 rc = txCommit(tid, ipcount, iplist, commit_flag);
1261
1262 out4:
1263 txEnd(tid);
1264
1265 up(&JFS_IP(new_dir)->commit_sem);
1266 up(&JFS_IP(old_ip)->commit_sem);
1267 if (old_dir != new_dir)
1268 up(&JFS_IP(old_dir)->commit_sem);
1269 if (new_ip)
1270 up(&JFS_IP(new_ip)->commit_sem);
1271
1272 while (new_size && (rc == 0)) {
1273 tid = txBegin(new_ip->i_sb, 0);
1274 down(&JFS_IP(new_ip)->commit_sem);
1275 new_size = xtTruncate_pmap(tid, new_ip, new_size);
1276 if (new_size < 0) {
1277 txAbort(tid, 1);
1278 rc = new_size;
1279 } else
1280 rc = txCommit(tid, 1, &new_ip, COMMIT_SYNC);
1281 txEnd(tid);
1282 up(&JFS_IP(new_ip)->commit_sem);
1283 }
1284 if (new_ip && (new_ip->i_nlink == 0))
1285 set_cflag(COMMIT_Nolink, new_ip);
1286 out3:
1287 free_UCSname(&new_dname);
1288 out2:
1289 free_UCSname(&old_dname);
1290 out1:
1291 if (new_ip && !S_ISDIR(new_ip->i_mode))
1292 IWRITE_UNLOCK(new_ip);
1293 /*
1294 * Truncating the directory index table is not guaranteed. It
1295 * may need to be done iteratively
1296 */
1297 if (test_cflag(COMMIT_Stale, old_dir)) {
1298 if (old_dir->i_size > 1)
1299 jfs_truncate_nolock(old_dir, 0);
1300
1301 clear_cflag(COMMIT_Stale, old_dir);
1302 }
1303
1304 jfs_info("jfs_rename: returning %d", rc);
1305 return rc;
1306}
1307
1308
1309/*
1310 * NAME: jfs_mknod
1311 *
1312 * FUNCTION: Create a special file (device)
1313 */
1314static int jfs_mknod(struct inode *dir, struct dentry *dentry,
1315 int mode, dev_t rdev)
1316{
1317 struct jfs_inode_info *jfs_ip;
1318 struct btstack btstack;
1319 struct component_name dname;
1320 ino_t ino;
1321 struct inode *ip;
1322 struct inode *iplist[2];
1323 int rc;
1324 tid_t tid;
1325 struct tblock *tblk;
1326
1327 if (!new_valid_dev(rdev))
1328 return -EINVAL;
1329
1330 jfs_info("jfs_mknod: %s", dentry->d_name.name);
1331
1332 if ((rc = get_UCSname(&dname, dentry)))
1333 goto out;
1334
1335 ip = ialloc(dir, mode);
1336 if (ip == NULL) {
1337 rc = -ENOSPC;
1338 goto out1;
1339 }
1340 jfs_ip = JFS_IP(ip);
1341
1342 tid = txBegin(dir->i_sb, 0);
1343
1344 down(&JFS_IP(dir)->commit_sem);
1345 down(&JFS_IP(ip)->commit_sem);
1346
1347 if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE)))
1348 goto out3;
1349
1350 tblk = tid_to_tblock(tid);
1351 tblk->xflag |= COMMIT_CREATE;
1352 tblk->ino = ip->i_ino;
1353 tblk->u.ixpxd = JFS_IP(ip)->ixpxd;
1354
1355 ino = ip->i_ino;
1356 if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack)))
1357 goto out3;
1358
1359 ip->i_op = &jfs_file_inode_operations;
1360 jfs_ip->dev = new_encode_dev(rdev);
1361 init_special_inode(ip, ip->i_mode, rdev);
1362
1363 insert_inode_hash(ip);
1364 mark_inode_dirty(ip);
1365
1366 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1367
1368 mark_inode_dirty(dir);
1369
1370 iplist[0] = dir;
1371 iplist[1] = ip;
1372 rc = txCommit(tid, 2, iplist, 0);
1373
1374 out3:
1375 txEnd(tid);
1376 up(&JFS_IP(ip)->commit_sem);
1377 up(&JFS_IP(dir)->commit_sem);
1378 if (rc) {
1379 ip->i_nlink = 0;
1380 iput(ip);
1381 } else
1382 d_instantiate(dentry, ip);
1383
1384 out1:
1385 free_UCSname(&dname);
1386
1387#ifdef CONFIG_JFS_POSIX_ACL
1388 if (rc == 0)
1389 jfs_init_acl(ip, dir);
1390#endif
1391
1392 out:
1393 jfs_info("jfs_mknod: returning %d", rc);
1394 return rc;
1395}
1396
1397static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struct nameidata *nd)
1398{
1399 struct btstack btstack;
1400 ino_t inum;
1401 struct inode *ip;
1402 struct component_name key;
1403 const char *name = dentry->d_name.name;
1404 int len = dentry->d_name.len;
1405 int rc;
1406
1407 jfs_info("jfs_lookup: name = %s", name);
1408
1409
1410 if ((name[0] == '.') && (len == 1))
1411 inum = dip->i_ino;
1412 else if (strcmp(name, "..") == 0)
1413 inum = PARENT(dip);
1414 else {
1415 if ((rc = get_UCSname(&key, dentry)))
1416 return ERR_PTR(rc);
1417 rc = dtSearch(dip, &key, &inum, &btstack, JFS_LOOKUP);
1418 free_UCSname(&key);
1419 if (rc == -ENOENT) {
1420 d_add(dentry, NULL);
1421 return ERR_PTR(0);
1422 } else if (rc) {
1423 jfs_err("jfs_lookup: dtSearch returned %d", rc);
1424 return ERR_PTR(rc);
1425 }
1426 }
1427
1428 ip = iget(dip->i_sb, inum);
1429 if (ip == NULL || is_bad_inode(ip)) {
1430 jfs_err("jfs_lookup: iget failed on inum %d", (uint) inum);
1431 if (ip)
1432 iput(ip);
1433 return ERR_PTR(-EACCES);
1434 }
1435
1436 if (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2)
1437 dentry->d_op = &jfs_ci_dentry_operations;
1438
1439 dentry = d_splice_alias(ip, dentry);
1440
1441 if (dentry && (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2))
1442 dentry->d_op = &jfs_ci_dentry_operations;
1443
1444 return dentry;
1445}
1446
1447struct dentry *jfs_get_parent(struct dentry *dentry)
1448{
1449 struct super_block *sb = dentry->d_inode->i_sb;
1450 struct dentry *parent = ERR_PTR(-ENOENT);
1451 struct inode *inode;
1452 unsigned long parent_ino;
1453
1454 parent_ino =
1455 le32_to_cpu(JFS_IP(dentry->d_inode)->i_dtroot.header.idotdot);
1456 inode = iget(sb, parent_ino);
1457 if (inode) {
1458 if (is_bad_inode(inode)) {
1459 iput(inode);
1460 parent = ERR_PTR(-EACCES);
1461 } else {
1462 parent = d_alloc_anon(inode);
1463 if (!parent) {
1464 parent = ERR_PTR(-ENOMEM);
1465 iput(inode);
1466 }
1467 }
1468 }
1469
1470 return parent;
1471}
1472
1473struct inode_operations jfs_dir_inode_operations = {
1474 .create = jfs_create,
1475 .lookup = jfs_lookup,
1476 .link = jfs_link,
1477 .unlink = jfs_unlink,
1478 .symlink = jfs_symlink,
1479 .mkdir = jfs_mkdir,
1480 .rmdir = jfs_rmdir,
1481 .mknod = jfs_mknod,
1482 .rename = jfs_rename,
1483 .setxattr = jfs_setxattr,
1484 .getxattr = jfs_getxattr,
1485 .listxattr = jfs_listxattr,
1486 .removexattr = jfs_removexattr,
1487#ifdef CONFIG_JFS_POSIX_ACL
1488 .setattr = jfs_setattr,
1489 .permission = jfs_permission,
1490#endif
1491};
1492
1493struct file_operations jfs_dir_operations = {
1494 .read = generic_read_dir,
1495 .readdir = jfs_readdir,
1496 .fsync = jfs_fsync,
1497};
1498
1499static int jfs_ci_hash(struct dentry *dir, struct qstr *this)
1500{
1501 unsigned long hash;
1502 int i;
1503
1504 hash = init_name_hash();
1505 for (i=0; i < this->len; i++)
1506 hash = partial_name_hash(tolower(this->name[i]), hash);
1507 this->hash = end_name_hash(hash);
1508
1509 return 0;
1510}
1511
1512static int jfs_ci_compare(struct dentry *dir, struct qstr *a, struct qstr *b)
1513{
1514 int i, result = 1;
1515
1516 if (a->len != b->len)
1517 goto out;
1518 for (i=0; i < a->len; i++) {
1519 if (tolower(a->name[i]) != tolower(b->name[i]))
1520 goto out;
1521 }
1522 result = 0;
1523
1524 /*
1525 * We want creates to preserve case. A negative dentry, a, that
1526 * has a different case than b may cause a new entry to be created
1527 * with the wrong case. Since we can't tell if a comes from a negative
1528 * dentry, we blindly replace it with b. This should be harmless if
1529 * a is not a negative dentry.
1530 */
1531 memcpy((unsigned char *)a->name, b->name, a->len);
1532out:
1533 return result;
1534}
1535
1536struct dentry_operations jfs_ci_dentry_operations =
1537{
1538 .d_hash = jfs_ci_hash,
1539 .d_compare = jfs_ci_compare,
1540};
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
new file mode 100644
index 000000000000..2eb6869b6e72
--- /dev/null
+++ b/fs/jfs/resize.c
@@ -0,0 +1,537 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/
18
19#include <linux/fs.h>
20#include <linux/buffer_head.h>
21#include <linux/quotaops.h>
22#include "jfs_incore.h"
23#include "jfs_filsys.h"
24#include "jfs_metapage.h"
25#include "jfs_dinode.h"
26#include "jfs_imap.h"
27#include "jfs_dmap.h"
28#include "jfs_superblock.h"
29#include "jfs_txnmgr.h"
30#include "jfs_debug.h"
31
32#define BITSPERPAGE (PSIZE << 3)
33#define L2MEGABYTE 20
34#define MEGABYTE (1 << L2MEGABYTE)
35#define MEGABYTE32 (MEGABYTE << 5)
36
37/* convert block number to bmap file page number */
38#define BLKTODMAPN(b)\
39 (((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1)
40
41/*
42 * jfs_extendfs()
43 *
44 * function: extend file system;
45 *
46 * |-------------------------------|----------|----------|
47 * file system space fsck inline log
48 * workspace space
49 *
50 * input:
51 * new LVSize: in LV blocks (required)
52 * new LogSize: in LV blocks (optional)
53 * new FSSize: in LV blocks (optional)
54 *
55 * new configuration:
56 * 1. set new LogSize as specified or default from new LVSize;
57 * 2. compute new FSCKSize from new LVSize;
58 * 3. set new FSSize as MIN(FSSize, LVSize-(LogSize+FSCKSize)) where
59 * assert(new FSSize >= old FSSize),
60 * i.e., file system must not be shrinked;
61 */
62int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
63{
64 int rc = 0;
65 struct jfs_sb_info *sbi = JFS_SBI(sb);
66 struct inode *ipbmap = sbi->ipbmap;
67 struct inode *ipbmap2;
68 struct inode *ipimap = sbi->ipimap;
69 struct jfs_log *log = sbi->log;
70 struct bmap *bmp = sbi->bmap;
71 s64 newLogAddress, newFSCKAddress;
72 int newFSCKSize;
73 s64 newMapSize = 0, mapSize;
74 s64 XAddress, XSize, nblocks, xoff, xaddr, t64;
75 s64 oldLVSize;
76 s64 newFSSize;
77 s64 VolumeSize;
78 int newNpages = 0, nPages, newPage, xlen, t32;
79 int tid;
80 int log_formatted = 0;
81 struct inode *iplist[1];
82 struct jfs_superblock *j_sb, *j_sb2;
83 uint old_agsize;
84 struct buffer_head *bh, *bh2;
85
86 /* If the volume hasn't grown, get out now */
87
88 if (sbi->mntflag & JFS_INLINELOG)
89 oldLVSize = addressPXD(&sbi->logpxd) + lengthPXD(&sbi->logpxd);
90 else
91 oldLVSize = addressPXD(&sbi->fsckpxd) +
92 lengthPXD(&sbi->fsckpxd);
93
94 if (oldLVSize >= newLVSize) {
95 printk(KERN_WARNING
96 "jfs_extendfs: volume hasn't grown, returning\n");
97 goto out;
98 }
99
100 VolumeSize = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
101
102 if (VolumeSize) {
103 if (newLVSize > VolumeSize) {
104 printk(KERN_WARNING "jfs_extendfs: invalid size\n");
105 rc = -EINVAL;
106 goto out;
107 }
108 } else {
109 /* check the device */
110 bh = sb_bread(sb, newLVSize - 1);
111 if (!bh) {
112 printk(KERN_WARNING "jfs_extendfs: invalid size\n");
113 rc = -EINVAL;
114 goto out;
115 }
116 bforget(bh);
117 }
118
119 /* Can't extend write-protected drive */
120
121 if (isReadOnly(ipbmap)) {
122 printk(KERN_WARNING "jfs_extendfs: read-only file system\n");
123 rc = -EROFS;
124 goto out;
125 }
126
127 /*
128 * reconfigure LV spaces
129 * ---------------------
130 *
131 * validate new size, or, if not specified, determine new size
132 */
133
134 /*
135 * reconfigure inline log space:
136 */
137 if ((sbi->mntflag & JFS_INLINELOG)) {
138 if (newLogSize == 0) {
139 /*
140 * no size specified: default to 1/256 of aggregate
141 * size; rounded up to a megabyte boundary;
142 */
143 newLogSize = newLVSize >> 8;
144 t32 = (1 << (20 - sbi->l2bsize)) - 1;
145 newLogSize = (newLogSize + t32) & ~t32;
146 newLogSize =
147 min(newLogSize, MEGABYTE32 >> sbi->l2bsize);
148 } else {
149 /*
150 * convert the newLogSize to fs blocks.
151 *
152 * Since this is given in megabytes, it will always be
153 * an even number of pages.
154 */
155 newLogSize = (newLogSize * MEGABYTE) >> sbi->l2bsize;
156 }
157
158 } else
159 newLogSize = 0;
160
161 newLogAddress = newLVSize - newLogSize;
162
163 /*
164 * reconfigure fsck work space:
165 *
166 * configure it to the end of the logical volume regardless of
167 * whether file system extends to the end of the aggregate;
168 * Need enough 4k pages to cover:
169 * - 1 bit per block in aggregate rounded up to BPERDMAP boundary
170 * - 1 extra page to handle control page and intermediate level pages
171 * - 50 extra pages for the chkdsk service log
172 */
173 t64 = ((newLVSize - newLogSize + BPERDMAP - 1) >> L2BPERDMAP)
174 << L2BPERDMAP;
175 t32 = ((t64 + (BITSPERPAGE - 1)) / BITSPERPAGE) + 1 + 50;
176 newFSCKSize = t32 << sbi->l2nbperpage;
177 newFSCKAddress = newLogAddress - newFSCKSize;
178
179 /*
180 * compute new file system space;
181 */
182 newFSSize = newLVSize - newLogSize - newFSCKSize;
183
184 /* file system cannot be shrinked */
185 if (newFSSize < bmp->db_mapsize) {
186 rc = -EINVAL;
187 goto out;
188 }
189
190 /*
191 * If we're expanding enough that the inline log does not overlap
192 * the old one, we can format the new log before we quiesce the
193 * filesystem.
194 */
195 if ((sbi->mntflag & JFS_INLINELOG) && (newLogAddress > oldLVSize)) {
196 if ((rc = lmLogFormat(log, newLogAddress, newLogSize)))
197 goto out;
198 log_formatted = 1;
199 }
200 /*
201 * quiesce file system
202 *
203 * (prepare to move the inline log and to prevent map update)
204 *
205 * block any new transactions and wait for completion of
206 * all wip transactions and flush modified pages s.t.
207 * on-disk file system is in consistent state and
208 * log is not required for recovery.
209 */
210 txQuiesce(sb);
211
212 if (sbi->mntflag & JFS_INLINELOG) {
213 /*
214 * deactivate old inline log
215 */
216 lmLogShutdown(log);
217
218 /*
219 * mark on-disk super block for fs in transition;
220 *
221 * update on-disk superblock for the new space configuration
222 * of inline log space and fsck work space descriptors:
223 * N.B. FS descriptor is NOT updated;
224 *
225 * crash recovery:
226 * logredo(): if FM_EXTENDFS, return to fsck() for cleanup;
227 * fsck(): if FM_EXTENDFS, reformat inline log and fsck
228 * workspace from superblock inline log descriptor and fsck
229 * workspace descriptor;
230 */
231
232 /* read in superblock */
233 if ((rc = readSuper(sb, &bh)))
234 goto error_out;
235 j_sb = (struct jfs_superblock *)bh->b_data;
236
237 /* mark extendfs() in progress */
238 j_sb->s_state |= cpu_to_le32(FM_EXTENDFS);
239 j_sb->s_xsize = cpu_to_le64(newFSSize);
240 PXDaddress(&j_sb->s_xfsckpxd, newFSCKAddress);
241 PXDlength(&j_sb->s_xfsckpxd, newFSCKSize);
242 PXDaddress(&j_sb->s_xlogpxd, newLogAddress);
243 PXDlength(&j_sb->s_xlogpxd, newLogSize);
244
245 /* synchronously update superblock */
246 mark_buffer_dirty(bh);
247 sync_dirty_buffer(bh);
248 brelse(bh);
249
250 /*
251 * format new inline log synchronously;
252 *
253 * crash recovery: if log move in progress,
254 * reformat log and exit success;
255 */
256 if (!log_formatted)
257 if ((rc = lmLogFormat(log, newLogAddress, newLogSize)))
258 goto error_out;
259
260 /*
261 * activate new log
262 */
263 log->base = newLogAddress;
264 log->size = newLogSize >> (L2LOGPSIZE - sb->s_blocksize_bits);
265 if ((rc = lmLogInit(log)))
266 goto error_out;
267 }
268
269 /*
270 * extend block allocation map
271 * ---------------------------
272 *
273 * extendfs() for new extension, retry after crash recovery;
274 *
275 * note: both logredo() and fsck() rebuild map from
276 * the bitmap and configuration parameter from superblock
277 * (disregarding all other control information in the map);
278 *
279 * superblock:
280 * s_size: aggregate size in physical blocks;
281 */
282 /*
283 * compute the new block allocation map configuration
284 *
285 * map dinode:
286 * di_size: map file size in byte;
287 * di_nblocks: number of blocks allocated for map file;
288 * di_mapsize: number of blocks in aggregate (covered by map);
289 * map control page:
290 * db_mapsize: number of blocks in aggregate (covered by map);
291 */
292 newMapSize = newFSSize;
293 /* number of data pages of new bmap file:
294 * roundup new size to full dmap page boundary and
295 * add 1 extra dmap page for next extendfs()
296 */
297 t64 = (newMapSize - 1) + BPERDMAP;
298 newNpages = BLKTODMAPN(t64) + 1;
299
300 /*
301 * extend map from current map (WITHOUT growing mapfile)
302 *
303 * map new extension with unmapped part of the last partial
304 * dmap page, if applicable, and extra page(s) allocated
305 * at end of bmap by mkfs() or previous extendfs();
306 */
307 extendBmap:
308 /* compute number of blocks requested to extend */
309 mapSize = bmp->db_mapsize;
310 XAddress = mapSize; /* eXtension Address */
311 XSize = newMapSize - mapSize; /* eXtension Size */
312 old_agsize = bmp->db_agsize; /* We need to know if this changes */
313
314 /* compute number of blocks that can be extended by current mapfile */
315 t64 = dbMapFileSizeToMapSize(ipbmap);
316 if (mapSize > t64) {
317 printk(KERN_ERR "jfs_extendfs: mapSize (0x%Lx) > t64 (0x%Lx)\n",
318 (long long) mapSize, (long long) t64);
319 rc = -EIO;
320 goto error_out;
321 }
322 nblocks = min(t64 - mapSize, XSize);
323
324 /*
325 * update map pages for new extension:
326 *
327 * update/init dmap and bubble up the control hierarchy
328 * incrementally fold up dmaps into upper levels;
329 * update bmap control page;
330 */
331 if ((rc = dbExtendFS(ipbmap, XAddress, nblocks)))
332 goto error_out;
333 /*
334 * the map now has extended to cover additional nblocks:
335 * dn_mapsize = oldMapsize + nblocks;
336 */
337 /* ipbmap->i_mapsize += nblocks; */
338 XSize -= nblocks;
339
340 /*
341 * grow map file to cover remaining extension
342 * and/or one extra dmap page for next extendfs();
343 *
344 * allocate new map pages and its backing blocks, and
345 * update map file xtree
346 */
347 /* compute number of data pages of current bmap file */
348 nPages = ipbmap->i_size >> L2PSIZE;
349
350 /* need to grow map file ? */
351 if (nPages == newNpages)
352 goto finalizeBmap;
353
354 /*
355 * grow bmap file for the new map pages required:
356 *
357 * allocate growth at the start of newly extended region;
358 * bmap file only grows sequentially, i.e., both data pages
359 * and possibly xtree index pages may grow in append mode,
360 * s.t. logredo() can reconstruct pre-extension state
361 * by washing away bmap file of pages outside s_size boundary;
362 */
363 /*
364 * journal map file growth as if a regular file growth:
365 * (note: bmap is created with di_mode = IFJOURNAL|IFREG);
366 *
367 * journaling of bmap file growth is not required since
368 * logredo() do/can not use log records of bmap file growth
369 * but it provides careful write semantics, pmap update, etc.;
370 */
371 /* synchronous write of data pages: bmap data pages are
372 * cached in meta-data cache, and not written out
373 * by txCommit();
374 */
375 filemap_fdatawait(ipbmap->i_mapping);
376 filemap_fdatawrite(ipbmap->i_mapping);
377 filemap_fdatawait(ipbmap->i_mapping);
378 diWriteSpecial(ipbmap, 0);
379
380 newPage = nPages; /* first new page number */
381 xoff = newPage << sbi->l2nbperpage;
382 xlen = (newNpages - nPages) << sbi->l2nbperpage;
383 xlen = min(xlen, (int) nblocks) & ~(sbi->nbperpage - 1);
384 xaddr = XAddress;
385
386 tid = txBegin(sb, COMMIT_FORCE);
387
388 if ((rc = xtAppend(tid, ipbmap, 0, xoff, nblocks, &xlen, &xaddr, 0))) {
389 txEnd(tid);
390 goto error_out;
391 }
392 /* update bmap file size */
393 ipbmap->i_size += xlen << sbi->l2bsize;
394 inode_add_bytes(ipbmap, xlen << sbi->l2bsize);
395
396 iplist[0] = ipbmap;
397 rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE);
398
399 txEnd(tid);
400
401 if (rc)
402 goto error_out;
403
404 /*
405 * map file has been grown now to cover extension to further out;
406 * di_size = new map file size;
407 *
408 * if huge extension, the previous extension based on previous
409 * map file size may not have been sufficient to cover whole extension
410 * (it could have been used up for new map pages),
411 * but the newly grown map file now covers lot bigger new free space
412 * available for further extension of map;
413 */
414 /* any more blocks to extend ? */
415 if (XSize)
416 goto extendBmap;
417
418 finalizeBmap:
419 /* finalize bmap */
420 dbFinalizeBmap(ipbmap);
421
422 /*
423 * update inode allocation map
424 * ---------------------------
425 *
426 * move iag lists from old to new iag;
427 * agstart field is not updated for logredo() to reconstruct
428 * iag lists if system crash occurs.
429 * (computation of ag number from agstart based on agsize
430 * will correctly identify the new ag);
431 */
432 /* if new AG size the same as old AG size, done! */
433 if (bmp->db_agsize != old_agsize) {
434 if ((rc = diExtendFS(ipimap, ipbmap)))
435 goto error_out;
436
437 /* finalize imap */
438 if ((rc = diSync(ipimap)))
439 goto error_out;
440 }
441
442 /*
443 * finalize
444 * --------
445 *
446 * extension is committed when on-disk super block is
447 * updated with new descriptors: logredo will recover
448 * crash before it to pre-extension state;
449 */
450
451 /* sync log to skip log replay of bmap file growth transaction; */
452 /* lmLogSync(log, 1); */
453
454 /*
455 * synchronous write bmap global control page;
456 * for crash before completion of write
457 * logredo() will recover to pre-extendfs state;
458 * for crash after completion of write,
459 * logredo() will recover post-extendfs state;
460 */
461 if ((rc = dbSync(ipbmap)))
462 goto error_out;
463
464 /*
465 * copy primary bmap inode to secondary bmap inode
466 */
467
468 ipbmap2 = diReadSpecial(sb, BMAP_I, 1);
469 if (ipbmap2 == NULL) {
470 printk(KERN_ERR "jfs_extendfs: diReadSpecial(bmap) failed\n");
471 goto error_out;
472 }
473 memcpy(&JFS_IP(ipbmap2)->i_xtroot, &JFS_IP(ipbmap)->i_xtroot, 288);
474 ipbmap2->i_size = ipbmap->i_size;
475 ipbmap2->i_blocks = ipbmap->i_blocks;
476
477 diWriteSpecial(ipbmap2, 1);
478 diFreeSpecial(ipbmap2);
479
480 /*
481 * update superblock
482 */
483 if ((rc = readSuper(sb, &bh)))
484 goto error_out;
485 j_sb = (struct jfs_superblock *)bh->b_data;
486
487 /* mark extendfs() completion */
488 j_sb->s_state &= cpu_to_le32(~FM_EXTENDFS);
489 j_sb->s_size = cpu_to_le64(bmp->db_mapsize <<
490 le16_to_cpu(j_sb->s_l2bfactor));
491 j_sb->s_agsize = cpu_to_le32(bmp->db_agsize);
492
493 /* update inline log space descriptor */
494 if (sbi->mntflag & JFS_INLINELOG) {
495 PXDaddress(&(j_sb->s_logpxd), newLogAddress);
496 PXDlength(&(j_sb->s_logpxd), newLogSize);
497 }
498
499 /* record log's mount serial number */
500 j_sb->s_logserial = cpu_to_le32(log->serial);
501
502 /* update fsck work space descriptor */
503 PXDaddress(&(j_sb->s_fsckpxd), newFSCKAddress);
504 PXDlength(&(j_sb->s_fsckpxd), newFSCKSize);
505 j_sb->s_fscklog = 1;
506 /* sb->s_fsckloglen remains the same */
507
508 /* Update secondary superblock */
509 bh2 = sb_bread(sb, SUPER2_OFF >> sb->s_blocksize_bits);
510 if (bh2) {
511 j_sb2 = (struct jfs_superblock *)bh2->b_data;
512 memcpy(j_sb2, j_sb, sizeof (struct jfs_superblock));
513
514 mark_buffer_dirty(bh);
515 sync_dirty_buffer(bh2);
516 brelse(bh2);
517 }
518
519 /* write primary superblock */
520 mark_buffer_dirty(bh);
521 sync_dirty_buffer(bh);
522 brelse(bh);
523
524 goto resume;
525
526 error_out:
527 jfs_error(sb, "jfs_extendfs");
528
529 resume:
530 /*
531 * resume file system transactions
532 */
533 txResume(sb);
534
535 out:
536 return rc;
537}
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
new file mode 100644
index 000000000000..5856866e24fc
--- /dev/null
+++ b/fs/jfs/super.c
@@ -0,0 +1,700 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 * Portions Copyright (C) Christoph Hellwig, 2001-2002
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/fs.h>
21#include <linux/config.h>
22#include <linux/module.h>
23#include <linux/parser.h>
24#include <linux/completion.h>
25#include <linux/vfs.h>
26#include <linux/moduleparam.h>
27#include <asm/uaccess.h>
28
29#include "jfs_incore.h"
30#include "jfs_filsys.h"
31#include "jfs_metapage.h"
32#include "jfs_superblock.h"
33#include "jfs_dmap.h"
34#include "jfs_imap.h"
35#include "jfs_acl.h"
36#include "jfs_debug.h"
37
38MODULE_DESCRIPTION("The Journaled Filesystem (JFS)");
39MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM");
40MODULE_LICENSE("GPL");
41
42static kmem_cache_t * jfs_inode_cachep;
43
44static struct super_operations jfs_super_operations;
45static struct export_operations jfs_export_operations;
46static struct file_system_type jfs_fs_type;
47
48#define MAX_COMMIT_THREADS 64
49static int commit_threads = 0;
50module_param(commit_threads, int, 0);
51MODULE_PARM_DESC(commit_threads, "Number of commit threads");
52
53int jfs_stop_threads;
54static pid_t jfsIOthread;
55static pid_t jfsCommitThread[MAX_COMMIT_THREADS];
56static pid_t jfsSyncThread;
57DECLARE_COMPLETION(jfsIOwait);
58
59#ifdef CONFIG_JFS_DEBUG
60int jfsloglevel = JFS_LOGLEVEL_WARN;
61module_param(jfsloglevel, int, 0644);
62MODULE_PARM_DESC(jfsloglevel, "Specify JFS loglevel (0, 1 or 2)");
63#endif
64
65/*
66 * External declarations
67 */
68extern int jfs_mount(struct super_block *);
69extern int jfs_mount_rw(struct super_block *, int);
70extern int jfs_umount(struct super_block *);
71extern int jfs_umount_rw(struct super_block *);
72
73extern int jfsIOWait(void *);
74extern int jfs_lazycommit(void *);
75extern int jfs_sync(void *);
76
77extern void jfs_read_inode(struct inode *inode);
78extern void jfs_dirty_inode(struct inode *inode);
79extern void jfs_delete_inode(struct inode *inode);
80extern int jfs_write_inode(struct inode *inode, int wait);
81
82extern struct dentry *jfs_get_parent(struct dentry *dentry);
83extern int jfs_extendfs(struct super_block *, s64, int);
84
85extern struct dentry_operations jfs_ci_dentry_operations;
86
87#ifdef PROC_FS_JFS /* see jfs_debug.h */
88extern void jfs_proc_init(void);
89extern void jfs_proc_clean(void);
90#endif
91
92extern wait_queue_head_t jfs_IO_thread_wait;
93extern wait_queue_head_t jfs_commit_thread_wait;
94extern wait_queue_head_t jfs_sync_thread_wait;
95
96static void jfs_handle_error(struct super_block *sb)
97{
98 struct jfs_sb_info *sbi = JFS_SBI(sb);
99
100 if (sb->s_flags & MS_RDONLY)
101 return;
102
103 updateSuper(sb, FM_DIRTY);
104
105 if (sbi->flag & JFS_ERR_PANIC)
106 panic("JFS (device %s): panic forced after error\n",
107 sb->s_id);
108 else if (sbi->flag & JFS_ERR_REMOUNT_RO) {
109 jfs_err("ERROR: (device %s): remounting filesystem "
110 "as read-only\n",
111 sb->s_id);
112 sb->s_flags |= MS_RDONLY;
113 }
114
115 /* nothing is done for continue beyond marking the superblock dirty */
116}
117
118void jfs_error(struct super_block *sb, const char * function, ...)
119{
120 static char error_buf[256];
121 va_list args;
122
123 va_start(args, function);
124 vsprintf(error_buf, function, args);
125 va_end(args);
126
127 printk(KERN_ERR "ERROR: (device %s): %s\n", sb->s_id, error_buf);
128
129 jfs_handle_error(sb);
130}
131
132static struct inode *jfs_alloc_inode(struct super_block *sb)
133{
134 struct jfs_inode_info *jfs_inode;
135
136 jfs_inode = kmem_cache_alloc(jfs_inode_cachep, GFP_NOFS);
137 if (!jfs_inode)
138 return NULL;
139 return &jfs_inode->vfs_inode;
140}
141
142static void jfs_destroy_inode(struct inode *inode)
143{
144 struct jfs_inode_info *ji = JFS_IP(inode);
145
146 spin_lock_irq(&ji->ag_lock);
147 if (ji->active_ag != -1) {
148 struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap;
149 atomic_dec(&bmap->db_active[ji->active_ag]);
150 ji->active_ag = -1;
151 }
152 spin_unlock_irq(&ji->ag_lock);
153
154#ifdef CONFIG_JFS_POSIX_ACL
155 if (ji->i_acl != JFS_ACL_NOT_CACHED) {
156 posix_acl_release(ji->i_acl);
157 ji->i_acl = JFS_ACL_NOT_CACHED;
158 }
159 if (ji->i_default_acl != JFS_ACL_NOT_CACHED) {
160 posix_acl_release(ji->i_default_acl);
161 ji->i_default_acl = JFS_ACL_NOT_CACHED;
162 }
163#endif
164
165 kmem_cache_free(jfs_inode_cachep, ji);
166}
167
168static int jfs_statfs(struct super_block *sb, struct kstatfs *buf)
169{
170 struct jfs_sb_info *sbi = JFS_SBI(sb);
171 s64 maxinodes;
172 struct inomap *imap = JFS_IP(sbi->ipimap)->i_imap;
173
174 jfs_info("In jfs_statfs");
175 buf->f_type = JFS_SUPER_MAGIC;
176 buf->f_bsize = sbi->bsize;
177 buf->f_blocks = sbi->bmap->db_mapsize;
178 buf->f_bfree = sbi->bmap->db_nfree;
179 buf->f_bavail = sbi->bmap->db_nfree;
180 /*
181 * If we really return the number of allocated & free inodes, some
182 * applications will fail because they won't see enough free inodes.
183 * We'll try to calculate some guess as to how may inodes we can
184 * really allocate
185 *
186 * buf->f_files = atomic_read(&imap->im_numinos);
187 * buf->f_ffree = atomic_read(&imap->im_numfree);
188 */
189 maxinodes = min((s64) atomic_read(&imap->im_numinos) +
190 ((sbi->bmap->db_nfree >> imap->im_l2nbperiext)
191 << L2INOSPEREXT), (s64) 0xffffffffLL);
192 buf->f_files = maxinodes;
193 buf->f_ffree = maxinodes - (atomic_read(&imap->im_numinos) -
194 atomic_read(&imap->im_numfree));
195
196 buf->f_namelen = JFS_NAME_MAX;
197 return 0;
198}
199
200static void jfs_put_super(struct super_block *sb)
201{
202 struct jfs_sb_info *sbi = JFS_SBI(sb);
203 int rc;
204
205 jfs_info("In jfs_put_super");
206 rc = jfs_umount(sb);
207 if (rc)
208 jfs_err("jfs_umount failed with return code %d", rc);
209 if (sbi->nls_tab)
210 unload_nls(sbi->nls_tab);
211 sbi->nls_tab = NULL;
212
213 kfree(sbi);
214}
215
216enum {
217 Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize,
218 Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err,
219};
220
221static match_table_t tokens = {
222 {Opt_integrity, "integrity"},
223 {Opt_nointegrity, "nointegrity"},
224 {Opt_iocharset, "iocharset=%s"},
225 {Opt_resize, "resize=%u"},
226 {Opt_resize_nosize, "resize"},
227 {Opt_errors, "errors=%s"},
228 {Opt_ignore, "noquota"},
229 {Opt_ignore, "quota"},
230 {Opt_ignore, "usrquota"},
231 {Opt_ignore, "grpquota"},
232 {Opt_err, NULL}
233};
234
235static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
236 int *flag)
237{
238 void *nls_map = (void *)-1; /* -1: no change; NULL: none */
239 char *p;
240 struct jfs_sb_info *sbi = JFS_SBI(sb);
241
242 *newLVSize = 0;
243
244 if (!options)
245 return 1;
246
247 while ((p = strsep(&options, ",")) != NULL) {
248 substring_t args[MAX_OPT_ARGS];
249 int token;
250 if (!*p)
251 continue;
252
253 token = match_token(p, tokens, args);
254 switch (token) {
255 case Opt_integrity:
256 *flag &= ~JFS_NOINTEGRITY;
257 break;
258 case Opt_nointegrity:
259 *flag |= JFS_NOINTEGRITY;
260 break;
261 case Opt_ignore:
262 /* Silently ignore the quota options */
263 /* Don't do anything ;-) */
264 break;
265 case Opt_iocharset:
266 if (nls_map && nls_map != (void *) -1)
267 unload_nls(nls_map);
268 if (!strcmp(args[0].from, "none"))
269 nls_map = NULL;
270 else {
271 nls_map = load_nls(args[0].from);
272 if (!nls_map) {
273 printk(KERN_ERR
274 "JFS: charset not found\n");
275 goto cleanup;
276 }
277 }
278 break;
279 case Opt_resize:
280 {
281 char *resize = args[0].from;
282 *newLVSize = simple_strtoull(resize, &resize, 0);
283 break;
284 }
285 case Opt_resize_nosize:
286 {
287 *newLVSize = sb->s_bdev->bd_inode->i_size >>
288 sb->s_blocksize_bits;
289 if (*newLVSize == 0)
290 printk(KERN_ERR
291 "JFS: Cannot determine volume size\n");
292 break;
293 }
294 case Opt_errors:
295 {
296 char *errors = args[0].from;
297 if (!errors || !*errors)
298 goto cleanup;
299 if (!strcmp(errors, "continue")) {
300 *flag &= ~JFS_ERR_REMOUNT_RO;
301 *flag &= ~JFS_ERR_PANIC;
302 *flag |= JFS_ERR_CONTINUE;
303 } else if (!strcmp(errors, "remount-ro")) {
304 *flag &= ~JFS_ERR_CONTINUE;
305 *flag &= ~JFS_ERR_PANIC;
306 *flag |= JFS_ERR_REMOUNT_RO;
307 } else if (!strcmp(errors, "panic")) {
308 *flag &= ~JFS_ERR_CONTINUE;
309 *flag &= ~JFS_ERR_REMOUNT_RO;
310 *flag |= JFS_ERR_PANIC;
311 } else {
312 printk(KERN_ERR
313 "JFS: %s is an invalid error handler\n",
314 errors);
315 goto cleanup;
316 }
317 break;
318 }
319 default:
320 printk("jfs: Unrecognized mount option \"%s\" "
321 " or missing value\n", p);
322 goto cleanup;
323 }
324 }
325
326 if (nls_map != (void *) -1) {
327 /* Discard old (if remount) */
328 if (sbi->nls_tab)
329 unload_nls(sbi->nls_tab);
330 sbi->nls_tab = nls_map;
331 }
332 return 1;
333
334cleanup:
335 if (nls_map && nls_map != (void *) -1)
336 unload_nls(nls_map);
337 return 0;
338}
339
340static int jfs_remount(struct super_block *sb, int *flags, char *data)
341{
342 s64 newLVSize = 0;
343 int rc = 0;
344 int flag = JFS_SBI(sb)->flag;
345
346 if (!parse_options(data, sb, &newLVSize, &flag)) {
347 return -EINVAL;
348 }
349 if (newLVSize) {
350 if (sb->s_flags & MS_RDONLY) {
351 printk(KERN_ERR
352 "JFS: resize requires volume to be mounted read-write\n");
353 return -EROFS;
354 }
355 rc = jfs_extendfs(sb, newLVSize, 0);
356 if (rc)
357 return rc;
358 }
359
360 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
361 JFS_SBI(sb)->flag = flag;
362 return jfs_mount_rw(sb, 1);
363 }
364 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
365 rc = jfs_umount_rw(sb);
366 JFS_SBI(sb)->flag = flag;
367 return rc;
368 }
369 if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
370 if (!(sb->s_flags & MS_RDONLY)) {
371 rc = jfs_umount_rw(sb);
372 if (rc)
373 return rc;
374 JFS_SBI(sb)->flag = flag;
375 return jfs_mount_rw(sb, 1);
376 }
377 JFS_SBI(sb)->flag = flag;
378
379 return 0;
380}
381
382static int jfs_fill_super(struct super_block *sb, void *data, int silent)
383{
384 struct jfs_sb_info *sbi;
385 struct inode *inode;
386 int rc;
387 s64 newLVSize = 0;
388 int flag;
389
390 jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags);
391
392 if (!new_valid_dev(sb->s_bdev->bd_dev))
393 return -EOVERFLOW;
394
395 sbi = kmalloc(sizeof (struct jfs_sb_info), GFP_KERNEL);
396 if (!sbi)
397 return -ENOSPC;
398 memset(sbi, 0, sizeof (struct jfs_sb_info));
399 sb->s_fs_info = sbi;
400 sbi->sb = sb;
401
402 /* initialize the mount flag and determine the default error handler */
403 flag = JFS_ERR_REMOUNT_RO;
404
405 if (!parse_options((char *) data, sb, &newLVSize, &flag)) {
406 kfree(sbi);
407 return -EINVAL;
408 }
409 sbi->flag = flag;
410
411#ifdef CONFIG_JFS_POSIX_ACL
412 sb->s_flags |= MS_POSIXACL;
413#endif
414
415 if (newLVSize) {
416 printk(KERN_ERR "resize option for remount only\n");
417 return -EINVAL;
418 }
419
420 /*
421 * Initialize blocksize to 4K.
422 */
423 sb_set_blocksize(sb, PSIZE);
424
425 /*
426 * Set method vectors.
427 */
428 sb->s_op = &jfs_super_operations;
429 sb->s_export_op = &jfs_export_operations;
430
431 rc = jfs_mount(sb);
432 if (rc) {
433 if (!silent) {
434 jfs_err("jfs_mount failed w/return code = %d", rc);
435 }
436 goto out_kfree;
437 }
438 if (sb->s_flags & MS_RDONLY)
439 sbi->log = NULL;
440 else {
441 rc = jfs_mount_rw(sb, 0);
442 if (rc) {
443 if (!silent) {
444 jfs_err("jfs_mount_rw failed, return code = %d",
445 rc);
446 }
447 goto out_no_rw;
448 }
449 }
450
451 sb->s_magic = JFS_SUPER_MAGIC;
452
453 inode = iget(sb, ROOT_I);
454 if (!inode || is_bad_inode(inode))
455 goto out_no_root;
456 sb->s_root = d_alloc_root(inode);
457 if (!sb->s_root)
458 goto out_no_root;
459
460 if (sbi->mntflag & JFS_OS2)
461 sb->s_root->d_op = &jfs_ci_dentry_operations;
462
463 /* logical blocks are represented by 40 bits in pxd_t, etc. */
464 sb->s_maxbytes = ((u64) sb->s_blocksize) << 40;
465#if BITS_PER_LONG == 32
466 /*
467 * Page cache is indexed by long.
468 * I would use MAX_LFS_FILESIZE, but it's only half as big
469 */
470 sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, sb->s_maxbytes);
471#endif
472 sb->s_time_gran = 1;
473 return 0;
474
475out_no_root:
476 jfs_err("jfs_read_super: get root inode failed");
477 if (inode)
478 iput(inode);
479
480out_no_rw:
481 rc = jfs_umount(sb);
482 if (rc) {
483 jfs_err("jfs_umount failed with return code %d", rc);
484 }
485out_kfree:
486 if (sbi->nls_tab)
487 unload_nls(sbi->nls_tab);
488 kfree(sbi);
489 return -EINVAL;
490}
491
492static void jfs_write_super_lockfs(struct super_block *sb)
493{
494 struct jfs_sb_info *sbi = JFS_SBI(sb);
495 struct jfs_log *log = sbi->log;
496
497 if (!(sb->s_flags & MS_RDONLY)) {
498 txQuiesce(sb);
499 lmLogShutdown(log);
500 updateSuper(sb, FM_CLEAN);
501 }
502}
503
504static void jfs_unlockfs(struct super_block *sb)
505{
506 struct jfs_sb_info *sbi = JFS_SBI(sb);
507 struct jfs_log *log = sbi->log;
508 int rc = 0;
509
510 if (!(sb->s_flags & MS_RDONLY)) {
511 updateSuper(sb, FM_MOUNT);
512 if ((rc = lmLogInit(log)))
513 jfs_err("jfs_unlock failed with return code %d", rc);
514 else
515 txResume(sb);
516 }
517}
518
519static struct super_block *jfs_get_sb(struct file_system_type *fs_type,
520 int flags, const char *dev_name, void *data)
521{
522 return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
523}
524
525static int jfs_sync_fs(struct super_block *sb, int wait)
526{
527 struct jfs_log *log = JFS_SBI(sb)->log;
528
529 /* log == NULL indicates read-only mount */
530 if (log)
531 jfs_flush_journal(log, wait);
532
533 return 0;
534}
535
536static struct super_operations jfs_super_operations = {
537 .alloc_inode = jfs_alloc_inode,
538 .destroy_inode = jfs_destroy_inode,
539 .read_inode = jfs_read_inode,
540 .dirty_inode = jfs_dirty_inode,
541 .write_inode = jfs_write_inode,
542 .delete_inode = jfs_delete_inode,
543 .put_super = jfs_put_super,
544 .sync_fs = jfs_sync_fs,
545 .write_super_lockfs = jfs_write_super_lockfs,
546 .unlockfs = jfs_unlockfs,
547 .statfs = jfs_statfs,
548 .remount_fs = jfs_remount,
549};
550
551static struct export_operations jfs_export_operations = {
552 .get_parent = jfs_get_parent,
553};
554
555static struct file_system_type jfs_fs_type = {
556 .owner = THIS_MODULE,
557 .name = "jfs",
558 .get_sb = jfs_get_sb,
559 .kill_sb = kill_block_super,
560 .fs_flags = FS_REQUIRES_DEV,
561};
562
563extern int metapage_init(void);
564extern int txInit(void);
565extern void txExit(void);
566extern void metapage_exit(void);
567
568static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags)
569{
570 struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo;
571
572 if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
573 SLAB_CTOR_CONSTRUCTOR) {
574 memset(jfs_ip, 0, sizeof(struct jfs_inode_info));
575 INIT_LIST_HEAD(&jfs_ip->anon_inode_list);
576 init_rwsem(&jfs_ip->rdwrlock);
577 init_MUTEX(&jfs_ip->commit_sem);
578 init_rwsem(&jfs_ip->xattr_sem);
579 spin_lock_init(&jfs_ip->ag_lock);
580 jfs_ip->active_ag = -1;
581#ifdef CONFIG_JFS_POSIX_ACL
582 jfs_ip->i_acl = JFS_ACL_NOT_CACHED;
583 jfs_ip->i_default_acl = JFS_ACL_NOT_CACHED;
584#endif
585 inode_init_once(&jfs_ip->vfs_inode);
586 }
587}
588
589static int __init init_jfs_fs(void)
590{
591 int i;
592 int rc;
593
594 jfs_inode_cachep =
595 kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0,
596 SLAB_RECLAIM_ACCOUNT, init_once, NULL);
597 if (jfs_inode_cachep == NULL)
598 return -ENOMEM;
599
600 /*
601 * Metapage initialization
602 */
603 rc = metapage_init();
604 if (rc) {
605 jfs_err("metapage_init failed w/rc = %d", rc);
606 goto free_slab;
607 }
608
609 /*
610 * Transaction Manager initialization
611 */
612 rc = txInit();
613 if (rc) {
614 jfs_err("txInit failed w/rc = %d", rc);
615 goto free_metapage;
616 }
617
618 /*
619 * I/O completion thread (endio)
620 */
621 jfsIOthread = kernel_thread(jfsIOWait, NULL, CLONE_KERNEL);
622 if (jfsIOthread < 0) {
623 jfs_err("init_jfs_fs: fork failed w/rc = %d", jfsIOthread);
624 goto end_txmngr;
625 }
626 wait_for_completion(&jfsIOwait); /* Wait until thread starts */
627
628 if (commit_threads < 1)
629 commit_threads = num_online_cpus();
630 if (commit_threads > MAX_COMMIT_THREADS)
631 commit_threads = MAX_COMMIT_THREADS;
632
633 for (i = 0; i < commit_threads; i++) {
634 jfsCommitThread[i] = kernel_thread(jfs_lazycommit, NULL,
635 CLONE_KERNEL);
636 if (jfsCommitThread[i] < 0) {
637 jfs_err("init_jfs_fs: fork failed w/rc = %d",
638 jfsCommitThread[i]);
639 commit_threads = i;
640 goto kill_committask;
641 }
642 /* Wait until thread starts */
643 wait_for_completion(&jfsIOwait);
644 }
645
646 jfsSyncThread = kernel_thread(jfs_sync, NULL, CLONE_KERNEL);
647 if (jfsSyncThread < 0) {
648 jfs_err("init_jfs_fs: fork failed w/rc = %d", jfsSyncThread);
649 goto kill_committask;
650 }
651 wait_for_completion(&jfsIOwait); /* Wait until thread starts */
652
653#ifdef PROC_FS_JFS
654 jfs_proc_init();
655#endif
656
657 return register_filesystem(&jfs_fs_type);
658
659kill_committask:
660 jfs_stop_threads = 1;
661 wake_up_all(&jfs_commit_thread_wait);
662 for (i = 0; i < commit_threads; i++)
663 wait_for_completion(&jfsIOwait);
664
665 wake_up(&jfs_IO_thread_wait);
666 wait_for_completion(&jfsIOwait); /* Wait for thread exit */
667end_txmngr:
668 txExit();
669free_metapage:
670 metapage_exit();
671free_slab:
672 kmem_cache_destroy(jfs_inode_cachep);
673 return rc;
674}
675
676static void __exit exit_jfs_fs(void)
677{
678 int i;
679
680 jfs_info("exit_jfs_fs called");
681
682 jfs_stop_threads = 1;
683 txExit();
684 metapage_exit();
685 wake_up(&jfs_IO_thread_wait);
686 wait_for_completion(&jfsIOwait); /* Wait until IO thread exits */
687 wake_up_all(&jfs_commit_thread_wait);
688 for (i = 0; i < commit_threads; i++)
689 wait_for_completion(&jfsIOwait);
690 wake_up(&jfs_sync_thread_wait);
691 wait_for_completion(&jfsIOwait); /* Wait until Sync thread exits */
692#ifdef PROC_FS_JFS
693 jfs_proc_clean();
694#endif
695 unregister_filesystem(&jfs_fs_type);
696 kmem_cache_destroy(jfs_inode_cachep);
697}
698
699module_init(init_jfs_fs)
700module_exit(exit_jfs_fs)
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
new file mode 100644
index 000000000000..ef4c07ee92b2
--- /dev/null
+++ b/fs/jfs/symlink.c
@@ -0,0 +1,39 @@
1/*
2 * Copyright (c) Christoph Hellwig, 2001-2002
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
12 * the GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#include <linux/fs.h>
20#include <linux/namei.h>
21#include "jfs_incore.h"
22#include "jfs_xattr.h"
23
24static int jfs_follow_link(struct dentry *dentry, struct nameidata *nd)
25{
26 char *s = JFS_IP(dentry->d_inode)->i_inline;
27 nd_set_link(nd, s);
28 return 0;
29}
30
31struct inode_operations jfs_symlink_inode_operations = {
32 .readlink = generic_readlink,
33 .follow_link = jfs_follow_link,
34 .setxattr = jfs_setxattr,
35 .getxattr = jfs_getxattr,
36 .listxattr = jfs_listxattr,
37 .removexattr = jfs_removexattr,
38};
39
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
new file mode 100644
index 000000000000..7a9ffd5d03dc
--- /dev/null
+++ b/fs/jfs/xattr.c
@@ -0,0 +1,1127 @@
1/*
2 * Copyright (C) International Business Machines Corp., 2000-2004
3 * Copyright (C) Christoph Hellwig, 2002
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/fs.h>
21#include <linux/xattr.h>
22#include <linux/quotaops.h>
23#include "jfs_incore.h"
24#include "jfs_superblock.h"
25#include "jfs_dmap.h"
26#include "jfs_debug.h"
27#include "jfs_dinode.h"
28#include "jfs_extent.h"
29#include "jfs_metapage.h"
30#include "jfs_xattr.h"
31#include "jfs_acl.h"
32
33/*
34 * jfs_xattr.c: extended attribute service
35 *
36 * Overall design --
37 *
38 * Format:
39 *
40 * Extended attribute lists (jfs_ea_list) consist of an overall size (32 bit
41 * value) and a variable (0 or more) number of extended attribute
42 * entries. Each extended attribute entry (jfs_ea) is a <name,value> double
43 * where <name> is constructed from a null-terminated ascii string
44 * (1 ... 255 bytes in the name) and <value> is arbitrary 8 bit data
45 * (1 ... 65535 bytes). The in-memory format is
46 *
47 * 0 1 2 4 4 + namelen + 1
48 * +-------+--------+--------+----------------+-------------------+
49 * | Flags | Name | Value | Name String \0 | Data . . . . |
50 * | | Length | Length | | |
51 * +-------+--------+--------+----------------+-------------------+
52 *
53 * A jfs_ea_list then is structured as
54 *
55 * 0 4 4 + EA_SIZE(ea1)
56 * +------------+-------------------+--------------------+-----
57 * | Overall EA | First FEA Element | Second FEA Element | .....
58 * | List Size | | |
59 * +------------+-------------------+--------------------+-----
60 *
61 * On-disk:
62 *
63 * FEALISTs are stored on disk using blocks allocated by dbAlloc() and
64 * written directly. An EA list may be in-lined in the inode if there is
65 * sufficient room available.
66 */
67
68struct ea_buffer {
69 int flag; /* Indicates what storage xattr points to */
70 int max_size; /* largest xattr that fits in current buffer */
71 dxd_t new_ea; /* dxd to replace ea when modifying xattr */
72 struct metapage *mp; /* metapage containing ea list */
73 struct jfs_ea_list *xattr; /* buffer containing ea list */
74};
75
76/*
77 * ea_buffer.flag values
78 */
79#define EA_INLINE 0x0001
80#define EA_EXTENT 0x0002
81#define EA_NEW 0x0004
82#define EA_MALLOC 0x0008
83
84/* Namespaces */
85#define XATTR_SYSTEM_PREFIX "system."
86#define XATTR_SYSTEM_PREFIX_LEN (sizeof (XATTR_SYSTEM_PREFIX) - 1)
87
88#define XATTR_USER_PREFIX "user."
89#define XATTR_USER_PREFIX_LEN (sizeof (XATTR_USER_PREFIX) - 1)
90
91#define XATTR_OS2_PREFIX "os2."
92#define XATTR_OS2_PREFIX_LEN (sizeof (XATTR_OS2_PREFIX) - 1)
93
94/* XATTR_SECURITY_PREFIX is defined in include/linux/xattr.h */
95#define XATTR_SECURITY_PREFIX_LEN (sizeof (XATTR_SECURITY_PREFIX) - 1)
96
97#define XATTR_TRUSTED_PREFIX "trusted."
98#define XATTR_TRUSTED_PREFIX_LEN (sizeof (XATTR_TRUSTED_PREFIX) - 1)
99
100/*
101 * These three routines are used to recognize on-disk extended attributes
102 * that are in a recognized namespace. If the attribute is not recognized,
103 * "os2." is prepended to the name
104 */
105static inline int is_os2_xattr(struct jfs_ea *ea)
106{
107 /*
108 * Check for "system."
109 */
110 if ((ea->namelen >= XATTR_SYSTEM_PREFIX_LEN) &&
111 !strncmp(ea->name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
112 return FALSE;
113 /*
114 * Check for "user."
115 */
116 if ((ea->namelen >= XATTR_USER_PREFIX_LEN) &&
117 !strncmp(ea->name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
118 return FALSE;
119 /*
120 * Check for "security."
121 */
122 if ((ea->namelen >= XATTR_SECURITY_PREFIX_LEN) &&
123 !strncmp(ea->name, XATTR_SECURITY_PREFIX,
124 XATTR_SECURITY_PREFIX_LEN))
125 return FALSE;
126 /*
127 * Check for "trusted."
128 */
129 if ((ea->namelen >= XATTR_TRUSTED_PREFIX_LEN) &&
130 !strncmp(ea->name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
131 return FALSE;
132 /*
133 * Add any other valid namespace prefixes here
134 */
135
136 /*
137 * We assume it's OS/2's flat namespace
138 */
139 return TRUE;
140}
141
142static inline int name_size(struct jfs_ea *ea)
143{
144 if (is_os2_xattr(ea))
145 return ea->namelen + XATTR_OS2_PREFIX_LEN;
146 else
147 return ea->namelen;
148}
149
150static inline int copy_name(char *buffer, struct jfs_ea *ea)
151{
152 int len = ea->namelen;
153
154 if (is_os2_xattr(ea)) {
155 memcpy(buffer, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN);
156 buffer += XATTR_OS2_PREFIX_LEN;
157 len += XATTR_OS2_PREFIX_LEN;
158 }
159 memcpy(buffer, ea->name, ea->namelen);
160 buffer[ea->namelen] = 0;
161
162 return len;
163}
164
165/* Forward references */
166static void ea_release(struct inode *inode, struct ea_buffer *ea_buf);
167
168/*
169 * NAME: ea_write_inline
170 *
171 * FUNCTION: Attempt to write an EA inline if area is available
172 *
173 * PRE CONDITIONS:
174 * Already verified that the specified EA is small enough to fit inline
175 *
176 * PARAMETERS:
177 * ip - Inode pointer
178 * ealist - EA list pointer
179 * size - size of ealist in bytes
180 * ea - dxd_t structure to be filled in with necessary EA information
181 * if we successfully copy the EA inline
182 *
183 * NOTES:
184 * Checks if the inode's inline area is available. If so, copies EA inline
185 * and sets <ea> fields appropriately. Otherwise, returns failure, EA will
186 * have to be put into an extent.
187 *
188 * RETURNS: 0 for successful copy to inline area; -1 if area not available
189 */
190static int ea_write_inline(struct inode *ip, struct jfs_ea_list *ealist,
191 int size, dxd_t * ea)
192{
193 struct jfs_inode_info *ji = JFS_IP(ip);
194
195 /*
196 * Make sure we have an EA -- the NULL EA list is valid, but you
197 * can't copy it!
198 */
199 if (ealist && size > sizeof (struct jfs_ea_list)) {
200 assert(size <= sizeof (ji->i_inline_ea));
201
202 /*
203 * See if the space is available or if it is already being
204 * used for an inline EA.
205 */
206 if (!(ji->mode2 & INLINEEA) && !(ji->ea.flag & DXD_INLINE))
207 return -EPERM;
208
209 DXDsize(ea, size);
210 DXDlength(ea, 0);
211 DXDaddress(ea, 0);
212 memcpy(ji->i_inline_ea, ealist, size);
213 ea->flag = DXD_INLINE;
214 ji->mode2 &= ~INLINEEA;
215 } else {
216 ea->flag = 0;
217 DXDsize(ea, 0);
218 DXDlength(ea, 0);
219 DXDaddress(ea, 0);
220
221 /* Free up INLINE area */
222 if (ji->ea.flag & DXD_INLINE)
223 ji->mode2 |= INLINEEA;
224 }
225
226 return 0;
227}
228
229/*
230 * NAME: ea_write
231 *
232 * FUNCTION: Write an EA for an inode
233 *
234 * PRE CONDITIONS: EA has been verified
235 *
236 * PARAMETERS:
237 * ip - Inode pointer
238 * ealist - EA list pointer
239 * size - size of ealist in bytes
240 * ea - dxd_t structure to be filled in appropriately with where the
241 * EA was copied
242 *
243 * NOTES: Will write EA inline if able to, otherwise allocates blocks for an
244 * extent and synchronously writes it to those blocks.
245 *
246 * RETURNS: 0 for success; Anything else indicates failure
247 */
248static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
249 dxd_t * ea)
250{
251 struct super_block *sb = ip->i_sb;
252 struct jfs_inode_info *ji = JFS_IP(ip);
253 struct jfs_sb_info *sbi = JFS_SBI(sb);
254 int nblocks;
255 s64 blkno;
256 int rc = 0, i;
257 char *cp;
258 s32 nbytes, nb;
259 s32 bytes_to_write;
260 struct metapage *mp;
261
262 /*
263 * Quick check to see if this is an in-linable EA. Short EAs
264 * and empty EAs are all in-linable, provided the space exists.
265 */
266 if (!ealist || size <= sizeof (ji->i_inline_ea)) {
267 if (!ea_write_inline(ip, ealist, size, ea))
268 return 0;
269 }
270
271 /* figure out how many blocks we need */
272 nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits;
273
274 /* Allocate new blocks to quota. */
275 if (DQUOT_ALLOC_BLOCK(ip, nblocks)) {
276 return -EDQUOT;
277 }
278
279 rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno);
280 if (rc) {
281 /*Rollback quota allocation. */
282 DQUOT_FREE_BLOCK(ip, nblocks);
283 return rc;
284 }
285
286 /*
287 * Now have nblocks worth of storage to stuff into the FEALIST.
288 * loop over the FEALIST copying data into the buffer one page at
289 * a time.
290 */
291 cp = (char *) ealist;
292 nbytes = size;
293 for (i = 0; i < nblocks; i += sbi->nbperpage) {
294 /*
295 * Determine how many bytes for this request, and round up to
296 * the nearest aggregate block size
297 */
298 nb = min(PSIZE, nbytes);
299 bytes_to_write =
300 ((((nb + sb->s_blocksize - 1)) >> sb->s_blocksize_bits))
301 << sb->s_blocksize_bits;
302
303 if (!(mp = get_metapage(ip, blkno + i, bytes_to_write, 1))) {
304 rc = -EIO;
305 goto failed;
306 }
307
308 memcpy(mp->data, cp, nb);
309
310 /*
311 * We really need a way to propagate errors for
312 * forced writes like this one. --hch
313 *
314 * (__write_metapage => release_metapage => flush_metapage)
315 */
316#ifdef _JFS_FIXME
317 if ((rc = flush_metapage(mp))) {
318 /*
319 * the write failed -- this means that the buffer
320 * is still assigned and the blocks are not being
321 * used. this seems like the best error recovery
322 * we can get ...
323 */
324 goto failed;
325 }
326#else
327 flush_metapage(mp);
328#endif
329
330 cp += PSIZE;
331 nbytes -= nb;
332 }
333
334 ea->flag = DXD_EXTENT;
335 DXDsize(ea, le32_to_cpu(ealist->size));
336 DXDlength(ea, nblocks);
337 DXDaddress(ea, blkno);
338
339 /* Free up INLINE area */
340 if (ji->ea.flag & DXD_INLINE)
341 ji->mode2 |= INLINEEA;
342
343 return 0;
344
345 failed:
346 /* Rollback quota allocation. */
347 DQUOT_FREE_BLOCK(ip, nblocks);
348
349 dbFree(ip, blkno, nblocks);
350 return rc;
351}
352
353/*
354 * NAME: ea_read_inline
355 *
356 * FUNCTION: Read an inlined EA into user's buffer
357 *
358 * PARAMETERS:
359 * ip - Inode pointer
360 * ealist - Pointer to buffer to fill in with EA
361 *
362 * RETURNS: 0
363 */
364static int ea_read_inline(struct inode *ip, struct jfs_ea_list *ealist)
365{
366 struct jfs_inode_info *ji = JFS_IP(ip);
367 int ea_size = sizeDXD(&ji->ea);
368
369 if (ea_size == 0) {
370 ealist->size = 0;
371 return 0;
372 }
373
374 /* Sanity Check */
375 if ((sizeDXD(&ji->ea) > sizeof (ji->i_inline_ea)))
376 return -EIO;
377 if (le32_to_cpu(((struct jfs_ea_list *) &ji->i_inline_ea)->size)
378 != ea_size)
379 return -EIO;
380
381 memcpy(ealist, ji->i_inline_ea, ea_size);
382 return 0;
383}
384
385/*
386 * NAME: ea_read
387 *
388 * FUNCTION: copy EA data into user's buffer
389 *
390 * PARAMETERS:
391 * ip - Inode pointer
392 * ealist - Pointer to buffer to fill in with EA
393 *
394 * NOTES: If EA is inline calls ea_read_inline() to copy EA.
395 *
396 * RETURNS: 0 for success; other indicates failure
397 */
398static int ea_read(struct inode *ip, struct jfs_ea_list *ealist)
399{
400 struct super_block *sb = ip->i_sb;
401 struct jfs_inode_info *ji = JFS_IP(ip);
402 struct jfs_sb_info *sbi = JFS_SBI(sb);
403 int nblocks;
404 s64 blkno;
405 char *cp = (char *) ealist;
406 int i;
407 int nbytes, nb;
408 s32 bytes_to_read;
409 struct metapage *mp;
410
411 /* quick check for in-line EA */
412 if (ji->ea.flag & DXD_INLINE)
413 return ea_read_inline(ip, ealist);
414
415 nbytes = sizeDXD(&ji->ea);
416 if (!nbytes) {
417 jfs_error(sb, "ea_read: nbytes is 0");
418 return -EIO;
419 }
420
421 /*
422 * Figure out how many blocks were allocated when this EA list was
423 * originally written to disk.
424 */
425 nblocks = lengthDXD(&ji->ea) << sbi->l2nbperpage;
426 blkno = addressDXD(&ji->ea) << sbi->l2nbperpage;
427
428 /*
429 * I have found the disk blocks which were originally used to store
430 * the FEALIST. now i loop over each contiguous block copying the
431 * data into the buffer.
432 */
433 for (i = 0; i < nblocks; i += sbi->nbperpage) {
434 /*
435 * Determine how many bytes for this request, and round up to
436 * the nearest aggregate block size
437 */
438 nb = min(PSIZE, nbytes);
439 bytes_to_read =
440 ((((nb + sb->s_blocksize - 1)) >> sb->s_blocksize_bits))
441 << sb->s_blocksize_bits;
442
443 if (!(mp = read_metapage(ip, blkno + i, bytes_to_read, 1)))
444 return -EIO;
445
446 memcpy(cp, mp->data, nb);
447 release_metapage(mp);
448
449 cp += PSIZE;
450 nbytes -= nb;
451 }
452
453 return 0;
454}
455
456/*
457 * NAME: ea_get
458 *
459 * FUNCTION: Returns buffer containing existing extended attributes.
460 * The size of the buffer will be the larger of the existing
461 * attributes size, or min_size.
462 *
463 * The buffer, which may be inlined in the inode or in the
464 * page cache must be release by calling ea_release or ea_put
465 *
466 * PARAMETERS:
467 * inode - Inode pointer
468 * ea_buf - Structure to be populated with ealist and its metadata
469 * min_size- minimum size of buffer to be returned
470 *
471 * RETURNS: 0 for success; Other indicates failure
472 */
473static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
474{
475 struct jfs_inode_info *ji = JFS_IP(inode);
476 struct super_block *sb = inode->i_sb;
477 int size;
478 int ea_size = sizeDXD(&ji->ea);
479 int blocks_needed, current_blocks;
480 s64 blkno;
481 int rc;
482 int quota_allocation = 0;
483
484 /* When fsck.jfs clears a bad ea, it doesn't clear the size */
485 if (ji->ea.flag == 0)
486 ea_size = 0;
487
488 if (ea_size == 0) {
489 if (min_size == 0) {
490 ea_buf->flag = 0;
491 ea_buf->max_size = 0;
492 ea_buf->xattr = NULL;
493 return 0;
494 }
495 if ((min_size <= sizeof (ji->i_inline_ea)) &&
496 (ji->mode2 & INLINEEA)) {
497 ea_buf->flag = EA_INLINE | EA_NEW;
498 ea_buf->max_size = sizeof (ji->i_inline_ea);
499 ea_buf->xattr = (struct jfs_ea_list *) ji->i_inline_ea;
500 DXDlength(&ea_buf->new_ea, 0);
501 DXDaddress(&ea_buf->new_ea, 0);
502 ea_buf->new_ea.flag = DXD_INLINE;
503 DXDsize(&ea_buf->new_ea, min_size);
504 return 0;
505 }
506 current_blocks = 0;
507 } else if (ji->ea.flag & DXD_INLINE) {
508 if (min_size <= sizeof (ji->i_inline_ea)) {
509 ea_buf->flag = EA_INLINE;
510 ea_buf->max_size = sizeof (ji->i_inline_ea);
511 ea_buf->xattr = (struct jfs_ea_list *) ji->i_inline_ea;
512 goto size_check;
513 }
514 current_blocks = 0;
515 } else {
516 if (!(ji->ea.flag & DXD_EXTENT)) {
517 jfs_error(sb, "ea_get: invalid ea.flag)");
518 return -EIO;
519 }
520 current_blocks = (ea_size + sb->s_blocksize - 1) >>
521 sb->s_blocksize_bits;
522 }
523 size = max(min_size, ea_size);
524
525 if (size > PSIZE) {
526 /*
527 * To keep the rest of the code simple. Allocate a
528 * contiguous buffer to work with
529 */
530 ea_buf->xattr = kmalloc(size, GFP_KERNEL);
531 if (ea_buf->xattr == NULL)
532 return -ENOMEM;
533
534 ea_buf->flag = EA_MALLOC;
535 ea_buf->max_size = (size + sb->s_blocksize - 1) &
536 ~(sb->s_blocksize - 1);
537
538 if (ea_size == 0)
539 return 0;
540
541 if ((rc = ea_read(inode, ea_buf->xattr))) {
542 kfree(ea_buf->xattr);
543 ea_buf->xattr = NULL;
544 return rc;
545 }
546 goto size_check;
547 }
548 blocks_needed = (min_size + sb->s_blocksize - 1) >>
549 sb->s_blocksize_bits;
550
551 if (blocks_needed > current_blocks) {
552 /* Allocate new blocks to quota. */
553 if (DQUOT_ALLOC_BLOCK(inode, blocks_needed))
554 return -EDQUOT;
555
556 quota_allocation = blocks_needed;
557
558 rc = dbAlloc(inode, INOHINT(inode), (s64) blocks_needed,
559 &blkno);
560 if (rc)
561 goto clean_up;
562
563 DXDlength(&ea_buf->new_ea, blocks_needed);
564 DXDaddress(&ea_buf->new_ea, blkno);
565 ea_buf->new_ea.flag = DXD_EXTENT;
566 DXDsize(&ea_buf->new_ea, min_size);
567
568 ea_buf->flag = EA_EXTENT | EA_NEW;
569
570 ea_buf->mp = get_metapage(inode, blkno,
571 blocks_needed << sb->s_blocksize_bits,
572 1);
573 if (ea_buf->mp == NULL) {
574 dbFree(inode, blkno, (s64) blocks_needed);
575 rc = -EIO;
576 goto clean_up;
577 }
578 ea_buf->xattr = ea_buf->mp->data;
579 ea_buf->max_size = (min_size + sb->s_blocksize - 1) &
580 ~(sb->s_blocksize - 1);
581 if (ea_size == 0)
582 return 0;
583 if ((rc = ea_read(inode, ea_buf->xattr))) {
584 discard_metapage(ea_buf->mp);
585 dbFree(inode, blkno, (s64) blocks_needed);
586 goto clean_up;
587 }
588 goto size_check;
589 }
590 ea_buf->flag = EA_EXTENT;
591 ea_buf->mp = read_metapage(inode, addressDXD(&ji->ea),
592 lengthDXD(&ji->ea) << sb->s_blocksize_bits,
593 1);
594 if (ea_buf->mp == NULL) {
595 rc = -EIO;
596 goto clean_up;
597 }
598 ea_buf->xattr = ea_buf->mp->data;
599 ea_buf->max_size = (ea_size + sb->s_blocksize - 1) &
600 ~(sb->s_blocksize - 1);
601
602 size_check:
603 if (EALIST_SIZE(ea_buf->xattr) != ea_size) {
604 printk(KERN_ERR "ea_get: invalid extended attribute\n");
605 dump_mem("xattr", ea_buf->xattr, ea_size);
606 ea_release(inode, ea_buf);
607 rc = -EIO;
608 goto clean_up;
609 }
610
611 return ea_size;
612
613 clean_up:
614 /* Rollback quota allocation */
615 if (quota_allocation)
616 DQUOT_FREE_BLOCK(inode, quota_allocation);
617
618 return (rc);
619}
620
621static void ea_release(struct inode *inode, struct ea_buffer *ea_buf)
622{
623 if (ea_buf->flag & EA_MALLOC)
624 kfree(ea_buf->xattr);
625 else if (ea_buf->flag & EA_EXTENT) {
626 assert(ea_buf->mp);
627 release_metapage(ea_buf->mp);
628
629 if (ea_buf->flag & EA_NEW)
630 dbFree(inode, addressDXD(&ea_buf->new_ea),
631 lengthDXD(&ea_buf->new_ea));
632 }
633}
634
635static int ea_put(struct inode *inode, struct ea_buffer *ea_buf, int new_size)
636{
637 struct jfs_inode_info *ji = JFS_IP(inode);
638 unsigned long old_blocks, new_blocks;
639 int rc = 0;
640 tid_t tid;
641
642 if (new_size == 0) {
643 ea_release(inode, ea_buf);
644 ea_buf = NULL;
645 } else if (ea_buf->flag & EA_INLINE) {
646 assert(new_size <= sizeof (ji->i_inline_ea));
647 ji->mode2 &= ~INLINEEA;
648 ea_buf->new_ea.flag = DXD_INLINE;
649 DXDsize(&ea_buf->new_ea, new_size);
650 DXDaddress(&ea_buf->new_ea, 0);
651 DXDlength(&ea_buf->new_ea, 0);
652 } else if (ea_buf->flag & EA_MALLOC) {
653 rc = ea_write(inode, ea_buf->xattr, new_size, &ea_buf->new_ea);
654 kfree(ea_buf->xattr);
655 } else if (ea_buf->flag & EA_NEW) {
656 /* We have already allocated a new dxd */
657 flush_metapage(ea_buf->mp);
658 } else {
659 /* ->xattr must point to original ea's metapage */
660 rc = ea_write(inode, ea_buf->xattr, new_size, &ea_buf->new_ea);
661 discard_metapage(ea_buf->mp);
662 }
663 if (rc)
664 return rc;
665
666 tid = txBegin(inode->i_sb, 0);
667 down(&ji->commit_sem);
668
669 old_blocks = new_blocks = 0;
670
671 if (ji->ea.flag & DXD_EXTENT) {
672 invalidate_dxd_metapages(inode, ji->ea);
673 old_blocks = lengthDXD(&ji->ea);
674 }
675
676 if (ea_buf) {
677 txEA(tid, inode, &ji->ea, &ea_buf->new_ea);
678 if (ea_buf->new_ea.flag & DXD_EXTENT) {
679 new_blocks = lengthDXD(&ea_buf->new_ea);
680 if (ji->ea.flag & DXD_INLINE)
681 ji->mode2 |= INLINEEA;
682 }
683 ji->ea = ea_buf->new_ea;
684 } else {
685 txEA(tid, inode, &ji->ea, NULL);
686 if (ji->ea.flag & DXD_INLINE)
687 ji->mode2 |= INLINEEA;
688 ji->ea.flag = 0;
689 ji->ea.size = 0;
690 }
691
692 /* If old blocks exist, they must be removed from quota allocation. */
693 if (old_blocks)
694 DQUOT_FREE_BLOCK(inode, old_blocks);
695
696 inode->i_ctime = CURRENT_TIME;
697 rc = txCommit(tid, 1, &inode, 0);
698 txEnd(tid);
699 up(&ji->commit_sem);
700
701 return rc;
702}
703
704/*
705 * can_set_system_xattr
706 *
707 * This code is specific to the system.* namespace. It contains policy
708 * which doesn't belong in the main xattr codepath.
709 */
710static int can_set_system_xattr(struct inode *inode, const char *name,
711 const void *value, size_t value_len)
712{
713#ifdef CONFIG_JFS_POSIX_ACL
714 struct posix_acl *acl;
715 int rc;
716
717 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
718 return -EPERM;
719
720 /*
721 * XATTR_NAME_ACL_ACCESS is tied to i_mode
722 */
723 if (strcmp(name, XATTR_NAME_ACL_ACCESS) == 0) {
724 acl = posix_acl_from_xattr(value, value_len);
725 if (IS_ERR(acl)) {
726 rc = PTR_ERR(acl);
727 printk(KERN_ERR "posix_acl_from_xattr returned %d\n",
728 rc);
729 return rc;
730 }
731 if (acl) {
732 mode_t mode = inode->i_mode;
733 rc = posix_acl_equiv_mode(acl, &mode);
734 posix_acl_release(acl);
735 if (rc < 0) {
736 printk(KERN_ERR
737 "posix_acl_equiv_mode returned %d\n",
738 rc);
739 return rc;
740 }
741 inode->i_mode = mode;
742 mark_inode_dirty(inode);
743 }
744 /*
745 * We're changing the ACL. Get rid of the cached one
746 */
747 acl =JFS_IP(inode)->i_acl;
748 if (acl != JFS_ACL_NOT_CACHED)
749 posix_acl_release(acl);
750 JFS_IP(inode)->i_acl = JFS_ACL_NOT_CACHED;
751
752 return 0;
753 } else if (strcmp(name, XATTR_NAME_ACL_DEFAULT) == 0) {
754 acl = posix_acl_from_xattr(value, value_len);
755 if (IS_ERR(acl)) {
756 rc = PTR_ERR(acl);
757 printk(KERN_ERR "posix_acl_from_xattr returned %d\n",
758 rc);
759 return rc;
760 }
761 posix_acl_release(acl);
762
763 /*
764 * We're changing the default ACL. Get rid of the cached one
765 */
766 acl =JFS_IP(inode)->i_default_acl;
767 if (acl && (acl != JFS_ACL_NOT_CACHED))
768 posix_acl_release(acl);
769 JFS_IP(inode)->i_default_acl = JFS_ACL_NOT_CACHED;
770
771 return 0;
772 }
773#endif /* CONFIG_JFS_POSIX_ACL */
774 return -EOPNOTSUPP;
775}
776
777static int can_set_xattr(struct inode *inode, const char *name,
778 const void *value, size_t value_len)
779{
780 if (IS_RDONLY(inode))
781 return -EROFS;
782
783 if (IS_IMMUTABLE(inode) || IS_APPEND(inode) || S_ISLNK(inode->i_mode))
784 return -EPERM;
785
786 if(strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) == 0)
787 /*
788 * "system.*"
789 */
790 return can_set_system_xattr(inode, name, value, value_len);
791
792 if(strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0)
793 return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM);
794
795#ifdef CONFIG_JFS_SECURITY
796 if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)
797 != 0)
798 return 0; /* Leave it to the security module */
799#endif
800
801 if((strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) != 0) &&
802 (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) != 0))
803 return -EOPNOTSUPP;
804
805 if (!S_ISREG(inode->i_mode) &&
806 (!S_ISDIR(inode->i_mode) || inode->i_mode &S_ISVTX))
807 return -EPERM;
808
809 return permission(inode, MAY_WRITE, NULL);
810}
811
812int __jfs_setxattr(struct inode *inode, const char *name, const void *value,
813 size_t value_len, int flags)
814{
815 struct jfs_ea_list *ealist;
816 struct jfs_ea *ea, *old_ea = NULL, *next_ea = NULL;
817 struct ea_buffer ea_buf;
818 int old_ea_size = 0;
819 int xattr_size;
820 int new_size;
821 int namelen = strlen(name);
822 char *os2name = NULL;
823 int found = 0;
824 int rc;
825 int length;
826
827 if ((rc = can_set_xattr(inode, name, value, value_len)))
828 return rc;
829
830 if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
831 os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1,
832 GFP_KERNEL);
833 if (!os2name)
834 return -ENOMEM;
835 strcpy(os2name, name + XATTR_OS2_PREFIX_LEN);
836 name = os2name;
837 namelen -= XATTR_OS2_PREFIX_LEN;
838 }
839
840 down_write(&JFS_IP(inode)->xattr_sem);
841
842 xattr_size = ea_get(inode, &ea_buf, 0);
843 if (xattr_size < 0) {
844 rc = xattr_size;
845 goto out;
846 }
847
848 again:
849 ealist = (struct jfs_ea_list *) ea_buf.xattr;
850 new_size = sizeof (struct jfs_ea_list);
851
852 if (xattr_size) {
853 for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist);
854 ea = NEXT_EA(ea)) {
855 if ((namelen == ea->namelen) &&
856 (memcmp(name, ea->name, namelen) == 0)) {
857 found = 1;
858 if (flags & XATTR_CREATE) {
859 rc = -EEXIST;
860 goto release;
861 }
862 old_ea = ea;
863 old_ea_size = EA_SIZE(ea);
864 next_ea = NEXT_EA(ea);
865 } else
866 new_size += EA_SIZE(ea);
867 }
868 }
869
870 if (!found) {
871 if (flags & XATTR_REPLACE) {
872 rc = -ENODATA;
873 goto release;
874 }
875 if (value == NULL) {
876 rc = 0;
877 goto release;
878 }
879 }
880 if (value)
881 new_size += sizeof (struct jfs_ea) + namelen + 1 + value_len;
882
883 if (new_size > ea_buf.max_size) {
884 /*
885 * We need to allocate more space for merged ea list.
886 * We should only have loop to again: once.
887 */
888 ea_release(inode, &ea_buf);
889 xattr_size = ea_get(inode, &ea_buf, new_size);
890 if (xattr_size < 0) {
891 rc = xattr_size;
892 goto out;
893 }
894 goto again;
895 }
896
897 /* Remove old ea of the same name */
898 if (found) {
899 /* number of bytes following target EA */
900 length = (char *) END_EALIST(ealist) - (char *) next_ea;
901 if (length > 0)
902 memmove(old_ea, next_ea, length);
903 xattr_size -= old_ea_size;
904 }
905
906 /* Add new entry to the end */
907 if (value) {
908 if (xattr_size == 0)
909 /* Completely new ea list */
910 xattr_size = sizeof (struct jfs_ea_list);
911
912 ea = (struct jfs_ea *) ((char *) ealist + xattr_size);
913 ea->flag = 0;
914 ea->namelen = namelen;
915 ea->valuelen = (cpu_to_le16(value_len));
916 memcpy(ea->name, name, namelen);
917 ea->name[namelen] = 0;
918 if (value_len)
919 memcpy(&ea->name[namelen + 1], value, value_len);
920 xattr_size += EA_SIZE(ea);
921 }
922
923 /* DEBUG - If we did this right, these number match */
924 if (xattr_size != new_size) {
925 printk(KERN_ERR
926 "jfs_xsetattr: xattr_size = %d, new_size = %d\n",
927 xattr_size, new_size);
928
929 rc = -EINVAL;
930 goto release;
931 }
932
933 /*
934 * If we're left with an empty list, there's no ea
935 */
936 if (new_size == sizeof (struct jfs_ea_list))
937 new_size = 0;
938
939 ealist->size = cpu_to_le32(new_size);
940
941 rc = ea_put(inode, &ea_buf, new_size);
942
943 goto out;
944 release:
945 ea_release(inode, &ea_buf);
946 out:
947 up_write(&JFS_IP(inode)->xattr_sem);
948
949 if (os2name)
950 kfree(os2name);
951
952 return rc;
953}
954
955int jfs_setxattr(struct dentry *dentry, const char *name, const void *value,
956 size_t value_len, int flags)
957{
958 if (value == NULL) { /* empty EA, do not remove */
959 value = "";
960 value_len = 0;
961 }
962
963 return __jfs_setxattr(dentry->d_inode, name, value, value_len, flags);
964}
965
966static int can_get_xattr(struct inode *inode, const char *name)
967{
968#ifdef CONFIG_JFS_SECURITY
969 if(strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0)
970 return 0;
971#endif
972
973 if(strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0)
974 return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM);
975
976 if(strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) == 0)
977 return 0;
978
979 return permission(inode, MAY_READ, NULL);
980}
981
982ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
983 size_t buf_size)
984{
985 struct jfs_ea_list *ealist;
986 struct jfs_ea *ea;
987 struct ea_buffer ea_buf;
988 int xattr_size;
989 ssize_t size;
990 int namelen = strlen(name);
991 char *os2name = NULL;
992 int rc;
993 char *value;
994
995 if ((rc = can_get_xattr(inode, name)))
996 return rc;
997
998 if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
999 os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1,
1000 GFP_KERNEL);
1001 if (!os2name)
1002 return -ENOMEM;
1003 strcpy(os2name, name + XATTR_OS2_PREFIX_LEN);
1004 name = os2name;
1005 namelen -= XATTR_OS2_PREFIX_LEN;
1006 }
1007
1008 down_read(&JFS_IP(inode)->xattr_sem);
1009
1010 xattr_size = ea_get(inode, &ea_buf, 0);
1011
1012 if (xattr_size < 0) {
1013 size = xattr_size;
1014 goto out;
1015 }
1016
1017 if (xattr_size == 0)
1018 goto not_found;
1019
1020 ealist = (struct jfs_ea_list *) ea_buf.xattr;
1021
1022 /* Find the named attribute */
1023 for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea))
1024 if ((namelen == ea->namelen) &&
1025 memcmp(name, ea->name, namelen) == 0) {
1026 /* Found it */
1027 size = le16_to_cpu(ea->valuelen);
1028 if (!data)
1029 goto release;
1030 else if (size > buf_size) {
1031 size = -ERANGE;
1032 goto release;
1033 }
1034 value = ((char *) &ea->name) + ea->namelen + 1;
1035 memcpy(data, value, size);
1036 goto release;
1037 }
1038 not_found:
1039 size = -ENODATA;
1040 release:
1041 ea_release(inode, &ea_buf);
1042 out:
1043 up_read(&JFS_IP(inode)->xattr_sem);
1044
1045 if (os2name)
1046 kfree(os2name);
1047
1048 return size;
1049}
1050
1051ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data,
1052 size_t buf_size)
1053{
1054 int err;
1055
1056 err = __jfs_getxattr(dentry->d_inode, name, data, buf_size);
1057
1058 return err;
1059}
1060
1061/*
1062 * No special permissions are needed to list attributes except for trusted.*
1063 */
1064static inline int can_list(struct jfs_ea *ea)
1065{
1066 return (strncmp(ea->name, XATTR_TRUSTED_PREFIX,
1067 XATTR_TRUSTED_PREFIX_LEN) ||
1068 capable(CAP_SYS_ADMIN));
1069}
1070
1071ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size)
1072{
1073 struct inode *inode = dentry->d_inode;
1074 char *buffer;
1075 ssize_t size = 0;
1076 int xattr_size;
1077 struct jfs_ea_list *ealist;
1078 struct jfs_ea *ea;
1079 struct ea_buffer ea_buf;
1080
1081 down_read(&JFS_IP(inode)->xattr_sem);
1082
1083 xattr_size = ea_get(inode, &ea_buf, 0);
1084 if (xattr_size < 0) {
1085 size = xattr_size;
1086 goto out;
1087 }
1088
1089 if (xattr_size == 0)
1090 goto release;
1091
1092 ealist = (struct jfs_ea_list *) ea_buf.xattr;
1093
1094 /* compute required size of list */
1095 for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) {
1096 if (can_list(ea))
1097 size += name_size(ea) + 1;
1098 }
1099
1100 if (!data)
1101 goto release;
1102
1103 if (size > buf_size) {
1104 size = -ERANGE;
1105 goto release;
1106 }
1107
1108 /* Copy attribute names to buffer */
1109 buffer = data;
1110 for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) {
1111 if (can_list(ea)) {
1112 int namelen = copy_name(buffer, ea);
1113 buffer += namelen + 1;
1114 }
1115 }
1116
1117 release:
1118 ea_release(inode, &ea_buf);
1119 out:
1120 up_read(&JFS_IP(inode)->xattr_sem);
1121 return size;
1122}
1123
1124int jfs_removexattr(struct dentry *dentry, const char *name)
1125{
1126 return __jfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
1127}