diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /fs/jfs |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'fs/jfs')
44 files changed, 33077 insertions, 0 deletions
diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile new file mode 100644 index 000000000000..6f1e0e95587a --- /dev/null +++ b/fs/jfs/Makefile | |||
@@ -0,0 +1,15 @@ | |||
1 | # | ||
2 | # Makefile for the Linux JFS filesystem routines. | ||
3 | # | ||
4 | |||
5 | obj-$(CONFIG_JFS_FS) += jfs.o | ||
6 | |||
7 | jfs-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \ | ||
8 | jfs_xtree.o jfs_imap.o jfs_debug.o jfs_dmap.o \ | ||
9 | jfs_unicode.o jfs_dtree.o jfs_inode.o \ | ||
10 | jfs_extent.o symlink.o jfs_metapage.o \ | ||
11 | jfs_logmgr.o jfs_txnmgr.o jfs_uniupr.o resize.o xattr.o | ||
12 | |||
13 | jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o | ||
14 | |||
15 | EXTRA_CFLAGS += -D_JFS_4K | ||
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c new file mode 100644 index 000000000000..8d2a9ab981d4 --- /dev/null +++ b/fs/jfs/acl.c | |||
@@ -0,0 +1,234 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2002-2004 | ||
3 | * Copyright (C) Andreas Gruenbacher, 2001 | ||
4 | * Copyright (C) Linus Torvalds, 1991, 1992 | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
14 | * the GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with this program; if not, write to the Free Software | ||
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
19 | */ | ||
20 | |||
21 | #include <linux/sched.h> | ||
22 | #include <linux/fs.h> | ||
23 | #include <linux/quotaops.h> | ||
24 | #include "jfs_incore.h" | ||
25 | #include "jfs_xattr.h" | ||
26 | #include "jfs_acl.h" | ||
27 | |||
28 | static struct posix_acl *jfs_get_acl(struct inode *inode, int type) | ||
29 | { | ||
30 | struct posix_acl *acl; | ||
31 | char *ea_name; | ||
32 | struct jfs_inode_info *ji = JFS_IP(inode); | ||
33 | struct posix_acl **p_acl; | ||
34 | int size; | ||
35 | char *value = NULL; | ||
36 | |||
37 | switch(type) { | ||
38 | case ACL_TYPE_ACCESS: | ||
39 | ea_name = XATTR_NAME_ACL_ACCESS; | ||
40 | p_acl = &ji->i_acl; | ||
41 | break; | ||
42 | case ACL_TYPE_DEFAULT: | ||
43 | ea_name = XATTR_NAME_ACL_DEFAULT; | ||
44 | p_acl = &ji->i_default_acl; | ||
45 | break; | ||
46 | default: | ||
47 | return ERR_PTR(-EINVAL); | ||
48 | } | ||
49 | |||
50 | if (*p_acl != JFS_ACL_NOT_CACHED) | ||
51 | return posix_acl_dup(*p_acl); | ||
52 | |||
53 | size = __jfs_getxattr(inode, ea_name, NULL, 0); | ||
54 | |||
55 | if (size > 0) { | ||
56 | value = kmalloc(size, GFP_KERNEL); | ||
57 | if (!value) | ||
58 | return ERR_PTR(-ENOMEM); | ||
59 | size = __jfs_getxattr(inode, ea_name, value, size); | ||
60 | } | ||
61 | |||
62 | if (size < 0) { | ||
63 | if (size == -ENODATA) { | ||
64 | *p_acl = NULL; | ||
65 | acl = NULL; | ||
66 | } else | ||
67 | acl = ERR_PTR(size); | ||
68 | } else { | ||
69 | acl = posix_acl_from_xattr(value, size); | ||
70 | if (!IS_ERR(acl)) | ||
71 | *p_acl = posix_acl_dup(acl); | ||
72 | } | ||
73 | if (value) | ||
74 | kfree(value); | ||
75 | return acl; | ||
76 | } | ||
77 | |||
78 | static int jfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) | ||
79 | { | ||
80 | char *ea_name; | ||
81 | struct jfs_inode_info *ji = JFS_IP(inode); | ||
82 | struct posix_acl **p_acl; | ||
83 | int rc; | ||
84 | int size = 0; | ||
85 | char *value = NULL; | ||
86 | |||
87 | if (S_ISLNK(inode->i_mode)) | ||
88 | return -EOPNOTSUPP; | ||
89 | |||
90 | switch(type) { | ||
91 | case ACL_TYPE_ACCESS: | ||
92 | ea_name = XATTR_NAME_ACL_ACCESS; | ||
93 | p_acl = &ji->i_acl; | ||
94 | break; | ||
95 | case ACL_TYPE_DEFAULT: | ||
96 | ea_name = XATTR_NAME_ACL_DEFAULT; | ||
97 | p_acl = &ji->i_default_acl; | ||
98 | if (!S_ISDIR(inode->i_mode)) | ||
99 | return acl ? -EACCES : 0; | ||
100 | break; | ||
101 | default: | ||
102 | return -EINVAL; | ||
103 | } | ||
104 | if (acl) { | ||
105 | size = xattr_acl_size(acl->a_count); | ||
106 | value = kmalloc(size, GFP_KERNEL); | ||
107 | if (!value) | ||
108 | return -ENOMEM; | ||
109 | rc = posix_acl_to_xattr(acl, value, size); | ||
110 | if (rc < 0) | ||
111 | goto out; | ||
112 | } | ||
113 | rc = __jfs_setxattr(inode, ea_name, value, size, 0); | ||
114 | out: | ||
115 | if (value) | ||
116 | kfree(value); | ||
117 | |||
118 | if (!rc) { | ||
119 | if (*p_acl && (*p_acl != JFS_ACL_NOT_CACHED)) | ||
120 | posix_acl_release(*p_acl); | ||
121 | *p_acl = posix_acl_dup(acl); | ||
122 | } | ||
123 | return rc; | ||
124 | } | ||
125 | |||
126 | static int jfs_check_acl(struct inode *inode, int mask) | ||
127 | { | ||
128 | struct jfs_inode_info *ji = JFS_IP(inode); | ||
129 | |||
130 | if (ji->i_acl == JFS_ACL_NOT_CACHED) { | ||
131 | struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); | ||
132 | if (IS_ERR(acl)) | ||
133 | return PTR_ERR(acl); | ||
134 | posix_acl_release(acl); | ||
135 | } | ||
136 | |||
137 | if (ji->i_acl) | ||
138 | return posix_acl_permission(inode, ji->i_acl, mask); | ||
139 | return -EAGAIN; | ||
140 | } | ||
141 | |||
142 | int jfs_permission(struct inode *inode, int mask, struct nameidata *nd) | ||
143 | { | ||
144 | return generic_permission(inode, mask, jfs_check_acl); | ||
145 | } | ||
146 | |||
147 | int jfs_init_acl(struct inode *inode, struct inode *dir) | ||
148 | { | ||
149 | struct posix_acl *acl = NULL; | ||
150 | struct posix_acl *clone; | ||
151 | mode_t mode; | ||
152 | int rc = 0; | ||
153 | |||
154 | if (S_ISLNK(inode->i_mode)) | ||
155 | return 0; | ||
156 | |||
157 | acl = jfs_get_acl(dir, ACL_TYPE_DEFAULT); | ||
158 | if (IS_ERR(acl)) | ||
159 | return PTR_ERR(acl); | ||
160 | |||
161 | if (acl) { | ||
162 | if (S_ISDIR(inode->i_mode)) { | ||
163 | rc = jfs_set_acl(inode, ACL_TYPE_DEFAULT, acl); | ||
164 | if (rc) | ||
165 | goto cleanup; | ||
166 | } | ||
167 | clone = posix_acl_clone(acl, GFP_KERNEL); | ||
168 | if (!clone) { | ||
169 | rc = -ENOMEM; | ||
170 | goto cleanup; | ||
171 | } | ||
172 | mode = inode->i_mode; | ||
173 | rc = posix_acl_create_masq(clone, &mode); | ||
174 | if (rc >= 0) { | ||
175 | inode->i_mode = mode; | ||
176 | if (rc > 0) | ||
177 | rc = jfs_set_acl(inode, ACL_TYPE_ACCESS, clone); | ||
178 | } | ||
179 | posix_acl_release(clone); | ||
180 | cleanup: | ||
181 | posix_acl_release(acl); | ||
182 | } else | ||
183 | inode->i_mode &= ~current->fs->umask; | ||
184 | |||
185 | return rc; | ||
186 | } | ||
187 | |||
188 | static int jfs_acl_chmod(struct inode *inode) | ||
189 | { | ||
190 | struct posix_acl *acl, *clone; | ||
191 | int rc; | ||
192 | |||
193 | if (S_ISLNK(inode->i_mode)) | ||
194 | return -EOPNOTSUPP; | ||
195 | |||
196 | acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); | ||
197 | if (IS_ERR(acl) || !acl) | ||
198 | return PTR_ERR(acl); | ||
199 | |||
200 | clone = posix_acl_clone(acl, GFP_KERNEL); | ||
201 | posix_acl_release(acl); | ||
202 | if (!clone) | ||
203 | return -ENOMEM; | ||
204 | |||
205 | rc = posix_acl_chmod_masq(clone, inode->i_mode); | ||
206 | if (!rc) | ||
207 | rc = jfs_set_acl(inode, ACL_TYPE_ACCESS, clone); | ||
208 | |||
209 | posix_acl_release(clone); | ||
210 | return rc; | ||
211 | } | ||
212 | |||
213 | int jfs_setattr(struct dentry *dentry, struct iattr *iattr) | ||
214 | { | ||
215 | struct inode *inode = dentry->d_inode; | ||
216 | int rc; | ||
217 | |||
218 | rc = inode_change_ok(inode, iattr); | ||
219 | if (rc) | ||
220 | return rc; | ||
221 | |||
222 | if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || | ||
223 | (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { | ||
224 | if (DQUOT_TRANSFER(inode, iattr)) | ||
225 | return -EDQUOT; | ||
226 | } | ||
227 | |||
228 | rc = inode_setattr(inode, iattr); | ||
229 | |||
230 | if (!rc && (iattr->ia_valid & ATTR_MODE)) | ||
231 | rc = jfs_acl_chmod(inode); | ||
232 | |||
233 | return rc; | ||
234 | } | ||
diff --git a/fs/jfs/endian24.h b/fs/jfs/endian24.h new file mode 100644 index 000000000000..ab7cd0567c95 --- /dev/null +++ b/fs/jfs/endian24.h | |||
@@ -0,0 +1,49 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2001 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | #ifndef _H_ENDIAN24 | ||
19 | #define _H_ENDIAN24 | ||
20 | |||
21 | /* | ||
22 | * endian24.h: | ||
23 | * | ||
24 | * Endian conversion for 24-byte data | ||
25 | * | ||
26 | */ | ||
27 | #define __swab24(x) \ | ||
28 | ({ \ | ||
29 | __u32 __x = (x); \ | ||
30 | ((__u32)( \ | ||
31 | ((__x & (__u32)0x000000ffUL) << 16) | \ | ||
32 | (__x & (__u32)0x0000ff00UL) | \ | ||
33 | ((__x & (__u32)0x00ff0000UL) >> 16) )); \ | ||
34 | }) | ||
35 | |||
36 | #if (defined(__KERNEL__) && defined(__LITTLE_ENDIAN)) || (defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN)) | ||
37 | #define __cpu_to_le24(x) ((__u32)(x)) | ||
38 | #define __le24_to_cpu(x) ((__u32)(x)) | ||
39 | #else | ||
40 | #define __cpu_to_le24(x) __swab24(x) | ||
41 | #define __le24_to_cpu(x) __swab24(x) | ||
42 | #endif | ||
43 | |||
44 | #ifdef __KERNEL__ | ||
45 | #define cpu_to_le24 __cpu_to_le24 | ||
46 | #define le24_to_cpu __le24_to_cpu | ||
47 | #endif | ||
48 | |||
49 | #endif /* !_H_ENDIAN24 */ | ||
diff --git a/fs/jfs/file.c b/fs/jfs/file.c new file mode 100644 index 000000000000..a87b06fa8ff8 --- /dev/null +++ b/fs/jfs/file.c | |||
@@ -0,0 +1,119 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2000-2002 | ||
3 | * Portions Copyright (c) Christoph Hellwig, 2001-2002 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | |||
20 | #include <linux/fs.h> | ||
21 | #include "jfs_incore.h" | ||
22 | #include "jfs_dmap.h" | ||
23 | #include "jfs_txnmgr.h" | ||
24 | #include "jfs_xattr.h" | ||
25 | #include "jfs_acl.h" | ||
26 | #include "jfs_debug.h" | ||
27 | |||
28 | |||
29 | extern int jfs_commit_inode(struct inode *, int); | ||
30 | extern void jfs_truncate(struct inode *); | ||
31 | |||
32 | int jfs_fsync(struct file *file, struct dentry *dentry, int datasync) | ||
33 | { | ||
34 | struct inode *inode = dentry->d_inode; | ||
35 | int rc = 0; | ||
36 | |||
37 | if (!(inode->i_state & I_DIRTY) || | ||
38 | (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { | ||
39 | /* Make sure committed changes hit the disk */ | ||
40 | jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); | ||
41 | return rc; | ||
42 | } | ||
43 | |||
44 | rc |= jfs_commit_inode(inode, 1); | ||
45 | |||
46 | return rc ? -EIO : 0; | ||
47 | } | ||
48 | |||
49 | static int jfs_open(struct inode *inode, struct file *file) | ||
50 | { | ||
51 | int rc; | ||
52 | |||
53 | if ((rc = generic_file_open(inode, file))) | ||
54 | return rc; | ||
55 | |||
56 | /* | ||
57 | * We attempt to allow only one "active" file open per aggregate | ||
58 | * group. Otherwise, appending to files in parallel can cause | ||
59 | * fragmentation within the files. | ||
60 | * | ||
61 | * If the file is empty, it was probably just created and going | ||
62 | * to be written to. If it has a size, we'll hold off until the | ||
63 | * file is actually grown. | ||
64 | */ | ||
65 | if (S_ISREG(inode->i_mode) && file->f_mode & FMODE_WRITE && | ||
66 | (inode->i_size == 0)) { | ||
67 | struct jfs_inode_info *ji = JFS_IP(inode); | ||
68 | spin_lock_irq(&ji->ag_lock); | ||
69 | if (ji->active_ag == -1) { | ||
70 | ji->active_ag = ji->agno; | ||
71 | atomic_inc( | ||
72 | &JFS_SBI(inode->i_sb)->bmap->db_active[ji->agno]); | ||
73 | } | ||
74 | spin_unlock_irq(&ji->ag_lock); | ||
75 | } | ||
76 | |||
77 | return 0; | ||
78 | } | ||
79 | static int jfs_release(struct inode *inode, struct file *file) | ||
80 | { | ||
81 | struct jfs_inode_info *ji = JFS_IP(inode); | ||
82 | |||
83 | spin_lock_irq(&ji->ag_lock); | ||
84 | if (ji->active_ag != -1) { | ||
85 | struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap; | ||
86 | atomic_dec(&bmap->db_active[ji->active_ag]); | ||
87 | ji->active_ag = -1; | ||
88 | } | ||
89 | spin_unlock_irq(&ji->ag_lock); | ||
90 | |||
91 | return 0; | ||
92 | } | ||
93 | |||
94 | struct inode_operations jfs_file_inode_operations = { | ||
95 | .truncate = jfs_truncate, | ||
96 | .setxattr = jfs_setxattr, | ||
97 | .getxattr = jfs_getxattr, | ||
98 | .listxattr = jfs_listxattr, | ||
99 | .removexattr = jfs_removexattr, | ||
100 | #ifdef CONFIG_JFS_POSIX_ACL | ||
101 | .setattr = jfs_setattr, | ||
102 | .permission = jfs_permission, | ||
103 | #endif | ||
104 | }; | ||
105 | |||
106 | struct file_operations jfs_file_operations = { | ||
107 | .open = jfs_open, | ||
108 | .llseek = generic_file_llseek, | ||
109 | .write = generic_file_write, | ||
110 | .read = generic_file_read, | ||
111 | .aio_read = generic_file_aio_read, | ||
112 | .aio_write = generic_file_aio_write, | ||
113 | .mmap = generic_file_mmap, | ||
114 | .readv = generic_file_readv, | ||
115 | .writev = generic_file_writev, | ||
116 | .sendfile = generic_file_sendfile, | ||
117 | .fsync = jfs_fsync, | ||
118 | .release = jfs_release, | ||
119 | }; | ||
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c new file mode 100644 index 000000000000..7bc906677b0d --- /dev/null +++ b/fs/jfs/inode.c | |||
@@ -0,0 +1,384 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * Portions Copyright (C) Christoph Hellwig, 2001-2002 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | |||
20 | #include <linux/fs.h> | ||
21 | #include <linux/mpage.h> | ||
22 | #include <linux/buffer_head.h> | ||
23 | #include <linux/pagemap.h> | ||
24 | #include <linux/quotaops.h> | ||
25 | #include "jfs_incore.h" | ||
26 | #include "jfs_filsys.h" | ||
27 | #include "jfs_imap.h" | ||
28 | #include "jfs_extent.h" | ||
29 | #include "jfs_unicode.h" | ||
30 | #include "jfs_debug.h" | ||
31 | |||
32 | |||
33 | extern struct inode_operations jfs_dir_inode_operations; | ||
34 | extern struct inode_operations jfs_file_inode_operations; | ||
35 | extern struct inode_operations jfs_symlink_inode_operations; | ||
36 | extern struct file_operations jfs_dir_operations; | ||
37 | extern struct file_operations jfs_file_operations; | ||
38 | struct address_space_operations jfs_aops; | ||
39 | extern int freeZeroLink(struct inode *); | ||
40 | |||
41 | void jfs_read_inode(struct inode *inode) | ||
42 | { | ||
43 | if (diRead(inode)) { | ||
44 | make_bad_inode(inode); | ||
45 | return; | ||
46 | } | ||
47 | |||
48 | if (S_ISREG(inode->i_mode)) { | ||
49 | inode->i_op = &jfs_file_inode_operations; | ||
50 | inode->i_fop = &jfs_file_operations; | ||
51 | inode->i_mapping->a_ops = &jfs_aops; | ||
52 | } else if (S_ISDIR(inode->i_mode)) { | ||
53 | inode->i_op = &jfs_dir_inode_operations; | ||
54 | inode->i_fop = &jfs_dir_operations; | ||
55 | } else if (S_ISLNK(inode->i_mode)) { | ||
56 | if (inode->i_size >= IDATASIZE) { | ||
57 | inode->i_op = &page_symlink_inode_operations; | ||
58 | inode->i_mapping->a_ops = &jfs_aops; | ||
59 | } else | ||
60 | inode->i_op = &jfs_symlink_inode_operations; | ||
61 | } else { | ||
62 | inode->i_op = &jfs_file_inode_operations; | ||
63 | init_special_inode(inode, inode->i_mode, inode->i_rdev); | ||
64 | } | ||
65 | } | ||
66 | |||
67 | /* | ||
68 | * Workhorse of both fsync & write_inode | ||
69 | */ | ||
70 | int jfs_commit_inode(struct inode *inode, int wait) | ||
71 | { | ||
72 | int rc = 0; | ||
73 | tid_t tid; | ||
74 | static int noisy = 5; | ||
75 | |||
76 | jfs_info("In jfs_commit_inode, inode = 0x%p", inode); | ||
77 | |||
78 | /* | ||
79 | * Don't commit if inode has been committed since last being | ||
80 | * marked dirty, or if it has been deleted. | ||
81 | */ | ||
82 | if (inode->i_nlink == 0 || !test_cflag(COMMIT_Dirty, inode)) | ||
83 | return 0; | ||
84 | |||
85 | if (isReadOnly(inode)) { | ||
86 | /* kernel allows writes to devices on read-only | ||
87 | * partitions and may think inode is dirty | ||
88 | */ | ||
89 | if (!special_file(inode->i_mode) && noisy) { | ||
90 | jfs_err("jfs_commit_inode(0x%p) called on " | ||
91 | "read-only volume", inode); | ||
92 | jfs_err("Is remount racy?"); | ||
93 | noisy--; | ||
94 | } | ||
95 | return 0; | ||
96 | } | ||
97 | |||
98 | tid = txBegin(inode->i_sb, COMMIT_INODE); | ||
99 | down(&JFS_IP(inode)->commit_sem); | ||
100 | |||
101 | /* | ||
102 | * Retest inode state after taking commit_sem | ||
103 | */ | ||
104 | if (inode->i_nlink && test_cflag(COMMIT_Dirty, inode)) | ||
105 | rc = txCommit(tid, 1, &inode, wait ? COMMIT_SYNC : 0); | ||
106 | |||
107 | txEnd(tid); | ||
108 | up(&JFS_IP(inode)->commit_sem); | ||
109 | return rc; | ||
110 | } | ||
111 | |||
112 | int jfs_write_inode(struct inode *inode, int wait) | ||
113 | { | ||
114 | if (test_cflag(COMMIT_Nolink, inode)) | ||
115 | return 0; | ||
116 | /* | ||
117 | * If COMMIT_DIRTY is not set, the inode isn't really dirty. | ||
118 | * It has been committed since the last change, but was still | ||
119 | * on the dirty inode list. | ||
120 | */ | ||
121 | if (!test_cflag(COMMIT_Dirty, inode)) { | ||
122 | /* Make sure committed changes hit the disk */ | ||
123 | jfs_flush_journal(JFS_SBI(inode->i_sb)->log, wait); | ||
124 | return 0; | ||
125 | } | ||
126 | |||
127 | if (jfs_commit_inode(inode, wait)) { | ||
128 | jfs_err("jfs_write_inode: jfs_commit_inode failed!"); | ||
129 | return -EIO; | ||
130 | } else | ||
131 | return 0; | ||
132 | } | ||
133 | |||
134 | void jfs_delete_inode(struct inode *inode) | ||
135 | { | ||
136 | jfs_info("In jfs_delete_inode, inode = 0x%p", inode); | ||
137 | |||
138 | if (test_cflag(COMMIT_Freewmap, inode)) | ||
139 | freeZeroLink(inode); | ||
140 | |||
141 | diFree(inode); | ||
142 | |||
143 | /* | ||
144 | * Free the inode from the quota allocation. | ||
145 | */ | ||
146 | DQUOT_INIT(inode); | ||
147 | DQUOT_FREE_INODE(inode); | ||
148 | DQUOT_DROP(inode); | ||
149 | |||
150 | clear_inode(inode); | ||
151 | } | ||
152 | |||
153 | void jfs_dirty_inode(struct inode *inode) | ||
154 | { | ||
155 | static int noisy = 5; | ||
156 | |||
157 | if (isReadOnly(inode)) { | ||
158 | if (!special_file(inode->i_mode) && noisy) { | ||
159 | /* kernel allows writes to devices on read-only | ||
160 | * partitions and may try to mark inode dirty | ||
161 | */ | ||
162 | jfs_err("jfs_dirty_inode called on read-only volume"); | ||
163 | jfs_err("Is remount racy?"); | ||
164 | noisy--; | ||
165 | } | ||
166 | return; | ||
167 | } | ||
168 | |||
169 | set_cflag(COMMIT_Dirty, inode); | ||
170 | } | ||
171 | |||
172 | static int | ||
173 | jfs_get_blocks(struct inode *ip, sector_t lblock, unsigned long max_blocks, | ||
174 | struct buffer_head *bh_result, int create) | ||
175 | { | ||
176 | s64 lblock64 = lblock; | ||
177 | int rc = 0; | ||
178 | int take_locks; | ||
179 | xad_t xad; | ||
180 | s64 xaddr; | ||
181 | int xflag; | ||
182 | s32 xlen; | ||
183 | |||
184 | /* | ||
185 | * If this is a special inode (imap, dmap) | ||
186 | * the lock should already be taken | ||
187 | */ | ||
188 | take_locks = (JFS_IP(ip)->fileset != AGGREGATE_I); | ||
189 | |||
190 | /* | ||
191 | * Take appropriate lock on inode | ||
192 | */ | ||
193 | if (take_locks) { | ||
194 | if (create) | ||
195 | IWRITE_LOCK(ip); | ||
196 | else | ||
197 | IREAD_LOCK(ip); | ||
198 | } | ||
199 | |||
200 | if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) && | ||
201 | (xtLookup(ip, lblock64, max_blocks, &xflag, &xaddr, &xlen, 0) | ||
202 | == 0) && xlen) { | ||
203 | if (xflag & XAD_NOTRECORDED) { | ||
204 | if (!create) | ||
205 | /* | ||
206 | * Allocated but not recorded, read treats | ||
207 | * this as a hole | ||
208 | */ | ||
209 | goto unlock; | ||
210 | #ifdef _JFS_4K | ||
211 | XADoffset(&xad, lblock64); | ||
212 | XADlength(&xad, xlen); | ||
213 | XADaddress(&xad, xaddr); | ||
214 | #else /* _JFS_4K */ | ||
215 | /* | ||
216 | * As long as block size = 4K, this isn't a problem. | ||
217 | * We should mark the whole page not ABNR, but how | ||
218 | * will we know to mark the other blocks BH_New? | ||
219 | */ | ||
220 | BUG(); | ||
221 | #endif /* _JFS_4K */ | ||
222 | rc = extRecord(ip, &xad); | ||
223 | if (rc) | ||
224 | goto unlock; | ||
225 | set_buffer_new(bh_result); | ||
226 | } | ||
227 | |||
228 | map_bh(bh_result, ip->i_sb, xaddr); | ||
229 | bh_result->b_size = xlen << ip->i_blkbits; | ||
230 | goto unlock; | ||
231 | } | ||
232 | if (!create) | ||
233 | goto unlock; | ||
234 | |||
235 | /* | ||
236 | * Allocate a new block | ||
237 | */ | ||
238 | #ifdef _JFS_4K | ||
239 | if ((rc = extHint(ip, lblock64 << ip->i_sb->s_blocksize_bits, &xad))) | ||
240 | goto unlock; | ||
241 | rc = extAlloc(ip, max_blocks, lblock64, &xad, FALSE); | ||
242 | if (rc) | ||
243 | goto unlock; | ||
244 | |||
245 | set_buffer_new(bh_result); | ||
246 | map_bh(bh_result, ip->i_sb, addressXAD(&xad)); | ||
247 | bh_result->b_size = lengthXAD(&xad) << ip->i_blkbits; | ||
248 | |||
249 | #else /* _JFS_4K */ | ||
250 | /* | ||
251 | * We need to do whatever it takes to keep all but the last buffers | ||
252 | * in 4K pages - see jfs_write.c | ||
253 | */ | ||
254 | BUG(); | ||
255 | #endif /* _JFS_4K */ | ||
256 | |||
257 | unlock: | ||
258 | /* | ||
259 | * Release lock on inode | ||
260 | */ | ||
261 | if (take_locks) { | ||
262 | if (create) | ||
263 | IWRITE_UNLOCK(ip); | ||
264 | else | ||
265 | IREAD_UNLOCK(ip); | ||
266 | } | ||
267 | return rc; | ||
268 | } | ||
269 | |||
270 | static int jfs_get_block(struct inode *ip, sector_t lblock, | ||
271 | struct buffer_head *bh_result, int create) | ||
272 | { | ||
273 | return jfs_get_blocks(ip, lblock, 1, bh_result, create); | ||
274 | } | ||
275 | |||
276 | static int jfs_writepage(struct page *page, struct writeback_control *wbc) | ||
277 | { | ||
278 | return nobh_writepage(page, jfs_get_block, wbc); | ||
279 | } | ||
280 | |||
281 | static int jfs_writepages(struct address_space *mapping, | ||
282 | struct writeback_control *wbc) | ||
283 | { | ||
284 | return mpage_writepages(mapping, wbc, jfs_get_block); | ||
285 | } | ||
286 | |||
287 | static int jfs_readpage(struct file *file, struct page *page) | ||
288 | { | ||
289 | return mpage_readpage(page, jfs_get_block); | ||
290 | } | ||
291 | |||
292 | static int jfs_readpages(struct file *file, struct address_space *mapping, | ||
293 | struct list_head *pages, unsigned nr_pages) | ||
294 | { | ||
295 | return mpage_readpages(mapping, pages, nr_pages, jfs_get_block); | ||
296 | } | ||
297 | |||
298 | static int jfs_prepare_write(struct file *file, | ||
299 | struct page *page, unsigned from, unsigned to) | ||
300 | { | ||
301 | return nobh_prepare_write(page, from, to, jfs_get_block); | ||
302 | } | ||
303 | |||
304 | static sector_t jfs_bmap(struct address_space *mapping, sector_t block) | ||
305 | { | ||
306 | return generic_block_bmap(mapping, block, jfs_get_block); | ||
307 | } | ||
308 | |||
309 | static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb, | ||
310 | const struct iovec *iov, loff_t offset, unsigned long nr_segs) | ||
311 | { | ||
312 | struct file *file = iocb->ki_filp; | ||
313 | struct inode *inode = file->f_mapping->host; | ||
314 | |||
315 | return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, | ||
316 | offset, nr_segs, jfs_get_blocks, NULL); | ||
317 | } | ||
318 | |||
319 | struct address_space_operations jfs_aops = { | ||
320 | .readpage = jfs_readpage, | ||
321 | .readpages = jfs_readpages, | ||
322 | .writepage = jfs_writepage, | ||
323 | .writepages = jfs_writepages, | ||
324 | .sync_page = block_sync_page, | ||
325 | .prepare_write = jfs_prepare_write, | ||
326 | .commit_write = nobh_commit_write, | ||
327 | .bmap = jfs_bmap, | ||
328 | .direct_IO = jfs_direct_IO, | ||
329 | }; | ||
330 | |||
331 | /* | ||
332 | * Guts of jfs_truncate. Called with locks already held. Can be called | ||
333 | * with directory for truncating directory index table. | ||
334 | */ | ||
335 | void jfs_truncate_nolock(struct inode *ip, loff_t length) | ||
336 | { | ||
337 | loff_t newsize; | ||
338 | tid_t tid; | ||
339 | |||
340 | ASSERT(length >= 0); | ||
341 | |||
342 | if (test_cflag(COMMIT_Nolink, ip)) { | ||
343 | xtTruncate(0, ip, length, COMMIT_WMAP); | ||
344 | return; | ||
345 | } | ||
346 | |||
347 | do { | ||
348 | tid = txBegin(ip->i_sb, 0); | ||
349 | |||
350 | /* | ||
351 | * The commit_sem cannot be taken before txBegin. | ||
352 | * txBegin may block and there is a chance the inode | ||
353 | * could be marked dirty and need to be committed | ||
354 | * before txBegin unblocks | ||
355 | */ | ||
356 | down(&JFS_IP(ip)->commit_sem); | ||
357 | |||
358 | newsize = xtTruncate(tid, ip, length, | ||
359 | COMMIT_TRUNCATE | COMMIT_PWMAP); | ||
360 | if (newsize < 0) { | ||
361 | txEnd(tid); | ||
362 | up(&JFS_IP(ip)->commit_sem); | ||
363 | break; | ||
364 | } | ||
365 | |||
366 | ip->i_mtime = ip->i_ctime = CURRENT_TIME; | ||
367 | mark_inode_dirty(ip); | ||
368 | |||
369 | txCommit(tid, 1, &ip, 0); | ||
370 | txEnd(tid); | ||
371 | up(&JFS_IP(ip)->commit_sem); | ||
372 | } while (newsize > length); /* Truncate isn't always atomic */ | ||
373 | } | ||
374 | |||
375 | void jfs_truncate(struct inode *ip) | ||
376 | { | ||
377 | jfs_info("jfs_truncate: size = 0x%lx", (ulong) ip->i_size); | ||
378 | |||
379 | nobh_truncate_page(ip->i_mapping, ip->i_size); | ||
380 | |||
381 | IWRITE_LOCK(ip); | ||
382 | jfs_truncate_nolock(ip, ip->i_size); | ||
383 | IWRITE_UNLOCK(ip); | ||
384 | } | ||
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h new file mode 100644 index 000000000000..d2ae430adecf --- /dev/null +++ b/fs/jfs/jfs_acl.h | |||
@@ -0,0 +1,30 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2002 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | #ifndef _H_JFS_ACL | ||
19 | #define _H_JFS_ACL | ||
20 | |||
21 | #ifdef CONFIG_JFS_POSIX_ACL | ||
22 | |||
23 | #include <linux/xattr_acl.h> | ||
24 | |||
25 | int jfs_permission(struct inode *, int, struct nameidata *); | ||
26 | int jfs_init_acl(struct inode *, struct inode *); | ||
27 | int jfs_setattr(struct dentry *, struct iattr *); | ||
28 | |||
29 | #endif /* CONFIG_JFS_POSIX_ACL */ | ||
30 | #endif /* _H_JFS_ACL */ | ||
diff --git a/fs/jfs/jfs_btree.h b/fs/jfs/jfs_btree.h new file mode 100644 index 000000000000..7f3e9ac454ff --- /dev/null +++ b/fs/jfs/jfs_btree.h | |||
@@ -0,0 +1,172 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | #ifndef _H_JFS_BTREE | ||
19 | #define _H_JFS_BTREE | ||
20 | |||
21 | /* | ||
22 | * jfs_btree.h: B+-tree | ||
23 | * | ||
24 | * JFS B+-tree (dtree and xtree) common definitions | ||
25 | */ | ||
26 | |||
27 | /* | ||
28 | * basic btree page - btpage | ||
29 | * | ||
30 | struct btpage { | ||
31 | s64 next; right sibling bn | ||
32 | s64 prev; left sibling bn | ||
33 | |||
34 | u8 flag; | ||
35 | u8 rsrvd[7]; type specific | ||
36 | s64 self; self address | ||
37 | |||
38 | u8 entry[4064]; | ||
39 | }; */ | ||
40 | |||
41 | /* btpaget_t flag */ | ||
42 | #define BT_TYPE 0x07 /* B+-tree index */ | ||
43 | #define BT_ROOT 0x01 /* root page */ | ||
44 | #define BT_LEAF 0x02 /* leaf page */ | ||
45 | #define BT_INTERNAL 0x04 /* internal page */ | ||
46 | #define BT_RIGHTMOST 0x10 /* rightmost page */ | ||
47 | #define BT_LEFTMOST 0x20 /* leftmost page */ | ||
48 | #define BT_SWAPPED 0x80 /* used by fsck for endian swapping */ | ||
49 | |||
50 | /* btorder (in inode) */ | ||
51 | #define BT_RANDOM 0x0000 | ||
52 | #define BT_SEQUENTIAL 0x0001 | ||
53 | #define BT_LOOKUP 0x0010 | ||
54 | #define BT_INSERT 0x0020 | ||
55 | #define BT_DELETE 0x0040 | ||
56 | |||
57 | /* | ||
58 | * btree page buffer cache access | ||
59 | */ | ||
60 | #define BT_IS_ROOT(MP) (((MP)->xflag & COMMIT_PAGE) == 0) | ||
61 | |||
62 | /* get page from buffer page */ | ||
63 | #define BT_PAGE(IP, MP, TYPE, ROOT)\ | ||
64 | (BT_IS_ROOT(MP) ? (TYPE *)&JFS_IP(IP)->ROOT : (TYPE *)(MP)->data) | ||
65 | |||
66 | /* get the page buffer and the page for specified block address */ | ||
67 | #define BT_GETPAGE(IP, BN, MP, TYPE, SIZE, P, RC, ROOT)\ | ||
68 | {\ | ||
69 | if ((BN) == 0)\ | ||
70 | {\ | ||
71 | MP = (struct metapage *)&JFS_IP(IP)->bxflag;\ | ||
72 | P = (TYPE *)&JFS_IP(IP)->ROOT;\ | ||
73 | RC = 0;\ | ||
74 | }\ | ||
75 | else\ | ||
76 | {\ | ||
77 | MP = read_metapage((IP), BN, SIZE, 1);\ | ||
78 | if (MP) {\ | ||
79 | RC = 0;\ | ||
80 | P = (MP)->data;\ | ||
81 | } else {\ | ||
82 | P = NULL;\ | ||
83 | jfs_err("bread failed!");\ | ||
84 | RC = -EIO;\ | ||
85 | }\ | ||
86 | }\ | ||
87 | } | ||
88 | |||
89 | #define BT_MARK_DIRTY(MP, IP)\ | ||
90 | {\ | ||
91 | if (BT_IS_ROOT(MP))\ | ||
92 | mark_inode_dirty(IP);\ | ||
93 | else\ | ||
94 | mark_metapage_dirty(MP);\ | ||
95 | } | ||
96 | |||
97 | /* put the page buffer */ | ||
98 | #define BT_PUTPAGE(MP)\ | ||
99 | {\ | ||
100 | if (! BT_IS_ROOT(MP)) \ | ||
101 | release_metapage(MP); \ | ||
102 | } | ||
103 | |||
104 | |||
105 | /* | ||
106 | * btree traversal stack | ||
107 | * | ||
108 | * record the path traversed during the search; | ||
109 | * top frame record the leaf page/entry selected. | ||
110 | */ | ||
111 | struct btframe { /* stack frame */ | ||
112 | s64 bn; /* 8: */ | ||
113 | s16 index; /* 2: */ | ||
114 | s16 lastindex; /* 2: unused */ | ||
115 | struct metapage *mp; /* 4/8: */ | ||
116 | }; /* (16/24) */ | ||
117 | |||
118 | struct btstack { | ||
119 | struct btframe *top; | ||
120 | int nsplit; | ||
121 | struct btframe stack[MAXTREEHEIGHT]; | ||
122 | }; | ||
123 | |||
124 | #define BT_CLR(btstack)\ | ||
125 | (btstack)->top = (btstack)->stack | ||
126 | |||
127 | #define BT_STACK_FULL(btstack)\ | ||
128 | ( (btstack)->top == &((btstack)->stack[MAXTREEHEIGHT-1])) | ||
129 | |||
130 | #define BT_PUSH(BTSTACK, BN, INDEX)\ | ||
131 | {\ | ||
132 | assert(!BT_STACK_FULL(BTSTACK));\ | ||
133 | (BTSTACK)->top->bn = BN;\ | ||
134 | (BTSTACK)->top->index = INDEX;\ | ||
135 | ++(BTSTACK)->top;\ | ||
136 | } | ||
137 | |||
138 | #define BT_POP(btstack)\ | ||
139 | ( (btstack)->top == (btstack)->stack ? NULL : --(btstack)->top ) | ||
140 | |||
141 | #define BT_STACK(btstack)\ | ||
142 | ( (btstack)->top == (btstack)->stack ? NULL : (btstack)->top ) | ||
143 | |||
144 | static inline void BT_STACK_DUMP(struct btstack *btstack) | ||
145 | { | ||
146 | int i; | ||
147 | printk("btstack dump:\n"); | ||
148 | for (i = 0; i < MAXTREEHEIGHT; i++) | ||
149 | printk(KERN_ERR "bn = %Lx, index = %d\n", | ||
150 | (long long)btstack->stack[i].bn, | ||
151 | btstack->stack[i].index); | ||
152 | } | ||
153 | |||
154 | /* retrieve search results */ | ||
155 | #define BT_GETSEARCH(IP, LEAF, BN, MP, TYPE, P, INDEX, ROOT)\ | ||
156 | {\ | ||
157 | BN = (LEAF)->bn;\ | ||
158 | MP = (LEAF)->mp;\ | ||
159 | if (BN)\ | ||
160 | P = (TYPE *)MP->data;\ | ||
161 | else\ | ||
162 | P = (TYPE *)&JFS_IP(IP)->ROOT;\ | ||
163 | INDEX = (LEAF)->index;\ | ||
164 | } | ||
165 | |||
166 | /* put the page buffer of search */ | ||
167 | #define BT_PUTSEARCH(BTSTACK)\ | ||
168 | {\ | ||
169 | if (! BT_IS_ROOT((BTSTACK)->top->mp))\ | ||
170 | release_metapage((BTSTACK)->top->mp);\ | ||
171 | } | ||
172 | #endif /* _H_JFS_BTREE */ | ||
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c new file mode 100644 index 000000000000..91a0a889ebc5 --- /dev/null +++ b/fs/jfs/jfs_debug.c | |||
@@ -0,0 +1,154 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * Portions Copyright (C) Christoph Hellwig, 2001-2002 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | |||
20 | #include <linux/fs.h> | ||
21 | #include <linux/ctype.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/proc_fs.h> | ||
24 | #include <asm/uaccess.h> | ||
25 | #include "jfs_incore.h" | ||
26 | #include "jfs_filsys.h" | ||
27 | #include "jfs_debug.h" | ||
28 | |||
29 | #ifdef CONFIG_JFS_DEBUG | ||
30 | void dump_mem(char *label, void *data, int length) | ||
31 | { | ||
32 | int i, j; | ||
33 | int *intptr = data; | ||
34 | char *charptr = data; | ||
35 | char buf[10], line[80]; | ||
36 | |||
37 | printk("%s: dump of %d bytes of data at 0x%p\n\n", label, length, | ||
38 | data); | ||
39 | for (i = 0; i < length; i += 16) { | ||
40 | line[0] = 0; | ||
41 | for (j = 0; (j < 4) && (i + j * 4 < length); j++) { | ||
42 | sprintf(buf, " %08x", intptr[i / 4 + j]); | ||
43 | strcat(line, buf); | ||
44 | } | ||
45 | buf[0] = ' '; | ||
46 | buf[2] = 0; | ||
47 | for (j = 0; (j < 16) && (i + j < length); j++) { | ||
48 | buf[1] = | ||
49 | isprint(charptr[i + j]) ? charptr[i + j] : '.'; | ||
50 | strcat(line, buf); | ||
51 | } | ||
52 | printk("%s\n", line); | ||
53 | } | ||
54 | } | ||
55 | #endif | ||
56 | |||
57 | #ifdef PROC_FS_JFS /* see jfs_debug.h */ | ||
58 | |||
59 | static struct proc_dir_entry *base; | ||
60 | #ifdef CONFIG_JFS_DEBUG | ||
61 | extern read_proc_t jfs_txanchor_read; | ||
62 | |||
63 | static int loglevel_read(char *page, char **start, off_t off, | ||
64 | int count, int *eof, void *data) | ||
65 | { | ||
66 | int len; | ||
67 | |||
68 | len = sprintf(page, "%d\n", jfsloglevel); | ||
69 | |||
70 | len -= off; | ||
71 | *start = page + off; | ||
72 | |||
73 | if (len > count) | ||
74 | len = count; | ||
75 | else | ||
76 | *eof = 1; | ||
77 | |||
78 | if (len < 0) | ||
79 | len = 0; | ||
80 | |||
81 | return len; | ||
82 | } | ||
83 | |||
84 | static int loglevel_write(struct file *file, const char __user *buffer, | ||
85 | unsigned long count, void *data) | ||
86 | { | ||
87 | char c; | ||
88 | |||
89 | if (get_user(c, buffer)) | ||
90 | return -EFAULT; | ||
91 | |||
92 | /* yes, I know this is an ASCIIism. --hch */ | ||
93 | if (c < '0' || c > '9') | ||
94 | return -EINVAL; | ||
95 | jfsloglevel = c - '0'; | ||
96 | return count; | ||
97 | } | ||
98 | #endif | ||
99 | |||
100 | |||
101 | #ifdef CONFIG_JFS_STATISTICS | ||
102 | extern read_proc_t jfs_lmstats_read; | ||
103 | extern read_proc_t jfs_txstats_read; | ||
104 | extern read_proc_t jfs_xtstat_read; | ||
105 | extern read_proc_t jfs_mpstat_read; | ||
106 | #endif | ||
107 | |||
108 | static struct { | ||
109 | const char *name; | ||
110 | read_proc_t *read_fn; | ||
111 | write_proc_t *write_fn; | ||
112 | } Entries[] = { | ||
113 | #ifdef CONFIG_JFS_STATISTICS | ||
114 | { "lmstats", jfs_lmstats_read, }, | ||
115 | { "txstats", jfs_txstats_read, }, | ||
116 | { "xtstat", jfs_xtstat_read, }, | ||
117 | { "mpstat", jfs_mpstat_read, }, | ||
118 | #endif | ||
119 | #ifdef CONFIG_JFS_DEBUG | ||
120 | { "TxAnchor", jfs_txanchor_read, }, | ||
121 | { "loglevel", loglevel_read, loglevel_write } | ||
122 | #endif | ||
123 | }; | ||
124 | #define NPROCENT (sizeof(Entries)/sizeof(Entries[0])) | ||
125 | |||
126 | void jfs_proc_init(void) | ||
127 | { | ||
128 | int i; | ||
129 | |||
130 | if (!(base = proc_mkdir("jfs", proc_root_fs))) | ||
131 | return; | ||
132 | base->owner = THIS_MODULE; | ||
133 | |||
134 | for (i = 0; i < NPROCENT; i++) { | ||
135 | struct proc_dir_entry *p; | ||
136 | if ((p = create_proc_entry(Entries[i].name, 0, base))) { | ||
137 | p->read_proc = Entries[i].read_fn; | ||
138 | p->write_proc = Entries[i].write_fn; | ||
139 | } | ||
140 | } | ||
141 | } | ||
142 | |||
143 | void jfs_proc_clean(void) | ||
144 | { | ||
145 | int i; | ||
146 | |||
147 | if (base) { | ||
148 | for (i = 0; i < NPROCENT; i++) | ||
149 | remove_proc_entry(Entries[i].name, base); | ||
150 | remove_proc_entry("jfs", proc_root_fs); | ||
151 | } | ||
152 | } | ||
153 | |||
154 | #endif /* PROC_FS_JFS */ | ||
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h new file mode 100644 index 000000000000..a38079ae1e00 --- /dev/null +++ b/fs/jfs/jfs_debug.h | |||
@@ -0,0 +1,122 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2000-2002 | ||
3 | * Portions Copyright (c) Christoph Hellwig, 2001-2002 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | #ifndef _H_JFS_DEBUG | ||
20 | #define _H_JFS_DEBUG | ||
21 | |||
22 | /* | ||
23 | * jfs_debug.h | ||
24 | * | ||
25 | * global debug message, data structure/macro definitions | ||
26 | * under control of CONFIG_JFS_DEBUG, CONFIG_JFS_STATISTICS; | ||
27 | */ | ||
28 | |||
29 | /* | ||
30 | * Create /proc/fs/jfs if procfs is enabled andeither | ||
31 | * CONFIG_JFS_DEBUG or CONFIG_JFS_STATISTICS is defined | ||
32 | */ | ||
33 | #if defined(CONFIG_PROC_FS) && (defined(CONFIG_JFS_DEBUG) || defined(CONFIG_JFS_STATISTICS)) | ||
34 | #define PROC_FS_JFS | ||
35 | #endif | ||
36 | |||
37 | /* | ||
38 | * assert with traditional printf/panic | ||
39 | */ | ||
40 | #ifdef CONFIG_KERNEL_ASSERTS | ||
41 | /* kgdb stuff */ | ||
42 | #define assert(p) KERNEL_ASSERT(#p, p) | ||
43 | #else | ||
44 | #define assert(p) do { \ | ||
45 | if (!(p)) { \ | ||
46 | printk(KERN_CRIT "BUG at %s:%d assert(%s)\n", \ | ||
47 | __FILE__, __LINE__, #p); \ | ||
48 | BUG(); \ | ||
49 | } \ | ||
50 | } while (0) | ||
51 | #endif | ||
52 | |||
53 | /* | ||
54 | * debug ON | ||
55 | * -------- | ||
56 | */ | ||
57 | #ifdef CONFIG_JFS_DEBUG | ||
58 | #define ASSERT(p) assert(p) | ||
59 | |||
60 | /* printk verbosity */ | ||
61 | #define JFS_LOGLEVEL_ERR 1 | ||
62 | #define JFS_LOGLEVEL_WARN 2 | ||
63 | #define JFS_LOGLEVEL_DEBUG 3 | ||
64 | #define JFS_LOGLEVEL_INFO 4 | ||
65 | |||
66 | extern int jfsloglevel; | ||
67 | |||
68 | /* dump memory contents */ | ||
69 | extern void dump_mem(char *label, void *data, int length); | ||
70 | |||
71 | /* information message: e.g., configuration, major event */ | ||
72 | #define jfs_info(fmt, arg...) do { \ | ||
73 | if (jfsloglevel >= JFS_LOGLEVEL_INFO) \ | ||
74 | printk(KERN_INFO fmt "\n", ## arg); \ | ||
75 | } while (0) | ||
76 | |||
77 | /* debug message: ad hoc */ | ||
78 | #define jfs_debug(fmt, arg...) do { \ | ||
79 | if (jfsloglevel >= JFS_LOGLEVEL_DEBUG) \ | ||
80 | printk(KERN_DEBUG fmt "\n", ## arg); \ | ||
81 | } while (0) | ||
82 | |||
83 | /* warn message: */ | ||
84 | #define jfs_warn(fmt, arg...) do { \ | ||
85 | if (jfsloglevel >= JFS_LOGLEVEL_WARN) \ | ||
86 | printk(KERN_WARNING fmt "\n", ## arg); \ | ||
87 | } while (0) | ||
88 | |||
89 | /* error event message: e.g., i/o error */ | ||
90 | #define jfs_err(fmt, arg...) do { \ | ||
91 | if (jfsloglevel >= JFS_LOGLEVEL_ERR) \ | ||
92 | printk(KERN_ERR fmt "\n", ## arg); \ | ||
93 | } while (0) | ||
94 | |||
95 | /* | ||
96 | * debug OFF | ||
97 | * --------- | ||
98 | */ | ||
99 | #else /* CONFIG_JFS_DEBUG */ | ||
100 | #define dump_mem(label,data,length) do {} while (0) | ||
101 | #define ASSERT(p) do {} while (0) | ||
102 | #define jfs_info(fmt, arg...) do {} while (0) | ||
103 | #define jfs_debug(fmt, arg...) do {} while (0) | ||
104 | #define jfs_warn(fmt, arg...) do {} while (0) | ||
105 | #define jfs_err(fmt, arg...) do {} while (0) | ||
106 | #endif /* CONFIG_JFS_DEBUG */ | ||
107 | |||
108 | /* | ||
109 | * statistics | ||
110 | * ---------- | ||
111 | */ | ||
112 | #ifdef CONFIG_JFS_STATISTICS | ||
113 | #define INCREMENT(x) ((x)++) | ||
114 | #define DECREMENT(x) ((x)--) | ||
115 | #define HIGHWATERMARK(x,y) ((x) = max((x), (y))) | ||
116 | #else | ||
117 | #define INCREMENT(x) | ||
118 | #define DECREMENT(x) | ||
119 | #define HIGHWATERMARK(x,y) | ||
120 | #endif /* CONFIG_JFS_STATISTICS */ | ||
121 | |||
122 | #endif /* _H_JFS_DEBUG */ | ||
diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h new file mode 100644 index 000000000000..580a3258449b --- /dev/null +++ b/fs/jfs/jfs_dinode.h | |||
@@ -0,0 +1,151 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2000-2001 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | #ifndef _H_JFS_DINODE | ||
19 | #define _H_JFS_DINODE | ||
20 | |||
21 | /* | ||
22 | * jfs_dinode.h: on-disk inode manager | ||
23 | */ | ||
24 | |||
25 | #define INODESLOTSIZE 128 | ||
26 | #define L2INODESLOTSIZE 7 | ||
27 | #define log2INODESIZE 9 /* log2(bytes per dinode) */ | ||
28 | |||
29 | |||
30 | /* | ||
31 | * on-disk inode : 512 bytes | ||
32 | * | ||
33 | * note: align 64-bit fields on 8-byte boundary. | ||
34 | */ | ||
35 | struct dinode { | ||
36 | /* | ||
37 | * I. base area (128 bytes) | ||
38 | * ------------------------ | ||
39 | * | ||
40 | * define generic/POSIX attributes | ||
41 | */ | ||
42 | __le32 di_inostamp; /* 4: stamp to show inode belongs to fileset */ | ||
43 | __le32 di_fileset; /* 4: fileset number */ | ||
44 | __le32 di_number; /* 4: inode number, aka file serial number */ | ||
45 | __le32 di_gen; /* 4: inode generation number */ | ||
46 | |||
47 | pxd_t di_ixpxd; /* 8: inode extent descriptor */ | ||
48 | |||
49 | __le64 di_size; /* 8: size */ | ||
50 | __le64 di_nblocks; /* 8: number of blocks allocated */ | ||
51 | |||
52 | __le32 di_nlink; /* 4: number of links to the object */ | ||
53 | |||
54 | __le32 di_uid; /* 4: user id of owner */ | ||
55 | __le32 di_gid; /* 4: group id of owner */ | ||
56 | |||
57 | __le32 di_mode; /* 4: attribute, format and permission */ | ||
58 | |||
59 | struct timestruc_t di_atime; /* 8: time last data accessed */ | ||
60 | struct timestruc_t di_ctime; /* 8: time last status changed */ | ||
61 | struct timestruc_t di_mtime; /* 8: time last data modified */ | ||
62 | struct timestruc_t di_otime; /* 8: time created */ | ||
63 | |||
64 | dxd_t di_acl; /* 16: acl descriptor */ | ||
65 | |||
66 | dxd_t di_ea; /* 16: ea descriptor */ | ||
67 | |||
68 | __le32 di_next_index; /* 4: Next available dir_table index */ | ||
69 | |||
70 | __le32 di_acltype; /* 4: Type of ACL */ | ||
71 | |||
72 | /* | ||
73 | * Extension Areas. | ||
74 | * | ||
75 | * Historically, the inode was partitioned into 4 128-byte areas, | ||
76 | * the last 3 being defined as unions which could have multiple | ||
77 | * uses. The first 96 bytes had been completely unused until | ||
78 | * an index table was added to the directory. It is now more | ||
79 | * useful to describe the last 3/4 of the inode as a single | ||
80 | * union. We would probably be better off redesigning the | ||
81 | * entire structure from scratch, but we don't want to break | ||
82 | * commonality with OS/2's JFS at this time. | ||
83 | */ | ||
84 | union { | ||
85 | struct { | ||
86 | /* | ||
87 | * This table contains the information needed to | ||
88 | * find a directory entry from a 32-bit index. | ||
89 | * If the index is small enough, the table is inline, | ||
90 | * otherwise, an x-tree root overlays this table | ||
91 | */ | ||
92 | struct dir_table_slot _table[12]; /* 96: inline */ | ||
93 | |||
94 | dtroot_t _dtroot; /* 288: dtree root */ | ||
95 | } _dir; /* (384) */ | ||
96 | #define di_dirtable u._dir._table | ||
97 | #define di_dtroot u._dir._dtroot | ||
98 | #define di_parent di_dtroot.header.idotdot | ||
99 | #define di_DASD di_dtroot.header.DASD | ||
100 | |||
101 | struct { | ||
102 | union { | ||
103 | u8 _data[96]; /* 96: unused */ | ||
104 | struct { | ||
105 | void *_imap; /* 4: unused */ | ||
106 | __le32 _gengen; /* 4: generator */ | ||
107 | } _imap; | ||
108 | } _u1; /* 96: */ | ||
109 | #define di_gengen u._file._u1._imap._gengen | ||
110 | |||
111 | union { | ||
112 | xtpage_t _xtroot; | ||
113 | struct { | ||
114 | u8 unused[16]; /* 16: */ | ||
115 | dxd_t _dxd; /* 16: */ | ||
116 | union { | ||
117 | __le32 _rdev; /* 4: */ | ||
118 | u8 _fastsymlink[128]; | ||
119 | } _u; | ||
120 | u8 _inlineea[128]; | ||
121 | } _special; | ||
122 | } _u2; | ||
123 | } _file; | ||
124 | #define di_xtroot u._file._u2._xtroot | ||
125 | #define di_dxd u._file._u2._special._dxd | ||
126 | #define di_btroot di_xtroot | ||
127 | #define di_inlinedata u._file._u2._special._u | ||
128 | #define di_rdev u._file._u2._special._u._rdev | ||
129 | #define di_fastsymlink u._file._u2._special._u._fastsymlink | ||
130 | #define di_inlineea u._file._u2._special._inlineea | ||
131 | } u; | ||
132 | }; | ||
133 | |||
134 | /* extended mode bits (on-disk inode di_mode) */ | ||
135 | #define IFJOURNAL 0x00010000 /* journalled file */ | ||
136 | #define ISPARSE 0x00020000 /* sparse file enabled */ | ||
137 | #define INLINEEA 0x00040000 /* inline EA area free */ | ||
138 | #define ISWAPFILE 0x00800000 /* file open for pager swap space */ | ||
139 | |||
140 | /* more extended mode bits: attributes for OS/2 */ | ||
141 | #define IREADONLY 0x02000000 /* no write access to file */ | ||
142 | #define IARCHIVE 0x40000000 /* file archive bit */ | ||
143 | #define ISYSTEM 0x08000000 /* system file */ | ||
144 | #define IHIDDEN 0x04000000 /* hidden file */ | ||
145 | #define IRASH 0x4E000000 /* mask for changeable attributes */ | ||
146 | #define INEWNAME 0x80000000 /* non-8.3 filename format */ | ||
147 | #define IDIRECTORY 0x20000000 /* directory (shadow of real bit) */ | ||
148 | #define ATTRSHIFT 25 /* bits to shift to move attribute | ||
149 | specification to mode position */ | ||
150 | |||
151 | #endif /*_H_JFS_DINODE */ | ||
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c new file mode 100644 index 000000000000..d86e467c6e42 --- /dev/null +++ b/fs/jfs/jfs_dmap.c | |||
@@ -0,0 +1,4272 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | |||
19 | #include <linux/fs.h> | ||
20 | #include "jfs_incore.h" | ||
21 | #include "jfs_superblock.h" | ||
22 | #include "jfs_dmap.h" | ||
23 | #include "jfs_imap.h" | ||
24 | #include "jfs_lock.h" | ||
25 | #include "jfs_metapage.h" | ||
26 | #include "jfs_debug.h" | ||
27 | |||
28 | /* | ||
29 | * Debug code for double-checking block map | ||
30 | */ | ||
31 | /* #define _JFS_DEBUG_DMAP 1 */ | ||
32 | |||
33 | #ifdef _JFS_DEBUG_DMAP | ||
34 | #define DBINITMAP(size,ipbmap,results) \ | ||
35 | DBinitmap(size,ipbmap,results) | ||
36 | #define DBALLOC(dbmap,mapsize,blkno,nblocks) \ | ||
37 | DBAlloc(dbmap,mapsize,blkno,nblocks) | ||
38 | #define DBFREE(dbmap,mapsize,blkno,nblocks) \ | ||
39 | DBFree(dbmap,mapsize,blkno,nblocks) | ||
40 | #define DBALLOCCK(dbmap,mapsize,blkno,nblocks) \ | ||
41 | DBAllocCK(dbmap,mapsize,blkno,nblocks) | ||
42 | #define DBFREECK(dbmap,mapsize,blkno,nblocks) \ | ||
43 | DBFreeCK(dbmap,mapsize,blkno,nblocks) | ||
44 | |||
45 | static void DBinitmap(s64, struct inode *, u32 **); | ||
46 | static void DBAlloc(uint *, s64, s64, s64); | ||
47 | static void DBFree(uint *, s64, s64, s64); | ||
48 | static void DBAllocCK(uint *, s64, s64, s64); | ||
49 | static void DBFreeCK(uint *, s64, s64, s64); | ||
50 | #else | ||
51 | #define DBINITMAP(size,ipbmap,results) | ||
52 | #define DBALLOC(dbmap, mapsize, blkno, nblocks) | ||
53 | #define DBFREE(dbmap, mapsize, blkno, nblocks) | ||
54 | #define DBALLOCCK(dbmap, mapsize, blkno, nblocks) | ||
55 | #define DBFREECK(dbmap, mapsize, blkno, nblocks) | ||
56 | #endif /* _JFS_DEBUG_DMAP */ | ||
57 | |||
58 | /* | ||
59 | * SERIALIZATION of the Block Allocation Map. | ||
60 | * | ||
61 | * the working state of the block allocation map is accessed in | ||
62 | * two directions: | ||
63 | * | ||
64 | * 1) allocation and free requests that start at the dmap | ||
65 | * level and move up through the dmap control pages (i.e. | ||
66 | * the vast majority of requests). | ||
67 | * | ||
68 | * 2) allocation requests that start at dmap control page | ||
69 | * level and work down towards the dmaps. | ||
70 | * | ||
71 | * the serialization scheme used here is as follows. | ||
72 | * | ||
73 | * requests which start at the bottom are serialized against each | ||
74 | * other through buffers and each requests holds onto its buffers | ||
75 | * as it works it way up from a single dmap to the required level | ||
76 | * of dmap control page. | ||
77 | * requests that start at the top are serialized against each other | ||
78 | * and request that start from the bottom by the multiple read/single | ||
79 | * write inode lock of the bmap inode. requests starting at the top | ||
80 | * take this lock in write mode while request starting at the bottom | ||
81 | * take the lock in read mode. a single top-down request may proceed | ||
82 | * exclusively while multiple bottoms-up requests may proceed | ||
83 | * simultaneously (under the protection of busy buffers). | ||
84 | * | ||
85 | * in addition to information found in dmaps and dmap control pages, | ||
86 | * the working state of the block allocation map also includes read/ | ||
87 | * write information maintained in the bmap descriptor (i.e. total | ||
88 | * free block count, allocation group level free block counts). | ||
89 | * a single exclusive lock (BMAP_LOCK) is used to guard this information | ||
90 | * in the face of multiple-bottoms up requests. | ||
91 | * (lock ordering: IREAD_LOCK, BMAP_LOCK); | ||
92 | * | ||
93 | * accesses to the persistent state of the block allocation map (limited | ||
94 | * to the persistent bitmaps in dmaps) is guarded by (busy) buffers. | ||
95 | */ | ||
96 | |||
97 | #define BMAP_LOCK_INIT(bmp) init_MUTEX(&bmp->db_bmaplock) | ||
98 | #define BMAP_LOCK(bmp) down(&bmp->db_bmaplock) | ||
99 | #define BMAP_UNLOCK(bmp) up(&bmp->db_bmaplock) | ||
100 | |||
101 | /* | ||
102 | * forward references | ||
103 | */ | ||
104 | static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, | ||
105 | int nblocks); | ||
106 | static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval); | ||
107 | static void dbBackSplit(dmtree_t * tp, int leafno); | ||
108 | static void dbJoin(dmtree_t * tp, int leafno, int newval); | ||
109 | static void dbAdjTree(dmtree_t * tp, int leafno, int newval); | ||
110 | static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, | ||
111 | int level); | ||
112 | static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results); | ||
113 | static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno, | ||
114 | int nblocks); | ||
115 | static int dbAllocNear(struct bmap * bmp, struct dmap * dp, s64 blkno, | ||
116 | int nblocks, | ||
117 | int l2nb, s64 * results); | ||
118 | static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, | ||
119 | int nblocks); | ||
120 | static int dbAllocDmapLev(struct bmap * bmp, struct dmap * dp, int nblocks, | ||
121 | int l2nb, | ||
122 | s64 * results); | ||
123 | static int dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, | ||
124 | s64 * results); | ||
125 | static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, | ||
126 | s64 * results); | ||
127 | static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks); | ||
128 | static int dbFindBits(u32 word, int l2nb); | ||
129 | static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno); | ||
130 | static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx); | ||
131 | static void dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, | ||
132 | int nblocks); | ||
133 | static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, | ||
134 | int nblocks); | ||
135 | static int dbMaxBud(u8 * cp); | ||
136 | s64 dbMapFileSizeToMapSize(struct inode *ipbmap); | ||
137 | static int blkstol2(s64 nb); | ||
138 | |||
139 | static int cntlz(u32 value); | ||
140 | static int cnttz(u32 word); | ||
141 | |||
142 | static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, | ||
143 | int nblocks); | ||
144 | static int dbInitDmap(struct dmap * dp, s64 blkno, int nblocks); | ||
145 | static int dbInitDmapTree(struct dmap * dp); | ||
146 | static int dbInitTree(struct dmaptree * dtp); | ||
147 | static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i); | ||
148 | static int dbGetL2AGSize(s64 nblocks); | ||
149 | |||
150 | /* | ||
151 | * buddy table | ||
152 | * | ||
153 | * table used for determining buddy sizes within characters of | ||
154 | * dmap bitmap words. the characters themselves serve as indexes | ||
155 | * into the table, with the table elements yielding the maximum | ||
156 | * binary buddy of free bits within the character. | ||
157 | */ | ||
158 | static s8 budtab[256] = { | ||
159 | 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | ||
160 | 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
161 | 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
162 | 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
163 | 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
164 | 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, | ||
165 | 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, | ||
166 | 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, | ||
167 | 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
168 | 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, | ||
169 | 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, | ||
170 | 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, | ||
171 | 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
172 | 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, | ||
173 | 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, | ||
174 | 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1 | ||
175 | }; | ||
176 | |||
177 | |||
178 | /* | ||
179 | * NAME: dbMount() | ||
180 | * | ||
181 | * FUNCTION: initializate the block allocation map. | ||
182 | * | ||
183 | * memory is allocated for the in-core bmap descriptor and | ||
184 | * the in-core descriptor is initialized from disk. | ||
185 | * | ||
186 | * PARAMETERS: | ||
187 | * ipbmap - pointer to in-core inode for the block map. | ||
188 | * | ||
189 | * RETURN VALUES: | ||
190 | * 0 - success | ||
191 | * -ENOMEM - insufficient memory | ||
192 | * -EIO - i/o error | ||
193 | */ | ||
194 | int dbMount(struct inode *ipbmap) | ||
195 | { | ||
196 | struct bmap *bmp; | ||
197 | struct dbmap_disk *dbmp_le; | ||
198 | struct metapage *mp; | ||
199 | int i; | ||
200 | |||
201 | /* | ||
202 | * allocate/initialize the in-memory bmap descriptor | ||
203 | */ | ||
204 | /* allocate memory for the in-memory bmap descriptor */ | ||
205 | bmp = kmalloc(sizeof(struct bmap), GFP_KERNEL); | ||
206 | if (bmp == NULL) | ||
207 | return -ENOMEM; | ||
208 | |||
209 | /* read the on-disk bmap descriptor. */ | ||
210 | mp = read_metapage(ipbmap, | ||
211 | BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage, | ||
212 | PSIZE, 0); | ||
213 | if (mp == NULL) { | ||
214 | kfree(bmp); | ||
215 | return -EIO; | ||
216 | } | ||
217 | |||
218 | /* copy the on-disk bmap descriptor to its in-memory version. */ | ||
219 | dbmp_le = (struct dbmap_disk *) mp->data; | ||
220 | bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize); | ||
221 | bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree); | ||
222 | bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage); | ||
223 | bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); | ||
224 | bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel); | ||
225 | bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); | ||
226 | bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); | ||
227 | bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel); | ||
228 | bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth); | ||
229 | bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); | ||
230 | bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); | ||
231 | bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); | ||
232 | for (i = 0; i < MAXAG; i++) | ||
233 | bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]); | ||
234 | bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize); | ||
235 | bmp->db_maxfreebud = dbmp_le->dn_maxfreebud; | ||
236 | |||
237 | /* release the buffer. */ | ||
238 | release_metapage(mp); | ||
239 | |||
240 | /* bind the bmap inode and the bmap descriptor to each other. */ | ||
241 | bmp->db_ipbmap = ipbmap; | ||
242 | JFS_SBI(ipbmap->i_sb)->bmap = bmp; | ||
243 | |||
244 | memset(bmp->db_active, 0, sizeof(bmp->db_active)); | ||
245 | DBINITMAP(bmp->db_mapsize, ipbmap, &bmp->db_DBmap); | ||
246 | |||
247 | /* | ||
248 | * allocate/initialize the bmap lock | ||
249 | */ | ||
250 | BMAP_LOCK_INIT(bmp); | ||
251 | |||
252 | return (0); | ||
253 | } | ||
254 | |||
255 | |||
256 | /* | ||
257 | * NAME: dbUnmount() | ||
258 | * | ||
259 | * FUNCTION: terminate the block allocation map in preparation for | ||
260 | * file system unmount. | ||
261 | * | ||
262 | * the in-core bmap descriptor is written to disk and | ||
263 | * the memory for this descriptor is freed. | ||
264 | * | ||
265 | * PARAMETERS: | ||
266 | * ipbmap - pointer to in-core inode for the block map. | ||
267 | * | ||
268 | * RETURN VALUES: | ||
269 | * 0 - success | ||
270 | * -EIO - i/o error | ||
271 | */ | ||
272 | int dbUnmount(struct inode *ipbmap, int mounterror) | ||
273 | { | ||
274 | struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; | ||
275 | int i; | ||
276 | |||
277 | if (!(mounterror || isReadOnly(ipbmap))) | ||
278 | dbSync(ipbmap); | ||
279 | |||
280 | /* | ||
281 | * Invalidate the page cache buffers | ||
282 | */ | ||
283 | truncate_inode_pages(ipbmap->i_mapping, 0); | ||
284 | |||
285 | /* | ||
286 | * Sanity Check | ||
287 | */ | ||
288 | for (i = 0; i < bmp->db_numag; i++) | ||
289 | if (atomic_read(&bmp->db_active[i])) | ||
290 | printk(KERN_ERR "dbUnmount: db_active[%d] = %d\n", | ||
291 | i, atomic_read(&bmp->db_active[i])); | ||
292 | |||
293 | /* free the memory for the in-memory bmap. */ | ||
294 | kfree(bmp); | ||
295 | |||
296 | return (0); | ||
297 | } | ||
298 | |||
299 | /* | ||
300 | * dbSync() | ||
301 | */ | ||
302 | int dbSync(struct inode *ipbmap) | ||
303 | { | ||
304 | struct dbmap_disk *dbmp_le; | ||
305 | struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; | ||
306 | struct metapage *mp; | ||
307 | int i; | ||
308 | |||
309 | /* | ||
310 | * write bmap global control page | ||
311 | */ | ||
312 | /* get the buffer for the on-disk bmap descriptor. */ | ||
313 | mp = read_metapage(ipbmap, | ||
314 | BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage, | ||
315 | PSIZE, 0); | ||
316 | if (mp == NULL) { | ||
317 | jfs_err("dbSync: read_metapage failed!"); | ||
318 | return -EIO; | ||
319 | } | ||
320 | /* copy the in-memory version of the bmap to the on-disk version */ | ||
321 | dbmp_le = (struct dbmap_disk *) mp->data; | ||
322 | dbmp_le->dn_mapsize = cpu_to_le64(bmp->db_mapsize); | ||
323 | dbmp_le->dn_nfree = cpu_to_le64(bmp->db_nfree); | ||
324 | dbmp_le->dn_l2nbperpage = cpu_to_le32(bmp->db_l2nbperpage); | ||
325 | dbmp_le->dn_numag = cpu_to_le32(bmp->db_numag); | ||
326 | dbmp_le->dn_maxlevel = cpu_to_le32(bmp->db_maxlevel); | ||
327 | dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag); | ||
328 | dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref); | ||
329 | dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel); | ||
330 | dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth); | ||
331 | dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth); | ||
332 | dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart); | ||
333 | dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size); | ||
334 | for (i = 0; i < MAXAG; i++) | ||
335 | dbmp_le->dn_agfree[i] = cpu_to_le64(bmp->db_agfree[i]); | ||
336 | dbmp_le->dn_agsize = cpu_to_le64(bmp->db_agsize); | ||
337 | dbmp_le->dn_maxfreebud = bmp->db_maxfreebud; | ||
338 | |||
339 | /* write the buffer */ | ||
340 | write_metapage(mp); | ||
341 | |||
342 | /* | ||
343 | * write out dirty pages of bmap | ||
344 | */ | ||
345 | filemap_fdatawrite(ipbmap->i_mapping); | ||
346 | filemap_fdatawait(ipbmap->i_mapping); | ||
347 | |||
348 | ipbmap->i_state |= I_DIRTY; | ||
349 | diWriteSpecial(ipbmap, 0); | ||
350 | |||
351 | return (0); | ||
352 | } | ||
353 | |||
354 | |||
355 | /* | ||
356 | * NAME: dbFree() | ||
357 | * | ||
358 | * FUNCTION: free the specified block range from the working block | ||
359 | * allocation map. | ||
360 | * | ||
361 | * the blocks will be free from the working map one dmap | ||
362 | * at a time. | ||
363 | * | ||
364 | * PARAMETERS: | ||
365 | * ip - pointer to in-core inode; | ||
366 | * blkno - starting block number to be freed. | ||
367 | * nblocks - number of blocks to be freed. | ||
368 | * | ||
369 | * RETURN VALUES: | ||
370 | * 0 - success | ||
371 | * -EIO - i/o error | ||
372 | */ | ||
373 | int dbFree(struct inode *ip, s64 blkno, s64 nblocks) | ||
374 | { | ||
375 | struct metapage *mp; | ||
376 | struct dmap *dp; | ||
377 | int nb, rc; | ||
378 | s64 lblkno, rem; | ||
379 | struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; | ||
380 | struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; | ||
381 | |||
382 | IREAD_LOCK(ipbmap); | ||
383 | |||
384 | /* block to be freed better be within the mapsize. */ | ||
385 | if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) { | ||
386 | IREAD_UNLOCK(ipbmap); | ||
387 | printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", | ||
388 | (unsigned long long) blkno, | ||
389 | (unsigned long long) nblocks); | ||
390 | jfs_error(ip->i_sb, | ||
391 | "dbFree: block to be freed is outside the map"); | ||
392 | return -EIO; | ||
393 | } | ||
394 | |||
395 | /* | ||
396 | * free the blocks a dmap at a time. | ||
397 | */ | ||
398 | mp = NULL; | ||
399 | for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) { | ||
400 | /* release previous dmap if any */ | ||
401 | if (mp) { | ||
402 | write_metapage(mp); | ||
403 | } | ||
404 | |||
405 | /* get the buffer for the current dmap. */ | ||
406 | lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); | ||
407 | mp = read_metapage(ipbmap, lblkno, PSIZE, 0); | ||
408 | if (mp == NULL) { | ||
409 | IREAD_UNLOCK(ipbmap); | ||
410 | return -EIO; | ||
411 | } | ||
412 | dp = (struct dmap *) mp->data; | ||
413 | |||
414 | /* determine the number of blocks to be freed from | ||
415 | * this dmap. | ||
416 | */ | ||
417 | nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1))); | ||
418 | |||
419 | DBALLOCCK(bmp->db_DBmap, bmp->db_mapsize, blkno, nb); | ||
420 | |||
421 | /* free the blocks. */ | ||
422 | if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) { | ||
423 | release_metapage(mp); | ||
424 | IREAD_UNLOCK(ipbmap); | ||
425 | return (rc); | ||
426 | } | ||
427 | |||
428 | DBFREE(bmp->db_DBmap, bmp->db_mapsize, blkno, nb); | ||
429 | } | ||
430 | |||
431 | /* write the last buffer. */ | ||
432 | write_metapage(mp); | ||
433 | |||
434 | IREAD_UNLOCK(ipbmap); | ||
435 | |||
436 | return (0); | ||
437 | } | ||
438 | |||
439 | |||
440 | /* | ||
441 | * NAME: dbUpdatePMap() | ||
442 | * | ||
443 | * FUNCTION: update the allocation state (free or allocate) of the | ||
444 | * specified block range in the persistent block allocation map. | ||
445 | * | ||
446 | * the blocks will be updated in the persistent map one | ||
447 | * dmap at a time. | ||
448 | * | ||
449 | * PARAMETERS: | ||
450 | * ipbmap - pointer to in-core inode for the block map. | ||
451 | * free - TRUE if block range is to be freed from the persistent | ||
452 | * map; FALSE if it is to be allocated. | ||
453 | * blkno - starting block number of the range. | ||
454 | * nblocks - number of contiguous blocks in the range. | ||
455 | * tblk - transaction block; | ||
456 | * | ||
457 | * RETURN VALUES: | ||
458 | * 0 - success | ||
459 | * -EIO - i/o error | ||
460 | */ | ||
461 | int | ||
462 | dbUpdatePMap(struct inode *ipbmap, | ||
463 | int free, s64 blkno, s64 nblocks, struct tblock * tblk) | ||
464 | { | ||
465 | int nblks, dbitno, wbitno, rbits; | ||
466 | int word, nbits, nwords; | ||
467 | struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; | ||
468 | s64 lblkno, rem, lastlblkno; | ||
469 | u32 mask; | ||
470 | struct dmap *dp; | ||
471 | struct metapage *mp; | ||
472 | struct jfs_log *log; | ||
473 | int lsn, difft, diffp; | ||
474 | |||
475 | /* the blocks better be within the mapsize. */ | ||
476 | if (blkno + nblocks > bmp->db_mapsize) { | ||
477 | printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", | ||
478 | (unsigned long long) blkno, | ||
479 | (unsigned long long) nblocks); | ||
480 | jfs_error(ipbmap->i_sb, | ||
481 | "dbUpdatePMap: blocks are outside the map"); | ||
482 | return -EIO; | ||
483 | } | ||
484 | |||
485 | /* compute delta of transaction lsn from log syncpt */ | ||
486 | lsn = tblk->lsn; | ||
487 | log = (struct jfs_log *) JFS_SBI(tblk->sb)->log; | ||
488 | logdiff(difft, lsn, log); | ||
489 | |||
490 | /* | ||
491 | * update the block state a dmap at a time. | ||
492 | */ | ||
493 | mp = NULL; | ||
494 | lastlblkno = 0; | ||
495 | for (rem = nblocks; rem > 0; rem -= nblks, blkno += nblks) { | ||
496 | /* get the buffer for the current dmap. */ | ||
497 | lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); | ||
498 | if (lblkno != lastlblkno) { | ||
499 | if (mp) { | ||
500 | write_metapage(mp); | ||
501 | } | ||
502 | |||
503 | mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, | ||
504 | 0); | ||
505 | if (mp == NULL) | ||
506 | return -EIO; | ||
507 | } | ||
508 | dp = (struct dmap *) mp->data; | ||
509 | |||
510 | /* determine the bit number and word within the dmap of | ||
511 | * the starting block. also determine how many blocks | ||
512 | * are to be updated within this dmap. | ||
513 | */ | ||
514 | dbitno = blkno & (BPERDMAP - 1); | ||
515 | word = dbitno >> L2DBWORD; | ||
516 | nblks = min(rem, (s64)BPERDMAP - dbitno); | ||
517 | |||
518 | /* update the bits of the dmap words. the first and last | ||
519 | * words may only have a subset of their bits updated. if | ||
520 | * this is the case, we'll work against that word (i.e. | ||
521 | * partial first and/or last) only in a single pass. a | ||
522 | * single pass will also be used to update all words that | ||
523 | * are to have all their bits updated. | ||
524 | */ | ||
525 | for (rbits = nblks; rbits > 0; | ||
526 | rbits -= nbits, dbitno += nbits) { | ||
527 | /* determine the bit number within the word and | ||
528 | * the number of bits within the word. | ||
529 | */ | ||
530 | wbitno = dbitno & (DBWORD - 1); | ||
531 | nbits = min(rbits, DBWORD - wbitno); | ||
532 | |||
533 | /* check if only part of the word is to be updated. */ | ||
534 | if (nbits < DBWORD) { | ||
535 | /* update (free or allocate) the bits | ||
536 | * in this word. | ||
537 | */ | ||
538 | mask = | ||
539 | (ONES << (DBWORD - nbits) >> wbitno); | ||
540 | if (free) | ||
541 | dp->pmap[word] &= | ||
542 | cpu_to_le32(~mask); | ||
543 | else | ||
544 | dp->pmap[word] |= | ||
545 | cpu_to_le32(mask); | ||
546 | |||
547 | word += 1; | ||
548 | } else { | ||
549 | /* one or more words are to have all | ||
550 | * their bits updated. determine how | ||
551 | * many words and how many bits. | ||
552 | */ | ||
553 | nwords = rbits >> L2DBWORD; | ||
554 | nbits = nwords << L2DBWORD; | ||
555 | |||
556 | /* update (free or allocate) the bits | ||
557 | * in these words. | ||
558 | */ | ||
559 | if (free) | ||
560 | memset(&dp->pmap[word], 0, | ||
561 | nwords * 4); | ||
562 | else | ||
563 | memset(&dp->pmap[word], (int) ONES, | ||
564 | nwords * 4); | ||
565 | |||
566 | word += nwords; | ||
567 | } | ||
568 | } | ||
569 | |||
570 | /* | ||
571 | * update dmap lsn | ||
572 | */ | ||
573 | if (lblkno == lastlblkno) | ||
574 | continue; | ||
575 | |||
576 | lastlblkno = lblkno; | ||
577 | |||
578 | if (mp->lsn != 0) { | ||
579 | /* inherit older/smaller lsn */ | ||
580 | logdiff(diffp, mp->lsn, log); | ||
581 | if (difft < diffp) { | ||
582 | mp->lsn = lsn; | ||
583 | |||
584 | /* move bp after tblock in logsync list */ | ||
585 | LOGSYNC_LOCK(log); | ||
586 | list_move(&mp->synclist, &tblk->synclist); | ||
587 | LOGSYNC_UNLOCK(log); | ||
588 | } | ||
589 | |||
590 | /* inherit younger/larger clsn */ | ||
591 | LOGSYNC_LOCK(log); | ||
592 | logdiff(difft, tblk->clsn, log); | ||
593 | logdiff(diffp, mp->clsn, log); | ||
594 | if (difft > diffp) | ||
595 | mp->clsn = tblk->clsn; | ||
596 | LOGSYNC_UNLOCK(log); | ||
597 | } else { | ||
598 | mp->log = log; | ||
599 | mp->lsn = lsn; | ||
600 | |||
601 | /* insert bp after tblock in logsync list */ | ||
602 | LOGSYNC_LOCK(log); | ||
603 | |||
604 | log->count++; | ||
605 | list_add(&mp->synclist, &tblk->synclist); | ||
606 | |||
607 | mp->clsn = tblk->clsn; | ||
608 | LOGSYNC_UNLOCK(log); | ||
609 | } | ||
610 | } | ||
611 | |||
612 | /* write the last buffer. */ | ||
613 | if (mp) { | ||
614 | write_metapage(mp); | ||
615 | } | ||
616 | |||
617 | return (0); | ||
618 | } | ||
619 | |||
620 | |||
621 | /* | ||
622 | * NAME: dbNextAG() | ||
623 | * | ||
624 | * FUNCTION: find the preferred allocation group for new allocations. | ||
625 | * | ||
626 | * Within the allocation groups, we maintain a preferred | ||
627 | * allocation group which consists of a group with at least | ||
628 | * average free space. It is the preferred group that we target | ||
629 | * new inode allocation towards. The tie-in between inode | ||
630 | * allocation and block allocation occurs as we allocate the | ||
631 | * first (data) block of an inode and specify the inode (block) | ||
632 | * as the allocation hint for this block. | ||
633 | * | ||
634 | * We try to avoid having more than one open file growing in | ||
635 | * an allocation group, as this will lead to fragmentation. | ||
636 | * This differs from the old OS/2 method of trying to keep | ||
637 | * empty ags around for large allocations. | ||
638 | * | ||
639 | * PARAMETERS: | ||
640 | * ipbmap - pointer to in-core inode for the block map. | ||
641 | * | ||
642 | * RETURN VALUES: | ||
643 | * the preferred allocation group number. | ||
644 | */ | ||
645 | int dbNextAG(struct inode *ipbmap) | ||
646 | { | ||
647 | s64 avgfree; | ||
648 | int agpref; | ||
649 | s64 hwm = 0; | ||
650 | int i; | ||
651 | int next_best = -1; | ||
652 | struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; | ||
653 | |||
654 | BMAP_LOCK(bmp); | ||
655 | |||
656 | /* determine the average number of free blocks within the ags. */ | ||
657 | avgfree = (u32)bmp->db_nfree / bmp->db_numag; | ||
658 | |||
659 | /* | ||
660 | * if the current preferred ag does not have an active allocator | ||
661 | * and has at least average freespace, return it | ||
662 | */ | ||
663 | agpref = bmp->db_agpref; | ||
664 | if ((atomic_read(&bmp->db_active[agpref]) == 0) && | ||
665 | (bmp->db_agfree[agpref] >= avgfree)) | ||
666 | goto unlock; | ||
667 | |||
668 | /* From the last preferred ag, find the next one with at least | ||
669 | * average free space. | ||
670 | */ | ||
671 | for (i = 0 ; i < bmp->db_numag; i++, agpref++) { | ||
672 | if (agpref == bmp->db_numag) | ||
673 | agpref = 0; | ||
674 | |||
675 | if (atomic_read(&bmp->db_active[agpref])) | ||
676 | /* open file is currently growing in this ag */ | ||
677 | continue; | ||
678 | if (bmp->db_agfree[agpref] >= avgfree) { | ||
679 | /* Return this one */ | ||
680 | bmp->db_agpref = agpref; | ||
681 | goto unlock; | ||
682 | } else if (bmp->db_agfree[agpref] > hwm) { | ||
683 | /* Less than avg. freespace, but best so far */ | ||
684 | hwm = bmp->db_agfree[agpref]; | ||
685 | next_best = agpref; | ||
686 | } | ||
687 | } | ||
688 | |||
689 | /* | ||
690 | * If no inactive ag was found with average freespace, use the | ||
691 | * next best | ||
692 | */ | ||
693 | if (next_best != -1) | ||
694 | bmp->db_agpref = next_best; | ||
695 | /* else leave db_agpref unchanged */ | ||
696 | unlock: | ||
697 | BMAP_UNLOCK(bmp); | ||
698 | |||
699 | /* return the preferred group. | ||
700 | */ | ||
701 | return (bmp->db_agpref); | ||
702 | } | ||
703 | |||
704 | /* | ||
705 | * NAME: dbAlloc() | ||
706 | * | ||
707 | * FUNCTION: attempt to allocate a specified number of contiguous free | ||
708 | * blocks from the working allocation block map. | ||
709 | * | ||
710 | * the block allocation policy uses hints and a multi-step | ||
711 | * approach. | ||
712 | * | ||
713 | * for allocation requests smaller than the number of blocks | ||
714 | * per dmap, we first try to allocate the new blocks | ||
715 | * immediately following the hint. if these blocks are not | ||
716 | * available, we try to allocate blocks near the hint. if | ||
717 | * no blocks near the hint are available, we next try to | ||
718 | * allocate within the same dmap as contains the hint. | ||
719 | * | ||
720 | * if no blocks are available in the dmap or the allocation | ||
721 | * request is larger than the dmap size, we try to allocate | ||
722 | * within the same allocation group as contains the hint. if | ||
723 | * this does not succeed, we finally try to allocate anywhere | ||
724 | * within the aggregate. | ||
725 | * | ||
726 | * we also try to allocate anywhere within the aggregate for | ||
727 | * for allocation requests larger than the allocation group | ||
728 | * size or requests that specify no hint value. | ||
729 | * | ||
730 | * PARAMETERS: | ||
731 | * ip - pointer to in-core inode; | ||
732 | * hint - allocation hint. | ||
733 | * nblocks - number of contiguous blocks in the range. | ||
734 | * results - on successful return, set to the starting block number | ||
735 | * of the newly allocated contiguous range. | ||
736 | * | ||
737 | * RETURN VALUES: | ||
738 | * 0 - success | ||
739 | * -ENOSPC - insufficient disk resources | ||
740 | * -EIO - i/o error | ||
741 | */ | ||
742 | int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) | ||
743 | { | ||
744 | int rc, agno; | ||
745 | struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; | ||
746 | struct bmap *bmp; | ||
747 | struct metapage *mp; | ||
748 | s64 lblkno, blkno; | ||
749 | struct dmap *dp; | ||
750 | int l2nb; | ||
751 | s64 mapSize; | ||
752 | int writers; | ||
753 | |||
754 | /* assert that nblocks is valid */ | ||
755 | assert(nblocks > 0); | ||
756 | |||
757 | #ifdef _STILL_TO_PORT | ||
758 | /* DASD limit check F226941 */ | ||
759 | if (OVER_LIMIT(ip, nblocks)) | ||
760 | return -ENOSPC; | ||
761 | #endif /* _STILL_TO_PORT */ | ||
762 | |||
763 | /* get the log2 number of blocks to be allocated. | ||
764 | * if the number of blocks is not a log2 multiple, | ||
765 | * it will be rounded up to the next log2 multiple. | ||
766 | */ | ||
767 | l2nb = BLKSTOL2(nblocks); | ||
768 | |||
769 | bmp = JFS_SBI(ip->i_sb)->bmap; | ||
770 | |||
771 | //retry: /* serialize w.r.t.extendfs() */ | ||
772 | mapSize = bmp->db_mapsize; | ||
773 | |||
774 | /* the hint should be within the map */ | ||
775 | if (hint >= mapSize) { | ||
776 | jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map"); | ||
777 | return -EIO; | ||
778 | } | ||
779 | |||
780 | /* if the number of blocks to be allocated is greater than the | ||
781 | * allocation group size, try to allocate anywhere. | ||
782 | */ | ||
783 | if (l2nb > bmp->db_agl2size) { | ||
784 | IWRITE_LOCK(ipbmap); | ||
785 | |||
786 | rc = dbAllocAny(bmp, nblocks, l2nb, results); | ||
787 | if (rc == 0) { | ||
788 | DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results, | ||
789 | nblocks); | ||
790 | } | ||
791 | |||
792 | goto write_unlock; | ||
793 | } | ||
794 | |||
795 | /* | ||
796 | * If no hint, let dbNextAG recommend an allocation group | ||
797 | */ | ||
798 | if (hint == 0) | ||
799 | goto pref_ag; | ||
800 | |||
801 | /* we would like to allocate close to the hint. adjust the | ||
802 | * hint to the block following the hint since the allocators | ||
803 | * will start looking for free space starting at this point. | ||
804 | */ | ||
805 | blkno = hint + 1; | ||
806 | |||
807 | if (blkno >= bmp->db_mapsize) | ||
808 | goto pref_ag; | ||
809 | |||
810 | agno = blkno >> bmp->db_agl2size; | ||
811 | |||
812 | /* check if blkno crosses over into a new allocation group. | ||
813 | * if so, check if we should allow allocations within this | ||
814 | * allocation group. | ||
815 | */ | ||
816 | if ((blkno & (bmp->db_agsize - 1)) == 0) | ||
817 | /* check if the AG is currenly being written to. | ||
818 | * if so, call dbNextAG() to find a non-busy | ||
819 | * AG with sufficient free space. | ||
820 | */ | ||
821 | if (atomic_read(&bmp->db_active[agno])) | ||
822 | goto pref_ag; | ||
823 | |||
824 | /* check if the allocation request size can be satisfied from a | ||
825 | * single dmap. if so, try to allocate from the dmap containing | ||
826 | * the hint using a tiered strategy. | ||
827 | */ | ||
828 | if (nblocks <= BPERDMAP) { | ||
829 | IREAD_LOCK(ipbmap); | ||
830 | |||
831 | /* get the buffer for the dmap containing the hint. | ||
832 | */ | ||
833 | rc = -EIO; | ||
834 | lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); | ||
835 | mp = read_metapage(ipbmap, lblkno, PSIZE, 0); | ||
836 | if (mp == NULL) | ||
837 | goto read_unlock; | ||
838 | |||
839 | dp = (struct dmap *) mp->data; | ||
840 | |||
841 | /* first, try to satisfy the allocation request with the | ||
842 | * blocks beginning at the hint. | ||
843 | */ | ||
844 | if ((rc = dbAllocNext(bmp, dp, blkno, (int) nblocks)) | ||
845 | != -ENOSPC) { | ||
846 | if (rc == 0) { | ||
847 | *results = blkno; | ||
848 | DBALLOC(bmp->db_DBmap, bmp->db_mapsize, | ||
849 | *results, nblocks); | ||
850 | mark_metapage_dirty(mp); | ||
851 | } | ||
852 | |||
853 | release_metapage(mp); | ||
854 | goto read_unlock; | ||
855 | } | ||
856 | |||
857 | writers = atomic_read(&bmp->db_active[agno]); | ||
858 | if ((writers > 1) || | ||
859 | ((writers == 1) && (JFS_IP(ip)->active_ag != agno))) { | ||
860 | /* | ||
861 | * Someone else is writing in this allocation | ||
862 | * group. To avoid fragmenting, try another ag | ||
863 | */ | ||
864 | release_metapage(mp); | ||
865 | IREAD_UNLOCK(ipbmap); | ||
866 | goto pref_ag; | ||
867 | } | ||
868 | |||
869 | /* next, try to satisfy the allocation request with blocks | ||
870 | * near the hint. | ||
871 | */ | ||
872 | if ((rc = | ||
873 | dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results)) | ||
874 | != -ENOSPC) { | ||
875 | if (rc == 0) { | ||
876 | DBALLOC(bmp->db_DBmap, bmp->db_mapsize, | ||
877 | *results, nblocks); | ||
878 | mark_metapage_dirty(mp); | ||
879 | } | ||
880 | |||
881 | release_metapage(mp); | ||
882 | goto read_unlock; | ||
883 | } | ||
884 | |||
885 | /* try to satisfy the allocation request with blocks within | ||
886 | * the same dmap as the hint. | ||
887 | */ | ||
888 | if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results)) | ||
889 | != -ENOSPC) { | ||
890 | if (rc == 0) { | ||
891 | DBALLOC(bmp->db_DBmap, bmp->db_mapsize, | ||
892 | *results, nblocks); | ||
893 | mark_metapage_dirty(mp); | ||
894 | } | ||
895 | |||
896 | release_metapage(mp); | ||
897 | goto read_unlock; | ||
898 | } | ||
899 | |||
900 | release_metapage(mp); | ||
901 | IREAD_UNLOCK(ipbmap); | ||
902 | } | ||
903 | |||
904 | /* try to satisfy the allocation request with blocks within | ||
905 | * the same allocation group as the hint. | ||
906 | */ | ||
907 | IWRITE_LOCK(ipbmap); | ||
908 | if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) | ||
909 | != -ENOSPC) { | ||
910 | if (rc == 0) | ||
911 | DBALLOC(bmp->db_DBmap, bmp->db_mapsize, | ||
912 | *results, nblocks); | ||
913 | goto write_unlock; | ||
914 | } | ||
915 | IWRITE_UNLOCK(ipbmap); | ||
916 | |||
917 | |||
918 | pref_ag: | ||
919 | /* | ||
920 | * Let dbNextAG recommend a preferred allocation group | ||
921 | */ | ||
922 | agno = dbNextAG(ipbmap); | ||
923 | IWRITE_LOCK(ipbmap); | ||
924 | |||
925 | /* Try to allocate within this allocation group. if that fails, try to | ||
926 | * allocate anywhere in the map. | ||
927 | */ | ||
928 | if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == -ENOSPC) | ||
929 | rc = dbAllocAny(bmp, nblocks, l2nb, results); | ||
930 | if (rc == 0) { | ||
931 | DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results, nblocks); | ||
932 | } | ||
933 | |||
934 | write_unlock: | ||
935 | IWRITE_UNLOCK(ipbmap); | ||
936 | |||
937 | return (rc); | ||
938 | |||
939 | read_unlock: | ||
940 | IREAD_UNLOCK(ipbmap); | ||
941 | |||
942 | return (rc); | ||
943 | } | ||
944 | |||
945 | #ifdef _NOTYET | ||
946 | /* | ||
947 | * NAME: dbAllocExact() | ||
948 | * | ||
949 | * FUNCTION: try to allocate the requested extent; | ||
950 | * | ||
951 | * PARAMETERS: | ||
952 | * ip - pointer to in-core inode; | ||
953 | * blkno - extent address; | ||
954 | * nblocks - extent length; | ||
955 | * | ||
956 | * RETURN VALUES: | ||
957 | * 0 - success | ||
958 | * -ENOSPC - insufficient disk resources | ||
959 | * -EIO - i/o error | ||
960 | */ | ||
961 | int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) | ||
962 | { | ||
963 | int rc; | ||
964 | struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; | ||
965 | struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; | ||
966 | struct dmap *dp; | ||
967 | s64 lblkno; | ||
968 | struct metapage *mp; | ||
969 | |||
970 | IREAD_LOCK(ipbmap); | ||
971 | |||
972 | /* | ||
973 | * validate extent request: | ||
974 | * | ||
975 | * note: defragfs policy: | ||
976 | * max 64 blocks will be moved. | ||
977 | * allocation request size must be satisfied from a single dmap. | ||
978 | */ | ||
979 | if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) { | ||
980 | IREAD_UNLOCK(ipbmap); | ||
981 | return -EINVAL; | ||
982 | } | ||
983 | |||
984 | if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) { | ||
985 | /* the free space is no longer available */ | ||
986 | IREAD_UNLOCK(ipbmap); | ||
987 | return -ENOSPC; | ||
988 | } | ||
989 | |||
990 | /* read in the dmap covering the extent */ | ||
991 | lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); | ||
992 | mp = read_metapage(ipbmap, lblkno, PSIZE, 0); | ||
993 | if (mp == NULL) { | ||
994 | IREAD_UNLOCK(ipbmap); | ||
995 | return -EIO; | ||
996 | } | ||
997 | dp = (struct dmap *) mp->data; | ||
998 | |||
999 | /* try to allocate the requested extent */ | ||
1000 | rc = dbAllocNext(bmp, dp, blkno, nblocks); | ||
1001 | |||
1002 | IREAD_UNLOCK(ipbmap); | ||
1003 | |||
1004 | if (rc == 0) { | ||
1005 | DBALLOC(bmp->db_DBmap, bmp->db_mapsize, blkno, nblocks); | ||
1006 | mark_metapage_dirty(mp); | ||
1007 | } | ||
1008 | release_metapage(mp); | ||
1009 | |||
1010 | return (rc); | ||
1011 | } | ||
1012 | #endif /* _NOTYET */ | ||
1013 | |||
1014 | /* | ||
1015 | * NAME: dbReAlloc() | ||
1016 | * | ||
1017 | * FUNCTION: attempt to extend a current allocation by a specified | ||
1018 | * number of blocks. | ||
1019 | * | ||
1020 | * this routine attempts to satisfy the allocation request | ||
1021 | * by first trying to extend the existing allocation in | ||
1022 | * place by allocating the additional blocks as the blocks | ||
1023 | * immediately following the current allocation. if these | ||
1024 | * blocks are not available, this routine will attempt to | ||
1025 | * allocate a new set of contiguous blocks large enough | ||
1026 | * to cover the existing allocation plus the additional | ||
1027 | * number of blocks required. | ||
1028 | * | ||
1029 | * PARAMETERS: | ||
1030 | * ip - pointer to in-core inode requiring allocation. | ||
1031 | * blkno - starting block of the current allocation. | ||
1032 | * nblocks - number of contiguous blocks within the current | ||
1033 | * allocation. | ||
1034 | * addnblocks - number of blocks to add to the allocation. | ||
1035 | * results - on successful return, set to the starting block number | ||
1036 | * of the existing allocation if the existing allocation | ||
1037 | * was extended in place or to a newly allocated contiguous | ||
1038 | * range if the existing allocation could not be extended | ||
1039 | * in place. | ||
1040 | * | ||
1041 | * RETURN VALUES: | ||
1042 | * 0 - success | ||
1043 | * -ENOSPC - insufficient disk resources | ||
1044 | * -EIO - i/o error | ||
1045 | */ | ||
1046 | int | ||
1047 | dbReAlloc(struct inode *ip, | ||
1048 | s64 blkno, s64 nblocks, s64 addnblocks, s64 * results) | ||
1049 | { | ||
1050 | int rc; | ||
1051 | |||
1052 | /* try to extend the allocation in place. | ||
1053 | */ | ||
1054 | if ((rc = dbExtend(ip, blkno, nblocks, addnblocks)) == 0) { | ||
1055 | *results = blkno; | ||
1056 | return (0); | ||
1057 | } else { | ||
1058 | if (rc != -ENOSPC) | ||
1059 | return (rc); | ||
1060 | } | ||
1061 | |||
1062 | /* could not extend the allocation in place, so allocate a | ||
1063 | * new set of blocks for the entire request (i.e. try to get | ||
1064 | * a range of contiguous blocks large enough to cover the | ||
1065 | * existing allocation plus the additional blocks.) | ||
1066 | */ | ||
1067 | return (dbAlloc | ||
1068 | (ip, blkno + nblocks - 1, addnblocks + nblocks, results)); | ||
1069 | } | ||
1070 | |||
1071 | |||
1072 | /* | ||
1073 | * NAME: dbExtend() | ||
1074 | * | ||
1075 | * FUNCTION: attempt to extend a current allocation by a specified | ||
1076 | * number of blocks. | ||
1077 | * | ||
1078 | * this routine attempts to satisfy the allocation request | ||
1079 | * by first trying to extend the existing allocation in | ||
1080 | * place by allocating the additional blocks as the blocks | ||
1081 | * immediately following the current allocation. | ||
1082 | * | ||
1083 | * PARAMETERS: | ||
1084 | * ip - pointer to in-core inode requiring allocation. | ||
1085 | * blkno - starting block of the current allocation. | ||
1086 | * nblocks - number of contiguous blocks within the current | ||
1087 | * allocation. | ||
1088 | * addnblocks - number of blocks to add to the allocation. | ||
1089 | * | ||
1090 | * RETURN VALUES: | ||
1091 | * 0 - success | ||
1092 | * -ENOSPC - insufficient disk resources | ||
1093 | * -EIO - i/o error | ||
1094 | */ | ||
1095 | static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) | ||
1096 | { | ||
1097 | struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); | ||
1098 | s64 lblkno, lastblkno, extblkno; | ||
1099 | uint rel_block; | ||
1100 | struct metapage *mp; | ||
1101 | struct dmap *dp; | ||
1102 | int rc; | ||
1103 | struct inode *ipbmap = sbi->ipbmap; | ||
1104 | struct bmap *bmp; | ||
1105 | |||
1106 | /* | ||
1107 | * We don't want a non-aligned extent to cross a page boundary | ||
1108 | */ | ||
1109 | if (((rel_block = blkno & (sbi->nbperpage - 1))) && | ||
1110 | (rel_block + nblocks + addnblocks > sbi->nbperpage)) | ||
1111 | return -ENOSPC; | ||
1112 | |||
1113 | /* get the last block of the current allocation */ | ||
1114 | lastblkno = blkno + nblocks - 1; | ||
1115 | |||
1116 | /* determine the block number of the block following | ||
1117 | * the existing allocation. | ||
1118 | */ | ||
1119 | extblkno = lastblkno + 1; | ||
1120 | |||
1121 | IREAD_LOCK(ipbmap); | ||
1122 | |||
1123 | /* better be within the file system */ | ||
1124 | bmp = sbi->bmap; | ||
1125 | if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) { | ||
1126 | IREAD_UNLOCK(ipbmap); | ||
1127 | jfs_error(ip->i_sb, | ||
1128 | "dbExtend: the block is outside the filesystem"); | ||
1129 | return -EIO; | ||
1130 | } | ||
1131 | |||
1132 | /* we'll attempt to extend the current allocation in place by | ||
1133 | * allocating the additional blocks as the blocks immediately | ||
1134 | * following the current allocation. we only try to extend the | ||
1135 | * current allocation in place if the number of additional blocks | ||
1136 | * can fit into a dmap, the last block of the current allocation | ||
1137 | * is not the last block of the file system, and the start of the | ||
1138 | * inplace extension is not on an allocation group boundary. | ||
1139 | */ | ||
1140 | if (addnblocks > BPERDMAP || extblkno >= bmp->db_mapsize || | ||
1141 | (extblkno & (bmp->db_agsize - 1)) == 0) { | ||
1142 | IREAD_UNLOCK(ipbmap); | ||
1143 | return -ENOSPC; | ||
1144 | } | ||
1145 | |||
1146 | /* get the buffer for the dmap containing the first block | ||
1147 | * of the extension. | ||
1148 | */ | ||
1149 | lblkno = BLKTODMAP(extblkno, bmp->db_l2nbperpage); | ||
1150 | mp = read_metapage(ipbmap, lblkno, PSIZE, 0); | ||
1151 | if (mp == NULL) { | ||
1152 | IREAD_UNLOCK(ipbmap); | ||
1153 | return -EIO; | ||
1154 | } | ||
1155 | |||
1156 | DBALLOCCK(bmp->db_DBmap, bmp->db_mapsize, blkno, nblocks); | ||
1157 | dp = (struct dmap *) mp->data; | ||
1158 | |||
1159 | /* try to allocate the blocks immediately following the | ||
1160 | * current allocation. | ||
1161 | */ | ||
1162 | rc = dbAllocNext(bmp, dp, extblkno, (int) addnblocks); | ||
1163 | |||
1164 | IREAD_UNLOCK(ipbmap); | ||
1165 | |||
1166 | /* were we successful ? */ | ||
1167 | if (rc == 0) { | ||
1168 | DBALLOC(bmp->db_DBmap, bmp->db_mapsize, extblkno, | ||
1169 | addnblocks); | ||
1170 | write_metapage(mp); | ||
1171 | } else | ||
1172 | /* we were not successful */ | ||
1173 | release_metapage(mp); | ||
1174 | |||
1175 | |||
1176 | return (rc); | ||
1177 | } | ||
1178 | |||
1179 | |||
1180 | /* | ||
1181 | * NAME: dbAllocNext() | ||
1182 | * | ||
1183 | * FUNCTION: attempt to allocate the blocks of the specified block | ||
1184 | * range within a dmap. | ||
1185 | * | ||
1186 | * PARAMETERS: | ||
1187 | * bmp - pointer to bmap descriptor | ||
1188 | * dp - pointer to dmap. | ||
1189 | * blkno - starting block number of the range. | ||
1190 | * nblocks - number of contiguous free blocks of the range. | ||
1191 | * | ||
1192 | * RETURN VALUES: | ||
1193 | * 0 - success | ||
1194 | * -ENOSPC - insufficient disk resources | ||
1195 | * -EIO - i/o error | ||
1196 | * | ||
1197 | * serialization: IREAD_LOCK(ipbmap) held on entry/exit; | ||
1198 | */ | ||
1199 | static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno, | ||
1200 | int nblocks) | ||
1201 | { | ||
1202 | int dbitno, word, rembits, nb, nwords, wbitno, nw; | ||
1203 | int l2size; | ||
1204 | s8 *leaf; | ||
1205 | u32 mask; | ||
1206 | |||
1207 | if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) { | ||
1208 | jfs_error(bmp->db_ipbmap->i_sb, | ||
1209 | "dbAllocNext: Corrupt dmap page"); | ||
1210 | return -EIO; | ||
1211 | } | ||
1212 | |||
1213 | /* pick up a pointer to the leaves of the dmap tree. | ||
1214 | */ | ||
1215 | leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx); | ||
1216 | |||
1217 | /* determine the bit number and word within the dmap of the | ||
1218 | * starting block. | ||
1219 | */ | ||
1220 | dbitno = blkno & (BPERDMAP - 1); | ||
1221 | word = dbitno >> L2DBWORD; | ||
1222 | |||
1223 | /* check if the specified block range is contained within | ||
1224 | * this dmap. | ||
1225 | */ | ||
1226 | if (dbitno + nblocks > BPERDMAP) | ||
1227 | return -ENOSPC; | ||
1228 | |||
1229 | /* check if the starting leaf indicates that anything | ||
1230 | * is free. | ||
1231 | */ | ||
1232 | if (leaf[word] == NOFREE) | ||
1233 | return -ENOSPC; | ||
1234 | |||
1235 | /* check the dmaps words corresponding to block range to see | ||
1236 | * if the block range is free. not all bits of the first and | ||
1237 | * last words may be contained within the block range. if this | ||
1238 | * is the case, we'll work against those words (i.e. partial first | ||
1239 | * and/or last) on an individual basis (a single pass) and examine | ||
1240 | * the actual bits to determine if they are free. a single pass | ||
1241 | * will be used for all dmap words fully contained within the | ||
1242 | * specified range. within this pass, the leaves of the dmap | ||
1243 | * tree will be examined to determine if the blocks are free. a | ||
1244 | * single leaf may describe the free space of multiple dmap | ||
1245 | * words, so we may visit only a subset of the actual leaves | ||
1246 | * corresponding to the dmap words of the block range. | ||
1247 | */ | ||
1248 | for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { | ||
1249 | /* determine the bit number within the word and | ||
1250 | * the number of bits within the word. | ||
1251 | */ | ||
1252 | wbitno = dbitno & (DBWORD - 1); | ||
1253 | nb = min(rembits, DBWORD - wbitno); | ||
1254 | |||
1255 | /* check if only part of the word is to be examined. | ||
1256 | */ | ||
1257 | if (nb < DBWORD) { | ||
1258 | /* check if the bits are free. | ||
1259 | */ | ||
1260 | mask = (ONES << (DBWORD - nb) >> wbitno); | ||
1261 | if ((mask & ~le32_to_cpu(dp->wmap[word])) != mask) | ||
1262 | return -ENOSPC; | ||
1263 | |||
1264 | word += 1; | ||
1265 | } else { | ||
1266 | /* one or more dmap words are fully contained | ||
1267 | * within the block range. determine how many | ||
1268 | * words and how many bits. | ||
1269 | */ | ||
1270 | nwords = rembits >> L2DBWORD; | ||
1271 | nb = nwords << L2DBWORD; | ||
1272 | |||
1273 | /* now examine the appropriate leaves to determine | ||
1274 | * if the blocks are free. | ||
1275 | */ | ||
1276 | while (nwords > 0) { | ||
1277 | /* does the leaf describe any free space ? | ||
1278 | */ | ||
1279 | if (leaf[word] < BUDMIN) | ||
1280 | return -ENOSPC; | ||
1281 | |||
1282 | /* determine the l2 number of bits provided | ||
1283 | * by this leaf. | ||
1284 | */ | ||
1285 | l2size = | ||
1286 | min((int)leaf[word], NLSTOL2BSZ(nwords)); | ||
1287 | |||
1288 | /* determine how many words were handled. | ||
1289 | */ | ||
1290 | nw = BUDSIZE(l2size, BUDMIN); | ||
1291 | |||
1292 | nwords -= nw; | ||
1293 | word += nw; | ||
1294 | } | ||
1295 | } | ||
1296 | } | ||
1297 | |||
1298 | /* allocate the blocks. | ||
1299 | */ | ||
1300 | return (dbAllocDmap(bmp, dp, blkno, nblocks)); | ||
1301 | } | ||
1302 | |||
1303 | |||
1304 | /* | ||
1305 | * NAME: dbAllocNear() | ||
1306 | * | ||
1307 | * FUNCTION: attempt to allocate a number of contiguous free blocks near | ||
1308 | * a specified block (hint) within a dmap. | ||
1309 | * | ||
1310 | * starting with the dmap leaf that covers the hint, we'll | ||
1311 | * check the next four contiguous leaves for sufficient free | ||
1312 | * space. if sufficient free space is found, we'll allocate | ||
1313 | * the desired free space. | ||
1314 | * | ||
1315 | * PARAMETERS: | ||
1316 | * bmp - pointer to bmap descriptor | ||
1317 | * dp - pointer to dmap. | ||
1318 | * blkno - block number to allocate near. | ||
1319 | * nblocks - actual number of contiguous free blocks desired. | ||
1320 | * l2nb - log2 number of contiguous free blocks desired. | ||
1321 | * results - on successful return, set to the starting block number | ||
1322 | * of the newly allocated range. | ||
1323 | * | ||
1324 | * RETURN VALUES: | ||
1325 | * 0 - success | ||
1326 | * -ENOSPC - insufficient disk resources | ||
1327 | * -EIO - i/o error | ||
1328 | * | ||
1329 | * serialization: IREAD_LOCK(ipbmap) held on entry/exit; | ||
1330 | */ | ||
1331 | static int | ||
1332 | dbAllocNear(struct bmap * bmp, | ||
1333 | struct dmap * dp, s64 blkno, int nblocks, int l2nb, s64 * results) | ||
1334 | { | ||
1335 | int word, lword, rc; | ||
1336 | s8 *leaf; | ||
1337 | |||
1338 | if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) { | ||
1339 | jfs_error(bmp->db_ipbmap->i_sb, | ||
1340 | "dbAllocNear: Corrupt dmap page"); | ||
1341 | return -EIO; | ||
1342 | } | ||
1343 | |||
1344 | leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx); | ||
1345 | |||
1346 | /* determine the word within the dmap that holds the hint | ||
1347 | * (i.e. blkno). also, determine the last word in the dmap | ||
1348 | * that we'll include in our examination. | ||
1349 | */ | ||
1350 | word = (blkno & (BPERDMAP - 1)) >> L2DBWORD; | ||
1351 | lword = min(word + 4, LPERDMAP); | ||
1352 | |||
1353 | /* examine the leaves for sufficient free space. | ||
1354 | */ | ||
1355 | for (; word < lword; word++) { | ||
1356 | /* does the leaf describe sufficient free space ? | ||
1357 | */ | ||
1358 | if (leaf[word] < l2nb) | ||
1359 | continue; | ||
1360 | |||
1361 | /* determine the block number within the file system | ||
1362 | * of the first block described by this dmap word. | ||
1363 | */ | ||
1364 | blkno = le64_to_cpu(dp->start) + (word << L2DBWORD); | ||
1365 | |||
1366 | /* if not all bits of the dmap word are free, get the | ||
1367 | * starting bit number within the dmap word of the required | ||
1368 | * string of free bits and adjust the block number with the | ||
1369 | * value. | ||
1370 | */ | ||
1371 | if (leaf[word] < BUDMIN) | ||
1372 | blkno += | ||
1373 | dbFindBits(le32_to_cpu(dp->wmap[word]), l2nb); | ||
1374 | |||
1375 | /* allocate the blocks. | ||
1376 | */ | ||
1377 | if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0) | ||
1378 | *results = blkno; | ||
1379 | |||
1380 | return (rc); | ||
1381 | } | ||
1382 | |||
1383 | return -ENOSPC; | ||
1384 | } | ||
1385 | |||
1386 | |||
1387 | /* | ||
1388 | * NAME: dbAllocAG() | ||
1389 | * | ||
1390 | * FUNCTION: attempt to allocate the specified number of contiguous | ||
1391 | * free blocks within the specified allocation group. | ||
1392 | * | ||
1393 | * unless the allocation group size is equal to the number | ||
1394 | * of blocks per dmap, the dmap control pages will be used to | ||
1395 | * find the required free space, if available. we start the | ||
1396 | * search at the highest dmap control page level which | ||
1397 | * distinctly describes the allocation group's free space | ||
1398 | * (i.e. the highest level at which the allocation group's | ||
1399 | * free space is not mixed in with that of any other group). | ||
1400 | * in addition, we start the search within this level at a | ||
1401 | * height of the dmapctl dmtree at which the nodes distinctly | ||
1402 | * describe the allocation group's free space. at this height, | ||
1403 | * the allocation group's free space may be represented by 1 | ||
1404 | * or two sub-trees, depending on the allocation group size. | ||
1405 | * we search the top nodes of these subtrees left to right for | ||
1406 | * sufficient free space. if sufficient free space is found, | ||
1407 | * the subtree is searched to find the leftmost leaf that | ||
1408 | * has free space. once we have made it to the leaf, we | ||
1409 | * move the search to the next lower level dmap control page | ||
1410 | * corresponding to this leaf. we continue down the dmap control | ||
1411 | * pages until we find the dmap that contains or starts the | ||
1412 | * sufficient free space and we allocate at this dmap. | ||
1413 | * | ||
1414 | * if the allocation group size is equal to the dmap size, | ||
1415 | * we'll start at the dmap corresponding to the allocation | ||
1416 | * group and attempt the allocation at this level. | ||
1417 | * | ||
1418 | * the dmap control page search is also not performed if the | ||
1419 | * allocation group is completely free and we go to the first | ||
1420 | * dmap of the allocation group to do the allocation. this is | ||
1421 | * done because the allocation group may be part (not the first | ||
1422 | * part) of a larger binary buddy system, causing the dmap | ||
1423 | * control pages to indicate no free space (NOFREE) within | ||
1424 | * the allocation group. | ||
1425 | * | ||
1426 | * PARAMETERS: | ||
1427 | * bmp - pointer to bmap descriptor | ||
1428 | * agno - allocation group number. | ||
1429 | * nblocks - actual number of contiguous free blocks desired. | ||
1430 | * l2nb - log2 number of contiguous free blocks desired. | ||
1431 | * results - on successful return, set to the starting block number | ||
1432 | * of the newly allocated range. | ||
1433 | * | ||
1434 | * RETURN VALUES: | ||
1435 | * 0 - success | ||
1436 | * -ENOSPC - insufficient disk resources | ||
1437 | * -EIO - i/o error | ||
1438 | * | ||
1439 | * note: IWRITE_LOCK(ipmap) held on entry/exit; | ||
1440 | */ | ||
1441 | static int | ||
1442 | dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) | ||
1443 | { | ||
1444 | struct metapage *mp; | ||
1445 | struct dmapctl *dcp; | ||
1446 | int rc, ti, i, k, m, n, agperlev; | ||
1447 | s64 blkno, lblkno; | ||
1448 | int budmin; | ||
1449 | |||
1450 | /* allocation request should not be for more than the | ||
1451 | * allocation group size. | ||
1452 | */ | ||
1453 | if (l2nb > bmp->db_agl2size) { | ||
1454 | jfs_error(bmp->db_ipbmap->i_sb, | ||
1455 | "dbAllocAG: allocation request is larger than the " | ||
1456 | "allocation group size"); | ||
1457 | return -EIO; | ||
1458 | } | ||
1459 | |||
1460 | /* determine the starting block number of the allocation | ||
1461 | * group. | ||
1462 | */ | ||
1463 | blkno = (s64) agno << bmp->db_agl2size; | ||
1464 | |||
1465 | /* check if the allocation group size is the minimum allocation | ||
1466 | * group size or if the allocation group is completely free. if | ||
1467 | * the allocation group size is the minimum size of BPERDMAP (i.e. | ||
1468 | * 1 dmap), there is no need to search the dmap control page (below) | ||
1469 | * that fully describes the allocation group since the allocation | ||
1470 | * group is already fully described by a dmap. in this case, we | ||
1471 | * just call dbAllocCtl() to search the dmap tree and allocate the | ||
1472 | * required space if available. | ||
1473 | * | ||
1474 | * if the allocation group is completely free, dbAllocCtl() is | ||
1475 | * also called to allocate the required space. this is done for | ||
1476 | * two reasons. first, it makes no sense searching the dmap control | ||
1477 | * pages for free space when we know that free space exists. second, | ||
1478 | * the dmap control pages may indicate that the allocation group | ||
1479 | * has no free space if the allocation group is part (not the first | ||
1480 | * part) of a larger binary buddy system. | ||
1481 | */ | ||
1482 | if (bmp->db_agsize == BPERDMAP | ||
1483 | || bmp->db_agfree[agno] == bmp->db_agsize) { | ||
1484 | rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); | ||
1485 | if ((rc == -ENOSPC) && | ||
1486 | (bmp->db_agfree[agno] == bmp->db_agsize)) { | ||
1487 | printk(KERN_ERR "blkno = %Lx, blocks = %Lx\n", | ||
1488 | (unsigned long long) blkno, | ||
1489 | (unsigned long long) nblocks); | ||
1490 | jfs_error(bmp->db_ipbmap->i_sb, | ||
1491 | "dbAllocAG: dbAllocCtl failed in free AG"); | ||
1492 | } | ||
1493 | return (rc); | ||
1494 | } | ||
1495 | |||
1496 | /* the buffer for the dmap control page that fully describes the | ||
1497 | * allocation group. | ||
1498 | */ | ||
1499 | lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, bmp->db_aglevel); | ||
1500 | mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); | ||
1501 | if (mp == NULL) | ||
1502 | return -EIO; | ||
1503 | dcp = (struct dmapctl *) mp->data; | ||
1504 | budmin = dcp->budmin; | ||
1505 | |||
1506 | if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { | ||
1507 | jfs_error(bmp->db_ipbmap->i_sb, | ||
1508 | "dbAllocAG: Corrupt dmapctl page"); | ||
1509 | release_metapage(mp); | ||
1510 | return -EIO; | ||
1511 | } | ||
1512 | |||
1513 | /* search the subtree(s) of the dmap control page that describes | ||
1514 | * the allocation group, looking for sufficient free space. to begin, | ||
1515 | * determine how many allocation groups are represented in a dmap | ||
1516 | * control page at the control page level (i.e. L0, L1, L2) that | ||
1517 | * fully describes an allocation group. next, determine the starting | ||
1518 | * tree index of this allocation group within the control page. | ||
1519 | */ | ||
1520 | agperlev = | ||
1521 | (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth; | ||
1522 | ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1)); | ||
1523 | |||
1524 | /* dmap control page trees fan-out by 4 and a single allocation | ||
1525 | * group may be described by 1 or 2 subtrees within the ag level | ||
1526 | * dmap control page, depending upon the ag size. examine the ag's | ||
1527 | * subtrees for sufficient free space, starting with the leftmost | ||
1528 | * subtree. | ||
1529 | */ | ||
1530 | for (i = 0; i < bmp->db_agwidth; i++, ti++) { | ||
1531 | /* is there sufficient free space ? | ||
1532 | */ | ||
1533 | if (l2nb > dcp->stree[ti]) | ||
1534 | continue; | ||
1535 | |||
1536 | /* sufficient free space found in a subtree. now search down | ||
1537 | * the subtree to find the leftmost leaf that describes this | ||
1538 | * free space. | ||
1539 | */ | ||
1540 | for (k = bmp->db_agheigth; k > 0; k--) { | ||
1541 | for (n = 0, m = (ti << 2) + 1; n < 4; n++) { | ||
1542 | if (l2nb <= dcp->stree[m + n]) { | ||
1543 | ti = m + n; | ||
1544 | break; | ||
1545 | } | ||
1546 | } | ||
1547 | if (n == 4) { | ||
1548 | jfs_error(bmp->db_ipbmap->i_sb, | ||
1549 | "dbAllocAG: failed descending stree"); | ||
1550 | release_metapage(mp); | ||
1551 | return -EIO; | ||
1552 | } | ||
1553 | } | ||
1554 | |||
1555 | /* determine the block number within the file system | ||
1556 | * that corresponds to this leaf. | ||
1557 | */ | ||
1558 | if (bmp->db_aglevel == 2) | ||
1559 | blkno = 0; | ||
1560 | else if (bmp->db_aglevel == 1) | ||
1561 | blkno &= ~(MAXL1SIZE - 1); | ||
1562 | else /* bmp->db_aglevel == 0 */ | ||
1563 | blkno &= ~(MAXL0SIZE - 1); | ||
1564 | |||
1565 | blkno += | ||
1566 | ((s64) (ti - le32_to_cpu(dcp->leafidx))) << budmin; | ||
1567 | |||
1568 | /* release the buffer in preparation for going down | ||
1569 | * the next level of dmap control pages. | ||
1570 | */ | ||
1571 | release_metapage(mp); | ||
1572 | |||
1573 | /* check if we need to continue to search down the lower | ||
1574 | * level dmap control pages. we need to if the number of | ||
1575 | * blocks required is less than maximum number of blocks | ||
1576 | * described at the next lower level. | ||
1577 | */ | ||
1578 | if (l2nb < budmin) { | ||
1579 | |||
1580 | /* search the lower level dmap control pages to get | ||
1581 | * the starting block number of the the dmap that | ||
1582 | * contains or starts off the free space. | ||
1583 | */ | ||
1584 | if ((rc = | ||
1585 | dbFindCtl(bmp, l2nb, bmp->db_aglevel - 1, | ||
1586 | &blkno))) { | ||
1587 | if (rc == -ENOSPC) { | ||
1588 | jfs_error(bmp->db_ipbmap->i_sb, | ||
1589 | "dbAllocAG: control page " | ||
1590 | "inconsistent"); | ||
1591 | return -EIO; | ||
1592 | } | ||
1593 | return (rc); | ||
1594 | } | ||
1595 | } | ||
1596 | |||
1597 | /* allocate the blocks. | ||
1598 | */ | ||
1599 | rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); | ||
1600 | if (rc == -ENOSPC) { | ||
1601 | jfs_error(bmp->db_ipbmap->i_sb, | ||
1602 | "dbAllocAG: unable to allocate blocks"); | ||
1603 | rc = -EIO; | ||
1604 | } | ||
1605 | return (rc); | ||
1606 | } | ||
1607 | |||
1608 | /* no space in the allocation group. release the buffer and | ||
1609 | * return -ENOSPC. | ||
1610 | */ | ||
1611 | release_metapage(mp); | ||
1612 | |||
1613 | return -ENOSPC; | ||
1614 | } | ||
1615 | |||
1616 | |||
1617 | /* | ||
1618 | * NAME: dbAllocAny() | ||
1619 | * | ||
1620 | * FUNCTION: attempt to allocate the specified number of contiguous | ||
1621 | * free blocks anywhere in the file system. | ||
1622 | * | ||
1623 | * dbAllocAny() attempts to find the sufficient free space by | ||
1624 | * searching down the dmap control pages, starting with the | ||
1625 | * highest level (i.e. L0, L1, L2) control page. if free space | ||
1626 | * large enough to satisfy the desired free space is found, the | ||
1627 | * desired free space is allocated. | ||
1628 | * | ||
1629 | * PARAMETERS: | ||
1630 | * bmp - pointer to bmap descriptor | ||
1631 | * nblocks - actual number of contiguous free blocks desired. | ||
1632 | * l2nb - log2 number of contiguous free blocks desired. | ||
1633 | * results - on successful return, set to the starting block number | ||
1634 | * of the newly allocated range. | ||
1635 | * | ||
1636 | * RETURN VALUES: | ||
1637 | * 0 - success | ||
1638 | * -ENOSPC - insufficient disk resources | ||
1639 | * -EIO - i/o error | ||
1640 | * | ||
1641 | * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; | ||
1642 | */ | ||
1643 | static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results) | ||
1644 | { | ||
1645 | int rc; | ||
1646 | s64 blkno = 0; | ||
1647 | |||
1648 | /* starting with the top level dmap control page, search | ||
1649 | * down the dmap control levels for sufficient free space. | ||
1650 | * if free space is found, dbFindCtl() returns the starting | ||
1651 | * block number of the dmap that contains or starts off the | ||
1652 | * range of free space. | ||
1653 | */ | ||
1654 | if ((rc = dbFindCtl(bmp, l2nb, bmp->db_maxlevel, &blkno))) | ||
1655 | return (rc); | ||
1656 | |||
1657 | /* allocate the blocks. | ||
1658 | */ | ||
1659 | rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); | ||
1660 | if (rc == -ENOSPC) { | ||
1661 | jfs_error(bmp->db_ipbmap->i_sb, | ||
1662 | "dbAllocAny: unable to allocate blocks"); | ||
1663 | return -EIO; | ||
1664 | } | ||
1665 | return (rc); | ||
1666 | } | ||
1667 | |||
1668 | |||
1669 | /* | ||
1670 | * NAME: dbFindCtl() | ||
1671 | * | ||
1672 | * FUNCTION: starting at a specified dmap control page level and block | ||
1673 | * number, search down the dmap control levels for a range of | ||
1674 | * contiguous free blocks large enough to satisfy an allocation | ||
1675 | * request for the specified number of free blocks. | ||
1676 | * | ||
1677 | * if sufficient contiguous free blocks are found, this routine | ||
1678 | * returns the starting block number within a dmap page that | ||
1679 | * contains or starts a range of contiqious free blocks that | ||
1680 | * is sufficient in size. | ||
1681 | * | ||
1682 | * PARAMETERS: | ||
1683 | * bmp - pointer to bmap descriptor | ||
1684 | * level - starting dmap control page level. | ||
1685 | * l2nb - log2 number of contiguous free blocks desired. | ||
1686 | * *blkno - on entry, starting block number for conducting the search. | ||
1687 | * on successful return, the first block within a dmap page | ||
1688 | * that contains or starts a range of contiguous free blocks. | ||
1689 | * | ||
1690 | * RETURN VALUES: | ||
1691 | * 0 - success | ||
1692 | * -ENOSPC - insufficient disk resources | ||
1693 | * -EIO - i/o error | ||
1694 | * | ||
1695 | * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; | ||
1696 | */ | ||
1697 | static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno) | ||
1698 | { | ||
1699 | int rc, leafidx, lev; | ||
1700 | s64 b, lblkno; | ||
1701 | struct dmapctl *dcp; | ||
1702 | int budmin; | ||
1703 | struct metapage *mp; | ||
1704 | |||
1705 | /* starting at the specified dmap control page level and block | ||
1706 | * number, search down the dmap control levels for the starting | ||
1707 | * block number of a dmap page that contains or starts off | ||
1708 | * sufficient free blocks. | ||
1709 | */ | ||
1710 | for (lev = level, b = *blkno; lev >= 0; lev--) { | ||
1711 | /* get the buffer of the dmap control page for the block | ||
1712 | * number and level (i.e. L0, L1, L2). | ||
1713 | */ | ||
1714 | lblkno = BLKTOCTL(b, bmp->db_l2nbperpage, lev); | ||
1715 | mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); | ||
1716 | if (mp == NULL) | ||
1717 | return -EIO; | ||
1718 | dcp = (struct dmapctl *) mp->data; | ||
1719 | budmin = dcp->budmin; | ||
1720 | |||
1721 | if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { | ||
1722 | jfs_error(bmp->db_ipbmap->i_sb, | ||
1723 | "dbFindCtl: Corrupt dmapctl page"); | ||
1724 | release_metapage(mp); | ||
1725 | return -EIO; | ||
1726 | } | ||
1727 | |||
1728 | /* search the tree within the dmap control page for | ||
1729 | * sufficent free space. if sufficient free space is found, | ||
1730 | * dbFindLeaf() returns the index of the leaf at which | ||
1731 | * free space was found. | ||
1732 | */ | ||
1733 | rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx); | ||
1734 | |||
1735 | /* release the buffer. | ||
1736 | */ | ||
1737 | release_metapage(mp); | ||
1738 | |||
1739 | /* space found ? | ||
1740 | */ | ||
1741 | if (rc) { | ||
1742 | if (lev != level) { | ||
1743 | jfs_error(bmp->db_ipbmap->i_sb, | ||
1744 | "dbFindCtl: dmap inconsistent"); | ||
1745 | return -EIO; | ||
1746 | } | ||
1747 | return -ENOSPC; | ||
1748 | } | ||
1749 | |||
1750 | /* adjust the block number to reflect the location within | ||
1751 | * the dmap control page (i.e. the leaf) at which free | ||
1752 | * space was found. | ||
1753 | */ | ||
1754 | b += (((s64) leafidx) << budmin); | ||
1755 | |||
1756 | /* we stop the search at this dmap control page level if | ||
1757 | * the number of blocks required is greater than or equal | ||
1758 | * to the maximum number of blocks described at the next | ||
1759 | * (lower) level. | ||
1760 | */ | ||
1761 | if (l2nb >= budmin) | ||
1762 | break; | ||
1763 | } | ||
1764 | |||
1765 | *blkno = b; | ||
1766 | return (0); | ||
1767 | } | ||
1768 | |||
1769 | |||
1770 | /* | ||
1771 | * NAME: dbAllocCtl() | ||
1772 | * | ||
1773 | * FUNCTION: attempt to allocate a specified number of contiguous | ||
1774 | * blocks starting within a specific dmap. | ||
1775 | * | ||
1776 | * this routine is called by higher level routines that search | ||
1777 | * the dmap control pages above the actual dmaps for contiguous | ||
1778 | * free space. the result of successful searches by these | ||
1779 | * routines are the starting block numbers within dmaps, with | ||
1780 | * the dmaps themselves containing the desired contiguous free | ||
1781 | * space or starting a contiguous free space of desired size | ||
1782 | * that is made up of the blocks of one or more dmaps. these | ||
1783 | * calls should not fail due to insufficent resources. | ||
1784 | * | ||
1785 | * this routine is called in some cases where it is not known | ||
1786 | * whether it will fail due to insufficient resources. more | ||
1787 | * specifically, this occurs when allocating from an allocation | ||
1788 | * group whose size is equal to the number of blocks per dmap. | ||
1789 | * in this case, the dmap control pages are not examined prior | ||
1790 | * to calling this routine (to save pathlength) and the call | ||
1791 | * might fail. | ||
1792 | * | ||
1793 | * for a request size that fits within a dmap, this routine relies | ||
1794 | * upon the dmap's dmtree to find the requested contiguous free | ||
1795 | * space. for request sizes that are larger than a dmap, the | ||
1796 | * requested free space will start at the first block of the | ||
1797 | * first dmap (i.e. blkno). | ||
1798 | * | ||
1799 | * PARAMETERS: | ||
1800 | * bmp - pointer to bmap descriptor | ||
1801 | * nblocks - actual number of contiguous free blocks to allocate. | ||
1802 | * l2nb - log2 number of contiguous free blocks to allocate. | ||
1803 | * blkno - starting block number of the dmap to start the allocation | ||
1804 | * from. | ||
1805 | * results - on successful return, set to the starting block number | ||
1806 | * of the newly allocated range. | ||
1807 | * | ||
1808 | * RETURN VALUES: | ||
1809 | * 0 - success | ||
1810 | * -ENOSPC - insufficient disk resources | ||
1811 | * -EIO - i/o error | ||
1812 | * | ||
1813 | * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; | ||
1814 | */ | ||
1815 | static int | ||
1816 | dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) | ||
1817 | { | ||
1818 | int rc, nb; | ||
1819 | s64 b, lblkno, n; | ||
1820 | struct metapage *mp; | ||
1821 | struct dmap *dp; | ||
1822 | |||
1823 | /* check if the allocation request is confined to a single dmap. | ||
1824 | */ | ||
1825 | if (l2nb <= L2BPERDMAP) { | ||
1826 | /* get the buffer for the dmap. | ||
1827 | */ | ||
1828 | lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); | ||
1829 | mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); | ||
1830 | if (mp == NULL) | ||
1831 | return -EIO; | ||
1832 | dp = (struct dmap *) mp->data; | ||
1833 | |||
1834 | /* try to allocate the blocks. | ||
1835 | */ | ||
1836 | rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results); | ||
1837 | if (rc == 0) | ||
1838 | mark_metapage_dirty(mp); | ||
1839 | |||
1840 | release_metapage(mp); | ||
1841 | |||
1842 | return (rc); | ||
1843 | } | ||
1844 | |||
1845 | /* allocation request involving multiple dmaps. it must start on | ||
1846 | * a dmap boundary. | ||
1847 | */ | ||
1848 | assert((blkno & (BPERDMAP - 1)) == 0); | ||
1849 | |||
1850 | /* allocate the blocks dmap by dmap. | ||
1851 | */ | ||
1852 | for (n = nblocks, b = blkno; n > 0; n -= nb, b += nb) { | ||
1853 | /* get the buffer for the dmap. | ||
1854 | */ | ||
1855 | lblkno = BLKTODMAP(b, bmp->db_l2nbperpage); | ||
1856 | mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); | ||
1857 | if (mp == NULL) { | ||
1858 | rc = -EIO; | ||
1859 | goto backout; | ||
1860 | } | ||
1861 | dp = (struct dmap *) mp->data; | ||
1862 | |||
1863 | /* the dmap better be all free. | ||
1864 | */ | ||
1865 | if (dp->tree.stree[ROOT] != L2BPERDMAP) { | ||
1866 | release_metapage(mp); | ||
1867 | jfs_error(bmp->db_ipbmap->i_sb, | ||
1868 | "dbAllocCtl: the dmap is not all free"); | ||
1869 | rc = -EIO; | ||
1870 | goto backout; | ||
1871 | } | ||
1872 | |||
1873 | /* determine how many blocks to allocate from this dmap. | ||
1874 | */ | ||
1875 | nb = min(n, (s64)BPERDMAP); | ||
1876 | |||
1877 | /* allocate the blocks from the dmap. | ||
1878 | */ | ||
1879 | if ((rc = dbAllocDmap(bmp, dp, b, nb))) { | ||
1880 | release_metapage(mp); | ||
1881 | goto backout; | ||
1882 | } | ||
1883 | |||
1884 | /* write the buffer. | ||
1885 | */ | ||
1886 | write_metapage(mp); | ||
1887 | } | ||
1888 | |||
1889 | /* set the results (starting block number) and return. | ||
1890 | */ | ||
1891 | *results = blkno; | ||
1892 | return (0); | ||
1893 | |||
1894 | /* something failed in handling an allocation request involving | ||
1895 | * multiple dmaps. we'll try to clean up by backing out any | ||
1896 | * allocation that has already happened for this request. if | ||
1897 | * we fail in backing out the allocation, we'll mark the file | ||
1898 | * system to indicate that blocks have been leaked. | ||
1899 | */ | ||
1900 | backout: | ||
1901 | |||
1902 | /* try to backout the allocations dmap by dmap. | ||
1903 | */ | ||
1904 | for (n = nblocks - n, b = blkno; n > 0; | ||
1905 | n -= BPERDMAP, b += BPERDMAP) { | ||
1906 | /* get the buffer for this dmap. | ||
1907 | */ | ||
1908 | lblkno = BLKTODMAP(b, bmp->db_l2nbperpage); | ||
1909 | mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); | ||
1910 | if (mp == NULL) { | ||
1911 | /* could not back out. mark the file system | ||
1912 | * to indicate that we have leaked blocks. | ||
1913 | */ | ||
1914 | jfs_error(bmp->db_ipbmap->i_sb, | ||
1915 | "dbAllocCtl: I/O Error: Block Leakage."); | ||
1916 | continue; | ||
1917 | } | ||
1918 | dp = (struct dmap *) mp->data; | ||
1919 | |||
1920 | /* free the blocks is this dmap. | ||
1921 | */ | ||
1922 | if (dbFreeDmap(bmp, dp, b, BPERDMAP)) { | ||
1923 | /* could not back out. mark the file system | ||
1924 | * to indicate that we have leaked blocks. | ||
1925 | */ | ||
1926 | release_metapage(mp); | ||
1927 | jfs_error(bmp->db_ipbmap->i_sb, | ||
1928 | "dbAllocCtl: Block Leakage."); | ||
1929 | continue; | ||
1930 | } | ||
1931 | |||
1932 | /* write the buffer. | ||
1933 | */ | ||
1934 | write_metapage(mp); | ||
1935 | } | ||
1936 | |||
1937 | return (rc); | ||
1938 | } | ||
1939 | |||
1940 | |||
1941 | /* | ||
1942 | * NAME: dbAllocDmapLev() | ||
1943 | * | ||
1944 | * FUNCTION: attempt to allocate a specified number of contiguous blocks | ||
1945 | * from a specified dmap. | ||
1946 | * | ||
1947 | * this routine checks if the contiguous blocks are available. | ||
1948 | * if so, nblocks of blocks are allocated; otherwise, ENOSPC is | ||
1949 | * returned. | ||
1950 | * | ||
1951 | * PARAMETERS: | ||
1952 | * mp - pointer to bmap descriptor | ||
1953 | * dp - pointer to dmap to attempt to allocate blocks from. | ||
1954 | * l2nb - log2 number of contiguous block desired. | ||
1955 | * nblocks - actual number of contiguous block desired. | ||
1956 | * results - on successful return, set to the starting block number | ||
1957 | * of the newly allocated range. | ||
1958 | * | ||
1959 | * RETURN VALUES: | ||
1960 | * 0 - success | ||
1961 | * -ENOSPC - insufficient disk resources | ||
1962 | * -EIO - i/o error | ||
1963 | * | ||
1964 | * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or | ||
1965 | * IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit; | ||
1966 | */ | ||
1967 | static int | ||
1968 | dbAllocDmapLev(struct bmap * bmp, | ||
1969 | struct dmap * dp, int nblocks, int l2nb, s64 * results) | ||
1970 | { | ||
1971 | s64 blkno; | ||
1972 | int leafidx, rc; | ||
1973 | |||
1974 | /* can't be more than a dmaps worth of blocks */ | ||
1975 | assert(l2nb <= L2BPERDMAP); | ||
1976 | |||
1977 | /* search the tree within the dmap page for sufficient | ||
1978 | * free space. if sufficient free space is found, dbFindLeaf() | ||
1979 | * returns the index of the leaf at which free space was found. | ||
1980 | */ | ||
1981 | if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx)) | ||
1982 | return -ENOSPC; | ||
1983 | |||
1984 | /* determine the block number within the file system corresponding | ||
1985 | * to the leaf at which free space was found. | ||
1986 | */ | ||
1987 | blkno = le64_to_cpu(dp->start) + (leafidx << L2DBWORD); | ||
1988 | |||
1989 | /* if not all bits of the dmap word are free, get the starting | ||
1990 | * bit number within the dmap word of the required string of free | ||
1991 | * bits and adjust the block number with this value. | ||
1992 | */ | ||
1993 | if (dp->tree.stree[leafidx + LEAFIND] < BUDMIN) | ||
1994 | blkno += dbFindBits(le32_to_cpu(dp->wmap[leafidx]), l2nb); | ||
1995 | |||
1996 | /* allocate the blocks */ | ||
1997 | if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0) | ||
1998 | *results = blkno; | ||
1999 | |||
2000 | return (rc); | ||
2001 | } | ||
2002 | |||
2003 | |||
2004 | /* | ||
2005 | * NAME: dbAllocDmap() | ||
2006 | * | ||
2007 | * FUNCTION: adjust the disk allocation map to reflect the allocation | ||
2008 | * of a specified block range within a dmap. | ||
2009 | * | ||
2010 | * this routine allocates the specified blocks from the dmap | ||
2011 | * through a call to dbAllocBits(). if the allocation of the | ||
2012 | * block range causes the maximum string of free blocks within | ||
2013 | * the dmap to change (i.e. the value of the root of the dmap's | ||
2014 | * dmtree), this routine will cause this change to be reflected | ||
2015 | * up through the appropriate levels of the dmap control pages | ||
2016 | * by a call to dbAdjCtl() for the L0 dmap control page that | ||
2017 | * covers this dmap. | ||
2018 | * | ||
2019 | * PARAMETERS: | ||
2020 | * bmp - pointer to bmap descriptor | ||
2021 | * dp - pointer to dmap to allocate the block range from. | ||
2022 | * blkno - starting block number of the block to be allocated. | ||
2023 | * nblocks - number of blocks to be allocated. | ||
2024 | * | ||
2025 | * RETURN VALUES: | ||
2026 | * 0 - success | ||
2027 | * -EIO - i/o error | ||
2028 | * | ||
2029 | * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; | ||
2030 | */ | ||
2031 | static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, | ||
2032 | int nblocks) | ||
2033 | { | ||
2034 | s8 oldroot; | ||
2035 | int rc; | ||
2036 | |||
2037 | /* save the current value of the root (i.e. maximum free string) | ||
2038 | * of the dmap tree. | ||
2039 | */ | ||
2040 | oldroot = dp->tree.stree[ROOT]; | ||
2041 | |||
2042 | /* allocate the specified (blocks) bits */ | ||
2043 | dbAllocBits(bmp, dp, blkno, nblocks); | ||
2044 | |||
2045 | /* if the root has not changed, done. */ | ||
2046 | if (dp->tree.stree[ROOT] == oldroot) | ||
2047 | return (0); | ||
2048 | |||
2049 | /* root changed. bubble the change up to the dmap control pages. | ||
2050 | * if the adjustment of the upper level control pages fails, | ||
2051 | * backout the bit allocation (thus making everything consistent). | ||
2052 | */ | ||
2053 | if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 1, 0))) | ||
2054 | dbFreeBits(bmp, dp, blkno, nblocks); | ||
2055 | |||
2056 | return (rc); | ||
2057 | } | ||
2058 | |||
2059 | |||
2060 | /* | ||
2061 | * NAME: dbFreeDmap() | ||
2062 | * | ||
2063 | * FUNCTION: adjust the disk allocation map to reflect the allocation | ||
2064 | * of a specified block range within a dmap. | ||
2065 | * | ||
2066 | * this routine frees the specified blocks from the dmap through | ||
2067 | * a call to dbFreeBits(). if the deallocation of the block range | ||
2068 | * causes the maximum string of free blocks within the dmap to | ||
2069 | * change (i.e. the value of the root of the dmap's dmtree), this | ||
2070 | * routine will cause this change to be reflected up through the | ||
2071 | * appropriate levels of the dmap control pages by a call to | ||
2072 | * dbAdjCtl() for the L0 dmap control page that covers this dmap. | ||
2073 | * | ||
2074 | * PARAMETERS: | ||
2075 | * bmp - pointer to bmap descriptor | ||
2076 | * dp - pointer to dmap to free the block range from. | ||
2077 | * blkno - starting block number of the block to be freed. | ||
2078 | * nblocks - number of blocks to be freed. | ||
2079 | * | ||
2080 | * RETURN VALUES: | ||
2081 | * 0 - success | ||
2082 | * -EIO - i/o error | ||
2083 | * | ||
2084 | * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; | ||
2085 | */ | ||
2086 | static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, | ||
2087 | int nblocks) | ||
2088 | { | ||
2089 | s8 oldroot; | ||
2090 | int rc, word; | ||
2091 | |||
2092 | /* save the current value of the root (i.e. maximum free string) | ||
2093 | * of the dmap tree. | ||
2094 | */ | ||
2095 | oldroot = dp->tree.stree[ROOT]; | ||
2096 | |||
2097 | /* free the specified (blocks) bits */ | ||
2098 | dbFreeBits(bmp, dp, blkno, nblocks); | ||
2099 | |||
2100 | /* if the root has not changed, done. */ | ||
2101 | if (dp->tree.stree[ROOT] == oldroot) | ||
2102 | return (0); | ||
2103 | |||
2104 | /* root changed. bubble the change up to the dmap control pages. | ||
2105 | * if the adjustment of the upper level control pages fails, | ||
2106 | * backout the deallocation. | ||
2107 | */ | ||
2108 | if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 0, 0))) { | ||
2109 | word = (blkno & (BPERDMAP - 1)) >> L2DBWORD; | ||
2110 | |||
2111 | /* as part of backing out the deallocation, we will have | ||
2112 | * to back split the dmap tree if the deallocation caused | ||
2113 | * the freed blocks to become part of a larger binary buddy | ||
2114 | * system. | ||
2115 | */ | ||
2116 | if (dp->tree.stree[word] == NOFREE) | ||
2117 | dbBackSplit((dmtree_t *) & dp->tree, word); | ||
2118 | |||
2119 | dbAllocBits(bmp, dp, blkno, nblocks); | ||
2120 | } | ||
2121 | |||
2122 | return (rc); | ||
2123 | } | ||
2124 | |||
2125 | |||
2126 | /* | ||
2127 | * NAME: dbAllocBits() | ||
2128 | * | ||
2129 | * FUNCTION: allocate a specified block range from a dmap. | ||
2130 | * | ||
2131 | * this routine updates the dmap to reflect the working | ||
2132 | * state allocation of the specified block range. it directly | ||
2133 | * updates the bits of the working map and causes the adjustment | ||
2134 | * of the binary buddy system described by the dmap's dmtree | ||
2135 | * leaves to reflect the bits allocated. it also causes the | ||
2136 | * dmap's dmtree, as a whole, to reflect the allocated range. | ||
2137 | * | ||
2138 | * PARAMETERS: | ||
2139 | * bmp - pointer to bmap descriptor | ||
2140 | * dp - pointer to dmap to allocate bits from. | ||
2141 | * blkno - starting block number of the bits to be allocated. | ||
2142 | * nblocks - number of bits to be allocated. | ||
2143 | * | ||
2144 | * RETURN VALUES: none | ||
2145 | * | ||
2146 | * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; | ||
2147 | */ | ||
2148 | static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, | ||
2149 | int nblocks) | ||
2150 | { | ||
2151 | int dbitno, word, rembits, nb, nwords, wbitno, nw, agno; | ||
2152 | dmtree_t *tp = (dmtree_t *) & dp->tree; | ||
2153 | int size; | ||
2154 | s8 *leaf; | ||
2155 | |||
2156 | /* pick up a pointer to the leaves of the dmap tree */ | ||
2157 | leaf = dp->tree.stree + LEAFIND; | ||
2158 | |||
2159 | /* determine the bit number and word within the dmap of the | ||
2160 | * starting block. | ||
2161 | */ | ||
2162 | dbitno = blkno & (BPERDMAP - 1); | ||
2163 | word = dbitno >> L2DBWORD; | ||
2164 | |||
2165 | /* block range better be within the dmap */ | ||
2166 | assert(dbitno + nblocks <= BPERDMAP); | ||
2167 | |||
2168 | /* allocate the bits of the dmap's words corresponding to the block | ||
2169 | * range. not all bits of the first and last words may be contained | ||
2170 | * within the block range. if this is the case, we'll work against | ||
2171 | * those words (i.e. partial first and/or last) on an individual basis | ||
2172 | * (a single pass), allocating the bits of interest by hand and | ||
2173 | * updating the leaf corresponding to the dmap word. a single pass | ||
2174 | * will be used for all dmap words fully contained within the | ||
2175 | * specified range. within this pass, the bits of all fully contained | ||
2176 | * dmap words will be marked as free in a single shot and the leaves | ||
2177 | * will be updated. a single leaf may describe the free space of | ||
2178 | * multiple dmap words, so we may update only a subset of the actual | ||
2179 | * leaves corresponding to the dmap words of the block range. | ||
2180 | */ | ||
2181 | for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { | ||
2182 | /* determine the bit number within the word and | ||
2183 | * the number of bits within the word. | ||
2184 | */ | ||
2185 | wbitno = dbitno & (DBWORD - 1); | ||
2186 | nb = min(rembits, DBWORD - wbitno); | ||
2187 | |||
2188 | /* check if only part of a word is to be allocated. | ||
2189 | */ | ||
2190 | if (nb < DBWORD) { | ||
2191 | /* allocate (set to 1) the appropriate bits within | ||
2192 | * this dmap word. | ||
2193 | */ | ||
2194 | dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb) | ||
2195 | >> wbitno); | ||
2196 | |||
2197 | /* update the leaf for this dmap word. in addition | ||
2198 | * to setting the leaf value to the binary buddy max | ||
2199 | * of the updated dmap word, dbSplit() will split | ||
2200 | * the binary system of the leaves if need be. | ||
2201 | */ | ||
2202 | dbSplit(tp, word, BUDMIN, | ||
2203 | dbMaxBud((u8 *) & dp->wmap[word])); | ||
2204 | |||
2205 | word += 1; | ||
2206 | } else { | ||
2207 | /* one or more dmap words are fully contained | ||
2208 | * within the block range. determine how many | ||
2209 | * words and allocate (set to 1) the bits of these | ||
2210 | * words. | ||
2211 | */ | ||
2212 | nwords = rembits >> L2DBWORD; | ||
2213 | memset(&dp->wmap[word], (int) ONES, nwords * 4); | ||
2214 | |||
2215 | /* determine how many bits. | ||
2216 | */ | ||
2217 | nb = nwords << L2DBWORD; | ||
2218 | |||
2219 | /* now update the appropriate leaves to reflect | ||
2220 | * the allocated words. | ||
2221 | */ | ||
2222 | for (; nwords > 0; nwords -= nw) { | ||
2223 | if (leaf[word] < BUDMIN) { | ||
2224 | jfs_error(bmp->db_ipbmap->i_sb, | ||
2225 | "dbAllocBits: leaf page " | ||
2226 | "corrupt"); | ||
2227 | break; | ||
2228 | } | ||
2229 | |||
2230 | /* determine what the leaf value should be | ||
2231 | * updated to as the minimum of the l2 number | ||
2232 | * of bits being allocated and the l2 number | ||
2233 | * of bits currently described by this leaf. | ||
2234 | */ | ||
2235 | size = min((int)leaf[word], NLSTOL2BSZ(nwords)); | ||
2236 | |||
2237 | /* update the leaf to reflect the allocation. | ||
2238 | * in addition to setting the leaf value to | ||
2239 | * NOFREE, dbSplit() will split the binary | ||
2240 | * system of the leaves to reflect the current | ||
2241 | * allocation (size). | ||
2242 | */ | ||
2243 | dbSplit(tp, word, size, NOFREE); | ||
2244 | |||
2245 | /* get the number of dmap words handled */ | ||
2246 | nw = BUDSIZE(size, BUDMIN); | ||
2247 | word += nw; | ||
2248 | } | ||
2249 | } | ||
2250 | } | ||
2251 | |||
2252 | /* update the free count for this dmap */ | ||
2253 | dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks); | ||
2254 | |||
2255 | BMAP_LOCK(bmp); | ||
2256 | |||
2257 | /* if this allocation group is completely free, | ||
2258 | * update the maximum allocation group number if this allocation | ||
2259 | * group is the new max. | ||
2260 | */ | ||
2261 | agno = blkno >> bmp->db_agl2size; | ||
2262 | if (agno > bmp->db_maxag) | ||
2263 | bmp->db_maxag = agno; | ||
2264 | |||
2265 | /* update the free count for the allocation group and map */ | ||
2266 | bmp->db_agfree[agno] -= nblocks; | ||
2267 | bmp->db_nfree -= nblocks; | ||
2268 | |||
2269 | BMAP_UNLOCK(bmp); | ||
2270 | } | ||
2271 | |||
2272 | |||
2273 | /* | ||
2274 | * NAME: dbFreeBits() | ||
2275 | * | ||
2276 | * FUNCTION: free a specified block range from a dmap. | ||
2277 | * | ||
2278 | * this routine updates the dmap to reflect the working | ||
2279 | * state allocation of the specified block range. it directly | ||
2280 | * updates the bits of the working map and causes the adjustment | ||
2281 | * of the binary buddy system described by the dmap's dmtree | ||
2282 | * leaves to reflect the bits freed. it also causes the dmap's | ||
2283 | * dmtree, as a whole, to reflect the deallocated range. | ||
2284 | * | ||
2285 | * PARAMETERS: | ||
2286 | * bmp - pointer to bmap descriptor | ||
2287 | * dp - pointer to dmap to free bits from. | ||
2288 | * blkno - starting block number of the bits to be freed. | ||
2289 | * nblocks - number of bits to be freed. | ||
2290 | * | ||
2291 | * RETURN VALUES: none | ||
2292 | * | ||
2293 | * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; | ||
2294 | */ | ||
2295 | static void dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, | ||
2296 | int nblocks) | ||
2297 | { | ||
2298 | int dbitno, word, rembits, nb, nwords, wbitno, nw, agno; | ||
2299 | dmtree_t *tp = (dmtree_t *) & dp->tree; | ||
2300 | int size; | ||
2301 | |||
2302 | /* determine the bit number and word within the dmap of the | ||
2303 | * starting block. | ||
2304 | */ | ||
2305 | dbitno = blkno & (BPERDMAP - 1); | ||
2306 | word = dbitno >> L2DBWORD; | ||
2307 | |||
2308 | /* block range better be within the dmap. | ||
2309 | */ | ||
2310 | assert(dbitno + nblocks <= BPERDMAP); | ||
2311 | |||
2312 | /* free the bits of the dmaps words corresponding to the block range. | ||
2313 | * not all bits of the first and last words may be contained within | ||
2314 | * the block range. if this is the case, we'll work against those | ||
2315 | * words (i.e. partial first and/or last) on an individual basis | ||
2316 | * (a single pass), freeing the bits of interest by hand and updating | ||
2317 | * the leaf corresponding to the dmap word. a single pass will be used | ||
2318 | * for all dmap words fully contained within the specified range. | ||
2319 | * within this pass, the bits of all fully contained dmap words will | ||
2320 | * be marked as free in a single shot and the leaves will be updated. a | ||
2321 | * single leaf may describe the free space of multiple dmap words, | ||
2322 | * so we may update only a subset of the actual leaves corresponding | ||
2323 | * to the dmap words of the block range. | ||
2324 | * | ||
2325 | * dbJoin() is used to update leaf values and will join the binary | ||
2326 | * buddy system of the leaves if the new leaf values indicate this | ||
2327 | * should be done. | ||
2328 | */ | ||
2329 | for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { | ||
2330 | /* determine the bit number within the word and | ||
2331 | * the number of bits within the word. | ||
2332 | */ | ||
2333 | wbitno = dbitno & (DBWORD - 1); | ||
2334 | nb = min(rembits, DBWORD - wbitno); | ||
2335 | |||
2336 | /* check if only part of a word is to be freed. | ||
2337 | */ | ||
2338 | if (nb < DBWORD) { | ||
2339 | /* free (zero) the appropriate bits within this | ||
2340 | * dmap word. | ||
2341 | */ | ||
2342 | dp->wmap[word] &= | ||
2343 | cpu_to_le32(~(ONES << (DBWORD - nb) | ||
2344 | >> wbitno)); | ||
2345 | |||
2346 | /* update the leaf for this dmap word. | ||
2347 | */ | ||
2348 | dbJoin(tp, word, | ||
2349 | dbMaxBud((u8 *) & dp->wmap[word])); | ||
2350 | |||
2351 | word += 1; | ||
2352 | } else { | ||
2353 | /* one or more dmap words are fully contained | ||
2354 | * within the block range. determine how many | ||
2355 | * words and free (zero) the bits of these words. | ||
2356 | */ | ||
2357 | nwords = rembits >> L2DBWORD; | ||
2358 | memset(&dp->wmap[word], 0, nwords * 4); | ||
2359 | |||
2360 | /* determine how many bits. | ||
2361 | */ | ||
2362 | nb = nwords << L2DBWORD; | ||
2363 | |||
2364 | /* now update the appropriate leaves to reflect | ||
2365 | * the freed words. | ||
2366 | */ | ||
2367 | for (; nwords > 0; nwords -= nw) { | ||
2368 | /* determine what the leaf value should be | ||
2369 | * updated to as the minimum of the l2 number | ||
2370 | * of bits being freed and the l2 (max) number | ||
2371 | * of bits that can be described by this leaf. | ||
2372 | */ | ||
2373 | size = | ||
2374 | min(LITOL2BSZ | ||
2375 | (word, L2LPERDMAP, BUDMIN), | ||
2376 | NLSTOL2BSZ(nwords)); | ||
2377 | |||
2378 | /* update the leaf. | ||
2379 | */ | ||
2380 | dbJoin(tp, word, size); | ||
2381 | |||
2382 | /* get the number of dmap words handled. | ||
2383 | */ | ||
2384 | nw = BUDSIZE(size, BUDMIN); | ||
2385 | word += nw; | ||
2386 | } | ||
2387 | } | ||
2388 | } | ||
2389 | |||
2390 | /* update the free count for this dmap. | ||
2391 | */ | ||
2392 | dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks); | ||
2393 | |||
2394 | BMAP_LOCK(bmp); | ||
2395 | |||
2396 | /* update the free count for the allocation group and | ||
2397 | * map. | ||
2398 | */ | ||
2399 | agno = blkno >> bmp->db_agl2size; | ||
2400 | bmp->db_nfree += nblocks; | ||
2401 | bmp->db_agfree[agno] += nblocks; | ||
2402 | |||
2403 | /* check if this allocation group is not completely free and | ||
2404 | * if it is currently the maximum (rightmost) allocation group. | ||
2405 | * if so, establish the new maximum allocation group number by | ||
2406 | * searching left for the first allocation group with allocation. | ||
2407 | */ | ||
2408 | if ((bmp->db_agfree[agno] == bmp->db_agsize && agno == bmp->db_maxag) || | ||
2409 | (agno == bmp->db_numag - 1 && | ||
2410 | bmp->db_agfree[agno] == (bmp-> db_mapsize & (BPERDMAP - 1)))) { | ||
2411 | while (bmp->db_maxag > 0) { | ||
2412 | bmp->db_maxag -= 1; | ||
2413 | if (bmp->db_agfree[bmp->db_maxag] != | ||
2414 | bmp->db_agsize) | ||
2415 | break; | ||
2416 | } | ||
2417 | |||
2418 | /* re-establish the allocation group preference if the | ||
2419 | * current preference is right of the maximum allocation | ||
2420 | * group. | ||
2421 | */ | ||
2422 | if (bmp->db_agpref > bmp->db_maxag) | ||
2423 | bmp->db_agpref = bmp->db_maxag; | ||
2424 | } | ||
2425 | |||
2426 | BMAP_UNLOCK(bmp); | ||
2427 | } | ||
2428 | |||
2429 | |||
2430 | /* | ||
2431 | * NAME: dbAdjCtl() | ||
2432 | * | ||
2433 | * FUNCTION: adjust a dmap control page at a specified level to reflect | ||
2434 | * the change in a lower level dmap or dmap control page's | ||
2435 | * maximum string of free blocks (i.e. a change in the root | ||
2436 | * of the lower level object's dmtree) due to the allocation | ||
2437 | * or deallocation of a range of blocks with a single dmap. | ||
2438 | * | ||
2439 | * on entry, this routine is provided with the new value of | ||
2440 | * the lower level dmap or dmap control page root and the | ||
2441 | * starting block number of the block range whose allocation | ||
2442 | * or deallocation resulted in the root change. this range | ||
2443 | * is respresented by a single leaf of the current dmapctl | ||
2444 | * and the leaf will be updated with this value, possibly | ||
2445 | * causing a binary buddy system within the leaves to be | ||
2446 | * split or joined. the update may also cause the dmapctl's | ||
2447 | * dmtree to be updated. | ||
2448 | * | ||
2449 | * if the adjustment of the dmap control page, itself, causes its | ||
2450 | * root to change, this change will be bubbled up to the next dmap | ||
2451 | * control level by a recursive call to this routine, specifying | ||
2452 | * the new root value and the next dmap control page level to | ||
2453 | * be adjusted. | ||
2454 | * PARAMETERS: | ||
2455 | * bmp - pointer to bmap descriptor | ||
2456 | * blkno - the first block of a block range within a dmap. it is | ||
2457 | * the allocation or deallocation of this block range that | ||
2458 | * requires the dmap control page to be adjusted. | ||
2459 | * newval - the new value of the lower level dmap or dmap control | ||
2460 | * page root. | ||
2461 | * alloc - TRUE if adjustment is due to an allocation. | ||
2462 | * level - current level of dmap control page (i.e. L0, L1, L2) to | ||
2463 | * be adjusted. | ||
2464 | * | ||
2465 | * RETURN VALUES: | ||
2466 | * 0 - success | ||
2467 | * -EIO - i/o error | ||
2468 | * | ||
2469 | * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; | ||
2470 | */ | ||
2471 | static int | ||
2472 | dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) | ||
2473 | { | ||
2474 | struct metapage *mp; | ||
2475 | s8 oldroot; | ||
2476 | int oldval; | ||
2477 | s64 lblkno; | ||
2478 | struct dmapctl *dcp; | ||
2479 | int rc, leafno, ti; | ||
2480 | |||
2481 | /* get the buffer for the dmap control page for the specified | ||
2482 | * block number and control page level. | ||
2483 | */ | ||
2484 | lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, level); | ||
2485 | mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); | ||
2486 | if (mp == NULL) | ||
2487 | return -EIO; | ||
2488 | dcp = (struct dmapctl *) mp->data; | ||
2489 | |||
2490 | if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { | ||
2491 | jfs_error(bmp->db_ipbmap->i_sb, | ||
2492 | "dbAdjCtl: Corrupt dmapctl page"); | ||
2493 | release_metapage(mp); | ||
2494 | return -EIO; | ||
2495 | } | ||
2496 | |||
2497 | /* determine the leaf number corresponding to the block and | ||
2498 | * the index within the dmap control tree. | ||
2499 | */ | ||
2500 | leafno = BLKTOCTLLEAF(blkno, dcp->budmin); | ||
2501 | ti = leafno + le32_to_cpu(dcp->leafidx); | ||
2502 | |||
2503 | /* save the current leaf value and the current root level (i.e. | ||
2504 | * maximum l2 free string described by this dmapctl). | ||
2505 | */ | ||
2506 | oldval = dcp->stree[ti]; | ||
2507 | oldroot = dcp->stree[ROOT]; | ||
2508 | |||
2509 | /* check if this is a control page update for an allocation. | ||
2510 | * if so, update the leaf to reflect the new leaf value using | ||
2511 | * dbSplit(); otherwise (deallocation), use dbJoin() to udpate | ||
2512 | * the leaf with the new value. in addition to updating the | ||
2513 | * leaf, dbSplit() will also split the binary buddy system of | ||
2514 | * the leaves, if required, and bubble new values within the | ||
2515 | * dmapctl tree, if required. similarly, dbJoin() will join | ||
2516 | * the binary buddy system of leaves and bubble new values up | ||
2517 | * the dmapctl tree as required by the new leaf value. | ||
2518 | */ | ||
2519 | if (alloc) { | ||
2520 | /* check if we are in the middle of a binary buddy | ||
2521 | * system. this happens when we are performing the | ||
2522 | * first allocation out of an allocation group that | ||
2523 | * is part (not the first part) of a larger binary | ||
2524 | * buddy system. if we are in the middle, back split | ||
2525 | * the system prior to calling dbSplit() which assumes | ||
2526 | * that it is at the front of a binary buddy system. | ||
2527 | */ | ||
2528 | if (oldval == NOFREE) { | ||
2529 | dbBackSplit((dmtree_t *) dcp, leafno); | ||
2530 | oldval = dcp->stree[ti]; | ||
2531 | } | ||
2532 | dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval); | ||
2533 | } else { | ||
2534 | dbJoin((dmtree_t *) dcp, leafno, newval); | ||
2535 | } | ||
2536 | |||
2537 | /* check if the root of the current dmap control page changed due | ||
2538 | * to the update and if the current dmap control page is not at | ||
2539 | * the current top level (i.e. L0, L1, L2) of the map. if so (i.e. | ||
2540 | * root changed and this is not the top level), call this routine | ||
2541 | * again (recursion) for the next higher level of the mapping to | ||
2542 | * reflect the change in root for the current dmap control page. | ||
2543 | */ | ||
2544 | if (dcp->stree[ROOT] != oldroot) { | ||
2545 | /* are we below the top level of the map. if so, | ||
2546 | * bubble the root up to the next higher level. | ||
2547 | */ | ||
2548 | if (level < bmp->db_maxlevel) { | ||
2549 | /* bubble up the new root of this dmap control page to | ||
2550 | * the next level. | ||
2551 | */ | ||
2552 | if ((rc = | ||
2553 | dbAdjCtl(bmp, blkno, dcp->stree[ROOT], alloc, | ||
2554 | level + 1))) { | ||
2555 | /* something went wrong in bubbling up the new | ||
2556 | * root value, so backout the changes to the | ||
2557 | * current dmap control page. | ||
2558 | */ | ||
2559 | if (alloc) { | ||
2560 | dbJoin((dmtree_t *) dcp, leafno, | ||
2561 | oldval); | ||
2562 | } else { | ||
2563 | /* the dbJoin() above might have | ||
2564 | * caused a larger binary buddy system | ||
2565 | * to form and we may now be in the | ||
2566 | * middle of it. if this is the case, | ||
2567 | * back split the buddies. | ||
2568 | */ | ||
2569 | if (dcp->stree[ti] == NOFREE) | ||
2570 | dbBackSplit((dmtree_t *) | ||
2571 | dcp, leafno); | ||
2572 | dbSplit((dmtree_t *) dcp, leafno, | ||
2573 | dcp->budmin, oldval); | ||
2574 | } | ||
2575 | |||
2576 | /* release the buffer and return the error. | ||
2577 | */ | ||
2578 | release_metapage(mp); | ||
2579 | return (rc); | ||
2580 | } | ||
2581 | } else { | ||
2582 | /* we're at the top level of the map. update | ||
2583 | * the bmap control page to reflect the size | ||
2584 | * of the maximum free buddy system. | ||
2585 | */ | ||
2586 | assert(level == bmp->db_maxlevel); | ||
2587 | if (bmp->db_maxfreebud != oldroot) { | ||
2588 | jfs_error(bmp->db_ipbmap->i_sb, | ||
2589 | "dbAdjCtl: the maximum free buddy is " | ||
2590 | "not the old root"); | ||
2591 | } | ||
2592 | bmp->db_maxfreebud = dcp->stree[ROOT]; | ||
2593 | } | ||
2594 | } | ||
2595 | |||
2596 | /* write the buffer. | ||
2597 | */ | ||
2598 | write_metapage(mp); | ||
2599 | |||
2600 | return (0); | ||
2601 | } | ||
2602 | |||
2603 | |||
2604 | /* | ||
2605 | * NAME: dbSplit() | ||
2606 | * | ||
2607 | * FUNCTION: update the leaf of a dmtree with a new value, splitting | ||
2608 | * the leaf from the binary buddy system of the dmtree's | ||
2609 | * leaves, as required. | ||
2610 | * | ||
2611 | * PARAMETERS: | ||
2612 | * tp - pointer to the tree containing the leaf. | ||
2613 | * leafno - the number of the leaf to be updated. | ||
2614 | * splitsz - the size the binary buddy system starting at the leaf | ||
2615 | * must be split to, specified as the log2 number of blocks. | ||
2616 | * newval - the new value for the leaf. | ||
2617 | * | ||
2618 | * RETURN VALUES: none | ||
2619 | * | ||
2620 | * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; | ||
2621 | */ | ||
2622 | static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval) | ||
2623 | { | ||
2624 | int budsz; | ||
2625 | int cursz; | ||
2626 | s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx); | ||
2627 | |||
2628 | /* check if the leaf needs to be split. | ||
2629 | */ | ||
2630 | if (leaf[leafno] > tp->dmt_budmin) { | ||
2631 | /* the split occurs by cutting the buddy system in half | ||
2632 | * at the specified leaf until we reach the specified | ||
2633 | * size. pick up the starting split size (current size | ||
2634 | * - 1 in l2) and the corresponding buddy size. | ||
2635 | */ | ||
2636 | cursz = leaf[leafno] - 1; | ||
2637 | budsz = BUDSIZE(cursz, tp->dmt_budmin); | ||
2638 | |||
2639 | /* split until we reach the specified size. | ||
2640 | */ | ||
2641 | while (cursz >= splitsz) { | ||
2642 | /* update the buddy's leaf with its new value. | ||
2643 | */ | ||
2644 | dbAdjTree(tp, leafno ^ budsz, cursz); | ||
2645 | |||
2646 | /* on to the next size and buddy. | ||
2647 | */ | ||
2648 | cursz -= 1; | ||
2649 | budsz >>= 1; | ||
2650 | } | ||
2651 | } | ||
2652 | |||
2653 | /* adjust the dmap tree to reflect the specified leaf's new | ||
2654 | * value. | ||
2655 | */ | ||
2656 | dbAdjTree(tp, leafno, newval); | ||
2657 | } | ||
2658 | |||
2659 | |||
2660 | /* | ||
2661 | * NAME: dbBackSplit() | ||
2662 | * | ||
2663 | * FUNCTION: back split the binary buddy system of dmtree leaves | ||
2664 | * that hold a specified leaf until the specified leaf | ||
2665 | * starts its own binary buddy system. | ||
2666 | * | ||
2667 | * the allocators typically perform allocations at the start | ||
2668 | * of binary buddy systems and dbSplit() is used to accomplish | ||
2669 | * any required splits. in some cases, however, allocation | ||
2670 | * may occur in the middle of a binary system and requires a | ||
2671 | * back split, with the split proceeding out from the middle of | ||
2672 | * the system (less efficient) rather than the start of the | ||
2673 | * system (more efficient). the cases in which a back split | ||
2674 | * is required are rare and are limited to the first allocation | ||
2675 | * within an allocation group which is a part (not first part) | ||
2676 | * of a larger binary buddy system and a few exception cases | ||
2677 | * in which a previous join operation must be backed out. | ||
2678 | * | ||
2679 | * PARAMETERS: | ||
2680 | * tp - pointer to the tree containing the leaf. | ||
2681 | * leafno - the number of the leaf to be updated. | ||
2682 | * | ||
2683 | * RETURN VALUES: none | ||
2684 | * | ||
2685 | * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; | ||
2686 | */ | ||
2687 | static void dbBackSplit(dmtree_t * tp, int leafno) | ||
2688 | { | ||
2689 | int budsz, bud, w, bsz, size; | ||
2690 | int cursz; | ||
2691 | s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx); | ||
2692 | |||
2693 | /* leaf should be part (not first part) of a binary | ||
2694 | * buddy system. | ||
2695 | */ | ||
2696 | assert(leaf[leafno] == NOFREE); | ||
2697 | |||
2698 | /* the back split is accomplished by iteratively finding the leaf | ||
2699 | * that starts the buddy system that contains the specified leaf and | ||
2700 | * splitting that system in two. this iteration continues until | ||
2701 | * the specified leaf becomes the start of a buddy system. | ||
2702 | * | ||
2703 | * determine maximum possible l2 size for the specified leaf. | ||
2704 | */ | ||
2705 | size = | ||
2706 | LITOL2BSZ(leafno, le32_to_cpu(tp->dmt_l2nleafs), | ||
2707 | tp->dmt_budmin); | ||
2708 | |||
2709 | /* determine the number of leaves covered by this size. this | ||
2710 | * is the buddy size that we will start with as we search for | ||
2711 | * the buddy system that contains the specified leaf. | ||
2712 | */ | ||
2713 | budsz = BUDSIZE(size, tp->dmt_budmin); | ||
2714 | |||
2715 | /* back split. | ||
2716 | */ | ||
2717 | while (leaf[leafno] == NOFREE) { | ||
2718 | /* find the leftmost buddy leaf. | ||
2719 | */ | ||
2720 | for (w = leafno, bsz = budsz;; bsz <<= 1, | ||
2721 | w = (w < bud) ? w : bud) { | ||
2722 | assert(bsz < le32_to_cpu(tp->dmt_nleafs)); | ||
2723 | |||
2724 | /* determine the buddy. | ||
2725 | */ | ||
2726 | bud = w ^ bsz; | ||
2727 | |||
2728 | /* check if this buddy is the start of the system. | ||
2729 | */ | ||
2730 | if (leaf[bud] != NOFREE) { | ||
2731 | /* split the leaf at the start of the | ||
2732 | * system in two. | ||
2733 | */ | ||
2734 | cursz = leaf[bud] - 1; | ||
2735 | dbSplit(tp, bud, cursz, cursz); | ||
2736 | break; | ||
2737 | } | ||
2738 | } | ||
2739 | } | ||
2740 | |||
2741 | assert(leaf[leafno] == size); | ||
2742 | } | ||
2743 | |||
2744 | |||
2745 | /* | ||
2746 | * NAME: dbJoin() | ||
2747 | * | ||
2748 | * FUNCTION: update the leaf of a dmtree with a new value, joining | ||
2749 | * the leaf with other leaves of the dmtree into a multi-leaf | ||
2750 | * binary buddy system, as required. | ||
2751 | * | ||
2752 | * PARAMETERS: | ||
2753 | * tp - pointer to the tree containing the leaf. | ||
2754 | * leafno - the number of the leaf to be updated. | ||
2755 | * newval - the new value for the leaf. | ||
2756 | * | ||
2757 | * RETURN VALUES: none | ||
2758 | */ | ||
2759 | static void dbJoin(dmtree_t * tp, int leafno, int newval) | ||
2760 | { | ||
2761 | int budsz, buddy; | ||
2762 | s8 *leaf; | ||
2763 | |||
2764 | /* can the new leaf value require a join with other leaves ? | ||
2765 | */ | ||
2766 | if (newval >= tp->dmt_budmin) { | ||
2767 | /* pickup a pointer to the leaves of the tree. | ||
2768 | */ | ||
2769 | leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx); | ||
2770 | |||
2771 | /* try to join the specified leaf into a large binary | ||
2772 | * buddy system. the join proceeds by attempting to join | ||
2773 | * the specified leafno with its buddy (leaf) at new value. | ||
2774 | * if the join occurs, we attempt to join the left leaf | ||
2775 | * of the joined buddies with its buddy at new value + 1. | ||
2776 | * we continue to join until we find a buddy that cannot be | ||
2777 | * joined (does not have a value equal to the size of the | ||
2778 | * last join) or until all leaves have been joined into a | ||
2779 | * single system. | ||
2780 | * | ||
2781 | * get the buddy size (number of words covered) of | ||
2782 | * the new value. | ||
2783 | */ | ||
2784 | budsz = BUDSIZE(newval, tp->dmt_budmin); | ||
2785 | |||
2786 | /* try to join. | ||
2787 | */ | ||
2788 | while (budsz < le32_to_cpu(tp->dmt_nleafs)) { | ||
2789 | /* get the buddy leaf. | ||
2790 | */ | ||
2791 | buddy = leafno ^ budsz; | ||
2792 | |||
2793 | /* if the leaf's new value is greater than its | ||
2794 | * buddy's value, we join no more. | ||
2795 | */ | ||
2796 | if (newval > leaf[buddy]) | ||
2797 | break; | ||
2798 | |||
2799 | assert(newval == leaf[buddy]); | ||
2800 | |||
2801 | /* check which (leafno or buddy) is the left buddy. | ||
2802 | * the left buddy gets to claim the blocks resulting | ||
2803 | * from the join while the right gets to claim none. | ||
2804 | * the left buddy is also eligable to participate in | ||
2805 | * a join at the next higher level while the right | ||
2806 | * is not. | ||
2807 | * | ||
2808 | */ | ||
2809 | if (leafno < buddy) { | ||
2810 | /* leafno is the left buddy. | ||
2811 | */ | ||
2812 | dbAdjTree(tp, buddy, NOFREE); | ||
2813 | } else { | ||
2814 | /* buddy is the left buddy and becomes | ||
2815 | * leafno. | ||
2816 | */ | ||
2817 | dbAdjTree(tp, leafno, NOFREE); | ||
2818 | leafno = buddy; | ||
2819 | } | ||
2820 | |||
2821 | /* on to try the next join. | ||
2822 | */ | ||
2823 | newval += 1; | ||
2824 | budsz <<= 1; | ||
2825 | } | ||
2826 | } | ||
2827 | |||
2828 | /* update the leaf value. | ||
2829 | */ | ||
2830 | dbAdjTree(tp, leafno, newval); | ||
2831 | } | ||
2832 | |||
2833 | |||
2834 | /* | ||
2835 | * NAME: dbAdjTree() | ||
2836 | * | ||
2837 | * FUNCTION: update a leaf of a dmtree with a new value, adjusting | ||
2838 | * the dmtree, as required, to reflect the new leaf value. | ||
2839 | * the combination of any buddies must already be done before | ||
2840 | * this is called. | ||
2841 | * | ||
2842 | * PARAMETERS: | ||
2843 | * tp - pointer to the tree to be adjusted. | ||
2844 | * leafno - the number of the leaf to be updated. | ||
2845 | * newval - the new value for the leaf. | ||
2846 | * | ||
2847 | * RETURN VALUES: none | ||
2848 | */ | ||
2849 | static void dbAdjTree(dmtree_t * tp, int leafno, int newval) | ||
2850 | { | ||
2851 | int lp, pp, k; | ||
2852 | int max; | ||
2853 | |||
2854 | /* pick up the index of the leaf for this leafno. | ||
2855 | */ | ||
2856 | lp = leafno + le32_to_cpu(tp->dmt_leafidx); | ||
2857 | |||
2858 | /* is the current value the same as the old value ? if so, | ||
2859 | * there is nothing to do. | ||
2860 | */ | ||
2861 | if (tp->dmt_stree[lp] == newval) | ||
2862 | return; | ||
2863 | |||
2864 | /* set the new value. | ||
2865 | */ | ||
2866 | tp->dmt_stree[lp] = newval; | ||
2867 | |||
2868 | /* bubble the new value up the tree as required. | ||
2869 | */ | ||
2870 | for (k = 0; k < le32_to_cpu(tp->dmt_height); k++) { | ||
2871 | /* get the index of the first leaf of the 4 leaf | ||
2872 | * group containing the specified leaf (leafno). | ||
2873 | */ | ||
2874 | lp = ((lp - 1) & ~0x03) + 1; | ||
2875 | |||
2876 | /* get the index of the parent of this 4 leaf group. | ||
2877 | */ | ||
2878 | pp = (lp - 1) >> 2; | ||
2879 | |||
2880 | /* determine the maximum of the 4 leaves. | ||
2881 | */ | ||
2882 | max = TREEMAX(&tp->dmt_stree[lp]); | ||
2883 | |||
2884 | /* if the maximum of the 4 is the same as the | ||
2885 | * parent's value, we're done. | ||
2886 | */ | ||
2887 | if (tp->dmt_stree[pp] == max) | ||
2888 | break; | ||
2889 | |||
2890 | /* parent gets new value. | ||
2891 | */ | ||
2892 | tp->dmt_stree[pp] = max; | ||
2893 | |||
2894 | /* parent becomes leaf for next go-round. | ||
2895 | */ | ||
2896 | lp = pp; | ||
2897 | } | ||
2898 | } | ||
2899 | |||
2900 | |||
2901 | /* | ||
2902 | * NAME: dbFindLeaf() | ||
2903 | * | ||
2904 | * FUNCTION: search a dmtree_t for sufficient free blocks, returning | ||
2905 | * the index of a leaf describing the free blocks if | ||
2906 | * sufficient free blocks are found. | ||
2907 | * | ||
2908 | * the search starts at the top of the dmtree_t tree and | ||
2909 | * proceeds down the tree to the leftmost leaf with sufficient | ||
2910 | * free space. | ||
2911 | * | ||
2912 | * PARAMETERS: | ||
2913 | * tp - pointer to the tree to be searched. | ||
2914 | * l2nb - log2 number of free blocks to search for. | ||
2915 | * leafidx - return pointer to be set to the index of the leaf | ||
2916 | * describing at least l2nb free blocks if sufficient | ||
2917 | * free blocks are found. | ||
2918 | * | ||
2919 | * RETURN VALUES: | ||
2920 | * 0 - success | ||
2921 | * -ENOSPC - insufficient free blocks. | ||
2922 | */ | ||
2923 | static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx) | ||
2924 | { | ||
2925 | int ti, n = 0, k, x = 0; | ||
2926 | |||
2927 | /* first check the root of the tree to see if there is | ||
2928 | * sufficient free space. | ||
2929 | */ | ||
2930 | if (l2nb > tp->dmt_stree[ROOT]) | ||
2931 | return -ENOSPC; | ||
2932 | |||
2933 | /* sufficient free space available. now search down the tree | ||
2934 | * starting at the next level for the leftmost leaf that | ||
2935 | * describes sufficient free space. | ||
2936 | */ | ||
2937 | for (k = le32_to_cpu(tp->dmt_height), ti = 1; | ||
2938 | k > 0; k--, ti = ((ti + n) << 2) + 1) { | ||
2939 | /* search the four nodes at this level, starting from | ||
2940 | * the left. | ||
2941 | */ | ||
2942 | for (x = ti, n = 0; n < 4; n++) { | ||
2943 | /* sufficient free space found. move to the next | ||
2944 | * level (or quit if this is the last level). | ||
2945 | */ | ||
2946 | if (l2nb <= tp->dmt_stree[x + n]) | ||
2947 | break; | ||
2948 | } | ||
2949 | |||
2950 | /* better have found something since the higher | ||
2951 | * levels of the tree said it was here. | ||
2952 | */ | ||
2953 | assert(n < 4); | ||
2954 | } | ||
2955 | |||
2956 | /* set the return to the leftmost leaf describing sufficient | ||
2957 | * free space. | ||
2958 | */ | ||
2959 | *leafidx = x + n - le32_to_cpu(tp->dmt_leafidx); | ||
2960 | |||
2961 | return (0); | ||
2962 | } | ||
2963 | |||
2964 | |||
2965 | /* | ||
2966 | * NAME: dbFindBits() | ||
2967 | * | ||
2968 | * FUNCTION: find a specified number of binary buddy free bits within a | ||
2969 | * dmap bitmap word value. | ||
2970 | * | ||
2971 | * this routine searches the bitmap value for (1 << l2nb) free | ||
2972 | * bits at (1 << l2nb) alignments within the value. | ||
2973 | * | ||
2974 | * PARAMETERS: | ||
2975 | * word - dmap bitmap word value. | ||
2976 | * l2nb - number of free bits specified as a log2 number. | ||
2977 | * | ||
2978 | * RETURN VALUES: | ||
2979 | * starting bit number of free bits. | ||
2980 | */ | ||
2981 | static int dbFindBits(u32 word, int l2nb) | ||
2982 | { | ||
2983 | int bitno, nb; | ||
2984 | u32 mask; | ||
2985 | |||
2986 | /* get the number of bits. | ||
2987 | */ | ||
2988 | nb = 1 << l2nb; | ||
2989 | assert(nb <= DBWORD); | ||
2990 | |||
2991 | /* complement the word so we can use a mask (i.e. 0s represent | ||
2992 | * free bits) and compute the mask. | ||
2993 | */ | ||
2994 | word = ~word; | ||
2995 | mask = ONES << (DBWORD - nb); | ||
2996 | |||
2997 | /* scan the word for nb free bits at nb alignments. | ||
2998 | */ | ||
2999 | for (bitno = 0; mask != 0; bitno += nb, mask >>= nb) { | ||
3000 | if ((mask & word) == mask) | ||
3001 | break; | ||
3002 | } | ||
3003 | |||
3004 | ASSERT(bitno < 32); | ||
3005 | |||
3006 | /* return the bit number. | ||
3007 | */ | ||
3008 | return (bitno); | ||
3009 | } | ||
3010 | |||
3011 | |||
3012 | /* | ||
3013 | * NAME: dbMaxBud(u8 *cp) | ||
3014 | * | ||
3015 | * FUNCTION: determine the largest binary buddy string of free | ||
3016 | * bits within 32-bits of the map. | ||
3017 | * | ||
3018 | * PARAMETERS: | ||
3019 | * cp - pointer to the 32-bit value. | ||
3020 | * | ||
3021 | * RETURN VALUES: | ||
3022 | * largest binary buddy of free bits within a dmap word. | ||
3023 | */ | ||
3024 | static int dbMaxBud(u8 * cp) | ||
3025 | { | ||
3026 | signed char tmp1, tmp2; | ||
3027 | |||
3028 | /* check if the wmap word is all free. if so, the | ||
3029 | * free buddy size is BUDMIN. | ||
3030 | */ | ||
3031 | if (*((uint *) cp) == 0) | ||
3032 | return (BUDMIN); | ||
3033 | |||
3034 | /* check if the wmap word is half free. if so, the | ||
3035 | * free buddy size is BUDMIN-1. | ||
3036 | */ | ||
3037 | if (*((u16 *) cp) == 0 || *((u16 *) cp + 1) == 0) | ||
3038 | return (BUDMIN - 1); | ||
3039 | |||
3040 | /* not all free or half free. determine the free buddy | ||
3041 | * size thru table lookup using quarters of the wmap word. | ||
3042 | */ | ||
3043 | tmp1 = max(budtab[cp[2]], budtab[cp[3]]); | ||
3044 | tmp2 = max(budtab[cp[0]], budtab[cp[1]]); | ||
3045 | return (max(tmp1, tmp2)); | ||
3046 | } | ||
3047 | |||
3048 | |||
3049 | /* | ||
3050 | * NAME: cnttz(uint word) | ||
3051 | * | ||
3052 | * FUNCTION: determine the number of trailing zeros within a 32-bit | ||
3053 | * value. | ||
3054 | * | ||
3055 | * PARAMETERS: | ||
3056 | * value - 32-bit value to be examined. | ||
3057 | * | ||
3058 | * RETURN VALUES: | ||
3059 | * count of trailing zeros | ||
3060 | */ | ||
3061 | static int cnttz(u32 word) | ||
3062 | { | ||
3063 | int n; | ||
3064 | |||
3065 | for (n = 0; n < 32; n++, word >>= 1) { | ||
3066 | if (word & 0x01) | ||
3067 | break; | ||
3068 | } | ||
3069 | |||
3070 | return (n); | ||
3071 | } | ||
3072 | |||
3073 | |||
3074 | /* | ||
3075 | * NAME: cntlz(u32 value) | ||
3076 | * | ||
3077 | * FUNCTION: determine the number of leading zeros within a 32-bit | ||
3078 | * value. | ||
3079 | * | ||
3080 | * PARAMETERS: | ||
3081 | * value - 32-bit value to be examined. | ||
3082 | * | ||
3083 | * RETURN VALUES: | ||
3084 | * count of leading zeros | ||
3085 | */ | ||
3086 | static int cntlz(u32 value) | ||
3087 | { | ||
3088 | int n; | ||
3089 | |||
3090 | for (n = 0; n < 32; n++, value <<= 1) { | ||
3091 | if (value & HIGHORDER) | ||
3092 | break; | ||
3093 | } | ||
3094 | return (n); | ||
3095 | } | ||
3096 | |||
3097 | |||
3098 | /* | ||
3099 | * NAME: blkstol2(s64 nb) | ||
3100 | * | ||
3101 | * FUNCTION: convert a block count to its log2 value. if the block | ||
3102 | * count is not a l2 multiple, it is rounded up to the next | ||
3103 | * larger l2 multiple. | ||
3104 | * | ||
3105 | * PARAMETERS: | ||
3106 | * nb - number of blocks | ||
3107 | * | ||
3108 | * RETURN VALUES: | ||
3109 | * log2 number of blocks | ||
3110 | */ | ||
3111 | int blkstol2(s64 nb) | ||
3112 | { | ||
3113 | int l2nb; | ||
3114 | s64 mask; /* meant to be signed */ | ||
3115 | |||
3116 | mask = (s64) 1 << (64 - 1); | ||
3117 | |||
3118 | /* count the leading bits. | ||
3119 | */ | ||
3120 | for (l2nb = 0; l2nb < 64; l2nb++, mask >>= 1) { | ||
3121 | /* leading bit found. | ||
3122 | */ | ||
3123 | if (nb & mask) { | ||
3124 | /* determine the l2 value. | ||
3125 | */ | ||
3126 | l2nb = (64 - 1) - l2nb; | ||
3127 | |||
3128 | /* check if we need to round up. | ||
3129 | */ | ||
3130 | if (~mask & nb) | ||
3131 | l2nb++; | ||
3132 | |||
3133 | return (l2nb); | ||
3134 | } | ||
3135 | } | ||
3136 | assert(0); | ||
3137 | return 0; /* fix compiler warning */ | ||
3138 | } | ||
3139 | |||
3140 | |||
3141 | /* | ||
3142 | * NAME: dbAllocBottomUp() | ||
3143 | * | ||
3144 | * FUNCTION: alloc the specified block range from the working block | ||
3145 | * allocation map. | ||
3146 | * | ||
3147 | * the blocks will be alloc from the working map one dmap | ||
3148 | * at a time. | ||
3149 | * | ||
3150 | * PARAMETERS: | ||
3151 | * ip - pointer to in-core inode; | ||
3152 | * blkno - starting block number to be freed. | ||
3153 | * nblocks - number of blocks to be freed. | ||
3154 | * | ||
3155 | * RETURN VALUES: | ||
3156 | * 0 - success | ||
3157 | * -EIO - i/o error | ||
3158 | */ | ||
3159 | int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks) | ||
3160 | { | ||
3161 | struct metapage *mp; | ||
3162 | struct dmap *dp; | ||
3163 | int nb, rc; | ||
3164 | s64 lblkno, rem; | ||
3165 | struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; | ||
3166 | struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; | ||
3167 | |||
3168 | IREAD_LOCK(ipbmap); | ||
3169 | |||
3170 | /* block to be allocated better be within the mapsize. */ | ||
3171 | ASSERT(nblocks <= bmp->db_mapsize - blkno); | ||
3172 | |||
3173 | /* | ||
3174 | * allocate the blocks a dmap at a time. | ||
3175 | */ | ||
3176 | mp = NULL; | ||
3177 | for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) { | ||
3178 | /* release previous dmap if any */ | ||
3179 | if (mp) { | ||
3180 | write_metapage(mp); | ||
3181 | } | ||
3182 | |||
3183 | /* get the buffer for the current dmap. */ | ||
3184 | lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); | ||
3185 | mp = read_metapage(ipbmap, lblkno, PSIZE, 0); | ||
3186 | if (mp == NULL) { | ||
3187 | IREAD_UNLOCK(ipbmap); | ||
3188 | return -EIO; | ||
3189 | } | ||
3190 | dp = (struct dmap *) mp->data; | ||
3191 | |||
3192 | /* determine the number of blocks to be allocated from | ||
3193 | * this dmap. | ||
3194 | */ | ||
3195 | nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1))); | ||
3196 | |||
3197 | DBFREECK(bmp->db_DBmap, bmp->db_mapsize, blkno, nb); | ||
3198 | |||
3199 | /* allocate the blocks. */ | ||
3200 | if ((rc = dbAllocDmapBU(bmp, dp, blkno, nb))) { | ||
3201 | release_metapage(mp); | ||
3202 | IREAD_UNLOCK(ipbmap); | ||
3203 | return (rc); | ||
3204 | } | ||
3205 | |||
3206 | DBALLOC(bmp->db_DBmap, bmp->db_mapsize, blkno, nb); | ||
3207 | } | ||
3208 | |||
3209 | /* write the last buffer. */ | ||
3210 | write_metapage(mp); | ||
3211 | |||
3212 | IREAD_UNLOCK(ipbmap); | ||
3213 | |||
3214 | return (0); | ||
3215 | } | ||
3216 | |||
3217 | |||
3218 | static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, | ||
3219 | int nblocks) | ||
3220 | { | ||
3221 | int rc; | ||
3222 | int dbitno, word, rembits, nb, nwords, wbitno, agno; | ||
3223 | s8 oldroot, *leaf; | ||
3224 | struct dmaptree *tp = (struct dmaptree *) & dp->tree; | ||
3225 | |||
3226 | /* save the current value of the root (i.e. maximum free string) | ||
3227 | * of the dmap tree. | ||
3228 | */ | ||
3229 | oldroot = tp->stree[ROOT]; | ||
3230 | |||
3231 | /* pick up a pointer to the leaves of the dmap tree */ | ||
3232 | leaf = tp->stree + LEAFIND; | ||
3233 | |||
3234 | /* determine the bit number and word within the dmap of the | ||
3235 | * starting block. | ||
3236 | */ | ||
3237 | dbitno = blkno & (BPERDMAP - 1); | ||
3238 | word = dbitno >> L2DBWORD; | ||
3239 | |||
3240 | /* block range better be within the dmap */ | ||
3241 | assert(dbitno + nblocks <= BPERDMAP); | ||
3242 | |||
3243 | /* allocate the bits of the dmap's words corresponding to the block | ||
3244 | * range. not all bits of the first and last words may be contained | ||
3245 | * within the block range. if this is the case, we'll work against | ||
3246 | * those words (i.e. partial first and/or last) on an individual basis | ||
3247 | * (a single pass), allocating the bits of interest by hand and | ||
3248 | * updating the leaf corresponding to the dmap word. a single pass | ||
3249 | * will be used for all dmap words fully contained within the | ||
3250 | * specified range. within this pass, the bits of all fully contained | ||
3251 | * dmap words will be marked as free in a single shot and the leaves | ||
3252 | * will be updated. a single leaf may describe the free space of | ||
3253 | * multiple dmap words, so we may update only a subset of the actual | ||
3254 | * leaves corresponding to the dmap words of the block range. | ||
3255 | */ | ||
3256 | for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { | ||
3257 | /* determine the bit number within the word and | ||
3258 | * the number of bits within the word. | ||
3259 | */ | ||
3260 | wbitno = dbitno & (DBWORD - 1); | ||
3261 | nb = min(rembits, DBWORD - wbitno); | ||
3262 | |||
3263 | /* check if only part of a word is to be allocated. | ||
3264 | */ | ||
3265 | if (nb < DBWORD) { | ||
3266 | /* allocate (set to 1) the appropriate bits within | ||
3267 | * this dmap word. | ||
3268 | */ | ||
3269 | dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb) | ||
3270 | >> wbitno); | ||
3271 | |||
3272 | word++; | ||
3273 | } else { | ||
3274 | /* one or more dmap words are fully contained | ||
3275 | * within the block range. determine how many | ||
3276 | * words and allocate (set to 1) the bits of these | ||
3277 | * words. | ||
3278 | */ | ||
3279 | nwords = rembits >> L2DBWORD; | ||
3280 | memset(&dp->wmap[word], (int) ONES, nwords * 4); | ||
3281 | |||
3282 | /* determine how many bits */ | ||
3283 | nb = nwords << L2DBWORD; | ||
3284 | word += nwords; | ||
3285 | } | ||
3286 | } | ||
3287 | |||
3288 | /* update the free count for this dmap */ | ||
3289 | dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks); | ||
3290 | |||
3291 | /* reconstruct summary tree */ | ||
3292 | dbInitDmapTree(dp); | ||
3293 | |||
3294 | BMAP_LOCK(bmp); | ||
3295 | |||
3296 | /* if this allocation group is completely free, | ||
3297 | * update the highest active allocation group number | ||
3298 | * if this allocation group is the new max. | ||
3299 | */ | ||
3300 | agno = blkno >> bmp->db_agl2size; | ||
3301 | if (agno > bmp->db_maxag) | ||
3302 | bmp->db_maxag = agno; | ||
3303 | |||
3304 | /* update the free count for the allocation group and map */ | ||
3305 | bmp->db_agfree[agno] -= nblocks; | ||
3306 | bmp->db_nfree -= nblocks; | ||
3307 | |||
3308 | BMAP_UNLOCK(bmp); | ||
3309 | |||
3310 | /* if the root has not changed, done. */ | ||
3311 | if (tp->stree[ROOT] == oldroot) | ||
3312 | return (0); | ||
3313 | |||
3314 | /* root changed. bubble the change up to the dmap control pages. | ||
3315 | * if the adjustment of the upper level control pages fails, | ||
3316 | * backout the bit allocation (thus making everything consistent). | ||
3317 | */ | ||
3318 | if ((rc = dbAdjCtl(bmp, blkno, tp->stree[ROOT], 1, 0))) | ||
3319 | dbFreeBits(bmp, dp, blkno, nblocks); | ||
3320 | |||
3321 | return (rc); | ||
3322 | } | ||
3323 | |||
3324 | |||
3325 | /* | ||
3326 | * NAME: dbExtendFS() | ||
3327 | * | ||
3328 | * FUNCTION: extend bmap from blkno for nblocks; | ||
3329 | * dbExtendFS() updates bmap ready for dbAllocBottomUp(); | ||
3330 | * | ||
3331 | * L2 | ||
3332 | * | | ||
3333 | * L1---------------------------------L1 | ||
3334 | * | | | ||
3335 | * L0---------L0---------L0 L0---------L0---------L0 | ||
3336 | * | | | | | | | ||
3337 | * d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,.,dm; | ||
3338 | * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm | ||
3339 | * | ||
3340 | * <---old---><----------------------------extend-----------------------> | ||
3341 | */ | ||
3342 | int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) | ||
3343 | { | ||
3344 | struct jfs_sb_info *sbi = JFS_SBI(ipbmap->i_sb); | ||
3345 | int nbperpage = sbi->nbperpage; | ||
3346 | int i, i0 = TRUE, j, j0 = TRUE, k, n; | ||
3347 | s64 newsize; | ||
3348 | s64 p; | ||
3349 | struct metapage *mp, *l2mp, *l1mp = NULL, *l0mp = NULL; | ||
3350 | struct dmapctl *l2dcp, *l1dcp, *l0dcp; | ||
3351 | struct dmap *dp; | ||
3352 | s8 *l0leaf, *l1leaf, *l2leaf; | ||
3353 | struct bmap *bmp = sbi->bmap; | ||
3354 | int agno, l2agsize, oldl2agsize; | ||
3355 | s64 ag_rem; | ||
3356 | |||
3357 | newsize = blkno + nblocks; | ||
3358 | |||
3359 | jfs_info("dbExtendFS: blkno:%Ld nblocks:%Ld newsize:%Ld", | ||
3360 | (long long) blkno, (long long) nblocks, (long long) newsize); | ||
3361 | |||
3362 | /* | ||
3363 | * initialize bmap control page. | ||
3364 | * | ||
3365 | * all the data in bmap control page should exclude | ||
3366 | * the mkfs hidden dmap page. | ||
3367 | */ | ||
3368 | |||
3369 | /* update mapsize */ | ||
3370 | bmp->db_mapsize = newsize; | ||
3371 | bmp->db_maxlevel = BMAPSZTOLEV(bmp->db_mapsize); | ||
3372 | |||
3373 | /* compute new AG size */ | ||
3374 | l2agsize = dbGetL2AGSize(newsize); | ||
3375 | oldl2agsize = bmp->db_agl2size; | ||
3376 | |||
3377 | bmp->db_agl2size = l2agsize; | ||
3378 | bmp->db_agsize = 1 << l2agsize; | ||
3379 | |||
3380 | /* compute new number of AG */ | ||
3381 | agno = bmp->db_numag; | ||
3382 | bmp->db_numag = newsize >> l2agsize; | ||
3383 | bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0; | ||
3384 | |||
3385 | /* | ||
3386 | * reconfigure db_agfree[] | ||
3387 | * from old AG configuration to new AG configuration; | ||
3388 | * | ||
3389 | * coalesce contiguous k (newAGSize/oldAGSize) AGs; | ||
3390 | * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn; | ||
3391 | * note: new AG size = old AG size * (2**x). | ||
3392 | */ | ||
3393 | if (l2agsize == oldl2agsize) | ||
3394 | goto extend; | ||
3395 | k = 1 << (l2agsize - oldl2agsize); | ||
3396 | ag_rem = bmp->db_agfree[0]; /* save agfree[0] */ | ||
3397 | for (i = 0, n = 0; i < agno; n++) { | ||
3398 | bmp->db_agfree[n] = 0; /* init collection point */ | ||
3399 | |||
3400 | /* coalesce cotiguous k AGs; */ | ||
3401 | for (j = 0; j < k && i < agno; j++, i++) { | ||
3402 | /* merge AGi to AGn */ | ||
3403 | bmp->db_agfree[n] += bmp->db_agfree[i]; | ||
3404 | } | ||
3405 | } | ||
3406 | bmp->db_agfree[0] += ag_rem; /* restore agfree[0] */ | ||
3407 | |||
3408 | for (; n < MAXAG; n++) | ||
3409 | bmp->db_agfree[n] = 0; | ||
3410 | |||
3411 | /* | ||
3412 | * update highest active ag number | ||
3413 | */ | ||
3414 | |||
3415 | bmp->db_maxag = bmp->db_maxag / k; | ||
3416 | |||
3417 | /* | ||
3418 | * extend bmap | ||
3419 | * | ||
3420 | * update bit maps and corresponding level control pages; | ||
3421 | * global control page db_nfree, db_agfree[agno], db_maxfreebud; | ||
3422 | */ | ||
3423 | extend: | ||
3424 | /* get L2 page */ | ||
3425 | p = BMAPBLKNO + nbperpage; /* L2 page */ | ||
3426 | l2mp = read_metapage(ipbmap, p, PSIZE, 0); | ||
3427 | if (!l2mp) { | ||
3428 | jfs_error(ipbmap->i_sb, "dbExtendFS: L2 page could not be read"); | ||
3429 | return -EIO; | ||
3430 | } | ||
3431 | l2dcp = (struct dmapctl *) l2mp->data; | ||
3432 | |||
3433 | /* compute start L1 */ | ||
3434 | k = blkno >> L2MAXL1SIZE; | ||
3435 | l2leaf = l2dcp->stree + CTLLEAFIND + k; | ||
3436 | p = BLKTOL1(blkno, sbi->l2nbperpage); /* L1 page */ | ||
3437 | |||
3438 | /* | ||
3439 | * extend each L1 in L2 | ||
3440 | */ | ||
3441 | for (; k < LPERCTL; k++, p += nbperpage) { | ||
3442 | /* get L1 page */ | ||
3443 | if (j0) { | ||
3444 | /* read in L1 page: (blkno & (MAXL1SIZE - 1)) */ | ||
3445 | l1mp = read_metapage(ipbmap, p, PSIZE, 0); | ||
3446 | if (l1mp == NULL) | ||
3447 | goto errout; | ||
3448 | l1dcp = (struct dmapctl *) l1mp->data; | ||
3449 | |||
3450 | /* compute start L0 */ | ||
3451 | j = (blkno & (MAXL1SIZE - 1)) >> L2MAXL0SIZE; | ||
3452 | l1leaf = l1dcp->stree + CTLLEAFIND + j; | ||
3453 | p = BLKTOL0(blkno, sbi->l2nbperpage); | ||
3454 | j0 = FALSE; | ||
3455 | } else { | ||
3456 | /* assign/init L1 page */ | ||
3457 | l1mp = get_metapage(ipbmap, p, PSIZE, 0); | ||
3458 | if (l1mp == NULL) | ||
3459 | goto errout; | ||
3460 | |||
3461 | l1dcp = (struct dmapctl *) l1mp->data; | ||
3462 | |||
3463 | /* compute start L0 */ | ||
3464 | j = 0; | ||
3465 | l1leaf = l1dcp->stree + CTLLEAFIND; | ||
3466 | p += nbperpage; /* 1st L0 of L1.k */ | ||
3467 | } | ||
3468 | |||
3469 | /* | ||
3470 | * extend each L0 in L1 | ||
3471 | */ | ||
3472 | for (; j < LPERCTL; j++) { | ||
3473 | /* get L0 page */ | ||
3474 | if (i0) { | ||
3475 | /* read in L0 page: (blkno & (MAXL0SIZE - 1)) */ | ||
3476 | |||
3477 | l0mp = read_metapage(ipbmap, p, PSIZE, 0); | ||
3478 | if (l0mp == NULL) | ||
3479 | goto errout; | ||
3480 | l0dcp = (struct dmapctl *) l0mp->data; | ||
3481 | |||
3482 | /* compute start dmap */ | ||
3483 | i = (blkno & (MAXL0SIZE - 1)) >> | ||
3484 | L2BPERDMAP; | ||
3485 | l0leaf = l0dcp->stree + CTLLEAFIND + i; | ||
3486 | p = BLKTODMAP(blkno, | ||
3487 | sbi->l2nbperpage); | ||
3488 | i0 = FALSE; | ||
3489 | } else { | ||
3490 | /* assign/init L0 page */ | ||
3491 | l0mp = get_metapage(ipbmap, p, PSIZE, 0); | ||
3492 | if (l0mp == NULL) | ||
3493 | goto errout; | ||
3494 | |||
3495 | l0dcp = (struct dmapctl *) l0mp->data; | ||
3496 | |||
3497 | /* compute start dmap */ | ||
3498 | i = 0; | ||
3499 | l0leaf = l0dcp->stree + CTLLEAFIND; | ||
3500 | p += nbperpage; /* 1st dmap of L0.j */ | ||
3501 | } | ||
3502 | |||
3503 | /* | ||
3504 | * extend each dmap in L0 | ||
3505 | */ | ||
3506 | for (; i < LPERCTL; i++) { | ||
3507 | /* | ||
3508 | * reconstruct the dmap page, and | ||
3509 | * initialize corresponding parent L0 leaf | ||
3510 | */ | ||
3511 | if ((n = blkno & (BPERDMAP - 1))) { | ||
3512 | /* read in dmap page: */ | ||
3513 | mp = read_metapage(ipbmap, p, | ||
3514 | PSIZE, 0); | ||
3515 | if (mp == NULL) | ||
3516 | goto errout; | ||
3517 | n = min(nblocks, (s64)BPERDMAP - n); | ||
3518 | } else { | ||
3519 | /* assign/init dmap page */ | ||
3520 | mp = read_metapage(ipbmap, p, | ||
3521 | PSIZE, 0); | ||
3522 | if (mp == NULL) | ||
3523 | goto errout; | ||
3524 | |||
3525 | n = min(nblocks, (s64)BPERDMAP); | ||
3526 | } | ||
3527 | |||
3528 | dp = (struct dmap *) mp->data; | ||
3529 | *l0leaf = dbInitDmap(dp, blkno, n); | ||
3530 | |||
3531 | bmp->db_nfree += n; | ||
3532 | agno = le64_to_cpu(dp->start) >> l2agsize; | ||
3533 | bmp->db_agfree[agno] += n; | ||
3534 | |||
3535 | write_metapage(mp); | ||
3536 | |||
3537 | l0leaf++; | ||
3538 | p += nbperpage; | ||
3539 | |||
3540 | blkno += n; | ||
3541 | nblocks -= n; | ||
3542 | if (nblocks == 0) | ||
3543 | break; | ||
3544 | } /* for each dmap in a L0 */ | ||
3545 | |||
3546 | /* | ||
3547 | * build current L0 page from its leaves, and | ||
3548 | * initialize corresponding parent L1 leaf | ||
3549 | */ | ||
3550 | *l1leaf = dbInitDmapCtl(l0dcp, 0, ++i); | ||
3551 | write_metapage(l0mp); | ||
3552 | l0mp = NULL; | ||
3553 | |||
3554 | if (nblocks) | ||
3555 | l1leaf++; /* continue for next L0 */ | ||
3556 | else { | ||
3557 | /* more than 1 L0 ? */ | ||
3558 | if (j > 0) | ||
3559 | break; /* build L1 page */ | ||
3560 | else { | ||
3561 | /* summarize in global bmap page */ | ||
3562 | bmp->db_maxfreebud = *l1leaf; | ||
3563 | release_metapage(l1mp); | ||
3564 | release_metapage(l2mp); | ||
3565 | goto finalize; | ||
3566 | } | ||
3567 | } | ||
3568 | } /* for each L0 in a L1 */ | ||
3569 | |||
3570 | /* | ||
3571 | * build current L1 page from its leaves, and | ||
3572 | * initialize corresponding parent L2 leaf | ||
3573 | */ | ||
3574 | *l2leaf = dbInitDmapCtl(l1dcp, 1, ++j); | ||
3575 | write_metapage(l1mp); | ||
3576 | l1mp = NULL; | ||
3577 | |||
3578 | if (nblocks) | ||
3579 | l2leaf++; /* continue for next L1 */ | ||
3580 | else { | ||
3581 | /* more than 1 L1 ? */ | ||
3582 | if (k > 0) | ||
3583 | break; /* build L2 page */ | ||
3584 | else { | ||
3585 | /* summarize in global bmap page */ | ||
3586 | bmp->db_maxfreebud = *l2leaf; | ||
3587 | release_metapage(l2mp); | ||
3588 | goto finalize; | ||
3589 | } | ||
3590 | } | ||
3591 | } /* for each L1 in a L2 */ | ||
3592 | |||
3593 | jfs_error(ipbmap->i_sb, | ||
3594 | "dbExtendFS: function has not returned as expected"); | ||
3595 | errout: | ||
3596 | if (l0mp) | ||
3597 | release_metapage(l0mp); | ||
3598 | if (l1mp) | ||
3599 | release_metapage(l1mp); | ||
3600 | release_metapage(l2mp); | ||
3601 | return -EIO; | ||
3602 | |||
3603 | /* | ||
3604 | * finalize bmap control page | ||
3605 | */ | ||
3606 | finalize: | ||
3607 | |||
3608 | return 0; | ||
3609 | } | ||
3610 | |||
3611 | |||
3612 | /* | ||
3613 | * dbFinalizeBmap() | ||
3614 | */ | ||
3615 | void dbFinalizeBmap(struct inode *ipbmap) | ||
3616 | { | ||
3617 | struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; | ||
3618 | int actags, inactags, l2nl; | ||
3619 | s64 ag_rem, actfree, inactfree, avgfree; | ||
3620 | int i, n; | ||
3621 | |||
3622 | /* | ||
3623 | * finalize bmap control page | ||
3624 | */ | ||
3625 | //finalize: | ||
3626 | /* | ||
3627 | * compute db_agpref: preferred ag to allocate from | ||
3628 | * (the leftmost ag with average free space in it); | ||
3629 | */ | ||
3630 | //agpref: | ||
3631 | /* get the number of active ags and inacitve ags */ | ||
3632 | actags = bmp->db_maxag + 1; | ||
3633 | inactags = bmp->db_numag - actags; | ||
3634 | ag_rem = bmp->db_mapsize & (bmp->db_agsize - 1); /* ??? */ | ||
3635 | |||
3636 | /* determine how many blocks are in the inactive allocation | ||
3637 | * groups. in doing this, we must account for the fact that | ||
3638 | * the rightmost group might be a partial group (i.e. file | ||
3639 | * system size is not a multiple of the group size). | ||
3640 | */ | ||
3641 | inactfree = (inactags && ag_rem) ? | ||
3642 | ((inactags - 1) << bmp->db_agl2size) + ag_rem | ||
3643 | : inactags << bmp->db_agl2size; | ||
3644 | |||
3645 | /* determine how many free blocks are in the active | ||
3646 | * allocation groups plus the average number of free blocks | ||
3647 | * within the active ags. | ||
3648 | */ | ||
3649 | actfree = bmp->db_nfree - inactfree; | ||
3650 | avgfree = (u32) actfree / (u32) actags; | ||
3651 | |||
3652 | /* if the preferred allocation group has not average free space. | ||
3653 | * re-establish the preferred group as the leftmost | ||
3654 | * group with average free space. | ||
3655 | */ | ||
3656 | if (bmp->db_agfree[bmp->db_agpref] < avgfree) { | ||
3657 | for (bmp->db_agpref = 0; bmp->db_agpref < actags; | ||
3658 | bmp->db_agpref++) { | ||
3659 | if (bmp->db_agfree[bmp->db_agpref] >= avgfree) | ||
3660 | break; | ||
3661 | } | ||
3662 | if (bmp->db_agpref >= bmp->db_numag) { | ||
3663 | jfs_error(ipbmap->i_sb, | ||
3664 | "cannot find ag with average freespace"); | ||
3665 | } | ||
3666 | } | ||
3667 | |||
3668 | /* | ||
3669 | * compute db_aglevel, db_agheigth, db_width, db_agstart: | ||
3670 | * an ag is covered in aglevel dmapctl summary tree, | ||
3671 | * at agheight level height (from leaf) with agwidth number of nodes | ||
3672 | * each, which starts at agstart index node of the smmary tree node | ||
3673 | * array; | ||
3674 | */ | ||
3675 | bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize); | ||
3676 | l2nl = | ||
3677 | bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL); | ||
3678 | bmp->db_agheigth = l2nl >> 1; | ||
3679 | bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1)); | ||
3680 | for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0; | ||
3681 | i--) { | ||
3682 | bmp->db_agstart += n; | ||
3683 | n <<= 2; | ||
3684 | } | ||
3685 | |||
3686 | } | ||
3687 | |||
3688 | |||
3689 | /* | ||
3690 | * NAME: dbInitDmap()/ujfs_idmap_page() | ||
3691 | * | ||
3692 | * FUNCTION: initialize working/persistent bitmap of the dmap page | ||
3693 | * for the specified number of blocks: | ||
3694 | * | ||
3695 | * at entry, the bitmaps had been initialized as free (ZEROS); | ||
3696 | * The number of blocks will only account for the actually | ||
3697 | * existing blocks. Blocks which don't actually exist in | ||
3698 | * the aggregate will be marked as allocated (ONES); | ||
3699 | * | ||
3700 | * PARAMETERS: | ||
3701 | * dp - pointer to page of map | ||
3702 | * nblocks - number of blocks this page | ||
3703 | * | ||
3704 | * RETURNS: NONE | ||
3705 | */ | ||
3706 | static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks) | ||
3707 | { | ||
3708 | int blkno, w, b, r, nw, nb, i; | ||
3709 | |||
3710 | /* starting block number within the dmap */ | ||
3711 | blkno = Blkno & (BPERDMAP - 1); | ||
3712 | |||
3713 | if (blkno == 0) { | ||
3714 | dp->nblocks = dp->nfree = cpu_to_le32(nblocks); | ||
3715 | dp->start = cpu_to_le64(Blkno); | ||
3716 | |||
3717 | if (nblocks == BPERDMAP) { | ||
3718 | memset(&dp->wmap[0], 0, LPERDMAP * 4); | ||
3719 | memset(&dp->pmap[0], 0, LPERDMAP * 4); | ||
3720 | goto initTree; | ||
3721 | } | ||
3722 | } else { | ||
3723 | dp->nblocks = | ||
3724 | cpu_to_le32(le32_to_cpu(dp->nblocks) + nblocks); | ||
3725 | dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks); | ||
3726 | } | ||
3727 | |||
3728 | /* word number containing start block number */ | ||
3729 | w = blkno >> L2DBWORD; | ||
3730 | |||
3731 | /* | ||
3732 | * free the bits corresponding to the block range (ZEROS): | ||
3733 | * note: not all bits of the first and last words may be contained | ||
3734 | * within the block range. | ||
3735 | */ | ||
3736 | for (r = nblocks; r > 0; r -= nb, blkno += nb) { | ||
3737 | /* number of bits preceding range to be freed in the word */ | ||
3738 | b = blkno & (DBWORD - 1); | ||
3739 | /* number of bits to free in the word */ | ||
3740 | nb = min(r, DBWORD - b); | ||
3741 | |||
3742 | /* is partial word to be freed ? */ | ||
3743 | if (nb < DBWORD) { | ||
3744 | /* free (set to 0) from the bitmap word */ | ||
3745 | dp->wmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb) | ||
3746 | >> b)); | ||
3747 | dp->pmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb) | ||
3748 | >> b)); | ||
3749 | |||
3750 | /* skip the word freed */ | ||
3751 | w++; | ||
3752 | } else { | ||
3753 | /* free (set to 0) contiguous bitmap words */ | ||
3754 | nw = r >> L2DBWORD; | ||
3755 | memset(&dp->wmap[w], 0, nw * 4); | ||
3756 | memset(&dp->pmap[w], 0, nw * 4); | ||
3757 | |||
3758 | /* skip the words freed */ | ||
3759 | nb = nw << L2DBWORD; | ||
3760 | w += nw; | ||
3761 | } | ||
3762 | } | ||
3763 | |||
3764 | /* | ||
3765 | * mark bits following the range to be freed (non-existing | ||
3766 | * blocks) as allocated (ONES) | ||
3767 | */ | ||
3768 | |||
3769 | if (blkno == BPERDMAP) | ||
3770 | goto initTree; | ||
3771 | |||
3772 | /* the first word beyond the end of existing blocks */ | ||
3773 | w = blkno >> L2DBWORD; | ||
3774 | |||
3775 | /* does nblocks fall on a 32-bit boundary ? */ | ||
3776 | b = blkno & (DBWORD - 1); | ||
3777 | if (b) { | ||
3778 | /* mark a partial word allocated */ | ||
3779 | dp->wmap[w] = dp->pmap[w] = cpu_to_le32(ONES >> b); | ||
3780 | w++; | ||
3781 | } | ||
3782 | |||
3783 | /* set the rest of the words in the page to allocated (ONES) */ | ||
3784 | for (i = w; i < LPERDMAP; i++) | ||
3785 | dp->pmap[i] = dp->wmap[i] = cpu_to_le32(ONES); | ||
3786 | |||
3787 | /* | ||
3788 | * init tree | ||
3789 | */ | ||
3790 | initTree: | ||
3791 | return (dbInitDmapTree(dp)); | ||
3792 | } | ||
3793 | |||
3794 | |||
3795 | /* | ||
3796 | * NAME: dbInitDmapTree()/ujfs_complete_dmap() | ||
3797 | * | ||
3798 | * FUNCTION: initialize summary tree of the specified dmap: | ||
3799 | * | ||
3800 | * at entry, bitmap of the dmap has been initialized; | ||
3801 | * | ||
3802 | * PARAMETERS: | ||
3803 | * dp - dmap to complete | ||
3804 | * blkno - starting block number for this dmap | ||
3805 | * treemax - will be filled in with max free for this dmap | ||
3806 | * | ||
3807 | * RETURNS: max free string at the root of the tree | ||
3808 | */ | ||
3809 | static int dbInitDmapTree(struct dmap * dp) | ||
3810 | { | ||
3811 | struct dmaptree *tp; | ||
3812 | s8 *cp; | ||
3813 | int i; | ||
3814 | |||
3815 | /* init fixed info of tree */ | ||
3816 | tp = &dp->tree; | ||
3817 | tp->nleafs = cpu_to_le32(LPERDMAP); | ||
3818 | tp->l2nleafs = cpu_to_le32(L2LPERDMAP); | ||
3819 | tp->leafidx = cpu_to_le32(LEAFIND); | ||
3820 | tp->height = cpu_to_le32(4); | ||
3821 | tp->budmin = BUDMIN; | ||
3822 | |||
3823 | /* init each leaf from corresponding wmap word: | ||
3824 | * note: leaf is set to NOFREE(-1) if all blocks of corresponding | ||
3825 | * bitmap word are allocated. | ||
3826 | */ | ||
3827 | cp = tp->stree + le32_to_cpu(tp->leafidx); | ||
3828 | for (i = 0; i < LPERDMAP; i++) | ||
3829 | *cp++ = dbMaxBud((u8 *) & dp->wmap[i]); | ||
3830 | |||
3831 | /* build the dmap's binary buddy summary tree */ | ||
3832 | return (dbInitTree(tp)); | ||
3833 | } | ||
3834 | |||
3835 | |||
3836 | /* | ||
3837 | * NAME: dbInitTree()/ujfs_adjtree() | ||
3838 | * | ||
3839 | * FUNCTION: initialize binary buddy summary tree of a dmap or dmapctl. | ||
3840 | * | ||
3841 | * at entry, the leaves of the tree has been initialized | ||
3842 | * from corresponding bitmap word or root of summary tree | ||
3843 | * of the child control page; | ||
3844 | * configure binary buddy system at the leaf level, then | ||
3845 | * bubble up the values of the leaf nodes up the tree. | ||
3846 | * | ||
3847 | * PARAMETERS: | ||
3848 | * cp - Pointer to the root of the tree | ||
3849 | * l2leaves- Number of leaf nodes as a power of 2 | ||
3850 | * l2min - Number of blocks that can be covered by a leaf | ||
3851 | * as a power of 2 | ||
3852 | * | ||
3853 | * RETURNS: max free string at the root of the tree | ||
3854 | */ | ||
3855 | static int dbInitTree(struct dmaptree * dtp) | ||
3856 | { | ||
3857 | int l2max, l2free, bsize, nextb, i; | ||
3858 | int child, parent, nparent; | ||
3859 | s8 *tp, *cp, *cp1; | ||
3860 | |||
3861 | tp = dtp->stree; | ||
3862 | |||
3863 | /* Determine the maximum free string possible for the leaves */ | ||
3864 | l2max = le32_to_cpu(dtp->l2nleafs) + dtp->budmin; | ||
3865 | |||
3866 | /* | ||
3867 | * configure the leaf levevl into binary buddy system | ||
3868 | * | ||
3869 | * Try to combine buddies starting with a buddy size of 1 | ||
3870 | * (i.e. two leaves). At a buddy size of 1 two buddy leaves | ||
3871 | * can be combined if both buddies have a maximum free of l2min; | ||
3872 | * the combination will result in the left-most buddy leaf having | ||
3873 | * a maximum free of l2min+1. | ||
3874 | * After processing all buddies for a given size, process buddies | ||
3875 | * at the next higher buddy size (i.e. current size * 2) and | ||
3876 | * the next maximum free (current free + 1). | ||
3877 | * This continues until the maximum possible buddy combination | ||
3878 | * yields maximum free. | ||
3879 | */ | ||
3880 | for (l2free = dtp->budmin, bsize = 1; l2free < l2max; | ||
3881 | l2free++, bsize = nextb) { | ||
3882 | /* get next buddy size == current buddy pair size */ | ||
3883 | nextb = bsize << 1; | ||
3884 | |||
3885 | /* scan each adjacent buddy pair at current buddy size */ | ||
3886 | for (i = 0, cp = tp + le32_to_cpu(dtp->leafidx); | ||
3887 | i < le32_to_cpu(dtp->nleafs); | ||
3888 | i += nextb, cp += nextb) { | ||
3889 | /* coalesce if both adjacent buddies are max free */ | ||
3890 | if (*cp == l2free && *(cp + bsize) == l2free) { | ||
3891 | *cp = l2free + 1; /* left take right */ | ||
3892 | *(cp + bsize) = -1; /* right give left */ | ||
3893 | } | ||
3894 | } | ||
3895 | } | ||
3896 | |||
3897 | /* | ||
3898 | * bubble summary information of leaves up the tree. | ||
3899 | * | ||
3900 | * Starting at the leaf node level, the four nodes described by | ||
3901 | * the higher level parent node are compared for a maximum free and | ||
3902 | * this maximum becomes the value of the parent node. | ||
3903 | * when all lower level nodes are processed in this fashion then | ||
3904 | * move up to the next level (parent becomes a lower level node) and | ||
3905 | * continue the process for that level. | ||
3906 | */ | ||
3907 | for (child = le32_to_cpu(dtp->leafidx), | ||
3908 | nparent = le32_to_cpu(dtp->nleafs) >> 2; | ||
3909 | nparent > 0; nparent >>= 2, child = parent) { | ||
3910 | /* get index of 1st node of parent level */ | ||
3911 | parent = (child - 1) >> 2; | ||
3912 | |||
3913 | /* set the value of the parent node as the maximum | ||
3914 | * of the four nodes of the current level. | ||
3915 | */ | ||
3916 | for (i = 0, cp = tp + child, cp1 = tp + parent; | ||
3917 | i < nparent; i++, cp += 4, cp1++) | ||
3918 | *cp1 = TREEMAX(cp); | ||
3919 | } | ||
3920 | |||
3921 | return (*tp); | ||
3922 | } | ||
3923 | |||
3924 | |||
3925 | /* | ||
3926 | * dbInitDmapCtl() | ||
3927 | * | ||
3928 | * function: initialize dmapctl page | ||
3929 | */ | ||
3930 | static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i) | ||
3931 | { /* start leaf index not covered by range */ | ||
3932 | s8 *cp; | ||
3933 | |||
3934 | dcp->nleafs = cpu_to_le32(LPERCTL); | ||
3935 | dcp->l2nleafs = cpu_to_le32(L2LPERCTL); | ||
3936 | dcp->leafidx = cpu_to_le32(CTLLEAFIND); | ||
3937 | dcp->height = cpu_to_le32(5); | ||
3938 | dcp->budmin = L2BPERDMAP + L2LPERCTL * level; | ||
3939 | |||
3940 | /* | ||
3941 | * initialize the leaves of current level that were not covered | ||
3942 | * by the specified input block range (i.e. the leaves have no | ||
3943 | * low level dmapctl or dmap). | ||
3944 | */ | ||
3945 | cp = &dcp->stree[CTLLEAFIND + i]; | ||
3946 | for (; i < LPERCTL; i++) | ||
3947 | *cp++ = NOFREE; | ||
3948 | |||
3949 | /* build the dmap's binary buddy summary tree */ | ||
3950 | return (dbInitTree((struct dmaptree *) dcp)); | ||
3951 | } | ||
3952 | |||
3953 | |||
3954 | /* | ||
3955 | * NAME: dbGetL2AGSize()/ujfs_getagl2size() | ||
3956 | * | ||
3957 | * FUNCTION: Determine log2(allocation group size) from aggregate size | ||
3958 | * | ||
3959 | * PARAMETERS: | ||
3960 | * nblocks - Number of blocks in aggregate | ||
3961 | * | ||
3962 | * RETURNS: log2(allocation group size) in aggregate blocks | ||
3963 | */ | ||
3964 | static int dbGetL2AGSize(s64 nblocks) | ||
3965 | { | ||
3966 | s64 sz; | ||
3967 | s64 m; | ||
3968 | int l2sz; | ||
3969 | |||
3970 | if (nblocks < BPERDMAP * MAXAG) | ||
3971 | return (L2BPERDMAP); | ||
3972 | |||
3973 | /* round up aggregate size to power of 2 */ | ||
3974 | m = ((u64) 1 << (64 - 1)); | ||
3975 | for (l2sz = 64; l2sz >= 0; l2sz--, m >>= 1) { | ||
3976 | if (m & nblocks) | ||
3977 | break; | ||
3978 | } | ||
3979 | |||
3980 | sz = (s64) 1 << l2sz; | ||
3981 | if (sz < nblocks) | ||
3982 | l2sz += 1; | ||
3983 | |||
3984 | /* agsize = roundupSize/max_number_of_ag */ | ||
3985 | return (l2sz - L2MAXAG); | ||
3986 | } | ||
3987 | |||
3988 | |||
3989 | /* | ||
3990 | * NAME: dbMapFileSizeToMapSize() | ||
3991 | * | ||
3992 | * FUNCTION: compute number of blocks the block allocation map file | ||
3993 | * can cover from the map file size; | ||
3994 | * | ||
3995 | * RETURNS: Number of blocks which can be covered by this block map file; | ||
3996 | */ | ||
3997 | |||
3998 | /* | ||
3999 | * maximum number of map pages at each level including control pages | ||
4000 | */ | ||
4001 | #define MAXL0PAGES (1 + LPERCTL) | ||
4002 | #define MAXL1PAGES (1 + LPERCTL * MAXL0PAGES) | ||
4003 | #define MAXL2PAGES (1 + LPERCTL * MAXL1PAGES) | ||
4004 | |||
4005 | /* | ||
4006 | * convert number of map pages to the zero origin top dmapctl level | ||
4007 | */ | ||
4008 | #define BMAPPGTOLEV(npages) \ | ||
4009 | (((npages) <= 3 + MAXL0PAGES) ? 0 \ | ||
4010 | : ((npages) <= 2 + MAXL1PAGES) ? 1 : 2) | ||
4011 | |||
4012 | s64 dbMapFileSizeToMapSize(struct inode * ipbmap) | ||
4013 | { | ||
4014 | struct super_block *sb = ipbmap->i_sb; | ||
4015 | s64 nblocks; | ||
4016 | s64 npages, ndmaps; | ||
4017 | int level, i; | ||
4018 | int complete, factor; | ||
4019 | |||
4020 | nblocks = ipbmap->i_size >> JFS_SBI(sb)->l2bsize; | ||
4021 | npages = nblocks >> JFS_SBI(sb)->l2nbperpage; | ||
4022 | level = BMAPPGTOLEV(npages); | ||
4023 | |||
4024 | /* At each level, accumulate the number of dmap pages covered by | ||
4025 | * the number of full child levels below it; | ||
4026 | * repeat for the last incomplete child level. | ||
4027 | */ | ||
4028 | ndmaps = 0; | ||
4029 | npages--; /* skip the first global control page */ | ||
4030 | /* skip higher level control pages above top level covered by map */ | ||
4031 | npages -= (2 - level); | ||
4032 | npages--; /* skip top level's control page */ | ||
4033 | for (i = level; i >= 0; i--) { | ||
4034 | factor = | ||
4035 | (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1); | ||
4036 | complete = (u32) npages / factor; | ||
4037 | ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL | ||
4038 | : ((i == 1) ? LPERCTL : 1)); | ||
4039 | |||
4040 | /* pages in last/incomplete child */ | ||
4041 | npages = (u32) npages % factor; | ||
4042 | /* skip incomplete child's level control page */ | ||
4043 | npages--; | ||
4044 | } | ||
4045 | |||
4046 | /* convert the number of dmaps into the number of blocks | ||
4047 | * which can be covered by the dmaps; | ||
4048 | */ | ||
4049 | nblocks = ndmaps << L2BPERDMAP; | ||
4050 | |||
4051 | return (nblocks); | ||
4052 | } | ||
4053 | |||
4054 | |||
4055 | #ifdef _JFS_DEBUG_DMAP | ||
4056 | /* | ||
4057 | * DBinitmap() | ||
4058 | */ | ||
4059 | static void DBinitmap(s64 size, struct inode *ipbmap, u32 ** results) | ||
4060 | { | ||
4061 | int npages; | ||
4062 | u32 *dbmap, *d; | ||
4063 | int n; | ||
4064 | s64 lblkno, cur_block; | ||
4065 | struct dmap *dp; | ||
4066 | struct metapage *mp; | ||
4067 | |||
4068 | npages = size / 32768; | ||
4069 | npages += (size % 32768) ? 1 : 0; | ||
4070 | |||
4071 | dbmap = (u32 *) xmalloc(npages * 4096, L2PSIZE, kernel_heap); | ||
4072 | if (dbmap == NULL) | ||
4073 | BUG(); /* Not robust since this is only unused debug code */ | ||
4074 | |||
4075 | for (n = 0, d = dbmap; n < npages; n++, d += 1024) | ||
4076 | bzero(d, 4096); | ||
4077 | |||
4078 | /* Need to initialize from disk map pages | ||
4079 | */ | ||
4080 | for (d = dbmap, cur_block = 0; cur_block < size; | ||
4081 | cur_block += BPERDMAP, d += LPERDMAP) { | ||
4082 | lblkno = BLKTODMAP(cur_block, | ||
4083 | JFS_SBI(ipbmap->i_sb)->bmap-> | ||
4084 | db_l2nbperpage); | ||
4085 | mp = read_metapage(ipbmap, lblkno, PSIZE, 0); | ||
4086 | if (mp == NULL) { | ||
4087 | jfs_error(ipbmap->i_sb, | ||
4088 | "DBinitmap: could not read disk map page"); | ||
4089 | continue; | ||
4090 | } | ||
4091 | dp = (struct dmap *) mp->data; | ||
4092 | |||
4093 | for (n = 0; n < LPERDMAP; n++) | ||
4094 | d[n] = le32_to_cpu(dp->wmap[n]); | ||
4095 | |||
4096 | release_metapage(mp); | ||
4097 | } | ||
4098 | |||
4099 | *results = dbmap; | ||
4100 | } | ||
4101 | |||
4102 | |||
4103 | /* | ||
4104 | * DBAlloc() | ||
4105 | */ | ||
4106 | void DBAlloc(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks) | ||
4107 | { | ||
4108 | int word, nb, bitno; | ||
4109 | u32 mask; | ||
4110 | |||
4111 | assert(blkno > 0 && blkno < mapsize); | ||
4112 | assert(nblocks > 0 && nblocks <= mapsize); | ||
4113 | |||
4114 | assert(blkno + nblocks <= mapsize); | ||
4115 | |||
4116 | dbmap += (blkno / 32); | ||
4117 | while (nblocks > 0) { | ||
4118 | bitno = blkno & (32 - 1); | ||
4119 | nb = min(nblocks, 32 - bitno); | ||
4120 | |||
4121 | mask = (0xffffffff << (32 - nb) >> bitno); | ||
4122 | assert((mask & *dbmap) == 0); | ||
4123 | *dbmap |= mask; | ||
4124 | |||
4125 | dbmap++; | ||
4126 | blkno += nb; | ||
4127 | nblocks -= nb; | ||
4128 | } | ||
4129 | } | ||
4130 | |||
4131 | |||
4132 | /* | ||
4133 | * DBFree() | ||
4134 | */ | ||
4135 | static void DBFree(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks) | ||
4136 | { | ||
4137 | int word, nb, bitno; | ||
4138 | u32 mask; | ||
4139 | |||
4140 | assert(blkno > 0 && blkno < mapsize); | ||
4141 | assert(nblocks > 0 && nblocks <= mapsize); | ||
4142 | |||
4143 | assert(blkno + nblocks <= mapsize); | ||
4144 | |||
4145 | dbmap += (blkno / 32); | ||
4146 | while (nblocks > 0) { | ||
4147 | bitno = blkno & (32 - 1); | ||
4148 | nb = min(nblocks, 32 - bitno); | ||
4149 | |||
4150 | mask = (0xffffffff << (32 - nb) >> bitno); | ||
4151 | assert((mask & *dbmap) == mask); | ||
4152 | *dbmap &= ~mask; | ||
4153 | |||
4154 | dbmap++; | ||
4155 | blkno += nb; | ||
4156 | nblocks -= nb; | ||
4157 | } | ||
4158 | } | ||
4159 | |||
4160 | |||
4161 | /* | ||
4162 | * DBAllocCK() | ||
4163 | */ | ||
4164 | static void DBAllocCK(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks) | ||
4165 | { | ||
4166 | int word, nb, bitno; | ||
4167 | u32 mask; | ||
4168 | |||
4169 | assert(blkno > 0 && blkno < mapsize); | ||
4170 | assert(nblocks > 0 && nblocks <= mapsize); | ||
4171 | |||
4172 | assert(blkno + nblocks <= mapsize); | ||
4173 | |||
4174 | dbmap += (blkno / 32); | ||
4175 | while (nblocks > 0) { | ||
4176 | bitno = blkno & (32 - 1); | ||
4177 | nb = min(nblocks, 32 - bitno); | ||
4178 | |||
4179 | mask = (0xffffffff << (32 - nb) >> bitno); | ||
4180 | assert((mask & *dbmap) == mask); | ||
4181 | |||
4182 | dbmap++; | ||
4183 | blkno += nb; | ||
4184 | nblocks -= nb; | ||
4185 | } | ||
4186 | } | ||
4187 | |||
4188 | |||
4189 | /* | ||
4190 | * DBFreeCK() | ||
4191 | */ | ||
4192 | static void DBFreeCK(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks) | ||
4193 | { | ||
4194 | int word, nb, bitno; | ||
4195 | u32 mask; | ||
4196 | |||
4197 | assert(blkno > 0 && blkno < mapsize); | ||
4198 | assert(nblocks > 0 && nblocks <= mapsize); | ||
4199 | |||
4200 | assert(blkno + nblocks <= mapsize); | ||
4201 | |||
4202 | dbmap += (blkno / 32); | ||
4203 | while (nblocks > 0) { | ||
4204 | bitno = blkno & (32 - 1); | ||
4205 | nb = min(nblocks, 32 - bitno); | ||
4206 | |||
4207 | mask = (0xffffffff << (32 - nb) >> bitno); | ||
4208 | assert((mask & *dbmap) == 0); | ||
4209 | |||
4210 | dbmap++; | ||
4211 | blkno += nb; | ||
4212 | nblocks -= nb; | ||
4213 | } | ||
4214 | } | ||
4215 | |||
4216 | |||
4217 | /* | ||
4218 | * dbPrtMap() | ||
4219 | */ | ||
4220 | static void dbPrtMap(struct bmap * bmp) | ||
4221 | { | ||
4222 | printk(" mapsize: %d%d\n", bmp->db_mapsize); | ||
4223 | printk(" nfree: %d%d\n", bmp->db_nfree); | ||
4224 | printk(" numag: %d\n", bmp->db_numag); | ||
4225 | printk(" agsize: %d%d\n", bmp->db_agsize); | ||
4226 | printk(" agl2size: %d\n", bmp->db_agl2size); | ||
4227 | printk(" agwidth: %d\n", bmp->db_agwidth); | ||
4228 | printk(" agstart: %d\n", bmp->db_agstart); | ||
4229 | printk(" agheigth: %d\n", bmp->db_agheigth); | ||
4230 | printk(" aglevel: %d\n", bmp->db_aglevel); | ||
4231 | printk(" maxlevel: %d\n", bmp->db_maxlevel); | ||
4232 | printk(" maxag: %d\n", bmp->db_maxag); | ||
4233 | printk(" agpref: %d\n", bmp->db_agpref); | ||
4234 | printk(" l2nbppg: %d\n", bmp->db_l2nbperpage); | ||
4235 | } | ||
4236 | |||
4237 | |||
4238 | /* | ||
4239 | * dbPrtCtl() | ||
4240 | */ | ||
4241 | static void dbPrtCtl(struct dmapctl * dcp) | ||
4242 | { | ||
4243 | int i, j, n; | ||
4244 | |||
4245 | printk(" height: %08x\n", le32_to_cpu(dcp->height)); | ||
4246 | printk(" leafidx: %08x\n", le32_to_cpu(dcp->leafidx)); | ||
4247 | printk(" budmin: %08x\n", dcp->budmin); | ||
4248 | printk(" nleafs: %08x\n", le32_to_cpu(dcp->nleafs)); | ||
4249 | printk(" l2nleafs: %08x\n", le32_to_cpu(dcp->l2nleafs)); | ||
4250 | |||
4251 | printk("\n Tree:\n"); | ||
4252 | for (i = 0; i < CTLLEAFIND; i += 8) { | ||
4253 | n = min(8, CTLLEAFIND - i); | ||
4254 | |||
4255 | for (j = 0; j < n; j++) | ||
4256 | printf(" [%03x]: %02x", i + j, | ||
4257 | (char) dcp->stree[i + j]); | ||
4258 | printf("\n"); | ||
4259 | } | ||
4260 | |||
4261 | printk("\n Tree Leaves:\n"); | ||
4262 | for (i = 0; i < LPERCTL; i += 8) { | ||
4263 | n = min(8, LPERCTL - i); | ||
4264 | |||
4265 | for (j = 0; j < n; j++) | ||
4266 | printf(" [%03x]: %02x", | ||
4267 | i + j, | ||
4268 | (char) dcp->stree[i + j + CTLLEAFIND]); | ||
4269 | printf("\n"); | ||
4270 | } | ||
4271 | } | ||
4272 | #endif /* _JFS_DEBUG_DMAP */ | ||
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h new file mode 100644 index 000000000000..32e25884e7e8 --- /dev/null +++ b/fs/jfs/jfs_dmap.h | |||
@@ -0,0 +1,314 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2000-2002 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | #ifndef _H_JFS_DMAP | ||
19 | #define _H_JFS_DMAP | ||
20 | |||
21 | #include "jfs_txnmgr.h" | ||
22 | |||
23 | #define BMAPVERSION 1 /* version number */ | ||
24 | #define TREESIZE (256+64+16+4+1) /* size of a dmap tree */ | ||
25 | #define LEAFIND (64+16+4+1) /* index of 1st leaf of a dmap tree */ | ||
26 | #define LPERDMAP 256 /* num leaves per dmap tree */ | ||
27 | #define L2LPERDMAP 8 /* l2 number of leaves per dmap tree */ | ||
28 | #define DBWORD 32 /* # of blks covered by a map word */ | ||
29 | #define L2DBWORD 5 /* l2 # of blks covered by a mword */ | ||
30 | #define BUDMIN L2DBWORD /* max free string in a map word */ | ||
31 | #define BPERDMAP (LPERDMAP * DBWORD) /* num of blks per dmap */ | ||
32 | #define L2BPERDMAP 13 /* l2 num of blks per dmap */ | ||
33 | #define CTLTREESIZE (1024+256+64+16+4+1) /* size of a dmapctl tree */ | ||
34 | #define CTLLEAFIND (256+64+16+4+1) /* idx of 1st leaf of a dmapctl tree */ | ||
35 | #define LPERCTL 1024 /* num of leaves per dmapctl tree */ | ||
36 | #define L2LPERCTL 10 /* l2 num of leaves per dmapctl tree */ | ||
37 | #define ROOT 0 /* index of the root of a tree */ | ||
38 | #define NOFREE ((s8) -1) /* no blocks free */ | ||
39 | #define MAXAG 128 /* max number of allocation groups */ | ||
40 | #define L2MAXAG 7 /* l2 max num of AG */ | ||
41 | #define L2MINAGSZ 25 /* l2 of minimum AG size in bytes */ | ||
42 | #define BMAPBLKNO 0 /* lblkno of bmap within the map */ | ||
43 | |||
44 | /* | ||
45 | * maximum l2 number of disk blocks at the various dmapctl levels. | ||
46 | */ | ||
47 | #define L2MAXL0SIZE (L2BPERDMAP + 1 * L2LPERCTL) | ||
48 | #define L2MAXL1SIZE (L2BPERDMAP + 2 * L2LPERCTL) | ||
49 | #define L2MAXL2SIZE (L2BPERDMAP + 3 * L2LPERCTL) | ||
50 | |||
51 | /* | ||
52 | * maximum number of disk blocks at the various dmapctl levels. | ||
53 | */ | ||
54 | #define MAXL0SIZE ((s64)1 << L2MAXL0SIZE) | ||
55 | #define MAXL1SIZE ((s64)1 << L2MAXL1SIZE) | ||
56 | #define MAXL2SIZE ((s64)1 << L2MAXL2SIZE) | ||
57 | |||
58 | #define MAXMAPSIZE MAXL2SIZE /* maximum aggregate map size */ | ||
59 | |||
60 | /* | ||
61 | * determine the maximum free string for four (lower level) nodes | ||
62 | * of the tree. | ||
63 | */ | ||
64 | static __inline signed char TREEMAX(signed char *cp) | ||
65 | { | ||
66 | signed char tmp1, tmp2; | ||
67 | |||
68 | tmp1 = max(*(cp+2), *(cp+3)); | ||
69 | tmp2 = max(*(cp), *(cp+1)); | ||
70 | |||
71 | return max(tmp1, tmp2); | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * convert disk block number to the logical block number of the dmap | ||
76 | * describing the disk block. s is the log2(number of logical blocks per page) | ||
77 | * | ||
78 | * The calculation figures out how many logical pages are in front of the dmap. | ||
79 | * - the number of dmaps preceding it | ||
80 | * - the number of L0 pages preceding its L0 page | ||
81 | * - the number of L1 pages preceding its L1 page | ||
82 | * - 3 is added to account for the L2, L1, and L0 page for this dmap | ||
83 | * - 1 is added to account for the control page of the map. | ||
84 | */ | ||
85 | #define BLKTODMAP(b,s) \ | ||
86 | ((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s)) | ||
87 | |||
88 | /* | ||
89 | * convert disk block number to the logical block number of the LEVEL 0 | ||
90 | * dmapctl describing the disk block. s is the log2(number of logical blocks | ||
91 | * per page) | ||
92 | * | ||
93 | * The calculation figures out how many logical pages are in front of the L0. | ||
94 | * - the number of dmap pages preceding it | ||
95 | * - the number of L0 pages preceding it | ||
96 | * - the number of L1 pages preceding its L1 page | ||
97 | * - 2 is added to account for the L2, and L1 page for this L0 | ||
98 | * - 1 is added to account for the control page of the map. | ||
99 | */ | ||
100 | #define BLKTOL0(b,s) \ | ||
101 | (((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s)) | ||
102 | |||
103 | /* | ||
104 | * convert disk block number to the logical block number of the LEVEL 1 | ||
105 | * dmapctl describing the disk block. s is the log2(number of logical blocks | ||
106 | * per page) | ||
107 | * | ||
108 | * The calculation figures out how many logical pages are in front of the L1. | ||
109 | * - the number of dmap pages preceding it | ||
110 | * - the number of L0 pages preceding it | ||
111 | * - the number of L1 pages preceding it | ||
112 | * - 1 is added to account for the L2 page | ||
113 | * - 1 is added to account for the control page of the map. | ||
114 | */ | ||
115 | #define BLKTOL1(b,s) \ | ||
116 | (((((b) >> 33) << 20) + (((b) >> 33) << 10) + ((b) >> 33) + 1 + 1) << (s)) | ||
117 | |||
118 | /* | ||
119 | * convert disk block number to the logical block number of the dmapctl | ||
120 | * at the specified level which describes the disk block. | ||
121 | */ | ||
122 | #define BLKTOCTL(b,s,l) \ | ||
123 | (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s))) | ||
124 | |||
125 | /* | ||
126 | * convert aggregate map size to the zero origin dmapctl level of the | ||
127 | * top dmapctl. | ||
128 | */ | ||
129 | #define BMAPSZTOLEV(size) \ | ||
130 | (((size) <= MAXL0SIZE) ? 0 : ((size) <= MAXL1SIZE) ? 1 : 2) | ||
131 | |||
132 | /* convert disk block number to allocation group number. | ||
133 | */ | ||
134 | #define BLKTOAG(b,sbi) ((b) >> ((sbi)->bmap->db_agl2size)) | ||
135 | |||
136 | /* convert allocation group number to starting disk block | ||
137 | * number. | ||
138 | */ | ||
139 | #define AGTOBLK(a,ip) \ | ||
140 | ((s64)(a) << (JFS_SBI((ip)->i_sb)->bmap->db_agl2size)) | ||
141 | |||
142 | /* | ||
143 | * dmap summary tree | ||
144 | * | ||
145 | * dmaptree must be consistent with dmapctl. | ||
146 | */ | ||
147 | struct dmaptree { | ||
148 | __le32 nleafs; /* 4: number of tree leafs */ | ||
149 | __le32 l2nleafs; /* 4: l2 number of tree leafs */ | ||
150 | __le32 leafidx; /* 4: index of first tree leaf */ | ||
151 | __le32 height; /* 4: height of the tree */ | ||
152 | s8 budmin; /* 1: min l2 tree leaf value to combine */ | ||
153 | s8 stree[TREESIZE]; /* TREESIZE: tree */ | ||
154 | u8 pad[2]; /* 2: pad to word boundary */ | ||
155 | }; /* - 360 - */ | ||
156 | |||
157 | /* | ||
158 | * dmap page per 8K blocks bitmap | ||
159 | */ | ||
160 | struct dmap { | ||
161 | __le32 nblocks; /* 4: num blks covered by this dmap */ | ||
162 | __le32 nfree; /* 4: num of free blks in this dmap */ | ||
163 | __le64 start; /* 8: starting blkno for this dmap */ | ||
164 | struct dmaptree tree; /* 360: dmap tree */ | ||
165 | u8 pad[1672]; /* 1672: pad to 2048 bytes */ | ||
166 | __le32 wmap[LPERDMAP]; /* 1024: bits of the working map */ | ||
167 | __le32 pmap[LPERDMAP]; /* 1024: bits of the persistent map */ | ||
168 | }; /* - 4096 - */ | ||
169 | |||
170 | /* | ||
171 | * disk map control page per level. | ||
172 | * | ||
173 | * dmapctl must be consistent with dmaptree. | ||
174 | */ | ||
175 | struct dmapctl { | ||
176 | __le32 nleafs; /* 4: number of tree leafs */ | ||
177 | __le32 l2nleafs; /* 4: l2 number of tree leafs */ | ||
178 | __le32 leafidx; /* 4: index of the first tree leaf */ | ||
179 | __le32 height; /* 4: height of tree */ | ||
180 | s8 budmin; /* 1: minimum l2 tree leaf value */ | ||
181 | s8 stree[CTLTREESIZE]; /* CTLTREESIZE: dmapctl tree */ | ||
182 | u8 pad[2714]; /* 2714: pad to 4096 */ | ||
183 | }; /* - 4096 - */ | ||
184 | |||
185 | /* | ||
186 | * common definition for dmaptree within dmap and dmapctl | ||
187 | */ | ||
188 | typedef union dmtree { | ||
189 | struct dmaptree t1; | ||
190 | struct dmapctl t2; | ||
191 | } dmtree_t; | ||
192 | |||
193 | /* macros for accessing fields within dmtree */ | ||
194 | #define dmt_nleafs t1.nleafs | ||
195 | #define dmt_l2nleafs t1.l2nleafs | ||
196 | #define dmt_leafidx t1.leafidx | ||
197 | #define dmt_height t1.height | ||
198 | #define dmt_budmin t1.budmin | ||
199 | #define dmt_stree t1.stree | ||
200 | |||
201 | /* | ||
202 | * on-disk aggregate disk allocation map descriptor. | ||
203 | */ | ||
204 | struct dbmap_disk { | ||
205 | __le64 dn_mapsize; /* 8: number of blocks in aggregate */ | ||
206 | __le64 dn_nfree; /* 8: num free blks in aggregate map */ | ||
207 | __le32 dn_l2nbperpage; /* 4: number of blks per page */ | ||
208 | __le32 dn_numag; /* 4: total number of ags */ | ||
209 | __le32 dn_maxlevel; /* 4: number of active ags */ | ||
210 | __le32 dn_maxag; /* 4: max active alloc group number */ | ||
211 | __le32 dn_agpref; /* 4: preferred alloc group (hint) */ | ||
212 | __le32 dn_aglevel; /* 4: dmapctl level holding the AG */ | ||
213 | __le32 dn_agheigth; /* 4: height in dmapctl of the AG */ | ||
214 | __le32 dn_agwidth; /* 4: width in dmapctl of the AG */ | ||
215 | __le32 dn_agstart; /* 4: start tree index at AG height */ | ||
216 | __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */ | ||
217 | __le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count */ | ||
218 | __le64 dn_agsize; /* 8: num of blks per alloc group */ | ||
219 | s8 dn_maxfreebud; /* 1: max free buddy system */ | ||
220 | u8 pad[3007]; /* 3007: pad to 4096 */ | ||
221 | }; /* - 4096 - */ | ||
222 | |||
223 | struct dbmap { | ||
224 | s64 dn_mapsize; /* number of blocks in aggregate */ | ||
225 | s64 dn_nfree; /* num free blks in aggregate map */ | ||
226 | int dn_l2nbperpage; /* number of blks per page */ | ||
227 | int dn_numag; /* total number of ags */ | ||
228 | int dn_maxlevel; /* number of active ags */ | ||
229 | int dn_maxag; /* max active alloc group number */ | ||
230 | int dn_agpref; /* preferred alloc group (hint) */ | ||
231 | int dn_aglevel; /* dmapctl level holding the AG */ | ||
232 | int dn_agheigth; /* height in dmapctl of the AG */ | ||
233 | int dn_agwidth; /* width in dmapctl of the AG */ | ||
234 | int dn_agstart; /* start tree index at AG height */ | ||
235 | int dn_agl2size; /* l2 num of blks per alloc group */ | ||
236 | s64 dn_agfree[MAXAG]; /* per AG free count */ | ||
237 | s64 dn_agsize; /* num of blks per alloc group */ | ||
238 | signed char dn_maxfreebud; /* max free buddy system */ | ||
239 | }; /* - 4096 - */ | ||
240 | /* | ||
241 | * in-memory aggregate disk allocation map descriptor. | ||
242 | */ | ||
243 | struct bmap { | ||
244 | struct dbmap db_bmap; /* on-disk aggregate map descriptor */ | ||
245 | struct inode *db_ipbmap; /* ptr to aggregate map incore inode */ | ||
246 | struct semaphore db_bmaplock; /* aggregate map lock */ | ||
247 | atomic_t db_active[MAXAG]; /* count of active, open files in AG */ | ||
248 | u32 *db_DBmap; | ||
249 | }; | ||
250 | |||
251 | /* macros for accessing fields within in-memory aggregate map descriptor */ | ||
252 | #define db_mapsize db_bmap.dn_mapsize | ||
253 | #define db_nfree db_bmap.dn_nfree | ||
254 | #define db_agfree db_bmap.dn_agfree | ||
255 | #define db_agsize db_bmap.dn_agsize | ||
256 | #define db_agl2size db_bmap.dn_agl2size | ||
257 | #define db_agwidth db_bmap.dn_agwidth | ||
258 | #define db_agheigth db_bmap.dn_agheigth | ||
259 | #define db_agstart db_bmap.dn_agstart | ||
260 | #define db_numag db_bmap.dn_numag | ||
261 | #define db_maxlevel db_bmap.dn_maxlevel | ||
262 | #define db_aglevel db_bmap.dn_aglevel | ||
263 | #define db_agpref db_bmap.dn_agpref | ||
264 | #define db_maxag db_bmap.dn_maxag | ||
265 | #define db_maxfreebud db_bmap.dn_maxfreebud | ||
266 | #define db_l2nbperpage db_bmap.dn_l2nbperpage | ||
267 | |||
268 | /* | ||
269 | * macros for various conversions needed by the allocators. | ||
270 | * blkstol2(), cntlz(), and cnttz() are operating system dependent functions. | ||
271 | */ | ||
272 | /* convert number of blocks to log2 number of blocks, rounding up to | ||
273 | * the next log2 value if blocks is not a l2 multiple. | ||
274 | */ | ||
275 | #define BLKSTOL2(d) (blkstol2(d)) | ||
276 | |||
277 | /* convert number of leafs to log2 leaf value */ | ||
278 | #define NLSTOL2BSZ(n) (31 - cntlz((n)) + BUDMIN) | ||
279 | |||
280 | /* convert leaf index to log2 leaf value */ | ||
281 | #define LITOL2BSZ(n,m,b) ((((n) == 0) ? (m) : cnttz((n))) + (b)) | ||
282 | |||
283 | /* convert a block number to a dmap control leaf index */ | ||
284 | #define BLKTOCTLLEAF(b,m) \ | ||
285 | (((b) & (((s64)1 << ((m) + L2LPERCTL)) - 1)) >> (m)) | ||
286 | |||
287 | /* convert log2 leaf value to buddy size */ | ||
288 | #define BUDSIZE(s,m) (1 << ((s) - (m))) | ||
289 | |||
290 | /* | ||
291 | * external references. | ||
292 | */ | ||
293 | extern int dbMount(struct inode *ipbmap); | ||
294 | |||
295 | extern int dbUnmount(struct inode *ipbmap, int mounterror); | ||
296 | |||
297 | extern int dbFree(struct inode *ipbmap, s64 blkno, s64 nblocks); | ||
298 | |||
299 | extern int dbUpdatePMap(struct inode *ipbmap, | ||
300 | int free, s64 blkno, s64 nblocks, struct tblock * tblk); | ||
301 | |||
302 | extern int dbNextAG(struct inode *ipbmap); | ||
303 | |||
304 | extern int dbAlloc(struct inode *ipbmap, s64 hint, s64 nblocks, s64 * results); | ||
305 | |||
306 | extern int dbReAlloc(struct inode *ipbmap, | ||
307 | s64 blkno, s64 nblocks, s64 addnblocks, s64 * results); | ||
308 | |||
309 | extern int dbSync(struct inode *ipbmap); | ||
310 | extern int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks); | ||
311 | extern int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks); | ||
312 | extern void dbFinalizeBmap(struct inode *ipbmap); | ||
313 | extern s64 dbMapFileSizeToMapSize(struct inode *ipbmap); | ||
314 | #endif /* _H_JFS_DMAP */ | ||
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c new file mode 100644 index 000000000000..e357890adfb2 --- /dev/null +++ b/fs/jfs/jfs_dtree.c | |||
@@ -0,0 +1,4752 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | |||
19 | /* | ||
20 | * jfs_dtree.c: directory B+-tree manager | ||
21 | * | ||
22 | * B+-tree with variable length key directory: | ||
23 | * | ||
24 | * each directory page is structured as an array of 32-byte | ||
25 | * directory entry slots initialized as a freelist | ||
26 | * to avoid search/compaction of free space at insertion. | ||
27 | * when an entry is inserted, a number of slots are allocated | ||
28 | * from the freelist as required to store variable length data | ||
29 | * of the entry; when the entry is deleted, slots of the entry | ||
30 | * are returned to freelist. | ||
31 | * | ||
32 | * leaf entry stores full name as key and file serial number | ||
33 | * (aka inode number) as data. | ||
34 | * internal/router entry stores sufffix compressed name | ||
35 | * as key and simple extent descriptor as data. | ||
36 | * | ||
37 | * each directory page maintains a sorted entry index table | ||
38 | * which stores the start slot index of sorted entries | ||
39 | * to allow binary search on the table. | ||
40 | * | ||
41 | * directory starts as a root/leaf page in on-disk inode | ||
42 | * inline data area. | ||
43 | * when it becomes full, it starts a leaf of a external extent | ||
44 | * of length of 1 block. each time the first leaf becomes full, | ||
45 | * it is extended rather than split (its size is doubled), | ||
46 | * until its length becoms 4 KBytes, from then the extent is split | ||
47 | * with new 4 Kbyte extent when it becomes full | ||
48 | * to reduce external fragmentation of small directories. | ||
49 | * | ||
50 | * blah, blah, blah, for linear scan of directory in pieces by | ||
51 | * readdir(). | ||
52 | * | ||
53 | * | ||
54 | * case-insensitive directory file system | ||
55 | * | ||
56 | * names are stored in case-sensitive way in leaf entry. | ||
57 | * but stored, searched and compared in case-insensitive (uppercase) order | ||
58 | * (i.e., both search key and entry key are folded for search/compare): | ||
59 | * (note that case-sensitive order is BROKEN in storage, e.g., | ||
60 | * sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad | ||
61 | * | ||
62 | * entries which folds to the same key makes up a equivalent class | ||
63 | * whose members are stored as contiguous cluster (may cross page boundary) | ||
64 | * but whose order is arbitrary and acts as duplicate, e.g., | ||
65 | * abc, Abc, aBc, abC) | ||
66 | * | ||
67 | * once match is found at leaf, requires scan forward/backward | ||
68 | * either for, in case-insensitive search, duplicate | ||
69 | * or for, in case-sensitive search, for exact match | ||
70 | * | ||
71 | * router entry must be created/stored in case-insensitive way | ||
72 | * in internal entry: | ||
73 | * (right most key of left page and left most key of right page | ||
74 | * are folded, and its suffix compression is propagated as router | ||
75 | * key in parent) | ||
76 | * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB> | ||
77 | * should be made the router key for the split) | ||
78 | * | ||
79 | * case-insensitive search: | ||
80 | * | ||
81 | * fold search key; | ||
82 | * | ||
83 | * case-insensitive search of B-tree: | ||
84 | * for internal entry, router key is already folded; | ||
85 | * for leaf entry, fold the entry key before comparison. | ||
86 | * | ||
87 | * if (leaf entry case-insensitive match found) | ||
88 | * if (next entry satisfies case-insensitive match) | ||
89 | * return EDUPLICATE; | ||
90 | * if (prev entry satisfies case-insensitive match) | ||
91 | * return EDUPLICATE; | ||
92 | * return match; | ||
93 | * else | ||
94 | * return no match; | ||
95 | * | ||
96 | * serialization: | ||
97 | * target directory inode lock is being held on entry/exit | ||
98 | * of all main directory service routines. | ||
99 | * | ||
100 | * log based recovery: | ||
101 | */ | ||
102 | |||
103 | #include <linux/fs.h> | ||
104 | #include <linux/quotaops.h> | ||
105 | #include "jfs_incore.h" | ||
106 | #include "jfs_superblock.h" | ||
107 | #include "jfs_filsys.h" | ||
108 | #include "jfs_metapage.h" | ||
109 | #include "jfs_dmap.h" | ||
110 | #include "jfs_unicode.h" | ||
111 | #include "jfs_debug.h" | ||
112 | |||
113 | /* dtree split parameter */ | ||
114 | struct dtsplit { | ||
115 | struct metapage *mp; | ||
116 | s16 index; | ||
117 | s16 nslot; | ||
118 | struct component_name *key; | ||
119 | ddata_t *data; | ||
120 | struct pxdlist *pxdlist; | ||
121 | }; | ||
122 | |||
123 | #define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot) | ||
124 | |||
125 | /* get page buffer for specified block address */ | ||
126 | #define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)\ | ||
127 | {\ | ||
128 | BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot)\ | ||
129 | if (!(RC))\ | ||
130 | {\ | ||
131 | if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\ | ||
132 | ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\ | ||
133 | {\ | ||
134 | BT_PUTPAGE(MP);\ | ||
135 | jfs_error((IP)->i_sb, "DT_GETPAGE: dtree page corrupt");\ | ||
136 | MP = NULL;\ | ||
137 | RC = -EIO;\ | ||
138 | }\ | ||
139 | }\ | ||
140 | } | ||
141 | |||
142 | /* for consistency */ | ||
143 | #define DT_PUTPAGE(MP) BT_PUTPAGE(MP) | ||
144 | |||
145 | #define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ | ||
146 | BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot) | ||
147 | |||
148 | /* | ||
149 | * forward references | ||
150 | */ | ||
151 | static int dtSplitUp(tid_t tid, struct inode *ip, | ||
152 | struct dtsplit * split, struct btstack * btstack); | ||
153 | |||
154 | static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, | ||
155 | struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp); | ||
156 | |||
157 | static int dtExtendPage(tid_t tid, struct inode *ip, | ||
158 | struct dtsplit * split, struct btstack * btstack); | ||
159 | |||
160 | static int dtSplitRoot(tid_t tid, struct inode *ip, | ||
161 | struct dtsplit * split, struct metapage ** rmpp); | ||
162 | |||
163 | static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, | ||
164 | dtpage_t * fp, struct btstack * btstack); | ||
165 | |||
166 | static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p); | ||
167 | |||
168 | static int dtReadFirst(struct inode *ip, struct btstack * btstack); | ||
169 | |||
170 | static int dtReadNext(struct inode *ip, | ||
171 | loff_t * offset, struct btstack * btstack); | ||
172 | |||
173 | static int dtCompare(struct component_name * key, dtpage_t * p, int si); | ||
174 | |||
175 | static int ciCompare(struct component_name * key, dtpage_t * p, int si, | ||
176 | int flag); | ||
177 | |||
178 | static void dtGetKey(dtpage_t * p, int i, struct component_name * key, | ||
179 | int flag); | ||
180 | |||
181 | static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, | ||
182 | int ri, struct component_name * key, int flag); | ||
183 | |||
184 | static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key, | ||
185 | ddata_t * data, struct dt_lock **); | ||
186 | |||
187 | static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, | ||
188 | struct dt_lock ** sdtlock, struct dt_lock ** ddtlock, | ||
189 | int do_index); | ||
190 | |||
191 | static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock); | ||
192 | |||
193 | static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock); | ||
194 | |||
195 | static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock); | ||
196 | |||
197 | #define ciToUpper(c) UniStrupr((c)->name) | ||
198 | |||
199 | /* | ||
200 | * read_index_page() | ||
201 | * | ||
202 | * Reads a page of a directory's index table. | ||
203 | * Having metadata mapped into the directory inode's address space | ||
204 | * presents a multitude of problems. We avoid this by mapping to | ||
205 | * the absolute address space outside of the *_metapage routines | ||
206 | */ | ||
207 | static struct metapage *read_index_page(struct inode *inode, s64 blkno) | ||
208 | { | ||
209 | int rc; | ||
210 | s64 xaddr; | ||
211 | int xflag; | ||
212 | s32 xlen; | ||
213 | |||
214 | rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); | ||
215 | if (rc || (xlen == 0)) | ||
216 | return NULL; | ||
217 | |||
218 | return read_metapage(inode, xaddr, PSIZE, 1); | ||
219 | } | ||
220 | |||
221 | /* | ||
222 | * get_index_page() | ||
223 | * | ||
224 | * Same as get_index_page(), but get's a new page without reading | ||
225 | */ | ||
226 | static struct metapage *get_index_page(struct inode *inode, s64 blkno) | ||
227 | { | ||
228 | int rc; | ||
229 | s64 xaddr; | ||
230 | int xflag; | ||
231 | s32 xlen; | ||
232 | |||
233 | rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); | ||
234 | if (rc || (xlen == 0)) | ||
235 | return NULL; | ||
236 | |||
237 | return get_metapage(inode, xaddr, PSIZE, 1); | ||
238 | } | ||
239 | |||
240 | /* | ||
241 | * find_index() | ||
242 | * | ||
243 | * Returns dtree page containing directory table entry for specified | ||
244 | * index and pointer to its entry. | ||
245 | * | ||
246 | * mp must be released by caller. | ||
247 | */ | ||
248 | static struct dir_table_slot *find_index(struct inode *ip, u32 index, | ||
249 | struct metapage ** mp, s64 *lblock) | ||
250 | { | ||
251 | struct jfs_inode_info *jfs_ip = JFS_IP(ip); | ||
252 | s64 blkno; | ||
253 | s64 offset; | ||
254 | int page_offset; | ||
255 | struct dir_table_slot *slot; | ||
256 | static int maxWarnings = 10; | ||
257 | |||
258 | if (index < 2) { | ||
259 | if (maxWarnings) { | ||
260 | jfs_warn("find_entry called with index = %d", index); | ||
261 | maxWarnings--; | ||
262 | } | ||
263 | return NULL; | ||
264 | } | ||
265 | |||
266 | if (index >= jfs_ip->next_index) { | ||
267 | jfs_warn("find_entry called with index >= next_index"); | ||
268 | return NULL; | ||
269 | } | ||
270 | |||
271 | if (jfs_dirtable_inline(ip)) { | ||
272 | /* | ||
273 | * Inline directory table | ||
274 | */ | ||
275 | *mp = NULL; | ||
276 | slot = &jfs_ip->i_dirtable[index - 2]; | ||
277 | } else { | ||
278 | offset = (index - 2) * sizeof(struct dir_table_slot); | ||
279 | page_offset = offset & (PSIZE - 1); | ||
280 | blkno = ((offset + 1) >> L2PSIZE) << | ||
281 | JFS_SBI(ip->i_sb)->l2nbperpage; | ||
282 | |||
283 | if (*mp && (*lblock != blkno)) { | ||
284 | release_metapage(*mp); | ||
285 | *mp = NULL; | ||
286 | } | ||
287 | if (*mp == 0) { | ||
288 | *lblock = blkno; | ||
289 | *mp = read_index_page(ip, blkno); | ||
290 | } | ||
291 | if (*mp == 0) { | ||
292 | jfs_err("free_index: error reading directory table"); | ||
293 | return NULL; | ||
294 | } | ||
295 | |||
296 | slot = | ||
297 | (struct dir_table_slot *) ((char *) (*mp)->data + | ||
298 | page_offset); | ||
299 | } | ||
300 | return slot; | ||
301 | } | ||
302 | |||
303 | static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp, | ||
304 | u32 index) | ||
305 | { | ||
306 | struct tlock *tlck; | ||
307 | struct linelock *llck; | ||
308 | struct lv *lv; | ||
309 | |||
310 | tlck = txLock(tid, ip, mp, tlckDATA); | ||
311 | llck = (struct linelock *) tlck->lock; | ||
312 | |||
313 | if (llck->index >= llck->maxcnt) | ||
314 | llck = txLinelock(llck); | ||
315 | lv = &llck->lv[llck->index]; | ||
316 | |||
317 | /* | ||
318 | * Linelock slot size is twice the size of directory table | ||
319 | * slot size. 512 entries per page. | ||
320 | */ | ||
321 | lv->offset = ((index - 2) & 511) >> 1; | ||
322 | lv->length = 1; | ||
323 | llck->index++; | ||
324 | } | ||
325 | |||
326 | /* | ||
327 | * add_index() | ||
328 | * | ||
329 | * Adds an entry to the directory index table. This is used to provide | ||
330 | * each directory entry with a persistent index in which to resume | ||
331 | * directory traversals | ||
332 | */ | ||
333 | static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) | ||
334 | { | ||
335 | struct super_block *sb = ip->i_sb; | ||
336 | struct jfs_sb_info *sbi = JFS_SBI(sb); | ||
337 | struct jfs_inode_info *jfs_ip = JFS_IP(ip); | ||
338 | u64 blkno; | ||
339 | struct dir_table_slot *dirtab_slot; | ||
340 | u32 index; | ||
341 | struct linelock *llck; | ||
342 | struct lv *lv; | ||
343 | struct metapage *mp; | ||
344 | s64 offset; | ||
345 | uint page_offset; | ||
346 | struct tlock *tlck; | ||
347 | s64 xaddr; | ||
348 | |||
349 | ASSERT(DO_INDEX(ip)); | ||
350 | |||
351 | if (jfs_ip->next_index < 2) { | ||
352 | jfs_warn("add_index: next_index = %d. Resetting!", | ||
353 | jfs_ip->next_index); | ||
354 | jfs_ip->next_index = 2; | ||
355 | } | ||
356 | |||
357 | index = jfs_ip->next_index++; | ||
358 | |||
359 | if (index <= MAX_INLINE_DIRTABLE_ENTRY) { | ||
360 | /* | ||
361 | * i_size reflects size of index table, or 8 bytes per entry. | ||
362 | */ | ||
363 | ip->i_size = (loff_t) (index - 1) << 3; | ||
364 | |||
365 | /* | ||
366 | * dir table fits inline within inode | ||
367 | */ | ||
368 | dirtab_slot = &jfs_ip->i_dirtable[index-2]; | ||
369 | dirtab_slot->flag = DIR_INDEX_VALID; | ||
370 | dirtab_slot->slot = slot; | ||
371 | DTSaddress(dirtab_slot, bn); | ||
372 | |||
373 | set_cflag(COMMIT_Dirtable, ip); | ||
374 | |||
375 | return index; | ||
376 | } | ||
377 | if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) { | ||
378 | struct dir_table_slot temp_table[12]; | ||
379 | |||
380 | /* | ||
381 | * It's time to move the inline table to an external | ||
382 | * page and begin to build the xtree | ||
383 | */ | ||
384 | if (DQUOT_ALLOC_BLOCK(ip, sbi->nbperpage) || | ||
385 | dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) | ||
386 | goto clean_up; /* No space */ | ||
387 | |||
388 | /* | ||
389 | * Save the table, we're going to overwrite it with the | ||
390 | * xtree root | ||
391 | */ | ||
392 | memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table)); | ||
393 | |||
394 | /* | ||
395 | * Initialize empty x-tree | ||
396 | */ | ||
397 | xtInitRoot(tid, ip); | ||
398 | |||
399 | /* | ||
400 | * Allocate the first block & add it to the xtree | ||
401 | */ | ||
402 | if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) { | ||
403 | /* This really shouldn't fail */ | ||
404 | jfs_warn("add_index: xtInsert failed!"); | ||
405 | memcpy(&jfs_ip->i_dirtable, temp_table, | ||
406 | sizeof (temp_table)); | ||
407 | goto clean_up; | ||
408 | } | ||
409 | ip->i_size = PSIZE; | ||
410 | |||
411 | if ((mp = get_index_page(ip, 0)) == 0) { | ||
412 | jfs_err("add_index: get_metapage failed!"); | ||
413 | xtTruncate(tid, ip, 0, COMMIT_PWMAP); | ||
414 | memcpy(&jfs_ip->i_dirtable, temp_table, | ||
415 | sizeof (temp_table)); | ||
416 | goto clean_up; | ||
417 | } | ||
418 | tlck = txLock(tid, ip, mp, tlckDATA); | ||
419 | llck = (struct linelock *) & tlck->lock; | ||
420 | ASSERT(llck->index == 0); | ||
421 | lv = &llck->lv[0]; | ||
422 | |||
423 | lv->offset = 0; | ||
424 | lv->length = 6; /* tlckDATA slot size is 16 bytes */ | ||
425 | llck->index++; | ||
426 | |||
427 | memcpy(mp->data, temp_table, sizeof(temp_table)); | ||
428 | |||
429 | mark_metapage_dirty(mp); | ||
430 | release_metapage(mp); | ||
431 | |||
432 | /* | ||
433 | * Logging is now directed by xtree tlocks | ||
434 | */ | ||
435 | clear_cflag(COMMIT_Dirtable, ip); | ||
436 | } | ||
437 | |||
438 | offset = (index - 2) * sizeof(struct dir_table_slot); | ||
439 | page_offset = offset & (PSIZE - 1); | ||
440 | blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage; | ||
441 | if (page_offset == 0) { | ||
442 | /* | ||
443 | * This will be the beginning of a new page | ||
444 | */ | ||
445 | xaddr = 0; | ||
446 | if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) { | ||
447 | jfs_warn("add_index: xtInsert failed!"); | ||
448 | goto clean_up; | ||
449 | } | ||
450 | ip->i_size += PSIZE; | ||
451 | |||
452 | if ((mp = get_index_page(ip, blkno))) | ||
453 | memset(mp->data, 0, PSIZE); /* Just looks better */ | ||
454 | else | ||
455 | xtTruncate(tid, ip, offset, COMMIT_PWMAP); | ||
456 | } else | ||
457 | mp = read_index_page(ip, blkno); | ||
458 | |||
459 | if (mp == 0) { | ||
460 | jfs_err("add_index: get/read_metapage failed!"); | ||
461 | goto clean_up; | ||
462 | } | ||
463 | |||
464 | lock_index(tid, ip, mp, index); | ||
465 | |||
466 | dirtab_slot = | ||
467 | (struct dir_table_slot *) ((char *) mp->data + page_offset); | ||
468 | dirtab_slot->flag = DIR_INDEX_VALID; | ||
469 | dirtab_slot->slot = slot; | ||
470 | DTSaddress(dirtab_slot, bn); | ||
471 | |||
472 | mark_metapage_dirty(mp); | ||
473 | release_metapage(mp); | ||
474 | |||
475 | return index; | ||
476 | |||
477 | clean_up: | ||
478 | |||
479 | jfs_ip->next_index--; | ||
480 | |||
481 | return 0; | ||
482 | } | ||
483 | |||
484 | /* | ||
485 | * free_index() | ||
486 | * | ||
487 | * Marks an entry to the directory index table as free. | ||
488 | */ | ||
489 | static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next) | ||
490 | { | ||
491 | struct dir_table_slot *dirtab_slot; | ||
492 | s64 lblock; | ||
493 | struct metapage *mp = NULL; | ||
494 | |||
495 | dirtab_slot = find_index(ip, index, &mp, &lblock); | ||
496 | |||
497 | if (dirtab_slot == 0) | ||
498 | return; | ||
499 | |||
500 | dirtab_slot->flag = DIR_INDEX_FREE; | ||
501 | dirtab_slot->slot = dirtab_slot->addr1 = 0; | ||
502 | dirtab_slot->addr2 = cpu_to_le32(next); | ||
503 | |||
504 | if (mp) { | ||
505 | lock_index(tid, ip, mp, index); | ||
506 | mark_metapage_dirty(mp); | ||
507 | release_metapage(mp); | ||
508 | } else | ||
509 | set_cflag(COMMIT_Dirtable, ip); | ||
510 | } | ||
511 | |||
512 | /* | ||
513 | * modify_index() | ||
514 | * | ||
515 | * Changes an entry in the directory index table | ||
516 | */ | ||
517 | static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn, | ||
518 | int slot, struct metapage ** mp, u64 *lblock) | ||
519 | { | ||
520 | struct dir_table_slot *dirtab_slot; | ||
521 | |||
522 | dirtab_slot = find_index(ip, index, mp, lblock); | ||
523 | |||
524 | if (dirtab_slot == 0) | ||
525 | return; | ||
526 | |||
527 | DTSaddress(dirtab_slot, bn); | ||
528 | dirtab_slot->slot = slot; | ||
529 | |||
530 | if (*mp) { | ||
531 | lock_index(tid, ip, *mp, index); | ||
532 | mark_metapage_dirty(*mp); | ||
533 | } else | ||
534 | set_cflag(COMMIT_Dirtable, ip); | ||
535 | } | ||
536 | |||
537 | /* | ||
538 | * read_index() | ||
539 | * | ||
540 | * reads a directory table slot | ||
541 | */ | ||
542 | static int read_index(struct inode *ip, u32 index, | ||
543 | struct dir_table_slot * dirtab_slot) | ||
544 | { | ||
545 | s64 lblock; | ||
546 | struct metapage *mp = NULL; | ||
547 | struct dir_table_slot *slot; | ||
548 | |||
549 | slot = find_index(ip, index, &mp, &lblock); | ||
550 | if (slot == 0) { | ||
551 | return -EIO; | ||
552 | } | ||
553 | |||
554 | memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot)); | ||
555 | |||
556 | if (mp) | ||
557 | release_metapage(mp); | ||
558 | |||
559 | return 0; | ||
560 | } | ||
561 | |||
562 | /* | ||
563 | * dtSearch() | ||
564 | * | ||
565 | * function: | ||
566 | * Search for the entry with specified key | ||
567 | * | ||
568 | * parameter: | ||
569 | * | ||
570 | * return: 0 - search result on stack, leaf page pinned; | ||
571 | * errno - I/O error | ||
572 | */ | ||
573 | int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, | ||
574 | struct btstack * btstack, int flag) | ||
575 | { | ||
576 | int rc = 0; | ||
577 | int cmp = 1; /* init for empty page */ | ||
578 | s64 bn; | ||
579 | struct metapage *mp; | ||
580 | dtpage_t *p; | ||
581 | s8 *stbl; | ||
582 | int base, index, lim; | ||
583 | struct btframe *btsp; | ||
584 | pxd_t *pxd; | ||
585 | int psize = 288; /* initial in-line directory */ | ||
586 | ino_t inumber; | ||
587 | struct component_name ciKey; | ||
588 | struct super_block *sb = ip->i_sb; | ||
589 | |||
590 | ciKey.name = | ||
591 | (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), | ||
592 | GFP_NOFS); | ||
593 | if (ciKey.name == 0) { | ||
594 | rc = -ENOMEM; | ||
595 | goto dtSearch_Exit2; | ||
596 | } | ||
597 | |||
598 | |||
599 | /* uppercase search key for c-i directory */ | ||
600 | UniStrcpy(ciKey.name, key->name); | ||
601 | ciKey.namlen = key->namlen; | ||
602 | |||
603 | /* only uppercase if case-insensitive support is on */ | ||
604 | if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) { | ||
605 | ciToUpper(&ciKey); | ||
606 | } | ||
607 | BT_CLR(btstack); /* reset stack */ | ||
608 | |||
609 | /* init level count for max pages to split */ | ||
610 | btstack->nsplit = 1; | ||
611 | |||
612 | /* | ||
613 | * search down tree from root: | ||
614 | * | ||
615 | * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of | ||
616 | * internal page, child page Pi contains entry with k, Ki <= K < Kj. | ||
617 | * | ||
618 | * if entry with search key K is not found | ||
619 | * internal page search find the entry with largest key Ki | ||
620 | * less than K which point to the child page to search; | ||
621 | * leaf page search find the entry with smallest key Kj | ||
622 | * greater than K so that the returned index is the position of | ||
623 | * the entry to be shifted right for insertion of new entry. | ||
624 | * for empty tree, search key is greater than any key of the tree. | ||
625 | * | ||
626 | * by convention, root bn = 0. | ||
627 | */ | ||
628 | for (bn = 0;;) { | ||
629 | /* get/pin the page to search */ | ||
630 | DT_GETPAGE(ip, bn, mp, psize, p, rc); | ||
631 | if (rc) | ||
632 | goto dtSearch_Exit1; | ||
633 | |||
634 | /* get sorted entry table of the page */ | ||
635 | stbl = DT_GETSTBL(p); | ||
636 | |||
637 | /* | ||
638 | * binary search with search key K on the current page. | ||
639 | */ | ||
640 | for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) { | ||
641 | index = base + (lim >> 1); | ||
642 | |||
643 | if (p->header.flag & BT_LEAF) { | ||
644 | /* uppercase leaf name to compare */ | ||
645 | cmp = | ||
646 | ciCompare(&ciKey, p, stbl[index], | ||
647 | JFS_SBI(sb)->mntflag); | ||
648 | } else { | ||
649 | /* router key is in uppercase */ | ||
650 | |||
651 | cmp = dtCompare(&ciKey, p, stbl[index]); | ||
652 | |||
653 | |||
654 | } | ||
655 | if (cmp == 0) { | ||
656 | /* | ||
657 | * search hit | ||
658 | */ | ||
659 | /* search hit - leaf page: | ||
660 | * return the entry found | ||
661 | */ | ||
662 | if (p->header.flag & BT_LEAF) { | ||
663 | inumber = le32_to_cpu( | ||
664 | ((struct ldtentry *) & p->slot[stbl[index]])->inumber); | ||
665 | |||
666 | /* | ||
667 | * search for JFS_LOOKUP | ||
668 | */ | ||
669 | if (flag == JFS_LOOKUP) { | ||
670 | *data = inumber; | ||
671 | rc = 0; | ||
672 | goto out; | ||
673 | } | ||
674 | |||
675 | /* | ||
676 | * search for JFS_CREATE | ||
677 | */ | ||
678 | if (flag == JFS_CREATE) { | ||
679 | *data = inumber; | ||
680 | rc = -EEXIST; | ||
681 | goto out; | ||
682 | } | ||
683 | |||
684 | /* | ||
685 | * search for JFS_REMOVE or JFS_RENAME | ||
686 | */ | ||
687 | if ((flag == JFS_REMOVE || | ||
688 | flag == JFS_RENAME) && | ||
689 | *data != inumber) { | ||
690 | rc = -ESTALE; | ||
691 | goto out; | ||
692 | } | ||
693 | |||
694 | /* | ||
695 | * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME | ||
696 | */ | ||
697 | /* save search result */ | ||
698 | *data = inumber; | ||
699 | btsp = btstack->top; | ||
700 | btsp->bn = bn; | ||
701 | btsp->index = index; | ||
702 | btsp->mp = mp; | ||
703 | |||
704 | rc = 0; | ||
705 | goto dtSearch_Exit1; | ||
706 | } | ||
707 | |||
708 | /* search hit - internal page: | ||
709 | * descend/search its child page | ||
710 | */ | ||
711 | goto getChild; | ||
712 | } | ||
713 | |||
714 | if (cmp > 0) { | ||
715 | base = index + 1; | ||
716 | --lim; | ||
717 | } | ||
718 | } | ||
719 | |||
720 | /* | ||
721 | * search miss | ||
722 | * | ||
723 | * base is the smallest index with key (Kj) greater than | ||
724 | * search key (K) and may be zero or (maxindex + 1) index. | ||
725 | */ | ||
726 | /* | ||
727 | * search miss - leaf page | ||
728 | * | ||
729 | * return location of entry (base) where new entry with | ||
730 | * search key K is to be inserted. | ||
731 | */ | ||
732 | if (p->header.flag & BT_LEAF) { | ||
733 | /* | ||
734 | * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME | ||
735 | */ | ||
736 | if (flag == JFS_LOOKUP || flag == JFS_REMOVE || | ||
737 | flag == JFS_RENAME) { | ||
738 | rc = -ENOENT; | ||
739 | goto out; | ||
740 | } | ||
741 | |||
742 | /* | ||
743 | * search for JFS_CREATE|JFS_FINDDIR: | ||
744 | * | ||
745 | * save search result | ||
746 | */ | ||
747 | *data = 0; | ||
748 | btsp = btstack->top; | ||
749 | btsp->bn = bn; | ||
750 | btsp->index = base; | ||
751 | btsp->mp = mp; | ||
752 | |||
753 | rc = 0; | ||
754 | goto dtSearch_Exit1; | ||
755 | } | ||
756 | |||
757 | /* | ||
758 | * search miss - internal page | ||
759 | * | ||
760 | * if base is non-zero, decrement base by one to get the parent | ||
761 | * entry of the child page to search. | ||
762 | */ | ||
763 | index = base ? base - 1 : base; | ||
764 | |||
765 | /* | ||
766 | * go down to child page | ||
767 | */ | ||
768 | getChild: | ||
769 | /* update max. number of pages to split */ | ||
770 | if (BT_STACK_FULL(btstack)) { | ||
771 | /* Something's corrupted, mark filesytem dirty so | ||
772 | * chkdsk will fix it. | ||
773 | */ | ||
774 | jfs_error(sb, "stack overrun in dtSearch!"); | ||
775 | BT_STACK_DUMP(btstack); | ||
776 | rc = -EIO; | ||
777 | goto out; | ||
778 | } | ||
779 | btstack->nsplit++; | ||
780 | |||
781 | /* push (bn, index) of the parent page/entry */ | ||
782 | BT_PUSH(btstack, bn, index); | ||
783 | |||
784 | /* get the child page block number */ | ||
785 | pxd = (pxd_t *) & p->slot[stbl[index]]; | ||
786 | bn = addressPXD(pxd); | ||
787 | psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize; | ||
788 | |||
789 | /* unpin the parent page */ | ||
790 | DT_PUTPAGE(mp); | ||
791 | } | ||
792 | |||
793 | out: | ||
794 | DT_PUTPAGE(mp); | ||
795 | |||
796 | dtSearch_Exit1: | ||
797 | |||
798 | kfree(ciKey.name); | ||
799 | |||
800 | dtSearch_Exit2: | ||
801 | |||
802 | return rc; | ||
803 | } | ||
804 | |||
805 | |||
806 | /* | ||
807 | * dtInsert() | ||
808 | * | ||
809 | * function: insert an entry to directory tree | ||
810 | * | ||
811 | * parameter: | ||
812 | * | ||
813 | * return: 0 - success; | ||
814 | * errno - failure; | ||
815 | */ | ||
816 | int dtInsert(tid_t tid, struct inode *ip, | ||
817 | struct component_name * name, ino_t * fsn, struct btstack * btstack) | ||
818 | { | ||
819 | int rc = 0; | ||
820 | struct metapage *mp; /* meta-page buffer */ | ||
821 | dtpage_t *p; /* base B+-tree index page */ | ||
822 | s64 bn; | ||
823 | int index; | ||
824 | struct dtsplit split; /* split information */ | ||
825 | ddata_t data; | ||
826 | struct dt_lock *dtlck; | ||
827 | int n; | ||
828 | struct tlock *tlck; | ||
829 | struct lv *lv; | ||
830 | |||
831 | /* | ||
832 | * retrieve search result | ||
833 | * | ||
834 | * dtSearch() returns (leaf page pinned, index at which to insert). | ||
835 | * n.b. dtSearch() may return index of (maxindex + 1) of | ||
836 | * the full page. | ||
837 | */ | ||
838 | DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); | ||
839 | |||
840 | /* | ||
841 | * insert entry for new key | ||
842 | */ | ||
843 | if (DO_INDEX(ip)) { | ||
844 | if (JFS_IP(ip)->next_index == DIREND) { | ||
845 | DT_PUTPAGE(mp); | ||
846 | return -EMLINK; | ||
847 | } | ||
848 | n = NDTLEAF(name->namlen); | ||
849 | data.leaf.tid = tid; | ||
850 | data.leaf.ip = ip; | ||
851 | } else { | ||
852 | n = NDTLEAF_LEGACY(name->namlen); | ||
853 | data.leaf.ip = NULL; /* signifies legacy directory format */ | ||
854 | } | ||
855 | data.leaf.ino = *fsn; | ||
856 | |||
857 | /* | ||
858 | * leaf page does not have enough room for new entry: | ||
859 | * | ||
860 | * extend/split the leaf page; | ||
861 | * | ||
862 | * dtSplitUp() will insert the entry and unpin the leaf page. | ||
863 | */ | ||
864 | if (n > p->header.freecnt) { | ||
865 | split.mp = mp; | ||
866 | split.index = index; | ||
867 | split.nslot = n; | ||
868 | split.key = name; | ||
869 | split.data = &data; | ||
870 | rc = dtSplitUp(tid, ip, &split, btstack); | ||
871 | return rc; | ||
872 | } | ||
873 | |||
874 | /* | ||
875 | * leaf page does have enough room for new entry: | ||
876 | * | ||
877 | * insert the new data entry into the leaf page; | ||
878 | */ | ||
879 | BT_MARK_DIRTY(mp, ip); | ||
880 | /* | ||
881 | * acquire a transaction lock on the leaf page | ||
882 | */ | ||
883 | tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); | ||
884 | dtlck = (struct dt_lock *) & tlck->lock; | ||
885 | ASSERT(dtlck->index == 0); | ||
886 | lv = & dtlck->lv[0]; | ||
887 | |||
888 | /* linelock header */ | ||
889 | lv->offset = 0; | ||
890 | lv->length = 1; | ||
891 | dtlck->index++; | ||
892 | |||
893 | dtInsertEntry(p, index, name, &data, &dtlck); | ||
894 | |||
895 | /* linelock stbl of non-root leaf page */ | ||
896 | if (!(p->header.flag & BT_ROOT)) { | ||
897 | if (dtlck->index >= dtlck->maxcnt) | ||
898 | dtlck = (struct dt_lock *) txLinelock(dtlck); | ||
899 | lv = & dtlck->lv[dtlck->index]; | ||
900 | n = index >> L2DTSLOTSIZE; | ||
901 | lv->offset = p->header.stblindex + n; | ||
902 | lv->length = | ||
903 | ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; | ||
904 | dtlck->index++; | ||
905 | } | ||
906 | |||
907 | /* unpin the leaf page */ | ||
908 | DT_PUTPAGE(mp); | ||
909 | |||
910 | return 0; | ||
911 | } | ||
912 | |||
913 | |||
914 | /* | ||
915 | * dtSplitUp() | ||
916 | * | ||
917 | * function: propagate insertion bottom up; | ||
918 | * | ||
919 | * parameter: | ||
920 | * | ||
921 | * return: 0 - success; | ||
922 | * errno - failure; | ||
923 | * leaf page unpinned; | ||
924 | */ | ||
925 | static int dtSplitUp(tid_t tid, | ||
926 | struct inode *ip, struct dtsplit * split, struct btstack * btstack) | ||
927 | { | ||
928 | struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); | ||
929 | int rc = 0; | ||
930 | struct metapage *smp; | ||
931 | dtpage_t *sp; /* split page */ | ||
932 | struct metapage *rmp; | ||
933 | dtpage_t *rp; /* new right page split from sp */ | ||
934 | pxd_t rpxd; /* new right page extent descriptor */ | ||
935 | struct metapage *lmp; | ||
936 | dtpage_t *lp; /* left child page */ | ||
937 | int skip; /* index of entry of insertion */ | ||
938 | struct btframe *parent; /* parent page entry on traverse stack */ | ||
939 | s64 xaddr, nxaddr; | ||
940 | int xlen, xsize; | ||
941 | struct pxdlist pxdlist; | ||
942 | pxd_t *pxd; | ||
943 | struct component_name key = { 0, NULL }; | ||
944 | ddata_t *data = split->data; | ||
945 | int n; | ||
946 | struct dt_lock *dtlck; | ||
947 | struct tlock *tlck; | ||
948 | struct lv *lv; | ||
949 | int quota_allocation = 0; | ||
950 | |||
951 | /* get split page */ | ||
952 | smp = split->mp; | ||
953 | sp = DT_PAGE(ip, smp); | ||
954 | |||
955 | key.name = | ||
956 | (wchar_t *) kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), | ||
957 | GFP_NOFS); | ||
958 | if (key.name == 0) { | ||
959 | DT_PUTPAGE(smp); | ||
960 | rc = -ENOMEM; | ||
961 | goto dtSplitUp_Exit; | ||
962 | } | ||
963 | |||
964 | /* | ||
965 | * split leaf page | ||
966 | * | ||
967 | * The split routines insert the new entry, and | ||
968 | * acquire txLock as appropriate. | ||
969 | */ | ||
970 | /* | ||
971 | * split root leaf page: | ||
972 | */ | ||
973 | if (sp->header.flag & BT_ROOT) { | ||
974 | /* | ||
975 | * allocate a single extent child page | ||
976 | */ | ||
977 | xlen = 1; | ||
978 | n = sbi->bsize >> L2DTSLOTSIZE; | ||
979 | n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ | ||
980 | n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */ | ||
981 | if (n <= split->nslot) | ||
982 | xlen++; | ||
983 | if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) { | ||
984 | DT_PUTPAGE(smp); | ||
985 | goto freeKeyName; | ||
986 | } | ||
987 | |||
988 | pxdlist.maxnpxd = 1; | ||
989 | pxdlist.npxd = 0; | ||
990 | pxd = &pxdlist.pxd[0]; | ||
991 | PXDaddress(pxd, xaddr); | ||
992 | PXDlength(pxd, xlen); | ||
993 | split->pxdlist = &pxdlist; | ||
994 | rc = dtSplitRoot(tid, ip, split, &rmp); | ||
995 | |||
996 | if (rc) | ||
997 | dbFree(ip, xaddr, xlen); | ||
998 | else | ||
999 | DT_PUTPAGE(rmp); | ||
1000 | |||
1001 | DT_PUTPAGE(smp); | ||
1002 | |||
1003 | goto freeKeyName; | ||
1004 | } | ||
1005 | |||
1006 | /* | ||
1007 | * extend first leaf page | ||
1008 | * | ||
1009 | * extend the 1st extent if less than buffer page size | ||
1010 | * (dtExtendPage() reurns leaf page unpinned) | ||
1011 | */ | ||
1012 | pxd = &sp->header.self; | ||
1013 | xlen = lengthPXD(pxd); | ||
1014 | xsize = xlen << sbi->l2bsize; | ||
1015 | if (xsize < PSIZE) { | ||
1016 | xaddr = addressPXD(pxd); | ||
1017 | n = xsize >> L2DTSLOTSIZE; | ||
1018 | n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ | ||
1019 | if ((n + sp->header.freecnt) <= split->nslot) | ||
1020 | n = xlen + (xlen << 1); | ||
1021 | else | ||
1022 | n = xlen; | ||
1023 | |||
1024 | /* Allocate blocks to quota. */ | ||
1025 | if (DQUOT_ALLOC_BLOCK(ip, n)) { | ||
1026 | rc = -EDQUOT; | ||
1027 | goto extendOut; | ||
1028 | } | ||
1029 | quota_allocation += n; | ||
1030 | |||
1031 | if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen, | ||
1032 | (s64) n, &nxaddr))) | ||
1033 | goto extendOut; | ||
1034 | |||
1035 | pxdlist.maxnpxd = 1; | ||
1036 | pxdlist.npxd = 0; | ||
1037 | pxd = &pxdlist.pxd[0]; | ||
1038 | PXDaddress(pxd, nxaddr) | ||
1039 | PXDlength(pxd, xlen + n); | ||
1040 | split->pxdlist = &pxdlist; | ||
1041 | if ((rc = dtExtendPage(tid, ip, split, btstack))) { | ||
1042 | nxaddr = addressPXD(pxd); | ||
1043 | if (xaddr != nxaddr) { | ||
1044 | /* free relocated extent */ | ||
1045 | xlen = lengthPXD(pxd); | ||
1046 | dbFree(ip, nxaddr, (s64) xlen); | ||
1047 | } else { | ||
1048 | /* free extended delta */ | ||
1049 | xlen = lengthPXD(pxd) - n; | ||
1050 | xaddr = addressPXD(pxd) + xlen; | ||
1051 | dbFree(ip, xaddr, (s64) n); | ||
1052 | } | ||
1053 | } | ||
1054 | |||
1055 | extendOut: | ||
1056 | DT_PUTPAGE(smp); | ||
1057 | goto freeKeyName; | ||
1058 | } | ||
1059 | |||
1060 | /* | ||
1061 | * split leaf page <sp> into <sp> and a new right page <rp>. | ||
1062 | * | ||
1063 | * return <rp> pinned and its extent descriptor <rpxd> | ||
1064 | */ | ||
1065 | /* | ||
1066 | * allocate new directory page extent and | ||
1067 | * new index page(s) to cover page split(s) | ||
1068 | * | ||
1069 | * allocation hint: ? | ||
1070 | */ | ||
1071 | n = btstack->nsplit; | ||
1072 | pxdlist.maxnpxd = pxdlist.npxd = 0; | ||
1073 | xlen = sbi->nbperpage; | ||
1074 | for (pxd = pxdlist.pxd; n > 0; n--, pxd++) { | ||
1075 | if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) { | ||
1076 | PXDaddress(pxd, xaddr); | ||
1077 | PXDlength(pxd, xlen); | ||
1078 | pxdlist.maxnpxd++; | ||
1079 | continue; | ||
1080 | } | ||
1081 | |||
1082 | DT_PUTPAGE(smp); | ||
1083 | |||
1084 | /* undo allocation */ | ||
1085 | goto splitOut; | ||
1086 | } | ||
1087 | |||
1088 | split->pxdlist = &pxdlist; | ||
1089 | if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) { | ||
1090 | DT_PUTPAGE(smp); | ||
1091 | |||
1092 | /* undo allocation */ | ||
1093 | goto splitOut; | ||
1094 | } | ||
1095 | |||
1096 | /* | ||
1097 | * propagate up the router entry for the leaf page just split | ||
1098 | * | ||
1099 | * insert a router entry for the new page into the parent page, | ||
1100 | * propagate the insert/split up the tree by walking back the stack | ||
1101 | * of (bn of parent page, index of child page entry in parent page) | ||
1102 | * that were traversed during the search for the page that split. | ||
1103 | * | ||
1104 | * the propagation of insert/split up the tree stops if the root | ||
1105 | * splits or the page inserted into doesn't have to split to hold | ||
1106 | * the new entry. | ||
1107 | * | ||
1108 | * the parent entry for the split page remains the same, and | ||
1109 | * a new entry is inserted at its right with the first key and | ||
1110 | * block number of the new right page. | ||
1111 | * | ||
1112 | * There are a maximum of 4 pages pinned at any time: | ||
1113 | * two children, left parent and right parent (when the parent splits). | ||
1114 | * keep the child pages pinned while working on the parent. | ||
1115 | * make sure that all pins are released at exit. | ||
1116 | */ | ||
1117 | while ((parent = BT_POP(btstack)) != NULL) { | ||
1118 | /* parent page specified by stack frame <parent> */ | ||
1119 | |||
1120 | /* keep current child pages (<lp>, <rp>) pinned */ | ||
1121 | lmp = smp; | ||
1122 | lp = sp; | ||
1123 | |||
1124 | /* | ||
1125 | * insert router entry in parent for new right child page <rp> | ||
1126 | */ | ||
1127 | /* get the parent page <sp> */ | ||
1128 | DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); | ||
1129 | if (rc) { | ||
1130 | DT_PUTPAGE(lmp); | ||
1131 | DT_PUTPAGE(rmp); | ||
1132 | goto splitOut; | ||
1133 | } | ||
1134 | |||
1135 | /* | ||
1136 | * The new key entry goes ONE AFTER the index of parent entry, | ||
1137 | * because the split was to the right. | ||
1138 | */ | ||
1139 | skip = parent->index + 1; | ||
1140 | |||
1141 | /* | ||
1142 | * compute the key for the router entry | ||
1143 | * | ||
1144 | * key suffix compression: | ||
1145 | * for internal pages that have leaf pages as children, | ||
1146 | * retain only what's needed to distinguish between | ||
1147 | * the new entry and the entry on the page to its left. | ||
1148 | * If the keys compare equal, retain the entire key. | ||
1149 | * | ||
1150 | * note that compression is performed only at computing | ||
1151 | * router key at the lowest internal level. | ||
1152 | * further compression of the key between pairs of higher | ||
1153 | * level internal pages loses too much information and | ||
1154 | * the search may fail. | ||
1155 | * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,} | ||
1156 | * results in two adjacent parent entries (a)(xx). | ||
1157 | * if split occurs between these two entries, and | ||
1158 | * if compression is applied, the router key of parent entry | ||
1159 | * of right page (x) will divert search for x into right | ||
1160 | * subtree and miss x in the left subtree.) | ||
1161 | * | ||
1162 | * the entire key must be retained for the next-to-leftmost | ||
1163 | * internal key at any level of the tree, or search may fail | ||
1164 | * (e.g., ?) | ||
1165 | */ | ||
1166 | switch (rp->header.flag & BT_TYPE) { | ||
1167 | case BT_LEAF: | ||
1168 | /* | ||
1169 | * compute the length of prefix for suffix compression | ||
1170 | * between last entry of left page and first entry | ||
1171 | * of right page | ||
1172 | */ | ||
1173 | if ((sp->header.flag & BT_ROOT && skip > 1) || | ||
1174 | sp->header.prev != 0 || skip > 1) { | ||
1175 | /* compute uppercase router prefix key */ | ||
1176 | rc = ciGetLeafPrefixKey(lp, | ||
1177 | lp->header.nextindex-1, | ||
1178 | rp, 0, &key, | ||
1179 | sbi->mntflag); | ||
1180 | if (rc) { | ||
1181 | DT_PUTPAGE(lmp); | ||
1182 | DT_PUTPAGE(rmp); | ||
1183 | DT_PUTPAGE(smp); | ||
1184 | goto splitOut; | ||
1185 | } | ||
1186 | } else { | ||
1187 | /* next to leftmost entry of | ||
1188 | lowest internal level */ | ||
1189 | |||
1190 | /* compute uppercase router key */ | ||
1191 | dtGetKey(rp, 0, &key, sbi->mntflag); | ||
1192 | key.name[key.namlen] = 0; | ||
1193 | |||
1194 | if ((sbi->mntflag & JFS_OS2) == JFS_OS2) | ||
1195 | ciToUpper(&key); | ||
1196 | } | ||
1197 | |||
1198 | n = NDTINTERNAL(key.namlen); | ||
1199 | break; | ||
1200 | |||
1201 | case BT_INTERNAL: | ||
1202 | dtGetKey(rp, 0, &key, sbi->mntflag); | ||
1203 | n = NDTINTERNAL(key.namlen); | ||
1204 | break; | ||
1205 | |||
1206 | default: | ||
1207 | jfs_err("dtSplitUp(): UFO!"); | ||
1208 | break; | ||
1209 | } | ||
1210 | |||
1211 | /* unpin left child page */ | ||
1212 | DT_PUTPAGE(lmp); | ||
1213 | |||
1214 | /* | ||
1215 | * compute the data for the router entry | ||
1216 | */ | ||
1217 | data->xd = rpxd; /* child page xd */ | ||
1218 | |||
1219 | /* | ||
1220 | * parent page is full - split the parent page | ||
1221 | */ | ||
1222 | if (n > sp->header.freecnt) { | ||
1223 | /* init for parent page split */ | ||
1224 | split->mp = smp; | ||
1225 | split->index = skip; /* index at insert */ | ||
1226 | split->nslot = n; | ||
1227 | split->key = &key; | ||
1228 | /* split->data = data; */ | ||
1229 | |||
1230 | /* unpin right child page */ | ||
1231 | DT_PUTPAGE(rmp); | ||
1232 | |||
1233 | /* The split routines insert the new entry, | ||
1234 | * acquire txLock as appropriate. | ||
1235 | * return <rp> pinned and its block number <rbn>. | ||
1236 | */ | ||
1237 | rc = (sp->header.flag & BT_ROOT) ? | ||
1238 | dtSplitRoot(tid, ip, split, &rmp) : | ||
1239 | dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd); | ||
1240 | if (rc) { | ||
1241 | DT_PUTPAGE(smp); | ||
1242 | goto splitOut; | ||
1243 | } | ||
1244 | |||
1245 | /* smp and rmp are pinned */ | ||
1246 | } | ||
1247 | /* | ||
1248 | * parent page is not full - insert router entry in parent page | ||
1249 | */ | ||
1250 | else { | ||
1251 | BT_MARK_DIRTY(smp, ip); | ||
1252 | /* | ||
1253 | * acquire a transaction lock on the parent page | ||
1254 | */ | ||
1255 | tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); | ||
1256 | dtlck = (struct dt_lock *) & tlck->lock; | ||
1257 | ASSERT(dtlck->index == 0); | ||
1258 | lv = & dtlck->lv[0]; | ||
1259 | |||
1260 | /* linelock header */ | ||
1261 | lv->offset = 0; | ||
1262 | lv->length = 1; | ||
1263 | dtlck->index++; | ||
1264 | |||
1265 | /* linelock stbl of non-root parent page */ | ||
1266 | if (!(sp->header.flag & BT_ROOT)) { | ||
1267 | lv++; | ||
1268 | n = skip >> L2DTSLOTSIZE; | ||
1269 | lv->offset = sp->header.stblindex + n; | ||
1270 | lv->length = | ||
1271 | ((sp->header.nextindex - | ||
1272 | 1) >> L2DTSLOTSIZE) - n + 1; | ||
1273 | dtlck->index++; | ||
1274 | } | ||
1275 | |||
1276 | dtInsertEntry(sp, skip, &key, data, &dtlck); | ||
1277 | |||
1278 | /* exit propagate up */ | ||
1279 | break; | ||
1280 | } | ||
1281 | } | ||
1282 | |||
1283 | /* unpin current split and its right page */ | ||
1284 | DT_PUTPAGE(smp); | ||
1285 | DT_PUTPAGE(rmp); | ||
1286 | |||
1287 | /* | ||
1288 | * free remaining extents allocated for split | ||
1289 | */ | ||
1290 | splitOut: | ||
1291 | n = pxdlist.npxd; | ||
1292 | pxd = &pxdlist.pxd[n]; | ||
1293 | for (; n < pxdlist.maxnpxd; n++, pxd++) | ||
1294 | dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd)); | ||
1295 | |||
1296 | freeKeyName: | ||
1297 | kfree(key.name); | ||
1298 | |||
1299 | /* Rollback quota allocation */ | ||
1300 | if (rc && quota_allocation) | ||
1301 | DQUOT_FREE_BLOCK(ip, quota_allocation); | ||
1302 | |||
1303 | dtSplitUp_Exit: | ||
1304 | |||
1305 | return rc; | ||
1306 | } | ||
1307 | |||
1308 | |||
1309 | /* | ||
1310 | * dtSplitPage() | ||
1311 | * | ||
1312 | * function: Split a non-root page of a btree. | ||
1313 | * | ||
1314 | * parameter: | ||
1315 | * | ||
1316 | * return: 0 - success; | ||
1317 | * errno - failure; | ||
1318 | * return split and new page pinned; | ||
1319 | */ | ||
1320 | static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, | ||
1321 | struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp) | ||
1322 | { | ||
1323 | int rc = 0; | ||
1324 | struct metapage *smp; | ||
1325 | dtpage_t *sp; | ||
1326 | struct metapage *rmp; | ||
1327 | dtpage_t *rp; /* new right page allocated */ | ||
1328 | s64 rbn; /* new right page block number */ | ||
1329 | struct metapage *mp; | ||
1330 | dtpage_t *p; | ||
1331 | s64 nextbn; | ||
1332 | struct pxdlist *pxdlist; | ||
1333 | pxd_t *pxd; | ||
1334 | int skip, nextindex, half, left, nxt, off, si; | ||
1335 | struct ldtentry *ldtentry; | ||
1336 | struct idtentry *idtentry; | ||
1337 | u8 *stbl; | ||
1338 | struct dtslot *f; | ||
1339 | int fsi, stblsize; | ||
1340 | int n; | ||
1341 | struct dt_lock *sdtlck, *rdtlck; | ||
1342 | struct tlock *tlck; | ||
1343 | struct dt_lock *dtlck; | ||
1344 | struct lv *slv, *rlv, *lv; | ||
1345 | |||
1346 | /* get split page */ | ||
1347 | smp = split->mp; | ||
1348 | sp = DT_PAGE(ip, smp); | ||
1349 | |||
1350 | /* | ||
1351 | * allocate the new right page for the split | ||
1352 | */ | ||
1353 | pxdlist = split->pxdlist; | ||
1354 | pxd = &pxdlist->pxd[pxdlist->npxd]; | ||
1355 | pxdlist->npxd++; | ||
1356 | rbn = addressPXD(pxd); | ||
1357 | rmp = get_metapage(ip, rbn, PSIZE, 1); | ||
1358 | if (rmp == NULL) | ||
1359 | return -EIO; | ||
1360 | |||
1361 | /* Allocate blocks to quota. */ | ||
1362 | if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) { | ||
1363 | release_metapage(rmp); | ||
1364 | return -EDQUOT; | ||
1365 | } | ||
1366 | |||
1367 | jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); | ||
1368 | |||
1369 | BT_MARK_DIRTY(rmp, ip); | ||
1370 | /* | ||
1371 | * acquire a transaction lock on the new right page | ||
1372 | */ | ||
1373 | tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); | ||
1374 | rdtlck = (struct dt_lock *) & tlck->lock; | ||
1375 | |||
1376 | rp = (dtpage_t *) rmp->data; | ||
1377 | *rpp = rp; | ||
1378 | rp->header.self = *pxd; | ||
1379 | |||
1380 | BT_MARK_DIRTY(smp, ip); | ||
1381 | /* | ||
1382 | * acquire a transaction lock on the split page | ||
1383 | * | ||
1384 | * action: | ||
1385 | */ | ||
1386 | tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); | ||
1387 | sdtlck = (struct dt_lock *) & tlck->lock; | ||
1388 | |||
1389 | /* linelock header of split page */ | ||
1390 | ASSERT(sdtlck->index == 0); | ||
1391 | slv = & sdtlck->lv[0]; | ||
1392 | slv->offset = 0; | ||
1393 | slv->length = 1; | ||
1394 | sdtlck->index++; | ||
1395 | |||
1396 | /* | ||
1397 | * initialize/update sibling pointers between sp and rp | ||
1398 | */ | ||
1399 | nextbn = le64_to_cpu(sp->header.next); | ||
1400 | rp->header.next = cpu_to_le64(nextbn); | ||
1401 | rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self)); | ||
1402 | sp->header.next = cpu_to_le64(rbn); | ||
1403 | |||
1404 | /* | ||
1405 | * initialize new right page | ||
1406 | */ | ||
1407 | rp->header.flag = sp->header.flag; | ||
1408 | |||
1409 | /* compute sorted entry table at start of extent data area */ | ||
1410 | rp->header.nextindex = 0; | ||
1411 | rp->header.stblindex = 1; | ||
1412 | |||
1413 | n = PSIZE >> L2DTSLOTSIZE; | ||
1414 | rp->header.maxslot = n; | ||
1415 | stblsize = (n + 31) >> L2DTSLOTSIZE; /* in unit of slot */ | ||
1416 | |||
1417 | /* init freelist */ | ||
1418 | fsi = rp->header.stblindex + stblsize; | ||
1419 | rp->header.freelist = fsi; | ||
1420 | rp->header.freecnt = rp->header.maxslot - fsi; | ||
1421 | |||
1422 | /* | ||
1423 | * sequential append at tail: append without split | ||
1424 | * | ||
1425 | * If splitting the last page on a level because of appending | ||
1426 | * a entry to it (skip is maxentry), it's likely that the access is | ||
1427 | * sequential. Adding an empty page on the side of the level is less | ||
1428 | * work and can push the fill factor much higher than normal. | ||
1429 | * If we're wrong it's no big deal, we'll just do the split the right | ||
1430 | * way next time. | ||
1431 | * (It may look like it's equally easy to do a similar hack for | ||
1432 | * reverse sorted data, that is, split the tree left, | ||
1433 | * but it's not. Be my guest.) | ||
1434 | */ | ||
1435 | if (nextbn == 0 && split->index == sp->header.nextindex) { | ||
1436 | /* linelock header + stbl (first slot) of new page */ | ||
1437 | rlv = & rdtlck->lv[rdtlck->index]; | ||
1438 | rlv->offset = 0; | ||
1439 | rlv->length = 2; | ||
1440 | rdtlck->index++; | ||
1441 | |||
1442 | /* | ||
1443 | * initialize freelist of new right page | ||
1444 | */ | ||
1445 | f = &rp->slot[fsi]; | ||
1446 | for (fsi++; fsi < rp->header.maxslot; f++, fsi++) | ||
1447 | f->next = fsi; | ||
1448 | f->next = -1; | ||
1449 | |||
1450 | /* insert entry at the first entry of the new right page */ | ||
1451 | dtInsertEntry(rp, 0, split->key, split->data, &rdtlck); | ||
1452 | |||
1453 | goto out; | ||
1454 | } | ||
1455 | |||
1456 | /* | ||
1457 | * non-sequential insert (at possibly middle page) | ||
1458 | */ | ||
1459 | |||
1460 | /* | ||
1461 | * update prev pointer of previous right sibling page; | ||
1462 | */ | ||
1463 | if (nextbn != 0) { | ||
1464 | DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); | ||
1465 | if (rc) { | ||
1466 | discard_metapage(rmp); | ||
1467 | return rc; | ||
1468 | } | ||
1469 | |||
1470 | BT_MARK_DIRTY(mp, ip); | ||
1471 | /* | ||
1472 | * acquire a transaction lock on the next page | ||
1473 | */ | ||
1474 | tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); | ||
1475 | jfs_info("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p", | ||
1476 | tlck, ip, mp); | ||
1477 | dtlck = (struct dt_lock *) & tlck->lock; | ||
1478 | |||
1479 | /* linelock header of previous right sibling page */ | ||
1480 | lv = & dtlck->lv[dtlck->index]; | ||
1481 | lv->offset = 0; | ||
1482 | lv->length = 1; | ||
1483 | dtlck->index++; | ||
1484 | |||
1485 | p->header.prev = cpu_to_le64(rbn); | ||
1486 | |||
1487 | DT_PUTPAGE(mp); | ||
1488 | } | ||
1489 | |||
1490 | /* | ||
1491 | * split the data between the split and right pages. | ||
1492 | */ | ||
1493 | skip = split->index; | ||
1494 | half = (PSIZE >> L2DTSLOTSIZE) >> 1; /* swag */ | ||
1495 | left = 0; | ||
1496 | |||
1497 | /* | ||
1498 | * compute fill factor for split pages | ||
1499 | * | ||
1500 | * <nxt> traces the next entry to move to rp | ||
1501 | * <off> traces the next entry to stay in sp | ||
1502 | */ | ||
1503 | stbl = (u8 *) & sp->slot[sp->header.stblindex]; | ||
1504 | nextindex = sp->header.nextindex; | ||
1505 | for (nxt = off = 0; nxt < nextindex; ++off) { | ||
1506 | if (off == skip) | ||
1507 | /* check for fill factor with new entry size */ | ||
1508 | n = split->nslot; | ||
1509 | else { | ||
1510 | si = stbl[nxt]; | ||
1511 | switch (sp->header.flag & BT_TYPE) { | ||
1512 | case BT_LEAF: | ||
1513 | ldtentry = (struct ldtentry *) & sp->slot[si]; | ||
1514 | if (DO_INDEX(ip)) | ||
1515 | n = NDTLEAF(ldtentry->namlen); | ||
1516 | else | ||
1517 | n = NDTLEAF_LEGACY(ldtentry-> | ||
1518 | namlen); | ||
1519 | break; | ||
1520 | |||
1521 | case BT_INTERNAL: | ||
1522 | idtentry = (struct idtentry *) & sp->slot[si]; | ||
1523 | n = NDTINTERNAL(idtentry->namlen); | ||
1524 | break; | ||
1525 | |||
1526 | default: | ||
1527 | break; | ||
1528 | } | ||
1529 | |||
1530 | ++nxt; /* advance to next entry to move in sp */ | ||
1531 | } | ||
1532 | |||
1533 | left += n; | ||
1534 | if (left >= half) | ||
1535 | break; | ||
1536 | } | ||
1537 | |||
1538 | /* <nxt> poins to the 1st entry to move */ | ||
1539 | |||
1540 | /* | ||
1541 | * move entries to right page | ||
1542 | * | ||
1543 | * dtMoveEntry() initializes rp and reserves entry for insertion | ||
1544 | * | ||
1545 | * split page moved out entries are linelocked; | ||
1546 | * new/right page moved in entries are linelocked; | ||
1547 | */ | ||
1548 | /* linelock header + stbl of new right page */ | ||
1549 | rlv = & rdtlck->lv[rdtlck->index]; | ||
1550 | rlv->offset = 0; | ||
1551 | rlv->length = 5; | ||
1552 | rdtlck->index++; | ||
1553 | |||
1554 | dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip)); | ||
1555 | |||
1556 | sp->header.nextindex = nxt; | ||
1557 | |||
1558 | /* | ||
1559 | * finalize freelist of new right page | ||
1560 | */ | ||
1561 | fsi = rp->header.freelist; | ||
1562 | f = &rp->slot[fsi]; | ||
1563 | for (fsi++; fsi < rp->header.maxslot; f++, fsi++) | ||
1564 | f->next = fsi; | ||
1565 | f->next = -1; | ||
1566 | |||
1567 | /* | ||
1568 | * Update directory index table for entries now in right page | ||
1569 | */ | ||
1570 | if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { | ||
1571 | s64 lblock; | ||
1572 | |||
1573 | mp = NULL; | ||
1574 | stbl = DT_GETSTBL(rp); | ||
1575 | for (n = 0; n < rp->header.nextindex; n++) { | ||
1576 | ldtentry = (struct ldtentry *) & rp->slot[stbl[n]]; | ||
1577 | modify_index(tid, ip, le32_to_cpu(ldtentry->index), | ||
1578 | rbn, n, &mp, &lblock); | ||
1579 | } | ||
1580 | if (mp) | ||
1581 | release_metapage(mp); | ||
1582 | } | ||
1583 | |||
1584 | /* | ||
1585 | * the skipped index was on the left page, | ||
1586 | */ | ||
1587 | if (skip <= off) { | ||
1588 | /* insert the new entry in the split page */ | ||
1589 | dtInsertEntry(sp, skip, split->key, split->data, &sdtlck); | ||
1590 | |||
1591 | /* linelock stbl of split page */ | ||
1592 | if (sdtlck->index >= sdtlck->maxcnt) | ||
1593 | sdtlck = (struct dt_lock *) txLinelock(sdtlck); | ||
1594 | slv = & sdtlck->lv[sdtlck->index]; | ||
1595 | n = skip >> L2DTSLOTSIZE; | ||
1596 | slv->offset = sp->header.stblindex + n; | ||
1597 | slv->length = | ||
1598 | ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; | ||
1599 | sdtlck->index++; | ||
1600 | } | ||
1601 | /* | ||
1602 | * the skipped index was on the right page, | ||
1603 | */ | ||
1604 | else { | ||
1605 | /* adjust the skip index to reflect the new position */ | ||
1606 | skip -= nxt; | ||
1607 | |||
1608 | /* insert the new entry in the right page */ | ||
1609 | dtInsertEntry(rp, skip, split->key, split->data, &rdtlck); | ||
1610 | } | ||
1611 | |||
1612 | out: | ||
1613 | *rmpp = rmp; | ||
1614 | *rpxdp = *pxd; | ||
1615 | |||
1616 | return rc; | ||
1617 | } | ||
1618 | |||
1619 | |||
1620 | /* | ||
1621 | * dtExtendPage() | ||
1622 | * | ||
1623 | * function: extend 1st/only directory leaf page | ||
1624 | * | ||
1625 | * parameter: | ||
1626 | * | ||
1627 | * return: 0 - success; | ||
1628 | * errno - failure; | ||
1629 | * return extended page pinned; | ||
1630 | */ | ||
1631 | static int dtExtendPage(tid_t tid, | ||
1632 | struct inode *ip, struct dtsplit * split, struct btstack * btstack) | ||
1633 | { | ||
1634 | struct super_block *sb = ip->i_sb; | ||
1635 | int rc; | ||
1636 | struct metapage *smp, *pmp, *mp; | ||
1637 | dtpage_t *sp, *pp; | ||
1638 | struct pxdlist *pxdlist; | ||
1639 | pxd_t *pxd, *tpxd; | ||
1640 | int xlen, xsize; | ||
1641 | int newstblindex, newstblsize; | ||
1642 | int oldstblindex, oldstblsize; | ||
1643 | int fsi, last; | ||
1644 | struct dtslot *f; | ||
1645 | struct btframe *parent; | ||
1646 | int n; | ||
1647 | struct dt_lock *dtlck; | ||
1648 | s64 xaddr, txaddr; | ||
1649 | struct tlock *tlck; | ||
1650 | struct pxd_lock *pxdlock; | ||
1651 | struct lv *lv; | ||
1652 | uint type; | ||
1653 | struct ldtentry *ldtentry; | ||
1654 | u8 *stbl; | ||
1655 | |||
1656 | /* get page to extend */ | ||
1657 | smp = split->mp; | ||
1658 | sp = DT_PAGE(ip, smp); | ||
1659 | |||
1660 | /* get parent/root page */ | ||
1661 | parent = BT_POP(btstack); | ||
1662 | DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc); | ||
1663 | if (rc) | ||
1664 | return (rc); | ||
1665 | |||
1666 | /* | ||
1667 | * extend the extent | ||
1668 | */ | ||
1669 | pxdlist = split->pxdlist; | ||
1670 | pxd = &pxdlist->pxd[pxdlist->npxd]; | ||
1671 | pxdlist->npxd++; | ||
1672 | |||
1673 | xaddr = addressPXD(pxd); | ||
1674 | tpxd = &sp->header.self; | ||
1675 | txaddr = addressPXD(tpxd); | ||
1676 | /* in-place extension */ | ||
1677 | if (xaddr == txaddr) { | ||
1678 | type = tlckEXTEND; | ||
1679 | } | ||
1680 | /* relocation */ | ||
1681 | else { | ||
1682 | type = tlckNEW; | ||
1683 | |||
1684 | /* save moved extent descriptor for later free */ | ||
1685 | tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE); | ||
1686 | pxdlock = (struct pxd_lock *) & tlck->lock; | ||
1687 | pxdlock->flag = mlckFREEPXD; | ||
1688 | pxdlock->pxd = sp->header.self; | ||
1689 | pxdlock->index = 1; | ||
1690 | |||
1691 | /* | ||
1692 | * Update directory index table to reflect new page address | ||
1693 | */ | ||
1694 | if (DO_INDEX(ip)) { | ||
1695 | s64 lblock; | ||
1696 | |||
1697 | mp = NULL; | ||
1698 | stbl = DT_GETSTBL(sp); | ||
1699 | for (n = 0; n < sp->header.nextindex; n++) { | ||
1700 | ldtentry = | ||
1701 | (struct ldtentry *) & sp->slot[stbl[n]]; | ||
1702 | modify_index(tid, ip, | ||
1703 | le32_to_cpu(ldtentry->index), | ||
1704 | xaddr, n, &mp, &lblock); | ||
1705 | } | ||
1706 | if (mp) | ||
1707 | release_metapage(mp); | ||
1708 | } | ||
1709 | } | ||
1710 | |||
1711 | /* | ||
1712 | * extend the page | ||
1713 | */ | ||
1714 | sp->header.self = *pxd; | ||
1715 | |||
1716 | jfs_info("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p", ip, smp, sp); | ||
1717 | |||
1718 | BT_MARK_DIRTY(smp, ip); | ||
1719 | /* | ||
1720 | * acquire a transaction lock on the extended/leaf page | ||
1721 | */ | ||
1722 | tlck = txLock(tid, ip, smp, tlckDTREE | type); | ||
1723 | dtlck = (struct dt_lock *) & tlck->lock; | ||
1724 | lv = & dtlck->lv[0]; | ||
1725 | |||
1726 | /* update buffer extent descriptor of extended page */ | ||
1727 | xlen = lengthPXD(pxd); | ||
1728 | xsize = xlen << JFS_SBI(sb)->l2bsize; | ||
1729 | #ifdef _STILL_TO_PORT | ||
1730 | bmSetXD(smp, xaddr, xsize); | ||
1731 | #endif /* _STILL_TO_PORT */ | ||
1732 | |||
1733 | /* | ||
1734 | * copy old stbl to new stbl at start of extended area | ||
1735 | */ | ||
1736 | oldstblindex = sp->header.stblindex; | ||
1737 | oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE; | ||
1738 | newstblindex = sp->header.maxslot; | ||
1739 | n = xsize >> L2DTSLOTSIZE; | ||
1740 | newstblsize = (n + 31) >> L2DTSLOTSIZE; | ||
1741 | memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex], | ||
1742 | sp->header.nextindex); | ||
1743 | |||
1744 | /* | ||
1745 | * in-line extension: linelock old area of extended page | ||
1746 | */ | ||
1747 | if (type == tlckEXTEND) { | ||
1748 | /* linelock header */ | ||
1749 | lv->offset = 0; | ||
1750 | lv->length = 1; | ||
1751 | dtlck->index++; | ||
1752 | lv++; | ||
1753 | |||
1754 | /* linelock new stbl of extended page */ | ||
1755 | lv->offset = newstblindex; | ||
1756 | lv->length = newstblsize; | ||
1757 | } | ||
1758 | /* | ||
1759 | * relocation: linelock whole relocated area | ||
1760 | */ | ||
1761 | else { | ||
1762 | lv->offset = 0; | ||
1763 | lv->length = sp->header.maxslot + newstblsize; | ||
1764 | } | ||
1765 | |||
1766 | dtlck->index++; | ||
1767 | |||
1768 | sp->header.maxslot = n; | ||
1769 | sp->header.stblindex = newstblindex; | ||
1770 | /* sp->header.nextindex remains the same */ | ||
1771 | |||
1772 | /* | ||
1773 | * add old stbl region at head of freelist | ||
1774 | */ | ||
1775 | fsi = oldstblindex; | ||
1776 | f = &sp->slot[fsi]; | ||
1777 | last = sp->header.freelist; | ||
1778 | for (n = 0; n < oldstblsize; n++, fsi++, f++) { | ||
1779 | f->next = last; | ||
1780 | last = fsi; | ||
1781 | } | ||
1782 | sp->header.freelist = last; | ||
1783 | sp->header.freecnt += oldstblsize; | ||
1784 | |||
1785 | /* | ||
1786 | * append free region of newly extended area at tail of freelist | ||
1787 | */ | ||
1788 | /* init free region of newly extended area */ | ||
1789 | fsi = n = newstblindex + newstblsize; | ||
1790 | f = &sp->slot[fsi]; | ||
1791 | for (fsi++; fsi < sp->header.maxslot; f++, fsi++) | ||
1792 | f->next = fsi; | ||
1793 | f->next = -1; | ||
1794 | |||
1795 | /* append new free region at tail of old freelist */ | ||
1796 | fsi = sp->header.freelist; | ||
1797 | if (fsi == -1) | ||
1798 | sp->header.freelist = n; | ||
1799 | else { | ||
1800 | do { | ||
1801 | f = &sp->slot[fsi]; | ||
1802 | fsi = f->next; | ||
1803 | } while (fsi != -1); | ||
1804 | |||
1805 | f->next = n; | ||
1806 | } | ||
1807 | |||
1808 | sp->header.freecnt += sp->header.maxslot - n; | ||
1809 | |||
1810 | /* | ||
1811 | * insert the new entry | ||
1812 | */ | ||
1813 | dtInsertEntry(sp, split->index, split->key, split->data, &dtlck); | ||
1814 | |||
1815 | BT_MARK_DIRTY(pmp, ip); | ||
1816 | /* | ||
1817 | * linelock any freeslots residing in old extent | ||
1818 | */ | ||
1819 | if (type == tlckEXTEND) { | ||
1820 | n = sp->header.maxslot >> 2; | ||
1821 | if (sp->header.freelist < n) | ||
1822 | dtLinelockFreelist(sp, n, &dtlck); | ||
1823 | } | ||
1824 | |||
1825 | /* | ||
1826 | * update parent entry on the parent/root page | ||
1827 | */ | ||
1828 | /* | ||
1829 | * acquire a transaction lock on the parent/root page | ||
1830 | */ | ||
1831 | tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY); | ||
1832 | dtlck = (struct dt_lock *) & tlck->lock; | ||
1833 | lv = & dtlck->lv[dtlck->index]; | ||
1834 | |||
1835 | /* linelock parent entry - 1st slot */ | ||
1836 | lv->offset = 1; | ||
1837 | lv->length = 1; | ||
1838 | dtlck->index++; | ||
1839 | |||
1840 | /* update the parent pxd for page extension */ | ||
1841 | tpxd = (pxd_t *) & pp->slot[1]; | ||
1842 | *tpxd = *pxd; | ||
1843 | |||
1844 | DT_PUTPAGE(pmp); | ||
1845 | return 0; | ||
1846 | } | ||
1847 | |||
1848 | |||
1849 | /* | ||
1850 | * dtSplitRoot() | ||
1851 | * | ||
1852 | * function: | ||
1853 | * split the full root page into | ||
1854 | * original/root/split page and new right page | ||
1855 | * i.e., root remains fixed in tree anchor (inode) and | ||
1856 | * the root is copied to a single new right child page | ||
1857 | * since root page << non-root page, and | ||
1858 | * the split root page contains a single entry for the | ||
1859 | * new right child page. | ||
1860 | * | ||
1861 | * parameter: | ||
1862 | * | ||
1863 | * return: 0 - success; | ||
1864 | * errno - failure; | ||
1865 | * return new page pinned; | ||
1866 | */ | ||
1867 | static int dtSplitRoot(tid_t tid, | ||
1868 | struct inode *ip, struct dtsplit * split, struct metapage ** rmpp) | ||
1869 | { | ||
1870 | struct super_block *sb = ip->i_sb; | ||
1871 | struct metapage *smp; | ||
1872 | dtroot_t *sp; | ||
1873 | struct metapage *rmp; | ||
1874 | dtpage_t *rp; | ||
1875 | s64 rbn; | ||
1876 | int xlen; | ||
1877 | int xsize; | ||
1878 | struct dtslot *f; | ||
1879 | s8 *stbl; | ||
1880 | int fsi, stblsize, n; | ||
1881 | struct idtentry *s; | ||
1882 | pxd_t *ppxd; | ||
1883 | struct pxdlist *pxdlist; | ||
1884 | pxd_t *pxd; | ||
1885 | struct dt_lock *dtlck; | ||
1886 | struct tlock *tlck; | ||
1887 | struct lv *lv; | ||
1888 | |||
1889 | /* get split root page */ | ||
1890 | smp = split->mp; | ||
1891 | sp = &JFS_IP(ip)->i_dtroot; | ||
1892 | |||
1893 | /* | ||
1894 | * allocate/initialize a single (right) child page | ||
1895 | * | ||
1896 | * N.B. at first split, a one (or two) block to fit new entry | ||
1897 | * is allocated; at subsequent split, a full page is allocated; | ||
1898 | */ | ||
1899 | pxdlist = split->pxdlist; | ||
1900 | pxd = &pxdlist->pxd[pxdlist->npxd]; | ||
1901 | pxdlist->npxd++; | ||
1902 | rbn = addressPXD(pxd); | ||
1903 | xlen = lengthPXD(pxd); | ||
1904 | xsize = xlen << JFS_SBI(sb)->l2bsize; | ||
1905 | rmp = get_metapage(ip, rbn, xsize, 1); | ||
1906 | if (!rmp) | ||
1907 | return -EIO; | ||
1908 | |||
1909 | rp = rmp->data; | ||
1910 | |||
1911 | /* Allocate blocks to quota. */ | ||
1912 | if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) { | ||
1913 | release_metapage(rmp); | ||
1914 | return -EDQUOT; | ||
1915 | } | ||
1916 | |||
1917 | BT_MARK_DIRTY(rmp, ip); | ||
1918 | /* | ||
1919 | * acquire a transaction lock on the new right page | ||
1920 | */ | ||
1921 | tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); | ||
1922 | dtlck = (struct dt_lock *) & tlck->lock; | ||
1923 | |||
1924 | rp->header.flag = | ||
1925 | (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL; | ||
1926 | rp->header.self = *pxd; | ||
1927 | |||
1928 | /* initialize sibling pointers */ | ||
1929 | rp->header.next = 0; | ||
1930 | rp->header.prev = 0; | ||
1931 | |||
1932 | /* | ||
1933 | * move in-line root page into new right page extent | ||
1934 | */ | ||
1935 | /* linelock header + copied entries + new stbl (1st slot) in new page */ | ||
1936 | ASSERT(dtlck->index == 0); | ||
1937 | lv = & dtlck->lv[0]; | ||
1938 | lv->offset = 0; | ||
1939 | lv->length = 10; /* 1 + 8 + 1 */ | ||
1940 | dtlck->index++; | ||
1941 | |||
1942 | n = xsize >> L2DTSLOTSIZE; | ||
1943 | rp->header.maxslot = n; | ||
1944 | stblsize = (n + 31) >> L2DTSLOTSIZE; | ||
1945 | |||
1946 | /* copy old stbl to new stbl at start of extended area */ | ||
1947 | rp->header.stblindex = DTROOTMAXSLOT; | ||
1948 | stbl = (s8 *) & rp->slot[DTROOTMAXSLOT]; | ||
1949 | memcpy(stbl, sp->header.stbl, sp->header.nextindex); | ||
1950 | rp->header.nextindex = sp->header.nextindex; | ||
1951 | |||
1952 | /* copy old data area to start of new data area */ | ||
1953 | memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE); | ||
1954 | |||
1955 | /* | ||
1956 | * append free region of newly extended area at tail of freelist | ||
1957 | */ | ||
1958 | /* init free region of newly extended area */ | ||
1959 | fsi = n = DTROOTMAXSLOT + stblsize; | ||
1960 | f = &rp->slot[fsi]; | ||
1961 | for (fsi++; fsi < rp->header.maxslot; f++, fsi++) | ||
1962 | f->next = fsi; | ||
1963 | f->next = -1; | ||
1964 | |||
1965 | /* append new free region at tail of old freelist */ | ||
1966 | fsi = sp->header.freelist; | ||
1967 | if (fsi == -1) | ||
1968 | rp->header.freelist = n; | ||
1969 | else { | ||
1970 | rp->header.freelist = fsi; | ||
1971 | |||
1972 | do { | ||
1973 | f = &rp->slot[fsi]; | ||
1974 | fsi = f->next; | ||
1975 | } while (fsi != -1); | ||
1976 | |||
1977 | f->next = n; | ||
1978 | } | ||
1979 | |||
1980 | rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n; | ||
1981 | |||
1982 | /* | ||
1983 | * Update directory index table for entries now in right page | ||
1984 | */ | ||
1985 | if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { | ||
1986 | s64 lblock; | ||
1987 | struct metapage *mp = NULL; | ||
1988 | struct ldtentry *ldtentry; | ||
1989 | |||
1990 | stbl = DT_GETSTBL(rp); | ||
1991 | for (n = 0; n < rp->header.nextindex; n++) { | ||
1992 | ldtentry = (struct ldtentry *) & rp->slot[stbl[n]]; | ||
1993 | modify_index(tid, ip, le32_to_cpu(ldtentry->index), | ||
1994 | rbn, n, &mp, &lblock); | ||
1995 | } | ||
1996 | if (mp) | ||
1997 | release_metapage(mp); | ||
1998 | } | ||
1999 | /* | ||
2000 | * insert the new entry into the new right/child page | ||
2001 | * (skip index in the new right page will not change) | ||
2002 | */ | ||
2003 | dtInsertEntry(rp, split->index, split->key, split->data, &dtlck); | ||
2004 | |||
2005 | /* | ||
2006 | * reset parent/root page | ||
2007 | * | ||
2008 | * set the 1st entry offset to 0, which force the left-most key | ||
2009 | * at any level of the tree to be less than any search key. | ||
2010 | * | ||
2011 | * The btree comparison code guarantees that the left-most key on any | ||
2012 | * level of the tree is never used, so it doesn't need to be filled in. | ||
2013 | */ | ||
2014 | BT_MARK_DIRTY(smp, ip); | ||
2015 | /* | ||
2016 | * acquire a transaction lock on the root page (in-memory inode) | ||
2017 | */ | ||
2018 | tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT); | ||
2019 | dtlck = (struct dt_lock *) & tlck->lock; | ||
2020 | |||
2021 | /* linelock root */ | ||
2022 | ASSERT(dtlck->index == 0); | ||
2023 | lv = & dtlck->lv[0]; | ||
2024 | lv->offset = 0; | ||
2025 | lv->length = DTROOTMAXSLOT; | ||
2026 | dtlck->index++; | ||
2027 | |||
2028 | /* update page header of root */ | ||
2029 | if (sp->header.flag & BT_LEAF) { | ||
2030 | sp->header.flag &= ~BT_LEAF; | ||
2031 | sp->header.flag |= BT_INTERNAL; | ||
2032 | } | ||
2033 | |||
2034 | /* init the first entry */ | ||
2035 | s = (struct idtentry *) & sp->slot[DTENTRYSTART]; | ||
2036 | ppxd = (pxd_t *) s; | ||
2037 | *ppxd = *pxd; | ||
2038 | s->next = -1; | ||
2039 | s->namlen = 0; | ||
2040 | |||
2041 | stbl = sp->header.stbl; | ||
2042 | stbl[0] = DTENTRYSTART; | ||
2043 | sp->header.nextindex = 1; | ||
2044 | |||
2045 | /* init freelist */ | ||
2046 | fsi = DTENTRYSTART + 1; | ||
2047 | f = &sp->slot[fsi]; | ||
2048 | |||
2049 | /* init free region of remaining area */ | ||
2050 | for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) | ||
2051 | f->next = fsi; | ||
2052 | f->next = -1; | ||
2053 | |||
2054 | sp->header.freelist = DTENTRYSTART + 1; | ||
2055 | sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1); | ||
2056 | |||
2057 | *rmpp = rmp; | ||
2058 | |||
2059 | return 0; | ||
2060 | } | ||
2061 | |||
2062 | |||
2063 | /* | ||
2064 | * dtDelete() | ||
2065 | * | ||
2066 | * function: delete the entry(s) referenced by a key. | ||
2067 | * | ||
2068 | * parameter: | ||
2069 | * | ||
2070 | * return: | ||
2071 | */ | ||
2072 | int dtDelete(tid_t tid, | ||
2073 | struct inode *ip, struct component_name * key, ino_t * ino, int flag) | ||
2074 | { | ||
2075 | int rc = 0; | ||
2076 | s64 bn; | ||
2077 | struct metapage *mp, *imp; | ||
2078 | dtpage_t *p; | ||
2079 | int index; | ||
2080 | struct btstack btstack; | ||
2081 | struct dt_lock *dtlck; | ||
2082 | struct tlock *tlck; | ||
2083 | struct lv *lv; | ||
2084 | int i; | ||
2085 | struct ldtentry *ldtentry; | ||
2086 | u8 *stbl; | ||
2087 | u32 table_index, next_index; | ||
2088 | struct metapage *nmp; | ||
2089 | dtpage_t *np; | ||
2090 | |||
2091 | /* | ||
2092 | * search for the entry to delete: | ||
2093 | * | ||
2094 | * dtSearch() returns (leaf page pinned, index at which to delete). | ||
2095 | */ | ||
2096 | if ((rc = dtSearch(ip, key, ino, &btstack, flag))) | ||
2097 | return rc; | ||
2098 | |||
2099 | /* retrieve search result */ | ||
2100 | DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); | ||
2101 | |||
2102 | /* | ||
2103 | * We need to find put the index of the next entry into the | ||
2104 | * directory index table in order to resume a readdir from this | ||
2105 | * entry. | ||
2106 | */ | ||
2107 | if (DO_INDEX(ip)) { | ||
2108 | stbl = DT_GETSTBL(p); | ||
2109 | ldtentry = (struct ldtentry *) & p->slot[stbl[index]]; | ||
2110 | table_index = le32_to_cpu(ldtentry->index); | ||
2111 | if (index == (p->header.nextindex - 1)) { | ||
2112 | /* | ||
2113 | * Last entry in this leaf page | ||
2114 | */ | ||
2115 | if ((p->header.flag & BT_ROOT) | ||
2116 | || (p->header.next == 0)) | ||
2117 | next_index = -1; | ||
2118 | else { | ||
2119 | /* Read next leaf page */ | ||
2120 | DT_GETPAGE(ip, le64_to_cpu(p->header.next), | ||
2121 | nmp, PSIZE, np, rc); | ||
2122 | if (rc) | ||
2123 | next_index = -1; | ||
2124 | else { | ||
2125 | stbl = DT_GETSTBL(np); | ||
2126 | ldtentry = | ||
2127 | (struct ldtentry *) & np-> | ||
2128 | slot[stbl[0]]; | ||
2129 | next_index = | ||
2130 | le32_to_cpu(ldtentry->index); | ||
2131 | DT_PUTPAGE(nmp); | ||
2132 | } | ||
2133 | } | ||
2134 | } else { | ||
2135 | ldtentry = | ||
2136 | (struct ldtentry *) & p->slot[stbl[index + 1]]; | ||
2137 | next_index = le32_to_cpu(ldtentry->index); | ||
2138 | } | ||
2139 | free_index(tid, ip, table_index, next_index); | ||
2140 | } | ||
2141 | /* | ||
2142 | * the leaf page becomes empty, delete the page | ||
2143 | */ | ||
2144 | if (p->header.nextindex == 1) { | ||
2145 | /* delete empty page */ | ||
2146 | rc = dtDeleteUp(tid, ip, mp, p, &btstack); | ||
2147 | } | ||
2148 | /* | ||
2149 | * the leaf page has other entries remaining: | ||
2150 | * | ||
2151 | * delete the entry from the leaf page. | ||
2152 | */ | ||
2153 | else { | ||
2154 | BT_MARK_DIRTY(mp, ip); | ||
2155 | /* | ||
2156 | * acquire a transaction lock on the leaf page | ||
2157 | */ | ||
2158 | tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); | ||
2159 | dtlck = (struct dt_lock *) & tlck->lock; | ||
2160 | |||
2161 | /* | ||
2162 | * Do not assume that dtlck->index will be zero. During a | ||
2163 | * rename within a directory, this transaction may have | ||
2164 | * modified this page already when adding the new entry. | ||
2165 | */ | ||
2166 | |||
2167 | /* linelock header */ | ||
2168 | if (dtlck->index >= dtlck->maxcnt) | ||
2169 | dtlck = (struct dt_lock *) txLinelock(dtlck); | ||
2170 | lv = & dtlck->lv[dtlck->index]; | ||
2171 | lv->offset = 0; | ||
2172 | lv->length = 1; | ||
2173 | dtlck->index++; | ||
2174 | |||
2175 | /* linelock stbl of non-root leaf page */ | ||
2176 | if (!(p->header.flag & BT_ROOT)) { | ||
2177 | if (dtlck->index >= dtlck->maxcnt) | ||
2178 | dtlck = (struct dt_lock *) txLinelock(dtlck); | ||
2179 | lv = & dtlck->lv[dtlck->index]; | ||
2180 | i = index >> L2DTSLOTSIZE; | ||
2181 | lv->offset = p->header.stblindex + i; | ||
2182 | lv->length = | ||
2183 | ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - | ||
2184 | i + 1; | ||
2185 | dtlck->index++; | ||
2186 | } | ||
2187 | |||
2188 | /* free the leaf entry */ | ||
2189 | dtDeleteEntry(p, index, &dtlck); | ||
2190 | |||
2191 | /* | ||
2192 | * Update directory index table for entries moved in stbl | ||
2193 | */ | ||
2194 | if (DO_INDEX(ip) && index < p->header.nextindex) { | ||
2195 | s64 lblock; | ||
2196 | |||
2197 | imp = NULL; | ||
2198 | stbl = DT_GETSTBL(p); | ||
2199 | for (i = index; i < p->header.nextindex; i++) { | ||
2200 | ldtentry = | ||
2201 | (struct ldtentry *) & p->slot[stbl[i]]; | ||
2202 | modify_index(tid, ip, | ||
2203 | le32_to_cpu(ldtentry->index), | ||
2204 | bn, i, &imp, &lblock); | ||
2205 | } | ||
2206 | if (imp) | ||
2207 | release_metapage(imp); | ||
2208 | } | ||
2209 | |||
2210 | DT_PUTPAGE(mp); | ||
2211 | } | ||
2212 | |||
2213 | return rc; | ||
2214 | } | ||
2215 | |||
2216 | |||
2217 | /* | ||
2218 | * dtDeleteUp() | ||
2219 | * | ||
2220 | * function: | ||
2221 | * free empty pages as propagating deletion up the tree | ||
2222 | * | ||
2223 | * parameter: | ||
2224 | * | ||
2225 | * return: | ||
2226 | */ | ||
2227 | static int dtDeleteUp(tid_t tid, struct inode *ip, | ||
2228 | struct metapage * fmp, dtpage_t * fp, struct btstack * btstack) | ||
2229 | { | ||
2230 | int rc = 0; | ||
2231 | struct metapage *mp; | ||
2232 | dtpage_t *p; | ||
2233 | int index, nextindex; | ||
2234 | int xlen; | ||
2235 | struct btframe *parent; | ||
2236 | struct dt_lock *dtlck; | ||
2237 | struct tlock *tlck; | ||
2238 | struct lv *lv; | ||
2239 | struct pxd_lock *pxdlock; | ||
2240 | int i; | ||
2241 | |||
2242 | /* | ||
2243 | * keep the root leaf page which has become empty | ||
2244 | */ | ||
2245 | if (BT_IS_ROOT(fmp)) { | ||
2246 | /* | ||
2247 | * reset the root | ||
2248 | * | ||
2249 | * dtInitRoot() acquires txlock on the root | ||
2250 | */ | ||
2251 | dtInitRoot(tid, ip, PARENT(ip)); | ||
2252 | |||
2253 | DT_PUTPAGE(fmp); | ||
2254 | |||
2255 | return 0; | ||
2256 | } | ||
2257 | |||
2258 | /* | ||
2259 | * free the non-root leaf page | ||
2260 | */ | ||
2261 | /* | ||
2262 | * acquire a transaction lock on the page | ||
2263 | * | ||
2264 | * write FREEXTENT|NOREDOPAGE log record | ||
2265 | * N.B. linelock is overlaid as freed extent descriptor, and | ||
2266 | * the buffer page is freed; | ||
2267 | */ | ||
2268 | tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); | ||
2269 | pxdlock = (struct pxd_lock *) & tlck->lock; | ||
2270 | pxdlock->flag = mlckFREEPXD; | ||
2271 | pxdlock->pxd = fp->header.self; | ||
2272 | pxdlock->index = 1; | ||
2273 | |||
2274 | /* update sibling pointers */ | ||
2275 | if ((rc = dtRelink(tid, ip, fp))) { | ||
2276 | BT_PUTPAGE(fmp); | ||
2277 | return rc; | ||
2278 | } | ||
2279 | |||
2280 | xlen = lengthPXD(&fp->header.self); | ||
2281 | |||
2282 | /* Free quota allocation. */ | ||
2283 | DQUOT_FREE_BLOCK(ip, xlen); | ||
2284 | |||
2285 | /* free/invalidate its buffer page */ | ||
2286 | discard_metapage(fmp); | ||
2287 | |||
2288 | /* | ||
2289 | * propagate page deletion up the directory tree | ||
2290 | * | ||
2291 | * If the delete from the parent page makes it empty, | ||
2292 | * continue all the way up the tree. | ||
2293 | * stop if the root page is reached (which is never deleted) or | ||
2294 | * if the entry deletion does not empty the page. | ||
2295 | */ | ||
2296 | while ((parent = BT_POP(btstack)) != NULL) { | ||
2297 | /* pin the parent page <sp> */ | ||
2298 | DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc); | ||
2299 | if (rc) | ||
2300 | return rc; | ||
2301 | |||
2302 | /* | ||
2303 | * free the extent of the child page deleted | ||
2304 | */ | ||
2305 | index = parent->index; | ||
2306 | |||
2307 | /* | ||
2308 | * delete the entry for the child page from parent | ||
2309 | */ | ||
2310 | nextindex = p->header.nextindex; | ||
2311 | |||
2312 | /* | ||
2313 | * the parent has the single entry being deleted: | ||
2314 | * | ||
2315 | * free the parent page which has become empty. | ||
2316 | */ | ||
2317 | if (nextindex == 1) { | ||
2318 | /* | ||
2319 | * keep the root internal page which has become empty | ||
2320 | */ | ||
2321 | if (p->header.flag & BT_ROOT) { | ||
2322 | /* | ||
2323 | * reset the root | ||
2324 | * | ||
2325 | * dtInitRoot() acquires txlock on the root | ||
2326 | */ | ||
2327 | dtInitRoot(tid, ip, PARENT(ip)); | ||
2328 | |||
2329 | DT_PUTPAGE(mp); | ||
2330 | |||
2331 | return 0; | ||
2332 | } | ||
2333 | /* | ||
2334 | * free the parent page | ||
2335 | */ | ||
2336 | else { | ||
2337 | /* | ||
2338 | * acquire a transaction lock on the page | ||
2339 | * | ||
2340 | * write FREEXTENT|NOREDOPAGE log record | ||
2341 | */ | ||
2342 | tlck = | ||
2343 | txMaplock(tid, ip, | ||
2344 | tlckDTREE | tlckFREE); | ||
2345 | pxdlock = (struct pxd_lock *) & tlck->lock; | ||
2346 | pxdlock->flag = mlckFREEPXD; | ||
2347 | pxdlock->pxd = p->header.self; | ||
2348 | pxdlock->index = 1; | ||
2349 | |||
2350 | /* update sibling pointers */ | ||
2351 | if ((rc = dtRelink(tid, ip, p))) { | ||
2352 | DT_PUTPAGE(mp); | ||
2353 | return rc; | ||
2354 | } | ||
2355 | |||
2356 | xlen = lengthPXD(&p->header.self); | ||
2357 | |||
2358 | /* Free quota allocation */ | ||
2359 | DQUOT_FREE_BLOCK(ip, xlen); | ||
2360 | |||
2361 | /* free/invalidate its buffer page */ | ||
2362 | discard_metapage(mp); | ||
2363 | |||
2364 | /* propagate up */ | ||
2365 | continue; | ||
2366 | } | ||
2367 | } | ||
2368 | |||
2369 | /* | ||
2370 | * the parent has other entries remaining: | ||
2371 | * | ||
2372 | * delete the router entry from the parent page. | ||
2373 | */ | ||
2374 | BT_MARK_DIRTY(mp, ip); | ||
2375 | /* | ||
2376 | * acquire a transaction lock on the page | ||
2377 | * | ||
2378 | * action: router entry deletion | ||
2379 | */ | ||
2380 | tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); | ||
2381 | dtlck = (struct dt_lock *) & tlck->lock; | ||
2382 | |||
2383 | /* linelock header */ | ||
2384 | if (dtlck->index >= dtlck->maxcnt) | ||
2385 | dtlck = (struct dt_lock *) txLinelock(dtlck); | ||
2386 | lv = & dtlck->lv[dtlck->index]; | ||
2387 | lv->offset = 0; | ||
2388 | lv->length = 1; | ||
2389 | dtlck->index++; | ||
2390 | |||
2391 | /* linelock stbl of non-root leaf page */ | ||
2392 | if (!(p->header.flag & BT_ROOT)) { | ||
2393 | if (dtlck->index < dtlck->maxcnt) | ||
2394 | lv++; | ||
2395 | else { | ||
2396 | dtlck = (struct dt_lock *) txLinelock(dtlck); | ||
2397 | lv = & dtlck->lv[0]; | ||
2398 | } | ||
2399 | i = index >> L2DTSLOTSIZE; | ||
2400 | lv->offset = p->header.stblindex + i; | ||
2401 | lv->length = | ||
2402 | ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - | ||
2403 | i + 1; | ||
2404 | dtlck->index++; | ||
2405 | } | ||
2406 | |||
2407 | /* free the router entry */ | ||
2408 | dtDeleteEntry(p, index, &dtlck); | ||
2409 | |||
2410 | /* reset key of new leftmost entry of level (for consistency) */ | ||
2411 | if (index == 0 && | ||
2412 | ((p->header.flag & BT_ROOT) || p->header.prev == 0)) | ||
2413 | dtTruncateEntry(p, 0, &dtlck); | ||
2414 | |||
2415 | /* unpin the parent page */ | ||
2416 | DT_PUTPAGE(mp); | ||
2417 | |||
2418 | /* exit propagation up */ | ||
2419 | break; | ||
2420 | } | ||
2421 | |||
2422 | return 0; | ||
2423 | } | ||
2424 | |||
2425 | #ifdef _NOTYET | ||
2426 | /* | ||
2427 | * NAME: dtRelocate() | ||
2428 | * | ||
2429 | * FUNCTION: relocate dtpage (internal or leaf) of directory; | ||
2430 | * This function is mainly used by defragfs utility. | ||
2431 | */ | ||
2432 | int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, | ||
2433 | s64 nxaddr) | ||
2434 | { | ||
2435 | int rc = 0; | ||
2436 | struct metapage *mp, *pmp, *lmp, *rmp; | ||
2437 | dtpage_t *p, *pp, *rp = 0, *lp= 0; | ||
2438 | s64 bn; | ||
2439 | int index; | ||
2440 | struct btstack btstack; | ||
2441 | pxd_t *pxd; | ||
2442 | s64 oxaddr, nextbn, prevbn; | ||
2443 | int xlen, xsize; | ||
2444 | struct tlock *tlck; | ||
2445 | struct dt_lock *dtlck; | ||
2446 | struct pxd_lock *pxdlock; | ||
2447 | s8 *stbl; | ||
2448 | struct lv *lv; | ||
2449 | |||
2450 | oxaddr = addressPXD(opxd); | ||
2451 | xlen = lengthPXD(opxd); | ||
2452 | |||
2453 | jfs_info("dtRelocate: lmxaddr:%Ld xaddr:%Ld:%Ld xlen:%d", | ||
2454 | (long long)lmxaddr, (long long)oxaddr, (long long)nxaddr, | ||
2455 | xlen); | ||
2456 | |||
2457 | /* | ||
2458 | * 1. get the internal parent dtpage covering | ||
2459 | * router entry for the tartget page to be relocated; | ||
2460 | */ | ||
2461 | rc = dtSearchNode(ip, lmxaddr, opxd, &btstack); | ||
2462 | if (rc) | ||
2463 | return rc; | ||
2464 | |||
2465 | /* retrieve search result */ | ||
2466 | DT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); | ||
2467 | jfs_info("dtRelocate: parent router entry validated."); | ||
2468 | |||
2469 | /* | ||
2470 | * 2. relocate the target dtpage | ||
2471 | */ | ||
2472 | /* read in the target page from src extent */ | ||
2473 | DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); | ||
2474 | if (rc) { | ||
2475 | /* release the pinned parent page */ | ||
2476 | DT_PUTPAGE(pmp); | ||
2477 | return rc; | ||
2478 | } | ||
2479 | |||
2480 | /* | ||
2481 | * read in sibling pages if any to update sibling pointers; | ||
2482 | */ | ||
2483 | rmp = NULL; | ||
2484 | if (p->header.next) { | ||
2485 | nextbn = le64_to_cpu(p->header.next); | ||
2486 | DT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc); | ||
2487 | if (rc) { | ||
2488 | DT_PUTPAGE(mp); | ||
2489 | DT_PUTPAGE(pmp); | ||
2490 | return (rc); | ||
2491 | } | ||
2492 | } | ||
2493 | |||
2494 | lmp = NULL; | ||
2495 | if (p->header.prev) { | ||
2496 | prevbn = le64_to_cpu(p->header.prev); | ||
2497 | DT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc); | ||
2498 | if (rc) { | ||
2499 | DT_PUTPAGE(mp); | ||
2500 | DT_PUTPAGE(pmp); | ||
2501 | if (rmp) | ||
2502 | DT_PUTPAGE(rmp); | ||
2503 | return (rc); | ||
2504 | } | ||
2505 | } | ||
2506 | |||
2507 | /* at this point, all xtpages to be updated are in memory */ | ||
2508 | |||
2509 | /* | ||
2510 | * update sibling pointers of sibling dtpages if any; | ||
2511 | */ | ||
2512 | if (lmp) { | ||
2513 | tlck = txLock(tid, ip, lmp, tlckDTREE | tlckRELINK); | ||
2514 | dtlck = (struct dt_lock *) & tlck->lock; | ||
2515 | /* linelock header */ | ||
2516 | ASSERT(dtlck->index == 0); | ||
2517 | lv = & dtlck->lv[0]; | ||
2518 | lv->offset = 0; | ||
2519 | lv->length = 1; | ||
2520 | dtlck->index++; | ||
2521 | |||
2522 | lp->header.next = cpu_to_le64(nxaddr); | ||
2523 | DT_PUTPAGE(lmp); | ||
2524 | } | ||
2525 | |||
2526 | if (rmp) { | ||
2527 | tlck = txLock(tid, ip, rmp, tlckDTREE | tlckRELINK); | ||
2528 | dtlck = (struct dt_lock *) & tlck->lock; | ||
2529 | /* linelock header */ | ||
2530 | ASSERT(dtlck->index == 0); | ||
2531 | lv = & dtlck->lv[0]; | ||
2532 | lv->offset = 0; | ||
2533 | lv->length = 1; | ||
2534 | dtlck->index++; | ||
2535 | |||
2536 | rp->header.prev = cpu_to_le64(nxaddr); | ||
2537 | DT_PUTPAGE(rmp); | ||
2538 | } | ||
2539 | |||
2540 | /* | ||
2541 | * update the target dtpage to be relocated | ||
2542 | * | ||
2543 | * write LOG_REDOPAGE of LOG_NEW type for dst page | ||
2544 | * for the whole target page (logredo() will apply | ||
2545 | * after image and update bmap for allocation of the | ||
2546 | * dst extent), and update bmap for allocation of | ||
2547 | * the dst extent; | ||
2548 | */ | ||
2549 | tlck = txLock(tid, ip, mp, tlckDTREE | tlckNEW); | ||
2550 | dtlck = (struct dt_lock *) & tlck->lock; | ||
2551 | /* linelock header */ | ||
2552 | ASSERT(dtlck->index == 0); | ||
2553 | lv = & dtlck->lv[0]; | ||
2554 | |||
2555 | /* update the self address in the dtpage header */ | ||
2556 | pxd = &p->header.self; | ||
2557 | PXDaddress(pxd, nxaddr); | ||
2558 | |||
2559 | /* the dst page is the same as the src page, i.e., | ||
2560 | * linelock for afterimage of the whole page; | ||
2561 | */ | ||
2562 | lv->offset = 0; | ||
2563 | lv->length = p->header.maxslot; | ||
2564 | dtlck->index++; | ||
2565 | |||
2566 | /* update the buffer extent descriptor of the dtpage */ | ||
2567 | xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; | ||
2568 | #ifdef _STILL_TO_PORT | ||
2569 | bmSetXD(mp, nxaddr, xsize); | ||
2570 | #endif /* _STILL_TO_PORT */ | ||
2571 | /* unpin the relocated page */ | ||
2572 | DT_PUTPAGE(mp); | ||
2573 | jfs_info("dtRelocate: target dtpage relocated."); | ||
2574 | |||
2575 | /* the moved extent is dtpage, then a LOG_NOREDOPAGE log rec | ||
2576 | * needs to be written (in logredo(), the LOG_NOREDOPAGE log rec | ||
2577 | * will also force a bmap update ). | ||
2578 | */ | ||
2579 | |||
2580 | /* | ||
2581 | * 3. acquire maplock for the source extent to be freed; | ||
2582 | */ | ||
2583 | /* for dtpage relocation, write a LOG_NOREDOPAGE record | ||
2584 | * for the source dtpage (logredo() will init NoRedoPage | ||
2585 | * filter and will also update bmap for free of the source | ||
2586 | * dtpage), and upadte bmap for free of the source dtpage; | ||
2587 | */ | ||
2588 | tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); | ||
2589 | pxdlock = (struct pxd_lock *) & tlck->lock; | ||
2590 | pxdlock->flag = mlckFREEPXD; | ||
2591 | PXDaddress(&pxdlock->pxd, oxaddr); | ||
2592 | PXDlength(&pxdlock->pxd, xlen); | ||
2593 | pxdlock->index = 1; | ||
2594 | |||
2595 | /* | ||
2596 | * 4. update the parent router entry for relocation; | ||
2597 | * | ||
2598 | * acquire tlck for the parent entry covering the target dtpage; | ||
2599 | * write LOG_REDOPAGE to apply after image only; | ||
2600 | */ | ||
2601 | jfs_info("dtRelocate: update parent router entry."); | ||
2602 | tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY); | ||
2603 | dtlck = (struct dt_lock *) & tlck->lock; | ||
2604 | lv = & dtlck->lv[dtlck->index]; | ||
2605 | |||
2606 | /* update the PXD with the new address */ | ||
2607 | stbl = DT_GETSTBL(pp); | ||
2608 | pxd = (pxd_t *) & pp->slot[stbl[index]]; | ||
2609 | PXDaddress(pxd, nxaddr); | ||
2610 | lv->offset = stbl[index]; | ||
2611 | lv->length = 1; | ||
2612 | dtlck->index++; | ||
2613 | |||
2614 | /* unpin the parent dtpage */ | ||
2615 | DT_PUTPAGE(pmp); | ||
2616 | |||
2617 | return rc; | ||
2618 | } | ||
2619 | |||
2620 | /* | ||
2621 | * NAME: dtSearchNode() | ||
2622 | * | ||
2623 | * FUNCTION: Search for an dtpage containing a specified address | ||
2624 | * This function is mainly used by defragfs utility. | ||
2625 | * | ||
2626 | * NOTE: Search result on stack, the found page is pinned at exit. | ||
2627 | * The result page must be an internal dtpage. | ||
2628 | * lmxaddr give the address of the left most page of the | ||
2629 | * dtree level, in which the required dtpage resides. | ||
2630 | */ | ||
2631 | static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd, | ||
2632 | struct btstack * btstack) | ||
2633 | { | ||
2634 | int rc = 0; | ||
2635 | s64 bn; | ||
2636 | struct metapage *mp; | ||
2637 | dtpage_t *p; | ||
2638 | int psize = 288; /* initial in-line directory */ | ||
2639 | s8 *stbl; | ||
2640 | int i; | ||
2641 | pxd_t *pxd; | ||
2642 | struct btframe *btsp; | ||
2643 | |||
2644 | BT_CLR(btstack); /* reset stack */ | ||
2645 | |||
2646 | /* | ||
2647 | * descend tree to the level with specified leftmost page | ||
2648 | * | ||
2649 | * by convention, root bn = 0. | ||
2650 | */ | ||
2651 | for (bn = 0;;) { | ||
2652 | /* get/pin the page to search */ | ||
2653 | DT_GETPAGE(ip, bn, mp, psize, p, rc); | ||
2654 | if (rc) | ||
2655 | return rc; | ||
2656 | |||
2657 | /* does the xaddr of leftmost page of the levevl | ||
2658 | * matches levevl search key ? | ||
2659 | */ | ||
2660 | if (p->header.flag & BT_ROOT) { | ||
2661 | if (lmxaddr == 0) | ||
2662 | break; | ||
2663 | } else if (addressPXD(&p->header.self) == lmxaddr) | ||
2664 | break; | ||
2665 | |||
2666 | /* | ||
2667 | * descend down to leftmost child page | ||
2668 | */ | ||
2669 | if (p->header.flag & BT_LEAF) { | ||
2670 | DT_PUTPAGE(mp); | ||
2671 | return -ESTALE; | ||
2672 | } | ||
2673 | |||
2674 | /* get the leftmost entry */ | ||
2675 | stbl = DT_GETSTBL(p); | ||
2676 | pxd = (pxd_t *) & p->slot[stbl[0]]; | ||
2677 | |||
2678 | /* get the child page block address */ | ||
2679 | bn = addressPXD(pxd); | ||
2680 | psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize; | ||
2681 | /* unpin the parent page */ | ||
2682 | DT_PUTPAGE(mp); | ||
2683 | } | ||
2684 | |||
2685 | /* | ||
2686 | * search each page at the current levevl | ||
2687 | */ | ||
2688 | loop: | ||
2689 | stbl = DT_GETSTBL(p); | ||
2690 | for (i = 0; i < p->header.nextindex; i++) { | ||
2691 | pxd = (pxd_t *) & p->slot[stbl[i]]; | ||
2692 | |||
2693 | /* found the specified router entry */ | ||
2694 | if (addressPXD(pxd) == addressPXD(kpxd) && | ||
2695 | lengthPXD(pxd) == lengthPXD(kpxd)) { | ||
2696 | btsp = btstack->top; | ||
2697 | btsp->bn = bn; | ||
2698 | btsp->index = i; | ||
2699 | btsp->mp = mp; | ||
2700 | |||
2701 | return 0; | ||
2702 | } | ||
2703 | } | ||
2704 | |||
2705 | /* get the right sibling page if any */ | ||
2706 | if (p->header.next) | ||
2707 | bn = le64_to_cpu(p->header.next); | ||
2708 | else { | ||
2709 | DT_PUTPAGE(mp); | ||
2710 | return -ESTALE; | ||
2711 | } | ||
2712 | |||
2713 | /* unpin current page */ | ||
2714 | DT_PUTPAGE(mp); | ||
2715 | |||
2716 | /* get the right sibling page */ | ||
2717 | DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
2718 | if (rc) | ||
2719 | return rc; | ||
2720 | |||
2721 | goto loop; | ||
2722 | } | ||
2723 | #endif /* _NOTYET */ | ||
2724 | |||
2725 | /* | ||
2726 | * dtRelink() | ||
2727 | * | ||
2728 | * function: | ||
2729 | * link around a freed page. | ||
2730 | * | ||
2731 | * parameter: | ||
2732 | * fp: page to be freed | ||
2733 | * | ||
2734 | * return: | ||
2735 | */ | ||
2736 | static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p) | ||
2737 | { | ||
2738 | int rc; | ||
2739 | struct metapage *mp; | ||
2740 | s64 nextbn, prevbn; | ||
2741 | struct tlock *tlck; | ||
2742 | struct dt_lock *dtlck; | ||
2743 | struct lv *lv; | ||
2744 | |||
2745 | nextbn = le64_to_cpu(p->header.next); | ||
2746 | prevbn = le64_to_cpu(p->header.prev); | ||
2747 | |||
2748 | /* update prev pointer of the next page */ | ||
2749 | if (nextbn != 0) { | ||
2750 | DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); | ||
2751 | if (rc) | ||
2752 | return rc; | ||
2753 | |||
2754 | BT_MARK_DIRTY(mp, ip); | ||
2755 | /* | ||
2756 | * acquire a transaction lock on the next page | ||
2757 | * | ||
2758 | * action: update prev pointer; | ||
2759 | */ | ||
2760 | tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); | ||
2761 | jfs_info("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p", | ||
2762 | tlck, ip, mp); | ||
2763 | dtlck = (struct dt_lock *) & tlck->lock; | ||
2764 | |||
2765 | /* linelock header */ | ||
2766 | if (dtlck->index >= dtlck->maxcnt) | ||
2767 | dtlck = (struct dt_lock *) txLinelock(dtlck); | ||
2768 | lv = & dtlck->lv[dtlck->index]; | ||
2769 | lv->offset = 0; | ||
2770 | lv->length = 1; | ||
2771 | dtlck->index++; | ||
2772 | |||
2773 | p->header.prev = cpu_to_le64(prevbn); | ||
2774 | DT_PUTPAGE(mp); | ||
2775 | } | ||
2776 | |||
2777 | /* update next pointer of the previous page */ | ||
2778 | if (prevbn != 0) { | ||
2779 | DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc); | ||
2780 | if (rc) | ||
2781 | return rc; | ||
2782 | |||
2783 | BT_MARK_DIRTY(mp, ip); | ||
2784 | /* | ||
2785 | * acquire a transaction lock on the prev page | ||
2786 | * | ||
2787 | * action: update next pointer; | ||
2788 | */ | ||
2789 | tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); | ||
2790 | jfs_info("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p", | ||
2791 | tlck, ip, mp); | ||
2792 | dtlck = (struct dt_lock *) & tlck->lock; | ||
2793 | |||
2794 | /* linelock header */ | ||
2795 | if (dtlck->index >= dtlck->maxcnt) | ||
2796 | dtlck = (struct dt_lock *) txLinelock(dtlck); | ||
2797 | lv = & dtlck->lv[dtlck->index]; | ||
2798 | lv->offset = 0; | ||
2799 | lv->length = 1; | ||
2800 | dtlck->index++; | ||
2801 | |||
2802 | p->header.next = cpu_to_le64(nextbn); | ||
2803 | DT_PUTPAGE(mp); | ||
2804 | } | ||
2805 | |||
2806 | return 0; | ||
2807 | } | ||
2808 | |||
2809 | |||
2810 | /* | ||
2811 | * dtInitRoot() | ||
2812 | * | ||
2813 | * initialize directory root (inline in inode) | ||
2814 | */ | ||
2815 | void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot) | ||
2816 | { | ||
2817 | struct jfs_inode_info *jfs_ip = JFS_IP(ip); | ||
2818 | dtroot_t *p; | ||
2819 | int fsi; | ||
2820 | struct dtslot *f; | ||
2821 | struct tlock *tlck; | ||
2822 | struct dt_lock *dtlck; | ||
2823 | struct lv *lv; | ||
2824 | u16 xflag_save; | ||
2825 | |||
2826 | /* | ||
2827 | * If this was previously an non-empty directory, we need to remove | ||
2828 | * the old directory table. | ||
2829 | */ | ||
2830 | if (DO_INDEX(ip)) { | ||
2831 | if (!jfs_dirtable_inline(ip)) { | ||
2832 | struct tblock *tblk = tid_to_tblock(tid); | ||
2833 | /* | ||
2834 | * We're playing games with the tid's xflag. If | ||
2835 | * we're removing a regular file, the file's xtree | ||
2836 | * is committed with COMMIT_PMAP, but we always | ||
2837 | * commit the directories xtree with COMMIT_PWMAP. | ||
2838 | */ | ||
2839 | xflag_save = tblk->xflag; | ||
2840 | tblk->xflag = 0; | ||
2841 | /* | ||
2842 | * xtTruncate isn't guaranteed to fully truncate | ||
2843 | * the xtree. The caller needs to check i_size | ||
2844 | * after committing the transaction to see if | ||
2845 | * additional truncation is needed. The | ||
2846 | * COMMIT_Stale flag tells caller that we | ||
2847 | * initiated the truncation. | ||
2848 | */ | ||
2849 | xtTruncate(tid, ip, 0, COMMIT_PWMAP); | ||
2850 | set_cflag(COMMIT_Stale, ip); | ||
2851 | |||
2852 | tblk->xflag = xflag_save; | ||
2853 | } else | ||
2854 | ip->i_size = 1; | ||
2855 | |||
2856 | jfs_ip->next_index = 2; | ||
2857 | } else | ||
2858 | ip->i_size = IDATASIZE; | ||
2859 | |||
2860 | /* | ||
2861 | * acquire a transaction lock on the root | ||
2862 | * | ||
2863 | * action: directory initialization; | ||
2864 | */ | ||
2865 | tlck = txLock(tid, ip, (struct metapage *) & jfs_ip->bxflag, | ||
2866 | tlckDTREE | tlckENTRY | tlckBTROOT); | ||
2867 | dtlck = (struct dt_lock *) & tlck->lock; | ||
2868 | |||
2869 | /* linelock root */ | ||
2870 | ASSERT(dtlck->index == 0); | ||
2871 | lv = & dtlck->lv[0]; | ||
2872 | lv->offset = 0; | ||
2873 | lv->length = DTROOTMAXSLOT; | ||
2874 | dtlck->index++; | ||
2875 | |||
2876 | p = &jfs_ip->i_dtroot; | ||
2877 | |||
2878 | p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF; | ||
2879 | |||
2880 | p->header.nextindex = 0; | ||
2881 | |||
2882 | /* init freelist */ | ||
2883 | fsi = 1; | ||
2884 | f = &p->slot[fsi]; | ||
2885 | |||
2886 | /* init data area of root */ | ||
2887 | for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) | ||
2888 | f->next = fsi; | ||
2889 | f->next = -1; | ||
2890 | |||
2891 | p->header.freelist = 1; | ||
2892 | p->header.freecnt = 8; | ||
2893 | |||
2894 | /* init '..' entry */ | ||
2895 | p->header.idotdot = cpu_to_le32(idotdot); | ||
2896 | |||
2897 | return; | ||
2898 | } | ||
2899 | |||
2900 | /* | ||
2901 | * add_missing_indices() | ||
2902 | * | ||
2903 | * function: Fix dtree page in which one or more entries has an invalid index. | ||
2904 | * fsck.jfs should really fix this, but it currently does not. | ||
2905 | * Called from jfs_readdir when bad index is detected. | ||
2906 | */ | ||
2907 | static void add_missing_indices(struct inode *inode, s64 bn) | ||
2908 | { | ||
2909 | struct ldtentry *d; | ||
2910 | struct dt_lock *dtlck; | ||
2911 | int i; | ||
2912 | uint index; | ||
2913 | struct lv *lv; | ||
2914 | struct metapage *mp; | ||
2915 | dtpage_t *p; | ||
2916 | int rc; | ||
2917 | s8 *stbl; | ||
2918 | tid_t tid; | ||
2919 | struct tlock *tlck; | ||
2920 | |||
2921 | tid = txBegin(inode->i_sb, 0); | ||
2922 | |||
2923 | DT_GETPAGE(inode, bn, mp, PSIZE, p, rc); | ||
2924 | |||
2925 | if (rc) { | ||
2926 | printk(KERN_ERR "DT_GETPAGE failed!\n"); | ||
2927 | goto end; | ||
2928 | } | ||
2929 | BT_MARK_DIRTY(mp, inode); | ||
2930 | |||
2931 | ASSERT(p->header.flag & BT_LEAF); | ||
2932 | |||
2933 | tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY); | ||
2934 | dtlck = (struct dt_lock *) &tlck->lock; | ||
2935 | |||
2936 | stbl = DT_GETSTBL(p); | ||
2937 | for (i = 0; i < p->header.nextindex; i++) { | ||
2938 | d = (struct ldtentry *) &p->slot[stbl[i]]; | ||
2939 | index = le32_to_cpu(d->index); | ||
2940 | if ((index < 2) || (index >= JFS_IP(inode)->next_index)) { | ||
2941 | d->index = cpu_to_le32(add_index(tid, inode, bn, i)); | ||
2942 | if (dtlck->index >= dtlck->maxcnt) | ||
2943 | dtlck = (struct dt_lock *) txLinelock(dtlck); | ||
2944 | lv = &dtlck->lv[dtlck->index]; | ||
2945 | lv->offset = stbl[i]; | ||
2946 | lv->length = 1; | ||
2947 | dtlck->index++; | ||
2948 | } | ||
2949 | } | ||
2950 | |||
2951 | DT_PUTPAGE(mp); | ||
2952 | (void) txCommit(tid, 1, &inode, 0); | ||
2953 | end: | ||
2954 | txEnd(tid); | ||
2955 | } | ||
2956 | |||
2957 | /* | ||
2958 | * Buffer to hold directory entry info while traversing a dtree page | ||
2959 | * before being fed to the filldir function | ||
2960 | */ | ||
2961 | struct jfs_dirent { | ||
2962 | loff_t position; | ||
2963 | int ino; | ||
2964 | u16 name_len; | ||
2965 | char name[0]; | ||
2966 | }; | ||
2967 | |||
2968 | /* | ||
2969 | * function to determine next variable-sized jfs_dirent in buffer | ||
2970 | */ | ||
2971 | static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent) | ||
2972 | { | ||
2973 | return (struct jfs_dirent *) | ||
2974 | ((char *)dirent + | ||
2975 | ((sizeof (struct jfs_dirent) + dirent->name_len + 1 + | ||
2976 | sizeof (loff_t) - 1) & | ||
2977 | ~(sizeof (loff_t) - 1))); | ||
2978 | } | ||
2979 | |||
2980 | /* | ||
2981 | * jfs_readdir() | ||
2982 | * | ||
2983 | * function: read directory entries sequentially | ||
2984 | * from the specified entry offset | ||
2985 | * | ||
2986 | * parameter: | ||
2987 | * | ||
2988 | * return: offset = (pn, index) of start entry | ||
2989 | * of next jfs_readdir()/dtRead() | ||
2990 | */ | ||
2991 | int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) | ||
2992 | { | ||
2993 | struct inode *ip = filp->f_dentry->d_inode; | ||
2994 | struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab; | ||
2995 | int rc = 0; | ||
2996 | loff_t dtpos; /* legacy OS/2 style position */ | ||
2997 | struct dtoffset { | ||
2998 | s16 pn; | ||
2999 | s16 index; | ||
3000 | s32 unused; | ||
3001 | } *dtoffset = (struct dtoffset *) &dtpos; | ||
3002 | s64 bn; | ||
3003 | struct metapage *mp; | ||
3004 | dtpage_t *p; | ||
3005 | int index; | ||
3006 | s8 *stbl; | ||
3007 | struct btstack btstack; | ||
3008 | int i, next; | ||
3009 | struct ldtentry *d; | ||
3010 | struct dtslot *t; | ||
3011 | int d_namleft, len, outlen; | ||
3012 | unsigned long dirent_buf; | ||
3013 | char *name_ptr; | ||
3014 | u32 dir_index; | ||
3015 | int do_index = 0; | ||
3016 | uint loop_count = 0; | ||
3017 | struct jfs_dirent *jfs_dirent; | ||
3018 | int jfs_dirents; | ||
3019 | int overflow, fix_page, page_fixed = 0; | ||
3020 | static int unique_pos = 2; /* If we can't fix broken index */ | ||
3021 | |||
3022 | if (filp->f_pos == DIREND) | ||
3023 | return 0; | ||
3024 | |||
3025 | if (DO_INDEX(ip)) { | ||
3026 | /* | ||
3027 | * persistent index is stored in directory entries. | ||
3028 | * Special cases: 0 = . | ||
3029 | * 1 = .. | ||
3030 | * -1 = End of directory | ||
3031 | */ | ||
3032 | do_index = 1; | ||
3033 | |||
3034 | dir_index = (u32) filp->f_pos; | ||
3035 | |||
3036 | if (dir_index > 1) { | ||
3037 | struct dir_table_slot dirtab_slot; | ||
3038 | |||
3039 | if (dtEmpty(ip) || | ||
3040 | (dir_index >= JFS_IP(ip)->next_index)) { | ||
3041 | /* Stale position. Directory has shrunk */ | ||
3042 | filp->f_pos = DIREND; | ||
3043 | return 0; | ||
3044 | } | ||
3045 | repeat: | ||
3046 | rc = read_index(ip, dir_index, &dirtab_slot); | ||
3047 | if (rc) { | ||
3048 | filp->f_pos = DIREND; | ||
3049 | return rc; | ||
3050 | } | ||
3051 | if (dirtab_slot.flag == DIR_INDEX_FREE) { | ||
3052 | if (loop_count++ > JFS_IP(ip)->next_index) { | ||
3053 | jfs_err("jfs_readdir detected " | ||
3054 | "infinite loop!"); | ||
3055 | filp->f_pos = DIREND; | ||
3056 | return 0; | ||
3057 | } | ||
3058 | dir_index = le32_to_cpu(dirtab_slot.addr2); | ||
3059 | if (dir_index == -1) { | ||
3060 | filp->f_pos = DIREND; | ||
3061 | return 0; | ||
3062 | } | ||
3063 | goto repeat; | ||
3064 | } | ||
3065 | bn = addressDTS(&dirtab_slot); | ||
3066 | index = dirtab_slot.slot; | ||
3067 | DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
3068 | if (rc) { | ||
3069 | filp->f_pos = DIREND; | ||
3070 | return 0; | ||
3071 | } | ||
3072 | if (p->header.flag & BT_INTERNAL) { | ||
3073 | jfs_err("jfs_readdir: bad index table"); | ||
3074 | DT_PUTPAGE(mp); | ||
3075 | filp->f_pos = -1; | ||
3076 | return 0; | ||
3077 | } | ||
3078 | } else { | ||
3079 | if (dir_index == 0) { | ||
3080 | /* | ||
3081 | * self "." | ||
3082 | */ | ||
3083 | filp->f_pos = 0; | ||
3084 | if (filldir(dirent, ".", 1, 0, ip->i_ino, | ||
3085 | DT_DIR)) | ||
3086 | return 0; | ||
3087 | } | ||
3088 | /* | ||
3089 | * parent ".." | ||
3090 | */ | ||
3091 | filp->f_pos = 1; | ||
3092 | if (filldir(dirent, "..", 2, 1, PARENT(ip), DT_DIR)) | ||
3093 | return 0; | ||
3094 | |||
3095 | /* | ||
3096 | * Find first entry of left-most leaf | ||
3097 | */ | ||
3098 | if (dtEmpty(ip)) { | ||
3099 | filp->f_pos = DIREND; | ||
3100 | return 0; | ||
3101 | } | ||
3102 | |||
3103 | if ((rc = dtReadFirst(ip, &btstack))) | ||
3104 | return rc; | ||
3105 | |||
3106 | DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); | ||
3107 | } | ||
3108 | } else { | ||
3109 | /* | ||
3110 | * Legacy filesystem - OS/2 & Linux JFS < 0.3.6 | ||
3111 | * | ||
3112 | * pn = index = 0: First entry "." | ||
3113 | * pn = 0; index = 1: Second entry ".." | ||
3114 | * pn > 0: Real entries, pn=1 -> leftmost page | ||
3115 | * pn = index = -1: No more entries | ||
3116 | */ | ||
3117 | dtpos = filp->f_pos; | ||
3118 | if (dtpos == 0) { | ||
3119 | /* build "." entry */ | ||
3120 | |||
3121 | if (filldir(dirent, ".", 1, filp->f_pos, ip->i_ino, | ||
3122 | DT_DIR)) | ||
3123 | return 0; | ||
3124 | dtoffset->index = 1; | ||
3125 | filp->f_pos = dtpos; | ||
3126 | } | ||
3127 | |||
3128 | if (dtoffset->pn == 0) { | ||
3129 | if (dtoffset->index == 1) { | ||
3130 | /* build ".." entry */ | ||
3131 | |||
3132 | if (filldir(dirent, "..", 2, filp->f_pos, | ||
3133 | PARENT(ip), DT_DIR)) | ||
3134 | return 0; | ||
3135 | } else { | ||
3136 | jfs_err("jfs_readdir called with " | ||
3137 | "invalid offset!"); | ||
3138 | } | ||
3139 | dtoffset->pn = 1; | ||
3140 | dtoffset->index = 0; | ||
3141 | filp->f_pos = dtpos; | ||
3142 | } | ||
3143 | |||
3144 | if (dtEmpty(ip)) { | ||
3145 | filp->f_pos = DIREND; | ||
3146 | return 0; | ||
3147 | } | ||
3148 | |||
3149 | if ((rc = dtReadNext(ip, &filp->f_pos, &btstack))) { | ||
3150 | jfs_err("jfs_readdir: unexpected rc = %d " | ||
3151 | "from dtReadNext", rc); | ||
3152 | filp->f_pos = DIREND; | ||
3153 | return 0; | ||
3154 | } | ||
3155 | /* get start leaf page and index */ | ||
3156 | DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); | ||
3157 | |||
3158 | /* offset beyond directory eof ? */ | ||
3159 | if (bn < 0) { | ||
3160 | filp->f_pos = DIREND; | ||
3161 | return 0; | ||
3162 | } | ||
3163 | } | ||
3164 | |||
3165 | dirent_buf = __get_free_page(GFP_KERNEL); | ||
3166 | if (dirent_buf == 0) { | ||
3167 | DT_PUTPAGE(mp); | ||
3168 | jfs_warn("jfs_readdir: __get_free_page failed!"); | ||
3169 | filp->f_pos = DIREND; | ||
3170 | return -ENOMEM; | ||
3171 | } | ||
3172 | |||
3173 | while (1) { | ||
3174 | jfs_dirent = (struct jfs_dirent *) dirent_buf; | ||
3175 | jfs_dirents = 0; | ||
3176 | overflow = fix_page = 0; | ||
3177 | |||
3178 | stbl = DT_GETSTBL(p); | ||
3179 | |||
3180 | for (i = index; i < p->header.nextindex; i++) { | ||
3181 | d = (struct ldtentry *) & p->slot[stbl[i]]; | ||
3182 | |||
3183 | if (((long) jfs_dirent + d->namlen + 1) > | ||
3184 | (dirent_buf + PSIZE)) { | ||
3185 | /* DBCS codepages could overrun dirent_buf */ | ||
3186 | index = i; | ||
3187 | overflow = 1; | ||
3188 | break; | ||
3189 | } | ||
3190 | |||
3191 | d_namleft = d->namlen; | ||
3192 | name_ptr = jfs_dirent->name; | ||
3193 | jfs_dirent->ino = le32_to_cpu(d->inumber); | ||
3194 | |||
3195 | if (do_index) { | ||
3196 | len = min(d_namleft, DTLHDRDATALEN); | ||
3197 | jfs_dirent->position = le32_to_cpu(d->index); | ||
3198 | /* | ||
3199 | * d->index should always be valid, but it | ||
3200 | * isn't. fsck.jfs doesn't create the | ||
3201 | * directory index for the lost+found | ||
3202 | * directory. Rather than let it go, | ||
3203 | * we can try to fix it. | ||
3204 | */ | ||
3205 | if ((jfs_dirent->position < 2) || | ||
3206 | (jfs_dirent->position >= | ||
3207 | JFS_IP(ip)->next_index)) { | ||
3208 | if (!page_fixed && !isReadOnly(ip)) { | ||
3209 | fix_page = 1; | ||
3210 | /* | ||
3211 | * setting overflow and setting | ||
3212 | * index to i will cause the | ||
3213 | * same page to be processed | ||
3214 | * again starting here | ||
3215 | */ | ||
3216 | overflow = 1; | ||
3217 | index = i; | ||
3218 | break; | ||
3219 | } | ||
3220 | jfs_dirent->position = unique_pos++; | ||
3221 | } | ||
3222 | } else { | ||
3223 | jfs_dirent->position = dtpos; | ||
3224 | len = min(d_namleft, DTLHDRDATALEN_LEGACY); | ||
3225 | } | ||
3226 | |||
3227 | /* copy the name of head/only segment */ | ||
3228 | outlen = jfs_strfromUCS_le(name_ptr, d->name, len, | ||
3229 | codepage); | ||
3230 | jfs_dirent->name_len = outlen; | ||
3231 | |||
3232 | /* copy name in the additional segment(s) */ | ||
3233 | next = d->next; | ||
3234 | while (next >= 0) { | ||
3235 | t = (struct dtslot *) & p->slot[next]; | ||
3236 | name_ptr += outlen; | ||
3237 | d_namleft -= len; | ||
3238 | /* Sanity Check */ | ||
3239 | if (d_namleft == 0) { | ||
3240 | jfs_error(ip->i_sb, | ||
3241 | "JFS:Dtree error: ino = " | ||
3242 | "%ld, bn=%Ld, index = %d", | ||
3243 | (long)ip->i_ino, | ||
3244 | (long long)bn, | ||
3245 | i); | ||
3246 | goto skip_one; | ||
3247 | } | ||
3248 | len = min(d_namleft, DTSLOTDATALEN); | ||
3249 | outlen = jfs_strfromUCS_le(name_ptr, t->name, | ||
3250 | len, codepage); | ||
3251 | jfs_dirent->name_len += outlen; | ||
3252 | |||
3253 | next = t->next; | ||
3254 | } | ||
3255 | |||
3256 | jfs_dirents++; | ||
3257 | jfs_dirent = next_jfs_dirent(jfs_dirent); | ||
3258 | skip_one: | ||
3259 | if (!do_index) | ||
3260 | dtoffset->index++; | ||
3261 | } | ||
3262 | |||
3263 | if (!overflow) { | ||
3264 | /* Point to next leaf page */ | ||
3265 | if (p->header.flag & BT_ROOT) | ||
3266 | bn = 0; | ||
3267 | else { | ||
3268 | bn = le64_to_cpu(p->header.next); | ||
3269 | index = 0; | ||
3270 | /* update offset (pn:index) for new page */ | ||
3271 | if (!do_index) { | ||
3272 | dtoffset->pn++; | ||
3273 | dtoffset->index = 0; | ||
3274 | } | ||
3275 | } | ||
3276 | page_fixed = 0; | ||
3277 | } | ||
3278 | |||
3279 | /* unpin previous leaf page */ | ||
3280 | DT_PUTPAGE(mp); | ||
3281 | |||
3282 | jfs_dirent = (struct jfs_dirent *) dirent_buf; | ||
3283 | while (jfs_dirents--) { | ||
3284 | filp->f_pos = jfs_dirent->position; | ||
3285 | if (filldir(dirent, jfs_dirent->name, | ||
3286 | jfs_dirent->name_len, filp->f_pos, | ||
3287 | jfs_dirent->ino, DT_UNKNOWN)) | ||
3288 | goto out; | ||
3289 | jfs_dirent = next_jfs_dirent(jfs_dirent); | ||
3290 | } | ||
3291 | |||
3292 | if (fix_page) { | ||
3293 | add_missing_indices(ip, bn); | ||
3294 | page_fixed = 1; | ||
3295 | } | ||
3296 | |||
3297 | if (!overflow && (bn == 0)) { | ||
3298 | filp->f_pos = DIREND; | ||
3299 | break; | ||
3300 | } | ||
3301 | |||
3302 | DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
3303 | if (rc) { | ||
3304 | free_page(dirent_buf); | ||
3305 | return rc; | ||
3306 | } | ||
3307 | } | ||
3308 | |||
3309 | out: | ||
3310 | free_page(dirent_buf); | ||
3311 | |||
3312 | return rc; | ||
3313 | } | ||
3314 | |||
3315 | |||
3316 | /* | ||
3317 | * dtReadFirst() | ||
3318 | * | ||
3319 | * function: get the leftmost page of the directory | ||
3320 | */ | ||
3321 | static int dtReadFirst(struct inode *ip, struct btstack * btstack) | ||
3322 | { | ||
3323 | int rc = 0; | ||
3324 | s64 bn; | ||
3325 | int psize = 288; /* initial in-line directory */ | ||
3326 | struct metapage *mp; | ||
3327 | dtpage_t *p; | ||
3328 | s8 *stbl; | ||
3329 | struct btframe *btsp; | ||
3330 | pxd_t *xd; | ||
3331 | |||
3332 | BT_CLR(btstack); /* reset stack */ | ||
3333 | |||
3334 | /* | ||
3335 | * descend leftmost path of the tree | ||
3336 | * | ||
3337 | * by convention, root bn = 0. | ||
3338 | */ | ||
3339 | for (bn = 0;;) { | ||
3340 | DT_GETPAGE(ip, bn, mp, psize, p, rc); | ||
3341 | if (rc) | ||
3342 | return rc; | ||
3343 | |||
3344 | /* | ||
3345 | * leftmost leaf page | ||
3346 | */ | ||
3347 | if (p->header.flag & BT_LEAF) { | ||
3348 | /* return leftmost entry */ | ||
3349 | btsp = btstack->top; | ||
3350 | btsp->bn = bn; | ||
3351 | btsp->index = 0; | ||
3352 | btsp->mp = mp; | ||
3353 | |||
3354 | return 0; | ||
3355 | } | ||
3356 | |||
3357 | /* | ||
3358 | * descend down to leftmost child page | ||
3359 | */ | ||
3360 | if (BT_STACK_FULL(btstack)) { | ||
3361 | DT_PUTPAGE(mp); | ||
3362 | jfs_error(ip->i_sb, "dtReadFirst: btstack overrun"); | ||
3363 | BT_STACK_DUMP(btstack); | ||
3364 | return -EIO; | ||
3365 | } | ||
3366 | /* push (bn, index) of the parent page/entry */ | ||
3367 | BT_PUSH(btstack, bn, 0); | ||
3368 | |||
3369 | /* get the leftmost entry */ | ||
3370 | stbl = DT_GETSTBL(p); | ||
3371 | xd = (pxd_t *) & p->slot[stbl[0]]; | ||
3372 | |||
3373 | /* get the child page block address */ | ||
3374 | bn = addressPXD(xd); | ||
3375 | psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize; | ||
3376 | |||
3377 | /* unpin the parent page */ | ||
3378 | DT_PUTPAGE(mp); | ||
3379 | } | ||
3380 | } | ||
3381 | |||
3382 | |||
3383 | /* | ||
3384 | * dtReadNext() | ||
3385 | * | ||
3386 | * function: get the page of the specified offset (pn:index) | ||
3387 | * | ||
3388 | * return: if (offset > eof), bn = -1; | ||
3389 | * | ||
3390 | * note: if index > nextindex of the target leaf page, | ||
3391 | * start with 1st entry of next leaf page; | ||
3392 | */ | ||
3393 | static int dtReadNext(struct inode *ip, loff_t * offset, | ||
3394 | struct btstack * btstack) | ||
3395 | { | ||
3396 | int rc = 0; | ||
3397 | struct dtoffset { | ||
3398 | s16 pn; | ||
3399 | s16 index; | ||
3400 | s32 unused; | ||
3401 | } *dtoffset = (struct dtoffset *) offset; | ||
3402 | s64 bn; | ||
3403 | struct metapage *mp; | ||
3404 | dtpage_t *p; | ||
3405 | int index; | ||
3406 | int pn; | ||
3407 | s8 *stbl; | ||
3408 | struct btframe *btsp, *parent; | ||
3409 | pxd_t *xd; | ||
3410 | |||
3411 | /* | ||
3412 | * get leftmost leaf page pinned | ||
3413 | */ | ||
3414 | if ((rc = dtReadFirst(ip, btstack))) | ||
3415 | return rc; | ||
3416 | |||
3417 | /* get leaf page */ | ||
3418 | DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); | ||
3419 | |||
3420 | /* get the start offset (pn:index) */ | ||
3421 | pn = dtoffset->pn - 1; /* Now pn = 0 represents leftmost leaf */ | ||
3422 | index = dtoffset->index; | ||
3423 | |||
3424 | /* start at leftmost page ? */ | ||
3425 | if (pn == 0) { | ||
3426 | /* offset beyond eof ? */ | ||
3427 | if (index < p->header.nextindex) | ||
3428 | goto out; | ||
3429 | |||
3430 | if (p->header.flag & BT_ROOT) { | ||
3431 | bn = -1; | ||
3432 | goto out; | ||
3433 | } | ||
3434 | |||
3435 | /* start with 1st entry of next leaf page */ | ||
3436 | dtoffset->pn++; | ||
3437 | dtoffset->index = index = 0; | ||
3438 | goto a; | ||
3439 | } | ||
3440 | |||
3441 | /* start at non-leftmost page: scan parent pages for large pn */ | ||
3442 | if (p->header.flag & BT_ROOT) { | ||
3443 | bn = -1; | ||
3444 | goto out; | ||
3445 | } | ||
3446 | |||
3447 | /* start after next leaf page ? */ | ||
3448 | if (pn > 1) | ||
3449 | goto b; | ||
3450 | |||
3451 | /* get leaf page pn = 1 */ | ||
3452 | a: | ||
3453 | bn = le64_to_cpu(p->header.next); | ||
3454 | |||
3455 | /* unpin leaf page */ | ||
3456 | DT_PUTPAGE(mp); | ||
3457 | |||
3458 | /* offset beyond eof ? */ | ||
3459 | if (bn == 0) { | ||
3460 | bn = -1; | ||
3461 | goto out; | ||
3462 | } | ||
3463 | |||
3464 | goto c; | ||
3465 | |||
3466 | /* | ||
3467 | * scan last internal page level to get target leaf page | ||
3468 | */ | ||
3469 | b: | ||
3470 | /* unpin leftmost leaf page */ | ||
3471 | DT_PUTPAGE(mp); | ||
3472 | |||
3473 | /* get left most parent page */ | ||
3474 | btsp = btstack->top; | ||
3475 | parent = btsp - 1; | ||
3476 | bn = parent->bn; | ||
3477 | DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
3478 | if (rc) | ||
3479 | return rc; | ||
3480 | |||
3481 | /* scan parent pages at last internal page level */ | ||
3482 | while (pn >= p->header.nextindex) { | ||
3483 | pn -= p->header.nextindex; | ||
3484 | |||
3485 | /* get next parent page address */ | ||
3486 | bn = le64_to_cpu(p->header.next); | ||
3487 | |||
3488 | /* unpin current parent page */ | ||
3489 | DT_PUTPAGE(mp); | ||
3490 | |||
3491 | /* offset beyond eof ? */ | ||
3492 | if (bn == 0) { | ||
3493 | bn = -1; | ||
3494 | goto out; | ||
3495 | } | ||
3496 | |||
3497 | /* get next parent page */ | ||
3498 | DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
3499 | if (rc) | ||
3500 | return rc; | ||
3501 | |||
3502 | /* update parent page stack frame */ | ||
3503 | parent->bn = bn; | ||
3504 | } | ||
3505 | |||
3506 | /* get leaf page address */ | ||
3507 | stbl = DT_GETSTBL(p); | ||
3508 | xd = (pxd_t *) & p->slot[stbl[pn]]; | ||
3509 | bn = addressPXD(xd); | ||
3510 | |||
3511 | /* unpin parent page */ | ||
3512 | DT_PUTPAGE(mp); | ||
3513 | |||
3514 | /* | ||
3515 | * get target leaf page | ||
3516 | */ | ||
3517 | c: | ||
3518 | DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
3519 | if (rc) | ||
3520 | return rc; | ||
3521 | |||
3522 | /* | ||
3523 | * leaf page has been completed: | ||
3524 | * start with 1st entry of next leaf page | ||
3525 | */ | ||
3526 | if (index >= p->header.nextindex) { | ||
3527 | bn = le64_to_cpu(p->header.next); | ||
3528 | |||
3529 | /* unpin leaf page */ | ||
3530 | DT_PUTPAGE(mp); | ||
3531 | |||
3532 | /* offset beyond eof ? */ | ||
3533 | if (bn == 0) { | ||
3534 | bn = -1; | ||
3535 | goto out; | ||
3536 | } | ||
3537 | |||
3538 | /* get next leaf page */ | ||
3539 | DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
3540 | if (rc) | ||
3541 | return rc; | ||
3542 | |||
3543 | /* start with 1st entry of next leaf page */ | ||
3544 | dtoffset->pn++; | ||
3545 | dtoffset->index = 0; | ||
3546 | } | ||
3547 | |||
3548 | out: | ||
3549 | /* return target leaf page pinned */ | ||
3550 | btsp = btstack->top; | ||
3551 | btsp->bn = bn; | ||
3552 | btsp->index = dtoffset->index; | ||
3553 | btsp->mp = mp; | ||
3554 | |||
3555 | return 0; | ||
3556 | } | ||
3557 | |||
3558 | |||
3559 | /* | ||
3560 | * dtCompare() | ||
3561 | * | ||
3562 | * function: compare search key with an internal entry | ||
3563 | * | ||
3564 | * return: | ||
3565 | * < 0 if k is < record | ||
3566 | * = 0 if k is = record | ||
3567 | * > 0 if k is > record | ||
3568 | */ | ||
3569 | static int dtCompare(struct component_name * key, /* search key */ | ||
3570 | dtpage_t * p, /* directory page */ | ||
3571 | int si) | ||
3572 | { /* entry slot index */ | ||
3573 | wchar_t *kname; | ||
3574 | __le16 *name; | ||
3575 | int klen, namlen, len, rc; | ||
3576 | struct idtentry *ih; | ||
3577 | struct dtslot *t; | ||
3578 | |||
3579 | /* | ||
3580 | * force the left-most key on internal pages, at any level of | ||
3581 | * the tree, to be less than any search key. | ||
3582 | * this obviates having to update the leftmost key on an internal | ||
3583 | * page when the user inserts a new key in the tree smaller than | ||
3584 | * anything that has been stored. | ||
3585 | * | ||
3586 | * (? if/when dtSearch() narrows down to 1st entry (index = 0), | ||
3587 | * at any internal page at any level of the tree, | ||
3588 | * it descends to child of the entry anyway - | ||
3589 | * ? make the entry as min size dummy entry) | ||
3590 | * | ||
3591 | * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) | ||
3592 | * return (1); | ||
3593 | */ | ||
3594 | |||
3595 | kname = key->name; | ||
3596 | klen = key->namlen; | ||
3597 | |||
3598 | ih = (struct idtentry *) & p->slot[si]; | ||
3599 | si = ih->next; | ||
3600 | name = ih->name; | ||
3601 | namlen = ih->namlen; | ||
3602 | len = min(namlen, DTIHDRDATALEN); | ||
3603 | |||
3604 | /* compare with head/only segment */ | ||
3605 | len = min(klen, len); | ||
3606 | if ((rc = UniStrncmp_le(kname, name, len))) | ||
3607 | return rc; | ||
3608 | |||
3609 | klen -= len; | ||
3610 | namlen -= len; | ||
3611 | |||
3612 | /* compare with additional segment(s) */ | ||
3613 | kname += len; | ||
3614 | while (klen > 0 && namlen > 0) { | ||
3615 | /* compare with next name segment */ | ||
3616 | t = (struct dtslot *) & p->slot[si]; | ||
3617 | len = min(namlen, DTSLOTDATALEN); | ||
3618 | len = min(klen, len); | ||
3619 | name = t->name; | ||
3620 | if ((rc = UniStrncmp_le(kname, name, len))) | ||
3621 | return rc; | ||
3622 | |||
3623 | klen -= len; | ||
3624 | namlen -= len; | ||
3625 | kname += len; | ||
3626 | si = t->next; | ||
3627 | } | ||
3628 | |||
3629 | return (klen - namlen); | ||
3630 | } | ||
3631 | |||
3632 | |||
3633 | |||
3634 | |||
3635 | /* | ||
3636 | * ciCompare() | ||
3637 | * | ||
3638 | * function: compare search key with an (leaf/internal) entry | ||
3639 | * | ||
3640 | * return: | ||
3641 | * < 0 if k is < record | ||
3642 | * = 0 if k is = record | ||
3643 | * > 0 if k is > record | ||
3644 | */ | ||
3645 | static int ciCompare(struct component_name * key, /* search key */ | ||
3646 | dtpage_t * p, /* directory page */ | ||
3647 | int si, /* entry slot index */ | ||
3648 | int flag) | ||
3649 | { | ||
3650 | wchar_t *kname, x; | ||
3651 | __le16 *name; | ||
3652 | int klen, namlen, len, rc; | ||
3653 | struct ldtentry *lh; | ||
3654 | struct idtentry *ih; | ||
3655 | struct dtslot *t; | ||
3656 | int i; | ||
3657 | |||
3658 | /* | ||
3659 | * force the left-most key on internal pages, at any level of | ||
3660 | * the tree, to be less than any search key. | ||
3661 | * this obviates having to update the leftmost key on an internal | ||
3662 | * page when the user inserts a new key in the tree smaller than | ||
3663 | * anything that has been stored. | ||
3664 | * | ||
3665 | * (? if/when dtSearch() narrows down to 1st entry (index = 0), | ||
3666 | * at any internal page at any level of the tree, | ||
3667 | * it descends to child of the entry anyway - | ||
3668 | * ? make the entry as min size dummy entry) | ||
3669 | * | ||
3670 | * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) | ||
3671 | * return (1); | ||
3672 | */ | ||
3673 | |||
3674 | kname = key->name; | ||
3675 | klen = key->namlen; | ||
3676 | |||
3677 | /* | ||
3678 | * leaf page entry | ||
3679 | */ | ||
3680 | if (p->header.flag & BT_LEAF) { | ||
3681 | lh = (struct ldtentry *) & p->slot[si]; | ||
3682 | si = lh->next; | ||
3683 | name = lh->name; | ||
3684 | namlen = lh->namlen; | ||
3685 | if (flag & JFS_DIR_INDEX) | ||
3686 | len = min(namlen, DTLHDRDATALEN); | ||
3687 | else | ||
3688 | len = min(namlen, DTLHDRDATALEN_LEGACY); | ||
3689 | } | ||
3690 | /* | ||
3691 | * internal page entry | ||
3692 | */ | ||
3693 | else { | ||
3694 | ih = (struct idtentry *) & p->slot[si]; | ||
3695 | si = ih->next; | ||
3696 | name = ih->name; | ||
3697 | namlen = ih->namlen; | ||
3698 | len = min(namlen, DTIHDRDATALEN); | ||
3699 | } | ||
3700 | |||
3701 | /* compare with head/only segment */ | ||
3702 | len = min(klen, len); | ||
3703 | for (i = 0; i < len; i++, kname++, name++) { | ||
3704 | /* only uppercase if case-insensitive support is on */ | ||
3705 | if ((flag & JFS_OS2) == JFS_OS2) | ||
3706 | x = UniToupper(le16_to_cpu(*name)); | ||
3707 | else | ||
3708 | x = le16_to_cpu(*name); | ||
3709 | if ((rc = *kname - x)) | ||
3710 | return rc; | ||
3711 | } | ||
3712 | |||
3713 | klen -= len; | ||
3714 | namlen -= len; | ||
3715 | |||
3716 | /* compare with additional segment(s) */ | ||
3717 | while (klen > 0 && namlen > 0) { | ||
3718 | /* compare with next name segment */ | ||
3719 | t = (struct dtslot *) & p->slot[si]; | ||
3720 | len = min(namlen, DTSLOTDATALEN); | ||
3721 | len = min(klen, len); | ||
3722 | name = t->name; | ||
3723 | for (i = 0; i < len; i++, kname++, name++) { | ||
3724 | /* only uppercase if case-insensitive support is on */ | ||
3725 | if ((flag & JFS_OS2) == JFS_OS2) | ||
3726 | x = UniToupper(le16_to_cpu(*name)); | ||
3727 | else | ||
3728 | x = le16_to_cpu(*name); | ||
3729 | |||
3730 | if ((rc = *kname - x)) | ||
3731 | return rc; | ||
3732 | } | ||
3733 | |||
3734 | klen -= len; | ||
3735 | namlen -= len; | ||
3736 | si = t->next; | ||
3737 | } | ||
3738 | |||
3739 | return (klen - namlen); | ||
3740 | } | ||
3741 | |||
3742 | |||
3743 | /* | ||
3744 | * ciGetLeafPrefixKey() | ||
3745 | * | ||
3746 | * function: compute prefix of suffix compression | ||
3747 | * from two adjacent leaf entries | ||
3748 | * across page boundary | ||
3749 | * | ||
3750 | * return: non-zero on error | ||
3751 | * | ||
3752 | */ | ||
3753 | static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, | ||
3754 | int ri, struct component_name * key, int flag) | ||
3755 | { | ||
3756 | int klen, namlen; | ||
3757 | wchar_t *pl, *pr, *kname; | ||
3758 | struct component_name lkey; | ||
3759 | struct component_name rkey; | ||
3760 | |||
3761 | lkey.name = (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), | ||
3762 | GFP_KERNEL); | ||
3763 | if (lkey.name == NULL) | ||
3764 | return -ENOSPC; | ||
3765 | |||
3766 | rkey.name = (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), | ||
3767 | GFP_KERNEL); | ||
3768 | if (rkey.name == NULL) { | ||
3769 | kfree(lkey.name); | ||
3770 | return -ENOSPC; | ||
3771 | } | ||
3772 | |||
3773 | /* get left and right key */ | ||
3774 | dtGetKey(lp, li, &lkey, flag); | ||
3775 | lkey.name[lkey.namlen] = 0; | ||
3776 | |||
3777 | if ((flag & JFS_OS2) == JFS_OS2) | ||
3778 | ciToUpper(&lkey); | ||
3779 | |||
3780 | dtGetKey(rp, ri, &rkey, flag); | ||
3781 | rkey.name[rkey.namlen] = 0; | ||
3782 | |||
3783 | |||
3784 | if ((flag & JFS_OS2) == JFS_OS2) | ||
3785 | ciToUpper(&rkey); | ||
3786 | |||
3787 | /* compute prefix */ | ||
3788 | klen = 0; | ||
3789 | kname = key->name; | ||
3790 | namlen = min(lkey.namlen, rkey.namlen); | ||
3791 | for (pl = lkey.name, pr = rkey.name; | ||
3792 | namlen; pl++, pr++, namlen--, klen++, kname++) { | ||
3793 | *kname = *pr; | ||
3794 | if (*pl != *pr) { | ||
3795 | key->namlen = klen + 1; | ||
3796 | goto free_names; | ||
3797 | } | ||
3798 | } | ||
3799 | |||
3800 | /* l->namlen <= r->namlen since l <= r */ | ||
3801 | if (lkey.namlen < rkey.namlen) { | ||
3802 | *kname = *pr; | ||
3803 | key->namlen = klen + 1; | ||
3804 | } else /* l->namelen == r->namelen */ | ||
3805 | key->namlen = klen; | ||
3806 | |||
3807 | free_names: | ||
3808 | kfree(lkey.name); | ||
3809 | kfree(rkey.name); | ||
3810 | return 0; | ||
3811 | } | ||
3812 | |||
3813 | |||
3814 | |||
3815 | /* | ||
3816 | * dtGetKey() | ||
3817 | * | ||
3818 | * function: get key of the entry | ||
3819 | */ | ||
3820 | static void dtGetKey(dtpage_t * p, int i, /* entry index */ | ||
3821 | struct component_name * key, int flag) | ||
3822 | { | ||
3823 | int si; | ||
3824 | s8 *stbl; | ||
3825 | struct ldtentry *lh; | ||
3826 | struct idtentry *ih; | ||
3827 | struct dtslot *t; | ||
3828 | int namlen, len; | ||
3829 | wchar_t *kname; | ||
3830 | __le16 *name; | ||
3831 | |||
3832 | /* get entry */ | ||
3833 | stbl = DT_GETSTBL(p); | ||
3834 | si = stbl[i]; | ||
3835 | if (p->header.flag & BT_LEAF) { | ||
3836 | lh = (struct ldtentry *) & p->slot[si]; | ||
3837 | si = lh->next; | ||
3838 | namlen = lh->namlen; | ||
3839 | name = lh->name; | ||
3840 | if (flag & JFS_DIR_INDEX) | ||
3841 | len = min(namlen, DTLHDRDATALEN); | ||
3842 | else | ||
3843 | len = min(namlen, DTLHDRDATALEN_LEGACY); | ||
3844 | } else { | ||
3845 | ih = (struct idtentry *) & p->slot[si]; | ||
3846 | si = ih->next; | ||
3847 | namlen = ih->namlen; | ||
3848 | name = ih->name; | ||
3849 | len = min(namlen, DTIHDRDATALEN); | ||
3850 | } | ||
3851 | |||
3852 | key->namlen = namlen; | ||
3853 | kname = key->name; | ||
3854 | |||
3855 | /* | ||
3856 | * move head/only segment | ||
3857 | */ | ||
3858 | UniStrncpy_from_le(kname, name, len); | ||
3859 | |||
3860 | /* | ||
3861 | * move additional segment(s) | ||
3862 | */ | ||
3863 | while (si >= 0) { | ||
3864 | /* get next segment */ | ||
3865 | t = &p->slot[si]; | ||
3866 | kname += len; | ||
3867 | namlen -= len; | ||
3868 | len = min(namlen, DTSLOTDATALEN); | ||
3869 | UniStrncpy_from_le(kname, t->name, len); | ||
3870 | |||
3871 | si = t->next; | ||
3872 | } | ||
3873 | } | ||
3874 | |||
3875 | |||
3876 | /* | ||
3877 | * dtInsertEntry() | ||
3878 | * | ||
3879 | * function: allocate free slot(s) and | ||
3880 | * write a leaf/internal entry | ||
3881 | * | ||
3882 | * return: entry slot index | ||
3883 | */ | ||
3884 | static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key, | ||
3885 | ddata_t * data, struct dt_lock ** dtlock) | ||
3886 | { | ||
3887 | struct dtslot *h, *t; | ||
3888 | struct ldtentry *lh = NULL; | ||
3889 | struct idtentry *ih = NULL; | ||
3890 | int hsi, fsi, klen, len, nextindex; | ||
3891 | wchar_t *kname; | ||
3892 | __le16 *name; | ||
3893 | s8 *stbl; | ||
3894 | pxd_t *xd; | ||
3895 | struct dt_lock *dtlck = *dtlock; | ||
3896 | struct lv *lv; | ||
3897 | int xsi, n; | ||
3898 | s64 bn = 0; | ||
3899 | struct metapage *mp = NULL; | ||
3900 | |||
3901 | klen = key->namlen; | ||
3902 | kname = key->name; | ||
3903 | |||
3904 | /* allocate a free slot */ | ||
3905 | hsi = fsi = p->header.freelist; | ||
3906 | h = &p->slot[fsi]; | ||
3907 | p->header.freelist = h->next; | ||
3908 | --p->header.freecnt; | ||
3909 | |||
3910 | /* open new linelock */ | ||
3911 | if (dtlck->index >= dtlck->maxcnt) | ||
3912 | dtlck = (struct dt_lock *) txLinelock(dtlck); | ||
3913 | |||
3914 | lv = & dtlck->lv[dtlck->index]; | ||
3915 | lv->offset = hsi; | ||
3916 | |||
3917 | /* write head/only segment */ | ||
3918 | if (p->header.flag & BT_LEAF) { | ||
3919 | lh = (struct ldtentry *) h; | ||
3920 | lh->next = h->next; | ||
3921 | lh->inumber = cpu_to_le32(data->leaf.ino); | ||
3922 | lh->namlen = klen; | ||
3923 | name = lh->name; | ||
3924 | if (data->leaf.ip) { | ||
3925 | len = min(klen, DTLHDRDATALEN); | ||
3926 | if (!(p->header.flag & BT_ROOT)) | ||
3927 | bn = addressPXD(&p->header.self); | ||
3928 | lh->index = cpu_to_le32(add_index(data->leaf.tid, | ||
3929 | data->leaf.ip, | ||
3930 | bn, index)); | ||
3931 | } else | ||
3932 | len = min(klen, DTLHDRDATALEN_LEGACY); | ||
3933 | } else { | ||
3934 | ih = (struct idtentry *) h; | ||
3935 | ih->next = h->next; | ||
3936 | xd = (pxd_t *) ih; | ||
3937 | *xd = data->xd; | ||
3938 | ih->namlen = klen; | ||
3939 | name = ih->name; | ||
3940 | len = min(klen, DTIHDRDATALEN); | ||
3941 | } | ||
3942 | |||
3943 | UniStrncpy_to_le(name, kname, len); | ||
3944 | |||
3945 | n = 1; | ||
3946 | xsi = hsi; | ||
3947 | |||
3948 | /* write additional segment(s) */ | ||
3949 | t = h; | ||
3950 | klen -= len; | ||
3951 | while (klen) { | ||
3952 | /* get free slot */ | ||
3953 | fsi = p->header.freelist; | ||
3954 | t = &p->slot[fsi]; | ||
3955 | p->header.freelist = t->next; | ||
3956 | --p->header.freecnt; | ||
3957 | |||
3958 | /* is next slot contiguous ? */ | ||
3959 | if (fsi != xsi + 1) { | ||
3960 | /* close current linelock */ | ||
3961 | lv->length = n; | ||
3962 | dtlck->index++; | ||
3963 | |||
3964 | /* open new linelock */ | ||
3965 | if (dtlck->index < dtlck->maxcnt) | ||
3966 | lv++; | ||
3967 | else { | ||
3968 | dtlck = (struct dt_lock *) txLinelock(dtlck); | ||
3969 | lv = & dtlck->lv[0]; | ||
3970 | } | ||
3971 | |||
3972 | lv->offset = fsi; | ||
3973 | n = 0; | ||
3974 | } | ||
3975 | |||
3976 | kname += len; | ||
3977 | len = min(klen, DTSLOTDATALEN); | ||
3978 | UniStrncpy_to_le(t->name, kname, len); | ||
3979 | |||
3980 | n++; | ||
3981 | xsi = fsi; | ||
3982 | klen -= len; | ||
3983 | } | ||
3984 | |||
3985 | /* close current linelock */ | ||
3986 | lv->length = n; | ||
3987 | dtlck->index++; | ||
3988 | |||
3989 | *dtlock = dtlck; | ||
3990 | |||
3991 | /* terminate last/only segment */ | ||
3992 | if (h == t) { | ||
3993 | /* single segment entry */ | ||
3994 | if (p->header.flag & BT_LEAF) | ||
3995 | lh->next = -1; | ||
3996 | else | ||
3997 | ih->next = -1; | ||
3998 | } else | ||
3999 | /* multi-segment entry */ | ||
4000 | t->next = -1; | ||
4001 | |||
4002 | /* if insert into middle, shift right succeeding entries in stbl */ | ||
4003 | stbl = DT_GETSTBL(p); | ||
4004 | nextindex = p->header.nextindex; | ||
4005 | if (index < nextindex) { | ||
4006 | memmove(stbl + index + 1, stbl + index, nextindex - index); | ||
4007 | |||
4008 | if ((p->header.flag & BT_LEAF) && data->leaf.ip) { | ||
4009 | s64 lblock; | ||
4010 | |||
4011 | /* | ||
4012 | * Need to update slot number for entries that moved | ||
4013 | * in the stbl | ||
4014 | */ | ||
4015 | mp = NULL; | ||
4016 | for (n = index + 1; n <= nextindex; n++) { | ||
4017 | lh = (struct ldtentry *) & (p->slot[stbl[n]]); | ||
4018 | modify_index(data->leaf.tid, data->leaf.ip, | ||
4019 | le32_to_cpu(lh->index), bn, n, | ||
4020 | &mp, &lblock); | ||
4021 | } | ||
4022 | if (mp) | ||
4023 | release_metapage(mp); | ||
4024 | } | ||
4025 | } | ||
4026 | |||
4027 | stbl[index] = hsi; | ||
4028 | |||
4029 | /* advance next available entry index of stbl */ | ||
4030 | ++p->header.nextindex; | ||
4031 | } | ||
4032 | |||
4033 | |||
4034 | /* | ||
4035 | * dtMoveEntry() | ||
4036 | * | ||
4037 | * function: move entries from split/left page to new/right page | ||
4038 | * | ||
4039 | * nextindex of dst page and freelist/freecnt of both pages | ||
4040 | * are updated. | ||
4041 | */ | ||
4042 | static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, | ||
4043 | struct dt_lock ** sdtlock, struct dt_lock ** ddtlock, | ||
4044 | int do_index) | ||
4045 | { | ||
4046 | int ssi, next; /* src slot index */ | ||
4047 | int di; /* dst entry index */ | ||
4048 | int dsi; /* dst slot index */ | ||
4049 | s8 *sstbl, *dstbl; /* sorted entry table */ | ||
4050 | int snamlen, len; | ||
4051 | struct ldtentry *slh, *dlh = NULL; | ||
4052 | struct idtentry *sih, *dih = NULL; | ||
4053 | struct dtslot *h, *s, *d; | ||
4054 | struct dt_lock *sdtlck = *sdtlock, *ddtlck = *ddtlock; | ||
4055 | struct lv *slv, *dlv; | ||
4056 | int xssi, ns, nd; | ||
4057 | int sfsi; | ||
4058 | |||
4059 | sstbl = (s8 *) & sp->slot[sp->header.stblindex]; | ||
4060 | dstbl = (s8 *) & dp->slot[dp->header.stblindex]; | ||
4061 | |||
4062 | dsi = dp->header.freelist; /* first (whole page) free slot */ | ||
4063 | sfsi = sp->header.freelist; | ||
4064 | |||
4065 | /* linelock destination entry slot */ | ||
4066 | dlv = & ddtlck->lv[ddtlck->index]; | ||
4067 | dlv->offset = dsi; | ||
4068 | |||
4069 | /* linelock source entry slot */ | ||
4070 | slv = & sdtlck->lv[sdtlck->index]; | ||
4071 | slv->offset = sstbl[si]; | ||
4072 | xssi = slv->offset - 1; | ||
4073 | |||
4074 | /* | ||
4075 | * move entries | ||
4076 | */ | ||
4077 | ns = nd = 0; | ||
4078 | for (di = 0; si < sp->header.nextindex; si++, di++) { | ||
4079 | ssi = sstbl[si]; | ||
4080 | dstbl[di] = dsi; | ||
4081 | |||
4082 | /* is next slot contiguous ? */ | ||
4083 | if (ssi != xssi + 1) { | ||
4084 | /* close current linelock */ | ||
4085 | slv->length = ns; | ||
4086 | sdtlck->index++; | ||
4087 | |||
4088 | /* open new linelock */ | ||
4089 | if (sdtlck->index < sdtlck->maxcnt) | ||
4090 | slv++; | ||
4091 | else { | ||
4092 | sdtlck = (struct dt_lock *) txLinelock(sdtlck); | ||
4093 | slv = & sdtlck->lv[0]; | ||
4094 | } | ||
4095 | |||
4096 | slv->offset = ssi; | ||
4097 | ns = 0; | ||
4098 | } | ||
4099 | |||
4100 | /* | ||
4101 | * move head/only segment of an entry | ||
4102 | */ | ||
4103 | /* get dst slot */ | ||
4104 | h = d = &dp->slot[dsi]; | ||
4105 | |||
4106 | /* get src slot and move */ | ||
4107 | s = &sp->slot[ssi]; | ||
4108 | if (sp->header.flag & BT_LEAF) { | ||
4109 | /* get source entry */ | ||
4110 | slh = (struct ldtentry *) s; | ||
4111 | dlh = (struct ldtentry *) h; | ||
4112 | snamlen = slh->namlen; | ||
4113 | |||
4114 | if (do_index) { | ||
4115 | len = min(snamlen, DTLHDRDATALEN); | ||
4116 | dlh->index = slh->index; /* little-endian */ | ||
4117 | } else | ||
4118 | len = min(snamlen, DTLHDRDATALEN_LEGACY); | ||
4119 | |||
4120 | memcpy(dlh, slh, 6 + len * 2); | ||
4121 | |||
4122 | next = slh->next; | ||
4123 | |||
4124 | /* update dst head/only segment next field */ | ||
4125 | dsi++; | ||
4126 | dlh->next = dsi; | ||
4127 | } else { | ||
4128 | sih = (struct idtentry *) s; | ||
4129 | snamlen = sih->namlen; | ||
4130 | |||
4131 | len = min(snamlen, DTIHDRDATALEN); | ||
4132 | dih = (struct idtentry *) h; | ||
4133 | memcpy(dih, sih, 10 + len * 2); | ||
4134 | next = sih->next; | ||
4135 | |||
4136 | dsi++; | ||
4137 | dih->next = dsi; | ||
4138 | } | ||
4139 | |||
4140 | /* free src head/only segment */ | ||
4141 | s->next = sfsi; | ||
4142 | s->cnt = 1; | ||
4143 | sfsi = ssi; | ||
4144 | |||
4145 | ns++; | ||
4146 | nd++; | ||
4147 | xssi = ssi; | ||
4148 | |||
4149 | /* | ||
4150 | * move additional segment(s) of the entry | ||
4151 | */ | ||
4152 | snamlen -= len; | ||
4153 | while ((ssi = next) >= 0) { | ||
4154 | /* is next slot contiguous ? */ | ||
4155 | if (ssi != xssi + 1) { | ||
4156 | /* close current linelock */ | ||
4157 | slv->length = ns; | ||
4158 | sdtlck->index++; | ||
4159 | |||
4160 | /* open new linelock */ | ||
4161 | if (sdtlck->index < sdtlck->maxcnt) | ||
4162 | slv++; | ||
4163 | else { | ||
4164 | sdtlck = | ||
4165 | (struct dt_lock *) | ||
4166 | txLinelock(sdtlck); | ||
4167 | slv = & sdtlck->lv[0]; | ||
4168 | } | ||
4169 | |||
4170 | slv->offset = ssi; | ||
4171 | ns = 0; | ||
4172 | } | ||
4173 | |||
4174 | /* get next source segment */ | ||
4175 | s = &sp->slot[ssi]; | ||
4176 | |||
4177 | /* get next destination free slot */ | ||
4178 | d++; | ||
4179 | |||
4180 | len = min(snamlen, DTSLOTDATALEN); | ||
4181 | UniStrncpy_le(d->name, s->name, len); | ||
4182 | |||
4183 | ns++; | ||
4184 | nd++; | ||
4185 | xssi = ssi; | ||
4186 | |||
4187 | dsi++; | ||
4188 | d->next = dsi; | ||
4189 | |||
4190 | /* free source segment */ | ||
4191 | next = s->next; | ||
4192 | s->next = sfsi; | ||
4193 | s->cnt = 1; | ||
4194 | sfsi = ssi; | ||
4195 | |||
4196 | snamlen -= len; | ||
4197 | } /* end while */ | ||
4198 | |||
4199 | /* terminate dst last/only segment */ | ||
4200 | if (h == d) { | ||
4201 | /* single segment entry */ | ||
4202 | if (dp->header.flag & BT_LEAF) | ||
4203 | dlh->next = -1; | ||
4204 | else | ||
4205 | dih->next = -1; | ||
4206 | } else | ||
4207 | /* multi-segment entry */ | ||
4208 | d->next = -1; | ||
4209 | } /* end for */ | ||
4210 | |||
4211 | /* close current linelock */ | ||
4212 | slv->length = ns; | ||
4213 | sdtlck->index++; | ||
4214 | *sdtlock = sdtlck; | ||
4215 | |||
4216 | dlv->length = nd; | ||
4217 | ddtlck->index++; | ||
4218 | *ddtlock = ddtlck; | ||
4219 | |||
4220 | /* update source header */ | ||
4221 | sp->header.freelist = sfsi; | ||
4222 | sp->header.freecnt += nd; | ||
4223 | |||
4224 | /* update destination header */ | ||
4225 | dp->header.nextindex = di; | ||
4226 | |||
4227 | dp->header.freelist = dsi; | ||
4228 | dp->header.freecnt -= nd; | ||
4229 | } | ||
4230 | |||
4231 | |||
4232 | /* | ||
4233 | * dtDeleteEntry() | ||
4234 | * | ||
4235 | * function: free a (leaf/internal) entry | ||
4236 | * | ||
4237 | * log freelist header, stbl, and each segment slot of entry | ||
4238 | * (even though last/only segment next field is modified, | ||
4239 | * physical image logging requires all segment slots of | ||
4240 | * the entry logged to avoid applying previous updates | ||
4241 | * to the same slots) | ||
4242 | */ | ||
4243 | static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock) | ||
4244 | { | ||
4245 | int fsi; /* free entry slot index */ | ||
4246 | s8 *stbl; | ||
4247 | struct dtslot *t; | ||
4248 | int si, freecnt; | ||
4249 | struct dt_lock *dtlck = *dtlock; | ||
4250 | struct lv *lv; | ||
4251 | int xsi, n; | ||
4252 | |||
4253 | /* get free entry slot index */ | ||
4254 | stbl = DT_GETSTBL(p); | ||
4255 | fsi = stbl[fi]; | ||
4256 | |||
4257 | /* open new linelock */ | ||
4258 | if (dtlck->index >= dtlck->maxcnt) | ||
4259 | dtlck = (struct dt_lock *) txLinelock(dtlck); | ||
4260 | lv = & dtlck->lv[dtlck->index]; | ||
4261 | |||
4262 | lv->offset = fsi; | ||
4263 | |||
4264 | /* get the head/only segment */ | ||
4265 | t = &p->slot[fsi]; | ||
4266 | if (p->header.flag & BT_LEAF) | ||
4267 | si = ((struct ldtentry *) t)->next; | ||
4268 | else | ||
4269 | si = ((struct idtentry *) t)->next; | ||
4270 | t->next = si; | ||
4271 | t->cnt = 1; | ||
4272 | |||
4273 | n = freecnt = 1; | ||
4274 | xsi = fsi; | ||
4275 | |||
4276 | /* find the last/only segment */ | ||
4277 | while (si >= 0) { | ||
4278 | /* is next slot contiguous ? */ | ||
4279 | if (si != xsi + 1) { | ||
4280 | /* close current linelock */ | ||
4281 | lv->length = n; | ||
4282 | dtlck->index++; | ||
4283 | |||
4284 | /* open new linelock */ | ||
4285 | if (dtlck->index < dtlck->maxcnt) | ||
4286 | lv++; | ||
4287 | else { | ||
4288 | dtlck = (struct dt_lock *) txLinelock(dtlck); | ||
4289 | lv = & dtlck->lv[0]; | ||
4290 | } | ||
4291 | |||
4292 | lv->offset = si; | ||
4293 | n = 0; | ||
4294 | } | ||
4295 | |||
4296 | n++; | ||
4297 | xsi = si; | ||
4298 | freecnt++; | ||
4299 | |||
4300 | t = &p->slot[si]; | ||
4301 | t->cnt = 1; | ||
4302 | si = t->next; | ||
4303 | } | ||
4304 | |||
4305 | /* close current linelock */ | ||
4306 | lv->length = n; | ||
4307 | dtlck->index++; | ||
4308 | |||
4309 | *dtlock = dtlck; | ||
4310 | |||
4311 | /* update freelist */ | ||
4312 | t->next = p->header.freelist; | ||
4313 | p->header.freelist = fsi; | ||
4314 | p->header.freecnt += freecnt; | ||
4315 | |||
4316 | /* if delete from middle, | ||
4317 | * shift left the succedding entries in the stbl | ||
4318 | */ | ||
4319 | si = p->header.nextindex; | ||
4320 | if (fi < si - 1) | ||
4321 | memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1); | ||
4322 | |||
4323 | p->header.nextindex--; | ||
4324 | } | ||
4325 | |||
4326 | |||
4327 | /* | ||
4328 | * dtTruncateEntry() | ||
4329 | * | ||
4330 | * function: truncate a (leaf/internal) entry | ||
4331 | * | ||
4332 | * log freelist header, stbl, and each segment slot of entry | ||
4333 | * (even though last/only segment next field is modified, | ||
4334 | * physical image logging requires all segment slots of | ||
4335 | * the entry logged to avoid applying previous updates | ||
4336 | * to the same slots) | ||
4337 | */ | ||
4338 | static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock) | ||
4339 | { | ||
4340 | int tsi; /* truncate entry slot index */ | ||
4341 | s8 *stbl; | ||
4342 | struct dtslot *t; | ||
4343 | int si, freecnt; | ||
4344 | struct dt_lock *dtlck = *dtlock; | ||
4345 | struct lv *lv; | ||
4346 | int fsi, xsi, n; | ||
4347 | |||
4348 | /* get free entry slot index */ | ||
4349 | stbl = DT_GETSTBL(p); | ||
4350 | tsi = stbl[ti]; | ||
4351 | |||
4352 | /* open new linelock */ | ||
4353 | if (dtlck->index >= dtlck->maxcnt) | ||
4354 | dtlck = (struct dt_lock *) txLinelock(dtlck); | ||
4355 | lv = & dtlck->lv[dtlck->index]; | ||
4356 | |||
4357 | lv->offset = tsi; | ||
4358 | |||
4359 | /* get the head/only segment */ | ||
4360 | t = &p->slot[tsi]; | ||
4361 | ASSERT(p->header.flag & BT_INTERNAL); | ||
4362 | ((struct idtentry *) t)->namlen = 0; | ||
4363 | si = ((struct idtentry *) t)->next; | ||
4364 | ((struct idtentry *) t)->next = -1; | ||
4365 | |||
4366 | n = 1; | ||
4367 | freecnt = 0; | ||
4368 | fsi = si; | ||
4369 | xsi = tsi; | ||
4370 | |||
4371 | /* find the last/only segment */ | ||
4372 | while (si >= 0) { | ||
4373 | /* is next slot contiguous ? */ | ||
4374 | if (si != xsi + 1) { | ||
4375 | /* close current linelock */ | ||
4376 | lv->length = n; | ||
4377 | dtlck->index++; | ||
4378 | |||
4379 | /* open new linelock */ | ||
4380 | if (dtlck->index < dtlck->maxcnt) | ||
4381 | lv++; | ||
4382 | else { | ||
4383 | dtlck = (struct dt_lock *) txLinelock(dtlck); | ||
4384 | lv = & dtlck->lv[0]; | ||
4385 | } | ||
4386 | |||
4387 | lv->offset = si; | ||
4388 | n = 0; | ||
4389 | } | ||
4390 | |||
4391 | n++; | ||
4392 | xsi = si; | ||
4393 | freecnt++; | ||
4394 | |||
4395 | t = &p->slot[si]; | ||
4396 | t->cnt = 1; | ||
4397 | si = t->next; | ||
4398 | } | ||
4399 | |||
4400 | /* close current linelock */ | ||
4401 | lv->length = n; | ||
4402 | dtlck->index++; | ||
4403 | |||
4404 | *dtlock = dtlck; | ||
4405 | |||
4406 | /* update freelist */ | ||
4407 | if (freecnt == 0) | ||
4408 | return; | ||
4409 | t->next = p->header.freelist; | ||
4410 | p->header.freelist = fsi; | ||
4411 | p->header.freecnt += freecnt; | ||
4412 | } | ||
4413 | |||
4414 | |||
4415 | /* | ||
4416 | * dtLinelockFreelist() | ||
4417 | */ | ||
4418 | static void dtLinelockFreelist(dtpage_t * p, /* directory page */ | ||
4419 | int m, /* max slot index */ | ||
4420 | struct dt_lock ** dtlock) | ||
4421 | { | ||
4422 | int fsi; /* free entry slot index */ | ||
4423 | struct dtslot *t; | ||
4424 | int si; | ||
4425 | struct dt_lock *dtlck = *dtlock; | ||
4426 | struct lv *lv; | ||
4427 | int xsi, n; | ||
4428 | |||
4429 | /* get free entry slot index */ | ||
4430 | fsi = p->header.freelist; | ||
4431 | |||
4432 | /* open new linelock */ | ||
4433 | if (dtlck->index >= dtlck->maxcnt) | ||
4434 | dtlck = (struct dt_lock *) txLinelock(dtlck); | ||
4435 | lv = & dtlck->lv[dtlck->index]; | ||
4436 | |||
4437 | lv->offset = fsi; | ||
4438 | |||
4439 | n = 1; | ||
4440 | xsi = fsi; | ||
4441 | |||
4442 | t = &p->slot[fsi]; | ||
4443 | si = t->next; | ||
4444 | |||
4445 | /* find the last/only segment */ | ||
4446 | while (si < m && si >= 0) { | ||
4447 | /* is next slot contiguous ? */ | ||
4448 | if (si != xsi + 1) { | ||
4449 | /* close current linelock */ | ||
4450 | lv->length = n; | ||
4451 | dtlck->index++; | ||
4452 | |||
4453 | /* open new linelock */ | ||
4454 | if (dtlck->index < dtlck->maxcnt) | ||
4455 | lv++; | ||
4456 | else { | ||
4457 | dtlck = (struct dt_lock *) txLinelock(dtlck); | ||
4458 | lv = & dtlck->lv[0]; | ||
4459 | } | ||
4460 | |||
4461 | lv->offset = si; | ||
4462 | n = 0; | ||
4463 | } | ||
4464 | |||
4465 | n++; | ||
4466 | xsi = si; | ||
4467 | |||
4468 | t = &p->slot[si]; | ||
4469 | si = t->next; | ||
4470 | } | ||
4471 | |||
4472 | /* close current linelock */ | ||
4473 | lv->length = n; | ||
4474 | dtlck->index++; | ||
4475 | |||
4476 | *dtlock = dtlck; | ||
4477 | } | ||
4478 | |||
4479 | |||
4480 | /* | ||
4481 | * NAME: dtModify | ||
4482 | * | ||
4483 | * FUNCTION: Modify the inode number part of a directory entry | ||
4484 | * | ||
4485 | * PARAMETERS: | ||
4486 | * tid - Transaction id | ||
4487 | * ip - Inode of parent directory | ||
4488 | * key - Name of entry to be modified | ||
4489 | * orig_ino - Original inode number expected in entry | ||
4490 | * new_ino - New inode number to put into entry | ||
4491 | * flag - JFS_RENAME | ||
4492 | * | ||
4493 | * RETURNS: | ||
4494 | * -ESTALE - If entry found does not match orig_ino passed in | ||
4495 | * -ENOENT - If no entry can be found to match key | ||
4496 | * 0 - If successfully modified entry | ||
4497 | */ | ||
4498 | int dtModify(tid_t tid, struct inode *ip, | ||
4499 | struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag) | ||
4500 | { | ||
4501 | int rc; | ||
4502 | s64 bn; | ||
4503 | struct metapage *mp; | ||
4504 | dtpage_t *p; | ||
4505 | int index; | ||
4506 | struct btstack btstack; | ||
4507 | struct tlock *tlck; | ||
4508 | struct dt_lock *dtlck; | ||
4509 | struct lv *lv; | ||
4510 | s8 *stbl; | ||
4511 | int entry_si; /* entry slot index */ | ||
4512 | struct ldtentry *entry; | ||
4513 | |||
4514 | /* | ||
4515 | * search for the entry to modify: | ||
4516 | * | ||
4517 | * dtSearch() returns (leaf page pinned, index at which to modify). | ||
4518 | */ | ||
4519 | if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag))) | ||
4520 | return rc; | ||
4521 | |||
4522 | /* retrieve search result */ | ||
4523 | DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); | ||
4524 | |||
4525 | BT_MARK_DIRTY(mp, ip); | ||
4526 | /* | ||
4527 | * acquire a transaction lock on the leaf page of named entry | ||
4528 | */ | ||
4529 | tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); | ||
4530 | dtlck = (struct dt_lock *) & tlck->lock; | ||
4531 | |||
4532 | /* get slot index of the entry */ | ||
4533 | stbl = DT_GETSTBL(p); | ||
4534 | entry_si = stbl[index]; | ||
4535 | |||
4536 | /* linelock entry */ | ||
4537 | ASSERT(dtlck->index == 0); | ||
4538 | lv = & dtlck->lv[0]; | ||
4539 | lv->offset = entry_si; | ||
4540 | lv->length = 1; | ||
4541 | dtlck->index++; | ||
4542 | |||
4543 | /* get the head/only segment */ | ||
4544 | entry = (struct ldtentry *) & p->slot[entry_si]; | ||
4545 | |||
4546 | /* substitute the inode number of the entry */ | ||
4547 | entry->inumber = cpu_to_le32(new_ino); | ||
4548 | |||
4549 | /* unpin the leaf page */ | ||
4550 | DT_PUTPAGE(mp); | ||
4551 | |||
4552 | return 0; | ||
4553 | } | ||
4554 | |||
4555 | #ifdef _JFS_DEBUG_DTREE | ||
4556 | /* | ||
4557 | * dtDisplayTree() | ||
4558 | * | ||
4559 | * function: traverse forward | ||
4560 | */ | ||
4561 | int dtDisplayTree(struct inode *ip) | ||
4562 | { | ||
4563 | int rc; | ||
4564 | struct metapage *mp; | ||
4565 | dtpage_t *p; | ||
4566 | s64 bn, pbn; | ||
4567 | int index, lastindex, v, h; | ||
4568 | pxd_t *xd; | ||
4569 | struct btstack btstack; | ||
4570 | struct btframe *btsp; | ||
4571 | struct btframe *parent; | ||
4572 | u8 *stbl; | ||
4573 | int psize = 256; | ||
4574 | |||
4575 | printk("display B+-tree.\n"); | ||
4576 | |||
4577 | /* clear stack */ | ||
4578 | btsp = btstack.stack; | ||
4579 | |||
4580 | /* | ||
4581 | * start with root | ||
4582 | * | ||
4583 | * root resides in the inode | ||
4584 | */ | ||
4585 | bn = 0; | ||
4586 | v = h = 0; | ||
4587 | |||
4588 | /* | ||
4589 | * first access of each page: | ||
4590 | */ | ||
4591 | newPage: | ||
4592 | DT_GETPAGE(ip, bn, mp, psize, p, rc); | ||
4593 | if (rc) | ||
4594 | return rc; | ||
4595 | |||
4596 | /* process entries forward from first index */ | ||
4597 | index = 0; | ||
4598 | lastindex = p->header.nextindex - 1; | ||
4599 | |||
4600 | if (p->header.flag & BT_INTERNAL) { | ||
4601 | /* | ||
4602 | * first access of each internal page | ||
4603 | */ | ||
4604 | printf("internal page "); | ||
4605 | dtDisplayPage(ip, bn, p); | ||
4606 | |||
4607 | goto getChild; | ||
4608 | } else { /* (p->header.flag & BT_LEAF) */ | ||
4609 | |||
4610 | /* | ||
4611 | * first access of each leaf page | ||
4612 | */ | ||
4613 | printf("leaf page "); | ||
4614 | dtDisplayPage(ip, bn, p); | ||
4615 | |||
4616 | /* | ||
4617 | * process leaf page entries | ||
4618 | * | ||
4619 | for ( ; index <= lastindex; index++) | ||
4620 | { | ||
4621 | } | ||
4622 | */ | ||
4623 | |||
4624 | /* unpin the leaf page */ | ||
4625 | DT_PUTPAGE(mp); | ||
4626 | } | ||
4627 | |||
4628 | /* | ||
4629 | * go back up to the parent page | ||
4630 | */ | ||
4631 | getParent: | ||
4632 | /* pop/restore parent entry for the current child page */ | ||
4633 | if ((parent = (btsp == btstack.stack ? NULL : --btsp)) == NULL) | ||
4634 | /* current page must have been root */ | ||
4635 | return; | ||
4636 | |||
4637 | /* | ||
4638 | * parent page scan completed | ||
4639 | */ | ||
4640 | if ((index = parent->index) == (lastindex = parent->lastindex)) { | ||
4641 | /* go back up to the parent page */ | ||
4642 | goto getParent; | ||
4643 | } | ||
4644 | |||
4645 | /* | ||
4646 | * parent page has entries remaining | ||
4647 | */ | ||
4648 | /* get back the parent page */ | ||
4649 | bn = parent->bn; | ||
4650 | /* v = parent->level; */ | ||
4651 | DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
4652 | if (rc) | ||
4653 | return rc; | ||
4654 | |||
4655 | /* get next parent entry */ | ||
4656 | index++; | ||
4657 | |||
4658 | /* | ||
4659 | * internal page: go down to child page of current entry | ||
4660 | */ | ||
4661 | getChild: | ||
4662 | /* push/save current parent entry for the child page */ | ||
4663 | btsp->bn = pbn = bn; | ||
4664 | btsp->index = index; | ||
4665 | btsp->lastindex = lastindex; | ||
4666 | /* btsp->level = v; */ | ||
4667 | /* btsp->node = h; */ | ||
4668 | ++btsp; | ||
4669 | |||
4670 | /* get current entry for the child page */ | ||
4671 | stbl = DT_GETSTBL(p); | ||
4672 | xd = (pxd_t *) & p->slot[stbl[index]]; | ||
4673 | |||
4674 | /* | ||
4675 | * first access of each internal entry: | ||
4676 | */ | ||
4677 | |||
4678 | /* get child page */ | ||
4679 | bn = addressPXD(xd); | ||
4680 | psize = lengthPXD(xd) << ip->i_ipmnt->i_l2bsize; | ||
4681 | |||
4682 | printk("traverse down 0x%Lx[%d]->0x%Lx\n", pbn, index, bn); | ||
4683 | v++; | ||
4684 | h = index; | ||
4685 | |||
4686 | /* release parent page */ | ||
4687 | DT_PUTPAGE(mp); | ||
4688 | |||
4689 | /* process the child page */ | ||
4690 | goto newPage; | ||
4691 | } | ||
4692 | |||
4693 | |||
4694 | /* | ||
4695 | * dtDisplayPage() | ||
4696 | * | ||
4697 | * function: display page | ||
4698 | */ | ||
4699 | int dtDisplayPage(struct inode *ip, s64 bn, dtpage_t * p) | ||
4700 | { | ||
4701 | int rc; | ||
4702 | struct metapage *mp; | ||
4703 | struct ldtentry *lh; | ||
4704 | struct idtentry *ih; | ||
4705 | pxd_t *xd; | ||
4706 | int i, j; | ||
4707 | u8 *stbl; | ||
4708 | wchar_t name[JFS_NAME_MAX + 1]; | ||
4709 | struct component_name key = { 0, name }; | ||
4710 | int freepage = 0; | ||
4711 | |||
4712 | if (p == NULL) { | ||
4713 | freepage = 1; | ||
4714 | DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
4715 | if (rc) | ||
4716 | return rc; | ||
4717 | } | ||
4718 | |||
4719 | /* display page control */ | ||
4720 | printk("bn:0x%Lx flag:0x%08x nextindex:%d\n", | ||
4721 | bn, p->header.flag, p->header.nextindex); | ||
4722 | |||
4723 | /* display entries */ | ||
4724 | stbl = DT_GETSTBL(p); | ||
4725 | for (i = 0, j = 1; i < p->header.nextindex; i++, j++) { | ||
4726 | dtGetKey(p, i, &key, JFS_SBI(ip->i_sb)->mntflag); | ||
4727 | key.name[key.namlen] = '\0'; | ||
4728 | if (p->header.flag & BT_LEAF) { | ||
4729 | lh = (struct ldtentry *) & p->slot[stbl[i]]; | ||
4730 | printf("\t[%d] %s:%d", i, key.name, | ||
4731 | le32_to_cpu(lh->inumber)); | ||
4732 | } else { | ||
4733 | ih = (struct idtentry *) & p->slot[stbl[i]]; | ||
4734 | xd = (pxd_t *) ih; | ||
4735 | bn = addressPXD(xd); | ||
4736 | printf("\t[%d] %s:0x%Lx", i, key.name, bn); | ||
4737 | } | ||
4738 | |||
4739 | if (j == 4) { | ||
4740 | printf("\n"); | ||
4741 | j = 0; | ||
4742 | } | ||
4743 | } | ||
4744 | |||
4745 | printf("\n"); | ||
4746 | |||
4747 | if (freepage) | ||
4748 | DT_PUTPAGE(mp); | ||
4749 | |||
4750 | return 0; | ||
4751 | } | ||
4752 | #endif /* _JFS_DEBUG_DTREE */ | ||
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h new file mode 100644 index 000000000000..273a80130c9d --- /dev/null +++ b/fs/jfs/jfs_dtree.h | |||
@@ -0,0 +1,279 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2000-2002 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | #ifndef _H_JFS_DTREE | ||
19 | #define _H_JFS_DTREE | ||
20 | |||
21 | /* | ||
22 | * jfs_dtree.h: directory B+-tree manager | ||
23 | */ | ||
24 | |||
25 | #include "jfs_btree.h" | ||
26 | |||
27 | typedef union { | ||
28 | struct { | ||
29 | tid_t tid; | ||
30 | struct inode *ip; | ||
31 | u32 ino; | ||
32 | } leaf; | ||
33 | pxd_t xd; | ||
34 | } ddata_t; | ||
35 | |||
36 | |||
37 | /* | ||
38 | * entry segment/slot | ||
39 | * | ||
40 | * an entry consists of type dependent head/only segment/slot and | ||
41 | * additional segments/slots linked vi next field; | ||
42 | * N.B. last/only segment of entry is terminated by next = -1; | ||
43 | */ | ||
44 | /* | ||
45 | * directory page slot | ||
46 | */ | ||
47 | struct dtslot { | ||
48 | s8 next; /* 1: */ | ||
49 | s8 cnt; /* 1: */ | ||
50 | __le16 name[15]; /* 30: */ | ||
51 | }; /* (32) */ | ||
52 | |||
53 | |||
54 | #define DATASLOTSIZE 16 | ||
55 | #define L2DATASLOTSIZE 4 | ||
56 | #define DTSLOTSIZE 32 | ||
57 | #define L2DTSLOTSIZE 5 | ||
58 | #define DTSLOTHDRSIZE 2 | ||
59 | #define DTSLOTDATASIZE 30 | ||
60 | #define DTSLOTDATALEN 15 | ||
61 | |||
62 | /* | ||
63 | * internal node entry head/only segment | ||
64 | */ | ||
65 | struct idtentry { | ||
66 | pxd_t xd; /* 8: child extent descriptor */ | ||
67 | |||
68 | s8 next; /* 1: */ | ||
69 | u8 namlen; /* 1: */ | ||
70 | __le16 name[11]; /* 22: 2-byte aligned */ | ||
71 | }; /* (32) */ | ||
72 | |||
73 | #define DTIHDRSIZE 10 | ||
74 | #define DTIHDRDATALEN 11 | ||
75 | |||
76 | /* compute number of slots for entry */ | ||
77 | #define NDTINTERNAL(klen) ( ((4 + (klen)) + (15 - 1)) / 15 ) | ||
78 | |||
79 | |||
80 | /* | ||
81 | * leaf node entry head/only segment | ||
82 | * | ||
83 | * For legacy filesystems, name contains 13 wchars -- no index field | ||
84 | */ | ||
85 | struct ldtentry { | ||
86 | __le32 inumber; /* 4: 4-byte aligned */ | ||
87 | s8 next; /* 1: */ | ||
88 | u8 namlen; /* 1: */ | ||
89 | __le16 name[11]; /* 22: 2-byte aligned */ | ||
90 | __le32 index; /* 4: index into dir_table */ | ||
91 | }; /* (32) */ | ||
92 | |||
93 | #define DTLHDRSIZE 6 | ||
94 | #define DTLHDRDATALEN_LEGACY 13 /* Old (OS/2) format */ | ||
95 | #define DTLHDRDATALEN 11 | ||
96 | |||
97 | /* | ||
98 | * dir_table used for directory traversal during readdir | ||
99 | */ | ||
100 | |||
101 | /* | ||
102 | * Keep persistent index for directory entries | ||
103 | */ | ||
104 | #define DO_INDEX(INODE) (JFS_SBI((INODE)->i_sb)->mntflag & JFS_DIR_INDEX) | ||
105 | |||
106 | /* | ||
107 | * Maximum entry in inline directory table | ||
108 | */ | ||
109 | #define MAX_INLINE_DIRTABLE_ENTRY 13 | ||
110 | |||
111 | struct dir_table_slot { | ||
112 | u8 rsrvd; /* 1: */ | ||
113 | u8 flag; /* 1: 0 if free */ | ||
114 | u8 slot; /* 1: slot within leaf page of entry */ | ||
115 | u8 addr1; /* 1: upper 8 bits of leaf page address */ | ||
116 | __le32 addr2; /* 4: lower 32 bits of leaf page address -OR- | ||
117 | index of next entry when this entry was deleted */ | ||
118 | }; /* (8) */ | ||
119 | |||
120 | /* | ||
121 | * flag values | ||
122 | */ | ||
123 | #define DIR_INDEX_VALID 1 | ||
124 | #define DIR_INDEX_FREE 0 | ||
125 | |||
126 | #define DTSaddress(dir_table_slot, address64)\ | ||
127 | {\ | ||
128 | (dir_table_slot)->addr1 = ((u64)address64) >> 32;\ | ||
129 | (dir_table_slot)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ | ||
130 | } | ||
131 | |||
132 | #define addressDTS(dts)\ | ||
133 | ( ((s64)((dts)->addr1)) << 32 | __le32_to_cpu((dts)->addr2) ) | ||
134 | |||
135 | /* compute number of slots for entry */ | ||
136 | #define NDTLEAF_LEGACY(klen) ( ((2 + (klen)) + (15 - 1)) / 15 ) | ||
137 | #define NDTLEAF NDTINTERNAL | ||
138 | |||
139 | |||
140 | /* | ||
141 | * directory root page (in-line in on-disk inode): | ||
142 | * | ||
143 | * cf. dtpage_t below. | ||
144 | */ | ||
145 | typedef union { | ||
146 | struct { | ||
147 | struct dasd DASD; /* 16: DASD limit/usage info */ | ||
148 | |||
149 | u8 flag; /* 1: */ | ||
150 | u8 nextindex; /* 1: next free entry in stbl */ | ||
151 | s8 freecnt; /* 1: free count */ | ||
152 | s8 freelist; /* 1: freelist header */ | ||
153 | |||
154 | __le32 idotdot; /* 4: parent inode number */ | ||
155 | |||
156 | s8 stbl[8]; /* 8: sorted entry index table */ | ||
157 | } header; /* (32) */ | ||
158 | |||
159 | struct dtslot slot[9]; | ||
160 | } dtroot_t; | ||
161 | |||
162 | #define PARENT(IP) \ | ||
163 | (le32_to_cpu(JFS_IP(IP)->i_dtroot.header.idotdot)) | ||
164 | |||
165 | #define DTROOTMAXSLOT 9 | ||
166 | |||
167 | #define dtEmpty(IP) (JFS_IP(IP)->i_dtroot.header.nextindex == 0) | ||
168 | |||
169 | |||
170 | /* | ||
171 | * directory regular page: | ||
172 | * | ||
173 | * entry slot array of 32 byte slot | ||
174 | * | ||
175 | * sorted entry slot index table (stbl): | ||
176 | * contiguous slots at slot specified by stblindex, | ||
177 | * 1-byte per entry | ||
178 | * 512 byte block: 16 entry tbl (1 slot) | ||
179 | * 1024 byte block: 32 entry tbl (1 slot) | ||
180 | * 2048 byte block: 64 entry tbl (2 slot) | ||
181 | * 4096 byte block: 128 entry tbl (4 slot) | ||
182 | * | ||
183 | * data area: | ||
184 | * 512 byte block: 16 - 2 = 14 slot | ||
185 | * 1024 byte block: 32 - 2 = 30 slot | ||
186 | * 2048 byte block: 64 - 3 = 61 slot | ||
187 | * 4096 byte block: 128 - 5 = 123 slot | ||
188 | * | ||
189 | * N.B. index is 0-based; index fields refer to slot index | ||
190 | * except nextindex which refers to entry index in stbl; | ||
191 | * end of entry stot list or freelist is marked with -1. | ||
192 | */ | ||
193 | typedef union { | ||
194 | struct { | ||
195 | __le64 next; /* 8: next sibling */ | ||
196 | __le64 prev; /* 8: previous sibling */ | ||
197 | |||
198 | u8 flag; /* 1: */ | ||
199 | u8 nextindex; /* 1: next entry index in stbl */ | ||
200 | s8 freecnt; /* 1: */ | ||
201 | s8 freelist; /* 1: slot index of head of freelist */ | ||
202 | |||
203 | u8 maxslot; /* 1: number of slots in page slot[] */ | ||
204 | u8 stblindex; /* 1: slot index of start of stbl */ | ||
205 | u8 rsrvd[2]; /* 2: */ | ||
206 | |||
207 | pxd_t self; /* 8: self pxd */ | ||
208 | } header; /* (32) */ | ||
209 | |||
210 | struct dtslot slot[128]; | ||
211 | } dtpage_t; | ||
212 | |||
213 | #define DTPAGEMAXSLOT 128 | ||
214 | |||
215 | #define DT8THPGNODEBYTES 512 | ||
216 | #define DT8THPGNODETSLOTS 1 | ||
217 | #define DT8THPGNODESLOTS 16 | ||
218 | |||
219 | #define DTQTRPGNODEBYTES 1024 | ||
220 | #define DTQTRPGNODETSLOTS 1 | ||
221 | #define DTQTRPGNODESLOTS 32 | ||
222 | |||
223 | #define DTHALFPGNODEBYTES 2048 | ||
224 | #define DTHALFPGNODETSLOTS 2 | ||
225 | #define DTHALFPGNODESLOTS 64 | ||
226 | |||
227 | #define DTFULLPGNODEBYTES 4096 | ||
228 | #define DTFULLPGNODETSLOTS 4 | ||
229 | #define DTFULLPGNODESLOTS 128 | ||
230 | |||
231 | #define DTENTRYSTART 1 | ||
232 | |||
233 | /* get sorted entry table of the page */ | ||
234 | #define DT_GETSTBL(p) ( ((p)->header.flag & BT_ROOT) ?\ | ||
235 | ((dtroot_t *)(p))->header.stbl : \ | ||
236 | (s8 *)&(p)->slot[(p)->header.stblindex] ) | ||
237 | |||
238 | /* | ||
239 | * Flags for dtSearch | ||
240 | */ | ||
241 | #define JFS_CREATE 1 | ||
242 | #define JFS_LOOKUP 2 | ||
243 | #define JFS_REMOVE 3 | ||
244 | #define JFS_RENAME 4 | ||
245 | |||
246 | #define DIRENTSIZ(namlen) \ | ||
247 | ( (sizeof(struct dirent) - 2*(JFS_NAME_MAX+1) + 2*((namlen)+1) + 3) &~ 3 ) | ||
248 | |||
249 | /* | ||
250 | * Maximum file offset for directories. | ||
251 | */ | ||
252 | #define DIREND INT_MAX | ||
253 | |||
254 | /* | ||
255 | * external declarations | ||
256 | */ | ||
257 | extern void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot); | ||
258 | |||
259 | extern int dtSearch(struct inode *ip, struct component_name * key, | ||
260 | ino_t * data, struct btstack * btstack, int flag); | ||
261 | |||
262 | extern int dtInsert(tid_t tid, struct inode *ip, struct component_name * key, | ||
263 | ino_t * ino, struct btstack * btstack); | ||
264 | |||
265 | extern int dtDelete(tid_t tid, struct inode *ip, struct component_name * key, | ||
266 | ino_t * data, int flag); | ||
267 | |||
268 | extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key, | ||
269 | ino_t * orig_ino, ino_t new_ino, int flag); | ||
270 | |||
271 | extern int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir); | ||
272 | |||
273 | #ifdef _JFS_DEBUG_DTREE | ||
274 | extern int dtDisplayTree(struct inode *ip); | ||
275 | |||
276 | extern int dtDisplayPage(struct inode *ip, s64 bn, dtpage_t * p); | ||
277 | #endif /* _JFS_DEBUG_DTREE */ | ||
278 | |||
279 | #endif /* !_H_JFS_DTREE */ | ||
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c new file mode 100644 index 000000000000..1953acb79266 --- /dev/null +++ b/fs/jfs/jfs_extent.c | |||
@@ -0,0 +1,668 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | |||
19 | #include <linux/fs.h> | ||
20 | #include <linux/quotaops.h> | ||
21 | #include "jfs_incore.h" | ||
22 | #include "jfs_superblock.h" | ||
23 | #include "jfs_dmap.h" | ||
24 | #include "jfs_extent.h" | ||
25 | #include "jfs_debug.h" | ||
26 | |||
27 | /* | ||
28 | * forward references | ||
29 | */ | ||
30 | static int extBalloc(struct inode *, s64, s64 *, s64 *); | ||
31 | #ifdef _NOTYET | ||
32 | static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *); | ||
33 | #endif | ||
34 | static s64 extRoundDown(s64 nb); | ||
35 | |||
36 | /* | ||
37 | * external references | ||
38 | */ | ||
39 | extern int jfs_commit_inode(struct inode *, int); | ||
40 | |||
41 | |||
42 | #define DPD(a) (printk("(a): %d\n",(a))) | ||
43 | #define DPC(a) (printk("(a): %c\n",(a))) | ||
44 | #define DPL1(a) \ | ||
45 | { \ | ||
46 | if ((a) >> 32) \ | ||
47 | printk("(a): %x%08x ",(a)); \ | ||
48 | else \ | ||
49 | printk("(a): %x ",(a) << 32); \ | ||
50 | } | ||
51 | #define DPL(a) \ | ||
52 | { \ | ||
53 | if ((a) >> 32) \ | ||
54 | printk("(a): %x%08x\n",(a)); \ | ||
55 | else \ | ||
56 | printk("(a): %x\n",(a) << 32); \ | ||
57 | } | ||
58 | |||
59 | #define DPD1(a) (printk("(a): %d ",(a))) | ||
60 | #define DPX(a) (printk("(a): %08x\n",(a))) | ||
61 | #define DPX1(a) (printk("(a): %08x ",(a))) | ||
62 | #define DPS(a) (printk("%s\n",(a))) | ||
63 | #define DPE(a) (printk("\nENTERING: %s\n",(a))) | ||
64 | #define DPE1(a) (printk("\nENTERING: %s",(a))) | ||
65 | #define DPS1(a) (printk(" %s ",(a))) | ||
66 | |||
67 | |||
68 | /* | ||
69 | * NAME: extAlloc() | ||
70 | * | ||
71 | * FUNCTION: allocate an extent for a specified page range within a | ||
72 | * file. | ||
73 | * | ||
74 | * PARAMETERS: | ||
75 | * ip - the inode of the file. | ||
76 | * xlen - requested extent length. | ||
77 | * pno - the starting page number with the file. | ||
78 | * xp - pointer to an xad. on entry, xad describes an | ||
79 | * extent that is used as an allocation hint if the | ||
80 | * xaddr of the xad is non-zero. on successful exit, | ||
81 | * the xad describes the newly allocated extent. | ||
82 | * abnr - boolean_t indicating whether the newly allocated extent | ||
83 | * should be marked as allocated but not recorded. | ||
84 | * | ||
85 | * RETURN VALUES: | ||
86 | * 0 - success | ||
87 | * -EIO - i/o error. | ||
88 | * -ENOSPC - insufficient disk resources. | ||
89 | */ | ||
90 | int | ||
91 | extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr) | ||
92 | { | ||
93 | struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); | ||
94 | s64 nxlen, nxaddr, xoff, hint, xaddr = 0; | ||
95 | int rc; | ||
96 | int xflag; | ||
97 | |||
98 | /* This blocks if we are low on resources */ | ||
99 | txBeginAnon(ip->i_sb); | ||
100 | |||
101 | /* Avoid race with jfs_commit_inode() */ | ||
102 | down(&JFS_IP(ip)->commit_sem); | ||
103 | |||
104 | /* validate extent length */ | ||
105 | if (xlen > MAXXLEN) | ||
106 | xlen = MAXXLEN; | ||
107 | |||
108 | /* get the page's starting extent offset */ | ||
109 | xoff = pno << sbi->l2nbperpage; | ||
110 | |||
111 | /* check if an allocation hint was provided */ | ||
112 | if ((hint = addressXAD(xp))) { | ||
113 | /* get the size of the extent described by the hint */ | ||
114 | nxlen = lengthXAD(xp); | ||
115 | |||
116 | /* check if the hint is for the portion of the file | ||
117 | * immediately previous to the current allocation | ||
118 | * request and if hint extent has the same abnr | ||
119 | * value as the current request. if so, we can | ||
120 | * extend the hint extent to include the current | ||
121 | * extent if we can allocate the blocks immediately | ||
122 | * following the hint extent. | ||
123 | */ | ||
124 | if (offsetXAD(xp) + nxlen == xoff && | ||
125 | abnr == ((xp->flag & XAD_NOTRECORDED) ? TRUE : FALSE)) | ||
126 | xaddr = hint + nxlen; | ||
127 | |||
128 | /* adjust the hint to the last block of the extent */ | ||
129 | hint += (nxlen - 1); | ||
130 | } | ||
131 | |||
132 | /* allocate the disk blocks for the extent. initially, extBalloc() | ||
133 | * will try to allocate disk blocks for the requested size (xlen). | ||
134 | * if this fails (xlen contigious free blocks not avaliable), it'll | ||
135 | * try to allocate a smaller number of blocks (producing a smaller | ||
136 | * extent), with this smaller number of blocks consisting of the | ||
137 | * requested number of blocks rounded down to the next smaller | ||
138 | * power of 2 number (i.e. 16 -> 8). it'll continue to round down | ||
139 | * and retry the allocation until the number of blocks to allocate | ||
140 | * is smaller than the number of blocks per page. | ||
141 | */ | ||
142 | nxlen = xlen; | ||
143 | if ((rc = extBalloc(ip, hint ? hint : INOHINT(ip), &nxlen, &nxaddr))) { | ||
144 | up(&JFS_IP(ip)->commit_sem); | ||
145 | return (rc); | ||
146 | } | ||
147 | |||
148 | /* Allocate blocks to quota. */ | ||
149 | if (DQUOT_ALLOC_BLOCK(ip, nxlen)) { | ||
150 | dbFree(ip, nxaddr, (s64) nxlen); | ||
151 | up(&JFS_IP(ip)->commit_sem); | ||
152 | return -EDQUOT; | ||
153 | } | ||
154 | |||
155 | /* determine the value of the extent flag */ | ||
156 | xflag = (abnr == TRUE) ? XAD_NOTRECORDED : 0; | ||
157 | |||
158 | /* if we can extend the hint extent to cover the current request, | ||
159 | * extend it. otherwise, insert a new extent to | ||
160 | * cover the current request. | ||
161 | */ | ||
162 | if (xaddr && xaddr == nxaddr) | ||
163 | rc = xtExtend(0, ip, xoff, (int) nxlen, 0); | ||
164 | else | ||
165 | rc = xtInsert(0, ip, xflag, xoff, (int) nxlen, &nxaddr, 0); | ||
166 | |||
167 | /* if the extend or insert failed, | ||
168 | * free the newly allocated blocks and return the error. | ||
169 | */ | ||
170 | if (rc) { | ||
171 | dbFree(ip, nxaddr, nxlen); | ||
172 | DQUOT_FREE_BLOCK(ip, nxlen); | ||
173 | up(&JFS_IP(ip)->commit_sem); | ||
174 | return (rc); | ||
175 | } | ||
176 | |||
177 | /* set the results of the extent allocation */ | ||
178 | XADaddress(xp, nxaddr); | ||
179 | XADlength(xp, nxlen); | ||
180 | XADoffset(xp, xoff); | ||
181 | xp->flag = xflag; | ||
182 | |||
183 | mark_inode_dirty(ip); | ||
184 | |||
185 | up(&JFS_IP(ip)->commit_sem); | ||
186 | /* | ||
187 | * COMMIT_SyncList flags an anonymous tlock on page that is on | ||
188 | * sync list. | ||
189 | * We need to commit the inode to get the page written disk. | ||
190 | */ | ||
191 | if (test_and_clear_cflag(COMMIT_Synclist,ip)) | ||
192 | jfs_commit_inode(ip, 0); | ||
193 | |||
194 | return (0); | ||
195 | } | ||
196 | |||
197 | |||
198 | #ifdef _NOTYET | ||
199 | /* | ||
200 | * NAME: extRealloc() | ||
201 | * | ||
202 | * FUNCTION: extend the allocation of a file extent containing a | ||
203 | * partial back last page. | ||
204 | * | ||
205 | * PARAMETERS: | ||
206 | * ip - the inode of the file. | ||
207 | * cp - cbuf for the partial backed last page. | ||
208 | * xlen - request size of the resulting extent. | ||
209 | * xp - pointer to an xad. on successful exit, the xad | ||
210 | * describes the newly allocated extent. | ||
211 | * abnr - boolean_t indicating whether the newly allocated extent | ||
212 | * should be marked as allocated but not recorded. | ||
213 | * | ||
214 | * RETURN VALUES: | ||
215 | * 0 - success | ||
216 | * -EIO - i/o error. | ||
217 | * -ENOSPC - insufficient disk resources. | ||
218 | */ | ||
219 | int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, boolean_t abnr) | ||
220 | { | ||
221 | struct super_block *sb = ip->i_sb; | ||
222 | s64 xaddr, xlen, nxaddr, delta, xoff; | ||
223 | s64 ntail, nextend, ninsert; | ||
224 | int rc, nbperpage = JFS_SBI(sb)->nbperpage; | ||
225 | int xflag; | ||
226 | |||
227 | /* This blocks if we are low on resources */ | ||
228 | txBeginAnon(ip->i_sb); | ||
229 | |||
230 | down(&JFS_IP(ip)->commit_sem); | ||
231 | /* validate extent length */ | ||
232 | if (nxlen > MAXXLEN) | ||
233 | nxlen = MAXXLEN; | ||
234 | |||
235 | /* get the extend (partial) page's disk block address and | ||
236 | * number of blocks. | ||
237 | */ | ||
238 | xaddr = addressXAD(xp); | ||
239 | xlen = lengthXAD(xp); | ||
240 | xoff = offsetXAD(xp); | ||
241 | |||
242 | /* if the extend page is abnr and if the request is for | ||
243 | * the extent to be allocated and recorded, | ||
244 | * make the page allocated and recorded. | ||
245 | */ | ||
246 | if ((xp->flag & XAD_NOTRECORDED) && !abnr) { | ||
247 | xp->flag = 0; | ||
248 | if ((rc = xtUpdate(0, ip, xp))) | ||
249 | goto exit; | ||
250 | } | ||
251 | |||
252 | /* try to allocated the request number of blocks for the | ||
253 | * extent. dbRealloc() first tries to satisfy the request | ||
254 | * by extending the allocation in place. otherwise, it will | ||
255 | * try to allocate a new set of blocks large enough for the | ||
256 | * request. in satisfying a request, dbReAlloc() may allocate | ||
257 | * less than what was request but will always allocate enough | ||
258 | * space as to satisfy the extend page. | ||
259 | */ | ||
260 | if ((rc = extBrealloc(ip, xaddr, xlen, &nxlen, &nxaddr))) | ||
261 | goto exit; | ||
262 | |||
263 | /* Allocat blocks to quota. */ | ||
264 | if (DQUOT_ALLOC_BLOCK(ip, nxlen)) { | ||
265 | dbFree(ip, nxaddr, (s64) nxlen); | ||
266 | up(&JFS_IP(ip)->commit_sem); | ||
267 | return -EDQUOT; | ||
268 | } | ||
269 | |||
270 | delta = nxlen - xlen; | ||
271 | |||
272 | /* check if the extend page is not abnr but the request is abnr | ||
273 | * and the allocated disk space is for more than one page. if this | ||
274 | * is the case, there is a miss match of abnr between the extend page | ||
275 | * and the one or more pages following the extend page. as a result, | ||
276 | * two extents will have to be manipulated. the first will be that | ||
277 | * of the extent of the extend page and will be manipulated thru | ||
278 | * an xtExtend() or an xtTailgate(), depending upon whether the | ||
279 | * disk allocation occurred as an inplace extension. the second | ||
280 | * extent will be manipulated (created) through an xtInsert() and | ||
281 | * will be for the pages following the extend page. | ||
282 | */ | ||
283 | if (abnr && (!(xp->flag & XAD_NOTRECORDED)) && (nxlen > nbperpage)) { | ||
284 | ntail = nbperpage; | ||
285 | nextend = ntail - xlen; | ||
286 | ninsert = nxlen - nbperpage; | ||
287 | |||
288 | xflag = XAD_NOTRECORDED; | ||
289 | } else { | ||
290 | ntail = nxlen; | ||
291 | nextend = delta; | ||
292 | ninsert = 0; | ||
293 | |||
294 | xflag = xp->flag; | ||
295 | } | ||
296 | |||
297 | /* if we were able to extend the disk allocation in place, | ||
298 | * extend the extent. otherwise, move the extent to a | ||
299 | * new disk location. | ||
300 | */ | ||
301 | if (xaddr == nxaddr) { | ||
302 | /* extend the extent */ | ||
303 | if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) { | ||
304 | dbFree(ip, xaddr + xlen, delta); | ||
305 | DQUOT_FREE_BLOCK(ip, nxlen); | ||
306 | goto exit; | ||
307 | } | ||
308 | } else { | ||
309 | /* | ||
310 | * move the extent to a new location: | ||
311 | * | ||
312 | * xtTailgate() accounts for relocated tail extent; | ||
313 | */ | ||
314 | if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) { | ||
315 | dbFree(ip, nxaddr, nxlen); | ||
316 | DQUOT_FREE_BLOCK(ip, nxlen); | ||
317 | goto exit; | ||
318 | } | ||
319 | } | ||
320 | |||
321 | |||
322 | /* check if we need to also insert a new extent */ | ||
323 | if (ninsert) { | ||
324 | /* perform the insert. if it fails, free the blocks | ||
325 | * to be inserted and make it appear that we only did | ||
326 | * the xtExtend() or xtTailgate() above. | ||
327 | */ | ||
328 | xaddr = nxaddr + ntail; | ||
329 | if (xtInsert (0, ip, xflag, xoff + ntail, (int) ninsert, | ||
330 | &xaddr, 0)) { | ||
331 | dbFree(ip, xaddr, (s64) ninsert); | ||
332 | delta = nextend; | ||
333 | nxlen = ntail; | ||
334 | xflag = 0; | ||
335 | } | ||
336 | } | ||
337 | |||
338 | /* set the return results */ | ||
339 | XADaddress(xp, nxaddr); | ||
340 | XADlength(xp, nxlen); | ||
341 | XADoffset(xp, xoff); | ||
342 | xp->flag = xflag; | ||
343 | |||
344 | mark_inode_dirty(ip); | ||
345 | exit: | ||
346 | up(&JFS_IP(ip)->commit_sem); | ||
347 | return (rc); | ||
348 | } | ||
349 | #endif /* _NOTYET */ | ||
350 | |||
351 | |||
352 | /* | ||
353 | * NAME: extHint() | ||
354 | * | ||
355 | * FUNCTION: produce an extent allocation hint for a file offset. | ||
356 | * | ||
357 | * PARAMETERS: | ||
358 | * ip - the inode of the file. | ||
359 | * offset - file offset for which the hint is needed. | ||
360 | * xp - pointer to the xad that is to be filled in with | ||
361 | * the hint. | ||
362 | * | ||
363 | * RETURN VALUES: | ||
364 | * 0 - success | ||
365 | * -EIO - i/o error. | ||
366 | */ | ||
367 | int extHint(struct inode *ip, s64 offset, xad_t * xp) | ||
368 | { | ||
369 | struct super_block *sb = ip->i_sb; | ||
370 | struct xadlist xadl; | ||
371 | struct lxdlist lxdl; | ||
372 | lxd_t lxd; | ||
373 | s64 prev; | ||
374 | int rc, nbperpage = JFS_SBI(sb)->nbperpage; | ||
375 | |||
376 | /* init the hint as "no hint provided" */ | ||
377 | XADaddress(xp, 0); | ||
378 | |||
379 | /* determine the starting extent offset of the page previous | ||
380 | * to the page containing the offset. | ||
381 | */ | ||
382 | prev = ((offset & ~POFFSET) >> JFS_SBI(sb)->l2bsize) - nbperpage; | ||
383 | |||
384 | /* if the offsets in the first page of the file, | ||
385 | * no hint provided. | ||
386 | */ | ||
387 | if (prev < 0) | ||
388 | return (0); | ||
389 | |||
390 | /* prepare to lookup the previous page's extent info */ | ||
391 | lxdl.maxnlxd = 1; | ||
392 | lxdl.nlxd = 1; | ||
393 | lxdl.lxd = &lxd; | ||
394 | LXDoffset(&lxd, prev) | ||
395 | LXDlength(&lxd, nbperpage); | ||
396 | |||
397 | xadl.maxnxad = 1; | ||
398 | xadl.nxad = 0; | ||
399 | xadl.xad = xp; | ||
400 | |||
401 | /* perform the lookup */ | ||
402 | if ((rc = xtLookupList(ip, &lxdl, &xadl, 0))) | ||
403 | return (rc); | ||
404 | |||
405 | /* check if not extent exists for the previous page. | ||
406 | * this is possible for sparse files. | ||
407 | */ | ||
408 | if (xadl.nxad == 0) { | ||
409 | // assert(ISSPARSE(ip)); | ||
410 | return (0); | ||
411 | } | ||
412 | |||
413 | /* only preserve the abnr flag within the xad flags | ||
414 | * of the returned hint. | ||
415 | */ | ||
416 | xp->flag &= XAD_NOTRECORDED; | ||
417 | |||
418 | if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) { | ||
419 | jfs_error(ip->i_sb, "extHint: corrupt xtree"); | ||
420 | return -EIO; | ||
421 | } | ||
422 | |||
423 | return (0); | ||
424 | } | ||
425 | |||
426 | |||
427 | /* | ||
428 | * NAME: extRecord() | ||
429 | * | ||
430 | * FUNCTION: change a page with a file from not recorded to recorded. | ||
431 | * | ||
432 | * PARAMETERS: | ||
433 | * ip - inode of the file. | ||
434 | * cp - cbuf of the file page. | ||
435 | * | ||
436 | * RETURN VALUES: | ||
437 | * 0 - success | ||
438 | * -EIO - i/o error. | ||
439 | * -ENOSPC - insufficient disk resources. | ||
440 | */ | ||
441 | int extRecord(struct inode *ip, xad_t * xp) | ||
442 | { | ||
443 | int rc; | ||
444 | |||
445 | txBeginAnon(ip->i_sb); | ||
446 | |||
447 | down(&JFS_IP(ip)->commit_sem); | ||
448 | |||
449 | /* update the extent */ | ||
450 | rc = xtUpdate(0, ip, xp); | ||
451 | |||
452 | up(&JFS_IP(ip)->commit_sem); | ||
453 | return rc; | ||
454 | } | ||
455 | |||
456 | |||
457 | #ifdef _NOTYET | ||
458 | /* | ||
459 | * NAME: extFill() | ||
460 | * | ||
461 | * FUNCTION: allocate disk space for a file page that represents | ||
462 | * a file hole. | ||
463 | * | ||
464 | * PARAMETERS: | ||
465 | * ip - the inode of the file. | ||
466 | * cp - cbuf of the file page represent the hole. | ||
467 | * | ||
468 | * RETURN VALUES: | ||
469 | * 0 - success | ||
470 | * -EIO - i/o error. | ||
471 | * -ENOSPC - insufficient disk resources. | ||
472 | */ | ||
473 | int extFill(struct inode *ip, xad_t * xp) | ||
474 | { | ||
475 | int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage; | ||
476 | s64 blkno = offsetXAD(xp) >> ip->i_blksize; | ||
477 | |||
478 | // assert(ISSPARSE(ip)); | ||
479 | |||
480 | /* initialize the extent allocation hint */ | ||
481 | XADaddress(xp, 0); | ||
482 | |||
483 | /* allocate an extent to fill the hole */ | ||
484 | if ((rc = extAlloc(ip, nbperpage, blkno, xp, FALSE))) | ||
485 | return (rc); | ||
486 | |||
487 | assert(lengthPXD(xp) == nbperpage); | ||
488 | |||
489 | return (0); | ||
490 | } | ||
491 | #endif /* _NOTYET */ | ||
492 | |||
493 | |||
494 | /* | ||
495 | * NAME: extBalloc() | ||
496 | * | ||
497 | * FUNCTION: allocate disk blocks to form an extent. | ||
498 | * | ||
499 | * initially, we will try to allocate disk blocks for the | ||
500 | * requested size (nblocks). if this fails (nblocks | ||
501 | * contigious free blocks not avaliable), we'll try to allocate | ||
502 | * a smaller number of blocks (producing a smaller extent), with | ||
503 | * this smaller number of blocks consisting of the requested | ||
504 | * number of blocks rounded down to the next smaller power of 2 | ||
505 | * number (i.e. 16 -> 8). we'll continue to round down and | ||
506 | * retry the allocation until the number of blocks to allocate | ||
507 | * is smaller than the number of blocks per page. | ||
508 | * | ||
509 | * PARAMETERS: | ||
510 | * ip - the inode of the file. | ||
511 | * hint - disk block number to be used as an allocation hint. | ||
512 | * *nblocks - pointer to an s64 value. on entry, this value specifies | ||
513 | * the desired number of block to be allocated. on successful | ||
514 | * exit, this value is set to the number of blocks actually | ||
515 | * allocated. | ||
516 | * blkno - pointer to a block address that is filled in on successful | ||
517 | * return with the starting block number of the newly | ||
518 | * allocated block range. | ||
519 | * | ||
520 | * RETURN VALUES: | ||
521 | * 0 - success | ||
522 | * -EIO - i/o error. | ||
523 | * -ENOSPC - insufficient disk resources. | ||
524 | */ | ||
525 | static int | ||
526 | extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) | ||
527 | { | ||
528 | struct jfs_inode_info *ji = JFS_IP(ip); | ||
529 | struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); | ||
530 | s64 nb, nblks, daddr, max; | ||
531 | int rc, nbperpage = sbi->nbperpage; | ||
532 | struct bmap *bmp = sbi->bmap; | ||
533 | int ag; | ||
534 | |||
535 | /* get the number of blocks to initially attempt to allocate. | ||
536 | * we'll first try the number of blocks requested unless this | ||
537 | * number is greater than the maximum number of contigious free | ||
538 | * blocks in the map. in that case, we'll start off with the | ||
539 | * maximum free. | ||
540 | */ | ||
541 | max = (s64) 1 << bmp->db_maxfreebud; | ||
542 | if (*nblocks >= max && *nblocks > nbperpage) | ||
543 | nb = nblks = (max > nbperpage) ? max : nbperpage; | ||
544 | else | ||
545 | nb = nblks = *nblocks; | ||
546 | |||
547 | /* try to allocate blocks */ | ||
548 | while ((rc = dbAlloc(ip, hint, nb, &daddr)) != 0) { | ||
549 | /* if something other than an out of space error, | ||
550 | * stop and return this error. | ||
551 | */ | ||
552 | if (rc != -ENOSPC) | ||
553 | return (rc); | ||
554 | |||
555 | /* decrease the allocation request size */ | ||
556 | nb = min(nblks, extRoundDown(nb)); | ||
557 | |||
558 | /* give up if we cannot cover a page */ | ||
559 | if (nb < nbperpage) | ||
560 | return (rc); | ||
561 | } | ||
562 | |||
563 | *nblocks = nb; | ||
564 | *blkno = daddr; | ||
565 | |||
566 | if (S_ISREG(ip->i_mode) && (ji->fileset == FILESYSTEM_I)) { | ||
567 | ag = BLKTOAG(daddr, sbi); | ||
568 | spin_lock_irq(&ji->ag_lock); | ||
569 | if (ji->active_ag == -1) { | ||
570 | atomic_inc(&bmp->db_active[ag]); | ||
571 | ji->active_ag = ag; | ||
572 | } else if (ji->active_ag != ag) { | ||
573 | atomic_dec(&bmp->db_active[ji->active_ag]); | ||
574 | atomic_inc(&bmp->db_active[ag]); | ||
575 | ji->active_ag = ag; | ||
576 | } | ||
577 | spin_unlock_irq(&ji->ag_lock); | ||
578 | } | ||
579 | |||
580 | return (0); | ||
581 | } | ||
582 | |||
583 | |||
584 | #ifdef _NOTYET | ||
585 | /* | ||
586 | * NAME: extBrealloc() | ||
587 | * | ||
588 | * FUNCTION: attempt to extend an extent's allocation. | ||
589 | * | ||
590 | * initially, we will try to extend the extent's allocation | ||
591 | * in place. if this fails, we'll try to move the extent | ||
592 | * to a new set of blocks. if moving the extent, we initially | ||
593 | * will try to allocate disk blocks for the requested size | ||
594 | * (nnew). if this fails (nnew contigious free blocks not | ||
595 | * avaliable), we'll try to allocate a smaller number of | ||
596 | * blocks (producing a smaller extent), with this smaller | ||
597 | * number of blocks consisting of the requested number of | ||
598 | * blocks rounded down to the next smaller power of 2 | ||
599 | * number (i.e. 16 -> 8). we'll continue to round down and | ||
600 | * retry the allocation until the number of blocks to allocate | ||
601 | * is smaller than the number of blocks per page. | ||
602 | * | ||
603 | * PARAMETERS: | ||
604 | * ip - the inode of the file. | ||
605 | * blkno - starting block number of the extents current allocation. | ||
606 | * nblks - number of blocks within the extents current allocation. | ||
607 | * newnblks - pointer to a s64 value. on entry, this value is the | ||
608 | * the new desired extent size (number of blocks). on | ||
609 | * successful exit, this value is set to the extent's actual | ||
610 | * new size (new number of blocks). | ||
611 | * newblkno - the starting block number of the extents new allocation. | ||
612 | * | ||
613 | * RETURN VALUES: | ||
614 | * 0 - success | ||
615 | * -EIO - i/o error. | ||
616 | * -ENOSPC - insufficient disk resources. | ||
617 | */ | ||
618 | static int | ||
619 | extBrealloc(struct inode *ip, | ||
620 | s64 blkno, s64 nblks, s64 * newnblks, s64 * newblkno) | ||
621 | { | ||
622 | int rc; | ||
623 | |||
624 | /* try to extend in place */ | ||
625 | if ((rc = dbExtend(ip, blkno, nblks, *newnblks - nblks)) == 0) { | ||
626 | *newblkno = blkno; | ||
627 | return (0); | ||
628 | } else { | ||
629 | if (rc != -ENOSPC) | ||
630 | return (rc); | ||
631 | } | ||
632 | |||
633 | /* in place extension not possible. | ||
634 | * try to move the extent to a new set of blocks. | ||
635 | */ | ||
636 | return (extBalloc(ip, blkno, newnblks, newblkno)); | ||
637 | } | ||
638 | #endif /* _NOTYET */ | ||
639 | |||
640 | |||
641 | /* | ||
642 | * NAME: extRoundDown() | ||
643 | * | ||
644 | * FUNCTION: round down a specified number of blocks to the next | ||
645 | * smallest power of 2 number. | ||
646 | * | ||
647 | * PARAMETERS: | ||
648 | * nb - the inode of the file. | ||
649 | * | ||
650 | * RETURN VALUES: | ||
651 | * next smallest power of 2 number. | ||
652 | */ | ||
653 | static s64 extRoundDown(s64 nb) | ||
654 | { | ||
655 | int i; | ||
656 | u64 m, k; | ||
657 | |||
658 | for (i = 0, m = (u64) 1 << 63; i < 64; i++, m >>= 1) { | ||
659 | if (m & nb) | ||
660 | break; | ||
661 | } | ||
662 | |||
663 | i = 63 - i; | ||
664 | k = (u64) 1 << i; | ||
665 | k = ((k - 1) & nb) ? k : k >> 1; | ||
666 | |||
667 | return (k); | ||
668 | } | ||
diff --git a/fs/jfs/jfs_extent.h b/fs/jfs/jfs_extent.h new file mode 100644 index 000000000000..e80fc7ced87d --- /dev/null +++ b/fs/jfs/jfs_extent.h | |||
@@ -0,0 +1,31 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2000-2001 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | #ifndef _H_JFS_EXTENT | ||
19 | #define _H_JFS_EXTENT | ||
20 | |||
21 | /* get block allocation allocation hint as location of disk inode */ | ||
22 | #define INOHINT(ip) \ | ||
23 | (addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1) | ||
24 | |||
25 | extern int extAlloc(struct inode *, s64, s64, xad_t *, boolean_t); | ||
26 | extern int extFill(struct inode *, xad_t *); | ||
27 | extern int extHint(struct inode *, s64, xad_t *); | ||
28 | extern int extRealloc(struct inode *, s64, xad_t *, boolean_t); | ||
29 | extern int extRecord(struct inode *, xad_t *); | ||
30 | |||
31 | #endif /* _H_JFS_EXTENT */ | ||
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h new file mode 100644 index 000000000000..86ccac80f0ab --- /dev/null +++ b/fs/jfs/jfs_filsys.h | |||
@@ -0,0 +1,280 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2003 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | #ifndef _H_JFS_FILSYS | ||
19 | #define _H_JFS_FILSYS | ||
20 | |||
21 | /* | ||
22 | * jfs_filsys.h | ||
23 | * | ||
24 | * file system (implementation-dependent) constants | ||
25 | * | ||
26 | * refer to <limits.h> for system wide implementation-dependent constants | ||
27 | */ | ||
28 | |||
29 | /* | ||
30 | * file system option (superblock flag) | ||
31 | */ | ||
32 | /* mount time flag to disable journaling to disk */ | ||
33 | #define JFS_NOINTEGRITY 0x00000010 | ||
34 | |||
35 | /* mount time flags for error handling */ | ||
36 | #define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */ | ||
37 | #define JFS_ERR_CONTINUE 0x00000004 /* continue */ | ||
38 | #define JFS_ERR_PANIC 0x00000008 /* panic */ | ||
39 | |||
40 | /* platform option (conditional compilation) */ | ||
41 | #define JFS_AIX 0x80000000 /* AIX support */ | ||
42 | /* POSIX name/directory support */ | ||
43 | |||
44 | #define JFS_OS2 0x40000000 /* OS/2 support */ | ||
45 | /* case-insensitive name/directory support */ | ||
46 | |||
47 | #define JFS_DFS 0x20000000 /* DCE DFS LFS support */ | ||
48 | |||
49 | #define JFS_LINUX 0x10000000 /* Linux support */ | ||
50 | /* case-sensitive name/directory support */ | ||
51 | |||
52 | /* directory option */ | ||
53 | #define JFS_UNICODE 0x00000001 /* unicode name */ | ||
54 | |||
55 | /* commit option */ | ||
56 | #define JFS_COMMIT 0x00000f00 /* commit option mask */ | ||
57 | #define JFS_GROUPCOMMIT 0x00000100 /* group (of 1) commit */ | ||
58 | #define JFS_LAZYCOMMIT 0x00000200 /* lazy commit */ | ||
59 | #define JFS_TMPFS 0x00000400 /* temporary file system - | ||
60 | * do not log/commit: | ||
61 | */ | ||
62 | |||
63 | /* log logical volume option */ | ||
64 | #define JFS_INLINELOG 0x00000800 /* inline log within file system */ | ||
65 | #define JFS_INLINEMOVE 0x00001000 /* inline log being moved */ | ||
66 | |||
67 | /* Secondary aggregate inode table */ | ||
68 | #define JFS_BAD_SAIT 0x00010000 /* current secondary ait is bad */ | ||
69 | |||
70 | /* sparse regular file support */ | ||
71 | #define JFS_SPARSE 0x00020000 /* sparse regular file */ | ||
72 | |||
73 | /* DASD Limits F226941 */ | ||
74 | #define JFS_DASD_ENABLED 0x00040000 /* DASD limits enabled */ | ||
75 | #define JFS_DASD_PRIME 0x00080000 /* Prime DASD usage on boot */ | ||
76 | |||
77 | /* big endian flag */ | ||
78 | #define JFS_SWAP_BYTES 0x00100000 /* running on big endian computer */ | ||
79 | |||
80 | /* Directory index */ | ||
81 | #define JFS_DIR_INDEX 0x00200000 /* Persistant index for */ | ||
82 | /* directory entries */ | ||
83 | |||
84 | |||
85 | /* | ||
86 | * buffer cache configuration | ||
87 | */ | ||
88 | /* page size */ | ||
89 | #ifdef PSIZE | ||
90 | #undef PSIZE | ||
91 | #endif | ||
92 | #define PSIZE 4096 /* page size (in byte) */ | ||
93 | #define L2PSIZE 12 /* log2(PSIZE) */ | ||
94 | #define POFFSET 4095 /* offset within page */ | ||
95 | |||
96 | /* buffer page size */ | ||
97 | #define BPSIZE PSIZE | ||
98 | |||
99 | /* | ||
100 | * fs fundamental size | ||
101 | * | ||
102 | * PSIZE >= file system block size >= PBSIZE >= DISIZE | ||
103 | */ | ||
104 | #define PBSIZE 512 /* physical block size (in byte) */ | ||
105 | #define L2PBSIZE 9 /* log2(PBSIZE) */ | ||
106 | |||
107 | #define DISIZE 512 /* on-disk inode size (in byte) */ | ||
108 | #define L2DISIZE 9 /* log2(DISIZE) */ | ||
109 | |||
110 | #define IDATASIZE 256 /* inode inline data size */ | ||
111 | #define IXATTRSIZE 128 /* inode inline extended attribute size */ | ||
112 | |||
113 | #define XTPAGE_SIZE 4096 | ||
114 | #define log2_PAGESIZE 12 | ||
115 | |||
116 | #define IAG_SIZE 4096 | ||
117 | #define IAG_EXTENT_SIZE 4096 | ||
118 | #define INOSPERIAG 4096 /* number of disk inodes per iag */ | ||
119 | #define L2INOSPERIAG 12 /* l2 number of disk inodes per iag */ | ||
120 | #define INOSPEREXT 32 /* number of disk inode per extent */ | ||
121 | #define L2INOSPEREXT 5 /* l2 number of disk inode per extent */ | ||
122 | #define IXSIZE (DISIZE * INOSPEREXT) /* inode extent size */ | ||
123 | #define INOSPERPAGE 8 /* number of disk inodes per 4K page */ | ||
124 | #define L2INOSPERPAGE 3 /* log2(INOSPERPAGE) */ | ||
125 | |||
126 | #define IAGFREELIST_LWM 64 | ||
127 | |||
128 | #define INODE_EXTENT_SIZE IXSIZE /* inode extent size */ | ||
129 | #define NUM_INODE_PER_EXTENT INOSPEREXT | ||
130 | #define NUM_INODE_PER_IAG INOSPERIAG | ||
131 | |||
132 | #define MINBLOCKSIZE 512 | ||
133 | #define MAXBLOCKSIZE 4096 | ||
134 | #define MAXFILESIZE ((s64)1 << 52) | ||
135 | |||
136 | #define JFS_LINK_MAX 0xffffffff | ||
137 | |||
138 | /* Minimum number of bytes supported for a JFS partition */ | ||
139 | #define MINJFS (0x1000000) | ||
140 | #define MINJFSTEXT "16" | ||
141 | |||
142 | /* | ||
143 | * file system block size -> physical block size | ||
144 | */ | ||
145 | #define LBOFFSET(x) ((x) & (PBSIZE - 1)) | ||
146 | #define LBNUMBER(x) ((x) >> L2PBSIZE) | ||
147 | #define LBLK2PBLK(sb,b) ((b) << (sb->s_blocksize_bits - L2PBSIZE)) | ||
148 | #define PBLK2LBLK(sb,b) ((b) >> (sb->s_blocksize_bits - L2PBSIZE)) | ||
149 | /* size in byte -> last page number */ | ||
150 | #define SIZE2PN(size) ( ((s64)((size) - 1)) >> (L2PSIZE) ) | ||
151 | /* size in byte -> last file system block number */ | ||
152 | #define SIZE2BN(size, l2bsize) ( ((s64)((size) - 1)) >> (l2bsize) ) | ||
153 | |||
154 | /* | ||
155 | * fixed physical block address (physical block size = 512 byte) | ||
156 | * | ||
157 | * NOTE: since we can't guarantee a physical block size of 512 bytes the use of | ||
158 | * these macros should be removed and the byte offset macros used instead. | ||
159 | */ | ||
160 | #define SUPER1_B 64 /* primary superblock */ | ||
161 | #define AIMAP_B (SUPER1_B + 8) /* 1st extent of aggregate inode map */ | ||
162 | #define AITBL_B (AIMAP_B + 16) /* | ||
163 | * 1st extent of aggregate inode table | ||
164 | */ | ||
165 | #define SUPER2_B (AITBL_B + 32) /* 2ndary superblock pbn */ | ||
166 | #define BMAP_B (SUPER2_B + 8) /* block allocation map */ | ||
167 | |||
168 | /* | ||
169 | * SIZE_OF_SUPER defines the total amount of space reserved on disk for the | ||
170 | * superblock. This is not the same as the superblock structure, since all of | ||
171 | * this space is not currently being used. | ||
172 | */ | ||
173 | #define SIZE_OF_SUPER PSIZE | ||
174 | |||
175 | /* | ||
176 | * SIZE_OF_AG_TABLE defines the amount of space reserved to hold the AG table | ||
177 | */ | ||
178 | #define SIZE_OF_AG_TABLE PSIZE | ||
179 | |||
180 | /* | ||
181 | * SIZE_OF_MAP_PAGE defines the amount of disk space reserved for each page of | ||
182 | * the inode allocation map (to hold iag) | ||
183 | */ | ||
184 | #define SIZE_OF_MAP_PAGE PSIZE | ||
185 | |||
186 | /* | ||
187 | * fixed byte offset address | ||
188 | */ | ||
189 | #define SUPER1_OFF 0x8000 /* primary superblock */ | ||
190 | #define AIMAP_OFF (SUPER1_OFF + SIZE_OF_SUPER) | ||
191 | /* | ||
192 | * Control page of aggregate inode map | ||
193 | * followed by 1st extent of map | ||
194 | */ | ||
195 | #define AITBL_OFF (AIMAP_OFF + (SIZE_OF_MAP_PAGE << 1)) | ||
196 | /* | ||
197 | * 1st extent of aggregate inode table | ||
198 | */ | ||
199 | #define SUPER2_OFF (AITBL_OFF + INODE_EXTENT_SIZE) | ||
200 | /* | ||
201 | * secondary superblock | ||
202 | */ | ||
203 | #define BMAP_OFF (SUPER2_OFF + SIZE_OF_SUPER) | ||
204 | /* | ||
205 | * block allocation map | ||
206 | */ | ||
207 | |||
208 | /* | ||
209 | * The following macro is used to indicate the number of reserved disk blocks at | ||
210 | * the front of an aggregate, in terms of physical blocks. This value is | ||
211 | * currently defined to be 32K. This turns out to be the same as the primary | ||
212 | * superblock's address, since it directly follows the reserved blocks. | ||
213 | */ | ||
214 | #define AGGR_RSVD_BLOCKS SUPER1_B | ||
215 | |||
216 | /* | ||
217 | * The following macro is used to indicate the number of reserved bytes at the | ||
218 | * front of an aggregate. This value is currently defined to be 32K. This | ||
219 | * turns out to be the same as the primary superblock's byte offset, since it | ||
220 | * directly follows the reserved blocks. | ||
221 | */ | ||
222 | #define AGGR_RSVD_BYTES SUPER1_OFF | ||
223 | |||
224 | /* | ||
225 | * The following macro defines the byte offset for the first inode extent in | ||
226 | * the aggregate inode table. This allows us to find the self inode to find the | ||
227 | * rest of the table. Currently this value is 44K. | ||
228 | */ | ||
229 | #define AGGR_INODE_TABLE_START AITBL_OFF | ||
230 | |||
231 | /* | ||
232 | * fixed reserved inode number | ||
233 | */ | ||
234 | /* aggregate inode */ | ||
235 | #define AGGR_RESERVED_I 0 /* aggregate inode (reserved) */ | ||
236 | #define AGGREGATE_I 1 /* aggregate inode map inode */ | ||
237 | #define BMAP_I 2 /* aggregate block allocation map inode */ | ||
238 | #define LOG_I 3 /* aggregate inline log inode */ | ||
239 | #define BADBLOCK_I 4 /* aggregate bad block inode */ | ||
240 | #define FILESYSTEM_I 16 /* 1st/only fileset inode in ait: | ||
241 | * fileset inode map inode | ||
242 | */ | ||
243 | |||
244 | /* per fileset inode */ | ||
245 | #define FILESET_RSVD_I 0 /* fileset inode (reserved) */ | ||
246 | #define FILESET_EXT_I 1 /* fileset inode extension */ | ||
247 | #define ROOT_I 2 /* fileset root inode */ | ||
248 | #define ACL_I 3 /* fileset ACL inode */ | ||
249 | |||
250 | #define FILESET_OBJECT_I 4 /* the first fileset inode available for a file | ||
251 | * or directory or link... | ||
252 | */ | ||
253 | #define FIRST_FILESET_INO 16 /* the first aggregate inode which describes | ||
254 | * an inode. (To fsck this is also the first | ||
255 | * inode in part 2 of the agg inode table.) | ||
256 | */ | ||
257 | |||
258 | /* | ||
259 | * directory configuration | ||
260 | */ | ||
261 | #define JFS_NAME_MAX 255 | ||
262 | #define JFS_PATH_MAX BPSIZE | ||
263 | |||
264 | |||
265 | /* | ||
266 | * file system state (superblock state) | ||
267 | */ | ||
268 | #define FM_CLEAN 0x00000000 /* file system is unmounted and clean */ | ||
269 | #define FM_MOUNT 0x00000001 /* file system is mounted cleanly */ | ||
270 | #define FM_DIRTY 0x00000002 /* file system was not unmounted and clean | ||
271 | * when mounted or | ||
272 | * commit failure occurred while being mounted: | ||
273 | * fsck() must be run to repair | ||
274 | */ | ||
275 | #define FM_LOGREDO 0x00000004 /* log based recovery (logredo()) failed: | ||
276 | * fsck() must be run to repair | ||
277 | */ | ||
278 | #define FM_EXTENDFS 0x00000008 /* file system extendfs() in progress */ | ||
279 | |||
280 | #endif /* _H_JFS_FILSYS */ | ||
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c new file mode 100644 index 000000000000..783831301625 --- /dev/null +++ b/fs/jfs/jfs_imap.c | |||
@@ -0,0 +1,3270 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | |||
19 | /* | ||
20 | * jfs_imap.c: inode allocation map manager | ||
21 | * | ||
22 | * Serialization: | ||
23 | * Each AG has a simple lock which is used to control the serialization of | ||
24 | * the AG level lists. This lock should be taken first whenever an AG | ||
25 | * level list will be modified or accessed. | ||
26 | * | ||
27 | * Each IAG is locked by obtaining the buffer for the IAG page. | ||
28 | * | ||
29 | * There is also a inode lock for the inode map inode. A read lock needs to | ||
30 | * be taken whenever an IAG is read from the map or the global level | ||
31 | * information is read. A write lock needs to be taken whenever the global | ||
32 | * level information is modified or an atomic operation needs to be used. | ||
33 | * | ||
34 | * If more than one IAG is read at one time, the read lock may not | ||
35 | * be given up until all of the IAG's are read. Otherwise, a deadlock | ||
36 | * may occur when trying to obtain the read lock while another thread | ||
37 | * holding the read lock is waiting on the IAG already being held. | ||
38 | * | ||
39 | * The control page of the inode map is read into memory by diMount(). | ||
40 | * Thereafter it should only be modified in memory and then it will be | ||
41 | * written out when the filesystem is unmounted by diUnmount(). | ||
42 | */ | ||
43 | |||
44 | #include <linux/fs.h> | ||
45 | #include <linux/buffer_head.h> | ||
46 | #include <linux/pagemap.h> | ||
47 | #include <linux/quotaops.h> | ||
48 | |||
49 | #include "jfs_incore.h" | ||
50 | #include "jfs_filsys.h" | ||
51 | #include "jfs_dinode.h" | ||
52 | #include "jfs_dmap.h" | ||
53 | #include "jfs_imap.h" | ||
54 | #include "jfs_metapage.h" | ||
55 | #include "jfs_superblock.h" | ||
56 | #include "jfs_debug.h" | ||
57 | |||
58 | /* | ||
59 | * imap locks | ||
60 | */ | ||
61 | /* iag free list lock */ | ||
62 | #define IAGFREE_LOCK_INIT(imap) init_MUTEX(&imap->im_freelock) | ||
63 | #define IAGFREE_LOCK(imap) down(&imap->im_freelock) | ||
64 | #define IAGFREE_UNLOCK(imap) up(&imap->im_freelock) | ||
65 | |||
66 | /* per ag iag list locks */ | ||
67 | #define AG_LOCK_INIT(imap,index) init_MUTEX(&(imap->im_aglock[index])) | ||
68 | #define AG_LOCK(imap,agno) down(&imap->im_aglock[agno]) | ||
69 | #define AG_UNLOCK(imap,agno) up(&imap->im_aglock[agno]) | ||
70 | |||
71 | /* | ||
72 | * external references | ||
73 | */ | ||
74 | extern struct address_space_operations jfs_aops; | ||
75 | |||
76 | /* | ||
77 | * forward references | ||
78 | */ | ||
79 | static int diAllocAG(struct inomap *, int, boolean_t, struct inode *); | ||
80 | static int diAllocAny(struct inomap *, int, boolean_t, struct inode *); | ||
81 | static int diAllocBit(struct inomap *, struct iag *, int); | ||
82 | static int diAllocExt(struct inomap *, int, struct inode *); | ||
83 | static int diAllocIno(struct inomap *, int, struct inode *); | ||
84 | static int diFindFree(u32, int); | ||
85 | static int diNewExt(struct inomap *, struct iag *, int); | ||
86 | static int diNewIAG(struct inomap *, int *, int, struct metapage **); | ||
87 | static void duplicateIXtree(struct super_block *, s64, int, s64 *); | ||
88 | |||
89 | static int diIAGRead(struct inomap * imap, int, struct metapage **); | ||
90 | static int copy_from_dinode(struct dinode *, struct inode *); | ||
91 | static void copy_to_dinode(struct dinode *, struct inode *); | ||
92 | |||
93 | /* | ||
94 | * debug code for double-checking inode map | ||
95 | */ | ||
96 | /* #define _JFS_DEBUG_IMAP 1 */ | ||
97 | |||
98 | #ifdef _JFS_DEBUG_IMAP | ||
99 | #define DBG_DIINIT(imap) DBGdiInit(imap) | ||
100 | #define DBG_DIALLOC(imap, ino) DBGdiAlloc(imap, ino) | ||
101 | #define DBG_DIFREE(imap, ino) DBGdiFree(imap, ino) | ||
102 | |||
103 | static void *DBGdiInit(struct inomap * imap); | ||
104 | static void DBGdiAlloc(struct inomap * imap, ino_t ino); | ||
105 | static void DBGdiFree(struct inomap * imap, ino_t ino); | ||
106 | #else | ||
107 | #define DBG_DIINIT(imap) | ||
108 | #define DBG_DIALLOC(imap, ino) | ||
109 | #define DBG_DIFREE(imap, ino) | ||
110 | #endif /* _JFS_DEBUG_IMAP */ | ||
111 | |||
112 | /* | ||
113 | * NAME: diMount() | ||
114 | * | ||
115 | * FUNCTION: initialize the incore inode map control structures for | ||
116 | * a fileset or aggregate init time. | ||
117 | * | ||
118 | * the inode map's control structure (dinomap) is | ||
119 | * brought in from disk and placed in virtual memory. | ||
120 | * | ||
121 | * PARAMETERS: | ||
122 | * ipimap - pointer to inode map inode for the aggregate or fileset. | ||
123 | * | ||
124 | * RETURN VALUES: | ||
125 | * 0 - success | ||
126 | * -ENOMEM - insufficient free virtual memory. | ||
127 | * -EIO - i/o error. | ||
128 | */ | ||
129 | int diMount(struct inode *ipimap) | ||
130 | { | ||
131 | struct inomap *imap; | ||
132 | struct metapage *mp; | ||
133 | int index; | ||
134 | struct dinomap_disk *dinom_le; | ||
135 | |||
136 | /* | ||
137 | * allocate/initialize the in-memory inode map control structure | ||
138 | */ | ||
139 | /* allocate the in-memory inode map control structure. */ | ||
140 | imap = (struct inomap *) kmalloc(sizeof(struct inomap), GFP_KERNEL); | ||
141 | if (imap == NULL) { | ||
142 | jfs_err("diMount: kmalloc returned NULL!"); | ||
143 | return -ENOMEM; | ||
144 | } | ||
145 | |||
146 | /* read the on-disk inode map control structure. */ | ||
147 | |||
148 | mp = read_metapage(ipimap, | ||
149 | IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage, | ||
150 | PSIZE, 0); | ||
151 | if (mp == NULL) { | ||
152 | kfree(imap); | ||
153 | return -EIO; | ||
154 | } | ||
155 | |||
156 | /* copy the on-disk version to the in-memory version. */ | ||
157 | dinom_le = (struct dinomap_disk *) mp->data; | ||
158 | imap->im_freeiag = le32_to_cpu(dinom_le->in_freeiag); | ||
159 | imap->im_nextiag = le32_to_cpu(dinom_le->in_nextiag); | ||
160 | atomic_set(&imap->im_numinos, le32_to_cpu(dinom_le->in_numinos)); | ||
161 | atomic_set(&imap->im_numfree, le32_to_cpu(dinom_le->in_numfree)); | ||
162 | imap->im_nbperiext = le32_to_cpu(dinom_le->in_nbperiext); | ||
163 | imap->im_l2nbperiext = le32_to_cpu(dinom_le->in_l2nbperiext); | ||
164 | for (index = 0; index < MAXAG; index++) { | ||
165 | imap->im_agctl[index].inofree = | ||
166 | le32_to_cpu(dinom_le->in_agctl[index].inofree); | ||
167 | imap->im_agctl[index].extfree = | ||
168 | le32_to_cpu(dinom_le->in_agctl[index].extfree); | ||
169 | imap->im_agctl[index].numinos = | ||
170 | le32_to_cpu(dinom_le->in_agctl[index].numinos); | ||
171 | imap->im_agctl[index].numfree = | ||
172 | le32_to_cpu(dinom_le->in_agctl[index].numfree); | ||
173 | } | ||
174 | |||
175 | /* release the buffer. */ | ||
176 | release_metapage(mp); | ||
177 | |||
178 | /* | ||
179 | * allocate/initialize inode allocation map locks | ||
180 | */ | ||
181 | /* allocate and init iag free list lock */ | ||
182 | IAGFREE_LOCK_INIT(imap); | ||
183 | |||
184 | /* allocate and init ag list locks */ | ||
185 | for (index = 0; index < MAXAG; index++) { | ||
186 | AG_LOCK_INIT(imap, index); | ||
187 | } | ||
188 | |||
189 | /* bind the inode map inode and inode map control structure | ||
190 | * to each other. | ||
191 | */ | ||
192 | imap->im_ipimap = ipimap; | ||
193 | JFS_IP(ipimap)->i_imap = imap; | ||
194 | |||
195 | // DBG_DIINIT(imap); | ||
196 | |||
197 | return (0); | ||
198 | } | ||
199 | |||
200 | |||
201 | /* | ||
202 | * NAME: diUnmount() | ||
203 | * | ||
204 | * FUNCTION: write to disk the incore inode map control structures for | ||
205 | * a fileset or aggregate at unmount time. | ||
206 | * | ||
207 | * PARAMETERS: | ||
208 | * ipimap - pointer to inode map inode for the aggregate or fileset. | ||
209 | * | ||
210 | * RETURN VALUES: | ||
211 | * 0 - success | ||
212 | * -ENOMEM - insufficient free virtual memory. | ||
213 | * -EIO - i/o error. | ||
214 | */ | ||
215 | int diUnmount(struct inode *ipimap, int mounterror) | ||
216 | { | ||
217 | struct inomap *imap = JFS_IP(ipimap)->i_imap; | ||
218 | |||
219 | /* | ||
220 | * update the on-disk inode map control structure | ||
221 | */ | ||
222 | |||
223 | if (!(mounterror || isReadOnly(ipimap))) | ||
224 | diSync(ipimap); | ||
225 | |||
226 | /* | ||
227 | * Invalidate the page cache buffers | ||
228 | */ | ||
229 | truncate_inode_pages(ipimap->i_mapping, 0); | ||
230 | |||
231 | /* | ||
232 | * free in-memory control structure | ||
233 | */ | ||
234 | kfree(imap); | ||
235 | |||
236 | return (0); | ||
237 | } | ||
238 | |||
239 | |||
240 | /* | ||
241 | * diSync() | ||
242 | */ | ||
243 | int diSync(struct inode *ipimap) | ||
244 | { | ||
245 | struct dinomap_disk *dinom_le; | ||
246 | struct inomap *imp = JFS_IP(ipimap)->i_imap; | ||
247 | struct metapage *mp; | ||
248 | int index; | ||
249 | |||
250 | /* | ||
251 | * write imap global conrol page | ||
252 | */ | ||
253 | /* read the on-disk inode map control structure */ | ||
254 | mp = get_metapage(ipimap, | ||
255 | IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage, | ||
256 | PSIZE, 0); | ||
257 | if (mp == NULL) { | ||
258 | jfs_err("diSync: get_metapage failed!"); | ||
259 | return -EIO; | ||
260 | } | ||
261 | |||
262 | /* copy the in-memory version to the on-disk version */ | ||
263 | dinom_le = (struct dinomap_disk *) mp->data; | ||
264 | dinom_le->in_freeiag = cpu_to_le32(imp->im_freeiag); | ||
265 | dinom_le->in_nextiag = cpu_to_le32(imp->im_nextiag); | ||
266 | dinom_le->in_numinos = cpu_to_le32(atomic_read(&imp->im_numinos)); | ||
267 | dinom_le->in_numfree = cpu_to_le32(atomic_read(&imp->im_numfree)); | ||
268 | dinom_le->in_nbperiext = cpu_to_le32(imp->im_nbperiext); | ||
269 | dinom_le->in_l2nbperiext = cpu_to_le32(imp->im_l2nbperiext); | ||
270 | for (index = 0; index < MAXAG; index++) { | ||
271 | dinom_le->in_agctl[index].inofree = | ||
272 | cpu_to_le32(imp->im_agctl[index].inofree); | ||
273 | dinom_le->in_agctl[index].extfree = | ||
274 | cpu_to_le32(imp->im_agctl[index].extfree); | ||
275 | dinom_le->in_agctl[index].numinos = | ||
276 | cpu_to_le32(imp->im_agctl[index].numinos); | ||
277 | dinom_le->in_agctl[index].numfree = | ||
278 | cpu_to_le32(imp->im_agctl[index].numfree); | ||
279 | } | ||
280 | |||
281 | /* write out the control structure */ | ||
282 | write_metapage(mp); | ||
283 | |||
284 | /* | ||
285 | * write out dirty pages of imap | ||
286 | */ | ||
287 | filemap_fdatawrite(ipimap->i_mapping); | ||
288 | filemap_fdatawait(ipimap->i_mapping); | ||
289 | |||
290 | diWriteSpecial(ipimap, 0); | ||
291 | |||
292 | return (0); | ||
293 | } | ||
294 | |||
295 | |||
296 | /* | ||
297 | * NAME: diRead() | ||
298 | * | ||
299 | * FUNCTION: initialize an incore inode from disk. | ||
300 | * | ||
301 | * on entry, the specifed incore inode should itself | ||
302 | * specify the disk inode number corresponding to the | ||
303 | * incore inode (i.e. i_number should be initialized). | ||
304 | * | ||
305 | * this routine handles incore inode initialization for | ||
306 | * both "special" and "regular" inodes. special inodes | ||
307 | * are those required early in the mount process and | ||
308 | * require special handling since much of the file system | ||
309 | * is not yet initialized. these "special" inodes are | ||
310 | * identified by a NULL inode map inode pointer and are | ||
311 | * actually initialized by a call to diReadSpecial(). | ||
312 | * | ||
313 | * for regular inodes, the iag describing the disk inode | ||
314 | * is read from disk to determine the inode extent address | ||
315 | * for the disk inode. with the inode extent address in | ||
316 | * hand, the page of the extent that contains the disk | ||
317 | * inode is read and the disk inode is copied to the | ||
318 | * incore inode. | ||
319 | * | ||
320 | * PARAMETERS: | ||
321 | * ip - pointer to incore inode to be initialized from disk. | ||
322 | * | ||
323 | * RETURN VALUES: | ||
324 | * 0 - success | ||
325 | * -EIO - i/o error. | ||
326 | * -ENOMEM - insufficient memory | ||
327 | * | ||
328 | */ | ||
329 | int diRead(struct inode *ip) | ||
330 | { | ||
331 | struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); | ||
332 | int iagno, ino, extno, rc; | ||
333 | struct inode *ipimap; | ||
334 | struct dinode *dp; | ||
335 | struct iag *iagp; | ||
336 | struct metapage *mp; | ||
337 | s64 blkno, agstart; | ||
338 | struct inomap *imap; | ||
339 | int block_offset; | ||
340 | int inodes_left; | ||
341 | uint pageno; | ||
342 | int rel_inode; | ||
343 | |||
344 | jfs_info("diRead: ino = %ld", ip->i_ino); | ||
345 | |||
346 | ipimap = sbi->ipimap; | ||
347 | JFS_IP(ip)->ipimap = ipimap; | ||
348 | |||
349 | /* determine the iag number for this inode (number) */ | ||
350 | iagno = INOTOIAG(ip->i_ino); | ||
351 | |||
352 | /* read the iag */ | ||
353 | imap = JFS_IP(ipimap)->i_imap; | ||
354 | IREAD_LOCK(ipimap); | ||
355 | rc = diIAGRead(imap, iagno, &mp); | ||
356 | IREAD_UNLOCK(ipimap); | ||
357 | if (rc) { | ||
358 | jfs_err("diRead: diIAGRead returned %d", rc); | ||
359 | return (rc); | ||
360 | } | ||
361 | |||
362 | iagp = (struct iag *) mp->data; | ||
363 | |||
364 | /* determine inode extent that holds the disk inode */ | ||
365 | ino = ip->i_ino & (INOSPERIAG - 1); | ||
366 | extno = ino >> L2INOSPEREXT; | ||
367 | |||
368 | if ((lengthPXD(&iagp->inoext[extno]) != imap->im_nbperiext) || | ||
369 | (addressPXD(&iagp->inoext[extno]) == 0)) { | ||
370 | release_metapage(mp); | ||
371 | return -ESTALE; | ||
372 | } | ||
373 | |||
374 | /* get disk block number of the page within the inode extent | ||
375 | * that holds the disk inode. | ||
376 | */ | ||
377 | blkno = INOPBLK(&iagp->inoext[extno], ino, sbi->l2nbperpage); | ||
378 | |||
379 | /* get the ag for the iag */ | ||
380 | agstart = le64_to_cpu(iagp->agstart); | ||
381 | |||
382 | release_metapage(mp); | ||
383 | |||
384 | rel_inode = (ino & (INOSPERPAGE - 1)); | ||
385 | pageno = blkno >> sbi->l2nbperpage; | ||
386 | |||
387 | if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) { | ||
388 | /* | ||
389 | * OS/2 didn't always align inode extents on page boundaries | ||
390 | */ | ||
391 | inodes_left = | ||
392 | (sbi->nbperpage - block_offset) << sbi->l2niperblk; | ||
393 | |||
394 | if (rel_inode < inodes_left) | ||
395 | rel_inode += block_offset << sbi->l2niperblk; | ||
396 | else { | ||
397 | pageno += 1; | ||
398 | rel_inode -= inodes_left; | ||
399 | } | ||
400 | } | ||
401 | |||
402 | /* read the page of disk inode */ | ||
403 | mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1); | ||
404 | if (mp == 0) { | ||
405 | jfs_err("diRead: read_metapage failed"); | ||
406 | return -EIO; | ||
407 | } | ||
408 | |||
409 | /* locate the the disk inode requested */ | ||
410 | dp = (struct dinode *) mp->data; | ||
411 | dp += rel_inode; | ||
412 | |||
413 | if (ip->i_ino != le32_to_cpu(dp->di_number)) { | ||
414 | jfs_error(ip->i_sb, "diRead: i_ino != di_number"); | ||
415 | rc = -EIO; | ||
416 | } else if (le32_to_cpu(dp->di_nlink) == 0) | ||
417 | rc = -ESTALE; | ||
418 | else | ||
419 | /* copy the disk inode to the in-memory inode */ | ||
420 | rc = copy_from_dinode(dp, ip); | ||
421 | |||
422 | release_metapage(mp); | ||
423 | |||
424 | /* set the ag for the inode */ | ||
425 | JFS_IP(ip)->agno = BLKTOAG(agstart, sbi); | ||
426 | JFS_IP(ip)->active_ag = -1; | ||
427 | |||
428 | return (rc); | ||
429 | } | ||
430 | |||
431 | |||
432 | /* | ||
433 | * NAME: diReadSpecial() | ||
434 | * | ||
435 | * FUNCTION: initialize a 'special' inode from disk. | ||
436 | * | ||
437 | * this routines handles aggregate level inodes. The | ||
438 | * inode cache cannot differentiate between the | ||
439 | * aggregate inodes and the filesystem inodes, so we | ||
440 | * handle these here. We don't actually use the aggregate | ||
441 | * inode map, since these inodes are at a fixed location | ||
442 | * and in some cases the aggregate inode map isn't initialized | ||
443 | * yet. | ||
444 | * | ||
445 | * PARAMETERS: | ||
446 | * sb - filesystem superblock | ||
447 | * inum - aggregate inode number | ||
448 | * secondary - 1 if secondary aggregate inode table | ||
449 | * | ||
450 | * RETURN VALUES: | ||
451 | * new inode - success | ||
452 | * NULL - i/o error. | ||
453 | */ | ||
454 | struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) | ||
455 | { | ||
456 | struct jfs_sb_info *sbi = JFS_SBI(sb); | ||
457 | uint address; | ||
458 | struct dinode *dp; | ||
459 | struct inode *ip; | ||
460 | struct metapage *mp; | ||
461 | |||
462 | ip = new_inode(sb); | ||
463 | if (ip == NULL) { | ||
464 | jfs_err("diReadSpecial: new_inode returned NULL!"); | ||
465 | return ip; | ||
466 | } | ||
467 | |||
468 | if (secondary) { | ||
469 | address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage; | ||
470 | JFS_IP(ip)->ipimap = sbi->ipaimap2; | ||
471 | } else { | ||
472 | address = AITBL_OFF >> L2PSIZE; | ||
473 | JFS_IP(ip)->ipimap = sbi->ipaimap; | ||
474 | } | ||
475 | |||
476 | ASSERT(inum < INOSPEREXT); | ||
477 | |||
478 | ip->i_ino = inum; | ||
479 | |||
480 | address += inum >> 3; /* 8 inodes per 4K page */ | ||
481 | |||
482 | /* read the page of fixed disk inode (AIT) in raw mode */ | ||
483 | mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1); | ||
484 | if (mp == NULL) { | ||
485 | ip->i_nlink = 1; /* Don't want iput() deleting it */ | ||
486 | iput(ip); | ||
487 | return (NULL); | ||
488 | } | ||
489 | |||
490 | /* get the pointer to the disk inode of interest */ | ||
491 | dp = (struct dinode *) (mp->data); | ||
492 | dp += inum % 8; /* 8 inodes per 4K page */ | ||
493 | |||
494 | /* copy on-disk inode to in-memory inode */ | ||
495 | if ((copy_from_dinode(dp, ip)) != 0) { | ||
496 | /* handle bad return by returning NULL for ip */ | ||
497 | ip->i_nlink = 1; /* Don't want iput() deleting it */ | ||
498 | iput(ip); | ||
499 | /* release the page */ | ||
500 | release_metapage(mp); | ||
501 | return (NULL); | ||
502 | |||
503 | } | ||
504 | |||
505 | ip->i_mapping->a_ops = &jfs_aops; | ||
506 | mapping_set_gfp_mask(ip->i_mapping, GFP_NOFS); | ||
507 | |||
508 | /* Allocations to metadata inodes should not affect quotas */ | ||
509 | ip->i_flags |= S_NOQUOTA; | ||
510 | |||
511 | if ((inum == FILESYSTEM_I) && (JFS_IP(ip)->ipimap == sbi->ipaimap)) { | ||
512 | sbi->gengen = le32_to_cpu(dp->di_gengen); | ||
513 | sbi->inostamp = le32_to_cpu(dp->di_inostamp); | ||
514 | } | ||
515 | |||
516 | /* release the page */ | ||
517 | release_metapage(mp); | ||
518 | |||
519 | return (ip); | ||
520 | } | ||
521 | |||
522 | /* | ||
523 | * NAME: diWriteSpecial() | ||
524 | * | ||
525 | * FUNCTION: Write the special inode to disk | ||
526 | * | ||
527 | * PARAMETERS: | ||
528 | * ip - special inode | ||
529 | * secondary - 1 if secondary aggregate inode table | ||
530 | * | ||
531 | * RETURN VALUES: none | ||
532 | */ | ||
533 | |||
534 | void diWriteSpecial(struct inode *ip, int secondary) | ||
535 | { | ||
536 | struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); | ||
537 | uint address; | ||
538 | struct dinode *dp; | ||
539 | ino_t inum = ip->i_ino; | ||
540 | struct metapage *mp; | ||
541 | |||
542 | ip->i_state &= ~I_DIRTY; | ||
543 | |||
544 | if (secondary) | ||
545 | address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage; | ||
546 | else | ||
547 | address = AITBL_OFF >> L2PSIZE; | ||
548 | |||
549 | ASSERT(inum < INOSPEREXT); | ||
550 | |||
551 | address += inum >> 3; /* 8 inodes per 4K page */ | ||
552 | |||
553 | /* read the page of fixed disk inode (AIT) in raw mode */ | ||
554 | mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1); | ||
555 | if (mp == NULL) { | ||
556 | jfs_err("diWriteSpecial: failed to read aggregate inode " | ||
557 | "extent!"); | ||
558 | return; | ||
559 | } | ||
560 | |||
561 | /* get the pointer to the disk inode of interest */ | ||
562 | dp = (struct dinode *) (mp->data); | ||
563 | dp += inum % 8; /* 8 inodes per 4K page */ | ||
564 | |||
565 | /* copy on-disk inode to in-memory inode */ | ||
566 | copy_to_dinode(dp, ip); | ||
567 | memcpy(&dp->di_xtroot, &JFS_IP(ip)->i_xtroot, 288); | ||
568 | |||
569 | if (inum == FILESYSTEM_I) | ||
570 | dp->di_gengen = cpu_to_le32(sbi->gengen); | ||
571 | |||
572 | /* write the page */ | ||
573 | write_metapage(mp); | ||
574 | } | ||
575 | |||
576 | /* | ||
577 | * NAME: diFreeSpecial() | ||
578 | * | ||
579 | * FUNCTION: Free allocated space for special inode | ||
580 | */ | ||
581 | void diFreeSpecial(struct inode *ip) | ||
582 | { | ||
583 | if (ip == NULL) { | ||
584 | jfs_err("diFreeSpecial called with NULL ip!"); | ||
585 | return; | ||
586 | } | ||
587 | filemap_fdatawrite(ip->i_mapping); | ||
588 | filemap_fdatawait(ip->i_mapping); | ||
589 | truncate_inode_pages(ip->i_mapping, 0); | ||
590 | iput(ip); | ||
591 | } | ||
592 | |||
593 | |||
594 | |||
595 | /* | ||
596 | * NAME: diWrite() | ||
597 | * | ||
598 | * FUNCTION: write the on-disk inode portion of the in-memory inode | ||
599 | * to its corresponding on-disk inode. | ||
600 | * | ||
601 | * on entry, the specifed incore inode should itself | ||
602 | * specify the disk inode number corresponding to the | ||
603 | * incore inode (i.e. i_number should be initialized). | ||
604 | * | ||
605 | * the inode contains the inode extent address for the disk | ||
606 | * inode. with the inode extent address in hand, the | ||
607 | * page of the extent that contains the disk inode is | ||
608 | * read and the disk inode portion of the incore inode | ||
609 | * is copied to the disk inode. | ||
610 | * | ||
611 | * PARAMETERS: | ||
612 | * tid - transacation id | ||
613 | * ip - pointer to incore inode to be written to the inode extent. | ||
614 | * | ||
615 | * RETURN VALUES: | ||
616 | * 0 - success | ||
617 | * -EIO - i/o error. | ||
618 | */ | ||
619 | int diWrite(tid_t tid, struct inode *ip) | ||
620 | { | ||
621 | struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); | ||
622 | struct jfs_inode_info *jfs_ip = JFS_IP(ip); | ||
623 | int rc = 0; | ||
624 | s32 ino; | ||
625 | struct dinode *dp; | ||
626 | s64 blkno; | ||
627 | int block_offset; | ||
628 | int inodes_left; | ||
629 | struct metapage *mp; | ||
630 | uint pageno; | ||
631 | int rel_inode; | ||
632 | int dioffset; | ||
633 | struct inode *ipimap; | ||
634 | uint type; | ||
635 | lid_t lid; | ||
636 | struct tlock *ditlck, *tlck; | ||
637 | struct linelock *dilinelock, *ilinelock; | ||
638 | struct lv *lv; | ||
639 | int n; | ||
640 | |||
641 | ipimap = jfs_ip->ipimap; | ||
642 | |||
643 | ino = ip->i_ino & (INOSPERIAG - 1); | ||
644 | |||
645 | if (!addressPXD(&(jfs_ip->ixpxd)) || | ||
646 | (lengthPXD(&(jfs_ip->ixpxd)) != | ||
647 | JFS_IP(ipimap)->i_imap->im_nbperiext)) { | ||
648 | jfs_error(ip->i_sb, "diWrite: ixpxd invalid"); | ||
649 | return -EIO; | ||
650 | } | ||
651 | |||
652 | /* | ||
653 | * read the page of disk inode containing the specified inode: | ||
654 | */ | ||
655 | /* compute the block address of the page */ | ||
656 | blkno = INOPBLK(&(jfs_ip->ixpxd), ino, sbi->l2nbperpage); | ||
657 | |||
658 | rel_inode = (ino & (INOSPERPAGE - 1)); | ||
659 | pageno = blkno >> sbi->l2nbperpage; | ||
660 | |||
661 | if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) { | ||
662 | /* | ||
663 | * OS/2 didn't always align inode extents on page boundaries | ||
664 | */ | ||
665 | inodes_left = | ||
666 | (sbi->nbperpage - block_offset) << sbi->l2niperblk; | ||
667 | |||
668 | if (rel_inode < inodes_left) | ||
669 | rel_inode += block_offset << sbi->l2niperblk; | ||
670 | else { | ||
671 | pageno += 1; | ||
672 | rel_inode -= inodes_left; | ||
673 | } | ||
674 | } | ||
675 | /* read the page of disk inode */ | ||
676 | retry: | ||
677 | mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1); | ||
678 | if (mp == 0) | ||
679 | return -EIO; | ||
680 | |||
681 | /* get the pointer to the disk inode */ | ||
682 | dp = (struct dinode *) mp->data; | ||
683 | dp += rel_inode; | ||
684 | |||
685 | dioffset = (ino & (INOSPERPAGE - 1)) << L2DISIZE; | ||
686 | |||
687 | /* | ||
688 | * acquire transaction lock on the on-disk inode; | ||
689 | * N.B. tlock is acquired on ipimap not ip; | ||
690 | */ | ||
691 | if ((ditlck = | ||
692 | txLock(tid, ipimap, mp, tlckINODE | tlckENTRY)) == NULL) | ||
693 | goto retry; | ||
694 | dilinelock = (struct linelock *) & ditlck->lock; | ||
695 | |||
696 | /* | ||
697 | * copy btree root from in-memory inode to on-disk inode | ||
698 | * | ||
699 | * (tlock is taken from inline B+-tree root in in-memory | ||
700 | * inode when the B+-tree root is updated, which is pointed | ||
701 | * by jfs_ip->blid as well as being on tx tlock list) | ||
702 | * | ||
703 | * further processing of btree root is based on the copy | ||
704 | * in in-memory inode, where txLog() will log from, and, | ||
705 | * for xtree root, txUpdateMap() will update map and reset | ||
706 | * XAD_NEW bit; | ||
707 | */ | ||
708 | |||
709 | if (S_ISDIR(ip->i_mode) && (lid = jfs_ip->xtlid)) { | ||
710 | /* | ||
711 | * This is the special xtree inside the directory for storing | ||
712 | * the directory table | ||
713 | */ | ||
714 | xtpage_t *p, *xp; | ||
715 | xad_t *xad; | ||
716 | |||
717 | jfs_ip->xtlid = 0; | ||
718 | tlck = lid_to_tlock(lid); | ||
719 | assert(tlck->type & tlckXTREE); | ||
720 | tlck->type |= tlckBTROOT; | ||
721 | tlck->mp = mp; | ||
722 | ilinelock = (struct linelock *) & tlck->lock; | ||
723 | |||
724 | /* | ||
725 | * copy xtree root from inode to dinode: | ||
726 | */ | ||
727 | p = &jfs_ip->i_xtroot; | ||
728 | xp = (xtpage_t *) &dp->di_dirtable; | ||
729 | lv = ilinelock->lv; | ||
730 | for (n = 0; n < ilinelock->index; n++, lv++) { | ||
731 | memcpy(&xp->xad[lv->offset], &p->xad[lv->offset], | ||
732 | lv->length << L2XTSLOTSIZE); | ||
733 | } | ||
734 | |||
735 | /* reset on-disk (metadata page) xtree XAD_NEW bit */ | ||
736 | xad = &xp->xad[XTENTRYSTART]; | ||
737 | for (n = XTENTRYSTART; | ||
738 | n < le16_to_cpu(xp->header.nextindex); n++, xad++) | ||
739 | if (xad->flag & (XAD_NEW | XAD_EXTENDED)) | ||
740 | xad->flag &= ~(XAD_NEW | XAD_EXTENDED); | ||
741 | } | ||
742 | |||
743 | if ((lid = jfs_ip->blid) == 0) | ||
744 | goto inlineData; | ||
745 | jfs_ip->blid = 0; | ||
746 | |||
747 | tlck = lid_to_tlock(lid); | ||
748 | type = tlck->type; | ||
749 | tlck->type |= tlckBTROOT; | ||
750 | tlck->mp = mp; | ||
751 | ilinelock = (struct linelock *) & tlck->lock; | ||
752 | |||
753 | /* | ||
754 | * regular file: 16 byte (XAD slot) granularity | ||
755 | */ | ||
756 | if (type & tlckXTREE) { | ||
757 | xtpage_t *p, *xp; | ||
758 | xad_t *xad; | ||
759 | |||
760 | /* | ||
761 | * copy xtree root from inode to dinode: | ||
762 | */ | ||
763 | p = &jfs_ip->i_xtroot; | ||
764 | xp = &dp->di_xtroot; | ||
765 | lv = ilinelock->lv; | ||
766 | for (n = 0; n < ilinelock->index; n++, lv++) { | ||
767 | memcpy(&xp->xad[lv->offset], &p->xad[lv->offset], | ||
768 | lv->length << L2XTSLOTSIZE); | ||
769 | } | ||
770 | |||
771 | /* reset on-disk (metadata page) xtree XAD_NEW bit */ | ||
772 | xad = &xp->xad[XTENTRYSTART]; | ||
773 | for (n = XTENTRYSTART; | ||
774 | n < le16_to_cpu(xp->header.nextindex); n++, xad++) | ||
775 | if (xad->flag & (XAD_NEW | XAD_EXTENDED)) | ||
776 | xad->flag &= ~(XAD_NEW | XAD_EXTENDED); | ||
777 | } | ||
778 | /* | ||
779 | * directory: 32 byte (directory entry slot) granularity | ||
780 | */ | ||
781 | else if (type & tlckDTREE) { | ||
782 | dtpage_t *p, *xp; | ||
783 | |||
784 | /* | ||
785 | * copy dtree root from inode to dinode: | ||
786 | */ | ||
787 | p = (dtpage_t *) &jfs_ip->i_dtroot; | ||
788 | xp = (dtpage_t *) & dp->di_dtroot; | ||
789 | lv = ilinelock->lv; | ||
790 | for (n = 0; n < ilinelock->index; n++, lv++) { | ||
791 | memcpy(&xp->slot[lv->offset], &p->slot[lv->offset], | ||
792 | lv->length << L2DTSLOTSIZE); | ||
793 | } | ||
794 | } else { | ||
795 | jfs_err("diWrite: UFO tlock"); | ||
796 | } | ||
797 | |||
798 | inlineData: | ||
799 | /* | ||
800 | * copy inline symlink from in-memory inode to on-disk inode | ||
801 | */ | ||
802 | if (S_ISLNK(ip->i_mode) && ip->i_size < IDATASIZE) { | ||
803 | lv = & dilinelock->lv[dilinelock->index]; | ||
804 | lv->offset = (dioffset + 2 * 128) >> L2INODESLOTSIZE; | ||
805 | lv->length = 2; | ||
806 | memcpy(&dp->di_fastsymlink, jfs_ip->i_inline, IDATASIZE); | ||
807 | dilinelock->index++; | ||
808 | } | ||
809 | /* | ||
810 | * copy inline data from in-memory inode to on-disk inode: | ||
811 | * 128 byte slot granularity | ||
812 | */ | ||
813 | if (test_cflag(COMMIT_Inlineea, ip)) { | ||
814 | lv = & dilinelock->lv[dilinelock->index]; | ||
815 | lv->offset = (dioffset + 3 * 128) >> L2INODESLOTSIZE; | ||
816 | lv->length = 1; | ||
817 | memcpy(&dp->di_inlineea, jfs_ip->i_inline_ea, INODESLOTSIZE); | ||
818 | dilinelock->index++; | ||
819 | |||
820 | clear_cflag(COMMIT_Inlineea, ip); | ||
821 | } | ||
822 | |||
823 | /* | ||
824 | * lock/copy inode base: 128 byte slot granularity | ||
825 | */ | ||
826 | // baseDinode: | ||
827 | lv = & dilinelock->lv[dilinelock->index]; | ||
828 | lv->offset = dioffset >> L2INODESLOTSIZE; | ||
829 | copy_to_dinode(dp, ip); | ||
830 | if (test_and_clear_cflag(COMMIT_Dirtable, ip)) { | ||
831 | lv->length = 2; | ||
832 | memcpy(&dp->di_dirtable, &jfs_ip->i_dirtable, 96); | ||
833 | } else | ||
834 | lv->length = 1; | ||
835 | dilinelock->index++; | ||
836 | |||
837 | #ifdef _JFS_FASTDASD | ||
838 | /* | ||
839 | * We aren't logging changes to the DASD used in directory inodes, | ||
840 | * but we need to write them to disk. If we don't unmount cleanly, | ||
841 | * mount will recalculate the DASD used. | ||
842 | */ | ||
843 | if (S_ISDIR(ip->i_mode) | ||
844 | && (ip->i_ipmnt->i_mntflag & JFS_DASD_ENABLED)) | ||
845 | memcpy(&dp->di_DASD, &ip->i_DASD, sizeof(struct dasd)); | ||
846 | #endif /* _JFS_FASTDASD */ | ||
847 | |||
848 | /* release the buffer holding the updated on-disk inode. | ||
849 | * the buffer will be later written by commit processing. | ||
850 | */ | ||
851 | write_metapage(mp); | ||
852 | |||
853 | return (rc); | ||
854 | } | ||
855 | |||
856 | |||
857 | /* | ||
858 | * NAME: diFree(ip) | ||
859 | * | ||
860 | * FUNCTION: free a specified inode from the inode working map | ||
861 | * for a fileset or aggregate. | ||
862 | * | ||
863 | * if the inode to be freed represents the first (only) | ||
864 | * free inode within the iag, the iag will be placed on | ||
865 | * the ag free inode list. | ||
866 | * | ||
867 | * freeing the inode will cause the inode extent to be | ||
868 | * freed if the inode is the only allocated inode within | ||
869 | * the extent. in this case all the disk resource backing | ||
870 | * up the inode extent will be freed. in addition, the iag | ||
871 | * will be placed on the ag extent free list if the extent | ||
872 | * is the first free extent in the iag. if freeing the | ||
873 | * extent also means that no free inodes will exist for | ||
874 | * the iag, the iag will also be removed from the ag free | ||
875 | * inode list. | ||
876 | * | ||
877 | * the iag describing the inode will be freed if the extent | ||
878 | * is to be freed and it is the only backed extent within | ||
879 | * the iag. in this case, the iag will be removed from the | ||
880 | * ag free extent list and ag free inode list and placed on | ||
881 | * the inode map's free iag list. | ||
882 | * | ||
883 | * a careful update approach is used to provide consistency | ||
884 | * in the face of updates to multiple buffers. under this | ||
885 | * approach, all required buffers are obtained before making | ||
886 | * any updates and are held until all updates are complete. | ||
887 | * | ||
888 | * PARAMETERS: | ||
889 | * ip - inode to be freed. | ||
890 | * | ||
891 | * RETURN VALUES: | ||
892 | * 0 - success | ||
893 | * -EIO - i/o error. | ||
894 | */ | ||
895 | int diFree(struct inode *ip) | ||
896 | { | ||
897 | int rc; | ||
898 | ino_t inum = ip->i_ino; | ||
899 | struct iag *iagp, *aiagp, *biagp, *ciagp, *diagp; | ||
900 | struct metapage *mp, *amp, *bmp, *cmp, *dmp; | ||
901 | int iagno, ino, extno, bitno, sword, agno; | ||
902 | int back, fwd; | ||
903 | u32 bitmap, mask; | ||
904 | struct inode *ipimap = JFS_SBI(ip->i_sb)->ipimap; | ||
905 | struct inomap *imap = JFS_IP(ipimap)->i_imap; | ||
906 | pxd_t freepxd; | ||
907 | tid_t tid; | ||
908 | struct inode *iplist[3]; | ||
909 | struct tlock *tlck; | ||
910 | struct pxd_lock *pxdlock; | ||
911 | |||
912 | /* | ||
913 | * This is just to suppress compiler warnings. The same logic that | ||
914 | * references these variables is used to initialize them. | ||
915 | */ | ||
916 | aiagp = biagp = ciagp = diagp = NULL; | ||
917 | |||
918 | /* get the iag number containing the inode. | ||
919 | */ | ||
920 | iagno = INOTOIAG(inum); | ||
921 | |||
922 | /* make sure that the iag is contained within | ||
923 | * the map. | ||
924 | */ | ||
925 | if (iagno >= imap->im_nextiag) { | ||
926 | dump_mem("imap", imap, 32); | ||
927 | jfs_error(ip->i_sb, | ||
928 | "diFree: inum = %d, iagno = %d, nextiag = %d", | ||
929 | (uint) inum, iagno, imap->im_nextiag); | ||
930 | return -EIO; | ||
931 | } | ||
932 | |||
933 | /* get the allocation group for this ino. | ||
934 | */ | ||
935 | agno = JFS_IP(ip)->agno; | ||
936 | |||
937 | /* Lock the AG specific inode map information | ||
938 | */ | ||
939 | AG_LOCK(imap, agno); | ||
940 | |||
941 | /* Obtain read lock in imap inode. Don't release it until we have | ||
942 | * read all of the IAG's that we are going to. | ||
943 | */ | ||
944 | IREAD_LOCK(ipimap); | ||
945 | |||
946 | /* read the iag. | ||
947 | */ | ||
948 | if ((rc = diIAGRead(imap, iagno, &mp))) { | ||
949 | IREAD_UNLOCK(ipimap); | ||
950 | AG_UNLOCK(imap, agno); | ||
951 | return (rc); | ||
952 | } | ||
953 | iagp = (struct iag *) mp->data; | ||
954 | |||
955 | /* get the inode number and extent number of the inode within | ||
956 | * the iag and the inode number within the extent. | ||
957 | */ | ||
958 | ino = inum & (INOSPERIAG - 1); | ||
959 | extno = ino >> L2INOSPEREXT; | ||
960 | bitno = ino & (INOSPEREXT - 1); | ||
961 | mask = HIGHORDER >> bitno; | ||
962 | |||
963 | if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { | ||
964 | jfs_error(ip->i_sb, | ||
965 | "diFree: wmap shows inode already free"); | ||
966 | } | ||
967 | |||
968 | if (!addressPXD(&iagp->inoext[extno])) { | ||
969 | release_metapage(mp); | ||
970 | IREAD_UNLOCK(ipimap); | ||
971 | AG_UNLOCK(imap, agno); | ||
972 | jfs_error(ip->i_sb, "diFree: invalid inoext"); | ||
973 | return -EIO; | ||
974 | } | ||
975 | |||
976 | /* compute the bitmap for the extent reflecting the freed inode. | ||
977 | */ | ||
978 | bitmap = le32_to_cpu(iagp->wmap[extno]) & ~mask; | ||
979 | |||
980 | if (imap->im_agctl[agno].numfree > imap->im_agctl[agno].numinos) { | ||
981 | release_metapage(mp); | ||
982 | IREAD_UNLOCK(ipimap); | ||
983 | AG_UNLOCK(imap, agno); | ||
984 | jfs_error(ip->i_sb, "diFree: numfree > numinos"); | ||
985 | return -EIO; | ||
986 | } | ||
987 | /* | ||
988 | * inode extent still has some inodes or below low water mark: | ||
989 | * keep the inode extent; | ||
990 | */ | ||
991 | if (bitmap || | ||
992 | imap->im_agctl[agno].numfree < 96 || | ||
993 | (imap->im_agctl[agno].numfree < 288 && | ||
994 | (((imap->im_agctl[agno].numfree * 100) / | ||
995 | imap->im_agctl[agno].numinos) <= 25))) { | ||
996 | /* if the iag currently has no free inodes (i.e., | ||
997 | * the inode being freed is the first free inode of iag), | ||
998 | * insert the iag at head of the inode free list for the ag. | ||
999 | */ | ||
1000 | if (iagp->nfreeinos == 0) { | ||
1001 | /* check if there are any iags on the ag inode | ||
1002 | * free list. if so, read the first one so that | ||
1003 | * we can link the current iag onto the list at | ||
1004 | * the head. | ||
1005 | */ | ||
1006 | if ((fwd = imap->im_agctl[agno].inofree) >= 0) { | ||
1007 | /* read the iag that currently is the head | ||
1008 | * of the list. | ||
1009 | */ | ||
1010 | if ((rc = diIAGRead(imap, fwd, &))) { | ||
1011 | IREAD_UNLOCK(ipimap); | ||
1012 | AG_UNLOCK(imap, agno); | ||
1013 | release_metapage(mp); | ||
1014 | return (rc); | ||
1015 | } | ||
1016 | aiagp = (struct iag *) amp->data; | ||
1017 | |||
1018 | /* make current head point back to the iag. | ||
1019 | */ | ||
1020 | aiagp->inofreeback = cpu_to_le32(iagno); | ||
1021 | |||
1022 | write_metapage(amp); | ||
1023 | } | ||
1024 | |||
1025 | /* iag points forward to current head and iag | ||
1026 | * becomes the new head of the list. | ||
1027 | */ | ||
1028 | iagp->inofreefwd = | ||
1029 | cpu_to_le32(imap->im_agctl[agno].inofree); | ||
1030 | iagp->inofreeback = cpu_to_le32(-1); | ||
1031 | imap->im_agctl[agno].inofree = iagno; | ||
1032 | } | ||
1033 | IREAD_UNLOCK(ipimap); | ||
1034 | |||
1035 | /* update the free inode summary map for the extent if | ||
1036 | * freeing the inode means the extent will now have free | ||
1037 | * inodes (i.e., the inode being freed is the first free | ||
1038 | * inode of extent), | ||
1039 | */ | ||
1040 | if (iagp->wmap[extno] == cpu_to_le32(ONES)) { | ||
1041 | sword = extno >> L2EXTSPERSUM; | ||
1042 | bitno = extno & (EXTSPERSUM - 1); | ||
1043 | iagp->inosmap[sword] &= | ||
1044 | cpu_to_le32(~(HIGHORDER >> bitno)); | ||
1045 | } | ||
1046 | |||
1047 | /* update the bitmap. | ||
1048 | */ | ||
1049 | iagp->wmap[extno] = cpu_to_le32(bitmap); | ||
1050 | DBG_DIFREE(imap, inum); | ||
1051 | |||
1052 | /* update the free inode counts at the iag, ag and | ||
1053 | * map level. | ||
1054 | */ | ||
1055 | iagp->nfreeinos = | ||
1056 | cpu_to_le32(le32_to_cpu(iagp->nfreeinos) + 1); | ||
1057 | imap->im_agctl[agno].numfree += 1; | ||
1058 | atomic_inc(&imap->im_numfree); | ||
1059 | |||
1060 | /* release the AG inode map lock | ||
1061 | */ | ||
1062 | AG_UNLOCK(imap, agno); | ||
1063 | |||
1064 | /* write the iag */ | ||
1065 | write_metapage(mp); | ||
1066 | |||
1067 | return (0); | ||
1068 | } | ||
1069 | |||
1070 | |||
1071 | /* | ||
1072 | * inode extent has become free and above low water mark: | ||
1073 | * free the inode extent; | ||
1074 | */ | ||
1075 | |||
1076 | /* | ||
1077 | * prepare to update iag list(s) (careful update step 1) | ||
1078 | */ | ||
1079 | amp = bmp = cmp = dmp = NULL; | ||
1080 | fwd = back = -1; | ||
1081 | |||
1082 | /* check if the iag currently has no free extents. if so, | ||
1083 | * it will be placed on the head of the ag extent free list. | ||
1084 | */ | ||
1085 | if (iagp->nfreeexts == 0) { | ||
1086 | /* check if the ag extent free list has any iags. | ||
1087 | * if so, read the iag at the head of the list now. | ||
1088 | * this (head) iag will be updated later to reflect | ||
1089 | * the addition of the current iag at the head of | ||
1090 | * the list. | ||
1091 | */ | ||
1092 | if ((fwd = imap->im_agctl[agno].extfree) >= 0) { | ||
1093 | if ((rc = diIAGRead(imap, fwd, &))) | ||
1094 | goto error_out; | ||
1095 | aiagp = (struct iag *) amp->data; | ||
1096 | } | ||
1097 | } else { | ||
1098 | /* iag has free extents. check if the addition of a free | ||
1099 | * extent will cause all extents to be free within this | ||
1100 | * iag. if so, the iag will be removed from the ag extent | ||
1101 | * free list and placed on the inode map's free iag list. | ||
1102 | */ | ||
1103 | if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) { | ||
1104 | /* in preparation for removing the iag from the | ||
1105 | * ag extent free list, read the iags preceeding | ||
1106 | * and following the iag on the ag extent free | ||
1107 | * list. | ||
1108 | */ | ||
1109 | if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) { | ||
1110 | if ((rc = diIAGRead(imap, fwd, &))) | ||
1111 | goto error_out; | ||
1112 | aiagp = (struct iag *) amp->data; | ||
1113 | } | ||
1114 | |||
1115 | if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) { | ||
1116 | if ((rc = diIAGRead(imap, back, &bmp))) | ||
1117 | goto error_out; | ||
1118 | biagp = (struct iag *) bmp->data; | ||
1119 | } | ||
1120 | } | ||
1121 | } | ||
1122 | |||
1123 | /* remove the iag from the ag inode free list if freeing | ||
1124 | * this extent cause the iag to have no free inodes. | ||
1125 | */ | ||
1126 | if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) { | ||
1127 | int inofreeback = le32_to_cpu(iagp->inofreeback); | ||
1128 | int inofreefwd = le32_to_cpu(iagp->inofreefwd); | ||
1129 | |||
1130 | /* in preparation for removing the iag from the | ||
1131 | * ag inode free list, read the iags preceeding | ||
1132 | * and following the iag on the ag inode free | ||
1133 | * list. before reading these iags, we must make | ||
1134 | * sure that we already don't have them in hand | ||
1135 | * from up above, since re-reading an iag (buffer) | ||
1136 | * we are currently holding would cause a deadlock. | ||
1137 | */ | ||
1138 | if (inofreefwd >= 0) { | ||
1139 | |||
1140 | if (inofreefwd == fwd) | ||
1141 | ciagp = (struct iag *) amp->data; | ||
1142 | else if (inofreefwd == back) | ||
1143 | ciagp = (struct iag *) bmp->data; | ||
1144 | else { | ||
1145 | if ((rc = | ||
1146 | diIAGRead(imap, inofreefwd, &cmp))) | ||
1147 | goto error_out; | ||
1148 | ciagp = (struct iag *) cmp->data; | ||
1149 | } | ||
1150 | assert(ciagp != NULL); | ||
1151 | } | ||
1152 | |||
1153 | if (inofreeback >= 0) { | ||
1154 | if (inofreeback == fwd) | ||
1155 | diagp = (struct iag *) amp->data; | ||
1156 | else if (inofreeback == back) | ||
1157 | diagp = (struct iag *) bmp->data; | ||
1158 | else { | ||
1159 | if ((rc = | ||
1160 | diIAGRead(imap, inofreeback, &dmp))) | ||
1161 | goto error_out; | ||
1162 | diagp = (struct iag *) dmp->data; | ||
1163 | } | ||
1164 | assert(diagp != NULL); | ||
1165 | } | ||
1166 | } | ||
1167 | |||
1168 | IREAD_UNLOCK(ipimap); | ||
1169 | |||
1170 | /* | ||
1171 | * invalidate any page of the inode extent freed from buffer cache; | ||
1172 | */ | ||
1173 | freepxd = iagp->inoext[extno]; | ||
1174 | invalidate_pxd_metapages(ip, freepxd); | ||
1175 | |||
1176 | /* | ||
1177 | * update iag list(s) (careful update step 2) | ||
1178 | */ | ||
1179 | /* add the iag to the ag extent free list if this is the | ||
1180 | * first free extent for the iag. | ||
1181 | */ | ||
1182 | if (iagp->nfreeexts == 0) { | ||
1183 | if (fwd >= 0) | ||
1184 | aiagp->extfreeback = cpu_to_le32(iagno); | ||
1185 | |||
1186 | iagp->extfreefwd = | ||
1187 | cpu_to_le32(imap->im_agctl[agno].extfree); | ||
1188 | iagp->extfreeback = cpu_to_le32(-1); | ||
1189 | imap->im_agctl[agno].extfree = iagno; | ||
1190 | } else { | ||
1191 | /* remove the iag from the ag extent list if all extents | ||
1192 | * are now free and place it on the inode map iag free list. | ||
1193 | */ | ||
1194 | if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) { | ||
1195 | if (fwd >= 0) | ||
1196 | aiagp->extfreeback = iagp->extfreeback; | ||
1197 | |||
1198 | if (back >= 0) | ||
1199 | biagp->extfreefwd = iagp->extfreefwd; | ||
1200 | else | ||
1201 | imap->im_agctl[agno].extfree = | ||
1202 | le32_to_cpu(iagp->extfreefwd); | ||
1203 | |||
1204 | iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1); | ||
1205 | |||
1206 | IAGFREE_LOCK(imap); | ||
1207 | iagp->iagfree = cpu_to_le32(imap->im_freeiag); | ||
1208 | imap->im_freeiag = iagno; | ||
1209 | IAGFREE_UNLOCK(imap); | ||
1210 | } | ||
1211 | } | ||
1212 | |||
1213 | /* remove the iag from the ag inode free list if freeing | ||
1214 | * this extent causes the iag to have no free inodes. | ||
1215 | */ | ||
1216 | if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) { | ||
1217 | if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) | ||
1218 | ciagp->inofreeback = iagp->inofreeback; | ||
1219 | |||
1220 | if ((int) le32_to_cpu(iagp->inofreeback) >= 0) | ||
1221 | diagp->inofreefwd = iagp->inofreefwd; | ||
1222 | else | ||
1223 | imap->im_agctl[agno].inofree = | ||
1224 | le32_to_cpu(iagp->inofreefwd); | ||
1225 | |||
1226 | iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1); | ||
1227 | } | ||
1228 | |||
1229 | /* update the inode extent address and working map | ||
1230 | * to reflect the free extent. | ||
1231 | * the permanent map should have been updated already | ||
1232 | * for the inode being freed. | ||
1233 | */ | ||
1234 | if (iagp->pmap[extno] != 0) { | ||
1235 | jfs_error(ip->i_sb, "diFree: the pmap does not show inode free"); | ||
1236 | } | ||
1237 | iagp->wmap[extno] = 0; | ||
1238 | DBG_DIFREE(imap, inum); | ||
1239 | PXDlength(&iagp->inoext[extno], 0); | ||
1240 | PXDaddress(&iagp->inoext[extno], 0); | ||
1241 | |||
1242 | /* update the free extent and free inode summary maps | ||
1243 | * to reflect the freed extent. | ||
1244 | * the inode summary map is marked to indicate no inodes | ||
1245 | * available for the freed extent. | ||
1246 | */ | ||
1247 | sword = extno >> L2EXTSPERSUM; | ||
1248 | bitno = extno & (EXTSPERSUM - 1); | ||
1249 | mask = HIGHORDER >> bitno; | ||
1250 | iagp->inosmap[sword] |= cpu_to_le32(mask); | ||
1251 | iagp->extsmap[sword] &= cpu_to_le32(~mask); | ||
1252 | |||
1253 | /* update the number of free inodes and number of free extents | ||
1254 | * for the iag. | ||
1255 | */ | ||
1256 | iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) - | ||
1257 | (INOSPEREXT - 1)); | ||
1258 | iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) + 1); | ||
1259 | |||
1260 | /* update the number of free inodes and backed inodes | ||
1261 | * at the ag and inode map level. | ||
1262 | */ | ||
1263 | imap->im_agctl[agno].numfree -= (INOSPEREXT - 1); | ||
1264 | imap->im_agctl[agno].numinos -= INOSPEREXT; | ||
1265 | atomic_sub(INOSPEREXT - 1, &imap->im_numfree); | ||
1266 | atomic_sub(INOSPEREXT, &imap->im_numinos); | ||
1267 | |||
1268 | if (amp) | ||
1269 | write_metapage(amp); | ||
1270 | if (bmp) | ||
1271 | write_metapage(bmp); | ||
1272 | if (cmp) | ||
1273 | write_metapage(cmp); | ||
1274 | if (dmp) | ||
1275 | write_metapage(dmp); | ||
1276 | |||
1277 | /* | ||
1278 | * start transaction to update block allocation map | ||
1279 | * for the inode extent freed; | ||
1280 | * | ||
1281 | * N.B. AG_LOCK is released and iag will be released below, and | ||
1282 | * other thread may allocate inode from/reusing the ixad freed | ||
1283 | * BUT with new/different backing inode extent from the extent | ||
1284 | * to be freed by the transaction; | ||
1285 | */ | ||
1286 | tid = txBegin(ipimap->i_sb, COMMIT_FORCE); | ||
1287 | down(&JFS_IP(ipimap)->commit_sem); | ||
1288 | |||
1289 | /* acquire tlock of the iag page of the freed ixad | ||
1290 | * to force the page NOHOMEOK (even though no data is | ||
1291 | * logged from the iag page) until NOREDOPAGE|FREEXTENT log | ||
1292 | * for the free of the extent is committed; | ||
1293 | * write FREEXTENT|NOREDOPAGE log record | ||
1294 | * N.B. linelock is overlaid as freed extent descriptor; | ||
1295 | */ | ||
1296 | tlck = txLock(tid, ipimap, mp, tlckINODE | tlckFREE); | ||
1297 | pxdlock = (struct pxd_lock *) & tlck->lock; | ||
1298 | pxdlock->flag = mlckFREEPXD; | ||
1299 | pxdlock->pxd = freepxd; | ||
1300 | pxdlock->index = 1; | ||
1301 | |||
1302 | write_metapage(mp); | ||
1303 | |||
1304 | iplist[0] = ipimap; | ||
1305 | |||
1306 | /* | ||
1307 | * logredo needs the IAG number and IAG extent index in order | ||
1308 | * to ensure that the IMap is consistent. The least disruptive | ||
1309 | * way to pass these values through to the transaction manager | ||
1310 | * is in the iplist array. | ||
1311 | * | ||
1312 | * It's not pretty, but it works. | ||
1313 | */ | ||
1314 | iplist[1] = (struct inode *) (size_t)iagno; | ||
1315 | iplist[2] = (struct inode *) (size_t)extno; | ||
1316 | |||
1317 | rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE); | ||
1318 | |||
1319 | txEnd(tid); | ||
1320 | up(&JFS_IP(ipimap)->commit_sem); | ||
1321 | |||
1322 | /* unlock the AG inode map information */ | ||
1323 | AG_UNLOCK(imap, agno); | ||
1324 | |||
1325 | return (0); | ||
1326 | |||
1327 | error_out: | ||
1328 | IREAD_UNLOCK(ipimap); | ||
1329 | |||
1330 | if (amp) | ||
1331 | release_metapage(amp); | ||
1332 | if (bmp) | ||
1333 | release_metapage(bmp); | ||
1334 | if (cmp) | ||
1335 | release_metapage(cmp); | ||
1336 | if (dmp) | ||
1337 | release_metapage(dmp); | ||
1338 | |||
1339 | AG_UNLOCK(imap, agno); | ||
1340 | |||
1341 | release_metapage(mp); | ||
1342 | |||
1343 | return (rc); | ||
1344 | } | ||
1345 | |||
1346 | /* | ||
1347 | * There are several places in the diAlloc* routines where we initialize | ||
1348 | * the inode. | ||
1349 | */ | ||
1350 | static inline void | ||
1351 | diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp) | ||
1352 | { | ||
1353 | struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); | ||
1354 | struct jfs_inode_info *jfs_ip = JFS_IP(ip); | ||
1355 | |||
1356 | ip->i_ino = (iagno << L2INOSPERIAG) + ino; | ||
1357 | DBG_DIALLOC(JFS_IP(ipimap)->i_imap, ip->i_ino); | ||
1358 | jfs_ip->ixpxd = iagp->inoext[extno]; | ||
1359 | jfs_ip->agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi); | ||
1360 | jfs_ip->active_ag = -1; | ||
1361 | } | ||
1362 | |||
1363 | |||
1364 | /* | ||
1365 | * NAME: diAlloc(pip,dir,ip) | ||
1366 | * | ||
1367 | * FUNCTION: allocate a disk inode from the inode working map | ||
1368 | * for a fileset or aggregate. | ||
1369 | * | ||
1370 | * PARAMETERS: | ||
1371 | * pip - pointer to incore inode for the parent inode. | ||
1372 | * dir - TRUE if the new disk inode is for a directory. | ||
1373 | * ip - pointer to a new inode | ||
1374 | * | ||
1375 | * RETURN VALUES: | ||
1376 | * 0 - success. | ||
1377 | * -ENOSPC - insufficient disk resources. | ||
1378 | * -EIO - i/o error. | ||
1379 | */ | ||
1380 | int diAlloc(struct inode *pip, boolean_t dir, struct inode *ip) | ||
1381 | { | ||
1382 | int rc, ino, iagno, addext, extno, bitno, sword; | ||
1383 | int nwords, rem, i, agno; | ||
1384 | u32 mask, inosmap, extsmap; | ||
1385 | struct inode *ipimap; | ||
1386 | struct metapage *mp; | ||
1387 | ino_t inum; | ||
1388 | struct iag *iagp; | ||
1389 | struct inomap *imap; | ||
1390 | |||
1391 | /* get the pointers to the inode map inode and the | ||
1392 | * corresponding imap control structure. | ||
1393 | */ | ||
1394 | ipimap = JFS_SBI(pip->i_sb)->ipimap; | ||
1395 | imap = JFS_IP(ipimap)->i_imap; | ||
1396 | JFS_IP(ip)->ipimap = ipimap; | ||
1397 | JFS_IP(ip)->fileset = FILESYSTEM_I; | ||
1398 | |||
1399 | /* for a directory, the allocation policy is to start | ||
1400 | * at the ag level using the preferred ag. | ||
1401 | */ | ||
1402 | if (dir == TRUE) { | ||
1403 | agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap); | ||
1404 | AG_LOCK(imap, agno); | ||
1405 | goto tryag; | ||
1406 | } | ||
1407 | |||
1408 | /* for files, the policy starts off by trying to allocate from | ||
1409 | * the same iag containing the parent disk inode: | ||
1410 | * try to allocate the new disk inode close to the parent disk | ||
1411 | * inode, using parent disk inode number + 1 as the allocation | ||
1412 | * hint. (we use a left-to-right policy to attempt to avoid | ||
1413 | * moving backward on the disk.) compute the hint within the | ||
1414 | * file system and the iag. | ||
1415 | */ | ||
1416 | |||
1417 | /* get the ag number of this iag */ | ||
1418 | agno = JFS_IP(pip)->agno; | ||
1419 | |||
1420 | if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) { | ||
1421 | /* | ||
1422 | * There is an open file actively growing. We want to | ||
1423 | * allocate new inodes from a different ag to avoid | ||
1424 | * fragmentation problems. | ||
1425 | */ | ||
1426 | agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap); | ||
1427 | AG_LOCK(imap, agno); | ||
1428 | goto tryag; | ||
1429 | } | ||
1430 | |||
1431 | inum = pip->i_ino + 1; | ||
1432 | ino = inum & (INOSPERIAG - 1); | ||
1433 | |||
1434 | /* back off the the hint if it is outside of the iag */ | ||
1435 | if (ino == 0) | ||
1436 | inum = pip->i_ino; | ||
1437 | |||
1438 | /* lock the AG inode map information */ | ||
1439 | AG_LOCK(imap, agno); | ||
1440 | |||
1441 | /* Get read lock on imap inode */ | ||
1442 | IREAD_LOCK(ipimap); | ||
1443 | |||
1444 | /* get the iag number and read the iag */ | ||
1445 | iagno = INOTOIAG(inum); | ||
1446 | if ((rc = diIAGRead(imap, iagno, &mp))) { | ||
1447 | IREAD_UNLOCK(ipimap); | ||
1448 | AG_UNLOCK(imap, agno); | ||
1449 | return (rc); | ||
1450 | } | ||
1451 | iagp = (struct iag *) mp->data; | ||
1452 | |||
1453 | /* determine if new inode extent is allowed to be added to the iag. | ||
1454 | * new inode extent can be added to the iag if the ag | ||
1455 | * has less than 32 free disk inodes and the iag has free extents. | ||
1456 | */ | ||
1457 | addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts); | ||
1458 | |||
1459 | /* | ||
1460 | * try to allocate from the IAG | ||
1461 | */ | ||
1462 | /* check if the inode may be allocated from the iag | ||
1463 | * (i.e. the inode has free inodes or new extent can be added). | ||
1464 | */ | ||
1465 | if (iagp->nfreeinos || addext) { | ||
1466 | /* determine the extent number of the hint. | ||
1467 | */ | ||
1468 | extno = ino >> L2INOSPEREXT; | ||
1469 | |||
1470 | /* check if the extent containing the hint has backed | ||
1471 | * inodes. if so, try to allocate within this extent. | ||
1472 | */ | ||
1473 | if (addressPXD(&iagp->inoext[extno])) { | ||
1474 | bitno = ino & (INOSPEREXT - 1); | ||
1475 | if ((bitno = | ||
1476 | diFindFree(le32_to_cpu(iagp->wmap[extno]), | ||
1477 | bitno)) | ||
1478 | < INOSPEREXT) { | ||
1479 | ino = (extno << L2INOSPEREXT) + bitno; | ||
1480 | |||
1481 | /* a free inode (bit) was found within this | ||
1482 | * extent, so allocate it. | ||
1483 | */ | ||
1484 | rc = diAllocBit(imap, iagp, ino); | ||
1485 | IREAD_UNLOCK(ipimap); | ||
1486 | if (rc) { | ||
1487 | assert(rc == -EIO); | ||
1488 | } else { | ||
1489 | /* set the results of the allocation | ||
1490 | * and write the iag. | ||
1491 | */ | ||
1492 | diInitInode(ip, iagno, ino, extno, | ||
1493 | iagp); | ||
1494 | mark_metapage_dirty(mp); | ||
1495 | } | ||
1496 | release_metapage(mp); | ||
1497 | |||
1498 | /* free the AG lock and return. | ||
1499 | */ | ||
1500 | AG_UNLOCK(imap, agno); | ||
1501 | return (rc); | ||
1502 | } | ||
1503 | |||
1504 | if (!addext) | ||
1505 | extno = | ||
1506 | (extno == | ||
1507 | EXTSPERIAG - 1) ? 0 : extno + 1; | ||
1508 | } | ||
1509 | |||
1510 | /* | ||
1511 | * no free inodes within the extent containing the hint. | ||
1512 | * | ||
1513 | * try to allocate from the backed extents following | ||
1514 | * hint or, if appropriate (i.e. addext is true), allocate | ||
1515 | * an extent of free inodes at or following the extent | ||
1516 | * containing the hint. | ||
1517 | * | ||
1518 | * the free inode and free extent summary maps are used | ||
1519 | * here, so determine the starting summary map position | ||
1520 | * and the number of words we'll have to examine. again, | ||
1521 | * the approach is to allocate following the hint, so we | ||
1522 | * might have to initially ignore prior bits of the summary | ||
1523 | * map that represent extents prior to the extent containing | ||
1524 | * the hint and later revisit these bits. | ||
1525 | */ | ||
1526 | bitno = extno & (EXTSPERSUM - 1); | ||
1527 | nwords = (bitno == 0) ? SMAPSZ : SMAPSZ + 1; | ||
1528 | sword = extno >> L2EXTSPERSUM; | ||
1529 | |||
1530 | /* mask any prior bits for the starting words of the | ||
1531 | * summary map. | ||
1532 | */ | ||
1533 | mask = ONES << (EXTSPERSUM - bitno); | ||
1534 | inosmap = le32_to_cpu(iagp->inosmap[sword]) | mask; | ||
1535 | extsmap = le32_to_cpu(iagp->extsmap[sword]) | mask; | ||
1536 | |||
1537 | /* scan the free inode and free extent summary maps for | ||
1538 | * free resources. | ||
1539 | */ | ||
1540 | for (i = 0; i < nwords; i++) { | ||
1541 | /* check if this word of the free inode summary | ||
1542 | * map describes an extent with free inodes. | ||
1543 | */ | ||
1544 | if (~inosmap) { | ||
1545 | /* an extent with free inodes has been | ||
1546 | * found. determine the extent number | ||
1547 | * and the inode number within the extent. | ||
1548 | */ | ||
1549 | rem = diFindFree(inosmap, 0); | ||
1550 | extno = (sword << L2EXTSPERSUM) + rem; | ||
1551 | rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), | ||
1552 | 0); | ||
1553 | if (rem >= INOSPEREXT) { | ||
1554 | IREAD_UNLOCK(ipimap); | ||
1555 | release_metapage(mp); | ||
1556 | AG_UNLOCK(imap, agno); | ||
1557 | jfs_error(ip->i_sb, | ||
1558 | "diAlloc: can't find free bit " | ||
1559 | "in wmap"); | ||
1560 | return EIO; | ||
1561 | } | ||
1562 | |||
1563 | /* determine the inode number within the | ||
1564 | * iag and allocate the inode from the | ||
1565 | * map. | ||
1566 | */ | ||
1567 | ino = (extno << L2INOSPEREXT) + rem; | ||
1568 | rc = diAllocBit(imap, iagp, ino); | ||
1569 | IREAD_UNLOCK(ipimap); | ||
1570 | if (rc) | ||
1571 | assert(rc == -EIO); | ||
1572 | else { | ||
1573 | /* set the results of the allocation | ||
1574 | * and write the iag. | ||
1575 | */ | ||
1576 | diInitInode(ip, iagno, ino, extno, | ||
1577 | iagp); | ||
1578 | mark_metapage_dirty(mp); | ||
1579 | } | ||
1580 | release_metapage(mp); | ||
1581 | |||
1582 | /* free the AG lock and return. | ||
1583 | */ | ||
1584 | AG_UNLOCK(imap, agno); | ||
1585 | return (rc); | ||
1586 | |||
1587 | } | ||
1588 | |||
1589 | /* check if we may allocate an extent of free | ||
1590 | * inodes and whether this word of the free | ||
1591 | * extents summary map describes a free extent. | ||
1592 | */ | ||
1593 | if (addext && ~extsmap) { | ||
1594 | /* a free extent has been found. determine | ||
1595 | * the extent number. | ||
1596 | */ | ||
1597 | rem = diFindFree(extsmap, 0); | ||
1598 | extno = (sword << L2EXTSPERSUM) + rem; | ||
1599 | |||
1600 | /* allocate an extent of free inodes. | ||
1601 | */ | ||
1602 | if ((rc = diNewExt(imap, iagp, extno))) { | ||
1603 | /* if there is no disk space for a | ||
1604 | * new extent, try to allocate the | ||
1605 | * disk inode from somewhere else. | ||
1606 | */ | ||
1607 | if (rc == -ENOSPC) | ||
1608 | break; | ||
1609 | |||
1610 | assert(rc == -EIO); | ||
1611 | } else { | ||
1612 | /* set the results of the allocation | ||
1613 | * and write the iag. | ||
1614 | */ | ||
1615 | diInitInode(ip, iagno, | ||
1616 | extno << L2INOSPEREXT, | ||
1617 | extno, iagp); | ||
1618 | mark_metapage_dirty(mp); | ||
1619 | } | ||
1620 | release_metapage(mp); | ||
1621 | /* free the imap inode & the AG lock & return. | ||
1622 | */ | ||
1623 | IREAD_UNLOCK(ipimap); | ||
1624 | AG_UNLOCK(imap, agno); | ||
1625 | return (rc); | ||
1626 | } | ||
1627 | |||
1628 | /* move on to the next set of summary map words. | ||
1629 | */ | ||
1630 | sword = (sword == SMAPSZ - 1) ? 0 : sword + 1; | ||
1631 | inosmap = le32_to_cpu(iagp->inosmap[sword]); | ||
1632 | extsmap = le32_to_cpu(iagp->extsmap[sword]); | ||
1633 | } | ||
1634 | } | ||
1635 | /* unlock imap inode */ | ||
1636 | IREAD_UNLOCK(ipimap); | ||
1637 | |||
1638 | /* nothing doing in this iag, so release it. */ | ||
1639 | release_metapage(mp); | ||
1640 | |||
1641 | tryag: | ||
1642 | /* | ||
1643 | * try to allocate anywhere within the same AG as the parent inode. | ||
1644 | */ | ||
1645 | rc = diAllocAG(imap, agno, dir, ip); | ||
1646 | |||
1647 | AG_UNLOCK(imap, agno); | ||
1648 | |||
1649 | if (rc != -ENOSPC) | ||
1650 | return (rc); | ||
1651 | |||
1652 | /* | ||
1653 | * try to allocate in any AG. | ||
1654 | */ | ||
1655 | return (diAllocAny(imap, agno, dir, ip)); | ||
1656 | } | ||
1657 | |||
1658 | |||
1659 | /* | ||
1660 | * NAME: diAllocAG(imap,agno,dir,ip) | ||
1661 | * | ||
1662 | * FUNCTION: allocate a disk inode from the allocation group. | ||
1663 | * | ||
1664 | * this routine first determines if a new extent of free | ||
1665 | * inodes should be added for the allocation group, with | ||
1666 | * the current request satisfied from this extent. if this | ||
1667 | * is the case, an attempt will be made to do just that. if | ||
1668 | * this attempt fails or it has been determined that a new | ||
1669 | * extent should not be added, an attempt is made to satisfy | ||
1670 | * the request by allocating an existing (backed) free inode | ||
1671 | * from the allocation group. | ||
1672 | * | ||
1673 | * PRE CONDITION: Already have the AG lock for this AG. | ||
1674 | * | ||
1675 | * PARAMETERS: | ||
1676 | * imap - pointer to inode map control structure. | ||
1677 | * agno - allocation group to allocate from. | ||
1678 | * dir - TRUE if the new disk inode is for a directory. | ||
1679 | * ip - pointer to the new inode to be filled in on successful return | ||
1680 | * with the disk inode number allocated, its extent address | ||
1681 | * and the start of the ag. | ||
1682 | * | ||
1683 | * RETURN VALUES: | ||
1684 | * 0 - success. | ||
1685 | * -ENOSPC - insufficient disk resources. | ||
1686 | * -EIO - i/o error. | ||
1687 | */ | ||
1688 | static int | ||
1689 | diAllocAG(struct inomap * imap, int agno, boolean_t dir, struct inode *ip) | ||
1690 | { | ||
1691 | int rc, addext, numfree, numinos; | ||
1692 | |||
1693 | /* get the number of free and the number of backed disk | ||
1694 | * inodes currently within the ag. | ||
1695 | */ | ||
1696 | numfree = imap->im_agctl[agno].numfree; | ||
1697 | numinos = imap->im_agctl[agno].numinos; | ||
1698 | |||
1699 | if (numfree > numinos) { | ||
1700 | jfs_error(ip->i_sb, "diAllocAG: numfree > numinos"); | ||
1701 | return -EIO; | ||
1702 | } | ||
1703 | |||
1704 | /* determine if we should allocate a new extent of free inodes | ||
1705 | * within the ag: for directory inodes, add a new extent | ||
1706 | * if there are a small number of free inodes or number of free | ||
1707 | * inodes is a small percentage of the number of backed inodes. | ||
1708 | */ | ||
1709 | if (dir == TRUE) | ||
1710 | addext = (numfree < 64 || | ||
1711 | (numfree < 256 | ||
1712 | && ((numfree * 100) / numinos) <= 20)); | ||
1713 | else | ||
1714 | addext = (numfree == 0); | ||
1715 | |||
1716 | /* | ||
1717 | * try to allocate a new extent of free inodes. | ||
1718 | */ | ||
1719 | if (addext) { | ||
1720 | /* if free space is not avaliable for this new extent, try | ||
1721 | * below to allocate a free and existing (already backed) | ||
1722 | * inode from the ag. | ||
1723 | */ | ||
1724 | if ((rc = diAllocExt(imap, agno, ip)) != -ENOSPC) | ||
1725 | return (rc); | ||
1726 | } | ||
1727 | |||
1728 | /* | ||
1729 | * try to allocate an existing free inode from the ag. | ||
1730 | */ | ||
1731 | return (diAllocIno(imap, agno, ip)); | ||
1732 | } | ||
1733 | |||
1734 | |||
1735 | /* | ||
1736 | * NAME: diAllocAny(imap,agno,dir,iap) | ||
1737 | * | ||
1738 | * FUNCTION: allocate a disk inode from any other allocation group. | ||
1739 | * | ||
1740 | * this routine is called when an allocation attempt within | ||
1741 | * the primary allocation group has failed. if attempts to | ||
1742 | * allocate an inode from any allocation group other than the | ||
1743 | * specified primary group. | ||
1744 | * | ||
1745 | * PARAMETERS: | ||
1746 | * imap - pointer to inode map control structure. | ||
1747 | * agno - primary allocation group (to avoid). | ||
1748 | * dir - TRUE if the new disk inode is for a directory. | ||
1749 | * ip - pointer to a new inode to be filled in on successful return | ||
1750 | * with the disk inode number allocated, its extent address | ||
1751 | * and the start of the ag. | ||
1752 | * | ||
1753 | * RETURN VALUES: | ||
1754 | * 0 - success. | ||
1755 | * -ENOSPC - insufficient disk resources. | ||
1756 | * -EIO - i/o error. | ||
1757 | */ | ||
1758 | static int | ||
1759 | diAllocAny(struct inomap * imap, int agno, boolean_t dir, struct inode *ip) | ||
1760 | { | ||
1761 | int ag, rc; | ||
1762 | int maxag = JFS_SBI(imap->im_ipimap->i_sb)->bmap->db_maxag; | ||
1763 | |||
1764 | |||
1765 | /* try to allocate from the ags following agno up to | ||
1766 | * the maximum ag number. | ||
1767 | */ | ||
1768 | for (ag = agno + 1; ag <= maxag; ag++) { | ||
1769 | AG_LOCK(imap, ag); | ||
1770 | |||
1771 | rc = diAllocAG(imap, ag, dir, ip); | ||
1772 | |||
1773 | AG_UNLOCK(imap, ag); | ||
1774 | |||
1775 | if (rc != -ENOSPC) | ||
1776 | return (rc); | ||
1777 | } | ||
1778 | |||
1779 | /* try to allocate from the ags in front of agno. | ||
1780 | */ | ||
1781 | for (ag = 0; ag < agno; ag++) { | ||
1782 | AG_LOCK(imap, ag); | ||
1783 | |||
1784 | rc = diAllocAG(imap, ag, dir, ip); | ||
1785 | |||
1786 | AG_UNLOCK(imap, ag); | ||
1787 | |||
1788 | if (rc != -ENOSPC) | ||
1789 | return (rc); | ||
1790 | } | ||
1791 | |||
1792 | /* no free disk inodes. | ||
1793 | */ | ||
1794 | return -ENOSPC; | ||
1795 | } | ||
1796 | |||
1797 | |||
1798 | /* | ||
1799 | * NAME: diAllocIno(imap,agno,ip) | ||
1800 | * | ||
1801 | * FUNCTION: allocate a disk inode from the allocation group's free | ||
1802 | * inode list, returning an error if this free list is | ||
1803 | * empty (i.e. no iags on the list). | ||
1804 | * | ||
1805 | * allocation occurs from the first iag on the list using | ||
1806 | * the iag's free inode summary map to find the leftmost | ||
1807 | * free inode in the iag. | ||
1808 | * | ||
1809 | * PRE CONDITION: Already have AG lock for this AG. | ||
1810 | * | ||
1811 | * PARAMETERS: | ||
1812 | * imap - pointer to inode map control structure. | ||
1813 | * agno - allocation group. | ||
1814 | * ip - pointer to new inode to be filled in on successful return | ||
1815 | * with the disk inode number allocated, its extent address | ||
1816 | * and the start of the ag. | ||
1817 | * | ||
1818 | * RETURN VALUES: | ||
1819 | * 0 - success. | ||
1820 | * -ENOSPC - insufficient disk resources. | ||
1821 | * -EIO - i/o error. | ||
1822 | */ | ||
1823 | static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) | ||
1824 | { | ||
1825 | int iagno, ino, rc, rem, extno, sword; | ||
1826 | struct metapage *mp; | ||
1827 | struct iag *iagp; | ||
1828 | |||
1829 | /* check if there are iags on the ag's free inode list. | ||
1830 | */ | ||
1831 | if ((iagno = imap->im_agctl[agno].inofree) < 0) | ||
1832 | return -ENOSPC; | ||
1833 | |||
1834 | /* obtain read lock on imap inode */ | ||
1835 | IREAD_LOCK(imap->im_ipimap); | ||
1836 | |||
1837 | /* read the iag at the head of the list. | ||
1838 | */ | ||
1839 | if ((rc = diIAGRead(imap, iagno, &mp))) { | ||
1840 | IREAD_UNLOCK(imap->im_ipimap); | ||
1841 | return (rc); | ||
1842 | } | ||
1843 | iagp = (struct iag *) mp->data; | ||
1844 | |||
1845 | /* better be free inodes in this iag if it is on the | ||
1846 | * list. | ||
1847 | */ | ||
1848 | if (!iagp->nfreeinos) { | ||
1849 | IREAD_UNLOCK(imap->im_ipimap); | ||
1850 | release_metapage(mp); | ||
1851 | jfs_error(ip->i_sb, | ||
1852 | "diAllocIno: nfreeinos = 0, but iag on freelist"); | ||
1853 | return -EIO; | ||
1854 | } | ||
1855 | |||
1856 | /* scan the free inode summary map to find an extent | ||
1857 | * with free inodes. | ||
1858 | */ | ||
1859 | for (sword = 0;; sword++) { | ||
1860 | if (sword >= SMAPSZ) { | ||
1861 | IREAD_UNLOCK(imap->im_ipimap); | ||
1862 | release_metapage(mp); | ||
1863 | jfs_error(ip->i_sb, | ||
1864 | "diAllocIno: free inode not found in summary map"); | ||
1865 | return -EIO; | ||
1866 | } | ||
1867 | |||
1868 | if (~iagp->inosmap[sword]) | ||
1869 | break; | ||
1870 | } | ||
1871 | |||
1872 | /* found a extent with free inodes. determine | ||
1873 | * the extent number. | ||
1874 | */ | ||
1875 | rem = diFindFree(le32_to_cpu(iagp->inosmap[sword]), 0); | ||
1876 | if (rem >= EXTSPERSUM) { | ||
1877 | IREAD_UNLOCK(imap->im_ipimap); | ||
1878 | release_metapage(mp); | ||
1879 | jfs_error(ip->i_sb, "diAllocIno: no free extent found"); | ||
1880 | return -EIO; | ||
1881 | } | ||
1882 | extno = (sword << L2EXTSPERSUM) + rem; | ||
1883 | |||
1884 | /* find the first free inode in the extent. | ||
1885 | */ | ||
1886 | rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), 0); | ||
1887 | if (rem >= INOSPEREXT) { | ||
1888 | IREAD_UNLOCK(imap->im_ipimap); | ||
1889 | release_metapage(mp); | ||
1890 | jfs_error(ip->i_sb, "diAllocIno: free inode not found"); | ||
1891 | return -EIO; | ||
1892 | } | ||
1893 | |||
1894 | /* compute the inode number within the iag. | ||
1895 | */ | ||
1896 | ino = (extno << L2INOSPEREXT) + rem; | ||
1897 | |||
1898 | /* allocate the inode. | ||
1899 | */ | ||
1900 | rc = diAllocBit(imap, iagp, ino); | ||
1901 | IREAD_UNLOCK(imap->im_ipimap); | ||
1902 | if (rc) { | ||
1903 | release_metapage(mp); | ||
1904 | return (rc); | ||
1905 | } | ||
1906 | |||
1907 | /* set the results of the allocation and write the iag. | ||
1908 | */ | ||
1909 | diInitInode(ip, iagno, ino, extno, iagp); | ||
1910 | write_metapage(mp); | ||
1911 | |||
1912 | return (0); | ||
1913 | } | ||
1914 | |||
1915 | |||
1916 | /* | ||
1917 | * NAME: diAllocExt(imap,agno,ip) | ||
1918 | * | ||
1919 | * FUNCTION: add a new extent of free inodes to an iag, allocating | ||
1920 | * an inode from this extent to satisfy the current allocation | ||
1921 | * request. | ||
1922 | * | ||
1923 | * this routine first tries to find an existing iag with free | ||
1924 | * extents through the ag free extent list. if list is not | ||
1925 | * empty, the head of the list will be selected as the home | ||
1926 | * of the new extent of free inodes. otherwise (the list is | ||
1927 | * empty), a new iag will be allocated for the ag to contain | ||
1928 | * the extent. | ||
1929 | * | ||
1930 | * once an iag has been selected, the free extent summary map | ||
1931 | * is used to locate a free extent within the iag and diNewExt() | ||
1932 | * is called to initialize the extent, with initialization | ||
1933 | * including the allocation of the first inode of the extent | ||
1934 | * for the purpose of satisfying this request. | ||
1935 | * | ||
1936 | * PARAMETERS: | ||
1937 | * imap - pointer to inode map control structure. | ||
1938 | * agno - allocation group number. | ||
1939 | * ip - pointer to new inode to be filled in on successful return | ||
1940 | * with the disk inode number allocated, its extent address | ||
1941 | * and the start of the ag. | ||
1942 | * | ||
1943 | * RETURN VALUES: | ||
1944 | * 0 - success. | ||
1945 | * -ENOSPC - insufficient disk resources. | ||
1946 | * -EIO - i/o error. | ||
1947 | */ | ||
1948 | static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) | ||
1949 | { | ||
1950 | int rem, iagno, sword, extno, rc; | ||
1951 | struct metapage *mp; | ||
1952 | struct iag *iagp; | ||
1953 | |||
1954 | /* check if the ag has any iags with free extents. if not, | ||
1955 | * allocate a new iag for the ag. | ||
1956 | */ | ||
1957 | if ((iagno = imap->im_agctl[agno].extfree) < 0) { | ||
1958 | /* If successful, diNewIAG will obtain the read lock on the | ||
1959 | * imap inode. | ||
1960 | */ | ||
1961 | if ((rc = diNewIAG(imap, &iagno, agno, &mp))) { | ||
1962 | return (rc); | ||
1963 | } | ||
1964 | iagp = (struct iag *) mp->data; | ||
1965 | |||
1966 | /* set the ag number if this a brand new iag | ||
1967 | */ | ||
1968 | iagp->agstart = | ||
1969 | cpu_to_le64(AGTOBLK(agno, imap->im_ipimap)); | ||
1970 | } else { | ||
1971 | /* read the iag. | ||
1972 | */ | ||
1973 | IREAD_LOCK(imap->im_ipimap); | ||
1974 | if ((rc = diIAGRead(imap, iagno, &mp))) { | ||
1975 | IREAD_UNLOCK(imap->im_ipimap); | ||
1976 | jfs_error(ip->i_sb, "diAllocExt: error reading iag"); | ||
1977 | return rc; | ||
1978 | } | ||
1979 | iagp = (struct iag *) mp->data; | ||
1980 | } | ||
1981 | |||
1982 | /* using the free extent summary map, find a free extent. | ||
1983 | */ | ||
1984 | for (sword = 0;; sword++) { | ||
1985 | if (sword >= SMAPSZ) { | ||
1986 | release_metapage(mp); | ||
1987 | IREAD_UNLOCK(imap->im_ipimap); | ||
1988 | jfs_error(ip->i_sb, | ||
1989 | "diAllocExt: free ext summary map not found"); | ||
1990 | return -EIO; | ||
1991 | } | ||
1992 | if (~iagp->extsmap[sword]) | ||
1993 | break; | ||
1994 | } | ||
1995 | |||
1996 | /* determine the extent number of the free extent. | ||
1997 | */ | ||
1998 | rem = diFindFree(le32_to_cpu(iagp->extsmap[sword]), 0); | ||
1999 | if (rem >= EXTSPERSUM) { | ||
2000 | release_metapage(mp); | ||
2001 | IREAD_UNLOCK(imap->im_ipimap); | ||
2002 | jfs_error(ip->i_sb, "diAllocExt: free extent not found"); | ||
2003 | return -EIO; | ||
2004 | } | ||
2005 | extno = (sword << L2EXTSPERSUM) + rem; | ||
2006 | |||
2007 | /* initialize the new extent. | ||
2008 | */ | ||
2009 | rc = diNewExt(imap, iagp, extno); | ||
2010 | IREAD_UNLOCK(imap->im_ipimap); | ||
2011 | if (rc) { | ||
2012 | /* something bad happened. if a new iag was allocated, | ||
2013 | * place it back on the inode map's iag free list, and | ||
2014 | * clear the ag number information. | ||
2015 | */ | ||
2016 | if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { | ||
2017 | IAGFREE_LOCK(imap); | ||
2018 | iagp->iagfree = cpu_to_le32(imap->im_freeiag); | ||
2019 | imap->im_freeiag = iagno; | ||
2020 | IAGFREE_UNLOCK(imap); | ||
2021 | } | ||
2022 | write_metapage(mp); | ||
2023 | return (rc); | ||
2024 | } | ||
2025 | |||
2026 | /* set the results of the allocation and write the iag. | ||
2027 | */ | ||
2028 | diInitInode(ip, iagno, extno << L2INOSPEREXT, extno, iagp); | ||
2029 | |||
2030 | write_metapage(mp); | ||
2031 | |||
2032 | return (0); | ||
2033 | } | ||
2034 | |||
2035 | |||
2036 | /* | ||
2037 | * NAME: diAllocBit(imap,iagp,ino) | ||
2038 | * | ||
2039 | * FUNCTION: allocate a backed inode from an iag. | ||
2040 | * | ||
2041 | * this routine performs the mechanics of allocating a | ||
2042 | * specified inode from a backed extent. | ||
2043 | * | ||
2044 | * if the inode to be allocated represents the last free | ||
2045 | * inode within the iag, the iag will be removed from the | ||
2046 | * ag free inode list. | ||
2047 | * | ||
2048 | * a careful update approach is used to provide consistency | ||
2049 | * in the face of updates to multiple buffers. under this | ||
2050 | * approach, all required buffers are obtained before making | ||
2051 | * any updates and are held all are updates are complete. | ||
2052 | * | ||
2053 | * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on | ||
2054 | * this AG. Must have read lock on imap inode. | ||
2055 | * | ||
2056 | * PARAMETERS: | ||
2057 | * imap - pointer to inode map control structure. | ||
2058 | * iagp - pointer to iag. | ||
2059 | * ino - inode number to be allocated within the iag. | ||
2060 | * | ||
2061 | * RETURN VALUES: | ||
2062 | * 0 - success. | ||
2063 | * -ENOSPC - insufficient disk resources. | ||
2064 | * -EIO - i/o error. | ||
2065 | */ | ||
2066 | static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino) | ||
2067 | { | ||
2068 | int extno, bitno, agno, sword, rc; | ||
2069 | struct metapage *amp = NULL, *bmp = NULL; | ||
2070 | struct iag *aiagp = NULL, *biagp = NULL; | ||
2071 | u32 mask; | ||
2072 | |||
2073 | /* check if this is the last free inode within the iag. | ||
2074 | * if so, it will have to be removed from the ag free | ||
2075 | * inode list, so get the iags preceeding and following | ||
2076 | * it on the list. | ||
2077 | */ | ||
2078 | if (iagp->nfreeinos == cpu_to_le32(1)) { | ||
2079 | if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) { | ||
2080 | if ((rc = | ||
2081 | diIAGRead(imap, le32_to_cpu(iagp->inofreefwd), | ||
2082 | &))) | ||
2083 | return (rc); | ||
2084 | aiagp = (struct iag *) amp->data; | ||
2085 | } | ||
2086 | |||
2087 | if ((int) le32_to_cpu(iagp->inofreeback) >= 0) { | ||
2088 | if ((rc = | ||
2089 | diIAGRead(imap, | ||
2090 | le32_to_cpu(iagp->inofreeback), | ||
2091 | &bmp))) { | ||
2092 | if (amp) | ||
2093 | release_metapage(amp); | ||
2094 | return (rc); | ||
2095 | } | ||
2096 | biagp = (struct iag *) bmp->data; | ||
2097 | } | ||
2098 | } | ||
2099 | |||
2100 | /* get the ag number, extent number, inode number within | ||
2101 | * the extent. | ||
2102 | */ | ||
2103 | agno = BLKTOAG(le64_to_cpu(iagp->agstart), JFS_SBI(imap->im_ipimap->i_sb)); | ||
2104 | extno = ino >> L2INOSPEREXT; | ||
2105 | bitno = ino & (INOSPEREXT - 1); | ||
2106 | |||
2107 | /* compute the mask for setting the map. | ||
2108 | */ | ||
2109 | mask = HIGHORDER >> bitno; | ||
2110 | |||
2111 | /* the inode should be free and backed. | ||
2112 | */ | ||
2113 | if (((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) || | ||
2114 | ((le32_to_cpu(iagp->wmap[extno]) & mask) != 0) || | ||
2115 | (addressPXD(&iagp->inoext[extno]) == 0)) { | ||
2116 | if (amp) | ||
2117 | release_metapage(amp); | ||
2118 | if (bmp) | ||
2119 | release_metapage(bmp); | ||
2120 | |||
2121 | jfs_error(imap->im_ipimap->i_sb, | ||
2122 | "diAllocBit: iag inconsistent"); | ||
2123 | return -EIO; | ||
2124 | } | ||
2125 | |||
2126 | /* mark the inode as allocated in the working map. | ||
2127 | */ | ||
2128 | iagp->wmap[extno] |= cpu_to_le32(mask); | ||
2129 | |||
2130 | /* check if all inodes within the extent are now | ||
2131 | * allocated. if so, update the free inode summary | ||
2132 | * map to reflect this. | ||
2133 | */ | ||
2134 | if (iagp->wmap[extno] == cpu_to_le32(ONES)) { | ||
2135 | sword = extno >> L2EXTSPERSUM; | ||
2136 | bitno = extno & (EXTSPERSUM - 1); | ||
2137 | iagp->inosmap[sword] |= cpu_to_le32(HIGHORDER >> bitno); | ||
2138 | } | ||
2139 | |||
2140 | /* if this was the last free inode in the iag, remove the | ||
2141 | * iag from the ag free inode list. | ||
2142 | */ | ||
2143 | if (iagp->nfreeinos == cpu_to_le32(1)) { | ||
2144 | if (amp) { | ||
2145 | aiagp->inofreeback = iagp->inofreeback; | ||
2146 | write_metapage(amp); | ||
2147 | } | ||
2148 | |||
2149 | if (bmp) { | ||
2150 | biagp->inofreefwd = iagp->inofreefwd; | ||
2151 | write_metapage(bmp); | ||
2152 | } else { | ||
2153 | imap->im_agctl[agno].inofree = | ||
2154 | le32_to_cpu(iagp->inofreefwd); | ||
2155 | } | ||
2156 | iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1); | ||
2157 | } | ||
2158 | |||
2159 | /* update the free inode count at the iag, ag, inode | ||
2160 | * map levels. | ||
2161 | */ | ||
2162 | iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) - 1); | ||
2163 | imap->im_agctl[agno].numfree -= 1; | ||
2164 | atomic_dec(&imap->im_numfree); | ||
2165 | |||
2166 | return (0); | ||
2167 | } | ||
2168 | |||
2169 | |||
2170 | /* | ||
2171 | * NAME: diNewExt(imap,iagp,extno) | ||
2172 | * | ||
2173 | * FUNCTION: initialize a new extent of inodes for an iag, allocating | ||
2174 | * the first inode of the extent for use for the current | ||
2175 | * allocation request. | ||
2176 | * | ||
2177 | * disk resources are allocated for the new extent of inodes | ||
2178 | * and the inodes themselves are initialized to reflect their | ||
2179 | * existence within the extent (i.e. their inode numbers and | ||
2180 | * inode extent addresses are set) and their initial state | ||
2181 | * (mode and link count are set to zero). | ||
2182 | * | ||
2183 | * if the iag is new, it is not yet on an ag extent free list | ||
2184 | * but will now be placed on this list. | ||
2185 | * | ||
2186 | * if the allocation of the new extent causes the iag to | ||
2187 | * have no free extent, the iag will be removed from the | ||
2188 | * ag extent free list. | ||
2189 | * | ||
2190 | * if the iag has no free backed inodes, it will be placed | ||
2191 | * on the ag free inode list, since the addition of the new | ||
2192 | * extent will now cause it to have free inodes. | ||
2193 | * | ||
2194 | * a careful update approach is used to provide consistency | ||
2195 | * (i.e. list consistency) in the face of updates to multiple | ||
2196 | * buffers. under this approach, all required buffers are | ||
2197 | * obtained before making any updates and are held until all | ||
2198 | * updates are complete. | ||
2199 | * | ||
2200 | * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on | ||
2201 | * this AG. Must have read lock on imap inode. | ||
2202 | * | ||
2203 | * PARAMETERS: | ||
2204 | * imap - pointer to inode map control structure. | ||
2205 | * iagp - pointer to iag. | ||
2206 | * extno - extent number. | ||
2207 | * | ||
2208 | * RETURN VALUES: | ||
2209 | * 0 - success. | ||
2210 | * -ENOSPC - insufficient disk resources. | ||
2211 | * -EIO - i/o error. | ||
2212 | */ | ||
2213 | static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) | ||
2214 | { | ||
2215 | int agno, iagno, fwd, back, freei = 0, sword, rc; | ||
2216 | struct iag *aiagp = NULL, *biagp = NULL, *ciagp = NULL; | ||
2217 | struct metapage *amp, *bmp, *cmp, *dmp; | ||
2218 | struct inode *ipimap; | ||
2219 | s64 blkno, hint; | ||
2220 | int i, j; | ||
2221 | u32 mask; | ||
2222 | ino_t ino; | ||
2223 | struct dinode *dp; | ||
2224 | struct jfs_sb_info *sbi; | ||
2225 | |||
2226 | /* better have free extents. | ||
2227 | */ | ||
2228 | if (!iagp->nfreeexts) { | ||
2229 | jfs_error(imap->im_ipimap->i_sb, "diNewExt: no free extents"); | ||
2230 | return -EIO; | ||
2231 | } | ||
2232 | |||
2233 | /* get the inode map inode. | ||
2234 | */ | ||
2235 | ipimap = imap->im_ipimap; | ||
2236 | sbi = JFS_SBI(ipimap->i_sb); | ||
2237 | |||
2238 | amp = bmp = cmp = NULL; | ||
2239 | |||
2240 | /* get the ag and iag numbers for this iag. | ||
2241 | */ | ||
2242 | agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi); | ||
2243 | iagno = le32_to_cpu(iagp->iagnum); | ||
2244 | |||
2245 | /* check if this is the last free extent within the | ||
2246 | * iag. if so, the iag must be removed from the ag | ||
2247 | * free extent list, so get the iags preceeding and | ||
2248 | * following the iag on this list. | ||
2249 | */ | ||
2250 | if (iagp->nfreeexts == cpu_to_le32(1)) { | ||
2251 | if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) { | ||
2252 | if ((rc = diIAGRead(imap, fwd, &))) | ||
2253 | return (rc); | ||
2254 | aiagp = (struct iag *) amp->data; | ||
2255 | } | ||
2256 | |||
2257 | if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) { | ||
2258 | if ((rc = diIAGRead(imap, back, &bmp))) | ||
2259 | goto error_out; | ||
2260 | biagp = (struct iag *) bmp->data; | ||
2261 | } | ||
2262 | } else { | ||
2263 | /* the iag has free extents. if all extents are free | ||
2264 | * (as is the case for a newly allocated iag), the iag | ||
2265 | * must be added to the ag free extent list, so get | ||
2266 | * the iag at the head of the list in preparation for | ||
2267 | * adding this iag to this list. | ||
2268 | */ | ||
2269 | fwd = back = -1; | ||
2270 | if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { | ||
2271 | if ((fwd = imap->im_agctl[agno].extfree) >= 0) { | ||
2272 | if ((rc = diIAGRead(imap, fwd, &))) | ||
2273 | goto error_out; | ||
2274 | aiagp = (struct iag *) amp->data; | ||
2275 | } | ||
2276 | } | ||
2277 | } | ||
2278 | |||
2279 | /* check if the iag has no free inodes. if so, the iag | ||
2280 | * will have to be added to the ag free inode list, so get | ||
2281 | * the iag at the head of the list in preparation for | ||
2282 | * adding this iag to this list. in doing this, we must | ||
2283 | * check if we already have the iag at the head of | ||
2284 | * the list in hand. | ||
2285 | */ | ||
2286 | if (iagp->nfreeinos == 0) { | ||
2287 | freei = imap->im_agctl[agno].inofree; | ||
2288 | |||
2289 | if (freei >= 0) { | ||
2290 | if (freei == fwd) { | ||
2291 | ciagp = aiagp; | ||
2292 | } else if (freei == back) { | ||
2293 | ciagp = biagp; | ||
2294 | } else { | ||
2295 | if ((rc = diIAGRead(imap, freei, &cmp))) | ||
2296 | goto error_out; | ||
2297 | ciagp = (struct iag *) cmp->data; | ||
2298 | } | ||
2299 | if (ciagp == NULL) { | ||
2300 | jfs_error(imap->im_ipimap->i_sb, | ||
2301 | "diNewExt: ciagp == NULL"); | ||
2302 | rc = -EIO; | ||
2303 | goto error_out; | ||
2304 | } | ||
2305 | } | ||
2306 | } | ||
2307 | |||
2308 | /* allocate disk space for the inode extent. | ||
2309 | */ | ||
2310 | if ((extno == 0) || (addressPXD(&iagp->inoext[extno - 1]) == 0)) | ||
2311 | hint = ((s64) agno << sbi->bmap->db_agl2size) - 1; | ||
2312 | else | ||
2313 | hint = addressPXD(&iagp->inoext[extno - 1]) + | ||
2314 | lengthPXD(&iagp->inoext[extno - 1]) - 1; | ||
2315 | |||
2316 | if ((rc = dbAlloc(ipimap, hint, (s64) imap->im_nbperiext, &blkno))) | ||
2317 | goto error_out; | ||
2318 | |||
2319 | /* compute the inode number of the first inode within the | ||
2320 | * extent. | ||
2321 | */ | ||
2322 | ino = (iagno << L2INOSPERIAG) + (extno << L2INOSPEREXT); | ||
2323 | |||
2324 | /* initialize the inodes within the newly allocated extent a | ||
2325 | * page at a time. | ||
2326 | */ | ||
2327 | for (i = 0; i < imap->im_nbperiext; i += sbi->nbperpage) { | ||
2328 | /* get a buffer for this page of disk inodes. | ||
2329 | */ | ||
2330 | dmp = get_metapage(ipimap, blkno + i, PSIZE, 1); | ||
2331 | if (dmp == NULL) { | ||
2332 | rc = -EIO; | ||
2333 | goto error_out; | ||
2334 | } | ||
2335 | dp = (struct dinode *) dmp->data; | ||
2336 | |||
2337 | /* initialize the inode number, mode, link count and | ||
2338 | * inode extent address. | ||
2339 | */ | ||
2340 | for (j = 0; j < INOSPERPAGE; j++, dp++, ino++) { | ||
2341 | dp->di_inostamp = cpu_to_le32(sbi->inostamp); | ||
2342 | dp->di_number = cpu_to_le32(ino); | ||
2343 | dp->di_fileset = cpu_to_le32(FILESYSTEM_I); | ||
2344 | dp->di_mode = 0; | ||
2345 | dp->di_nlink = 0; | ||
2346 | PXDaddress(&(dp->di_ixpxd), blkno); | ||
2347 | PXDlength(&(dp->di_ixpxd), imap->im_nbperiext); | ||
2348 | } | ||
2349 | write_metapage(dmp); | ||
2350 | } | ||
2351 | |||
2352 | /* if this is the last free extent within the iag, remove the | ||
2353 | * iag from the ag free extent list. | ||
2354 | */ | ||
2355 | if (iagp->nfreeexts == cpu_to_le32(1)) { | ||
2356 | if (fwd >= 0) | ||
2357 | aiagp->extfreeback = iagp->extfreeback; | ||
2358 | |||
2359 | if (back >= 0) | ||
2360 | biagp->extfreefwd = iagp->extfreefwd; | ||
2361 | else | ||
2362 | imap->im_agctl[agno].extfree = | ||
2363 | le32_to_cpu(iagp->extfreefwd); | ||
2364 | |||
2365 | iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1); | ||
2366 | } else { | ||
2367 | /* if the iag has all free extents (newly allocated iag), | ||
2368 | * add the iag to the ag free extent list. | ||
2369 | */ | ||
2370 | if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { | ||
2371 | if (fwd >= 0) | ||
2372 | aiagp->extfreeback = cpu_to_le32(iagno); | ||
2373 | |||
2374 | iagp->extfreefwd = cpu_to_le32(fwd); | ||
2375 | iagp->extfreeback = cpu_to_le32(-1); | ||
2376 | imap->im_agctl[agno].extfree = iagno; | ||
2377 | } | ||
2378 | } | ||
2379 | |||
2380 | /* if the iag has no free inodes, add the iag to the | ||
2381 | * ag free inode list. | ||
2382 | */ | ||
2383 | if (iagp->nfreeinos == 0) { | ||
2384 | if (freei >= 0) | ||
2385 | ciagp->inofreeback = cpu_to_le32(iagno); | ||
2386 | |||
2387 | iagp->inofreefwd = | ||
2388 | cpu_to_le32(imap->im_agctl[agno].inofree); | ||
2389 | iagp->inofreeback = cpu_to_le32(-1); | ||
2390 | imap->im_agctl[agno].inofree = iagno; | ||
2391 | } | ||
2392 | |||
2393 | /* initialize the extent descriptor of the extent. */ | ||
2394 | PXDlength(&iagp->inoext[extno], imap->im_nbperiext); | ||
2395 | PXDaddress(&iagp->inoext[extno], blkno); | ||
2396 | |||
2397 | /* initialize the working and persistent map of the extent. | ||
2398 | * the working map will be initialized such that | ||
2399 | * it indicates the first inode of the extent is allocated. | ||
2400 | */ | ||
2401 | iagp->wmap[extno] = cpu_to_le32(HIGHORDER); | ||
2402 | iagp->pmap[extno] = 0; | ||
2403 | |||
2404 | /* update the free inode and free extent summary maps | ||
2405 | * for the extent to indicate the extent has free inodes | ||
2406 | * and no longer represents a free extent. | ||
2407 | */ | ||
2408 | sword = extno >> L2EXTSPERSUM; | ||
2409 | mask = HIGHORDER >> (extno & (EXTSPERSUM - 1)); | ||
2410 | iagp->extsmap[sword] |= cpu_to_le32(mask); | ||
2411 | iagp->inosmap[sword] &= cpu_to_le32(~mask); | ||
2412 | |||
2413 | /* update the free inode and free extent counts for the | ||
2414 | * iag. | ||
2415 | */ | ||
2416 | iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) + | ||
2417 | (INOSPEREXT - 1)); | ||
2418 | iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) - 1); | ||
2419 | |||
2420 | /* update the free and backed inode counts for the ag. | ||
2421 | */ | ||
2422 | imap->im_agctl[agno].numfree += (INOSPEREXT - 1); | ||
2423 | imap->im_agctl[agno].numinos += INOSPEREXT; | ||
2424 | |||
2425 | /* update the free and backed inode counts for the inode map. | ||
2426 | */ | ||
2427 | atomic_add(INOSPEREXT - 1, &imap->im_numfree); | ||
2428 | atomic_add(INOSPEREXT, &imap->im_numinos); | ||
2429 | |||
2430 | /* write the iags. | ||
2431 | */ | ||
2432 | if (amp) | ||
2433 | write_metapage(amp); | ||
2434 | if (bmp) | ||
2435 | write_metapage(bmp); | ||
2436 | if (cmp) | ||
2437 | write_metapage(cmp); | ||
2438 | |||
2439 | return (0); | ||
2440 | |||
2441 | error_out: | ||
2442 | |||
2443 | /* release the iags. | ||
2444 | */ | ||
2445 | if (amp) | ||
2446 | release_metapage(amp); | ||
2447 | if (bmp) | ||
2448 | release_metapage(bmp); | ||
2449 | if (cmp) | ||
2450 | release_metapage(cmp); | ||
2451 | |||
2452 | return (rc); | ||
2453 | } | ||
2454 | |||
2455 | |||
2456 | /* | ||
2457 | * NAME: diNewIAG(imap,iagnop,agno) | ||
2458 | * | ||
2459 | * FUNCTION: allocate a new iag for an allocation group. | ||
2460 | * | ||
2461 | * first tries to allocate the iag from the inode map | ||
2462 | * iagfree list: | ||
2463 | * if the list has free iags, the head of the list is removed | ||
2464 | * and returned to satisfy the request. | ||
2465 | * if the inode map's iag free list is empty, the inode map | ||
2466 | * is extended to hold a new iag. this new iag is initialized | ||
2467 | * and returned to satisfy the request. | ||
2468 | * | ||
2469 | * PARAMETERS: | ||
2470 | * imap - pointer to inode map control structure. | ||
2471 | * iagnop - pointer to an iag number set with the number of the | ||
2472 | * newly allocated iag upon successful return. | ||
2473 | * agno - allocation group number. | ||
2474 | * bpp - Buffer pointer to be filled in with new IAG's buffer | ||
2475 | * | ||
2476 | * RETURN VALUES: | ||
2477 | * 0 - success. | ||
2478 | * -ENOSPC - insufficient disk resources. | ||
2479 | * -EIO - i/o error. | ||
2480 | * | ||
2481 | * serialization: | ||
2482 | * AG lock held on entry/exit; | ||
2483 | * write lock on the map is held inside; | ||
2484 | * read lock on the map is held on successful completion; | ||
2485 | * | ||
2486 | * note: new iag transaction: | ||
2487 | * . synchronously write iag; | ||
2488 | * . write log of xtree and inode of imap; | ||
2489 | * . commit; | ||
2490 | * . synchronous write of xtree (right to left, bottom to top); | ||
2491 | * . at start of logredo(): init in-memory imap with one additional iag page; | ||
2492 | * . at end of logredo(): re-read imap inode to determine | ||
2493 | * new imap size; | ||
2494 | */ | ||
2495 | static int | ||
2496 | diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) | ||
2497 | { | ||
2498 | int rc; | ||
2499 | int iagno, i, xlen; | ||
2500 | struct inode *ipimap; | ||
2501 | struct super_block *sb; | ||
2502 | struct jfs_sb_info *sbi; | ||
2503 | struct metapage *mp; | ||
2504 | struct iag *iagp; | ||
2505 | s64 xaddr = 0; | ||
2506 | s64 blkno; | ||
2507 | tid_t tid; | ||
2508 | #ifdef _STILL_TO_PORT | ||
2509 | xad_t xad; | ||
2510 | #endif /* _STILL_TO_PORT */ | ||
2511 | struct inode *iplist[1]; | ||
2512 | |||
2513 | /* pick up pointers to the inode map and mount inodes */ | ||
2514 | ipimap = imap->im_ipimap; | ||
2515 | sb = ipimap->i_sb; | ||
2516 | sbi = JFS_SBI(sb); | ||
2517 | |||
2518 | /* acquire the free iag lock */ | ||
2519 | IAGFREE_LOCK(imap); | ||
2520 | |||
2521 | /* if there are any iags on the inode map free iag list, | ||
2522 | * allocate the iag from the head of the list. | ||
2523 | */ | ||
2524 | if (imap->im_freeiag >= 0) { | ||
2525 | /* pick up the iag number at the head of the list */ | ||
2526 | iagno = imap->im_freeiag; | ||
2527 | |||
2528 | /* determine the logical block number of the iag */ | ||
2529 | blkno = IAGTOLBLK(iagno, sbi->l2nbperpage); | ||
2530 | } else { | ||
2531 | /* no free iags. the inode map will have to be extented | ||
2532 | * to include a new iag. | ||
2533 | */ | ||
2534 | |||
2535 | /* acquire inode map lock */ | ||
2536 | IWRITE_LOCK(ipimap); | ||
2537 | |||
2538 | if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) { | ||
2539 | IWRITE_UNLOCK(ipimap); | ||
2540 | IAGFREE_UNLOCK(imap); | ||
2541 | jfs_error(imap->im_ipimap->i_sb, | ||
2542 | "diNewIAG: ipimap->i_size is wrong"); | ||
2543 | return -EIO; | ||
2544 | } | ||
2545 | |||
2546 | |||
2547 | /* get the next avaliable iag number */ | ||
2548 | iagno = imap->im_nextiag; | ||
2549 | |||
2550 | /* make sure that we have not exceeded the maximum inode | ||
2551 | * number limit. | ||
2552 | */ | ||
2553 | if (iagno > (MAXIAGS - 1)) { | ||
2554 | /* release the inode map lock */ | ||
2555 | IWRITE_UNLOCK(ipimap); | ||
2556 | |||
2557 | rc = -ENOSPC; | ||
2558 | goto out; | ||
2559 | } | ||
2560 | |||
2561 | /* | ||
2562 | * synchronously append new iag page. | ||
2563 | */ | ||
2564 | /* determine the logical address of iag page to append */ | ||
2565 | blkno = IAGTOLBLK(iagno, sbi->l2nbperpage); | ||
2566 | |||
2567 | /* Allocate extent for new iag page */ | ||
2568 | xlen = sbi->nbperpage; | ||
2569 | if ((rc = dbAlloc(ipimap, 0, (s64) xlen, &xaddr))) { | ||
2570 | /* release the inode map lock */ | ||
2571 | IWRITE_UNLOCK(ipimap); | ||
2572 | |||
2573 | goto out; | ||
2574 | } | ||
2575 | |||
2576 | /* assign a buffer for the page */ | ||
2577 | mp = get_metapage(ipimap, xaddr, PSIZE, 1); | ||
2578 | if (!mp) { | ||
2579 | /* Free the blocks allocated for the iag since it was | ||
2580 | * not successfully added to the inode map | ||
2581 | */ | ||
2582 | dbFree(ipimap, xaddr, (s64) xlen); | ||
2583 | |||
2584 | /* release the inode map lock */ | ||
2585 | IWRITE_UNLOCK(ipimap); | ||
2586 | |||
2587 | rc = -EIO; | ||
2588 | goto out; | ||
2589 | } | ||
2590 | iagp = (struct iag *) mp->data; | ||
2591 | |||
2592 | /* init the iag */ | ||
2593 | memset(iagp, 0, sizeof(struct iag)); | ||
2594 | iagp->iagnum = cpu_to_le32(iagno); | ||
2595 | iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1); | ||
2596 | iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1); | ||
2597 | iagp->iagfree = cpu_to_le32(-1); | ||
2598 | iagp->nfreeinos = 0; | ||
2599 | iagp->nfreeexts = cpu_to_le32(EXTSPERIAG); | ||
2600 | |||
2601 | /* initialize the free inode summary map (free extent | ||
2602 | * summary map initialization handled by bzero). | ||
2603 | */ | ||
2604 | for (i = 0; i < SMAPSZ; i++) | ||
2605 | iagp->inosmap[i] = cpu_to_le32(ONES); | ||
2606 | |||
2607 | /* | ||
2608 | * Invalidate the page after writing and syncing it. | ||
2609 | * After it's initialized, we access it in a different | ||
2610 | * address space | ||
2611 | */ | ||
2612 | set_bit(META_discard, &mp->flag); | ||
2613 | flush_metapage(mp); | ||
2614 | |||
2615 | /* | ||
2616 | * start tyransaction of update of the inode map | ||
2617 | * addressing structure pointing to the new iag page; | ||
2618 | */ | ||
2619 | tid = txBegin(sb, COMMIT_FORCE); | ||
2620 | down(&JFS_IP(ipimap)->commit_sem); | ||
2621 | |||
2622 | /* update the inode map addressing structure to point to it */ | ||
2623 | if ((rc = | ||
2624 | xtInsert(tid, ipimap, 0, blkno, xlen, &xaddr, 0))) { | ||
2625 | txEnd(tid); | ||
2626 | up(&JFS_IP(ipimap)->commit_sem); | ||
2627 | /* Free the blocks allocated for the iag since it was | ||
2628 | * not successfully added to the inode map | ||
2629 | */ | ||
2630 | dbFree(ipimap, xaddr, (s64) xlen); | ||
2631 | |||
2632 | /* release the inode map lock */ | ||
2633 | IWRITE_UNLOCK(ipimap); | ||
2634 | |||
2635 | goto out; | ||
2636 | } | ||
2637 | |||
2638 | /* update the inode map's inode to reflect the extension */ | ||
2639 | ipimap->i_size += PSIZE; | ||
2640 | inode_add_bytes(ipimap, PSIZE); | ||
2641 | |||
2642 | /* | ||
2643 | * txCommit(COMMIT_FORCE) will synchronously write address | ||
2644 | * index pages and inode after commit in careful update order | ||
2645 | * of address index pages (right to left, bottom up); | ||
2646 | */ | ||
2647 | iplist[0] = ipimap; | ||
2648 | rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE); | ||
2649 | |||
2650 | txEnd(tid); | ||
2651 | up(&JFS_IP(ipimap)->commit_sem); | ||
2652 | |||
2653 | duplicateIXtree(sb, blkno, xlen, &xaddr); | ||
2654 | |||
2655 | /* update the next avaliable iag number */ | ||
2656 | imap->im_nextiag += 1; | ||
2657 | |||
2658 | /* Add the iag to the iag free list so we don't lose the iag | ||
2659 | * if a failure happens now. | ||
2660 | */ | ||
2661 | imap->im_freeiag = iagno; | ||
2662 | |||
2663 | /* Until we have logredo working, we want the imap inode & | ||
2664 | * control page to be up to date. | ||
2665 | */ | ||
2666 | diSync(ipimap); | ||
2667 | |||
2668 | /* release the inode map lock */ | ||
2669 | IWRITE_UNLOCK(ipimap); | ||
2670 | } | ||
2671 | |||
2672 | /* obtain read lock on map */ | ||
2673 | IREAD_LOCK(ipimap); | ||
2674 | |||
2675 | /* read the iag */ | ||
2676 | if ((rc = diIAGRead(imap, iagno, &mp))) { | ||
2677 | IREAD_UNLOCK(ipimap); | ||
2678 | rc = -EIO; | ||
2679 | goto out; | ||
2680 | } | ||
2681 | iagp = (struct iag *) mp->data; | ||
2682 | |||
2683 | /* remove the iag from the iag free list */ | ||
2684 | imap->im_freeiag = le32_to_cpu(iagp->iagfree); | ||
2685 | iagp->iagfree = cpu_to_le32(-1); | ||
2686 | |||
2687 | /* set the return iag number and buffer pointer */ | ||
2688 | *iagnop = iagno; | ||
2689 | *mpp = mp; | ||
2690 | |||
2691 | out: | ||
2692 | /* release the iag free lock */ | ||
2693 | IAGFREE_UNLOCK(imap); | ||
2694 | |||
2695 | return (rc); | ||
2696 | } | ||
2697 | |||
2698 | /* | ||
2699 | * NAME: diIAGRead() | ||
2700 | * | ||
2701 | * FUNCTION: get the buffer for the specified iag within a fileset | ||
2702 | * or aggregate inode map. | ||
2703 | * | ||
2704 | * PARAMETERS: | ||
2705 | * imap - pointer to inode map control structure. | ||
2706 | * iagno - iag number. | ||
2707 | * bpp - point to buffer pointer to be filled in on successful | ||
2708 | * exit. | ||
2709 | * | ||
2710 | * SERIALIZATION: | ||
2711 | * must have read lock on imap inode | ||
2712 | * (When called by diExtendFS, the filesystem is quiesced, therefore | ||
2713 | * the read lock is unnecessary.) | ||
2714 | * | ||
2715 | * RETURN VALUES: | ||
2716 | * 0 - success. | ||
2717 | * -EIO - i/o error. | ||
2718 | */ | ||
2719 | static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp) | ||
2720 | { | ||
2721 | struct inode *ipimap = imap->im_ipimap; | ||
2722 | s64 blkno; | ||
2723 | |||
2724 | /* compute the logical block number of the iag. */ | ||
2725 | blkno = IAGTOLBLK(iagno, JFS_SBI(ipimap->i_sb)->l2nbperpage); | ||
2726 | |||
2727 | /* read the iag. */ | ||
2728 | *mpp = read_metapage(ipimap, blkno, PSIZE, 0); | ||
2729 | if (*mpp == NULL) { | ||
2730 | return -EIO; | ||
2731 | } | ||
2732 | |||
2733 | return (0); | ||
2734 | } | ||
2735 | |||
2736 | /* | ||
2737 | * NAME: diFindFree() | ||
2738 | * | ||
2739 | * FUNCTION: find the first free bit in a word starting at | ||
2740 | * the specified bit position. | ||
2741 | * | ||
2742 | * PARAMETERS: | ||
2743 | * word - word to be examined. | ||
2744 | * start - starting bit position. | ||
2745 | * | ||
2746 | * RETURN VALUES: | ||
2747 | * bit position of first free bit in the word or 32 if | ||
2748 | * no free bits were found. | ||
2749 | */ | ||
2750 | static int diFindFree(u32 word, int start) | ||
2751 | { | ||
2752 | int bitno; | ||
2753 | assert(start < 32); | ||
2754 | /* scan the word for the first free bit. */ | ||
2755 | for (word <<= start, bitno = start; bitno < 32; | ||
2756 | bitno++, word <<= 1) { | ||
2757 | if ((word & HIGHORDER) == 0) | ||
2758 | break; | ||
2759 | } | ||
2760 | return (bitno); | ||
2761 | } | ||
2762 | |||
2763 | /* | ||
2764 | * NAME: diUpdatePMap() | ||
2765 | * | ||
2766 | * FUNCTION: Update the persistent map in an IAG for the allocation or | ||
2767 | * freeing of the specified inode. | ||
2768 | * | ||
2769 | * PRE CONDITIONS: Working map has already been updated for allocate. | ||
2770 | * | ||
2771 | * PARAMETERS: | ||
2772 | * ipimap - Incore inode map inode | ||
2773 | * inum - Number of inode to mark in permanent map | ||
2774 | * is_free - If TRUE indicates inode should be marked freed, otherwise | ||
2775 | * indicates inode should be marked allocated. | ||
2776 | * | ||
2777 | * RETURN VALUES: | ||
2778 | * 0 for success | ||
2779 | */ | ||
2780 | int | ||
2781 | diUpdatePMap(struct inode *ipimap, | ||
2782 | unsigned long inum, boolean_t is_free, struct tblock * tblk) | ||
2783 | { | ||
2784 | int rc; | ||
2785 | struct iag *iagp; | ||
2786 | struct metapage *mp; | ||
2787 | int iagno, ino, extno, bitno; | ||
2788 | struct inomap *imap; | ||
2789 | u32 mask; | ||
2790 | struct jfs_log *log; | ||
2791 | int lsn, difft, diffp; | ||
2792 | |||
2793 | imap = JFS_IP(ipimap)->i_imap; | ||
2794 | /* get the iag number containing the inode */ | ||
2795 | iagno = INOTOIAG(inum); | ||
2796 | /* make sure that the iag is contained within the map */ | ||
2797 | if (iagno >= imap->im_nextiag) { | ||
2798 | jfs_error(ipimap->i_sb, | ||
2799 | "diUpdatePMap: the iag is outside the map"); | ||
2800 | return -EIO; | ||
2801 | } | ||
2802 | /* read the iag */ | ||
2803 | IREAD_LOCK(ipimap); | ||
2804 | rc = diIAGRead(imap, iagno, &mp); | ||
2805 | IREAD_UNLOCK(ipimap); | ||
2806 | if (rc) | ||
2807 | return (rc); | ||
2808 | iagp = (struct iag *) mp->data; | ||
2809 | /* get the inode number and extent number of the inode within | ||
2810 | * the iag and the inode number within the extent. | ||
2811 | */ | ||
2812 | ino = inum & (INOSPERIAG - 1); | ||
2813 | extno = ino >> L2INOSPEREXT; | ||
2814 | bitno = ino & (INOSPEREXT - 1); | ||
2815 | mask = HIGHORDER >> bitno; | ||
2816 | /* | ||
2817 | * mark the inode free in persistent map: | ||
2818 | */ | ||
2819 | if (is_free == TRUE) { | ||
2820 | /* The inode should have been allocated both in working | ||
2821 | * map and in persistent map; | ||
2822 | * the inode will be freed from working map at the release | ||
2823 | * of last reference release; | ||
2824 | */ | ||
2825 | if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { | ||
2826 | jfs_error(ipimap->i_sb, | ||
2827 | "diUpdatePMap: inode %ld not marked as " | ||
2828 | "allocated in wmap!", inum); | ||
2829 | } | ||
2830 | if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) { | ||
2831 | jfs_error(ipimap->i_sb, | ||
2832 | "diUpdatePMap: inode %ld not marked as " | ||
2833 | "allocated in pmap!", inum); | ||
2834 | } | ||
2835 | /* update the bitmap for the extent of the freed inode */ | ||
2836 | iagp->pmap[extno] &= cpu_to_le32(~mask); | ||
2837 | } | ||
2838 | /* | ||
2839 | * mark the inode allocated in persistent map: | ||
2840 | */ | ||
2841 | else { | ||
2842 | /* The inode should be already allocated in the working map | ||
2843 | * and should be free in persistent map; | ||
2844 | */ | ||
2845 | if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { | ||
2846 | release_metapage(mp); | ||
2847 | jfs_error(ipimap->i_sb, | ||
2848 | "diUpdatePMap: the inode is not allocated in " | ||
2849 | "the working map"); | ||
2850 | return -EIO; | ||
2851 | } | ||
2852 | if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) { | ||
2853 | release_metapage(mp); | ||
2854 | jfs_error(ipimap->i_sb, | ||
2855 | "diUpdatePMap: the inode is not free in the " | ||
2856 | "persistent map"); | ||
2857 | return -EIO; | ||
2858 | } | ||
2859 | /* update the bitmap for the extent of the allocated inode */ | ||
2860 | iagp->pmap[extno] |= cpu_to_le32(mask); | ||
2861 | } | ||
2862 | /* | ||
2863 | * update iag lsn | ||
2864 | */ | ||
2865 | lsn = tblk->lsn; | ||
2866 | log = JFS_SBI(tblk->sb)->log; | ||
2867 | if (mp->lsn != 0) { | ||
2868 | /* inherit older/smaller lsn */ | ||
2869 | logdiff(difft, lsn, log); | ||
2870 | logdiff(diffp, mp->lsn, log); | ||
2871 | if (difft < diffp) { | ||
2872 | mp->lsn = lsn; | ||
2873 | /* move mp after tblock in logsync list */ | ||
2874 | LOGSYNC_LOCK(log); | ||
2875 | list_move(&mp->synclist, &tblk->synclist); | ||
2876 | LOGSYNC_UNLOCK(log); | ||
2877 | } | ||
2878 | /* inherit younger/larger clsn */ | ||
2879 | LOGSYNC_LOCK(log); | ||
2880 | assert(mp->clsn); | ||
2881 | logdiff(difft, tblk->clsn, log); | ||
2882 | logdiff(diffp, mp->clsn, log); | ||
2883 | if (difft > diffp) | ||
2884 | mp->clsn = tblk->clsn; | ||
2885 | LOGSYNC_UNLOCK(log); | ||
2886 | } else { | ||
2887 | mp->log = log; | ||
2888 | mp->lsn = lsn; | ||
2889 | /* insert mp after tblock in logsync list */ | ||
2890 | LOGSYNC_LOCK(log); | ||
2891 | log->count++; | ||
2892 | list_add(&mp->synclist, &tblk->synclist); | ||
2893 | mp->clsn = tblk->clsn; | ||
2894 | LOGSYNC_UNLOCK(log); | ||
2895 | } | ||
2896 | write_metapage(mp); | ||
2897 | return (0); | ||
2898 | } | ||
2899 | |||
2900 | /* | ||
2901 | * diExtendFS() | ||
2902 | * | ||
2903 | * function: update imap for extendfs(); | ||
2904 | * | ||
2905 | * note: AG size has been increased s.t. each k old contiguous AGs are | ||
2906 | * coalesced into a new AG; | ||
2907 | */ | ||
2908 | int diExtendFS(struct inode *ipimap, struct inode *ipbmap) | ||
2909 | { | ||
2910 | int rc, rcx = 0; | ||
2911 | struct inomap *imap = JFS_IP(ipimap)->i_imap; | ||
2912 | struct iag *iagp = NULL, *hiagp = NULL; | ||
2913 | struct bmap *mp = JFS_SBI(ipbmap->i_sb)->bmap; | ||
2914 | struct metapage *bp, *hbp; | ||
2915 | int i, n, head; | ||
2916 | int numinos, xnuminos = 0, xnumfree = 0; | ||
2917 | s64 agstart; | ||
2918 | |||
2919 | jfs_info("diExtendFS: nextiag:%d numinos:%d numfree:%d", | ||
2920 | imap->im_nextiag, atomic_read(&imap->im_numinos), | ||
2921 | atomic_read(&imap->im_numfree)); | ||
2922 | |||
2923 | /* | ||
2924 | * reconstruct imap | ||
2925 | * | ||
2926 | * coalesce contiguous k (newAGSize/oldAGSize) AGs; | ||
2927 | * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn; | ||
2928 | * note: new AG size = old AG size * (2**x). | ||
2929 | */ | ||
2930 | |||
2931 | /* init per AG control information im_agctl[] */ | ||
2932 | for (i = 0; i < MAXAG; i++) { | ||
2933 | imap->im_agctl[i].inofree = -1; | ||
2934 | imap->im_agctl[i].extfree = -1; | ||
2935 | imap->im_agctl[i].numinos = 0; /* number of backed inodes */ | ||
2936 | imap->im_agctl[i].numfree = 0; /* number of free backed inodes */ | ||
2937 | } | ||
2938 | |||
2939 | /* | ||
2940 | * process each iag page of the map. | ||
2941 | * | ||
2942 | * rebuild AG Free Inode List, AG Free Inode Extent List; | ||
2943 | */ | ||
2944 | for (i = 0; i < imap->im_nextiag; i++) { | ||
2945 | if ((rc = diIAGRead(imap, i, &bp))) { | ||
2946 | rcx = rc; | ||
2947 | continue; | ||
2948 | } | ||
2949 | iagp = (struct iag *) bp->data; | ||
2950 | if (le32_to_cpu(iagp->iagnum) != i) { | ||
2951 | release_metapage(bp); | ||
2952 | jfs_error(ipimap->i_sb, | ||
2953 | "diExtendFs: unexpected value of iagnum"); | ||
2954 | return -EIO; | ||
2955 | } | ||
2956 | |||
2957 | /* leave free iag in the free iag list */ | ||
2958 | if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { | ||
2959 | release_metapage(bp); | ||
2960 | continue; | ||
2961 | } | ||
2962 | |||
2963 | /* agstart that computes to the same ag is treated as same; */ | ||
2964 | agstart = le64_to_cpu(iagp->agstart); | ||
2965 | /* iagp->agstart = agstart & ~(mp->db_agsize - 1); */ | ||
2966 | n = agstart >> mp->db_agl2size; | ||
2967 | |||
2968 | /* compute backed inodes */ | ||
2969 | numinos = (EXTSPERIAG - le32_to_cpu(iagp->nfreeexts)) | ||
2970 | << L2INOSPEREXT; | ||
2971 | if (numinos > 0) { | ||
2972 | /* merge AG backed inodes */ | ||
2973 | imap->im_agctl[n].numinos += numinos; | ||
2974 | xnuminos += numinos; | ||
2975 | } | ||
2976 | |||
2977 | /* if any backed free inodes, insert at AG free inode list */ | ||
2978 | if ((int) le32_to_cpu(iagp->nfreeinos) > 0) { | ||
2979 | if ((head = imap->im_agctl[n].inofree) == -1) { | ||
2980 | iagp->inofreefwd = cpu_to_le32(-1); | ||
2981 | iagp->inofreeback = cpu_to_le32(-1); | ||
2982 | } else { | ||
2983 | if ((rc = diIAGRead(imap, head, &hbp))) { | ||
2984 | rcx = rc; | ||
2985 | goto nextiag; | ||
2986 | } | ||
2987 | hiagp = (struct iag *) hbp->data; | ||
2988 | hiagp->inofreeback = iagp->iagnum; | ||
2989 | iagp->inofreefwd = cpu_to_le32(head); | ||
2990 | iagp->inofreeback = cpu_to_le32(-1); | ||
2991 | write_metapage(hbp); | ||
2992 | } | ||
2993 | |||
2994 | imap->im_agctl[n].inofree = | ||
2995 | le32_to_cpu(iagp->iagnum); | ||
2996 | |||
2997 | /* merge AG backed free inodes */ | ||
2998 | imap->im_agctl[n].numfree += | ||
2999 | le32_to_cpu(iagp->nfreeinos); | ||
3000 | xnumfree += le32_to_cpu(iagp->nfreeinos); | ||
3001 | } | ||
3002 | |||
3003 | /* if any free extents, insert at AG free extent list */ | ||
3004 | if (le32_to_cpu(iagp->nfreeexts) > 0) { | ||
3005 | if ((head = imap->im_agctl[n].extfree) == -1) { | ||
3006 | iagp->extfreefwd = cpu_to_le32(-1); | ||
3007 | iagp->extfreeback = cpu_to_le32(-1); | ||
3008 | } else { | ||
3009 | if ((rc = diIAGRead(imap, head, &hbp))) { | ||
3010 | rcx = rc; | ||
3011 | goto nextiag; | ||
3012 | } | ||
3013 | hiagp = (struct iag *) hbp->data; | ||
3014 | hiagp->extfreeback = iagp->iagnum; | ||
3015 | iagp->extfreefwd = cpu_to_le32(head); | ||
3016 | iagp->extfreeback = cpu_to_le32(-1); | ||
3017 | write_metapage(hbp); | ||
3018 | } | ||
3019 | |||
3020 | imap->im_agctl[n].extfree = | ||
3021 | le32_to_cpu(iagp->iagnum); | ||
3022 | } | ||
3023 | |||
3024 | nextiag: | ||
3025 | write_metapage(bp); | ||
3026 | } | ||
3027 | |||
3028 | if (xnuminos != atomic_read(&imap->im_numinos) || | ||
3029 | xnumfree != atomic_read(&imap->im_numfree)) { | ||
3030 | jfs_error(ipimap->i_sb, | ||
3031 | "diExtendFs: numinos or numfree incorrect"); | ||
3032 | return -EIO; | ||
3033 | } | ||
3034 | |||
3035 | return rcx; | ||
3036 | } | ||
3037 | |||
3038 | |||
3039 | /* | ||
3040 | * duplicateIXtree() | ||
3041 | * | ||
3042 | * serialization: IWRITE_LOCK held on entry/exit | ||
3043 | * | ||
3044 | * note: shadow page with regular inode (rel.2); | ||
3045 | */ | ||
3046 | static void duplicateIXtree(struct super_block *sb, s64 blkno, | ||
3047 | int xlen, s64 *xaddr) | ||
3048 | { | ||
3049 | struct jfs_superblock *j_sb; | ||
3050 | struct buffer_head *bh; | ||
3051 | struct inode *ip; | ||
3052 | tid_t tid; | ||
3053 | |||
3054 | /* if AIT2 ipmap2 is bad, do not try to update it */ | ||
3055 | if (JFS_SBI(sb)->mntflag & JFS_BAD_SAIT) /* s_flag */ | ||
3056 | return; | ||
3057 | ip = diReadSpecial(sb, FILESYSTEM_I, 1); | ||
3058 | if (ip == NULL) { | ||
3059 | JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT; | ||
3060 | if (readSuper(sb, &bh)) | ||
3061 | return; | ||
3062 | j_sb = (struct jfs_superblock *)bh->b_data; | ||
3063 | j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT); | ||
3064 | |||
3065 | mark_buffer_dirty(bh); | ||
3066 | sync_dirty_buffer(bh); | ||
3067 | brelse(bh); | ||
3068 | return; | ||
3069 | } | ||
3070 | |||
3071 | /* start transaction */ | ||
3072 | tid = txBegin(sb, COMMIT_FORCE); | ||
3073 | /* update the inode map addressing structure to point to it */ | ||
3074 | if (xtInsert(tid, ip, 0, blkno, xlen, xaddr, 0)) { | ||
3075 | JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT; | ||
3076 | txAbort(tid, 1); | ||
3077 | goto cleanup; | ||
3078 | |||
3079 | } | ||
3080 | /* update the inode map's inode to reflect the extension */ | ||
3081 | ip->i_size += PSIZE; | ||
3082 | inode_add_bytes(ip, PSIZE); | ||
3083 | txCommit(tid, 1, &ip, COMMIT_FORCE); | ||
3084 | cleanup: | ||
3085 | txEnd(tid); | ||
3086 | diFreeSpecial(ip); | ||
3087 | } | ||
3088 | |||
3089 | /* | ||
3090 | * NAME: copy_from_dinode() | ||
3091 | * | ||
3092 | * FUNCTION: Copies inode info from disk inode to in-memory inode | ||
3093 | * | ||
3094 | * RETURN VALUES: | ||
3095 | * 0 - success | ||
3096 | * -ENOMEM - insufficient memory | ||
3097 | */ | ||
3098 | static int copy_from_dinode(struct dinode * dip, struct inode *ip) | ||
3099 | { | ||
3100 | struct jfs_inode_info *jfs_ip = JFS_IP(ip); | ||
3101 | |||
3102 | jfs_ip->fileset = le32_to_cpu(dip->di_fileset); | ||
3103 | jfs_ip->mode2 = le32_to_cpu(dip->di_mode); | ||
3104 | |||
3105 | ip->i_mode = le32_to_cpu(dip->di_mode) & 0xffff; | ||
3106 | ip->i_nlink = le32_to_cpu(dip->di_nlink); | ||
3107 | ip->i_uid = le32_to_cpu(dip->di_uid); | ||
3108 | ip->i_gid = le32_to_cpu(dip->di_gid); | ||
3109 | ip->i_size = le64_to_cpu(dip->di_size); | ||
3110 | ip->i_atime.tv_sec = le32_to_cpu(dip->di_atime.tv_sec); | ||
3111 | ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec); | ||
3112 | ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec); | ||
3113 | ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec); | ||
3114 | ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec); | ||
3115 | ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec); | ||
3116 | ip->i_blksize = ip->i_sb->s_blocksize; | ||
3117 | ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks)); | ||
3118 | ip->i_generation = le32_to_cpu(dip->di_gen); | ||
3119 | |||
3120 | jfs_ip->ixpxd = dip->di_ixpxd; /* in-memory pxd's are little-endian */ | ||
3121 | jfs_ip->acl = dip->di_acl; /* as are dxd's */ | ||
3122 | jfs_ip->ea = dip->di_ea; | ||
3123 | jfs_ip->next_index = le32_to_cpu(dip->di_next_index); | ||
3124 | jfs_ip->otime = le32_to_cpu(dip->di_otime.tv_sec); | ||
3125 | jfs_ip->acltype = le32_to_cpu(dip->di_acltype); | ||
3126 | |||
3127 | if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode)) { | ||
3128 | jfs_ip->dev = le32_to_cpu(dip->di_rdev); | ||
3129 | ip->i_rdev = new_decode_dev(jfs_ip->dev); | ||
3130 | } | ||
3131 | |||
3132 | if (S_ISDIR(ip->i_mode)) { | ||
3133 | memcpy(&jfs_ip->i_dirtable, &dip->di_dirtable, 384); | ||
3134 | } else if (S_ISREG(ip->i_mode) || S_ISLNK(ip->i_mode)) { | ||
3135 | memcpy(&jfs_ip->i_xtroot, &dip->di_xtroot, 288); | ||
3136 | } else | ||
3137 | memcpy(&jfs_ip->i_inline_ea, &dip->di_inlineea, 128); | ||
3138 | |||
3139 | /* Zero the in-memory-only stuff */ | ||
3140 | jfs_ip->cflag = 0; | ||
3141 | jfs_ip->btindex = 0; | ||
3142 | jfs_ip->btorder = 0; | ||
3143 | jfs_ip->bxflag = 0; | ||
3144 | jfs_ip->blid = 0; | ||
3145 | jfs_ip->atlhead = 0; | ||
3146 | jfs_ip->atltail = 0; | ||
3147 | jfs_ip->xtlid = 0; | ||
3148 | return (0); | ||
3149 | } | ||
3150 | |||
3151 | /* | ||
3152 | * NAME: copy_to_dinode() | ||
3153 | * | ||
3154 | * FUNCTION: Copies inode info from in-memory inode to disk inode | ||
3155 | */ | ||
3156 | static void copy_to_dinode(struct dinode * dip, struct inode *ip) | ||
3157 | { | ||
3158 | struct jfs_inode_info *jfs_ip = JFS_IP(ip); | ||
3159 | |||
3160 | dip->di_fileset = cpu_to_le32(jfs_ip->fileset); | ||
3161 | dip->di_inostamp = cpu_to_le32(JFS_SBI(ip->i_sb)->inostamp); | ||
3162 | dip->di_number = cpu_to_le32(ip->i_ino); | ||
3163 | dip->di_gen = cpu_to_le32(ip->i_generation); | ||
3164 | dip->di_size = cpu_to_le64(ip->i_size); | ||
3165 | dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks)); | ||
3166 | dip->di_nlink = cpu_to_le32(ip->i_nlink); | ||
3167 | dip->di_uid = cpu_to_le32(ip->i_uid); | ||
3168 | dip->di_gid = cpu_to_le32(ip->i_gid); | ||
3169 | /* | ||
3170 | * mode2 is only needed for storing the higher order bits. | ||
3171 | * Trust i_mode for the lower order ones | ||
3172 | */ | ||
3173 | dip->di_mode = cpu_to_le32((jfs_ip->mode2 & 0xffff0000) | ip->i_mode); | ||
3174 | dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec); | ||
3175 | dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec); | ||
3176 | dip->di_ctime.tv_sec = cpu_to_le32(ip->i_ctime.tv_sec); | ||
3177 | dip->di_ctime.tv_nsec = cpu_to_le32(ip->i_ctime.tv_nsec); | ||
3178 | dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec); | ||
3179 | dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec); | ||
3180 | dip->di_ixpxd = jfs_ip->ixpxd; /* in-memory pxd's are little-endian */ | ||
3181 | dip->di_acl = jfs_ip->acl; /* as are dxd's */ | ||
3182 | dip->di_ea = jfs_ip->ea; | ||
3183 | dip->di_next_index = cpu_to_le32(jfs_ip->next_index); | ||
3184 | dip->di_otime.tv_sec = cpu_to_le32(jfs_ip->otime); | ||
3185 | dip->di_otime.tv_nsec = 0; | ||
3186 | dip->di_acltype = cpu_to_le32(jfs_ip->acltype); | ||
3187 | if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode)) | ||
3188 | dip->di_rdev = cpu_to_le32(jfs_ip->dev); | ||
3189 | } | ||
3190 | |||
3191 | #ifdef _JFS_DEBUG_IMAP | ||
3192 | /* | ||
3193 | * DBGdiInit() | ||
3194 | */ | ||
3195 | static void *DBGdiInit(struct inomap * imap) | ||
3196 | { | ||
3197 | u32 *dimap; | ||
3198 | int size; | ||
3199 | size = 64 * 1024; | ||
3200 | if ((dimap = (u32 *) xmalloc(size, L2PSIZE, kernel_heap)) == NULL) | ||
3201 | assert(0); | ||
3202 | bzero((void *) dimap, size); | ||
3203 | imap->im_DBGdimap = dimap; | ||
3204 | } | ||
3205 | |||
3206 | /* | ||
3207 | * DBGdiAlloc() | ||
3208 | */ | ||
3209 | static void DBGdiAlloc(struct inomap * imap, ino_t ino) | ||
3210 | { | ||
3211 | u32 *dimap = imap->im_DBGdimap; | ||
3212 | int w, b; | ||
3213 | u32 m; | ||
3214 | w = ino >> 5; | ||
3215 | b = ino & 31; | ||
3216 | m = 0x80000000 >> b; | ||
3217 | assert(w < 64 * 256); | ||
3218 | if (dimap[w] & m) { | ||
3219 | printk("DEBUG diAlloc: duplicate alloc ino:0x%x\n", ino); | ||
3220 | } | ||
3221 | dimap[w] |= m; | ||
3222 | } | ||
3223 | |||
3224 | /* | ||
3225 | * DBGdiFree() | ||
3226 | */ | ||
3227 | static void DBGdiFree(struct inomap * imap, ino_t ino) | ||
3228 | { | ||
3229 | u32 *dimap = imap->im_DBGdimap; | ||
3230 | int w, b; | ||
3231 | u32 m; | ||
3232 | w = ino >> 5; | ||
3233 | b = ino & 31; | ||
3234 | m = 0x80000000 >> b; | ||
3235 | assert(w < 64 * 256); | ||
3236 | if ((dimap[w] & m) == 0) { | ||
3237 | printk("DEBUG diFree: duplicate free ino:0x%x\n", ino); | ||
3238 | } | ||
3239 | dimap[w] &= ~m; | ||
3240 | } | ||
3241 | |||
3242 | static void dump_cp(struct inomap * ipimap, char *function, int line) | ||
3243 | { | ||
3244 | printk("\n* ********* *\nControl Page %s %d\n", function, line); | ||
3245 | printk("FreeIAG %d\tNextIAG %d\n", ipimap->im_freeiag, | ||
3246 | ipimap->im_nextiag); | ||
3247 | printk("NumInos %d\tNumFree %d\n", | ||
3248 | atomic_read(&ipimap->im_numinos), | ||
3249 | atomic_read(&ipimap->im_numfree)); | ||
3250 | printk("AG InoFree %d\tAG ExtFree %d\n", | ||
3251 | ipimap->im_agctl[0].inofree, ipimap->im_agctl[0].extfree); | ||
3252 | printk("AG NumInos %d\tAG NumFree %d\n", | ||
3253 | ipimap->im_agctl[0].numinos, ipimap->im_agctl[0].numfree); | ||
3254 | } | ||
3255 | |||
3256 | static void dump_iag(struct iag * iag, char *function, int line) | ||
3257 | { | ||
3258 | printk("\n* ********* *\nIAG %s %d\n", function, line); | ||
3259 | printk("IagNum %d\tIAG Free %d\n", le32_to_cpu(iag->iagnum), | ||
3260 | le32_to_cpu(iag->iagfree)); | ||
3261 | printk("InoFreeFwd %d\tInoFreeBack %d\n", | ||
3262 | le32_to_cpu(iag->inofreefwd), | ||
3263 | le32_to_cpu(iag->inofreeback)); | ||
3264 | printk("ExtFreeFwd %d\tExtFreeBack %d\n", | ||
3265 | le32_to_cpu(iag->extfreefwd), | ||
3266 | le32_to_cpu(iag->extfreeback)); | ||
3267 | printk("NFreeInos %d\tNFreeExts %d\n", le32_to_cpu(iag->nfreeinos), | ||
3268 | le32_to_cpu(iag->nfreeexts)); | ||
3269 | } | ||
3270 | #endif /* _JFS_DEBUG_IMAP */ | ||
diff --git a/fs/jfs/jfs_imap.h b/fs/jfs/jfs_imap.h new file mode 100644 index 000000000000..6b59adec036a --- /dev/null +++ b/fs/jfs/jfs_imap.h | |||
@@ -0,0 +1,175 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2000-2002 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | #ifndef _H_JFS_IMAP | ||
19 | #define _H_JFS_IMAP | ||
20 | |||
21 | #include "jfs_txnmgr.h" | ||
22 | |||
23 | /* | ||
24 | * jfs_imap.h: disk inode manager | ||
25 | */ | ||
26 | |||
27 | #define EXTSPERIAG 128 /* number of disk inode extent per iag */ | ||
28 | #define IMAPBLKNO 0 /* lblkno of dinomap within inode map */ | ||
29 | #define SMAPSZ 4 /* number of words per summary map */ | ||
30 | #define EXTSPERSUM 32 /* number of extents per summary map entry */ | ||
31 | #define L2EXTSPERSUM 5 /* l2 number of extents per summary map */ | ||
32 | #define PGSPERIEXT 4 /* number of 4K pages per dinode extent */ | ||
33 | #define MAXIAGS ((1<<20)-1) /* maximum number of iags */ | ||
34 | #define MAXAG 128 /* maximum number of allocation groups */ | ||
35 | |||
36 | #define AMAPSIZE 512 /* bytes in the IAG allocation maps */ | ||
37 | #define SMAPSIZE 16 /* bytes in the IAG summary maps */ | ||
38 | |||
39 | /* convert inode number to iag number */ | ||
40 | #define INOTOIAG(ino) ((ino) >> L2INOSPERIAG) | ||
41 | |||
42 | /* convert iag number to logical block number of the iag page */ | ||
43 | #define IAGTOLBLK(iagno,l2nbperpg) (((iagno) + 1) << (l2nbperpg)) | ||
44 | |||
45 | /* get the starting block number of the 4K page of an inode extent | ||
46 | * that contains ino. | ||
47 | */ | ||
48 | #define INOPBLK(pxd,ino,l2nbperpg) (addressPXD((pxd)) + \ | ||
49 | ((((ino) & (INOSPEREXT-1)) >> L2INOSPERPAGE) << (l2nbperpg))) | ||
50 | |||
51 | /* | ||
52 | * inode allocation map: | ||
53 | * | ||
54 | * inode allocation map consists of | ||
55 | * . the inode map control page and | ||
56 | * . inode allocation group pages (per 4096 inodes) | ||
57 | * which are addressed by standard JFS xtree. | ||
58 | */ | ||
59 | /* | ||
60 | * inode allocation group page (per 4096 inodes of an AG) | ||
61 | */ | ||
62 | struct iag { | ||
63 | __le64 agstart; /* 8: starting block of ag */ | ||
64 | __le32 iagnum; /* 4: inode allocation group number */ | ||
65 | __le32 inofreefwd; /* 4: ag inode free list forward */ | ||
66 | __le32 inofreeback; /* 4: ag inode free list back */ | ||
67 | __le32 extfreefwd; /* 4: ag inode extent free list forward */ | ||
68 | __le32 extfreeback; /* 4: ag inode extent free list back */ | ||
69 | __le32 iagfree; /* 4: iag free list */ | ||
70 | |||
71 | /* summary map: 1 bit per inode extent */ | ||
72 | __le32 inosmap[SMAPSZ]; /* 16: sum map of mapwords w/ free inodes; | ||
73 | * note: this indicates free and backed | ||
74 | * inodes, if the extent is not backed the | ||
75 | * value will be 1. if the extent is | ||
76 | * backed but all inodes are being used the | ||
77 | * value will be 1. if the extent is | ||
78 | * backed but at least one of the inodes is | ||
79 | * free the value will be 0. | ||
80 | */ | ||
81 | __le32 extsmap[SMAPSZ]; /* 16: sum map of mapwords w/ free extents */ | ||
82 | __le32 nfreeinos; /* 4: number of free inodes */ | ||
83 | __le32 nfreeexts; /* 4: number of free extents */ | ||
84 | /* (72) */ | ||
85 | u8 pad[1976]; /* 1976: pad to 2048 bytes */ | ||
86 | /* allocation bit map: 1 bit per inode (0 - free, 1 - allocated) */ | ||
87 | __le32 wmap[EXTSPERIAG]; /* 512: working allocation map */ | ||
88 | __le32 pmap[EXTSPERIAG]; /* 512: persistent allocation map */ | ||
89 | pxd_t inoext[EXTSPERIAG]; /* 1024: inode extent addresses */ | ||
90 | }; /* (4096) */ | ||
91 | |||
92 | /* | ||
93 | * per AG control information (in inode map control page) | ||
94 | */ | ||
95 | struct iagctl_disk { | ||
96 | __le32 inofree; /* 4: free inode list anchor */ | ||
97 | __le32 extfree; /* 4: free extent list anchor */ | ||
98 | __le32 numinos; /* 4: number of backed inodes */ | ||
99 | __le32 numfree; /* 4: number of free inodes */ | ||
100 | }; /* (16) */ | ||
101 | |||
102 | struct iagctl { | ||
103 | int inofree; /* free inode list anchor */ | ||
104 | int extfree; /* free extent list anchor */ | ||
105 | int numinos; /* number of backed inodes */ | ||
106 | int numfree; /* number of free inodes */ | ||
107 | }; | ||
108 | |||
109 | /* | ||
110 | * per fileset/aggregate inode map control page | ||
111 | */ | ||
112 | struct dinomap_disk { | ||
113 | __le32 in_freeiag; /* 4: free iag list anchor */ | ||
114 | __le32 in_nextiag; /* 4: next free iag number */ | ||
115 | __le32 in_numinos; /* 4: num of backed inodes */ | ||
116 | __le32 in_numfree; /* 4: num of free backed inodes */ | ||
117 | __le32 in_nbperiext; /* 4: num of blocks per inode extent */ | ||
118 | __le32 in_l2nbperiext; /* 4: l2 of in_nbperiext */ | ||
119 | __le32 in_diskblock; /* 4: for standalone test driver */ | ||
120 | __le32 in_maxag; /* 4: for standalone test driver */ | ||
121 | u8 pad[2016]; /* 2016: pad to 2048 */ | ||
122 | struct iagctl_disk in_agctl[MAXAG]; /* 2048: AG control information */ | ||
123 | }; /* (4096) */ | ||
124 | |||
125 | struct dinomap { | ||
126 | int in_freeiag; /* free iag list anchor */ | ||
127 | int in_nextiag; /* next free iag number */ | ||
128 | int in_numinos; /* num of backed inodes */ | ||
129 | int in_numfree; /* num of free backed inodes */ | ||
130 | int in_nbperiext; /* num of blocks per inode extent */ | ||
131 | int in_l2nbperiext; /* l2 of in_nbperiext */ | ||
132 | int in_diskblock; /* for standalone test driver */ | ||
133 | int in_maxag; /* for standalone test driver */ | ||
134 | struct iagctl in_agctl[MAXAG]; /* AG control information */ | ||
135 | }; | ||
136 | |||
137 | /* | ||
138 | * In-core inode map control page | ||
139 | */ | ||
140 | struct inomap { | ||
141 | struct dinomap im_imap; /* 4096: inode allocation control */ | ||
142 | struct inode *im_ipimap; /* 4: ptr to inode for imap */ | ||
143 | struct semaphore im_freelock; /* 4: iag free list lock */ | ||
144 | struct semaphore im_aglock[MAXAG]; /* 512: per AG locks */ | ||
145 | u32 *im_DBGdimap; | ||
146 | atomic_t im_numinos; /* num of backed inodes */ | ||
147 | atomic_t im_numfree; /* num of free backed inodes */ | ||
148 | }; | ||
149 | |||
150 | #define im_freeiag im_imap.in_freeiag | ||
151 | #define im_nextiag im_imap.in_nextiag | ||
152 | #define im_agctl im_imap.in_agctl | ||
153 | #define im_nbperiext im_imap.in_nbperiext | ||
154 | #define im_l2nbperiext im_imap.in_l2nbperiext | ||
155 | |||
156 | /* for standalone testdriver | ||
157 | */ | ||
158 | #define im_diskblock im_imap.in_diskblock | ||
159 | #define im_maxag im_imap.in_maxag | ||
160 | |||
161 | extern int diFree(struct inode *); | ||
162 | extern int diAlloc(struct inode *, boolean_t, struct inode *); | ||
163 | extern int diSync(struct inode *); | ||
164 | /* external references */ | ||
165 | extern int diUpdatePMap(struct inode *ipimap, unsigned long inum, | ||
166 | boolean_t is_free, struct tblock * tblk); | ||
167 | extern int diExtendFS(struct inode *ipimap, struct inode *ipbmap); | ||
168 | extern int diMount(struct inode *); | ||
169 | extern int diUnmount(struct inode *, int); | ||
170 | extern int diRead(struct inode *); | ||
171 | extern struct inode *diReadSpecial(struct super_block *, ino_t, int); | ||
172 | extern void diWriteSpecial(struct inode *, int); | ||
173 | extern void diFreeSpecial(struct inode *); | ||
174 | extern int diWrite(tid_t tid, struct inode *); | ||
175 | #endif /* _H_JFS_IMAP */ | ||
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h new file mode 100644 index 000000000000..ebd77c1bed66 --- /dev/null +++ b/fs/jfs/jfs_incore.h | |||
@@ -0,0 +1,197 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * Portions Copyright (C) Christoph Hellwig, 2001-2002 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | #ifndef _H_JFS_INCORE | ||
20 | #define _H_JFS_INCORE | ||
21 | |||
22 | #include <linux/rwsem.h> | ||
23 | #include <linux/slab.h> | ||
24 | #include <linux/bitops.h> | ||
25 | #include "jfs_types.h" | ||
26 | #include "jfs_xtree.h" | ||
27 | #include "jfs_dtree.h" | ||
28 | |||
29 | /* | ||
30 | * JFS magic number | ||
31 | */ | ||
32 | #define JFS_SUPER_MAGIC 0x3153464a /* "JFS1" */ | ||
33 | |||
34 | /* | ||
35 | * JFS-private inode information | ||
36 | */ | ||
37 | struct jfs_inode_info { | ||
38 | int fileset; /* fileset number (always 16)*/ | ||
39 | uint mode2; /* jfs-specific mode */ | ||
40 | pxd_t ixpxd; /* inode extent descriptor */ | ||
41 | dxd_t acl; /* dxd describing acl */ | ||
42 | dxd_t ea; /* dxd describing ea */ | ||
43 | time_t otime; /* time created */ | ||
44 | uint next_index; /* next available directory entry index */ | ||
45 | int acltype; /* Type of ACL */ | ||
46 | short btorder; /* access order */ | ||
47 | short btindex; /* btpage entry index*/ | ||
48 | struct inode *ipimap; /* inode map */ | ||
49 | long cflag; /* commit flags */ | ||
50 | u16 bxflag; /* xflag of pseudo buffer? */ | ||
51 | unchar agno; /* ag number */ | ||
52 | signed char active_ag; /* ag currently allocating from */ | ||
53 | lid_t blid; /* lid of pseudo buffer? */ | ||
54 | lid_t atlhead; /* anonymous tlock list head */ | ||
55 | lid_t atltail; /* anonymous tlock list tail */ | ||
56 | spinlock_t ag_lock; /* protects active_ag */ | ||
57 | struct list_head anon_inode_list; /* inodes having anonymous txns */ | ||
58 | /* | ||
59 | * rdwrlock serializes xtree between reads & writes and synchronizes | ||
60 | * changes to special inodes. It's use would be redundant on | ||
61 | * directories since the i_sem taken in the VFS is sufficient. | ||
62 | */ | ||
63 | struct rw_semaphore rdwrlock; | ||
64 | /* | ||
65 | * commit_sem serializes transaction processing on an inode. | ||
66 | * It must be taken after beginning a transaction (txBegin), since | ||
67 | * dirty inodes may be committed while a new transaction on the | ||
68 | * inode is blocked in txBegin or TxBeginAnon | ||
69 | */ | ||
70 | struct semaphore commit_sem; | ||
71 | /* xattr_sem allows us to access the xattrs without taking i_sem */ | ||
72 | struct rw_semaphore xattr_sem; | ||
73 | lid_t xtlid; /* lid of xtree lock on directory */ | ||
74 | #ifdef CONFIG_JFS_POSIX_ACL | ||
75 | struct posix_acl *i_acl; | ||
76 | struct posix_acl *i_default_acl; | ||
77 | #endif | ||
78 | union { | ||
79 | struct { | ||
80 | xtpage_t _xtroot; /* 288: xtree root */ | ||
81 | struct inomap *_imap; /* 4: inode map header */ | ||
82 | } file; | ||
83 | struct { | ||
84 | struct dir_table_slot _table[12]; /* 96: dir index */ | ||
85 | dtroot_t _dtroot; /* 288: dtree root */ | ||
86 | } dir; | ||
87 | struct { | ||
88 | unchar _unused[16]; /* 16: */ | ||
89 | dxd_t _dxd; /* 16: */ | ||
90 | unchar _inline[128]; /* 128: inline symlink */ | ||
91 | /* _inline_ea may overlay the last part of | ||
92 | * file._xtroot if maxentry = XTROOTINITSLOT | ||
93 | */ | ||
94 | unchar _inline_ea[128]; /* 128: inline extended attr */ | ||
95 | } link; | ||
96 | } u; | ||
97 | u32 dev; /* will die when we get wide dev_t */ | ||
98 | struct inode vfs_inode; | ||
99 | }; | ||
100 | #define i_xtroot u.file._xtroot | ||
101 | #define i_imap u.file._imap | ||
102 | #define i_dirtable u.dir._table | ||
103 | #define i_dtroot u.dir._dtroot | ||
104 | #define i_inline u.link._inline | ||
105 | #define i_inline_ea u.link._inline_ea | ||
106 | |||
107 | #define JFS_ACL_NOT_CACHED ((void *)-1) | ||
108 | |||
109 | #define IREAD_LOCK(ip) down_read(&JFS_IP(ip)->rdwrlock) | ||
110 | #define IREAD_UNLOCK(ip) up_read(&JFS_IP(ip)->rdwrlock) | ||
111 | #define IWRITE_LOCK(ip) down_write(&JFS_IP(ip)->rdwrlock) | ||
112 | #define IWRITE_UNLOCK(ip) up_write(&JFS_IP(ip)->rdwrlock) | ||
113 | |||
114 | /* | ||
115 | * cflag | ||
116 | */ | ||
117 | enum cflags { | ||
118 | COMMIT_Nolink, /* inode committed with zero link count */ | ||
119 | COMMIT_Inlineea, /* commit inode inline EA */ | ||
120 | COMMIT_Freewmap, /* free WMAP at iClose() */ | ||
121 | COMMIT_Dirty, /* Inode is really dirty */ | ||
122 | COMMIT_Dirtable, /* commit changes to di_dirtable */ | ||
123 | COMMIT_Stale, /* data extent is no longer valid */ | ||
124 | COMMIT_Synclist, /* metadata pages on group commit synclist */ | ||
125 | }; | ||
126 | |||
127 | #define set_cflag(flag, ip) set_bit(flag, &(JFS_IP(ip)->cflag)) | ||
128 | #define clear_cflag(flag, ip) clear_bit(flag, &(JFS_IP(ip)->cflag)) | ||
129 | #define test_cflag(flag, ip) test_bit(flag, &(JFS_IP(ip)->cflag)) | ||
130 | #define test_and_clear_cflag(flag, ip) \ | ||
131 | test_and_clear_bit(flag, &(JFS_IP(ip)->cflag)) | ||
132 | /* | ||
133 | * JFS-private superblock information. | ||
134 | */ | ||
135 | struct jfs_sb_info { | ||
136 | struct super_block *sb; /* Point back to vfs super block */ | ||
137 | unsigned long mntflag; /* aggregate attributes */ | ||
138 | struct inode *ipbmap; /* block map inode */ | ||
139 | struct inode *ipaimap; /* aggregate inode map inode */ | ||
140 | struct inode *ipaimap2; /* secondary aimap inode */ | ||
141 | struct inode *ipimap; /* aggregate inode map inode */ | ||
142 | struct jfs_log *log; /* log */ | ||
143 | struct list_head log_list; /* volumes associated with a journal */ | ||
144 | short bsize; /* logical block size */ | ||
145 | short l2bsize; /* log2 logical block size */ | ||
146 | short nbperpage; /* blocks per page */ | ||
147 | short l2nbperpage; /* log2 blocks per page */ | ||
148 | short l2niperblk; /* log2 inodes per page */ | ||
149 | dev_t logdev; /* external log device */ | ||
150 | uint aggregate; /* volume identifier in log record */ | ||
151 | pxd_t logpxd; /* pxd describing log */ | ||
152 | pxd_t fsckpxd; /* pxd describing fsck wkspc */ | ||
153 | pxd_t ait2; /* pxd describing AIT copy */ | ||
154 | char uuid[16]; /* 128-bit uuid for volume */ | ||
155 | char loguuid[16]; /* 128-bit uuid for log */ | ||
156 | /* | ||
157 | * commit_state is used for synchronization of the jfs_commit | ||
158 | * threads. It is protected by LAZY_LOCK(). | ||
159 | */ | ||
160 | int commit_state; /* commit state */ | ||
161 | /* Formerly in ipimap */ | ||
162 | uint gengen; /* inode generation generator*/ | ||
163 | uint inostamp; /* shows inode belongs to fileset*/ | ||
164 | |||
165 | /* Formerly in ipbmap */ | ||
166 | struct bmap *bmap; /* incore bmap descriptor */ | ||
167 | struct nls_table *nls_tab; /* current codepage */ | ||
168 | uint state; /* mount/recovery state */ | ||
169 | unsigned long flag; /* mount time flags */ | ||
170 | uint p_state; /* state prior to going no integrity */ | ||
171 | }; | ||
172 | |||
173 | /* jfs_sb_info commit_state */ | ||
174 | #define IN_LAZYCOMMIT 1 | ||
175 | |||
176 | static inline struct jfs_inode_info *JFS_IP(struct inode *inode) | ||
177 | { | ||
178 | return list_entry(inode, struct jfs_inode_info, vfs_inode); | ||
179 | } | ||
180 | |||
181 | static inline int jfs_dirtable_inline(struct inode *inode) | ||
182 | { | ||
183 | return (JFS_IP(inode)->next_index <= (MAX_INLINE_DIRTABLE_ENTRY + 1)); | ||
184 | } | ||
185 | |||
186 | static inline struct jfs_sb_info *JFS_SBI(struct super_block *sb) | ||
187 | { | ||
188 | return sb->s_fs_info; | ||
189 | } | ||
190 | |||
191 | static inline int isReadOnly(struct inode *inode) | ||
192 | { | ||
193 | if (JFS_SBI(inode->i_sb)->log) | ||
194 | return 0; | ||
195 | return 1; | ||
196 | } | ||
197 | #endif /* _H_JFS_INCORE */ | ||
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c new file mode 100644 index 000000000000..84f2459b2191 --- /dev/null +++ b/fs/jfs/jfs_inode.c | |||
@@ -0,0 +1,104 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | |||
19 | #include <linux/fs.h> | ||
20 | #include <linux/quotaops.h> | ||
21 | #include "jfs_incore.h" | ||
22 | #include "jfs_filsys.h" | ||
23 | #include "jfs_imap.h" | ||
24 | #include "jfs_dinode.h" | ||
25 | #include "jfs_debug.h" | ||
26 | |||
27 | /* | ||
28 | * NAME: ialloc() | ||
29 | * | ||
30 | * FUNCTION: Allocate a new inode | ||
31 | * | ||
32 | */ | ||
33 | struct inode *ialloc(struct inode *parent, umode_t mode) | ||
34 | { | ||
35 | struct super_block *sb = parent->i_sb; | ||
36 | struct inode *inode; | ||
37 | struct jfs_inode_info *jfs_inode; | ||
38 | int rc; | ||
39 | |||
40 | inode = new_inode(sb); | ||
41 | if (!inode) { | ||
42 | jfs_warn("ialloc: new_inode returned NULL!"); | ||
43 | return inode; | ||
44 | } | ||
45 | |||
46 | jfs_inode = JFS_IP(inode); | ||
47 | |||
48 | rc = diAlloc(parent, S_ISDIR(mode), inode); | ||
49 | if (rc) { | ||
50 | jfs_warn("ialloc: diAlloc returned %d!", rc); | ||
51 | make_bad_inode(inode); | ||
52 | iput(inode); | ||
53 | return NULL; | ||
54 | } | ||
55 | |||
56 | inode->i_uid = current->fsuid; | ||
57 | if (parent->i_mode & S_ISGID) { | ||
58 | inode->i_gid = parent->i_gid; | ||
59 | if (S_ISDIR(mode)) | ||
60 | mode |= S_ISGID; | ||
61 | } else | ||
62 | inode->i_gid = current->fsgid; | ||
63 | |||
64 | /* | ||
65 | * Allocate inode to quota. | ||
66 | */ | ||
67 | if (DQUOT_ALLOC_INODE(inode)) { | ||
68 | DQUOT_DROP(inode); | ||
69 | inode->i_flags |= S_NOQUOTA; | ||
70 | inode->i_nlink = 0; | ||
71 | iput(inode); | ||
72 | return NULL; | ||
73 | } | ||
74 | |||
75 | inode->i_mode = mode; | ||
76 | if (S_ISDIR(mode)) | ||
77 | jfs_inode->mode2 = IDIRECTORY | mode; | ||
78 | else | ||
79 | jfs_inode->mode2 = INLINEEA | ISPARSE | mode; | ||
80 | inode->i_blksize = sb->s_blocksize; | ||
81 | inode->i_blocks = 0; | ||
82 | inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; | ||
83 | jfs_inode->otime = inode->i_ctime.tv_sec; | ||
84 | inode->i_generation = JFS_SBI(sb)->gengen++; | ||
85 | |||
86 | jfs_inode->cflag = 0; | ||
87 | |||
88 | /* Zero remaining fields */ | ||
89 | memset(&jfs_inode->acl, 0, sizeof(dxd_t)); | ||
90 | memset(&jfs_inode->ea, 0, sizeof(dxd_t)); | ||
91 | jfs_inode->next_index = 0; | ||
92 | jfs_inode->acltype = 0; | ||
93 | jfs_inode->btorder = 0; | ||
94 | jfs_inode->btindex = 0; | ||
95 | jfs_inode->bxflag = 0; | ||
96 | jfs_inode->blid = 0; | ||
97 | jfs_inode->atlhead = 0; | ||
98 | jfs_inode->atltail = 0; | ||
99 | jfs_inode->xtlid = 0; | ||
100 | |||
101 | jfs_info("ialloc returns inode = 0x%p\n", inode); | ||
102 | |||
103 | return inode; | ||
104 | } | ||
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h new file mode 100644 index 000000000000..3df91fbfe781 --- /dev/null +++ b/fs/jfs/jfs_inode.h | |||
@@ -0,0 +1,23 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2000-2001 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | #ifndef _H_JFS_INODE | ||
19 | #define _H_JFS_INODE | ||
20 | |||
21 | extern struct inode *ialloc(struct inode *, umode_t); | ||
22 | |||
23 | #endif /* _H_JFS_INODE */ | ||
diff --git a/fs/jfs/jfs_lock.h b/fs/jfs/jfs_lock.h new file mode 100644 index 000000000000..10ad1d086685 --- /dev/null +++ b/fs/jfs/jfs_lock.h | |||
@@ -0,0 +1,51 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2000-2001 | ||
3 | * Portions Copyright (c) Christoph Hellwig, 2001-2002 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | #ifndef _H_JFS_LOCK | ||
20 | #define _H_JFS_LOCK | ||
21 | |||
22 | #include <linux/spinlock.h> | ||
23 | #include <linux/sched.h> | ||
24 | |||
25 | /* | ||
26 | * jfs_lock.h | ||
27 | */ | ||
28 | |||
29 | /* | ||
30 | * Conditional sleep where condition is protected by spinlock | ||
31 | * | ||
32 | * lock_cmd and unlock_cmd take and release the spinlock | ||
33 | */ | ||
34 | #define __SLEEP_COND(wq, cond, lock_cmd, unlock_cmd) \ | ||
35 | do { \ | ||
36 | DECLARE_WAITQUEUE(__wait, current); \ | ||
37 | \ | ||
38 | add_wait_queue(&wq, &__wait); \ | ||
39 | for (;;) { \ | ||
40 | set_current_state(TASK_UNINTERRUPTIBLE);\ | ||
41 | if (cond) \ | ||
42 | break; \ | ||
43 | unlock_cmd; \ | ||
44 | schedule(); \ | ||
45 | lock_cmd; \ | ||
46 | } \ | ||
47 | current->state = TASK_RUNNING; \ | ||
48 | remove_wait_queue(&wq, &__wait); \ | ||
49 | } while (0) | ||
50 | |||
51 | #endif /* _H_JFS_LOCK */ | ||
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c new file mode 100644 index 000000000000..b6a6869ebb4f --- /dev/null +++ b/fs/jfs/jfs_logmgr.c | |||
@@ -0,0 +1,2524 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * Portions Copyright (C) Christoph Hellwig, 2001-2002 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | |||
20 | /* | ||
21 | * jfs_logmgr.c: log manager | ||
22 | * | ||
23 | * for related information, see transaction manager (jfs_txnmgr.c), and | ||
24 | * recovery manager (jfs_logredo.c). | ||
25 | * | ||
26 | * note: for detail, RTFS. | ||
27 | * | ||
28 | * log buffer manager: | ||
29 | * special purpose buffer manager supporting log i/o requirements. | ||
30 | * per log serial pageout of logpage | ||
31 | * queuing i/o requests and redrive i/o at iodone | ||
32 | * maintain current logpage buffer | ||
33 | * no caching since append only | ||
34 | * appropriate jfs buffer cache buffers as needed | ||
35 | * | ||
36 | * group commit: | ||
37 | * transactions which wrote COMMIT records in the same in-memory | ||
38 | * log page during the pageout of previous/current log page(s) are | ||
39 | * committed together by the pageout of the page. | ||
40 | * | ||
41 | * TBD lazy commit: | ||
42 | * transactions are committed asynchronously when the log page | ||
43 | * containing it COMMIT is paged out when it becomes full; | ||
44 | * | ||
45 | * serialization: | ||
46 | * . a per log lock serialize log write. | ||
47 | * . a per log lock serialize group commit. | ||
48 | * . a per log lock serialize log open/close; | ||
49 | * | ||
50 | * TBD log integrity: | ||
51 | * careful-write (ping-pong) of last logpage to recover from crash | ||
52 | * in overwrite. | ||
53 | * detection of split (out-of-order) write of physical sectors | ||
54 | * of last logpage via timestamp at end of each sector | ||
55 | * with its mirror data array at trailer). | ||
56 | * | ||
57 | * alternatives: | ||
58 | * lsn - 64-bit monotonically increasing integer vs | ||
59 | * 32-bit lspn and page eor. | ||
60 | */ | ||
61 | |||
62 | #include <linux/fs.h> | ||
63 | #include <linux/blkdev.h> | ||
64 | #include <linux/interrupt.h> | ||
65 | #include <linux/smp_lock.h> | ||
66 | #include <linux/completion.h> | ||
67 | #include <linux/buffer_head.h> /* for sync_blockdev() */ | ||
68 | #include <linux/bio.h> | ||
69 | #include <linux/suspend.h> | ||
70 | #include <linux/delay.h> | ||
71 | #include "jfs_incore.h" | ||
72 | #include "jfs_filsys.h" | ||
73 | #include "jfs_metapage.h" | ||
74 | #include "jfs_txnmgr.h" | ||
75 | #include "jfs_debug.h" | ||
76 | |||
77 | |||
78 | /* | ||
79 | * lbuf's ready to be redriven. Protected by log_redrive_lock (jfsIO thread) | ||
80 | */ | ||
81 | static struct lbuf *log_redrive_list; | ||
82 | static DEFINE_SPINLOCK(log_redrive_lock); | ||
83 | DECLARE_WAIT_QUEUE_HEAD(jfs_IO_thread_wait); | ||
84 | |||
85 | |||
86 | /* | ||
87 | * log read/write serialization (per log) | ||
88 | */ | ||
89 | #define LOG_LOCK_INIT(log) init_MUTEX(&(log)->loglock) | ||
90 | #define LOG_LOCK(log) down(&((log)->loglock)) | ||
91 | #define LOG_UNLOCK(log) up(&((log)->loglock)) | ||
92 | |||
93 | |||
94 | /* | ||
95 | * log group commit serialization (per log) | ||
96 | */ | ||
97 | |||
98 | #define LOGGC_LOCK_INIT(log) spin_lock_init(&(log)->gclock) | ||
99 | #define LOGGC_LOCK(log) spin_lock_irq(&(log)->gclock) | ||
100 | #define LOGGC_UNLOCK(log) spin_unlock_irq(&(log)->gclock) | ||
101 | #define LOGGC_WAKEUP(tblk) wake_up_all(&(tblk)->gcwait) | ||
102 | |||
103 | /* | ||
104 | * log sync serialization (per log) | ||
105 | */ | ||
106 | #define LOGSYNC_DELTA(logsize) min((logsize)/8, 128*LOGPSIZE) | ||
107 | #define LOGSYNC_BARRIER(logsize) ((logsize)/4) | ||
108 | /* | ||
109 | #define LOGSYNC_DELTA(logsize) min((logsize)/4, 256*LOGPSIZE) | ||
110 | #define LOGSYNC_BARRIER(logsize) ((logsize)/2) | ||
111 | */ | ||
112 | |||
113 | |||
114 | /* | ||
115 | * log buffer cache synchronization | ||
116 | */ | ||
117 | static DEFINE_SPINLOCK(jfsLCacheLock); | ||
118 | |||
119 | #define LCACHE_LOCK(flags) spin_lock_irqsave(&jfsLCacheLock, flags) | ||
120 | #define LCACHE_UNLOCK(flags) spin_unlock_irqrestore(&jfsLCacheLock, flags) | ||
121 | |||
122 | /* | ||
123 | * See __SLEEP_COND in jfs_locks.h | ||
124 | */ | ||
125 | #define LCACHE_SLEEP_COND(wq, cond, flags) \ | ||
126 | do { \ | ||
127 | if (cond) \ | ||
128 | break; \ | ||
129 | __SLEEP_COND(wq, cond, LCACHE_LOCK(flags), LCACHE_UNLOCK(flags)); \ | ||
130 | } while (0) | ||
131 | |||
132 | #define LCACHE_WAKEUP(event) wake_up(event) | ||
133 | |||
134 | |||
135 | /* | ||
136 | * lbuf buffer cache (lCache) control | ||
137 | */ | ||
138 | /* log buffer manager pageout control (cumulative, inclusive) */ | ||
139 | #define lbmREAD 0x0001 | ||
140 | #define lbmWRITE 0x0002 /* enqueue at tail of write queue; | ||
141 | * init pageout if at head of queue; | ||
142 | */ | ||
143 | #define lbmRELEASE 0x0004 /* remove from write queue | ||
144 | * at completion of pageout; | ||
145 | * do not free/recycle it yet: | ||
146 | * caller will free it; | ||
147 | */ | ||
148 | #define lbmSYNC 0x0008 /* do not return to freelist | ||
149 | * when removed from write queue; | ||
150 | */ | ||
151 | #define lbmFREE 0x0010 /* return to freelist | ||
152 | * at completion of pageout; | ||
153 | * the buffer may be recycled; | ||
154 | */ | ||
155 | #define lbmDONE 0x0020 | ||
156 | #define lbmERROR 0x0040 | ||
157 | #define lbmGC 0x0080 /* lbmIODone to perform post-GC processing | ||
158 | * of log page | ||
159 | */ | ||
160 | #define lbmDIRECT 0x0100 | ||
161 | |||
162 | /* | ||
163 | * Global list of active external journals | ||
164 | */ | ||
165 | static LIST_HEAD(jfs_external_logs); | ||
166 | static struct jfs_log *dummy_log = NULL; | ||
167 | static DECLARE_MUTEX(jfs_log_sem); | ||
168 | |||
169 | /* | ||
170 | * external references | ||
171 | */ | ||
172 | extern void txLazyUnlock(struct tblock * tblk); | ||
173 | extern int jfs_stop_threads; | ||
174 | extern struct completion jfsIOwait; | ||
175 | extern int jfs_tlocks_low; | ||
176 | |||
177 | /* | ||
178 | * forward references | ||
179 | */ | ||
180 | static int lmWriteRecord(struct jfs_log * log, struct tblock * tblk, | ||
181 | struct lrd * lrd, struct tlock * tlck); | ||
182 | |||
183 | static int lmNextPage(struct jfs_log * log); | ||
184 | static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi, | ||
185 | int activate); | ||
186 | |||
187 | static int open_inline_log(struct super_block *sb); | ||
188 | static int open_dummy_log(struct super_block *sb); | ||
189 | static int lbmLogInit(struct jfs_log * log); | ||
190 | static void lbmLogShutdown(struct jfs_log * log); | ||
191 | static struct lbuf *lbmAllocate(struct jfs_log * log, int); | ||
192 | static void lbmFree(struct lbuf * bp); | ||
193 | static void lbmfree(struct lbuf * bp); | ||
194 | static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp); | ||
195 | static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, int cant_block); | ||
196 | static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag); | ||
197 | static int lbmIOWait(struct lbuf * bp, int flag); | ||
198 | static bio_end_io_t lbmIODone; | ||
199 | static void lbmStartIO(struct lbuf * bp); | ||
200 | static void lmGCwrite(struct jfs_log * log, int cant_block); | ||
201 | static int lmLogSync(struct jfs_log * log, int nosyncwait); | ||
202 | |||
203 | |||
204 | |||
205 | /* | ||
206 | * statistics | ||
207 | */ | ||
208 | #ifdef CONFIG_JFS_STATISTICS | ||
209 | static struct lmStat { | ||
210 | uint commit; /* # of commit */ | ||
211 | uint pagedone; /* # of page written */ | ||
212 | uint submitted; /* # of pages submitted */ | ||
213 | uint full_page; /* # of full pages submitted */ | ||
214 | uint partial_page; /* # of partial pages submitted */ | ||
215 | } lmStat; | ||
216 | #endif | ||
217 | |||
218 | |||
219 | /* | ||
220 | * NAME: lmLog() | ||
221 | * | ||
222 | * FUNCTION: write a log record; | ||
223 | * | ||
224 | * PARAMETER: | ||
225 | * | ||
226 | * RETURN: lsn - offset to the next log record to write (end-of-log); | ||
227 | * -1 - error; | ||
228 | * | ||
229 | * note: todo: log error handler | ||
230 | */ | ||
231 | int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, | ||
232 | struct tlock * tlck) | ||
233 | { | ||
234 | int lsn; | ||
235 | int diffp, difft; | ||
236 | struct metapage *mp = NULL; | ||
237 | |||
238 | jfs_info("lmLog: log:0x%p tblk:0x%p, lrd:0x%p tlck:0x%p", | ||
239 | log, tblk, lrd, tlck); | ||
240 | |||
241 | LOG_LOCK(log); | ||
242 | |||
243 | /* log by (out-of-transaction) JFS ? */ | ||
244 | if (tblk == NULL) | ||
245 | goto writeRecord; | ||
246 | |||
247 | /* log from page ? */ | ||
248 | if (tlck == NULL || | ||
249 | tlck->type & tlckBTROOT || (mp = tlck->mp) == NULL) | ||
250 | goto writeRecord; | ||
251 | |||
252 | /* | ||
253 | * initialize/update page/transaction recovery lsn | ||
254 | */ | ||
255 | lsn = log->lsn; | ||
256 | |||
257 | LOGSYNC_LOCK(log); | ||
258 | |||
259 | /* | ||
260 | * initialize page lsn if first log write of the page | ||
261 | */ | ||
262 | if (mp->lsn == 0) { | ||
263 | mp->log = log; | ||
264 | mp->lsn = lsn; | ||
265 | log->count++; | ||
266 | |||
267 | /* insert page at tail of logsynclist */ | ||
268 | list_add_tail(&mp->synclist, &log->synclist); | ||
269 | } | ||
270 | |||
271 | /* | ||
272 | * initialize/update lsn of tblock of the page | ||
273 | * | ||
274 | * transaction inherits oldest lsn of pages associated | ||
275 | * with allocation/deallocation of resources (their | ||
276 | * log records are used to reconstruct allocation map | ||
277 | * at recovery time: inode for inode allocation map, | ||
278 | * B+-tree index of extent descriptors for block | ||
279 | * allocation map); | ||
280 | * allocation map pages inherit transaction lsn at | ||
281 | * commit time to allow forwarding log syncpt past log | ||
282 | * records associated with allocation/deallocation of | ||
283 | * resources only after persistent map of these map pages | ||
284 | * have been updated and propagated to home. | ||
285 | */ | ||
286 | /* | ||
287 | * initialize transaction lsn: | ||
288 | */ | ||
289 | if (tblk->lsn == 0) { | ||
290 | /* inherit lsn of its first page logged */ | ||
291 | tblk->lsn = mp->lsn; | ||
292 | log->count++; | ||
293 | |||
294 | /* insert tblock after the page on logsynclist */ | ||
295 | list_add(&tblk->synclist, &mp->synclist); | ||
296 | } | ||
297 | /* | ||
298 | * update transaction lsn: | ||
299 | */ | ||
300 | else { | ||
301 | /* inherit oldest/smallest lsn of page */ | ||
302 | logdiff(diffp, mp->lsn, log); | ||
303 | logdiff(difft, tblk->lsn, log); | ||
304 | if (diffp < difft) { | ||
305 | /* update tblock lsn with page lsn */ | ||
306 | tblk->lsn = mp->lsn; | ||
307 | |||
308 | /* move tblock after page on logsynclist */ | ||
309 | list_move(&tblk->synclist, &mp->synclist); | ||
310 | } | ||
311 | } | ||
312 | |||
313 | LOGSYNC_UNLOCK(log); | ||
314 | |||
315 | /* | ||
316 | * write the log record | ||
317 | */ | ||
318 | writeRecord: | ||
319 | lsn = lmWriteRecord(log, tblk, lrd, tlck); | ||
320 | |||
321 | /* | ||
322 | * forward log syncpt if log reached next syncpt trigger | ||
323 | */ | ||
324 | logdiff(diffp, lsn, log); | ||
325 | if (diffp >= log->nextsync) | ||
326 | lsn = lmLogSync(log, 0); | ||
327 | |||
328 | /* update end-of-log lsn */ | ||
329 | log->lsn = lsn; | ||
330 | |||
331 | LOG_UNLOCK(log); | ||
332 | |||
333 | /* return end-of-log address */ | ||
334 | return lsn; | ||
335 | } | ||
336 | |||
337 | |||
338 | /* | ||
339 | * NAME: lmWriteRecord() | ||
340 | * | ||
341 | * FUNCTION: move the log record to current log page | ||
342 | * | ||
343 | * PARAMETER: cd - commit descriptor | ||
344 | * | ||
345 | * RETURN: end-of-log address | ||
346 | * | ||
347 | * serialization: LOG_LOCK() held on entry/exit | ||
348 | */ | ||
349 | static int | ||
350 | lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, | ||
351 | struct tlock * tlck) | ||
352 | { | ||
353 | int lsn = 0; /* end-of-log address */ | ||
354 | struct lbuf *bp; /* dst log page buffer */ | ||
355 | struct logpage *lp; /* dst log page */ | ||
356 | caddr_t dst; /* destination address in log page */ | ||
357 | int dstoffset; /* end-of-log offset in log page */ | ||
358 | int freespace; /* free space in log page */ | ||
359 | caddr_t p; /* src meta-data page */ | ||
360 | caddr_t src; | ||
361 | int srclen; | ||
362 | int nbytes; /* number of bytes to move */ | ||
363 | int i; | ||
364 | int len; | ||
365 | struct linelock *linelock; | ||
366 | struct lv *lv; | ||
367 | struct lvd *lvd; | ||
368 | int l2linesize; | ||
369 | |||
370 | len = 0; | ||
371 | |||
372 | /* retrieve destination log page to write */ | ||
373 | bp = (struct lbuf *) log->bp; | ||
374 | lp = (struct logpage *) bp->l_ldata; | ||
375 | dstoffset = log->eor; | ||
376 | |||
377 | /* any log data to write ? */ | ||
378 | if (tlck == NULL) | ||
379 | goto moveLrd; | ||
380 | |||
381 | /* | ||
382 | * move log record data | ||
383 | */ | ||
384 | /* retrieve source meta-data page to log */ | ||
385 | if (tlck->flag & tlckPAGELOCK) { | ||
386 | p = (caddr_t) (tlck->mp->data); | ||
387 | linelock = (struct linelock *) & tlck->lock; | ||
388 | } | ||
389 | /* retrieve source in-memory inode to log */ | ||
390 | else if (tlck->flag & tlckINODELOCK) { | ||
391 | if (tlck->type & tlckDTREE) | ||
392 | p = (caddr_t) &JFS_IP(tlck->ip)->i_dtroot; | ||
393 | else | ||
394 | p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot; | ||
395 | linelock = (struct linelock *) & tlck->lock; | ||
396 | } | ||
397 | #ifdef _JFS_WIP | ||
398 | else if (tlck->flag & tlckINLINELOCK) { | ||
399 | |||
400 | inlinelock = (struct inlinelock *) & tlck; | ||
401 | p = (caddr_t) & inlinelock->pxd; | ||
402 | linelock = (struct linelock *) & tlck; | ||
403 | } | ||
404 | #endif /* _JFS_WIP */ | ||
405 | else { | ||
406 | jfs_err("lmWriteRecord: UFO tlck:0x%p", tlck); | ||
407 | return 0; /* Probably should trap */ | ||
408 | } | ||
409 | l2linesize = linelock->l2linesize; | ||
410 | |||
411 | moveData: | ||
412 | ASSERT(linelock->index <= linelock->maxcnt); | ||
413 | |||
414 | lv = linelock->lv; | ||
415 | for (i = 0; i < linelock->index; i++, lv++) { | ||
416 | if (lv->length == 0) | ||
417 | continue; | ||
418 | |||
419 | /* is page full ? */ | ||
420 | if (dstoffset >= LOGPSIZE - LOGPTLRSIZE) { | ||
421 | /* page become full: move on to next page */ | ||
422 | lmNextPage(log); | ||
423 | |||
424 | bp = log->bp; | ||
425 | lp = (struct logpage *) bp->l_ldata; | ||
426 | dstoffset = LOGPHDRSIZE; | ||
427 | } | ||
428 | |||
429 | /* | ||
430 | * move log vector data | ||
431 | */ | ||
432 | src = (u8 *) p + (lv->offset << l2linesize); | ||
433 | srclen = lv->length << l2linesize; | ||
434 | len += srclen; | ||
435 | while (srclen > 0) { | ||
436 | freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset; | ||
437 | nbytes = min(freespace, srclen); | ||
438 | dst = (caddr_t) lp + dstoffset; | ||
439 | memcpy(dst, src, nbytes); | ||
440 | dstoffset += nbytes; | ||
441 | |||
442 | /* is page not full ? */ | ||
443 | if (dstoffset < LOGPSIZE - LOGPTLRSIZE) | ||
444 | break; | ||
445 | |||
446 | /* page become full: move on to next page */ | ||
447 | lmNextPage(log); | ||
448 | |||
449 | bp = (struct lbuf *) log->bp; | ||
450 | lp = (struct logpage *) bp->l_ldata; | ||
451 | dstoffset = LOGPHDRSIZE; | ||
452 | |||
453 | srclen -= nbytes; | ||
454 | src += nbytes; | ||
455 | } | ||
456 | |||
457 | /* | ||
458 | * move log vector descriptor | ||
459 | */ | ||
460 | len += 4; | ||
461 | lvd = (struct lvd *) ((caddr_t) lp + dstoffset); | ||
462 | lvd->offset = cpu_to_le16(lv->offset); | ||
463 | lvd->length = cpu_to_le16(lv->length); | ||
464 | dstoffset += 4; | ||
465 | jfs_info("lmWriteRecord: lv offset:%d length:%d", | ||
466 | lv->offset, lv->length); | ||
467 | } | ||
468 | |||
469 | if ((i = linelock->next)) { | ||
470 | linelock = (struct linelock *) lid_to_tlock(i); | ||
471 | goto moveData; | ||
472 | } | ||
473 | |||
474 | /* | ||
475 | * move log record descriptor | ||
476 | */ | ||
477 | moveLrd: | ||
478 | lrd->length = cpu_to_le16(len); | ||
479 | |||
480 | src = (caddr_t) lrd; | ||
481 | srclen = LOGRDSIZE; | ||
482 | |||
483 | while (srclen > 0) { | ||
484 | freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset; | ||
485 | nbytes = min(freespace, srclen); | ||
486 | dst = (caddr_t) lp + dstoffset; | ||
487 | memcpy(dst, src, nbytes); | ||
488 | |||
489 | dstoffset += nbytes; | ||
490 | srclen -= nbytes; | ||
491 | |||
492 | /* are there more to move than freespace of page ? */ | ||
493 | if (srclen) | ||
494 | goto pageFull; | ||
495 | |||
496 | /* | ||
497 | * end of log record descriptor | ||
498 | */ | ||
499 | |||
500 | /* update last log record eor */ | ||
501 | log->eor = dstoffset; | ||
502 | bp->l_eor = dstoffset; | ||
503 | lsn = (log->page << L2LOGPSIZE) + dstoffset; | ||
504 | |||
505 | if (lrd->type & cpu_to_le16(LOG_COMMIT)) { | ||
506 | tblk->clsn = lsn; | ||
507 | jfs_info("wr: tclsn:0x%x, beor:0x%x", tblk->clsn, | ||
508 | bp->l_eor); | ||
509 | |||
510 | INCREMENT(lmStat.commit); /* # of commit */ | ||
511 | |||
512 | /* | ||
513 | * enqueue tblock for group commit: | ||
514 | * | ||
515 | * enqueue tblock of non-trivial/synchronous COMMIT | ||
516 | * at tail of group commit queue | ||
517 | * (trivial/asynchronous COMMITs are ignored by | ||
518 | * group commit.) | ||
519 | */ | ||
520 | LOGGC_LOCK(log); | ||
521 | |||
522 | /* init tblock gc state */ | ||
523 | tblk->flag = tblkGC_QUEUE; | ||
524 | tblk->bp = log->bp; | ||
525 | tblk->pn = log->page; | ||
526 | tblk->eor = log->eor; | ||
527 | |||
528 | /* enqueue transaction to commit queue */ | ||
529 | list_add_tail(&tblk->cqueue, &log->cqueue); | ||
530 | |||
531 | LOGGC_UNLOCK(log); | ||
532 | } | ||
533 | |||
534 | jfs_info("lmWriteRecord: lrd:0x%04x bp:0x%p pn:%d eor:0x%x", | ||
535 | le16_to_cpu(lrd->type), log->bp, log->page, dstoffset); | ||
536 | |||
537 | /* page not full ? */ | ||
538 | if (dstoffset < LOGPSIZE - LOGPTLRSIZE) | ||
539 | return lsn; | ||
540 | |||
541 | pageFull: | ||
542 | /* page become full: move on to next page */ | ||
543 | lmNextPage(log); | ||
544 | |||
545 | bp = (struct lbuf *) log->bp; | ||
546 | lp = (struct logpage *) bp->l_ldata; | ||
547 | dstoffset = LOGPHDRSIZE; | ||
548 | src += nbytes; | ||
549 | } | ||
550 | |||
551 | return lsn; | ||
552 | } | ||
553 | |||
554 | |||
555 | /* | ||
556 | * NAME: lmNextPage() | ||
557 | * | ||
558 | * FUNCTION: write current page and allocate next page. | ||
559 | * | ||
560 | * PARAMETER: log | ||
561 | * | ||
562 | * RETURN: 0 | ||
563 | * | ||
564 | * serialization: LOG_LOCK() held on entry/exit | ||
565 | */ | ||
566 | static int lmNextPage(struct jfs_log * log) | ||
567 | { | ||
568 | struct logpage *lp; | ||
569 | int lspn; /* log sequence page number */ | ||
570 | int pn; /* current page number */ | ||
571 | struct lbuf *bp; | ||
572 | struct lbuf *nextbp; | ||
573 | struct tblock *tblk; | ||
574 | |||
575 | /* get current log page number and log sequence page number */ | ||
576 | pn = log->page; | ||
577 | bp = log->bp; | ||
578 | lp = (struct logpage *) bp->l_ldata; | ||
579 | lspn = le32_to_cpu(lp->h.page); | ||
580 | |||
581 | LOGGC_LOCK(log); | ||
582 | |||
583 | /* | ||
584 | * write or queue the full page at the tail of write queue | ||
585 | */ | ||
586 | /* get the tail tblk on commit queue */ | ||
587 | if (list_empty(&log->cqueue)) | ||
588 | tblk = NULL; | ||
589 | else | ||
590 | tblk = list_entry(log->cqueue.prev, struct tblock, cqueue); | ||
591 | |||
592 | /* every tblk who has COMMIT record on the current page, | ||
593 | * and has not been committed, must be on commit queue | ||
594 | * since tblk is queued at commit queueu at the time | ||
595 | * of writing its COMMIT record on the page before | ||
596 | * page becomes full (even though the tblk thread | ||
597 | * who wrote COMMIT record may have been suspended | ||
598 | * currently); | ||
599 | */ | ||
600 | |||
601 | /* is page bound with outstanding tail tblk ? */ | ||
602 | if (tblk && tblk->pn == pn) { | ||
603 | /* mark tblk for end-of-page */ | ||
604 | tblk->flag |= tblkGC_EOP; | ||
605 | |||
606 | if (log->cflag & logGC_PAGEOUT) { | ||
607 | /* if page is not already on write queue, | ||
608 | * just enqueue (no lbmWRITE to prevent redrive) | ||
609 | * buffer to wqueue to ensure correct serial order | ||
610 | * of the pages since log pages will be added | ||
611 | * continuously | ||
612 | */ | ||
613 | if (bp->l_wqnext == NULL) | ||
614 | lbmWrite(log, bp, 0, 0); | ||
615 | } else { | ||
616 | /* | ||
617 | * No current GC leader, initiate group commit | ||
618 | */ | ||
619 | log->cflag |= logGC_PAGEOUT; | ||
620 | lmGCwrite(log, 0); | ||
621 | } | ||
622 | } | ||
623 | /* page is not bound with outstanding tblk: | ||
624 | * init write or mark it to be redriven (lbmWRITE) | ||
625 | */ | ||
626 | else { | ||
627 | /* finalize the page */ | ||
628 | bp->l_ceor = bp->l_eor; | ||
629 | lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); | ||
630 | lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, 0); | ||
631 | } | ||
632 | LOGGC_UNLOCK(log); | ||
633 | |||
634 | /* | ||
635 | * allocate/initialize next page | ||
636 | */ | ||
637 | /* if log wraps, the first data page of log is 2 | ||
638 | * (0 never used, 1 is superblock). | ||
639 | */ | ||
640 | log->page = (pn == log->size - 1) ? 2 : pn + 1; | ||
641 | log->eor = LOGPHDRSIZE; /* ? valid page empty/full at logRedo() */ | ||
642 | |||
643 | /* allocate/initialize next log page buffer */ | ||
644 | nextbp = lbmAllocate(log, log->page); | ||
645 | nextbp->l_eor = log->eor; | ||
646 | log->bp = nextbp; | ||
647 | |||
648 | /* initialize next log page */ | ||
649 | lp = (struct logpage *) nextbp->l_ldata; | ||
650 | lp->h.page = lp->t.page = cpu_to_le32(lspn + 1); | ||
651 | lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE); | ||
652 | |||
653 | return 0; | ||
654 | } | ||
655 | |||
656 | |||
657 | /* | ||
658 | * NAME: lmGroupCommit() | ||
659 | * | ||
660 | * FUNCTION: group commit | ||
661 | * initiate pageout of the pages with COMMIT in the order of | ||
662 | * page number - redrive pageout of the page at the head of | ||
663 | * pageout queue until full page has been written. | ||
664 | * | ||
665 | * RETURN: | ||
666 | * | ||
667 | * NOTE: | ||
668 | * LOGGC_LOCK serializes log group commit queue, and | ||
669 | * transaction blocks on the commit queue. | ||
670 | * N.B. LOG_LOCK is NOT held during lmGroupCommit(). | ||
671 | */ | ||
672 | int lmGroupCommit(struct jfs_log * log, struct tblock * tblk) | ||
673 | { | ||
674 | int rc = 0; | ||
675 | |||
676 | LOGGC_LOCK(log); | ||
677 | |||
678 | /* group committed already ? */ | ||
679 | if (tblk->flag & tblkGC_COMMITTED) { | ||
680 | if (tblk->flag & tblkGC_ERROR) | ||
681 | rc = -EIO; | ||
682 | |||
683 | LOGGC_UNLOCK(log); | ||
684 | return rc; | ||
685 | } | ||
686 | jfs_info("lmGroup Commit: tblk = 0x%p, gcrtc = %d", tblk, log->gcrtc); | ||
687 | |||
688 | if (tblk->xflag & COMMIT_LAZY) | ||
689 | tblk->flag |= tblkGC_LAZY; | ||
690 | |||
691 | if ((!(log->cflag & logGC_PAGEOUT)) && (!list_empty(&log->cqueue)) && | ||
692 | (!(tblk->xflag & COMMIT_LAZY) || test_bit(log_FLUSH, &log->flag) | ||
693 | || jfs_tlocks_low)) { | ||
694 | /* | ||
695 | * No pageout in progress | ||
696 | * | ||
697 | * start group commit as its group leader. | ||
698 | */ | ||
699 | log->cflag |= logGC_PAGEOUT; | ||
700 | |||
701 | lmGCwrite(log, 0); | ||
702 | } | ||
703 | |||
704 | if (tblk->xflag & COMMIT_LAZY) { | ||
705 | /* | ||
706 | * Lazy transactions can leave now | ||
707 | */ | ||
708 | LOGGC_UNLOCK(log); | ||
709 | return 0; | ||
710 | } | ||
711 | |||
712 | /* lmGCwrite gives up LOGGC_LOCK, check again */ | ||
713 | |||
714 | if (tblk->flag & tblkGC_COMMITTED) { | ||
715 | if (tblk->flag & tblkGC_ERROR) | ||
716 | rc = -EIO; | ||
717 | |||
718 | LOGGC_UNLOCK(log); | ||
719 | return rc; | ||
720 | } | ||
721 | |||
722 | /* upcount transaction waiting for completion | ||
723 | */ | ||
724 | log->gcrtc++; | ||
725 | tblk->flag |= tblkGC_READY; | ||
726 | |||
727 | __SLEEP_COND(tblk->gcwait, (tblk->flag & tblkGC_COMMITTED), | ||
728 | LOGGC_LOCK(log), LOGGC_UNLOCK(log)); | ||
729 | |||
730 | /* removed from commit queue */ | ||
731 | if (tblk->flag & tblkGC_ERROR) | ||
732 | rc = -EIO; | ||
733 | |||
734 | LOGGC_UNLOCK(log); | ||
735 | return rc; | ||
736 | } | ||
737 | |||
738 | /* | ||
739 | * NAME: lmGCwrite() | ||
740 | * | ||
741 | * FUNCTION: group commit write | ||
742 | * initiate write of log page, building a group of all transactions | ||
743 | * with commit records on that page. | ||
744 | * | ||
745 | * RETURN: None | ||
746 | * | ||
747 | * NOTE: | ||
748 | * LOGGC_LOCK must be held by caller. | ||
749 | * N.B. LOG_LOCK is NOT held during lmGroupCommit(). | ||
750 | */ | ||
751 | static void lmGCwrite(struct jfs_log * log, int cant_write) | ||
752 | { | ||
753 | struct lbuf *bp; | ||
754 | struct logpage *lp; | ||
755 | int gcpn; /* group commit page number */ | ||
756 | struct tblock *tblk; | ||
757 | struct tblock *xtblk = NULL; | ||
758 | |||
759 | /* | ||
760 | * build the commit group of a log page | ||
761 | * | ||
762 | * scan commit queue and make a commit group of all | ||
763 | * transactions with COMMIT records on the same log page. | ||
764 | */ | ||
765 | /* get the head tblk on the commit queue */ | ||
766 | gcpn = list_entry(log->cqueue.next, struct tblock, cqueue)->pn; | ||
767 | |||
768 | list_for_each_entry(tblk, &log->cqueue, cqueue) { | ||
769 | if (tblk->pn != gcpn) | ||
770 | break; | ||
771 | |||
772 | xtblk = tblk; | ||
773 | |||
774 | /* state transition: (QUEUE, READY) -> COMMIT */ | ||
775 | tblk->flag |= tblkGC_COMMIT; | ||
776 | } | ||
777 | tblk = xtblk; /* last tblk of the page */ | ||
778 | |||
779 | /* | ||
780 | * pageout to commit transactions on the log page. | ||
781 | */ | ||
782 | bp = (struct lbuf *) tblk->bp; | ||
783 | lp = (struct logpage *) bp->l_ldata; | ||
784 | /* is page already full ? */ | ||
785 | if (tblk->flag & tblkGC_EOP) { | ||
786 | /* mark page to free at end of group commit of the page */ | ||
787 | tblk->flag &= ~tblkGC_EOP; | ||
788 | tblk->flag |= tblkGC_FREE; | ||
789 | bp->l_ceor = bp->l_eor; | ||
790 | lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); | ||
791 | lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmGC, | ||
792 | cant_write); | ||
793 | INCREMENT(lmStat.full_page); | ||
794 | } | ||
795 | /* page is not yet full */ | ||
796 | else { | ||
797 | bp->l_ceor = tblk->eor; /* ? bp->l_ceor = bp->l_eor; */ | ||
798 | lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); | ||
799 | lbmWrite(log, bp, lbmWRITE | lbmGC, cant_write); | ||
800 | INCREMENT(lmStat.partial_page); | ||
801 | } | ||
802 | } | ||
803 | |||
804 | /* | ||
805 | * NAME: lmPostGC() | ||
806 | * | ||
807 | * FUNCTION: group commit post-processing | ||
808 | * Processes transactions after their commit records have been written | ||
809 | * to disk, redriving log I/O if necessary. | ||
810 | * | ||
811 | * RETURN: None | ||
812 | * | ||
813 | * NOTE: | ||
814 | * This routine is called a interrupt time by lbmIODone | ||
815 | */ | ||
816 | static void lmPostGC(struct lbuf * bp) | ||
817 | { | ||
818 | unsigned long flags; | ||
819 | struct jfs_log *log = bp->l_log; | ||
820 | struct logpage *lp; | ||
821 | struct tblock *tblk, *temp; | ||
822 | |||
823 | //LOGGC_LOCK(log); | ||
824 | spin_lock_irqsave(&log->gclock, flags); | ||
825 | /* | ||
826 | * current pageout of group commit completed. | ||
827 | * | ||
828 | * remove/wakeup transactions from commit queue who were | ||
829 | * group committed with the current log page | ||
830 | */ | ||
831 | list_for_each_entry_safe(tblk, temp, &log->cqueue, cqueue) { | ||
832 | if (!(tblk->flag & tblkGC_COMMIT)) | ||
833 | break; | ||
834 | /* if transaction was marked GC_COMMIT then | ||
835 | * it has been shipped in the current pageout | ||
836 | * and made it to disk - it is committed. | ||
837 | */ | ||
838 | |||
839 | if (bp->l_flag & lbmERROR) | ||
840 | tblk->flag |= tblkGC_ERROR; | ||
841 | |||
842 | /* remove it from the commit queue */ | ||
843 | list_del(&tblk->cqueue); | ||
844 | tblk->flag &= ~tblkGC_QUEUE; | ||
845 | |||
846 | if (tblk == log->flush_tblk) { | ||
847 | /* we can stop flushing the log now */ | ||
848 | clear_bit(log_FLUSH, &log->flag); | ||
849 | log->flush_tblk = NULL; | ||
850 | } | ||
851 | |||
852 | jfs_info("lmPostGC: tblk = 0x%p, flag = 0x%x", tblk, | ||
853 | tblk->flag); | ||
854 | |||
855 | if (!(tblk->xflag & COMMIT_FORCE)) | ||
856 | /* | ||
857 | * Hand tblk over to lazy commit thread | ||
858 | */ | ||
859 | txLazyUnlock(tblk); | ||
860 | else { | ||
861 | /* state transition: COMMIT -> COMMITTED */ | ||
862 | tblk->flag |= tblkGC_COMMITTED; | ||
863 | |||
864 | if (tblk->flag & tblkGC_READY) | ||
865 | log->gcrtc--; | ||
866 | |||
867 | LOGGC_WAKEUP(tblk); | ||
868 | } | ||
869 | |||
870 | /* was page full before pageout ? | ||
871 | * (and this is the last tblk bound with the page) | ||
872 | */ | ||
873 | if (tblk->flag & tblkGC_FREE) | ||
874 | lbmFree(bp); | ||
875 | /* did page become full after pageout ? | ||
876 | * (and this is the last tblk bound with the page) | ||
877 | */ | ||
878 | else if (tblk->flag & tblkGC_EOP) { | ||
879 | /* finalize the page */ | ||
880 | lp = (struct logpage *) bp->l_ldata; | ||
881 | bp->l_ceor = bp->l_eor; | ||
882 | lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); | ||
883 | jfs_info("lmPostGC: calling lbmWrite"); | ||
884 | lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, | ||
885 | 1); | ||
886 | } | ||
887 | |||
888 | } | ||
889 | |||
890 | /* are there any transactions who have entered lnGroupCommit() | ||
891 | * (whose COMMITs are after that of the last log page written. | ||
892 | * They are waiting for new group commit (above at (SLEEP 1)) | ||
893 | * or lazy transactions are on a full (queued) log page, | ||
894 | * select the latest ready transaction as new group leader and | ||
895 | * wake her up to lead her group. | ||
896 | */ | ||
897 | if ((!list_empty(&log->cqueue)) && | ||
898 | ((log->gcrtc > 0) || (tblk->bp->l_wqnext != NULL) || | ||
899 | test_bit(log_FLUSH, &log->flag) || jfs_tlocks_low)) | ||
900 | /* | ||
901 | * Call lmGCwrite with new group leader | ||
902 | */ | ||
903 | lmGCwrite(log, 1); | ||
904 | |||
905 | /* no transaction are ready yet (transactions are only just | ||
906 | * queued (GC_QUEUE) and not entered for group commit yet). | ||
907 | * the first transaction entering group commit | ||
908 | * will elect herself as new group leader. | ||
909 | */ | ||
910 | else | ||
911 | log->cflag &= ~logGC_PAGEOUT; | ||
912 | |||
913 | //LOGGC_UNLOCK(log); | ||
914 | spin_unlock_irqrestore(&log->gclock, flags); | ||
915 | return; | ||
916 | } | ||
917 | |||
918 | /* | ||
919 | * NAME: lmLogSync() | ||
920 | * | ||
921 | * FUNCTION: write log SYNCPT record for specified log | ||
922 | * if new sync address is available | ||
923 | * (normally the case if sync() is executed by back-ground | ||
924 | * process). | ||
925 | * if not, explicitly run jfs_blogsync() to initiate | ||
926 | * getting of new sync address. | ||
927 | * calculate new value of i_nextsync which determines when | ||
928 | * this code is called again. | ||
929 | * | ||
930 | * this is called only from lmLog(). | ||
931 | * | ||
932 | * PARAMETER: ip - pointer to logs inode. | ||
933 | * | ||
934 | * RETURN: 0 | ||
935 | * | ||
936 | * serialization: LOG_LOCK() held on entry/exit | ||
937 | */ | ||
938 | static int lmLogSync(struct jfs_log * log, int nosyncwait) | ||
939 | { | ||
940 | int logsize; | ||
941 | int written; /* written since last syncpt */ | ||
942 | int free; /* free space left available */ | ||
943 | int delta; /* additional delta to write normally */ | ||
944 | int more; /* additional write granted */ | ||
945 | struct lrd lrd; | ||
946 | int lsn; | ||
947 | struct logsyncblk *lp; | ||
948 | |||
949 | /* | ||
950 | * forward syncpt | ||
951 | */ | ||
952 | /* if last sync is same as last syncpt, | ||
953 | * invoke sync point forward processing to update sync. | ||
954 | */ | ||
955 | |||
956 | if (log->sync == log->syncpt) { | ||
957 | LOGSYNC_LOCK(log); | ||
958 | /* ToDo: push dirty metapages out to disk */ | ||
959 | // bmLogSync(log); | ||
960 | |||
961 | if (list_empty(&log->synclist)) | ||
962 | log->sync = log->lsn; | ||
963 | else { | ||
964 | lp = list_entry(log->synclist.next, | ||
965 | struct logsyncblk, synclist); | ||
966 | log->sync = lp->lsn; | ||
967 | } | ||
968 | LOGSYNC_UNLOCK(log); | ||
969 | |||
970 | } | ||
971 | |||
972 | /* if sync is different from last syncpt, | ||
973 | * write a SYNCPT record with syncpt = sync. | ||
974 | * reset syncpt = sync | ||
975 | */ | ||
976 | if (log->sync != log->syncpt) { | ||
977 | struct jfs_sb_info *sbi; | ||
978 | |||
979 | /* | ||
980 | * We need to make sure all of the "written" metapages | ||
981 | * actually make it to disk | ||
982 | */ | ||
983 | list_for_each_entry(sbi, &log->sb_list, log_list) { | ||
984 | if (sbi->flag & JFS_NOINTEGRITY) | ||
985 | continue; | ||
986 | filemap_fdatawrite(sbi->ipbmap->i_mapping); | ||
987 | filemap_fdatawrite(sbi->ipimap->i_mapping); | ||
988 | filemap_fdatawrite(sbi->sb->s_bdev->bd_inode->i_mapping); | ||
989 | } | ||
990 | list_for_each_entry(sbi, &log->sb_list, log_list) { | ||
991 | if (sbi->flag & JFS_NOINTEGRITY) | ||
992 | continue; | ||
993 | filemap_fdatawait(sbi->ipbmap->i_mapping); | ||
994 | filemap_fdatawait(sbi->ipimap->i_mapping); | ||
995 | filemap_fdatawait(sbi->sb->s_bdev->bd_inode->i_mapping); | ||
996 | } | ||
997 | |||
998 | lrd.logtid = 0; | ||
999 | lrd.backchain = 0; | ||
1000 | lrd.type = cpu_to_le16(LOG_SYNCPT); | ||
1001 | lrd.length = 0; | ||
1002 | lrd.log.syncpt.sync = cpu_to_le32(log->sync); | ||
1003 | lsn = lmWriteRecord(log, NULL, &lrd, NULL); | ||
1004 | |||
1005 | log->syncpt = log->sync; | ||
1006 | } else | ||
1007 | lsn = log->lsn; | ||
1008 | |||
1009 | /* | ||
1010 | * setup next syncpt trigger (SWAG) | ||
1011 | */ | ||
1012 | logsize = log->logsize; | ||
1013 | |||
1014 | logdiff(written, lsn, log); | ||
1015 | free = logsize - written; | ||
1016 | delta = LOGSYNC_DELTA(logsize); | ||
1017 | more = min(free / 2, delta); | ||
1018 | if (more < 2 * LOGPSIZE) { | ||
1019 | jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n"); | ||
1020 | /* | ||
1021 | * log wrapping | ||
1022 | * | ||
1023 | * option 1 - panic ? No.! | ||
1024 | * option 2 - shutdown file systems | ||
1025 | * associated with log ? | ||
1026 | * option 3 - extend log ? | ||
1027 | */ | ||
1028 | /* | ||
1029 | * option 4 - second chance | ||
1030 | * | ||
1031 | * mark log wrapped, and continue. | ||
1032 | * when all active transactions are completed, | ||
1033 | * mark log vaild for recovery. | ||
1034 | * if crashed during invalid state, log state | ||
1035 | * implies invald log, forcing fsck(). | ||
1036 | */ | ||
1037 | /* mark log state log wrap in log superblock */ | ||
1038 | /* log->state = LOGWRAP; */ | ||
1039 | |||
1040 | /* reset sync point computation */ | ||
1041 | log->syncpt = log->sync = lsn; | ||
1042 | log->nextsync = delta; | ||
1043 | } else | ||
1044 | /* next syncpt trigger = written + more */ | ||
1045 | log->nextsync = written + more; | ||
1046 | |||
1047 | /* return if lmLogSync() from outside of transaction, e.g., sync() */ | ||
1048 | if (nosyncwait) | ||
1049 | return lsn; | ||
1050 | |||
1051 | /* if number of bytes written from last sync point is more | ||
1052 | * than 1/4 of the log size, stop new transactions from | ||
1053 | * starting until all current transactions are completed | ||
1054 | * by setting syncbarrier flag. | ||
1055 | */ | ||
1056 | if (written > LOGSYNC_BARRIER(logsize) && logsize > 32 * LOGPSIZE) { | ||
1057 | set_bit(log_SYNCBARRIER, &log->flag); | ||
1058 | jfs_info("log barrier on: lsn=0x%x syncpt=0x%x", lsn, | ||
1059 | log->syncpt); | ||
1060 | /* | ||
1061 | * We may have to initiate group commit | ||
1062 | */ | ||
1063 | jfs_flush_journal(log, 0); | ||
1064 | } | ||
1065 | |||
1066 | return lsn; | ||
1067 | } | ||
1068 | |||
1069 | |||
1070 | /* | ||
1071 | * NAME: lmLogOpen() | ||
1072 | * | ||
1073 | * FUNCTION: open the log on first open; | ||
1074 | * insert filesystem in the active list of the log. | ||
1075 | * | ||
1076 | * PARAMETER: ipmnt - file system mount inode | ||
1077 | * iplog - log inode (out) | ||
1078 | * | ||
1079 | * RETURN: | ||
1080 | * | ||
1081 | * serialization: | ||
1082 | */ | ||
1083 | int lmLogOpen(struct super_block *sb) | ||
1084 | { | ||
1085 | int rc; | ||
1086 | struct block_device *bdev; | ||
1087 | struct jfs_log *log; | ||
1088 | struct jfs_sb_info *sbi = JFS_SBI(sb); | ||
1089 | |||
1090 | if (sbi->flag & JFS_NOINTEGRITY) | ||
1091 | return open_dummy_log(sb); | ||
1092 | |||
1093 | if (sbi->mntflag & JFS_INLINELOG) | ||
1094 | return open_inline_log(sb); | ||
1095 | |||
1096 | down(&jfs_log_sem); | ||
1097 | list_for_each_entry(log, &jfs_external_logs, journal_list) { | ||
1098 | if (log->bdev->bd_dev == sbi->logdev) { | ||
1099 | if (memcmp(log->uuid, sbi->loguuid, | ||
1100 | sizeof(log->uuid))) { | ||
1101 | jfs_warn("wrong uuid on JFS journal\n"); | ||
1102 | up(&jfs_log_sem); | ||
1103 | return -EINVAL; | ||
1104 | } | ||
1105 | /* | ||
1106 | * add file system to log active file system list | ||
1107 | */ | ||
1108 | if ((rc = lmLogFileSystem(log, sbi, 1))) { | ||
1109 | up(&jfs_log_sem); | ||
1110 | return rc; | ||
1111 | } | ||
1112 | goto journal_found; | ||
1113 | } | ||
1114 | } | ||
1115 | |||
1116 | if (!(log = kmalloc(sizeof(struct jfs_log), GFP_KERNEL))) { | ||
1117 | up(&jfs_log_sem); | ||
1118 | return -ENOMEM; | ||
1119 | } | ||
1120 | memset(log, 0, sizeof(struct jfs_log)); | ||
1121 | INIT_LIST_HEAD(&log->sb_list); | ||
1122 | init_waitqueue_head(&log->syncwait); | ||
1123 | |||
1124 | /* | ||
1125 | * external log as separate logical volume | ||
1126 | * | ||
1127 | * file systems to log may have n-to-1 relationship; | ||
1128 | */ | ||
1129 | |||
1130 | bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE); | ||
1131 | if (IS_ERR(bdev)) { | ||
1132 | rc = -PTR_ERR(bdev); | ||
1133 | goto free; | ||
1134 | } | ||
1135 | |||
1136 | if ((rc = bd_claim(bdev, log))) { | ||
1137 | goto close; | ||
1138 | } | ||
1139 | |||
1140 | log->bdev = bdev; | ||
1141 | memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid)); | ||
1142 | |||
1143 | /* | ||
1144 | * initialize log: | ||
1145 | */ | ||
1146 | if ((rc = lmLogInit(log))) | ||
1147 | goto unclaim; | ||
1148 | |||
1149 | list_add(&log->journal_list, &jfs_external_logs); | ||
1150 | |||
1151 | /* | ||
1152 | * add file system to log active file system list | ||
1153 | */ | ||
1154 | if ((rc = lmLogFileSystem(log, sbi, 1))) | ||
1155 | goto shutdown; | ||
1156 | |||
1157 | journal_found: | ||
1158 | LOG_LOCK(log); | ||
1159 | list_add(&sbi->log_list, &log->sb_list); | ||
1160 | sbi->log = log; | ||
1161 | LOG_UNLOCK(log); | ||
1162 | |||
1163 | up(&jfs_log_sem); | ||
1164 | return 0; | ||
1165 | |||
1166 | /* | ||
1167 | * unwind on error | ||
1168 | */ | ||
1169 | shutdown: /* unwind lbmLogInit() */ | ||
1170 | list_del(&log->journal_list); | ||
1171 | lbmLogShutdown(log); | ||
1172 | |||
1173 | unclaim: | ||
1174 | bd_release(bdev); | ||
1175 | |||
1176 | close: /* close external log device */ | ||
1177 | blkdev_put(bdev); | ||
1178 | |||
1179 | free: /* free log descriptor */ | ||
1180 | up(&jfs_log_sem); | ||
1181 | kfree(log); | ||
1182 | |||
1183 | jfs_warn("lmLogOpen: exit(%d)", rc); | ||
1184 | return rc; | ||
1185 | } | ||
1186 | |||
1187 | static int open_inline_log(struct super_block *sb) | ||
1188 | { | ||
1189 | struct jfs_log *log; | ||
1190 | int rc; | ||
1191 | |||
1192 | if (!(log = kmalloc(sizeof(struct jfs_log), GFP_KERNEL))) | ||
1193 | return -ENOMEM; | ||
1194 | memset(log, 0, sizeof(struct jfs_log)); | ||
1195 | INIT_LIST_HEAD(&log->sb_list); | ||
1196 | init_waitqueue_head(&log->syncwait); | ||
1197 | |||
1198 | set_bit(log_INLINELOG, &log->flag); | ||
1199 | log->bdev = sb->s_bdev; | ||
1200 | log->base = addressPXD(&JFS_SBI(sb)->logpxd); | ||
1201 | log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >> | ||
1202 | (L2LOGPSIZE - sb->s_blocksize_bits); | ||
1203 | log->l2bsize = sb->s_blocksize_bits; | ||
1204 | ASSERT(L2LOGPSIZE >= sb->s_blocksize_bits); | ||
1205 | |||
1206 | /* | ||
1207 | * initialize log. | ||
1208 | */ | ||
1209 | if ((rc = lmLogInit(log))) { | ||
1210 | kfree(log); | ||
1211 | jfs_warn("lmLogOpen: exit(%d)", rc); | ||
1212 | return rc; | ||
1213 | } | ||
1214 | |||
1215 | list_add(&JFS_SBI(sb)->log_list, &log->sb_list); | ||
1216 | JFS_SBI(sb)->log = log; | ||
1217 | |||
1218 | return rc; | ||
1219 | } | ||
1220 | |||
1221 | static int open_dummy_log(struct super_block *sb) | ||
1222 | { | ||
1223 | int rc; | ||
1224 | |||
1225 | down(&jfs_log_sem); | ||
1226 | if (!dummy_log) { | ||
1227 | dummy_log = kmalloc(sizeof(struct jfs_log), GFP_KERNEL); | ||
1228 | if (!dummy_log) { | ||
1229 | up(&jfs_log_sem); | ||
1230 | return -ENOMEM; | ||
1231 | } | ||
1232 | memset(dummy_log, 0, sizeof(struct jfs_log)); | ||
1233 | INIT_LIST_HEAD(&dummy_log->sb_list); | ||
1234 | init_waitqueue_head(&dummy_log->syncwait); | ||
1235 | dummy_log->no_integrity = 1; | ||
1236 | /* Make up some stuff */ | ||
1237 | dummy_log->base = 0; | ||
1238 | dummy_log->size = 1024; | ||
1239 | rc = lmLogInit(dummy_log); | ||
1240 | if (rc) { | ||
1241 | kfree(dummy_log); | ||
1242 | dummy_log = NULL; | ||
1243 | up(&jfs_log_sem); | ||
1244 | return rc; | ||
1245 | } | ||
1246 | } | ||
1247 | |||
1248 | LOG_LOCK(dummy_log); | ||
1249 | list_add(&JFS_SBI(sb)->log_list, &dummy_log->sb_list); | ||
1250 | JFS_SBI(sb)->log = dummy_log; | ||
1251 | LOG_UNLOCK(dummy_log); | ||
1252 | up(&jfs_log_sem); | ||
1253 | |||
1254 | return 0; | ||
1255 | } | ||
1256 | |||
1257 | /* | ||
1258 | * NAME: lmLogInit() | ||
1259 | * | ||
1260 | * FUNCTION: log initialization at first log open. | ||
1261 | * | ||
1262 | * logredo() (or logformat()) should have been run previously. | ||
1263 | * initialize the log from log superblock. | ||
1264 | * set the log state in the superblock to LOGMOUNT and | ||
1265 | * write SYNCPT log record. | ||
1266 | * | ||
1267 | * PARAMETER: log - log structure | ||
1268 | * | ||
1269 | * RETURN: 0 - if ok | ||
1270 | * -EINVAL - bad log magic number or superblock dirty | ||
1271 | * error returned from logwait() | ||
1272 | * | ||
1273 | * serialization: single first open thread | ||
1274 | */ | ||
1275 | int lmLogInit(struct jfs_log * log) | ||
1276 | { | ||
1277 | int rc = 0; | ||
1278 | struct lrd lrd; | ||
1279 | struct logsuper *logsuper; | ||
1280 | struct lbuf *bpsuper; | ||
1281 | struct lbuf *bp; | ||
1282 | struct logpage *lp; | ||
1283 | int lsn = 0; | ||
1284 | |||
1285 | jfs_info("lmLogInit: log:0x%p", log); | ||
1286 | |||
1287 | /* initialize the group commit serialization lock */ | ||
1288 | LOGGC_LOCK_INIT(log); | ||
1289 | |||
1290 | /* allocate/initialize the log write serialization lock */ | ||
1291 | LOG_LOCK_INIT(log); | ||
1292 | |||
1293 | LOGSYNC_LOCK_INIT(log); | ||
1294 | |||
1295 | INIT_LIST_HEAD(&log->synclist); | ||
1296 | |||
1297 | INIT_LIST_HEAD(&log->cqueue); | ||
1298 | log->flush_tblk = NULL; | ||
1299 | |||
1300 | log->count = 0; | ||
1301 | |||
1302 | /* | ||
1303 | * initialize log i/o | ||
1304 | */ | ||
1305 | if ((rc = lbmLogInit(log))) | ||
1306 | return rc; | ||
1307 | |||
1308 | if (!test_bit(log_INLINELOG, &log->flag)) | ||
1309 | log->l2bsize = L2LOGPSIZE; | ||
1310 | |||
1311 | /* check for disabled journaling to disk */ | ||
1312 | if (log->no_integrity) { | ||
1313 | /* | ||
1314 | * Journal pages will still be filled. When the time comes | ||
1315 | * to actually do the I/O, the write is not done, and the | ||
1316 | * endio routine is called directly. | ||
1317 | */ | ||
1318 | bp = lbmAllocate(log , 0); | ||
1319 | log->bp = bp; | ||
1320 | bp->l_pn = bp->l_eor = 0; | ||
1321 | } else { | ||
1322 | /* | ||
1323 | * validate log superblock | ||
1324 | */ | ||
1325 | if ((rc = lbmRead(log, 1, &bpsuper))) | ||
1326 | goto errout10; | ||
1327 | |||
1328 | logsuper = (struct logsuper *) bpsuper->l_ldata; | ||
1329 | |||
1330 | if (logsuper->magic != cpu_to_le32(LOGMAGIC)) { | ||
1331 | jfs_warn("*** Log Format Error ! ***"); | ||
1332 | rc = -EINVAL; | ||
1333 | goto errout20; | ||
1334 | } | ||
1335 | |||
1336 | /* logredo() should have been run successfully. */ | ||
1337 | if (logsuper->state != cpu_to_le32(LOGREDONE)) { | ||
1338 | jfs_warn("*** Log Is Dirty ! ***"); | ||
1339 | rc = -EINVAL; | ||
1340 | goto errout20; | ||
1341 | } | ||
1342 | |||
1343 | /* initialize log from log superblock */ | ||
1344 | if (test_bit(log_INLINELOG,&log->flag)) { | ||
1345 | if (log->size != le32_to_cpu(logsuper->size)) { | ||
1346 | rc = -EINVAL; | ||
1347 | goto errout20; | ||
1348 | } | ||
1349 | jfs_info("lmLogInit: inline log:0x%p base:0x%Lx " | ||
1350 | "size:0x%x", log, | ||
1351 | (unsigned long long) log->base, log->size); | ||
1352 | } else { | ||
1353 | if (memcmp(logsuper->uuid, log->uuid, 16)) { | ||
1354 | jfs_warn("wrong uuid on JFS log device"); | ||
1355 | goto errout20; | ||
1356 | } | ||
1357 | log->size = le32_to_cpu(logsuper->size); | ||
1358 | log->l2bsize = le32_to_cpu(logsuper->l2bsize); | ||
1359 | jfs_info("lmLogInit: external log:0x%p base:0x%Lx " | ||
1360 | "size:0x%x", log, | ||
1361 | (unsigned long long) log->base, log->size); | ||
1362 | } | ||
1363 | |||
1364 | log->page = le32_to_cpu(logsuper->end) / LOGPSIZE; | ||
1365 | log->eor = le32_to_cpu(logsuper->end) - (LOGPSIZE * log->page); | ||
1366 | |||
1367 | /* | ||
1368 | * initialize for log append write mode | ||
1369 | */ | ||
1370 | /* establish current/end-of-log page/buffer */ | ||
1371 | if ((rc = lbmRead(log, log->page, &bp))) | ||
1372 | goto errout20; | ||
1373 | |||
1374 | lp = (struct logpage *) bp->l_ldata; | ||
1375 | |||
1376 | jfs_info("lmLogInit: lsn:0x%x page:%d eor:%d:%d", | ||
1377 | le32_to_cpu(logsuper->end), log->page, log->eor, | ||
1378 | le16_to_cpu(lp->h.eor)); | ||
1379 | |||
1380 | log->bp = bp; | ||
1381 | bp->l_pn = log->page; | ||
1382 | bp->l_eor = log->eor; | ||
1383 | |||
1384 | /* if current page is full, move on to next page */ | ||
1385 | if (log->eor >= LOGPSIZE - LOGPTLRSIZE) | ||
1386 | lmNextPage(log); | ||
1387 | |||
1388 | /* | ||
1389 | * initialize log syncpoint | ||
1390 | */ | ||
1391 | /* | ||
1392 | * write the first SYNCPT record with syncpoint = 0 | ||
1393 | * (i.e., log redo up to HERE !); | ||
1394 | * remove current page from lbm write queue at end of pageout | ||
1395 | * (to write log superblock update), but do not release to | ||
1396 | * freelist; | ||
1397 | */ | ||
1398 | lrd.logtid = 0; | ||
1399 | lrd.backchain = 0; | ||
1400 | lrd.type = cpu_to_le16(LOG_SYNCPT); | ||
1401 | lrd.length = 0; | ||
1402 | lrd.log.syncpt.sync = 0; | ||
1403 | lsn = lmWriteRecord(log, NULL, &lrd, NULL); | ||
1404 | bp = log->bp; | ||
1405 | bp->l_ceor = bp->l_eor; | ||
1406 | lp = (struct logpage *) bp->l_ldata; | ||
1407 | lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); | ||
1408 | lbmWrite(log, bp, lbmWRITE | lbmSYNC, 0); | ||
1409 | if ((rc = lbmIOWait(bp, 0))) | ||
1410 | goto errout30; | ||
1411 | |||
1412 | /* | ||
1413 | * update/write superblock | ||
1414 | */ | ||
1415 | logsuper->state = cpu_to_le32(LOGMOUNT); | ||
1416 | log->serial = le32_to_cpu(logsuper->serial) + 1; | ||
1417 | logsuper->serial = cpu_to_le32(log->serial); | ||
1418 | lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); | ||
1419 | if ((rc = lbmIOWait(bpsuper, lbmFREE))) | ||
1420 | goto errout30; | ||
1421 | } | ||
1422 | |||
1423 | /* initialize logsync parameters */ | ||
1424 | log->logsize = (log->size - 2) << L2LOGPSIZE; | ||
1425 | log->lsn = lsn; | ||
1426 | log->syncpt = lsn; | ||
1427 | log->sync = log->syncpt; | ||
1428 | log->nextsync = LOGSYNC_DELTA(log->logsize); | ||
1429 | |||
1430 | jfs_info("lmLogInit: lsn:0x%x syncpt:0x%x sync:0x%x", | ||
1431 | log->lsn, log->syncpt, log->sync); | ||
1432 | |||
1433 | /* | ||
1434 | * initialize for lazy/group commit | ||
1435 | */ | ||
1436 | log->clsn = lsn; | ||
1437 | |||
1438 | return 0; | ||
1439 | |||
1440 | /* | ||
1441 | * unwind on error | ||
1442 | */ | ||
1443 | errout30: /* release log page */ | ||
1444 | log->wqueue = NULL; | ||
1445 | bp->l_wqnext = NULL; | ||
1446 | lbmFree(bp); | ||
1447 | |||
1448 | errout20: /* release log superblock */ | ||
1449 | lbmFree(bpsuper); | ||
1450 | |||
1451 | errout10: /* unwind lbmLogInit() */ | ||
1452 | lbmLogShutdown(log); | ||
1453 | |||
1454 | jfs_warn("lmLogInit: exit(%d)", rc); | ||
1455 | return rc; | ||
1456 | } | ||
1457 | |||
1458 | |||
1459 | /* | ||
1460 | * NAME: lmLogClose() | ||
1461 | * | ||
1462 | * FUNCTION: remove file system <ipmnt> from active list of log <iplog> | ||
1463 | * and close it on last close. | ||
1464 | * | ||
1465 | * PARAMETER: sb - superblock | ||
1466 | * | ||
1467 | * RETURN: errors from subroutines | ||
1468 | * | ||
1469 | * serialization: | ||
1470 | */ | ||
1471 | int lmLogClose(struct super_block *sb) | ||
1472 | { | ||
1473 | struct jfs_sb_info *sbi = JFS_SBI(sb); | ||
1474 | struct jfs_log *log = sbi->log; | ||
1475 | struct block_device *bdev; | ||
1476 | int rc = 0; | ||
1477 | |||
1478 | jfs_info("lmLogClose: log:0x%p", log); | ||
1479 | |||
1480 | down(&jfs_log_sem); | ||
1481 | LOG_LOCK(log); | ||
1482 | list_del(&sbi->log_list); | ||
1483 | LOG_UNLOCK(log); | ||
1484 | sbi->log = NULL; | ||
1485 | |||
1486 | /* | ||
1487 | * We need to make sure all of the "written" metapages | ||
1488 | * actually make it to disk | ||
1489 | */ | ||
1490 | sync_blockdev(sb->s_bdev); | ||
1491 | |||
1492 | if (test_bit(log_INLINELOG, &log->flag)) { | ||
1493 | /* | ||
1494 | * in-line log in host file system | ||
1495 | */ | ||
1496 | rc = lmLogShutdown(log); | ||
1497 | kfree(log); | ||
1498 | goto out; | ||
1499 | } | ||
1500 | |||
1501 | if (!log->no_integrity) | ||
1502 | lmLogFileSystem(log, sbi, 0); | ||
1503 | |||
1504 | if (!list_empty(&log->sb_list)) | ||
1505 | goto out; | ||
1506 | |||
1507 | /* | ||
1508 | * TODO: ensure that the dummy_log is in a state to allow | ||
1509 | * lbmLogShutdown to deallocate all the buffers and call | ||
1510 | * kfree against dummy_log. For now, leave dummy_log & its | ||
1511 | * buffers in memory, and resuse if another no-integrity mount | ||
1512 | * is requested. | ||
1513 | */ | ||
1514 | if (log->no_integrity) | ||
1515 | goto out; | ||
1516 | |||
1517 | /* | ||
1518 | * external log as separate logical volume | ||
1519 | */ | ||
1520 | list_del(&log->journal_list); | ||
1521 | bdev = log->bdev; | ||
1522 | rc = lmLogShutdown(log); | ||
1523 | |||
1524 | bd_release(bdev); | ||
1525 | blkdev_put(bdev); | ||
1526 | |||
1527 | kfree(log); | ||
1528 | |||
1529 | out: | ||
1530 | up(&jfs_log_sem); | ||
1531 | jfs_info("lmLogClose: exit(%d)", rc); | ||
1532 | return rc; | ||
1533 | } | ||
1534 | |||
1535 | |||
1536 | /* | ||
1537 | * NAME: jfs_flush_journal() | ||
1538 | * | ||
1539 | * FUNCTION: initiate write of any outstanding transactions to the journal | ||
1540 | * and optionally wait until they are all written to disk | ||
1541 | * | ||
1542 | * wait == 0 flush until latest txn is committed, don't wait | ||
1543 | * wait == 1 flush until latest txn is committed, wait | ||
1544 | * wait > 1 flush until all txn's are complete, wait | ||
1545 | */ | ||
1546 | void jfs_flush_journal(struct jfs_log *log, int wait) | ||
1547 | { | ||
1548 | int i; | ||
1549 | struct tblock *target = NULL; | ||
1550 | |||
1551 | /* jfs_write_inode may call us during read-only mount */ | ||
1552 | if (!log) | ||
1553 | return; | ||
1554 | |||
1555 | jfs_info("jfs_flush_journal: log:0x%p wait=%d", log, wait); | ||
1556 | |||
1557 | LOGGC_LOCK(log); | ||
1558 | |||
1559 | if (!list_empty(&log->cqueue)) { | ||
1560 | /* | ||
1561 | * This ensures that we will keep writing to the journal as long | ||
1562 | * as there are unwritten commit records | ||
1563 | */ | ||
1564 | target = list_entry(log->cqueue.prev, struct tblock, cqueue); | ||
1565 | |||
1566 | if (test_bit(log_FLUSH, &log->flag)) { | ||
1567 | /* | ||
1568 | * We're already flushing. | ||
1569 | * if flush_tblk is NULL, we are flushing everything, | ||
1570 | * so leave it that way. Otherwise, update it to the | ||
1571 | * latest transaction | ||
1572 | */ | ||
1573 | if (log->flush_tblk) | ||
1574 | log->flush_tblk = target; | ||
1575 | } else { | ||
1576 | /* Only flush until latest transaction is committed */ | ||
1577 | log->flush_tblk = target; | ||
1578 | set_bit(log_FLUSH, &log->flag); | ||
1579 | |||
1580 | /* | ||
1581 | * Initiate I/O on outstanding transactions | ||
1582 | */ | ||
1583 | if (!(log->cflag & logGC_PAGEOUT)) { | ||
1584 | log->cflag |= logGC_PAGEOUT; | ||
1585 | lmGCwrite(log, 0); | ||
1586 | } | ||
1587 | } | ||
1588 | } | ||
1589 | if ((wait > 1) || test_bit(log_SYNCBARRIER, &log->flag)) { | ||
1590 | /* Flush until all activity complete */ | ||
1591 | set_bit(log_FLUSH, &log->flag); | ||
1592 | log->flush_tblk = NULL; | ||
1593 | } | ||
1594 | |||
1595 | if (wait && target && !(target->flag & tblkGC_COMMITTED)) { | ||
1596 | DECLARE_WAITQUEUE(__wait, current); | ||
1597 | |||
1598 | add_wait_queue(&target->gcwait, &__wait); | ||
1599 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
1600 | LOGGC_UNLOCK(log); | ||
1601 | schedule(); | ||
1602 | current->state = TASK_RUNNING; | ||
1603 | LOGGC_LOCK(log); | ||
1604 | remove_wait_queue(&target->gcwait, &__wait); | ||
1605 | } | ||
1606 | LOGGC_UNLOCK(log); | ||
1607 | |||
1608 | if (wait < 2) | ||
1609 | return; | ||
1610 | |||
1611 | /* | ||
1612 | * If there was recent activity, we may need to wait | ||
1613 | * for the lazycommit thread to catch up | ||
1614 | */ | ||
1615 | if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) { | ||
1616 | for (i = 0; i < 800; i++) { /* Too much? */ | ||
1617 | msleep(250); | ||
1618 | if (list_empty(&log->cqueue) && | ||
1619 | list_empty(&log->synclist)) | ||
1620 | break; | ||
1621 | } | ||
1622 | } | ||
1623 | assert(list_empty(&log->cqueue)); | ||
1624 | assert(list_empty(&log->synclist)); | ||
1625 | clear_bit(log_FLUSH, &log->flag); | ||
1626 | } | ||
1627 | |||
1628 | /* | ||
1629 | * NAME: lmLogShutdown() | ||
1630 | * | ||
1631 | * FUNCTION: log shutdown at last LogClose(). | ||
1632 | * | ||
1633 | * write log syncpt record. | ||
1634 | * update super block to set redone flag to 0. | ||
1635 | * | ||
1636 | * PARAMETER: log - log inode | ||
1637 | * | ||
1638 | * RETURN: 0 - success | ||
1639 | * | ||
1640 | * serialization: single last close thread | ||
1641 | */ | ||
1642 | int lmLogShutdown(struct jfs_log * log) | ||
1643 | { | ||
1644 | int rc; | ||
1645 | struct lrd lrd; | ||
1646 | int lsn; | ||
1647 | struct logsuper *logsuper; | ||
1648 | struct lbuf *bpsuper; | ||
1649 | struct lbuf *bp; | ||
1650 | struct logpage *lp; | ||
1651 | |||
1652 | jfs_info("lmLogShutdown: log:0x%p", log); | ||
1653 | |||
1654 | jfs_flush_journal(log, 2); | ||
1655 | |||
1656 | /* | ||
1657 | * write the last SYNCPT record with syncpoint = 0 | ||
1658 | * (i.e., log redo up to HERE !) | ||
1659 | */ | ||
1660 | lrd.logtid = 0; | ||
1661 | lrd.backchain = 0; | ||
1662 | lrd.type = cpu_to_le16(LOG_SYNCPT); | ||
1663 | lrd.length = 0; | ||
1664 | lrd.log.syncpt.sync = 0; | ||
1665 | |||
1666 | lsn = lmWriteRecord(log, NULL, &lrd, NULL); | ||
1667 | bp = log->bp; | ||
1668 | lp = (struct logpage *) bp->l_ldata; | ||
1669 | lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); | ||
1670 | lbmWrite(log, log->bp, lbmWRITE | lbmRELEASE | lbmSYNC, 0); | ||
1671 | lbmIOWait(log->bp, lbmFREE); | ||
1672 | |||
1673 | /* | ||
1674 | * synchronous update log superblock | ||
1675 | * mark log state as shutdown cleanly | ||
1676 | * (i.e., Log does not need to be replayed). | ||
1677 | */ | ||
1678 | if ((rc = lbmRead(log, 1, &bpsuper))) | ||
1679 | goto out; | ||
1680 | |||
1681 | logsuper = (struct logsuper *) bpsuper->l_ldata; | ||
1682 | logsuper->state = cpu_to_le32(LOGREDONE); | ||
1683 | logsuper->end = cpu_to_le32(lsn); | ||
1684 | lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); | ||
1685 | rc = lbmIOWait(bpsuper, lbmFREE); | ||
1686 | |||
1687 | jfs_info("lmLogShutdown: lsn:0x%x page:%d eor:%d", | ||
1688 | lsn, log->page, log->eor); | ||
1689 | |||
1690 | out: | ||
1691 | /* | ||
1692 | * shutdown per log i/o | ||
1693 | */ | ||
1694 | lbmLogShutdown(log); | ||
1695 | |||
1696 | if (rc) { | ||
1697 | jfs_warn("lmLogShutdown: exit(%d)", rc); | ||
1698 | } | ||
1699 | return rc; | ||
1700 | } | ||
1701 | |||
1702 | |||
1703 | /* | ||
1704 | * NAME: lmLogFileSystem() | ||
1705 | * | ||
1706 | * FUNCTION: insert (<activate> = true)/remove (<activate> = false) | ||
1707 | * file system into/from log active file system list. | ||
1708 | * | ||
1709 | * PARAMETE: log - pointer to logs inode. | ||
1710 | * fsdev - kdev_t of filesystem. | ||
1711 | * serial - pointer to returned log serial number | ||
1712 | * activate - insert/remove device from active list. | ||
1713 | * | ||
1714 | * RETURN: 0 - success | ||
1715 | * errors returned by vms_iowait(). | ||
1716 | */ | ||
1717 | static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi, | ||
1718 | int activate) | ||
1719 | { | ||
1720 | int rc = 0; | ||
1721 | int i; | ||
1722 | struct logsuper *logsuper; | ||
1723 | struct lbuf *bpsuper; | ||
1724 | char *uuid = sbi->uuid; | ||
1725 | |||
1726 | /* | ||
1727 | * insert/remove file system device to log active file system list. | ||
1728 | */ | ||
1729 | if ((rc = lbmRead(log, 1, &bpsuper))) | ||
1730 | return rc; | ||
1731 | |||
1732 | logsuper = (struct logsuper *) bpsuper->l_ldata; | ||
1733 | if (activate) { | ||
1734 | for (i = 0; i < MAX_ACTIVE; i++) | ||
1735 | if (!memcmp(logsuper->active[i].uuid, NULL_UUID, 16)) { | ||
1736 | memcpy(logsuper->active[i].uuid, uuid, 16); | ||
1737 | sbi->aggregate = i; | ||
1738 | break; | ||
1739 | } | ||
1740 | if (i == MAX_ACTIVE) { | ||
1741 | jfs_warn("Too many file systems sharing journal!"); | ||
1742 | lbmFree(bpsuper); | ||
1743 | return -EMFILE; /* Is there a better rc? */ | ||
1744 | } | ||
1745 | } else { | ||
1746 | for (i = 0; i < MAX_ACTIVE; i++) | ||
1747 | if (!memcmp(logsuper->active[i].uuid, uuid, 16)) { | ||
1748 | memcpy(logsuper->active[i].uuid, NULL_UUID, 16); | ||
1749 | break; | ||
1750 | } | ||
1751 | if (i == MAX_ACTIVE) { | ||
1752 | jfs_warn("Somebody stomped on the journal!"); | ||
1753 | lbmFree(bpsuper); | ||
1754 | return -EIO; | ||
1755 | } | ||
1756 | |||
1757 | } | ||
1758 | |||
1759 | /* | ||
1760 | * synchronous write log superblock: | ||
1761 | * | ||
1762 | * write sidestream bypassing write queue: | ||
1763 | * at file system mount, log super block is updated for | ||
1764 | * activation of the file system before any log record | ||
1765 | * (MOUNT record) of the file system, and at file system | ||
1766 | * unmount, all meta data for the file system has been | ||
1767 | * flushed before log super block is updated for deactivation | ||
1768 | * of the file system. | ||
1769 | */ | ||
1770 | lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); | ||
1771 | rc = lbmIOWait(bpsuper, lbmFREE); | ||
1772 | |||
1773 | return rc; | ||
1774 | } | ||
1775 | |||
1776 | /* | ||
1777 | * log buffer manager (lbm) | ||
1778 | * ------------------------ | ||
1779 | * | ||
1780 | * special purpose buffer manager supporting log i/o requirements. | ||
1781 | * | ||
1782 | * per log write queue: | ||
1783 | * log pageout occurs in serial order by fifo write queue and | ||
1784 | * restricting to a single i/o in pregress at any one time. | ||
1785 | * a circular singly-linked list | ||
1786 | * (log->wrqueue points to the tail, and buffers are linked via | ||
1787 | * bp->wrqueue field), and | ||
1788 | * maintains log page in pageout ot waiting for pageout in serial pageout. | ||
1789 | */ | ||
1790 | |||
1791 | /* | ||
1792 | * lbmLogInit() | ||
1793 | * | ||
1794 | * initialize per log I/O setup at lmLogInit() | ||
1795 | */ | ||
1796 | static int lbmLogInit(struct jfs_log * log) | ||
1797 | { /* log inode */ | ||
1798 | int i; | ||
1799 | struct lbuf *lbuf; | ||
1800 | |||
1801 | jfs_info("lbmLogInit: log:0x%p", log); | ||
1802 | |||
1803 | /* initialize current buffer cursor */ | ||
1804 | log->bp = NULL; | ||
1805 | |||
1806 | /* initialize log device write queue */ | ||
1807 | log->wqueue = NULL; | ||
1808 | |||
1809 | /* | ||
1810 | * Each log has its own buffer pages allocated to it. These are | ||
1811 | * not managed by the page cache. This ensures that a transaction | ||
1812 | * writing to the log does not block trying to allocate a page from | ||
1813 | * the page cache (for the log). This would be bad, since page | ||
1814 | * allocation waits on the kswapd thread that may be committing inodes | ||
1815 | * which would cause log activity. Was that clear? I'm trying to | ||
1816 | * avoid deadlock here. | ||
1817 | */ | ||
1818 | init_waitqueue_head(&log->free_wait); | ||
1819 | |||
1820 | log->lbuf_free = NULL; | ||
1821 | |||
1822 | for (i = 0; i < LOGPAGES; i++) { | ||
1823 | lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL); | ||
1824 | if (lbuf == 0) | ||
1825 | goto error; | ||
1826 | lbuf->l_ldata = (char *) get_zeroed_page(GFP_KERNEL); | ||
1827 | if (lbuf->l_ldata == 0) { | ||
1828 | kfree(lbuf); | ||
1829 | goto error; | ||
1830 | } | ||
1831 | lbuf->l_log = log; | ||
1832 | init_waitqueue_head(&lbuf->l_ioevent); | ||
1833 | |||
1834 | lbuf->l_freelist = log->lbuf_free; | ||
1835 | log->lbuf_free = lbuf; | ||
1836 | } | ||
1837 | |||
1838 | return (0); | ||
1839 | |||
1840 | error: | ||
1841 | lbmLogShutdown(log); | ||
1842 | return -ENOMEM; | ||
1843 | } | ||
1844 | |||
1845 | |||
1846 | /* | ||
1847 | * lbmLogShutdown() | ||
1848 | * | ||
1849 | * finalize per log I/O setup at lmLogShutdown() | ||
1850 | */ | ||
1851 | static void lbmLogShutdown(struct jfs_log * log) | ||
1852 | { | ||
1853 | struct lbuf *lbuf; | ||
1854 | |||
1855 | jfs_info("lbmLogShutdown: log:0x%p", log); | ||
1856 | |||
1857 | lbuf = log->lbuf_free; | ||
1858 | while (lbuf) { | ||
1859 | struct lbuf *next = lbuf->l_freelist; | ||
1860 | free_page((unsigned long) lbuf->l_ldata); | ||
1861 | kfree(lbuf); | ||
1862 | lbuf = next; | ||
1863 | } | ||
1864 | |||
1865 | log->bp = NULL; | ||
1866 | } | ||
1867 | |||
1868 | |||
1869 | /* | ||
1870 | * lbmAllocate() | ||
1871 | * | ||
1872 | * allocate an empty log buffer | ||
1873 | */ | ||
1874 | static struct lbuf *lbmAllocate(struct jfs_log * log, int pn) | ||
1875 | { | ||
1876 | struct lbuf *bp; | ||
1877 | unsigned long flags; | ||
1878 | |||
1879 | /* | ||
1880 | * recycle from log buffer freelist if any | ||
1881 | */ | ||
1882 | LCACHE_LOCK(flags); | ||
1883 | LCACHE_SLEEP_COND(log->free_wait, (bp = log->lbuf_free), flags); | ||
1884 | log->lbuf_free = bp->l_freelist; | ||
1885 | LCACHE_UNLOCK(flags); | ||
1886 | |||
1887 | bp->l_flag = 0; | ||
1888 | |||
1889 | bp->l_wqnext = NULL; | ||
1890 | bp->l_freelist = NULL; | ||
1891 | |||
1892 | bp->l_pn = pn; | ||
1893 | bp->l_blkno = log->base + (pn << (L2LOGPSIZE - log->l2bsize)); | ||
1894 | bp->l_ceor = 0; | ||
1895 | |||
1896 | return bp; | ||
1897 | } | ||
1898 | |||
1899 | |||
1900 | /* | ||
1901 | * lbmFree() | ||
1902 | * | ||
1903 | * release a log buffer to freelist | ||
1904 | */ | ||
1905 | static void lbmFree(struct lbuf * bp) | ||
1906 | { | ||
1907 | unsigned long flags; | ||
1908 | |||
1909 | LCACHE_LOCK(flags); | ||
1910 | |||
1911 | lbmfree(bp); | ||
1912 | |||
1913 | LCACHE_UNLOCK(flags); | ||
1914 | } | ||
1915 | |||
1916 | static void lbmfree(struct lbuf * bp) | ||
1917 | { | ||
1918 | struct jfs_log *log = bp->l_log; | ||
1919 | |||
1920 | assert(bp->l_wqnext == NULL); | ||
1921 | |||
1922 | /* | ||
1923 | * return the buffer to head of freelist | ||
1924 | */ | ||
1925 | bp->l_freelist = log->lbuf_free; | ||
1926 | log->lbuf_free = bp; | ||
1927 | |||
1928 | wake_up(&log->free_wait); | ||
1929 | return; | ||
1930 | } | ||
1931 | |||
1932 | |||
1933 | /* | ||
1934 | * NAME: lbmRedrive | ||
1935 | * | ||
1936 | * FUNCTION: add a log buffer to the the log redrive list | ||
1937 | * | ||
1938 | * PARAMETER: | ||
1939 | * bp - log buffer | ||
1940 | * | ||
1941 | * NOTES: | ||
1942 | * Takes log_redrive_lock. | ||
1943 | */ | ||
1944 | static inline void lbmRedrive(struct lbuf *bp) | ||
1945 | { | ||
1946 | unsigned long flags; | ||
1947 | |||
1948 | spin_lock_irqsave(&log_redrive_lock, flags); | ||
1949 | bp->l_redrive_next = log_redrive_list; | ||
1950 | log_redrive_list = bp; | ||
1951 | spin_unlock_irqrestore(&log_redrive_lock, flags); | ||
1952 | |||
1953 | wake_up(&jfs_IO_thread_wait); | ||
1954 | } | ||
1955 | |||
1956 | |||
1957 | /* | ||
1958 | * lbmRead() | ||
1959 | */ | ||
1960 | static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) | ||
1961 | { | ||
1962 | struct bio *bio; | ||
1963 | struct lbuf *bp; | ||
1964 | |||
1965 | /* | ||
1966 | * allocate a log buffer | ||
1967 | */ | ||
1968 | *bpp = bp = lbmAllocate(log, pn); | ||
1969 | jfs_info("lbmRead: bp:0x%p pn:0x%x", bp, pn); | ||
1970 | |||
1971 | bp->l_flag |= lbmREAD; | ||
1972 | |||
1973 | bio = bio_alloc(GFP_NOFS, 1); | ||
1974 | |||
1975 | bio->bi_sector = bp->l_blkno << (log->l2bsize - 9); | ||
1976 | bio->bi_bdev = log->bdev; | ||
1977 | bio->bi_io_vec[0].bv_page = virt_to_page(bp->l_ldata); | ||
1978 | bio->bi_io_vec[0].bv_len = LOGPSIZE; | ||
1979 | bio->bi_io_vec[0].bv_offset = 0; | ||
1980 | |||
1981 | bio->bi_vcnt = 1; | ||
1982 | bio->bi_idx = 0; | ||
1983 | bio->bi_size = LOGPSIZE; | ||
1984 | |||
1985 | bio->bi_end_io = lbmIODone; | ||
1986 | bio->bi_private = bp; | ||
1987 | submit_bio(READ_SYNC, bio); | ||
1988 | |||
1989 | wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD)); | ||
1990 | |||
1991 | return 0; | ||
1992 | } | ||
1993 | |||
1994 | |||
1995 | /* | ||
1996 | * lbmWrite() | ||
1997 | * | ||
1998 | * buffer at head of pageout queue stays after completion of | ||
1999 | * partial-page pageout and redriven by explicit initiation of | ||
2000 | * pageout by caller until full-page pageout is completed and | ||
2001 | * released. | ||
2002 | * | ||
2003 | * device driver i/o done redrives pageout of new buffer at | ||
2004 | * head of pageout queue when current buffer at head of pageout | ||
2005 | * queue is released at the completion of its full-page pageout. | ||
2006 | * | ||
2007 | * LOGGC_LOCK() serializes lbmWrite() by lmNextPage() and lmGroupCommit(). | ||
2008 | * LCACHE_LOCK() serializes xflag between lbmWrite() and lbmIODone() | ||
2009 | */ | ||
2010 | static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, | ||
2011 | int cant_block) | ||
2012 | { | ||
2013 | struct lbuf *tail; | ||
2014 | unsigned long flags; | ||
2015 | |||
2016 | jfs_info("lbmWrite: bp:0x%p flag:0x%x pn:0x%x", bp, flag, bp->l_pn); | ||
2017 | |||
2018 | /* map the logical block address to physical block address */ | ||
2019 | bp->l_blkno = | ||
2020 | log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize)); | ||
2021 | |||
2022 | LCACHE_LOCK(flags); /* disable+lock */ | ||
2023 | |||
2024 | /* | ||
2025 | * initialize buffer for device driver | ||
2026 | */ | ||
2027 | bp->l_flag = flag; | ||
2028 | |||
2029 | /* | ||
2030 | * insert bp at tail of write queue associated with log | ||
2031 | * | ||
2032 | * (request is either for bp already/currently at head of queue | ||
2033 | * or new bp to be inserted at tail) | ||
2034 | */ | ||
2035 | tail = log->wqueue; | ||
2036 | |||
2037 | /* is buffer not already on write queue ? */ | ||
2038 | if (bp->l_wqnext == NULL) { | ||
2039 | /* insert at tail of wqueue */ | ||
2040 | if (tail == NULL) { | ||
2041 | log->wqueue = bp; | ||
2042 | bp->l_wqnext = bp; | ||
2043 | } else { | ||
2044 | log->wqueue = bp; | ||
2045 | bp->l_wqnext = tail->l_wqnext; | ||
2046 | tail->l_wqnext = bp; | ||
2047 | } | ||
2048 | |||
2049 | tail = bp; | ||
2050 | } | ||
2051 | |||
2052 | /* is buffer at head of wqueue and for write ? */ | ||
2053 | if ((bp != tail->l_wqnext) || !(flag & lbmWRITE)) { | ||
2054 | LCACHE_UNLOCK(flags); /* unlock+enable */ | ||
2055 | return; | ||
2056 | } | ||
2057 | |||
2058 | LCACHE_UNLOCK(flags); /* unlock+enable */ | ||
2059 | |||
2060 | if (cant_block) | ||
2061 | lbmRedrive(bp); | ||
2062 | else if (flag & lbmSYNC) | ||
2063 | lbmStartIO(bp); | ||
2064 | else { | ||
2065 | LOGGC_UNLOCK(log); | ||
2066 | lbmStartIO(bp); | ||
2067 | LOGGC_LOCK(log); | ||
2068 | } | ||
2069 | } | ||
2070 | |||
2071 | |||
2072 | /* | ||
2073 | * lbmDirectWrite() | ||
2074 | * | ||
2075 | * initiate pageout bypassing write queue for sidestream | ||
2076 | * (e.g., log superblock) write; | ||
2077 | */ | ||
2078 | static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag) | ||
2079 | { | ||
2080 | jfs_info("lbmDirectWrite: bp:0x%p flag:0x%x pn:0x%x", | ||
2081 | bp, flag, bp->l_pn); | ||
2082 | |||
2083 | /* | ||
2084 | * initialize buffer for device driver | ||
2085 | */ | ||
2086 | bp->l_flag = flag | lbmDIRECT; | ||
2087 | |||
2088 | /* map the logical block address to physical block address */ | ||
2089 | bp->l_blkno = | ||
2090 | log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize)); | ||
2091 | |||
2092 | /* | ||
2093 | * initiate pageout of the page | ||
2094 | */ | ||
2095 | lbmStartIO(bp); | ||
2096 | } | ||
2097 | |||
2098 | |||
2099 | /* | ||
2100 | * NAME: lbmStartIO() | ||
2101 | * | ||
2102 | * FUNCTION: Interface to DD strategy routine | ||
2103 | * | ||
2104 | * RETURN: none | ||
2105 | * | ||
2106 | * serialization: LCACHE_LOCK() is NOT held during log i/o; | ||
2107 | */ | ||
2108 | static void lbmStartIO(struct lbuf * bp) | ||
2109 | { | ||
2110 | struct bio *bio; | ||
2111 | struct jfs_log *log = bp->l_log; | ||
2112 | |||
2113 | jfs_info("lbmStartIO\n"); | ||
2114 | |||
2115 | bio = bio_alloc(GFP_NOFS, 1); | ||
2116 | bio->bi_sector = bp->l_blkno << (log->l2bsize - 9); | ||
2117 | bio->bi_bdev = log->bdev; | ||
2118 | bio->bi_io_vec[0].bv_page = virt_to_page(bp->l_ldata); | ||
2119 | bio->bi_io_vec[0].bv_len = LOGPSIZE; | ||
2120 | bio->bi_io_vec[0].bv_offset = 0; | ||
2121 | |||
2122 | bio->bi_vcnt = 1; | ||
2123 | bio->bi_idx = 0; | ||
2124 | bio->bi_size = LOGPSIZE; | ||
2125 | |||
2126 | bio->bi_end_io = lbmIODone; | ||
2127 | bio->bi_private = bp; | ||
2128 | |||
2129 | /* check if journaling to disk has been disabled */ | ||
2130 | if (!log->no_integrity) { | ||
2131 | submit_bio(WRITE_SYNC, bio); | ||
2132 | INCREMENT(lmStat.submitted); | ||
2133 | } | ||
2134 | else { | ||
2135 | bio->bi_size = 0; | ||
2136 | lbmIODone(bio, 0, 0); /* 2nd argument appears to not be used => 0 | ||
2137 | * 3rd argument appears to not be used => 0 | ||
2138 | */ | ||
2139 | } | ||
2140 | } | ||
2141 | |||
2142 | |||
2143 | /* | ||
2144 | * lbmIOWait() | ||
2145 | */ | ||
2146 | static int lbmIOWait(struct lbuf * bp, int flag) | ||
2147 | { | ||
2148 | unsigned long flags; | ||
2149 | int rc = 0; | ||
2150 | |||
2151 | jfs_info("lbmIOWait1: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag); | ||
2152 | |||
2153 | LCACHE_LOCK(flags); /* disable+lock */ | ||
2154 | |||
2155 | LCACHE_SLEEP_COND(bp->l_ioevent, (bp->l_flag & lbmDONE), flags); | ||
2156 | |||
2157 | rc = (bp->l_flag & lbmERROR) ? -EIO : 0; | ||
2158 | |||
2159 | if (flag & lbmFREE) | ||
2160 | lbmfree(bp); | ||
2161 | |||
2162 | LCACHE_UNLOCK(flags); /* unlock+enable */ | ||
2163 | |||
2164 | jfs_info("lbmIOWait2: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag); | ||
2165 | return rc; | ||
2166 | } | ||
2167 | |||
2168 | /* | ||
2169 | * lbmIODone() | ||
2170 | * | ||
2171 | * executed at INTIODONE level | ||
2172 | */ | ||
2173 | static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error) | ||
2174 | { | ||
2175 | struct lbuf *bp = bio->bi_private; | ||
2176 | struct lbuf *nextbp, *tail; | ||
2177 | struct jfs_log *log; | ||
2178 | unsigned long flags; | ||
2179 | |||
2180 | if (bio->bi_size) | ||
2181 | return 1; | ||
2182 | |||
2183 | /* | ||
2184 | * get back jfs buffer bound to the i/o buffer | ||
2185 | */ | ||
2186 | jfs_info("lbmIODone: bp:0x%p flag:0x%x", bp, bp->l_flag); | ||
2187 | |||
2188 | LCACHE_LOCK(flags); /* disable+lock */ | ||
2189 | |||
2190 | bp->l_flag |= lbmDONE; | ||
2191 | |||
2192 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { | ||
2193 | bp->l_flag |= lbmERROR; | ||
2194 | |||
2195 | jfs_err("lbmIODone: I/O error in JFS log"); | ||
2196 | } | ||
2197 | |||
2198 | bio_put(bio); | ||
2199 | |||
2200 | /* | ||
2201 | * pagein completion | ||
2202 | */ | ||
2203 | if (bp->l_flag & lbmREAD) { | ||
2204 | bp->l_flag &= ~lbmREAD; | ||
2205 | |||
2206 | LCACHE_UNLOCK(flags); /* unlock+enable */ | ||
2207 | |||
2208 | /* wakeup I/O initiator */ | ||
2209 | LCACHE_WAKEUP(&bp->l_ioevent); | ||
2210 | |||
2211 | return 0; | ||
2212 | } | ||
2213 | |||
2214 | /* | ||
2215 | * pageout completion | ||
2216 | * | ||
2217 | * the bp at the head of write queue has completed pageout. | ||
2218 | * | ||
2219 | * if single-commit/full-page pageout, remove the current buffer | ||
2220 | * from head of pageout queue, and redrive pageout with | ||
2221 | * the new buffer at head of pageout queue; | ||
2222 | * otherwise, the partial-page pageout buffer stays at | ||
2223 | * the head of pageout queue to be redriven for pageout | ||
2224 | * by lmGroupCommit() until full-page pageout is completed. | ||
2225 | */ | ||
2226 | bp->l_flag &= ~lbmWRITE; | ||
2227 | INCREMENT(lmStat.pagedone); | ||
2228 | |||
2229 | /* update committed lsn */ | ||
2230 | log = bp->l_log; | ||
2231 | log->clsn = (bp->l_pn << L2LOGPSIZE) + bp->l_ceor; | ||
2232 | |||
2233 | if (bp->l_flag & lbmDIRECT) { | ||
2234 | LCACHE_WAKEUP(&bp->l_ioevent); | ||
2235 | LCACHE_UNLOCK(flags); | ||
2236 | return 0; | ||
2237 | } | ||
2238 | |||
2239 | tail = log->wqueue; | ||
2240 | |||
2241 | /* single element queue */ | ||
2242 | if (bp == tail) { | ||
2243 | /* remove head buffer of full-page pageout | ||
2244 | * from log device write queue | ||
2245 | */ | ||
2246 | if (bp->l_flag & lbmRELEASE) { | ||
2247 | log->wqueue = NULL; | ||
2248 | bp->l_wqnext = NULL; | ||
2249 | } | ||
2250 | } | ||
2251 | /* multi element queue */ | ||
2252 | else { | ||
2253 | /* remove head buffer of full-page pageout | ||
2254 | * from log device write queue | ||
2255 | */ | ||
2256 | if (bp->l_flag & lbmRELEASE) { | ||
2257 | nextbp = tail->l_wqnext = bp->l_wqnext; | ||
2258 | bp->l_wqnext = NULL; | ||
2259 | |||
2260 | /* | ||
2261 | * redrive pageout of next page at head of write queue: | ||
2262 | * redrive next page without any bound tblk | ||
2263 | * (i.e., page w/o any COMMIT records), or | ||
2264 | * first page of new group commit which has been | ||
2265 | * queued after current page (subsequent pageout | ||
2266 | * is performed synchronously, except page without | ||
2267 | * any COMMITs) by lmGroupCommit() as indicated | ||
2268 | * by lbmWRITE flag; | ||
2269 | */ | ||
2270 | if (nextbp->l_flag & lbmWRITE) { | ||
2271 | /* | ||
2272 | * We can't do the I/O at interrupt time. | ||
2273 | * The jfsIO thread can do it | ||
2274 | */ | ||
2275 | lbmRedrive(nextbp); | ||
2276 | } | ||
2277 | } | ||
2278 | } | ||
2279 | |||
2280 | /* | ||
2281 | * synchronous pageout: | ||
2282 | * | ||
2283 | * buffer has not necessarily been removed from write queue | ||
2284 | * (e.g., synchronous write of partial-page with COMMIT): | ||
2285 | * leave buffer for i/o initiator to dispose | ||
2286 | */ | ||
2287 | if (bp->l_flag & lbmSYNC) { | ||
2288 | LCACHE_UNLOCK(flags); /* unlock+enable */ | ||
2289 | |||
2290 | /* wakeup I/O initiator */ | ||
2291 | LCACHE_WAKEUP(&bp->l_ioevent); | ||
2292 | } | ||
2293 | |||
2294 | /* | ||
2295 | * Group Commit pageout: | ||
2296 | */ | ||
2297 | else if (bp->l_flag & lbmGC) { | ||
2298 | LCACHE_UNLOCK(flags); | ||
2299 | lmPostGC(bp); | ||
2300 | } | ||
2301 | |||
2302 | /* | ||
2303 | * asynchronous pageout: | ||
2304 | * | ||
2305 | * buffer must have been removed from write queue: | ||
2306 | * insert buffer at head of freelist where it can be recycled | ||
2307 | */ | ||
2308 | else { | ||
2309 | assert(bp->l_flag & lbmRELEASE); | ||
2310 | assert(bp->l_flag & lbmFREE); | ||
2311 | lbmfree(bp); | ||
2312 | |||
2313 | LCACHE_UNLOCK(flags); /* unlock+enable */ | ||
2314 | } | ||
2315 | |||
2316 | return 0; | ||
2317 | } | ||
2318 | |||
2319 | int jfsIOWait(void *arg) | ||
2320 | { | ||
2321 | struct lbuf *bp; | ||
2322 | |||
2323 | daemonize("jfsIO"); | ||
2324 | |||
2325 | complete(&jfsIOwait); | ||
2326 | |||
2327 | do { | ||
2328 | DECLARE_WAITQUEUE(wq, current); | ||
2329 | |||
2330 | spin_lock_irq(&log_redrive_lock); | ||
2331 | while ((bp = log_redrive_list) != 0) { | ||
2332 | log_redrive_list = bp->l_redrive_next; | ||
2333 | bp->l_redrive_next = NULL; | ||
2334 | spin_unlock_irq(&log_redrive_lock); | ||
2335 | lbmStartIO(bp); | ||
2336 | spin_lock_irq(&log_redrive_lock); | ||
2337 | } | ||
2338 | if (current->flags & PF_FREEZE) { | ||
2339 | spin_unlock_irq(&log_redrive_lock); | ||
2340 | refrigerator(PF_FREEZE); | ||
2341 | } else { | ||
2342 | add_wait_queue(&jfs_IO_thread_wait, &wq); | ||
2343 | set_current_state(TASK_INTERRUPTIBLE); | ||
2344 | spin_unlock_irq(&log_redrive_lock); | ||
2345 | schedule(); | ||
2346 | current->state = TASK_RUNNING; | ||
2347 | remove_wait_queue(&jfs_IO_thread_wait, &wq); | ||
2348 | } | ||
2349 | } while (!jfs_stop_threads); | ||
2350 | |||
2351 | jfs_info("jfsIOWait being killed!"); | ||
2352 | complete_and_exit(&jfsIOwait, 0); | ||
2353 | } | ||
2354 | |||
2355 | /* | ||
2356 | * NAME: lmLogFormat()/jfs_logform() | ||
2357 | * | ||
2358 | * FUNCTION: format file system log | ||
2359 | * | ||
2360 | * PARAMETERS: | ||
2361 | * log - volume log | ||
2362 | * logAddress - start address of log space in FS block | ||
2363 | * logSize - length of log space in FS block; | ||
2364 | * | ||
2365 | * RETURN: 0 - success | ||
2366 | * -EIO - i/o error | ||
2367 | * | ||
2368 | * XXX: We're synchronously writing one page at a time. This needs to | ||
2369 | * be improved by writing multiple pages at once. | ||
2370 | */ | ||
2371 | int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize) | ||
2372 | { | ||
2373 | int rc = -EIO; | ||
2374 | struct jfs_sb_info *sbi; | ||
2375 | struct logsuper *logsuper; | ||
2376 | struct logpage *lp; | ||
2377 | int lspn; /* log sequence page number */ | ||
2378 | struct lrd *lrd_ptr; | ||
2379 | int npages = 0; | ||
2380 | struct lbuf *bp; | ||
2381 | |||
2382 | jfs_info("lmLogFormat: logAddress:%Ld logSize:%d", | ||
2383 | (long long)logAddress, logSize); | ||
2384 | |||
2385 | sbi = list_entry(log->sb_list.next, struct jfs_sb_info, log_list); | ||
2386 | |||
2387 | /* allocate a log buffer */ | ||
2388 | bp = lbmAllocate(log, 1); | ||
2389 | |||
2390 | npages = logSize >> sbi->l2nbperpage; | ||
2391 | |||
2392 | /* | ||
2393 | * log space: | ||
2394 | * | ||
2395 | * page 0 - reserved; | ||
2396 | * page 1 - log superblock; | ||
2397 | * page 2 - log data page: A SYNC log record is written | ||
2398 | * into this page at logform time; | ||
2399 | * pages 3-N - log data page: set to empty log data pages; | ||
2400 | */ | ||
2401 | /* | ||
2402 | * init log superblock: log page 1 | ||
2403 | */ | ||
2404 | logsuper = (struct logsuper *) bp->l_ldata; | ||
2405 | |||
2406 | logsuper->magic = cpu_to_le32(LOGMAGIC); | ||
2407 | logsuper->version = cpu_to_le32(LOGVERSION); | ||
2408 | logsuper->state = cpu_to_le32(LOGREDONE); | ||
2409 | logsuper->flag = cpu_to_le32(sbi->mntflag); /* ? */ | ||
2410 | logsuper->size = cpu_to_le32(npages); | ||
2411 | logsuper->bsize = cpu_to_le32(sbi->bsize); | ||
2412 | logsuper->l2bsize = cpu_to_le32(sbi->l2bsize); | ||
2413 | logsuper->end = cpu_to_le32(2 * LOGPSIZE + LOGPHDRSIZE + LOGRDSIZE); | ||
2414 | |||
2415 | bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT; | ||
2416 | bp->l_blkno = logAddress + sbi->nbperpage; | ||
2417 | lbmStartIO(bp); | ||
2418 | if ((rc = lbmIOWait(bp, 0))) | ||
2419 | goto exit; | ||
2420 | |||
2421 | /* | ||
2422 | * init pages 2 to npages-1 as log data pages: | ||
2423 | * | ||
2424 | * log page sequence number (lpsn) initialization: | ||
2425 | * | ||
2426 | * pn: 0 1 2 3 n-1 | ||
2427 | * +-----+-----+=====+=====+===.....===+=====+ | ||
2428 | * lspn: N-1 0 1 N-2 | ||
2429 | * <--- N page circular file ----> | ||
2430 | * | ||
2431 | * the N (= npages-2) data pages of the log is maintained as | ||
2432 | * a circular file for the log records; | ||
2433 | * lpsn grows by 1 monotonically as each log page is written | ||
2434 | * to the circular file of the log; | ||
2435 | * and setLogpage() will not reset the page number even if | ||
2436 | * the eor is equal to LOGPHDRSIZE. In order for binary search | ||
2437 | * still work in find log end process, we have to simulate the | ||
2438 | * log wrap situation at the log format time. | ||
2439 | * The 1st log page written will have the highest lpsn. Then | ||
2440 | * the succeeding log pages will have ascending order of | ||
2441 | * the lspn starting from 0, ... (N-2) | ||
2442 | */ | ||
2443 | lp = (struct logpage *) bp->l_ldata; | ||
2444 | /* | ||
2445 | * initialize 1st log page to be written: lpsn = N - 1, | ||
2446 | * write a SYNCPT log record is written to this page | ||
2447 | */ | ||
2448 | lp->h.page = lp->t.page = cpu_to_le32(npages - 3); | ||
2449 | lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE + LOGRDSIZE); | ||
2450 | |||
2451 | lrd_ptr = (struct lrd *) &lp->data; | ||
2452 | lrd_ptr->logtid = 0; | ||
2453 | lrd_ptr->backchain = 0; | ||
2454 | lrd_ptr->type = cpu_to_le16(LOG_SYNCPT); | ||
2455 | lrd_ptr->length = 0; | ||
2456 | lrd_ptr->log.syncpt.sync = 0; | ||
2457 | |||
2458 | bp->l_blkno += sbi->nbperpage; | ||
2459 | bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT; | ||
2460 | lbmStartIO(bp); | ||
2461 | if ((rc = lbmIOWait(bp, 0))) | ||
2462 | goto exit; | ||
2463 | |||
2464 | /* | ||
2465 | * initialize succeeding log pages: lpsn = 0, 1, ..., (N-2) | ||
2466 | */ | ||
2467 | for (lspn = 0; lspn < npages - 3; lspn++) { | ||
2468 | lp->h.page = lp->t.page = cpu_to_le32(lspn); | ||
2469 | lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE); | ||
2470 | |||
2471 | bp->l_blkno += sbi->nbperpage; | ||
2472 | bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT; | ||
2473 | lbmStartIO(bp); | ||
2474 | if ((rc = lbmIOWait(bp, 0))) | ||
2475 | goto exit; | ||
2476 | } | ||
2477 | |||
2478 | rc = 0; | ||
2479 | exit: | ||
2480 | /* | ||
2481 | * finalize log | ||
2482 | */ | ||
2483 | /* release the buffer */ | ||
2484 | lbmFree(bp); | ||
2485 | |||
2486 | return rc; | ||
2487 | } | ||
2488 | |||
2489 | #ifdef CONFIG_JFS_STATISTICS | ||
2490 | int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length, | ||
2491 | int *eof, void *data) | ||
2492 | { | ||
2493 | int len = 0; | ||
2494 | off_t begin; | ||
2495 | |||
2496 | len += sprintf(buffer, | ||
2497 | "JFS Logmgr stats\n" | ||
2498 | "================\n" | ||
2499 | "commits = %d\n" | ||
2500 | "writes submitted = %d\n" | ||
2501 | "writes completed = %d\n" | ||
2502 | "full pages submitted = %d\n" | ||
2503 | "partial pages submitted = %d\n", | ||
2504 | lmStat.commit, | ||
2505 | lmStat.submitted, | ||
2506 | lmStat.pagedone, | ||
2507 | lmStat.full_page, | ||
2508 | lmStat.partial_page); | ||
2509 | |||
2510 | begin = offset; | ||
2511 | *start = buffer + begin; | ||
2512 | len -= begin; | ||
2513 | |||
2514 | if (len > length) | ||
2515 | len = length; | ||
2516 | else | ||
2517 | *eof = 1; | ||
2518 | |||
2519 | if (len < 0) | ||
2520 | len = 0; | ||
2521 | |||
2522 | return len; | ||
2523 | } | ||
2524 | #endif /* CONFIG_JFS_STATISTICS */ | ||
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h new file mode 100644 index 000000000000..141ad74010c9 --- /dev/null +++ b/fs/jfs/jfs_logmgr.h | |||
@@ -0,0 +1,510 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * Portions Copyright (C) Christoph Hellwig, 2001-2002 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | #ifndef _H_JFS_LOGMGR | ||
20 | #define _H_JFS_LOGMGR | ||
21 | |||
22 | #include "jfs_filsys.h" | ||
23 | #include "jfs_lock.h" | ||
24 | |||
25 | /* | ||
26 | * log manager configuration parameters | ||
27 | */ | ||
28 | |||
29 | /* log page size */ | ||
30 | #define LOGPSIZE 4096 | ||
31 | #define L2LOGPSIZE 12 | ||
32 | |||
33 | #define LOGPAGES 16 /* Log pages per mounted file system */ | ||
34 | |||
35 | /* | ||
36 | * log logical volume | ||
37 | * | ||
38 | * a log is used to make the commit operation on journalled | ||
39 | * files within the same logical volume group atomic. | ||
40 | * a log is implemented with a logical volume. | ||
41 | * there is one log per logical volume group. | ||
42 | * | ||
43 | * block 0 of the log logical volume is not used (ipl etc). | ||
44 | * block 1 contains a log "superblock" and is used by logFormat(), | ||
45 | * lmLogInit(), lmLogShutdown(), and logRedo() to record status | ||
46 | * of the log but is not otherwise used during normal processing. | ||
47 | * blocks 2 - (N-1) are used to contain log records. | ||
48 | * | ||
49 | * when a volume group is varied-on-line, logRedo() must have | ||
50 | * been executed before the file systems (logical volumes) in | ||
51 | * the volume group can be mounted. | ||
52 | */ | ||
53 | /* | ||
54 | * log superblock (block 1 of logical volume) | ||
55 | */ | ||
56 | #define LOGSUPER_B 1 | ||
57 | #define LOGSTART_B 2 | ||
58 | |||
59 | #define LOGMAGIC 0x87654321 | ||
60 | #define LOGVERSION 1 | ||
61 | |||
62 | #define MAX_ACTIVE 128 /* Max active file systems sharing log */ | ||
63 | |||
64 | struct logsuper { | ||
65 | __le32 magic; /* 4: log lv identifier */ | ||
66 | __le32 version; /* 4: version number */ | ||
67 | __le32 serial; /* 4: log open/mount counter */ | ||
68 | __le32 size; /* 4: size in number of LOGPSIZE blocks */ | ||
69 | __le32 bsize; /* 4: logical block size in byte */ | ||
70 | __le32 l2bsize; /* 4: log2 of bsize */ | ||
71 | |||
72 | __le32 flag; /* 4: option */ | ||
73 | __le32 state; /* 4: state - see below */ | ||
74 | |||
75 | __le32 end; /* 4: addr of last log record set by logredo */ | ||
76 | char uuid[16]; /* 16: 128-bit journal uuid */ | ||
77 | char label[16]; /* 16: journal label */ | ||
78 | struct { | ||
79 | char uuid[16]; | ||
80 | } active[MAX_ACTIVE]; /* 2048: active file systems list */ | ||
81 | }; | ||
82 | |||
83 | #define NULL_UUID "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" | ||
84 | |||
85 | /* log flag: commit option (see jfs_filsys.h) */ | ||
86 | |||
87 | /* log state */ | ||
88 | #define LOGMOUNT 0 /* log mounted by lmLogInit() */ | ||
89 | #define LOGREDONE 1 /* log shutdown by lmLogShutdown(). | ||
90 | * log redo completed by logredo(). | ||
91 | */ | ||
92 | #define LOGWRAP 2 /* log wrapped */ | ||
93 | #define LOGREADERR 3 /* log read error detected in logredo() */ | ||
94 | |||
95 | |||
96 | /* | ||
97 | * log logical page | ||
98 | * | ||
99 | * (this comment should be rewritten !) | ||
100 | * the header and trailer structures (h,t) will normally have | ||
101 | * the same page and eor value. | ||
102 | * An exception to this occurs when a complete page write is not | ||
103 | * accomplished on a power failure. Since the hardware may "split write" | ||
104 | * sectors in the page, any out of order sequence may occur during powerfail | ||
105 | * and needs to be recognized during log replay. The xor value is | ||
106 | * an "exclusive or" of all log words in the page up to eor. This | ||
107 | * 32 bit eor is stored with the top 16 bits in the header and the | ||
108 | * bottom 16 bits in the trailer. logredo can easily recognize pages | ||
109 | * that were not completed by reconstructing this eor and checking | ||
110 | * the log page. | ||
111 | * | ||
112 | * Previous versions of the operating system did not allow split | ||
113 | * writes and detected partially written records in logredo by | ||
114 | * ordering the updates to the header, trailer, and the move of data | ||
115 | * into the logdata area. The order: (1) data is moved (2) header | ||
116 | * is updated (3) trailer is updated. In logredo, when the header | ||
117 | * differed from the trailer, the header and trailer were reconciled | ||
118 | * as follows: if h.page != t.page they were set to the smaller of | ||
119 | * the two and h.eor and t.eor set to 8 (i.e. empty page). if (only) | ||
120 | * h.eor != t.eor they were set to the smaller of their two values. | ||
121 | */ | ||
122 | struct logpage { | ||
123 | struct { /* header */ | ||
124 | __le32 page; /* 4: log sequence page number */ | ||
125 | __le16 rsrvd; /* 2: */ | ||
126 | __le16 eor; /* 2: end-of-log offset of lasrt record write */ | ||
127 | } h; | ||
128 | |||
129 | __le32 data[LOGPSIZE / 4 - 4]; /* log record area */ | ||
130 | |||
131 | struct { /* trailer */ | ||
132 | __le32 page; /* 4: normally the same as h.page */ | ||
133 | __le16 rsrvd; /* 2: */ | ||
134 | __le16 eor; /* 2: normally the same as h.eor */ | ||
135 | } t; | ||
136 | }; | ||
137 | |||
138 | #define LOGPHDRSIZE 8 /* log page header size */ | ||
139 | #define LOGPTLRSIZE 8 /* log page trailer size */ | ||
140 | |||
141 | |||
142 | /* | ||
143 | * log record | ||
144 | * | ||
145 | * (this comment should be rewritten !) | ||
146 | * jfs uses only "after" log records (only a single writer is allowed | ||
147 | * in a page, pages are written to temporary paging space if | ||
148 | * if they must be written to disk before commit, and i/o is | ||
149 | * scheduled for modified pages to their home location after | ||
150 | * the log records containing the after values and the commit | ||
151 | * record is written to the log on disk, undo discards the copy | ||
152 | * in main-memory.) | ||
153 | * | ||
154 | * a log record consists of a data area of variable length followed by | ||
155 | * a descriptor of fixed size LOGRDSIZE bytes. | ||
156 | * the data area is rounded up to an integral number of 4-bytes and | ||
157 | * must be no longer than LOGPSIZE. | ||
158 | * the descriptor is of size of multiple of 4-bytes and aligned on a | ||
159 | * 4-byte boundary. | ||
160 | * records are packed one after the other in the data area of log pages. | ||
161 | * (sometimes a DUMMY record is inserted so that at least one record ends | ||
162 | * on every page or the longest record is placed on at most two pages). | ||
163 | * the field eor in page header/trailer points to the byte following | ||
164 | * the last record on a page. | ||
165 | */ | ||
166 | |||
167 | /* log record types */ | ||
168 | #define LOG_COMMIT 0x8000 | ||
169 | #define LOG_SYNCPT 0x4000 | ||
170 | #define LOG_MOUNT 0x2000 | ||
171 | #define LOG_REDOPAGE 0x0800 | ||
172 | #define LOG_NOREDOPAGE 0x0080 | ||
173 | #define LOG_NOREDOINOEXT 0x0040 | ||
174 | #define LOG_UPDATEMAP 0x0008 | ||
175 | #define LOG_NOREDOFILE 0x0001 | ||
176 | |||
177 | /* REDOPAGE/NOREDOPAGE log record data type */ | ||
178 | #define LOG_INODE 0x0001 | ||
179 | #define LOG_XTREE 0x0002 | ||
180 | #define LOG_DTREE 0x0004 | ||
181 | #define LOG_BTROOT 0x0010 | ||
182 | #define LOG_EA 0x0020 | ||
183 | #define LOG_ACL 0x0040 | ||
184 | #define LOG_DATA 0x0080 | ||
185 | #define LOG_NEW 0x0100 | ||
186 | #define LOG_EXTEND 0x0200 | ||
187 | #define LOG_RELOCATE 0x0400 | ||
188 | #define LOG_DIR_XTREE 0x0800 /* Xtree is in directory inode */ | ||
189 | |||
190 | /* UPDATEMAP log record descriptor type */ | ||
191 | #define LOG_ALLOCXADLIST 0x0080 | ||
192 | #define LOG_ALLOCPXDLIST 0x0040 | ||
193 | #define LOG_ALLOCXAD 0x0020 | ||
194 | #define LOG_ALLOCPXD 0x0010 | ||
195 | #define LOG_FREEXADLIST 0x0008 | ||
196 | #define LOG_FREEPXDLIST 0x0004 | ||
197 | #define LOG_FREEXAD 0x0002 | ||
198 | #define LOG_FREEPXD 0x0001 | ||
199 | |||
200 | |||
201 | struct lrd { | ||
202 | /* | ||
203 | * type independent area | ||
204 | */ | ||
205 | __le32 logtid; /* 4: log transaction identifier */ | ||
206 | __le32 backchain; /* 4: ptr to prev record of same transaction */ | ||
207 | __le16 type; /* 2: record type */ | ||
208 | __le16 length; /* 2: length of data in record (in byte) */ | ||
209 | __le32 aggregate; /* 4: file system lv/aggregate */ | ||
210 | /* (16) */ | ||
211 | |||
212 | /* | ||
213 | * type dependent area (20) | ||
214 | */ | ||
215 | union { | ||
216 | |||
217 | /* | ||
218 | * COMMIT: commit | ||
219 | * | ||
220 | * transaction commit: no type-dependent information; | ||
221 | */ | ||
222 | |||
223 | /* | ||
224 | * REDOPAGE: after-image | ||
225 | * | ||
226 | * apply after-image; | ||
227 | * | ||
228 | * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; | ||
229 | */ | ||
230 | struct { | ||
231 | __le32 fileset; /* 4: fileset number */ | ||
232 | __le32 inode; /* 4: inode number */ | ||
233 | __le16 type; /* 2: REDOPAGE record type */ | ||
234 | __le16 l2linesize; /* 2: log2 of line size */ | ||
235 | pxd_t pxd; /* 8: on-disk page pxd */ | ||
236 | } redopage; /* (20) */ | ||
237 | |||
238 | /* | ||
239 | * NOREDOPAGE: the page is freed | ||
240 | * | ||
241 | * do not apply after-image records which precede this record | ||
242 | * in the log with the same page block number to this page. | ||
243 | * | ||
244 | * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; | ||
245 | */ | ||
246 | struct { | ||
247 | __le32 fileset; /* 4: fileset number */ | ||
248 | __le32 inode; /* 4: inode number */ | ||
249 | __le16 type; /* 2: NOREDOPAGE record type */ | ||
250 | __le16 rsrvd; /* 2: reserved */ | ||
251 | pxd_t pxd; /* 8: on-disk page pxd */ | ||
252 | } noredopage; /* (20) */ | ||
253 | |||
254 | /* | ||
255 | * UPDATEMAP: update block allocation map | ||
256 | * | ||
257 | * either in-line PXD, | ||
258 | * or out-of-line XADLIST; | ||
259 | * | ||
260 | * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; | ||
261 | */ | ||
262 | struct { | ||
263 | __le32 fileset; /* 4: fileset number */ | ||
264 | __le32 inode; /* 4: inode number */ | ||
265 | __le16 type; /* 2: UPDATEMAP record type */ | ||
266 | __le16 nxd; /* 2: number of extents */ | ||
267 | pxd_t pxd; /* 8: pxd */ | ||
268 | } updatemap; /* (20) */ | ||
269 | |||
270 | /* | ||
271 | * NOREDOINOEXT: the inode extent is freed | ||
272 | * | ||
273 | * do not apply after-image records which precede this | ||
274 | * record in the log with the any of the 4 page block | ||
275 | * numbers in this inode extent. | ||
276 | * | ||
277 | * NOTE: The fileset and pxd fields MUST remain in | ||
278 | * the same fields in the REDOPAGE record format. | ||
279 | * | ||
280 | */ | ||
281 | struct { | ||
282 | __le32 fileset; /* 4: fileset number */ | ||
283 | __le32 iagnum; /* 4: IAG number */ | ||
284 | __le32 inoext_idx; /* 4: inode extent index */ | ||
285 | pxd_t pxd; /* 8: on-disk page pxd */ | ||
286 | } noredoinoext; /* (20) */ | ||
287 | |||
288 | /* | ||
289 | * SYNCPT: log sync point | ||
290 | * | ||
291 | * replay log upto syncpt address specified; | ||
292 | */ | ||
293 | struct { | ||
294 | __le32 sync; /* 4: syncpt address (0 = here) */ | ||
295 | } syncpt; | ||
296 | |||
297 | /* | ||
298 | * MOUNT: file system mount | ||
299 | * | ||
300 | * file system mount: no type-dependent information; | ||
301 | */ | ||
302 | |||
303 | /* | ||
304 | * ? FREEXTENT: free specified extent(s) | ||
305 | * | ||
306 | * free specified extent(s) from block allocation map | ||
307 | * N.B.: nextents should be length of data/sizeof(xad_t) | ||
308 | */ | ||
309 | struct { | ||
310 | __le32 type; /* 4: FREEXTENT record type */ | ||
311 | __le32 nextent; /* 4: number of extents */ | ||
312 | |||
313 | /* data: PXD or XAD list */ | ||
314 | } freextent; | ||
315 | |||
316 | /* | ||
317 | * ? NOREDOFILE: this file is freed | ||
318 | * | ||
319 | * do not apply records which precede this record in the log | ||
320 | * with the same inode number. | ||
321 | * | ||
322 | * NOREDILE must be the first to be written at commit | ||
323 | * (last to be read in logredo()) - it prevents | ||
324 | * replay of preceding updates of all preceding generations | ||
325 | * of the inumber esp. the on-disk inode itself, | ||
326 | * but does NOT prevent | ||
327 | * replay of the | ||
328 | */ | ||
329 | struct { | ||
330 | __le32 fileset; /* 4: fileset number */ | ||
331 | __le32 inode; /* 4: inode number */ | ||
332 | } noredofile; | ||
333 | |||
334 | /* | ||
335 | * ? NEWPAGE: | ||
336 | * | ||
337 | * metadata type dependent | ||
338 | */ | ||
339 | struct { | ||
340 | __le32 fileset; /* 4: fileset number */ | ||
341 | __le32 inode; /* 4: inode number */ | ||
342 | __le32 type; /* 4: NEWPAGE record type */ | ||
343 | pxd_t pxd; /* 8: on-disk page pxd */ | ||
344 | } newpage; | ||
345 | |||
346 | /* | ||
347 | * ? DUMMY: filler | ||
348 | * | ||
349 | * no type-dependent information | ||
350 | */ | ||
351 | } log; | ||
352 | }; /* (36) */ | ||
353 | |||
354 | #define LOGRDSIZE (sizeof(struct lrd)) | ||
355 | |||
356 | /* | ||
357 | * line vector descriptor | ||
358 | */ | ||
359 | struct lvd { | ||
360 | __le16 offset; | ||
361 | __le16 length; | ||
362 | }; | ||
363 | |||
364 | |||
365 | /* | ||
366 | * log logical volume | ||
367 | */ | ||
368 | struct jfs_log { | ||
369 | |||
370 | struct list_head sb_list;/* This is used to sync metadata | ||
371 | * before writing syncpt. | ||
372 | */ | ||
373 | struct list_head journal_list; /* Global list */ | ||
374 | struct block_device *bdev; /* 4: log lv pointer */ | ||
375 | int serial; /* 4: log mount serial number */ | ||
376 | |||
377 | s64 base; /* @8: log extent address (inline log ) */ | ||
378 | int size; /* 4: log size in log page (in page) */ | ||
379 | int l2bsize; /* 4: log2 of bsize */ | ||
380 | |||
381 | long flag; /* 4: flag */ | ||
382 | |||
383 | struct lbuf *lbuf_free; /* 4: free lbufs */ | ||
384 | wait_queue_head_t free_wait; /* 4: */ | ||
385 | |||
386 | /* log write */ | ||
387 | int logtid; /* 4: log tid */ | ||
388 | int page; /* 4: page number of eol page */ | ||
389 | int eor; /* 4: eor of last record in eol page */ | ||
390 | struct lbuf *bp; /* 4: current log page buffer */ | ||
391 | |||
392 | struct semaphore loglock; /* 4: log write serialization lock */ | ||
393 | |||
394 | /* syncpt */ | ||
395 | int nextsync; /* 4: bytes to write before next syncpt */ | ||
396 | int active; /* 4: */ | ||
397 | wait_queue_head_t syncwait; /* 4: */ | ||
398 | |||
399 | /* commit */ | ||
400 | uint cflag; /* 4: */ | ||
401 | struct list_head cqueue; /* FIFO commit queue */ | ||
402 | struct tblock *flush_tblk; /* tblk we're waiting on for flush */ | ||
403 | int gcrtc; /* 4: GC_READY transaction count */ | ||
404 | struct tblock *gclrt; /* 4: latest GC_READY transaction */ | ||
405 | spinlock_t gclock; /* 4: group commit lock */ | ||
406 | int logsize; /* 4: log data area size in byte */ | ||
407 | int lsn; /* 4: end-of-log */ | ||
408 | int clsn; /* 4: clsn */ | ||
409 | int syncpt; /* 4: addr of last syncpt record */ | ||
410 | int sync; /* 4: addr from last logsync() */ | ||
411 | struct list_head synclist; /* 8: logsynclist anchor */ | ||
412 | spinlock_t synclock; /* 4: synclist lock */ | ||
413 | struct lbuf *wqueue; /* 4: log pageout queue */ | ||
414 | int count; /* 4: count */ | ||
415 | char uuid[16]; /* 16: 128-bit uuid of log device */ | ||
416 | |||
417 | int no_integrity; /* 3: flag to disable journaling to disk */ | ||
418 | }; | ||
419 | |||
420 | /* | ||
421 | * Log flag | ||
422 | */ | ||
423 | #define log_INLINELOG 1 | ||
424 | #define log_SYNCBARRIER 2 | ||
425 | #define log_QUIESCE 3 | ||
426 | #define log_FLUSH 4 | ||
427 | |||
428 | /* | ||
429 | * group commit flag | ||
430 | */ | ||
431 | /* jfs_log */ | ||
432 | #define logGC_PAGEOUT 0x00000001 | ||
433 | |||
434 | /* tblock/lbuf */ | ||
435 | #define tblkGC_QUEUE 0x0001 | ||
436 | #define tblkGC_READY 0x0002 | ||
437 | #define tblkGC_COMMIT 0x0004 | ||
438 | #define tblkGC_COMMITTED 0x0008 | ||
439 | #define tblkGC_EOP 0x0010 | ||
440 | #define tblkGC_FREE 0x0020 | ||
441 | #define tblkGC_LEADER 0x0040 | ||
442 | #define tblkGC_ERROR 0x0080 | ||
443 | #define tblkGC_LAZY 0x0100 // D230860 | ||
444 | #define tblkGC_UNLOCKED 0x0200 // D230860 | ||
445 | |||
446 | /* | ||
447 | * log cache buffer header | ||
448 | */ | ||
449 | struct lbuf { | ||
450 | struct jfs_log *l_log; /* 4: log associated with buffer */ | ||
451 | |||
452 | /* | ||
453 | * data buffer base area | ||
454 | */ | ||
455 | uint l_flag; /* 4: pageout control flags */ | ||
456 | |||
457 | struct lbuf *l_wqnext; /* 4: write queue link */ | ||
458 | struct lbuf *l_freelist; /* 4: freelistlink */ | ||
459 | |||
460 | int l_pn; /* 4: log page number */ | ||
461 | int l_eor; /* 4: log record eor */ | ||
462 | int l_ceor; /* 4: committed log record eor */ | ||
463 | |||
464 | s64 l_blkno; /* 8: log page block number */ | ||
465 | caddr_t l_ldata; /* 4: data page */ | ||
466 | |||
467 | wait_queue_head_t l_ioevent; /* 4: i/o done event */ | ||
468 | struct page *l_page; /* The page itself */ | ||
469 | }; | ||
470 | |||
471 | /* Reuse l_freelist for redrive list */ | ||
472 | #define l_redrive_next l_freelist | ||
473 | |||
474 | /* | ||
475 | * logsynclist block | ||
476 | * | ||
477 | * common logsyncblk prefix for jbuf_t and tblock | ||
478 | */ | ||
479 | struct logsyncblk { | ||
480 | u16 xflag; /* flags */ | ||
481 | u16 flag; /* only meaninful in tblock */ | ||
482 | lid_t lid; /* lock id */ | ||
483 | s32 lsn; /* log sequence number */ | ||
484 | struct list_head synclist; /* log sync list link */ | ||
485 | }; | ||
486 | |||
487 | /* | ||
488 | * logsynclist serialization (per log) | ||
489 | */ | ||
490 | |||
491 | #define LOGSYNC_LOCK_INIT(log) spin_lock_init(&(log)->synclock) | ||
492 | #define LOGSYNC_LOCK(log) spin_lock(&(log)->synclock) | ||
493 | #define LOGSYNC_UNLOCK(log) spin_unlock(&(log)->synclock) | ||
494 | |||
495 | /* compute the difference in bytes of lsn from sync point */ | ||
496 | #define logdiff(diff, lsn, log)\ | ||
497 | {\ | ||
498 | diff = (lsn) - (log)->syncpt;\ | ||
499 | if (diff < 0)\ | ||
500 | diff += (log)->logsize;\ | ||
501 | } | ||
502 | |||
503 | extern int lmLogOpen(struct super_block *sb); | ||
504 | extern int lmLogClose(struct super_block *sb); | ||
505 | extern int lmLogShutdown(struct jfs_log * log); | ||
506 | extern int lmLogInit(struct jfs_log * log); | ||
507 | extern int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize); | ||
508 | extern void jfs_flush_journal(struct jfs_log * log, int wait); | ||
509 | |||
510 | #endif /* _H_JFS_LOGMGR */ | ||
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c new file mode 100644 index 000000000000..4c0a3ac75c08 --- /dev/null +++ b/fs/jfs/jfs_metapage.c | |||
@@ -0,0 +1,580 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2003 | ||
3 | * Portions Copyright (C) Christoph Hellwig, 2001-2002 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | |||
20 | #include <linux/fs.h> | ||
21 | #include <linux/init.h> | ||
22 | #include <linux/buffer_head.h> | ||
23 | #include <linux/mempool.h> | ||
24 | #include <linux/delay.h> | ||
25 | #include "jfs_incore.h" | ||
26 | #include "jfs_superblock.h" | ||
27 | #include "jfs_filsys.h" | ||
28 | #include "jfs_metapage.h" | ||
29 | #include "jfs_txnmgr.h" | ||
30 | #include "jfs_debug.h" | ||
31 | |||
32 | static DEFINE_SPINLOCK(meta_lock); | ||
33 | |||
34 | #ifdef CONFIG_JFS_STATISTICS | ||
35 | static struct { | ||
36 | uint pagealloc; /* # of page allocations */ | ||
37 | uint pagefree; /* # of page frees */ | ||
38 | uint lockwait; /* # of sleeping lock_metapage() calls */ | ||
39 | } mpStat; | ||
40 | #endif | ||
41 | |||
42 | |||
43 | #define HASH_BITS 10 /* This makes hash_table 1 4K page */ | ||
44 | #define HASH_SIZE (1 << HASH_BITS) | ||
45 | static struct metapage **hash_table = NULL; | ||
46 | static unsigned long hash_order; | ||
47 | |||
48 | |||
49 | static inline int metapage_locked(struct metapage *mp) | ||
50 | { | ||
51 | return test_bit(META_locked, &mp->flag); | ||
52 | } | ||
53 | |||
54 | static inline int trylock_metapage(struct metapage *mp) | ||
55 | { | ||
56 | return test_and_set_bit(META_locked, &mp->flag); | ||
57 | } | ||
58 | |||
59 | static inline void unlock_metapage(struct metapage *mp) | ||
60 | { | ||
61 | clear_bit(META_locked, &mp->flag); | ||
62 | wake_up(&mp->wait); | ||
63 | } | ||
64 | |||
65 | static void __lock_metapage(struct metapage *mp) | ||
66 | { | ||
67 | DECLARE_WAITQUEUE(wait, current); | ||
68 | |||
69 | INCREMENT(mpStat.lockwait); | ||
70 | |||
71 | add_wait_queue_exclusive(&mp->wait, &wait); | ||
72 | do { | ||
73 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
74 | if (metapage_locked(mp)) { | ||
75 | spin_unlock(&meta_lock); | ||
76 | schedule(); | ||
77 | spin_lock(&meta_lock); | ||
78 | } | ||
79 | } while (trylock_metapage(mp)); | ||
80 | __set_current_state(TASK_RUNNING); | ||
81 | remove_wait_queue(&mp->wait, &wait); | ||
82 | } | ||
83 | |||
84 | /* needs meta_lock */ | ||
85 | static inline void lock_metapage(struct metapage *mp) | ||
86 | { | ||
87 | if (trylock_metapage(mp)) | ||
88 | __lock_metapage(mp); | ||
89 | } | ||
90 | |||
91 | #define METAPOOL_MIN_PAGES 32 | ||
92 | static kmem_cache_t *metapage_cache; | ||
93 | static mempool_t *metapage_mempool; | ||
94 | |||
95 | static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) | ||
96 | { | ||
97 | struct metapage *mp = (struct metapage *)foo; | ||
98 | |||
99 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == | ||
100 | SLAB_CTOR_CONSTRUCTOR) { | ||
101 | mp->lid = 0; | ||
102 | mp->lsn = 0; | ||
103 | mp->flag = 0; | ||
104 | mp->data = NULL; | ||
105 | mp->clsn = 0; | ||
106 | mp->log = NULL; | ||
107 | set_bit(META_free, &mp->flag); | ||
108 | init_waitqueue_head(&mp->wait); | ||
109 | } | ||
110 | } | ||
111 | |||
112 | static inline struct metapage *alloc_metapage(int gfp_mask) | ||
113 | { | ||
114 | return mempool_alloc(metapage_mempool, gfp_mask); | ||
115 | } | ||
116 | |||
117 | static inline void free_metapage(struct metapage *mp) | ||
118 | { | ||
119 | mp->flag = 0; | ||
120 | set_bit(META_free, &mp->flag); | ||
121 | |||
122 | mempool_free(mp, metapage_mempool); | ||
123 | } | ||
124 | |||
125 | int __init metapage_init(void) | ||
126 | { | ||
127 | /* | ||
128 | * Allocate the metapage structures | ||
129 | */ | ||
130 | metapage_cache = kmem_cache_create("jfs_mp", sizeof(struct metapage), | ||
131 | 0, 0, init_once, NULL); | ||
132 | if (metapage_cache == NULL) | ||
133 | return -ENOMEM; | ||
134 | |||
135 | metapage_mempool = mempool_create(METAPOOL_MIN_PAGES, mempool_alloc_slab, | ||
136 | mempool_free_slab, metapage_cache); | ||
137 | |||
138 | if (metapage_mempool == NULL) { | ||
139 | kmem_cache_destroy(metapage_cache); | ||
140 | return -ENOMEM; | ||
141 | } | ||
142 | /* | ||
143 | * Now the hash list | ||
144 | */ | ||
145 | for (hash_order = 0; | ||
146 | ((PAGE_SIZE << hash_order) / sizeof(void *)) < HASH_SIZE; | ||
147 | hash_order++); | ||
148 | hash_table = | ||
149 | (struct metapage **) __get_free_pages(GFP_KERNEL, hash_order); | ||
150 | assert(hash_table); | ||
151 | memset(hash_table, 0, PAGE_SIZE << hash_order); | ||
152 | |||
153 | return 0; | ||
154 | } | ||
155 | |||
156 | void metapage_exit(void) | ||
157 | { | ||
158 | mempool_destroy(metapage_mempool); | ||
159 | kmem_cache_destroy(metapage_cache); | ||
160 | } | ||
161 | |||
162 | /* | ||
163 | * Basically same hash as in pagemap.h, but using our hash table | ||
164 | */ | ||
165 | static struct metapage **meta_hash(struct address_space *mapping, | ||
166 | unsigned long index) | ||
167 | { | ||
168 | #define i (((unsigned long)mapping)/ \ | ||
169 | (sizeof(struct inode) & ~(sizeof(struct inode) -1 ))) | ||
170 | #define s(x) ((x) + ((x) >> HASH_BITS)) | ||
171 | return hash_table + (s(i + index) & (HASH_SIZE - 1)); | ||
172 | #undef i | ||
173 | #undef s | ||
174 | } | ||
175 | |||
176 | static struct metapage *search_hash(struct metapage ** hash_ptr, | ||
177 | struct address_space *mapping, | ||
178 | unsigned long index) | ||
179 | { | ||
180 | struct metapage *ptr; | ||
181 | |||
182 | for (ptr = *hash_ptr; ptr; ptr = ptr->hash_next) { | ||
183 | if ((ptr->mapping == mapping) && (ptr->index == index)) | ||
184 | return ptr; | ||
185 | } | ||
186 | |||
187 | return NULL; | ||
188 | } | ||
189 | |||
190 | static void add_to_hash(struct metapage * mp, struct metapage ** hash_ptr) | ||
191 | { | ||
192 | if (*hash_ptr) | ||
193 | (*hash_ptr)->hash_prev = mp; | ||
194 | |||
195 | mp->hash_prev = NULL; | ||
196 | mp->hash_next = *hash_ptr; | ||
197 | *hash_ptr = mp; | ||
198 | } | ||
199 | |||
200 | static void remove_from_hash(struct metapage * mp, struct metapage ** hash_ptr) | ||
201 | { | ||
202 | if (mp->hash_prev) | ||
203 | mp->hash_prev->hash_next = mp->hash_next; | ||
204 | else { | ||
205 | assert(*hash_ptr == mp); | ||
206 | *hash_ptr = mp->hash_next; | ||
207 | } | ||
208 | |||
209 | if (mp->hash_next) | ||
210 | mp->hash_next->hash_prev = mp->hash_prev; | ||
211 | } | ||
212 | |||
213 | struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, | ||
214 | unsigned int size, int absolute, | ||
215 | unsigned long new) | ||
216 | { | ||
217 | struct metapage **hash_ptr; | ||
218 | int l2BlocksPerPage; | ||
219 | int l2bsize; | ||
220 | struct address_space *mapping; | ||
221 | struct metapage *mp; | ||
222 | unsigned long page_index; | ||
223 | unsigned long page_offset; | ||
224 | |||
225 | jfs_info("__get_metapage: inode = 0x%p, lblock = 0x%lx", inode, lblock); | ||
226 | |||
227 | if (absolute) | ||
228 | mapping = inode->i_sb->s_bdev->bd_inode->i_mapping; | ||
229 | else { | ||
230 | /* | ||
231 | * If an nfs client tries to read an inode that is larger | ||
232 | * than any existing inodes, we may try to read past the | ||
233 | * end of the inode map | ||
234 | */ | ||
235 | if ((lblock << inode->i_blkbits) >= inode->i_size) | ||
236 | return NULL; | ||
237 | mapping = inode->i_mapping; | ||
238 | } | ||
239 | |||
240 | hash_ptr = meta_hash(mapping, lblock); | ||
241 | again: | ||
242 | spin_lock(&meta_lock); | ||
243 | mp = search_hash(hash_ptr, mapping, lblock); | ||
244 | if (mp) { | ||
245 | page_found: | ||
246 | if (test_bit(META_stale, &mp->flag)) { | ||
247 | spin_unlock(&meta_lock); | ||
248 | msleep(1); | ||
249 | goto again; | ||
250 | } | ||
251 | mp->count++; | ||
252 | lock_metapage(mp); | ||
253 | spin_unlock(&meta_lock); | ||
254 | if (test_bit(META_discard, &mp->flag)) { | ||
255 | if (!new) { | ||
256 | jfs_error(inode->i_sb, | ||
257 | "__get_metapage: using a " | ||
258 | "discarded metapage"); | ||
259 | release_metapage(mp); | ||
260 | return NULL; | ||
261 | } | ||
262 | clear_bit(META_discard, &mp->flag); | ||
263 | } | ||
264 | jfs_info("__get_metapage: found 0x%p, in hash", mp); | ||
265 | if (mp->logical_size != size) { | ||
266 | jfs_error(inode->i_sb, | ||
267 | "__get_metapage: mp->logical_size != size"); | ||
268 | release_metapage(mp); | ||
269 | return NULL; | ||
270 | } | ||
271 | } else { | ||
272 | l2bsize = inode->i_blkbits; | ||
273 | l2BlocksPerPage = PAGE_CACHE_SHIFT - l2bsize; | ||
274 | page_index = lblock >> l2BlocksPerPage; | ||
275 | page_offset = (lblock - (page_index << l2BlocksPerPage)) << | ||
276 | l2bsize; | ||
277 | if ((page_offset + size) > PAGE_CACHE_SIZE) { | ||
278 | spin_unlock(&meta_lock); | ||
279 | jfs_err("MetaData crosses page boundary!!"); | ||
280 | return NULL; | ||
281 | } | ||
282 | |||
283 | /* | ||
284 | * Locks held on aggregate inode pages are usually | ||
285 | * not held long, and they are taken in critical code | ||
286 | * paths (committing dirty inodes, txCommit thread) | ||
287 | * | ||
288 | * Attempt to get metapage without blocking, tapping into | ||
289 | * reserves if necessary. | ||
290 | */ | ||
291 | mp = NULL; | ||
292 | if (JFS_IP(inode)->fileset == AGGREGATE_I) { | ||
293 | mp = alloc_metapage(GFP_ATOMIC); | ||
294 | if (!mp) { | ||
295 | /* | ||
296 | * mempool is supposed to protect us from | ||
297 | * failing here. We will try a blocking | ||
298 | * call, but a deadlock is possible here | ||
299 | */ | ||
300 | printk(KERN_WARNING | ||
301 | "__get_metapage: atomic call to mempool_alloc failed.\n"); | ||
302 | printk(KERN_WARNING | ||
303 | "Will attempt blocking call\n"); | ||
304 | } | ||
305 | } | ||
306 | if (!mp) { | ||
307 | struct metapage *mp2; | ||
308 | |||
309 | spin_unlock(&meta_lock); | ||
310 | mp = alloc_metapage(GFP_NOFS); | ||
311 | spin_lock(&meta_lock); | ||
312 | |||
313 | /* we dropped the meta_lock, we need to search the | ||
314 | * hash again. | ||
315 | */ | ||
316 | mp2 = search_hash(hash_ptr, mapping, lblock); | ||
317 | if (mp2) { | ||
318 | free_metapage(mp); | ||
319 | mp = mp2; | ||
320 | goto page_found; | ||
321 | } | ||
322 | } | ||
323 | mp->flag = 0; | ||
324 | lock_metapage(mp); | ||
325 | if (absolute) | ||
326 | set_bit(META_absolute, &mp->flag); | ||
327 | mp->xflag = COMMIT_PAGE; | ||
328 | mp->count = 1; | ||
329 | atomic_set(&mp->nohomeok,0); | ||
330 | mp->mapping = mapping; | ||
331 | mp->index = lblock; | ||
332 | mp->page = NULL; | ||
333 | mp->logical_size = size; | ||
334 | add_to_hash(mp, hash_ptr); | ||
335 | spin_unlock(&meta_lock); | ||
336 | |||
337 | if (new) { | ||
338 | jfs_info("__get_metapage: Calling grab_cache_page"); | ||
339 | mp->page = grab_cache_page(mapping, page_index); | ||
340 | if (!mp->page) { | ||
341 | jfs_err("grab_cache_page failed!"); | ||
342 | goto freeit; | ||
343 | } else { | ||
344 | INCREMENT(mpStat.pagealloc); | ||
345 | unlock_page(mp->page); | ||
346 | } | ||
347 | } else { | ||
348 | jfs_info("__get_metapage: Calling read_cache_page"); | ||
349 | mp->page = read_cache_page(mapping, lblock, | ||
350 | (filler_t *)mapping->a_ops->readpage, NULL); | ||
351 | if (IS_ERR(mp->page)) { | ||
352 | jfs_err("read_cache_page failed!"); | ||
353 | goto freeit; | ||
354 | } else | ||
355 | INCREMENT(mpStat.pagealloc); | ||
356 | } | ||
357 | mp->data = kmap(mp->page) + page_offset; | ||
358 | } | ||
359 | |||
360 | if (new) | ||
361 | memset(mp->data, 0, PSIZE); | ||
362 | |||
363 | jfs_info("__get_metapage: returning = 0x%p", mp); | ||
364 | return mp; | ||
365 | |||
366 | freeit: | ||
367 | spin_lock(&meta_lock); | ||
368 | remove_from_hash(mp, hash_ptr); | ||
369 | free_metapage(mp); | ||
370 | spin_unlock(&meta_lock); | ||
371 | return NULL; | ||
372 | } | ||
373 | |||
374 | void hold_metapage(struct metapage * mp, int force) | ||
375 | { | ||
376 | spin_lock(&meta_lock); | ||
377 | |||
378 | mp->count++; | ||
379 | |||
380 | if (force) { | ||
381 | ASSERT (!(test_bit(META_forced, &mp->flag))); | ||
382 | if (trylock_metapage(mp)) | ||
383 | set_bit(META_forced, &mp->flag); | ||
384 | } else | ||
385 | lock_metapage(mp); | ||
386 | |||
387 | spin_unlock(&meta_lock); | ||
388 | } | ||
389 | |||
390 | static void __write_metapage(struct metapage * mp) | ||
391 | { | ||
392 | int l2bsize = mp->mapping->host->i_blkbits; | ||
393 | int l2BlocksPerPage = PAGE_CACHE_SHIFT - l2bsize; | ||
394 | unsigned long page_index; | ||
395 | unsigned long page_offset; | ||
396 | int rc; | ||
397 | |||
398 | jfs_info("__write_metapage: mp = 0x%p", mp); | ||
399 | |||
400 | page_index = mp->page->index; | ||
401 | page_offset = | ||
402 | (mp->index - (page_index << l2BlocksPerPage)) << l2bsize; | ||
403 | |||
404 | lock_page(mp->page); | ||
405 | rc = mp->mapping->a_ops->prepare_write(NULL, mp->page, page_offset, | ||
406 | page_offset + | ||
407 | mp->logical_size); | ||
408 | if (rc) { | ||
409 | jfs_err("prepare_write return %d!", rc); | ||
410 | ClearPageUptodate(mp->page); | ||
411 | unlock_page(mp->page); | ||
412 | clear_bit(META_dirty, &mp->flag); | ||
413 | return; | ||
414 | } | ||
415 | rc = mp->mapping->a_ops->commit_write(NULL, mp->page, page_offset, | ||
416 | page_offset + | ||
417 | mp->logical_size); | ||
418 | if (rc) { | ||
419 | jfs_err("commit_write returned %d", rc); | ||
420 | } | ||
421 | |||
422 | unlock_page(mp->page); | ||
423 | clear_bit(META_dirty, &mp->flag); | ||
424 | |||
425 | jfs_info("__write_metapage done"); | ||
426 | } | ||
427 | |||
428 | static inline void sync_metapage(struct metapage *mp) | ||
429 | { | ||
430 | struct page *page = mp->page; | ||
431 | |||
432 | page_cache_get(page); | ||
433 | lock_page(page); | ||
434 | |||
435 | /* we're done with this page - no need to check for errors */ | ||
436 | if (page_has_buffers(page)) | ||
437 | write_one_page(page, 1); | ||
438 | else | ||
439 | unlock_page(page); | ||
440 | page_cache_release(page); | ||
441 | } | ||
442 | |||
443 | void release_metapage(struct metapage * mp) | ||
444 | { | ||
445 | struct jfs_log *log; | ||
446 | |||
447 | jfs_info("release_metapage: mp = 0x%p, flag = 0x%lx", mp, mp->flag); | ||
448 | |||
449 | spin_lock(&meta_lock); | ||
450 | if (test_bit(META_forced, &mp->flag)) { | ||
451 | clear_bit(META_forced, &mp->flag); | ||
452 | mp->count--; | ||
453 | spin_unlock(&meta_lock); | ||
454 | return; | ||
455 | } | ||
456 | |||
457 | assert(mp->count); | ||
458 | if (--mp->count || atomic_read(&mp->nohomeok)) { | ||
459 | unlock_metapage(mp); | ||
460 | spin_unlock(&meta_lock); | ||
461 | return; | ||
462 | } | ||
463 | |||
464 | if (mp->page) { | ||
465 | set_bit(META_stale, &mp->flag); | ||
466 | spin_unlock(&meta_lock); | ||
467 | kunmap(mp->page); | ||
468 | mp->data = NULL; | ||
469 | if (test_bit(META_dirty, &mp->flag)) | ||
470 | __write_metapage(mp); | ||
471 | if (test_bit(META_sync, &mp->flag)) { | ||
472 | sync_metapage(mp); | ||
473 | clear_bit(META_sync, &mp->flag); | ||
474 | } | ||
475 | |||
476 | if (test_bit(META_discard, &mp->flag)) { | ||
477 | lock_page(mp->page); | ||
478 | block_invalidatepage(mp->page, 0); | ||
479 | unlock_page(mp->page); | ||
480 | } | ||
481 | |||
482 | page_cache_release(mp->page); | ||
483 | mp->page = NULL; | ||
484 | INCREMENT(mpStat.pagefree); | ||
485 | spin_lock(&meta_lock); | ||
486 | } | ||
487 | |||
488 | if (mp->lsn) { | ||
489 | /* | ||
490 | * Remove metapage from logsynclist. | ||
491 | */ | ||
492 | log = mp->log; | ||
493 | LOGSYNC_LOCK(log); | ||
494 | mp->log = NULL; | ||
495 | mp->lsn = 0; | ||
496 | mp->clsn = 0; | ||
497 | log->count--; | ||
498 | list_del(&mp->synclist); | ||
499 | LOGSYNC_UNLOCK(log); | ||
500 | } | ||
501 | remove_from_hash(mp, meta_hash(mp->mapping, mp->index)); | ||
502 | spin_unlock(&meta_lock); | ||
503 | |||
504 | free_metapage(mp); | ||
505 | } | ||
506 | |||
507 | void __invalidate_metapages(struct inode *ip, s64 addr, int len) | ||
508 | { | ||
509 | struct metapage **hash_ptr; | ||
510 | unsigned long lblock; | ||
511 | int l2BlocksPerPage = PAGE_CACHE_SHIFT - ip->i_blkbits; | ||
512 | /* All callers are interested in block device's mapping */ | ||
513 | struct address_space *mapping = ip->i_sb->s_bdev->bd_inode->i_mapping; | ||
514 | struct metapage *mp; | ||
515 | struct page *page; | ||
516 | |||
517 | /* | ||
518 | * First, mark metapages to discard. They will eventually be | ||
519 | * released, but should not be written. | ||
520 | */ | ||
521 | for (lblock = addr; lblock < addr + len; | ||
522 | lblock += 1 << l2BlocksPerPage) { | ||
523 | hash_ptr = meta_hash(mapping, lblock); | ||
524 | again: | ||
525 | spin_lock(&meta_lock); | ||
526 | mp = search_hash(hash_ptr, mapping, lblock); | ||
527 | if (mp) { | ||
528 | if (test_bit(META_stale, &mp->flag)) { | ||
529 | spin_unlock(&meta_lock); | ||
530 | msleep(1); | ||
531 | goto again; | ||
532 | } | ||
533 | |||
534 | clear_bit(META_dirty, &mp->flag); | ||
535 | set_bit(META_discard, &mp->flag); | ||
536 | spin_unlock(&meta_lock); | ||
537 | } else { | ||
538 | spin_unlock(&meta_lock); | ||
539 | page = find_lock_page(mapping, lblock>>l2BlocksPerPage); | ||
540 | if (page) { | ||
541 | block_invalidatepage(page, 0); | ||
542 | unlock_page(page); | ||
543 | page_cache_release(page); | ||
544 | } | ||
545 | } | ||
546 | } | ||
547 | } | ||
548 | |||
549 | #ifdef CONFIG_JFS_STATISTICS | ||
550 | int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length, | ||
551 | int *eof, void *data) | ||
552 | { | ||
553 | int len = 0; | ||
554 | off_t begin; | ||
555 | |||
556 | len += sprintf(buffer, | ||
557 | "JFS Metapage statistics\n" | ||
558 | "=======================\n" | ||
559 | "page allocations = %d\n" | ||
560 | "page frees = %d\n" | ||
561 | "lock waits = %d\n", | ||
562 | mpStat.pagealloc, | ||
563 | mpStat.pagefree, | ||
564 | mpStat.lockwait); | ||
565 | |||
566 | begin = offset; | ||
567 | *start = buffer + begin; | ||
568 | len -= begin; | ||
569 | |||
570 | if (len > length) | ||
571 | len = length; | ||
572 | else | ||
573 | *eof = 1; | ||
574 | |||
575 | if (len < 0) | ||
576 | len = 0; | ||
577 | |||
578 | return len; | ||
579 | } | ||
580 | #endif | ||
diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h new file mode 100644 index 000000000000..0e58aba58c37 --- /dev/null +++ b/fs/jfs/jfs_metapage.h | |||
@@ -0,0 +1,115 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2000-2002 | ||
3 | * Portions Copyright (c) Christoph Hellwig, 2001-2002 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | #ifndef _H_JFS_METAPAGE | ||
20 | #define _H_JFS_METAPAGE | ||
21 | |||
22 | #include <linux/pagemap.h> | ||
23 | |||
24 | struct metapage { | ||
25 | /* Common logsyncblk prefix (see jfs_logmgr.h) */ | ||
26 | u16 xflag; | ||
27 | u16 unused; | ||
28 | lid_t lid; | ||
29 | int lsn; | ||
30 | struct list_head synclist; | ||
31 | /* End of logsyncblk prefix */ | ||
32 | |||
33 | unsigned long flag; /* See Below */ | ||
34 | unsigned long count; /* Reference count */ | ||
35 | void *data; /* Data pointer */ | ||
36 | |||
37 | /* list management stuff */ | ||
38 | struct metapage *hash_prev; | ||
39 | struct metapage *hash_next; /* Also used for free list */ | ||
40 | |||
41 | /* | ||
42 | * mapping & index become redundant, but we need these here to | ||
43 | * add the metapage to the hash before we have the real page | ||
44 | */ | ||
45 | struct address_space *mapping; | ||
46 | unsigned long index; | ||
47 | wait_queue_head_t wait; | ||
48 | |||
49 | /* implementation */ | ||
50 | struct page *page; | ||
51 | unsigned long logical_size; | ||
52 | |||
53 | /* Journal management */ | ||
54 | int clsn; | ||
55 | atomic_t nohomeok; | ||
56 | struct jfs_log *log; | ||
57 | }; | ||
58 | |||
59 | /* metapage flag */ | ||
60 | #define META_locked 0 | ||
61 | #define META_absolute 1 | ||
62 | #define META_free 2 | ||
63 | #define META_dirty 3 | ||
64 | #define META_sync 4 | ||
65 | #define META_discard 5 | ||
66 | #define META_forced 6 | ||
67 | #define META_stale 7 | ||
68 | |||
69 | #define mark_metapage_dirty(mp) set_bit(META_dirty, &(mp)->flag) | ||
70 | |||
71 | /* function prototypes */ | ||
72 | extern struct metapage *__get_metapage(struct inode *inode, | ||
73 | unsigned long lblock, unsigned int size, | ||
74 | int absolute, unsigned long new); | ||
75 | |||
76 | #define read_metapage(inode, lblock, size, absolute)\ | ||
77 | __get_metapage(inode, lblock, size, absolute, FALSE) | ||
78 | |||
79 | #define get_metapage(inode, lblock, size, absolute)\ | ||
80 | __get_metapage(inode, lblock, size, absolute, TRUE) | ||
81 | |||
82 | extern void release_metapage(struct metapage *); | ||
83 | extern void hold_metapage(struct metapage *, int); | ||
84 | |||
85 | static inline void write_metapage(struct metapage *mp) | ||
86 | { | ||
87 | set_bit(META_dirty, &mp->flag); | ||
88 | release_metapage(mp); | ||
89 | } | ||
90 | |||
91 | static inline void flush_metapage(struct metapage *mp) | ||
92 | { | ||
93 | set_bit(META_sync, &mp->flag); | ||
94 | write_metapage(mp); | ||
95 | } | ||
96 | |||
97 | static inline void discard_metapage(struct metapage *mp) | ||
98 | { | ||
99 | clear_bit(META_dirty, &mp->flag); | ||
100 | set_bit(META_discard, &mp->flag); | ||
101 | release_metapage(mp); | ||
102 | } | ||
103 | |||
104 | /* | ||
105 | * This routines invalidate all pages for an extent. | ||
106 | */ | ||
107 | extern void __invalidate_metapages(struct inode *, s64, int); | ||
108 | #define invalidate_pxd_metapages(ip, pxd) \ | ||
109 | __invalidate_metapages((ip), addressPXD(&(pxd)), lengthPXD(&(pxd))) | ||
110 | #define invalidate_dxd_metapages(ip, dxd) \ | ||
111 | __invalidate_metapages((ip), addressDXD(&(dxd)), lengthDXD(&(dxd))) | ||
112 | #define invalidate_xad_metapages(ip, xad) \ | ||
113 | __invalidate_metapages((ip), addressXAD(&(xad)), lengthXAD(&(xad))) | ||
114 | |||
115 | #endif /* _H_JFS_METAPAGE */ | ||
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c new file mode 100644 index 000000000000..c535ffd638e8 --- /dev/null +++ b/fs/jfs/jfs_mount.c | |||
@@ -0,0 +1,512 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | |||
19 | /* | ||
20 | * Module: jfs_mount.c | ||
21 | * | ||
22 | * note: file system in transition to aggregate/fileset: | ||
23 | * | ||
24 | * file system mount is interpreted as the mount of aggregate, | ||
25 | * if not already mounted, and mount of the single/only fileset in | ||
26 | * the aggregate; | ||
27 | * | ||
28 | * a file system/aggregate is represented by an internal inode | ||
29 | * (aka mount inode) initialized with aggregate superblock; | ||
30 | * each vfs represents a fileset, and points to its "fileset inode | ||
31 | * allocation map inode" (aka fileset inode): | ||
32 | * (an aggregate itself is structured recursively as a filset: | ||
33 | * an internal vfs is constructed and points to its "fileset inode | ||
34 | * allocation map inode" (aka aggregate inode) where each inode | ||
35 | * represents a fileset inode) so that inode number is mapped to | ||
36 | * on-disk inode in uniform way at both aggregate and fileset level; | ||
37 | * | ||
38 | * each vnode/inode of a fileset is linked to its vfs (to facilitate | ||
39 | * per fileset inode operations, e.g., unmount of a fileset, etc.); | ||
40 | * each inode points to the mount inode (to facilitate access to | ||
41 | * per aggregate information, e.g., block size, etc.) as well as | ||
42 | * its file set inode. | ||
43 | * | ||
44 | * aggregate | ||
45 | * ipmnt | ||
46 | * mntvfs -> fileset ipimap+ -> aggregate ipbmap -> aggregate ipaimap; | ||
47 | * fileset vfs -> vp(1) <-> ... <-> vp(n) <->vproot; | ||
48 | */ | ||
49 | |||
50 | #include <linux/fs.h> | ||
51 | #include <linux/buffer_head.h> | ||
52 | |||
53 | #include "jfs_incore.h" | ||
54 | #include "jfs_filsys.h" | ||
55 | #include "jfs_superblock.h" | ||
56 | #include "jfs_dmap.h" | ||
57 | #include "jfs_imap.h" | ||
58 | #include "jfs_metapage.h" | ||
59 | #include "jfs_debug.h" | ||
60 | |||
61 | |||
62 | /* | ||
63 | * forward references | ||
64 | */ | ||
65 | static int chkSuper(struct super_block *); | ||
66 | static int logMOUNT(struct super_block *sb); | ||
67 | |||
68 | /* | ||
69 | * NAME: jfs_mount(sb) | ||
70 | * | ||
71 | * FUNCTION: vfs_mount() | ||
72 | * | ||
73 | * PARAMETER: sb - super block | ||
74 | * | ||
75 | * RETURN: -EBUSY - device already mounted or open for write | ||
76 | * -EBUSY - cvrdvp already mounted; | ||
77 | * -EBUSY - mount table full | ||
78 | * -ENOTDIR- cvrdvp not directory on a device mount | ||
79 | * -ENXIO - device open failure | ||
80 | */ | ||
81 | int jfs_mount(struct super_block *sb) | ||
82 | { | ||
83 | int rc = 0; /* Return code */ | ||
84 | struct jfs_sb_info *sbi = JFS_SBI(sb); | ||
85 | struct inode *ipaimap = NULL; | ||
86 | struct inode *ipaimap2 = NULL; | ||
87 | struct inode *ipimap = NULL; | ||
88 | struct inode *ipbmap = NULL; | ||
89 | |||
90 | /* | ||
91 | * read/validate superblock | ||
92 | * (initialize mount inode from the superblock) | ||
93 | */ | ||
94 | if ((rc = chkSuper(sb))) { | ||
95 | goto errout20; | ||
96 | } | ||
97 | |||
98 | ipaimap = diReadSpecial(sb, AGGREGATE_I, 0); | ||
99 | if (ipaimap == NULL) { | ||
100 | jfs_err("jfs_mount: Faild to read AGGREGATE_I"); | ||
101 | rc = -EIO; | ||
102 | goto errout20; | ||
103 | } | ||
104 | sbi->ipaimap = ipaimap; | ||
105 | |||
106 | jfs_info("jfs_mount: ipaimap:0x%p", ipaimap); | ||
107 | |||
108 | /* | ||
109 | * initialize aggregate inode allocation map | ||
110 | */ | ||
111 | if ((rc = diMount(ipaimap))) { | ||
112 | jfs_err("jfs_mount: diMount(ipaimap) failed w/rc = %d", rc); | ||
113 | goto errout21; | ||
114 | } | ||
115 | |||
116 | /* | ||
117 | * open aggregate block allocation map | ||
118 | */ | ||
119 | ipbmap = diReadSpecial(sb, BMAP_I, 0); | ||
120 | if (ipbmap == NULL) { | ||
121 | rc = -EIO; | ||
122 | goto errout22; | ||
123 | } | ||
124 | |||
125 | jfs_info("jfs_mount: ipbmap:0x%p", ipbmap); | ||
126 | |||
127 | sbi->ipbmap = ipbmap; | ||
128 | |||
129 | /* | ||
130 | * initialize aggregate block allocation map | ||
131 | */ | ||
132 | if ((rc = dbMount(ipbmap))) { | ||
133 | jfs_err("jfs_mount: dbMount failed w/rc = %d", rc); | ||
134 | goto errout22; | ||
135 | } | ||
136 | |||
137 | /* | ||
138 | * open the secondary aggregate inode allocation map | ||
139 | * | ||
140 | * This is a duplicate of the aggregate inode allocation map. | ||
141 | * | ||
142 | * hand craft a vfs in the same fashion as we did to read ipaimap. | ||
143 | * By adding INOSPEREXT (32) to the inode number, we are telling | ||
144 | * diReadSpecial that we are reading from the secondary aggregate | ||
145 | * inode table. This also creates a unique entry in the inode hash | ||
146 | * table. | ||
147 | */ | ||
148 | if ((sbi->mntflag & JFS_BAD_SAIT) == 0) { | ||
149 | ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1); | ||
150 | if (ipaimap2 == 0) { | ||
151 | jfs_err("jfs_mount: Faild to read AGGREGATE_I"); | ||
152 | rc = -EIO; | ||
153 | goto errout35; | ||
154 | } | ||
155 | sbi->ipaimap2 = ipaimap2; | ||
156 | |||
157 | jfs_info("jfs_mount: ipaimap2:0x%p", ipaimap2); | ||
158 | |||
159 | /* | ||
160 | * initialize secondary aggregate inode allocation map | ||
161 | */ | ||
162 | if ((rc = diMount(ipaimap2))) { | ||
163 | jfs_err("jfs_mount: diMount(ipaimap2) failed, rc = %d", | ||
164 | rc); | ||
165 | goto errout35; | ||
166 | } | ||
167 | } else | ||
168 | /* Secondary aggregate inode table is not valid */ | ||
169 | sbi->ipaimap2 = NULL; | ||
170 | |||
171 | /* | ||
172 | * mount (the only/single) fileset | ||
173 | */ | ||
174 | /* | ||
175 | * open fileset inode allocation map (aka fileset inode) | ||
176 | */ | ||
177 | ipimap = diReadSpecial(sb, FILESYSTEM_I, 0); | ||
178 | if (ipimap == NULL) { | ||
179 | jfs_err("jfs_mount: Failed to read FILESYSTEM_I"); | ||
180 | /* open fileset secondary inode allocation map */ | ||
181 | rc = -EIO; | ||
182 | goto errout40; | ||
183 | } | ||
184 | jfs_info("jfs_mount: ipimap:0x%p", ipimap); | ||
185 | |||
186 | /* map further access of per fileset inodes by the fileset inode */ | ||
187 | sbi->ipimap = ipimap; | ||
188 | |||
189 | /* initialize fileset inode allocation map */ | ||
190 | if ((rc = diMount(ipimap))) { | ||
191 | jfs_err("jfs_mount: diMount failed w/rc = %d", rc); | ||
192 | goto errout41; | ||
193 | } | ||
194 | |||
195 | goto out; | ||
196 | |||
197 | /* | ||
198 | * unwind on error | ||
199 | */ | ||
200 | errout41: /* close fileset inode allocation map inode */ | ||
201 | diFreeSpecial(ipimap); | ||
202 | |||
203 | errout40: /* fileset closed */ | ||
204 | |||
205 | /* close secondary aggregate inode allocation map */ | ||
206 | if (ipaimap2) { | ||
207 | diUnmount(ipaimap2, 1); | ||
208 | diFreeSpecial(ipaimap2); | ||
209 | } | ||
210 | |||
211 | errout35: | ||
212 | |||
213 | /* close aggregate block allocation map */ | ||
214 | dbUnmount(ipbmap, 1); | ||
215 | diFreeSpecial(ipbmap); | ||
216 | |||
217 | errout22: /* close aggregate inode allocation map */ | ||
218 | |||
219 | diUnmount(ipaimap, 1); | ||
220 | |||
221 | errout21: /* close aggregate inodes */ | ||
222 | diFreeSpecial(ipaimap); | ||
223 | errout20: /* aggregate closed */ | ||
224 | |||
225 | out: | ||
226 | |||
227 | if (rc) | ||
228 | jfs_err("Mount JFS Failure: %d", rc); | ||
229 | |||
230 | return rc; | ||
231 | } | ||
232 | |||
233 | /* | ||
234 | * NAME: jfs_mount_rw(sb, remount) | ||
235 | * | ||
236 | * FUNCTION: Completes read-write mount, or remounts read-only volume | ||
237 | * as read-write | ||
238 | */ | ||
239 | int jfs_mount_rw(struct super_block *sb, int remount) | ||
240 | { | ||
241 | struct jfs_sb_info *sbi = JFS_SBI(sb); | ||
242 | int rc; | ||
243 | |||
244 | /* | ||
245 | * If we are re-mounting a previously read-only volume, we want to | ||
246 | * re-read the inode and block maps, since fsck.jfs may have updated | ||
247 | * them. | ||
248 | */ | ||
249 | if (remount) { | ||
250 | if (chkSuper(sb) || (sbi->state != FM_CLEAN)) | ||
251 | return -EINVAL; | ||
252 | |||
253 | truncate_inode_pages(sbi->ipimap->i_mapping, 0); | ||
254 | truncate_inode_pages(sbi->ipbmap->i_mapping, 0); | ||
255 | diUnmount(sbi->ipimap, 1); | ||
256 | if ((rc = diMount(sbi->ipimap))) { | ||
257 | jfs_err("jfs_mount_rw: diMount failed!"); | ||
258 | return rc; | ||
259 | } | ||
260 | |||
261 | dbUnmount(sbi->ipbmap, 1); | ||
262 | if ((rc = dbMount(sbi->ipbmap))) { | ||
263 | jfs_err("jfs_mount_rw: dbMount failed!"); | ||
264 | return rc; | ||
265 | } | ||
266 | } | ||
267 | |||
268 | /* | ||
269 | * open/initialize log | ||
270 | */ | ||
271 | if ((rc = lmLogOpen(sb))) | ||
272 | return rc; | ||
273 | |||
274 | /* | ||
275 | * update file system superblock; | ||
276 | */ | ||
277 | if ((rc = updateSuper(sb, FM_MOUNT))) { | ||
278 | jfs_err("jfs_mount: updateSuper failed w/rc = %d", rc); | ||
279 | lmLogClose(sb); | ||
280 | return rc; | ||
281 | } | ||
282 | |||
283 | /* | ||
284 | * write MOUNT log record of the file system | ||
285 | */ | ||
286 | logMOUNT(sb); | ||
287 | |||
288 | /* | ||
289 | * Set page cache allocation policy | ||
290 | */ | ||
291 | mapping_set_gfp_mask(sb->s_bdev->bd_inode->i_mapping, GFP_NOFS); | ||
292 | |||
293 | return rc; | ||
294 | } | ||
295 | |||
296 | /* | ||
297 | * chkSuper() | ||
298 | * | ||
299 | * validate the superblock of the file system to be mounted and | ||
300 | * get the file system parameters. | ||
301 | * | ||
302 | * returns | ||
303 | * 0 with fragsize set if check successful | ||
304 | * error code if not successful | ||
305 | */ | ||
306 | static int chkSuper(struct super_block *sb) | ||
307 | { | ||
308 | int rc = 0; | ||
309 | struct jfs_sb_info *sbi = JFS_SBI(sb); | ||
310 | struct jfs_superblock *j_sb; | ||
311 | struct buffer_head *bh; | ||
312 | int AIM_bytesize, AIT_bytesize; | ||
313 | int expected_AIM_bytesize, expected_AIT_bytesize; | ||
314 | s64 AIM_byte_addr, AIT_byte_addr, fsckwsp_addr; | ||
315 | s64 byte_addr_diff0, byte_addr_diff1; | ||
316 | s32 bsize; | ||
317 | |||
318 | if ((rc = readSuper(sb, &bh))) | ||
319 | return rc; | ||
320 | j_sb = (struct jfs_superblock *)bh->b_data; | ||
321 | |||
322 | /* | ||
323 | * validate superblock | ||
324 | */ | ||
325 | /* validate fs signature */ | ||
326 | if (strncmp(j_sb->s_magic, JFS_MAGIC, 4) || | ||
327 | le32_to_cpu(j_sb->s_version) > JFS_VERSION) { | ||
328 | rc = -EINVAL; | ||
329 | goto out; | ||
330 | } | ||
331 | |||
332 | bsize = le32_to_cpu(j_sb->s_bsize); | ||
333 | #ifdef _JFS_4K | ||
334 | if (bsize != PSIZE) { | ||
335 | jfs_err("Currently only 4K block size supported!"); | ||
336 | rc = -EINVAL; | ||
337 | goto out; | ||
338 | } | ||
339 | #endif /* _JFS_4K */ | ||
340 | |||
341 | jfs_info("superblock: flag:0x%08x state:0x%08x size:0x%Lx", | ||
342 | le32_to_cpu(j_sb->s_flag), le32_to_cpu(j_sb->s_state), | ||
343 | (unsigned long long) le64_to_cpu(j_sb->s_size)); | ||
344 | |||
345 | /* validate the descriptors for Secondary AIM and AIT */ | ||
346 | if ((j_sb->s_flag & cpu_to_le32(JFS_BAD_SAIT)) != | ||
347 | cpu_to_le32(JFS_BAD_SAIT)) { | ||
348 | expected_AIM_bytesize = 2 * PSIZE; | ||
349 | AIM_bytesize = lengthPXD(&(j_sb->s_aim2)) * bsize; | ||
350 | expected_AIT_bytesize = 4 * PSIZE; | ||
351 | AIT_bytesize = lengthPXD(&(j_sb->s_ait2)) * bsize; | ||
352 | AIM_byte_addr = addressPXD(&(j_sb->s_aim2)) * bsize; | ||
353 | AIT_byte_addr = addressPXD(&(j_sb->s_ait2)) * bsize; | ||
354 | byte_addr_diff0 = AIT_byte_addr - AIM_byte_addr; | ||
355 | fsckwsp_addr = addressPXD(&(j_sb->s_fsckpxd)) * bsize; | ||
356 | byte_addr_diff1 = fsckwsp_addr - AIT_byte_addr; | ||
357 | if ((AIM_bytesize != expected_AIM_bytesize) || | ||
358 | (AIT_bytesize != expected_AIT_bytesize) || | ||
359 | (byte_addr_diff0 != AIM_bytesize) || | ||
360 | (byte_addr_diff1 <= AIT_bytesize)) | ||
361 | j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT); | ||
362 | } | ||
363 | |||
364 | if ((j_sb->s_flag & cpu_to_le32(JFS_GROUPCOMMIT)) != | ||
365 | cpu_to_le32(JFS_GROUPCOMMIT)) | ||
366 | j_sb->s_flag |= cpu_to_le32(JFS_GROUPCOMMIT); | ||
367 | |||
368 | /* validate fs state */ | ||
369 | if (j_sb->s_state != cpu_to_le32(FM_CLEAN) && | ||
370 | !(sb->s_flags & MS_RDONLY)) { | ||
371 | jfs_err("jfs_mount: Mount Failure: File System Dirty."); | ||
372 | rc = -EINVAL; | ||
373 | goto out; | ||
374 | } | ||
375 | |||
376 | sbi->state = le32_to_cpu(j_sb->s_state); | ||
377 | sbi->mntflag = le32_to_cpu(j_sb->s_flag); | ||
378 | |||
379 | /* | ||
380 | * JFS always does I/O by 4K pages. Don't tell the buffer cache | ||
381 | * that we use anything else (leave s_blocksize alone). | ||
382 | */ | ||
383 | sbi->bsize = bsize; | ||
384 | sbi->l2bsize = le16_to_cpu(j_sb->s_l2bsize); | ||
385 | |||
386 | /* | ||
387 | * For now, ignore s_pbsize, l2bfactor. All I/O going through buffer | ||
388 | * cache. | ||
389 | */ | ||
390 | sbi->nbperpage = PSIZE >> sbi->l2bsize; | ||
391 | sbi->l2nbperpage = L2PSIZE - sbi->l2bsize; | ||
392 | sbi->l2niperblk = sbi->l2bsize - L2DISIZE; | ||
393 | if (sbi->mntflag & JFS_INLINELOG) | ||
394 | sbi->logpxd = j_sb->s_logpxd; | ||
395 | else { | ||
396 | sbi->logdev = new_decode_dev(le32_to_cpu(j_sb->s_logdev)); | ||
397 | memcpy(sbi->uuid, j_sb->s_uuid, sizeof(sbi->uuid)); | ||
398 | memcpy(sbi->loguuid, j_sb->s_loguuid, sizeof(sbi->uuid)); | ||
399 | } | ||
400 | sbi->fsckpxd = j_sb->s_fsckpxd; | ||
401 | sbi->ait2 = j_sb->s_ait2; | ||
402 | |||
403 | out: | ||
404 | brelse(bh); | ||
405 | return rc; | ||
406 | } | ||
407 | |||
408 | |||
409 | /* | ||
410 | * updateSuper() | ||
411 | * | ||
412 | * update synchronously superblock if it is mounted read-write. | ||
413 | */ | ||
414 | int updateSuper(struct super_block *sb, uint state) | ||
415 | { | ||
416 | struct jfs_superblock *j_sb; | ||
417 | struct jfs_sb_info *sbi = JFS_SBI(sb); | ||
418 | struct buffer_head *bh; | ||
419 | int rc; | ||
420 | |||
421 | if (sbi->flag & JFS_NOINTEGRITY) { | ||
422 | if (state == FM_DIRTY) { | ||
423 | sbi->p_state = state; | ||
424 | return 0; | ||
425 | } else if (state == FM_MOUNT) { | ||
426 | sbi->p_state = sbi->state; | ||
427 | state = FM_DIRTY; | ||
428 | } else if (state == FM_CLEAN) { | ||
429 | state = sbi->p_state; | ||
430 | } else | ||
431 | jfs_err("updateSuper: bad state"); | ||
432 | } else if (sbi->state == FM_DIRTY) | ||
433 | return 0; | ||
434 | |||
435 | if ((rc = readSuper(sb, &bh))) | ||
436 | return rc; | ||
437 | |||
438 | j_sb = (struct jfs_superblock *)bh->b_data; | ||
439 | |||
440 | j_sb->s_state = cpu_to_le32(state); | ||
441 | sbi->state = state; | ||
442 | |||
443 | if (state == FM_MOUNT) { | ||
444 | /* record log's dev_t and mount serial number */ | ||
445 | j_sb->s_logdev = cpu_to_le32(new_encode_dev(sbi->log->bdev->bd_dev)); | ||
446 | j_sb->s_logserial = cpu_to_le32(sbi->log->serial); | ||
447 | } else if (state == FM_CLEAN) { | ||
448 | /* | ||
449 | * If this volume is shared with OS/2, OS/2 will need to | ||
450 | * recalculate DASD usage, since we don't deal with it. | ||
451 | */ | ||
452 | if (j_sb->s_flag & cpu_to_le32(JFS_DASD_ENABLED)) | ||
453 | j_sb->s_flag |= cpu_to_le32(JFS_DASD_PRIME); | ||
454 | } | ||
455 | |||
456 | mark_buffer_dirty(bh); | ||
457 | sync_dirty_buffer(bh); | ||
458 | brelse(bh); | ||
459 | |||
460 | return 0; | ||
461 | } | ||
462 | |||
463 | |||
464 | /* | ||
465 | * readSuper() | ||
466 | * | ||
467 | * read superblock by raw sector address | ||
468 | */ | ||
469 | int readSuper(struct super_block *sb, struct buffer_head **bpp) | ||
470 | { | ||
471 | /* read in primary superblock */ | ||
472 | *bpp = sb_bread(sb, SUPER1_OFF >> sb->s_blocksize_bits); | ||
473 | if (*bpp) | ||
474 | return 0; | ||
475 | |||
476 | /* read in secondary/replicated superblock */ | ||
477 | *bpp = sb_bread(sb, SUPER2_OFF >> sb->s_blocksize_bits); | ||
478 | if (*bpp) | ||
479 | return 0; | ||
480 | |||
481 | return -EIO; | ||
482 | } | ||
483 | |||
484 | |||
485 | /* | ||
486 | * logMOUNT() | ||
487 | * | ||
488 | * function: write a MOUNT log record for file system. | ||
489 | * | ||
490 | * MOUNT record keeps logredo() from processing log records | ||
491 | * for this file system past this point in log. | ||
492 | * it is harmless if mount fails. | ||
493 | * | ||
494 | * note: MOUNT record is at aggregate level, not at fileset level, | ||
495 | * since log records of previous mounts of a fileset | ||
496 | * (e.g., AFTER record of extent allocation) have to be processed | ||
497 | * to update block allocation map at aggregate level. | ||
498 | */ | ||
499 | static int logMOUNT(struct super_block *sb) | ||
500 | { | ||
501 | struct jfs_log *log = JFS_SBI(sb)->log; | ||
502 | struct lrd lrd; | ||
503 | |||
504 | lrd.logtid = 0; | ||
505 | lrd.backchain = 0; | ||
506 | lrd.type = cpu_to_le16(LOG_MOUNT); | ||
507 | lrd.length = 0; | ||
508 | lrd.aggregate = cpu_to_le32(new_encode_dev(sb->s_bdev->bd_dev)); | ||
509 | lmLog(log, NULL, &lrd, NULL); | ||
510 | |||
511 | return 0; | ||
512 | } | ||
diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h new file mode 100644 index 000000000000..ab0566f70cfa --- /dev/null +++ b/fs/jfs/jfs_superblock.h | |||
@@ -0,0 +1,113 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2003 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | #ifndef _H_JFS_SUPERBLOCK | ||
19 | #define _H_JFS_SUPERBLOCK | ||
20 | |||
21 | /* | ||
22 | * make the magic number something a human could read | ||
23 | */ | ||
24 | #define JFS_MAGIC "JFS1" /* Magic word */ | ||
25 | |||
26 | #define JFS_VERSION 2 /* Version number: Version 2 */ | ||
27 | |||
28 | #define LV_NAME_SIZE 11 /* MUST BE 11 for OS/2 boot sector */ | ||
29 | |||
30 | /* | ||
31 | * aggregate superblock | ||
32 | * | ||
33 | * The name superblock is too close to super_block, so the name has been | ||
34 | * changed to jfs_superblock. The utilities are still using the old name. | ||
35 | */ | ||
36 | struct jfs_superblock { | ||
37 | char s_magic[4]; /* 4: magic number */ | ||
38 | __le32 s_version; /* 4: version number */ | ||
39 | |||
40 | __le64 s_size; /* 8: aggregate size in hardware/LVM blocks; | ||
41 | * VFS: number of blocks | ||
42 | */ | ||
43 | __le32 s_bsize; /* 4: aggregate block size in bytes; | ||
44 | * VFS: fragment size | ||
45 | */ | ||
46 | __le16 s_l2bsize; /* 2: log2 of s_bsize */ | ||
47 | __le16 s_l2bfactor; /* 2: log2(s_bsize/hardware block size) */ | ||
48 | __le32 s_pbsize; /* 4: hardware/LVM block size in bytes */ | ||
49 | __le16 s_l2pbsize; /* 2: log2 of s_pbsize */ | ||
50 | __le16 pad; /* 2: padding necessary for alignment */ | ||
51 | |||
52 | __le32 s_agsize; /* 4: allocation group size in aggr. blocks */ | ||
53 | |||
54 | __le32 s_flag; /* 4: aggregate attributes: | ||
55 | * see jfs_filsys.h | ||
56 | */ | ||
57 | __le32 s_state; /* 4: mount/unmount/recovery state: | ||
58 | * see jfs_filsys.h | ||
59 | */ | ||
60 | __le32 s_compress; /* 4: > 0 if data compression */ | ||
61 | |||
62 | pxd_t s_ait2; /* 8: first extent of secondary | ||
63 | * aggregate inode table | ||
64 | */ | ||
65 | |||
66 | pxd_t s_aim2; /* 8: first extent of secondary | ||
67 | * aggregate inode map | ||
68 | */ | ||
69 | __le32 s_logdev; /* 4: device address of log */ | ||
70 | __le32 s_logserial; /* 4: log serial number at aggregate mount */ | ||
71 | pxd_t s_logpxd; /* 8: inline log extent */ | ||
72 | |||
73 | pxd_t s_fsckpxd; /* 8: inline fsck work space extent */ | ||
74 | |||
75 | struct timestruc_t s_time; /* 8: time last updated */ | ||
76 | |||
77 | __le32 s_fsckloglen; /* 4: Number of filesystem blocks reserved for | ||
78 | * the fsck service log. | ||
79 | * N.B. These blocks are divided among the | ||
80 | * versions kept. This is not a per | ||
81 | * version size. | ||
82 | * N.B. These blocks are included in the | ||
83 | * length field of s_fsckpxd. | ||
84 | */ | ||
85 | s8 s_fscklog; /* 1: which fsck service log is most recent | ||
86 | * 0 => no service log data yet | ||
87 | * 1 => the first one | ||
88 | * 2 => the 2nd one | ||
89 | */ | ||
90 | char s_fpack[11]; /* 11: file system volume name | ||
91 | * N.B. This must be 11 bytes to | ||
92 | * conform with the OS/2 BootSector | ||
93 | * requirements | ||
94 | * Only used when s_version is 1 | ||
95 | */ | ||
96 | |||
97 | /* extendfs() parameter under s_state & FM_EXTENDFS */ | ||
98 | __le64 s_xsize; /* 8: extendfs s_size */ | ||
99 | pxd_t s_xfsckpxd; /* 8: extendfs fsckpxd */ | ||
100 | pxd_t s_xlogpxd; /* 8: extendfs logpxd */ | ||
101 | /* - 128 byte boundary - */ | ||
102 | |||
103 | char s_uuid[16]; /* 16: 128-bit uuid for volume */ | ||
104 | char s_label[16]; /* 16: volume label */ | ||
105 | char s_loguuid[16]; /* 16: 128-bit uuid for log device */ | ||
106 | |||
107 | }; | ||
108 | |||
109 | extern int readSuper(struct super_block *, struct buffer_head **); | ||
110 | extern int updateSuper(struct super_block *, uint); | ||
111 | extern void jfs_error(struct super_block *, const char *, ...); | ||
112 | |||
113 | #endif /*_H_JFS_SUPERBLOCK */ | ||
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c new file mode 100644 index 000000000000..f40301d93f74 --- /dev/null +++ b/fs/jfs/jfs_txnmgr.c | |||
@@ -0,0 +1,3131 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2005 | ||
3 | * Portions Copyright (C) Christoph Hellwig, 2001-2002 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | |||
20 | /* | ||
21 | * jfs_txnmgr.c: transaction manager | ||
22 | * | ||
23 | * notes: | ||
24 | * transaction starts with txBegin() and ends with txCommit() | ||
25 | * or txAbort(). | ||
26 | * | ||
27 | * tlock is acquired at the time of update; | ||
28 | * (obviate scan at commit time for xtree and dtree) | ||
29 | * tlock and mp points to each other; | ||
30 | * (no hashlist for mp -> tlock). | ||
31 | * | ||
32 | * special cases: | ||
33 | * tlock on in-memory inode: | ||
34 | * in-place tlock in the in-memory inode itself; | ||
35 | * converted to page lock by iWrite() at commit time. | ||
36 | * | ||
37 | * tlock during write()/mmap() under anonymous transaction (tid = 0): | ||
38 | * transferred (?) to transaction at commit time. | ||
39 | * | ||
40 | * use the page itself to update allocation maps | ||
41 | * (obviate intermediate replication of allocation/deallocation data) | ||
42 | * hold on to mp+lock thru update of maps | ||
43 | */ | ||
44 | |||
45 | |||
46 | #include <linux/fs.h> | ||
47 | #include <linux/vmalloc.h> | ||
48 | #include <linux/smp_lock.h> | ||
49 | #include <linux/completion.h> | ||
50 | #include <linux/suspend.h> | ||
51 | #include <linux/module.h> | ||
52 | #include <linux/moduleparam.h> | ||
53 | #include "jfs_incore.h" | ||
54 | #include "jfs_filsys.h" | ||
55 | #include "jfs_metapage.h" | ||
56 | #include "jfs_dinode.h" | ||
57 | #include "jfs_imap.h" | ||
58 | #include "jfs_dmap.h" | ||
59 | #include "jfs_superblock.h" | ||
60 | #include "jfs_debug.h" | ||
61 | |||
62 | /* | ||
63 | * transaction management structures | ||
64 | */ | ||
65 | static struct { | ||
66 | int freetid; /* index of a free tid structure */ | ||
67 | int freelock; /* index first free lock word */ | ||
68 | wait_queue_head_t freewait; /* eventlist of free tblock */ | ||
69 | wait_queue_head_t freelockwait; /* eventlist of free tlock */ | ||
70 | wait_queue_head_t lowlockwait; /* eventlist of ample tlocks */ | ||
71 | int tlocksInUse; /* Number of tlocks in use */ | ||
72 | spinlock_t LazyLock; /* synchronize sync_queue & unlock_queue */ | ||
73 | /* struct tblock *sync_queue; * Transactions waiting for data sync */ | ||
74 | struct list_head unlock_queue; /* Txns waiting to be released */ | ||
75 | struct list_head anon_list; /* inodes having anonymous txns */ | ||
76 | struct list_head anon_list2; /* inodes having anonymous txns | ||
77 | that couldn't be sync'ed */ | ||
78 | } TxAnchor; | ||
79 | |||
80 | int jfs_tlocks_low; /* Indicates low number of available tlocks */ | ||
81 | |||
82 | #ifdef CONFIG_JFS_STATISTICS | ||
83 | static struct { | ||
84 | uint txBegin; | ||
85 | uint txBegin_barrier; | ||
86 | uint txBegin_lockslow; | ||
87 | uint txBegin_freetid; | ||
88 | uint txBeginAnon; | ||
89 | uint txBeginAnon_barrier; | ||
90 | uint txBeginAnon_lockslow; | ||
91 | uint txLockAlloc; | ||
92 | uint txLockAlloc_freelock; | ||
93 | } TxStat; | ||
94 | #endif | ||
95 | |||
96 | static int nTxBlock = -1; /* number of transaction blocks */ | ||
97 | module_param(nTxBlock, int, 0); | ||
98 | MODULE_PARM_DESC(nTxBlock, | ||
99 | "Number of transaction blocks (max:65536)"); | ||
100 | |||
101 | static int nTxLock = -1; /* number of transaction locks */ | ||
102 | module_param(nTxLock, int, 0); | ||
103 | MODULE_PARM_DESC(nTxLock, | ||
104 | "Number of transaction locks (max:65536)"); | ||
105 | |||
106 | struct tblock *TxBlock; /* transaction block table */ | ||
107 | static int TxLockLWM; /* Low water mark for number of txLocks used */ | ||
108 | static int TxLockHWM; /* High water mark for number of txLocks used */ | ||
109 | static int TxLockVHWM; /* Very High water mark */ | ||
110 | struct tlock *TxLock; /* transaction lock table */ | ||
111 | |||
112 | |||
113 | /* | ||
114 | * transaction management lock | ||
115 | */ | ||
116 | static DEFINE_SPINLOCK(jfsTxnLock); | ||
117 | |||
118 | #define TXN_LOCK() spin_lock(&jfsTxnLock) | ||
119 | #define TXN_UNLOCK() spin_unlock(&jfsTxnLock) | ||
120 | |||
121 | #define LAZY_LOCK_INIT() spin_lock_init(&TxAnchor.LazyLock); | ||
122 | #define LAZY_LOCK(flags) spin_lock_irqsave(&TxAnchor.LazyLock, flags) | ||
123 | #define LAZY_UNLOCK(flags) spin_unlock_irqrestore(&TxAnchor.LazyLock, flags) | ||
124 | |||
125 | DECLARE_WAIT_QUEUE_HEAD(jfs_sync_thread_wait); | ||
126 | DECLARE_WAIT_QUEUE_HEAD(jfs_commit_thread_wait); | ||
127 | static int jfs_commit_thread_waking; | ||
128 | |||
129 | /* | ||
130 | * Retry logic exist outside these macros to protect from spurrious wakeups. | ||
131 | */ | ||
132 | static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event) | ||
133 | { | ||
134 | DECLARE_WAITQUEUE(wait, current); | ||
135 | |||
136 | add_wait_queue(event, &wait); | ||
137 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
138 | TXN_UNLOCK(); | ||
139 | schedule(); | ||
140 | current->state = TASK_RUNNING; | ||
141 | remove_wait_queue(event, &wait); | ||
142 | } | ||
143 | |||
144 | #define TXN_SLEEP(event)\ | ||
145 | {\ | ||
146 | TXN_SLEEP_DROP_LOCK(event);\ | ||
147 | TXN_LOCK();\ | ||
148 | } | ||
149 | |||
150 | #define TXN_WAKEUP(event) wake_up_all(event) | ||
151 | |||
152 | |||
153 | /* | ||
154 | * statistics | ||
155 | */ | ||
156 | static struct { | ||
157 | tid_t maxtid; /* 4: biggest tid ever used */ | ||
158 | lid_t maxlid; /* 4: biggest lid ever used */ | ||
159 | int ntid; /* 4: # of transactions performed */ | ||
160 | int nlid; /* 4: # of tlocks acquired */ | ||
161 | int waitlock; /* 4: # of tlock wait */ | ||
162 | } stattx; | ||
163 | |||
164 | |||
165 | /* | ||
166 | * external references | ||
167 | */ | ||
168 | extern int lmGroupCommit(struct jfs_log *, struct tblock *); | ||
169 | extern int jfs_commit_inode(struct inode *, int); | ||
170 | extern int jfs_stop_threads; | ||
171 | |||
172 | extern struct completion jfsIOwait; | ||
173 | |||
174 | /* | ||
175 | * forward references | ||
176 | */ | ||
177 | static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, | ||
178 | struct tlock * tlck, struct commit * cd); | ||
179 | static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, | ||
180 | struct tlock * tlck); | ||
181 | static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, | ||
182 | struct tlock * tlck); | ||
183 | static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, | ||
184 | struct tlock * tlck); | ||
185 | static void txAllocPMap(struct inode *ip, struct maplock * maplock, | ||
186 | struct tblock * tblk); | ||
187 | static void txForce(struct tblock * tblk); | ||
188 | static int txLog(struct jfs_log * log, struct tblock * tblk, | ||
189 | struct commit * cd); | ||
190 | static void txUpdateMap(struct tblock * tblk); | ||
191 | static void txRelease(struct tblock * tblk); | ||
192 | static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, | ||
193 | struct tlock * tlck); | ||
194 | static void LogSyncRelease(struct metapage * mp); | ||
195 | |||
196 | /* | ||
197 | * transaction block/lock management | ||
198 | * --------------------------------- | ||
199 | */ | ||
200 | |||
201 | /* | ||
202 | * Get a transaction lock from the free list. If the number in use is | ||
203 | * greater than the high water mark, wake up the sync daemon. This should | ||
204 | * free some anonymous transaction locks. (TXN_LOCK must be held.) | ||
205 | */ | ||
206 | static lid_t txLockAlloc(void) | ||
207 | { | ||
208 | lid_t lid; | ||
209 | |||
210 | INCREMENT(TxStat.txLockAlloc); | ||
211 | if (!TxAnchor.freelock) { | ||
212 | INCREMENT(TxStat.txLockAlloc_freelock); | ||
213 | } | ||
214 | |||
215 | while (!(lid = TxAnchor.freelock)) | ||
216 | TXN_SLEEP(&TxAnchor.freelockwait); | ||
217 | TxAnchor.freelock = TxLock[lid].next; | ||
218 | HIGHWATERMARK(stattx.maxlid, lid); | ||
219 | if ((++TxAnchor.tlocksInUse > TxLockHWM) && (jfs_tlocks_low == 0)) { | ||
220 | jfs_info("txLockAlloc tlocks low"); | ||
221 | jfs_tlocks_low = 1; | ||
222 | wake_up(&jfs_sync_thread_wait); | ||
223 | } | ||
224 | |||
225 | return lid; | ||
226 | } | ||
227 | |||
228 | static void txLockFree(lid_t lid) | ||
229 | { | ||
230 | TxLock[lid].next = TxAnchor.freelock; | ||
231 | TxAnchor.freelock = lid; | ||
232 | TxAnchor.tlocksInUse--; | ||
233 | if (jfs_tlocks_low && (TxAnchor.tlocksInUse < TxLockLWM)) { | ||
234 | jfs_info("txLockFree jfs_tlocks_low no more"); | ||
235 | jfs_tlocks_low = 0; | ||
236 | TXN_WAKEUP(&TxAnchor.lowlockwait); | ||
237 | } | ||
238 | TXN_WAKEUP(&TxAnchor.freelockwait); | ||
239 | } | ||
240 | |||
241 | /* | ||
242 | * NAME: txInit() | ||
243 | * | ||
244 | * FUNCTION: initialize transaction management structures | ||
245 | * | ||
246 | * RETURN: | ||
247 | * | ||
248 | * serialization: single thread at jfs_init() | ||
249 | */ | ||
250 | int txInit(void) | ||
251 | { | ||
252 | int k, size; | ||
253 | struct sysinfo si; | ||
254 | |||
255 | /* Set defaults for nTxLock and nTxBlock if unset */ | ||
256 | |||
257 | if (nTxLock == -1) { | ||
258 | if (nTxBlock == -1) { | ||
259 | /* Base default on memory size */ | ||
260 | si_meminfo(&si); | ||
261 | if (si.totalram > (256 * 1024)) /* 1 GB */ | ||
262 | nTxLock = 64 * 1024; | ||
263 | else | ||
264 | nTxLock = si.totalram >> 2; | ||
265 | } else if (nTxBlock > (8 * 1024)) | ||
266 | nTxLock = 64 * 1024; | ||
267 | else | ||
268 | nTxLock = nTxBlock << 3; | ||
269 | } | ||
270 | if (nTxBlock == -1) | ||
271 | nTxBlock = nTxLock >> 3; | ||
272 | |||
273 | /* Verify tunable parameters */ | ||
274 | if (nTxBlock < 16) | ||
275 | nTxBlock = 16; /* No one should set it this low */ | ||
276 | if (nTxBlock > 65536) | ||
277 | nTxBlock = 65536; | ||
278 | if (nTxLock < 256) | ||
279 | nTxLock = 256; /* No one should set it this low */ | ||
280 | if (nTxLock > 65536) | ||
281 | nTxLock = 65536; | ||
282 | |||
283 | printk(KERN_INFO "JFS: nTxBlock = %d, nTxLock = %d\n", | ||
284 | nTxBlock, nTxLock); | ||
285 | /* | ||
286 | * initialize transaction block (tblock) table | ||
287 | * | ||
288 | * transaction id (tid) = tblock index | ||
289 | * tid = 0 is reserved. | ||
290 | */ | ||
291 | TxLockLWM = (nTxLock * 4) / 10; | ||
292 | TxLockHWM = (nTxLock * 7) / 10; | ||
293 | TxLockVHWM = (nTxLock * 8) / 10; | ||
294 | |||
295 | size = sizeof(struct tblock) * nTxBlock; | ||
296 | TxBlock = (struct tblock *) vmalloc(size); | ||
297 | if (TxBlock == NULL) | ||
298 | return -ENOMEM; | ||
299 | |||
300 | for (k = 1; k < nTxBlock - 1; k++) { | ||
301 | TxBlock[k].next = k + 1; | ||
302 | init_waitqueue_head(&TxBlock[k].gcwait); | ||
303 | init_waitqueue_head(&TxBlock[k].waitor); | ||
304 | } | ||
305 | TxBlock[k].next = 0; | ||
306 | init_waitqueue_head(&TxBlock[k].gcwait); | ||
307 | init_waitqueue_head(&TxBlock[k].waitor); | ||
308 | |||
309 | TxAnchor.freetid = 1; | ||
310 | init_waitqueue_head(&TxAnchor.freewait); | ||
311 | |||
312 | stattx.maxtid = 1; /* statistics */ | ||
313 | |||
314 | /* | ||
315 | * initialize transaction lock (tlock) table | ||
316 | * | ||
317 | * transaction lock id = tlock index | ||
318 | * tlock id = 0 is reserved. | ||
319 | */ | ||
320 | size = sizeof(struct tlock) * nTxLock; | ||
321 | TxLock = (struct tlock *) vmalloc(size); | ||
322 | if (TxLock == NULL) { | ||
323 | vfree(TxBlock); | ||
324 | return -ENOMEM; | ||
325 | } | ||
326 | |||
327 | /* initialize tlock table */ | ||
328 | for (k = 1; k < nTxLock - 1; k++) | ||
329 | TxLock[k].next = k + 1; | ||
330 | TxLock[k].next = 0; | ||
331 | init_waitqueue_head(&TxAnchor.freelockwait); | ||
332 | init_waitqueue_head(&TxAnchor.lowlockwait); | ||
333 | |||
334 | TxAnchor.freelock = 1; | ||
335 | TxAnchor.tlocksInUse = 0; | ||
336 | INIT_LIST_HEAD(&TxAnchor.anon_list); | ||
337 | INIT_LIST_HEAD(&TxAnchor.anon_list2); | ||
338 | |||
339 | LAZY_LOCK_INIT(); | ||
340 | INIT_LIST_HEAD(&TxAnchor.unlock_queue); | ||
341 | |||
342 | stattx.maxlid = 1; /* statistics */ | ||
343 | |||
344 | return 0; | ||
345 | } | ||
346 | |||
347 | /* | ||
348 | * NAME: txExit() | ||
349 | * | ||
350 | * FUNCTION: clean up when module is unloaded | ||
351 | */ | ||
352 | void txExit(void) | ||
353 | { | ||
354 | vfree(TxLock); | ||
355 | TxLock = NULL; | ||
356 | vfree(TxBlock); | ||
357 | TxBlock = NULL; | ||
358 | } | ||
359 | |||
360 | |||
361 | /* | ||
362 | * NAME: txBegin() | ||
363 | * | ||
364 | * FUNCTION: start a transaction. | ||
365 | * | ||
366 | * PARAMETER: sb - superblock | ||
367 | * flag - force for nested tx; | ||
368 | * | ||
369 | * RETURN: tid - transaction id | ||
370 | * | ||
371 | * note: flag force allows to start tx for nested tx | ||
372 | * to prevent deadlock on logsync barrier; | ||
373 | */ | ||
374 | tid_t txBegin(struct super_block *sb, int flag) | ||
375 | { | ||
376 | tid_t t; | ||
377 | struct tblock *tblk; | ||
378 | struct jfs_log *log; | ||
379 | |||
380 | jfs_info("txBegin: flag = 0x%x", flag); | ||
381 | log = JFS_SBI(sb)->log; | ||
382 | |||
383 | TXN_LOCK(); | ||
384 | |||
385 | INCREMENT(TxStat.txBegin); | ||
386 | |||
387 | retry: | ||
388 | if (!(flag & COMMIT_FORCE)) { | ||
389 | /* | ||
390 | * synchronize with logsync barrier | ||
391 | */ | ||
392 | if (test_bit(log_SYNCBARRIER, &log->flag) || | ||
393 | test_bit(log_QUIESCE, &log->flag)) { | ||
394 | INCREMENT(TxStat.txBegin_barrier); | ||
395 | TXN_SLEEP(&log->syncwait); | ||
396 | goto retry; | ||
397 | } | ||
398 | } | ||
399 | if (flag == 0) { | ||
400 | /* | ||
401 | * Don't begin transaction if we're getting starved for tlocks | ||
402 | * unless COMMIT_FORCE or COMMIT_INODE (which may ultimately | ||
403 | * free tlocks) | ||
404 | */ | ||
405 | if (TxAnchor.tlocksInUse > TxLockVHWM) { | ||
406 | INCREMENT(TxStat.txBegin_lockslow); | ||
407 | TXN_SLEEP(&TxAnchor.lowlockwait); | ||
408 | goto retry; | ||
409 | } | ||
410 | } | ||
411 | |||
412 | /* | ||
413 | * allocate transaction id/block | ||
414 | */ | ||
415 | if ((t = TxAnchor.freetid) == 0) { | ||
416 | jfs_info("txBegin: waiting for free tid"); | ||
417 | INCREMENT(TxStat.txBegin_freetid); | ||
418 | TXN_SLEEP(&TxAnchor.freewait); | ||
419 | goto retry; | ||
420 | } | ||
421 | |||
422 | tblk = tid_to_tblock(t); | ||
423 | |||
424 | if ((tblk->next == 0) && !(flag & COMMIT_FORCE)) { | ||
425 | /* Don't let a non-forced transaction take the last tblk */ | ||
426 | jfs_info("txBegin: waiting for free tid"); | ||
427 | INCREMENT(TxStat.txBegin_freetid); | ||
428 | TXN_SLEEP(&TxAnchor.freewait); | ||
429 | goto retry; | ||
430 | } | ||
431 | |||
432 | TxAnchor.freetid = tblk->next; | ||
433 | |||
434 | /* | ||
435 | * initialize transaction | ||
436 | */ | ||
437 | |||
438 | /* | ||
439 | * We can't zero the whole thing or we screw up another thread being | ||
440 | * awakened after sleeping on tblk->waitor | ||
441 | * | ||
442 | * memset(tblk, 0, sizeof(struct tblock)); | ||
443 | */ | ||
444 | tblk->next = tblk->last = tblk->xflag = tblk->flag = tblk->lsn = 0; | ||
445 | |||
446 | tblk->sb = sb; | ||
447 | ++log->logtid; | ||
448 | tblk->logtid = log->logtid; | ||
449 | |||
450 | ++log->active; | ||
451 | |||
452 | HIGHWATERMARK(stattx.maxtid, t); /* statistics */ | ||
453 | INCREMENT(stattx.ntid); /* statistics */ | ||
454 | |||
455 | TXN_UNLOCK(); | ||
456 | |||
457 | jfs_info("txBegin: returning tid = %d", t); | ||
458 | |||
459 | return t; | ||
460 | } | ||
461 | |||
462 | |||
463 | /* | ||
464 | * NAME: txBeginAnon() | ||
465 | * | ||
466 | * FUNCTION: start an anonymous transaction. | ||
467 | * Blocks if logsync or available tlocks are low to prevent | ||
468 | * anonymous tlocks from depleting supply. | ||
469 | * | ||
470 | * PARAMETER: sb - superblock | ||
471 | * | ||
472 | * RETURN: none | ||
473 | */ | ||
474 | void txBeginAnon(struct super_block *sb) | ||
475 | { | ||
476 | struct jfs_log *log; | ||
477 | |||
478 | log = JFS_SBI(sb)->log; | ||
479 | |||
480 | TXN_LOCK(); | ||
481 | INCREMENT(TxStat.txBeginAnon); | ||
482 | |||
483 | retry: | ||
484 | /* | ||
485 | * synchronize with logsync barrier | ||
486 | */ | ||
487 | if (test_bit(log_SYNCBARRIER, &log->flag) || | ||
488 | test_bit(log_QUIESCE, &log->flag)) { | ||
489 | INCREMENT(TxStat.txBeginAnon_barrier); | ||
490 | TXN_SLEEP(&log->syncwait); | ||
491 | goto retry; | ||
492 | } | ||
493 | |||
494 | /* | ||
495 | * Don't begin transaction if we're getting starved for tlocks | ||
496 | */ | ||
497 | if (TxAnchor.tlocksInUse > TxLockVHWM) { | ||
498 | INCREMENT(TxStat.txBeginAnon_lockslow); | ||
499 | TXN_SLEEP(&TxAnchor.lowlockwait); | ||
500 | goto retry; | ||
501 | } | ||
502 | TXN_UNLOCK(); | ||
503 | } | ||
504 | |||
505 | |||
506 | /* | ||
507 | * txEnd() | ||
508 | * | ||
509 | * function: free specified transaction block. | ||
510 | * | ||
511 | * logsync barrier processing: | ||
512 | * | ||
513 | * serialization: | ||
514 | */ | ||
515 | void txEnd(tid_t tid) | ||
516 | { | ||
517 | struct tblock *tblk = tid_to_tblock(tid); | ||
518 | struct jfs_log *log; | ||
519 | |||
520 | jfs_info("txEnd: tid = %d", tid); | ||
521 | TXN_LOCK(); | ||
522 | |||
523 | /* | ||
524 | * wakeup transactions waiting on the page locked | ||
525 | * by the current transaction | ||
526 | */ | ||
527 | TXN_WAKEUP(&tblk->waitor); | ||
528 | |||
529 | log = JFS_SBI(tblk->sb)->log; | ||
530 | |||
531 | /* | ||
532 | * Lazy commit thread can't free this guy until we mark it UNLOCKED, | ||
533 | * otherwise, we would be left with a transaction that may have been | ||
534 | * reused. | ||
535 | * | ||
536 | * Lazy commit thread will turn off tblkGC_LAZY before calling this | ||
537 | * routine. | ||
538 | */ | ||
539 | if (tblk->flag & tblkGC_LAZY) { | ||
540 | jfs_info("txEnd called w/lazy tid: %d, tblk = 0x%p", tid, tblk); | ||
541 | TXN_UNLOCK(); | ||
542 | |||
543 | spin_lock_irq(&log->gclock); // LOGGC_LOCK | ||
544 | tblk->flag |= tblkGC_UNLOCKED; | ||
545 | spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK | ||
546 | return; | ||
547 | } | ||
548 | |||
549 | jfs_info("txEnd: tid: %d, tblk = 0x%p", tid, tblk); | ||
550 | |||
551 | assert(tblk->next == 0); | ||
552 | |||
553 | /* | ||
554 | * insert tblock back on freelist | ||
555 | */ | ||
556 | tblk->next = TxAnchor.freetid; | ||
557 | TxAnchor.freetid = tid; | ||
558 | |||
559 | /* | ||
560 | * mark the tblock not active | ||
561 | */ | ||
562 | if (--log->active == 0) { | ||
563 | clear_bit(log_FLUSH, &log->flag); | ||
564 | |||
565 | /* | ||
566 | * synchronize with logsync barrier | ||
567 | */ | ||
568 | if (test_bit(log_SYNCBARRIER, &log->flag)) { | ||
569 | /* forward log syncpt */ | ||
570 | /* lmSync(log); */ | ||
571 | |||
572 | jfs_info("log barrier off: 0x%x", log->lsn); | ||
573 | |||
574 | /* enable new transactions start */ | ||
575 | clear_bit(log_SYNCBARRIER, &log->flag); | ||
576 | |||
577 | /* wakeup all waitors for logsync barrier */ | ||
578 | TXN_WAKEUP(&log->syncwait); | ||
579 | } | ||
580 | } | ||
581 | |||
582 | /* | ||
583 | * wakeup all waitors for a free tblock | ||
584 | */ | ||
585 | TXN_WAKEUP(&TxAnchor.freewait); | ||
586 | |||
587 | TXN_UNLOCK(); | ||
588 | } | ||
589 | |||
590 | |||
591 | /* | ||
592 | * txLock() | ||
593 | * | ||
594 | * function: acquire a transaction lock on the specified <mp> | ||
595 | * | ||
596 | * parameter: | ||
597 | * | ||
598 | * return: transaction lock id | ||
599 | * | ||
600 | * serialization: | ||
601 | */ | ||
602 | struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp, | ||
603 | int type) | ||
604 | { | ||
605 | struct jfs_inode_info *jfs_ip = JFS_IP(ip); | ||
606 | int dir_xtree = 0; | ||
607 | lid_t lid; | ||
608 | tid_t xtid; | ||
609 | struct tlock *tlck; | ||
610 | struct xtlock *xtlck; | ||
611 | struct linelock *linelock; | ||
612 | xtpage_t *p; | ||
613 | struct tblock *tblk; | ||
614 | |||
615 | TXN_LOCK(); | ||
616 | |||
617 | if (S_ISDIR(ip->i_mode) && (type & tlckXTREE) && | ||
618 | !(mp->xflag & COMMIT_PAGE)) { | ||
619 | /* | ||
620 | * Directory inode is special. It can have both an xtree tlock | ||
621 | * and a dtree tlock associated with it. | ||
622 | */ | ||
623 | dir_xtree = 1; | ||
624 | lid = jfs_ip->xtlid; | ||
625 | } else | ||
626 | lid = mp->lid; | ||
627 | |||
628 | /* is page not locked by a transaction ? */ | ||
629 | if (lid == 0) | ||
630 | goto allocateLock; | ||
631 | |||
632 | jfs_info("txLock: tid:%d ip:0x%p mp:0x%p lid:%d", tid, ip, mp, lid); | ||
633 | |||
634 | /* is page locked by the requester transaction ? */ | ||
635 | tlck = lid_to_tlock(lid); | ||
636 | if ((xtid = tlck->tid) == tid) | ||
637 | goto grantLock; | ||
638 | |||
639 | /* | ||
640 | * is page locked by anonymous transaction/lock ? | ||
641 | * | ||
642 | * (page update without transaction (i.e., file write) is | ||
643 | * locked under anonymous transaction tid = 0: | ||
644 | * anonymous tlocks maintained on anonymous tlock list of | ||
645 | * the inode of the page and available to all anonymous | ||
646 | * transactions until txCommit() time at which point | ||
647 | * they are transferred to the transaction tlock list of | ||
648 | * the commiting transaction of the inode) | ||
649 | */ | ||
650 | if (xtid == 0) { | ||
651 | tlck->tid = tid; | ||
652 | tblk = tid_to_tblock(tid); | ||
653 | /* | ||
654 | * The order of the tlocks in the transaction is important | ||
655 | * (during truncate, child xtree pages must be freed before | ||
656 | * parent's tlocks change the working map). | ||
657 | * Take tlock off anonymous list and add to tail of | ||
658 | * transaction list | ||
659 | * | ||
660 | * Note: We really need to get rid of the tid & lid and | ||
661 | * use list_head's. This code is getting UGLY! | ||
662 | */ | ||
663 | if (jfs_ip->atlhead == lid) { | ||
664 | if (jfs_ip->atltail == lid) { | ||
665 | /* only anonymous txn. | ||
666 | * Remove from anon_list | ||
667 | */ | ||
668 | list_del_init(&jfs_ip->anon_inode_list); | ||
669 | } | ||
670 | jfs_ip->atlhead = tlck->next; | ||
671 | } else { | ||
672 | lid_t last; | ||
673 | for (last = jfs_ip->atlhead; | ||
674 | lid_to_tlock(last)->next != lid; | ||
675 | last = lid_to_tlock(last)->next) { | ||
676 | assert(last); | ||
677 | } | ||
678 | lid_to_tlock(last)->next = tlck->next; | ||
679 | if (jfs_ip->atltail == lid) | ||
680 | jfs_ip->atltail = last; | ||
681 | } | ||
682 | |||
683 | /* insert the tlock at tail of transaction tlock list */ | ||
684 | |||
685 | if (tblk->next) | ||
686 | lid_to_tlock(tblk->last)->next = lid; | ||
687 | else | ||
688 | tblk->next = lid; | ||
689 | tlck->next = 0; | ||
690 | tblk->last = lid; | ||
691 | |||
692 | goto grantLock; | ||
693 | } | ||
694 | |||
695 | goto waitLock; | ||
696 | |||
697 | /* | ||
698 | * allocate a tlock | ||
699 | */ | ||
700 | allocateLock: | ||
701 | lid = txLockAlloc(); | ||
702 | tlck = lid_to_tlock(lid); | ||
703 | |||
704 | /* | ||
705 | * initialize tlock | ||
706 | */ | ||
707 | tlck->tid = tid; | ||
708 | |||
709 | /* mark tlock for meta-data page */ | ||
710 | if (mp->xflag & COMMIT_PAGE) { | ||
711 | |||
712 | tlck->flag = tlckPAGELOCK; | ||
713 | |||
714 | /* mark the page dirty and nohomeok */ | ||
715 | mark_metapage_dirty(mp); | ||
716 | atomic_inc(&mp->nohomeok); | ||
717 | |||
718 | jfs_info("locking mp = 0x%p, nohomeok = %d tid = %d tlck = 0x%p", | ||
719 | mp, atomic_read(&mp->nohomeok), tid, tlck); | ||
720 | |||
721 | /* if anonymous transaction, and buffer is on the group | ||
722 | * commit synclist, mark inode to show this. This will | ||
723 | * prevent the buffer from being marked nohomeok for too | ||
724 | * long a time. | ||
725 | */ | ||
726 | if ((tid == 0) && mp->lsn) | ||
727 | set_cflag(COMMIT_Synclist, ip); | ||
728 | } | ||
729 | /* mark tlock for in-memory inode */ | ||
730 | else | ||
731 | tlck->flag = tlckINODELOCK; | ||
732 | |||
733 | tlck->type = 0; | ||
734 | |||
735 | /* bind the tlock and the page */ | ||
736 | tlck->ip = ip; | ||
737 | tlck->mp = mp; | ||
738 | if (dir_xtree) | ||
739 | jfs_ip->xtlid = lid; | ||
740 | else | ||
741 | mp->lid = lid; | ||
742 | |||
743 | /* | ||
744 | * enqueue transaction lock to transaction/inode | ||
745 | */ | ||
746 | /* insert the tlock at tail of transaction tlock list */ | ||
747 | if (tid) { | ||
748 | tblk = tid_to_tblock(tid); | ||
749 | if (tblk->next) | ||
750 | lid_to_tlock(tblk->last)->next = lid; | ||
751 | else | ||
752 | tblk->next = lid; | ||
753 | tlck->next = 0; | ||
754 | tblk->last = lid; | ||
755 | } | ||
756 | /* anonymous transaction: | ||
757 | * insert the tlock at head of inode anonymous tlock list | ||
758 | */ | ||
759 | else { | ||
760 | tlck->next = jfs_ip->atlhead; | ||
761 | jfs_ip->atlhead = lid; | ||
762 | if (tlck->next == 0) { | ||
763 | /* This inode's first anonymous transaction */ | ||
764 | jfs_ip->atltail = lid; | ||
765 | list_add_tail(&jfs_ip->anon_inode_list, | ||
766 | &TxAnchor.anon_list); | ||
767 | } | ||
768 | } | ||
769 | |||
770 | /* initialize type dependent area for linelock */ | ||
771 | linelock = (struct linelock *) & tlck->lock; | ||
772 | linelock->next = 0; | ||
773 | linelock->flag = tlckLINELOCK; | ||
774 | linelock->maxcnt = TLOCKSHORT; | ||
775 | linelock->index = 0; | ||
776 | |||
777 | switch (type & tlckTYPE) { | ||
778 | case tlckDTREE: | ||
779 | linelock->l2linesize = L2DTSLOTSIZE; | ||
780 | break; | ||
781 | |||
782 | case tlckXTREE: | ||
783 | linelock->l2linesize = L2XTSLOTSIZE; | ||
784 | |||
785 | xtlck = (struct xtlock *) linelock; | ||
786 | xtlck->header.offset = 0; | ||
787 | xtlck->header.length = 2; | ||
788 | |||
789 | if (type & tlckNEW) { | ||
790 | xtlck->lwm.offset = XTENTRYSTART; | ||
791 | } else { | ||
792 | if (mp->xflag & COMMIT_PAGE) | ||
793 | p = (xtpage_t *) mp->data; | ||
794 | else | ||
795 | p = &jfs_ip->i_xtroot; | ||
796 | xtlck->lwm.offset = | ||
797 | le16_to_cpu(p->header.nextindex); | ||
798 | } | ||
799 | xtlck->lwm.length = 0; /* ! */ | ||
800 | xtlck->twm.offset = 0; | ||
801 | xtlck->hwm.offset = 0; | ||
802 | |||
803 | xtlck->index = 2; | ||
804 | break; | ||
805 | |||
806 | case tlckINODE: | ||
807 | linelock->l2linesize = L2INODESLOTSIZE; | ||
808 | break; | ||
809 | |||
810 | case tlckDATA: | ||
811 | linelock->l2linesize = L2DATASLOTSIZE; | ||
812 | break; | ||
813 | |||
814 | default: | ||
815 | jfs_err("UFO tlock:0x%p", tlck); | ||
816 | } | ||
817 | |||
818 | /* | ||
819 | * update tlock vector | ||
820 | */ | ||
821 | grantLock: | ||
822 | tlck->type |= type; | ||
823 | |||
824 | TXN_UNLOCK(); | ||
825 | |||
826 | return tlck; | ||
827 | |||
828 | /* | ||
829 | * page is being locked by another transaction: | ||
830 | */ | ||
831 | waitLock: | ||
832 | /* Only locks on ipimap or ipaimap should reach here */ | ||
833 | /* assert(jfs_ip->fileset == AGGREGATE_I); */ | ||
834 | if (jfs_ip->fileset != AGGREGATE_I) { | ||
835 | jfs_err("txLock: trying to lock locked page!"); | ||
836 | dump_mem("ip", ip, sizeof(struct inode)); | ||
837 | dump_mem("mp", mp, sizeof(struct metapage)); | ||
838 | dump_mem("Locker's tblk", tid_to_tblock(tid), | ||
839 | sizeof(struct tblock)); | ||
840 | dump_mem("Tlock", tlck, sizeof(struct tlock)); | ||
841 | BUG(); | ||
842 | } | ||
843 | INCREMENT(stattx.waitlock); /* statistics */ | ||
844 | release_metapage(mp); | ||
845 | |||
846 | jfs_info("txLock: in waitLock, tid = %d, xtid = %d, lid = %d", | ||
847 | tid, xtid, lid); | ||
848 | TXN_SLEEP_DROP_LOCK(&tid_to_tblock(xtid)->waitor); | ||
849 | jfs_info("txLock: awakened tid = %d, lid = %d", tid, lid); | ||
850 | |||
851 | return NULL; | ||
852 | } | ||
853 | |||
854 | |||
855 | /* | ||
856 | * NAME: txRelease() | ||
857 | * | ||
858 | * FUNCTION: Release buffers associated with transaction locks, but don't | ||
859 | * mark homeok yet. The allows other transactions to modify | ||
860 | * buffers, but won't let them go to disk until commit record | ||
861 | * actually gets written. | ||
862 | * | ||
863 | * PARAMETER: | ||
864 | * tblk - | ||
865 | * | ||
866 | * RETURN: Errors from subroutines. | ||
867 | */ | ||
868 | static void txRelease(struct tblock * tblk) | ||
869 | { | ||
870 | struct metapage *mp; | ||
871 | lid_t lid; | ||
872 | struct tlock *tlck; | ||
873 | |||
874 | TXN_LOCK(); | ||
875 | |||
876 | for (lid = tblk->next; lid; lid = tlck->next) { | ||
877 | tlck = lid_to_tlock(lid); | ||
878 | if ((mp = tlck->mp) != NULL && | ||
879 | (tlck->type & tlckBTROOT) == 0) { | ||
880 | assert(mp->xflag & COMMIT_PAGE); | ||
881 | mp->lid = 0; | ||
882 | } | ||
883 | } | ||
884 | |||
885 | /* | ||
886 | * wakeup transactions waiting on a page locked | ||
887 | * by the current transaction | ||
888 | */ | ||
889 | TXN_WAKEUP(&tblk->waitor); | ||
890 | |||
891 | TXN_UNLOCK(); | ||
892 | } | ||
893 | |||
894 | |||
895 | /* | ||
896 | * NAME: txUnlock() | ||
897 | * | ||
898 | * FUNCTION: Initiates pageout of pages modified by tid in journalled | ||
899 | * objects and frees their lockwords. | ||
900 | */ | ||
901 | static void txUnlock(struct tblock * tblk) | ||
902 | { | ||
903 | struct tlock *tlck; | ||
904 | struct linelock *linelock; | ||
905 | lid_t lid, next, llid, k; | ||
906 | struct metapage *mp; | ||
907 | struct jfs_log *log; | ||
908 | int difft, diffp; | ||
909 | |||
910 | jfs_info("txUnlock: tblk = 0x%p", tblk); | ||
911 | log = JFS_SBI(tblk->sb)->log; | ||
912 | |||
913 | /* | ||
914 | * mark page under tlock homeok (its log has been written): | ||
915 | */ | ||
916 | for (lid = tblk->next; lid; lid = next) { | ||
917 | tlck = lid_to_tlock(lid); | ||
918 | next = tlck->next; | ||
919 | |||
920 | jfs_info("unlocking lid = %d, tlck = 0x%p", lid, tlck); | ||
921 | |||
922 | /* unbind page from tlock */ | ||
923 | if ((mp = tlck->mp) != NULL && | ||
924 | (tlck->type & tlckBTROOT) == 0) { | ||
925 | assert(mp->xflag & COMMIT_PAGE); | ||
926 | |||
927 | /* hold buffer | ||
928 | * | ||
929 | * It's possible that someone else has the metapage. | ||
930 | * The only things were changing are nohomeok, which | ||
931 | * is handled atomically, and clsn which is protected | ||
932 | * by the LOGSYNC_LOCK. | ||
933 | */ | ||
934 | hold_metapage(mp, 1); | ||
935 | |||
936 | assert(atomic_read(&mp->nohomeok) > 0); | ||
937 | atomic_dec(&mp->nohomeok); | ||
938 | |||
939 | /* inherit younger/larger clsn */ | ||
940 | LOGSYNC_LOCK(log); | ||
941 | if (mp->clsn) { | ||
942 | logdiff(difft, tblk->clsn, log); | ||
943 | logdiff(diffp, mp->clsn, log); | ||
944 | if (difft > diffp) | ||
945 | mp->clsn = tblk->clsn; | ||
946 | } else | ||
947 | mp->clsn = tblk->clsn; | ||
948 | LOGSYNC_UNLOCK(log); | ||
949 | |||
950 | assert(!(tlck->flag & tlckFREEPAGE)); | ||
951 | |||
952 | if (tlck->flag & tlckWRITEPAGE) { | ||
953 | write_metapage(mp); | ||
954 | } else { | ||
955 | /* release page which has been forced */ | ||
956 | release_metapage(mp); | ||
957 | } | ||
958 | } | ||
959 | |||
960 | /* insert tlock, and linelock(s) of the tlock if any, | ||
961 | * at head of freelist | ||
962 | */ | ||
963 | TXN_LOCK(); | ||
964 | |||
965 | llid = ((struct linelock *) & tlck->lock)->next; | ||
966 | while (llid) { | ||
967 | linelock = (struct linelock *) lid_to_tlock(llid); | ||
968 | k = linelock->next; | ||
969 | txLockFree(llid); | ||
970 | llid = k; | ||
971 | } | ||
972 | txLockFree(lid); | ||
973 | |||
974 | TXN_UNLOCK(); | ||
975 | } | ||
976 | tblk->next = tblk->last = 0; | ||
977 | |||
978 | /* | ||
979 | * remove tblock from logsynclist | ||
980 | * (allocation map pages inherited lsn of tblk and | ||
981 | * has been inserted in logsync list at txUpdateMap()) | ||
982 | */ | ||
983 | if (tblk->lsn) { | ||
984 | LOGSYNC_LOCK(log); | ||
985 | log->count--; | ||
986 | list_del(&tblk->synclist); | ||
987 | LOGSYNC_UNLOCK(log); | ||
988 | } | ||
989 | } | ||
990 | |||
991 | |||
992 | /* | ||
993 | * txMaplock() | ||
994 | * | ||
995 | * function: allocate a transaction lock for freed page/entry; | ||
996 | * for freed page, maplock is used as xtlock/dtlock type; | ||
997 | */ | ||
998 | struct tlock *txMaplock(tid_t tid, struct inode *ip, int type) | ||
999 | { | ||
1000 | struct jfs_inode_info *jfs_ip = JFS_IP(ip); | ||
1001 | lid_t lid; | ||
1002 | struct tblock *tblk; | ||
1003 | struct tlock *tlck; | ||
1004 | struct maplock *maplock; | ||
1005 | |||
1006 | TXN_LOCK(); | ||
1007 | |||
1008 | /* | ||
1009 | * allocate a tlock | ||
1010 | */ | ||
1011 | lid = txLockAlloc(); | ||
1012 | tlck = lid_to_tlock(lid); | ||
1013 | |||
1014 | /* | ||
1015 | * initialize tlock | ||
1016 | */ | ||
1017 | tlck->tid = tid; | ||
1018 | |||
1019 | /* bind the tlock and the object */ | ||
1020 | tlck->flag = tlckINODELOCK; | ||
1021 | tlck->ip = ip; | ||
1022 | tlck->mp = NULL; | ||
1023 | |||
1024 | tlck->type = type; | ||
1025 | |||
1026 | /* | ||
1027 | * enqueue transaction lock to transaction/inode | ||
1028 | */ | ||
1029 | /* insert the tlock at tail of transaction tlock list */ | ||
1030 | if (tid) { | ||
1031 | tblk = tid_to_tblock(tid); | ||
1032 | if (tblk->next) | ||
1033 | lid_to_tlock(tblk->last)->next = lid; | ||
1034 | else | ||
1035 | tblk->next = lid; | ||
1036 | tlck->next = 0; | ||
1037 | tblk->last = lid; | ||
1038 | } | ||
1039 | /* anonymous transaction: | ||
1040 | * insert the tlock at head of inode anonymous tlock list | ||
1041 | */ | ||
1042 | else { | ||
1043 | tlck->next = jfs_ip->atlhead; | ||
1044 | jfs_ip->atlhead = lid; | ||
1045 | if (tlck->next == 0) { | ||
1046 | /* This inode's first anonymous transaction */ | ||
1047 | jfs_ip->atltail = lid; | ||
1048 | list_add_tail(&jfs_ip->anon_inode_list, | ||
1049 | &TxAnchor.anon_list); | ||
1050 | } | ||
1051 | } | ||
1052 | |||
1053 | TXN_UNLOCK(); | ||
1054 | |||
1055 | /* initialize type dependent area for maplock */ | ||
1056 | maplock = (struct maplock *) & tlck->lock; | ||
1057 | maplock->next = 0; | ||
1058 | maplock->maxcnt = 0; | ||
1059 | maplock->index = 0; | ||
1060 | |||
1061 | return tlck; | ||
1062 | } | ||
1063 | |||
1064 | |||
1065 | /* | ||
1066 | * txLinelock() | ||
1067 | * | ||
1068 | * function: allocate a transaction lock for log vector list | ||
1069 | */ | ||
1070 | struct linelock *txLinelock(struct linelock * tlock) | ||
1071 | { | ||
1072 | lid_t lid; | ||
1073 | struct tlock *tlck; | ||
1074 | struct linelock *linelock; | ||
1075 | |||
1076 | TXN_LOCK(); | ||
1077 | |||
1078 | /* allocate a TxLock structure */ | ||
1079 | lid = txLockAlloc(); | ||
1080 | tlck = lid_to_tlock(lid); | ||
1081 | |||
1082 | TXN_UNLOCK(); | ||
1083 | |||
1084 | /* initialize linelock */ | ||
1085 | linelock = (struct linelock *) tlck; | ||
1086 | linelock->next = 0; | ||
1087 | linelock->flag = tlckLINELOCK; | ||
1088 | linelock->maxcnt = TLOCKLONG; | ||
1089 | linelock->index = 0; | ||
1090 | |||
1091 | /* append linelock after tlock */ | ||
1092 | linelock->next = tlock->next; | ||
1093 | tlock->next = lid; | ||
1094 | |||
1095 | return linelock; | ||
1096 | } | ||
1097 | |||
1098 | |||
1099 | |||
1100 | /* | ||
1101 | * transaction commit management | ||
1102 | * ----------------------------- | ||
1103 | */ | ||
1104 | |||
1105 | /* | ||
1106 | * NAME: txCommit() | ||
1107 | * | ||
1108 | * FUNCTION: commit the changes to the objects specified in | ||
1109 | * clist. For journalled segments only the | ||
1110 | * changes of the caller are committed, ie by tid. | ||
1111 | * for non-journalled segments the data are flushed to | ||
1112 | * disk and then the change to the disk inode and indirect | ||
1113 | * blocks committed (so blocks newly allocated to the | ||
1114 | * segment will be made a part of the segment atomically). | ||
1115 | * | ||
1116 | * all of the segments specified in clist must be in | ||
1117 | * one file system. no more than 6 segments are needed | ||
1118 | * to handle all unix svcs. | ||
1119 | * | ||
1120 | * if the i_nlink field (i.e. disk inode link count) | ||
1121 | * is zero, and the type of inode is a regular file or | ||
1122 | * directory, or symbolic link , the inode is truncated | ||
1123 | * to zero length. the truncation is committed but the | ||
1124 | * VM resources are unaffected until it is closed (see | ||
1125 | * iput and iclose). | ||
1126 | * | ||
1127 | * PARAMETER: | ||
1128 | * | ||
1129 | * RETURN: | ||
1130 | * | ||
1131 | * serialization: | ||
1132 | * on entry the inode lock on each segment is assumed | ||
1133 | * to be held. | ||
1134 | * | ||
1135 | * i/o error: | ||
1136 | */ | ||
1137 | int txCommit(tid_t tid, /* transaction identifier */ | ||
1138 | int nip, /* number of inodes to commit */ | ||
1139 | struct inode **iplist, /* list of inode to commit */ | ||
1140 | int flag) | ||
1141 | { | ||
1142 | int rc = 0; | ||
1143 | struct commit cd; | ||
1144 | struct jfs_log *log; | ||
1145 | struct tblock *tblk; | ||
1146 | struct lrd *lrd; | ||
1147 | int lsn; | ||
1148 | struct inode *ip; | ||
1149 | struct jfs_inode_info *jfs_ip; | ||
1150 | int k, n; | ||
1151 | ino_t top; | ||
1152 | struct super_block *sb; | ||
1153 | |||
1154 | jfs_info("txCommit, tid = %d, flag = %d", tid, flag); | ||
1155 | /* is read-only file system ? */ | ||
1156 | if (isReadOnly(iplist[0])) { | ||
1157 | rc = -EROFS; | ||
1158 | goto TheEnd; | ||
1159 | } | ||
1160 | |||
1161 | sb = cd.sb = iplist[0]->i_sb; | ||
1162 | cd.tid = tid; | ||
1163 | |||
1164 | if (tid == 0) | ||
1165 | tid = txBegin(sb, 0); | ||
1166 | tblk = tid_to_tblock(tid); | ||
1167 | |||
1168 | /* | ||
1169 | * initialize commit structure | ||
1170 | */ | ||
1171 | log = JFS_SBI(sb)->log; | ||
1172 | cd.log = log; | ||
1173 | |||
1174 | /* initialize log record descriptor in commit */ | ||
1175 | lrd = &cd.lrd; | ||
1176 | lrd->logtid = cpu_to_le32(tblk->logtid); | ||
1177 | lrd->backchain = 0; | ||
1178 | |||
1179 | tblk->xflag |= flag; | ||
1180 | |||
1181 | if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0) | ||
1182 | tblk->xflag |= COMMIT_LAZY; | ||
1183 | /* | ||
1184 | * prepare non-journaled objects for commit | ||
1185 | * | ||
1186 | * flush data pages of non-journaled file | ||
1187 | * to prevent the file getting non-initialized disk blocks | ||
1188 | * in case of crash. | ||
1189 | * (new blocks - ) | ||
1190 | */ | ||
1191 | cd.iplist = iplist; | ||
1192 | cd.nip = nip; | ||
1193 | |||
1194 | /* | ||
1195 | * acquire transaction lock on (on-disk) inodes | ||
1196 | * | ||
1197 | * update on-disk inode from in-memory inode | ||
1198 | * acquiring transaction locks for AFTER records | ||
1199 | * on the on-disk inode of file object | ||
1200 | * | ||
1201 | * sort the inodes array by inode number in descending order | ||
1202 | * to prevent deadlock when acquiring transaction lock | ||
1203 | * of on-disk inodes on multiple on-disk inode pages by | ||
1204 | * multiple concurrent transactions | ||
1205 | */ | ||
1206 | for (k = 0; k < cd.nip; k++) { | ||
1207 | top = (cd.iplist[k])->i_ino; | ||
1208 | for (n = k + 1; n < cd.nip; n++) { | ||
1209 | ip = cd.iplist[n]; | ||
1210 | if (ip->i_ino > top) { | ||
1211 | top = ip->i_ino; | ||
1212 | cd.iplist[n] = cd.iplist[k]; | ||
1213 | cd.iplist[k] = ip; | ||
1214 | } | ||
1215 | } | ||
1216 | |||
1217 | ip = cd.iplist[k]; | ||
1218 | jfs_ip = JFS_IP(ip); | ||
1219 | |||
1220 | /* | ||
1221 | * BUGBUG - This code has temporarily been removed. The | ||
1222 | * intent is to ensure that any file data is written before | ||
1223 | * the metadata is committed to the journal. This prevents | ||
1224 | * uninitialized data from appearing in a file after the | ||
1225 | * journal has been replayed. (The uninitialized data | ||
1226 | * could be sensitive data removed by another user.) | ||
1227 | * | ||
1228 | * The problem now is that we are holding the IWRITELOCK | ||
1229 | * on the inode, and calling filemap_fdatawrite on an | ||
1230 | * unmapped page will cause a deadlock in jfs_get_block. | ||
1231 | * | ||
1232 | * The long term solution is to pare down the use of | ||
1233 | * IWRITELOCK. We are currently holding it too long. | ||
1234 | * We could also be smarter about which data pages need | ||
1235 | * to be written before the transaction is committed and | ||
1236 | * when we don't need to worry about it at all. | ||
1237 | * | ||
1238 | * if ((!S_ISDIR(ip->i_mode)) | ||
1239 | * && (tblk->flag & COMMIT_DELETE) == 0) { | ||
1240 | * filemap_fdatawrite(ip->i_mapping); | ||
1241 | * filemap_fdatawait(ip->i_mapping); | ||
1242 | * } | ||
1243 | */ | ||
1244 | |||
1245 | /* | ||
1246 | * Mark inode as not dirty. It will still be on the dirty | ||
1247 | * inode list, but we'll know not to commit it again unless | ||
1248 | * it gets marked dirty again | ||
1249 | */ | ||
1250 | clear_cflag(COMMIT_Dirty, ip); | ||
1251 | |||
1252 | /* inherit anonymous tlock(s) of inode */ | ||
1253 | if (jfs_ip->atlhead) { | ||
1254 | lid_to_tlock(jfs_ip->atltail)->next = tblk->next; | ||
1255 | tblk->next = jfs_ip->atlhead; | ||
1256 | if (!tblk->last) | ||
1257 | tblk->last = jfs_ip->atltail; | ||
1258 | jfs_ip->atlhead = jfs_ip->atltail = 0; | ||
1259 | TXN_LOCK(); | ||
1260 | list_del_init(&jfs_ip->anon_inode_list); | ||
1261 | TXN_UNLOCK(); | ||
1262 | } | ||
1263 | |||
1264 | /* | ||
1265 | * acquire transaction lock on on-disk inode page | ||
1266 | * (become first tlock of the tblk's tlock list) | ||
1267 | */ | ||
1268 | if (((rc = diWrite(tid, ip)))) | ||
1269 | goto out; | ||
1270 | } | ||
1271 | |||
1272 | /* | ||
1273 | * write log records from transaction locks | ||
1274 | * | ||
1275 | * txUpdateMap() resets XAD_NEW in XAD. | ||
1276 | */ | ||
1277 | if ((rc = txLog(log, tblk, &cd))) | ||
1278 | goto TheEnd; | ||
1279 | |||
1280 | /* | ||
1281 | * Ensure that inode isn't reused before | ||
1282 | * lazy commit thread finishes processing | ||
1283 | */ | ||
1284 | if (tblk->xflag & COMMIT_DELETE) { | ||
1285 | atomic_inc(&tblk->u.ip->i_count); | ||
1286 | /* | ||
1287 | * Avoid a rare deadlock | ||
1288 | * | ||
1289 | * If the inode is locked, we may be blocked in | ||
1290 | * jfs_commit_inode. If so, we don't want the | ||
1291 | * lazy_commit thread doing the last iput() on the inode | ||
1292 | * since that may block on the locked inode. Instead, | ||
1293 | * commit the transaction synchronously, so the last iput | ||
1294 | * will be done by the calling thread (or later) | ||
1295 | */ | ||
1296 | if (tblk->u.ip->i_state & I_LOCK) | ||
1297 | tblk->xflag &= ~COMMIT_LAZY; | ||
1298 | } | ||
1299 | |||
1300 | ASSERT((!(tblk->xflag & COMMIT_DELETE)) || | ||
1301 | ((tblk->u.ip->i_nlink == 0) && | ||
1302 | !test_cflag(COMMIT_Nolink, tblk->u.ip))); | ||
1303 | |||
1304 | /* | ||
1305 | * write COMMIT log record | ||
1306 | */ | ||
1307 | lrd->type = cpu_to_le16(LOG_COMMIT); | ||
1308 | lrd->length = 0; | ||
1309 | lsn = lmLog(log, tblk, lrd, NULL); | ||
1310 | |||
1311 | lmGroupCommit(log, tblk); | ||
1312 | |||
1313 | /* | ||
1314 | * - transaction is now committed - | ||
1315 | */ | ||
1316 | |||
1317 | /* | ||
1318 | * force pages in careful update | ||
1319 | * (imap addressing structure update) | ||
1320 | */ | ||
1321 | if (flag & COMMIT_FORCE) | ||
1322 | txForce(tblk); | ||
1323 | |||
1324 | /* | ||
1325 | * update allocation map. | ||
1326 | * | ||
1327 | * update inode allocation map and inode: | ||
1328 | * free pager lock on memory object of inode if any. | ||
1329 | * update block allocation map. | ||
1330 | * | ||
1331 | * txUpdateMap() resets XAD_NEW in XAD. | ||
1332 | */ | ||
1333 | if (tblk->xflag & COMMIT_FORCE) | ||
1334 | txUpdateMap(tblk); | ||
1335 | |||
1336 | /* | ||
1337 | * free transaction locks and pageout/free pages | ||
1338 | */ | ||
1339 | txRelease(tblk); | ||
1340 | |||
1341 | if ((tblk->flag & tblkGC_LAZY) == 0) | ||
1342 | txUnlock(tblk); | ||
1343 | |||
1344 | |||
1345 | /* | ||
1346 | * reset in-memory object state | ||
1347 | */ | ||
1348 | for (k = 0; k < cd.nip; k++) { | ||
1349 | ip = cd.iplist[k]; | ||
1350 | jfs_ip = JFS_IP(ip); | ||
1351 | |||
1352 | /* | ||
1353 | * reset in-memory inode state | ||
1354 | */ | ||
1355 | jfs_ip->bxflag = 0; | ||
1356 | jfs_ip->blid = 0; | ||
1357 | } | ||
1358 | |||
1359 | out: | ||
1360 | if (rc != 0) | ||
1361 | txAbort(tid, 1); | ||
1362 | |||
1363 | TheEnd: | ||
1364 | jfs_info("txCommit: tid = %d, returning %d", tid, rc); | ||
1365 | return rc; | ||
1366 | } | ||
1367 | |||
1368 | |||
1369 | /* | ||
1370 | * NAME: txLog() | ||
1371 | * | ||
1372 | * FUNCTION: Writes AFTER log records for all lines modified | ||
1373 | * by tid for segments specified by inodes in comdata. | ||
1374 | * Code assumes only WRITELOCKS are recorded in lockwords. | ||
1375 | * | ||
1376 | * PARAMETERS: | ||
1377 | * | ||
1378 | * RETURN : | ||
1379 | */ | ||
1380 | static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd) | ||
1381 | { | ||
1382 | int rc = 0; | ||
1383 | struct inode *ip; | ||
1384 | lid_t lid; | ||
1385 | struct tlock *tlck; | ||
1386 | struct lrd *lrd = &cd->lrd; | ||
1387 | |||
1388 | /* | ||
1389 | * write log record(s) for each tlock of transaction, | ||
1390 | */ | ||
1391 | for (lid = tblk->next; lid; lid = tlck->next) { | ||
1392 | tlck = lid_to_tlock(lid); | ||
1393 | |||
1394 | tlck->flag |= tlckLOG; | ||
1395 | |||
1396 | /* initialize lrd common */ | ||
1397 | ip = tlck->ip; | ||
1398 | lrd->aggregate = cpu_to_le32(JFS_SBI(ip->i_sb)->aggregate); | ||
1399 | lrd->log.redopage.fileset = cpu_to_le32(JFS_IP(ip)->fileset); | ||
1400 | lrd->log.redopage.inode = cpu_to_le32(ip->i_ino); | ||
1401 | |||
1402 | /* write log record of page from the tlock */ | ||
1403 | switch (tlck->type & tlckTYPE) { | ||
1404 | case tlckXTREE: | ||
1405 | xtLog(log, tblk, lrd, tlck); | ||
1406 | break; | ||
1407 | |||
1408 | case tlckDTREE: | ||
1409 | dtLog(log, tblk, lrd, tlck); | ||
1410 | break; | ||
1411 | |||
1412 | case tlckINODE: | ||
1413 | diLog(log, tblk, lrd, tlck, cd); | ||
1414 | break; | ||
1415 | |||
1416 | case tlckMAP: | ||
1417 | mapLog(log, tblk, lrd, tlck); | ||
1418 | break; | ||
1419 | |||
1420 | case tlckDATA: | ||
1421 | dataLog(log, tblk, lrd, tlck); | ||
1422 | break; | ||
1423 | |||
1424 | default: | ||
1425 | jfs_err("UFO tlock:0x%p", tlck); | ||
1426 | } | ||
1427 | } | ||
1428 | |||
1429 | return rc; | ||
1430 | } | ||
1431 | |||
1432 | |||
1433 | /* | ||
1434 | * diLog() | ||
1435 | * | ||
1436 | * function: log inode tlock and format maplock to update bmap; | ||
1437 | */ | ||
1438 | static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, | ||
1439 | struct tlock * tlck, struct commit * cd) | ||
1440 | { | ||
1441 | int rc = 0; | ||
1442 | struct metapage *mp; | ||
1443 | pxd_t *pxd; | ||
1444 | struct pxd_lock *pxdlock; | ||
1445 | |||
1446 | mp = tlck->mp; | ||
1447 | |||
1448 | /* initialize as REDOPAGE record format */ | ||
1449 | lrd->log.redopage.type = cpu_to_le16(LOG_INODE); | ||
1450 | lrd->log.redopage.l2linesize = cpu_to_le16(L2INODESLOTSIZE); | ||
1451 | |||
1452 | pxd = &lrd->log.redopage.pxd; | ||
1453 | |||
1454 | /* | ||
1455 | * inode after image | ||
1456 | */ | ||
1457 | if (tlck->type & tlckENTRY) { | ||
1458 | /* log after-image for logredo(): */ | ||
1459 | lrd->type = cpu_to_le16(LOG_REDOPAGE); | ||
1460 | // *pxd = mp->cm_pxd; | ||
1461 | PXDaddress(pxd, mp->index); | ||
1462 | PXDlength(pxd, | ||
1463 | mp->logical_size >> tblk->sb->s_blocksize_bits); | ||
1464 | lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); | ||
1465 | |||
1466 | /* mark page as homeward bound */ | ||
1467 | tlck->flag |= tlckWRITEPAGE; | ||
1468 | } else if (tlck->type & tlckFREE) { | ||
1469 | /* | ||
1470 | * free inode extent | ||
1471 | * | ||
1472 | * (pages of the freed inode extent have been invalidated and | ||
1473 | * a maplock for free of the extent has been formatted at | ||
1474 | * txLock() time); | ||
1475 | * | ||
1476 | * the tlock had been acquired on the inode allocation map page | ||
1477 | * (iag) that specifies the freed extent, even though the map | ||
1478 | * page is not itself logged, to prevent pageout of the map | ||
1479 | * page before the log; | ||
1480 | */ | ||
1481 | |||
1482 | /* log LOG_NOREDOINOEXT of the freed inode extent for | ||
1483 | * logredo() to start NoRedoPage filters, and to update | ||
1484 | * imap and bmap for free of the extent; | ||
1485 | */ | ||
1486 | lrd->type = cpu_to_le16(LOG_NOREDOINOEXT); | ||
1487 | /* | ||
1488 | * For the LOG_NOREDOINOEXT record, we need | ||
1489 | * to pass the IAG number and inode extent | ||
1490 | * index (within that IAG) from which the | ||
1491 | * the extent being released. These have been | ||
1492 | * passed to us in the iplist[1] and iplist[2]. | ||
1493 | */ | ||
1494 | lrd->log.noredoinoext.iagnum = | ||
1495 | cpu_to_le32((u32) (size_t) cd->iplist[1]); | ||
1496 | lrd->log.noredoinoext.inoext_idx = | ||
1497 | cpu_to_le32((u32) (size_t) cd->iplist[2]); | ||
1498 | |||
1499 | pxdlock = (struct pxd_lock *) & tlck->lock; | ||
1500 | *pxd = pxdlock->pxd; | ||
1501 | lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); | ||
1502 | |||
1503 | /* update bmap */ | ||
1504 | tlck->flag |= tlckUPDATEMAP; | ||
1505 | |||
1506 | /* mark page as homeward bound */ | ||
1507 | tlck->flag |= tlckWRITEPAGE; | ||
1508 | } else | ||
1509 | jfs_err("diLog: UFO type tlck:0x%p", tlck); | ||
1510 | #ifdef _JFS_WIP | ||
1511 | /* | ||
1512 | * alloc/free external EA extent | ||
1513 | * | ||
1514 | * a maplock for txUpdateMap() to update bPWMAP for alloc/free | ||
1515 | * of the extent has been formatted at txLock() time; | ||
1516 | */ | ||
1517 | else { | ||
1518 | assert(tlck->type & tlckEA); | ||
1519 | |||
1520 | /* log LOG_UPDATEMAP for logredo() to update bmap for | ||
1521 | * alloc of new (and free of old) external EA extent; | ||
1522 | */ | ||
1523 | lrd->type = cpu_to_le16(LOG_UPDATEMAP); | ||
1524 | pxdlock = (struct pxd_lock *) & tlck->lock; | ||
1525 | nlock = pxdlock->index; | ||
1526 | for (i = 0; i < nlock; i++, pxdlock++) { | ||
1527 | if (pxdlock->flag & mlckALLOCPXD) | ||
1528 | lrd->log.updatemap.type = | ||
1529 | cpu_to_le16(LOG_ALLOCPXD); | ||
1530 | else | ||
1531 | lrd->log.updatemap.type = | ||
1532 | cpu_to_le16(LOG_FREEPXD); | ||
1533 | lrd->log.updatemap.nxd = cpu_to_le16(1); | ||
1534 | lrd->log.updatemap.pxd = pxdlock->pxd; | ||
1535 | lrd->backchain = | ||
1536 | cpu_to_le32(lmLog(log, tblk, lrd, NULL)); | ||
1537 | } | ||
1538 | |||
1539 | /* update bmap */ | ||
1540 | tlck->flag |= tlckUPDATEMAP; | ||
1541 | } | ||
1542 | #endif /* _JFS_WIP */ | ||
1543 | |||
1544 | return rc; | ||
1545 | } | ||
1546 | |||
1547 | |||
1548 | /* | ||
1549 | * dataLog() | ||
1550 | * | ||
1551 | * function: log data tlock | ||
1552 | */ | ||
1553 | static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, | ||
1554 | struct tlock * tlck) | ||
1555 | { | ||
1556 | struct metapage *mp; | ||
1557 | pxd_t *pxd; | ||
1558 | |||
1559 | mp = tlck->mp; | ||
1560 | |||
1561 | /* initialize as REDOPAGE record format */ | ||
1562 | lrd->log.redopage.type = cpu_to_le16(LOG_DATA); | ||
1563 | lrd->log.redopage.l2linesize = cpu_to_le16(L2DATASLOTSIZE); | ||
1564 | |||
1565 | pxd = &lrd->log.redopage.pxd; | ||
1566 | |||
1567 | /* log after-image for logredo(): */ | ||
1568 | lrd->type = cpu_to_le16(LOG_REDOPAGE); | ||
1569 | |||
1570 | if (jfs_dirtable_inline(tlck->ip)) { | ||
1571 | /* | ||
1572 | * The table has been truncated, we've must have deleted | ||
1573 | * the last entry, so don't bother logging this | ||
1574 | */ | ||
1575 | mp->lid = 0; | ||
1576 | hold_metapage(mp, 0); | ||
1577 | atomic_dec(&mp->nohomeok); | ||
1578 | discard_metapage(mp); | ||
1579 | tlck->mp = NULL; | ||
1580 | return 0; | ||
1581 | } | ||
1582 | |||
1583 | PXDaddress(pxd, mp->index); | ||
1584 | PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); | ||
1585 | |||
1586 | lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); | ||
1587 | |||
1588 | /* mark page as homeward bound */ | ||
1589 | tlck->flag |= tlckWRITEPAGE; | ||
1590 | |||
1591 | return 0; | ||
1592 | } | ||
1593 | |||
1594 | |||
1595 | /* | ||
1596 | * dtLog() | ||
1597 | * | ||
1598 | * function: log dtree tlock and format maplock to update bmap; | ||
1599 | */ | ||
1600 | static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, | ||
1601 | struct tlock * tlck) | ||
1602 | { | ||
1603 | struct metapage *mp; | ||
1604 | struct pxd_lock *pxdlock; | ||
1605 | pxd_t *pxd; | ||
1606 | |||
1607 | mp = tlck->mp; | ||
1608 | |||
1609 | /* initialize as REDOPAGE/NOREDOPAGE record format */ | ||
1610 | lrd->log.redopage.type = cpu_to_le16(LOG_DTREE); | ||
1611 | lrd->log.redopage.l2linesize = cpu_to_le16(L2DTSLOTSIZE); | ||
1612 | |||
1613 | pxd = &lrd->log.redopage.pxd; | ||
1614 | |||
1615 | if (tlck->type & tlckBTROOT) | ||
1616 | lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT); | ||
1617 | |||
1618 | /* | ||
1619 | * page extension via relocation: entry insertion; | ||
1620 | * page extension in-place: entry insertion; | ||
1621 | * new right page from page split, reinitialized in-line | ||
1622 | * root from root page split: entry insertion; | ||
1623 | */ | ||
1624 | if (tlck->type & (tlckNEW | tlckEXTEND)) { | ||
1625 | /* log after-image of the new page for logredo(): | ||
1626 | * mark log (LOG_NEW) for logredo() to initialize | ||
1627 | * freelist and update bmap for alloc of the new page; | ||
1628 | */ | ||
1629 | lrd->type = cpu_to_le16(LOG_REDOPAGE); | ||
1630 | if (tlck->type & tlckEXTEND) | ||
1631 | lrd->log.redopage.type |= cpu_to_le16(LOG_EXTEND); | ||
1632 | else | ||
1633 | lrd->log.redopage.type |= cpu_to_le16(LOG_NEW); | ||
1634 | // *pxd = mp->cm_pxd; | ||
1635 | PXDaddress(pxd, mp->index); | ||
1636 | PXDlength(pxd, | ||
1637 | mp->logical_size >> tblk->sb->s_blocksize_bits); | ||
1638 | lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); | ||
1639 | |||
1640 | /* format a maplock for txUpdateMap() to update bPMAP for | ||
1641 | * alloc of the new page; | ||
1642 | */ | ||
1643 | if (tlck->type & tlckBTROOT) | ||
1644 | return; | ||
1645 | tlck->flag |= tlckUPDATEMAP; | ||
1646 | pxdlock = (struct pxd_lock *) & tlck->lock; | ||
1647 | pxdlock->flag = mlckALLOCPXD; | ||
1648 | pxdlock->pxd = *pxd; | ||
1649 | |||
1650 | pxdlock->index = 1; | ||
1651 | |||
1652 | /* mark page as homeward bound */ | ||
1653 | tlck->flag |= tlckWRITEPAGE; | ||
1654 | return; | ||
1655 | } | ||
1656 | |||
1657 | /* | ||
1658 | * entry insertion/deletion, | ||
1659 | * sibling page link update (old right page before split); | ||
1660 | */ | ||
1661 | if (tlck->type & (tlckENTRY | tlckRELINK)) { | ||
1662 | /* log after-image for logredo(): */ | ||
1663 | lrd->type = cpu_to_le16(LOG_REDOPAGE); | ||
1664 | PXDaddress(pxd, mp->index); | ||
1665 | PXDlength(pxd, | ||
1666 | mp->logical_size >> tblk->sb->s_blocksize_bits); | ||
1667 | lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); | ||
1668 | |||
1669 | /* mark page as homeward bound */ | ||
1670 | tlck->flag |= tlckWRITEPAGE; | ||
1671 | return; | ||
1672 | } | ||
1673 | |||
1674 | /* | ||
1675 | * page deletion: page has been invalidated | ||
1676 | * page relocation: source extent | ||
1677 | * | ||
1678 | * a maplock for free of the page has been formatted | ||
1679 | * at txLock() time); | ||
1680 | */ | ||
1681 | if (tlck->type & (tlckFREE | tlckRELOCATE)) { | ||
1682 | /* log LOG_NOREDOPAGE of the deleted page for logredo() | ||
1683 | * to start NoRedoPage filter and to update bmap for free | ||
1684 | * of the deletd page | ||
1685 | */ | ||
1686 | lrd->type = cpu_to_le16(LOG_NOREDOPAGE); | ||
1687 | pxdlock = (struct pxd_lock *) & tlck->lock; | ||
1688 | *pxd = pxdlock->pxd; | ||
1689 | lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); | ||
1690 | |||
1691 | /* a maplock for txUpdateMap() for free of the page | ||
1692 | * has been formatted at txLock() time; | ||
1693 | */ | ||
1694 | tlck->flag |= tlckUPDATEMAP; | ||
1695 | } | ||
1696 | return; | ||
1697 | } | ||
1698 | |||
1699 | |||
1700 | /* | ||
1701 | * xtLog() | ||
1702 | * | ||
1703 | * function: log xtree tlock and format maplock to update bmap; | ||
1704 | */ | ||
1705 | static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, | ||
1706 | struct tlock * tlck) | ||
1707 | { | ||
1708 | struct inode *ip; | ||
1709 | struct metapage *mp; | ||
1710 | xtpage_t *p; | ||
1711 | struct xtlock *xtlck; | ||
1712 | struct maplock *maplock; | ||
1713 | struct xdlistlock *xadlock; | ||
1714 | struct pxd_lock *pxdlock; | ||
1715 | pxd_t *pxd; | ||
1716 | int next, lwm, hwm; | ||
1717 | |||
1718 | ip = tlck->ip; | ||
1719 | mp = tlck->mp; | ||
1720 | |||
1721 | /* initialize as REDOPAGE/NOREDOPAGE record format */ | ||
1722 | lrd->log.redopage.type = cpu_to_le16(LOG_XTREE); | ||
1723 | lrd->log.redopage.l2linesize = cpu_to_le16(L2XTSLOTSIZE); | ||
1724 | |||
1725 | pxd = &lrd->log.redopage.pxd; | ||
1726 | |||
1727 | if (tlck->type & tlckBTROOT) { | ||
1728 | lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT); | ||
1729 | p = &JFS_IP(ip)->i_xtroot; | ||
1730 | if (S_ISDIR(ip->i_mode)) | ||
1731 | lrd->log.redopage.type |= | ||
1732 | cpu_to_le16(LOG_DIR_XTREE); | ||
1733 | } else | ||
1734 | p = (xtpage_t *) mp->data; | ||
1735 | next = le16_to_cpu(p->header.nextindex); | ||
1736 | |||
1737 | xtlck = (struct xtlock *) & tlck->lock; | ||
1738 | |||
1739 | maplock = (struct maplock *) & tlck->lock; | ||
1740 | xadlock = (struct xdlistlock *) maplock; | ||
1741 | |||
1742 | /* | ||
1743 | * entry insertion/extension; | ||
1744 | * sibling page link update (old right page before split); | ||
1745 | */ | ||
1746 | if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) { | ||
1747 | /* log after-image for logredo(): | ||
1748 | * logredo() will update bmap for alloc of new/extended | ||
1749 | * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from | ||
1750 | * after-image of XADlist; | ||
1751 | * logredo() resets (XAD_NEW|XAD_EXTEND) flag when | ||
1752 | * applying the after-image to the meta-data page. | ||
1753 | */ | ||
1754 | lrd->type = cpu_to_le16(LOG_REDOPAGE); | ||
1755 | // *pxd = mp->cm_pxd; | ||
1756 | PXDaddress(pxd, mp->index); | ||
1757 | PXDlength(pxd, | ||
1758 | mp->logical_size >> tblk->sb->s_blocksize_bits); | ||
1759 | lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); | ||
1760 | |||
1761 | /* format a maplock for txUpdateMap() to update bPMAP | ||
1762 | * for alloc of new/extended extents of XAD[lwm:next) | ||
1763 | * from the page itself; | ||
1764 | * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag. | ||
1765 | */ | ||
1766 | lwm = xtlck->lwm.offset; | ||
1767 | if (lwm == 0) | ||
1768 | lwm = XTPAGEMAXSLOT; | ||
1769 | |||
1770 | if (lwm == next) | ||
1771 | goto out; | ||
1772 | if (lwm > next) { | ||
1773 | jfs_err("xtLog: lwm > next\n"); | ||
1774 | goto out; | ||
1775 | } | ||
1776 | tlck->flag |= tlckUPDATEMAP; | ||
1777 | xadlock->flag = mlckALLOCXADLIST; | ||
1778 | xadlock->count = next - lwm; | ||
1779 | if ((xadlock->count <= 2) && (tblk->xflag & COMMIT_LAZY)) { | ||
1780 | int i; | ||
1781 | /* | ||
1782 | * Lazy commit may allow xtree to be modified before | ||
1783 | * txUpdateMap runs. Copy xad into linelock to | ||
1784 | * preserve correct data. | ||
1785 | */ | ||
1786 | xadlock->xdlist = &xtlck->pxdlock; | ||
1787 | memcpy(xadlock->xdlist, &p->xad[lwm], | ||
1788 | sizeof(xad_t) * xadlock->count); | ||
1789 | |||
1790 | for (i = 0; i < xadlock->count; i++) | ||
1791 | p->xad[lwm + i].flag &= | ||
1792 | ~(XAD_NEW | XAD_EXTENDED); | ||
1793 | } else { | ||
1794 | /* | ||
1795 | * xdlist will point to into inode's xtree, ensure | ||
1796 | * that transaction is not committed lazily. | ||
1797 | */ | ||
1798 | xadlock->xdlist = &p->xad[lwm]; | ||
1799 | tblk->xflag &= ~COMMIT_LAZY; | ||
1800 | } | ||
1801 | jfs_info("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d " | ||
1802 | "count:%d", tlck->ip, mp, tlck, lwm, xadlock->count); | ||
1803 | |||
1804 | maplock->index = 1; | ||
1805 | |||
1806 | out: | ||
1807 | /* mark page as homeward bound */ | ||
1808 | tlck->flag |= tlckWRITEPAGE; | ||
1809 | |||
1810 | return; | ||
1811 | } | ||
1812 | |||
1813 | /* | ||
1814 | * page deletion: file deletion/truncation (ref. xtTruncate()) | ||
1815 | * | ||
1816 | * (page will be invalidated after log is written and bmap | ||
1817 | * is updated from the page); | ||
1818 | */ | ||
1819 | if (tlck->type & tlckFREE) { | ||
1820 | /* LOG_NOREDOPAGE log for NoRedoPage filter: | ||
1821 | * if page free from file delete, NoRedoFile filter from | ||
1822 | * inode image of zero link count will subsume NoRedoPage | ||
1823 | * filters for each page; | ||
1824 | * if page free from file truncattion, write NoRedoPage | ||
1825 | * filter; | ||
1826 | * | ||
1827 | * upadte of block allocation map for the page itself: | ||
1828 | * if page free from deletion and truncation, LOG_UPDATEMAP | ||
1829 | * log for the page itself is generated from processing | ||
1830 | * its parent page xad entries; | ||
1831 | */ | ||
1832 | /* if page free from file truncation, log LOG_NOREDOPAGE | ||
1833 | * of the deleted page for logredo() to start NoRedoPage | ||
1834 | * filter for the page; | ||
1835 | */ | ||
1836 | if (tblk->xflag & COMMIT_TRUNCATE) { | ||
1837 | /* write NOREDOPAGE for the page */ | ||
1838 | lrd->type = cpu_to_le16(LOG_NOREDOPAGE); | ||
1839 | PXDaddress(pxd, mp->index); | ||
1840 | PXDlength(pxd, | ||
1841 | mp->logical_size >> tblk->sb-> | ||
1842 | s_blocksize_bits); | ||
1843 | lrd->backchain = | ||
1844 | cpu_to_le32(lmLog(log, tblk, lrd, NULL)); | ||
1845 | |||
1846 | if (tlck->type & tlckBTROOT) { | ||
1847 | /* Empty xtree must be logged */ | ||
1848 | lrd->type = cpu_to_le16(LOG_REDOPAGE); | ||
1849 | lrd->backchain = | ||
1850 | cpu_to_le32(lmLog(log, tblk, lrd, tlck)); | ||
1851 | } | ||
1852 | } | ||
1853 | |||
1854 | /* init LOG_UPDATEMAP of the freed extents | ||
1855 | * XAD[XTENTRYSTART:hwm) from the deleted page itself | ||
1856 | * for logredo() to update bmap; | ||
1857 | */ | ||
1858 | lrd->type = cpu_to_le16(LOG_UPDATEMAP); | ||
1859 | lrd->log.updatemap.type = cpu_to_le16(LOG_FREEXADLIST); | ||
1860 | xtlck = (struct xtlock *) & tlck->lock; | ||
1861 | hwm = xtlck->hwm.offset; | ||
1862 | lrd->log.updatemap.nxd = | ||
1863 | cpu_to_le16(hwm - XTENTRYSTART + 1); | ||
1864 | /* reformat linelock for lmLog() */ | ||
1865 | xtlck->header.offset = XTENTRYSTART; | ||
1866 | xtlck->header.length = hwm - XTENTRYSTART + 1; | ||
1867 | xtlck->index = 1; | ||
1868 | lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); | ||
1869 | |||
1870 | /* format a maplock for txUpdateMap() to update bmap | ||
1871 | * to free extents of XAD[XTENTRYSTART:hwm) from the | ||
1872 | * deleted page itself; | ||
1873 | */ | ||
1874 | tlck->flag |= tlckUPDATEMAP; | ||
1875 | xadlock->flag = mlckFREEXADLIST; | ||
1876 | xadlock->count = hwm - XTENTRYSTART + 1; | ||
1877 | if ((xadlock->count <= 2) && (tblk->xflag & COMMIT_LAZY)) { | ||
1878 | /* | ||
1879 | * Lazy commit may allow xtree to be modified before | ||
1880 | * txUpdateMap runs. Copy xad into linelock to | ||
1881 | * preserve correct data. | ||
1882 | */ | ||
1883 | xadlock->xdlist = &xtlck->pxdlock; | ||
1884 | memcpy(xadlock->xdlist, &p->xad[XTENTRYSTART], | ||
1885 | sizeof(xad_t) * xadlock->count); | ||
1886 | } else { | ||
1887 | /* | ||
1888 | * xdlist will point to into inode's xtree, ensure | ||
1889 | * that transaction is not committed lazily. | ||
1890 | */ | ||
1891 | xadlock->xdlist = &p->xad[XTENTRYSTART]; | ||
1892 | tblk->xflag &= ~COMMIT_LAZY; | ||
1893 | } | ||
1894 | jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d lwm:2", | ||
1895 | tlck->ip, mp, xadlock->count); | ||
1896 | |||
1897 | maplock->index = 1; | ||
1898 | |||
1899 | /* mark page as invalid */ | ||
1900 | if (((tblk->xflag & COMMIT_PWMAP) || S_ISDIR(ip->i_mode)) | ||
1901 | && !(tlck->type & tlckBTROOT)) | ||
1902 | tlck->flag |= tlckFREEPAGE; | ||
1903 | /* | ||
1904 | else (tblk->xflag & COMMIT_PMAP) | ||
1905 | ? release the page; | ||
1906 | */ | ||
1907 | return; | ||
1908 | } | ||
1909 | |||
1910 | /* | ||
1911 | * page/entry truncation: file truncation (ref. xtTruncate()) | ||
1912 | * | ||
1913 | * |----------+------+------+---------------| | ||
1914 | * | | | | ||
1915 | * | | hwm - hwm before truncation | ||
1916 | * | next - truncation point | ||
1917 | * lwm - lwm before truncation | ||
1918 | * header ? | ||
1919 | */ | ||
1920 | if (tlck->type & tlckTRUNCATE) { | ||
1921 | pxd_t tpxd; /* truncated extent of xad */ | ||
1922 | int twm; | ||
1923 | |||
1924 | /* | ||
1925 | * For truncation the entire linelock may be used, so it would | ||
1926 | * be difficult to store xad list in linelock itself. | ||
1927 | * Therefore, we'll just force transaction to be committed | ||
1928 | * synchronously, so that xtree pages won't be changed before | ||
1929 | * txUpdateMap runs. | ||
1930 | */ | ||
1931 | tblk->xflag &= ~COMMIT_LAZY; | ||
1932 | lwm = xtlck->lwm.offset; | ||
1933 | if (lwm == 0) | ||
1934 | lwm = XTPAGEMAXSLOT; | ||
1935 | hwm = xtlck->hwm.offset; | ||
1936 | twm = xtlck->twm.offset; | ||
1937 | |||
1938 | /* | ||
1939 | * write log records | ||
1940 | */ | ||
1941 | /* log after-image for logredo(): | ||
1942 | * | ||
1943 | * logredo() will update bmap for alloc of new/extended | ||
1944 | * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from | ||
1945 | * after-image of XADlist; | ||
1946 | * logredo() resets (XAD_NEW|XAD_EXTEND) flag when | ||
1947 | * applying the after-image to the meta-data page. | ||
1948 | */ | ||
1949 | lrd->type = cpu_to_le16(LOG_REDOPAGE); | ||
1950 | PXDaddress(pxd, mp->index); | ||
1951 | PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); | ||
1952 | lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); | ||
1953 | |||
1954 | /* | ||
1955 | * truncate entry XAD[twm == next - 1]: | ||
1956 | */ | ||
1957 | if (twm == next - 1) { | ||
1958 | /* init LOG_UPDATEMAP for logredo() to update bmap for | ||
1959 | * free of truncated delta extent of the truncated | ||
1960 | * entry XAD[next - 1]: | ||
1961 | * (xtlck->pxdlock = truncated delta extent); | ||
1962 | */ | ||
1963 | pxdlock = (struct pxd_lock *) & xtlck->pxdlock; | ||
1964 | /* assert(pxdlock->type & tlckTRUNCATE); */ | ||
1965 | lrd->type = cpu_to_le16(LOG_UPDATEMAP); | ||
1966 | lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD); | ||
1967 | lrd->log.updatemap.nxd = cpu_to_le16(1); | ||
1968 | lrd->log.updatemap.pxd = pxdlock->pxd; | ||
1969 | tpxd = pxdlock->pxd; /* save to format maplock */ | ||
1970 | lrd->backchain = | ||
1971 | cpu_to_le32(lmLog(log, tblk, lrd, NULL)); | ||
1972 | } | ||
1973 | |||
1974 | /* | ||
1975 | * free entries XAD[next:hwm]: | ||
1976 | */ | ||
1977 | if (hwm >= next) { | ||
1978 | /* init LOG_UPDATEMAP of the freed extents | ||
1979 | * XAD[next:hwm] from the deleted page itself | ||
1980 | * for logredo() to update bmap; | ||
1981 | */ | ||
1982 | lrd->type = cpu_to_le16(LOG_UPDATEMAP); | ||
1983 | lrd->log.updatemap.type = | ||
1984 | cpu_to_le16(LOG_FREEXADLIST); | ||
1985 | xtlck = (struct xtlock *) & tlck->lock; | ||
1986 | hwm = xtlck->hwm.offset; | ||
1987 | lrd->log.updatemap.nxd = | ||
1988 | cpu_to_le16(hwm - next + 1); | ||
1989 | /* reformat linelock for lmLog() */ | ||
1990 | xtlck->header.offset = next; | ||
1991 | xtlck->header.length = hwm - next + 1; | ||
1992 | xtlck->index = 1; | ||
1993 | lrd->backchain = | ||
1994 | cpu_to_le32(lmLog(log, tblk, lrd, tlck)); | ||
1995 | } | ||
1996 | |||
1997 | /* | ||
1998 | * format maplock(s) for txUpdateMap() to update bmap | ||
1999 | */ | ||
2000 | maplock->index = 0; | ||
2001 | |||
2002 | /* | ||
2003 | * allocate entries XAD[lwm:next): | ||
2004 | */ | ||
2005 | if (lwm < next) { | ||
2006 | /* format a maplock for txUpdateMap() to update bPMAP | ||
2007 | * for alloc of new/extended extents of XAD[lwm:next) | ||
2008 | * from the page itself; | ||
2009 | * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag. | ||
2010 | */ | ||
2011 | tlck->flag |= tlckUPDATEMAP; | ||
2012 | xadlock->flag = mlckALLOCXADLIST; | ||
2013 | xadlock->count = next - lwm; | ||
2014 | xadlock->xdlist = &p->xad[lwm]; | ||
2015 | |||
2016 | jfs_info("xtLog: alloc ip:0x%p mp:0x%p count:%d " | ||
2017 | "lwm:%d next:%d", | ||
2018 | tlck->ip, mp, xadlock->count, lwm, next); | ||
2019 | maplock->index++; | ||
2020 | xadlock++; | ||
2021 | } | ||
2022 | |||
2023 | /* | ||
2024 | * truncate entry XAD[twm == next - 1]: | ||
2025 | */ | ||
2026 | if (twm == next - 1) { | ||
2027 | struct pxd_lock *pxdlock; | ||
2028 | |||
2029 | /* format a maplock for txUpdateMap() to update bmap | ||
2030 | * to free truncated delta extent of the truncated | ||
2031 | * entry XAD[next - 1]; | ||
2032 | * (xtlck->pxdlock = truncated delta extent); | ||
2033 | */ | ||
2034 | tlck->flag |= tlckUPDATEMAP; | ||
2035 | pxdlock = (struct pxd_lock *) xadlock; | ||
2036 | pxdlock->flag = mlckFREEPXD; | ||
2037 | pxdlock->count = 1; | ||
2038 | pxdlock->pxd = tpxd; | ||
2039 | |||
2040 | jfs_info("xtLog: truncate ip:0x%p mp:0x%p count:%d " | ||
2041 | "hwm:%d", ip, mp, pxdlock->count, hwm); | ||
2042 | maplock->index++; | ||
2043 | xadlock++; | ||
2044 | } | ||
2045 | |||
2046 | /* | ||
2047 | * free entries XAD[next:hwm]: | ||
2048 | */ | ||
2049 | if (hwm >= next) { | ||
2050 | /* format a maplock for txUpdateMap() to update bmap | ||
2051 | * to free extents of XAD[next:hwm] from thedeleted | ||
2052 | * page itself; | ||
2053 | */ | ||
2054 | tlck->flag |= tlckUPDATEMAP; | ||
2055 | xadlock->flag = mlckFREEXADLIST; | ||
2056 | xadlock->count = hwm - next + 1; | ||
2057 | xadlock->xdlist = &p->xad[next]; | ||
2058 | |||
2059 | jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d " | ||
2060 | "next:%d hwm:%d", | ||
2061 | tlck->ip, mp, xadlock->count, next, hwm); | ||
2062 | maplock->index++; | ||
2063 | } | ||
2064 | |||
2065 | /* mark page as homeward bound */ | ||
2066 | tlck->flag |= tlckWRITEPAGE; | ||
2067 | } | ||
2068 | return; | ||
2069 | } | ||
2070 | |||
2071 | |||
2072 | /* | ||
2073 | * mapLog() | ||
2074 | * | ||
2075 | * function: log from maplock of freed data extents; | ||
2076 | */ | ||
2077 | void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, | ||
2078 | struct tlock * tlck) | ||
2079 | { | ||
2080 | struct pxd_lock *pxdlock; | ||
2081 | int i, nlock; | ||
2082 | pxd_t *pxd; | ||
2083 | |||
2084 | /* | ||
2085 | * page relocation: free the source page extent | ||
2086 | * | ||
2087 | * a maplock for txUpdateMap() for free of the page | ||
2088 | * has been formatted at txLock() time saving the src | ||
2089 | * relocated page address; | ||
2090 | */ | ||
2091 | if (tlck->type & tlckRELOCATE) { | ||
2092 | /* log LOG_NOREDOPAGE of the old relocated page | ||
2093 | * for logredo() to start NoRedoPage filter; | ||
2094 | */ | ||
2095 | lrd->type = cpu_to_le16(LOG_NOREDOPAGE); | ||
2096 | pxdlock = (struct pxd_lock *) & tlck->lock; | ||
2097 | pxd = &lrd->log.redopage.pxd; | ||
2098 | *pxd = pxdlock->pxd; | ||
2099 | lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); | ||
2100 | |||
2101 | /* (N.B. currently, logredo() does NOT update bmap | ||
2102 | * for free of the page itself for (LOG_XTREE|LOG_NOREDOPAGE); | ||
2103 | * if page free from relocation, LOG_UPDATEMAP log is | ||
2104 | * specifically generated now for logredo() | ||
2105 | * to update bmap for free of src relocated page; | ||
2106 | * (new flag LOG_RELOCATE may be introduced which will | ||
2107 | * inform logredo() to start NORedoPage filter and also | ||
2108 | * update block allocation map at the same time, thus | ||
2109 | * avoiding an extra log write); | ||
2110 | */ | ||
2111 | lrd->type = cpu_to_le16(LOG_UPDATEMAP); | ||
2112 | lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD); | ||
2113 | lrd->log.updatemap.nxd = cpu_to_le16(1); | ||
2114 | lrd->log.updatemap.pxd = pxdlock->pxd; | ||
2115 | lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); | ||
2116 | |||
2117 | /* a maplock for txUpdateMap() for free of the page | ||
2118 | * has been formatted at txLock() time; | ||
2119 | */ | ||
2120 | tlck->flag |= tlckUPDATEMAP; | ||
2121 | return; | ||
2122 | } | ||
2123 | /* | ||
2124 | |||
2125 | * Otherwise it's not a relocate request | ||
2126 | * | ||
2127 | */ | ||
2128 | else { | ||
2129 | /* log LOG_UPDATEMAP for logredo() to update bmap for | ||
2130 | * free of truncated/relocated delta extent of the data; | ||
2131 | * e.g.: external EA extent, relocated/truncated extent | ||
2132 | * from xtTailgate(); | ||
2133 | */ | ||
2134 | lrd->type = cpu_to_le16(LOG_UPDATEMAP); | ||
2135 | pxdlock = (struct pxd_lock *) & tlck->lock; | ||
2136 | nlock = pxdlock->index; | ||
2137 | for (i = 0; i < nlock; i++, pxdlock++) { | ||
2138 | if (pxdlock->flag & mlckALLOCPXD) | ||
2139 | lrd->log.updatemap.type = | ||
2140 | cpu_to_le16(LOG_ALLOCPXD); | ||
2141 | else | ||
2142 | lrd->log.updatemap.type = | ||
2143 | cpu_to_le16(LOG_FREEPXD); | ||
2144 | lrd->log.updatemap.nxd = cpu_to_le16(1); | ||
2145 | lrd->log.updatemap.pxd = pxdlock->pxd; | ||
2146 | lrd->backchain = | ||
2147 | cpu_to_le32(lmLog(log, tblk, lrd, NULL)); | ||
2148 | jfs_info("mapLog: xaddr:0x%lx xlen:0x%x", | ||
2149 | (ulong) addressPXD(&pxdlock->pxd), | ||
2150 | lengthPXD(&pxdlock->pxd)); | ||
2151 | } | ||
2152 | |||
2153 | /* update bmap */ | ||
2154 | tlck->flag |= tlckUPDATEMAP; | ||
2155 | } | ||
2156 | } | ||
2157 | |||
2158 | |||
2159 | /* | ||
2160 | * txEA() | ||
2161 | * | ||
2162 | * function: acquire maplock for EA/ACL extents or | ||
2163 | * set COMMIT_INLINE flag; | ||
2164 | */ | ||
2165 | void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea) | ||
2166 | { | ||
2167 | struct tlock *tlck = NULL; | ||
2168 | struct pxd_lock *maplock = NULL, *pxdlock = NULL; | ||
2169 | |||
2170 | /* | ||
2171 | * format maplock for alloc of new EA extent | ||
2172 | */ | ||
2173 | if (newea) { | ||
2174 | /* Since the newea could be a completely zeroed entry we need to | ||
2175 | * check for the two flags which indicate we should actually | ||
2176 | * commit new EA data | ||
2177 | */ | ||
2178 | if (newea->flag & DXD_EXTENT) { | ||
2179 | tlck = txMaplock(tid, ip, tlckMAP); | ||
2180 | maplock = (struct pxd_lock *) & tlck->lock; | ||
2181 | pxdlock = (struct pxd_lock *) maplock; | ||
2182 | pxdlock->flag = mlckALLOCPXD; | ||
2183 | PXDaddress(&pxdlock->pxd, addressDXD(newea)); | ||
2184 | PXDlength(&pxdlock->pxd, lengthDXD(newea)); | ||
2185 | pxdlock++; | ||
2186 | maplock->index = 1; | ||
2187 | } else if (newea->flag & DXD_INLINE) { | ||
2188 | tlck = NULL; | ||
2189 | |||
2190 | set_cflag(COMMIT_Inlineea, ip); | ||
2191 | } | ||
2192 | } | ||
2193 | |||
2194 | /* | ||
2195 | * format maplock for free of old EA extent | ||
2196 | */ | ||
2197 | if (!test_cflag(COMMIT_Nolink, ip) && oldea->flag & DXD_EXTENT) { | ||
2198 | if (tlck == NULL) { | ||
2199 | tlck = txMaplock(tid, ip, tlckMAP); | ||
2200 | maplock = (struct pxd_lock *) & tlck->lock; | ||
2201 | pxdlock = (struct pxd_lock *) maplock; | ||
2202 | maplock->index = 0; | ||
2203 | } | ||
2204 | pxdlock->flag = mlckFREEPXD; | ||
2205 | PXDaddress(&pxdlock->pxd, addressDXD(oldea)); | ||
2206 | PXDlength(&pxdlock->pxd, lengthDXD(oldea)); | ||
2207 | maplock->index++; | ||
2208 | } | ||
2209 | } | ||
2210 | |||
2211 | |||
2212 | /* | ||
2213 | * txForce() | ||
2214 | * | ||
2215 | * function: synchronously write pages locked by transaction | ||
2216 | * after txLog() but before txUpdateMap(); | ||
2217 | */ | ||
2218 | void txForce(struct tblock * tblk) | ||
2219 | { | ||
2220 | struct tlock *tlck; | ||
2221 | lid_t lid, next; | ||
2222 | struct metapage *mp; | ||
2223 | |||
2224 | /* | ||
2225 | * reverse the order of transaction tlocks in | ||
2226 | * careful update order of address index pages | ||
2227 | * (right to left, bottom up) | ||
2228 | */ | ||
2229 | tlck = lid_to_tlock(tblk->next); | ||
2230 | lid = tlck->next; | ||
2231 | tlck->next = 0; | ||
2232 | while (lid) { | ||
2233 | tlck = lid_to_tlock(lid); | ||
2234 | next = tlck->next; | ||
2235 | tlck->next = tblk->next; | ||
2236 | tblk->next = lid; | ||
2237 | lid = next; | ||
2238 | } | ||
2239 | |||
2240 | /* | ||
2241 | * synchronously write the page, and | ||
2242 | * hold the page for txUpdateMap(); | ||
2243 | */ | ||
2244 | for (lid = tblk->next; lid; lid = next) { | ||
2245 | tlck = lid_to_tlock(lid); | ||
2246 | next = tlck->next; | ||
2247 | |||
2248 | if ((mp = tlck->mp) != NULL && | ||
2249 | (tlck->type & tlckBTROOT) == 0) { | ||
2250 | assert(mp->xflag & COMMIT_PAGE); | ||
2251 | |||
2252 | if (tlck->flag & tlckWRITEPAGE) { | ||
2253 | tlck->flag &= ~tlckWRITEPAGE; | ||
2254 | |||
2255 | /* do not release page to freelist */ | ||
2256 | |||
2257 | /* | ||
2258 | * The "right" thing to do here is to | ||
2259 | * synchronously write the metadata. | ||
2260 | * With the current implementation this | ||
2261 | * is hard since write_metapage requires | ||
2262 | * us to kunmap & remap the page. If we | ||
2263 | * have tlocks pointing into the metadata | ||
2264 | * pages, we don't want to do this. I think | ||
2265 | * we can get by with synchronously writing | ||
2266 | * the pages when they are released. | ||
2267 | */ | ||
2268 | assert(atomic_read(&mp->nohomeok)); | ||
2269 | set_bit(META_dirty, &mp->flag); | ||
2270 | set_bit(META_sync, &mp->flag); | ||
2271 | } | ||
2272 | } | ||
2273 | } | ||
2274 | } | ||
2275 | |||
2276 | |||
2277 | /* | ||
2278 | * txUpdateMap() | ||
2279 | * | ||
2280 | * function: update persistent allocation map (and working map | ||
2281 | * if appropriate); | ||
2282 | * | ||
2283 | * parameter: | ||
2284 | */ | ||
2285 | static void txUpdateMap(struct tblock * tblk) | ||
2286 | { | ||
2287 | struct inode *ip; | ||
2288 | struct inode *ipimap; | ||
2289 | lid_t lid; | ||
2290 | struct tlock *tlck; | ||
2291 | struct maplock *maplock; | ||
2292 | struct pxd_lock pxdlock; | ||
2293 | int maptype; | ||
2294 | int k, nlock; | ||
2295 | struct metapage *mp = NULL; | ||
2296 | |||
2297 | ipimap = JFS_SBI(tblk->sb)->ipimap; | ||
2298 | |||
2299 | maptype = (tblk->xflag & COMMIT_PMAP) ? COMMIT_PMAP : COMMIT_PWMAP; | ||
2300 | |||
2301 | |||
2302 | /* | ||
2303 | * update block allocation map | ||
2304 | * | ||
2305 | * update allocation state in pmap (and wmap) and | ||
2306 | * update lsn of the pmap page; | ||
2307 | */ | ||
2308 | /* | ||
2309 | * scan each tlock/page of transaction for block allocation/free: | ||
2310 | * | ||
2311 | * for each tlock/page of transaction, update map. | ||
2312 | * ? are there tlock for pmap and pwmap at the same time ? | ||
2313 | */ | ||
2314 | for (lid = tblk->next; lid; lid = tlck->next) { | ||
2315 | tlck = lid_to_tlock(lid); | ||
2316 | |||
2317 | if ((tlck->flag & tlckUPDATEMAP) == 0) | ||
2318 | continue; | ||
2319 | |||
2320 | if (tlck->flag & tlckFREEPAGE) { | ||
2321 | /* | ||
2322 | * Another thread may attempt to reuse freed space | ||
2323 | * immediately, so we want to get rid of the metapage | ||
2324 | * before anyone else has a chance to get it. | ||
2325 | * Lock metapage, update maps, then invalidate | ||
2326 | * the metapage. | ||
2327 | */ | ||
2328 | mp = tlck->mp; | ||
2329 | ASSERT(mp->xflag & COMMIT_PAGE); | ||
2330 | hold_metapage(mp, 0); | ||
2331 | } | ||
2332 | |||
2333 | /* | ||
2334 | * extent list: | ||
2335 | * . in-line PXD list: | ||
2336 | * . out-of-line XAD list: | ||
2337 | */ | ||
2338 | maplock = (struct maplock *) & tlck->lock; | ||
2339 | nlock = maplock->index; | ||
2340 | |||
2341 | for (k = 0; k < nlock; k++, maplock++) { | ||
2342 | /* | ||
2343 | * allocate blocks in persistent map: | ||
2344 | * | ||
2345 | * blocks have been allocated from wmap at alloc time; | ||
2346 | */ | ||
2347 | if (maplock->flag & mlckALLOC) { | ||
2348 | txAllocPMap(ipimap, maplock, tblk); | ||
2349 | } | ||
2350 | /* | ||
2351 | * free blocks in persistent and working map: | ||
2352 | * blocks will be freed in pmap and then in wmap; | ||
2353 | * | ||
2354 | * ? tblock specifies the PMAP/PWMAP based upon | ||
2355 | * transaction | ||
2356 | * | ||
2357 | * free blocks in persistent map: | ||
2358 | * blocks will be freed from wmap at last reference | ||
2359 | * release of the object for regular files; | ||
2360 | * | ||
2361 | * Alway free blocks from both persistent & working | ||
2362 | * maps for directories | ||
2363 | */ | ||
2364 | else { /* (maplock->flag & mlckFREE) */ | ||
2365 | |||
2366 | if (S_ISDIR(tlck->ip->i_mode)) | ||
2367 | txFreeMap(ipimap, maplock, | ||
2368 | tblk, COMMIT_PWMAP); | ||
2369 | else | ||
2370 | txFreeMap(ipimap, maplock, | ||
2371 | tblk, maptype); | ||
2372 | } | ||
2373 | } | ||
2374 | if (tlck->flag & tlckFREEPAGE) { | ||
2375 | if (!(tblk->flag & tblkGC_LAZY)) { | ||
2376 | /* This is equivalent to txRelease */ | ||
2377 | ASSERT(mp->lid == lid); | ||
2378 | tlck->mp->lid = 0; | ||
2379 | } | ||
2380 | assert(atomic_read(&mp->nohomeok) == 1); | ||
2381 | atomic_dec(&mp->nohomeok); | ||
2382 | discard_metapage(mp); | ||
2383 | tlck->mp = NULL; | ||
2384 | } | ||
2385 | } | ||
2386 | /* | ||
2387 | * update inode allocation map | ||
2388 | * | ||
2389 | * update allocation state in pmap and | ||
2390 | * update lsn of the pmap page; | ||
2391 | * update in-memory inode flag/state | ||
2392 | * | ||
2393 | * unlock mapper/write lock | ||
2394 | */ | ||
2395 | if (tblk->xflag & COMMIT_CREATE) { | ||
2396 | diUpdatePMap(ipimap, tblk->ino, FALSE, tblk); | ||
2397 | ipimap->i_state |= I_DIRTY; | ||
2398 | /* update persistent block allocation map | ||
2399 | * for the allocation of inode extent; | ||
2400 | */ | ||
2401 | pxdlock.flag = mlckALLOCPXD; | ||
2402 | pxdlock.pxd = tblk->u.ixpxd; | ||
2403 | pxdlock.index = 1; | ||
2404 | txAllocPMap(ipimap, (struct maplock *) & pxdlock, tblk); | ||
2405 | } else if (tblk->xflag & COMMIT_DELETE) { | ||
2406 | ip = tblk->u.ip; | ||
2407 | diUpdatePMap(ipimap, ip->i_ino, TRUE, tblk); | ||
2408 | ipimap->i_state |= I_DIRTY; | ||
2409 | iput(ip); | ||
2410 | } | ||
2411 | } | ||
2412 | |||
2413 | |||
2414 | /* | ||
2415 | * txAllocPMap() | ||
2416 | * | ||
2417 | * function: allocate from persistent map; | ||
2418 | * | ||
2419 | * parameter: | ||
2420 | * ipbmap - | ||
2421 | * malock - | ||
2422 | * xad list: | ||
2423 | * pxd: | ||
2424 | * | ||
2425 | * maptype - | ||
2426 | * allocate from persistent map; | ||
2427 | * free from persistent map; | ||
2428 | * (e.g., tmp file - free from working map at releae | ||
2429 | * of last reference); | ||
2430 | * free from persistent and working map; | ||
2431 | * | ||
2432 | * lsn - log sequence number; | ||
2433 | */ | ||
2434 | static void txAllocPMap(struct inode *ip, struct maplock * maplock, | ||
2435 | struct tblock * tblk) | ||
2436 | { | ||
2437 | struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; | ||
2438 | struct xdlistlock *xadlistlock; | ||
2439 | xad_t *xad; | ||
2440 | s64 xaddr; | ||
2441 | int xlen; | ||
2442 | struct pxd_lock *pxdlock; | ||
2443 | struct xdlistlock *pxdlistlock; | ||
2444 | pxd_t *pxd; | ||
2445 | int n; | ||
2446 | |||
2447 | /* | ||
2448 | * allocate from persistent map; | ||
2449 | */ | ||
2450 | if (maplock->flag & mlckALLOCXADLIST) { | ||
2451 | xadlistlock = (struct xdlistlock *) maplock; | ||
2452 | xad = xadlistlock->xdlist; | ||
2453 | for (n = 0; n < xadlistlock->count; n++, xad++) { | ||
2454 | if (xad->flag & (XAD_NEW | XAD_EXTENDED)) { | ||
2455 | xaddr = addressXAD(xad); | ||
2456 | xlen = lengthXAD(xad); | ||
2457 | dbUpdatePMap(ipbmap, FALSE, xaddr, | ||
2458 | (s64) xlen, tblk); | ||
2459 | xad->flag &= ~(XAD_NEW | XAD_EXTENDED); | ||
2460 | jfs_info("allocPMap: xaddr:0x%lx xlen:%d", | ||
2461 | (ulong) xaddr, xlen); | ||
2462 | } | ||
2463 | } | ||
2464 | } else if (maplock->flag & mlckALLOCPXD) { | ||
2465 | pxdlock = (struct pxd_lock *) maplock; | ||
2466 | xaddr = addressPXD(&pxdlock->pxd); | ||
2467 | xlen = lengthPXD(&pxdlock->pxd); | ||
2468 | dbUpdatePMap(ipbmap, FALSE, xaddr, (s64) xlen, tblk); | ||
2469 | jfs_info("allocPMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); | ||
2470 | } else { /* (maplock->flag & mlckALLOCPXDLIST) */ | ||
2471 | |||
2472 | pxdlistlock = (struct xdlistlock *) maplock; | ||
2473 | pxd = pxdlistlock->xdlist; | ||
2474 | for (n = 0; n < pxdlistlock->count; n++, pxd++) { | ||
2475 | xaddr = addressPXD(pxd); | ||
2476 | xlen = lengthPXD(pxd); | ||
2477 | dbUpdatePMap(ipbmap, FALSE, xaddr, (s64) xlen, | ||
2478 | tblk); | ||
2479 | jfs_info("allocPMap: xaddr:0x%lx xlen:%d", | ||
2480 | (ulong) xaddr, xlen); | ||
2481 | } | ||
2482 | } | ||
2483 | } | ||
2484 | |||
2485 | |||
2486 | /* | ||
2487 | * txFreeMap() | ||
2488 | * | ||
2489 | * function: free from persistent and/or working map; | ||
2490 | * | ||
2491 | * todo: optimization | ||
2492 | */ | ||
2493 | void txFreeMap(struct inode *ip, | ||
2494 | struct maplock * maplock, struct tblock * tblk, int maptype) | ||
2495 | { | ||
2496 | struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; | ||
2497 | struct xdlistlock *xadlistlock; | ||
2498 | xad_t *xad; | ||
2499 | s64 xaddr; | ||
2500 | int xlen; | ||
2501 | struct pxd_lock *pxdlock; | ||
2502 | struct xdlistlock *pxdlistlock; | ||
2503 | pxd_t *pxd; | ||
2504 | int n; | ||
2505 | |||
2506 | jfs_info("txFreeMap: tblk:0x%p maplock:0x%p maptype:0x%x", | ||
2507 | tblk, maplock, maptype); | ||
2508 | |||
2509 | /* | ||
2510 | * free from persistent map; | ||
2511 | */ | ||
2512 | if (maptype == COMMIT_PMAP || maptype == COMMIT_PWMAP) { | ||
2513 | if (maplock->flag & mlckFREEXADLIST) { | ||
2514 | xadlistlock = (struct xdlistlock *) maplock; | ||
2515 | xad = xadlistlock->xdlist; | ||
2516 | for (n = 0; n < xadlistlock->count; n++, xad++) { | ||
2517 | if (!(xad->flag & XAD_NEW)) { | ||
2518 | xaddr = addressXAD(xad); | ||
2519 | xlen = lengthXAD(xad); | ||
2520 | dbUpdatePMap(ipbmap, TRUE, xaddr, | ||
2521 | (s64) xlen, tblk); | ||
2522 | jfs_info("freePMap: xaddr:0x%lx " | ||
2523 | "xlen:%d", | ||
2524 | (ulong) xaddr, xlen); | ||
2525 | } | ||
2526 | } | ||
2527 | } else if (maplock->flag & mlckFREEPXD) { | ||
2528 | pxdlock = (struct pxd_lock *) maplock; | ||
2529 | xaddr = addressPXD(&pxdlock->pxd); | ||
2530 | xlen = lengthPXD(&pxdlock->pxd); | ||
2531 | dbUpdatePMap(ipbmap, TRUE, xaddr, (s64) xlen, | ||
2532 | tblk); | ||
2533 | jfs_info("freePMap: xaddr:0x%lx xlen:%d", | ||
2534 | (ulong) xaddr, xlen); | ||
2535 | } else { /* (maplock->flag & mlckALLOCPXDLIST) */ | ||
2536 | |||
2537 | pxdlistlock = (struct xdlistlock *) maplock; | ||
2538 | pxd = pxdlistlock->xdlist; | ||
2539 | for (n = 0; n < pxdlistlock->count; n++, pxd++) { | ||
2540 | xaddr = addressPXD(pxd); | ||
2541 | xlen = lengthPXD(pxd); | ||
2542 | dbUpdatePMap(ipbmap, TRUE, xaddr, | ||
2543 | (s64) xlen, tblk); | ||
2544 | jfs_info("freePMap: xaddr:0x%lx xlen:%d", | ||
2545 | (ulong) xaddr, xlen); | ||
2546 | } | ||
2547 | } | ||
2548 | } | ||
2549 | |||
2550 | /* | ||
2551 | * free from working map; | ||
2552 | */ | ||
2553 | if (maptype == COMMIT_PWMAP || maptype == COMMIT_WMAP) { | ||
2554 | if (maplock->flag & mlckFREEXADLIST) { | ||
2555 | xadlistlock = (struct xdlistlock *) maplock; | ||
2556 | xad = xadlistlock->xdlist; | ||
2557 | for (n = 0; n < xadlistlock->count; n++, xad++) { | ||
2558 | xaddr = addressXAD(xad); | ||
2559 | xlen = lengthXAD(xad); | ||
2560 | dbFree(ip, xaddr, (s64) xlen); | ||
2561 | xad->flag = 0; | ||
2562 | jfs_info("freeWMap: xaddr:0x%lx xlen:%d", | ||
2563 | (ulong) xaddr, xlen); | ||
2564 | } | ||
2565 | } else if (maplock->flag & mlckFREEPXD) { | ||
2566 | pxdlock = (struct pxd_lock *) maplock; | ||
2567 | xaddr = addressPXD(&pxdlock->pxd); | ||
2568 | xlen = lengthPXD(&pxdlock->pxd); | ||
2569 | dbFree(ip, xaddr, (s64) xlen); | ||
2570 | jfs_info("freeWMap: xaddr:0x%lx xlen:%d", | ||
2571 | (ulong) xaddr, xlen); | ||
2572 | } else { /* (maplock->flag & mlckFREEPXDLIST) */ | ||
2573 | |||
2574 | pxdlistlock = (struct xdlistlock *) maplock; | ||
2575 | pxd = pxdlistlock->xdlist; | ||
2576 | for (n = 0; n < pxdlistlock->count; n++, pxd++) { | ||
2577 | xaddr = addressPXD(pxd); | ||
2578 | xlen = lengthPXD(pxd); | ||
2579 | dbFree(ip, xaddr, (s64) xlen); | ||
2580 | jfs_info("freeWMap: xaddr:0x%lx xlen:%d", | ||
2581 | (ulong) xaddr, xlen); | ||
2582 | } | ||
2583 | } | ||
2584 | } | ||
2585 | } | ||
2586 | |||
2587 | |||
2588 | /* | ||
2589 | * txFreelock() | ||
2590 | * | ||
2591 | * function: remove tlock from inode anonymous locklist | ||
2592 | */ | ||
2593 | void txFreelock(struct inode *ip) | ||
2594 | { | ||
2595 | struct jfs_inode_info *jfs_ip = JFS_IP(ip); | ||
2596 | struct tlock *xtlck, *tlck; | ||
2597 | lid_t xlid = 0, lid; | ||
2598 | |||
2599 | if (!jfs_ip->atlhead) | ||
2600 | return; | ||
2601 | |||
2602 | TXN_LOCK(); | ||
2603 | xtlck = (struct tlock *) &jfs_ip->atlhead; | ||
2604 | |||
2605 | while ((lid = xtlck->next) != 0) { | ||
2606 | tlck = lid_to_tlock(lid); | ||
2607 | if (tlck->flag & tlckFREELOCK) { | ||
2608 | xtlck->next = tlck->next; | ||
2609 | txLockFree(lid); | ||
2610 | } else { | ||
2611 | xtlck = tlck; | ||
2612 | xlid = lid; | ||
2613 | } | ||
2614 | } | ||
2615 | |||
2616 | if (jfs_ip->atlhead) | ||
2617 | jfs_ip->atltail = xlid; | ||
2618 | else { | ||
2619 | jfs_ip->atltail = 0; | ||
2620 | /* | ||
2621 | * If inode was on anon_list, remove it | ||
2622 | */ | ||
2623 | list_del_init(&jfs_ip->anon_inode_list); | ||
2624 | } | ||
2625 | TXN_UNLOCK(); | ||
2626 | } | ||
2627 | |||
2628 | |||
2629 | /* | ||
2630 | * txAbort() | ||
2631 | * | ||
2632 | * function: abort tx before commit; | ||
2633 | * | ||
2634 | * frees line-locks and segment locks for all | ||
2635 | * segments in comdata structure. | ||
2636 | * Optionally sets state of file-system to FM_DIRTY in super-block. | ||
2637 | * log age of page-frames in memory for which caller has | ||
2638 | * are reset to 0 (to avoid logwarap). | ||
2639 | */ | ||
2640 | void txAbort(tid_t tid, int dirty) | ||
2641 | { | ||
2642 | lid_t lid, next; | ||
2643 | struct metapage *mp; | ||
2644 | struct tblock *tblk = tid_to_tblock(tid); | ||
2645 | struct tlock *tlck; | ||
2646 | |||
2647 | /* | ||
2648 | * free tlocks of the transaction | ||
2649 | */ | ||
2650 | for (lid = tblk->next; lid; lid = next) { | ||
2651 | tlck = lid_to_tlock(lid); | ||
2652 | next = tlck->next; | ||
2653 | mp = tlck->mp; | ||
2654 | JFS_IP(tlck->ip)->xtlid = 0; | ||
2655 | |||
2656 | if (mp) { | ||
2657 | mp->lid = 0; | ||
2658 | |||
2659 | /* | ||
2660 | * reset lsn of page to avoid logwarap: | ||
2661 | * | ||
2662 | * (page may have been previously committed by another | ||
2663 | * transaction(s) but has not been paged, i.e., | ||
2664 | * it may be on logsync list even though it has not | ||
2665 | * been logged for the current tx.) | ||
2666 | */ | ||
2667 | if (mp->xflag & COMMIT_PAGE && mp->lsn) | ||
2668 | LogSyncRelease(mp); | ||
2669 | } | ||
2670 | /* insert tlock at head of freelist */ | ||
2671 | TXN_LOCK(); | ||
2672 | txLockFree(lid); | ||
2673 | TXN_UNLOCK(); | ||
2674 | } | ||
2675 | |||
2676 | /* caller will free the transaction block */ | ||
2677 | |||
2678 | tblk->next = tblk->last = 0; | ||
2679 | |||
2680 | /* | ||
2681 | * mark filesystem dirty | ||
2682 | */ | ||
2683 | if (dirty) | ||
2684 | jfs_error(tblk->sb, "txAbort"); | ||
2685 | |||
2686 | return; | ||
2687 | } | ||
2688 | |||
2689 | /* | ||
2690 | * txLazyCommit(void) | ||
2691 | * | ||
2692 | * All transactions except those changing ipimap (COMMIT_FORCE) are | ||
2693 | * processed by this routine. This insures that the inode and block | ||
2694 | * allocation maps are updated in order. For synchronous transactions, | ||
2695 | * let the user thread finish processing after txUpdateMap() is called. | ||
2696 | */ | ||
2697 | static void txLazyCommit(struct tblock * tblk) | ||
2698 | { | ||
2699 | struct jfs_log *log; | ||
2700 | |||
2701 | while (((tblk->flag & tblkGC_READY) == 0) && | ||
2702 | ((tblk->flag & tblkGC_UNLOCKED) == 0)) { | ||
2703 | /* We must have gotten ahead of the user thread | ||
2704 | */ | ||
2705 | jfs_info("jfs_lazycommit: tblk 0x%p not unlocked", tblk); | ||
2706 | yield(); | ||
2707 | } | ||
2708 | |||
2709 | jfs_info("txLazyCommit: processing tblk 0x%p", tblk); | ||
2710 | |||
2711 | txUpdateMap(tblk); | ||
2712 | |||
2713 | log = (struct jfs_log *) JFS_SBI(tblk->sb)->log; | ||
2714 | |||
2715 | spin_lock_irq(&log->gclock); // LOGGC_LOCK | ||
2716 | |||
2717 | tblk->flag |= tblkGC_COMMITTED; | ||
2718 | |||
2719 | if (tblk->flag & tblkGC_READY) | ||
2720 | log->gcrtc--; | ||
2721 | |||
2722 | wake_up_all(&tblk->gcwait); // LOGGC_WAKEUP | ||
2723 | |||
2724 | /* | ||
2725 | * Can't release log->gclock until we've tested tblk->flag | ||
2726 | */ | ||
2727 | if (tblk->flag & tblkGC_LAZY) { | ||
2728 | spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK | ||
2729 | txUnlock(tblk); | ||
2730 | tblk->flag &= ~tblkGC_LAZY; | ||
2731 | txEnd(tblk - TxBlock); /* Convert back to tid */ | ||
2732 | } else | ||
2733 | spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK | ||
2734 | |||
2735 | jfs_info("txLazyCommit: done: tblk = 0x%p", tblk); | ||
2736 | } | ||
2737 | |||
2738 | /* | ||
2739 | * jfs_lazycommit(void) | ||
2740 | * | ||
2741 | * To be run as a kernel daemon. If lbmIODone is called in an interrupt | ||
2742 | * context, or where blocking is not wanted, this routine will process | ||
2743 | * committed transactions from the unlock queue. | ||
2744 | */ | ||
2745 | int jfs_lazycommit(void *arg) | ||
2746 | { | ||
2747 | int WorkDone; | ||
2748 | struct tblock *tblk; | ||
2749 | unsigned long flags; | ||
2750 | struct jfs_sb_info *sbi; | ||
2751 | |||
2752 | daemonize("jfsCommit"); | ||
2753 | |||
2754 | complete(&jfsIOwait); | ||
2755 | |||
2756 | do { | ||
2757 | LAZY_LOCK(flags); | ||
2758 | jfs_commit_thread_waking = 0; /* OK to wake another thread */ | ||
2759 | while (!list_empty(&TxAnchor.unlock_queue)) { | ||
2760 | WorkDone = 0; | ||
2761 | list_for_each_entry(tblk, &TxAnchor.unlock_queue, | ||
2762 | cqueue) { | ||
2763 | |||
2764 | sbi = JFS_SBI(tblk->sb); | ||
2765 | /* | ||
2766 | * For each volume, the transactions must be | ||
2767 | * handled in order. If another commit thread | ||
2768 | * is handling a tblk for this superblock, | ||
2769 | * skip it | ||
2770 | */ | ||
2771 | if (sbi->commit_state & IN_LAZYCOMMIT) | ||
2772 | continue; | ||
2773 | |||
2774 | sbi->commit_state |= IN_LAZYCOMMIT; | ||
2775 | WorkDone = 1; | ||
2776 | |||
2777 | /* | ||
2778 | * Remove transaction from queue | ||
2779 | */ | ||
2780 | list_del(&tblk->cqueue); | ||
2781 | |||
2782 | LAZY_UNLOCK(flags); | ||
2783 | txLazyCommit(tblk); | ||
2784 | LAZY_LOCK(flags); | ||
2785 | |||
2786 | sbi->commit_state &= ~IN_LAZYCOMMIT; | ||
2787 | /* | ||
2788 | * Don't continue in the for loop. (We can't | ||
2789 | * anyway, it's unsafe!) We want to go back to | ||
2790 | * the beginning of the list. | ||
2791 | */ | ||
2792 | break; | ||
2793 | } | ||
2794 | |||
2795 | /* If there was nothing to do, don't continue */ | ||
2796 | if (!WorkDone) | ||
2797 | break; | ||
2798 | } | ||
2799 | /* In case a wakeup came while all threads were active */ | ||
2800 | jfs_commit_thread_waking = 0; | ||
2801 | |||
2802 | if (current->flags & PF_FREEZE) { | ||
2803 | LAZY_UNLOCK(flags); | ||
2804 | refrigerator(PF_FREEZE); | ||
2805 | } else { | ||
2806 | DECLARE_WAITQUEUE(wq, current); | ||
2807 | |||
2808 | add_wait_queue(&jfs_commit_thread_wait, &wq); | ||
2809 | set_current_state(TASK_INTERRUPTIBLE); | ||
2810 | LAZY_UNLOCK(flags); | ||
2811 | schedule(); | ||
2812 | current->state = TASK_RUNNING; | ||
2813 | remove_wait_queue(&jfs_commit_thread_wait, &wq); | ||
2814 | } | ||
2815 | } while (!jfs_stop_threads); | ||
2816 | |||
2817 | if (!list_empty(&TxAnchor.unlock_queue)) | ||
2818 | jfs_err("jfs_lazycommit being killed w/pending transactions!"); | ||
2819 | else | ||
2820 | jfs_info("jfs_lazycommit being killed\n"); | ||
2821 | complete_and_exit(&jfsIOwait, 0); | ||
2822 | } | ||
2823 | |||
2824 | void txLazyUnlock(struct tblock * tblk) | ||
2825 | { | ||
2826 | unsigned long flags; | ||
2827 | |||
2828 | LAZY_LOCK(flags); | ||
2829 | |||
2830 | list_add_tail(&tblk->cqueue, &TxAnchor.unlock_queue); | ||
2831 | /* | ||
2832 | * Don't wake up a commit thread if there is already one servicing | ||
2833 | * this superblock, or if the last one we woke up hasn't started yet. | ||
2834 | */ | ||
2835 | if (!(JFS_SBI(tblk->sb)->commit_state & IN_LAZYCOMMIT) && | ||
2836 | !jfs_commit_thread_waking) { | ||
2837 | jfs_commit_thread_waking = 1; | ||
2838 | wake_up(&jfs_commit_thread_wait); | ||
2839 | } | ||
2840 | LAZY_UNLOCK(flags); | ||
2841 | } | ||
2842 | |||
2843 | static void LogSyncRelease(struct metapage * mp) | ||
2844 | { | ||
2845 | struct jfs_log *log = mp->log; | ||
2846 | |||
2847 | assert(atomic_read(&mp->nohomeok)); | ||
2848 | assert(log); | ||
2849 | atomic_dec(&mp->nohomeok); | ||
2850 | |||
2851 | if (atomic_read(&mp->nohomeok)) | ||
2852 | return; | ||
2853 | |||
2854 | hold_metapage(mp, 0); | ||
2855 | |||
2856 | LOGSYNC_LOCK(log); | ||
2857 | mp->log = NULL; | ||
2858 | mp->lsn = 0; | ||
2859 | mp->clsn = 0; | ||
2860 | log->count--; | ||
2861 | list_del_init(&mp->synclist); | ||
2862 | LOGSYNC_UNLOCK(log); | ||
2863 | |||
2864 | release_metapage(mp); | ||
2865 | } | ||
2866 | |||
2867 | /* | ||
2868 | * txQuiesce | ||
2869 | * | ||
2870 | * Block all new transactions and push anonymous transactions to | ||
2871 | * completion | ||
2872 | * | ||
2873 | * This does almost the same thing as jfs_sync below. We don't | ||
2874 | * worry about deadlocking when jfs_tlocks_low is set, since we would | ||
2875 | * expect jfs_sync to get us out of that jam. | ||
2876 | */ | ||
2877 | void txQuiesce(struct super_block *sb) | ||
2878 | { | ||
2879 | struct inode *ip; | ||
2880 | struct jfs_inode_info *jfs_ip; | ||
2881 | struct jfs_log *log = JFS_SBI(sb)->log; | ||
2882 | tid_t tid; | ||
2883 | |||
2884 | set_bit(log_QUIESCE, &log->flag); | ||
2885 | |||
2886 | TXN_LOCK(); | ||
2887 | restart: | ||
2888 | while (!list_empty(&TxAnchor.anon_list)) { | ||
2889 | jfs_ip = list_entry(TxAnchor.anon_list.next, | ||
2890 | struct jfs_inode_info, | ||
2891 | anon_inode_list); | ||
2892 | ip = &jfs_ip->vfs_inode; | ||
2893 | |||
2894 | /* | ||
2895 | * inode will be removed from anonymous list | ||
2896 | * when it is committed | ||
2897 | */ | ||
2898 | TXN_UNLOCK(); | ||
2899 | tid = txBegin(ip->i_sb, COMMIT_INODE | COMMIT_FORCE); | ||
2900 | down(&jfs_ip->commit_sem); | ||
2901 | txCommit(tid, 1, &ip, 0); | ||
2902 | txEnd(tid); | ||
2903 | up(&jfs_ip->commit_sem); | ||
2904 | /* | ||
2905 | * Just to be safe. I don't know how | ||
2906 | * long we can run without blocking | ||
2907 | */ | ||
2908 | cond_resched(); | ||
2909 | TXN_LOCK(); | ||
2910 | } | ||
2911 | |||
2912 | /* | ||
2913 | * If jfs_sync is running in parallel, there could be some inodes | ||
2914 | * on anon_list2. Let's check. | ||
2915 | */ | ||
2916 | if (!list_empty(&TxAnchor.anon_list2)) { | ||
2917 | list_splice(&TxAnchor.anon_list2, &TxAnchor.anon_list); | ||
2918 | INIT_LIST_HEAD(&TxAnchor.anon_list2); | ||
2919 | goto restart; | ||
2920 | } | ||
2921 | TXN_UNLOCK(); | ||
2922 | |||
2923 | /* | ||
2924 | * We may need to kick off the group commit | ||
2925 | */ | ||
2926 | jfs_flush_journal(log, 0); | ||
2927 | } | ||
2928 | |||
2929 | /* | ||
2930 | * txResume() | ||
2931 | * | ||
2932 | * Allows transactions to start again following txQuiesce | ||
2933 | */ | ||
2934 | void txResume(struct super_block *sb) | ||
2935 | { | ||
2936 | struct jfs_log *log = JFS_SBI(sb)->log; | ||
2937 | |||
2938 | clear_bit(log_QUIESCE, &log->flag); | ||
2939 | TXN_WAKEUP(&log->syncwait); | ||
2940 | } | ||
2941 | |||
2942 | /* | ||
2943 | * jfs_sync(void) | ||
2944 | * | ||
2945 | * To be run as a kernel daemon. This is awakened when tlocks run low. | ||
2946 | * We write any inodes that have anonymous tlocks so they will become | ||
2947 | * available. | ||
2948 | */ | ||
2949 | int jfs_sync(void *arg) | ||
2950 | { | ||
2951 | struct inode *ip; | ||
2952 | struct jfs_inode_info *jfs_ip; | ||
2953 | int rc; | ||
2954 | tid_t tid; | ||
2955 | |||
2956 | daemonize("jfsSync"); | ||
2957 | |||
2958 | complete(&jfsIOwait); | ||
2959 | |||
2960 | do { | ||
2961 | /* | ||
2962 | * write each inode on the anonymous inode list | ||
2963 | */ | ||
2964 | TXN_LOCK(); | ||
2965 | while (jfs_tlocks_low && !list_empty(&TxAnchor.anon_list)) { | ||
2966 | jfs_ip = list_entry(TxAnchor.anon_list.next, | ||
2967 | struct jfs_inode_info, | ||
2968 | anon_inode_list); | ||
2969 | ip = &jfs_ip->vfs_inode; | ||
2970 | |||
2971 | if (! igrab(ip)) { | ||
2972 | /* | ||
2973 | * Inode is being freed | ||
2974 | */ | ||
2975 | list_del_init(&jfs_ip->anon_inode_list); | ||
2976 | } else if (! down_trylock(&jfs_ip->commit_sem)) { | ||
2977 | /* | ||
2978 | * inode will be removed from anonymous list | ||
2979 | * when it is committed | ||
2980 | */ | ||
2981 | TXN_UNLOCK(); | ||
2982 | tid = txBegin(ip->i_sb, COMMIT_INODE); | ||
2983 | rc = txCommit(tid, 1, &ip, 0); | ||
2984 | txEnd(tid); | ||
2985 | up(&jfs_ip->commit_sem); | ||
2986 | |||
2987 | iput(ip); | ||
2988 | /* | ||
2989 | * Just to be safe. I don't know how | ||
2990 | * long we can run without blocking | ||
2991 | */ | ||
2992 | cond_resched(); | ||
2993 | TXN_LOCK(); | ||
2994 | } else { | ||
2995 | /* We can't get the commit semaphore. It may | ||
2996 | * be held by a thread waiting for tlock's | ||
2997 | * so let's not block here. Save it to | ||
2998 | * put back on the anon_list. | ||
2999 | */ | ||
3000 | |||
3001 | /* Take off anon_list */ | ||
3002 | list_del(&jfs_ip->anon_inode_list); | ||
3003 | |||
3004 | /* Put on anon_list2 */ | ||
3005 | list_add(&jfs_ip->anon_inode_list, | ||
3006 | &TxAnchor.anon_list2); | ||
3007 | |||
3008 | TXN_UNLOCK(); | ||
3009 | iput(ip); | ||
3010 | TXN_LOCK(); | ||
3011 | } | ||
3012 | } | ||
3013 | /* Add anon_list2 back to anon_list */ | ||
3014 | list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list); | ||
3015 | |||
3016 | if (current->flags & PF_FREEZE) { | ||
3017 | TXN_UNLOCK(); | ||
3018 | refrigerator(PF_FREEZE); | ||
3019 | } else { | ||
3020 | DECLARE_WAITQUEUE(wq, current); | ||
3021 | |||
3022 | add_wait_queue(&jfs_sync_thread_wait, &wq); | ||
3023 | set_current_state(TASK_INTERRUPTIBLE); | ||
3024 | TXN_UNLOCK(); | ||
3025 | schedule(); | ||
3026 | current->state = TASK_RUNNING; | ||
3027 | remove_wait_queue(&jfs_sync_thread_wait, &wq); | ||
3028 | } | ||
3029 | } while (!jfs_stop_threads); | ||
3030 | |||
3031 | jfs_info("jfs_sync being killed"); | ||
3032 | complete_and_exit(&jfsIOwait, 0); | ||
3033 | } | ||
3034 | |||
3035 | #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG) | ||
3036 | int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length, | ||
3037 | int *eof, void *data) | ||
3038 | { | ||
3039 | int len = 0; | ||
3040 | off_t begin; | ||
3041 | char *freewait; | ||
3042 | char *freelockwait; | ||
3043 | char *lowlockwait; | ||
3044 | |||
3045 | freewait = | ||
3046 | waitqueue_active(&TxAnchor.freewait) ? "active" : "empty"; | ||
3047 | freelockwait = | ||
3048 | waitqueue_active(&TxAnchor.freelockwait) ? "active" : "empty"; | ||
3049 | lowlockwait = | ||
3050 | waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty"; | ||
3051 | |||
3052 | len += sprintf(buffer, | ||
3053 | "JFS TxAnchor\n" | ||
3054 | "============\n" | ||
3055 | "freetid = %d\n" | ||
3056 | "freewait = %s\n" | ||
3057 | "freelock = %d\n" | ||
3058 | "freelockwait = %s\n" | ||
3059 | "lowlockwait = %s\n" | ||
3060 | "tlocksInUse = %d\n" | ||
3061 | "jfs_tlocks_low = %d\n" | ||
3062 | "unlock_queue is %sempty\n", | ||
3063 | TxAnchor.freetid, | ||
3064 | freewait, | ||
3065 | TxAnchor.freelock, | ||
3066 | freelockwait, | ||
3067 | lowlockwait, | ||
3068 | TxAnchor.tlocksInUse, | ||
3069 | jfs_tlocks_low, | ||
3070 | list_empty(&TxAnchor.unlock_queue) ? "" : "not "); | ||
3071 | |||
3072 | begin = offset; | ||
3073 | *start = buffer + begin; | ||
3074 | len -= begin; | ||
3075 | |||
3076 | if (len > length) | ||
3077 | len = length; | ||
3078 | else | ||
3079 | *eof = 1; | ||
3080 | |||
3081 | if (len < 0) | ||
3082 | len = 0; | ||
3083 | |||
3084 | return len; | ||
3085 | } | ||
3086 | #endif | ||
3087 | |||
3088 | #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS) | ||
3089 | int jfs_txstats_read(char *buffer, char **start, off_t offset, int length, | ||
3090 | int *eof, void *data) | ||
3091 | { | ||
3092 | int len = 0; | ||
3093 | off_t begin; | ||
3094 | |||
3095 | len += sprintf(buffer, | ||
3096 | "JFS TxStats\n" | ||
3097 | "===========\n" | ||
3098 | "calls to txBegin = %d\n" | ||
3099 | "txBegin blocked by sync barrier = %d\n" | ||
3100 | "txBegin blocked by tlocks low = %d\n" | ||
3101 | "txBegin blocked by no free tid = %d\n" | ||
3102 | "calls to txBeginAnon = %d\n" | ||
3103 | "txBeginAnon blocked by sync barrier = %d\n" | ||
3104 | "txBeginAnon blocked by tlocks low = %d\n" | ||
3105 | "calls to txLockAlloc = %d\n" | ||
3106 | "tLockAlloc blocked by no free lock = %d\n", | ||
3107 | TxStat.txBegin, | ||
3108 | TxStat.txBegin_barrier, | ||
3109 | TxStat.txBegin_lockslow, | ||
3110 | TxStat.txBegin_freetid, | ||
3111 | TxStat.txBeginAnon, | ||
3112 | TxStat.txBeginAnon_barrier, | ||
3113 | TxStat.txBeginAnon_lockslow, | ||
3114 | TxStat.txLockAlloc, | ||
3115 | TxStat.txLockAlloc_freelock); | ||
3116 | |||
3117 | begin = offset; | ||
3118 | *start = buffer + begin; | ||
3119 | len -= begin; | ||
3120 | |||
3121 | if (len > length) | ||
3122 | len = length; | ||
3123 | else | ||
3124 | *eof = 1; | ||
3125 | |||
3126 | if (len < 0) | ||
3127 | len = 0; | ||
3128 | |||
3129 | return len; | ||
3130 | } | ||
3131 | #endif | ||
diff --git a/fs/jfs/jfs_txnmgr.h b/fs/jfs/jfs_txnmgr.h new file mode 100644 index 000000000000..b71b82c2df04 --- /dev/null +++ b/fs/jfs/jfs_txnmgr.h | |||
@@ -0,0 +1,318 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | #ifndef _H_JFS_TXNMGR | ||
19 | #define _H_JFS_TXNMGR | ||
20 | |||
21 | #include "jfs_logmgr.h" | ||
22 | |||
23 | /* | ||
24 | * Hide implementation of TxBlock and TxLock | ||
25 | */ | ||
26 | #define tid_to_tblock(tid) (&TxBlock[tid]) | ||
27 | |||
28 | #define lid_to_tlock(lid) (&TxLock[lid]) | ||
29 | |||
30 | /* | ||
31 | * transaction block | ||
32 | */ | ||
33 | struct tblock { | ||
34 | /* | ||
35 | * tblock and jbuf_t common area: struct logsyncblk | ||
36 | * | ||
37 | * the following 5 fields are the same as struct logsyncblk | ||
38 | * which is common to tblock and jbuf to form logsynclist | ||
39 | */ | ||
40 | u16 xflag; /* tx commit type */ | ||
41 | u16 flag; /* tx commit state */ | ||
42 | lid_t dummy; /* Must keep structures common */ | ||
43 | s32 lsn; /* recovery lsn */ | ||
44 | struct list_head synclist; /* logsynclist link */ | ||
45 | |||
46 | /* lock management */ | ||
47 | struct super_block *sb; /* super block */ | ||
48 | lid_t next; /* index of first tlock of tid */ | ||
49 | lid_t last; /* index of last tlock of tid */ | ||
50 | wait_queue_head_t waitor; /* tids waiting on this tid */ | ||
51 | |||
52 | /* log management */ | ||
53 | u32 logtid; /* log transaction id */ | ||
54 | |||
55 | /* commit management */ | ||
56 | struct list_head cqueue; /* commit queue list */ | ||
57 | s32 clsn; /* commit lsn */ | ||
58 | struct lbuf *bp; | ||
59 | s32 pn; /* commit record log page number */ | ||
60 | s32 eor; /* commit record eor */ | ||
61 | wait_queue_head_t gcwait; /* group commit event list: | ||
62 | * ready transactions wait on this | ||
63 | * event for group commit completion. | ||
64 | */ | ||
65 | union { | ||
66 | struct inode *ip; /* inode being deleted */ | ||
67 | pxd_t ixpxd; /* pxd of inode extent for created inode */ | ||
68 | } u; | ||
69 | u32 ino; /* inode number being created */ | ||
70 | }; | ||
71 | |||
72 | extern struct tblock *TxBlock; /* transaction block table */ | ||
73 | |||
74 | /* commit flags: tblk->xflag */ | ||
75 | #define COMMIT_SYNC 0x0001 /* synchronous commit */ | ||
76 | #define COMMIT_FORCE 0x0002 /* force pageout at end of commit */ | ||
77 | #define COMMIT_FLUSH 0x0004 /* init flush at end of commit */ | ||
78 | #define COMMIT_MAP 0x00f0 | ||
79 | #define COMMIT_PMAP 0x0010 /* update pmap */ | ||
80 | #define COMMIT_WMAP 0x0020 /* update wmap */ | ||
81 | #define COMMIT_PWMAP 0x0040 /* update pwmap */ | ||
82 | #define COMMIT_FREE 0x0f00 | ||
83 | #define COMMIT_DELETE 0x0100 /* inode delete */ | ||
84 | #define COMMIT_TRUNCATE 0x0200 /* file truncation */ | ||
85 | #define COMMIT_CREATE 0x0400 /* inode create */ | ||
86 | #define COMMIT_LAZY 0x0800 /* lazy commit */ | ||
87 | #define COMMIT_PAGE 0x1000 /* Identifies element as metapage */ | ||
88 | #define COMMIT_INODE 0x2000 /* Identifies element as inode */ | ||
89 | |||
90 | /* group commit flags tblk->flag: see jfs_logmgr.h */ | ||
91 | |||
92 | /* | ||
93 | * transaction lock | ||
94 | */ | ||
95 | struct tlock { | ||
96 | lid_t next; /* 2: index next lockword on tid locklist | ||
97 | * next lockword on freelist | ||
98 | */ | ||
99 | tid_t tid; /* 2: transaction id holding lock */ | ||
100 | |||
101 | u16 flag; /* 2: lock control */ | ||
102 | u16 type; /* 2: log type */ | ||
103 | |||
104 | struct metapage *mp; /* 4/8: object page buffer locked */ | ||
105 | struct inode *ip; /* 4/8: object */ | ||
106 | /* (16) */ | ||
107 | |||
108 | s16 lock[24]; /* 48: overlay area */ | ||
109 | }; /* (64) */ | ||
110 | |||
111 | extern struct tlock *TxLock; /* transaction lock table */ | ||
112 | |||
113 | /* | ||
114 | * tlock flag | ||
115 | */ | ||
116 | /* txLock state */ | ||
117 | #define tlckPAGELOCK 0x8000 | ||
118 | #define tlckINODELOCK 0x4000 | ||
119 | #define tlckLINELOCK 0x2000 | ||
120 | #define tlckINLINELOCK 0x1000 | ||
121 | /* lmLog state */ | ||
122 | #define tlckLOG 0x0800 | ||
123 | /* updateMap state */ | ||
124 | #define tlckUPDATEMAP 0x0080 | ||
125 | /* freeLock state */ | ||
126 | #define tlckFREELOCK 0x0008 | ||
127 | #define tlckWRITEPAGE 0x0004 | ||
128 | #define tlckFREEPAGE 0x0002 | ||
129 | |||
130 | /* | ||
131 | * tlock type | ||
132 | */ | ||
133 | #define tlckTYPE 0xfe00 | ||
134 | #define tlckINODE 0x8000 | ||
135 | #define tlckXTREE 0x4000 | ||
136 | #define tlckDTREE 0x2000 | ||
137 | #define tlckMAP 0x1000 | ||
138 | #define tlckEA 0x0800 | ||
139 | #define tlckACL 0x0400 | ||
140 | #define tlckDATA 0x0200 | ||
141 | #define tlckBTROOT 0x0100 | ||
142 | |||
143 | #define tlckOPERATION 0x00ff | ||
144 | #define tlckGROW 0x0001 /* file grow */ | ||
145 | #define tlckREMOVE 0x0002 /* file delete */ | ||
146 | #define tlckTRUNCATE 0x0004 /* file truncate */ | ||
147 | #define tlckRELOCATE 0x0008 /* file/directory relocate */ | ||
148 | #define tlckENTRY 0x0001 /* directory insert/delete */ | ||
149 | #define tlckEXTEND 0x0002 /* directory extend in-line */ | ||
150 | #define tlckSPLIT 0x0010 /* splited page */ | ||
151 | #define tlckNEW 0x0020 /* new page from split */ | ||
152 | #define tlckFREE 0x0040 /* free page */ | ||
153 | #define tlckRELINK 0x0080 /* update sibling pointer */ | ||
154 | |||
155 | /* | ||
156 | * linelock for lmLog() | ||
157 | * | ||
158 | * note: linelock and its variations are overlaid | ||
159 | * at tlock.lock: watch for alignment; | ||
160 | */ | ||
161 | struct lv { | ||
162 | u8 offset; /* 1: */ | ||
163 | u8 length; /* 1: */ | ||
164 | }; /* (2) */ | ||
165 | |||
166 | #define TLOCKSHORT 20 | ||
167 | #define TLOCKLONG 28 | ||
168 | |||
169 | struct linelock { | ||
170 | lid_t next; /* 2: next linelock */ | ||
171 | |||
172 | s8 maxcnt; /* 1: */ | ||
173 | s8 index; /* 1: */ | ||
174 | |||
175 | u16 flag; /* 2: */ | ||
176 | u8 type; /* 1: */ | ||
177 | u8 l2linesize; /* 1: log2 of linesize */ | ||
178 | /* (8) */ | ||
179 | |||
180 | struct lv lv[20]; /* 40: */ | ||
181 | }; /* (48) */ | ||
182 | |||
183 | #define dt_lock linelock | ||
184 | |||
185 | struct xtlock { | ||
186 | lid_t next; /* 2: */ | ||
187 | |||
188 | s8 maxcnt; /* 1: */ | ||
189 | s8 index; /* 1: */ | ||
190 | |||
191 | u16 flag; /* 2: */ | ||
192 | u8 type; /* 1: */ | ||
193 | u8 l2linesize; /* 1: log2 of linesize */ | ||
194 | /* (8) */ | ||
195 | |||
196 | struct lv header; /* 2: */ | ||
197 | struct lv lwm; /* 2: low water mark */ | ||
198 | struct lv hwm; /* 2: high water mark */ | ||
199 | struct lv twm; /* 2: */ | ||
200 | /* (16) */ | ||
201 | |||
202 | s32 pxdlock[8]; /* 32: */ | ||
203 | }; /* (48) */ | ||
204 | |||
205 | |||
206 | /* | ||
207 | * maplock for txUpdateMap() | ||
208 | * | ||
209 | * note: maplock and its variations are overlaid | ||
210 | * at tlock.lock/linelock: watch for alignment; | ||
211 | * N.B. next field may be set by linelock, and should not | ||
212 | * be modified by maplock; | ||
213 | * N.B. index of the first pxdlock specifies index of next | ||
214 | * free maplock (i.e., number of maplock) in the tlock; | ||
215 | */ | ||
216 | struct maplock { | ||
217 | lid_t next; /* 2: */ | ||
218 | |||
219 | u8 maxcnt; /* 2: */ | ||
220 | u8 index; /* 2: next free maplock index */ | ||
221 | |||
222 | u16 flag; /* 2: */ | ||
223 | u8 type; /* 1: */ | ||
224 | u8 count; /* 1: number of pxd/xad */ | ||
225 | /* (8) */ | ||
226 | |||
227 | pxd_t pxd; /* 8: */ | ||
228 | }; /* (16): */ | ||
229 | |||
230 | /* maplock flag */ | ||
231 | #define mlckALLOC 0x00f0 | ||
232 | #define mlckALLOCXADLIST 0x0080 | ||
233 | #define mlckALLOCPXDLIST 0x0040 | ||
234 | #define mlckALLOCXAD 0x0020 | ||
235 | #define mlckALLOCPXD 0x0010 | ||
236 | #define mlckFREE 0x000f | ||
237 | #define mlckFREEXADLIST 0x0008 | ||
238 | #define mlckFREEPXDLIST 0x0004 | ||
239 | #define mlckFREEXAD 0x0002 | ||
240 | #define mlckFREEPXD 0x0001 | ||
241 | |||
242 | #define pxd_lock maplock | ||
243 | |||
244 | struct xdlistlock { | ||
245 | lid_t next; /* 2: */ | ||
246 | |||
247 | u8 maxcnt; /* 2: */ | ||
248 | u8 index; /* 2: */ | ||
249 | |||
250 | u16 flag; /* 2: */ | ||
251 | u8 type; /* 1: */ | ||
252 | u8 count; /* 1: number of pxd/xad */ | ||
253 | /* (8) */ | ||
254 | |||
255 | /* | ||
256 | * We need xdlist to be 64 bits (8 bytes), regardless of | ||
257 | * whether void * is 32 or 64 bits | ||
258 | */ | ||
259 | union { | ||
260 | void *_xdlist; /* pxd/xad list */ | ||
261 | s64 pad; /* 8: Force 64-bit xdlist size */ | ||
262 | } union64; | ||
263 | }; /* (16): */ | ||
264 | |||
265 | #define xdlist union64._xdlist | ||
266 | |||
267 | /* | ||
268 | * commit | ||
269 | * | ||
270 | * parameter to the commit manager routines | ||
271 | */ | ||
272 | struct commit { | ||
273 | tid_t tid; /* tid = index of tblock */ | ||
274 | int flag; /* flags */ | ||
275 | struct jfs_log *log; /* log */ | ||
276 | struct super_block *sb; /* superblock */ | ||
277 | |||
278 | int nip; /* number of entries in iplist */ | ||
279 | struct inode **iplist; /* list of pointers to inodes */ | ||
280 | |||
281 | /* log record descriptor on 64-bit boundary */ | ||
282 | struct lrd lrd; /* : log record descriptor */ | ||
283 | }; | ||
284 | |||
285 | /* | ||
286 | * external declarations | ||
287 | */ | ||
288 | extern struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage *mp, | ||
289 | int flag); | ||
290 | |||
291 | extern struct tlock *txMaplock(tid_t tid, struct inode *ip, int flag); | ||
292 | |||
293 | extern int txCommit(tid_t tid, int nip, struct inode **iplist, int flag); | ||
294 | |||
295 | extern tid_t txBegin(struct super_block *sb, int flag); | ||
296 | |||
297 | extern void txBeginAnon(struct super_block *sb); | ||
298 | |||
299 | extern void txEnd(tid_t tid); | ||
300 | |||
301 | extern void txAbort(tid_t tid, int dirty); | ||
302 | |||
303 | extern struct linelock *txLinelock(struct linelock * tlock); | ||
304 | |||
305 | extern void txFreeMap(struct inode *ip, struct maplock * maplock, | ||
306 | struct tblock * tblk, int maptype); | ||
307 | |||
308 | extern void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea); | ||
309 | |||
310 | extern void txFreelock(struct inode *ip); | ||
311 | |||
312 | extern int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, | ||
313 | struct tlock * tlck); | ||
314 | |||
315 | extern void txQuiesce(struct super_block *sb); | ||
316 | |||
317 | extern void txResume(struct super_block *sb); | ||
318 | #endif /* _H_JFS_TXNMGR */ | ||
diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h new file mode 100644 index 000000000000..5bfad39a2078 --- /dev/null +++ b/fs/jfs/jfs_types.h | |||
@@ -0,0 +1,192 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | #ifndef _H_JFS_TYPES | ||
19 | #define _H_JFS_TYPES | ||
20 | |||
21 | /* | ||
22 | * jfs_types.h: | ||
23 | * | ||
24 | * basic type/utility definitions | ||
25 | * | ||
26 | * note: this header file must be the 1st include file | ||
27 | * of JFS include list in all JFS .c file. | ||
28 | */ | ||
29 | |||
30 | #include <linux/types.h> | ||
31 | #include <linux/nls.h> | ||
32 | |||
33 | #include "endian24.h" | ||
34 | |||
35 | /* | ||
36 | * transaction and lock id's | ||
37 | * | ||
38 | * Don't change these without carefully considering the impact on the | ||
39 | * size and alignment of all of the linelock variants | ||
40 | */ | ||
41 | typedef u16 tid_t; | ||
42 | typedef u16 lid_t; | ||
43 | |||
44 | /* | ||
45 | * Almost identical to Linux's timespec, but not quite | ||
46 | */ | ||
47 | struct timestruc_t { | ||
48 | __le32 tv_sec; | ||
49 | __le32 tv_nsec; | ||
50 | }; | ||
51 | |||
52 | /* | ||
53 | * handy | ||
54 | */ | ||
55 | |||
56 | #define LEFTMOSTONE 0x80000000 | ||
57 | #define HIGHORDER 0x80000000u /* high order bit on */ | ||
58 | #define ONES 0xffffffffu /* all bit on */ | ||
59 | |||
60 | typedef int boolean_t; | ||
61 | #define TRUE 1 | ||
62 | #define FALSE 0 | ||
63 | |||
64 | /* | ||
65 | * logical xd (lxd) | ||
66 | */ | ||
67 | typedef struct { | ||
68 | unsigned len:24; | ||
69 | unsigned off1:8; | ||
70 | u32 off2; | ||
71 | } lxd_t; | ||
72 | |||
73 | /* lxd_t field construction */ | ||
74 | #define LXDlength(lxd, length32) ( (lxd)->len = length32 ) | ||
75 | #define LXDoffset(lxd, offset64)\ | ||
76 | {\ | ||
77 | (lxd)->off1 = ((s64)offset64) >> 32;\ | ||
78 | (lxd)->off2 = (offset64) & 0xffffffff;\ | ||
79 | } | ||
80 | |||
81 | /* lxd_t field extraction */ | ||
82 | #define lengthLXD(lxd) ( (lxd)->len ) | ||
83 | #define offsetLXD(lxd)\ | ||
84 | ( ((s64)((lxd)->off1)) << 32 | (lxd)->off2 ) | ||
85 | |||
86 | /* lxd list */ | ||
87 | struct lxdlist { | ||
88 | s16 maxnlxd; | ||
89 | s16 nlxd; | ||
90 | lxd_t *lxd; | ||
91 | }; | ||
92 | |||
93 | /* | ||
94 | * physical xd (pxd) | ||
95 | */ | ||
96 | typedef struct { | ||
97 | unsigned len:24; | ||
98 | unsigned addr1:8; | ||
99 | __le32 addr2; | ||
100 | } pxd_t; | ||
101 | |||
102 | /* xd_t field construction */ | ||
103 | |||
104 | #define PXDlength(pxd, length32) ((pxd)->len = __cpu_to_le24(length32)) | ||
105 | #define PXDaddress(pxd, address64)\ | ||
106 | {\ | ||
107 | (pxd)->addr1 = ((s64)address64) >> 32;\ | ||
108 | (pxd)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ | ||
109 | } | ||
110 | |||
111 | /* xd_t field extraction */ | ||
112 | #define lengthPXD(pxd) __le24_to_cpu((pxd)->len) | ||
113 | #define addressPXD(pxd)\ | ||
114 | ( ((s64)((pxd)->addr1)) << 32 | __le32_to_cpu((pxd)->addr2)) | ||
115 | |||
116 | #define MAXTREEHEIGHT 8 | ||
117 | /* pxd list */ | ||
118 | struct pxdlist { | ||
119 | s16 maxnpxd; | ||
120 | s16 npxd; | ||
121 | pxd_t pxd[MAXTREEHEIGHT]; | ||
122 | }; | ||
123 | |||
124 | |||
125 | /* | ||
126 | * data extent descriptor (dxd) | ||
127 | */ | ||
128 | typedef struct { | ||
129 | unsigned flag:8; /* 1: flags */ | ||
130 | unsigned rsrvd:24; | ||
131 | __le32 size; /* 4: size in byte */ | ||
132 | unsigned len:24; /* 3: length in unit of fsblksize */ | ||
133 | unsigned addr1:8; /* 1: address in unit of fsblksize */ | ||
134 | __le32 addr2; /* 4: address in unit of fsblksize */ | ||
135 | } dxd_t; /* - 16 - */ | ||
136 | |||
137 | /* dxd_t flags */ | ||
138 | #define DXD_INDEX 0x80 /* B+-tree index */ | ||
139 | #define DXD_INLINE 0x40 /* in-line data extent */ | ||
140 | #define DXD_EXTENT 0x20 /* out-of-line single extent */ | ||
141 | #define DXD_FILE 0x10 /* out-of-line file (inode) */ | ||
142 | #define DXD_CORRUPT 0x08 /* Inconsistency detected */ | ||
143 | |||
144 | /* dxd_t field construction | ||
145 | * Conveniently, the PXD macros work for DXD | ||
146 | */ | ||
147 | #define DXDlength PXDlength | ||
148 | #define DXDaddress PXDaddress | ||
149 | #define lengthDXD lengthPXD | ||
150 | #define addressDXD addressPXD | ||
151 | #define DXDsize(dxd, size32) ((dxd)->size = cpu_to_le32(size32)) | ||
152 | #define sizeDXD(dxd) le32_to_cpu((dxd)->size) | ||
153 | |||
154 | /* | ||
155 | * directory entry argument | ||
156 | */ | ||
157 | struct component_name { | ||
158 | int namlen; | ||
159 | wchar_t *name; | ||
160 | }; | ||
161 | |||
162 | |||
163 | /* | ||
164 | * DASD limit information - stored in directory inode | ||
165 | */ | ||
166 | struct dasd { | ||
167 | u8 thresh; /* Alert Threshold (in percent) */ | ||
168 | u8 delta; /* Alert Threshold delta (in percent) */ | ||
169 | u8 rsrvd1; | ||
170 | u8 limit_hi; /* DASD limit (in logical blocks) */ | ||
171 | __le32 limit_lo; /* DASD limit (in logical blocks) */ | ||
172 | u8 rsrvd2[3]; | ||
173 | u8 used_hi; /* DASD usage (in logical blocks) */ | ||
174 | __le32 used_lo; /* DASD usage (in logical blocks) */ | ||
175 | }; | ||
176 | |||
177 | #define DASDLIMIT(dasdp) \ | ||
178 | (((u64)((dasdp)->limit_hi) << 32) + __le32_to_cpu((dasdp)->limit_lo)) | ||
179 | #define setDASDLIMIT(dasdp, limit)\ | ||
180 | {\ | ||
181 | (dasdp)->limit_hi = ((u64)limit) >> 32;\ | ||
182 | (dasdp)->limit_lo = __cpu_to_le32(limit);\ | ||
183 | } | ||
184 | #define DASDUSED(dasdp) \ | ||
185 | (((u64)((dasdp)->used_hi) << 32) + __le32_to_cpu((dasdp)->used_lo)) | ||
186 | #define setDASDUSED(dasdp, used)\ | ||
187 | {\ | ||
188 | (dasdp)->used_hi = ((u64)used) >> 32;\ | ||
189 | (dasdp)->used_lo = __cpu_to_le32(used);\ | ||
190 | } | ||
191 | |||
192 | #endif /* !_H_JFS_TYPES */ | ||
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c new file mode 100644 index 000000000000..f31a9e3f3fec --- /dev/null +++ b/fs/jfs/jfs_umount.c | |||
@@ -0,0 +1,178 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | |||
19 | /* | ||
20 | * jfs_umount.c | ||
21 | * | ||
22 | * note: file system in transition to aggregate/fileset: | ||
23 | * (ref. jfs_mount.c) | ||
24 | * | ||
25 | * file system unmount is interpreted as mount of the single/only | ||
26 | * fileset in the aggregate and, if unmount of the last fileset, | ||
27 | * as unmount of the aggerate; | ||
28 | */ | ||
29 | |||
30 | #include <linux/fs.h> | ||
31 | #include "jfs_incore.h" | ||
32 | #include "jfs_filsys.h" | ||
33 | #include "jfs_superblock.h" | ||
34 | #include "jfs_dmap.h" | ||
35 | #include "jfs_imap.h" | ||
36 | #include "jfs_metapage.h" | ||
37 | #include "jfs_debug.h" | ||
38 | |||
39 | /* | ||
40 | * NAME: jfs_umount(vfsp, flags, crp) | ||
41 | * | ||
42 | * FUNCTION: vfs_umount() | ||
43 | * | ||
44 | * PARAMETERS: vfsp - virtual file system pointer | ||
45 | * flags - unmount for shutdown | ||
46 | * crp - credential | ||
47 | * | ||
48 | * RETURN : EBUSY - device has open files | ||
49 | */ | ||
50 | int jfs_umount(struct super_block *sb) | ||
51 | { | ||
52 | struct address_space *bdev_mapping = sb->s_bdev->bd_inode->i_mapping; | ||
53 | struct jfs_sb_info *sbi = JFS_SBI(sb); | ||
54 | struct inode *ipbmap = sbi->ipbmap; | ||
55 | struct inode *ipimap = sbi->ipimap; | ||
56 | struct inode *ipaimap = sbi->ipaimap; | ||
57 | struct inode *ipaimap2 = sbi->ipaimap2; | ||
58 | struct jfs_log *log; | ||
59 | int rc = 0; | ||
60 | |||
61 | jfs_info("UnMount JFS: sb:0x%p", sb); | ||
62 | |||
63 | /* | ||
64 | * update superblock and close log | ||
65 | * | ||
66 | * if mounted read-write and log based recovery was enabled | ||
67 | */ | ||
68 | if ((log = sbi->log)) | ||
69 | /* | ||
70 | * Wait for outstanding transactions to be written to log: | ||
71 | */ | ||
72 | jfs_flush_journal(log, 2); | ||
73 | |||
74 | /* | ||
75 | * close fileset inode allocation map (aka fileset inode) | ||
76 | */ | ||
77 | diUnmount(ipimap, 0); | ||
78 | |||
79 | diFreeSpecial(ipimap); | ||
80 | sbi->ipimap = NULL; | ||
81 | |||
82 | /* | ||
83 | * close secondary aggregate inode allocation map | ||
84 | */ | ||
85 | ipaimap2 = sbi->ipaimap2; | ||
86 | if (ipaimap2) { | ||
87 | diUnmount(ipaimap2, 0); | ||
88 | diFreeSpecial(ipaimap2); | ||
89 | sbi->ipaimap2 = NULL; | ||
90 | } | ||
91 | |||
92 | /* | ||
93 | * close aggregate inode allocation map | ||
94 | */ | ||
95 | ipaimap = sbi->ipaimap; | ||
96 | diUnmount(ipaimap, 0); | ||
97 | diFreeSpecial(ipaimap); | ||
98 | sbi->ipaimap = NULL; | ||
99 | |||
100 | /* | ||
101 | * close aggregate block allocation map | ||
102 | */ | ||
103 | dbUnmount(ipbmap, 0); | ||
104 | |||
105 | diFreeSpecial(ipbmap); | ||
106 | sbi->ipimap = NULL; | ||
107 | |||
108 | /* | ||
109 | * Make sure all metadata makes it to disk before we mark | ||
110 | * the superblock as clean | ||
111 | */ | ||
112 | filemap_fdatawrite(bdev_mapping); | ||
113 | filemap_fdatawait(bdev_mapping); | ||
114 | |||
115 | /* | ||
116 | * ensure all file system file pages are propagated to their | ||
117 | * home blocks on disk (and their in-memory buffer pages are | ||
118 | * invalidated) BEFORE updating file system superblock state | ||
119 | * (to signify file system is unmounted cleanly, and thus in | ||
120 | * consistent state) and log superblock active file system | ||
121 | * list (to signify skip logredo()). | ||
122 | */ | ||
123 | if (log) { /* log = NULL if read-only mount */ | ||
124 | updateSuper(sb, FM_CLEAN); | ||
125 | |||
126 | /* Restore default gfp_mask for bdev */ | ||
127 | mapping_set_gfp_mask(bdev_mapping, GFP_USER); | ||
128 | |||
129 | /* | ||
130 | * close log: | ||
131 | * | ||
132 | * remove file system from log active file system list. | ||
133 | */ | ||
134 | rc = lmLogClose(sb); | ||
135 | } | ||
136 | jfs_info("UnMount JFS Complete: rc = %d", rc); | ||
137 | return rc; | ||
138 | } | ||
139 | |||
140 | |||
141 | int jfs_umount_rw(struct super_block *sb) | ||
142 | { | ||
143 | struct address_space *bdev_mapping = sb->s_bdev->bd_inode->i_mapping; | ||
144 | struct jfs_sb_info *sbi = JFS_SBI(sb); | ||
145 | struct jfs_log *log = sbi->log; | ||
146 | |||
147 | if (!log) | ||
148 | return 0; | ||
149 | |||
150 | /* | ||
151 | * close log: | ||
152 | * | ||
153 | * remove file system from log active file system list. | ||
154 | */ | ||
155 | jfs_flush_journal(log, 2); | ||
156 | |||
157 | /* | ||
158 | * Make sure all metadata makes it to disk | ||
159 | */ | ||
160 | dbSync(sbi->ipbmap); | ||
161 | diSync(sbi->ipimap); | ||
162 | |||
163 | /* | ||
164 | * Note that we have to do this even if sync_blockdev() will | ||
165 | * do exactly the same a few instructions later: We can't | ||
166 | * mark the superblock clean before everything is flushed to | ||
167 | * disk. | ||
168 | */ | ||
169 | filemap_fdatawrite(bdev_mapping); | ||
170 | filemap_fdatawait(bdev_mapping); | ||
171 | |||
172 | updateSuper(sb, FM_CLEAN); | ||
173 | |||
174 | /* Restore default gfp_mask for bdev */ | ||
175 | mapping_set_gfp_mask(bdev_mapping, GFP_USER); | ||
176 | |||
177 | return lmLogClose(sb); | ||
178 | } | ||
diff --git a/fs/jfs/jfs_unicode.c b/fs/jfs/jfs_unicode.c new file mode 100644 index 000000000000..b32208aad550 --- /dev/null +++ b/fs/jfs/jfs_unicode.c | |||
@@ -0,0 +1,137 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | |||
19 | #include <linux/fs.h> | ||
20 | #include <linux/slab.h> | ||
21 | #include "jfs_incore.h" | ||
22 | #include "jfs_filsys.h" | ||
23 | #include "jfs_unicode.h" | ||
24 | #include "jfs_debug.h" | ||
25 | |||
26 | /* | ||
27 | * NAME: jfs_strfromUCS() | ||
28 | * | ||
29 | * FUNCTION: Convert little-endian unicode string to character string | ||
30 | * | ||
31 | */ | ||
32 | int jfs_strfromUCS_le(char *to, const __le16 * from, | ||
33 | int len, struct nls_table *codepage) | ||
34 | { | ||
35 | int i; | ||
36 | int outlen = 0; | ||
37 | static int warn_again = 5; /* Only warn up to 5 times total */ | ||
38 | int warn = !!warn_again; /* once per string */ | ||
39 | |||
40 | if (codepage) { | ||
41 | for (i = 0; (i < len) && from[i]; i++) { | ||
42 | int charlen; | ||
43 | charlen = | ||
44 | codepage->uni2char(le16_to_cpu(from[i]), | ||
45 | &to[outlen], | ||
46 | NLS_MAX_CHARSET_SIZE); | ||
47 | if (charlen > 0) | ||
48 | outlen += charlen; | ||
49 | else | ||
50 | to[outlen++] = '?'; | ||
51 | } | ||
52 | } else { | ||
53 | for (i = 0; (i < len) && from[i]; i++) { | ||
54 | if (le16_to_cpu(from[i]) & 0xff00) { | ||
55 | if (warn) { | ||
56 | warn--; | ||
57 | warn_again--; | ||
58 | printk(KERN_ERR | ||
59 | "non-latin1 character 0x%x found in JFS file name\n", | ||
60 | le16_to_cpu(from[i])); | ||
61 | printk(KERN_ERR | ||
62 | "mount with iocharset=utf8 to access\n"); | ||
63 | } | ||
64 | to[i] = '?'; | ||
65 | } | ||
66 | else | ||
67 | to[i] = (char) (le16_to_cpu(from[i])); | ||
68 | } | ||
69 | outlen = i; | ||
70 | } | ||
71 | to[outlen] = 0; | ||
72 | return outlen; | ||
73 | } | ||
74 | |||
75 | /* | ||
76 | * NAME: jfs_strtoUCS() | ||
77 | * | ||
78 | * FUNCTION: Convert character string to unicode string | ||
79 | * | ||
80 | */ | ||
81 | static int jfs_strtoUCS(wchar_t * to, const unsigned char *from, int len, | ||
82 | struct nls_table *codepage) | ||
83 | { | ||
84 | int charlen; | ||
85 | int i; | ||
86 | |||
87 | if (codepage) { | ||
88 | for (i = 0; len && *from; i++, from += charlen, len -= charlen) | ||
89 | { | ||
90 | charlen = codepage->char2uni(from, len, &to[i]); | ||
91 | if (charlen < 1) { | ||
92 | jfs_err("jfs_strtoUCS: char2uni returned %d.", | ||
93 | charlen); | ||
94 | jfs_err("charset = %s, char = 0x%x", | ||
95 | codepage->charset, *from); | ||
96 | return charlen; | ||
97 | } | ||
98 | } | ||
99 | } else { | ||
100 | for (i = 0; (i < len) && from[i]; i++) | ||
101 | to[i] = (wchar_t) from[i]; | ||
102 | } | ||
103 | |||
104 | to[i] = 0; | ||
105 | return i; | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * NAME: get_UCSname() | ||
110 | * | ||
111 | * FUNCTION: Allocate and translate to unicode string | ||
112 | * | ||
113 | */ | ||
114 | int get_UCSname(struct component_name * uniName, struct dentry *dentry) | ||
115 | { | ||
116 | struct nls_table *nls_tab = JFS_SBI(dentry->d_sb)->nls_tab; | ||
117 | int length = dentry->d_name.len; | ||
118 | |||
119 | if (length > JFS_NAME_MAX) | ||
120 | return -ENAMETOOLONG; | ||
121 | |||
122 | uniName->name = | ||
123 | kmalloc((length + 1) * sizeof(wchar_t), GFP_NOFS); | ||
124 | |||
125 | if (uniName->name == NULL) | ||
126 | return -ENOSPC; | ||
127 | |||
128 | uniName->namlen = jfs_strtoUCS(uniName->name, dentry->d_name.name, | ||
129 | length, nls_tab); | ||
130 | |||
131 | if (uniName->namlen < 0) { | ||
132 | kfree(uniName->name); | ||
133 | return uniName->namlen; | ||
134 | } | ||
135 | |||
136 | return 0; | ||
137 | } | ||
diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h new file mode 100644 index 000000000000..69e25ebe87ac --- /dev/null +++ b/fs/jfs/jfs_unicode.h | |||
@@ -0,0 +1,155 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2000-2002 | ||
3 | * Portions Copyright (c) Christoph Hellwig, 2001-2002 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | #ifndef _H_JFS_UNICODE | ||
20 | #define _H_JFS_UNICODE | ||
21 | |||
22 | #include <asm/byteorder.h> | ||
23 | #include "jfs_types.h" | ||
24 | |||
25 | typedef struct { | ||
26 | wchar_t start; | ||
27 | wchar_t end; | ||
28 | signed char *table; | ||
29 | } UNICASERANGE; | ||
30 | |||
31 | extern signed char UniUpperTable[512]; | ||
32 | extern UNICASERANGE UniUpperRange[]; | ||
33 | extern int get_UCSname(struct component_name *, struct dentry *); | ||
34 | extern int jfs_strfromUCS_le(char *, const __le16 *, int, struct nls_table *); | ||
35 | |||
36 | #define free_UCSname(COMP) kfree((COMP)->name) | ||
37 | |||
38 | /* | ||
39 | * UniStrcpy: Copy a string | ||
40 | */ | ||
41 | static inline wchar_t *UniStrcpy(wchar_t * ucs1, const wchar_t * ucs2) | ||
42 | { | ||
43 | wchar_t *anchor = ucs1; /* save the start of result string */ | ||
44 | |||
45 | while ((*ucs1++ = *ucs2++)); | ||
46 | return anchor; | ||
47 | } | ||
48 | |||
49 | |||
50 | |||
51 | /* | ||
52 | * UniStrncpy: Copy length limited string with pad | ||
53 | */ | ||
54 | static inline __le16 *UniStrncpy_le(__le16 * ucs1, const __le16 * ucs2, | ||
55 | size_t n) | ||
56 | { | ||
57 | __le16 *anchor = ucs1; | ||
58 | |||
59 | while (n-- && *ucs2) /* Copy the strings */ | ||
60 | *ucs1++ = *ucs2++; | ||
61 | |||
62 | n++; | ||
63 | while (n--) /* Pad with nulls */ | ||
64 | *ucs1++ = 0; | ||
65 | return anchor; | ||
66 | } | ||
67 | |||
68 | /* | ||
69 | * UniStrncmp_le: Compare length limited string - native to little-endian | ||
70 | */ | ||
71 | static inline int UniStrncmp_le(const wchar_t * ucs1, const __le16 * ucs2, | ||
72 | size_t n) | ||
73 | { | ||
74 | if (!n) | ||
75 | return 0; /* Null strings are equal */ | ||
76 | while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) { | ||
77 | ucs1++; | ||
78 | ucs2++; | ||
79 | } | ||
80 | return (int) *ucs1 - (int) __le16_to_cpu(*ucs2); | ||
81 | } | ||
82 | |||
83 | /* | ||
84 | * UniStrncpy_to_le: Copy length limited string with pad to little-endian | ||
85 | */ | ||
86 | static inline __le16 *UniStrncpy_to_le(__le16 * ucs1, const wchar_t * ucs2, | ||
87 | size_t n) | ||
88 | { | ||
89 | __le16 *anchor = ucs1; | ||
90 | |||
91 | while (n-- && *ucs2) /* Copy the strings */ | ||
92 | *ucs1++ = cpu_to_le16(*ucs2++); | ||
93 | |||
94 | n++; | ||
95 | while (n--) /* Pad with nulls */ | ||
96 | *ucs1++ = 0; | ||
97 | return anchor; | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * UniStrncpy_from_le: Copy length limited string with pad from little-endian | ||
102 | */ | ||
103 | static inline wchar_t *UniStrncpy_from_le(wchar_t * ucs1, const __le16 * ucs2, | ||
104 | size_t n) | ||
105 | { | ||
106 | wchar_t *anchor = ucs1; | ||
107 | |||
108 | while (n-- && *ucs2) /* Copy the strings */ | ||
109 | *ucs1++ = __le16_to_cpu(*ucs2++); | ||
110 | |||
111 | n++; | ||
112 | while (n--) /* Pad with nulls */ | ||
113 | *ucs1++ = 0; | ||
114 | return anchor; | ||
115 | } | ||
116 | |||
117 | /* | ||
118 | * UniToupper: Convert a unicode character to upper case | ||
119 | */ | ||
120 | static inline wchar_t UniToupper(wchar_t uc) | ||
121 | { | ||
122 | UNICASERANGE *rp; | ||
123 | |||
124 | if (uc < sizeof(UniUpperTable)) { /* Latin characters */ | ||
125 | return uc + UniUpperTable[uc]; /* Use base tables */ | ||
126 | } else { | ||
127 | rp = UniUpperRange; /* Use range tables */ | ||
128 | while (rp->start) { | ||
129 | if (uc < rp->start) /* Before start of range */ | ||
130 | return uc; /* Uppercase = input */ | ||
131 | if (uc <= rp->end) /* In range */ | ||
132 | return uc + rp->table[uc - rp->start]; | ||
133 | rp++; /* Try next range */ | ||
134 | } | ||
135 | } | ||
136 | return uc; /* Past last range */ | ||
137 | } | ||
138 | |||
139 | |||
140 | /* | ||
141 | * UniStrupr: Upper case a unicode string | ||
142 | */ | ||
143 | static inline wchar_t *UniStrupr(wchar_t * upin) | ||
144 | { | ||
145 | wchar_t *up; | ||
146 | |||
147 | up = upin; | ||
148 | while (*up) { /* For all characters */ | ||
149 | *up = UniToupper(*up); | ||
150 | up++; | ||
151 | } | ||
152 | return upin; /* Return input pointer */ | ||
153 | } | ||
154 | |||
155 | #endif /* !_H_JFS_UNICODE */ | ||
diff --git a/fs/jfs/jfs_uniupr.c b/fs/jfs/jfs_uniupr.c new file mode 100644 index 000000000000..4ab185d26308 --- /dev/null +++ b/fs/jfs/jfs_uniupr.c | |||
@@ -0,0 +1,134 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2000-2002 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | |||
19 | #include <linux/fs.h> | ||
20 | #include "jfs_unicode.h" | ||
21 | |||
22 | /* | ||
23 | * Latin upper case | ||
24 | */ | ||
25 | signed char UniUpperTable[512] = { | ||
26 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ | ||
27 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ | ||
28 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ | ||
29 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */ | ||
30 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 040-04f */ | ||
31 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 050-05f */ | ||
32 | 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 060-06f */ | ||
33 | -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, 0, 0, 0, 0, 0, /* 070-07f */ | ||
34 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */ | ||
35 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */ | ||
36 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */ | ||
37 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */ | ||
38 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0c0-0cf */ | ||
39 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0d0-0df */ | ||
40 | -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 0e0-0ef */ | ||
41 | -32,-32,-32,-32,-32,-32,-32, 0,-32,-32,-32,-32,-32,-32,-32,121, /* 0f0-0ff */ | ||
42 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 100-10f */ | ||
43 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 110-11f */ | ||
44 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 120-12f */ | ||
45 | 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 130-13f */ | ||
46 | -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, /* 140-14f */ | ||
47 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 150-15f */ | ||
48 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 160-16f */ | ||
49 | 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 170-17f */ | ||
50 | 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, /* 180-18f */ | ||
51 | 0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, /* 190-19f */ | ||
52 | 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, /* 1a0-1af */ | ||
53 | -1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, /* 1b0-1bf */ | ||
54 | 0, 0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0, /* 1c0-1cf */ | ||
55 | -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,-79, 0, -1, /* 1d0-1df */ | ||
56 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e0-1ef */ | ||
57 | 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, /* 1f0-1ff */ | ||
58 | }; | ||
59 | |||
60 | /* Upper case range - Greek */ | ||
61 | static signed char UniCaseRangeU03a0[47] = { | ||
62 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-38,-37,-37,-37, /* 3a0-3af */ | ||
63 | 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 3b0-3bf */ | ||
64 | -32,-32,-31,-32,-32,-32,-32,-32,-32,-32,-32,-32,-64,-63,-63, | ||
65 | }; | ||
66 | |||
67 | /* Upper case range - Cyrillic */ | ||
68 | static signed char UniCaseRangeU0430[48] = { | ||
69 | -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 430-43f */ | ||
70 | -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 440-44f */ | ||
71 | 0,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80, 0,-80,-80, /* 450-45f */ | ||
72 | }; | ||
73 | |||
74 | /* Upper case range - Extended cyrillic */ | ||
75 | static signed char UniCaseRangeU0490[61] = { | ||
76 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 490-49f */ | ||
77 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4a0-4af */ | ||
78 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4b0-4bf */ | ||
79 | 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, | ||
80 | }; | ||
81 | |||
82 | /* Upper case range - Extended latin and greek */ | ||
83 | static signed char UniCaseRangeU1e00[509] = { | ||
84 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e00-1e0f */ | ||
85 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e10-1e1f */ | ||
86 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e20-1e2f */ | ||
87 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e30-1e3f */ | ||
88 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e40-1e4f */ | ||
89 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e50-1e5f */ | ||
90 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e60-1e6f */ | ||
91 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e70-1e7f */ | ||
92 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e80-1e8f */ | ||
93 | 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0,-59, 0, -1, 0, -1, /* 1e90-1e9f */ | ||
94 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ea0-1eaf */ | ||
95 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1eb0-1ebf */ | ||
96 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ec0-1ecf */ | ||
97 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ed0-1edf */ | ||
98 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ee0-1eef */ | ||
99 | 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */ | ||
100 | 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f00-1f0f */ | ||
101 | 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f10-1f1f */ | ||
102 | 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f20-1f2f */ | ||
103 | 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f30-1f3f */ | ||
104 | 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f40-1f4f */ | ||
105 | 0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f50-1f5f */ | ||
106 | 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f60-1f6f */ | ||
107 | 74, 74, 86, 86, 86, 86,100,100, 0, 0,112,112,126,126, 0, 0, /* 1f70-1f7f */ | ||
108 | 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f80-1f8f */ | ||
109 | 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f90-1f9f */ | ||
110 | 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fa0-1faf */ | ||
111 | 8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fb0-1fbf */ | ||
112 | 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fc0-1fcf */ | ||
113 | 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fd0-1fdf */ | ||
114 | 8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fe0-1fef */ | ||
115 | 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
116 | }; | ||
117 | |||
118 | /* Upper case range - Wide latin */ | ||
119 | static signed char UniCaseRangeUff40[27] = { | ||
120 | 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* ff40-ff4f */ | ||
121 | -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, | ||
122 | }; | ||
123 | |||
124 | /* | ||
125 | * Upper Case Range | ||
126 | */ | ||
127 | UNICASERANGE UniUpperRange[] = { | ||
128 | { 0x03a0, 0x03ce, UniCaseRangeU03a0 }, | ||
129 | { 0x0430, 0x045f, UniCaseRangeU0430 }, | ||
130 | { 0x0490, 0x04cc, UniCaseRangeU0490 }, | ||
131 | { 0x1e00, 0x1ffc, UniCaseRangeU1e00 }, | ||
132 | { 0xff40, 0xff5a, UniCaseRangeUff40 }, | ||
133 | { 0 } | ||
134 | }; | ||
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h new file mode 100644 index 000000000000..a1052f3f0bee --- /dev/null +++ b/fs/jfs/jfs_xattr.h | |||
@@ -0,0 +1,64 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2000-2002 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | |||
19 | #ifndef H_JFS_XATTR | ||
20 | #define H_JFS_XATTR | ||
21 | |||
22 | /* | ||
23 | * jfs_ea_list describe the on-disk format of the extended attributes. | ||
24 | * I know the null-terminator is redundant since namelen is stored, but | ||
25 | * I am maintaining compatibility with OS/2 where possible. | ||
26 | */ | ||
27 | struct jfs_ea { | ||
28 | u8 flag; /* Unused? */ | ||
29 | u8 namelen; /* Length of name */ | ||
30 | __le16 valuelen; /* Length of value */ | ||
31 | char name[0]; /* Attribute name (includes null-terminator) */ | ||
32 | }; /* Value immediately follows name */ | ||
33 | |||
34 | struct jfs_ea_list { | ||
35 | __le32 size; /* overall size */ | ||
36 | struct jfs_ea ea[0]; /* Variable length list */ | ||
37 | }; | ||
38 | |||
39 | /* Macros for defining maxiumum number of bytes supported for EAs */ | ||
40 | #define MAXEASIZE 65535 | ||
41 | #define MAXEALISTSIZE MAXEASIZE | ||
42 | |||
43 | /* | ||
44 | * some macros for dealing with variable length EA lists. | ||
45 | */ | ||
46 | #define EA_SIZE(ea) \ | ||
47 | (sizeof (struct jfs_ea) + (ea)->namelen + 1 + \ | ||
48 | le16_to_cpu((ea)->valuelen)) | ||
49 | #define NEXT_EA(ea) ((struct jfs_ea *) (((char *) (ea)) + (EA_SIZE (ea)))) | ||
50 | #define FIRST_EA(ealist) ((ealist)->ea) | ||
51 | #define EALIST_SIZE(ealist) le32_to_cpu((ealist)->size) | ||
52 | #define END_EALIST(ealist) \ | ||
53 | ((struct jfs_ea *) (((char *) (ealist)) + EALIST_SIZE(ealist))) | ||
54 | |||
55 | extern int __jfs_setxattr(struct inode *, const char *, const void *, size_t, | ||
56 | int); | ||
57 | extern int jfs_setxattr(struct dentry *, const char *, const void *, size_t, | ||
58 | int); | ||
59 | extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t); | ||
60 | extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t); | ||
61 | extern ssize_t jfs_listxattr(struct dentry *, char *, size_t); | ||
62 | extern int jfs_removexattr(struct dentry *, const char *); | ||
63 | |||
64 | #endif /* H_JFS_XATTR */ | ||
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c new file mode 100644 index 000000000000..11c58c54b818 --- /dev/null +++ b/fs/jfs/jfs_xtree.c | |||
@@ -0,0 +1,4485 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | /* | ||
19 | * jfs_xtree.c: extent allocation descriptor B+-tree manager | ||
20 | */ | ||
21 | |||
22 | #include <linux/fs.h> | ||
23 | #include <linux/quotaops.h> | ||
24 | #include "jfs_incore.h" | ||
25 | #include "jfs_filsys.h" | ||
26 | #include "jfs_metapage.h" | ||
27 | #include "jfs_dmap.h" | ||
28 | #include "jfs_dinode.h" | ||
29 | #include "jfs_superblock.h" | ||
30 | #include "jfs_debug.h" | ||
31 | |||
32 | /* | ||
33 | * xtree local flag | ||
34 | */ | ||
35 | #define XT_INSERT 0x00000001 | ||
36 | |||
37 | /* | ||
38 | * xtree key/entry comparison: extent offset | ||
39 | * | ||
40 | * return: | ||
41 | * -1: k < start of extent | ||
42 | * 0: start_of_extent <= k <= end_of_extent | ||
43 | * 1: k > end_of_extent | ||
44 | */ | ||
45 | #define XT_CMP(CMP, K, X, OFFSET64)\ | ||
46 | {\ | ||
47 | OFFSET64 = offsetXAD(X);\ | ||
48 | (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\ | ||
49 | ((K) < OFFSET64) ? -1 : 0;\ | ||
50 | } | ||
51 | |||
52 | /* write a xad entry */ | ||
53 | #define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\ | ||
54 | {\ | ||
55 | (XAD)->flag = (FLAG);\ | ||
56 | XADoffset((XAD), (OFF));\ | ||
57 | XADlength((XAD), (LEN));\ | ||
58 | XADaddress((XAD), (ADDR));\ | ||
59 | } | ||
60 | |||
61 | #define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot) | ||
62 | |||
63 | /* get page buffer for specified block address */ | ||
64 | /* ToDo: Replace this ugly macro with a function */ | ||
65 | #define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)\ | ||
66 | {\ | ||
67 | BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot)\ | ||
68 | if (!(RC))\ | ||
69 | {\ | ||
70 | if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) ||\ | ||
71 | (le16_to_cpu((P)->header.nextindex) > le16_to_cpu((P)->header.maxentry)) ||\ | ||
72 | (le16_to_cpu((P)->header.maxentry) > (((BN)==0)?XTROOTMAXSLOT:PSIZE>>L2XTSLOTSIZE)))\ | ||
73 | {\ | ||
74 | jfs_error((IP)->i_sb, "XT_GETPAGE: xtree page corrupt");\ | ||
75 | BT_PUTPAGE(MP);\ | ||
76 | MP = NULL;\ | ||
77 | RC = -EIO;\ | ||
78 | }\ | ||
79 | }\ | ||
80 | } | ||
81 | |||
82 | /* for consistency */ | ||
83 | #define XT_PUTPAGE(MP) BT_PUTPAGE(MP) | ||
84 | |||
85 | #define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ | ||
86 | BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot) | ||
87 | /* xtree entry parameter descriptor */ | ||
88 | struct xtsplit { | ||
89 | struct metapage *mp; | ||
90 | s16 index; | ||
91 | u8 flag; | ||
92 | s64 off; | ||
93 | s64 addr; | ||
94 | int len; | ||
95 | struct pxdlist *pxdlist; | ||
96 | }; | ||
97 | |||
98 | |||
99 | /* | ||
100 | * statistics | ||
101 | */ | ||
102 | #ifdef CONFIG_JFS_STATISTICS | ||
103 | static struct { | ||
104 | uint search; | ||
105 | uint fastSearch; | ||
106 | uint split; | ||
107 | } xtStat; | ||
108 | #endif | ||
109 | |||
110 | |||
111 | /* | ||
112 | * forward references | ||
113 | */ | ||
114 | static int xtSearch(struct inode *ip, | ||
115 | s64 xoff, int *cmpp, struct btstack * btstack, int flag); | ||
116 | |||
117 | static int xtSplitUp(tid_t tid, | ||
118 | struct inode *ip, | ||
119 | struct xtsplit * split, struct btstack * btstack); | ||
120 | |||
121 | static int xtSplitPage(tid_t tid, struct inode *ip, struct xtsplit * split, | ||
122 | struct metapage ** rmpp, s64 * rbnp); | ||
123 | |||
124 | static int xtSplitRoot(tid_t tid, struct inode *ip, | ||
125 | struct xtsplit * split, struct metapage ** rmpp); | ||
126 | |||
127 | #ifdef _STILL_TO_PORT | ||
128 | static int xtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, | ||
129 | xtpage_t * fp, struct btstack * btstack); | ||
130 | |||
131 | static int xtSearchNode(struct inode *ip, | ||
132 | xad_t * xad, | ||
133 | int *cmpp, struct btstack * btstack, int flag); | ||
134 | |||
135 | static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp); | ||
136 | #endif /* _STILL_TO_PORT */ | ||
137 | |||
138 | /* External references */ | ||
139 | |||
140 | /* | ||
141 | * debug control | ||
142 | */ | ||
143 | /* #define _JFS_DEBUG_XTREE 1 */ | ||
144 | |||
145 | |||
146 | /* | ||
147 | * xtLookup() | ||
148 | * | ||
149 | * function: map a single page into a physical extent; | ||
150 | */ | ||
151 | int xtLookup(struct inode *ip, s64 lstart, | ||
152 | s64 llen, int *pflag, s64 * paddr, s32 * plen, int no_check) | ||
153 | { | ||
154 | int rc = 0; | ||
155 | struct btstack btstack; | ||
156 | int cmp; | ||
157 | s64 bn; | ||
158 | struct metapage *mp; | ||
159 | xtpage_t *p; | ||
160 | int index; | ||
161 | xad_t *xad; | ||
162 | s64 size, xoff, xend; | ||
163 | int xlen; | ||
164 | s64 xaddr; | ||
165 | |||
166 | *plen = 0; | ||
167 | |||
168 | if (!no_check) { | ||
169 | /* is lookup offset beyond eof ? */ | ||
170 | size = ((u64) ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >> | ||
171 | JFS_SBI(ip->i_sb)->l2bsize; | ||
172 | if (lstart >= size) { | ||
173 | jfs_err("xtLookup: lstart (0x%lx) >= size (0x%lx)", | ||
174 | (ulong) lstart, (ulong) size); | ||
175 | return 0; | ||
176 | } | ||
177 | } | ||
178 | |||
179 | /* | ||
180 | * search for the xad entry covering the logical extent | ||
181 | */ | ||
182 | //search: | ||
183 | if ((rc = xtSearch(ip, lstart, &cmp, &btstack, 0))) { | ||
184 | jfs_err("xtLookup: xtSearch returned %d", rc); | ||
185 | return rc; | ||
186 | } | ||
187 | |||
188 | /* | ||
189 | * compute the physical extent covering logical extent | ||
190 | * | ||
191 | * N.B. search may have failed (e.g., hole in sparse file), | ||
192 | * and returned the index of the next entry. | ||
193 | */ | ||
194 | /* retrieve search result */ | ||
195 | XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); | ||
196 | |||
197 | /* is xad found covering start of logical extent ? | ||
198 | * lstart is a page start address, | ||
199 | * i.e., lstart cannot start in a hole; | ||
200 | */ | ||
201 | if (cmp) | ||
202 | goto out; | ||
203 | |||
204 | /* | ||
205 | * lxd covered by xad | ||
206 | */ | ||
207 | xad = &p->xad[index]; | ||
208 | xoff = offsetXAD(xad); | ||
209 | xlen = lengthXAD(xad); | ||
210 | xend = xoff + xlen; | ||
211 | xaddr = addressXAD(xad); | ||
212 | |||
213 | /* initialize new pxd */ | ||
214 | *pflag = xad->flag; | ||
215 | *paddr = xaddr + (lstart - xoff); | ||
216 | /* a page must be fully covered by an xad */ | ||
217 | *plen = min(xend - lstart, llen); | ||
218 | |||
219 | out: | ||
220 | XT_PUTPAGE(mp); | ||
221 | |||
222 | return rc; | ||
223 | } | ||
224 | |||
225 | |||
226 | /* | ||
227 | * xtLookupList() | ||
228 | * | ||
229 | * function: map a single logical extent into a list of physical extent; | ||
230 | * | ||
231 | * parameter: | ||
232 | * struct inode *ip, | ||
233 | * struct lxdlist *lxdlist, lxd list (in) | ||
234 | * struct xadlist *xadlist, xad list (in/out) | ||
235 | * int flag) | ||
236 | * | ||
237 | * coverage of lxd by xad under assumption of | ||
238 | * . lxd's are ordered and disjoint. | ||
239 | * . xad's are ordered and disjoint. | ||
240 | * | ||
241 | * return: | ||
242 | * 0: success | ||
243 | * | ||
244 | * note: a page being written (even a single byte) is backed fully, | ||
245 | * except the last page which is only backed with blocks | ||
246 | * required to cover the last byte; | ||
247 | * the extent backing a page is fully contained within an xad; | ||
248 | */ | ||
249 | int xtLookupList(struct inode *ip, struct lxdlist * lxdlist, | ||
250 | struct xadlist * xadlist, int flag) | ||
251 | { | ||
252 | int rc = 0; | ||
253 | struct btstack btstack; | ||
254 | int cmp; | ||
255 | s64 bn; | ||
256 | struct metapage *mp; | ||
257 | xtpage_t *p; | ||
258 | int index; | ||
259 | lxd_t *lxd; | ||
260 | xad_t *xad, *pxd; | ||
261 | s64 size, lstart, lend, xstart, xend, pstart; | ||
262 | s64 llen, xlen, plen; | ||
263 | s64 xaddr, paddr; | ||
264 | int nlxd, npxd, maxnpxd; | ||
265 | |||
266 | npxd = xadlist->nxad = 0; | ||
267 | maxnpxd = xadlist->maxnxad; | ||
268 | pxd = xadlist->xad; | ||
269 | |||
270 | nlxd = lxdlist->nlxd; | ||
271 | lxd = lxdlist->lxd; | ||
272 | |||
273 | lstart = offsetLXD(lxd); | ||
274 | llen = lengthLXD(lxd); | ||
275 | lend = lstart + llen; | ||
276 | |||
277 | size = (ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >> | ||
278 | JFS_SBI(ip->i_sb)->l2bsize; | ||
279 | |||
280 | /* | ||
281 | * search for the xad entry covering the logical extent | ||
282 | */ | ||
283 | search: | ||
284 | if (lstart >= size) | ||
285 | return 0; | ||
286 | |||
287 | if ((rc = xtSearch(ip, lstart, &cmp, &btstack, 0))) | ||
288 | return rc; | ||
289 | |||
290 | /* | ||
291 | * compute the physical extent covering logical extent | ||
292 | * | ||
293 | * N.B. search may have failed (e.g., hole in sparse file), | ||
294 | * and returned the index of the next entry. | ||
295 | */ | ||
296 | //map: | ||
297 | /* retrieve search result */ | ||
298 | XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); | ||
299 | |||
300 | /* is xad on the next sibling page ? */ | ||
301 | if (index == le16_to_cpu(p->header.nextindex)) { | ||
302 | if (p->header.flag & BT_ROOT) | ||
303 | goto mapend; | ||
304 | |||
305 | if ((bn = le64_to_cpu(p->header.next)) == 0) | ||
306 | goto mapend; | ||
307 | |||
308 | XT_PUTPAGE(mp); | ||
309 | |||
310 | /* get next sibling page */ | ||
311 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
312 | if (rc) | ||
313 | return rc; | ||
314 | |||
315 | index = XTENTRYSTART; | ||
316 | } | ||
317 | |||
318 | xad = &p->xad[index]; | ||
319 | |||
320 | /* | ||
321 | * is lxd covered by xad ? | ||
322 | */ | ||
323 | compare: | ||
324 | xstart = offsetXAD(xad); | ||
325 | xlen = lengthXAD(xad); | ||
326 | xend = xstart + xlen; | ||
327 | xaddr = addressXAD(xad); | ||
328 | |||
329 | compare1: | ||
330 | if (xstart < lstart) | ||
331 | goto compare2; | ||
332 | |||
333 | /* (lstart <= xstart) */ | ||
334 | |||
335 | /* lxd is NOT covered by xad */ | ||
336 | if (lend <= xstart) { | ||
337 | /* | ||
338 | * get next lxd | ||
339 | */ | ||
340 | if (--nlxd == 0) | ||
341 | goto mapend; | ||
342 | lxd++; | ||
343 | |||
344 | lstart = offsetLXD(lxd); | ||
345 | llen = lengthLXD(lxd); | ||
346 | lend = lstart + llen; | ||
347 | if (lstart >= size) | ||
348 | goto mapend; | ||
349 | |||
350 | /* compare with the current xad */ | ||
351 | goto compare1; | ||
352 | } | ||
353 | /* lxd is covered by xad */ | ||
354 | else { /* (xstart < lend) */ | ||
355 | |||
356 | /* initialize new pxd */ | ||
357 | pstart = xstart; | ||
358 | plen = min(lend - xstart, xlen); | ||
359 | paddr = xaddr; | ||
360 | |||
361 | goto cover; | ||
362 | } | ||
363 | |||
364 | /* (xstart < lstart) */ | ||
365 | compare2: | ||
366 | /* lxd is covered by xad */ | ||
367 | if (lstart < xend) { | ||
368 | /* initialize new pxd */ | ||
369 | pstart = lstart; | ||
370 | plen = min(xend - lstart, llen); | ||
371 | paddr = xaddr + (lstart - xstart); | ||
372 | |||
373 | goto cover; | ||
374 | } | ||
375 | /* lxd is NOT covered by xad */ | ||
376 | else { /* (xend <= lstart) */ | ||
377 | |||
378 | /* | ||
379 | * get next xad | ||
380 | * | ||
381 | * linear search next xad covering lxd on | ||
382 | * the current xad page, and then tree search | ||
383 | */ | ||
384 | if (index == le16_to_cpu(p->header.nextindex) - 1) { | ||
385 | if (p->header.flag & BT_ROOT) | ||
386 | goto mapend; | ||
387 | |||
388 | XT_PUTPAGE(mp); | ||
389 | goto search; | ||
390 | } else { | ||
391 | index++; | ||
392 | xad++; | ||
393 | |||
394 | /* compare with new xad */ | ||
395 | goto compare; | ||
396 | } | ||
397 | } | ||
398 | |||
399 | /* | ||
400 | * lxd is covered by xad and a new pxd has been initialized | ||
401 | * (lstart <= xstart < lend) or (xstart < lstart < xend) | ||
402 | */ | ||
403 | cover: | ||
404 | /* finalize pxd corresponding to current xad */ | ||
405 | XT_PUTENTRY(pxd, xad->flag, pstart, plen, paddr); | ||
406 | |||
407 | if (++npxd >= maxnpxd) | ||
408 | goto mapend; | ||
409 | pxd++; | ||
410 | |||
411 | /* | ||
412 | * lxd is fully covered by xad | ||
413 | */ | ||
414 | if (lend <= xend) { | ||
415 | /* | ||
416 | * get next lxd | ||
417 | */ | ||
418 | if (--nlxd == 0) | ||
419 | goto mapend; | ||
420 | lxd++; | ||
421 | |||
422 | lstart = offsetLXD(lxd); | ||
423 | llen = lengthLXD(lxd); | ||
424 | lend = lstart + llen; | ||
425 | if (lstart >= size) | ||
426 | goto mapend; | ||
427 | |||
428 | /* | ||
429 | * test for old xad covering new lxd | ||
430 | * (old xstart < new lstart) | ||
431 | */ | ||
432 | goto compare2; | ||
433 | } | ||
434 | /* | ||
435 | * lxd is partially covered by xad | ||
436 | */ | ||
437 | else { /* (xend < lend) */ | ||
438 | |||
439 | /* | ||
440 | * get next xad | ||
441 | * | ||
442 | * linear search next xad covering lxd on | ||
443 | * the current xad page, and then next xad page search | ||
444 | */ | ||
445 | if (index == le16_to_cpu(p->header.nextindex) - 1) { | ||
446 | if (p->header.flag & BT_ROOT) | ||
447 | goto mapend; | ||
448 | |||
449 | if ((bn = le64_to_cpu(p->header.next)) == 0) | ||
450 | goto mapend; | ||
451 | |||
452 | XT_PUTPAGE(mp); | ||
453 | |||
454 | /* get next sibling page */ | ||
455 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
456 | if (rc) | ||
457 | return rc; | ||
458 | |||
459 | index = XTENTRYSTART; | ||
460 | xad = &p->xad[index]; | ||
461 | } else { | ||
462 | index++; | ||
463 | xad++; | ||
464 | } | ||
465 | |||
466 | /* | ||
467 | * test for new xad covering old lxd | ||
468 | * (old lstart < new xstart) | ||
469 | */ | ||
470 | goto compare; | ||
471 | } | ||
472 | |||
473 | mapend: | ||
474 | xadlist->nxad = npxd; | ||
475 | |||
476 | //out: | ||
477 | XT_PUTPAGE(mp); | ||
478 | |||
479 | return rc; | ||
480 | } | ||
481 | |||
482 | |||
483 | /* | ||
484 | * xtSearch() | ||
485 | * | ||
486 | * function: search for the xad entry covering specified offset. | ||
487 | * | ||
488 | * parameters: | ||
489 | * ip - file object; | ||
490 | * xoff - extent offset; | ||
491 | * cmpp - comparison result: | ||
492 | * btstack - traverse stack; | ||
493 | * flag - search process flag (XT_INSERT); | ||
494 | * | ||
495 | * returns: | ||
496 | * btstack contains (bn, index) of search path traversed to the entry. | ||
497 | * *cmpp is set to result of comparison with the entry returned. | ||
498 | * the page containing the entry is pinned at exit. | ||
499 | */ | ||
500 | static int xtSearch(struct inode *ip, s64 xoff, /* offset of extent */ | ||
501 | int *cmpp, struct btstack * btstack, int flag) | ||
502 | { | ||
503 | struct jfs_inode_info *jfs_ip = JFS_IP(ip); | ||
504 | int rc = 0; | ||
505 | int cmp = 1; /* init for empty page */ | ||
506 | s64 bn; /* block number */ | ||
507 | struct metapage *mp; /* page buffer */ | ||
508 | xtpage_t *p; /* page */ | ||
509 | xad_t *xad; | ||
510 | int base, index, lim, btindex; | ||
511 | struct btframe *btsp; | ||
512 | int nsplit = 0; /* number of pages to split */ | ||
513 | s64 t64; | ||
514 | |||
515 | INCREMENT(xtStat.search); | ||
516 | |||
517 | BT_CLR(btstack); | ||
518 | |||
519 | btstack->nsplit = 0; | ||
520 | |||
521 | /* | ||
522 | * search down tree from root: | ||
523 | * | ||
524 | * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of | ||
525 | * internal page, child page Pi contains entry with k, Ki <= K < Kj. | ||
526 | * | ||
527 | * if entry with search key K is not found | ||
528 | * internal page search find the entry with largest key Ki | ||
529 | * less than K which point to the child page to search; | ||
530 | * leaf page search find the entry with smallest key Kj | ||
531 | * greater than K so that the returned index is the position of | ||
532 | * the entry to be shifted right for insertion of new entry. | ||
533 | * for empty tree, search key is greater than any key of the tree. | ||
534 | * | ||
535 | * by convention, root bn = 0. | ||
536 | */ | ||
537 | for (bn = 0;;) { | ||
538 | /* get/pin the page to search */ | ||
539 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
540 | if (rc) | ||
541 | return rc; | ||
542 | |||
543 | /* try sequential access heuristics with the previous | ||
544 | * access entry in target leaf page: | ||
545 | * once search narrowed down into the target leaf, | ||
546 | * key must either match an entry in the leaf or | ||
547 | * key entry does not exist in the tree; | ||
548 | */ | ||
549 | //fastSearch: | ||
550 | if ((jfs_ip->btorder & BT_SEQUENTIAL) && | ||
551 | (p->header.flag & BT_LEAF) && | ||
552 | (index = jfs_ip->btindex) < | ||
553 | le16_to_cpu(p->header.nextindex)) { | ||
554 | xad = &p->xad[index]; | ||
555 | t64 = offsetXAD(xad); | ||
556 | if (xoff < t64 + lengthXAD(xad)) { | ||
557 | if (xoff >= t64) { | ||
558 | *cmpp = 0; | ||
559 | goto out; | ||
560 | } | ||
561 | |||
562 | /* stop sequential access heuristics */ | ||
563 | goto binarySearch; | ||
564 | } else { /* (t64 + lengthXAD(xad)) <= xoff */ | ||
565 | |||
566 | /* try next sequential entry */ | ||
567 | index++; | ||
568 | if (index < | ||
569 | le16_to_cpu(p->header.nextindex)) { | ||
570 | xad++; | ||
571 | t64 = offsetXAD(xad); | ||
572 | if (xoff < t64 + lengthXAD(xad)) { | ||
573 | if (xoff >= t64) { | ||
574 | *cmpp = 0; | ||
575 | goto out; | ||
576 | } | ||
577 | |||
578 | /* miss: key falls between | ||
579 | * previous and this entry | ||
580 | */ | ||
581 | *cmpp = 1; | ||
582 | goto out; | ||
583 | } | ||
584 | |||
585 | /* (xoff >= t64 + lengthXAD(xad)); | ||
586 | * matching entry may be further out: | ||
587 | * stop heuristic search | ||
588 | */ | ||
589 | /* stop sequential access heuristics */ | ||
590 | goto binarySearch; | ||
591 | } | ||
592 | |||
593 | /* (index == p->header.nextindex); | ||
594 | * miss: key entry does not exist in | ||
595 | * the target leaf/tree | ||
596 | */ | ||
597 | *cmpp = 1; | ||
598 | goto out; | ||
599 | } | ||
600 | |||
601 | /* | ||
602 | * if hit, return index of the entry found, and | ||
603 | * if miss, where new entry with search key is | ||
604 | * to be inserted; | ||
605 | */ | ||
606 | out: | ||
607 | /* compute number of pages to split */ | ||
608 | if (flag & XT_INSERT) { | ||
609 | if (p->header.nextindex == /* little-endian */ | ||
610 | p->header.maxentry) | ||
611 | nsplit++; | ||
612 | else | ||
613 | nsplit = 0; | ||
614 | btstack->nsplit = nsplit; | ||
615 | } | ||
616 | |||
617 | /* save search result */ | ||
618 | btsp = btstack->top; | ||
619 | btsp->bn = bn; | ||
620 | btsp->index = index; | ||
621 | btsp->mp = mp; | ||
622 | |||
623 | /* update sequential access heuristics */ | ||
624 | jfs_ip->btindex = index; | ||
625 | |||
626 | INCREMENT(xtStat.fastSearch); | ||
627 | return 0; | ||
628 | } | ||
629 | |||
630 | /* well, ... full search now */ | ||
631 | binarySearch: | ||
632 | lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART; | ||
633 | |||
634 | /* | ||
635 | * binary search with search key K on the current page | ||
636 | */ | ||
637 | for (base = XTENTRYSTART; lim; lim >>= 1) { | ||
638 | index = base + (lim >> 1); | ||
639 | |||
640 | XT_CMP(cmp, xoff, &p->xad[index], t64); | ||
641 | if (cmp == 0) { | ||
642 | /* | ||
643 | * search hit | ||
644 | */ | ||
645 | /* search hit - leaf page: | ||
646 | * return the entry found | ||
647 | */ | ||
648 | if (p->header.flag & BT_LEAF) { | ||
649 | *cmpp = cmp; | ||
650 | |||
651 | /* compute number of pages to split */ | ||
652 | if (flag & XT_INSERT) { | ||
653 | if (p->header.nextindex == | ||
654 | p->header.maxentry) | ||
655 | nsplit++; | ||
656 | else | ||
657 | nsplit = 0; | ||
658 | btstack->nsplit = nsplit; | ||
659 | } | ||
660 | |||
661 | /* save search result */ | ||
662 | btsp = btstack->top; | ||
663 | btsp->bn = bn; | ||
664 | btsp->index = index; | ||
665 | btsp->mp = mp; | ||
666 | |||
667 | /* init sequential access heuristics */ | ||
668 | btindex = jfs_ip->btindex; | ||
669 | if (index == btindex || | ||
670 | index == btindex + 1) | ||
671 | jfs_ip->btorder = BT_SEQUENTIAL; | ||
672 | else | ||
673 | jfs_ip->btorder = BT_RANDOM; | ||
674 | jfs_ip->btindex = index; | ||
675 | |||
676 | return 0; | ||
677 | } | ||
678 | |||
679 | /* search hit - internal page: | ||
680 | * descend/search its child page | ||
681 | */ | ||
682 | goto next; | ||
683 | } | ||
684 | |||
685 | if (cmp > 0) { | ||
686 | base = index + 1; | ||
687 | --lim; | ||
688 | } | ||
689 | } | ||
690 | |||
691 | /* | ||
692 | * search miss | ||
693 | * | ||
694 | * base is the smallest index with key (Kj) greater than | ||
695 | * search key (K) and may be zero or maxentry index. | ||
696 | */ | ||
697 | /* | ||
698 | * search miss - leaf page: | ||
699 | * | ||
700 | * return location of entry (base) where new entry with | ||
701 | * search key K is to be inserted. | ||
702 | */ | ||
703 | if (p->header.flag & BT_LEAF) { | ||
704 | *cmpp = cmp; | ||
705 | |||
706 | /* compute number of pages to split */ | ||
707 | if (flag & XT_INSERT) { | ||
708 | if (p->header.nextindex == | ||
709 | p->header.maxentry) | ||
710 | nsplit++; | ||
711 | else | ||
712 | nsplit = 0; | ||
713 | btstack->nsplit = nsplit; | ||
714 | } | ||
715 | |||
716 | /* save search result */ | ||
717 | btsp = btstack->top; | ||
718 | btsp->bn = bn; | ||
719 | btsp->index = base; | ||
720 | btsp->mp = mp; | ||
721 | |||
722 | /* init sequential access heuristics */ | ||
723 | btindex = jfs_ip->btindex; | ||
724 | if (base == btindex || base == btindex + 1) | ||
725 | jfs_ip->btorder = BT_SEQUENTIAL; | ||
726 | else | ||
727 | jfs_ip->btorder = BT_RANDOM; | ||
728 | jfs_ip->btindex = base; | ||
729 | |||
730 | return 0; | ||
731 | } | ||
732 | |||
733 | /* | ||
734 | * search miss - non-leaf page: | ||
735 | * | ||
736 | * if base is non-zero, decrement base by one to get the parent | ||
737 | * entry of the child page to search. | ||
738 | */ | ||
739 | index = base ? base - 1 : base; | ||
740 | |||
741 | /* | ||
742 | * go down to child page | ||
743 | */ | ||
744 | next: | ||
745 | /* update number of pages to split */ | ||
746 | if (p->header.nextindex == p->header.maxentry) | ||
747 | nsplit++; | ||
748 | else | ||
749 | nsplit = 0; | ||
750 | |||
751 | /* push (bn, index) of the parent page/entry */ | ||
752 | BT_PUSH(btstack, bn, index); | ||
753 | |||
754 | /* get the child page block number */ | ||
755 | bn = addressXAD(&p->xad[index]); | ||
756 | |||
757 | /* unpin the parent page */ | ||
758 | XT_PUTPAGE(mp); | ||
759 | } | ||
760 | } | ||
761 | |||
762 | /* | ||
763 | * xtInsert() | ||
764 | * | ||
765 | * function: | ||
766 | * | ||
767 | * parameter: | ||
768 | * tid - transaction id; | ||
769 | * ip - file object; | ||
770 | * xflag - extent flag (XAD_NOTRECORDED): | ||
771 | * xoff - extent offset; | ||
772 | * xlen - extent length; | ||
773 | * xaddrp - extent address pointer (in/out): | ||
774 | * if (*xaddrp) | ||
775 | * caller allocated data extent at *xaddrp; | ||
776 | * else | ||
777 | * allocate data extent and return its xaddr; | ||
778 | * flag - | ||
779 | * | ||
780 | * return: | ||
781 | */ | ||
782 | int xtInsert(tid_t tid, /* transaction id */ | ||
783 | struct inode *ip, int xflag, s64 xoff, s32 xlen, s64 * xaddrp, | ||
784 | int flag) | ||
785 | { | ||
786 | int rc = 0; | ||
787 | s64 xaddr, hint; | ||
788 | struct metapage *mp; /* meta-page buffer */ | ||
789 | xtpage_t *p; /* base B+-tree index page */ | ||
790 | s64 bn; | ||
791 | int index, nextindex; | ||
792 | struct btstack btstack; /* traverse stack */ | ||
793 | struct xtsplit split; /* split information */ | ||
794 | xad_t *xad; | ||
795 | int cmp; | ||
796 | struct tlock *tlck; | ||
797 | struct xtlock *xtlck; | ||
798 | |||
799 | jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen); | ||
800 | |||
801 | /* | ||
802 | * search for the entry location at which to insert: | ||
803 | * | ||
804 | * xtFastSearch() and xtSearch() both returns (leaf page | ||
805 | * pinned, index at which to insert). | ||
806 | * n.b. xtSearch() may return index of maxentry of | ||
807 | * the full page. | ||
808 | */ | ||
809 | if ((rc = xtSearch(ip, xoff, &cmp, &btstack, XT_INSERT))) | ||
810 | return rc; | ||
811 | |||
812 | /* retrieve search result */ | ||
813 | XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); | ||
814 | |||
815 | /* This test must follow XT_GETSEARCH since mp must be valid if | ||
816 | * we branch to out: */ | ||
817 | if (cmp == 0) { | ||
818 | rc = -EEXIST; | ||
819 | goto out; | ||
820 | } | ||
821 | |||
822 | /* | ||
823 | * allocate data extent requested | ||
824 | * | ||
825 | * allocation hint: last xad | ||
826 | */ | ||
827 | if ((xaddr = *xaddrp) == 0) { | ||
828 | if (index > XTENTRYSTART) { | ||
829 | xad = &p->xad[index - 1]; | ||
830 | hint = addressXAD(xad) + lengthXAD(xad) - 1; | ||
831 | } else | ||
832 | hint = 0; | ||
833 | if ((rc = DQUOT_ALLOC_BLOCK(ip, xlen))) | ||
834 | goto out; | ||
835 | if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) { | ||
836 | DQUOT_FREE_BLOCK(ip, xlen); | ||
837 | goto out; | ||
838 | } | ||
839 | } | ||
840 | |||
841 | /* | ||
842 | * insert entry for new extent | ||
843 | */ | ||
844 | xflag |= XAD_NEW; | ||
845 | |||
846 | /* | ||
847 | * if the leaf page is full, split the page and | ||
848 | * propagate up the router entry for the new page from split | ||
849 | * | ||
850 | * The xtSplitUp() will insert the entry and unpin the leaf page. | ||
851 | */ | ||
852 | nextindex = le16_to_cpu(p->header.nextindex); | ||
853 | if (nextindex == le16_to_cpu(p->header.maxentry)) { | ||
854 | split.mp = mp; | ||
855 | split.index = index; | ||
856 | split.flag = xflag; | ||
857 | split.off = xoff; | ||
858 | split.len = xlen; | ||
859 | split.addr = xaddr; | ||
860 | split.pxdlist = NULL; | ||
861 | if ((rc = xtSplitUp(tid, ip, &split, &btstack))) { | ||
862 | /* undo data extent allocation */ | ||
863 | if (*xaddrp == 0) { | ||
864 | dbFree(ip, xaddr, (s64) xlen); | ||
865 | DQUOT_FREE_BLOCK(ip, xlen); | ||
866 | } | ||
867 | return rc; | ||
868 | } | ||
869 | |||
870 | *xaddrp = xaddr; | ||
871 | return 0; | ||
872 | } | ||
873 | |||
874 | /* | ||
875 | * insert the new entry into the leaf page | ||
876 | */ | ||
877 | /* | ||
878 | * acquire a transaction lock on the leaf page; | ||
879 | * | ||
880 | * action: xad insertion/extension; | ||
881 | */ | ||
882 | BT_MARK_DIRTY(mp, ip); | ||
883 | |||
884 | /* if insert into middle, shift right remaining entries. */ | ||
885 | if (index < nextindex) | ||
886 | memmove(&p->xad[index + 1], &p->xad[index], | ||
887 | (nextindex - index) * sizeof(xad_t)); | ||
888 | |||
889 | /* insert the new entry: mark the entry NEW */ | ||
890 | xad = &p->xad[index]; | ||
891 | XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); | ||
892 | |||
893 | /* advance next available entry index */ | ||
894 | p->header.nextindex = | ||
895 | cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); | ||
896 | |||
897 | /* Don't log it if there are no links to the file */ | ||
898 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
899 | tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); | ||
900 | xtlck = (struct xtlock *) & tlck->lock; | ||
901 | xtlck->lwm.offset = | ||
902 | (xtlck->lwm.offset) ? min(index, | ||
903 | (int)xtlck->lwm.offset) : index; | ||
904 | xtlck->lwm.length = | ||
905 | le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; | ||
906 | } | ||
907 | |||
908 | *xaddrp = xaddr; | ||
909 | |||
910 | out: | ||
911 | /* unpin the leaf page */ | ||
912 | XT_PUTPAGE(mp); | ||
913 | |||
914 | return rc; | ||
915 | } | ||
916 | |||
917 | |||
918 | /* | ||
919 | * xtSplitUp() | ||
920 | * | ||
921 | * function: | ||
922 | * split full pages as propagating insertion up the tree | ||
923 | * | ||
924 | * parameter: | ||
925 | * tid - transaction id; | ||
926 | * ip - file object; | ||
927 | * split - entry parameter descriptor; | ||
928 | * btstack - traverse stack from xtSearch() | ||
929 | * | ||
930 | * return: | ||
931 | */ | ||
932 | static int | ||
933 | xtSplitUp(tid_t tid, | ||
934 | struct inode *ip, struct xtsplit * split, struct btstack * btstack) | ||
935 | { | ||
936 | int rc = 0; | ||
937 | struct metapage *smp; | ||
938 | xtpage_t *sp; /* split page */ | ||
939 | struct metapage *rmp; | ||
940 | s64 rbn; /* new right page block number */ | ||
941 | struct metapage *rcmp; | ||
942 | xtpage_t *rcp; /* right child page */ | ||
943 | s64 rcbn; /* right child page block number */ | ||
944 | int skip; /* index of entry of insertion */ | ||
945 | int nextindex; /* next available entry index of p */ | ||
946 | struct btframe *parent; /* parent page entry on traverse stack */ | ||
947 | xad_t *xad; | ||
948 | s64 xaddr; | ||
949 | int xlen; | ||
950 | int nsplit; /* number of pages split */ | ||
951 | struct pxdlist pxdlist; | ||
952 | pxd_t *pxd; | ||
953 | struct tlock *tlck; | ||
954 | struct xtlock *xtlck; | ||
955 | |||
956 | smp = split->mp; | ||
957 | sp = XT_PAGE(ip, smp); | ||
958 | |||
959 | /* is inode xtree root extension/inline EA area free ? */ | ||
960 | if ((sp->header.flag & BT_ROOT) && (!S_ISDIR(ip->i_mode)) && | ||
961 | (le16_to_cpu(sp->header.maxentry) < XTROOTMAXSLOT) && | ||
962 | (JFS_IP(ip)->mode2 & INLINEEA)) { | ||
963 | sp->header.maxentry = cpu_to_le16(XTROOTMAXSLOT); | ||
964 | JFS_IP(ip)->mode2 &= ~INLINEEA; | ||
965 | |||
966 | BT_MARK_DIRTY(smp, ip); | ||
967 | /* | ||
968 | * acquire a transaction lock on the leaf page; | ||
969 | * | ||
970 | * action: xad insertion/extension; | ||
971 | */ | ||
972 | |||
973 | /* if insert into middle, shift right remaining entries. */ | ||
974 | skip = split->index; | ||
975 | nextindex = le16_to_cpu(sp->header.nextindex); | ||
976 | if (skip < nextindex) | ||
977 | memmove(&sp->xad[skip + 1], &sp->xad[skip], | ||
978 | (nextindex - skip) * sizeof(xad_t)); | ||
979 | |||
980 | /* insert the new entry: mark the entry NEW */ | ||
981 | xad = &sp->xad[skip]; | ||
982 | XT_PUTENTRY(xad, split->flag, split->off, split->len, | ||
983 | split->addr); | ||
984 | |||
985 | /* advance next available entry index */ | ||
986 | sp->header.nextindex = | ||
987 | cpu_to_le16(le16_to_cpu(sp->header.nextindex) + 1); | ||
988 | |||
989 | /* Don't log it if there are no links to the file */ | ||
990 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
991 | tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW); | ||
992 | xtlck = (struct xtlock *) & tlck->lock; | ||
993 | xtlck->lwm.offset = (xtlck->lwm.offset) ? | ||
994 | min(skip, (int)xtlck->lwm.offset) : skip; | ||
995 | xtlck->lwm.length = | ||
996 | le16_to_cpu(sp->header.nextindex) - | ||
997 | xtlck->lwm.offset; | ||
998 | } | ||
999 | |||
1000 | return 0; | ||
1001 | } | ||
1002 | |||
1003 | /* | ||
1004 | * allocate new index blocks to cover index page split(s) | ||
1005 | * | ||
1006 | * allocation hint: ? | ||
1007 | */ | ||
1008 | if (split->pxdlist == NULL) { | ||
1009 | nsplit = btstack->nsplit; | ||
1010 | split->pxdlist = &pxdlist; | ||
1011 | pxdlist.maxnpxd = pxdlist.npxd = 0; | ||
1012 | pxd = &pxdlist.pxd[0]; | ||
1013 | xlen = JFS_SBI(ip->i_sb)->nbperpage; | ||
1014 | for (; nsplit > 0; nsplit--, pxd++) { | ||
1015 | if ((rc = dbAlloc(ip, (s64) 0, (s64) xlen, &xaddr)) | ||
1016 | == 0) { | ||
1017 | PXDaddress(pxd, xaddr); | ||
1018 | PXDlength(pxd, xlen); | ||
1019 | |||
1020 | pxdlist.maxnpxd++; | ||
1021 | |||
1022 | continue; | ||
1023 | } | ||
1024 | |||
1025 | /* undo allocation */ | ||
1026 | |||
1027 | XT_PUTPAGE(smp); | ||
1028 | return rc; | ||
1029 | } | ||
1030 | } | ||
1031 | |||
1032 | /* | ||
1033 | * Split leaf page <sp> into <sp> and a new right page <rp>. | ||
1034 | * | ||
1035 | * The split routines insert the new entry into the leaf page, | ||
1036 | * and acquire txLock as appropriate. | ||
1037 | * return <rp> pinned and its block number <rpbn>. | ||
1038 | */ | ||
1039 | rc = (sp->header.flag & BT_ROOT) ? | ||
1040 | xtSplitRoot(tid, ip, split, &rmp) : | ||
1041 | xtSplitPage(tid, ip, split, &rmp, &rbn); | ||
1042 | |||
1043 | XT_PUTPAGE(smp); | ||
1044 | |||
1045 | if (rc) | ||
1046 | return -EIO; | ||
1047 | /* | ||
1048 | * propagate up the router entry for the leaf page just split | ||
1049 | * | ||
1050 | * insert a router entry for the new page into the parent page, | ||
1051 | * propagate the insert/split up the tree by walking back the stack | ||
1052 | * of (bn of parent page, index of child page entry in parent page) | ||
1053 | * that were traversed during the search for the page that split. | ||
1054 | * | ||
1055 | * the propagation of insert/split up the tree stops if the root | ||
1056 | * splits or the page inserted into doesn't have to split to hold | ||
1057 | * the new entry. | ||
1058 | * | ||
1059 | * the parent entry for the split page remains the same, and | ||
1060 | * a new entry is inserted at its right with the first key and | ||
1061 | * block number of the new right page. | ||
1062 | * | ||
1063 | * There are a maximum of 3 pages pinned at any time: | ||
1064 | * right child, left parent and right parent (when the parent splits) | ||
1065 | * to keep the child page pinned while working on the parent. | ||
1066 | * make sure that all pins are released at exit. | ||
1067 | */ | ||
1068 | while ((parent = BT_POP(btstack)) != NULL) { | ||
1069 | /* parent page specified by stack frame <parent> */ | ||
1070 | |||
1071 | /* keep current child pages <rcp> pinned */ | ||
1072 | rcmp = rmp; | ||
1073 | rcbn = rbn; | ||
1074 | rcp = XT_PAGE(ip, rcmp); | ||
1075 | |||
1076 | /* | ||
1077 | * insert router entry in parent for new right child page <rp> | ||
1078 | */ | ||
1079 | /* get/pin the parent page <sp> */ | ||
1080 | XT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); | ||
1081 | if (rc) { | ||
1082 | XT_PUTPAGE(rcmp); | ||
1083 | return rc; | ||
1084 | } | ||
1085 | |||
1086 | /* | ||
1087 | * The new key entry goes ONE AFTER the index of parent entry, | ||
1088 | * because the split was to the right. | ||
1089 | */ | ||
1090 | skip = parent->index + 1; | ||
1091 | |||
1092 | /* | ||
1093 | * split or shift right remaining entries of the parent page | ||
1094 | */ | ||
1095 | nextindex = le16_to_cpu(sp->header.nextindex); | ||
1096 | /* | ||
1097 | * parent page is full - split the parent page | ||
1098 | */ | ||
1099 | if (nextindex == le16_to_cpu(sp->header.maxentry)) { | ||
1100 | /* init for parent page split */ | ||
1101 | split->mp = smp; | ||
1102 | split->index = skip; /* index at insert */ | ||
1103 | split->flag = XAD_NEW; | ||
1104 | split->off = offsetXAD(&rcp->xad[XTENTRYSTART]); | ||
1105 | split->len = JFS_SBI(ip->i_sb)->nbperpage; | ||
1106 | split->addr = rcbn; | ||
1107 | |||
1108 | /* unpin previous right child page */ | ||
1109 | XT_PUTPAGE(rcmp); | ||
1110 | |||
1111 | /* The split routines insert the new entry, | ||
1112 | * and acquire txLock as appropriate. | ||
1113 | * return <rp> pinned and its block number <rpbn>. | ||
1114 | */ | ||
1115 | rc = (sp->header.flag & BT_ROOT) ? | ||
1116 | xtSplitRoot(tid, ip, split, &rmp) : | ||
1117 | xtSplitPage(tid, ip, split, &rmp, &rbn); | ||
1118 | if (rc) { | ||
1119 | XT_PUTPAGE(smp); | ||
1120 | return rc; | ||
1121 | } | ||
1122 | |||
1123 | XT_PUTPAGE(smp); | ||
1124 | /* keep new child page <rp> pinned */ | ||
1125 | } | ||
1126 | /* | ||
1127 | * parent page is not full - insert in parent page | ||
1128 | */ | ||
1129 | else { | ||
1130 | /* | ||
1131 | * insert router entry in parent for the right child | ||
1132 | * page from the first entry of the right child page: | ||
1133 | */ | ||
1134 | /* | ||
1135 | * acquire a transaction lock on the parent page; | ||
1136 | * | ||
1137 | * action: router xad insertion; | ||
1138 | */ | ||
1139 | BT_MARK_DIRTY(smp, ip); | ||
1140 | |||
1141 | /* | ||
1142 | * if insert into middle, shift right remaining entries | ||
1143 | */ | ||
1144 | if (skip < nextindex) | ||
1145 | memmove(&sp->xad[skip + 1], &sp->xad[skip], | ||
1146 | (nextindex - | ||
1147 | skip) << L2XTSLOTSIZE); | ||
1148 | |||
1149 | /* insert the router entry */ | ||
1150 | xad = &sp->xad[skip]; | ||
1151 | XT_PUTENTRY(xad, XAD_NEW, | ||
1152 | offsetXAD(&rcp->xad[XTENTRYSTART]), | ||
1153 | JFS_SBI(ip->i_sb)->nbperpage, rcbn); | ||
1154 | |||
1155 | /* advance next available entry index. */ | ||
1156 | sp->header.nextindex = | ||
1157 | cpu_to_le16(le16_to_cpu(sp->header.nextindex) + | ||
1158 | 1); | ||
1159 | |||
1160 | /* Don't log it if there are no links to the file */ | ||
1161 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
1162 | tlck = txLock(tid, ip, smp, | ||
1163 | tlckXTREE | tlckGROW); | ||
1164 | xtlck = (struct xtlock *) & tlck->lock; | ||
1165 | xtlck->lwm.offset = (xtlck->lwm.offset) ? | ||
1166 | min(skip, (int)xtlck->lwm.offset) : skip; | ||
1167 | xtlck->lwm.length = | ||
1168 | le16_to_cpu(sp->header.nextindex) - | ||
1169 | xtlck->lwm.offset; | ||
1170 | } | ||
1171 | |||
1172 | /* unpin parent page */ | ||
1173 | XT_PUTPAGE(smp); | ||
1174 | |||
1175 | /* exit propagate up */ | ||
1176 | break; | ||
1177 | } | ||
1178 | } | ||
1179 | |||
1180 | /* unpin current right page */ | ||
1181 | XT_PUTPAGE(rmp); | ||
1182 | |||
1183 | return 0; | ||
1184 | } | ||
1185 | |||
1186 | |||
1187 | /* | ||
1188 | * xtSplitPage() | ||
1189 | * | ||
1190 | * function: | ||
1191 | * split a full non-root page into | ||
1192 | * original/split/left page and new right page | ||
1193 | * i.e., the original/split page remains as left page. | ||
1194 | * | ||
1195 | * parameter: | ||
1196 | * int tid, | ||
1197 | * struct inode *ip, | ||
1198 | * struct xtsplit *split, | ||
1199 | * struct metapage **rmpp, | ||
1200 | * u64 *rbnp, | ||
1201 | * | ||
1202 | * return: | ||
1203 | * Pointer to page in which to insert or NULL on error. | ||
1204 | */ | ||
1205 | static int | ||
1206 | xtSplitPage(tid_t tid, struct inode *ip, | ||
1207 | struct xtsplit * split, struct metapage ** rmpp, s64 * rbnp) | ||
1208 | { | ||
1209 | int rc = 0; | ||
1210 | struct metapage *smp; | ||
1211 | xtpage_t *sp; | ||
1212 | struct metapage *rmp; | ||
1213 | xtpage_t *rp; /* new right page allocated */ | ||
1214 | s64 rbn; /* new right page block number */ | ||
1215 | struct metapage *mp; | ||
1216 | xtpage_t *p; | ||
1217 | s64 nextbn; | ||
1218 | int skip, maxentry, middle, righthalf, n; | ||
1219 | xad_t *xad; | ||
1220 | struct pxdlist *pxdlist; | ||
1221 | pxd_t *pxd; | ||
1222 | struct tlock *tlck; | ||
1223 | struct xtlock *sxtlck = NULL, *rxtlck = NULL; | ||
1224 | int quota_allocation = 0; | ||
1225 | |||
1226 | smp = split->mp; | ||
1227 | sp = XT_PAGE(ip, smp); | ||
1228 | |||
1229 | INCREMENT(xtStat.split); | ||
1230 | |||
1231 | pxdlist = split->pxdlist; | ||
1232 | pxd = &pxdlist->pxd[pxdlist->npxd]; | ||
1233 | pxdlist->npxd++; | ||
1234 | rbn = addressPXD(pxd); | ||
1235 | |||
1236 | /* Allocate blocks to quota. */ | ||
1237 | if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) { | ||
1238 | rc = -EDQUOT; | ||
1239 | goto clean_up; | ||
1240 | } | ||
1241 | |||
1242 | quota_allocation += lengthPXD(pxd); | ||
1243 | |||
1244 | /* | ||
1245 | * allocate the new right page for the split | ||
1246 | */ | ||
1247 | rmp = get_metapage(ip, rbn, PSIZE, 1); | ||
1248 | if (rmp == NULL) { | ||
1249 | rc = -EIO; | ||
1250 | goto clean_up; | ||
1251 | } | ||
1252 | |||
1253 | jfs_info("xtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); | ||
1254 | |||
1255 | BT_MARK_DIRTY(rmp, ip); | ||
1256 | /* | ||
1257 | * action: new page; | ||
1258 | */ | ||
1259 | |||
1260 | rp = (xtpage_t *) rmp->data; | ||
1261 | rp->header.self = *pxd; | ||
1262 | rp->header.flag = sp->header.flag & BT_TYPE; | ||
1263 | rp->header.maxentry = sp->header.maxentry; /* little-endian */ | ||
1264 | rp->header.nextindex = cpu_to_le16(XTENTRYSTART); | ||
1265 | |||
1266 | BT_MARK_DIRTY(smp, ip); | ||
1267 | /* Don't log it if there are no links to the file */ | ||
1268 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
1269 | /* | ||
1270 | * acquire a transaction lock on the new right page; | ||
1271 | */ | ||
1272 | tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW); | ||
1273 | rxtlck = (struct xtlock *) & tlck->lock; | ||
1274 | rxtlck->lwm.offset = XTENTRYSTART; | ||
1275 | /* | ||
1276 | * acquire a transaction lock on the split page | ||
1277 | */ | ||
1278 | tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW); | ||
1279 | sxtlck = (struct xtlock *) & tlck->lock; | ||
1280 | } | ||
1281 | |||
1282 | /* | ||
1283 | * initialize/update sibling pointers of <sp> and <rp> | ||
1284 | */ | ||
1285 | nextbn = le64_to_cpu(sp->header.next); | ||
1286 | rp->header.next = cpu_to_le64(nextbn); | ||
1287 | rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self)); | ||
1288 | sp->header.next = cpu_to_le64(rbn); | ||
1289 | |||
1290 | skip = split->index; | ||
1291 | |||
1292 | /* | ||
1293 | * sequential append at tail (after last entry of last page) | ||
1294 | * | ||
1295 | * if splitting the last page on a level because of appending | ||
1296 | * a entry to it (skip is maxentry), it's likely that the access is | ||
1297 | * sequential. adding an empty page on the side of the level is less | ||
1298 | * work and can push the fill factor much higher than normal. | ||
1299 | * if we're wrong it's no big deal - we will do the split the right | ||
1300 | * way next time. | ||
1301 | * (it may look like it's equally easy to do a similar hack for | ||
1302 | * reverse sorted data, that is, split the tree left, but it's not. | ||
1303 | * Be my guest.) | ||
1304 | */ | ||
1305 | if (nextbn == 0 && skip == le16_to_cpu(sp->header.maxentry)) { | ||
1306 | /* | ||
1307 | * acquire a transaction lock on the new/right page; | ||
1308 | * | ||
1309 | * action: xad insertion; | ||
1310 | */ | ||
1311 | /* insert entry at the first entry of the new right page */ | ||
1312 | xad = &rp->xad[XTENTRYSTART]; | ||
1313 | XT_PUTENTRY(xad, split->flag, split->off, split->len, | ||
1314 | split->addr); | ||
1315 | |||
1316 | rp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1); | ||
1317 | |||
1318 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
1319 | /* rxtlck->lwm.offset = XTENTRYSTART; */ | ||
1320 | rxtlck->lwm.length = 1; | ||
1321 | } | ||
1322 | |||
1323 | *rmpp = rmp; | ||
1324 | *rbnp = rbn; | ||
1325 | |||
1326 | jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp); | ||
1327 | return 0; | ||
1328 | } | ||
1329 | |||
1330 | /* | ||
1331 | * non-sequential insert (at possibly middle page) | ||
1332 | */ | ||
1333 | |||
1334 | /* | ||
1335 | * update previous pointer of old next/right page of <sp> | ||
1336 | */ | ||
1337 | if (nextbn != 0) { | ||
1338 | XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); | ||
1339 | if (rc) { | ||
1340 | XT_PUTPAGE(rmp); | ||
1341 | goto clean_up; | ||
1342 | } | ||
1343 | |||
1344 | BT_MARK_DIRTY(mp, ip); | ||
1345 | /* | ||
1346 | * acquire a transaction lock on the next page; | ||
1347 | * | ||
1348 | * action:sibling pointer update; | ||
1349 | */ | ||
1350 | if (!test_cflag(COMMIT_Nolink, ip)) | ||
1351 | tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); | ||
1352 | |||
1353 | p->header.prev = cpu_to_le64(rbn); | ||
1354 | |||
1355 | /* sibling page may have been updated previously, or | ||
1356 | * it may be updated later; | ||
1357 | */ | ||
1358 | |||
1359 | XT_PUTPAGE(mp); | ||
1360 | } | ||
1361 | |||
1362 | /* | ||
1363 | * split the data between the split and new/right pages | ||
1364 | */ | ||
1365 | maxentry = le16_to_cpu(sp->header.maxentry); | ||
1366 | middle = maxentry >> 1; | ||
1367 | righthalf = maxentry - middle; | ||
1368 | |||
1369 | /* | ||
1370 | * skip index in old split/left page - insert into left page: | ||
1371 | */ | ||
1372 | if (skip <= middle) { | ||
1373 | /* move right half of split page to the new right page */ | ||
1374 | memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle], | ||
1375 | righthalf << L2XTSLOTSIZE); | ||
1376 | |||
1377 | /* shift right tail of left half to make room for new entry */ | ||
1378 | if (skip < middle) | ||
1379 | memmove(&sp->xad[skip + 1], &sp->xad[skip], | ||
1380 | (middle - skip) << L2XTSLOTSIZE); | ||
1381 | |||
1382 | /* insert new entry */ | ||
1383 | xad = &sp->xad[skip]; | ||
1384 | XT_PUTENTRY(xad, split->flag, split->off, split->len, | ||
1385 | split->addr); | ||
1386 | |||
1387 | /* update page header */ | ||
1388 | sp->header.nextindex = cpu_to_le16(middle + 1); | ||
1389 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
1390 | sxtlck->lwm.offset = (sxtlck->lwm.offset) ? | ||
1391 | min(skip, (int)sxtlck->lwm.offset) : skip; | ||
1392 | } | ||
1393 | |||
1394 | rp->header.nextindex = | ||
1395 | cpu_to_le16(XTENTRYSTART + righthalf); | ||
1396 | } | ||
1397 | /* | ||
1398 | * skip index in new right page - insert into right page: | ||
1399 | */ | ||
1400 | else { | ||
1401 | /* move left head of right half to right page */ | ||
1402 | n = skip - middle; | ||
1403 | memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle], | ||
1404 | n << L2XTSLOTSIZE); | ||
1405 | |||
1406 | /* insert new entry */ | ||
1407 | n += XTENTRYSTART; | ||
1408 | xad = &rp->xad[n]; | ||
1409 | XT_PUTENTRY(xad, split->flag, split->off, split->len, | ||
1410 | split->addr); | ||
1411 | |||
1412 | /* move right tail of right half to right page */ | ||
1413 | if (skip < maxentry) | ||
1414 | memmove(&rp->xad[n + 1], &sp->xad[skip], | ||
1415 | (maxentry - skip) << L2XTSLOTSIZE); | ||
1416 | |||
1417 | /* update page header */ | ||
1418 | sp->header.nextindex = cpu_to_le16(middle); | ||
1419 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
1420 | sxtlck->lwm.offset = (sxtlck->lwm.offset) ? | ||
1421 | min(middle, (int)sxtlck->lwm.offset) : middle; | ||
1422 | } | ||
1423 | |||
1424 | rp->header.nextindex = cpu_to_le16(XTENTRYSTART + | ||
1425 | righthalf + 1); | ||
1426 | } | ||
1427 | |||
1428 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
1429 | sxtlck->lwm.length = le16_to_cpu(sp->header.nextindex) - | ||
1430 | sxtlck->lwm.offset; | ||
1431 | |||
1432 | /* rxtlck->lwm.offset = XTENTRYSTART; */ | ||
1433 | rxtlck->lwm.length = le16_to_cpu(rp->header.nextindex) - | ||
1434 | XTENTRYSTART; | ||
1435 | } | ||
1436 | |||
1437 | *rmpp = rmp; | ||
1438 | *rbnp = rbn; | ||
1439 | |||
1440 | jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp); | ||
1441 | return rc; | ||
1442 | |||
1443 | clean_up: | ||
1444 | |||
1445 | /* Rollback quota allocation. */ | ||
1446 | if (quota_allocation) | ||
1447 | DQUOT_FREE_BLOCK(ip, quota_allocation); | ||
1448 | |||
1449 | return (rc); | ||
1450 | } | ||
1451 | |||
1452 | |||
1453 | /* | ||
1454 | * xtSplitRoot() | ||
1455 | * | ||
1456 | * function: | ||
1457 | * split the full root page into | ||
1458 | * original/root/split page and new right page | ||
1459 | * i.e., root remains fixed in tree anchor (inode) and | ||
1460 | * the root is copied to a single new right child page | ||
1461 | * since root page << non-root page, and | ||
1462 | * the split root page contains a single entry for the | ||
1463 | * new right child page. | ||
1464 | * | ||
1465 | * parameter: | ||
1466 | * int tid, | ||
1467 | * struct inode *ip, | ||
1468 | * struct xtsplit *split, | ||
1469 | * struct metapage **rmpp) | ||
1470 | * | ||
1471 | * return: | ||
1472 | * Pointer to page in which to insert or NULL on error. | ||
1473 | */ | ||
1474 | static int | ||
1475 | xtSplitRoot(tid_t tid, | ||
1476 | struct inode *ip, struct xtsplit * split, struct metapage ** rmpp) | ||
1477 | { | ||
1478 | xtpage_t *sp; | ||
1479 | struct metapage *rmp; | ||
1480 | xtpage_t *rp; | ||
1481 | s64 rbn; | ||
1482 | int skip, nextindex; | ||
1483 | xad_t *xad; | ||
1484 | pxd_t *pxd; | ||
1485 | struct pxdlist *pxdlist; | ||
1486 | struct tlock *tlck; | ||
1487 | struct xtlock *xtlck; | ||
1488 | |||
1489 | sp = &JFS_IP(ip)->i_xtroot; | ||
1490 | |||
1491 | INCREMENT(xtStat.split); | ||
1492 | |||
1493 | /* | ||
1494 | * allocate a single (right) child page | ||
1495 | */ | ||
1496 | pxdlist = split->pxdlist; | ||
1497 | pxd = &pxdlist->pxd[pxdlist->npxd]; | ||
1498 | pxdlist->npxd++; | ||
1499 | rbn = addressPXD(pxd); | ||
1500 | rmp = get_metapage(ip, rbn, PSIZE, 1); | ||
1501 | if (rmp == NULL) | ||
1502 | return -EIO; | ||
1503 | |||
1504 | /* Allocate blocks to quota. */ | ||
1505 | if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) { | ||
1506 | release_metapage(rmp); | ||
1507 | return -EDQUOT; | ||
1508 | } | ||
1509 | |||
1510 | jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp); | ||
1511 | |||
1512 | /* | ||
1513 | * acquire a transaction lock on the new right page; | ||
1514 | * | ||
1515 | * action: new page; | ||
1516 | */ | ||
1517 | BT_MARK_DIRTY(rmp, ip); | ||
1518 | |||
1519 | rp = (xtpage_t *) rmp->data; | ||
1520 | rp->header.flag = | ||
1521 | (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL; | ||
1522 | rp->header.self = *pxd; | ||
1523 | rp->header.nextindex = cpu_to_le16(XTENTRYSTART); | ||
1524 | rp->header.maxentry = cpu_to_le16(PSIZE >> L2XTSLOTSIZE); | ||
1525 | |||
1526 | /* initialize sibling pointers */ | ||
1527 | rp->header.next = 0; | ||
1528 | rp->header.prev = 0; | ||
1529 | |||
1530 | /* | ||
1531 | * copy the in-line root page into new right page extent | ||
1532 | */ | ||
1533 | nextindex = le16_to_cpu(sp->header.maxentry); | ||
1534 | memmove(&rp->xad[XTENTRYSTART], &sp->xad[XTENTRYSTART], | ||
1535 | (nextindex - XTENTRYSTART) << L2XTSLOTSIZE); | ||
1536 | |||
1537 | /* | ||
1538 | * insert the new entry into the new right/child page | ||
1539 | * (skip index in the new right page will not change) | ||
1540 | */ | ||
1541 | skip = split->index; | ||
1542 | /* if insert into middle, shift right remaining entries */ | ||
1543 | if (skip != nextindex) | ||
1544 | memmove(&rp->xad[skip + 1], &rp->xad[skip], | ||
1545 | (nextindex - skip) * sizeof(xad_t)); | ||
1546 | |||
1547 | xad = &rp->xad[skip]; | ||
1548 | XT_PUTENTRY(xad, split->flag, split->off, split->len, split->addr); | ||
1549 | |||
1550 | /* update page header */ | ||
1551 | rp->header.nextindex = cpu_to_le16(nextindex + 1); | ||
1552 | |||
1553 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
1554 | tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW); | ||
1555 | xtlck = (struct xtlock *) & tlck->lock; | ||
1556 | xtlck->lwm.offset = XTENTRYSTART; | ||
1557 | xtlck->lwm.length = le16_to_cpu(rp->header.nextindex) - | ||
1558 | XTENTRYSTART; | ||
1559 | } | ||
1560 | |||
1561 | /* | ||
1562 | * reset the root | ||
1563 | * | ||
1564 | * init root with the single entry for the new right page | ||
1565 | * set the 1st entry offset to 0, which force the left-most key | ||
1566 | * at any level of the tree to be less than any search key. | ||
1567 | */ | ||
1568 | /* | ||
1569 | * acquire a transaction lock on the root page (in-memory inode); | ||
1570 | * | ||
1571 | * action: root split; | ||
1572 | */ | ||
1573 | BT_MARK_DIRTY(split->mp, ip); | ||
1574 | |||
1575 | xad = &sp->xad[XTENTRYSTART]; | ||
1576 | XT_PUTENTRY(xad, XAD_NEW, 0, JFS_SBI(ip->i_sb)->nbperpage, rbn); | ||
1577 | |||
1578 | /* update page header of root */ | ||
1579 | sp->header.flag &= ~BT_LEAF; | ||
1580 | sp->header.flag |= BT_INTERNAL; | ||
1581 | |||
1582 | sp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1); | ||
1583 | |||
1584 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
1585 | tlck = txLock(tid, ip, split->mp, tlckXTREE | tlckGROW); | ||
1586 | xtlck = (struct xtlock *) & tlck->lock; | ||
1587 | xtlck->lwm.offset = XTENTRYSTART; | ||
1588 | xtlck->lwm.length = 1; | ||
1589 | } | ||
1590 | |||
1591 | *rmpp = rmp; | ||
1592 | |||
1593 | jfs_info("xtSplitRoot: sp:0x%p rp:0x%p", sp, rp); | ||
1594 | return 0; | ||
1595 | } | ||
1596 | |||
1597 | |||
1598 | /* | ||
1599 | * xtExtend() | ||
1600 | * | ||
1601 | * function: extend in-place; | ||
1602 | * | ||
1603 | * note: existing extent may or may not have been committed. | ||
1604 | * caller is responsible for pager buffer cache update, and | ||
1605 | * working block allocation map update; | ||
1606 | * update pmap: alloc whole extended extent; | ||
1607 | */ | ||
1608 | int xtExtend(tid_t tid, /* transaction id */ | ||
1609 | struct inode *ip, s64 xoff, /* delta extent offset */ | ||
1610 | s32 xlen, /* delta extent length */ | ||
1611 | int flag) | ||
1612 | { | ||
1613 | int rc = 0; | ||
1614 | int cmp; | ||
1615 | struct metapage *mp; /* meta-page buffer */ | ||
1616 | xtpage_t *p; /* base B+-tree index page */ | ||
1617 | s64 bn; | ||
1618 | int index, nextindex, len; | ||
1619 | struct btstack btstack; /* traverse stack */ | ||
1620 | struct xtsplit split; /* split information */ | ||
1621 | xad_t *xad; | ||
1622 | s64 xaddr; | ||
1623 | struct tlock *tlck; | ||
1624 | struct xtlock *xtlck = NULL; | ||
1625 | |||
1626 | jfs_info("xtExtend: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen); | ||
1627 | |||
1628 | /* there must exist extent to be extended */ | ||
1629 | if ((rc = xtSearch(ip, xoff - 1, &cmp, &btstack, XT_INSERT))) | ||
1630 | return rc; | ||
1631 | |||
1632 | /* retrieve search result */ | ||
1633 | XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); | ||
1634 | |||
1635 | if (cmp != 0) { | ||
1636 | XT_PUTPAGE(mp); | ||
1637 | jfs_error(ip->i_sb, "xtExtend: xtSearch did not find extent"); | ||
1638 | return -EIO; | ||
1639 | } | ||
1640 | |||
1641 | /* extension must be contiguous */ | ||
1642 | xad = &p->xad[index]; | ||
1643 | if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) { | ||
1644 | XT_PUTPAGE(mp); | ||
1645 | jfs_error(ip->i_sb, "xtExtend: extension is not contiguous"); | ||
1646 | return -EIO; | ||
1647 | } | ||
1648 | |||
1649 | /* | ||
1650 | * acquire a transaction lock on the leaf page; | ||
1651 | * | ||
1652 | * action: xad insertion/extension; | ||
1653 | */ | ||
1654 | BT_MARK_DIRTY(mp, ip); | ||
1655 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
1656 | tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); | ||
1657 | xtlck = (struct xtlock *) & tlck->lock; | ||
1658 | } | ||
1659 | |||
1660 | /* extend will overflow extent ? */ | ||
1661 | xlen = lengthXAD(xad) + xlen; | ||
1662 | if ((len = xlen - MAXXLEN) <= 0) | ||
1663 | goto extendOld; | ||
1664 | |||
1665 | /* | ||
1666 | * extent overflow: insert entry for new extent | ||
1667 | */ | ||
1668 | //insertNew: | ||
1669 | xoff = offsetXAD(xad) + MAXXLEN; | ||
1670 | xaddr = addressXAD(xad) + MAXXLEN; | ||
1671 | nextindex = le16_to_cpu(p->header.nextindex); | ||
1672 | |||
1673 | /* | ||
1674 | * if the leaf page is full, insert the new entry and | ||
1675 | * propagate up the router entry for the new page from split | ||
1676 | * | ||
1677 | * The xtSplitUp() will insert the entry and unpin the leaf page. | ||
1678 | */ | ||
1679 | if (nextindex == le16_to_cpu(p->header.maxentry)) { | ||
1680 | /* xtSpliUp() unpins leaf pages */ | ||
1681 | split.mp = mp; | ||
1682 | split.index = index + 1; | ||
1683 | split.flag = XAD_NEW; | ||
1684 | split.off = xoff; /* split offset */ | ||
1685 | split.len = len; | ||
1686 | split.addr = xaddr; | ||
1687 | split.pxdlist = NULL; | ||
1688 | if ((rc = xtSplitUp(tid, ip, &split, &btstack))) | ||
1689 | return rc; | ||
1690 | |||
1691 | /* get back old page */ | ||
1692 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
1693 | if (rc) | ||
1694 | return rc; | ||
1695 | /* | ||
1696 | * if leaf root has been split, original root has been | ||
1697 | * copied to new child page, i.e., original entry now | ||
1698 | * resides on the new child page; | ||
1699 | */ | ||
1700 | if (p->header.flag & BT_INTERNAL) { | ||
1701 | ASSERT(p->header.nextindex == | ||
1702 | cpu_to_le16(XTENTRYSTART + 1)); | ||
1703 | xad = &p->xad[XTENTRYSTART]; | ||
1704 | bn = addressXAD(xad); | ||
1705 | XT_PUTPAGE(mp); | ||
1706 | |||
1707 | /* get new child page */ | ||
1708 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
1709 | if (rc) | ||
1710 | return rc; | ||
1711 | |||
1712 | BT_MARK_DIRTY(mp, ip); | ||
1713 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
1714 | tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); | ||
1715 | xtlck = (struct xtlock *) & tlck->lock; | ||
1716 | } | ||
1717 | } | ||
1718 | } | ||
1719 | /* | ||
1720 | * insert the new entry into the leaf page | ||
1721 | */ | ||
1722 | else { | ||
1723 | /* insert the new entry: mark the entry NEW */ | ||
1724 | xad = &p->xad[index + 1]; | ||
1725 | XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr); | ||
1726 | |||
1727 | /* advance next available entry index */ | ||
1728 | p->header.nextindex = | ||
1729 | cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); | ||
1730 | } | ||
1731 | |||
1732 | /* get back old entry */ | ||
1733 | xad = &p->xad[index]; | ||
1734 | xlen = MAXXLEN; | ||
1735 | |||
1736 | /* | ||
1737 | * extend old extent | ||
1738 | */ | ||
1739 | extendOld: | ||
1740 | XADlength(xad, xlen); | ||
1741 | if (!(xad->flag & XAD_NEW)) | ||
1742 | xad->flag |= XAD_EXTENDED; | ||
1743 | |||
1744 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
1745 | xtlck->lwm.offset = | ||
1746 | (xtlck->lwm.offset) ? min(index, | ||
1747 | (int)xtlck->lwm.offset) : index; | ||
1748 | xtlck->lwm.length = | ||
1749 | le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; | ||
1750 | } | ||
1751 | |||
1752 | /* unpin the leaf page */ | ||
1753 | XT_PUTPAGE(mp); | ||
1754 | |||
1755 | return rc; | ||
1756 | } | ||
1757 | |||
1758 | #ifdef _NOTYET | ||
1759 | /* | ||
1760 | * xtTailgate() | ||
1761 | * | ||
1762 | * function: split existing 'tail' extent | ||
1763 | * (split offset >= start offset of tail extent), and | ||
1764 | * relocate and extend the split tail half; | ||
1765 | * | ||
1766 | * note: existing extent may or may not have been committed. | ||
1767 | * caller is responsible for pager buffer cache update, and | ||
1768 | * working block allocation map update; | ||
1769 | * update pmap: free old split tail extent, alloc new extent; | ||
1770 | */ | ||
1771 | int xtTailgate(tid_t tid, /* transaction id */ | ||
1772 | struct inode *ip, s64 xoff, /* split/new extent offset */ | ||
1773 | s32 xlen, /* new extent length */ | ||
1774 | s64 xaddr, /* new extent address */ | ||
1775 | int flag) | ||
1776 | { | ||
1777 | int rc = 0; | ||
1778 | int cmp; | ||
1779 | struct metapage *mp; /* meta-page buffer */ | ||
1780 | xtpage_t *p; /* base B+-tree index page */ | ||
1781 | s64 bn; | ||
1782 | int index, nextindex, llen, rlen; | ||
1783 | struct btstack btstack; /* traverse stack */ | ||
1784 | struct xtsplit split; /* split information */ | ||
1785 | xad_t *xad; | ||
1786 | struct tlock *tlck; | ||
1787 | struct xtlock *xtlck = 0; | ||
1788 | struct tlock *mtlck; | ||
1789 | struct maplock *pxdlock; | ||
1790 | |||
1791 | /* | ||
1792 | printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n", | ||
1793 | (ulong)xoff, xlen, (ulong)xaddr); | ||
1794 | */ | ||
1795 | |||
1796 | /* there must exist extent to be tailgated */ | ||
1797 | if ((rc = xtSearch(ip, xoff, &cmp, &btstack, XT_INSERT))) | ||
1798 | return rc; | ||
1799 | |||
1800 | /* retrieve search result */ | ||
1801 | XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); | ||
1802 | |||
1803 | if (cmp != 0) { | ||
1804 | XT_PUTPAGE(mp); | ||
1805 | jfs_error(ip->i_sb, "xtTailgate: couldn't find extent"); | ||
1806 | return -EIO; | ||
1807 | } | ||
1808 | |||
1809 | /* entry found must be last entry */ | ||
1810 | nextindex = le16_to_cpu(p->header.nextindex); | ||
1811 | if (index != nextindex - 1) { | ||
1812 | XT_PUTPAGE(mp); | ||
1813 | jfs_error(ip->i_sb, | ||
1814 | "xtTailgate: the entry found is not the last entry"); | ||
1815 | return -EIO; | ||
1816 | } | ||
1817 | |||
1818 | BT_MARK_DIRTY(mp, ip); | ||
1819 | /* | ||
1820 | * acquire tlock of the leaf page containing original entry | ||
1821 | */ | ||
1822 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
1823 | tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); | ||
1824 | xtlck = (struct xtlock *) & tlck->lock; | ||
1825 | } | ||
1826 | |||
1827 | /* completely replace extent ? */ | ||
1828 | xad = &p->xad[index]; | ||
1829 | /* | ||
1830 | printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", | ||
1831 | (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad)); | ||
1832 | */ | ||
1833 | if ((llen = xoff - offsetXAD(xad)) == 0) | ||
1834 | goto updateOld; | ||
1835 | |||
1836 | /* | ||
1837 | * partially replace extent: insert entry for new extent | ||
1838 | */ | ||
1839 | //insertNew: | ||
1840 | /* | ||
1841 | * if the leaf page is full, insert the new entry and | ||
1842 | * propagate up the router entry for the new page from split | ||
1843 | * | ||
1844 | * The xtSplitUp() will insert the entry and unpin the leaf page. | ||
1845 | */ | ||
1846 | if (nextindex == le16_to_cpu(p->header.maxentry)) { | ||
1847 | /* xtSpliUp() unpins leaf pages */ | ||
1848 | split.mp = mp; | ||
1849 | split.index = index + 1; | ||
1850 | split.flag = XAD_NEW; | ||
1851 | split.off = xoff; /* split offset */ | ||
1852 | split.len = xlen; | ||
1853 | split.addr = xaddr; | ||
1854 | split.pxdlist = NULL; | ||
1855 | if ((rc = xtSplitUp(tid, ip, &split, &btstack))) | ||
1856 | return rc; | ||
1857 | |||
1858 | /* get back old page */ | ||
1859 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
1860 | if (rc) | ||
1861 | return rc; | ||
1862 | /* | ||
1863 | * if leaf root has been split, original root has been | ||
1864 | * copied to new child page, i.e., original entry now | ||
1865 | * resides on the new child page; | ||
1866 | */ | ||
1867 | if (p->header.flag & BT_INTERNAL) { | ||
1868 | ASSERT(p->header.nextindex == | ||
1869 | cpu_to_le16(XTENTRYSTART + 1)); | ||
1870 | xad = &p->xad[XTENTRYSTART]; | ||
1871 | bn = addressXAD(xad); | ||
1872 | XT_PUTPAGE(mp); | ||
1873 | |||
1874 | /* get new child page */ | ||
1875 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
1876 | if (rc) | ||
1877 | return rc; | ||
1878 | |||
1879 | BT_MARK_DIRTY(mp, ip); | ||
1880 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
1881 | tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); | ||
1882 | xtlck = (struct xtlock *) & tlck->lock; | ||
1883 | } | ||
1884 | } | ||
1885 | } | ||
1886 | /* | ||
1887 | * insert the new entry into the leaf page | ||
1888 | */ | ||
1889 | else { | ||
1890 | /* insert the new entry: mark the entry NEW */ | ||
1891 | xad = &p->xad[index + 1]; | ||
1892 | XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); | ||
1893 | |||
1894 | /* advance next available entry index */ | ||
1895 | p->header.nextindex = | ||
1896 | cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); | ||
1897 | } | ||
1898 | |||
1899 | /* get back old XAD */ | ||
1900 | xad = &p->xad[index]; | ||
1901 | |||
1902 | /* | ||
1903 | * truncate/relocate old extent at split offset | ||
1904 | */ | ||
1905 | updateOld: | ||
1906 | /* update dmap for old/committed/truncated extent */ | ||
1907 | rlen = lengthXAD(xad) - llen; | ||
1908 | if (!(xad->flag & XAD_NEW)) { | ||
1909 | /* free from PWMAP at commit */ | ||
1910 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
1911 | mtlck = txMaplock(tid, ip, tlckMAP); | ||
1912 | pxdlock = (struct maplock *) & mtlck->lock; | ||
1913 | pxdlock->flag = mlckFREEPXD; | ||
1914 | PXDaddress(&pxdlock->pxd, addressXAD(xad) + llen); | ||
1915 | PXDlength(&pxdlock->pxd, rlen); | ||
1916 | pxdlock->index = 1; | ||
1917 | } | ||
1918 | } else | ||
1919 | /* free from WMAP */ | ||
1920 | dbFree(ip, addressXAD(xad) + llen, (s64) rlen); | ||
1921 | |||
1922 | if (llen) | ||
1923 | /* truncate */ | ||
1924 | XADlength(xad, llen); | ||
1925 | else | ||
1926 | /* replace */ | ||
1927 | XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); | ||
1928 | |||
1929 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
1930 | xtlck->lwm.offset = (xtlck->lwm.offset) ? | ||
1931 | min(index, (int)xtlck->lwm.offset) : index; | ||
1932 | xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - | ||
1933 | xtlck->lwm.offset; | ||
1934 | } | ||
1935 | |||
1936 | /* unpin the leaf page */ | ||
1937 | XT_PUTPAGE(mp); | ||
1938 | |||
1939 | return rc; | ||
1940 | } | ||
1941 | #endif /* _NOTYET */ | ||
1942 | |||
1943 | /* | ||
1944 | * xtUpdate() | ||
1945 | * | ||
1946 | * function: update XAD; | ||
1947 | * | ||
1948 | * update extent for allocated_but_not_recorded or | ||
1949 | * compressed extent; | ||
1950 | * | ||
1951 | * parameter: | ||
1952 | * nxad - new XAD; | ||
1953 | * logical extent of the specified XAD must be completely | ||
1954 | * contained by an existing XAD; | ||
1955 | */ | ||
1956 | int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) | ||
1957 | { /* new XAD */ | ||
1958 | int rc = 0; | ||
1959 | int cmp; | ||
1960 | struct metapage *mp; /* meta-page buffer */ | ||
1961 | xtpage_t *p; /* base B+-tree index page */ | ||
1962 | s64 bn; | ||
1963 | int index0, index, newindex, nextindex; | ||
1964 | struct btstack btstack; /* traverse stack */ | ||
1965 | struct xtsplit split; /* split information */ | ||
1966 | xad_t *xad, *lxad, *rxad; | ||
1967 | int xflag; | ||
1968 | s64 nxoff, xoff; | ||
1969 | int nxlen, xlen, lxlen, rxlen; | ||
1970 | s64 nxaddr, xaddr; | ||
1971 | struct tlock *tlck; | ||
1972 | struct xtlock *xtlck = NULL; | ||
1973 | int newpage = 0; | ||
1974 | |||
1975 | /* there must exist extent to be tailgated */ | ||
1976 | nxoff = offsetXAD(nxad); | ||
1977 | nxlen = lengthXAD(nxad); | ||
1978 | nxaddr = addressXAD(nxad); | ||
1979 | |||
1980 | if ((rc = xtSearch(ip, nxoff, &cmp, &btstack, XT_INSERT))) | ||
1981 | return rc; | ||
1982 | |||
1983 | /* retrieve search result */ | ||
1984 | XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0); | ||
1985 | |||
1986 | if (cmp != 0) { | ||
1987 | XT_PUTPAGE(mp); | ||
1988 | jfs_error(ip->i_sb, "xtUpdate: Could not find extent"); | ||
1989 | return -EIO; | ||
1990 | } | ||
1991 | |||
1992 | BT_MARK_DIRTY(mp, ip); | ||
1993 | /* | ||
1994 | * acquire tlock of the leaf page containing original entry | ||
1995 | */ | ||
1996 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
1997 | tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); | ||
1998 | xtlck = (struct xtlock *) & tlck->lock; | ||
1999 | } | ||
2000 | |||
2001 | xad = &p->xad[index0]; | ||
2002 | xflag = xad->flag; | ||
2003 | xoff = offsetXAD(xad); | ||
2004 | xlen = lengthXAD(xad); | ||
2005 | xaddr = addressXAD(xad); | ||
2006 | |||
2007 | /* nXAD must be completely contained within XAD */ | ||
2008 | if ((xoff > nxoff) || | ||
2009 | (nxoff + nxlen > xoff + xlen)) { | ||
2010 | XT_PUTPAGE(mp); | ||
2011 | jfs_error(ip->i_sb, | ||
2012 | "xtUpdate: nXAD in not completely contained within XAD"); | ||
2013 | return -EIO; | ||
2014 | } | ||
2015 | |||
2016 | index = index0; | ||
2017 | newindex = index + 1; | ||
2018 | nextindex = le16_to_cpu(p->header.nextindex); | ||
2019 | |||
2020 | #ifdef _JFS_WIP_NOCOALESCE | ||
2021 | if (xoff < nxoff) | ||
2022 | goto updateRight; | ||
2023 | |||
2024 | /* | ||
2025 | * replace XAD with nXAD | ||
2026 | */ | ||
2027 | replace: /* (nxoff == xoff) */ | ||
2028 | if (nxlen == xlen) { | ||
2029 | /* replace XAD with nXAD:recorded */ | ||
2030 | *xad = *nxad; | ||
2031 | xad->flag = xflag & ~XAD_NOTRECORDED; | ||
2032 | |||
2033 | goto out; | ||
2034 | } else /* (nxlen < xlen) */ | ||
2035 | goto updateLeft; | ||
2036 | #endif /* _JFS_WIP_NOCOALESCE */ | ||
2037 | |||
2038 | /* #ifdef _JFS_WIP_COALESCE */ | ||
2039 | if (xoff < nxoff) | ||
2040 | goto coalesceRight; | ||
2041 | |||
2042 | /* | ||
2043 | * coalesce with left XAD | ||
2044 | */ | ||
2045 | //coalesceLeft: /* (xoff == nxoff) */ | ||
2046 | /* is XAD first entry of page ? */ | ||
2047 | if (index == XTENTRYSTART) | ||
2048 | goto replace; | ||
2049 | |||
2050 | /* is nXAD logically and physically contiguous with lXAD ? */ | ||
2051 | lxad = &p->xad[index - 1]; | ||
2052 | lxlen = lengthXAD(lxad); | ||
2053 | if (!(lxad->flag & XAD_NOTRECORDED) && | ||
2054 | (nxoff == offsetXAD(lxad) + lxlen) && | ||
2055 | (nxaddr == addressXAD(lxad) + lxlen) && | ||
2056 | (lxlen + nxlen < MAXXLEN)) { | ||
2057 | /* extend right lXAD */ | ||
2058 | index0 = index - 1; | ||
2059 | XADlength(lxad, lxlen + nxlen); | ||
2060 | |||
2061 | /* If we just merged two extents together, need to make sure the | ||
2062 | * right extent gets logged. If the left one is marked XAD_NEW, | ||
2063 | * then we know it will be logged. Otherwise, mark as | ||
2064 | * XAD_EXTENDED | ||
2065 | */ | ||
2066 | if (!(lxad->flag & XAD_NEW)) | ||
2067 | lxad->flag |= XAD_EXTENDED; | ||
2068 | |||
2069 | if (xlen > nxlen) { | ||
2070 | /* truncate XAD */ | ||
2071 | XADoffset(xad, xoff + nxlen); | ||
2072 | XADlength(xad, xlen - nxlen); | ||
2073 | XADaddress(xad, xaddr + nxlen); | ||
2074 | goto out; | ||
2075 | } else { /* (xlen == nxlen) */ | ||
2076 | |||
2077 | /* remove XAD */ | ||
2078 | if (index < nextindex - 1) | ||
2079 | memmove(&p->xad[index], &p->xad[index + 1], | ||
2080 | (nextindex - index - | ||
2081 | 1) << L2XTSLOTSIZE); | ||
2082 | |||
2083 | p->header.nextindex = | ||
2084 | cpu_to_le16(le16_to_cpu(p->header.nextindex) - | ||
2085 | 1); | ||
2086 | |||
2087 | index = index0; | ||
2088 | newindex = index + 1; | ||
2089 | nextindex = le16_to_cpu(p->header.nextindex); | ||
2090 | xoff = nxoff = offsetXAD(lxad); | ||
2091 | xlen = nxlen = lxlen + nxlen; | ||
2092 | xaddr = nxaddr = addressXAD(lxad); | ||
2093 | goto coalesceRight; | ||
2094 | } | ||
2095 | } | ||
2096 | |||
2097 | /* | ||
2098 | * replace XAD with nXAD | ||
2099 | */ | ||
2100 | replace: /* (nxoff == xoff) */ | ||
2101 | if (nxlen == xlen) { | ||
2102 | /* replace XAD with nXAD:recorded */ | ||
2103 | *xad = *nxad; | ||
2104 | xad->flag = xflag & ~XAD_NOTRECORDED; | ||
2105 | |||
2106 | goto coalesceRight; | ||
2107 | } else /* (nxlen < xlen) */ | ||
2108 | goto updateLeft; | ||
2109 | |||
2110 | /* | ||
2111 | * coalesce with right XAD | ||
2112 | */ | ||
2113 | coalesceRight: /* (xoff <= nxoff) */ | ||
2114 | /* is XAD last entry of page ? */ | ||
2115 | if (newindex == nextindex) { | ||
2116 | if (xoff == nxoff) | ||
2117 | goto out; | ||
2118 | goto updateRight; | ||
2119 | } | ||
2120 | |||
2121 | /* is nXAD logically and physically contiguous with rXAD ? */ | ||
2122 | rxad = &p->xad[index + 1]; | ||
2123 | rxlen = lengthXAD(rxad); | ||
2124 | if (!(rxad->flag & XAD_NOTRECORDED) && | ||
2125 | (nxoff + nxlen == offsetXAD(rxad)) && | ||
2126 | (nxaddr + nxlen == addressXAD(rxad)) && | ||
2127 | (rxlen + nxlen < MAXXLEN)) { | ||
2128 | /* extend left rXAD */ | ||
2129 | XADoffset(rxad, nxoff); | ||
2130 | XADlength(rxad, rxlen + nxlen); | ||
2131 | XADaddress(rxad, nxaddr); | ||
2132 | |||
2133 | /* If we just merged two extents together, need to make sure | ||
2134 | * the left extent gets logged. If the right one is marked | ||
2135 | * XAD_NEW, then we know it will be logged. Otherwise, mark as | ||
2136 | * XAD_EXTENDED | ||
2137 | */ | ||
2138 | if (!(rxad->flag & XAD_NEW)) | ||
2139 | rxad->flag |= XAD_EXTENDED; | ||
2140 | |||
2141 | if (xlen > nxlen) | ||
2142 | /* truncate XAD */ | ||
2143 | XADlength(xad, xlen - nxlen); | ||
2144 | else { /* (xlen == nxlen) */ | ||
2145 | |||
2146 | /* remove XAD */ | ||
2147 | memmove(&p->xad[index], &p->xad[index + 1], | ||
2148 | (nextindex - index - 1) << L2XTSLOTSIZE); | ||
2149 | |||
2150 | p->header.nextindex = | ||
2151 | cpu_to_le16(le16_to_cpu(p->header.nextindex) - | ||
2152 | 1); | ||
2153 | } | ||
2154 | |||
2155 | goto out; | ||
2156 | } else if (xoff == nxoff) | ||
2157 | goto out; | ||
2158 | |||
2159 | if (xoff >= nxoff) { | ||
2160 | XT_PUTPAGE(mp); | ||
2161 | jfs_error(ip->i_sb, "xtUpdate: xoff >= nxoff"); | ||
2162 | return -EIO; | ||
2163 | } | ||
2164 | /* #endif _JFS_WIP_COALESCE */ | ||
2165 | |||
2166 | /* | ||
2167 | * split XAD into (lXAD, nXAD): | ||
2168 | * | ||
2169 | * |---nXAD---> | ||
2170 | * --|----------XAD----------|-- | ||
2171 | * |-lXAD-| | ||
2172 | */ | ||
2173 | updateRight: /* (xoff < nxoff) */ | ||
2174 | /* truncate old XAD as lXAD:not_recorded */ | ||
2175 | xad = &p->xad[index]; | ||
2176 | XADlength(xad, nxoff - xoff); | ||
2177 | |||
2178 | /* insert nXAD:recorded */ | ||
2179 | if (nextindex == le16_to_cpu(p->header.maxentry)) { | ||
2180 | |||
2181 | /* xtSpliUp() unpins leaf pages */ | ||
2182 | split.mp = mp; | ||
2183 | split.index = newindex; | ||
2184 | split.flag = xflag & ~XAD_NOTRECORDED; | ||
2185 | split.off = nxoff; | ||
2186 | split.len = nxlen; | ||
2187 | split.addr = nxaddr; | ||
2188 | split.pxdlist = NULL; | ||
2189 | if ((rc = xtSplitUp(tid, ip, &split, &btstack))) | ||
2190 | return rc; | ||
2191 | |||
2192 | /* get back old page */ | ||
2193 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
2194 | if (rc) | ||
2195 | return rc; | ||
2196 | /* | ||
2197 | * if leaf root has been split, original root has been | ||
2198 | * copied to new child page, i.e., original entry now | ||
2199 | * resides on the new child page; | ||
2200 | */ | ||
2201 | if (p->header.flag & BT_INTERNAL) { | ||
2202 | ASSERT(p->header.nextindex == | ||
2203 | cpu_to_le16(XTENTRYSTART + 1)); | ||
2204 | xad = &p->xad[XTENTRYSTART]; | ||
2205 | bn = addressXAD(xad); | ||
2206 | XT_PUTPAGE(mp); | ||
2207 | |||
2208 | /* get new child page */ | ||
2209 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
2210 | if (rc) | ||
2211 | return rc; | ||
2212 | |||
2213 | BT_MARK_DIRTY(mp, ip); | ||
2214 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
2215 | tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); | ||
2216 | xtlck = (struct xtlock *) & tlck->lock; | ||
2217 | } | ||
2218 | } else { | ||
2219 | /* is nXAD on new page ? */ | ||
2220 | if (newindex > | ||
2221 | (le16_to_cpu(p->header.maxentry) >> 1)) { | ||
2222 | newindex = | ||
2223 | newindex - | ||
2224 | le16_to_cpu(p->header.nextindex) + | ||
2225 | XTENTRYSTART; | ||
2226 | newpage = 1; | ||
2227 | } | ||
2228 | } | ||
2229 | } else { | ||
2230 | /* if insert into middle, shift right remaining entries */ | ||
2231 | if (newindex < nextindex) | ||
2232 | memmove(&p->xad[newindex + 1], &p->xad[newindex], | ||
2233 | (nextindex - newindex) << L2XTSLOTSIZE); | ||
2234 | |||
2235 | /* insert the entry */ | ||
2236 | xad = &p->xad[newindex]; | ||
2237 | *xad = *nxad; | ||
2238 | xad->flag = xflag & ~XAD_NOTRECORDED; | ||
2239 | |||
2240 | /* advance next available entry index. */ | ||
2241 | p->header.nextindex = | ||
2242 | cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); | ||
2243 | } | ||
2244 | |||
2245 | /* | ||
2246 | * does nXAD force 3-way split ? | ||
2247 | * | ||
2248 | * |---nXAD--->| | ||
2249 | * --|----------XAD-------------|-- | ||
2250 | * |-lXAD-| |-rXAD -| | ||
2251 | */ | ||
2252 | if (nxoff + nxlen == xoff + xlen) | ||
2253 | goto out; | ||
2254 | |||
2255 | /* reorient nXAD as XAD for further split XAD into (nXAD, rXAD) */ | ||
2256 | if (newpage) { | ||
2257 | /* close out old page */ | ||
2258 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
2259 | xtlck->lwm.offset = (xtlck->lwm.offset) ? | ||
2260 | min(index0, (int)xtlck->lwm.offset) : index0; | ||
2261 | xtlck->lwm.length = | ||
2262 | le16_to_cpu(p->header.nextindex) - | ||
2263 | xtlck->lwm.offset; | ||
2264 | } | ||
2265 | |||
2266 | bn = le64_to_cpu(p->header.next); | ||
2267 | XT_PUTPAGE(mp); | ||
2268 | |||
2269 | /* get new right page */ | ||
2270 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
2271 | if (rc) | ||
2272 | return rc; | ||
2273 | |||
2274 | BT_MARK_DIRTY(mp, ip); | ||
2275 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
2276 | tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); | ||
2277 | xtlck = (struct xtlock *) & tlck->lock; | ||
2278 | } | ||
2279 | |||
2280 | index0 = index = newindex; | ||
2281 | } else | ||
2282 | index++; | ||
2283 | |||
2284 | newindex = index + 1; | ||
2285 | nextindex = le16_to_cpu(p->header.nextindex); | ||
2286 | xlen = xlen - (nxoff - xoff); | ||
2287 | xoff = nxoff; | ||
2288 | xaddr = nxaddr; | ||
2289 | |||
2290 | /* recompute split pages */ | ||
2291 | if (nextindex == le16_to_cpu(p->header.maxentry)) { | ||
2292 | XT_PUTPAGE(mp); | ||
2293 | |||
2294 | if ((rc = xtSearch(ip, nxoff, &cmp, &btstack, XT_INSERT))) | ||
2295 | return rc; | ||
2296 | |||
2297 | /* retrieve search result */ | ||
2298 | XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0); | ||
2299 | |||
2300 | if (cmp != 0) { | ||
2301 | XT_PUTPAGE(mp); | ||
2302 | jfs_error(ip->i_sb, "xtUpdate: xtSearch failed"); | ||
2303 | return -EIO; | ||
2304 | } | ||
2305 | |||
2306 | if (index0 != index) { | ||
2307 | XT_PUTPAGE(mp); | ||
2308 | jfs_error(ip->i_sb, | ||
2309 | "xtUpdate: unexpected value of index"); | ||
2310 | return -EIO; | ||
2311 | } | ||
2312 | } | ||
2313 | |||
2314 | /* | ||
2315 | * split XAD into (nXAD, rXAD) | ||
2316 | * | ||
2317 | * ---nXAD---| | ||
2318 | * --|----------XAD----------|-- | ||
2319 | * |-rXAD-| | ||
2320 | */ | ||
2321 | updateLeft: /* (nxoff == xoff) && (nxlen < xlen) */ | ||
2322 | /* update old XAD with nXAD:recorded */ | ||
2323 | xad = &p->xad[index]; | ||
2324 | *xad = *nxad; | ||
2325 | xad->flag = xflag & ~XAD_NOTRECORDED; | ||
2326 | |||
2327 | /* insert rXAD:not_recorded */ | ||
2328 | xoff = xoff + nxlen; | ||
2329 | xlen = xlen - nxlen; | ||
2330 | xaddr = xaddr + nxlen; | ||
2331 | if (nextindex == le16_to_cpu(p->header.maxentry)) { | ||
2332 | /* | ||
2333 | printf("xtUpdate.updateLeft.split p:0x%p\n", p); | ||
2334 | */ | ||
2335 | /* xtSpliUp() unpins leaf pages */ | ||
2336 | split.mp = mp; | ||
2337 | split.index = newindex; | ||
2338 | split.flag = xflag; | ||
2339 | split.off = xoff; | ||
2340 | split.len = xlen; | ||
2341 | split.addr = xaddr; | ||
2342 | split.pxdlist = NULL; | ||
2343 | if ((rc = xtSplitUp(tid, ip, &split, &btstack))) | ||
2344 | return rc; | ||
2345 | |||
2346 | /* get back old page */ | ||
2347 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
2348 | if (rc) | ||
2349 | return rc; | ||
2350 | |||
2351 | /* | ||
2352 | * if leaf root has been split, original root has been | ||
2353 | * copied to new child page, i.e., original entry now | ||
2354 | * resides on the new child page; | ||
2355 | */ | ||
2356 | if (p->header.flag & BT_INTERNAL) { | ||
2357 | ASSERT(p->header.nextindex == | ||
2358 | cpu_to_le16(XTENTRYSTART + 1)); | ||
2359 | xad = &p->xad[XTENTRYSTART]; | ||
2360 | bn = addressXAD(xad); | ||
2361 | XT_PUTPAGE(mp); | ||
2362 | |||
2363 | /* get new child page */ | ||
2364 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
2365 | if (rc) | ||
2366 | return rc; | ||
2367 | |||
2368 | BT_MARK_DIRTY(mp, ip); | ||
2369 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
2370 | tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); | ||
2371 | xtlck = (struct xtlock *) & tlck->lock; | ||
2372 | } | ||
2373 | } | ||
2374 | } else { | ||
2375 | /* if insert into middle, shift right remaining entries */ | ||
2376 | if (newindex < nextindex) | ||
2377 | memmove(&p->xad[newindex + 1], &p->xad[newindex], | ||
2378 | (nextindex - newindex) << L2XTSLOTSIZE); | ||
2379 | |||
2380 | /* insert the entry */ | ||
2381 | xad = &p->xad[newindex]; | ||
2382 | XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); | ||
2383 | |||
2384 | /* advance next available entry index. */ | ||
2385 | p->header.nextindex = | ||
2386 | cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); | ||
2387 | } | ||
2388 | |||
2389 | out: | ||
2390 | if (!test_cflag(COMMIT_Nolink, ip)) { | ||
2391 | xtlck->lwm.offset = (xtlck->lwm.offset) ? | ||
2392 | min(index0, (int)xtlck->lwm.offset) : index0; | ||
2393 | xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - | ||
2394 | xtlck->lwm.offset; | ||
2395 | } | ||
2396 | |||
2397 | /* unpin the leaf page */ | ||
2398 | XT_PUTPAGE(mp); | ||
2399 | |||
2400 | return rc; | ||
2401 | } | ||
2402 | |||
2403 | |||
2404 | /* | ||
2405 | * xtAppend() | ||
2406 | * | ||
2407 | * function: grow in append mode from contiguous region specified ; | ||
2408 | * | ||
2409 | * parameter: | ||
2410 | * tid - transaction id; | ||
2411 | * ip - file object; | ||
2412 | * xflag - extent flag: | ||
2413 | * xoff - extent offset; | ||
2414 | * maxblocks - max extent length; | ||
2415 | * xlen - extent length (in/out); | ||
2416 | * xaddrp - extent address pointer (in/out): | ||
2417 | * flag - | ||
2418 | * | ||
2419 | * return: | ||
2420 | */ | ||
2421 | int xtAppend(tid_t tid, /* transaction id */ | ||
2422 | struct inode *ip, int xflag, s64 xoff, s32 maxblocks, | ||
2423 | s32 * xlenp, /* (in/out) */ | ||
2424 | s64 * xaddrp, /* (in/out) */ | ||
2425 | int flag) | ||
2426 | { | ||
2427 | int rc = 0; | ||
2428 | struct metapage *mp; /* meta-page buffer */ | ||
2429 | xtpage_t *p; /* base B+-tree index page */ | ||
2430 | s64 bn, xaddr; | ||
2431 | int index, nextindex; | ||
2432 | struct btstack btstack; /* traverse stack */ | ||
2433 | struct xtsplit split; /* split information */ | ||
2434 | xad_t *xad; | ||
2435 | int cmp; | ||
2436 | struct tlock *tlck; | ||
2437 | struct xtlock *xtlck; | ||
2438 | int nsplit, nblocks, xlen; | ||
2439 | struct pxdlist pxdlist; | ||
2440 | pxd_t *pxd; | ||
2441 | |||
2442 | xaddr = *xaddrp; | ||
2443 | xlen = *xlenp; | ||
2444 | jfs_info("xtAppend: xoff:0x%lx maxblocks:%d xlen:%d xaddr:0x%lx", | ||
2445 | (ulong) xoff, maxblocks, xlen, (ulong) xaddr); | ||
2446 | |||
2447 | /* | ||
2448 | * search for the entry location at which to insert: | ||
2449 | * | ||
2450 | * xtFastSearch() and xtSearch() both returns (leaf page | ||
2451 | * pinned, index at which to insert). | ||
2452 | * n.b. xtSearch() may return index of maxentry of | ||
2453 | * the full page. | ||
2454 | */ | ||
2455 | if ((rc = xtSearch(ip, xoff, &cmp, &btstack, XT_INSERT))) | ||
2456 | return rc; | ||
2457 | |||
2458 | /* retrieve search result */ | ||
2459 | XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); | ||
2460 | |||
2461 | if (cmp == 0) { | ||
2462 | rc = -EEXIST; | ||
2463 | goto out; | ||
2464 | } | ||
2465 | //insert: | ||
2466 | /* | ||
2467 | * insert entry for new extent | ||
2468 | */ | ||
2469 | xflag |= XAD_NEW; | ||
2470 | |||
2471 | /* | ||
2472 | * if the leaf page is full, split the page and | ||
2473 | * propagate up the router entry for the new page from split | ||
2474 | * | ||
2475 | * The xtSplitUp() will insert the entry and unpin the leaf page. | ||
2476 | */ | ||
2477 | nextindex = le16_to_cpu(p->header.nextindex); | ||
2478 | if (nextindex < le16_to_cpu(p->header.maxentry)) | ||
2479 | goto insertLeaf; | ||
2480 | |||
2481 | /* | ||
2482 | * allocate new index blocks to cover index page split(s) | ||
2483 | */ | ||
2484 | nsplit = btstack.nsplit; | ||
2485 | split.pxdlist = &pxdlist; | ||
2486 | pxdlist.maxnpxd = pxdlist.npxd = 0; | ||
2487 | pxd = &pxdlist.pxd[0]; | ||
2488 | nblocks = JFS_SBI(ip->i_sb)->nbperpage; | ||
2489 | for (; nsplit > 0; nsplit--, pxd++, xaddr += nblocks, maxblocks -= nblocks) { | ||
2490 | if ((rc = dbAllocBottomUp(ip, xaddr, (s64) nblocks)) == 0) { | ||
2491 | PXDaddress(pxd, xaddr); | ||
2492 | PXDlength(pxd, nblocks); | ||
2493 | |||
2494 | pxdlist.maxnpxd++; | ||
2495 | |||
2496 | continue; | ||
2497 | } | ||
2498 | |||
2499 | /* undo allocation */ | ||
2500 | |||
2501 | goto out; | ||
2502 | } | ||
2503 | |||
2504 | xlen = min(xlen, maxblocks); | ||
2505 | |||
2506 | /* | ||
2507 | * allocate data extent requested | ||
2508 | */ | ||
2509 | if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen))) | ||
2510 | goto out; | ||
2511 | |||
2512 | split.mp = mp; | ||
2513 | split.index = index; | ||
2514 | split.flag = xflag; | ||
2515 | split.off = xoff; | ||
2516 | split.len = xlen; | ||
2517 | split.addr = xaddr; | ||
2518 | if ((rc = xtSplitUp(tid, ip, &split, &btstack))) { | ||
2519 | /* undo data extent allocation */ | ||
2520 | dbFree(ip, *xaddrp, (s64) * xlenp); | ||
2521 | |||
2522 | return rc; | ||
2523 | } | ||
2524 | |||
2525 | *xaddrp = xaddr; | ||
2526 | *xlenp = xlen; | ||
2527 | return 0; | ||
2528 | |||
2529 | /* | ||
2530 | * insert the new entry into the leaf page | ||
2531 | */ | ||
2532 | insertLeaf: | ||
2533 | /* | ||
2534 | * allocate data extent requested | ||
2535 | */ | ||
2536 | if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen))) | ||
2537 | goto out; | ||
2538 | |||
2539 | BT_MARK_DIRTY(mp, ip); | ||
2540 | /* | ||
2541 | * acquire a transaction lock on the leaf page; | ||
2542 | * | ||
2543 | * action: xad insertion/extension; | ||
2544 | */ | ||
2545 | tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); | ||
2546 | xtlck = (struct xtlock *) & tlck->lock; | ||
2547 | |||
2548 | /* insert the new entry: mark the entry NEW */ | ||
2549 | xad = &p->xad[index]; | ||
2550 | XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); | ||
2551 | |||
2552 | /* advance next available entry index */ | ||
2553 | p->header.nextindex = | ||
2554 | cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); | ||
2555 | |||
2556 | xtlck->lwm.offset = | ||
2557 | (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index; | ||
2558 | xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - | ||
2559 | xtlck->lwm.offset; | ||
2560 | |||
2561 | *xaddrp = xaddr; | ||
2562 | *xlenp = xlen; | ||
2563 | |||
2564 | out: | ||
2565 | /* unpin the leaf page */ | ||
2566 | XT_PUTPAGE(mp); | ||
2567 | |||
2568 | return rc; | ||
2569 | } | ||
2570 | #ifdef _STILL_TO_PORT | ||
2571 | |||
2572 | /* - TBD for defragmentaion/reorganization - | ||
2573 | * | ||
2574 | * xtDelete() | ||
2575 | * | ||
2576 | * function: | ||
2577 | * delete the entry with the specified key. | ||
2578 | * | ||
2579 | * N.B.: whole extent of the entry is assumed to be deleted. | ||
2580 | * | ||
2581 | * parameter: | ||
2582 | * | ||
2583 | * return: | ||
2584 | * ENOENT: if the entry is not found. | ||
2585 | * | ||
2586 | * exception: | ||
2587 | */ | ||
2588 | int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag) | ||
2589 | { | ||
2590 | int rc = 0; | ||
2591 | struct btstack btstack; | ||
2592 | int cmp; | ||
2593 | s64 bn; | ||
2594 | struct metapage *mp; | ||
2595 | xtpage_t *p; | ||
2596 | int index, nextindex; | ||
2597 | struct tlock *tlck; | ||
2598 | struct xtlock *xtlck; | ||
2599 | |||
2600 | /* | ||
2601 | * find the matching entry; xtSearch() pins the page | ||
2602 | */ | ||
2603 | if ((rc = xtSearch(ip, xoff, &cmp, &btstack, 0))) | ||
2604 | return rc; | ||
2605 | |||
2606 | XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); | ||
2607 | if (cmp) { | ||
2608 | /* unpin the leaf page */ | ||
2609 | XT_PUTPAGE(mp); | ||
2610 | return -ENOENT; | ||
2611 | } | ||
2612 | |||
2613 | /* | ||
2614 | * delete the entry from the leaf page | ||
2615 | */ | ||
2616 | nextindex = le16_to_cpu(p->header.nextindex); | ||
2617 | p->header.nextindex = | ||
2618 | cpu_to_le16(le16_to_cpu(p->header.nextindex) - 1); | ||
2619 | |||
2620 | /* | ||
2621 | * if the leaf page bocome empty, free the page | ||
2622 | */ | ||
2623 | if (p->header.nextindex == cpu_to_le16(XTENTRYSTART)) | ||
2624 | return (xtDeleteUp(tid, ip, mp, p, &btstack)); | ||
2625 | |||
2626 | BT_MARK_DIRTY(mp, ip); | ||
2627 | /* | ||
2628 | * acquire a transaction lock on the leaf page; | ||
2629 | * | ||
2630 | * action:xad deletion; | ||
2631 | */ | ||
2632 | tlck = txLock(tid, ip, mp, tlckXTREE); | ||
2633 | xtlck = (struct xtlock *) & tlck->lock; | ||
2634 | xtlck->lwm.offset = | ||
2635 | (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index; | ||
2636 | |||
2637 | /* if delete from middle, shift left/compact the remaining entries */ | ||
2638 | if (index < nextindex - 1) | ||
2639 | memmove(&p->xad[index], &p->xad[index + 1], | ||
2640 | (nextindex - index - 1) * sizeof(xad_t)); | ||
2641 | |||
2642 | XT_PUTPAGE(mp); | ||
2643 | |||
2644 | return 0; | ||
2645 | } | ||
2646 | |||
2647 | |||
2648 | /* - TBD for defragmentaion/reorganization - | ||
2649 | * | ||
2650 | * xtDeleteUp() | ||
2651 | * | ||
2652 | * function: | ||
2653 | * free empty pages as propagating deletion up the tree | ||
2654 | * | ||
2655 | * parameter: | ||
2656 | * | ||
2657 | * return: | ||
2658 | */ | ||
2659 | static int | ||
2660 | xtDeleteUp(tid_t tid, struct inode *ip, | ||
2661 | struct metapage * fmp, xtpage_t * fp, struct btstack * btstack) | ||
2662 | { | ||
2663 | int rc = 0; | ||
2664 | struct metapage *mp; | ||
2665 | xtpage_t *p; | ||
2666 | int index, nextindex; | ||
2667 | s64 xaddr; | ||
2668 | int xlen; | ||
2669 | struct btframe *parent; | ||
2670 | struct tlock *tlck; | ||
2671 | struct xtlock *xtlck; | ||
2672 | |||
2673 | /* | ||
2674 | * keep root leaf page which has become empty | ||
2675 | */ | ||
2676 | if (fp->header.flag & BT_ROOT) { | ||
2677 | /* keep the root page */ | ||
2678 | fp->header.flag &= ~BT_INTERNAL; | ||
2679 | fp->header.flag |= BT_LEAF; | ||
2680 | fp->header.nextindex = cpu_to_le16(XTENTRYSTART); | ||
2681 | |||
2682 | /* XT_PUTPAGE(fmp); */ | ||
2683 | |||
2684 | return 0; | ||
2685 | } | ||
2686 | |||
2687 | /* | ||
2688 | * free non-root leaf page | ||
2689 | */ | ||
2690 | if ((rc = xtRelink(tid, ip, fp))) { | ||
2691 | XT_PUTPAGE(fmp); | ||
2692 | return rc; | ||
2693 | } | ||
2694 | |||
2695 | xaddr = addressPXD(&fp->header.self); | ||
2696 | xlen = lengthPXD(&fp->header.self); | ||
2697 | /* free the page extent */ | ||
2698 | dbFree(ip, xaddr, (s64) xlen); | ||
2699 | |||
2700 | /* free the buffer page */ | ||
2701 | discard_metapage(fmp); | ||
2702 | |||
2703 | /* | ||
2704 | * propagate page deletion up the index tree | ||
2705 | * | ||
2706 | * If the delete from the parent page makes it empty, | ||
2707 | * continue all the way up the tree. | ||
2708 | * stop if the root page is reached (which is never deleted) or | ||
2709 | * if the entry deletion does not empty the page. | ||
2710 | */ | ||
2711 | while ((parent = BT_POP(btstack)) != NULL) { | ||
2712 | /* get/pin the parent page <sp> */ | ||
2713 | XT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc); | ||
2714 | if (rc) | ||
2715 | return rc; | ||
2716 | |||
2717 | index = parent->index; | ||
2718 | |||
2719 | /* delete the entry for the freed child page from parent. | ||
2720 | */ | ||
2721 | nextindex = le16_to_cpu(p->header.nextindex); | ||
2722 | |||
2723 | /* | ||
2724 | * the parent has the single entry being deleted: | ||
2725 | * free the parent page which has become empty. | ||
2726 | */ | ||
2727 | if (nextindex == 1) { | ||
2728 | if (p->header.flag & BT_ROOT) { | ||
2729 | /* keep the root page */ | ||
2730 | p->header.flag &= ~BT_INTERNAL; | ||
2731 | p->header.flag |= BT_LEAF; | ||
2732 | p->header.nextindex = | ||
2733 | cpu_to_le16(XTENTRYSTART); | ||
2734 | |||
2735 | /* XT_PUTPAGE(mp); */ | ||
2736 | |||
2737 | break; | ||
2738 | } else { | ||
2739 | /* free the parent page */ | ||
2740 | if ((rc = xtRelink(tid, ip, p))) | ||
2741 | return rc; | ||
2742 | |||
2743 | xaddr = addressPXD(&p->header.self); | ||
2744 | /* free the page extent */ | ||
2745 | dbFree(ip, xaddr, | ||
2746 | (s64) JFS_SBI(ip->i_sb)->nbperpage); | ||
2747 | |||
2748 | /* unpin/free the buffer page */ | ||
2749 | discard_metapage(mp); | ||
2750 | |||
2751 | /* propagate up */ | ||
2752 | continue; | ||
2753 | } | ||
2754 | } | ||
2755 | /* | ||
2756 | * the parent has other entries remaining: | ||
2757 | * delete the router entry from the parent page. | ||
2758 | */ | ||
2759 | else { | ||
2760 | BT_MARK_DIRTY(mp, ip); | ||
2761 | /* | ||
2762 | * acquire a transaction lock on the leaf page; | ||
2763 | * | ||
2764 | * action:xad deletion; | ||
2765 | */ | ||
2766 | tlck = txLock(tid, ip, mp, tlckXTREE); | ||
2767 | xtlck = (struct xtlock *) & tlck->lock; | ||
2768 | xtlck->lwm.offset = | ||
2769 | (xtlck->lwm.offset) ? min(index, | ||
2770 | xtlck->lwm. | ||
2771 | offset) : index; | ||
2772 | |||
2773 | /* if delete from middle, | ||
2774 | * shift left/compact the remaining entries in the page | ||
2775 | */ | ||
2776 | if (index < nextindex - 1) | ||
2777 | memmove(&p->xad[index], &p->xad[index + 1], | ||
2778 | (nextindex - index - | ||
2779 | 1) << L2XTSLOTSIZE); | ||
2780 | |||
2781 | p->header.nextindex = | ||
2782 | cpu_to_le16(le16_to_cpu(p->header.nextindex) - | ||
2783 | 1); | ||
2784 | jfs_info("xtDeleteUp(entry): 0x%lx[%d]", | ||
2785 | (ulong) parent->bn, index); | ||
2786 | } | ||
2787 | |||
2788 | /* unpin the parent page */ | ||
2789 | XT_PUTPAGE(mp); | ||
2790 | |||
2791 | /* exit propagation up */ | ||
2792 | break; | ||
2793 | } | ||
2794 | |||
2795 | return 0; | ||
2796 | } | ||
2797 | |||
2798 | |||
2799 | /* | ||
2800 | * NAME: xtRelocate() | ||
2801 | * | ||
2802 | * FUNCTION: relocate xtpage or data extent of regular file; | ||
2803 | * This function is mainly used by defragfs utility. | ||
2804 | * | ||
2805 | * NOTE: This routine does not have the logic to handle | ||
2806 | * uncommitted allocated extent. The caller should call | ||
2807 | * txCommit() to commit all the allocation before call | ||
2808 | * this routine. | ||
2809 | */ | ||
2810 | int | ||
2811 | xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ | ||
2812 | s64 nxaddr, /* new xaddr */ | ||
2813 | int xtype) | ||
2814 | { /* extent type: XTPAGE or DATAEXT */ | ||
2815 | int rc = 0; | ||
2816 | struct tblock *tblk; | ||
2817 | struct tlock *tlck; | ||
2818 | struct xtlock *xtlck; | ||
2819 | struct metapage *mp, *pmp, *lmp, *rmp; /* meta-page buffer */ | ||
2820 | xtpage_t *p, *pp, *rp, *lp; /* base B+-tree index page */ | ||
2821 | xad_t *xad; | ||
2822 | pxd_t *pxd; | ||
2823 | s64 xoff, xsize; | ||
2824 | int xlen; | ||
2825 | s64 oxaddr, sxaddr, dxaddr, nextbn, prevbn; | ||
2826 | cbuf_t *cp; | ||
2827 | s64 offset, nbytes, nbrd, pno; | ||
2828 | int nb, npages, nblks; | ||
2829 | s64 bn; | ||
2830 | int cmp; | ||
2831 | int index; | ||
2832 | struct pxd_lock *pxdlock; | ||
2833 | struct btstack btstack; /* traverse stack */ | ||
2834 | |||
2835 | xtype = xtype & EXTENT_TYPE; | ||
2836 | |||
2837 | xoff = offsetXAD(oxad); | ||
2838 | oxaddr = addressXAD(oxad); | ||
2839 | xlen = lengthXAD(oxad); | ||
2840 | |||
2841 | /* validate extent offset */ | ||
2842 | offset = xoff << JFS_SBI(ip->i_sb)->l2bsize; | ||
2843 | if (offset >= ip->i_size) | ||
2844 | return -ESTALE; /* stale extent */ | ||
2845 | |||
2846 | jfs_info("xtRelocate: xtype:%d xoff:0x%lx xlen:0x%x xaddr:0x%lx:0x%lx", | ||
2847 | xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr); | ||
2848 | |||
2849 | /* | ||
2850 | * 1. get and validate the parent xtpage/xad entry | ||
2851 | * covering the source extent to be relocated; | ||
2852 | */ | ||
2853 | if (xtype == DATAEXT) { | ||
2854 | /* search in leaf entry */ | ||
2855 | rc = xtSearch(ip, xoff, &cmp, &btstack, 0); | ||
2856 | if (rc) | ||
2857 | return rc; | ||
2858 | |||
2859 | /* retrieve search result */ | ||
2860 | XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); | ||
2861 | |||
2862 | if (cmp) { | ||
2863 | XT_PUTPAGE(pmp); | ||
2864 | return -ESTALE; | ||
2865 | } | ||
2866 | |||
2867 | /* validate for exact match with a single entry */ | ||
2868 | xad = &pp->xad[index]; | ||
2869 | if (addressXAD(xad) != oxaddr || lengthXAD(xad) != xlen) { | ||
2870 | XT_PUTPAGE(pmp); | ||
2871 | return -ESTALE; | ||
2872 | } | ||
2873 | } else { /* (xtype == XTPAGE) */ | ||
2874 | |||
2875 | /* search in internal entry */ | ||
2876 | rc = xtSearchNode(ip, oxad, &cmp, &btstack, 0); | ||
2877 | if (rc) | ||
2878 | return rc; | ||
2879 | |||
2880 | /* retrieve search result */ | ||
2881 | XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); | ||
2882 | |||
2883 | if (cmp) { | ||
2884 | XT_PUTPAGE(pmp); | ||
2885 | return -ESTALE; | ||
2886 | } | ||
2887 | |||
2888 | /* xtSearchNode() validated for exact match with a single entry | ||
2889 | */ | ||
2890 | xad = &pp->xad[index]; | ||
2891 | } | ||
2892 | jfs_info("xtRelocate: parent xad entry validated."); | ||
2893 | |||
2894 | /* | ||
2895 | * 2. relocate the extent | ||
2896 | */ | ||
2897 | if (xtype == DATAEXT) { | ||
2898 | /* if the extent is allocated-but-not-recorded | ||
2899 | * there is no real data to be moved in this extent, | ||
2900 | */ | ||
2901 | if (xad->flag & XAD_NOTRECORDED) | ||
2902 | goto out; | ||
2903 | else | ||
2904 | /* release xtpage for cmRead()/xtLookup() */ | ||
2905 | XT_PUTPAGE(pmp); | ||
2906 | |||
2907 | /* | ||
2908 | * cmRelocate() | ||
2909 | * | ||
2910 | * copy target data pages to be relocated; | ||
2911 | * | ||
2912 | * data extent must start at page boundary and | ||
2913 | * multiple of page size (except the last data extent); | ||
2914 | * read in each page of the source data extent into cbuf, | ||
2915 | * update the cbuf extent descriptor of the page to be | ||
2916 | * homeward bound to new dst data extent | ||
2917 | * copy the data from the old extent to new extent. | ||
2918 | * copy is essential for compressed files to avoid problems | ||
2919 | * that can arise if there was a change in compression | ||
2920 | * algorithms. | ||
2921 | * it is a good strategy because it may disrupt cache | ||
2922 | * policy to keep the pages in memory afterwards. | ||
2923 | */ | ||
2924 | offset = xoff << JFS_SBI(ip->i_sb)->l2bsize; | ||
2925 | assert((offset & CM_OFFSET) == 0); | ||
2926 | nbytes = xlen << JFS_SBI(ip->i_sb)->l2bsize; | ||
2927 | pno = offset >> CM_L2BSIZE; | ||
2928 | npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE; | ||
2929 | /* | ||
2930 | npages = ((offset + nbytes - 1) >> CM_L2BSIZE) - | ||
2931 | (offset >> CM_L2BSIZE) + 1; | ||
2932 | */ | ||
2933 | sxaddr = oxaddr; | ||
2934 | dxaddr = nxaddr; | ||
2935 | |||
2936 | /* process the request one cache buffer at a time */ | ||
2937 | for (nbrd = 0; nbrd < nbytes; nbrd += nb, | ||
2938 | offset += nb, pno++, npages--) { | ||
2939 | /* compute page size */ | ||
2940 | nb = min(nbytes - nbrd, CM_BSIZE); | ||
2941 | |||
2942 | /* get the cache buffer of the page */ | ||
2943 | if (rc = cmRead(ip, offset, npages, &cp)) | ||
2944 | break; | ||
2945 | |||
2946 | assert(addressPXD(&cp->cm_pxd) == sxaddr); | ||
2947 | assert(!cp->cm_modified); | ||
2948 | |||
2949 | /* bind buffer with the new extent address */ | ||
2950 | nblks = nb >> JFS_IP(ip->i_sb)->l2bsize; | ||
2951 | cmSetXD(ip, cp, pno, dxaddr, nblks); | ||
2952 | |||
2953 | /* release the cbuf, mark it as modified */ | ||
2954 | cmPut(cp, TRUE); | ||
2955 | |||
2956 | dxaddr += nblks; | ||
2957 | sxaddr += nblks; | ||
2958 | } | ||
2959 | |||
2960 | /* get back parent page */ | ||
2961 | if ((rc = xtSearch(ip, xoff, &cmp, &btstack, 0))) | ||
2962 | return rc; | ||
2963 | |||
2964 | XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); | ||
2965 | jfs_info("xtRelocate: target data extent relocated."); | ||
2966 | } else { /* (xtype == XTPAGE) */ | ||
2967 | |||
2968 | /* | ||
2969 | * read in the target xtpage from the source extent; | ||
2970 | */ | ||
2971 | XT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); | ||
2972 | if (rc) { | ||
2973 | XT_PUTPAGE(pmp); | ||
2974 | return rc; | ||
2975 | } | ||
2976 | |||
2977 | /* | ||
2978 | * read in sibling pages if any to update sibling pointers; | ||
2979 | */ | ||
2980 | rmp = NULL; | ||
2981 | if (p->header.next) { | ||
2982 | nextbn = le64_to_cpu(p->header.next); | ||
2983 | XT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc); | ||
2984 | if (rc) { | ||
2985 | XT_PUTPAGE(pmp); | ||
2986 | XT_PUTPAGE(mp); | ||
2987 | return (rc); | ||
2988 | } | ||
2989 | } | ||
2990 | |||
2991 | lmp = NULL; | ||
2992 | if (p->header.prev) { | ||
2993 | prevbn = le64_to_cpu(p->header.prev); | ||
2994 | XT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc); | ||
2995 | if (rc) { | ||
2996 | XT_PUTPAGE(pmp); | ||
2997 | XT_PUTPAGE(mp); | ||
2998 | if (rmp) | ||
2999 | XT_PUTPAGE(rmp); | ||
3000 | return (rc); | ||
3001 | } | ||
3002 | } | ||
3003 | |||
3004 | /* at this point, all xtpages to be updated are in memory */ | ||
3005 | |||
3006 | /* | ||
3007 | * update sibling pointers of sibling xtpages if any; | ||
3008 | */ | ||
3009 | if (lmp) { | ||
3010 | BT_MARK_DIRTY(lmp, ip); | ||
3011 | tlck = | ||
3012 | txLock(tid, ip, lmp, tlckXTREE | tlckRELINK); | ||
3013 | lp->header.next = cpu_to_le64(nxaddr); | ||
3014 | XT_PUTPAGE(lmp); | ||
3015 | } | ||
3016 | |||
3017 | if (rmp) { | ||
3018 | BT_MARK_DIRTY(rmp, ip); | ||
3019 | tlck = | ||
3020 | txLock(tid, ip, rmp, tlckXTREE | tlckRELINK); | ||
3021 | rp->header.prev = cpu_to_le64(nxaddr); | ||
3022 | XT_PUTPAGE(rmp); | ||
3023 | } | ||
3024 | |||
3025 | /* | ||
3026 | * update the target xtpage to be relocated | ||
3027 | * | ||
3028 | * update the self address of the target page | ||
3029 | * and write to destination extent; | ||
3030 | * redo image covers the whole xtpage since it is new page | ||
3031 | * to the destination extent; | ||
3032 | * update of bmap for the free of source extent | ||
3033 | * of the target xtpage itself: | ||
3034 | * update of bmap for the allocation of destination extent | ||
3035 | * of the target xtpage itself: | ||
3036 | * update of bmap for the extents covered by xad entries in | ||
3037 | * the target xtpage is not necessary since they are not | ||
3038 | * updated; | ||
3039 | * if not committed before this relocation, | ||
3040 | * target page may contain XAD_NEW entries which must | ||
3041 | * be scanned for bmap update (logredo() always | ||
3042 | * scan xtpage REDOPAGE image for bmap update); | ||
3043 | * if committed before this relocation (tlckRELOCATE), | ||
3044 | * scan may be skipped by commit() and logredo(); | ||
3045 | */ | ||
3046 | BT_MARK_DIRTY(mp, ip); | ||
3047 | /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */ | ||
3048 | tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW); | ||
3049 | xtlck = (struct xtlock *) & tlck->lock; | ||
3050 | |||
3051 | /* update the self address in the xtpage header */ | ||
3052 | pxd = &p->header.self; | ||
3053 | PXDaddress(pxd, nxaddr); | ||
3054 | |||
3055 | /* linelock for the after image of the whole page */ | ||
3056 | xtlck->lwm.length = | ||
3057 | le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; | ||
3058 | |||
3059 | /* update the buffer extent descriptor of target xtpage */ | ||
3060 | xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; | ||
3061 | bmSetXD(mp, nxaddr, xsize); | ||
3062 | |||
3063 | /* unpin the target page to new homeward bound */ | ||
3064 | XT_PUTPAGE(mp); | ||
3065 | jfs_info("xtRelocate: target xtpage relocated."); | ||
3066 | } | ||
3067 | |||
3068 | /* | ||
3069 | * 3. acquire maplock for the source extent to be freed; | ||
3070 | * | ||
3071 | * acquire a maplock saving the src relocated extent address; | ||
3072 | * to free of the extent at commit time; | ||
3073 | */ | ||
3074 | out: | ||
3075 | /* if DATAEXT relocation, write a LOG_UPDATEMAP record for | ||
3076 | * free PXD of the source data extent (logredo() will update | ||
3077 | * bmap for free of source data extent), and update bmap for | ||
3078 | * free of the source data extent; | ||
3079 | */ | ||
3080 | if (xtype == DATAEXT) | ||
3081 | tlck = txMaplock(tid, ip, tlckMAP); | ||
3082 | /* if XTPAGE relocation, write a LOG_NOREDOPAGE record | ||
3083 | * for the source xtpage (logredo() will init NoRedoPage | ||
3084 | * filter and will also update bmap for free of the source | ||
3085 | * xtpage), and update bmap for free of the source xtpage; | ||
3086 | * N.B. We use tlckMAP instead of tlkcXTREE because there | ||
3087 | * is no buffer associated with this lock since the buffer | ||
3088 | * has been redirected to the target location. | ||
3089 | */ | ||
3090 | else /* (xtype == XTPAGE) */ | ||
3091 | tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE); | ||
3092 | |||
3093 | pxdlock = (struct pxd_lock *) & tlck->lock; | ||
3094 | pxdlock->flag = mlckFREEPXD; | ||
3095 | PXDaddress(&pxdlock->pxd, oxaddr); | ||
3096 | PXDlength(&pxdlock->pxd, xlen); | ||
3097 | pxdlock->index = 1; | ||
3098 | |||
3099 | /* | ||
3100 | * 4. update the parent xad entry for relocation; | ||
3101 | * | ||
3102 | * acquire tlck for the parent entry with XAD_NEW as entry | ||
3103 | * update which will write LOG_REDOPAGE and update bmap for | ||
3104 | * allocation of XAD_NEW destination extent; | ||
3105 | */ | ||
3106 | jfs_info("xtRelocate: update parent xad entry."); | ||
3107 | BT_MARK_DIRTY(pmp, ip); | ||
3108 | tlck = txLock(tid, ip, pmp, tlckXTREE | tlckGROW); | ||
3109 | xtlck = (struct xtlock *) & tlck->lock; | ||
3110 | |||
3111 | /* update the XAD with the new destination extent; */ | ||
3112 | xad = &pp->xad[index]; | ||
3113 | xad->flag |= XAD_NEW; | ||
3114 | XADaddress(xad, nxaddr); | ||
3115 | |||
3116 | xtlck->lwm.offset = min(index, xtlck->lwm.offset); | ||
3117 | xtlck->lwm.length = le16_to_cpu(pp->header.nextindex) - | ||
3118 | xtlck->lwm.offset; | ||
3119 | |||
3120 | /* unpin the parent xtpage */ | ||
3121 | XT_PUTPAGE(pmp); | ||
3122 | |||
3123 | return rc; | ||
3124 | } | ||
3125 | |||
3126 | |||
3127 | /* | ||
3128 | * xtSearchNode() | ||
3129 | * | ||
3130 | * function: search for the internal xad entry covering specified extent. | ||
3131 | * This function is mainly used by defragfs utility. | ||
3132 | * | ||
3133 | * parameters: | ||
3134 | * ip - file object; | ||
3135 | * xad - extent to find; | ||
3136 | * cmpp - comparison result: | ||
3137 | * btstack - traverse stack; | ||
3138 | * flag - search process flag; | ||
3139 | * | ||
3140 | * returns: | ||
3141 | * btstack contains (bn, index) of search path traversed to the entry. | ||
3142 | * *cmpp is set to result of comparison with the entry returned. | ||
3143 | * the page containing the entry is pinned at exit. | ||
3144 | */ | ||
3145 | static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ | ||
3146 | int *cmpp, struct btstack * btstack, int flag) | ||
3147 | { | ||
3148 | int rc = 0; | ||
3149 | s64 xoff, xaddr; | ||
3150 | int xlen; | ||
3151 | int cmp = 1; /* init for empty page */ | ||
3152 | s64 bn; /* block number */ | ||
3153 | struct metapage *mp; /* meta-page buffer */ | ||
3154 | xtpage_t *p; /* page */ | ||
3155 | int base, index, lim; | ||
3156 | struct btframe *btsp; | ||
3157 | s64 t64; | ||
3158 | |||
3159 | BT_CLR(btstack); | ||
3160 | |||
3161 | xoff = offsetXAD(xad); | ||
3162 | xlen = lengthXAD(xad); | ||
3163 | xaddr = addressXAD(xad); | ||
3164 | |||
3165 | /* | ||
3166 | * search down tree from root: | ||
3167 | * | ||
3168 | * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of | ||
3169 | * internal page, child page Pi contains entry with k, Ki <= K < Kj. | ||
3170 | * | ||
3171 | * if entry with search key K is not found | ||
3172 | * internal page search find the entry with largest key Ki | ||
3173 | * less than K which point to the child page to search; | ||
3174 | * leaf page search find the entry with smallest key Kj | ||
3175 | * greater than K so that the returned index is the position of | ||
3176 | * the entry to be shifted right for insertion of new entry. | ||
3177 | * for empty tree, search key is greater than any key of the tree. | ||
3178 | * | ||
3179 | * by convention, root bn = 0. | ||
3180 | */ | ||
3181 | for (bn = 0;;) { | ||
3182 | /* get/pin the page to search */ | ||
3183 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
3184 | if (rc) | ||
3185 | return rc; | ||
3186 | if (p->header.flag & BT_LEAF) { | ||
3187 | XT_PUTPAGE(mp); | ||
3188 | return -ESTALE; | ||
3189 | } | ||
3190 | |||
3191 | lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART; | ||
3192 | |||
3193 | /* | ||
3194 | * binary search with search key K on the current page | ||
3195 | */ | ||
3196 | for (base = XTENTRYSTART; lim; lim >>= 1) { | ||
3197 | index = base + (lim >> 1); | ||
3198 | |||
3199 | XT_CMP(cmp, xoff, &p->xad[index], t64); | ||
3200 | if (cmp == 0) { | ||
3201 | /* | ||
3202 | * search hit | ||
3203 | * | ||
3204 | * verify for exact match; | ||
3205 | */ | ||
3206 | if (xaddr == addressXAD(&p->xad[index]) && | ||
3207 | xoff == offsetXAD(&p->xad[index])) { | ||
3208 | *cmpp = cmp; | ||
3209 | |||
3210 | /* save search result */ | ||
3211 | btsp = btstack->top; | ||
3212 | btsp->bn = bn; | ||
3213 | btsp->index = index; | ||
3214 | btsp->mp = mp; | ||
3215 | |||
3216 | return 0; | ||
3217 | } | ||
3218 | |||
3219 | /* descend/search its child page */ | ||
3220 | goto next; | ||
3221 | } | ||
3222 | |||
3223 | if (cmp > 0) { | ||
3224 | base = index + 1; | ||
3225 | --lim; | ||
3226 | } | ||
3227 | } | ||
3228 | |||
3229 | /* | ||
3230 | * search miss - non-leaf page: | ||
3231 | * | ||
3232 | * base is the smallest index with key (Kj) greater than | ||
3233 | * search key (K) and may be zero or maxentry index. | ||
3234 | * if base is non-zero, decrement base by one to get the parent | ||
3235 | * entry of the child page to search. | ||
3236 | */ | ||
3237 | index = base ? base - 1 : base; | ||
3238 | |||
3239 | /* | ||
3240 | * go down to child page | ||
3241 | */ | ||
3242 | next: | ||
3243 | /* get the child page block number */ | ||
3244 | bn = addressXAD(&p->xad[index]); | ||
3245 | |||
3246 | /* unpin the parent page */ | ||
3247 | XT_PUTPAGE(mp); | ||
3248 | } | ||
3249 | } | ||
3250 | |||
3251 | |||
3252 | /* | ||
3253 | * xtRelink() | ||
3254 | * | ||
3255 | * function: | ||
3256 | * link around a freed page. | ||
3257 | * | ||
3258 | * Parameter: | ||
3259 | * int tid, | ||
3260 | * struct inode *ip, | ||
3261 | * xtpage_t *p) | ||
3262 | * | ||
3263 | * returns: | ||
3264 | */ | ||
3265 | static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p) | ||
3266 | { | ||
3267 | int rc = 0; | ||
3268 | struct metapage *mp; | ||
3269 | s64 nextbn, prevbn; | ||
3270 | struct tlock *tlck; | ||
3271 | |||
3272 | nextbn = le64_to_cpu(p->header.next); | ||
3273 | prevbn = le64_to_cpu(p->header.prev); | ||
3274 | |||
3275 | /* update prev pointer of the next page */ | ||
3276 | if (nextbn != 0) { | ||
3277 | XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); | ||
3278 | if (rc) | ||
3279 | return rc; | ||
3280 | |||
3281 | /* | ||
3282 | * acquire a transaction lock on the page; | ||
3283 | * | ||
3284 | * action: update prev pointer; | ||
3285 | */ | ||
3286 | BT_MARK_DIRTY(mp, ip); | ||
3287 | tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); | ||
3288 | |||
3289 | /* the page may already have been tlock'd */ | ||
3290 | |||
3291 | p->header.prev = cpu_to_le64(prevbn); | ||
3292 | |||
3293 | XT_PUTPAGE(mp); | ||
3294 | } | ||
3295 | |||
3296 | /* update next pointer of the previous page */ | ||
3297 | if (prevbn != 0) { | ||
3298 | XT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc); | ||
3299 | if (rc) | ||
3300 | return rc; | ||
3301 | |||
3302 | /* | ||
3303 | * acquire a transaction lock on the page; | ||
3304 | * | ||
3305 | * action: update next pointer; | ||
3306 | */ | ||
3307 | BT_MARK_DIRTY(mp, ip); | ||
3308 | tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); | ||
3309 | |||
3310 | /* the page may already have been tlock'd */ | ||
3311 | |||
3312 | p->header.next = le64_to_cpu(nextbn); | ||
3313 | |||
3314 | XT_PUTPAGE(mp); | ||
3315 | } | ||
3316 | |||
3317 | return 0; | ||
3318 | } | ||
3319 | #endif /* _STILL_TO_PORT */ | ||
3320 | |||
3321 | |||
3322 | /* | ||
3323 | * xtInitRoot() | ||
3324 | * | ||
3325 | * initialize file root (inline in inode) | ||
3326 | */ | ||
3327 | void xtInitRoot(tid_t tid, struct inode *ip) | ||
3328 | { | ||
3329 | xtpage_t *p; | ||
3330 | |||
3331 | /* | ||
3332 | * acquire a transaction lock on the root | ||
3333 | * | ||
3334 | * action: | ||
3335 | */ | ||
3336 | txLock(tid, ip, (struct metapage *) &JFS_IP(ip)->bxflag, | ||
3337 | tlckXTREE | tlckNEW); | ||
3338 | p = &JFS_IP(ip)->i_xtroot; | ||
3339 | |||
3340 | p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF; | ||
3341 | p->header.nextindex = cpu_to_le16(XTENTRYSTART); | ||
3342 | |||
3343 | if (S_ISDIR(ip->i_mode)) | ||
3344 | p->header.maxentry = cpu_to_le16(XTROOTINITSLOT_DIR); | ||
3345 | else { | ||
3346 | p->header.maxentry = cpu_to_le16(XTROOTINITSLOT); | ||
3347 | ip->i_size = 0; | ||
3348 | } | ||
3349 | |||
3350 | |||
3351 | return; | ||
3352 | } | ||
3353 | |||
3354 | |||
3355 | /* | ||
3356 | * We can run into a deadlock truncating a file with a large number of | ||
3357 | * xtree pages (large fragmented file). A robust fix would entail a | ||
3358 | * reservation system where we would reserve a number of metadata pages | ||
3359 | * and tlocks which we would be guaranteed without a deadlock. Without | ||
3360 | * this, a partial fix is to limit number of metadata pages we will lock | ||
3361 | * in a single transaction. Currently we will truncate the file so that | ||
3362 | * no more than 50 leaf pages will be locked. The caller of xtTruncate | ||
3363 | * will be responsible for ensuring that the current transaction gets | ||
3364 | * committed, and that subsequent transactions are created to truncate | ||
3365 | * the file further if needed. | ||
3366 | */ | ||
3367 | #define MAX_TRUNCATE_LEAVES 50 | ||
3368 | |||
3369 | /* | ||
3370 | * xtTruncate() | ||
3371 | * | ||
3372 | * function: | ||
3373 | * traverse for truncation logging backward bottom up; | ||
3374 | * terminate at the last extent entry at the current subtree | ||
3375 | * root page covering new down size. | ||
3376 | * truncation may occur within the last extent entry. | ||
3377 | * | ||
3378 | * parameter: | ||
3379 | * int tid, | ||
3380 | * struct inode *ip, | ||
3381 | * s64 newsize, | ||
3382 | * int type) {PWMAP, PMAP, WMAP; DELETE, TRUNCATE} | ||
3383 | * | ||
3384 | * return: | ||
3385 | * | ||
3386 | * note: | ||
3387 | * PWMAP: | ||
3388 | * 1. truncate (non-COMMIT_NOLINK file) | ||
3389 | * by jfs_truncate() or jfs_open(O_TRUNC): | ||
3390 | * xtree is updated; | ||
3391 | * 2. truncate index table of directory when last entry removed | ||
3392 | * map update via tlock at commit time; | ||
3393 | * PMAP: | ||
3394 | * Call xtTruncate_pmap instead | ||
3395 | * WMAP: | ||
3396 | * 1. remove (free zero link count) on last reference release | ||
3397 | * (pmap has been freed at commit zero link count); | ||
3398 | * 2. truncate (COMMIT_NOLINK file, i.e., tmp file): | ||
3399 | * xtree is updated; | ||
3400 | * map update directly at truncation time; | ||
3401 | * | ||
3402 | * if (DELETE) | ||
3403 | * no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient); | ||
3404 | * else if (TRUNCATE) | ||
3405 | * must write LOG_NOREDOPAGE for deleted index page; | ||
3406 | * | ||
3407 | * pages may already have been tlocked by anonymous transactions | ||
3408 | * during file growth (i.e., write) before truncation; | ||
3409 | * | ||
3410 | * except last truncated entry, deleted entries remains as is | ||
3411 | * in the page (nextindex is updated) for other use | ||
3412 | * (e.g., log/update allocation map): this avoid copying the page | ||
3413 | * info but delay free of pages; | ||
3414 | * | ||
3415 | */ | ||
3416 | s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) | ||
3417 | { | ||
3418 | int rc = 0; | ||
3419 | s64 teof; | ||
3420 | struct metapage *mp; | ||
3421 | xtpage_t *p; | ||
3422 | s64 bn; | ||
3423 | int index, nextindex; | ||
3424 | xad_t *xad; | ||
3425 | s64 xoff, xaddr; | ||
3426 | int xlen, len, freexlen; | ||
3427 | struct btstack btstack; | ||
3428 | struct btframe *parent; | ||
3429 | struct tblock *tblk = NULL; | ||
3430 | struct tlock *tlck = NULL; | ||
3431 | struct xtlock *xtlck = NULL; | ||
3432 | struct xdlistlock xadlock; /* maplock for COMMIT_WMAP */ | ||
3433 | struct pxd_lock *pxdlock; /* maplock for COMMIT_WMAP */ | ||
3434 | s64 nfreed; | ||
3435 | int freed, log; | ||
3436 | int locked_leaves = 0; | ||
3437 | |||
3438 | /* save object truncation type */ | ||
3439 | if (tid) { | ||
3440 | tblk = tid_to_tblock(tid); | ||
3441 | tblk->xflag |= flag; | ||
3442 | } | ||
3443 | |||
3444 | nfreed = 0; | ||
3445 | |||
3446 | flag &= COMMIT_MAP; | ||
3447 | assert(flag != COMMIT_PMAP); | ||
3448 | |||
3449 | if (flag == COMMIT_PWMAP) | ||
3450 | log = 1; | ||
3451 | else { | ||
3452 | log = 0; | ||
3453 | xadlock.flag = mlckFREEXADLIST; | ||
3454 | xadlock.index = 1; | ||
3455 | } | ||
3456 | |||
3457 | /* | ||
3458 | * if the newsize is not an integral number of pages, | ||
3459 | * the file between newsize and next page boundary will | ||
3460 | * be cleared. | ||
3461 | * if truncating into a file hole, it will cause | ||
3462 | * a full block to be allocated for the logical block. | ||
3463 | */ | ||
3464 | |||
3465 | /* | ||
3466 | * release page blocks of truncated region <teof, eof> | ||
3467 | * | ||
3468 | * free the data blocks from the leaf index blocks. | ||
3469 | * delete the parent index entries corresponding to | ||
3470 | * the freed child data/index blocks. | ||
3471 | * free the index blocks themselves which aren't needed | ||
3472 | * in new sized file. | ||
3473 | * | ||
3474 | * index blocks are updated only if the blocks are to be | ||
3475 | * retained in the new sized file. | ||
3476 | * if type is PMAP, the data and index pages are NOT | ||
3477 | * freed, and the data and index blocks are NOT freed | ||
3478 | * from working map. | ||
3479 | * (this will allow continued access of data/index of | ||
3480 | * temporary file (zerolink count file truncated to zero-length)). | ||
3481 | */ | ||
3482 | teof = (newsize + (JFS_SBI(ip->i_sb)->bsize - 1)) >> | ||
3483 | JFS_SBI(ip->i_sb)->l2bsize; | ||
3484 | |||
3485 | /* clear stack */ | ||
3486 | BT_CLR(&btstack); | ||
3487 | |||
3488 | /* | ||
3489 | * start with root | ||
3490 | * | ||
3491 | * root resides in the inode | ||
3492 | */ | ||
3493 | bn = 0; | ||
3494 | |||
3495 | /* | ||
3496 | * first access of each page: | ||
3497 | */ | ||
3498 | getPage: | ||
3499 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
3500 | if (rc) | ||
3501 | return rc; | ||
3502 | |||
3503 | /* process entries backward from last index */ | ||
3504 | index = le16_to_cpu(p->header.nextindex) - 1; | ||
3505 | |||
3506 | if (p->header.flag & BT_INTERNAL) | ||
3507 | goto getChild; | ||
3508 | |||
3509 | /* | ||
3510 | * leaf page | ||
3511 | */ | ||
3512 | |||
3513 | /* Since this is the rightmost leaf, and we may have already freed | ||
3514 | * a page that was formerly to the right, let's make sure that the | ||
3515 | * next pointer is zero. | ||
3516 | */ | ||
3517 | if (p->header.next) { | ||
3518 | if (log) | ||
3519 | /* | ||
3520 | * Make sure this change to the header is logged. | ||
3521 | * If we really truncate this leaf, the flag | ||
3522 | * will be changed to tlckTRUNCATE | ||
3523 | */ | ||
3524 | tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); | ||
3525 | BT_MARK_DIRTY(mp, ip); | ||
3526 | p->header.next = 0; | ||
3527 | } | ||
3528 | |||
3529 | freed = 0; | ||
3530 | |||
3531 | /* does region covered by leaf page precede Teof ? */ | ||
3532 | xad = &p->xad[index]; | ||
3533 | xoff = offsetXAD(xad); | ||
3534 | xlen = lengthXAD(xad); | ||
3535 | if (teof >= xoff + xlen) { | ||
3536 | XT_PUTPAGE(mp); | ||
3537 | goto getParent; | ||
3538 | } | ||
3539 | |||
3540 | /* (re)acquire tlock of the leaf page */ | ||
3541 | if (log) { | ||
3542 | if (++locked_leaves > MAX_TRUNCATE_LEAVES) { | ||
3543 | /* | ||
3544 | * We need to limit the size of the transaction | ||
3545 | * to avoid exhausting pagecache & tlocks | ||
3546 | */ | ||
3547 | XT_PUTPAGE(mp); | ||
3548 | newsize = (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; | ||
3549 | goto getParent; | ||
3550 | } | ||
3551 | tlck = txLock(tid, ip, mp, tlckXTREE); | ||
3552 | tlck->type = tlckXTREE | tlckTRUNCATE; | ||
3553 | xtlck = (struct xtlock *) & tlck->lock; | ||
3554 | xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1; | ||
3555 | } | ||
3556 | BT_MARK_DIRTY(mp, ip); | ||
3557 | |||
3558 | /* | ||
3559 | * scan backward leaf page entries | ||
3560 | */ | ||
3561 | for (; index >= XTENTRYSTART; index--) { | ||
3562 | xad = &p->xad[index]; | ||
3563 | xoff = offsetXAD(xad); | ||
3564 | xlen = lengthXAD(xad); | ||
3565 | xaddr = addressXAD(xad); | ||
3566 | |||
3567 | /* | ||
3568 | * The "data" for a directory is indexed by the block | ||
3569 | * device's address space. This metadata must be invalidated | ||
3570 | * here | ||
3571 | */ | ||
3572 | if (S_ISDIR(ip->i_mode) && (teof == 0)) | ||
3573 | invalidate_xad_metapages(ip, *xad); | ||
3574 | /* | ||
3575 | * entry beyond eof: continue scan of current page | ||
3576 | * xad | ||
3577 | * ---|---=======-------> | ||
3578 | * eof | ||
3579 | */ | ||
3580 | if (teof < xoff) { | ||
3581 | nfreed += xlen; | ||
3582 | continue; | ||
3583 | } | ||
3584 | |||
3585 | /* | ||
3586 | * (xoff <= teof): last entry to be deleted from page; | ||
3587 | * If other entries remain in page: keep and update the page. | ||
3588 | */ | ||
3589 | |||
3590 | /* | ||
3591 | * eof == entry_start: delete the entry | ||
3592 | * xad | ||
3593 | * -------|=======-------> | ||
3594 | * eof | ||
3595 | * | ||
3596 | */ | ||
3597 | if (teof == xoff) { | ||
3598 | nfreed += xlen; | ||
3599 | |||
3600 | if (index == XTENTRYSTART) | ||
3601 | break; | ||
3602 | |||
3603 | nextindex = index; | ||
3604 | } | ||
3605 | /* | ||
3606 | * eof within the entry: truncate the entry. | ||
3607 | * xad | ||
3608 | * -------===|===-------> | ||
3609 | * eof | ||
3610 | */ | ||
3611 | else if (teof < xoff + xlen) { | ||
3612 | /* update truncated entry */ | ||
3613 | len = teof - xoff; | ||
3614 | freexlen = xlen - len; | ||
3615 | XADlength(xad, len); | ||
3616 | |||
3617 | /* save pxd of truncated extent in tlck */ | ||
3618 | xaddr += len; | ||
3619 | if (log) { /* COMMIT_PWMAP */ | ||
3620 | xtlck->lwm.offset = (xtlck->lwm.offset) ? | ||
3621 | min(index, (int)xtlck->lwm.offset) : index; | ||
3622 | xtlck->lwm.length = index + 1 - | ||
3623 | xtlck->lwm.offset; | ||
3624 | xtlck->twm.offset = index; | ||
3625 | pxdlock = (struct pxd_lock *) & xtlck->pxdlock; | ||
3626 | pxdlock->flag = mlckFREEPXD; | ||
3627 | PXDaddress(&pxdlock->pxd, xaddr); | ||
3628 | PXDlength(&pxdlock->pxd, freexlen); | ||
3629 | } | ||
3630 | /* free truncated extent */ | ||
3631 | else { /* COMMIT_WMAP */ | ||
3632 | |||
3633 | pxdlock = (struct pxd_lock *) & xadlock; | ||
3634 | pxdlock->flag = mlckFREEPXD; | ||
3635 | PXDaddress(&pxdlock->pxd, xaddr); | ||
3636 | PXDlength(&pxdlock->pxd, freexlen); | ||
3637 | txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP); | ||
3638 | |||
3639 | /* reset map lock */ | ||
3640 | xadlock.flag = mlckFREEXADLIST; | ||
3641 | } | ||
3642 | |||
3643 | /* current entry is new last entry; */ | ||
3644 | nextindex = index + 1; | ||
3645 | |||
3646 | nfreed += freexlen; | ||
3647 | } | ||
3648 | /* | ||
3649 | * eof beyond the entry: | ||
3650 | * xad | ||
3651 | * -------=======---|---> | ||
3652 | * eof | ||
3653 | */ | ||
3654 | else { /* (xoff + xlen < teof) */ | ||
3655 | |||
3656 | nextindex = index + 1; | ||
3657 | } | ||
3658 | |||
3659 | if (nextindex < le16_to_cpu(p->header.nextindex)) { | ||
3660 | if (!log) { /* COMMIT_WAMP */ | ||
3661 | xadlock.xdlist = &p->xad[nextindex]; | ||
3662 | xadlock.count = | ||
3663 | le16_to_cpu(p->header.nextindex) - | ||
3664 | nextindex; | ||
3665 | txFreeMap(ip, (struct maplock *) & xadlock, | ||
3666 | NULL, COMMIT_WMAP); | ||
3667 | } | ||
3668 | p->header.nextindex = cpu_to_le16(nextindex); | ||
3669 | } | ||
3670 | |||
3671 | XT_PUTPAGE(mp); | ||
3672 | |||
3673 | /* assert(freed == 0); */ | ||
3674 | goto getParent; | ||
3675 | } /* end scan of leaf page entries */ | ||
3676 | |||
3677 | freed = 1; | ||
3678 | |||
3679 | /* | ||
3680 | * leaf page become empty: free the page if type != PMAP | ||
3681 | */ | ||
3682 | if (log) { /* COMMIT_PWMAP */ | ||
3683 | /* txCommit() with tlckFREE: | ||
3684 | * free data extents covered by leaf [XTENTRYSTART:hwm); | ||
3685 | * invalidate leaf if COMMIT_PWMAP; | ||
3686 | * if (TRUNCATE), will write LOG_NOREDOPAGE; | ||
3687 | */ | ||
3688 | tlck->type = tlckXTREE | tlckFREE; | ||
3689 | } else { /* COMMIT_WAMP */ | ||
3690 | |||
3691 | /* free data extents covered by leaf */ | ||
3692 | xadlock.xdlist = &p->xad[XTENTRYSTART]; | ||
3693 | xadlock.count = | ||
3694 | le16_to_cpu(p->header.nextindex) - XTENTRYSTART; | ||
3695 | txFreeMap(ip, (struct maplock *) & xadlock, NULL, COMMIT_WMAP); | ||
3696 | } | ||
3697 | |||
3698 | if (p->header.flag & BT_ROOT) { | ||
3699 | p->header.flag &= ~BT_INTERNAL; | ||
3700 | p->header.flag |= BT_LEAF; | ||
3701 | p->header.nextindex = cpu_to_le16(XTENTRYSTART); | ||
3702 | |||
3703 | XT_PUTPAGE(mp); /* debug */ | ||
3704 | goto out; | ||
3705 | } else { | ||
3706 | if (log) { /* COMMIT_PWMAP */ | ||
3707 | /* page will be invalidated at tx completion | ||
3708 | */ | ||
3709 | XT_PUTPAGE(mp); | ||
3710 | } else { /* COMMIT_WMAP */ | ||
3711 | |||
3712 | if (mp->lid) | ||
3713 | lid_to_tlock(mp->lid)->flag |= tlckFREELOCK; | ||
3714 | |||
3715 | /* invalidate empty leaf page */ | ||
3716 | discard_metapage(mp); | ||
3717 | } | ||
3718 | } | ||
3719 | |||
3720 | /* | ||
3721 | * the leaf page become empty: delete the parent entry | ||
3722 | * for the leaf page if the parent page is to be kept | ||
3723 | * in the new sized file. | ||
3724 | */ | ||
3725 | |||
3726 | /* | ||
3727 | * go back up to the parent page | ||
3728 | */ | ||
3729 | getParent: | ||
3730 | /* pop/restore parent entry for the current child page */ | ||
3731 | if ((parent = BT_POP(&btstack)) == NULL) | ||
3732 | /* current page must have been root */ | ||
3733 | goto out; | ||
3734 | |||
3735 | /* get back the parent page */ | ||
3736 | bn = parent->bn; | ||
3737 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
3738 | if (rc) | ||
3739 | return rc; | ||
3740 | |||
3741 | index = parent->index; | ||
3742 | |||
3743 | /* | ||
3744 | * child page was not empty: | ||
3745 | */ | ||
3746 | if (freed == 0) { | ||
3747 | /* has any entry deleted from parent ? */ | ||
3748 | if (index < le16_to_cpu(p->header.nextindex) - 1) { | ||
3749 | /* (re)acquire tlock on the parent page */ | ||
3750 | if (log) { /* COMMIT_PWMAP */ | ||
3751 | /* txCommit() with tlckTRUNCATE: | ||
3752 | * free child extents covered by parent [); | ||
3753 | */ | ||
3754 | tlck = txLock(tid, ip, mp, tlckXTREE); | ||
3755 | xtlck = (struct xtlock *) & tlck->lock; | ||
3756 | if (!(tlck->type & tlckTRUNCATE)) { | ||
3757 | xtlck->hwm.offset = | ||
3758 | le16_to_cpu(p->header. | ||
3759 | nextindex) - 1; | ||
3760 | tlck->type = | ||
3761 | tlckXTREE | tlckTRUNCATE; | ||
3762 | } | ||
3763 | } else { /* COMMIT_WMAP */ | ||
3764 | |||
3765 | /* free child extents covered by parent */ | ||
3766 | xadlock.xdlist = &p->xad[index + 1]; | ||
3767 | xadlock.count = | ||
3768 | le16_to_cpu(p->header.nextindex) - | ||
3769 | index - 1; | ||
3770 | txFreeMap(ip, (struct maplock *) & xadlock, | ||
3771 | NULL, COMMIT_WMAP); | ||
3772 | } | ||
3773 | BT_MARK_DIRTY(mp, ip); | ||
3774 | |||
3775 | p->header.nextindex = cpu_to_le16(index + 1); | ||
3776 | } | ||
3777 | XT_PUTPAGE(mp); | ||
3778 | goto getParent; | ||
3779 | } | ||
3780 | |||
3781 | /* | ||
3782 | * child page was empty: | ||
3783 | */ | ||
3784 | nfreed += lengthXAD(&p->xad[index]); | ||
3785 | |||
3786 | /* | ||
3787 | * During working map update, child page's tlock must be handled | ||
3788 | * before parent's. This is because the parent's tlock will cause | ||
3789 | * the child's disk space to be marked available in the wmap, so | ||
3790 | * it's important that the child page be released by that time. | ||
3791 | * | ||
3792 | * ToDo: tlocks should be on doubly-linked list, so we can | ||
3793 | * quickly remove it and add it to the end. | ||
3794 | */ | ||
3795 | |||
3796 | /* | ||
3797 | * Move parent page's tlock to the end of the tid's tlock list | ||
3798 | */ | ||
3799 | if (log && mp->lid && (tblk->last != mp->lid) && | ||
3800 | lid_to_tlock(mp->lid)->tid) { | ||
3801 | lid_t lid = mp->lid; | ||
3802 | struct tlock *prev; | ||
3803 | |||
3804 | tlck = lid_to_tlock(lid); | ||
3805 | |||
3806 | if (tblk->next == lid) | ||
3807 | tblk->next = tlck->next; | ||
3808 | else { | ||
3809 | for (prev = lid_to_tlock(tblk->next); | ||
3810 | prev->next != lid; | ||
3811 | prev = lid_to_tlock(prev->next)) { | ||
3812 | assert(prev->next); | ||
3813 | } | ||
3814 | prev->next = tlck->next; | ||
3815 | } | ||
3816 | lid_to_tlock(tblk->last)->next = lid; | ||
3817 | tlck->next = 0; | ||
3818 | tblk->last = lid; | ||
3819 | } | ||
3820 | |||
3821 | /* | ||
3822 | * parent page become empty: free the page | ||
3823 | */ | ||
3824 | if (index == XTENTRYSTART) { | ||
3825 | if (log) { /* COMMIT_PWMAP */ | ||
3826 | /* txCommit() with tlckFREE: | ||
3827 | * free child extents covered by parent; | ||
3828 | * invalidate parent if COMMIT_PWMAP; | ||
3829 | */ | ||
3830 | tlck = txLock(tid, ip, mp, tlckXTREE); | ||
3831 | xtlck = (struct xtlock *) & tlck->lock; | ||
3832 | xtlck->hwm.offset = | ||
3833 | le16_to_cpu(p->header.nextindex) - 1; | ||
3834 | tlck->type = tlckXTREE | tlckFREE; | ||
3835 | } else { /* COMMIT_WMAP */ | ||
3836 | |||
3837 | /* free child extents covered by parent */ | ||
3838 | xadlock.xdlist = &p->xad[XTENTRYSTART]; | ||
3839 | xadlock.count = | ||
3840 | le16_to_cpu(p->header.nextindex) - | ||
3841 | XTENTRYSTART; | ||
3842 | txFreeMap(ip, (struct maplock *) & xadlock, NULL, | ||
3843 | COMMIT_WMAP); | ||
3844 | } | ||
3845 | BT_MARK_DIRTY(mp, ip); | ||
3846 | |||
3847 | if (p->header.flag & BT_ROOT) { | ||
3848 | p->header.flag &= ~BT_INTERNAL; | ||
3849 | p->header.flag |= BT_LEAF; | ||
3850 | p->header.nextindex = cpu_to_le16(XTENTRYSTART); | ||
3851 | if (le16_to_cpu(p->header.maxentry) == XTROOTMAXSLOT) { | ||
3852 | /* | ||
3853 | * Shrink root down to allow inline | ||
3854 | * EA (otherwise fsck complains) | ||
3855 | */ | ||
3856 | p->header.maxentry = | ||
3857 | cpu_to_le16(XTROOTINITSLOT); | ||
3858 | JFS_IP(ip)->mode2 |= INLINEEA; | ||
3859 | } | ||
3860 | |||
3861 | XT_PUTPAGE(mp); /* debug */ | ||
3862 | goto out; | ||
3863 | } else { | ||
3864 | if (log) { /* COMMIT_PWMAP */ | ||
3865 | /* page will be invalidated at tx completion | ||
3866 | */ | ||
3867 | XT_PUTPAGE(mp); | ||
3868 | } else { /* COMMIT_WMAP */ | ||
3869 | |||
3870 | if (mp->lid) | ||
3871 | lid_to_tlock(mp->lid)->flag |= | ||
3872 | tlckFREELOCK; | ||
3873 | |||
3874 | /* invalidate parent page */ | ||
3875 | discard_metapage(mp); | ||
3876 | } | ||
3877 | |||
3878 | /* parent has become empty and freed: | ||
3879 | * go back up to its parent page | ||
3880 | */ | ||
3881 | /* freed = 1; */ | ||
3882 | goto getParent; | ||
3883 | } | ||
3884 | } | ||
3885 | /* | ||
3886 | * parent page still has entries for front region; | ||
3887 | */ | ||
3888 | else { | ||
3889 | /* try truncate region covered by preceding entry | ||
3890 | * (process backward) | ||
3891 | */ | ||
3892 | index--; | ||
3893 | |||
3894 | /* go back down to the child page corresponding | ||
3895 | * to the entry | ||
3896 | */ | ||
3897 | goto getChild; | ||
3898 | } | ||
3899 | |||
3900 | /* | ||
3901 | * internal page: go down to child page of current entry | ||
3902 | */ | ||
3903 | getChild: | ||
3904 | /* save current parent entry for the child page */ | ||
3905 | BT_PUSH(&btstack, bn, index); | ||
3906 | |||
3907 | /* get child page */ | ||
3908 | xad = &p->xad[index]; | ||
3909 | bn = addressXAD(xad); | ||
3910 | |||
3911 | /* | ||
3912 | * first access of each internal entry: | ||
3913 | */ | ||
3914 | /* release parent page */ | ||
3915 | XT_PUTPAGE(mp); | ||
3916 | |||
3917 | /* process the child page */ | ||
3918 | goto getPage; | ||
3919 | |||
3920 | out: | ||
3921 | /* | ||
3922 | * update file resource stat | ||
3923 | */ | ||
3924 | /* set size | ||
3925 | */ | ||
3926 | if (S_ISDIR(ip->i_mode) && !newsize) | ||
3927 | ip->i_size = 1; /* fsck hates zero-length directories */ | ||
3928 | else | ||
3929 | ip->i_size = newsize; | ||
3930 | |||
3931 | /* update quota allocation to reflect freed blocks */ | ||
3932 | DQUOT_FREE_BLOCK(ip, nfreed); | ||
3933 | |||
3934 | /* | ||
3935 | * free tlock of invalidated pages | ||
3936 | */ | ||
3937 | if (flag == COMMIT_WMAP) | ||
3938 | txFreelock(ip); | ||
3939 | |||
3940 | return newsize; | ||
3941 | } | ||
3942 | |||
3943 | |||
3944 | /* | ||
3945 | * xtTruncate_pmap() | ||
3946 | * | ||
3947 | * function: | ||
3948 | * Perform truncate to zero lenghth for deleted file, leaving the | ||
3949 | * the xtree and working map untouched. This allows the file to | ||
3950 | * be accessed via open file handles, while the delete of the file | ||
3951 | * is committed to disk. | ||
3952 | * | ||
3953 | * parameter: | ||
3954 | * tid_t tid, | ||
3955 | * struct inode *ip, | ||
3956 | * s64 committed_size) | ||
3957 | * | ||
3958 | * return: new committed size | ||
3959 | * | ||
3960 | * note: | ||
3961 | * | ||
3962 | * To avoid deadlock by holding too many transaction locks, the | ||
3963 | * truncation may be broken up into multiple transactions. | ||
3964 | * The committed_size keeps track of part of the file has been | ||
3965 | * freed from the pmaps. | ||
3966 | */ | ||
3967 | s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) | ||
3968 | { | ||
3969 | s64 bn; | ||
3970 | struct btstack btstack; | ||
3971 | int cmp; | ||
3972 | int index; | ||
3973 | int locked_leaves = 0; | ||
3974 | struct metapage *mp; | ||
3975 | xtpage_t *p; | ||
3976 | struct btframe *parent; | ||
3977 | int rc; | ||
3978 | struct tblock *tblk; | ||
3979 | struct tlock *tlck = NULL; | ||
3980 | xad_t *xad; | ||
3981 | int xlen; | ||
3982 | s64 xoff; | ||
3983 | struct xtlock *xtlck = NULL; | ||
3984 | |||
3985 | /* save object truncation type */ | ||
3986 | tblk = tid_to_tblock(tid); | ||
3987 | tblk->xflag |= COMMIT_PMAP; | ||
3988 | |||
3989 | /* clear stack */ | ||
3990 | BT_CLR(&btstack); | ||
3991 | |||
3992 | if (committed_size) { | ||
3993 | xoff = (committed_size >> JFS_SBI(ip->i_sb)->l2bsize) - 1; | ||
3994 | rc = xtSearch(ip, xoff, &cmp, &btstack, 0); | ||
3995 | if (rc) | ||
3996 | return rc; | ||
3997 | |||
3998 | XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); | ||
3999 | |||
4000 | if (cmp != 0) { | ||
4001 | XT_PUTPAGE(mp); | ||
4002 | jfs_error(ip->i_sb, | ||
4003 | "xtTruncate_pmap: did not find extent"); | ||
4004 | return -EIO; | ||
4005 | } | ||
4006 | } else { | ||
4007 | /* | ||
4008 | * start with root | ||
4009 | * | ||
4010 | * root resides in the inode | ||
4011 | */ | ||
4012 | bn = 0; | ||
4013 | |||
4014 | /* | ||
4015 | * first access of each page: | ||
4016 | */ | ||
4017 | getPage: | ||
4018 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
4019 | if (rc) | ||
4020 | return rc; | ||
4021 | |||
4022 | /* process entries backward from last index */ | ||
4023 | index = le16_to_cpu(p->header.nextindex) - 1; | ||
4024 | |||
4025 | if (p->header.flag & BT_INTERNAL) | ||
4026 | goto getChild; | ||
4027 | } | ||
4028 | |||
4029 | /* | ||
4030 | * leaf page | ||
4031 | */ | ||
4032 | |||
4033 | if (++locked_leaves > MAX_TRUNCATE_LEAVES) { | ||
4034 | /* | ||
4035 | * We need to limit the size of the transaction | ||
4036 | * to avoid exhausting pagecache & tlocks | ||
4037 | */ | ||
4038 | xad = &p->xad[index]; | ||
4039 | xoff = offsetXAD(xad); | ||
4040 | xlen = lengthXAD(xad); | ||
4041 | XT_PUTPAGE(mp); | ||
4042 | return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; | ||
4043 | } | ||
4044 | tlck = txLock(tid, ip, mp, tlckXTREE); | ||
4045 | tlck->type = tlckXTREE | tlckFREE; | ||
4046 | xtlck = (struct xtlock *) & tlck->lock; | ||
4047 | xtlck->hwm.offset = index; | ||
4048 | |||
4049 | |||
4050 | XT_PUTPAGE(mp); | ||
4051 | |||
4052 | /* | ||
4053 | * go back up to the parent page | ||
4054 | */ | ||
4055 | getParent: | ||
4056 | /* pop/restore parent entry for the current child page */ | ||
4057 | if ((parent = BT_POP(&btstack)) == NULL) | ||
4058 | /* current page must have been root */ | ||
4059 | goto out; | ||
4060 | |||
4061 | /* get back the parent page */ | ||
4062 | bn = parent->bn; | ||
4063 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
4064 | if (rc) | ||
4065 | return rc; | ||
4066 | |||
4067 | index = parent->index; | ||
4068 | |||
4069 | /* | ||
4070 | * parent page become empty: free the page | ||
4071 | */ | ||
4072 | if (index == XTENTRYSTART) { | ||
4073 | /* txCommit() with tlckFREE: | ||
4074 | * free child extents covered by parent; | ||
4075 | * invalidate parent if COMMIT_PWMAP; | ||
4076 | */ | ||
4077 | tlck = txLock(tid, ip, mp, tlckXTREE); | ||
4078 | xtlck = (struct xtlock *) & tlck->lock; | ||
4079 | xtlck->hwm.offset = | ||
4080 | le16_to_cpu(p->header.nextindex) - 1; | ||
4081 | tlck->type = tlckXTREE | tlckFREE; | ||
4082 | |||
4083 | XT_PUTPAGE(mp); | ||
4084 | |||
4085 | if (p->header.flag & BT_ROOT) { | ||
4086 | |||
4087 | goto out; | ||
4088 | } else { | ||
4089 | goto getParent; | ||
4090 | } | ||
4091 | } | ||
4092 | /* | ||
4093 | * parent page still has entries for front region; | ||
4094 | */ | ||
4095 | else | ||
4096 | index--; | ||
4097 | /* | ||
4098 | * internal page: go down to child page of current entry | ||
4099 | */ | ||
4100 | getChild: | ||
4101 | /* save current parent entry for the child page */ | ||
4102 | BT_PUSH(&btstack, bn, index); | ||
4103 | |||
4104 | /* get child page */ | ||
4105 | xad = &p->xad[index]; | ||
4106 | bn = addressXAD(xad); | ||
4107 | |||
4108 | /* | ||
4109 | * first access of each internal entry: | ||
4110 | */ | ||
4111 | /* release parent page */ | ||
4112 | XT_PUTPAGE(mp); | ||
4113 | |||
4114 | /* process the child page */ | ||
4115 | goto getPage; | ||
4116 | |||
4117 | out: | ||
4118 | |||
4119 | return 0; | ||
4120 | } | ||
4121 | |||
4122 | |||
4123 | #ifdef _JFS_DEBUG_XTREE | ||
4124 | /* | ||
4125 | * xtDisplayTree() | ||
4126 | * | ||
4127 | * function: traverse forward | ||
4128 | */ | ||
4129 | int xtDisplayTree(struct inode *ip) | ||
4130 | { | ||
4131 | int rc = 0; | ||
4132 | struct metapage *mp; | ||
4133 | xtpage_t *p; | ||
4134 | s64 bn, pbn; | ||
4135 | int index, lastindex, v, h; | ||
4136 | xad_t *xad; | ||
4137 | struct btstack btstack; | ||
4138 | struct btframe *btsp; | ||
4139 | struct btframe *parent; | ||
4140 | |||
4141 | printk("display B+-tree.\n"); | ||
4142 | |||
4143 | /* clear stack */ | ||
4144 | btsp = btstack.stack; | ||
4145 | |||
4146 | /* | ||
4147 | * start with root | ||
4148 | * | ||
4149 | * root resides in the inode | ||
4150 | */ | ||
4151 | bn = 0; | ||
4152 | v = h = 0; | ||
4153 | |||
4154 | /* | ||
4155 | * first access of each page: | ||
4156 | */ | ||
4157 | getPage: | ||
4158 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
4159 | if (rc) | ||
4160 | return rc; | ||
4161 | |||
4162 | /* process entries forward from first index */ | ||
4163 | index = XTENTRYSTART; | ||
4164 | lastindex = le16_to_cpu(p->header.nextindex) - 1; | ||
4165 | |||
4166 | if (p->header.flag & BT_INTERNAL) { | ||
4167 | /* | ||
4168 | * first access of each internal page | ||
4169 | */ | ||
4170 | goto getChild; | ||
4171 | } else { /* (p->header.flag & BT_LEAF) */ | ||
4172 | |||
4173 | /* | ||
4174 | * first access of each leaf page | ||
4175 | */ | ||
4176 | printf("leaf page "); | ||
4177 | xtDisplayPage(ip, bn, p); | ||
4178 | |||
4179 | /* unpin the leaf page */ | ||
4180 | XT_PUTPAGE(mp); | ||
4181 | } | ||
4182 | |||
4183 | /* | ||
4184 | * go back up to the parent page | ||
4185 | */ | ||
4186 | getParent: | ||
4187 | /* pop/restore parent entry for the current child page */ | ||
4188 | if ((parent = (btsp == btstack.stack ? NULL : --btsp)) == NULL) | ||
4189 | /* current page must have been root */ | ||
4190 | return; | ||
4191 | |||
4192 | /* | ||
4193 | * parent page scan completed | ||
4194 | */ | ||
4195 | if ((index = parent->index) == (lastindex = parent->lastindex)) { | ||
4196 | /* go back up to the parent page */ | ||
4197 | goto getParent; | ||
4198 | } | ||
4199 | |||
4200 | /* | ||
4201 | * parent page has entries remaining | ||
4202 | */ | ||
4203 | /* get back the parent page */ | ||
4204 | bn = parent->bn; | ||
4205 | /* v = parent->level; */ | ||
4206 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
4207 | if (rc) | ||
4208 | return rc; | ||
4209 | |||
4210 | /* get next parent entry */ | ||
4211 | index++; | ||
4212 | |||
4213 | /* | ||
4214 | * internal page: go down to child page of current entry | ||
4215 | */ | ||
4216 | getChild: | ||
4217 | /* push/save current parent entry for the child page */ | ||
4218 | btsp->bn = pbn = bn; | ||
4219 | btsp->index = index; | ||
4220 | btsp->lastindex = lastindex; | ||
4221 | /* btsp->level = v; */ | ||
4222 | /* btsp->node = h; */ | ||
4223 | ++btsp; | ||
4224 | |||
4225 | /* get child page */ | ||
4226 | xad = &p->xad[index]; | ||
4227 | bn = addressXAD(xad); | ||
4228 | |||
4229 | /* | ||
4230 | * first access of each internal entry: | ||
4231 | */ | ||
4232 | /* release parent page */ | ||
4233 | XT_PUTPAGE(mp); | ||
4234 | |||
4235 | printk("traverse down 0x%lx[%d]->0x%lx\n", (ulong) pbn, index, | ||
4236 | (ulong) bn); | ||
4237 | v++; | ||
4238 | h = index; | ||
4239 | |||
4240 | /* process the child page */ | ||
4241 | goto getPage; | ||
4242 | } | ||
4243 | |||
4244 | |||
4245 | /* | ||
4246 | * xtDisplayPage() | ||
4247 | * | ||
4248 | * function: display page | ||
4249 | */ | ||
4250 | int xtDisplayPage(struct inode *ip, s64 bn, xtpage_t * p) | ||
4251 | { | ||
4252 | int rc = 0; | ||
4253 | xad_t *xad; | ||
4254 | s64 xaddr, xoff; | ||
4255 | int xlen, i, j; | ||
4256 | |||
4257 | /* display page control */ | ||
4258 | printf("bn:0x%lx flag:0x%x nextindex:%d\n", | ||
4259 | (ulong) bn, p->header.flag, | ||
4260 | le16_to_cpu(p->header.nextindex)); | ||
4261 | |||
4262 | /* display entries */ | ||
4263 | xad = &p->xad[XTENTRYSTART]; | ||
4264 | for (i = XTENTRYSTART, j = 1; i < le16_to_cpu(p->header.nextindex); | ||
4265 | i++, xad++, j++) { | ||
4266 | xoff = offsetXAD(xad); | ||
4267 | xaddr = addressXAD(xad); | ||
4268 | xlen = lengthXAD(xad); | ||
4269 | printf("\t[%d] 0x%lx:0x%lx(0x%x)", i, (ulong) xoff, | ||
4270 | (ulong) xaddr, xlen); | ||
4271 | |||
4272 | if (j == 4) { | ||
4273 | printf("\n"); | ||
4274 | j = 0; | ||
4275 | } | ||
4276 | } | ||
4277 | |||
4278 | printf("\n"); | ||
4279 | } | ||
4280 | #endif /* _JFS_DEBUG_XTREE */ | ||
4281 | |||
4282 | |||
4283 | #ifdef _JFS_WIP | ||
4284 | /* | ||
4285 | * xtGather() | ||
4286 | * | ||
4287 | * function: | ||
4288 | * traverse for allocation acquiring tlock at commit time | ||
4289 | * (vs at the time of update) logging backward top down | ||
4290 | * | ||
4291 | * note: | ||
4292 | * problem - establishing that all new allocation have been | ||
4293 | * processed both for append and random write in sparse file | ||
4294 | * at the current entry at the current subtree root page | ||
4295 | * | ||
4296 | */ | ||
4297 | int xtGather(btree_t *t) | ||
4298 | { | ||
4299 | int rc = 0; | ||
4300 | xtpage_t *p; | ||
4301 | u64 bn; | ||
4302 | int index; | ||
4303 | btentry_t *e; | ||
4304 | struct btstack btstack; | ||
4305 | struct btsf *parent; | ||
4306 | |||
4307 | /* clear stack */ | ||
4308 | BT_CLR(&btstack); | ||
4309 | |||
4310 | /* | ||
4311 | * start with root | ||
4312 | * | ||
4313 | * root resides in the inode | ||
4314 | */ | ||
4315 | bn = 0; | ||
4316 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
4317 | if (rc) | ||
4318 | return rc; | ||
4319 | |||
4320 | /* new root is NOT pointed by a new entry | ||
4321 | if (p->header.flag & NEW) | ||
4322 | allocate new page lock; | ||
4323 | write a NEWPAGE log; | ||
4324 | */ | ||
4325 | |||
4326 | dopage: | ||
4327 | /* | ||
4328 | * first access of each page: | ||
4329 | */ | ||
4330 | /* process entries backward from last index */ | ||
4331 | index = le16_to_cpu(p->header.nextindex) - 1; | ||
4332 | |||
4333 | if (p->header.flag & BT_LEAF) { | ||
4334 | /* | ||
4335 | * first access of each leaf page | ||
4336 | */ | ||
4337 | /* process leaf page entries backward */ | ||
4338 | for (; index >= XTENTRYSTART; index--) { | ||
4339 | e = &p->xad[index]; | ||
4340 | /* | ||
4341 | * if newpage, log NEWPAGE. | ||
4342 | * | ||
4343 | if (e->flag & XAD_NEW) { | ||
4344 | nfound =+ entry->length; | ||
4345 | update current page lock for the entry; | ||
4346 | newpage(entry); | ||
4347 | * | ||
4348 | * if moved, log move. | ||
4349 | * | ||
4350 | } else if (e->flag & XAD_MOVED) { | ||
4351 | reset flag; | ||
4352 | update current page lock for the entry; | ||
4353 | } | ||
4354 | */ | ||
4355 | } | ||
4356 | |||
4357 | /* unpin the leaf page */ | ||
4358 | XT_PUTPAGE(mp); | ||
4359 | |||
4360 | /* | ||
4361 | * go back up to the parent page | ||
4362 | */ | ||
4363 | getParent: | ||
4364 | /* restore parent entry for the current child page */ | ||
4365 | if ((parent = BT_POP(&btstack)) == NULL) | ||
4366 | /* current page must have been root */ | ||
4367 | return 0; | ||
4368 | |||
4369 | if ((index = parent->index) == XTENTRYSTART) { | ||
4370 | /* | ||
4371 | * parent page scan completed | ||
4372 | */ | ||
4373 | /* go back up to the parent page */ | ||
4374 | goto getParent; | ||
4375 | } else { | ||
4376 | /* | ||
4377 | * parent page has entries remaining | ||
4378 | */ | ||
4379 | /* get back the parent page */ | ||
4380 | bn = parent->bn; | ||
4381 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
4382 | if (rc) | ||
4383 | return -EIO; | ||
4384 | |||
4385 | /* first subroot page which | ||
4386 | * covers all new allocated blocks | ||
4387 | * itself not new/modified. | ||
4388 | * (if modified from split of descendent, | ||
4389 | * go down path of split page) | ||
4390 | |||
4391 | if (nfound == nnew && | ||
4392 | !(p->header.flag & (NEW | MOD))) | ||
4393 | exit scan; | ||
4394 | */ | ||
4395 | |||
4396 | /* process parent page entries backward */ | ||
4397 | index--; | ||
4398 | } | ||
4399 | } else { | ||
4400 | /* | ||
4401 | * first access of each internal page | ||
4402 | */ | ||
4403 | } | ||
4404 | |||
4405 | /* | ||
4406 | * internal page: go down to child page of current entry | ||
4407 | */ | ||
4408 | |||
4409 | /* save current parent entry for the child page */ | ||
4410 | BT_PUSH(&btstack, bn, index); | ||
4411 | |||
4412 | /* get current entry for the child page */ | ||
4413 | e = &p->xad[index]; | ||
4414 | |||
4415 | /* | ||
4416 | * first access of each internal entry: | ||
4417 | */ | ||
4418 | /* | ||
4419 | * if new entry, log btree_tnewentry. | ||
4420 | * | ||
4421 | if (e->flag & XAD_NEW) | ||
4422 | update parent page lock for the entry; | ||
4423 | */ | ||
4424 | |||
4425 | /* release parent page */ | ||
4426 | XT_PUTPAGE(mp); | ||
4427 | |||
4428 | /* get child page */ | ||
4429 | bn = e->bn; | ||
4430 | XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); | ||
4431 | if (rc) | ||
4432 | return rc; | ||
4433 | |||
4434 | /* | ||
4435 | * first access of each non-root page: | ||
4436 | */ | ||
4437 | /* | ||
4438 | * if new, log btree_newpage. | ||
4439 | * | ||
4440 | if (p->header.flag & NEW) | ||
4441 | allocate new page lock; | ||
4442 | write a NEWPAGE log (next, prev); | ||
4443 | */ | ||
4444 | |||
4445 | /* process the child page */ | ||
4446 | goto dopage; | ||
4447 | |||
4448 | out: | ||
4449 | return 0; | ||
4450 | } | ||
4451 | #endif /* _JFS_WIP */ | ||
4452 | |||
4453 | |||
4454 | #ifdef CONFIG_JFS_STATISTICS | ||
4455 | int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length, | ||
4456 | int *eof, void *data) | ||
4457 | { | ||
4458 | int len = 0; | ||
4459 | off_t begin; | ||
4460 | |||
4461 | len += sprintf(buffer, | ||
4462 | "JFS Xtree statistics\n" | ||
4463 | "====================\n" | ||
4464 | "searches = %d\n" | ||
4465 | "fast searches = %d\n" | ||
4466 | "splits = %d\n", | ||
4467 | xtStat.search, | ||
4468 | xtStat.fastSearch, | ||
4469 | xtStat.split); | ||
4470 | |||
4471 | begin = offset; | ||
4472 | *start = buffer + begin; | ||
4473 | len -= begin; | ||
4474 | |||
4475 | if (len > length) | ||
4476 | len = length; | ||
4477 | else | ||
4478 | *eof = 1; | ||
4479 | |||
4480 | if (len < 0) | ||
4481 | len = 0; | ||
4482 | |||
4483 | return len; | ||
4484 | } | ||
4485 | #endif | ||
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h new file mode 100644 index 000000000000..a69784254fe7 --- /dev/null +++ b/fs/jfs/jfs_xtree.h | |||
@@ -0,0 +1,140 @@ | |||
1 | /* | ||
2 | * Copyright (c) International Business Machines Corp., 2000-2002 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | #ifndef _H_JFS_XTREE | ||
19 | #define _H_JFS_XTREE | ||
20 | |||
21 | /* | ||
22 | * jfs_xtree.h: extent allocation descriptor B+-tree manager | ||
23 | */ | ||
24 | |||
25 | #include "jfs_btree.h" | ||
26 | |||
27 | |||
28 | /* | ||
29 | * extent allocation descriptor (xad) | ||
30 | */ | ||
31 | typedef struct xad { | ||
32 | unsigned flag:8; /* 1: flag */ | ||
33 | unsigned rsvrd:16; /* 2: reserved */ | ||
34 | unsigned off1:8; /* 1: offset in unit of fsblksize */ | ||
35 | __le32 off2; /* 4: offset in unit of fsblksize */ | ||
36 | unsigned len:24; /* 3: length in unit of fsblksize */ | ||
37 | unsigned addr1:8; /* 1: address in unit of fsblksize */ | ||
38 | __le32 addr2; /* 4: address in unit of fsblksize */ | ||
39 | } xad_t; /* (16) */ | ||
40 | |||
41 | #define MAXXLEN ((1 << 24) - 1) | ||
42 | |||
43 | #define XTSLOTSIZE 16 | ||
44 | #define L2XTSLOTSIZE 4 | ||
45 | |||
46 | /* xad_t field construction */ | ||
47 | #define XADoffset(xad, offset64)\ | ||
48 | {\ | ||
49 | (xad)->off1 = ((u64)offset64) >> 32;\ | ||
50 | (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\ | ||
51 | } | ||
52 | #define XADaddress(xad, address64)\ | ||
53 | {\ | ||
54 | (xad)->addr1 = ((u64)address64) >> 32;\ | ||
55 | (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ | ||
56 | } | ||
57 | #define XADlength(xad, length32) (xad)->len = __cpu_to_le24(length32) | ||
58 | |||
59 | /* xad_t field extraction */ | ||
60 | #define offsetXAD(xad)\ | ||
61 | ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2)) | ||
62 | #define addressXAD(xad)\ | ||
63 | ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2)) | ||
64 | #define lengthXAD(xad) __le24_to_cpu((xad)->len) | ||
65 | |||
66 | /* xad list */ | ||
67 | struct xadlist { | ||
68 | s16 maxnxad; | ||
69 | s16 nxad; | ||
70 | xad_t *xad; | ||
71 | }; | ||
72 | |||
73 | /* xad_t flags */ | ||
74 | #define XAD_NEW 0x01 /* new */ | ||
75 | #define XAD_EXTENDED 0x02 /* extended */ | ||
76 | #define XAD_COMPRESSED 0x04 /* compressed with recorded length */ | ||
77 | #define XAD_NOTRECORDED 0x08 /* allocated but not recorded */ | ||
78 | #define XAD_COW 0x10 /* copy-on-write */ | ||
79 | |||
80 | |||
81 | /* possible values for maxentry */ | ||
82 | #define XTROOTINITSLOT_DIR 6 | ||
83 | #define XTROOTINITSLOT 10 | ||
84 | #define XTROOTMAXSLOT 18 | ||
85 | #define XTPAGEMAXSLOT 256 | ||
86 | #define XTENTRYSTART 2 | ||
87 | |||
88 | /* | ||
89 | * xtree page: | ||
90 | */ | ||
91 | typedef union { | ||
92 | struct xtheader { | ||
93 | __le64 next; /* 8: */ | ||
94 | __le64 prev; /* 8: */ | ||
95 | |||
96 | u8 flag; /* 1: */ | ||
97 | u8 rsrvd1; /* 1: */ | ||
98 | __le16 nextindex; /* 2: next index = number of entries */ | ||
99 | __le16 maxentry; /* 2: max number of entries */ | ||
100 | __le16 rsrvd2; /* 2: */ | ||
101 | |||
102 | pxd_t self; /* 8: self */ | ||
103 | } header; /* (32) */ | ||
104 | |||
105 | xad_t xad[XTROOTMAXSLOT]; /* 16 * maxentry: xad array */ | ||
106 | } xtpage_t; | ||
107 | |||
108 | /* | ||
109 | * external declaration | ||
110 | */ | ||
111 | extern int xtLookup(struct inode *ip, s64 lstart, s64 llen, | ||
112 | int *pflag, s64 * paddr, int *plen, int flag); | ||
113 | extern int xtLookupList(struct inode *ip, struct lxdlist * lxdlist, | ||
114 | struct xadlist * xadlist, int flag); | ||
115 | extern void xtInitRoot(tid_t tid, struct inode *ip); | ||
116 | extern int xtInsert(tid_t tid, struct inode *ip, | ||
117 | int xflag, s64 xoff, int xlen, s64 * xaddrp, int flag); | ||
118 | extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen, | ||
119 | int flag); | ||
120 | #ifdef _NOTYET | ||
121 | extern int xtTailgate(tid_t tid, struct inode *ip, | ||
122 | s64 xoff, int xlen, s64 xaddr, int flag); | ||
123 | #endif | ||
124 | extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad); | ||
125 | extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen, | ||
126 | int flag); | ||
127 | extern s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int type); | ||
128 | extern s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size); | ||
129 | extern int xtRelocate(tid_t tid, struct inode *ip, | ||
130 | xad_t * oxad, s64 nxaddr, int xtype); | ||
131 | extern int xtAppend(tid_t tid, | ||
132 | struct inode *ip, int xflag, s64 xoff, int maxblocks, | ||
133 | int *xlenp, s64 * xaddrp, int flag); | ||
134 | |||
135 | #ifdef _JFS_DEBUG_XTREE | ||
136 | extern int xtDisplayTree(struct inode *ip); | ||
137 | extern int xtDisplayPage(struct inode *ip, s64 bn, xtpage_t * p); | ||
138 | #endif /* _JFS_DEBUG_XTREE */ | ||
139 | |||
140 | #endif /* !_H_JFS_XTREE */ | ||
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c new file mode 100644 index 000000000000..8413a368f449 --- /dev/null +++ b/fs/jfs/namei.c | |||
@@ -0,0 +1,1540 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * Portions Copyright (C) Christoph Hellwig, 2001-2002 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | |||
20 | #include <linux/fs.h> | ||
21 | #include <linux/ctype.h> | ||
22 | #include <linux/quotaops.h> | ||
23 | #include "jfs_incore.h" | ||
24 | #include "jfs_superblock.h" | ||
25 | #include "jfs_inode.h" | ||
26 | #include "jfs_dinode.h" | ||
27 | #include "jfs_dmap.h" | ||
28 | #include "jfs_unicode.h" | ||
29 | #include "jfs_metapage.h" | ||
30 | #include "jfs_xattr.h" | ||
31 | #include "jfs_acl.h" | ||
32 | #include "jfs_debug.h" | ||
33 | |||
34 | extern struct inode_operations jfs_file_inode_operations; | ||
35 | extern struct inode_operations jfs_symlink_inode_operations; | ||
36 | extern struct file_operations jfs_file_operations; | ||
37 | extern struct address_space_operations jfs_aops; | ||
38 | |||
39 | extern int jfs_fsync(struct file *, struct dentry *, int); | ||
40 | extern void jfs_truncate_nolock(struct inode *, loff_t); | ||
41 | extern int jfs_init_acl(struct inode *, struct inode *); | ||
42 | |||
43 | /* | ||
44 | * forward references | ||
45 | */ | ||
46 | struct inode_operations jfs_dir_inode_operations; | ||
47 | struct file_operations jfs_dir_operations; | ||
48 | struct dentry_operations jfs_ci_dentry_operations; | ||
49 | |||
50 | static s64 commitZeroLink(tid_t, struct inode *); | ||
51 | |||
52 | /* | ||
53 | * NAME: jfs_create(dip, dentry, mode) | ||
54 | * | ||
55 | * FUNCTION: create a regular file in the parent directory <dip> | ||
56 | * with name = <from dentry> and mode = <mode> | ||
57 | * | ||
58 | * PARAMETER: dip - parent directory vnode | ||
59 | * dentry - dentry of new file | ||
60 | * mode - create mode (rwxrwxrwx). | ||
61 | * nd- nd struct | ||
62 | * | ||
63 | * RETURN: Errors from subroutines | ||
64 | * | ||
65 | */ | ||
66 | static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, | ||
67 | struct nameidata *nd) | ||
68 | { | ||
69 | int rc = 0; | ||
70 | tid_t tid; /* transaction id */ | ||
71 | struct inode *ip = NULL; /* child directory inode */ | ||
72 | ino_t ino; | ||
73 | struct component_name dname; /* child directory name */ | ||
74 | struct btstack btstack; | ||
75 | struct inode *iplist[2]; | ||
76 | struct tblock *tblk; | ||
77 | |||
78 | jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name); | ||
79 | |||
80 | /* | ||
81 | * search parent directory for entry/freespace | ||
82 | * (dtSearch() returns parent directory page pinned) | ||
83 | */ | ||
84 | if ((rc = get_UCSname(&dname, dentry))) | ||
85 | goto out1; | ||
86 | |||
87 | /* | ||
88 | * Either iAlloc() or txBegin() may block. Deadlock can occur if we | ||
89 | * block there while holding dtree page, so we allocate the inode & | ||
90 | * begin the transaction before we search the directory. | ||
91 | */ | ||
92 | ip = ialloc(dip, mode); | ||
93 | if (ip == NULL) { | ||
94 | rc = -ENOSPC; | ||
95 | goto out2; | ||
96 | } | ||
97 | |||
98 | tid = txBegin(dip->i_sb, 0); | ||
99 | |||
100 | down(&JFS_IP(dip)->commit_sem); | ||
101 | down(&JFS_IP(ip)->commit_sem); | ||
102 | |||
103 | if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) { | ||
104 | jfs_err("jfs_create: dtSearch returned %d", rc); | ||
105 | goto out3; | ||
106 | } | ||
107 | |||
108 | tblk = tid_to_tblock(tid); | ||
109 | tblk->xflag |= COMMIT_CREATE; | ||
110 | tblk->ino = ip->i_ino; | ||
111 | tblk->u.ixpxd = JFS_IP(ip)->ixpxd; | ||
112 | |||
113 | iplist[0] = dip; | ||
114 | iplist[1] = ip; | ||
115 | |||
116 | /* | ||
117 | * initialize the child XAD tree root in-line in inode | ||
118 | */ | ||
119 | xtInitRoot(tid, ip); | ||
120 | |||
121 | /* | ||
122 | * create entry in parent directory for child directory | ||
123 | * (dtInsert() releases parent directory page) | ||
124 | */ | ||
125 | ino = ip->i_ino; | ||
126 | if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) { | ||
127 | if (rc == -EIO) { | ||
128 | jfs_err("jfs_create: dtInsert returned -EIO"); | ||
129 | txAbort(tid, 1); /* Marks Filesystem dirty */ | ||
130 | } else | ||
131 | txAbort(tid, 0); /* Filesystem full */ | ||
132 | goto out3; | ||
133 | } | ||
134 | |||
135 | ip->i_op = &jfs_file_inode_operations; | ||
136 | ip->i_fop = &jfs_file_operations; | ||
137 | ip->i_mapping->a_ops = &jfs_aops; | ||
138 | |||
139 | insert_inode_hash(ip); | ||
140 | mark_inode_dirty(ip); | ||
141 | |||
142 | dip->i_ctime = dip->i_mtime = CURRENT_TIME; | ||
143 | |||
144 | mark_inode_dirty(dip); | ||
145 | |||
146 | rc = txCommit(tid, 2, &iplist[0], 0); | ||
147 | |||
148 | out3: | ||
149 | txEnd(tid); | ||
150 | up(&JFS_IP(dip)->commit_sem); | ||
151 | up(&JFS_IP(ip)->commit_sem); | ||
152 | if (rc) { | ||
153 | ip->i_nlink = 0; | ||
154 | iput(ip); | ||
155 | } else | ||
156 | d_instantiate(dentry, ip); | ||
157 | |||
158 | out2: | ||
159 | free_UCSname(&dname); | ||
160 | |||
161 | #ifdef CONFIG_JFS_POSIX_ACL | ||
162 | if (rc == 0) | ||
163 | jfs_init_acl(ip, dip); | ||
164 | #endif | ||
165 | |||
166 | out1: | ||
167 | |||
168 | jfs_info("jfs_create: rc:%d", rc); | ||
169 | return rc; | ||
170 | } | ||
171 | |||
172 | |||
173 | /* | ||
174 | * NAME: jfs_mkdir(dip, dentry, mode) | ||
175 | * | ||
176 | * FUNCTION: create a child directory in the parent directory <dip> | ||
177 | * with name = <from dentry> and mode = <mode> | ||
178 | * | ||
179 | * PARAMETER: dip - parent directory vnode | ||
180 | * dentry - dentry of child directory | ||
181 | * mode - create mode (rwxrwxrwx). | ||
182 | * | ||
183 | * RETURN: Errors from subroutines | ||
184 | * | ||
185 | * note: | ||
186 | * EACCESS: user needs search+write permission on the parent directory | ||
187 | */ | ||
188 | static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) | ||
189 | { | ||
190 | int rc = 0; | ||
191 | tid_t tid; /* transaction id */ | ||
192 | struct inode *ip = NULL; /* child directory inode */ | ||
193 | ino_t ino; | ||
194 | struct component_name dname; /* child directory name */ | ||
195 | struct btstack btstack; | ||
196 | struct inode *iplist[2]; | ||
197 | struct tblock *tblk; | ||
198 | |||
199 | jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name); | ||
200 | |||
201 | /* link count overflow on parent directory ? */ | ||
202 | if (dip->i_nlink == JFS_LINK_MAX) { | ||
203 | rc = -EMLINK; | ||
204 | goto out1; | ||
205 | } | ||
206 | |||
207 | /* | ||
208 | * search parent directory for entry/freespace | ||
209 | * (dtSearch() returns parent directory page pinned) | ||
210 | */ | ||
211 | if ((rc = get_UCSname(&dname, dentry))) | ||
212 | goto out1; | ||
213 | |||
214 | /* | ||
215 | * Either iAlloc() or txBegin() may block. Deadlock can occur if we | ||
216 | * block there while holding dtree page, so we allocate the inode & | ||
217 | * begin the transaction before we search the directory. | ||
218 | */ | ||
219 | ip = ialloc(dip, S_IFDIR | mode); | ||
220 | if (ip == NULL) { | ||
221 | rc = -ENOSPC; | ||
222 | goto out2; | ||
223 | } | ||
224 | |||
225 | tid = txBegin(dip->i_sb, 0); | ||
226 | |||
227 | down(&JFS_IP(dip)->commit_sem); | ||
228 | down(&JFS_IP(ip)->commit_sem); | ||
229 | |||
230 | if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) { | ||
231 | jfs_err("jfs_mkdir: dtSearch returned %d", rc); | ||
232 | goto out3; | ||
233 | } | ||
234 | |||
235 | tblk = tid_to_tblock(tid); | ||
236 | tblk->xflag |= COMMIT_CREATE; | ||
237 | tblk->ino = ip->i_ino; | ||
238 | tblk->u.ixpxd = JFS_IP(ip)->ixpxd; | ||
239 | |||
240 | iplist[0] = dip; | ||
241 | iplist[1] = ip; | ||
242 | |||
243 | /* | ||
244 | * initialize the child directory in-line in inode | ||
245 | */ | ||
246 | dtInitRoot(tid, ip, dip->i_ino); | ||
247 | |||
248 | /* | ||
249 | * create entry in parent directory for child directory | ||
250 | * (dtInsert() releases parent directory page) | ||
251 | */ | ||
252 | ino = ip->i_ino; | ||
253 | if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) { | ||
254 | if (rc == -EIO) { | ||
255 | jfs_err("jfs_mkdir: dtInsert returned -EIO"); | ||
256 | txAbort(tid, 1); /* Marks Filesystem dirty */ | ||
257 | } else | ||
258 | txAbort(tid, 0); /* Filesystem full */ | ||
259 | goto out3; | ||
260 | } | ||
261 | |||
262 | ip->i_nlink = 2; /* for '.' */ | ||
263 | ip->i_op = &jfs_dir_inode_operations; | ||
264 | ip->i_fop = &jfs_dir_operations; | ||
265 | |||
266 | insert_inode_hash(ip); | ||
267 | mark_inode_dirty(ip); | ||
268 | |||
269 | /* update parent directory inode */ | ||
270 | dip->i_nlink++; /* for '..' from child directory */ | ||
271 | dip->i_ctime = dip->i_mtime = CURRENT_TIME; | ||
272 | mark_inode_dirty(dip); | ||
273 | |||
274 | rc = txCommit(tid, 2, &iplist[0], 0); | ||
275 | |||
276 | out3: | ||
277 | txEnd(tid); | ||
278 | up(&JFS_IP(dip)->commit_sem); | ||
279 | up(&JFS_IP(ip)->commit_sem); | ||
280 | if (rc) { | ||
281 | ip->i_nlink = 0; | ||
282 | iput(ip); | ||
283 | } else | ||
284 | d_instantiate(dentry, ip); | ||
285 | |||
286 | out2: | ||
287 | free_UCSname(&dname); | ||
288 | |||
289 | #ifdef CONFIG_JFS_POSIX_ACL | ||
290 | if (rc == 0) | ||
291 | jfs_init_acl(ip, dip); | ||
292 | #endif | ||
293 | |||
294 | out1: | ||
295 | |||
296 | jfs_info("jfs_mkdir: rc:%d", rc); | ||
297 | return rc; | ||
298 | } | ||
299 | |||
300 | /* | ||
301 | * NAME: jfs_rmdir(dip, dentry) | ||
302 | * | ||
303 | * FUNCTION: remove a link to child directory | ||
304 | * | ||
305 | * PARAMETER: dip - parent inode | ||
306 | * dentry - child directory dentry | ||
307 | * | ||
308 | * RETURN: -EINVAL - if name is . or .. | ||
309 | * -EINVAL - if . or .. exist but are invalid. | ||
310 | * errors from subroutines | ||
311 | * | ||
312 | * note: | ||
313 | * if other threads have the directory open when the last link | ||
314 | * is removed, the "." and ".." entries, if present, are removed before | ||
315 | * rmdir() returns and no new entries may be created in the directory, | ||
316 | * but the directory is not removed until the last reference to | ||
317 | * the directory is released (cf.unlink() of regular file). | ||
318 | */ | ||
319 | static int jfs_rmdir(struct inode *dip, struct dentry *dentry) | ||
320 | { | ||
321 | int rc; | ||
322 | tid_t tid; /* transaction id */ | ||
323 | struct inode *ip = dentry->d_inode; | ||
324 | ino_t ino; | ||
325 | struct component_name dname; | ||
326 | struct inode *iplist[2]; | ||
327 | struct tblock *tblk; | ||
328 | |||
329 | jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name); | ||
330 | |||
331 | /* Init inode for quota operations. */ | ||
332 | DQUOT_INIT(ip); | ||
333 | |||
334 | /* directory must be empty to be removed */ | ||
335 | if (!dtEmpty(ip)) { | ||
336 | rc = -ENOTEMPTY; | ||
337 | goto out; | ||
338 | } | ||
339 | |||
340 | if ((rc = get_UCSname(&dname, dentry))) { | ||
341 | goto out; | ||
342 | } | ||
343 | |||
344 | tid = txBegin(dip->i_sb, 0); | ||
345 | |||
346 | down(&JFS_IP(dip)->commit_sem); | ||
347 | down(&JFS_IP(ip)->commit_sem); | ||
348 | |||
349 | iplist[0] = dip; | ||
350 | iplist[1] = ip; | ||
351 | |||
352 | tblk = tid_to_tblock(tid); | ||
353 | tblk->xflag |= COMMIT_DELETE; | ||
354 | tblk->u.ip = ip; | ||
355 | |||
356 | /* | ||
357 | * delete the entry of target directory from parent directory | ||
358 | */ | ||
359 | ino = ip->i_ino; | ||
360 | if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) { | ||
361 | jfs_err("jfs_rmdir: dtDelete returned %d", rc); | ||
362 | if (rc == -EIO) | ||
363 | txAbort(tid, 1); | ||
364 | txEnd(tid); | ||
365 | up(&JFS_IP(dip)->commit_sem); | ||
366 | up(&JFS_IP(ip)->commit_sem); | ||
367 | |||
368 | goto out2; | ||
369 | } | ||
370 | |||
371 | /* update parent directory's link count corresponding | ||
372 | * to ".." entry of the target directory deleted | ||
373 | */ | ||
374 | dip->i_nlink--; | ||
375 | dip->i_ctime = dip->i_mtime = CURRENT_TIME; | ||
376 | mark_inode_dirty(dip); | ||
377 | |||
378 | /* | ||
379 | * OS/2 could have created EA and/or ACL | ||
380 | */ | ||
381 | /* free EA from both persistent and working map */ | ||
382 | if (JFS_IP(ip)->ea.flag & DXD_EXTENT) { | ||
383 | /* free EA pages */ | ||
384 | txEA(tid, ip, &JFS_IP(ip)->ea, NULL); | ||
385 | } | ||
386 | JFS_IP(ip)->ea.flag = 0; | ||
387 | |||
388 | /* free ACL from both persistent and working map */ | ||
389 | if (JFS_IP(ip)->acl.flag & DXD_EXTENT) { | ||
390 | /* free ACL pages */ | ||
391 | txEA(tid, ip, &JFS_IP(ip)->acl, NULL); | ||
392 | } | ||
393 | JFS_IP(ip)->acl.flag = 0; | ||
394 | |||
395 | /* mark the target directory as deleted */ | ||
396 | ip->i_nlink = 0; | ||
397 | mark_inode_dirty(ip); | ||
398 | |||
399 | rc = txCommit(tid, 2, &iplist[0], 0); | ||
400 | |||
401 | txEnd(tid); | ||
402 | |||
403 | up(&JFS_IP(dip)->commit_sem); | ||
404 | up(&JFS_IP(ip)->commit_sem); | ||
405 | |||
406 | /* | ||
407 | * Truncating the directory index table is not guaranteed. It | ||
408 | * may need to be done iteratively | ||
409 | */ | ||
410 | if (test_cflag(COMMIT_Stale, dip)) { | ||
411 | if (dip->i_size > 1) | ||
412 | jfs_truncate_nolock(dip, 0); | ||
413 | |||
414 | clear_cflag(COMMIT_Stale, dip); | ||
415 | } | ||
416 | |||
417 | out2: | ||
418 | free_UCSname(&dname); | ||
419 | |||
420 | out: | ||
421 | jfs_info("jfs_rmdir: rc:%d", rc); | ||
422 | return rc; | ||
423 | } | ||
424 | |||
425 | /* | ||
426 | * NAME: jfs_unlink(dip, dentry) | ||
427 | * | ||
428 | * FUNCTION: remove a link to object <vp> named by <name> | ||
429 | * from parent directory <dvp> | ||
430 | * | ||
431 | * PARAMETER: dip - inode of parent directory | ||
432 | * dentry - dentry of object to be removed | ||
433 | * | ||
434 | * RETURN: errors from subroutines | ||
435 | * | ||
436 | * note: | ||
437 | * temporary file: if one or more processes have the file open | ||
438 | * when the last link is removed, the link will be removed before | ||
439 | * unlink() returns, but the removal of the file contents will be | ||
440 | * postponed until all references to the files are closed. | ||
441 | * | ||
442 | * JFS does NOT support unlink() on directories. | ||
443 | * | ||
444 | */ | ||
445 | static int jfs_unlink(struct inode *dip, struct dentry *dentry) | ||
446 | { | ||
447 | int rc; | ||
448 | tid_t tid; /* transaction id */ | ||
449 | struct inode *ip = dentry->d_inode; | ||
450 | ino_t ino; | ||
451 | struct component_name dname; /* object name */ | ||
452 | struct inode *iplist[2]; | ||
453 | struct tblock *tblk; | ||
454 | s64 new_size = 0; | ||
455 | int commit_flag; | ||
456 | |||
457 | jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name); | ||
458 | |||
459 | /* Init inode for quota operations. */ | ||
460 | DQUOT_INIT(ip); | ||
461 | |||
462 | if ((rc = get_UCSname(&dname, dentry))) | ||
463 | goto out; | ||
464 | |||
465 | IWRITE_LOCK(ip); | ||
466 | |||
467 | tid = txBegin(dip->i_sb, 0); | ||
468 | |||
469 | down(&JFS_IP(dip)->commit_sem); | ||
470 | down(&JFS_IP(ip)->commit_sem); | ||
471 | |||
472 | iplist[0] = dip; | ||
473 | iplist[1] = ip; | ||
474 | |||
475 | /* | ||
476 | * delete the entry of target file from parent directory | ||
477 | */ | ||
478 | ino = ip->i_ino; | ||
479 | if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) { | ||
480 | jfs_err("jfs_unlink: dtDelete returned %d", rc); | ||
481 | if (rc == -EIO) | ||
482 | txAbort(tid, 1); /* Marks FS Dirty */ | ||
483 | txEnd(tid); | ||
484 | up(&JFS_IP(dip)->commit_sem); | ||
485 | up(&JFS_IP(ip)->commit_sem); | ||
486 | IWRITE_UNLOCK(ip); | ||
487 | goto out1; | ||
488 | } | ||
489 | |||
490 | ASSERT(ip->i_nlink); | ||
491 | |||
492 | ip->i_ctime = dip->i_ctime = dip->i_mtime = CURRENT_TIME; | ||
493 | mark_inode_dirty(dip); | ||
494 | |||
495 | /* update target's inode */ | ||
496 | ip->i_nlink--; | ||
497 | mark_inode_dirty(ip); | ||
498 | |||
499 | /* | ||
500 | * commit zero link count object | ||
501 | */ | ||
502 | if (ip->i_nlink == 0) { | ||
503 | assert(!test_cflag(COMMIT_Nolink, ip)); | ||
504 | /* free block resources */ | ||
505 | if ((new_size = commitZeroLink(tid, ip)) < 0) { | ||
506 | txAbort(tid, 1); /* Marks FS Dirty */ | ||
507 | txEnd(tid); | ||
508 | up(&JFS_IP(dip)->commit_sem); | ||
509 | up(&JFS_IP(ip)->commit_sem); | ||
510 | IWRITE_UNLOCK(ip); | ||
511 | rc = new_size; | ||
512 | goto out1; | ||
513 | } | ||
514 | tblk = tid_to_tblock(tid); | ||
515 | tblk->xflag |= COMMIT_DELETE; | ||
516 | tblk->u.ip = ip; | ||
517 | } | ||
518 | |||
519 | /* | ||
520 | * Incomplete truncate of file data can | ||
521 | * result in timing problems unless we synchronously commit the | ||
522 | * transaction. | ||
523 | */ | ||
524 | if (new_size) | ||
525 | commit_flag = COMMIT_SYNC; | ||
526 | else | ||
527 | commit_flag = 0; | ||
528 | |||
529 | /* | ||
530 | * If xtTruncate was incomplete, commit synchronously to avoid | ||
531 | * timing complications | ||
532 | */ | ||
533 | rc = txCommit(tid, 2, &iplist[0], commit_flag); | ||
534 | |||
535 | txEnd(tid); | ||
536 | |||
537 | up(&JFS_IP(dip)->commit_sem); | ||
538 | up(&JFS_IP(ip)->commit_sem); | ||
539 | |||
540 | |||
541 | while (new_size && (rc == 0)) { | ||
542 | tid = txBegin(dip->i_sb, 0); | ||
543 | down(&JFS_IP(ip)->commit_sem); | ||
544 | new_size = xtTruncate_pmap(tid, ip, new_size); | ||
545 | if (new_size < 0) { | ||
546 | txAbort(tid, 1); /* Marks FS Dirty */ | ||
547 | rc = new_size; | ||
548 | } else | ||
549 | rc = txCommit(tid, 2, &iplist[0], COMMIT_SYNC); | ||
550 | txEnd(tid); | ||
551 | up(&JFS_IP(ip)->commit_sem); | ||
552 | } | ||
553 | |||
554 | if (ip->i_nlink == 0) | ||
555 | set_cflag(COMMIT_Nolink, ip); | ||
556 | |||
557 | IWRITE_UNLOCK(ip); | ||
558 | |||
559 | /* | ||
560 | * Truncating the directory index table is not guaranteed. It | ||
561 | * may need to be done iteratively | ||
562 | */ | ||
563 | if (test_cflag(COMMIT_Stale, dip)) { | ||
564 | if (dip->i_size > 1) | ||
565 | jfs_truncate_nolock(dip, 0); | ||
566 | |||
567 | clear_cflag(COMMIT_Stale, dip); | ||
568 | } | ||
569 | |||
570 | out1: | ||
571 | free_UCSname(&dname); | ||
572 | out: | ||
573 | jfs_info("jfs_unlink: rc:%d", rc); | ||
574 | return rc; | ||
575 | } | ||
576 | |||
577 | /* | ||
578 | * NAME: commitZeroLink() | ||
579 | * | ||
580 | * FUNCTION: for non-directory, called by jfs_remove(), | ||
581 | * truncate a regular file, directory or symbolic | ||
582 | * link to zero length. return 0 if type is not | ||
583 | * one of these. | ||
584 | * | ||
585 | * if the file is currently associated with a VM segment | ||
586 | * only permanent disk and inode map resources are freed, | ||
587 | * and neither the inode nor indirect blocks are modified | ||
588 | * so that the resources can be later freed in the work | ||
589 | * map by ctrunc1. | ||
590 | * if there is no VM segment on entry, the resources are | ||
591 | * freed in both work and permanent map. | ||
592 | * (? for temporary file - memory object is cached even | ||
593 | * after no reference: | ||
594 | * reference count > 0 - ) | ||
595 | * | ||
596 | * PARAMETERS: cd - pointer to commit data structure. | ||
597 | * current inode is the one to truncate. | ||
598 | * | ||
599 | * RETURN: Errors from subroutines | ||
600 | */ | ||
601 | static s64 commitZeroLink(tid_t tid, struct inode *ip) | ||
602 | { | ||
603 | int filetype; | ||
604 | struct tblock *tblk; | ||
605 | |||
606 | jfs_info("commitZeroLink: tid = %d, ip = 0x%p", tid, ip); | ||
607 | |||
608 | filetype = ip->i_mode & S_IFMT; | ||
609 | switch (filetype) { | ||
610 | case S_IFREG: | ||
611 | break; | ||
612 | case S_IFLNK: | ||
613 | /* fast symbolic link */ | ||
614 | if (ip->i_size < IDATASIZE) { | ||
615 | ip->i_size = 0; | ||
616 | return 0; | ||
617 | } | ||
618 | break; | ||
619 | default: | ||
620 | assert(filetype != S_IFDIR); | ||
621 | return 0; | ||
622 | } | ||
623 | |||
624 | set_cflag(COMMIT_Freewmap, ip); | ||
625 | |||
626 | /* mark transaction of block map update type */ | ||
627 | tblk = tid_to_tblock(tid); | ||
628 | tblk->xflag |= COMMIT_PMAP; | ||
629 | |||
630 | /* | ||
631 | * free EA | ||
632 | */ | ||
633 | if (JFS_IP(ip)->ea.flag & DXD_EXTENT) | ||
634 | /* acquire maplock on EA to be freed from block map */ | ||
635 | txEA(tid, ip, &JFS_IP(ip)->ea, NULL); | ||
636 | |||
637 | /* | ||
638 | * free ACL | ||
639 | */ | ||
640 | if (JFS_IP(ip)->acl.flag & DXD_EXTENT) | ||
641 | /* acquire maplock on EA to be freed from block map */ | ||
642 | txEA(tid, ip, &JFS_IP(ip)->acl, NULL); | ||
643 | |||
644 | /* | ||
645 | * free xtree/data (truncate to zero length): | ||
646 | * free xtree/data pages from cache if COMMIT_PWMAP, | ||
647 | * free xtree/data blocks from persistent block map, and | ||
648 | * free xtree/data blocks from working block map if COMMIT_PWMAP; | ||
649 | */ | ||
650 | if (ip->i_size) | ||
651 | return xtTruncate_pmap(tid, ip, 0); | ||
652 | |||
653 | return 0; | ||
654 | } | ||
655 | |||
656 | |||
657 | /* | ||
658 | * NAME: freeZeroLink() | ||
659 | * | ||
660 | * FUNCTION: for non-directory, called by iClose(), | ||
661 | * free resources of a file from cache and WORKING map | ||
662 | * for a file previously committed with zero link count | ||
663 | * while associated with a pager object, | ||
664 | * | ||
665 | * PARAMETER: ip - pointer to inode of file. | ||
666 | * | ||
667 | * RETURN: 0 -ok | ||
668 | */ | ||
669 | int freeZeroLink(struct inode *ip) | ||
670 | { | ||
671 | int rc = 0; | ||
672 | int type; | ||
673 | |||
674 | jfs_info("freeZeroLink: ip = 0x%p", ip); | ||
675 | |||
676 | /* return if not reg or symbolic link or if size is | ||
677 | * already ok. | ||
678 | */ | ||
679 | type = ip->i_mode & S_IFMT; | ||
680 | |||
681 | switch (type) { | ||
682 | case S_IFREG: | ||
683 | break; | ||
684 | case S_IFLNK: | ||
685 | /* if its contained in inode nothing to do */ | ||
686 | if (ip->i_size < IDATASIZE) | ||
687 | return 0; | ||
688 | break; | ||
689 | default: | ||
690 | return 0; | ||
691 | } | ||
692 | |||
693 | /* | ||
694 | * free EA | ||
695 | */ | ||
696 | if (JFS_IP(ip)->ea.flag & DXD_EXTENT) { | ||
697 | s64 xaddr = addressDXD(&JFS_IP(ip)->ea); | ||
698 | int xlen = lengthDXD(&JFS_IP(ip)->ea); | ||
699 | struct maplock maplock; /* maplock for COMMIT_WMAP */ | ||
700 | struct pxd_lock *pxdlock; /* maplock for COMMIT_WMAP */ | ||
701 | |||
702 | /* free EA pages from cache */ | ||
703 | invalidate_dxd_metapages(ip, JFS_IP(ip)->ea); | ||
704 | |||
705 | /* free EA extent from working block map */ | ||
706 | maplock.index = 1; | ||
707 | pxdlock = (struct pxd_lock *) & maplock; | ||
708 | pxdlock->flag = mlckFREEPXD; | ||
709 | PXDaddress(&pxdlock->pxd, xaddr); | ||
710 | PXDlength(&pxdlock->pxd, xlen); | ||
711 | txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP); | ||
712 | } | ||
713 | |||
714 | /* | ||
715 | * free ACL | ||
716 | */ | ||
717 | if (JFS_IP(ip)->acl.flag & DXD_EXTENT) { | ||
718 | s64 xaddr = addressDXD(&JFS_IP(ip)->acl); | ||
719 | int xlen = lengthDXD(&JFS_IP(ip)->acl); | ||
720 | struct maplock maplock; /* maplock for COMMIT_WMAP */ | ||
721 | struct pxd_lock *pxdlock; /* maplock for COMMIT_WMAP */ | ||
722 | |||
723 | invalidate_dxd_metapages(ip, JFS_IP(ip)->acl); | ||
724 | |||
725 | /* free ACL extent from working block map */ | ||
726 | maplock.index = 1; | ||
727 | pxdlock = (struct pxd_lock *) & maplock; | ||
728 | pxdlock->flag = mlckFREEPXD; | ||
729 | PXDaddress(&pxdlock->pxd, xaddr); | ||
730 | PXDlength(&pxdlock->pxd, xlen); | ||
731 | txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP); | ||
732 | } | ||
733 | |||
734 | /* | ||
735 | * free xtree/data (truncate to zero length): | ||
736 | * free xtree/data pages from cache, and | ||
737 | * free xtree/data blocks from working block map; | ||
738 | */ | ||
739 | if (ip->i_size) | ||
740 | rc = xtTruncate(0, ip, 0, COMMIT_WMAP); | ||
741 | |||
742 | return rc; | ||
743 | } | ||
744 | |||
745 | /* | ||
746 | * NAME: jfs_link(vp, dvp, name, crp) | ||
747 | * | ||
748 | * FUNCTION: create a link to <vp> by the name = <name> | ||
749 | * in the parent directory <dvp> | ||
750 | * | ||
751 | * PARAMETER: vp - target object | ||
752 | * dvp - parent directory of new link | ||
753 | * name - name of new link to target object | ||
754 | * crp - credential | ||
755 | * | ||
756 | * RETURN: Errors from subroutines | ||
757 | * | ||
758 | * note: | ||
759 | * JFS does NOT support link() on directories (to prevent circular | ||
760 | * path in the directory hierarchy); | ||
761 | * EPERM: the target object is a directory, and either the caller | ||
762 | * does not have appropriate privileges or the implementation prohibits | ||
763 | * using link() on directories [XPG4.2]. | ||
764 | * | ||
765 | * JFS does NOT support links between file systems: | ||
766 | * EXDEV: target object and new link are on different file systems and | ||
767 | * implementation does not support links between file systems [XPG4.2]. | ||
768 | */ | ||
769 | static int jfs_link(struct dentry *old_dentry, | ||
770 | struct inode *dir, struct dentry *dentry) | ||
771 | { | ||
772 | int rc; | ||
773 | tid_t tid; | ||
774 | struct inode *ip = old_dentry->d_inode; | ||
775 | ino_t ino; | ||
776 | struct component_name dname; | ||
777 | struct btstack btstack; | ||
778 | struct inode *iplist[2]; | ||
779 | |||
780 | jfs_info("jfs_link: %s %s", old_dentry->d_name.name, | ||
781 | dentry->d_name.name); | ||
782 | |||
783 | if (ip->i_nlink == JFS_LINK_MAX) | ||
784 | return -EMLINK; | ||
785 | |||
786 | if (ip->i_nlink == 0) | ||
787 | return -ENOENT; | ||
788 | |||
789 | tid = txBegin(ip->i_sb, 0); | ||
790 | |||
791 | down(&JFS_IP(dir)->commit_sem); | ||
792 | down(&JFS_IP(ip)->commit_sem); | ||
793 | |||
794 | /* | ||
795 | * scan parent directory for entry/freespace | ||
796 | */ | ||
797 | if ((rc = get_UCSname(&dname, dentry))) | ||
798 | goto out; | ||
799 | |||
800 | if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE))) | ||
801 | goto free_dname; | ||
802 | |||
803 | /* | ||
804 | * create entry for new link in parent directory | ||
805 | */ | ||
806 | ino = ip->i_ino; | ||
807 | if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack))) | ||
808 | goto free_dname; | ||
809 | |||
810 | /* update object inode */ | ||
811 | ip->i_nlink++; /* for new link */ | ||
812 | ip->i_ctime = CURRENT_TIME; | ||
813 | mark_inode_dirty(dir); | ||
814 | atomic_inc(&ip->i_count); | ||
815 | |||
816 | iplist[0] = ip; | ||
817 | iplist[1] = dir; | ||
818 | rc = txCommit(tid, 2, &iplist[0], 0); | ||
819 | |||
820 | if (rc) { | ||
821 | ip->i_nlink--; | ||
822 | iput(ip); | ||
823 | } else | ||
824 | d_instantiate(dentry, ip); | ||
825 | |||
826 | free_dname: | ||
827 | free_UCSname(&dname); | ||
828 | |||
829 | out: | ||
830 | txEnd(tid); | ||
831 | |||
832 | up(&JFS_IP(dir)->commit_sem); | ||
833 | up(&JFS_IP(ip)->commit_sem); | ||
834 | |||
835 | jfs_info("jfs_link: rc:%d", rc); | ||
836 | return rc; | ||
837 | } | ||
838 | |||
839 | /* | ||
840 | * NAME: jfs_symlink(dip, dentry, name) | ||
841 | * | ||
842 | * FUNCTION: creates a symbolic link to <symlink> by name <name> | ||
843 | * in directory <dip> | ||
844 | * | ||
845 | * PARAMETER: dip - parent directory vnode | ||
846 | * dentry - dentry of symbolic link | ||
847 | * name - the path name of the existing object | ||
848 | * that will be the source of the link | ||
849 | * | ||
850 | * RETURN: errors from subroutines | ||
851 | * | ||
852 | * note: | ||
853 | * ENAMETOOLONG: pathname resolution of a symbolic link produced | ||
854 | * an intermediate result whose length exceeds PATH_MAX [XPG4.2] | ||
855 | */ | ||
856 | |||
857 | static int jfs_symlink(struct inode *dip, struct dentry *dentry, | ||
858 | const char *name) | ||
859 | { | ||
860 | int rc; | ||
861 | tid_t tid; | ||
862 | ino_t ino = 0; | ||
863 | struct component_name dname; | ||
864 | int ssize; /* source pathname size */ | ||
865 | struct btstack btstack; | ||
866 | struct inode *ip = dentry->d_inode; | ||
867 | unchar *i_fastsymlink; | ||
868 | s64 xlen = 0; | ||
869 | int bmask = 0, xsize; | ||
870 | s64 extent = 0, xaddr; | ||
871 | struct metapage *mp; | ||
872 | struct super_block *sb; | ||
873 | struct tblock *tblk; | ||
874 | |||
875 | struct inode *iplist[2]; | ||
876 | |||
877 | jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name); | ||
878 | |||
879 | ssize = strlen(name) + 1; | ||
880 | |||
881 | /* | ||
882 | * search parent directory for entry/freespace | ||
883 | * (dtSearch() returns parent directory page pinned) | ||
884 | */ | ||
885 | |||
886 | if ((rc = get_UCSname(&dname, dentry))) | ||
887 | goto out1; | ||
888 | |||
889 | /* | ||
890 | * allocate on-disk/in-memory inode for symbolic link: | ||
891 | * (iAlloc() returns new, locked inode) | ||
892 | */ | ||
893 | ip = ialloc(dip, S_IFLNK | 0777); | ||
894 | if (ip == NULL) { | ||
895 | rc = -ENOSPC; | ||
896 | goto out2; | ||
897 | } | ||
898 | |||
899 | tid = txBegin(dip->i_sb, 0); | ||
900 | |||
901 | down(&JFS_IP(dip)->commit_sem); | ||
902 | down(&JFS_IP(ip)->commit_sem); | ||
903 | |||
904 | tblk = tid_to_tblock(tid); | ||
905 | tblk->xflag |= COMMIT_CREATE; | ||
906 | tblk->ino = ip->i_ino; | ||
907 | tblk->u.ixpxd = JFS_IP(ip)->ixpxd; | ||
908 | |||
909 | /* fix symlink access permission | ||
910 | * (dir_create() ANDs in the u.u_cmask, | ||
911 | * but symlinks really need to be 777 access) | ||
912 | */ | ||
913 | ip->i_mode |= 0777; | ||
914 | |||
915 | /* | ||
916 | * write symbolic link target path name | ||
917 | */ | ||
918 | xtInitRoot(tid, ip); | ||
919 | |||
920 | /* | ||
921 | * write source path name inline in on-disk inode (fast symbolic link) | ||
922 | */ | ||
923 | |||
924 | if (ssize <= IDATASIZE) { | ||
925 | ip->i_op = &jfs_symlink_inode_operations; | ||
926 | |||
927 | i_fastsymlink = JFS_IP(ip)->i_inline; | ||
928 | memcpy(i_fastsymlink, name, ssize); | ||
929 | ip->i_size = ssize - 1; | ||
930 | |||
931 | /* | ||
932 | * if symlink is > 128 bytes, we don't have the space to | ||
933 | * store inline extended attributes | ||
934 | */ | ||
935 | if (ssize > sizeof (JFS_IP(ip)->i_inline)) | ||
936 | JFS_IP(ip)->mode2 &= ~INLINEEA; | ||
937 | |||
938 | jfs_info("jfs_symlink: fast symlink added ssize:%d name:%s ", | ||
939 | ssize, name); | ||
940 | } | ||
941 | /* | ||
942 | * write source path name in a single extent | ||
943 | */ | ||
944 | else { | ||
945 | jfs_info("jfs_symlink: allocate extent ip:0x%p", ip); | ||
946 | |||
947 | ip->i_op = &page_symlink_inode_operations; | ||
948 | ip->i_mapping->a_ops = &jfs_aops; | ||
949 | |||
950 | /* | ||
951 | * even though the data of symlink object (source | ||
952 | * path name) is treated as non-journaled user data, | ||
953 | * it is read/written thru buffer cache for performance. | ||
954 | */ | ||
955 | sb = ip->i_sb; | ||
956 | bmask = JFS_SBI(sb)->bsize - 1; | ||
957 | xsize = (ssize + bmask) & ~bmask; | ||
958 | xaddr = 0; | ||
959 | xlen = xsize >> JFS_SBI(sb)->l2bsize; | ||
960 | if ((rc = xtInsert(tid, ip, 0, 0, xlen, &xaddr, 0))) { | ||
961 | txAbort(tid, 0); | ||
962 | rc = -ENOSPC; | ||
963 | goto out3; | ||
964 | } | ||
965 | extent = xaddr; | ||
966 | ip->i_size = ssize - 1; | ||
967 | while (ssize) { | ||
968 | /* This is kind of silly since PATH_MAX == 4K */ | ||
969 | int copy_size = min(ssize, PSIZE); | ||
970 | |||
971 | mp = get_metapage(ip, xaddr, PSIZE, 1); | ||
972 | |||
973 | if (mp == NULL) { | ||
974 | xtTruncate(tid, ip, 0, COMMIT_PWMAP); | ||
975 | rc = -EIO; | ||
976 | txAbort(tid, 0); | ||
977 | goto out3; | ||
978 | } | ||
979 | memcpy(mp->data, name, copy_size); | ||
980 | flush_metapage(mp); | ||
981 | ssize -= copy_size; | ||
982 | name += copy_size; | ||
983 | xaddr += JFS_SBI(sb)->nbperpage; | ||
984 | } | ||
985 | } | ||
986 | |||
987 | /* | ||
988 | * create entry for symbolic link in parent directory | ||
989 | */ | ||
990 | rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE); | ||
991 | if (rc == 0) { | ||
992 | ino = ip->i_ino; | ||
993 | rc = dtInsert(tid, dip, &dname, &ino, &btstack); | ||
994 | } | ||
995 | if (rc) { | ||
996 | if (xlen) | ||
997 | xtTruncate(tid, ip, 0, COMMIT_PWMAP); | ||
998 | txAbort(tid, 0); | ||
999 | /* discard new inode */ | ||
1000 | goto out3; | ||
1001 | } | ||
1002 | |||
1003 | insert_inode_hash(ip); | ||
1004 | mark_inode_dirty(ip); | ||
1005 | |||
1006 | /* | ||
1007 | * commit update of parent directory and link object | ||
1008 | */ | ||
1009 | |||
1010 | iplist[0] = dip; | ||
1011 | iplist[1] = ip; | ||
1012 | rc = txCommit(tid, 2, &iplist[0], 0); | ||
1013 | |||
1014 | out3: | ||
1015 | txEnd(tid); | ||
1016 | up(&JFS_IP(dip)->commit_sem); | ||
1017 | up(&JFS_IP(ip)->commit_sem); | ||
1018 | if (rc) { | ||
1019 | ip->i_nlink = 0; | ||
1020 | iput(ip); | ||
1021 | } else | ||
1022 | d_instantiate(dentry, ip); | ||
1023 | |||
1024 | out2: | ||
1025 | free_UCSname(&dname); | ||
1026 | |||
1027 | #ifdef CONFIG_JFS_POSIX_ACL | ||
1028 | if (rc == 0) | ||
1029 | jfs_init_acl(ip, dip); | ||
1030 | #endif | ||
1031 | |||
1032 | out1: | ||
1033 | jfs_info("jfs_symlink: rc:%d", rc); | ||
1034 | return rc; | ||
1035 | } | ||
1036 | |||
1037 | |||
1038 | /* | ||
1039 | * NAME: jfs_rename | ||
1040 | * | ||
1041 | * FUNCTION: rename a file or directory | ||
1042 | */ | ||
1043 | static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, | ||
1044 | struct inode *new_dir, struct dentry *new_dentry) | ||
1045 | { | ||
1046 | struct btstack btstack; | ||
1047 | ino_t ino; | ||
1048 | struct component_name new_dname; | ||
1049 | struct inode *new_ip; | ||
1050 | struct component_name old_dname; | ||
1051 | struct inode *old_ip; | ||
1052 | int rc; | ||
1053 | tid_t tid; | ||
1054 | struct tlock *tlck; | ||
1055 | struct dt_lock *dtlck; | ||
1056 | struct lv *lv; | ||
1057 | int ipcount; | ||
1058 | struct inode *iplist[4]; | ||
1059 | struct tblock *tblk; | ||
1060 | s64 new_size = 0; | ||
1061 | int commit_flag; | ||
1062 | |||
1063 | |||
1064 | jfs_info("jfs_rename: %s %s", old_dentry->d_name.name, | ||
1065 | new_dentry->d_name.name); | ||
1066 | |||
1067 | old_ip = old_dentry->d_inode; | ||
1068 | new_ip = new_dentry->d_inode; | ||
1069 | |||
1070 | if ((rc = get_UCSname(&old_dname, old_dentry))) | ||
1071 | goto out1; | ||
1072 | |||
1073 | if ((rc = get_UCSname(&new_dname, new_dentry))) | ||
1074 | goto out2; | ||
1075 | |||
1076 | /* | ||
1077 | * Make sure source inode number is what we think it is | ||
1078 | */ | ||
1079 | rc = dtSearch(old_dir, &old_dname, &ino, &btstack, JFS_LOOKUP); | ||
1080 | if (rc || (ino != old_ip->i_ino)) { | ||
1081 | rc = -ENOENT; | ||
1082 | goto out3; | ||
1083 | } | ||
1084 | |||
1085 | /* | ||
1086 | * Make sure dest inode number (if any) is what we think it is | ||
1087 | */ | ||
1088 | rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_LOOKUP); | ||
1089 | if (rc == 0) { | ||
1090 | if ((new_ip == 0) || (ino != new_ip->i_ino)) { | ||
1091 | rc = -ESTALE; | ||
1092 | goto out3; | ||
1093 | } | ||
1094 | } else if (rc != -ENOENT) | ||
1095 | goto out3; | ||
1096 | else if (new_ip) { | ||
1097 | /* no entry exists, but one was expected */ | ||
1098 | rc = -ESTALE; | ||
1099 | goto out3; | ||
1100 | } | ||
1101 | |||
1102 | if (S_ISDIR(old_ip->i_mode)) { | ||
1103 | if (new_ip) { | ||
1104 | if (!dtEmpty(new_ip)) { | ||
1105 | rc = -ENOTEMPTY; | ||
1106 | goto out3; | ||
1107 | } | ||
1108 | } else if ((new_dir != old_dir) && | ||
1109 | (new_dir->i_nlink == JFS_LINK_MAX)) { | ||
1110 | rc = -EMLINK; | ||
1111 | goto out3; | ||
1112 | } | ||
1113 | } else if (new_ip) { | ||
1114 | IWRITE_LOCK(new_ip); | ||
1115 | /* Init inode for quota operations. */ | ||
1116 | DQUOT_INIT(new_ip); | ||
1117 | } | ||
1118 | |||
1119 | /* | ||
1120 | * The real work starts here | ||
1121 | */ | ||
1122 | tid = txBegin(new_dir->i_sb, 0); | ||
1123 | |||
1124 | down(&JFS_IP(new_dir)->commit_sem); | ||
1125 | down(&JFS_IP(old_ip)->commit_sem); | ||
1126 | if (old_dir != new_dir) | ||
1127 | down(&JFS_IP(old_dir)->commit_sem); | ||
1128 | |||
1129 | if (new_ip) { | ||
1130 | down(&JFS_IP(new_ip)->commit_sem); | ||
1131 | /* | ||
1132 | * Change existing directory entry to new inode number | ||
1133 | */ | ||
1134 | ino = new_ip->i_ino; | ||
1135 | rc = dtModify(tid, new_dir, &new_dname, &ino, | ||
1136 | old_ip->i_ino, JFS_RENAME); | ||
1137 | if (rc) | ||
1138 | goto out4; | ||
1139 | new_ip->i_nlink--; | ||
1140 | if (S_ISDIR(new_ip->i_mode)) { | ||
1141 | new_ip->i_nlink--; | ||
1142 | if (new_ip->i_nlink) { | ||
1143 | up(&JFS_IP(new_dir)->commit_sem); | ||
1144 | up(&JFS_IP(old_ip)->commit_sem); | ||
1145 | if (old_dir != new_dir) | ||
1146 | up(&JFS_IP(old_dir)->commit_sem); | ||
1147 | if (!S_ISDIR(old_ip->i_mode) && new_ip) | ||
1148 | IWRITE_UNLOCK(new_ip); | ||
1149 | jfs_error(new_ip->i_sb, | ||
1150 | "jfs_rename: new_ip->i_nlink != 0"); | ||
1151 | return -EIO; | ||
1152 | } | ||
1153 | tblk = tid_to_tblock(tid); | ||
1154 | tblk->xflag |= COMMIT_DELETE; | ||
1155 | tblk->u.ip = new_ip; | ||
1156 | } else if (new_ip->i_nlink == 0) { | ||
1157 | assert(!test_cflag(COMMIT_Nolink, new_ip)); | ||
1158 | /* free block resources */ | ||
1159 | if ((new_size = commitZeroLink(tid, new_ip)) < 0) { | ||
1160 | txAbort(tid, 1); /* Marks FS Dirty */ | ||
1161 | rc = new_size; | ||
1162 | goto out4; | ||
1163 | } | ||
1164 | tblk = tid_to_tblock(tid); | ||
1165 | tblk->xflag |= COMMIT_DELETE; | ||
1166 | tblk->u.ip = new_ip; | ||
1167 | } else { | ||
1168 | new_ip->i_ctime = CURRENT_TIME; | ||
1169 | mark_inode_dirty(new_ip); | ||
1170 | } | ||
1171 | } else { | ||
1172 | /* | ||
1173 | * Add new directory entry | ||
1174 | */ | ||
1175 | rc = dtSearch(new_dir, &new_dname, &ino, &btstack, | ||
1176 | JFS_CREATE); | ||
1177 | if (rc) { | ||
1178 | jfs_err("jfs_rename didn't expect dtSearch to fail " | ||
1179 | "w/rc = %d", rc); | ||
1180 | goto out4; | ||
1181 | } | ||
1182 | |||
1183 | ino = old_ip->i_ino; | ||
1184 | rc = dtInsert(tid, new_dir, &new_dname, &ino, &btstack); | ||
1185 | if (rc) { | ||
1186 | if (rc == -EIO) | ||
1187 | jfs_err("jfs_rename: dtInsert returned -EIO"); | ||
1188 | goto out4; | ||
1189 | } | ||
1190 | if (S_ISDIR(old_ip->i_mode)) | ||
1191 | new_dir->i_nlink++; | ||
1192 | } | ||
1193 | /* | ||
1194 | * Remove old directory entry | ||
1195 | */ | ||
1196 | |||
1197 | ino = old_ip->i_ino; | ||
1198 | rc = dtDelete(tid, old_dir, &old_dname, &ino, JFS_REMOVE); | ||
1199 | if (rc) { | ||
1200 | jfs_err("jfs_rename did not expect dtDelete to return rc = %d", | ||
1201 | rc); | ||
1202 | txAbort(tid, 1); /* Marks Filesystem dirty */ | ||
1203 | goto out4; | ||
1204 | } | ||
1205 | if (S_ISDIR(old_ip->i_mode)) { | ||
1206 | old_dir->i_nlink--; | ||
1207 | if (old_dir != new_dir) { | ||
1208 | /* | ||
1209 | * Change inode number of parent for moved directory | ||
1210 | */ | ||
1211 | |||
1212 | JFS_IP(old_ip)->i_dtroot.header.idotdot = | ||
1213 | cpu_to_le32(new_dir->i_ino); | ||
1214 | |||
1215 | /* Linelock header of dtree */ | ||
1216 | tlck = txLock(tid, old_ip, | ||
1217 | (struct metapage *) &JFS_IP(old_ip)->bxflag, | ||
1218 | tlckDTREE | tlckBTROOT | tlckRELINK); | ||
1219 | dtlck = (struct dt_lock *) & tlck->lock; | ||
1220 | ASSERT(dtlck->index == 0); | ||
1221 | lv = & dtlck->lv[0]; | ||
1222 | lv->offset = 0; | ||
1223 | lv->length = 1; | ||
1224 | dtlck->index++; | ||
1225 | } | ||
1226 | } | ||
1227 | |||
1228 | /* | ||
1229 | * Update ctime on changed/moved inodes & mark dirty | ||
1230 | */ | ||
1231 | old_ip->i_ctime = CURRENT_TIME; | ||
1232 | mark_inode_dirty(old_ip); | ||
1233 | |||
1234 | new_dir->i_ctime = new_dir->i_mtime = current_fs_time(new_dir->i_sb); | ||
1235 | mark_inode_dirty(new_dir); | ||
1236 | |||
1237 | /* Build list of inodes modified by this transaction */ | ||
1238 | ipcount = 0; | ||
1239 | iplist[ipcount++] = old_ip; | ||
1240 | if (new_ip) | ||
1241 | iplist[ipcount++] = new_ip; | ||
1242 | iplist[ipcount++] = old_dir; | ||
1243 | |||
1244 | if (old_dir != new_dir) { | ||
1245 | iplist[ipcount++] = new_dir; | ||
1246 | old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; | ||
1247 | mark_inode_dirty(old_dir); | ||
1248 | } | ||
1249 | |||
1250 | /* | ||
1251 | * Incomplete truncate of file data can | ||
1252 | * result in timing problems unless we synchronously commit the | ||
1253 | * transaction. | ||
1254 | */ | ||
1255 | if (new_size) | ||
1256 | commit_flag = COMMIT_SYNC; | ||
1257 | else | ||
1258 | commit_flag = 0; | ||
1259 | |||
1260 | rc = txCommit(tid, ipcount, iplist, commit_flag); | ||
1261 | |||
1262 | out4: | ||
1263 | txEnd(tid); | ||
1264 | |||
1265 | up(&JFS_IP(new_dir)->commit_sem); | ||
1266 | up(&JFS_IP(old_ip)->commit_sem); | ||
1267 | if (old_dir != new_dir) | ||
1268 | up(&JFS_IP(old_dir)->commit_sem); | ||
1269 | if (new_ip) | ||
1270 | up(&JFS_IP(new_ip)->commit_sem); | ||
1271 | |||
1272 | while (new_size && (rc == 0)) { | ||
1273 | tid = txBegin(new_ip->i_sb, 0); | ||
1274 | down(&JFS_IP(new_ip)->commit_sem); | ||
1275 | new_size = xtTruncate_pmap(tid, new_ip, new_size); | ||
1276 | if (new_size < 0) { | ||
1277 | txAbort(tid, 1); | ||
1278 | rc = new_size; | ||
1279 | } else | ||
1280 | rc = txCommit(tid, 1, &new_ip, COMMIT_SYNC); | ||
1281 | txEnd(tid); | ||
1282 | up(&JFS_IP(new_ip)->commit_sem); | ||
1283 | } | ||
1284 | if (new_ip && (new_ip->i_nlink == 0)) | ||
1285 | set_cflag(COMMIT_Nolink, new_ip); | ||
1286 | out3: | ||
1287 | free_UCSname(&new_dname); | ||
1288 | out2: | ||
1289 | free_UCSname(&old_dname); | ||
1290 | out1: | ||
1291 | if (new_ip && !S_ISDIR(new_ip->i_mode)) | ||
1292 | IWRITE_UNLOCK(new_ip); | ||
1293 | /* | ||
1294 | * Truncating the directory index table is not guaranteed. It | ||
1295 | * may need to be done iteratively | ||
1296 | */ | ||
1297 | if (test_cflag(COMMIT_Stale, old_dir)) { | ||
1298 | if (old_dir->i_size > 1) | ||
1299 | jfs_truncate_nolock(old_dir, 0); | ||
1300 | |||
1301 | clear_cflag(COMMIT_Stale, old_dir); | ||
1302 | } | ||
1303 | |||
1304 | jfs_info("jfs_rename: returning %d", rc); | ||
1305 | return rc; | ||
1306 | } | ||
1307 | |||
1308 | |||
1309 | /* | ||
1310 | * NAME: jfs_mknod | ||
1311 | * | ||
1312 | * FUNCTION: Create a special file (device) | ||
1313 | */ | ||
1314 | static int jfs_mknod(struct inode *dir, struct dentry *dentry, | ||
1315 | int mode, dev_t rdev) | ||
1316 | { | ||
1317 | struct jfs_inode_info *jfs_ip; | ||
1318 | struct btstack btstack; | ||
1319 | struct component_name dname; | ||
1320 | ino_t ino; | ||
1321 | struct inode *ip; | ||
1322 | struct inode *iplist[2]; | ||
1323 | int rc; | ||
1324 | tid_t tid; | ||
1325 | struct tblock *tblk; | ||
1326 | |||
1327 | if (!new_valid_dev(rdev)) | ||
1328 | return -EINVAL; | ||
1329 | |||
1330 | jfs_info("jfs_mknod: %s", dentry->d_name.name); | ||
1331 | |||
1332 | if ((rc = get_UCSname(&dname, dentry))) | ||
1333 | goto out; | ||
1334 | |||
1335 | ip = ialloc(dir, mode); | ||
1336 | if (ip == NULL) { | ||
1337 | rc = -ENOSPC; | ||
1338 | goto out1; | ||
1339 | } | ||
1340 | jfs_ip = JFS_IP(ip); | ||
1341 | |||
1342 | tid = txBegin(dir->i_sb, 0); | ||
1343 | |||
1344 | down(&JFS_IP(dir)->commit_sem); | ||
1345 | down(&JFS_IP(ip)->commit_sem); | ||
1346 | |||
1347 | if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE))) | ||
1348 | goto out3; | ||
1349 | |||
1350 | tblk = tid_to_tblock(tid); | ||
1351 | tblk->xflag |= COMMIT_CREATE; | ||
1352 | tblk->ino = ip->i_ino; | ||
1353 | tblk->u.ixpxd = JFS_IP(ip)->ixpxd; | ||
1354 | |||
1355 | ino = ip->i_ino; | ||
1356 | if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack))) | ||
1357 | goto out3; | ||
1358 | |||
1359 | ip->i_op = &jfs_file_inode_operations; | ||
1360 | jfs_ip->dev = new_encode_dev(rdev); | ||
1361 | init_special_inode(ip, ip->i_mode, rdev); | ||
1362 | |||
1363 | insert_inode_hash(ip); | ||
1364 | mark_inode_dirty(ip); | ||
1365 | |||
1366 | dir->i_ctime = dir->i_mtime = CURRENT_TIME; | ||
1367 | |||
1368 | mark_inode_dirty(dir); | ||
1369 | |||
1370 | iplist[0] = dir; | ||
1371 | iplist[1] = ip; | ||
1372 | rc = txCommit(tid, 2, iplist, 0); | ||
1373 | |||
1374 | out3: | ||
1375 | txEnd(tid); | ||
1376 | up(&JFS_IP(ip)->commit_sem); | ||
1377 | up(&JFS_IP(dir)->commit_sem); | ||
1378 | if (rc) { | ||
1379 | ip->i_nlink = 0; | ||
1380 | iput(ip); | ||
1381 | } else | ||
1382 | d_instantiate(dentry, ip); | ||
1383 | |||
1384 | out1: | ||
1385 | free_UCSname(&dname); | ||
1386 | |||
1387 | #ifdef CONFIG_JFS_POSIX_ACL | ||
1388 | if (rc == 0) | ||
1389 | jfs_init_acl(ip, dir); | ||
1390 | #endif | ||
1391 | |||
1392 | out: | ||
1393 | jfs_info("jfs_mknod: returning %d", rc); | ||
1394 | return rc; | ||
1395 | } | ||
1396 | |||
1397 | static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struct nameidata *nd) | ||
1398 | { | ||
1399 | struct btstack btstack; | ||
1400 | ino_t inum; | ||
1401 | struct inode *ip; | ||
1402 | struct component_name key; | ||
1403 | const char *name = dentry->d_name.name; | ||
1404 | int len = dentry->d_name.len; | ||
1405 | int rc; | ||
1406 | |||
1407 | jfs_info("jfs_lookup: name = %s", name); | ||
1408 | |||
1409 | |||
1410 | if ((name[0] == '.') && (len == 1)) | ||
1411 | inum = dip->i_ino; | ||
1412 | else if (strcmp(name, "..") == 0) | ||
1413 | inum = PARENT(dip); | ||
1414 | else { | ||
1415 | if ((rc = get_UCSname(&key, dentry))) | ||
1416 | return ERR_PTR(rc); | ||
1417 | rc = dtSearch(dip, &key, &inum, &btstack, JFS_LOOKUP); | ||
1418 | free_UCSname(&key); | ||
1419 | if (rc == -ENOENT) { | ||
1420 | d_add(dentry, NULL); | ||
1421 | return ERR_PTR(0); | ||
1422 | } else if (rc) { | ||
1423 | jfs_err("jfs_lookup: dtSearch returned %d", rc); | ||
1424 | return ERR_PTR(rc); | ||
1425 | } | ||
1426 | } | ||
1427 | |||
1428 | ip = iget(dip->i_sb, inum); | ||
1429 | if (ip == NULL || is_bad_inode(ip)) { | ||
1430 | jfs_err("jfs_lookup: iget failed on inum %d", (uint) inum); | ||
1431 | if (ip) | ||
1432 | iput(ip); | ||
1433 | return ERR_PTR(-EACCES); | ||
1434 | } | ||
1435 | |||
1436 | if (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2) | ||
1437 | dentry->d_op = &jfs_ci_dentry_operations; | ||
1438 | |||
1439 | dentry = d_splice_alias(ip, dentry); | ||
1440 | |||
1441 | if (dentry && (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2)) | ||
1442 | dentry->d_op = &jfs_ci_dentry_operations; | ||
1443 | |||
1444 | return dentry; | ||
1445 | } | ||
1446 | |||
1447 | struct dentry *jfs_get_parent(struct dentry *dentry) | ||
1448 | { | ||
1449 | struct super_block *sb = dentry->d_inode->i_sb; | ||
1450 | struct dentry *parent = ERR_PTR(-ENOENT); | ||
1451 | struct inode *inode; | ||
1452 | unsigned long parent_ino; | ||
1453 | |||
1454 | parent_ino = | ||
1455 | le32_to_cpu(JFS_IP(dentry->d_inode)->i_dtroot.header.idotdot); | ||
1456 | inode = iget(sb, parent_ino); | ||
1457 | if (inode) { | ||
1458 | if (is_bad_inode(inode)) { | ||
1459 | iput(inode); | ||
1460 | parent = ERR_PTR(-EACCES); | ||
1461 | } else { | ||
1462 | parent = d_alloc_anon(inode); | ||
1463 | if (!parent) { | ||
1464 | parent = ERR_PTR(-ENOMEM); | ||
1465 | iput(inode); | ||
1466 | } | ||
1467 | } | ||
1468 | } | ||
1469 | |||
1470 | return parent; | ||
1471 | } | ||
1472 | |||
1473 | struct inode_operations jfs_dir_inode_operations = { | ||
1474 | .create = jfs_create, | ||
1475 | .lookup = jfs_lookup, | ||
1476 | .link = jfs_link, | ||
1477 | .unlink = jfs_unlink, | ||
1478 | .symlink = jfs_symlink, | ||
1479 | .mkdir = jfs_mkdir, | ||
1480 | .rmdir = jfs_rmdir, | ||
1481 | .mknod = jfs_mknod, | ||
1482 | .rename = jfs_rename, | ||
1483 | .setxattr = jfs_setxattr, | ||
1484 | .getxattr = jfs_getxattr, | ||
1485 | .listxattr = jfs_listxattr, | ||
1486 | .removexattr = jfs_removexattr, | ||
1487 | #ifdef CONFIG_JFS_POSIX_ACL | ||
1488 | .setattr = jfs_setattr, | ||
1489 | .permission = jfs_permission, | ||
1490 | #endif | ||
1491 | }; | ||
1492 | |||
1493 | struct file_operations jfs_dir_operations = { | ||
1494 | .read = generic_read_dir, | ||
1495 | .readdir = jfs_readdir, | ||
1496 | .fsync = jfs_fsync, | ||
1497 | }; | ||
1498 | |||
1499 | static int jfs_ci_hash(struct dentry *dir, struct qstr *this) | ||
1500 | { | ||
1501 | unsigned long hash; | ||
1502 | int i; | ||
1503 | |||
1504 | hash = init_name_hash(); | ||
1505 | for (i=0; i < this->len; i++) | ||
1506 | hash = partial_name_hash(tolower(this->name[i]), hash); | ||
1507 | this->hash = end_name_hash(hash); | ||
1508 | |||
1509 | return 0; | ||
1510 | } | ||
1511 | |||
1512 | static int jfs_ci_compare(struct dentry *dir, struct qstr *a, struct qstr *b) | ||
1513 | { | ||
1514 | int i, result = 1; | ||
1515 | |||
1516 | if (a->len != b->len) | ||
1517 | goto out; | ||
1518 | for (i=0; i < a->len; i++) { | ||
1519 | if (tolower(a->name[i]) != tolower(b->name[i])) | ||
1520 | goto out; | ||
1521 | } | ||
1522 | result = 0; | ||
1523 | |||
1524 | /* | ||
1525 | * We want creates to preserve case. A negative dentry, a, that | ||
1526 | * has a different case than b may cause a new entry to be created | ||
1527 | * with the wrong case. Since we can't tell if a comes from a negative | ||
1528 | * dentry, we blindly replace it with b. This should be harmless if | ||
1529 | * a is not a negative dentry. | ||
1530 | */ | ||
1531 | memcpy((unsigned char *)a->name, b->name, a->len); | ||
1532 | out: | ||
1533 | return result; | ||
1534 | } | ||
1535 | |||
1536 | struct dentry_operations jfs_ci_dentry_operations = | ||
1537 | { | ||
1538 | .d_hash = jfs_ci_hash, | ||
1539 | .d_compare = jfs_ci_compare, | ||
1540 | }; | ||
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c new file mode 100644 index 000000000000..2eb6869b6e72 --- /dev/null +++ b/fs/jfs/resize.c | |||
@@ -0,0 +1,537 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | |||
19 | #include <linux/fs.h> | ||
20 | #include <linux/buffer_head.h> | ||
21 | #include <linux/quotaops.h> | ||
22 | #include "jfs_incore.h" | ||
23 | #include "jfs_filsys.h" | ||
24 | #include "jfs_metapage.h" | ||
25 | #include "jfs_dinode.h" | ||
26 | #include "jfs_imap.h" | ||
27 | #include "jfs_dmap.h" | ||
28 | #include "jfs_superblock.h" | ||
29 | #include "jfs_txnmgr.h" | ||
30 | #include "jfs_debug.h" | ||
31 | |||
32 | #define BITSPERPAGE (PSIZE << 3) | ||
33 | #define L2MEGABYTE 20 | ||
34 | #define MEGABYTE (1 << L2MEGABYTE) | ||
35 | #define MEGABYTE32 (MEGABYTE << 5) | ||
36 | |||
37 | /* convert block number to bmap file page number */ | ||
38 | #define BLKTODMAPN(b)\ | ||
39 | (((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) | ||
40 | |||
41 | /* | ||
42 | * jfs_extendfs() | ||
43 | * | ||
44 | * function: extend file system; | ||
45 | * | ||
46 | * |-------------------------------|----------|----------| | ||
47 | * file system space fsck inline log | ||
48 | * workspace space | ||
49 | * | ||
50 | * input: | ||
51 | * new LVSize: in LV blocks (required) | ||
52 | * new LogSize: in LV blocks (optional) | ||
53 | * new FSSize: in LV blocks (optional) | ||
54 | * | ||
55 | * new configuration: | ||
56 | * 1. set new LogSize as specified or default from new LVSize; | ||
57 | * 2. compute new FSCKSize from new LVSize; | ||
58 | * 3. set new FSSize as MIN(FSSize, LVSize-(LogSize+FSCKSize)) where | ||
59 | * assert(new FSSize >= old FSSize), | ||
60 | * i.e., file system must not be shrinked; | ||
61 | */ | ||
62 | int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) | ||
63 | { | ||
64 | int rc = 0; | ||
65 | struct jfs_sb_info *sbi = JFS_SBI(sb); | ||
66 | struct inode *ipbmap = sbi->ipbmap; | ||
67 | struct inode *ipbmap2; | ||
68 | struct inode *ipimap = sbi->ipimap; | ||
69 | struct jfs_log *log = sbi->log; | ||
70 | struct bmap *bmp = sbi->bmap; | ||
71 | s64 newLogAddress, newFSCKAddress; | ||
72 | int newFSCKSize; | ||
73 | s64 newMapSize = 0, mapSize; | ||
74 | s64 XAddress, XSize, nblocks, xoff, xaddr, t64; | ||
75 | s64 oldLVSize; | ||
76 | s64 newFSSize; | ||
77 | s64 VolumeSize; | ||
78 | int newNpages = 0, nPages, newPage, xlen, t32; | ||
79 | int tid; | ||
80 | int log_formatted = 0; | ||
81 | struct inode *iplist[1]; | ||
82 | struct jfs_superblock *j_sb, *j_sb2; | ||
83 | uint old_agsize; | ||
84 | struct buffer_head *bh, *bh2; | ||
85 | |||
86 | /* If the volume hasn't grown, get out now */ | ||
87 | |||
88 | if (sbi->mntflag & JFS_INLINELOG) | ||
89 | oldLVSize = addressPXD(&sbi->logpxd) + lengthPXD(&sbi->logpxd); | ||
90 | else | ||
91 | oldLVSize = addressPXD(&sbi->fsckpxd) + | ||
92 | lengthPXD(&sbi->fsckpxd); | ||
93 | |||
94 | if (oldLVSize >= newLVSize) { | ||
95 | printk(KERN_WARNING | ||
96 | "jfs_extendfs: volume hasn't grown, returning\n"); | ||
97 | goto out; | ||
98 | } | ||
99 | |||
100 | VolumeSize = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; | ||
101 | |||
102 | if (VolumeSize) { | ||
103 | if (newLVSize > VolumeSize) { | ||
104 | printk(KERN_WARNING "jfs_extendfs: invalid size\n"); | ||
105 | rc = -EINVAL; | ||
106 | goto out; | ||
107 | } | ||
108 | } else { | ||
109 | /* check the device */ | ||
110 | bh = sb_bread(sb, newLVSize - 1); | ||
111 | if (!bh) { | ||
112 | printk(KERN_WARNING "jfs_extendfs: invalid size\n"); | ||
113 | rc = -EINVAL; | ||
114 | goto out; | ||
115 | } | ||
116 | bforget(bh); | ||
117 | } | ||
118 | |||
119 | /* Can't extend write-protected drive */ | ||
120 | |||
121 | if (isReadOnly(ipbmap)) { | ||
122 | printk(KERN_WARNING "jfs_extendfs: read-only file system\n"); | ||
123 | rc = -EROFS; | ||
124 | goto out; | ||
125 | } | ||
126 | |||
127 | /* | ||
128 | * reconfigure LV spaces | ||
129 | * --------------------- | ||
130 | * | ||
131 | * validate new size, or, if not specified, determine new size | ||
132 | */ | ||
133 | |||
134 | /* | ||
135 | * reconfigure inline log space: | ||
136 | */ | ||
137 | if ((sbi->mntflag & JFS_INLINELOG)) { | ||
138 | if (newLogSize == 0) { | ||
139 | /* | ||
140 | * no size specified: default to 1/256 of aggregate | ||
141 | * size; rounded up to a megabyte boundary; | ||
142 | */ | ||
143 | newLogSize = newLVSize >> 8; | ||
144 | t32 = (1 << (20 - sbi->l2bsize)) - 1; | ||
145 | newLogSize = (newLogSize + t32) & ~t32; | ||
146 | newLogSize = | ||
147 | min(newLogSize, MEGABYTE32 >> sbi->l2bsize); | ||
148 | } else { | ||
149 | /* | ||
150 | * convert the newLogSize to fs blocks. | ||
151 | * | ||
152 | * Since this is given in megabytes, it will always be | ||
153 | * an even number of pages. | ||
154 | */ | ||
155 | newLogSize = (newLogSize * MEGABYTE) >> sbi->l2bsize; | ||
156 | } | ||
157 | |||
158 | } else | ||
159 | newLogSize = 0; | ||
160 | |||
161 | newLogAddress = newLVSize - newLogSize; | ||
162 | |||
163 | /* | ||
164 | * reconfigure fsck work space: | ||
165 | * | ||
166 | * configure it to the end of the logical volume regardless of | ||
167 | * whether file system extends to the end of the aggregate; | ||
168 | * Need enough 4k pages to cover: | ||
169 | * - 1 bit per block in aggregate rounded up to BPERDMAP boundary | ||
170 | * - 1 extra page to handle control page and intermediate level pages | ||
171 | * - 50 extra pages for the chkdsk service log | ||
172 | */ | ||
173 | t64 = ((newLVSize - newLogSize + BPERDMAP - 1) >> L2BPERDMAP) | ||
174 | << L2BPERDMAP; | ||
175 | t32 = ((t64 + (BITSPERPAGE - 1)) / BITSPERPAGE) + 1 + 50; | ||
176 | newFSCKSize = t32 << sbi->l2nbperpage; | ||
177 | newFSCKAddress = newLogAddress - newFSCKSize; | ||
178 | |||
179 | /* | ||
180 | * compute new file system space; | ||
181 | */ | ||
182 | newFSSize = newLVSize - newLogSize - newFSCKSize; | ||
183 | |||
184 | /* file system cannot be shrinked */ | ||
185 | if (newFSSize < bmp->db_mapsize) { | ||
186 | rc = -EINVAL; | ||
187 | goto out; | ||
188 | } | ||
189 | |||
190 | /* | ||
191 | * If we're expanding enough that the inline log does not overlap | ||
192 | * the old one, we can format the new log before we quiesce the | ||
193 | * filesystem. | ||
194 | */ | ||
195 | if ((sbi->mntflag & JFS_INLINELOG) && (newLogAddress > oldLVSize)) { | ||
196 | if ((rc = lmLogFormat(log, newLogAddress, newLogSize))) | ||
197 | goto out; | ||
198 | log_formatted = 1; | ||
199 | } | ||
200 | /* | ||
201 | * quiesce file system | ||
202 | * | ||
203 | * (prepare to move the inline log and to prevent map update) | ||
204 | * | ||
205 | * block any new transactions and wait for completion of | ||
206 | * all wip transactions and flush modified pages s.t. | ||
207 | * on-disk file system is in consistent state and | ||
208 | * log is not required for recovery. | ||
209 | */ | ||
210 | txQuiesce(sb); | ||
211 | |||
212 | if (sbi->mntflag & JFS_INLINELOG) { | ||
213 | /* | ||
214 | * deactivate old inline log | ||
215 | */ | ||
216 | lmLogShutdown(log); | ||
217 | |||
218 | /* | ||
219 | * mark on-disk super block for fs in transition; | ||
220 | * | ||
221 | * update on-disk superblock for the new space configuration | ||
222 | * of inline log space and fsck work space descriptors: | ||
223 | * N.B. FS descriptor is NOT updated; | ||
224 | * | ||
225 | * crash recovery: | ||
226 | * logredo(): if FM_EXTENDFS, return to fsck() for cleanup; | ||
227 | * fsck(): if FM_EXTENDFS, reformat inline log and fsck | ||
228 | * workspace from superblock inline log descriptor and fsck | ||
229 | * workspace descriptor; | ||
230 | */ | ||
231 | |||
232 | /* read in superblock */ | ||
233 | if ((rc = readSuper(sb, &bh))) | ||
234 | goto error_out; | ||
235 | j_sb = (struct jfs_superblock *)bh->b_data; | ||
236 | |||
237 | /* mark extendfs() in progress */ | ||
238 | j_sb->s_state |= cpu_to_le32(FM_EXTENDFS); | ||
239 | j_sb->s_xsize = cpu_to_le64(newFSSize); | ||
240 | PXDaddress(&j_sb->s_xfsckpxd, newFSCKAddress); | ||
241 | PXDlength(&j_sb->s_xfsckpxd, newFSCKSize); | ||
242 | PXDaddress(&j_sb->s_xlogpxd, newLogAddress); | ||
243 | PXDlength(&j_sb->s_xlogpxd, newLogSize); | ||
244 | |||
245 | /* synchronously update superblock */ | ||
246 | mark_buffer_dirty(bh); | ||
247 | sync_dirty_buffer(bh); | ||
248 | brelse(bh); | ||
249 | |||
250 | /* | ||
251 | * format new inline log synchronously; | ||
252 | * | ||
253 | * crash recovery: if log move in progress, | ||
254 | * reformat log and exit success; | ||
255 | */ | ||
256 | if (!log_formatted) | ||
257 | if ((rc = lmLogFormat(log, newLogAddress, newLogSize))) | ||
258 | goto error_out; | ||
259 | |||
260 | /* | ||
261 | * activate new log | ||
262 | */ | ||
263 | log->base = newLogAddress; | ||
264 | log->size = newLogSize >> (L2LOGPSIZE - sb->s_blocksize_bits); | ||
265 | if ((rc = lmLogInit(log))) | ||
266 | goto error_out; | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * extend block allocation map | ||
271 | * --------------------------- | ||
272 | * | ||
273 | * extendfs() for new extension, retry after crash recovery; | ||
274 | * | ||
275 | * note: both logredo() and fsck() rebuild map from | ||
276 | * the bitmap and configuration parameter from superblock | ||
277 | * (disregarding all other control information in the map); | ||
278 | * | ||
279 | * superblock: | ||
280 | * s_size: aggregate size in physical blocks; | ||
281 | */ | ||
282 | /* | ||
283 | * compute the new block allocation map configuration | ||
284 | * | ||
285 | * map dinode: | ||
286 | * di_size: map file size in byte; | ||
287 | * di_nblocks: number of blocks allocated for map file; | ||
288 | * di_mapsize: number of blocks in aggregate (covered by map); | ||
289 | * map control page: | ||
290 | * db_mapsize: number of blocks in aggregate (covered by map); | ||
291 | */ | ||
292 | newMapSize = newFSSize; | ||
293 | /* number of data pages of new bmap file: | ||
294 | * roundup new size to full dmap page boundary and | ||
295 | * add 1 extra dmap page for next extendfs() | ||
296 | */ | ||
297 | t64 = (newMapSize - 1) + BPERDMAP; | ||
298 | newNpages = BLKTODMAPN(t64) + 1; | ||
299 | |||
300 | /* | ||
301 | * extend map from current map (WITHOUT growing mapfile) | ||
302 | * | ||
303 | * map new extension with unmapped part of the last partial | ||
304 | * dmap page, if applicable, and extra page(s) allocated | ||
305 | * at end of bmap by mkfs() or previous extendfs(); | ||
306 | */ | ||
307 | extendBmap: | ||
308 | /* compute number of blocks requested to extend */ | ||
309 | mapSize = bmp->db_mapsize; | ||
310 | XAddress = mapSize; /* eXtension Address */ | ||
311 | XSize = newMapSize - mapSize; /* eXtension Size */ | ||
312 | old_agsize = bmp->db_agsize; /* We need to know if this changes */ | ||
313 | |||
314 | /* compute number of blocks that can be extended by current mapfile */ | ||
315 | t64 = dbMapFileSizeToMapSize(ipbmap); | ||
316 | if (mapSize > t64) { | ||
317 | printk(KERN_ERR "jfs_extendfs: mapSize (0x%Lx) > t64 (0x%Lx)\n", | ||
318 | (long long) mapSize, (long long) t64); | ||
319 | rc = -EIO; | ||
320 | goto error_out; | ||
321 | } | ||
322 | nblocks = min(t64 - mapSize, XSize); | ||
323 | |||
324 | /* | ||
325 | * update map pages for new extension: | ||
326 | * | ||
327 | * update/init dmap and bubble up the control hierarchy | ||
328 | * incrementally fold up dmaps into upper levels; | ||
329 | * update bmap control page; | ||
330 | */ | ||
331 | if ((rc = dbExtendFS(ipbmap, XAddress, nblocks))) | ||
332 | goto error_out; | ||
333 | /* | ||
334 | * the map now has extended to cover additional nblocks: | ||
335 | * dn_mapsize = oldMapsize + nblocks; | ||
336 | */ | ||
337 | /* ipbmap->i_mapsize += nblocks; */ | ||
338 | XSize -= nblocks; | ||
339 | |||
340 | /* | ||
341 | * grow map file to cover remaining extension | ||
342 | * and/or one extra dmap page for next extendfs(); | ||
343 | * | ||
344 | * allocate new map pages and its backing blocks, and | ||
345 | * update map file xtree | ||
346 | */ | ||
347 | /* compute number of data pages of current bmap file */ | ||
348 | nPages = ipbmap->i_size >> L2PSIZE; | ||
349 | |||
350 | /* need to grow map file ? */ | ||
351 | if (nPages == newNpages) | ||
352 | goto finalizeBmap; | ||
353 | |||
354 | /* | ||
355 | * grow bmap file for the new map pages required: | ||
356 | * | ||
357 | * allocate growth at the start of newly extended region; | ||
358 | * bmap file only grows sequentially, i.e., both data pages | ||
359 | * and possibly xtree index pages may grow in append mode, | ||
360 | * s.t. logredo() can reconstruct pre-extension state | ||
361 | * by washing away bmap file of pages outside s_size boundary; | ||
362 | */ | ||
363 | /* | ||
364 | * journal map file growth as if a regular file growth: | ||
365 | * (note: bmap is created with di_mode = IFJOURNAL|IFREG); | ||
366 | * | ||
367 | * journaling of bmap file growth is not required since | ||
368 | * logredo() do/can not use log records of bmap file growth | ||
369 | * but it provides careful write semantics, pmap update, etc.; | ||
370 | */ | ||
371 | /* synchronous write of data pages: bmap data pages are | ||
372 | * cached in meta-data cache, and not written out | ||
373 | * by txCommit(); | ||
374 | */ | ||
375 | filemap_fdatawait(ipbmap->i_mapping); | ||
376 | filemap_fdatawrite(ipbmap->i_mapping); | ||
377 | filemap_fdatawait(ipbmap->i_mapping); | ||
378 | diWriteSpecial(ipbmap, 0); | ||
379 | |||
380 | newPage = nPages; /* first new page number */ | ||
381 | xoff = newPage << sbi->l2nbperpage; | ||
382 | xlen = (newNpages - nPages) << sbi->l2nbperpage; | ||
383 | xlen = min(xlen, (int) nblocks) & ~(sbi->nbperpage - 1); | ||
384 | xaddr = XAddress; | ||
385 | |||
386 | tid = txBegin(sb, COMMIT_FORCE); | ||
387 | |||
388 | if ((rc = xtAppend(tid, ipbmap, 0, xoff, nblocks, &xlen, &xaddr, 0))) { | ||
389 | txEnd(tid); | ||
390 | goto error_out; | ||
391 | } | ||
392 | /* update bmap file size */ | ||
393 | ipbmap->i_size += xlen << sbi->l2bsize; | ||
394 | inode_add_bytes(ipbmap, xlen << sbi->l2bsize); | ||
395 | |||
396 | iplist[0] = ipbmap; | ||
397 | rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE); | ||
398 | |||
399 | txEnd(tid); | ||
400 | |||
401 | if (rc) | ||
402 | goto error_out; | ||
403 | |||
404 | /* | ||
405 | * map file has been grown now to cover extension to further out; | ||
406 | * di_size = new map file size; | ||
407 | * | ||
408 | * if huge extension, the previous extension based on previous | ||
409 | * map file size may not have been sufficient to cover whole extension | ||
410 | * (it could have been used up for new map pages), | ||
411 | * but the newly grown map file now covers lot bigger new free space | ||
412 | * available for further extension of map; | ||
413 | */ | ||
414 | /* any more blocks to extend ? */ | ||
415 | if (XSize) | ||
416 | goto extendBmap; | ||
417 | |||
418 | finalizeBmap: | ||
419 | /* finalize bmap */ | ||
420 | dbFinalizeBmap(ipbmap); | ||
421 | |||
422 | /* | ||
423 | * update inode allocation map | ||
424 | * --------------------------- | ||
425 | * | ||
426 | * move iag lists from old to new iag; | ||
427 | * agstart field is not updated for logredo() to reconstruct | ||
428 | * iag lists if system crash occurs. | ||
429 | * (computation of ag number from agstart based on agsize | ||
430 | * will correctly identify the new ag); | ||
431 | */ | ||
432 | /* if new AG size the same as old AG size, done! */ | ||
433 | if (bmp->db_agsize != old_agsize) { | ||
434 | if ((rc = diExtendFS(ipimap, ipbmap))) | ||
435 | goto error_out; | ||
436 | |||
437 | /* finalize imap */ | ||
438 | if ((rc = diSync(ipimap))) | ||
439 | goto error_out; | ||
440 | } | ||
441 | |||
442 | /* | ||
443 | * finalize | ||
444 | * -------- | ||
445 | * | ||
446 | * extension is committed when on-disk super block is | ||
447 | * updated with new descriptors: logredo will recover | ||
448 | * crash before it to pre-extension state; | ||
449 | */ | ||
450 | |||
451 | /* sync log to skip log replay of bmap file growth transaction; */ | ||
452 | /* lmLogSync(log, 1); */ | ||
453 | |||
454 | /* | ||
455 | * synchronous write bmap global control page; | ||
456 | * for crash before completion of write | ||
457 | * logredo() will recover to pre-extendfs state; | ||
458 | * for crash after completion of write, | ||
459 | * logredo() will recover post-extendfs state; | ||
460 | */ | ||
461 | if ((rc = dbSync(ipbmap))) | ||
462 | goto error_out; | ||
463 | |||
464 | /* | ||
465 | * copy primary bmap inode to secondary bmap inode | ||
466 | */ | ||
467 | |||
468 | ipbmap2 = diReadSpecial(sb, BMAP_I, 1); | ||
469 | if (ipbmap2 == NULL) { | ||
470 | printk(KERN_ERR "jfs_extendfs: diReadSpecial(bmap) failed\n"); | ||
471 | goto error_out; | ||
472 | } | ||
473 | memcpy(&JFS_IP(ipbmap2)->i_xtroot, &JFS_IP(ipbmap)->i_xtroot, 288); | ||
474 | ipbmap2->i_size = ipbmap->i_size; | ||
475 | ipbmap2->i_blocks = ipbmap->i_blocks; | ||
476 | |||
477 | diWriteSpecial(ipbmap2, 1); | ||
478 | diFreeSpecial(ipbmap2); | ||
479 | |||
480 | /* | ||
481 | * update superblock | ||
482 | */ | ||
483 | if ((rc = readSuper(sb, &bh))) | ||
484 | goto error_out; | ||
485 | j_sb = (struct jfs_superblock *)bh->b_data; | ||
486 | |||
487 | /* mark extendfs() completion */ | ||
488 | j_sb->s_state &= cpu_to_le32(~FM_EXTENDFS); | ||
489 | j_sb->s_size = cpu_to_le64(bmp->db_mapsize << | ||
490 | le16_to_cpu(j_sb->s_l2bfactor)); | ||
491 | j_sb->s_agsize = cpu_to_le32(bmp->db_agsize); | ||
492 | |||
493 | /* update inline log space descriptor */ | ||
494 | if (sbi->mntflag & JFS_INLINELOG) { | ||
495 | PXDaddress(&(j_sb->s_logpxd), newLogAddress); | ||
496 | PXDlength(&(j_sb->s_logpxd), newLogSize); | ||
497 | } | ||
498 | |||
499 | /* record log's mount serial number */ | ||
500 | j_sb->s_logserial = cpu_to_le32(log->serial); | ||
501 | |||
502 | /* update fsck work space descriptor */ | ||
503 | PXDaddress(&(j_sb->s_fsckpxd), newFSCKAddress); | ||
504 | PXDlength(&(j_sb->s_fsckpxd), newFSCKSize); | ||
505 | j_sb->s_fscklog = 1; | ||
506 | /* sb->s_fsckloglen remains the same */ | ||
507 | |||
508 | /* Update secondary superblock */ | ||
509 | bh2 = sb_bread(sb, SUPER2_OFF >> sb->s_blocksize_bits); | ||
510 | if (bh2) { | ||
511 | j_sb2 = (struct jfs_superblock *)bh2->b_data; | ||
512 | memcpy(j_sb2, j_sb, sizeof (struct jfs_superblock)); | ||
513 | |||
514 | mark_buffer_dirty(bh); | ||
515 | sync_dirty_buffer(bh2); | ||
516 | brelse(bh2); | ||
517 | } | ||
518 | |||
519 | /* write primary superblock */ | ||
520 | mark_buffer_dirty(bh); | ||
521 | sync_dirty_buffer(bh); | ||
522 | brelse(bh); | ||
523 | |||
524 | goto resume; | ||
525 | |||
526 | error_out: | ||
527 | jfs_error(sb, "jfs_extendfs"); | ||
528 | |||
529 | resume: | ||
530 | /* | ||
531 | * resume file system transactions | ||
532 | */ | ||
533 | txResume(sb); | ||
534 | |||
535 | out: | ||
536 | return rc; | ||
537 | } | ||
diff --git a/fs/jfs/super.c b/fs/jfs/super.c new file mode 100644 index 000000000000..5856866e24fc --- /dev/null +++ b/fs/jfs/super.c | |||
@@ -0,0 +1,700 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * Portions Copyright (C) Christoph Hellwig, 2001-2002 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | |||
20 | #include <linux/fs.h> | ||
21 | #include <linux/config.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/parser.h> | ||
24 | #include <linux/completion.h> | ||
25 | #include <linux/vfs.h> | ||
26 | #include <linux/moduleparam.h> | ||
27 | #include <asm/uaccess.h> | ||
28 | |||
29 | #include "jfs_incore.h" | ||
30 | #include "jfs_filsys.h" | ||
31 | #include "jfs_metapage.h" | ||
32 | #include "jfs_superblock.h" | ||
33 | #include "jfs_dmap.h" | ||
34 | #include "jfs_imap.h" | ||
35 | #include "jfs_acl.h" | ||
36 | #include "jfs_debug.h" | ||
37 | |||
38 | MODULE_DESCRIPTION("The Journaled Filesystem (JFS)"); | ||
39 | MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM"); | ||
40 | MODULE_LICENSE("GPL"); | ||
41 | |||
42 | static kmem_cache_t * jfs_inode_cachep; | ||
43 | |||
44 | static struct super_operations jfs_super_operations; | ||
45 | static struct export_operations jfs_export_operations; | ||
46 | static struct file_system_type jfs_fs_type; | ||
47 | |||
48 | #define MAX_COMMIT_THREADS 64 | ||
49 | static int commit_threads = 0; | ||
50 | module_param(commit_threads, int, 0); | ||
51 | MODULE_PARM_DESC(commit_threads, "Number of commit threads"); | ||
52 | |||
53 | int jfs_stop_threads; | ||
54 | static pid_t jfsIOthread; | ||
55 | static pid_t jfsCommitThread[MAX_COMMIT_THREADS]; | ||
56 | static pid_t jfsSyncThread; | ||
57 | DECLARE_COMPLETION(jfsIOwait); | ||
58 | |||
59 | #ifdef CONFIG_JFS_DEBUG | ||
60 | int jfsloglevel = JFS_LOGLEVEL_WARN; | ||
61 | module_param(jfsloglevel, int, 0644); | ||
62 | MODULE_PARM_DESC(jfsloglevel, "Specify JFS loglevel (0, 1 or 2)"); | ||
63 | #endif | ||
64 | |||
65 | /* | ||
66 | * External declarations | ||
67 | */ | ||
68 | extern int jfs_mount(struct super_block *); | ||
69 | extern int jfs_mount_rw(struct super_block *, int); | ||
70 | extern int jfs_umount(struct super_block *); | ||
71 | extern int jfs_umount_rw(struct super_block *); | ||
72 | |||
73 | extern int jfsIOWait(void *); | ||
74 | extern int jfs_lazycommit(void *); | ||
75 | extern int jfs_sync(void *); | ||
76 | |||
77 | extern void jfs_read_inode(struct inode *inode); | ||
78 | extern void jfs_dirty_inode(struct inode *inode); | ||
79 | extern void jfs_delete_inode(struct inode *inode); | ||
80 | extern int jfs_write_inode(struct inode *inode, int wait); | ||
81 | |||
82 | extern struct dentry *jfs_get_parent(struct dentry *dentry); | ||
83 | extern int jfs_extendfs(struct super_block *, s64, int); | ||
84 | |||
85 | extern struct dentry_operations jfs_ci_dentry_operations; | ||
86 | |||
87 | #ifdef PROC_FS_JFS /* see jfs_debug.h */ | ||
88 | extern void jfs_proc_init(void); | ||
89 | extern void jfs_proc_clean(void); | ||
90 | #endif | ||
91 | |||
92 | extern wait_queue_head_t jfs_IO_thread_wait; | ||
93 | extern wait_queue_head_t jfs_commit_thread_wait; | ||
94 | extern wait_queue_head_t jfs_sync_thread_wait; | ||
95 | |||
96 | static void jfs_handle_error(struct super_block *sb) | ||
97 | { | ||
98 | struct jfs_sb_info *sbi = JFS_SBI(sb); | ||
99 | |||
100 | if (sb->s_flags & MS_RDONLY) | ||
101 | return; | ||
102 | |||
103 | updateSuper(sb, FM_DIRTY); | ||
104 | |||
105 | if (sbi->flag & JFS_ERR_PANIC) | ||
106 | panic("JFS (device %s): panic forced after error\n", | ||
107 | sb->s_id); | ||
108 | else if (sbi->flag & JFS_ERR_REMOUNT_RO) { | ||
109 | jfs_err("ERROR: (device %s): remounting filesystem " | ||
110 | "as read-only\n", | ||
111 | sb->s_id); | ||
112 | sb->s_flags |= MS_RDONLY; | ||
113 | } | ||
114 | |||
115 | /* nothing is done for continue beyond marking the superblock dirty */ | ||
116 | } | ||
117 | |||
118 | void jfs_error(struct super_block *sb, const char * function, ...) | ||
119 | { | ||
120 | static char error_buf[256]; | ||
121 | va_list args; | ||
122 | |||
123 | va_start(args, function); | ||
124 | vsprintf(error_buf, function, args); | ||
125 | va_end(args); | ||
126 | |||
127 | printk(KERN_ERR "ERROR: (device %s): %s\n", sb->s_id, error_buf); | ||
128 | |||
129 | jfs_handle_error(sb); | ||
130 | } | ||
131 | |||
132 | static struct inode *jfs_alloc_inode(struct super_block *sb) | ||
133 | { | ||
134 | struct jfs_inode_info *jfs_inode; | ||
135 | |||
136 | jfs_inode = kmem_cache_alloc(jfs_inode_cachep, GFP_NOFS); | ||
137 | if (!jfs_inode) | ||
138 | return NULL; | ||
139 | return &jfs_inode->vfs_inode; | ||
140 | } | ||
141 | |||
142 | static void jfs_destroy_inode(struct inode *inode) | ||
143 | { | ||
144 | struct jfs_inode_info *ji = JFS_IP(inode); | ||
145 | |||
146 | spin_lock_irq(&ji->ag_lock); | ||
147 | if (ji->active_ag != -1) { | ||
148 | struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap; | ||
149 | atomic_dec(&bmap->db_active[ji->active_ag]); | ||
150 | ji->active_ag = -1; | ||
151 | } | ||
152 | spin_unlock_irq(&ji->ag_lock); | ||
153 | |||
154 | #ifdef CONFIG_JFS_POSIX_ACL | ||
155 | if (ji->i_acl != JFS_ACL_NOT_CACHED) { | ||
156 | posix_acl_release(ji->i_acl); | ||
157 | ji->i_acl = JFS_ACL_NOT_CACHED; | ||
158 | } | ||
159 | if (ji->i_default_acl != JFS_ACL_NOT_CACHED) { | ||
160 | posix_acl_release(ji->i_default_acl); | ||
161 | ji->i_default_acl = JFS_ACL_NOT_CACHED; | ||
162 | } | ||
163 | #endif | ||
164 | |||
165 | kmem_cache_free(jfs_inode_cachep, ji); | ||
166 | } | ||
167 | |||
168 | static int jfs_statfs(struct super_block *sb, struct kstatfs *buf) | ||
169 | { | ||
170 | struct jfs_sb_info *sbi = JFS_SBI(sb); | ||
171 | s64 maxinodes; | ||
172 | struct inomap *imap = JFS_IP(sbi->ipimap)->i_imap; | ||
173 | |||
174 | jfs_info("In jfs_statfs"); | ||
175 | buf->f_type = JFS_SUPER_MAGIC; | ||
176 | buf->f_bsize = sbi->bsize; | ||
177 | buf->f_blocks = sbi->bmap->db_mapsize; | ||
178 | buf->f_bfree = sbi->bmap->db_nfree; | ||
179 | buf->f_bavail = sbi->bmap->db_nfree; | ||
180 | /* | ||
181 | * If we really return the number of allocated & free inodes, some | ||
182 | * applications will fail because they won't see enough free inodes. | ||
183 | * We'll try to calculate some guess as to how may inodes we can | ||
184 | * really allocate | ||
185 | * | ||
186 | * buf->f_files = atomic_read(&imap->im_numinos); | ||
187 | * buf->f_ffree = atomic_read(&imap->im_numfree); | ||
188 | */ | ||
189 | maxinodes = min((s64) atomic_read(&imap->im_numinos) + | ||
190 | ((sbi->bmap->db_nfree >> imap->im_l2nbperiext) | ||
191 | << L2INOSPEREXT), (s64) 0xffffffffLL); | ||
192 | buf->f_files = maxinodes; | ||
193 | buf->f_ffree = maxinodes - (atomic_read(&imap->im_numinos) - | ||
194 | atomic_read(&imap->im_numfree)); | ||
195 | |||
196 | buf->f_namelen = JFS_NAME_MAX; | ||
197 | return 0; | ||
198 | } | ||
199 | |||
200 | static void jfs_put_super(struct super_block *sb) | ||
201 | { | ||
202 | struct jfs_sb_info *sbi = JFS_SBI(sb); | ||
203 | int rc; | ||
204 | |||
205 | jfs_info("In jfs_put_super"); | ||
206 | rc = jfs_umount(sb); | ||
207 | if (rc) | ||
208 | jfs_err("jfs_umount failed with return code %d", rc); | ||
209 | if (sbi->nls_tab) | ||
210 | unload_nls(sbi->nls_tab); | ||
211 | sbi->nls_tab = NULL; | ||
212 | |||
213 | kfree(sbi); | ||
214 | } | ||
215 | |||
216 | enum { | ||
217 | Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize, | ||
218 | Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err, | ||
219 | }; | ||
220 | |||
221 | static match_table_t tokens = { | ||
222 | {Opt_integrity, "integrity"}, | ||
223 | {Opt_nointegrity, "nointegrity"}, | ||
224 | {Opt_iocharset, "iocharset=%s"}, | ||
225 | {Opt_resize, "resize=%u"}, | ||
226 | {Opt_resize_nosize, "resize"}, | ||
227 | {Opt_errors, "errors=%s"}, | ||
228 | {Opt_ignore, "noquota"}, | ||
229 | {Opt_ignore, "quota"}, | ||
230 | {Opt_ignore, "usrquota"}, | ||
231 | {Opt_ignore, "grpquota"}, | ||
232 | {Opt_err, NULL} | ||
233 | }; | ||
234 | |||
235 | static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, | ||
236 | int *flag) | ||
237 | { | ||
238 | void *nls_map = (void *)-1; /* -1: no change; NULL: none */ | ||
239 | char *p; | ||
240 | struct jfs_sb_info *sbi = JFS_SBI(sb); | ||
241 | |||
242 | *newLVSize = 0; | ||
243 | |||
244 | if (!options) | ||
245 | return 1; | ||
246 | |||
247 | while ((p = strsep(&options, ",")) != NULL) { | ||
248 | substring_t args[MAX_OPT_ARGS]; | ||
249 | int token; | ||
250 | if (!*p) | ||
251 | continue; | ||
252 | |||
253 | token = match_token(p, tokens, args); | ||
254 | switch (token) { | ||
255 | case Opt_integrity: | ||
256 | *flag &= ~JFS_NOINTEGRITY; | ||
257 | break; | ||
258 | case Opt_nointegrity: | ||
259 | *flag |= JFS_NOINTEGRITY; | ||
260 | break; | ||
261 | case Opt_ignore: | ||
262 | /* Silently ignore the quota options */ | ||
263 | /* Don't do anything ;-) */ | ||
264 | break; | ||
265 | case Opt_iocharset: | ||
266 | if (nls_map && nls_map != (void *) -1) | ||
267 | unload_nls(nls_map); | ||
268 | if (!strcmp(args[0].from, "none")) | ||
269 | nls_map = NULL; | ||
270 | else { | ||
271 | nls_map = load_nls(args[0].from); | ||
272 | if (!nls_map) { | ||
273 | printk(KERN_ERR | ||
274 | "JFS: charset not found\n"); | ||
275 | goto cleanup; | ||
276 | } | ||
277 | } | ||
278 | break; | ||
279 | case Opt_resize: | ||
280 | { | ||
281 | char *resize = args[0].from; | ||
282 | *newLVSize = simple_strtoull(resize, &resize, 0); | ||
283 | break; | ||
284 | } | ||
285 | case Opt_resize_nosize: | ||
286 | { | ||
287 | *newLVSize = sb->s_bdev->bd_inode->i_size >> | ||
288 | sb->s_blocksize_bits; | ||
289 | if (*newLVSize == 0) | ||
290 | printk(KERN_ERR | ||
291 | "JFS: Cannot determine volume size\n"); | ||
292 | break; | ||
293 | } | ||
294 | case Opt_errors: | ||
295 | { | ||
296 | char *errors = args[0].from; | ||
297 | if (!errors || !*errors) | ||
298 | goto cleanup; | ||
299 | if (!strcmp(errors, "continue")) { | ||
300 | *flag &= ~JFS_ERR_REMOUNT_RO; | ||
301 | *flag &= ~JFS_ERR_PANIC; | ||
302 | *flag |= JFS_ERR_CONTINUE; | ||
303 | } else if (!strcmp(errors, "remount-ro")) { | ||
304 | *flag &= ~JFS_ERR_CONTINUE; | ||
305 | *flag &= ~JFS_ERR_PANIC; | ||
306 | *flag |= JFS_ERR_REMOUNT_RO; | ||
307 | } else if (!strcmp(errors, "panic")) { | ||
308 | *flag &= ~JFS_ERR_CONTINUE; | ||
309 | *flag &= ~JFS_ERR_REMOUNT_RO; | ||
310 | *flag |= JFS_ERR_PANIC; | ||
311 | } else { | ||
312 | printk(KERN_ERR | ||
313 | "JFS: %s is an invalid error handler\n", | ||
314 | errors); | ||
315 | goto cleanup; | ||
316 | } | ||
317 | break; | ||
318 | } | ||
319 | default: | ||
320 | printk("jfs: Unrecognized mount option \"%s\" " | ||
321 | " or missing value\n", p); | ||
322 | goto cleanup; | ||
323 | } | ||
324 | } | ||
325 | |||
326 | if (nls_map != (void *) -1) { | ||
327 | /* Discard old (if remount) */ | ||
328 | if (sbi->nls_tab) | ||
329 | unload_nls(sbi->nls_tab); | ||
330 | sbi->nls_tab = nls_map; | ||
331 | } | ||
332 | return 1; | ||
333 | |||
334 | cleanup: | ||
335 | if (nls_map && nls_map != (void *) -1) | ||
336 | unload_nls(nls_map); | ||
337 | return 0; | ||
338 | } | ||
339 | |||
340 | static int jfs_remount(struct super_block *sb, int *flags, char *data) | ||
341 | { | ||
342 | s64 newLVSize = 0; | ||
343 | int rc = 0; | ||
344 | int flag = JFS_SBI(sb)->flag; | ||
345 | |||
346 | if (!parse_options(data, sb, &newLVSize, &flag)) { | ||
347 | return -EINVAL; | ||
348 | } | ||
349 | if (newLVSize) { | ||
350 | if (sb->s_flags & MS_RDONLY) { | ||
351 | printk(KERN_ERR | ||
352 | "JFS: resize requires volume to be mounted read-write\n"); | ||
353 | return -EROFS; | ||
354 | } | ||
355 | rc = jfs_extendfs(sb, newLVSize, 0); | ||
356 | if (rc) | ||
357 | return rc; | ||
358 | } | ||
359 | |||
360 | if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { | ||
361 | JFS_SBI(sb)->flag = flag; | ||
362 | return jfs_mount_rw(sb, 1); | ||
363 | } | ||
364 | if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { | ||
365 | rc = jfs_umount_rw(sb); | ||
366 | JFS_SBI(sb)->flag = flag; | ||
367 | return rc; | ||
368 | } | ||
369 | if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) | ||
370 | if (!(sb->s_flags & MS_RDONLY)) { | ||
371 | rc = jfs_umount_rw(sb); | ||
372 | if (rc) | ||
373 | return rc; | ||
374 | JFS_SBI(sb)->flag = flag; | ||
375 | return jfs_mount_rw(sb, 1); | ||
376 | } | ||
377 | JFS_SBI(sb)->flag = flag; | ||
378 | |||
379 | return 0; | ||
380 | } | ||
381 | |||
382 | static int jfs_fill_super(struct super_block *sb, void *data, int silent) | ||
383 | { | ||
384 | struct jfs_sb_info *sbi; | ||
385 | struct inode *inode; | ||
386 | int rc; | ||
387 | s64 newLVSize = 0; | ||
388 | int flag; | ||
389 | |||
390 | jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags); | ||
391 | |||
392 | if (!new_valid_dev(sb->s_bdev->bd_dev)) | ||
393 | return -EOVERFLOW; | ||
394 | |||
395 | sbi = kmalloc(sizeof (struct jfs_sb_info), GFP_KERNEL); | ||
396 | if (!sbi) | ||
397 | return -ENOSPC; | ||
398 | memset(sbi, 0, sizeof (struct jfs_sb_info)); | ||
399 | sb->s_fs_info = sbi; | ||
400 | sbi->sb = sb; | ||
401 | |||
402 | /* initialize the mount flag and determine the default error handler */ | ||
403 | flag = JFS_ERR_REMOUNT_RO; | ||
404 | |||
405 | if (!parse_options((char *) data, sb, &newLVSize, &flag)) { | ||
406 | kfree(sbi); | ||
407 | return -EINVAL; | ||
408 | } | ||
409 | sbi->flag = flag; | ||
410 | |||
411 | #ifdef CONFIG_JFS_POSIX_ACL | ||
412 | sb->s_flags |= MS_POSIXACL; | ||
413 | #endif | ||
414 | |||
415 | if (newLVSize) { | ||
416 | printk(KERN_ERR "resize option for remount only\n"); | ||
417 | return -EINVAL; | ||
418 | } | ||
419 | |||
420 | /* | ||
421 | * Initialize blocksize to 4K. | ||
422 | */ | ||
423 | sb_set_blocksize(sb, PSIZE); | ||
424 | |||
425 | /* | ||
426 | * Set method vectors. | ||
427 | */ | ||
428 | sb->s_op = &jfs_super_operations; | ||
429 | sb->s_export_op = &jfs_export_operations; | ||
430 | |||
431 | rc = jfs_mount(sb); | ||
432 | if (rc) { | ||
433 | if (!silent) { | ||
434 | jfs_err("jfs_mount failed w/return code = %d", rc); | ||
435 | } | ||
436 | goto out_kfree; | ||
437 | } | ||
438 | if (sb->s_flags & MS_RDONLY) | ||
439 | sbi->log = NULL; | ||
440 | else { | ||
441 | rc = jfs_mount_rw(sb, 0); | ||
442 | if (rc) { | ||
443 | if (!silent) { | ||
444 | jfs_err("jfs_mount_rw failed, return code = %d", | ||
445 | rc); | ||
446 | } | ||
447 | goto out_no_rw; | ||
448 | } | ||
449 | } | ||
450 | |||
451 | sb->s_magic = JFS_SUPER_MAGIC; | ||
452 | |||
453 | inode = iget(sb, ROOT_I); | ||
454 | if (!inode || is_bad_inode(inode)) | ||
455 | goto out_no_root; | ||
456 | sb->s_root = d_alloc_root(inode); | ||
457 | if (!sb->s_root) | ||
458 | goto out_no_root; | ||
459 | |||
460 | if (sbi->mntflag & JFS_OS2) | ||
461 | sb->s_root->d_op = &jfs_ci_dentry_operations; | ||
462 | |||
463 | /* logical blocks are represented by 40 bits in pxd_t, etc. */ | ||
464 | sb->s_maxbytes = ((u64) sb->s_blocksize) << 40; | ||
465 | #if BITS_PER_LONG == 32 | ||
466 | /* | ||
467 | * Page cache is indexed by long. | ||
468 | * I would use MAX_LFS_FILESIZE, but it's only half as big | ||
469 | */ | ||
470 | sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, sb->s_maxbytes); | ||
471 | #endif | ||
472 | sb->s_time_gran = 1; | ||
473 | return 0; | ||
474 | |||
475 | out_no_root: | ||
476 | jfs_err("jfs_read_super: get root inode failed"); | ||
477 | if (inode) | ||
478 | iput(inode); | ||
479 | |||
480 | out_no_rw: | ||
481 | rc = jfs_umount(sb); | ||
482 | if (rc) { | ||
483 | jfs_err("jfs_umount failed with return code %d", rc); | ||
484 | } | ||
485 | out_kfree: | ||
486 | if (sbi->nls_tab) | ||
487 | unload_nls(sbi->nls_tab); | ||
488 | kfree(sbi); | ||
489 | return -EINVAL; | ||
490 | } | ||
491 | |||
492 | static void jfs_write_super_lockfs(struct super_block *sb) | ||
493 | { | ||
494 | struct jfs_sb_info *sbi = JFS_SBI(sb); | ||
495 | struct jfs_log *log = sbi->log; | ||
496 | |||
497 | if (!(sb->s_flags & MS_RDONLY)) { | ||
498 | txQuiesce(sb); | ||
499 | lmLogShutdown(log); | ||
500 | updateSuper(sb, FM_CLEAN); | ||
501 | } | ||
502 | } | ||
503 | |||
504 | static void jfs_unlockfs(struct super_block *sb) | ||
505 | { | ||
506 | struct jfs_sb_info *sbi = JFS_SBI(sb); | ||
507 | struct jfs_log *log = sbi->log; | ||
508 | int rc = 0; | ||
509 | |||
510 | if (!(sb->s_flags & MS_RDONLY)) { | ||
511 | updateSuper(sb, FM_MOUNT); | ||
512 | if ((rc = lmLogInit(log))) | ||
513 | jfs_err("jfs_unlock failed with return code %d", rc); | ||
514 | else | ||
515 | txResume(sb); | ||
516 | } | ||
517 | } | ||
518 | |||
519 | static struct super_block *jfs_get_sb(struct file_system_type *fs_type, | ||
520 | int flags, const char *dev_name, void *data) | ||
521 | { | ||
522 | return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super); | ||
523 | } | ||
524 | |||
525 | static int jfs_sync_fs(struct super_block *sb, int wait) | ||
526 | { | ||
527 | struct jfs_log *log = JFS_SBI(sb)->log; | ||
528 | |||
529 | /* log == NULL indicates read-only mount */ | ||
530 | if (log) | ||
531 | jfs_flush_journal(log, wait); | ||
532 | |||
533 | return 0; | ||
534 | } | ||
535 | |||
536 | static struct super_operations jfs_super_operations = { | ||
537 | .alloc_inode = jfs_alloc_inode, | ||
538 | .destroy_inode = jfs_destroy_inode, | ||
539 | .read_inode = jfs_read_inode, | ||
540 | .dirty_inode = jfs_dirty_inode, | ||
541 | .write_inode = jfs_write_inode, | ||
542 | .delete_inode = jfs_delete_inode, | ||
543 | .put_super = jfs_put_super, | ||
544 | .sync_fs = jfs_sync_fs, | ||
545 | .write_super_lockfs = jfs_write_super_lockfs, | ||
546 | .unlockfs = jfs_unlockfs, | ||
547 | .statfs = jfs_statfs, | ||
548 | .remount_fs = jfs_remount, | ||
549 | }; | ||
550 | |||
551 | static struct export_operations jfs_export_operations = { | ||
552 | .get_parent = jfs_get_parent, | ||
553 | }; | ||
554 | |||
555 | static struct file_system_type jfs_fs_type = { | ||
556 | .owner = THIS_MODULE, | ||
557 | .name = "jfs", | ||
558 | .get_sb = jfs_get_sb, | ||
559 | .kill_sb = kill_block_super, | ||
560 | .fs_flags = FS_REQUIRES_DEV, | ||
561 | }; | ||
562 | |||
563 | extern int metapage_init(void); | ||
564 | extern int txInit(void); | ||
565 | extern void txExit(void); | ||
566 | extern void metapage_exit(void); | ||
567 | |||
568 | static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags) | ||
569 | { | ||
570 | struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo; | ||
571 | |||
572 | if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == | ||
573 | SLAB_CTOR_CONSTRUCTOR) { | ||
574 | memset(jfs_ip, 0, sizeof(struct jfs_inode_info)); | ||
575 | INIT_LIST_HEAD(&jfs_ip->anon_inode_list); | ||
576 | init_rwsem(&jfs_ip->rdwrlock); | ||
577 | init_MUTEX(&jfs_ip->commit_sem); | ||
578 | init_rwsem(&jfs_ip->xattr_sem); | ||
579 | spin_lock_init(&jfs_ip->ag_lock); | ||
580 | jfs_ip->active_ag = -1; | ||
581 | #ifdef CONFIG_JFS_POSIX_ACL | ||
582 | jfs_ip->i_acl = JFS_ACL_NOT_CACHED; | ||
583 | jfs_ip->i_default_acl = JFS_ACL_NOT_CACHED; | ||
584 | #endif | ||
585 | inode_init_once(&jfs_ip->vfs_inode); | ||
586 | } | ||
587 | } | ||
588 | |||
589 | static int __init init_jfs_fs(void) | ||
590 | { | ||
591 | int i; | ||
592 | int rc; | ||
593 | |||
594 | jfs_inode_cachep = | ||
595 | kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0, | ||
596 | SLAB_RECLAIM_ACCOUNT, init_once, NULL); | ||
597 | if (jfs_inode_cachep == NULL) | ||
598 | return -ENOMEM; | ||
599 | |||
600 | /* | ||
601 | * Metapage initialization | ||
602 | */ | ||
603 | rc = metapage_init(); | ||
604 | if (rc) { | ||
605 | jfs_err("metapage_init failed w/rc = %d", rc); | ||
606 | goto free_slab; | ||
607 | } | ||
608 | |||
609 | /* | ||
610 | * Transaction Manager initialization | ||
611 | */ | ||
612 | rc = txInit(); | ||
613 | if (rc) { | ||
614 | jfs_err("txInit failed w/rc = %d", rc); | ||
615 | goto free_metapage; | ||
616 | } | ||
617 | |||
618 | /* | ||
619 | * I/O completion thread (endio) | ||
620 | */ | ||
621 | jfsIOthread = kernel_thread(jfsIOWait, NULL, CLONE_KERNEL); | ||
622 | if (jfsIOthread < 0) { | ||
623 | jfs_err("init_jfs_fs: fork failed w/rc = %d", jfsIOthread); | ||
624 | goto end_txmngr; | ||
625 | } | ||
626 | wait_for_completion(&jfsIOwait); /* Wait until thread starts */ | ||
627 | |||
628 | if (commit_threads < 1) | ||
629 | commit_threads = num_online_cpus(); | ||
630 | if (commit_threads > MAX_COMMIT_THREADS) | ||
631 | commit_threads = MAX_COMMIT_THREADS; | ||
632 | |||
633 | for (i = 0; i < commit_threads; i++) { | ||
634 | jfsCommitThread[i] = kernel_thread(jfs_lazycommit, NULL, | ||
635 | CLONE_KERNEL); | ||
636 | if (jfsCommitThread[i] < 0) { | ||
637 | jfs_err("init_jfs_fs: fork failed w/rc = %d", | ||
638 | jfsCommitThread[i]); | ||
639 | commit_threads = i; | ||
640 | goto kill_committask; | ||
641 | } | ||
642 | /* Wait until thread starts */ | ||
643 | wait_for_completion(&jfsIOwait); | ||
644 | } | ||
645 | |||
646 | jfsSyncThread = kernel_thread(jfs_sync, NULL, CLONE_KERNEL); | ||
647 | if (jfsSyncThread < 0) { | ||
648 | jfs_err("init_jfs_fs: fork failed w/rc = %d", jfsSyncThread); | ||
649 | goto kill_committask; | ||
650 | } | ||
651 | wait_for_completion(&jfsIOwait); /* Wait until thread starts */ | ||
652 | |||
653 | #ifdef PROC_FS_JFS | ||
654 | jfs_proc_init(); | ||
655 | #endif | ||
656 | |||
657 | return register_filesystem(&jfs_fs_type); | ||
658 | |||
659 | kill_committask: | ||
660 | jfs_stop_threads = 1; | ||
661 | wake_up_all(&jfs_commit_thread_wait); | ||
662 | for (i = 0; i < commit_threads; i++) | ||
663 | wait_for_completion(&jfsIOwait); | ||
664 | |||
665 | wake_up(&jfs_IO_thread_wait); | ||
666 | wait_for_completion(&jfsIOwait); /* Wait for thread exit */ | ||
667 | end_txmngr: | ||
668 | txExit(); | ||
669 | free_metapage: | ||
670 | metapage_exit(); | ||
671 | free_slab: | ||
672 | kmem_cache_destroy(jfs_inode_cachep); | ||
673 | return rc; | ||
674 | } | ||
675 | |||
676 | static void __exit exit_jfs_fs(void) | ||
677 | { | ||
678 | int i; | ||
679 | |||
680 | jfs_info("exit_jfs_fs called"); | ||
681 | |||
682 | jfs_stop_threads = 1; | ||
683 | txExit(); | ||
684 | metapage_exit(); | ||
685 | wake_up(&jfs_IO_thread_wait); | ||
686 | wait_for_completion(&jfsIOwait); /* Wait until IO thread exits */ | ||
687 | wake_up_all(&jfs_commit_thread_wait); | ||
688 | for (i = 0; i < commit_threads; i++) | ||
689 | wait_for_completion(&jfsIOwait); | ||
690 | wake_up(&jfs_sync_thread_wait); | ||
691 | wait_for_completion(&jfsIOwait); /* Wait until Sync thread exits */ | ||
692 | #ifdef PROC_FS_JFS | ||
693 | jfs_proc_clean(); | ||
694 | #endif | ||
695 | unregister_filesystem(&jfs_fs_type); | ||
696 | kmem_cache_destroy(jfs_inode_cachep); | ||
697 | } | ||
698 | |||
699 | module_init(init_jfs_fs) | ||
700 | module_exit(exit_jfs_fs) | ||
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c new file mode 100644 index 000000000000..ef4c07ee92b2 --- /dev/null +++ b/fs/jfs/symlink.c | |||
@@ -0,0 +1,39 @@ | |||
1 | /* | ||
2 | * Copyright (c) Christoph Hellwig, 2001-2002 | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
12 | * the GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
17 | */ | ||
18 | |||
19 | #include <linux/fs.h> | ||
20 | #include <linux/namei.h> | ||
21 | #include "jfs_incore.h" | ||
22 | #include "jfs_xattr.h" | ||
23 | |||
24 | static int jfs_follow_link(struct dentry *dentry, struct nameidata *nd) | ||
25 | { | ||
26 | char *s = JFS_IP(dentry->d_inode)->i_inline; | ||
27 | nd_set_link(nd, s); | ||
28 | return 0; | ||
29 | } | ||
30 | |||
31 | struct inode_operations jfs_symlink_inode_operations = { | ||
32 | .readlink = generic_readlink, | ||
33 | .follow_link = jfs_follow_link, | ||
34 | .setxattr = jfs_setxattr, | ||
35 | .getxattr = jfs_getxattr, | ||
36 | .listxattr = jfs_listxattr, | ||
37 | .removexattr = jfs_removexattr, | ||
38 | }; | ||
39 | |||
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c new file mode 100644 index 000000000000..7a9ffd5d03dc --- /dev/null +++ b/fs/jfs/xattr.c | |||
@@ -0,0 +1,1127 @@ | |||
1 | /* | ||
2 | * Copyright (C) International Business Machines Corp., 2000-2004 | ||
3 | * Copyright (C) Christoph Hellwig, 2002 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | |||
20 | #include <linux/fs.h> | ||
21 | #include <linux/xattr.h> | ||
22 | #include <linux/quotaops.h> | ||
23 | #include "jfs_incore.h" | ||
24 | #include "jfs_superblock.h" | ||
25 | #include "jfs_dmap.h" | ||
26 | #include "jfs_debug.h" | ||
27 | #include "jfs_dinode.h" | ||
28 | #include "jfs_extent.h" | ||
29 | #include "jfs_metapage.h" | ||
30 | #include "jfs_xattr.h" | ||
31 | #include "jfs_acl.h" | ||
32 | |||
33 | /* | ||
34 | * jfs_xattr.c: extended attribute service | ||
35 | * | ||
36 | * Overall design -- | ||
37 | * | ||
38 | * Format: | ||
39 | * | ||
40 | * Extended attribute lists (jfs_ea_list) consist of an overall size (32 bit | ||
41 | * value) and a variable (0 or more) number of extended attribute | ||
42 | * entries. Each extended attribute entry (jfs_ea) is a <name,value> double | ||
43 | * where <name> is constructed from a null-terminated ascii string | ||
44 | * (1 ... 255 bytes in the name) and <value> is arbitrary 8 bit data | ||
45 | * (1 ... 65535 bytes). The in-memory format is | ||
46 | * | ||
47 | * 0 1 2 4 4 + namelen + 1 | ||
48 | * +-------+--------+--------+----------------+-------------------+ | ||
49 | * | Flags | Name | Value | Name String \0 | Data . . . . | | ||
50 | * | | Length | Length | | | | ||
51 | * +-------+--------+--------+----------------+-------------------+ | ||
52 | * | ||
53 | * A jfs_ea_list then is structured as | ||
54 | * | ||
55 | * 0 4 4 + EA_SIZE(ea1) | ||
56 | * +------------+-------------------+--------------------+----- | ||
57 | * | Overall EA | First FEA Element | Second FEA Element | ..... | ||
58 | * | List Size | | | | ||
59 | * +------------+-------------------+--------------------+----- | ||
60 | * | ||
61 | * On-disk: | ||
62 | * | ||
63 | * FEALISTs are stored on disk using blocks allocated by dbAlloc() and | ||
64 | * written directly. An EA list may be in-lined in the inode if there is | ||
65 | * sufficient room available. | ||
66 | */ | ||
67 | |||
68 | struct ea_buffer { | ||
69 | int flag; /* Indicates what storage xattr points to */ | ||
70 | int max_size; /* largest xattr that fits in current buffer */ | ||
71 | dxd_t new_ea; /* dxd to replace ea when modifying xattr */ | ||
72 | struct metapage *mp; /* metapage containing ea list */ | ||
73 | struct jfs_ea_list *xattr; /* buffer containing ea list */ | ||
74 | }; | ||
75 | |||
76 | /* | ||
77 | * ea_buffer.flag values | ||
78 | */ | ||
79 | #define EA_INLINE 0x0001 | ||
80 | #define EA_EXTENT 0x0002 | ||
81 | #define EA_NEW 0x0004 | ||
82 | #define EA_MALLOC 0x0008 | ||
83 | |||
84 | /* Namespaces */ | ||
85 | #define XATTR_SYSTEM_PREFIX "system." | ||
86 | #define XATTR_SYSTEM_PREFIX_LEN (sizeof (XATTR_SYSTEM_PREFIX) - 1) | ||
87 | |||
88 | #define XATTR_USER_PREFIX "user." | ||
89 | #define XATTR_USER_PREFIX_LEN (sizeof (XATTR_USER_PREFIX) - 1) | ||
90 | |||
91 | #define XATTR_OS2_PREFIX "os2." | ||
92 | #define XATTR_OS2_PREFIX_LEN (sizeof (XATTR_OS2_PREFIX) - 1) | ||
93 | |||
94 | /* XATTR_SECURITY_PREFIX is defined in include/linux/xattr.h */ | ||
95 | #define XATTR_SECURITY_PREFIX_LEN (sizeof (XATTR_SECURITY_PREFIX) - 1) | ||
96 | |||
97 | #define XATTR_TRUSTED_PREFIX "trusted." | ||
98 | #define XATTR_TRUSTED_PREFIX_LEN (sizeof (XATTR_TRUSTED_PREFIX) - 1) | ||
99 | |||
100 | /* | ||
101 | * These three routines are used to recognize on-disk extended attributes | ||
102 | * that are in a recognized namespace. If the attribute is not recognized, | ||
103 | * "os2." is prepended to the name | ||
104 | */ | ||
105 | static inline int is_os2_xattr(struct jfs_ea *ea) | ||
106 | { | ||
107 | /* | ||
108 | * Check for "system." | ||
109 | */ | ||
110 | if ((ea->namelen >= XATTR_SYSTEM_PREFIX_LEN) && | ||
111 | !strncmp(ea->name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) | ||
112 | return FALSE; | ||
113 | /* | ||
114 | * Check for "user." | ||
115 | */ | ||
116 | if ((ea->namelen >= XATTR_USER_PREFIX_LEN) && | ||
117 | !strncmp(ea->name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) | ||
118 | return FALSE; | ||
119 | /* | ||
120 | * Check for "security." | ||
121 | */ | ||
122 | if ((ea->namelen >= XATTR_SECURITY_PREFIX_LEN) && | ||
123 | !strncmp(ea->name, XATTR_SECURITY_PREFIX, | ||
124 | XATTR_SECURITY_PREFIX_LEN)) | ||
125 | return FALSE; | ||
126 | /* | ||
127 | * Check for "trusted." | ||
128 | */ | ||
129 | if ((ea->namelen >= XATTR_TRUSTED_PREFIX_LEN) && | ||
130 | !strncmp(ea->name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) | ||
131 | return FALSE; | ||
132 | /* | ||
133 | * Add any other valid namespace prefixes here | ||
134 | */ | ||
135 | |||
136 | /* | ||
137 | * We assume it's OS/2's flat namespace | ||
138 | */ | ||
139 | return TRUE; | ||
140 | } | ||
141 | |||
142 | static inline int name_size(struct jfs_ea *ea) | ||
143 | { | ||
144 | if (is_os2_xattr(ea)) | ||
145 | return ea->namelen + XATTR_OS2_PREFIX_LEN; | ||
146 | else | ||
147 | return ea->namelen; | ||
148 | } | ||
149 | |||
150 | static inline int copy_name(char *buffer, struct jfs_ea *ea) | ||
151 | { | ||
152 | int len = ea->namelen; | ||
153 | |||
154 | if (is_os2_xattr(ea)) { | ||
155 | memcpy(buffer, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN); | ||
156 | buffer += XATTR_OS2_PREFIX_LEN; | ||
157 | len += XATTR_OS2_PREFIX_LEN; | ||
158 | } | ||
159 | memcpy(buffer, ea->name, ea->namelen); | ||
160 | buffer[ea->namelen] = 0; | ||
161 | |||
162 | return len; | ||
163 | } | ||
164 | |||
165 | /* Forward references */ | ||
166 | static void ea_release(struct inode *inode, struct ea_buffer *ea_buf); | ||
167 | |||
168 | /* | ||
169 | * NAME: ea_write_inline | ||
170 | * | ||
171 | * FUNCTION: Attempt to write an EA inline if area is available | ||
172 | * | ||
173 | * PRE CONDITIONS: | ||
174 | * Already verified that the specified EA is small enough to fit inline | ||
175 | * | ||
176 | * PARAMETERS: | ||
177 | * ip - Inode pointer | ||
178 | * ealist - EA list pointer | ||
179 | * size - size of ealist in bytes | ||
180 | * ea - dxd_t structure to be filled in with necessary EA information | ||
181 | * if we successfully copy the EA inline | ||
182 | * | ||
183 | * NOTES: | ||
184 | * Checks if the inode's inline area is available. If so, copies EA inline | ||
185 | * and sets <ea> fields appropriately. Otherwise, returns failure, EA will | ||
186 | * have to be put into an extent. | ||
187 | * | ||
188 | * RETURNS: 0 for successful copy to inline area; -1 if area not available | ||
189 | */ | ||
190 | static int ea_write_inline(struct inode *ip, struct jfs_ea_list *ealist, | ||
191 | int size, dxd_t * ea) | ||
192 | { | ||
193 | struct jfs_inode_info *ji = JFS_IP(ip); | ||
194 | |||
195 | /* | ||
196 | * Make sure we have an EA -- the NULL EA list is valid, but you | ||
197 | * can't copy it! | ||
198 | */ | ||
199 | if (ealist && size > sizeof (struct jfs_ea_list)) { | ||
200 | assert(size <= sizeof (ji->i_inline_ea)); | ||
201 | |||
202 | /* | ||
203 | * See if the space is available or if it is already being | ||
204 | * used for an inline EA. | ||
205 | */ | ||
206 | if (!(ji->mode2 & INLINEEA) && !(ji->ea.flag & DXD_INLINE)) | ||
207 | return -EPERM; | ||
208 | |||
209 | DXDsize(ea, size); | ||
210 | DXDlength(ea, 0); | ||
211 | DXDaddress(ea, 0); | ||
212 | memcpy(ji->i_inline_ea, ealist, size); | ||
213 | ea->flag = DXD_INLINE; | ||
214 | ji->mode2 &= ~INLINEEA; | ||
215 | } else { | ||
216 | ea->flag = 0; | ||
217 | DXDsize(ea, 0); | ||
218 | DXDlength(ea, 0); | ||
219 | DXDaddress(ea, 0); | ||
220 | |||
221 | /* Free up INLINE area */ | ||
222 | if (ji->ea.flag & DXD_INLINE) | ||
223 | ji->mode2 |= INLINEEA; | ||
224 | } | ||
225 | |||
226 | return 0; | ||
227 | } | ||
228 | |||
229 | /* | ||
230 | * NAME: ea_write | ||
231 | * | ||
232 | * FUNCTION: Write an EA for an inode | ||
233 | * | ||
234 | * PRE CONDITIONS: EA has been verified | ||
235 | * | ||
236 | * PARAMETERS: | ||
237 | * ip - Inode pointer | ||
238 | * ealist - EA list pointer | ||
239 | * size - size of ealist in bytes | ||
240 | * ea - dxd_t structure to be filled in appropriately with where the | ||
241 | * EA was copied | ||
242 | * | ||
243 | * NOTES: Will write EA inline if able to, otherwise allocates blocks for an | ||
244 | * extent and synchronously writes it to those blocks. | ||
245 | * | ||
246 | * RETURNS: 0 for success; Anything else indicates failure | ||
247 | */ | ||
248 | static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size, | ||
249 | dxd_t * ea) | ||
250 | { | ||
251 | struct super_block *sb = ip->i_sb; | ||
252 | struct jfs_inode_info *ji = JFS_IP(ip); | ||
253 | struct jfs_sb_info *sbi = JFS_SBI(sb); | ||
254 | int nblocks; | ||
255 | s64 blkno; | ||
256 | int rc = 0, i; | ||
257 | char *cp; | ||
258 | s32 nbytes, nb; | ||
259 | s32 bytes_to_write; | ||
260 | struct metapage *mp; | ||
261 | |||
262 | /* | ||
263 | * Quick check to see if this is an in-linable EA. Short EAs | ||
264 | * and empty EAs are all in-linable, provided the space exists. | ||
265 | */ | ||
266 | if (!ealist || size <= sizeof (ji->i_inline_ea)) { | ||
267 | if (!ea_write_inline(ip, ealist, size, ea)) | ||
268 | return 0; | ||
269 | } | ||
270 | |||
271 | /* figure out how many blocks we need */ | ||
272 | nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits; | ||
273 | |||
274 | /* Allocate new blocks to quota. */ | ||
275 | if (DQUOT_ALLOC_BLOCK(ip, nblocks)) { | ||
276 | return -EDQUOT; | ||
277 | } | ||
278 | |||
279 | rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno); | ||
280 | if (rc) { | ||
281 | /*Rollback quota allocation. */ | ||
282 | DQUOT_FREE_BLOCK(ip, nblocks); | ||
283 | return rc; | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * Now have nblocks worth of storage to stuff into the FEALIST. | ||
288 | * loop over the FEALIST copying data into the buffer one page at | ||
289 | * a time. | ||
290 | */ | ||
291 | cp = (char *) ealist; | ||
292 | nbytes = size; | ||
293 | for (i = 0; i < nblocks; i += sbi->nbperpage) { | ||
294 | /* | ||
295 | * Determine how many bytes for this request, and round up to | ||
296 | * the nearest aggregate block size | ||
297 | */ | ||
298 | nb = min(PSIZE, nbytes); | ||
299 | bytes_to_write = | ||
300 | ((((nb + sb->s_blocksize - 1)) >> sb->s_blocksize_bits)) | ||
301 | << sb->s_blocksize_bits; | ||
302 | |||
303 | if (!(mp = get_metapage(ip, blkno + i, bytes_to_write, 1))) { | ||
304 | rc = -EIO; | ||
305 | goto failed; | ||
306 | } | ||
307 | |||
308 | memcpy(mp->data, cp, nb); | ||
309 | |||
310 | /* | ||
311 | * We really need a way to propagate errors for | ||
312 | * forced writes like this one. --hch | ||
313 | * | ||
314 | * (__write_metapage => release_metapage => flush_metapage) | ||
315 | */ | ||
316 | #ifdef _JFS_FIXME | ||
317 | if ((rc = flush_metapage(mp))) { | ||
318 | /* | ||
319 | * the write failed -- this means that the buffer | ||
320 | * is still assigned and the blocks are not being | ||
321 | * used. this seems like the best error recovery | ||
322 | * we can get ... | ||
323 | */ | ||
324 | goto failed; | ||
325 | } | ||
326 | #else | ||
327 | flush_metapage(mp); | ||
328 | #endif | ||
329 | |||
330 | cp += PSIZE; | ||
331 | nbytes -= nb; | ||
332 | } | ||
333 | |||
334 | ea->flag = DXD_EXTENT; | ||
335 | DXDsize(ea, le32_to_cpu(ealist->size)); | ||
336 | DXDlength(ea, nblocks); | ||
337 | DXDaddress(ea, blkno); | ||
338 | |||
339 | /* Free up INLINE area */ | ||
340 | if (ji->ea.flag & DXD_INLINE) | ||
341 | ji->mode2 |= INLINEEA; | ||
342 | |||
343 | return 0; | ||
344 | |||
345 | failed: | ||
346 | /* Rollback quota allocation. */ | ||
347 | DQUOT_FREE_BLOCK(ip, nblocks); | ||
348 | |||
349 | dbFree(ip, blkno, nblocks); | ||
350 | return rc; | ||
351 | } | ||
352 | |||
353 | /* | ||
354 | * NAME: ea_read_inline | ||
355 | * | ||
356 | * FUNCTION: Read an inlined EA into user's buffer | ||
357 | * | ||
358 | * PARAMETERS: | ||
359 | * ip - Inode pointer | ||
360 | * ealist - Pointer to buffer to fill in with EA | ||
361 | * | ||
362 | * RETURNS: 0 | ||
363 | */ | ||
364 | static int ea_read_inline(struct inode *ip, struct jfs_ea_list *ealist) | ||
365 | { | ||
366 | struct jfs_inode_info *ji = JFS_IP(ip); | ||
367 | int ea_size = sizeDXD(&ji->ea); | ||
368 | |||
369 | if (ea_size == 0) { | ||
370 | ealist->size = 0; | ||
371 | return 0; | ||
372 | } | ||
373 | |||
374 | /* Sanity Check */ | ||
375 | if ((sizeDXD(&ji->ea) > sizeof (ji->i_inline_ea))) | ||
376 | return -EIO; | ||
377 | if (le32_to_cpu(((struct jfs_ea_list *) &ji->i_inline_ea)->size) | ||
378 | != ea_size) | ||
379 | return -EIO; | ||
380 | |||
381 | memcpy(ealist, ji->i_inline_ea, ea_size); | ||
382 | return 0; | ||
383 | } | ||
384 | |||
385 | /* | ||
386 | * NAME: ea_read | ||
387 | * | ||
388 | * FUNCTION: copy EA data into user's buffer | ||
389 | * | ||
390 | * PARAMETERS: | ||
391 | * ip - Inode pointer | ||
392 | * ealist - Pointer to buffer to fill in with EA | ||
393 | * | ||
394 | * NOTES: If EA is inline calls ea_read_inline() to copy EA. | ||
395 | * | ||
396 | * RETURNS: 0 for success; other indicates failure | ||
397 | */ | ||
398 | static int ea_read(struct inode *ip, struct jfs_ea_list *ealist) | ||
399 | { | ||
400 | struct super_block *sb = ip->i_sb; | ||
401 | struct jfs_inode_info *ji = JFS_IP(ip); | ||
402 | struct jfs_sb_info *sbi = JFS_SBI(sb); | ||
403 | int nblocks; | ||
404 | s64 blkno; | ||
405 | char *cp = (char *) ealist; | ||
406 | int i; | ||
407 | int nbytes, nb; | ||
408 | s32 bytes_to_read; | ||
409 | struct metapage *mp; | ||
410 | |||
411 | /* quick check for in-line EA */ | ||
412 | if (ji->ea.flag & DXD_INLINE) | ||
413 | return ea_read_inline(ip, ealist); | ||
414 | |||
415 | nbytes = sizeDXD(&ji->ea); | ||
416 | if (!nbytes) { | ||
417 | jfs_error(sb, "ea_read: nbytes is 0"); | ||
418 | return -EIO; | ||
419 | } | ||
420 | |||
421 | /* | ||
422 | * Figure out how many blocks were allocated when this EA list was | ||
423 | * originally written to disk. | ||
424 | */ | ||
425 | nblocks = lengthDXD(&ji->ea) << sbi->l2nbperpage; | ||
426 | blkno = addressDXD(&ji->ea) << sbi->l2nbperpage; | ||
427 | |||
428 | /* | ||
429 | * I have found the disk blocks which were originally used to store | ||
430 | * the FEALIST. now i loop over each contiguous block copying the | ||
431 | * data into the buffer. | ||
432 | */ | ||
433 | for (i = 0; i < nblocks; i += sbi->nbperpage) { | ||
434 | /* | ||
435 | * Determine how many bytes for this request, and round up to | ||
436 | * the nearest aggregate block size | ||
437 | */ | ||
438 | nb = min(PSIZE, nbytes); | ||
439 | bytes_to_read = | ||
440 | ((((nb + sb->s_blocksize - 1)) >> sb->s_blocksize_bits)) | ||
441 | << sb->s_blocksize_bits; | ||
442 | |||
443 | if (!(mp = read_metapage(ip, blkno + i, bytes_to_read, 1))) | ||
444 | return -EIO; | ||
445 | |||
446 | memcpy(cp, mp->data, nb); | ||
447 | release_metapage(mp); | ||
448 | |||
449 | cp += PSIZE; | ||
450 | nbytes -= nb; | ||
451 | } | ||
452 | |||
453 | return 0; | ||
454 | } | ||
455 | |||
456 | /* | ||
457 | * NAME: ea_get | ||
458 | * | ||
459 | * FUNCTION: Returns buffer containing existing extended attributes. | ||
460 | * The size of the buffer will be the larger of the existing | ||
461 | * attributes size, or min_size. | ||
462 | * | ||
463 | * The buffer, which may be inlined in the inode or in the | ||
464 | * page cache must be release by calling ea_release or ea_put | ||
465 | * | ||
466 | * PARAMETERS: | ||
467 | * inode - Inode pointer | ||
468 | * ea_buf - Structure to be populated with ealist and its metadata | ||
469 | * min_size- minimum size of buffer to be returned | ||
470 | * | ||
471 | * RETURNS: 0 for success; Other indicates failure | ||
472 | */ | ||
473 | static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) | ||
474 | { | ||
475 | struct jfs_inode_info *ji = JFS_IP(inode); | ||
476 | struct super_block *sb = inode->i_sb; | ||
477 | int size; | ||
478 | int ea_size = sizeDXD(&ji->ea); | ||
479 | int blocks_needed, current_blocks; | ||
480 | s64 blkno; | ||
481 | int rc; | ||
482 | int quota_allocation = 0; | ||
483 | |||
484 | /* When fsck.jfs clears a bad ea, it doesn't clear the size */ | ||
485 | if (ji->ea.flag == 0) | ||
486 | ea_size = 0; | ||
487 | |||
488 | if (ea_size == 0) { | ||
489 | if (min_size == 0) { | ||
490 | ea_buf->flag = 0; | ||
491 | ea_buf->max_size = 0; | ||
492 | ea_buf->xattr = NULL; | ||
493 | return 0; | ||
494 | } | ||
495 | if ((min_size <= sizeof (ji->i_inline_ea)) && | ||
496 | (ji->mode2 & INLINEEA)) { | ||
497 | ea_buf->flag = EA_INLINE | EA_NEW; | ||
498 | ea_buf->max_size = sizeof (ji->i_inline_ea); | ||
499 | ea_buf->xattr = (struct jfs_ea_list *) ji->i_inline_ea; | ||
500 | DXDlength(&ea_buf->new_ea, 0); | ||
501 | DXDaddress(&ea_buf->new_ea, 0); | ||
502 | ea_buf->new_ea.flag = DXD_INLINE; | ||
503 | DXDsize(&ea_buf->new_ea, min_size); | ||
504 | return 0; | ||
505 | } | ||
506 | current_blocks = 0; | ||
507 | } else if (ji->ea.flag & DXD_INLINE) { | ||
508 | if (min_size <= sizeof (ji->i_inline_ea)) { | ||
509 | ea_buf->flag = EA_INLINE; | ||
510 | ea_buf->max_size = sizeof (ji->i_inline_ea); | ||
511 | ea_buf->xattr = (struct jfs_ea_list *) ji->i_inline_ea; | ||
512 | goto size_check; | ||
513 | } | ||
514 | current_blocks = 0; | ||
515 | } else { | ||
516 | if (!(ji->ea.flag & DXD_EXTENT)) { | ||
517 | jfs_error(sb, "ea_get: invalid ea.flag)"); | ||
518 | return -EIO; | ||
519 | } | ||
520 | current_blocks = (ea_size + sb->s_blocksize - 1) >> | ||
521 | sb->s_blocksize_bits; | ||
522 | } | ||
523 | size = max(min_size, ea_size); | ||
524 | |||
525 | if (size > PSIZE) { | ||
526 | /* | ||
527 | * To keep the rest of the code simple. Allocate a | ||
528 | * contiguous buffer to work with | ||
529 | */ | ||
530 | ea_buf->xattr = kmalloc(size, GFP_KERNEL); | ||
531 | if (ea_buf->xattr == NULL) | ||
532 | return -ENOMEM; | ||
533 | |||
534 | ea_buf->flag = EA_MALLOC; | ||
535 | ea_buf->max_size = (size + sb->s_blocksize - 1) & | ||
536 | ~(sb->s_blocksize - 1); | ||
537 | |||
538 | if (ea_size == 0) | ||
539 | return 0; | ||
540 | |||
541 | if ((rc = ea_read(inode, ea_buf->xattr))) { | ||
542 | kfree(ea_buf->xattr); | ||
543 | ea_buf->xattr = NULL; | ||
544 | return rc; | ||
545 | } | ||
546 | goto size_check; | ||
547 | } | ||
548 | blocks_needed = (min_size + sb->s_blocksize - 1) >> | ||
549 | sb->s_blocksize_bits; | ||
550 | |||
551 | if (blocks_needed > current_blocks) { | ||
552 | /* Allocate new blocks to quota. */ | ||
553 | if (DQUOT_ALLOC_BLOCK(inode, blocks_needed)) | ||
554 | return -EDQUOT; | ||
555 | |||
556 | quota_allocation = blocks_needed; | ||
557 | |||
558 | rc = dbAlloc(inode, INOHINT(inode), (s64) blocks_needed, | ||
559 | &blkno); | ||
560 | if (rc) | ||
561 | goto clean_up; | ||
562 | |||
563 | DXDlength(&ea_buf->new_ea, blocks_needed); | ||
564 | DXDaddress(&ea_buf->new_ea, blkno); | ||
565 | ea_buf->new_ea.flag = DXD_EXTENT; | ||
566 | DXDsize(&ea_buf->new_ea, min_size); | ||
567 | |||
568 | ea_buf->flag = EA_EXTENT | EA_NEW; | ||
569 | |||
570 | ea_buf->mp = get_metapage(inode, blkno, | ||
571 | blocks_needed << sb->s_blocksize_bits, | ||
572 | 1); | ||
573 | if (ea_buf->mp == NULL) { | ||
574 | dbFree(inode, blkno, (s64) blocks_needed); | ||
575 | rc = -EIO; | ||
576 | goto clean_up; | ||
577 | } | ||
578 | ea_buf->xattr = ea_buf->mp->data; | ||
579 | ea_buf->max_size = (min_size + sb->s_blocksize - 1) & | ||
580 | ~(sb->s_blocksize - 1); | ||
581 | if (ea_size == 0) | ||
582 | return 0; | ||
583 | if ((rc = ea_read(inode, ea_buf->xattr))) { | ||
584 | discard_metapage(ea_buf->mp); | ||
585 | dbFree(inode, blkno, (s64) blocks_needed); | ||
586 | goto clean_up; | ||
587 | } | ||
588 | goto size_check; | ||
589 | } | ||
590 | ea_buf->flag = EA_EXTENT; | ||
591 | ea_buf->mp = read_metapage(inode, addressDXD(&ji->ea), | ||
592 | lengthDXD(&ji->ea) << sb->s_blocksize_bits, | ||
593 | 1); | ||
594 | if (ea_buf->mp == NULL) { | ||
595 | rc = -EIO; | ||
596 | goto clean_up; | ||
597 | } | ||
598 | ea_buf->xattr = ea_buf->mp->data; | ||
599 | ea_buf->max_size = (ea_size + sb->s_blocksize - 1) & | ||
600 | ~(sb->s_blocksize - 1); | ||
601 | |||
602 | size_check: | ||
603 | if (EALIST_SIZE(ea_buf->xattr) != ea_size) { | ||
604 | printk(KERN_ERR "ea_get: invalid extended attribute\n"); | ||
605 | dump_mem("xattr", ea_buf->xattr, ea_size); | ||
606 | ea_release(inode, ea_buf); | ||
607 | rc = -EIO; | ||
608 | goto clean_up; | ||
609 | } | ||
610 | |||
611 | return ea_size; | ||
612 | |||
613 | clean_up: | ||
614 | /* Rollback quota allocation */ | ||
615 | if (quota_allocation) | ||
616 | DQUOT_FREE_BLOCK(inode, quota_allocation); | ||
617 | |||
618 | return (rc); | ||
619 | } | ||
620 | |||
621 | static void ea_release(struct inode *inode, struct ea_buffer *ea_buf) | ||
622 | { | ||
623 | if (ea_buf->flag & EA_MALLOC) | ||
624 | kfree(ea_buf->xattr); | ||
625 | else if (ea_buf->flag & EA_EXTENT) { | ||
626 | assert(ea_buf->mp); | ||
627 | release_metapage(ea_buf->mp); | ||
628 | |||
629 | if (ea_buf->flag & EA_NEW) | ||
630 | dbFree(inode, addressDXD(&ea_buf->new_ea), | ||
631 | lengthDXD(&ea_buf->new_ea)); | ||
632 | } | ||
633 | } | ||
634 | |||
635 | static int ea_put(struct inode *inode, struct ea_buffer *ea_buf, int new_size) | ||
636 | { | ||
637 | struct jfs_inode_info *ji = JFS_IP(inode); | ||
638 | unsigned long old_blocks, new_blocks; | ||
639 | int rc = 0; | ||
640 | tid_t tid; | ||
641 | |||
642 | if (new_size == 0) { | ||
643 | ea_release(inode, ea_buf); | ||
644 | ea_buf = NULL; | ||
645 | } else if (ea_buf->flag & EA_INLINE) { | ||
646 | assert(new_size <= sizeof (ji->i_inline_ea)); | ||
647 | ji->mode2 &= ~INLINEEA; | ||
648 | ea_buf->new_ea.flag = DXD_INLINE; | ||
649 | DXDsize(&ea_buf->new_ea, new_size); | ||
650 | DXDaddress(&ea_buf->new_ea, 0); | ||
651 | DXDlength(&ea_buf->new_ea, 0); | ||
652 | } else if (ea_buf->flag & EA_MALLOC) { | ||
653 | rc = ea_write(inode, ea_buf->xattr, new_size, &ea_buf->new_ea); | ||
654 | kfree(ea_buf->xattr); | ||
655 | } else if (ea_buf->flag & EA_NEW) { | ||
656 | /* We have already allocated a new dxd */ | ||
657 | flush_metapage(ea_buf->mp); | ||
658 | } else { | ||
659 | /* ->xattr must point to original ea's metapage */ | ||
660 | rc = ea_write(inode, ea_buf->xattr, new_size, &ea_buf->new_ea); | ||
661 | discard_metapage(ea_buf->mp); | ||
662 | } | ||
663 | if (rc) | ||
664 | return rc; | ||
665 | |||
666 | tid = txBegin(inode->i_sb, 0); | ||
667 | down(&ji->commit_sem); | ||
668 | |||
669 | old_blocks = new_blocks = 0; | ||
670 | |||
671 | if (ji->ea.flag & DXD_EXTENT) { | ||
672 | invalidate_dxd_metapages(inode, ji->ea); | ||
673 | old_blocks = lengthDXD(&ji->ea); | ||
674 | } | ||
675 | |||
676 | if (ea_buf) { | ||
677 | txEA(tid, inode, &ji->ea, &ea_buf->new_ea); | ||
678 | if (ea_buf->new_ea.flag & DXD_EXTENT) { | ||
679 | new_blocks = lengthDXD(&ea_buf->new_ea); | ||
680 | if (ji->ea.flag & DXD_INLINE) | ||
681 | ji->mode2 |= INLINEEA; | ||
682 | } | ||
683 | ji->ea = ea_buf->new_ea; | ||
684 | } else { | ||
685 | txEA(tid, inode, &ji->ea, NULL); | ||
686 | if (ji->ea.flag & DXD_INLINE) | ||
687 | ji->mode2 |= INLINEEA; | ||
688 | ji->ea.flag = 0; | ||
689 | ji->ea.size = 0; | ||
690 | } | ||
691 | |||
692 | /* If old blocks exist, they must be removed from quota allocation. */ | ||
693 | if (old_blocks) | ||
694 | DQUOT_FREE_BLOCK(inode, old_blocks); | ||
695 | |||
696 | inode->i_ctime = CURRENT_TIME; | ||
697 | rc = txCommit(tid, 1, &inode, 0); | ||
698 | txEnd(tid); | ||
699 | up(&ji->commit_sem); | ||
700 | |||
701 | return rc; | ||
702 | } | ||
703 | |||
704 | /* | ||
705 | * can_set_system_xattr | ||
706 | * | ||
707 | * This code is specific to the system.* namespace. It contains policy | ||
708 | * which doesn't belong in the main xattr codepath. | ||
709 | */ | ||
710 | static int can_set_system_xattr(struct inode *inode, const char *name, | ||
711 | const void *value, size_t value_len) | ||
712 | { | ||
713 | #ifdef CONFIG_JFS_POSIX_ACL | ||
714 | struct posix_acl *acl; | ||
715 | int rc; | ||
716 | |||
717 | if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) | ||
718 | return -EPERM; | ||
719 | |||
720 | /* | ||
721 | * XATTR_NAME_ACL_ACCESS is tied to i_mode | ||
722 | */ | ||
723 | if (strcmp(name, XATTR_NAME_ACL_ACCESS) == 0) { | ||
724 | acl = posix_acl_from_xattr(value, value_len); | ||
725 | if (IS_ERR(acl)) { | ||
726 | rc = PTR_ERR(acl); | ||
727 | printk(KERN_ERR "posix_acl_from_xattr returned %d\n", | ||
728 | rc); | ||
729 | return rc; | ||
730 | } | ||
731 | if (acl) { | ||
732 | mode_t mode = inode->i_mode; | ||
733 | rc = posix_acl_equiv_mode(acl, &mode); | ||
734 | posix_acl_release(acl); | ||
735 | if (rc < 0) { | ||
736 | printk(KERN_ERR | ||
737 | "posix_acl_equiv_mode returned %d\n", | ||
738 | rc); | ||
739 | return rc; | ||
740 | } | ||
741 | inode->i_mode = mode; | ||
742 | mark_inode_dirty(inode); | ||
743 | } | ||
744 | /* | ||
745 | * We're changing the ACL. Get rid of the cached one | ||
746 | */ | ||
747 | acl =JFS_IP(inode)->i_acl; | ||
748 | if (acl != JFS_ACL_NOT_CACHED) | ||
749 | posix_acl_release(acl); | ||
750 | JFS_IP(inode)->i_acl = JFS_ACL_NOT_CACHED; | ||
751 | |||
752 | return 0; | ||
753 | } else if (strcmp(name, XATTR_NAME_ACL_DEFAULT) == 0) { | ||
754 | acl = posix_acl_from_xattr(value, value_len); | ||
755 | if (IS_ERR(acl)) { | ||
756 | rc = PTR_ERR(acl); | ||
757 | printk(KERN_ERR "posix_acl_from_xattr returned %d\n", | ||
758 | rc); | ||
759 | return rc; | ||
760 | } | ||
761 | posix_acl_release(acl); | ||
762 | |||
763 | /* | ||
764 | * We're changing the default ACL. Get rid of the cached one | ||
765 | */ | ||
766 | acl =JFS_IP(inode)->i_default_acl; | ||
767 | if (acl && (acl != JFS_ACL_NOT_CACHED)) | ||
768 | posix_acl_release(acl); | ||
769 | JFS_IP(inode)->i_default_acl = JFS_ACL_NOT_CACHED; | ||
770 | |||
771 | return 0; | ||
772 | } | ||
773 | #endif /* CONFIG_JFS_POSIX_ACL */ | ||
774 | return -EOPNOTSUPP; | ||
775 | } | ||
776 | |||
777 | static int can_set_xattr(struct inode *inode, const char *name, | ||
778 | const void *value, size_t value_len) | ||
779 | { | ||
780 | if (IS_RDONLY(inode)) | ||
781 | return -EROFS; | ||
782 | |||
783 | if (IS_IMMUTABLE(inode) || IS_APPEND(inode) || S_ISLNK(inode->i_mode)) | ||
784 | return -EPERM; | ||
785 | |||
786 | if(strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) == 0) | ||
787 | /* | ||
788 | * "system.*" | ||
789 | */ | ||
790 | return can_set_system_xattr(inode, name, value, value_len); | ||
791 | |||
792 | if(strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0) | ||
793 | return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM); | ||
794 | |||
795 | #ifdef CONFIG_JFS_SECURITY | ||
796 | if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) | ||
797 | != 0) | ||
798 | return 0; /* Leave it to the security module */ | ||
799 | #endif | ||
800 | |||
801 | if((strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) != 0) && | ||
802 | (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) != 0)) | ||
803 | return -EOPNOTSUPP; | ||
804 | |||
805 | if (!S_ISREG(inode->i_mode) && | ||
806 | (!S_ISDIR(inode->i_mode) || inode->i_mode &S_ISVTX)) | ||
807 | return -EPERM; | ||
808 | |||
809 | return permission(inode, MAY_WRITE, NULL); | ||
810 | } | ||
811 | |||
812 | int __jfs_setxattr(struct inode *inode, const char *name, const void *value, | ||
813 | size_t value_len, int flags) | ||
814 | { | ||
815 | struct jfs_ea_list *ealist; | ||
816 | struct jfs_ea *ea, *old_ea = NULL, *next_ea = NULL; | ||
817 | struct ea_buffer ea_buf; | ||
818 | int old_ea_size = 0; | ||
819 | int xattr_size; | ||
820 | int new_size; | ||
821 | int namelen = strlen(name); | ||
822 | char *os2name = NULL; | ||
823 | int found = 0; | ||
824 | int rc; | ||
825 | int length; | ||
826 | |||
827 | if ((rc = can_set_xattr(inode, name, value, value_len))) | ||
828 | return rc; | ||
829 | |||
830 | if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { | ||
831 | os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1, | ||
832 | GFP_KERNEL); | ||
833 | if (!os2name) | ||
834 | return -ENOMEM; | ||
835 | strcpy(os2name, name + XATTR_OS2_PREFIX_LEN); | ||
836 | name = os2name; | ||
837 | namelen -= XATTR_OS2_PREFIX_LEN; | ||
838 | } | ||
839 | |||
840 | down_write(&JFS_IP(inode)->xattr_sem); | ||
841 | |||
842 | xattr_size = ea_get(inode, &ea_buf, 0); | ||
843 | if (xattr_size < 0) { | ||
844 | rc = xattr_size; | ||
845 | goto out; | ||
846 | } | ||
847 | |||
848 | again: | ||
849 | ealist = (struct jfs_ea_list *) ea_buf.xattr; | ||
850 | new_size = sizeof (struct jfs_ea_list); | ||
851 | |||
852 | if (xattr_size) { | ||
853 | for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); | ||
854 | ea = NEXT_EA(ea)) { | ||
855 | if ((namelen == ea->namelen) && | ||
856 | (memcmp(name, ea->name, namelen) == 0)) { | ||
857 | found = 1; | ||
858 | if (flags & XATTR_CREATE) { | ||
859 | rc = -EEXIST; | ||
860 | goto release; | ||
861 | } | ||
862 | old_ea = ea; | ||
863 | old_ea_size = EA_SIZE(ea); | ||
864 | next_ea = NEXT_EA(ea); | ||
865 | } else | ||
866 | new_size += EA_SIZE(ea); | ||
867 | } | ||
868 | } | ||
869 | |||
870 | if (!found) { | ||
871 | if (flags & XATTR_REPLACE) { | ||
872 | rc = -ENODATA; | ||
873 | goto release; | ||
874 | } | ||
875 | if (value == NULL) { | ||
876 | rc = 0; | ||
877 | goto release; | ||
878 | } | ||
879 | } | ||
880 | if (value) | ||
881 | new_size += sizeof (struct jfs_ea) + namelen + 1 + value_len; | ||
882 | |||
883 | if (new_size > ea_buf.max_size) { | ||
884 | /* | ||
885 | * We need to allocate more space for merged ea list. | ||
886 | * We should only have loop to again: once. | ||
887 | */ | ||
888 | ea_release(inode, &ea_buf); | ||
889 | xattr_size = ea_get(inode, &ea_buf, new_size); | ||
890 | if (xattr_size < 0) { | ||
891 | rc = xattr_size; | ||
892 | goto out; | ||
893 | } | ||
894 | goto again; | ||
895 | } | ||
896 | |||
897 | /* Remove old ea of the same name */ | ||
898 | if (found) { | ||
899 | /* number of bytes following target EA */ | ||
900 | length = (char *) END_EALIST(ealist) - (char *) next_ea; | ||
901 | if (length > 0) | ||
902 | memmove(old_ea, next_ea, length); | ||
903 | xattr_size -= old_ea_size; | ||
904 | } | ||
905 | |||
906 | /* Add new entry to the end */ | ||
907 | if (value) { | ||
908 | if (xattr_size == 0) | ||
909 | /* Completely new ea list */ | ||
910 | xattr_size = sizeof (struct jfs_ea_list); | ||
911 | |||
912 | ea = (struct jfs_ea *) ((char *) ealist + xattr_size); | ||
913 | ea->flag = 0; | ||
914 | ea->namelen = namelen; | ||
915 | ea->valuelen = (cpu_to_le16(value_len)); | ||
916 | memcpy(ea->name, name, namelen); | ||
917 | ea->name[namelen] = 0; | ||
918 | if (value_len) | ||
919 | memcpy(&ea->name[namelen + 1], value, value_len); | ||
920 | xattr_size += EA_SIZE(ea); | ||
921 | } | ||
922 | |||
923 | /* DEBUG - If we did this right, these number match */ | ||
924 | if (xattr_size != new_size) { | ||
925 | printk(KERN_ERR | ||
926 | "jfs_xsetattr: xattr_size = %d, new_size = %d\n", | ||
927 | xattr_size, new_size); | ||
928 | |||
929 | rc = -EINVAL; | ||
930 | goto release; | ||
931 | } | ||
932 | |||
933 | /* | ||
934 | * If we're left with an empty list, there's no ea | ||
935 | */ | ||
936 | if (new_size == sizeof (struct jfs_ea_list)) | ||
937 | new_size = 0; | ||
938 | |||
939 | ealist->size = cpu_to_le32(new_size); | ||
940 | |||
941 | rc = ea_put(inode, &ea_buf, new_size); | ||
942 | |||
943 | goto out; | ||
944 | release: | ||
945 | ea_release(inode, &ea_buf); | ||
946 | out: | ||
947 | up_write(&JFS_IP(inode)->xattr_sem); | ||
948 | |||
949 | if (os2name) | ||
950 | kfree(os2name); | ||
951 | |||
952 | return rc; | ||
953 | } | ||
954 | |||
955 | int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, | ||
956 | size_t value_len, int flags) | ||
957 | { | ||
958 | if (value == NULL) { /* empty EA, do not remove */ | ||
959 | value = ""; | ||
960 | value_len = 0; | ||
961 | } | ||
962 | |||
963 | return __jfs_setxattr(dentry->d_inode, name, value, value_len, flags); | ||
964 | } | ||
965 | |||
966 | static int can_get_xattr(struct inode *inode, const char *name) | ||
967 | { | ||
968 | #ifdef CONFIG_JFS_SECURITY | ||
969 | if(strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) | ||
970 | return 0; | ||
971 | #endif | ||
972 | |||
973 | if(strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) | ||
974 | return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM); | ||
975 | |||
976 | if(strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) == 0) | ||
977 | return 0; | ||
978 | |||
979 | return permission(inode, MAY_READ, NULL); | ||
980 | } | ||
981 | |||
982 | ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data, | ||
983 | size_t buf_size) | ||
984 | { | ||
985 | struct jfs_ea_list *ealist; | ||
986 | struct jfs_ea *ea; | ||
987 | struct ea_buffer ea_buf; | ||
988 | int xattr_size; | ||
989 | ssize_t size; | ||
990 | int namelen = strlen(name); | ||
991 | char *os2name = NULL; | ||
992 | int rc; | ||
993 | char *value; | ||
994 | |||
995 | if ((rc = can_get_xattr(inode, name))) | ||
996 | return rc; | ||
997 | |||
998 | if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { | ||
999 | os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1, | ||
1000 | GFP_KERNEL); | ||
1001 | if (!os2name) | ||
1002 | return -ENOMEM; | ||
1003 | strcpy(os2name, name + XATTR_OS2_PREFIX_LEN); | ||
1004 | name = os2name; | ||
1005 | namelen -= XATTR_OS2_PREFIX_LEN; | ||
1006 | } | ||
1007 | |||
1008 | down_read(&JFS_IP(inode)->xattr_sem); | ||
1009 | |||
1010 | xattr_size = ea_get(inode, &ea_buf, 0); | ||
1011 | |||
1012 | if (xattr_size < 0) { | ||
1013 | size = xattr_size; | ||
1014 | goto out; | ||
1015 | } | ||
1016 | |||
1017 | if (xattr_size == 0) | ||
1018 | goto not_found; | ||
1019 | |||
1020 | ealist = (struct jfs_ea_list *) ea_buf.xattr; | ||
1021 | |||
1022 | /* Find the named attribute */ | ||
1023 | for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) | ||
1024 | if ((namelen == ea->namelen) && | ||
1025 | memcmp(name, ea->name, namelen) == 0) { | ||
1026 | /* Found it */ | ||
1027 | size = le16_to_cpu(ea->valuelen); | ||
1028 | if (!data) | ||
1029 | goto release; | ||
1030 | else if (size > buf_size) { | ||
1031 | size = -ERANGE; | ||
1032 | goto release; | ||
1033 | } | ||
1034 | value = ((char *) &ea->name) + ea->namelen + 1; | ||
1035 | memcpy(data, value, size); | ||
1036 | goto release; | ||
1037 | } | ||
1038 | not_found: | ||
1039 | size = -ENODATA; | ||
1040 | release: | ||
1041 | ea_release(inode, &ea_buf); | ||
1042 | out: | ||
1043 | up_read(&JFS_IP(inode)->xattr_sem); | ||
1044 | |||
1045 | if (os2name) | ||
1046 | kfree(os2name); | ||
1047 | |||
1048 | return size; | ||
1049 | } | ||
1050 | |||
1051 | ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data, | ||
1052 | size_t buf_size) | ||
1053 | { | ||
1054 | int err; | ||
1055 | |||
1056 | err = __jfs_getxattr(dentry->d_inode, name, data, buf_size); | ||
1057 | |||
1058 | return err; | ||
1059 | } | ||
1060 | |||
1061 | /* | ||
1062 | * No special permissions are needed to list attributes except for trusted.* | ||
1063 | */ | ||
1064 | static inline int can_list(struct jfs_ea *ea) | ||
1065 | { | ||
1066 | return (strncmp(ea->name, XATTR_TRUSTED_PREFIX, | ||
1067 | XATTR_TRUSTED_PREFIX_LEN) || | ||
1068 | capable(CAP_SYS_ADMIN)); | ||
1069 | } | ||
1070 | |||
1071 | ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size) | ||
1072 | { | ||
1073 | struct inode *inode = dentry->d_inode; | ||
1074 | char *buffer; | ||
1075 | ssize_t size = 0; | ||
1076 | int xattr_size; | ||
1077 | struct jfs_ea_list *ealist; | ||
1078 | struct jfs_ea *ea; | ||
1079 | struct ea_buffer ea_buf; | ||
1080 | |||
1081 | down_read(&JFS_IP(inode)->xattr_sem); | ||
1082 | |||
1083 | xattr_size = ea_get(inode, &ea_buf, 0); | ||
1084 | if (xattr_size < 0) { | ||
1085 | size = xattr_size; | ||
1086 | goto out; | ||
1087 | } | ||
1088 | |||
1089 | if (xattr_size == 0) | ||
1090 | goto release; | ||
1091 | |||
1092 | ealist = (struct jfs_ea_list *) ea_buf.xattr; | ||
1093 | |||
1094 | /* compute required size of list */ | ||
1095 | for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) { | ||
1096 | if (can_list(ea)) | ||
1097 | size += name_size(ea) + 1; | ||
1098 | } | ||
1099 | |||
1100 | if (!data) | ||
1101 | goto release; | ||
1102 | |||
1103 | if (size > buf_size) { | ||
1104 | size = -ERANGE; | ||
1105 | goto release; | ||
1106 | } | ||
1107 | |||
1108 | /* Copy attribute names to buffer */ | ||
1109 | buffer = data; | ||
1110 | for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) { | ||
1111 | if (can_list(ea)) { | ||
1112 | int namelen = copy_name(buffer, ea); | ||
1113 | buffer += namelen + 1; | ||
1114 | } | ||
1115 | } | ||
1116 | |||
1117 | release: | ||
1118 | ea_release(inode, &ea_buf); | ||
1119 | out: | ||
1120 | up_read(&JFS_IP(inode)->xattr_sem); | ||
1121 | return size; | ||
1122 | } | ||
1123 | |||
1124 | int jfs_removexattr(struct dentry *dentry, const char *name) | ||
1125 | { | ||
1126 | return __jfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); | ||
1127 | } | ||