aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext3
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2015-06-18 10:52:29 -0400
committerJan Kara <jack@suse.com>2015-07-23 14:59:40 -0400
commitc290ea01abb7907fde602f3ba55905ef10a37477 (patch)
tree67b3f47105259178034ef42d096bb5accd9407a3 /fs/ext3
parent82ff50b222d8ac645cdeba974c612c9eef01c3dd (diff)
fs: Remove ext3 filesystem driver
The functionality of ext3 is fully supported by ext4 driver. Major distributions (SUSE, RedHat) already use ext4 driver to handle ext3 filesystems for quite some time. There is some ugliness in mm resulting from jbd cleaning buffers in a dirty page without cleaning page dirty bit and also support for buffer bouncing in the block layer when stable pages are required is there only because of jbd. So let's remove the ext3 driver. This saves us some 28k lines of duplicated code. Acked-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Jan Kara <jack@suse.cz>
Diffstat (limited to 'fs/ext3')
-rw-r--r--fs/ext3/Kconfig89
-rw-r--r--fs/ext3/Makefile12
-rw-r--r--fs/ext3/acl.c281
-rw-r--r--fs/ext3/acl.h72
-rw-r--r--fs/ext3/balloc.c2158
-rw-r--r--fs/ext3/bitmap.c20
-rw-r--r--fs/ext3/dir.c537
-rw-r--r--fs/ext3/ext3.h1332
-rw-r--r--fs/ext3/ext3_jbd.c59
-rw-r--r--fs/ext3/file.c79
-rw-r--r--fs/ext3/fsync.c109
-rw-r--r--fs/ext3/hash.c206
-rw-r--r--fs/ext3/ialloc.c706
-rw-r--r--fs/ext3/inode.c3574
-rw-r--r--fs/ext3/ioctl.c327
-rw-r--r--fs/ext3/namei.c2586
-rw-r--r--fs/ext3/namei.h27
-rw-r--r--fs/ext3/resize.c1117
-rw-r--r--fs/ext3/super.c3165
-rw-r--r--fs/ext3/symlink.c46
-rw-r--r--fs/ext3/xattr.c1330
-rw-r--r--fs/ext3/xattr.h136
-rw-r--r--fs/ext3/xattr_security.c78
-rw-r--r--fs/ext3/xattr_trusted.c54
-rw-r--r--fs/ext3/xattr_user.c58
25 files changed, 0 insertions, 18158 deletions
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
deleted file mode 100644
index e8c6ba0e4a3e..000000000000
--- a/fs/ext3/Kconfig
+++ /dev/null
@@ -1,89 +0,0 @@
1config EXT3_FS
2 tristate "Ext3 journalling file system support"
3 select JBD
4 help
5 This is the journalling version of the Second extended file system
6 (often called ext3), the de facto standard Linux file system
7 (method to organize files on a storage device) for hard disks.
8
9 The journalling code included in this driver means you do not have
10 to run e2fsck (file system checker) on your file systems after a
11 crash. The journal keeps track of any changes that were being made
12 at the time the system crashed, and can ensure that your file system
13 is consistent without the need for a lengthy check.
14
15 Other than adding the journal to the file system, the on-disk format
16 of ext3 is identical to ext2. It is possible to freely switch
17 between using the ext3 driver and the ext2 driver, as long as the
18 file system has been cleanly unmounted, or e2fsck is run on the file
19 system.
20
21 To add a journal on an existing ext2 file system or change the
22 behavior of ext3 file systems, you can use the tune2fs utility ("man
23 tune2fs"). To modify attributes of files and directories on ext3
24 file systems, use chattr ("man chattr"). You need to be using
25 e2fsprogs version 1.20 or later in order to create ext3 journals
26 (available at <http://sourceforge.net/projects/e2fsprogs/>).
27
28 To compile this file system support as a module, choose M here: the
29 module will be called ext3.
30
31config EXT3_DEFAULTS_TO_ORDERED
32 bool "Default to 'data=ordered' in ext3"
33 depends on EXT3_FS
34 default y
35 help
36 The journal mode options for ext3 have different tradeoffs
37 between when data is guaranteed to be on disk and
38 performance. The use of "data=writeback" can cause
39 unwritten data to appear in files after an system crash or
40 power failure, which can be a security issue. However,
41 "data=ordered" mode can also result in major performance
42 problems, including seconds-long delays before an fsync()
43 call returns. For details, see:
44
45 http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs
46
47 If you have been historically happy with ext3's performance,
48 data=ordered mode will be a safe choice and you should
49 answer 'y' here. If you understand the reliability and data
50 privacy issues of data=writeback and are willing to make
51 that trade off, answer 'n'.
52
53config EXT3_FS_XATTR
54 bool "Ext3 extended attributes"
55 depends on EXT3_FS
56 default y
57 help
58 Extended attributes are name:value pairs associated with inodes by
59 the kernel or by users (see the attr(5) manual page, or visit
60 <http://acl.bestbits.at/> for details).
61
62 If unsure, say N.
63
64 You need this for POSIX ACL support on ext3.
65
66config EXT3_FS_POSIX_ACL
67 bool "Ext3 POSIX Access Control Lists"
68 depends on EXT3_FS_XATTR
69 select FS_POSIX_ACL
70 help
71 Posix Access Control Lists (ACLs) support permissions for users and
72 groups beyond the owner/group/world scheme.
73
74 To learn more about Access Control Lists, visit the Posix ACLs for
75 Linux website <http://acl.bestbits.at/>.
76
77 If you don't know what Access Control Lists are, say N
78
79config EXT3_FS_SECURITY
80 bool "Ext3 Security Labels"
81 depends on EXT3_FS_XATTR
82 help
83 Security labels support alternative access control models
84 implemented by security modules like SELinux. This option
85 enables an extended attribute handler for file security
86 labels in the ext3 filesystem.
87
88 If you are not using a security module that requires using
89 extended attributes for file security labels, say N.
diff --git a/fs/ext3/Makefile b/fs/ext3/Makefile
deleted file mode 100644
index e77766a8b3f0..000000000000
--- a/fs/ext3/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
1#
2# Makefile for the linux ext3-filesystem routines.
3#
4
5obj-$(CONFIG_EXT3_FS) += ext3.o
6
7ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o ext3_jbd.o
9
10ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
11ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
12ext3-$(CONFIG_EXT3_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
deleted file mode 100644
index 8bbaf5bcf982..000000000000
--- a/fs/ext3/acl.c
+++ /dev/null
@@ -1,281 +0,0 @@
1/*
2 * linux/fs/ext3/acl.c
3 *
4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
5 */
6
7#include "ext3.h"
8#include "xattr.h"
9#include "acl.h"
10
11/*
12 * Convert from filesystem to in-memory representation.
13 */
14static struct posix_acl *
15ext3_acl_from_disk(const void *value, size_t size)
16{
17 const char *end = (char *)value + size;
18 int n, count;
19 struct posix_acl *acl;
20
21 if (!value)
22 return NULL;
23 if (size < sizeof(ext3_acl_header))
24 return ERR_PTR(-EINVAL);
25 if (((ext3_acl_header *)value)->a_version !=
26 cpu_to_le32(EXT3_ACL_VERSION))
27 return ERR_PTR(-EINVAL);
28 value = (char *)value + sizeof(ext3_acl_header);
29 count = ext3_acl_count(size);
30 if (count < 0)
31 return ERR_PTR(-EINVAL);
32 if (count == 0)
33 return NULL;
34 acl = posix_acl_alloc(count, GFP_NOFS);
35 if (!acl)
36 return ERR_PTR(-ENOMEM);
37 for (n=0; n < count; n++) {
38 ext3_acl_entry *entry =
39 (ext3_acl_entry *)value;
40 if ((char *)value + sizeof(ext3_acl_entry_short) > end)
41 goto fail;
42 acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
43 acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
44 switch(acl->a_entries[n].e_tag) {
45 case ACL_USER_OBJ:
46 case ACL_GROUP_OBJ:
47 case ACL_MASK:
48 case ACL_OTHER:
49 value = (char *)value +
50 sizeof(ext3_acl_entry_short);
51 break;
52
53 case ACL_USER:
54 value = (char *)value + sizeof(ext3_acl_entry);
55 if ((char *)value > end)
56 goto fail;
57 acl->a_entries[n].e_uid =
58 make_kuid(&init_user_ns,
59 le32_to_cpu(entry->e_id));
60 break;
61 case ACL_GROUP:
62 value = (char *)value + sizeof(ext3_acl_entry);
63 if ((char *)value > end)
64 goto fail;
65 acl->a_entries[n].e_gid =
66 make_kgid(&init_user_ns,
67 le32_to_cpu(entry->e_id));
68 break;
69
70 default:
71 goto fail;
72 }
73 }
74 if (value != end)
75 goto fail;
76 return acl;
77
78fail:
79 posix_acl_release(acl);
80 return ERR_PTR(-EINVAL);
81}
82
83/*
84 * Convert from in-memory to filesystem representation.
85 */
86static void *
87ext3_acl_to_disk(const struct posix_acl *acl, size_t *size)
88{
89 ext3_acl_header *ext_acl;
90 char *e;
91 size_t n;
92
93 *size = ext3_acl_size(acl->a_count);
94 ext_acl = kmalloc(sizeof(ext3_acl_header) + acl->a_count *
95 sizeof(ext3_acl_entry), GFP_NOFS);
96 if (!ext_acl)
97 return ERR_PTR(-ENOMEM);
98 ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION);
99 e = (char *)ext_acl + sizeof(ext3_acl_header);
100 for (n=0; n < acl->a_count; n++) {
101 const struct posix_acl_entry *acl_e = &acl->a_entries[n];
102 ext3_acl_entry *entry = (ext3_acl_entry *)e;
103 entry->e_tag = cpu_to_le16(acl_e->e_tag);
104 entry->e_perm = cpu_to_le16(acl_e->e_perm);
105 switch(acl_e->e_tag) {
106 case ACL_USER:
107 entry->e_id = cpu_to_le32(
108 from_kuid(&init_user_ns, acl_e->e_uid));
109 e += sizeof(ext3_acl_entry);
110 break;
111 case ACL_GROUP:
112 entry->e_id = cpu_to_le32(
113 from_kgid(&init_user_ns, acl_e->e_gid));
114 e += sizeof(ext3_acl_entry);
115 break;
116
117 case ACL_USER_OBJ:
118 case ACL_GROUP_OBJ:
119 case ACL_MASK:
120 case ACL_OTHER:
121 e += sizeof(ext3_acl_entry_short);
122 break;
123
124 default:
125 goto fail;
126 }
127 }
128 return (char *)ext_acl;
129
130fail:
131 kfree(ext_acl);
132 return ERR_PTR(-EINVAL);
133}
134
135/*
136 * Inode operation get_posix_acl().
137 *
138 * inode->i_mutex: don't care
139 */
140struct posix_acl *
141ext3_get_acl(struct inode *inode, int type)
142{
143 int name_index;
144 char *value = NULL;
145 struct posix_acl *acl;
146 int retval;
147
148 switch (type) {
149 case ACL_TYPE_ACCESS:
150 name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
151 break;
152 case ACL_TYPE_DEFAULT:
153 name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
154 break;
155 default:
156 BUG();
157 }
158
159 retval = ext3_xattr_get(inode, name_index, "", NULL, 0);
160 if (retval > 0) {
161 value = kmalloc(retval, GFP_NOFS);
162 if (!value)
163 return ERR_PTR(-ENOMEM);
164 retval = ext3_xattr_get(inode, name_index, "", value, retval);
165 }
166 if (retval > 0)
167 acl = ext3_acl_from_disk(value, retval);
168 else if (retval == -ENODATA || retval == -ENOSYS)
169 acl = NULL;
170 else
171 acl = ERR_PTR(retval);
172 kfree(value);
173
174 if (!IS_ERR(acl))
175 set_cached_acl(inode, type, acl);
176
177 return acl;
178}
179
180/*
181 * Set the access or default ACL of an inode.
182 *
183 * inode->i_mutex: down unless called from ext3_new_inode
184 */
185static int
186__ext3_set_acl(handle_t *handle, struct inode *inode, int type,
187 struct posix_acl *acl)
188{
189 int name_index;
190 void *value = NULL;
191 size_t size = 0;
192 int error;
193
194 switch(type) {
195 case ACL_TYPE_ACCESS:
196 name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
197 if (acl) {
198 error = posix_acl_equiv_mode(acl, &inode->i_mode);
199 if (error < 0)
200 return error;
201 else {
202 inode->i_ctime = CURRENT_TIME_SEC;
203 ext3_mark_inode_dirty(handle, inode);
204 if (error == 0)
205 acl = NULL;
206 }
207 }
208 break;
209
210 case ACL_TYPE_DEFAULT:
211 name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
212 if (!S_ISDIR(inode->i_mode))
213 return acl ? -EACCES : 0;
214 break;
215
216 default:
217 return -EINVAL;
218 }
219 if (acl) {
220 value = ext3_acl_to_disk(acl, &size);
221 if (IS_ERR(value))
222 return (int)PTR_ERR(value);
223 }
224
225 error = ext3_xattr_set_handle(handle, inode, name_index, "",
226 value, size, 0);
227
228 kfree(value);
229
230 if (!error)
231 set_cached_acl(inode, type, acl);
232
233 return error;
234}
235
236int
237ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
238{
239 handle_t *handle;
240 int error, retries = 0;
241
242retry:
243 handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
244 if (IS_ERR(handle))
245 return PTR_ERR(handle);
246 error = __ext3_set_acl(handle, inode, type, acl);
247 ext3_journal_stop(handle);
248 if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
249 goto retry;
250 return error;
251}
252
253/*
254 * Initialize the ACLs of a new inode. Called from ext3_new_inode.
255 *
256 * dir->i_mutex: down
257 * inode->i_mutex: up (access to inode is still exclusive)
258 */
259int
260ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
261{
262 struct posix_acl *default_acl, *acl;
263 int error;
264
265 error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
266 if (error)
267 return error;
268
269 if (default_acl) {
270 error = __ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT,
271 default_acl);
272 posix_acl_release(default_acl);
273 }
274 if (acl) {
275 if (!error)
276 error = __ext3_set_acl(handle, inode, ACL_TYPE_ACCESS,
277 acl);
278 posix_acl_release(acl);
279 }
280 return error;
281}
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
deleted file mode 100644
index ea1c69edab9e..000000000000
--- a/fs/ext3/acl.h
+++ /dev/null
@@ -1,72 +0,0 @@
1/*
2 File: fs/ext3/acl.h
3
4 (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
5*/
6
7#include <linux/posix_acl_xattr.h>
8
9#define EXT3_ACL_VERSION 0x0001
10
11typedef struct {
12 __le16 e_tag;
13 __le16 e_perm;
14 __le32 e_id;
15} ext3_acl_entry;
16
17typedef struct {
18 __le16 e_tag;
19 __le16 e_perm;
20} ext3_acl_entry_short;
21
22typedef struct {
23 __le32 a_version;
24} ext3_acl_header;
25
26static inline size_t ext3_acl_size(int count)
27{
28 if (count <= 4) {
29 return sizeof(ext3_acl_header) +
30 count * sizeof(ext3_acl_entry_short);
31 } else {
32 return sizeof(ext3_acl_header) +
33 4 * sizeof(ext3_acl_entry_short) +
34 (count - 4) * sizeof(ext3_acl_entry);
35 }
36}
37
38static inline int ext3_acl_count(size_t size)
39{
40 ssize_t s;
41 size -= sizeof(ext3_acl_header);
42 s = size - 4 * sizeof(ext3_acl_entry_short);
43 if (s < 0) {
44 if (size % sizeof(ext3_acl_entry_short))
45 return -1;
46 return size / sizeof(ext3_acl_entry_short);
47 } else {
48 if (s % sizeof(ext3_acl_entry))
49 return -1;
50 return s / sizeof(ext3_acl_entry) + 4;
51 }
52}
53
54#ifdef CONFIG_EXT3_FS_POSIX_ACL
55
56/* acl.c */
57extern struct posix_acl *ext3_get_acl(struct inode *inode, int type);
58extern int ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
59extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
60
61#else /* CONFIG_EXT3_FS_POSIX_ACL */
62#include <linux/sched.h>
63#define ext3_get_acl NULL
64#define ext3_set_acl NULL
65
66static inline int
67ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
68{
69 return 0;
70}
71#endif /* CONFIG_EXT3_FS_POSIX_ACL */
72
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
deleted file mode 100644
index 158b5d4ce067..000000000000
--- a/fs/ext3/balloc.c
+++ /dev/null
@@ -1,2158 +0,0 @@
1/*
2 * linux/fs/ext3/balloc.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
10 * Big-endian to little-endian byte-swapping/bitmaps by
11 * David S. Miller (davem@caip.rutgers.edu), 1995
12 */
13
14#include <linux/quotaops.h>
15#include <linux/blkdev.h>
16#include "ext3.h"
17
18/*
19 * balloc.c contains the blocks allocation and deallocation routines
20 */
21
22/*
23 * The free blocks are managed by bitmaps. A file system contains several
24 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
25 * block for inodes, N blocks for the inode table and data blocks.
26 *
27 * The file system contains group descriptors which are located after the
28 * super block. Each descriptor contains the number of the bitmap block and
29 * the free blocks count in the block. The descriptors are loaded in memory
30 * when a file system is mounted (see ext3_fill_super).
31 */
32
33
34#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
35
36/*
37 * Calculate the block group number and offset, given a block number
38 */
39static void ext3_get_group_no_and_offset(struct super_block *sb,
40 ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
41{
42 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
43
44 blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
45 if (offsetp)
46 *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
47 if (blockgrpp)
48 *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
49}
50
51/**
52 * ext3_get_group_desc() -- load group descriptor from disk
53 * @sb: super block
54 * @block_group: given block group
55 * @bh: pointer to the buffer head to store the block
56 * group descriptor
57 */
58struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
59 unsigned int block_group,
60 struct buffer_head ** bh)
61{
62 unsigned long group_desc;
63 unsigned long offset;
64 struct ext3_group_desc * desc;
65 struct ext3_sb_info *sbi = EXT3_SB(sb);
66
67 if (block_group >= sbi->s_groups_count) {
68 ext3_error (sb, "ext3_get_group_desc",
69 "block_group >= groups_count - "
70 "block_group = %d, groups_count = %lu",
71 block_group, sbi->s_groups_count);
72
73 return NULL;
74 }
75 smp_rmb();
76
77 group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
78 offset = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
79 if (!sbi->s_group_desc[group_desc]) {
80 ext3_error (sb, "ext3_get_group_desc",
81 "Group descriptor not loaded - "
82 "block_group = %d, group_desc = %lu, desc = %lu",
83 block_group, group_desc, offset);
84 return NULL;
85 }
86
87 desc = (struct ext3_group_desc *) sbi->s_group_desc[group_desc]->b_data;
88 if (bh)
89 *bh = sbi->s_group_desc[group_desc];
90 return desc + offset;
91}
92
93static int ext3_valid_block_bitmap(struct super_block *sb,
94 struct ext3_group_desc *desc,
95 unsigned int block_group,
96 struct buffer_head *bh)
97{
98 ext3_grpblk_t offset;
99 ext3_grpblk_t next_zero_bit;
100 ext3_fsblk_t bitmap_blk;
101 ext3_fsblk_t group_first_block;
102
103 group_first_block = ext3_group_first_block_no(sb, block_group);
104
105 /* check whether block bitmap block number is set */
106 bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
107 offset = bitmap_blk - group_first_block;
108 if (!ext3_test_bit(offset, bh->b_data))
109 /* bad block bitmap */
110 goto err_out;
111
112 /* check whether the inode bitmap block number is set */
113 bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap);
114 offset = bitmap_blk - group_first_block;
115 if (!ext3_test_bit(offset, bh->b_data))
116 /* bad block bitmap */
117 goto err_out;
118
119 /* check whether the inode table block number is set */
120 bitmap_blk = le32_to_cpu(desc->bg_inode_table);
121 offset = bitmap_blk - group_first_block;
122 next_zero_bit = ext3_find_next_zero_bit(bh->b_data,
123 offset + EXT3_SB(sb)->s_itb_per_group,
124 offset);
125 if (next_zero_bit >= offset + EXT3_SB(sb)->s_itb_per_group)
126 /* good bitmap for inode tables */
127 return 1;
128
129err_out:
130 ext3_error(sb, __func__,
131 "Invalid block bitmap - "
132 "block_group = %d, block = %lu",
133 block_group, bitmap_blk);
134 return 0;
135}
136
137/**
138 * read_block_bitmap()
139 * @sb: super block
140 * @block_group: given block group
141 *
142 * Read the bitmap for a given block_group,and validate the
143 * bits for block/inode/inode tables are set in the bitmaps
144 *
145 * Return buffer_head on success or NULL in case of failure.
146 */
147static struct buffer_head *
148read_block_bitmap(struct super_block *sb, unsigned int block_group)
149{
150 struct ext3_group_desc * desc;
151 struct buffer_head * bh = NULL;
152 ext3_fsblk_t bitmap_blk;
153
154 desc = ext3_get_group_desc(sb, block_group, NULL);
155 if (!desc)
156 return NULL;
157 trace_ext3_read_block_bitmap(sb, block_group);
158 bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
159 bh = sb_getblk(sb, bitmap_blk);
160 if (unlikely(!bh)) {
161 ext3_error(sb, __func__,
162 "Cannot read block bitmap - "
163 "block_group = %d, block_bitmap = %u",
164 block_group, le32_to_cpu(desc->bg_block_bitmap));
165 return NULL;
166 }
167 if (likely(bh_uptodate_or_lock(bh)))
168 return bh;
169
170 if (bh_submit_read(bh) < 0) {
171 brelse(bh);
172 ext3_error(sb, __func__,
173 "Cannot read block bitmap - "
174 "block_group = %d, block_bitmap = %u",
175 block_group, le32_to_cpu(desc->bg_block_bitmap));
176 return NULL;
177 }
178 ext3_valid_block_bitmap(sb, desc, block_group, bh);
179 /*
180 * file system mounted not to panic on error, continue with corrupt
181 * bitmap
182 */
183 return bh;
184}
185/*
186 * The reservation window structure operations
187 * --------------------------------------------
188 * Operations include:
189 * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
190 *
191 * We use a red-black tree to represent per-filesystem reservation
192 * windows.
193 *
194 */
195
196/**
197 * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
198 * @rb_root: root of per-filesystem reservation rb tree
199 * @verbose: verbose mode
200 * @fn: function which wishes to dump the reservation map
201 *
202 * If verbose is turned on, it will print the whole block reservation
203 * windows(start, end). Otherwise, it will only print out the "bad" windows,
204 * those windows that overlap with their immediate neighbors.
205 */
206#if 1
207static void __rsv_window_dump(struct rb_root *root, int verbose,
208 const char *fn)
209{
210 struct rb_node *n;
211 struct ext3_reserve_window_node *rsv, *prev;
212 int bad;
213
214restart:
215 n = rb_first(root);
216 bad = 0;
217 prev = NULL;
218
219 printk("Block Allocation Reservation Windows Map (%s):\n", fn);
220 while (n) {
221 rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
222 if (verbose)
223 printk("reservation window 0x%p "
224 "start: %lu, end: %lu\n",
225 rsv, rsv->rsv_start, rsv->rsv_end);
226 if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
227 printk("Bad reservation %p (start >= end)\n",
228 rsv);
229 bad = 1;
230 }
231 if (prev && prev->rsv_end >= rsv->rsv_start) {
232 printk("Bad reservation %p (prev->end >= start)\n",
233 rsv);
234 bad = 1;
235 }
236 if (bad) {
237 if (!verbose) {
238 printk("Restarting reservation walk in verbose mode\n");
239 verbose = 1;
240 goto restart;
241 }
242 }
243 n = rb_next(n);
244 prev = rsv;
245 }
246 printk("Window map complete.\n");
247 BUG_ON(bad);
248}
249#define rsv_window_dump(root, verbose) \
250 __rsv_window_dump((root), (verbose), __func__)
251#else
252#define rsv_window_dump(root, verbose) do {} while (0)
253#endif
254
255/**
256 * goal_in_my_reservation()
257 * @rsv: inode's reservation window
258 * @grp_goal: given goal block relative to the allocation block group
259 * @group: the current allocation block group
260 * @sb: filesystem super block
261 *
262 * Test if the given goal block (group relative) is within the file's
263 * own block reservation window range.
264 *
265 * If the reservation window is outside the goal allocation group, return 0;
266 * grp_goal (given goal block) could be -1, which means no specific
267 * goal block. In this case, always return 1.
268 * If the goal block is within the reservation window, return 1;
269 * otherwise, return 0;
270 */
271static int
272goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
273 unsigned int group, struct super_block * sb)
274{
275 ext3_fsblk_t group_first_block, group_last_block;
276
277 group_first_block = ext3_group_first_block_no(sb, group);
278 group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
279
280 if ((rsv->_rsv_start > group_last_block) ||
281 (rsv->_rsv_end < group_first_block))
282 return 0;
283 if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
284 || (grp_goal + group_first_block > rsv->_rsv_end)))
285 return 0;
286 return 1;
287}
288
289/**
290 * search_reserve_window()
291 * @rb_root: root of reservation tree
292 * @goal: target allocation block
293 *
294 * Find the reserved window which includes the goal, or the previous one
295 * if the goal is not in any window.
296 * Returns NULL if there are no windows or if all windows start after the goal.
297 */
298static struct ext3_reserve_window_node *
299search_reserve_window(struct rb_root *root, ext3_fsblk_t goal)
300{
301 struct rb_node *n = root->rb_node;
302 struct ext3_reserve_window_node *rsv;
303
304 if (!n)
305 return NULL;
306
307 do {
308 rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
309
310 if (goal < rsv->rsv_start)
311 n = n->rb_left;
312 else if (goal > rsv->rsv_end)
313 n = n->rb_right;
314 else
315 return rsv;
316 } while (n);
317 /*
318 * We've fallen off the end of the tree: the goal wasn't inside
319 * any particular node. OK, the previous node must be to one
320 * side of the interval containing the goal. If it's the RHS,
321 * we need to back up one.
322 */
323 if (rsv->rsv_start > goal) {
324 n = rb_prev(&rsv->rsv_node);
325 rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
326 }
327 return rsv;
328}
329
330/**
331 * ext3_rsv_window_add() -- Insert a window to the block reservation rb tree.
332 * @sb: super block
333 * @rsv: reservation window to add
334 *
335 * Must be called with rsv_lock hold.
336 */
337void ext3_rsv_window_add(struct super_block *sb,
338 struct ext3_reserve_window_node *rsv)
339{
340 struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
341 struct rb_node *node = &rsv->rsv_node;
342 ext3_fsblk_t start = rsv->rsv_start;
343
344 struct rb_node ** p = &root->rb_node;
345 struct rb_node * parent = NULL;
346 struct ext3_reserve_window_node *this;
347
348 trace_ext3_rsv_window_add(sb, rsv);
349 while (*p)
350 {
351 parent = *p;
352 this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node);
353
354 if (start < this->rsv_start)
355 p = &(*p)->rb_left;
356 else if (start > this->rsv_end)
357 p = &(*p)->rb_right;
358 else {
359 rsv_window_dump(root, 1);
360 BUG();
361 }
362 }
363
364 rb_link_node(node, parent, p);
365 rb_insert_color(node, root);
366}
367
368/**
369 * ext3_rsv_window_remove() -- unlink a window from the reservation rb tree
370 * @sb: super block
371 * @rsv: reservation window to remove
372 *
373 * Mark the block reservation window as not allocated, and unlink it
374 * from the filesystem reservation window rb tree. Must be called with
375 * rsv_lock hold.
376 */
377static void rsv_window_remove(struct super_block *sb,
378 struct ext3_reserve_window_node *rsv)
379{
380 rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
381 rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
382 rsv->rsv_alloc_hit = 0;
383 rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root);
384}
385
386/*
387 * rsv_is_empty() -- Check if the reservation window is allocated.
388 * @rsv: given reservation window to check
389 *
390 * returns 1 if the end block is EXT3_RESERVE_WINDOW_NOT_ALLOCATED.
391 */
392static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
393{
394 /* a valid reservation end block could not be 0 */
395 return rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
396}
397
398/**
399 * ext3_init_block_alloc_info()
400 * @inode: file inode structure
401 *
402 * Allocate and initialize the reservation window structure, and
403 * link the window to the ext3 inode structure at last
404 *
405 * The reservation window structure is only dynamically allocated
406 * and linked to ext3 inode the first time the open file
407 * needs a new block. So, before every ext3_new_block(s) call, for
408 * regular files, we should check whether the reservation window
409 * structure exists or not. In the latter case, this function is called.
410 * Fail to do so will result in block reservation being turned off for that
411 * open file.
412 *
413 * This function is called from ext3_get_blocks_handle(), also called
414 * when setting the reservation window size through ioctl before the file
415 * is open for write (needs block allocation).
416 *
417 * Needs truncate_mutex protection prior to call this function.
418 */
419void ext3_init_block_alloc_info(struct inode *inode)
420{
421 struct ext3_inode_info *ei = EXT3_I(inode);
422 struct ext3_block_alloc_info *block_i;
423 struct super_block *sb = inode->i_sb;
424
425 block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
426 if (block_i) {
427 struct ext3_reserve_window_node *rsv = &block_i->rsv_window_node;
428
429 rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
430 rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
431
432 /*
433 * if filesystem is mounted with NORESERVATION, the goal
434 * reservation window size is set to zero to indicate
435 * block reservation is off
436 */
437 if (!test_opt(sb, RESERVATION))
438 rsv->rsv_goal_size = 0;
439 else
440 rsv->rsv_goal_size = EXT3_DEFAULT_RESERVE_BLOCKS;
441 rsv->rsv_alloc_hit = 0;
442 block_i->last_alloc_logical_block = 0;
443 block_i->last_alloc_physical_block = 0;
444 }
445 ei->i_block_alloc_info = block_i;
446}
447
448/**
449 * ext3_discard_reservation()
450 * @inode: inode
451 *
452 * Discard(free) block reservation window on last file close, or truncate
453 * or at last iput().
454 *
455 * It is being called in three cases:
456 * ext3_release_file(): last writer close the file
457 * ext3_clear_inode(): last iput(), when nobody link to this file.
458 * ext3_truncate(): when the block indirect map is about to change.
459 *
460 */
461void ext3_discard_reservation(struct inode *inode)
462{
463 struct ext3_inode_info *ei = EXT3_I(inode);
464 struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
465 struct ext3_reserve_window_node *rsv;
466 spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock;
467
468 if (!block_i)
469 return;
470
471 rsv = &block_i->rsv_window_node;
472 if (!rsv_is_empty(&rsv->rsv_window)) {
473 spin_lock(rsv_lock);
474 if (!rsv_is_empty(&rsv->rsv_window)) {
475 trace_ext3_discard_reservation(inode, rsv);
476 rsv_window_remove(inode->i_sb, rsv);
477 }
478 spin_unlock(rsv_lock);
479 }
480}
481
482/**
483 * ext3_free_blocks_sb() -- Free given blocks and update quota
484 * @handle: handle to this transaction
485 * @sb: super block
486 * @block: start physical block to free
487 * @count: number of blocks to free
488 * @pdquot_freed_blocks: pointer to quota
489 */
490void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
491 ext3_fsblk_t block, unsigned long count,
492 unsigned long *pdquot_freed_blocks)
493{
494 struct buffer_head *bitmap_bh = NULL;
495 struct buffer_head *gd_bh;
496 unsigned long block_group;
497 ext3_grpblk_t bit;
498 unsigned long i;
499 unsigned long overflow;
500 struct ext3_group_desc * desc;
501 struct ext3_super_block * es;
502 struct ext3_sb_info *sbi;
503 int err = 0, ret;
504 ext3_grpblk_t group_freed;
505
506 *pdquot_freed_blocks = 0;
507 sbi = EXT3_SB(sb);
508 es = sbi->s_es;
509 if (block < le32_to_cpu(es->s_first_data_block) ||
510 block + count < block ||
511 block + count > le32_to_cpu(es->s_blocks_count)) {
512 ext3_error (sb, "ext3_free_blocks",
513 "Freeing blocks not in datazone - "
514 "block = "E3FSBLK", count = %lu", block, count);
515 goto error_return;
516 }
517
518 ext3_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1);
519
520do_more:
521 overflow = 0;
522 block_group = (block - le32_to_cpu(es->s_first_data_block)) /
523 EXT3_BLOCKS_PER_GROUP(sb);
524 bit = (block - le32_to_cpu(es->s_first_data_block)) %
525 EXT3_BLOCKS_PER_GROUP(sb);
526 /*
527 * Check to see if we are freeing blocks across a group
528 * boundary.
529 */
530 if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
531 overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
532 count -= overflow;
533 }
534 brelse(bitmap_bh);
535 bitmap_bh = read_block_bitmap(sb, block_group);
536 if (!bitmap_bh)
537 goto error_return;
538 desc = ext3_get_group_desc (sb, block_group, &gd_bh);
539 if (!desc)
540 goto error_return;
541
542 if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
543 in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
544 in_range (block, le32_to_cpu(desc->bg_inode_table),
545 sbi->s_itb_per_group) ||
546 in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
547 sbi->s_itb_per_group)) {
548 ext3_error (sb, "ext3_free_blocks",
549 "Freeing blocks in system zones - "
550 "Block = "E3FSBLK", count = %lu",
551 block, count);
552 goto error_return;
553 }
554
555 /*
556 * We are about to start releasing blocks in the bitmap,
557 * so we need undo access.
558 */
559 /* @@@ check errors */
560 BUFFER_TRACE(bitmap_bh, "getting undo access");
561 err = ext3_journal_get_undo_access(handle, bitmap_bh);
562 if (err)
563 goto error_return;
564
565 /*
566 * We are about to modify some metadata. Call the journal APIs
567 * to unshare ->b_data if a currently-committing transaction is
568 * using it
569 */
570 BUFFER_TRACE(gd_bh, "get_write_access");
571 err = ext3_journal_get_write_access(handle, gd_bh);
572 if (err)
573 goto error_return;
574
575 jbd_lock_bh_state(bitmap_bh);
576
577 for (i = 0, group_freed = 0; i < count; i++) {
578 /*
579 * An HJ special. This is expensive...
580 */
581#ifdef CONFIG_JBD_DEBUG
582 jbd_unlock_bh_state(bitmap_bh);
583 {
584 struct buffer_head *debug_bh;
585 debug_bh = sb_find_get_block(sb, block + i);
586 if (debug_bh) {
587 BUFFER_TRACE(debug_bh, "Deleted!");
588 if (!bh2jh(bitmap_bh)->b_committed_data)
589 BUFFER_TRACE(debug_bh,
590 "No committed data in bitmap");
591 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
592 __brelse(debug_bh);
593 }
594 }
595 jbd_lock_bh_state(bitmap_bh);
596#endif
597 if (need_resched()) {
598 jbd_unlock_bh_state(bitmap_bh);
599 cond_resched();
600 jbd_lock_bh_state(bitmap_bh);
601 }
602 /* @@@ This prevents newly-allocated data from being
603 * freed and then reallocated within the same
604 * transaction.
605 *
606 * Ideally we would want to allow that to happen, but to
607 * do so requires making journal_forget() capable of
608 * revoking the queued write of a data block, which
609 * implies blocking on the journal lock. *forget()
610 * cannot block due to truncate races.
611 *
612 * Eventually we can fix this by making journal_forget()
613 * return a status indicating whether or not it was able
614 * to revoke the buffer. On successful revoke, it is
615 * safe not to set the allocation bit in the committed
616 * bitmap, because we know that there is no outstanding
617 * activity on the buffer any more and so it is safe to
618 * reallocate it.
619 */
620 BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
621 J_ASSERT_BH(bitmap_bh,
622 bh2jh(bitmap_bh)->b_committed_data != NULL);
623 ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
624 bh2jh(bitmap_bh)->b_committed_data);
625
626 /*
627 * We clear the bit in the bitmap after setting the committed
628 * data bit, because this is the reverse order to that which
629 * the allocator uses.
630 */
631 BUFFER_TRACE(bitmap_bh, "clear bit");
632 if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
633 bit + i, bitmap_bh->b_data)) {
634 jbd_unlock_bh_state(bitmap_bh);
635 ext3_error(sb, __func__,
636 "bit already cleared for block "E3FSBLK,
637 block + i);
638 jbd_lock_bh_state(bitmap_bh);
639 BUFFER_TRACE(bitmap_bh, "bit already cleared");
640 } else {
641 group_freed++;
642 }
643 }
644 jbd_unlock_bh_state(bitmap_bh);
645
646 spin_lock(sb_bgl_lock(sbi, block_group));
647 le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
648 spin_unlock(sb_bgl_lock(sbi, block_group));
649 percpu_counter_add(&sbi->s_freeblocks_counter, count);
650
651 /* We dirtied the bitmap block */
652 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
653 err = ext3_journal_dirty_metadata(handle, bitmap_bh);
654
655 /* And the group descriptor block */
656 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
657 ret = ext3_journal_dirty_metadata(handle, gd_bh);
658 if (!err) err = ret;
659 *pdquot_freed_blocks += group_freed;
660
661 if (overflow && !err) {
662 block += count;
663 count = overflow;
664 goto do_more;
665 }
666
667error_return:
668 brelse(bitmap_bh);
669 ext3_std_error(sb, err);
670 return;
671}
672
673/**
674 * ext3_free_blocks() -- Free given blocks and update quota
675 * @handle: handle for this transaction
676 * @inode: inode
677 * @block: start physical block to free
678 * @count: number of blocks to count
679 */
680void ext3_free_blocks(handle_t *handle, struct inode *inode,
681 ext3_fsblk_t block, unsigned long count)
682{
683 struct super_block *sb = inode->i_sb;
684 unsigned long dquot_freed_blocks;
685
686 trace_ext3_free_blocks(inode, block, count);
687 ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
688 if (dquot_freed_blocks)
689 dquot_free_block(inode, dquot_freed_blocks);
690 return;
691}
692
693/**
694 * ext3_test_allocatable()
695 * @nr: given allocation block group
696 * @bh: bufferhead contains the bitmap of the given block group
697 *
698 * For ext3 allocations, we must not reuse any blocks which are
699 * allocated in the bitmap buffer's "last committed data" copy. This
700 * prevents deletes from freeing up the page for reuse until we have
701 * committed the delete transaction.
702 *
703 * If we didn't do this, then deleting something and reallocating it as
704 * data would allow the old block to be overwritten before the
705 * transaction committed (because we force data to disk before commit).
706 * This would lead to corruption if we crashed between overwriting the
707 * data and committing the delete.
708 *
709 * @@@ We may want to make this allocation behaviour conditional on
710 * data-writes at some point, and disable it for metadata allocations or
711 * sync-data inodes.
712 */
713static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh)
714{
715 int ret;
716 struct journal_head *jh = bh2jh(bh);
717
718 if (ext3_test_bit(nr, bh->b_data))
719 return 0;
720
721 jbd_lock_bh_state(bh);
722 if (!jh->b_committed_data)
723 ret = 1;
724 else
725 ret = !ext3_test_bit(nr, jh->b_committed_data);
726 jbd_unlock_bh_state(bh);
727 return ret;
728}
729
730/**
731 * bitmap_search_next_usable_block()
732 * @start: the starting block (group relative) of the search
733 * @bh: bufferhead contains the block group bitmap
734 * @maxblocks: the ending block (group relative) of the reservation
735 *
736 * The bitmap search --- search forward alternately through the actual
737 * bitmap on disk and the last-committed copy in journal, until we find a
738 * bit free in both bitmaps.
739 */
740static ext3_grpblk_t
741bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
742 ext3_grpblk_t maxblocks)
743{
744 ext3_grpblk_t next;
745 struct journal_head *jh = bh2jh(bh);
746
747 while (start < maxblocks) {
748 next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start);
749 if (next >= maxblocks)
750 return -1;
751 if (ext3_test_allocatable(next, bh))
752 return next;
753 jbd_lock_bh_state(bh);
754 if (jh->b_committed_data)
755 start = ext3_find_next_zero_bit(jh->b_committed_data,
756 maxblocks, next);
757 jbd_unlock_bh_state(bh);
758 }
759 return -1;
760}
761
762/**
763 * find_next_usable_block()
764 * @start: the starting block (group relative) to find next
765 * allocatable block in bitmap.
766 * @bh: bufferhead contains the block group bitmap
767 * @maxblocks: the ending block (group relative) for the search
768 *
769 * Find an allocatable block in a bitmap. We honor both the bitmap and
770 * its last-committed copy (if that exists), and perform the "most
771 * appropriate allocation" algorithm of looking for a free block near
772 * the initial goal; then for a free byte somewhere in the bitmap; then
773 * for any free bit in the bitmap.
774 */
775static ext3_grpblk_t
776find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
777 ext3_grpblk_t maxblocks)
778{
779 ext3_grpblk_t here, next;
780 char *p, *r;
781
782 if (start > 0) {
783 /*
784 * The goal was occupied; search forward for a free
785 * block within the next XX blocks.
786 *
787 * end_goal is more or less random, but it has to be
788 * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
789 * next 64-bit boundary is simple..
790 */
791 ext3_grpblk_t end_goal = (start + 63) & ~63;
792 if (end_goal > maxblocks)
793 end_goal = maxblocks;
794 here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
795 if (here < end_goal && ext3_test_allocatable(here, bh))
796 return here;
797 ext3_debug("Bit not found near goal\n");
798 }
799
800 here = start;
801 if (here < 0)
802 here = 0;
803
804 p = bh->b_data + (here >> 3);
805 r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
806 next = (r - bh->b_data) << 3;
807
808 if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
809 return next;
810
811 /*
812 * The bitmap search --- search forward alternately through the actual
813 * bitmap and the last-committed copy until we find a bit free in
814 * both
815 */
816 here = bitmap_search_next_usable_block(here, bh, maxblocks);
817 return here;
818}
819
820/**
821 * claim_block()
822 * @lock: the spin lock for this block group
823 * @block: the free block (group relative) to allocate
824 * @bh: the buffer_head contains the block group bitmap
825 *
826 * We think we can allocate this block in this bitmap. Try to set the bit.
827 * If that succeeds then check that nobody has allocated and then freed the
828 * block since we saw that is was not marked in b_committed_data. If it _was_
829 * allocated and freed then clear the bit in the bitmap again and return
830 * zero (failure).
831 */
832static inline int
833claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh)
834{
835 struct journal_head *jh = bh2jh(bh);
836 int ret;
837
838 if (ext3_set_bit_atomic(lock, block, bh->b_data))
839 return 0;
840 jbd_lock_bh_state(bh);
841 if (jh->b_committed_data && ext3_test_bit(block,jh->b_committed_data)) {
842 ext3_clear_bit_atomic(lock, block, bh->b_data);
843 ret = 0;
844 } else {
845 ret = 1;
846 }
847 jbd_unlock_bh_state(bh);
848 return ret;
849}
850
851/**
852 * ext3_try_to_allocate()
853 * @sb: superblock
854 * @handle: handle to this transaction
855 * @group: given allocation block group
856 * @bitmap_bh: bufferhead holds the block bitmap
857 * @grp_goal: given target block within the group
858 * @count: target number of blocks to allocate
859 * @my_rsv: reservation window
860 *
861 * Attempt to allocate blocks within a give range. Set the range of allocation
862 * first, then find the first free bit(s) from the bitmap (within the range),
863 * and at last, allocate the blocks by claiming the found free bit as allocated.
864 *
865 * To set the range of this allocation:
866 * if there is a reservation window, only try to allocate block(s) from the
867 * file's own reservation window;
868 * Otherwise, the allocation range starts from the give goal block, ends at
869 * the block group's last block.
870 *
871 * If we failed to allocate the desired block then we may end up crossing to a
872 * new bitmap. In that case we must release write access to the old one via
873 * ext3_journal_release_buffer(), else we'll run out of credits.
874 */
875static ext3_grpblk_t
876ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
877 struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
878 unsigned long *count, struct ext3_reserve_window *my_rsv)
879{
880 ext3_fsblk_t group_first_block;
881 ext3_grpblk_t start, end;
882 unsigned long num = 0;
883
884 /* we do allocation within the reservation window if we have a window */
885 if (my_rsv) {
886 group_first_block = ext3_group_first_block_no(sb, group);
887 if (my_rsv->_rsv_start >= group_first_block)
888 start = my_rsv->_rsv_start - group_first_block;
889 else
890 /* reservation window cross group boundary */
891 start = 0;
892 end = my_rsv->_rsv_end - group_first_block + 1;
893 if (end > EXT3_BLOCKS_PER_GROUP(sb))
894 /* reservation window crosses group boundary */
895 end = EXT3_BLOCKS_PER_GROUP(sb);
896 if ((start <= grp_goal) && (grp_goal < end))
897 start = grp_goal;
898 else
899 grp_goal = -1;
900 } else {
901 if (grp_goal > 0)
902 start = grp_goal;
903 else
904 start = 0;
905 end = EXT3_BLOCKS_PER_GROUP(sb);
906 }
907
908 BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
909
910repeat:
911 if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) {
912 grp_goal = find_next_usable_block(start, bitmap_bh, end);
913 if (grp_goal < 0)
914 goto fail_access;
915 if (!my_rsv) {
916 int i;
917
918 for (i = 0; i < 7 && grp_goal > start &&
919 ext3_test_allocatable(grp_goal - 1,
920 bitmap_bh);
921 i++, grp_goal--)
922 ;
923 }
924 }
925 start = grp_goal;
926
927 if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group),
928 grp_goal, bitmap_bh)) {
929 /*
930 * The block was allocated by another thread, or it was
931 * allocated and then freed by another thread
932 */
933 start++;
934 grp_goal++;
935 if (start >= end)
936 goto fail_access;
937 goto repeat;
938 }
939 num++;
940 grp_goal++;
941 while (num < *count && grp_goal < end
942 && ext3_test_allocatable(grp_goal, bitmap_bh)
943 && claim_block(sb_bgl_lock(EXT3_SB(sb), group),
944 grp_goal, bitmap_bh)) {
945 num++;
946 grp_goal++;
947 }
948 *count = num;
949 return grp_goal - num;
950fail_access:
951 *count = num;
952 return -1;
953}
954
955/**
956 * find_next_reservable_window():
957 * find a reservable space within the given range.
958 * It does not allocate the reservation window for now:
959 * alloc_new_reservation() will do the work later.
960 *
961 * @search_head: the head of the searching list;
962 * This is not necessarily the list head of the whole filesystem
963 *
964 * We have both head and start_block to assist the search
965 * for the reservable space. The list starts from head,
966 * but we will shift to the place where start_block is,
967 * then start from there, when looking for a reservable space.
968 *
969 * @my_rsv: the reservation window
970 *
971 * @sb: the super block
972 *
973 * @start_block: the first block we consider to start
974 * the real search from
975 *
976 * @last_block:
977 * the maximum block number that our goal reservable space
978 * could start from. This is normally the last block in this
979 * group. The search will end when we found the start of next
980 * possible reservable space is out of this boundary.
981 * This could handle the cross boundary reservation window
982 * request.
983 *
984 * basically we search from the given range, rather than the whole
985 * reservation double linked list, (start_block, last_block)
986 * to find a free region that is of my size and has not
987 * been reserved.
988 *
989 */
990static int find_next_reservable_window(
991 struct ext3_reserve_window_node *search_head,
992 struct ext3_reserve_window_node *my_rsv,
993 struct super_block * sb,
994 ext3_fsblk_t start_block,
995 ext3_fsblk_t last_block)
996{
997 struct rb_node *next;
998 struct ext3_reserve_window_node *rsv, *prev;
999 ext3_fsblk_t cur;
1000 int size = my_rsv->rsv_goal_size;
1001
1002 /* TODO: make the start of the reservation window byte-aligned */
1003 /* cur = *start_block & ~7;*/
1004 cur = start_block;
1005 rsv = search_head;
1006 if (!rsv)
1007 return -1;
1008
1009 while (1) {
1010 if (cur <= rsv->rsv_end)
1011 cur = rsv->rsv_end + 1;
1012
1013 /* TODO?
1014 * in the case we could not find a reservable space
1015 * that is what is expected, during the re-search, we could
1016 * remember what's the largest reservable space we could have
1017 * and return that one.
1018 *
1019 * For now it will fail if we could not find the reservable
1020 * space with expected-size (or more)...
1021 */
1022 if (cur > last_block)
1023 return -1; /* fail */
1024
1025 prev = rsv;
1026 next = rb_next(&rsv->rsv_node);
1027 rsv = rb_entry(next,struct ext3_reserve_window_node,rsv_node);
1028
1029 /*
1030 * Reached the last reservation, we can just append to the
1031 * previous one.
1032 */
1033 if (!next)
1034 break;
1035
1036 if (cur + size <= rsv->rsv_start) {
1037 /*
1038 * Found a reserveable space big enough. We could
1039 * have a reservation across the group boundary here
1040 */
1041 break;
1042 }
1043 }
1044 /*
1045 * we come here either :
1046 * when we reach the end of the whole list,
1047 * and there is empty reservable space after last entry in the list.
1048 * append it to the end of the list.
1049 *
1050 * or we found one reservable space in the middle of the list,
1051 * return the reservation window that we could append to.
1052 * succeed.
1053 */
1054
1055 if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
1056 rsv_window_remove(sb, my_rsv);
1057
1058 /*
1059 * Let's book the whole available window for now. We will check the
1060 * disk bitmap later and then, if there are free blocks then we adjust
1061 * the window size if it's larger than requested.
1062 * Otherwise, we will remove this node from the tree next time
1063 * call find_next_reservable_window.
1064 */
1065 my_rsv->rsv_start = cur;
1066 my_rsv->rsv_end = cur + size - 1;
1067 my_rsv->rsv_alloc_hit = 0;
1068
1069 if (prev != my_rsv)
1070 ext3_rsv_window_add(sb, my_rsv);
1071
1072 return 0;
1073}
1074
1075/**
1076 * alloc_new_reservation()--allocate a new reservation window
1077 *
1078 * To make a new reservation, we search part of the filesystem
1079 * reservation list (the list that inside the group). We try to
1080 * allocate a new reservation window near the allocation goal,
1081 * or the beginning of the group, if there is no goal.
1082 *
1083 * We first find a reservable space after the goal, then from
1084 * there, we check the bitmap for the first free block after
1085 * it. If there is no free block until the end of group, then the
1086 * whole group is full, we failed. Otherwise, check if the free
1087 * block is inside the expected reservable space, if so, we
1088 * succeed.
1089 * If the first free block is outside the reservable space, then
1090 * start from the first free block, we search for next available
1091 * space, and go on.
1092 *
1093 * on succeed, a new reservation will be found and inserted into the list
1094 * It contains at least one free block, and it does not overlap with other
1095 * reservation windows.
1096 *
1097 * failed: we failed to find a reservation window in this group
1098 *
1099 * @my_rsv: the reservation window
1100 *
1101 * @grp_goal: The goal (group-relative). It is where the search for a
1102 * free reservable space should start from.
1103 * if we have a grp_goal(grp_goal >0 ), then start from there,
1104 * no grp_goal(grp_goal = -1), we start from the first block
1105 * of the group.
1106 *
1107 * @sb: the super block
1108 * @group: the group we are trying to allocate in
1109 * @bitmap_bh: the block group block bitmap
1110 *
1111 */
1112static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
1113 ext3_grpblk_t grp_goal, struct super_block *sb,
1114 unsigned int group, struct buffer_head *bitmap_bh)
1115{
1116 struct ext3_reserve_window_node *search_head;
1117 ext3_fsblk_t group_first_block, group_end_block, start_block;
1118 ext3_grpblk_t first_free_block;
1119 struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
1120 unsigned long size;
1121 int ret;
1122 spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
1123
1124 group_first_block = ext3_group_first_block_no(sb, group);
1125 group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
1126
1127 if (grp_goal < 0)
1128 start_block = group_first_block;
1129 else
1130 start_block = grp_goal + group_first_block;
1131
1132 trace_ext3_alloc_new_reservation(sb, start_block);
1133 size = my_rsv->rsv_goal_size;
1134
1135 if (!rsv_is_empty(&my_rsv->rsv_window)) {
1136 /*
1137 * if the old reservation is cross group boundary
1138 * and if the goal is inside the old reservation window,
1139 * we will come here when we just failed to allocate from
1140 * the first part of the window. We still have another part
1141 * that belongs to the next group. In this case, there is no
1142 * point to discard our window and try to allocate a new one
1143 * in this group(which will fail). we should
1144 * keep the reservation window, just simply move on.
1145 *
1146 * Maybe we could shift the start block of the reservation
1147 * window to the first block of next group.
1148 */
1149
1150 if ((my_rsv->rsv_start <= group_end_block) &&
1151 (my_rsv->rsv_end > group_end_block) &&
1152 (start_block >= my_rsv->rsv_start))
1153 return -1;
1154
1155 if ((my_rsv->rsv_alloc_hit >
1156 (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
1157 /*
1158 * if the previously allocation hit ratio is
1159 * greater than 1/2, then we double the size of
1160 * the reservation window the next time,
1161 * otherwise we keep the same size window
1162 */
1163 size = size * 2;
1164 if (size > EXT3_MAX_RESERVE_BLOCKS)
1165 size = EXT3_MAX_RESERVE_BLOCKS;
1166 my_rsv->rsv_goal_size= size;
1167 }
1168 }
1169
1170 spin_lock(rsv_lock);
1171 /*
1172 * shift the search start to the window near the goal block
1173 */
1174 search_head = search_reserve_window(fs_rsv_root, start_block);
1175
1176 /*
1177 * find_next_reservable_window() simply finds a reservable window
1178 * inside the given range(start_block, group_end_block).
1179 *
1180 * To make sure the reservation window has a free bit inside it, we
1181 * need to check the bitmap after we found a reservable window.
1182 */
1183retry:
1184 ret = find_next_reservable_window(search_head, my_rsv, sb,
1185 start_block, group_end_block);
1186
1187 if (ret == -1) {
1188 if (!rsv_is_empty(&my_rsv->rsv_window))
1189 rsv_window_remove(sb, my_rsv);
1190 spin_unlock(rsv_lock);
1191 return -1;
1192 }
1193
1194 /*
1195 * On success, find_next_reservable_window() returns the
1196 * reservation window where there is a reservable space after it.
1197 * Before we reserve this reservable space, we need
1198 * to make sure there is at least a free block inside this region.
1199 *
1200 * searching the first free bit on the block bitmap and copy of
1201 * last committed bitmap alternatively, until we found a allocatable
1202 * block. Search start from the start block of the reservable space
1203 * we just found.
1204 */
1205 spin_unlock(rsv_lock);
1206 first_free_block = bitmap_search_next_usable_block(
1207 my_rsv->rsv_start - group_first_block,
1208 bitmap_bh, group_end_block - group_first_block + 1);
1209
1210 if (first_free_block < 0) {
1211 /*
1212 * no free block left on the bitmap, no point
1213 * to reserve the space. return failed.
1214 */
1215 spin_lock(rsv_lock);
1216 if (!rsv_is_empty(&my_rsv->rsv_window))
1217 rsv_window_remove(sb, my_rsv);
1218 spin_unlock(rsv_lock);
1219 return -1; /* failed */
1220 }
1221
1222 start_block = first_free_block + group_first_block;
1223 /*
1224 * check if the first free block is within the
1225 * free space we just reserved
1226 */
1227 if (start_block >= my_rsv->rsv_start &&
1228 start_block <= my_rsv->rsv_end) {
1229 trace_ext3_reserved(sb, start_block, my_rsv);
1230 return 0; /* success */
1231 }
1232 /*
1233 * if the first free bit we found is out of the reservable space
1234 * continue search for next reservable space,
1235 * start from where the free block is,
1236 * we also shift the list head to where we stopped last time
1237 */
1238 search_head = my_rsv;
1239 spin_lock(rsv_lock);
1240 goto retry;
1241}
1242
1243/**
1244 * try_to_extend_reservation()
1245 * @my_rsv: given reservation window
1246 * @sb: super block
1247 * @size: the delta to extend
1248 *
1249 * Attempt to expand the reservation window large enough to have
1250 * required number of free blocks
1251 *
1252 * Since ext3_try_to_allocate() will always allocate blocks within
1253 * the reservation window range, if the window size is too small,
1254 * multiple blocks allocation has to stop at the end of the reservation
1255 * window. To make this more efficient, given the total number of
1256 * blocks needed and the current size of the window, we try to
1257 * expand the reservation window size if necessary on a best-effort
1258 * basis before ext3_new_blocks() tries to allocate blocks,
1259 */
1260static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
1261 struct super_block *sb, int size)
1262{
1263 struct ext3_reserve_window_node *next_rsv;
1264 struct rb_node *next;
1265 spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
1266
1267 if (!spin_trylock(rsv_lock))
1268 return;
1269
1270 next = rb_next(&my_rsv->rsv_node);
1271
1272 if (!next)
1273 my_rsv->rsv_end += size;
1274 else {
1275 next_rsv = rb_entry(next, struct ext3_reserve_window_node, rsv_node);
1276
1277 if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
1278 my_rsv->rsv_end += size;
1279 else
1280 my_rsv->rsv_end = next_rsv->rsv_start - 1;
1281 }
1282 spin_unlock(rsv_lock);
1283}
1284
1285/**
1286 * ext3_try_to_allocate_with_rsv()
1287 * @sb: superblock
1288 * @handle: handle to this transaction
1289 * @group: given allocation block group
1290 * @bitmap_bh: bufferhead holds the block bitmap
1291 * @grp_goal: given target block within the group
1292 * @my_rsv: reservation window
1293 * @count: target number of blocks to allocate
1294 * @errp: pointer to store the error code
1295 *
1296 * This is the main function used to allocate a new block and its reservation
1297 * window.
1298 *
1299 * Each time when a new block allocation is need, first try to allocate from
1300 * its own reservation. If it does not have a reservation window, instead of
1301 * looking for a free bit on bitmap first, then look up the reservation list to
1302 * see if it is inside somebody else's reservation window, we try to allocate a
1303 * reservation window for it starting from the goal first. Then do the block
1304 * allocation within the reservation window.
1305 *
1306 * This will avoid keeping on searching the reservation list again and
1307 * again when somebody is looking for a free block (without
1308 * reservation), and there are lots of free blocks, but they are all
1309 * being reserved.
1310 *
1311 * We use a red-black tree for the per-filesystem reservation list.
1312 *
1313 */
1314static ext3_grpblk_t
1315ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1316 unsigned int group, struct buffer_head *bitmap_bh,
1317 ext3_grpblk_t grp_goal,
1318 struct ext3_reserve_window_node * my_rsv,
1319 unsigned long *count, int *errp)
1320{
1321 ext3_fsblk_t group_first_block, group_last_block;
1322 ext3_grpblk_t ret = 0;
1323 int fatal;
1324 unsigned long num = *count;
1325
1326 *errp = 0;
1327
1328 /*
1329 * Make sure we use undo access for the bitmap, because it is critical
1330 * that we do the frozen_data COW on bitmap buffers in all cases even
1331 * if the buffer is in BJ_Forget state in the committing transaction.
1332 */
1333 BUFFER_TRACE(bitmap_bh, "get undo access for new block");
1334 fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
1335 if (fatal) {
1336 *errp = fatal;
1337 return -1;
1338 }
1339
1340 /*
1341 * we don't deal with reservation when
1342 * filesystem is mounted without reservation
1343 * or the file is not a regular file
1344 * or last attempt to allocate a block with reservation turned on failed
1345 */
1346 if (my_rsv == NULL ) {
1347 ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
1348 grp_goal, count, NULL);
1349 goto out;
1350 }
1351 /*
1352 * grp_goal is a group relative block number (if there is a goal)
1353 * 0 <= grp_goal < EXT3_BLOCKS_PER_GROUP(sb)
1354 * first block is a filesystem wide block number
1355 * first block is the block number of the first block in this group
1356 */
1357 group_first_block = ext3_group_first_block_no(sb, group);
1358 group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
1359
1360 /*
1361 * Basically we will allocate a new block from inode's reservation
1362 * window.
1363 *
1364 * We need to allocate a new reservation window, if:
1365 * a) inode does not have a reservation window; or
1366 * b) last attempt to allocate a block from existing reservation
1367 * failed; or
1368 * c) we come here with a goal and with a reservation window
1369 *
1370 * We do not need to allocate a new reservation window if we come here
1371 * at the beginning with a goal and the goal is inside the window, or
1372 * we don't have a goal but already have a reservation window.
1373 * then we could go to allocate from the reservation window directly.
1374 */
1375 while (1) {
1376 if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
1377 !goal_in_my_reservation(&my_rsv->rsv_window,
1378 grp_goal, group, sb)) {
1379 if (my_rsv->rsv_goal_size < *count)
1380 my_rsv->rsv_goal_size = *count;
1381 ret = alloc_new_reservation(my_rsv, grp_goal, sb,
1382 group, bitmap_bh);
1383 if (ret < 0)
1384 break; /* failed */
1385
1386 if (!goal_in_my_reservation(&my_rsv->rsv_window,
1387 grp_goal, group, sb))
1388 grp_goal = -1;
1389 } else if (grp_goal >= 0) {
1390 int curr = my_rsv->rsv_end -
1391 (grp_goal + group_first_block) + 1;
1392
1393 if (curr < *count)
1394 try_to_extend_reservation(my_rsv, sb,
1395 *count - curr);
1396 }
1397
1398 if ((my_rsv->rsv_start > group_last_block) ||
1399 (my_rsv->rsv_end < group_first_block)) {
1400 rsv_window_dump(&EXT3_SB(sb)->s_rsv_window_root, 1);
1401 BUG();
1402 }
1403 ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
1404 grp_goal, &num, &my_rsv->rsv_window);
1405 if (ret >= 0) {
1406 my_rsv->rsv_alloc_hit += num;
1407 *count = num;
1408 break; /* succeed */
1409 }
1410 num = *count;
1411 }
1412out:
1413 if (ret >= 0) {
1414 BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
1415 "bitmap block");
1416 fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
1417 if (fatal) {
1418 *errp = fatal;
1419 return -1;
1420 }
1421 return ret;
1422 }
1423
1424 BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
1425 ext3_journal_release_buffer(handle, bitmap_bh);
1426 return ret;
1427}
1428
1429/**
1430 * ext3_has_free_blocks()
1431 * @sbi: in-core super block structure.
1432 *
1433 * Check if filesystem has at least 1 free block available for allocation.
1434 */
1435static int ext3_has_free_blocks(struct ext3_sb_info *sbi, int use_reservation)
1436{
1437 ext3_fsblk_t free_blocks, root_blocks;
1438
1439 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
1440 root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
1441 if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
1442 !use_reservation && !uid_eq(sbi->s_resuid, current_fsuid()) &&
1443 (gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) ||
1444 !in_group_p (sbi->s_resgid))) {
1445 return 0;
1446 }
1447 return 1;
1448}
1449
1450/**
1451 * ext3_should_retry_alloc()
1452 * @sb: super block
1453 * @retries number of attemps has been made
1454 *
1455 * ext3_should_retry_alloc() is called when ENOSPC is returned, and if
1456 * it is profitable to retry the operation, this function will wait
1457 * for the current or committing transaction to complete, and then
1458 * return TRUE.
1459 *
1460 * if the total number of retries exceed three times, return FALSE.
1461 */
1462int ext3_should_retry_alloc(struct super_block *sb, int *retries)
1463{
1464 if (!ext3_has_free_blocks(EXT3_SB(sb), 0) || (*retries)++ > 3)
1465 return 0;
1466
1467 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
1468
1469 return journal_force_commit_nested(EXT3_SB(sb)->s_journal);
1470}
1471
1472/**
1473 * ext3_new_blocks() -- core block(s) allocation function
1474 * @handle: handle to this transaction
1475 * @inode: file inode
1476 * @goal: given target block(filesystem wide)
1477 * @count: target number of blocks to allocate
1478 * @errp: error code
1479 *
1480 * ext3_new_blocks uses a goal block to assist allocation. It tries to
1481 * allocate block(s) from the block group contains the goal block first. If that
1482 * fails, it will try to allocate block(s) from other block groups without
1483 * any specific goal block.
1484 *
1485 */
1486ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
1487 ext3_fsblk_t goal, unsigned long *count, int *errp)
1488{
1489 struct buffer_head *bitmap_bh = NULL;
1490 struct buffer_head *gdp_bh;
1491 int group_no;
1492 int goal_group;
1493 ext3_grpblk_t grp_target_blk; /* blockgroup relative goal block */
1494 ext3_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/
1495 ext3_fsblk_t ret_block; /* filesyetem-wide allocated block */
1496 int bgi; /* blockgroup iteration index */
1497 int fatal = 0, err;
1498 int performed_allocation = 0;
1499 ext3_grpblk_t free_blocks; /* number of free blocks in a group */
1500 struct super_block *sb;
1501 struct ext3_group_desc *gdp;
1502 struct ext3_super_block *es;
1503 struct ext3_sb_info *sbi;
1504 struct ext3_reserve_window_node *my_rsv = NULL;
1505 struct ext3_block_alloc_info *block_i;
1506 unsigned short windowsz = 0;
1507#ifdef EXT3FS_DEBUG
1508 static int goal_hits, goal_attempts;
1509#endif
1510 unsigned long ngroups;
1511 unsigned long num = *count;
1512
1513 *errp = -ENOSPC;
1514 sb = inode->i_sb;
1515
1516 /*
1517 * Check quota for allocation of this block.
1518 */
1519 err = dquot_alloc_block(inode, num);
1520 if (err) {
1521 *errp = err;
1522 return 0;
1523 }
1524
1525 trace_ext3_request_blocks(inode, goal, num);
1526
1527 sbi = EXT3_SB(sb);
1528 es = sbi->s_es;
1529 ext3_debug("goal=%lu.\n", goal);
1530 /*
1531 * Allocate a block from reservation only when
1532 * filesystem is mounted with reservation(default,-o reservation), and
1533 * it's a regular file, and
1534 * the desired window size is greater than 0 (One could use ioctl
1535 * command EXT3_IOC_SETRSVSZ to set the window size to 0 to turn off
1536 * reservation on that particular file)
1537 */
1538 block_i = EXT3_I(inode)->i_block_alloc_info;
1539 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1540 my_rsv = &block_i->rsv_window_node;
1541
1542 if (!ext3_has_free_blocks(sbi, IS_NOQUOTA(inode))) {
1543 *errp = -ENOSPC;
1544 goto out;
1545 }
1546
1547 /*
1548 * First, test whether the goal block is free.
1549 */
1550 if (goal < le32_to_cpu(es->s_first_data_block) ||
1551 goal >= le32_to_cpu(es->s_blocks_count))
1552 goal = le32_to_cpu(es->s_first_data_block);
1553 group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
1554 EXT3_BLOCKS_PER_GROUP(sb);
1555 goal_group = group_no;
1556retry_alloc:
1557 gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
1558 if (!gdp)
1559 goto io_error;
1560
1561 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1562 /*
1563 * if there is not enough free blocks to make a new resevation
1564 * turn off reservation for this allocation
1565 */
1566 if (my_rsv && (free_blocks < windowsz)
1567 && (free_blocks > 0)
1568 && (rsv_is_empty(&my_rsv->rsv_window)))
1569 my_rsv = NULL;
1570
1571 if (free_blocks > 0) {
1572 grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
1573 EXT3_BLOCKS_PER_GROUP(sb));
1574 bitmap_bh = read_block_bitmap(sb, group_no);
1575 if (!bitmap_bh)
1576 goto io_error;
1577 grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
1578 group_no, bitmap_bh, grp_target_blk,
1579 my_rsv, &num, &fatal);
1580 if (fatal)
1581 goto out;
1582 if (grp_alloc_blk >= 0)
1583 goto allocated;
1584 }
1585
1586 ngroups = EXT3_SB(sb)->s_groups_count;
1587 smp_rmb();
1588
1589 /*
1590 * Now search the rest of the groups. We assume that
1591 * group_no and gdp correctly point to the last group visited.
1592 */
1593 for (bgi = 0; bgi < ngroups; bgi++) {
1594 group_no++;
1595 if (group_no >= ngroups)
1596 group_no = 0;
1597 gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
1598 if (!gdp)
1599 goto io_error;
1600 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1601 /*
1602 * skip this group (and avoid loading bitmap) if there
1603 * are no free blocks
1604 */
1605 if (!free_blocks)
1606 continue;
1607 /*
1608 * skip this group if the number of
1609 * free blocks is less than half of the reservation
1610 * window size.
1611 */
1612 if (my_rsv && (free_blocks <= (windowsz/2)))
1613 continue;
1614
1615 brelse(bitmap_bh);
1616 bitmap_bh = read_block_bitmap(sb, group_no);
1617 if (!bitmap_bh)
1618 goto io_error;
1619 /*
1620 * try to allocate block(s) from this group, without a goal(-1).
1621 */
1622 grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
1623 group_no, bitmap_bh, -1, my_rsv,
1624 &num, &fatal);
1625 if (fatal)
1626 goto out;
1627 if (grp_alloc_blk >= 0)
1628 goto allocated;
1629 }
1630 /*
1631 * We may end up a bogus earlier ENOSPC error due to
1632 * filesystem is "full" of reservations, but
1633 * there maybe indeed free blocks available on disk
1634 * In this case, we just forget about the reservations
1635 * just do block allocation as without reservations.
1636 */
1637 if (my_rsv) {
1638 my_rsv = NULL;
1639 windowsz = 0;
1640 group_no = goal_group;
1641 goto retry_alloc;
1642 }
1643 /* No space left on the device */
1644 *errp = -ENOSPC;
1645 goto out;
1646
1647allocated:
1648
1649 ext3_debug("using block group %d(%d)\n",
1650 group_no, gdp->bg_free_blocks_count);
1651
1652 BUFFER_TRACE(gdp_bh, "get_write_access");
1653 fatal = ext3_journal_get_write_access(handle, gdp_bh);
1654 if (fatal)
1655 goto out;
1656
1657 ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);
1658
1659 if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
1660 in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
1661 in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
1662 EXT3_SB(sb)->s_itb_per_group) ||
1663 in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
1664 EXT3_SB(sb)->s_itb_per_group)) {
1665 ext3_error(sb, "ext3_new_block",
1666 "Allocating block in system zone - "
1667 "blocks from "E3FSBLK", length %lu",
1668 ret_block, num);
1669 /*
1670 * claim_block() marked the blocks we allocated as in use. So we
1671 * may want to selectively mark some of the blocks as free.
1672 */
1673 goto retry_alloc;
1674 }
1675
1676 performed_allocation = 1;
1677
1678#ifdef CONFIG_JBD_DEBUG
1679 {
1680 struct buffer_head *debug_bh;
1681
1682 /* Record bitmap buffer state in the newly allocated block */
1683 debug_bh = sb_find_get_block(sb, ret_block);
1684 if (debug_bh) {
1685 BUFFER_TRACE(debug_bh, "state when allocated");
1686 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
1687 brelse(debug_bh);
1688 }
1689 }
1690 jbd_lock_bh_state(bitmap_bh);
1691 spin_lock(sb_bgl_lock(sbi, group_no));
1692 if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
1693 int i;
1694
1695 for (i = 0; i < num; i++) {
1696 if (ext3_test_bit(grp_alloc_blk+i,
1697 bh2jh(bitmap_bh)->b_committed_data)) {
1698 printk("%s: block was unexpectedly set in "
1699 "b_committed_data\n", __func__);
1700 }
1701 }
1702 }
1703 ext3_debug("found bit %d\n", grp_alloc_blk);
1704 spin_unlock(sb_bgl_lock(sbi, group_no));
1705 jbd_unlock_bh_state(bitmap_bh);
1706#endif
1707
1708 if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
1709 ext3_error(sb, "ext3_new_block",
1710 "block("E3FSBLK") >= blocks count(%d) - "
1711 "block_group = %d, es == %p ", ret_block,
1712 le32_to_cpu(es->s_blocks_count), group_no, es);
1713 goto out;
1714 }
1715
1716 /*
1717 * It is up to the caller to add the new buffer to a journal
1718 * list of some description. We don't know in advance whether
1719 * the caller wants to use it as metadata or data.
1720 */
1721 ext3_debug("allocating block %lu. Goal hits %d of %d.\n",
1722 ret_block, goal_hits, goal_attempts);
1723
1724 spin_lock(sb_bgl_lock(sbi, group_no));
1725 le16_add_cpu(&gdp->bg_free_blocks_count, -num);
1726 spin_unlock(sb_bgl_lock(sbi, group_no));
1727 percpu_counter_sub(&sbi->s_freeblocks_counter, num);
1728
1729 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1730 fatal = ext3_journal_dirty_metadata(handle, gdp_bh);
1731 if (fatal)
1732 goto out;
1733
1734 *errp = 0;
1735 brelse(bitmap_bh);
1736
1737 if (num < *count) {
1738 dquot_free_block(inode, *count-num);
1739 *count = num;
1740 }
1741
1742 trace_ext3_allocate_blocks(inode, goal, num,
1743 (unsigned long long)ret_block);
1744
1745 return ret_block;
1746
1747io_error:
1748 *errp = -EIO;
1749out:
1750 if (fatal) {
1751 *errp = fatal;
1752 ext3_std_error(sb, fatal);
1753 }
1754 /*
1755 * Undo the block allocation
1756 */
1757 if (!performed_allocation)
1758 dquot_free_block(inode, *count);
1759 brelse(bitmap_bh);
1760 return 0;
1761}
1762
1763ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
1764 ext3_fsblk_t goal, int *errp)
1765{
1766 unsigned long count = 1;
1767
1768 return ext3_new_blocks(handle, inode, goal, &count, errp);
1769}
1770
1771/**
1772 * ext3_count_free_blocks() -- count filesystem free blocks
1773 * @sb: superblock
1774 *
1775 * Adds up the number of free blocks from each block group.
1776 */
1777ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
1778{
1779 ext3_fsblk_t desc_count;
1780 struct ext3_group_desc *gdp;
1781 int i;
1782 unsigned long ngroups = EXT3_SB(sb)->s_groups_count;
1783#ifdef EXT3FS_DEBUG
1784 struct ext3_super_block *es;
1785 ext3_fsblk_t bitmap_count;
1786 unsigned long x;
1787 struct buffer_head *bitmap_bh = NULL;
1788
1789 es = EXT3_SB(sb)->s_es;
1790 desc_count = 0;
1791 bitmap_count = 0;
1792 gdp = NULL;
1793
1794 smp_rmb();
1795 for (i = 0; i < ngroups; i++) {
1796 gdp = ext3_get_group_desc(sb, i, NULL);
1797 if (!gdp)
1798 continue;
1799 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1800 brelse(bitmap_bh);
1801 bitmap_bh = read_block_bitmap(sb, i);
1802 if (bitmap_bh == NULL)
1803 continue;
1804
1805 x = ext3_count_free(bitmap_bh, sb->s_blocksize);
1806 printk("group %d: stored = %d, counted = %lu\n",
1807 i, le16_to_cpu(gdp->bg_free_blocks_count), x);
1808 bitmap_count += x;
1809 }
1810 brelse(bitmap_bh);
1811 printk("ext3_count_free_blocks: stored = "E3FSBLK
1812 ", computed = "E3FSBLK", "E3FSBLK"\n",
1813 (ext3_fsblk_t)le32_to_cpu(es->s_free_blocks_count),
1814 desc_count, bitmap_count);
1815 return bitmap_count;
1816#else
1817 desc_count = 0;
1818 smp_rmb();
1819 for (i = 0; i < ngroups; i++) {
1820 gdp = ext3_get_group_desc(sb, i, NULL);
1821 if (!gdp)
1822 continue;
1823 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1824 }
1825
1826 return desc_count;
1827#endif
1828}
1829
1830static inline int test_root(int a, int b)
1831{
1832 int num = b;
1833
1834 while (a > num)
1835 num *= b;
1836 return num == a;
1837}
1838
1839static int ext3_group_sparse(int group)
1840{
1841 if (group <= 1)
1842 return 1;
1843 if (!(group & 1))
1844 return 0;
1845 return (test_root(group, 7) || test_root(group, 5) ||
1846 test_root(group, 3));
1847}
1848
1849/**
1850 * ext3_bg_has_super - number of blocks used by the superblock in group
1851 * @sb: superblock for filesystem
1852 * @group: group number to check
1853 *
1854 * Return the number of blocks used by the superblock (primary or backup)
1855 * in this group. Currently this will be only 0 or 1.
1856 */
1857int ext3_bg_has_super(struct super_block *sb, int group)
1858{
1859 if (EXT3_HAS_RO_COMPAT_FEATURE(sb,
1860 EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
1861 !ext3_group_sparse(group))
1862 return 0;
1863 return 1;
1864}
1865
1866static unsigned long ext3_bg_num_gdb_meta(struct super_block *sb, int group)
1867{
1868 unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
1869 unsigned long first = metagroup * EXT3_DESC_PER_BLOCK(sb);
1870 unsigned long last = first + EXT3_DESC_PER_BLOCK(sb) - 1;
1871
1872 if (group == first || group == first + 1 || group == last)
1873 return 1;
1874 return 0;
1875}
1876
1877static unsigned long ext3_bg_num_gdb_nometa(struct super_block *sb, int group)
1878{
1879 return ext3_bg_has_super(sb, group) ? EXT3_SB(sb)->s_gdb_count : 0;
1880}
1881
1882/**
1883 * ext3_bg_num_gdb - number of blocks used by the group table in group
1884 * @sb: superblock for filesystem
1885 * @group: group number to check
1886 *
1887 * Return the number of blocks used by the group descriptor table
1888 * (primary or backup) in this group. In the future there may be a
1889 * different number of descriptor blocks in each group.
1890 */
1891unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
1892{
1893 unsigned long first_meta_bg =
1894 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_meta_bg);
1895 unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
1896
1897 if (!EXT3_HAS_INCOMPAT_FEATURE(sb,EXT3_FEATURE_INCOMPAT_META_BG) ||
1898 metagroup < first_meta_bg)
1899 return ext3_bg_num_gdb_nometa(sb,group);
1900
1901 return ext3_bg_num_gdb_meta(sb,group);
1902
1903}
1904
1905/**
1906 * ext3_trim_all_free -- function to trim all free space in alloc. group
1907 * @sb: super block for file system
1908 * @group: allocation group to trim
1909 * @start: first group block to examine
1910 * @max: last group block to examine
1911 * @gdp: allocation group description structure
1912 * @minblocks: minimum extent block count
1913 *
1914 * ext3_trim_all_free walks through group's block bitmap searching for free
1915 * blocks. When the free block is found, it tries to allocate this block and
1916 * consequent free block to get the biggest free extent possible, until it
1917 * reaches any used block. Then issue a TRIM command on this extent and free
1918 * the extent in the block bitmap. This is done until whole group is scanned.
1919 */
1920static ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,
1921 unsigned int group,
1922 ext3_grpblk_t start, ext3_grpblk_t max,
1923 ext3_grpblk_t minblocks)
1924{
1925 handle_t *handle;
1926 ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
1927 ext3_fsblk_t discard_block;
1928 struct ext3_sb_info *sbi;
1929 struct buffer_head *gdp_bh, *bitmap_bh = NULL;
1930 struct ext3_group_desc *gdp;
1931 int err = 0, ret = 0;
1932
1933 /*
1934 * We will update one block bitmap, and one group descriptor
1935 */
1936 handle = ext3_journal_start_sb(sb, 2);
1937 if (IS_ERR(handle))
1938 return PTR_ERR(handle);
1939
1940 bitmap_bh = read_block_bitmap(sb, group);
1941 if (!bitmap_bh) {
1942 err = -EIO;
1943 goto err_out;
1944 }
1945
1946 BUFFER_TRACE(bitmap_bh, "getting undo access");
1947 err = ext3_journal_get_undo_access(handle, bitmap_bh);
1948 if (err)
1949 goto err_out;
1950
1951 gdp = ext3_get_group_desc(sb, group, &gdp_bh);
1952 if (!gdp) {
1953 err = -EIO;
1954 goto err_out;
1955 }
1956
1957 BUFFER_TRACE(gdp_bh, "get_write_access");
1958 err = ext3_journal_get_write_access(handle, gdp_bh);
1959 if (err)
1960 goto err_out;
1961
1962 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1963 sbi = EXT3_SB(sb);
1964
1965 /* Walk through the whole group */
1966 while (start <= max) {
1967 start = bitmap_search_next_usable_block(start, bitmap_bh, max);
1968 if (start < 0)
1969 break;
1970 next = start;
1971
1972 /*
1973 * Allocate contiguous free extents by setting bits in the
1974 * block bitmap
1975 */
1976 while (next <= max
1977 && claim_block(sb_bgl_lock(sbi, group),
1978 next, bitmap_bh)) {
1979 next++;
1980 }
1981
1982 /* We did not claim any blocks */
1983 if (next == start)
1984 continue;
1985
1986 discard_block = (ext3_fsblk_t)start +
1987 ext3_group_first_block_no(sb, group);
1988
1989 /* Update counters */
1990 spin_lock(sb_bgl_lock(sbi, group));
1991 le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
1992 spin_unlock(sb_bgl_lock(sbi, group));
1993 percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
1994
1995 free_blocks -= next - start;
1996 /* Do not issue a TRIM on extents smaller than minblocks */
1997 if ((next - start) < minblocks)
1998 goto free_extent;
1999
2000 trace_ext3_discard_blocks(sb, discard_block, next - start);
2001 /* Send the TRIM command down to the device */
2002 err = sb_issue_discard(sb, discard_block, next - start,
2003 GFP_NOFS, 0);
2004 count += (next - start);
2005free_extent:
2006 freed = 0;
2007
2008 /*
2009 * Clear bits in the bitmap
2010 */
2011 for (bit = start; bit < next; bit++) {
2012 BUFFER_TRACE(bitmap_bh, "clear bit");
2013 if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
2014 bit, bitmap_bh->b_data)) {
2015 ext3_error(sb, __func__,
2016 "bit already cleared for block "E3FSBLK,
2017 (unsigned long)bit);
2018 BUFFER_TRACE(bitmap_bh, "bit already cleared");
2019 } else {
2020 freed++;
2021 }
2022 }
2023
2024 /* Update couters */
2025 spin_lock(sb_bgl_lock(sbi, group));
2026 le16_add_cpu(&gdp->bg_free_blocks_count, freed);
2027 spin_unlock(sb_bgl_lock(sbi, group));
2028 percpu_counter_add(&sbi->s_freeblocks_counter, freed);
2029
2030 start = next;
2031 if (err < 0) {
2032 if (err != -EOPNOTSUPP)
2033 ext3_warning(sb, __func__, "Discard command "
2034 "returned error %d\n", err);
2035 break;
2036 }
2037
2038 if (fatal_signal_pending(current)) {
2039 err = -ERESTARTSYS;
2040 break;
2041 }
2042
2043 cond_resched();
2044
2045 /* No more suitable extents */
2046 if (free_blocks < minblocks)
2047 break;
2048 }
2049
2050 /* We dirtied the bitmap block */
2051 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
2052 ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
2053 if (!err)
2054 err = ret;
2055
2056 /* And the group descriptor block */
2057 BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
2058 ret = ext3_journal_dirty_metadata(handle, gdp_bh);
2059 if (!err)
2060 err = ret;
2061
2062 ext3_debug("trimmed %d blocks in the group %d\n",
2063 count, group);
2064
2065err_out:
2066 if (err)
2067 count = err;
2068 ext3_journal_stop(handle);
2069 brelse(bitmap_bh);
2070
2071 return count;
2072}
2073
2074/**
2075 * ext3_trim_fs() -- trim ioctl handle function
2076 * @sb: superblock for filesystem
2077 * @start: First Byte to trim
2078 * @len: number of Bytes to trim from start
2079 * @minlen: minimum extent length in Bytes
2080 *
2081 * ext3_trim_fs goes through all allocation groups containing Bytes from
2082 * start to start+len. For each such a group ext3_trim_all_free function
2083 * is invoked to trim all free space.
2084 */
2085int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
2086{
2087 ext3_grpblk_t last_block, first_block;
2088 unsigned long group, first_group, last_group;
2089 struct ext3_group_desc *gdp;
2090 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
2091 uint64_t start, minlen, end, trimmed = 0;
2092 ext3_fsblk_t first_data_blk =
2093 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
2094 ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
2095 int ret = 0;
2096
2097 start = range->start >> sb->s_blocksize_bits;
2098 end = start + (range->len >> sb->s_blocksize_bits) - 1;
2099 minlen = range->minlen >> sb->s_blocksize_bits;
2100
2101 if (minlen > EXT3_BLOCKS_PER_GROUP(sb) ||
2102 start >= max_blks ||
2103 range->len < sb->s_blocksize)
2104 return -EINVAL;
2105 if (end >= max_blks)
2106 end = max_blks - 1;
2107 if (end <= first_data_blk)
2108 goto out;
2109 if (start < first_data_blk)
2110 start = first_data_blk;
2111
2112 smp_rmb();
2113
2114 /* Determine first and last group to examine based on start and len */
2115 ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
2116 &first_group, &first_block);
2117 ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) end,
2118 &last_group, &last_block);
2119
2120 /* end now represents the last block to discard in this group */
2121 end = EXT3_BLOCKS_PER_GROUP(sb) - 1;
2122
2123 for (group = first_group; group <= last_group; group++) {
2124 gdp = ext3_get_group_desc(sb, group, NULL);
2125 if (!gdp)
2126 break;
2127
2128 /*
2129 * For all the groups except the last one, last block will
2130 * always be EXT3_BLOCKS_PER_GROUP(sb)-1, so we only need to
2131 * change it for the last group, note that last_block is
2132 * already computed earlier by ext3_get_group_no_and_offset()
2133 */
2134 if (group == last_group)
2135 end = last_block;
2136
2137 if (le16_to_cpu(gdp->bg_free_blocks_count) >= minlen) {
2138 ret = ext3_trim_all_free(sb, group, first_block,
2139 end, minlen);
2140 if (ret < 0)
2141 break;
2142 trimmed += ret;
2143 }
2144
2145 /*
2146 * For every group except the first one, we are sure
2147 * that the first block to discard will be block #0.
2148 */
2149 first_block = 0;
2150 }
2151
2152 if (ret > 0)
2153 ret = 0;
2154
2155out:
2156 range->len = trimmed * sb->s_blocksize;
2157 return ret;
2158}
diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c
deleted file mode 100644
index ef9c643e8e9d..000000000000
--- a/fs/ext3/bitmap.c
+++ /dev/null
@@ -1,20 +0,0 @@
1/*
2 * linux/fs/ext3/bitmap.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 */
9
10#include "ext3.h"
11
12#ifdef EXT3FS_DEBUG
13
14unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
15{
16 return numchars * BITS_PER_BYTE - memweight(map->b_data, numchars);
17}
18
19#endif /* EXT3FS_DEBUG */
20
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
deleted file mode 100644
index 17742eed2c16..000000000000
--- a/fs/ext3/dir.c
+++ /dev/null
@@ -1,537 +0,0 @@
1/*
2 * linux/fs/ext3/dir.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/dir.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * ext3 directory handling functions
16 *
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 *
20 * Hash Tree Directory indexing (c) 2001 Daniel Phillips
21 *
22 */
23
24#include <linux/compat.h>
25#include "ext3.h"
26
27static unsigned char ext3_filetype_table[] = {
28 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
29};
30
31static int ext3_dx_readdir(struct file *, struct dir_context *);
32
33static unsigned char get_dtype(struct super_block *sb, int filetype)
34{
35 if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) ||
36 (filetype >= EXT3_FT_MAX))
37 return DT_UNKNOWN;
38
39 return (ext3_filetype_table[filetype]);
40}
41
42/**
43 * Check if the given dir-inode refers to an htree-indexed directory
44 * (or a directory which could potentially get converted to use htree
45 * indexing).
46 *
47 * Return 1 if it is a dx dir, 0 if not
48 */
49static int is_dx_dir(struct inode *inode)
50{
51 struct super_block *sb = inode->i_sb;
52
53 if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
54 EXT3_FEATURE_COMPAT_DIR_INDEX) &&
55 ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
56 ((inode->i_size >> sb->s_blocksize_bits) == 1)))
57 return 1;
58
59 return 0;
60}
61
62int ext3_check_dir_entry (const char * function, struct inode * dir,
63 struct ext3_dir_entry_2 * de,
64 struct buffer_head * bh,
65 unsigned long offset)
66{
67 const char * error_msg = NULL;
68 const int rlen = ext3_rec_len_from_disk(de->rec_len);
69
70 if (unlikely(rlen < EXT3_DIR_REC_LEN(1)))
71 error_msg = "rec_len is smaller than minimal";
72 else if (unlikely(rlen % 4 != 0))
73 error_msg = "rec_len % 4 != 0";
74 else if (unlikely(rlen < EXT3_DIR_REC_LEN(de->name_len)))
75 error_msg = "rec_len is too small for name_len";
76 else if (unlikely((((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)))
77 error_msg = "directory entry across blocks";
78 else if (unlikely(le32_to_cpu(de->inode) >
79 le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count)))
80 error_msg = "inode out of bounds";
81
82 if (unlikely(error_msg != NULL))
83 ext3_error (dir->i_sb, function,
84 "bad entry in directory #%lu: %s - "
85 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
86 dir->i_ino, error_msg, offset,
87 (unsigned long) le32_to_cpu(de->inode),
88 rlen, de->name_len);
89
90 return error_msg == NULL ? 1 : 0;
91}
92
93static int ext3_readdir(struct file *file, struct dir_context *ctx)
94{
95 unsigned long offset;
96 int i;
97 struct ext3_dir_entry_2 *de;
98 int err;
99 struct inode *inode = file_inode(file);
100 struct super_block *sb = inode->i_sb;
101 int dir_has_error = 0;
102
103 if (is_dx_dir(inode)) {
104 err = ext3_dx_readdir(file, ctx);
105 if (err != ERR_BAD_DX_DIR)
106 return err;
107 /*
108 * We don't set the inode dirty flag since it's not
109 * critical that it get flushed back to the disk.
110 */
111 EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
112 }
113 offset = ctx->pos & (sb->s_blocksize - 1);
114
115 while (ctx->pos < inode->i_size) {
116 unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb);
117 struct buffer_head map_bh;
118 struct buffer_head *bh = NULL;
119
120 map_bh.b_state = 0;
121 err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);
122 if (err > 0) {
123 pgoff_t index = map_bh.b_blocknr >>
124 (PAGE_CACHE_SHIFT - inode->i_blkbits);
125 if (!ra_has_index(&file->f_ra, index))
126 page_cache_sync_readahead(
127 sb->s_bdev->bd_inode->i_mapping,
128 &file->f_ra, file,
129 index, 1);
130 file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
131 bh = ext3_bread(NULL, inode, blk, 0, &err);
132 }
133
134 /*
135 * We ignore I/O errors on directories so users have a chance
136 * of recovering data when there's a bad sector
137 */
138 if (!bh) {
139 if (!dir_has_error) {
140 ext3_error(sb, __func__, "directory #%lu "
141 "contains a hole at offset %lld",
142 inode->i_ino, ctx->pos);
143 dir_has_error = 1;
144 }
145 /* corrupt size? Maybe no more blocks to read */
146 if (ctx->pos > inode->i_blocks << 9)
147 break;
148 ctx->pos += sb->s_blocksize - offset;
149 continue;
150 }
151
152 /* If the dir block has changed since the last call to
153 * readdir(2), then we might be pointing to an invalid
154 * dirent right now. Scan from the start of the block
155 * to make sure. */
156 if (offset && file->f_version != inode->i_version) {
157 for (i = 0; i < sb->s_blocksize && i < offset; ) {
158 de = (struct ext3_dir_entry_2 *)
159 (bh->b_data + i);
160 /* It's too expensive to do a full
161 * dirent test each time round this
162 * loop, but we do have to test at
163 * least that it is non-zero. A
164 * failure will be detected in the
165 * dirent test below. */
166 if (ext3_rec_len_from_disk(de->rec_len) <
167 EXT3_DIR_REC_LEN(1))
168 break;
169 i += ext3_rec_len_from_disk(de->rec_len);
170 }
171 offset = i;
172 ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
173 | offset;
174 file->f_version = inode->i_version;
175 }
176
177 while (ctx->pos < inode->i_size
178 && offset < sb->s_blocksize) {
179 de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
180 if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
181 bh, offset)) {
182 /* On error, skip the to the
183 next block. */
184 ctx->pos = (ctx->pos |
185 (sb->s_blocksize - 1)) + 1;
186 break;
187 }
188 offset += ext3_rec_len_from_disk(de->rec_len);
189 if (le32_to_cpu(de->inode)) {
190 if (!dir_emit(ctx, de->name, de->name_len,
191 le32_to_cpu(de->inode),
192 get_dtype(sb, de->file_type))) {
193 brelse(bh);
194 return 0;
195 }
196 }
197 ctx->pos += ext3_rec_len_from_disk(de->rec_len);
198 }
199 offset = 0;
200 brelse (bh);
201 if (ctx->pos < inode->i_size)
202 if (!dir_relax(inode))
203 return 0;
204 }
205 return 0;
206}
207
208static inline int is_32bit_api(void)
209{
210#ifdef CONFIG_COMPAT
211 return is_compat_task();
212#else
213 return (BITS_PER_LONG == 32);
214#endif
215}
216
217/*
218 * These functions convert from the major/minor hash to an f_pos
219 * value for dx directories
220 *
221 * Upper layer (for example NFS) should specify FMODE_32BITHASH or
222 * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted
223 * directly on both 32-bit and 64-bit nodes, under such case, neither
224 * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
225 */
226static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
227{
228 if ((filp->f_mode & FMODE_32BITHASH) ||
229 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
230 return major >> 1;
231 else
232 return ((__u64)(major >> 1) << 32) | (__u64)minor;
233}
234
235static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
236{
237 if ((filp->f_mode & FMODE_32BITHASH) ||
238 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
239 return (pos << 1) & 0xffffffff;
240 else
241 return ((pos >> 32) << 1) & 0xffffffff;
242}
243
244static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
245{
246 if ((filp->f_mode & FMODE_32BITHASH) ||
247 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
248 return 0;
249 else
250 return pos & 0xffffffff;
251}
252
253/*
254 * Return 32- or 64-bit end-of-file for dx directories
255 */
256static inline loff_t ext3_get_htree_eof(struct file *filp)
257{
258 if ((filp->f_mode & FMODE_32BITHASH) ||
259 (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
260 return EXT3_HTREE_EOF_32BIT;
261 else
262 return EXT3_HTREE_EOF_64BIT;
263}
264
265
266/*
267 * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both
268 * non-htree and htree directories, where the "offset" is in terms
269 * of the filename hash value instead of the byte offset.
270 *
271 * Because we may return a 64-bit hash that is well beyond s_maxbytes,
272 * we need to pass the max hash as the maximum allowable offset in
273 * the htree directory case.
274 *
275 * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
276 * will be invalid once the directory was converted into a dx directory
277 */
278static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
279{
280 struct inode *inode = file->f_mapping->host;
281 int dx_dir = is_dx_dir(inode);
282 loff_t htree_max = ext3_get_htree_eof(file);
283
284 if (likely(dx_dir))
285 return generic_file_llseek_size(file, offset, whence,
286 htree_max, htree_max);
287 else
288 return generic_file_llseek(file, offset, whence);
289}
290
291/*
292 * This structure holds the nodes of the red-black tree used to store
293 * the directory entry in hash order.
294 */
295struct fname {
296 __u32 hash;
297 __u32 minor_hash;
298 struct rb_node rb_hash;
299 struct fname *next;
300 __u32 inode;
301 __u8 name_len;
302 __u8 file_type;
303 char name[0];
304};
305
306/*
307 * This functoin implements a non-recursive way of freeing all of the
308 * nodes in the red-black tree.
309 */
310static void free_rb_tree_fname(struct rb_root *root)
311{
312 struct fname *fname, *next;
313
314 rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
315 do {
316 struct fname *old = fname;
317 fname = fname->next;
318 kfree(old);
319 } while (fname);
320
321 *root = RB_ROOT;
322}
323
324static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp,
325 loff_t pos)
326{
327 struct dir_private_info *p;
328
329 p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
330 if (!p)
331 return NULL;
332 p->curr_hash = pos2maj_hash(filp, pos);
333 p->curr_minor_hash = pos2min_hash(filp, pos);
334 return p;
335}
336
337void ext3_htree_free_dir_info(struct dir_private_info *p)
338{
339 free_rb_tree_fname(&p->root);
340 kfree(p);
341}
342
343/*
344 * Given a directory entry, enter it into the fname rb tree.
345 */
346int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
347 __u32 minor_hash,
348 struct ext3_dir_entry_2 *dirent)
349{
350 struct rb_node **p, *parent = NULL;
351 struct fname * fname, *new_fn;
352 struct dir_private_info *info;
353 int len;
354
355 info = (struct dir_private_info *) dir_file->private_data;
356 p = &info->root.rb_node;
357
358 /* Create and allocate the fname structure */
359 len = sizeof(struct fname) + dirent->name_len + 1;
360 new_fn = kzalloc(len, GFP_KERNEL);
361 if (!new_fn)
362 return -ENOMEM;
363 new_fn->hash = hash;
364 new_fn->minor_hash = minor_hash;
365 new_fn->inode = le32_to_cpu(dirent->inode);
366 new_fn->name_len = dirent->name_len;
367 new_fn->file_type = dirent->file_type;
368 memcpy(new_fn->name, dirent->name, dirent->name_len);
369 new_fn->name[dirent->name_len] = 0;
370
371 while (*p) {
372 parent = *p;
373 fname = rb_entry(parent, struct fname, rb_hash);
374
375 /*
376 * If the hash and minor hash match up, then we put
377 * them on a linked list. This rarely happens...
378 */
379 if ((new_fn->hash == fname->hash) &&
380 (new_fn->minor_hash == fname->minor_hash)) {
381 new_fn->next = fname->next;
382 fname->next = new_fn;
383 return 0;
384 }
385
386 if (new_fn->hash < fname->hash)
387 p = &(*p)->rb_left;
388 else if (new_fn->hash > fname->hash)
389 p = &(*p)->rb_right;
390 else if (new_fn->minor_hash < fname->minor_hash)
391 p = &(*p)->rb_left;
392 else /* if (new_fn->minor_hash > fname->minor_hash) */
393 p = &(*p)->rb_right;
394 }
395
396 rb_link_node(&new_fn->rb_hash, parent, p);
397 rb_insert_color(&new_fn->rb_hash, &info->root);
398 return 0;
399}
400
401
402
403/*
404 * This is a helper function for ext3_dx_readdir. It calls filldir
405 * for all entres on the fname linked list. (Normally there is only
406 * one entry on the linked list, unless there are 62 bit hash collisions.)
407 */
408static bool call_filldir(struct file *file, struct dir_context *ctx,
409 struct fname *fname)
410{
411 struct dir_private_info *info = file->private_data;
412 struct inode *inode = file_inode(file);
413 struct super_block *sb = inode->i_sb;
414
415 if (!fname) {
416 printk("call_filldir: called with null fname?!?\n");
417 return true;
418 }
419 ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
420 while (fname) {
421 if (!dir_emit(ctx, fname->name, fname->name_len,
422 fname->inode,
423 get_dtype(sb, fname->file_type))) {
424 info->extra_fname = fname;
425 return false;
426 }
427 fname = fname->next;
428 }
429 return true;
430}
431
432static int ext3_dx_readdir(struct file *file, struct dir_context *ctx)
433{
434 struct dir_private_info *info = file->private_data;
435 struct inode *inode = file_inode(file);
436 struct fname *fname;
437 int ret;
438
439 if (!info) {
440 info = ext3_htree_create_dir_info(file, ctx->pos);
441 if (!info)
442 return -ENOMEM;
443 file->private_data = info;
444 }
445
446 if (ctx->pos == ext3_get_htree_eof(file))
447 return 0; /* EOF */
448
449 /* Some one has messed with f_pos; reset the world */
450 if (info->last_pos != ctx->pos) {
451 free_rb_tree_fname(&info->root);
452 info->curr_node = NULL;
453 info->extra_fname = NULL;
454 info->curr_hash = pos2maj_hash(file, ctx->pos);
455 info->curr_minor_hash = pos2min_hash(file, ctx->pos);
456 }
457
458 /*
459 * If there are any leftover names on the hash collision
460 * chain, return them first.
461 */
462 if (info->extra_fname) {
463 if (!call_filldir(file, ctx, info->extra_fname))
464 goto finished;
465 info->extra_fname = NULL;
466 goto next_node;
467 } else if (!info->curr_node)
468 info->curr_node = rb_first(&info->root);
469
470 while (1) {
471 /*
472 * Fill the rbtree if we have no more entries,
473 * or the inode has changed since we last read in the
474 * cached entries.
475 */
476 if ((!info->curr_node) ||
477 (file->f_version != inode->i_version)) {
478 info->curr_node = NULL;
479 free_rb_tree_fname(&info->root);
480 file->f_version = inode->i_version;
481 ret = ext3_htree_fill_tree(file, info->curr_hash,
482 info->curr_minor_hash,
483 &info->next_hash);
484 if (ret < 0)
485 return ret;
486 if (ret == 0) {
487 ctx->pos = ext3_get_htree_eof(file);
488 break;
489 }
490 info->curr_node = rb_first(&info->root);
491 }
492
493 fname = rb_entry(info->curr_node, struct fname, rb_hash);
494 info->curr_hash = fname->hash;
495 info->curr_minor_hash = fname->minor_hash;
496 if (!call_filldir(file, ctx, fname))
497 break;
498 next_node:
499 info->curr_node = rb_next(info->curr_node);
500 if (info->curr_node) {
501 fname = rb_entry(info->curr_node, struct fname,
502 rb_hash);
503 info->curr_hash = fname->hash;
504 info->curr_minor_hash = fname->minor_hash;
505 } else {
506 if (info->next_hash == ~0) {
507 ctx->pos = ext3_get_htree_eof(file);
508 break;
509 }
510 info->curr_hash = info->next_hash;
511 info->curr_minor_hash = 0;
512 }
513 }
514finished:
515 info->last_pos = ctx->pos;
516 return 0;
517}
518
519static int ext3_release_dir (struct inode * inode, struct file * filp)
520{
521 if (filp->private_data)
522 ext3_htree_free_dir_info(filp->private_data);
523
524 return 0;
525}
526
527const struct file_operations ext3_dir_operations = {
528 .llseek = ext3_dir_llseek,
529 .read = generic_read_dir,
530 .iterate = ext3_readdir,
531 .unlocked_ioctl = ext3_ioctl,
532#ifdef CONFIG_COMPAT
533 .compat_ioctl = ext3_compat_ioctl,
534#endif
535 .fsync = ext3_sync_file,
536 .release = ext3_release_dir,
537};
diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h
deleted file mode 100644
index f483a80b3fe7..000000000000
--- a/fs/ext3/ext3.h
+++ /dev/null
@@ -1,1332 +0,0 @@
1/*
2 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
3 *
4 * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
5 *
6 * This file is part of the Linux kernel and is made available under
7 * the terms of the GNU General Public License, version 2, or at your
8 * option, any later version, incorporated herein by reference.
9 *
10 * Copyright (C) 1992, 1993, 1994, 1995
11 * Remy Card (card@masi.ibp.fr)
12 * Laboratoire MASI - Institut Blaise Pascal
13 * Universite Pierre et Marie Curie (Paris VI)
14 *
15 * from
16 *
17 * linux/include/linux/minix_fs.h
18 *
19 * Copyright (C) 1991, 1992 Linus Torvalds
20 */
21
22#include <linux/fs.h>
23#include <linux/jbd.h>
24#include <linux/magic.h>
25#include <linux/bug.h>
26#include <linux/blockgroup_lock.h>
27
28/*
29 * The second extended filesystem constants/structures
30 */
31
32/*
33 * Define EXT3FS_DEBUG to produce debug messages
34 */
35#undef EXT3FS_DEBUG
36
37/*
38 * Define EXT3_RESERVATION to reserve data blocks for expanding files
39 */
40#define EXT3_DEFAULT_RESERVE_BLOCKS 8
41/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
42#define EXT3_MAX_RESERVE_BLOCKS 1027
43#define EXT3_RESERVE_WINDOW_NOT_ALLOCATED 0
44
45/*
46 * Debug code
47 */
48#ifdef EXT3FS_DEBUG
49#define ext3_debug(f, a...) \
50 do { \
51 printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:", \
52 __FILE__, __LINE__, __func__); \
53 printk (KERN_DEBUG f, ## a); \
54 } while (0)
55#else
56#define ext3_debug(f, a...) do {} while (0)
57#endif
58
59/*
60 * Special inodes numbers
61 */
62#define EXT3_BAD_INO 1 /* Bad blocks inode */
63#define EXT3_ROOT_INO 2 /* Root inode */
64#define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */
65#define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */
66#define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */
67#define EXT3_JOURNAL_INO 8 /* Journal inode */
68
69/* First non-reserved inode for old ext3 filesystems */
70#define EXT3_GOOD_OLD_FIRST_INO 11
71
72/*
73 * Maximal count of links to a file
74 */
75#define EXT3_LINK_MAX 32000
76
77/*
78 * Macro-instructions used to manage several block sizes
79 */
80#define EXT3_MIN_BLOCK_SIZE 1024
81#define EXT3_MAX_BLOCK_SIZE 65536
82#define EXT3_MIN_BLOCK_LOG_SIZE 10
83#define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize)
84#define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32))
85#define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
86#define EXT3_ADDR_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_addr_per_block_bits)
87#define EXT3_INODE_SIZE(s) (EXT3_SB(s)->s_inode_size)
88#define EXT3_FIRST_INO(s) (EXT3_SB(s)->s_first_ino)
89
90/*
91 * Macro-instructions used to manage fragments
92 */
93#define EXT3_MIN_FRAG_SIZE 1024
94#define EXT3_MAX_FRAG_SIZE 4096
95#define EXT3_MIN_FRAG_LOG_SIZE 10
96#define EXT3_FRAG_SIZE(s) (EXT3_SB(s)->s_frag_size)
97#define EXT3_FRAGS_PER_BLOCK(s) (EXT3_SB(s)->s_frags_per_block)
98
99/*
100 * Structure of a blocks group descriptor
101 */
102struct ext3_group_desc
103{
104 __le32 bg_block_bitmap; /* Blocks bitmap block */
105 __le32 bg_inode_bitmap; /* Inodes bitmap block */
106 __le32 bg_inode_table; /* Inodes table block */
107 __le16 bg_free_blocks_count; /* Free blocks count */
108 __le16 bg_free_inodes_count; /* Free inodes count */
109 __le16 bg_used_dirs_count; /* Directories count */
110 __u16 bg_pad;
111 __le32 bg_reserved[3];
112};
113
114/*
115 * Macro-instructions used to manage group descriptors
116 */
117#define EXT3_BLOCKS_PER_GROUP(s) (EXT3_SB(s)->s_blocks_per_group)
118#define EXT3_DESC_PER_BLOCK(s) (EXT3_SB(s)->s_desc_per_block)
119#define EXT3_INODES_PER_GROUP(s) (EXT3_SB(s)->s_inodes_per_group)
120#define EXT3_DESC_PER_BLOCK_BITS(s) (EXT3_SB(s)->s_desc_per_block_bits)
121
122/*
123 * Constants relative to the data blocks
124 */
125#define EXT3_NDIR_BLOCKS 12
126#define EXT3_IND_BLOCK EXT3_NDIR_BLOCKS
127#define EXT3_DIND_BLOCK (EXT3_IND_BLOCK + 1)
128#define EXT3_TIND_BLOCK (EXT3_DIND_BLOCK + 1)
129#define EXT3_N_BLOCKS (EXT3_TIND_BLOCK + 1)
130
131/*
132 * Inode flags
133 */
134#define EXT3_SECRM_FL 0x00000001 /* Secure deletion */
135#define EXT3_UNRM_FL 0x00000002 /* Undelete */
136#define EXT3_COMPR_FL 0x00000004 /* Compress file */
137#define EXT3_SYNC_FL 0x00000008 /* Synchronous updates */
138#define EXT3_IMMUTABLE_FL 0x00000010 /* Immutable file */
139#define EXT3_APPEND_FL 0x00000020 /* writes to file may only append */
140#define EXT3_NODUMP_FL 0x00000040 /* do not dump file */
141#define EXT3_NOATIME_FL 0x00000080 /* do not update atime */
142/* Reserved for compression usage... */
143#define EXT3_DIRTY_FL 0x00000100
144#define EXT3_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */
145#define EXT3_NOCOMPR_FL 0x00000400 /* Don't compress */
146#define EXT3_ECOMPR_FL 0x00000800 /* Compression error */
147/* End compression flags --- maybe not all used */
148#define EXT3_INDEX_FL 0x00001000 /* hash-indexed directory */
149#define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */
150#define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */
151#define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */
152#define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
153#define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
154#define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */
155
156#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */
157#define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
158
159/* Flags that should be inherited by new inodes from their parent. */
160#define EXT3_FL_INHERITED (EXT3_SECRM_FL | EXT3_UNRM_FL | EXT3_COMPR_FL |\
161 EXT3_SYNC_FL | EXT3_NODUMP_FL |\
162 EXT3_NOATIME_FL | EXT3_COMPRBLK_FL |\
163 EXT3_NOCOMPR_FL | EXT3_JOURNAL_DATA_FL |\
164 EXT3_NOTAIL_FL | EXT3_DIRSYNC_FL)
165
166/* Flags that are appropriate for regular files (all but dir-specific ones). */
167#define EXT3_REG_FLMASK (~(EXT3_DIRSYNC_FL | EXT3_TOPDIR_FL))
168
169/* Flags that are appropriate for non-directories/regular files. */
170#define EXT3_OTHER_FLMASK (EXT3_NODUMP_FL | EXT3_NOATIME_FL)
171
172/* Mask out flags that are inappropriate for the given type of inode. */
173static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags)
174{
175 if (S_ISDIR(mode))
176 return flags;
177 else if (S_ISREG(mode))
178 return flags & EXT3_REG_FLMASK;
179 else
180 return flags & EXT3_OTHER_FLMASK;
181}
182
183/* Used to pass group descriptor data when online resize is done */
184struct ext3_new_group_input {
185 __u32 group; /* Group number for this data */
186 __u32 block_bitmap; /* Absolute block number of block bitmap */
187 __u32 inode_bitmap; /* Absolute block number of inode bitmap */
188 __u32 inode_table; /* Absolute block number of inode table start */
189 __u32 blocks_count; /* Total number of blocks in this group */
190 __u16 reserved_blocks; /* Number of reserved blocks in this group */
191 __u16 unused;
192};
193
194/* The struct ext3_new_group_input in kernel space, with free_blocks_count */
195struct ext3_new_group_data {
196 __u32 group;
197 __u32 block_bitmap;
198 __u32 inode_bitmap;
199 __u32 inode_table;
200 __u32 blocks_count;
201 __u16 reserved_blocks;
202 __u16 unused;
203 __u32 free_blocks_count;
204};
205
206
207/*
208 * ioctl commands
209 */
210#define EXT3_IOC_GETFLAGS FS_IOC_GETFLAGS
211#define EXT3_IOC_SETFLAGS FS_IOC_SETFLAGS
212#define EXT3_IOC_GETVERSION _IOR('f', 3, long)
213#define EXT3_IOC_SETVERSION _IOW('f', 4, long)
214#define EXT3_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
215#define EXT3_IOC_GROUP_ADD _IOW('f', 8,struct ext3_new_group_input)
216#define EXT3_IOC_GETVERSION_OLD FS_IOC_GETVERSION
217#define EXT3_IOC_SETVERSION_OLD FS_IOC_SETVERSION
218#ifdef CONFIG_JBD_DEBUG
219#define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long)
220#endif
221#define EXT3_IOC_GETRSVSZ _IOR('f', 5, long)
222#define EXT3_IOC_SETRSVSZ _IOW('f', 6, long)
223
224/*
225 * ioctl commands in 32 bit emulation
226 */
227#define EXT3_IOC32_GETFLAGS FS_IOC32_GETFLAGS
228#define EXT3_IOC32_SETFLAGS FS_IOC32_SETFLAGS
229#define EXT3_IOC32_GETVERSION _IOR('f', 3, int)
230#define EXT3_IOC32_SETVERSION _IOW('f', 4, int)
231#define EXT3_IOC32_GETRSVSZ _IOR('f', 5, int)
232#define EXT3_IOC32_SETRSVSZ _IOW('f', 6, int)
233#define EXT3_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
234#ifdef CONFIG_JBD_DEBUG
235#define EXT3_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
236#endif
237#define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
238#define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
239
240/* Number of supported quota types */
241#define EXT3_MAXQUOTAS 2
242
243/*
244 * Mount options
245 */
246struct ext3_mount_options {
247 unsigned long s_mount_opt;
248 kuid_t s_resuid;
249 kgid_t s_resgid;
250 unsigned long s_commit_interval;
251#ifdef CONFIG_QUOTA
252 int s_jquota_fmt;
253 char *s_qf_names[EXT3_MAXQUOTAS];
254#endif
255};
256
257/*
258 * Structure of an inode on the disk
259 */
260struct ext3_inode {
261 __le16 i_mode; /* File mode */
262 __le16 i_uid; /* Low 16 bits of Owner Uid */
263 __le32 i_size; /* Size in bytes */
264 __le32 i_atime; /* Access time */
265 __le32 i_ctime; /* Creation time */
266 __le32 i_mtime; /* Modification time */
267 __le32 i_dtime; /* Deletion Time */
268 __le16 i_gid; /* Low 16 bits of Group Id */
269 __le16 i_links_count; /* Links count */
270 __le32 i_blocks; /* Blocks count */
271 __le32 i_flags; /* File flags */
272 union {
273 struct {
274 __u32 l_i_reserved1;
275 } linux1;
276 struct {
277 __u32 h_i_translator;
278 } hurd1;
279 struct {
280 __u32 m_i_reserved1;
281 } masix1;
282 } osd1; /* OS dependent 1 */
283 __le32 i_block[EXT3_N_BLOCKS];/* Pointers to blocks */
284 __le32 i_generation; /* File version (for NFS) */
285 __le32 i_file_acl; /* File ACL */
286 __le32 i_dir_acl; /* Directory ACL */
287 __le32 i_faddr; /* Fragment address */
288 union {
289 struct {
290 __u8 l_i_frag; /* Fragment number */
291 __u8 l_i_fsize; /* Fragment size */
292 __u16 i_pad1;
293 __le16 l_i_uid_high; /* these 2 fields */
294 __le16 l_i_gid_high; /* were reserved2[0] */
295 __u32 l_i_reserved2;
296 } linux2;
297 struct {
298 __u8 h_i_frag; /* Fragment number */
299 __u8 h_i_fsize; /* Fragment size */
300 __u16 h_i_mode_high;
301 __u16 h_i_uid_high;
302 __u16 h_i_gid_high;
303 __u32 h_i_author;
304 } hurd2;
305 struct {
306 __u8 m_i_frag; /* Fragment number */
307 __u8 m_i_fsize; /* Fragment size */
308 __u16 m_pad1;
309 __u32 m_i_reserved2[2];
310 } masix2;
311 } osd2; /* OS dependent 2 */
312 __le16 i_extra_isize;
313 __le16 i_pad1;
314};
315
316#define i_size_high i_dir_acl
317
318#define i_reserved1 osd1.linux1.l_i_reserved1
319#define i_frag osd2.linux2.l_i_frag
320#define i_fsize osd2.linux2.l_i_fsize
321#define i_uid_low i_uid
322#define i_gid_low i_gid
323#define i_uid_high osd2.linux2.l_i_uid_high
324#define i_gid_high osd2.linux2.l_i_gid_high
325#define i_reserved2 osd2.linux2.l_i_reserved2
326
327/*
328 * File system states
329 */
330#define EXT3_VALID_FS 0x0001 /* Unmounted cleanly */
331#define EXT3_ERROR_FS 0x0002 /* Errors detected */
332#define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */
333
334/*
335 * Misc. filesystem flags
336 */
337#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */
338#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */
339#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */
340
341/*
342 * Mount flags
343 */
344#define EXT3_MOUNT_CHECK 0x00001 /* Do mount-time checks */
345/* EXT3_MOUNT_OLDALLOC was there */
346#define EXT3_MOUNT_GRPID 0x00004 /* Create files with directory's group */
347#define EXT3_MOUNT_DEBUG 0x00008 /* Some debugging messages */
348#define EXT3_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
349#define EXT3_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */
350#define EXT3_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */
351#define EXT3_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
352#define EXT3_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
353#define EXT3_MOUNT_ABORT 0x00200 /* Fatal error detected */
354#define EXT3_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
355#define EXT3_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
356#define EXT3_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
357#define EXT3_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */
358#define EXT3_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */
359#define EXT3_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */
360#define EXT3_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */
361#define EXT3_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
362#define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */
363#define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */
364#define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */
365#define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
366#define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
367#define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write
368 * error in ordered mode */
369
370/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
371#ifndef _LINUX_EXT2_FS_H
372#define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt
373#define set_opt(o, opt) o |= EXT3_MOUNT_##opt
374#define test_opt(sb, opt) (EXT3_SB(sb)->s_mount_opt & \
375 EXT3_MOUNT_##opt)
376#else
377#define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD
378#define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT
379#define EXT2_MOUNT_DATA_FLAGS EXT3_MOUNT_DATA_FLAGS
380#endif
381
382#define ext3_set_bit __set_bit_le
383#define ext3_set_bit_atomic ext2_set_bit_atomic
384#define ext3_clear_bit __clear_bit_le
385#define ext3_clear_bit_atomic ext2_clear_bit_atomic
386#define ext3_test_bit test_bit_le
387#define ext3_find_next_zero_bit find_next_zero_bit_le
388
389/*
390 * Maximal mount counts between two filesystem checks
391 */
392#define EXT3_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */
393#define EXT3_DFL_CHECKINTERVAL 0 /* Don't use interval check */
394
395/*
396 * Behaviour when detecting errors
397 */
398#define EXT3_ERRORS_CONTINUE 1 /* Continue execution */
399#define EXT3_ERRORS_RO 2 /* Remount fs read-only */
400#define EXT3_ERRORS_PANIC 3 /* Panic */
401#define EXT3_ERRORS_DEFAULT EXT3_ERRORS_CONTINUE
402
403/*
404 * Structure of the super block
405 */
406struct ext3_super_block {
407/*00*/ __le32 s_inodes_count; /* Inodes count */
408 __le32 s_blocks_count; /* Blocks count */
409 __le32 s_r_blocks_count; /* Reserved blocks count */
410 __le32 s_free_blocks_count; /* Free blocks count */
411/*10*/ __le32 s_free_inodes_count; /* Free inodes count */
412 __le32 s_first_data_block; /* First Data Block */
413 __le32 s_log_block_size; /* Block size */
414 __le32 s_log_frag_size; /* Fragment size */
415/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */
416 __le32 s_frags_per_group; /* # Fragments per group */
417 __le32 s_inodes_per_group; /* # Inodes per group */
418 __le32 s_mtime; /* Mount time */
419/*30*/ __le32 s_wtime; /* Write time */
420 __le16 s_mnt_count; /* Mount count */
421 __le16 s_max_mnt_count; /* Maximal mount count */
422 __le16 s_magic; /* Magic signature */
423 __le16 s_state; /* File system state */
424 __le16 s_errors; /* Behaviour when detecting errors */
425 __le16 s_minor_rev_level; /* minor revision level */
426/*40*/ __le32 s_lastcheck; /* time of last check */
427 __le32 s_checkinterval; /* max. time between checks */
428 __le32 s_creator_os; /* OS */
429 __le32 s_rev_level; /* Revision level */
430/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */
431 __le16 s_def_resgid; /* Default gid for reserved blocks */
432 /*
433 * These fields are for EXT3_DYNAMIC_REV superblocks only.
434 *
435 * Note: the difference between the compatible feature set and
436 * the incompatible feature set is that if there is a bit set
437 * in the incompatible feature set that the kernel doesn't
438 * know about, it should refuse to mount the filesystem.
439 *
440 * e2fsck's requirements are more strict; if it doesn't know
441 * about a feature in either the compatible or incompatible
442 * feature set, it must abort and not try to meddle with
443 * things it doesn't understand...
444 */
445 __le32 s_first_ino; /* First non-reserved inode */
446 __le16 s_inode_size; /* size of inode structure */
447 __le16 s_block_group_nr; /* block group # of this superblock */
448 __le32 s_feature_compat; /* compatible feature set */
449/*60*/ __le32 s_feature_incompat; /* incompatible feature set */
450 __le32 s_feature_ro_compat; /* readonly-compatible feature set */
451/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
452/*78*/ char s_volume_name[16]; /* volume name */
453/*88*/ char s_last_mounted[64]; /* directory where last mounted */
454/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
455 /*
456 * Performance hints. Directory preallocation should only
457 * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on.
458 */
459 __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/
460 __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */
461 __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */
462 /*
463 * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set.
464 */
465/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */
466/*E0*/ __le32 s_journal_inum; /* inode number of journal file */
467 __le32 s_journal_dev; /* device number of journal file */
468 __le32 s_last_orphan; /* start of list of inodes to delete */
469 __le32 s_hash_seed[4]; /* HTREE hash seed */
470 __u8 s_def_hash_version; /* Default hash version to use */
471 __u8 s_reserved_char_pad;
472 __u16 s_reserved_word_pad;
473 __le32 s_default_mount_opts;
474 __le32 s_first_meta_bg; /* First metablock block group */
475 __le32 s_mkfs_time; /* When the filesystem was created */
476 __le32 s_jnl_blocks[17]; /* Backup of the journal inode */
477 /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
478/*150*/ __le32 s_blocks_count_hi; /* Blocks count */
479 __le32 s_r_blocks_count_hi; /* Reserved blocks count */
480 __le32 s_free_blocks_count_hi; /* Free blocks count */
481 __le16 s_min_extra_isize; /* All inodes have at least # bytes */
482 __le16 s_want_extra_isize; /* New inodes should reserve # bytes */
483 __le32 s_flags; /* Miscellaneous flags */
484 __le16 s_raid_stride; /* RAID stride */
485 __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
486 __le64 s_mmp_block; /* Block for multi-mount protection */
487 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
488 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
489 __u8 s_reserved_char_pad2;
490 __le16 s_reserved_pad;
491 __u32 s_reserved[162]; /* Padding to the end of the block */
492};
493
494/* data type for block offset of block group */
495typedef int ext3_grpblk_t;
496
497/* data type for filesystem-wide blocks number */
498typedef unsigned long ext3_fsblk_t;
499
500#define E3FSBLK "%lu"
501
502struct ext3_reserve_window {
503 ext3_fsblk_t _rsv_start; /* First byte reserved */
504 ext3_fsblk_t _rsv_end; /* Last byte reserved or 0 */
505};
506
507struct ext3_reserve_window_node {
508 struct rb_node rsv_node;
509 __u32 rsv_goal_size;
510 __u32 rsv_alloc_hit;
511 struct ext3_reserve_window rsv_window;
512};
513
514struct ext3_block_alloc_info {
515 /* information about reservation window */
516 struct ext3_reserve_window_node rsv_window_node;
517 /*
518 * was i_next_alloc_block in ext3_inode_info
519 * is the logical (file-relative) number of the
520 * most-recently-allocated block in this file.
521 * We use this for detecting linearly ascending allocation requests.
522 */
523 __u32 last_alloc_logical_block;
524 /*
525 * Was i_next_alloc_goal in ext3_inode_info
526 * is the *physical* companion to i_next_alloc_block.
527 * it the physical block number of the block which was most-recentl
528 * allocated to this file. This give us the goal (target) for the next
529 * allocation when we detect linearly ascending requests.
530 */
531 ext3_fsblk_t last_alloc_physical_block;
532};
533
534#define rsv_start rsv_window._rsv_start
535#define rsv_end rsv_window._rsv_end
536
537/*
538 * third extended file system inode data in memory
539 */
540struct ext3_inode_info {
541 __le32 i_data[15]; /* unconverted */
542 __u32 i_flags;
543#ifdef EXT3_FRAGMENTS
544 __u32 i_faddr;
545 __u8 i_frag_no;
546 __u8 i_frag_size;
547#endif
548 ext3_fsblk_t i_file_acl;
549 __u32 i_dir_acl;
550 __u32 i_dtime;
551
552 /*
553 * i_block_group is the number of the block group which contains
554 * this file's inode. Constant across the lifetime of the inode,
555 * it is ued for making block allocation decisions - we try to
556 * place a file's data blocks near its inode block, and new inodes
557 * near to their parent directory's inode.
558 */
559 __u32 i_block_group;
560 unsigned long i_state_flags; /* Dynamic state flags for ext3 */
561
562 /* block reservation info */
563 struct ext3_block_alloc_info *i_block_alloc_info;
564
565 __u32 i_dir_start_lookup;
566#ifdef CONFIG_EXT3_FS_XATTR
567 /*
568 * Extended attributes can be read independently of the main file
569 * data. Taking i_mutex even when reading would cause contention
570 * between readers of EAs and writers of regular file data, so
571 * instead we synchronize on xattr_sem when reading or changing
572 * EAs.
573 */
574 struct rw_semaphore xattr_sem;
575#endif
576
577 struct list_head i_orphan; /* unlinked but open inodes */
578
579 /*
580 * i_disksize keeps track of what the inode size is ON DISK, not
581 * in memory. During truncate, i_size is set to the new size by
582 * the VFS prior to calling ext3_truncate(), but the filesystem won't
583 * set i_disksize to 0 until the truncate is actually under way.
584 *
585 * The intent is that i_disksize always represents the blocks which
586 * are used by this file. This allows recovery to restart truncate
587 * on orphans if we crash during truncate. We actually write i_disksize
588 * into the on-disk inode when writing inodes out, instead of i_size.
589 *
590 * The only time when i_disksize and i_size may be different is when
591 * a truncate is in progress. The only things which change i_disksize
592 * are ext3_get_block (growth) and ext3_truncate (shrinkth).
593 */
594 loff_t i_disksize;
595
596 /* on-disk additional length */
597 __u16 i_extra_isize;
598
599 /*
600 * truncate_mutex is for serialising ext3_truncate() against
601 * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's
602 * data tree are chopped off during truncate. We can't do that in
603 * ext3 because whenever we perform intermediate commits during
604 * truncate, the inode and all the metadata blocks *must* be in a
605 * consistent state which allows truncation of the orphans to restart
606 * during recovery. Hence we must fix the get_block-vs-truncate race
607 * by other means, so we have truncate_mutex.
608 */
609 struct mutex truncate_mutex;
610
611 /*
612 * Transactions that contain inode's metadata needed to complete
613 * fsync and fdatasync, respectively.
614 */
615 atomic_t i_sync_tid;
616 atomic_t i_datasync_tid;
617
618#ifdef CONFIG_QUOTA
619 struct dquot *i_dquot[MAXQUOTAS];
620#endif
621
622 struct inode vfs_inode;
623};
624
625/*
626 * third extended-fs super-block data in memory
627 */
628struct ext3_sb_info {
629 unsigned long s_frag_size; /* Size of a fragment in bytes */
630 unsigned long s_frags_per_block;/* Number of fragments per block */
631 unsigned long s_inodes_per_block;/* Number of inodes per block */
632 unsigned long s_frags_per_group;/* Number of fragments in a group */
633 unsigned long s_blocks_per_group;/* Number of blocks in a group */
634 unsigned long s_inodes_per_group;/* Number of inodes in a group */
635 unsigned long s_itb_per_group; /* Number of inode table blocks per group */
636 unsigned long s_gdb_count; /* Number of group descriptor blocks */
637 unsigned long s_desc_per_block; /* Number of group descriptors per block */
638 unsigned long s_groups_count; /* Number of groups in the fs */
639 unsigned long s_overhead_last; /* Last calculated overhead */
640 unsigned long s_blocks_last; /* Last seen block count */
641 struct buffer_head * s_sbh; /* Buffer containing the super block */
642 struct ext3_super_block * s_es; /* Pointer to the super block in the buffer */
643 struct buffer_head ** s_group_desc;
644 unsigned long s_mount_opt;
645 ext3_fsblk_t s_sb_block;
646 kuid_t s_resuid;
647 kgid_t s_resgid;
648 unsigned short s_mount_state;
649 unsigned short s_pad;
650 int s_addr_per_block_bits;
651 int s_desc_per_block_bits;
652 int s_inode_size;
653 int s_first_ino;
654 spinlock_t s_next_gen_lock;
655 u32 s_next_generation;
656 u32 s_hash_seed[4];
657 int s_def_hash_version;
658 int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
659 struct percpu_counter s_freeblocks_counter;
660 struct percpu_counter s_freeinodes_counter;
661 struct percpu_counter s_dirs_counter;
662 struct blockgroup_lock *s_blockgroup_lock;
663
664 /* root of the per fs reservation window tree */
665 spinlock_t s_rsv_window_lock;
666 struct rb_root s_rsv_window_root;
667 struct ext3_reserve_window_node s_rsv_window_head;
668
669 /* Journaling */
670 struct inode * s_journal_inode;
671 struct journal_s * s_journal;
672 struct list_head s_orphan;
673 struct mutex s_orphan_lock;
674 struct mutex s_resize_lock;
675 unsigned long s_commit_interval;
676 struct block_device *journal_bdev;
677#ifdef CONFIG_QUOTA
678 char *s_qf_names[EXT3_MAXQUOTAS]; /* Names of quota files with journalled quota */
679 int s_jquota_fmt; /* Format of quota to use */
680#endif
681};
682
683static inline spinlock_t *
684sb_bgl_lock(struct ext3_sb_info *sbi, unsigned int block_group)
685{
686 return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
687}
688
689static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb)
690{
691 return sb->s_fs_info;
692}
693static inline struct ext3_inode_info *EXT3_I(struct inode *inode)
694{
695 return container_of(inode, struct ext3_inode_info, vfs_inode);
696}
697
698static inline int ext3_valid_inum(struct super_block *sb, unsigned long ino)
699{
700 return ino == EXT3_ROOT_INO ||
701 ino == EXT3_JOURNAL_INO ||
702 ino == EXT3_RESIZE_INO ||
703 (ino >= EXT3_FIRST_INO(sb) &&
704 ino <= le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count));
705}
706
707/*
708 * Inode dynamic state flags
709 */
710enum {
711 EXT3_STATE_JDATA, /* journaled data exists */
712 EXT3_STATE_NEW, /* inode is newly created */
713 EXT3_STATE_XATTR, /* has in-inode xattrs */
714 EXT3_STATE_FLUSH_ON_CLOSE, /* flush dirty pages on close */
715};
716
717static inline int ext3_test_inode_state(struct inode *inode, int bit)
718{
719 return test_bit(bit, &EXT3_I(inode)->i_state_flags);
720}
721
722static inline void ext3_set_inode_state(struct inode *inode, int bit)
723{
724 set_bit(bit, &EXT3_I(inode)->i_state_flags);
725}
726
727static inline void ext3_clear_inode_state(struct inode *inode, int bit)
728{
729 clear_bit(bit, &EXT3_I(inode)->i_state_flags);
730}
731
732#define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime
733
734/*
735 * Codes for operating systems
736 */
737#define EXT3_OS_LINUX 0
738#define EXT3_OS_HURD 1
739#define EXT3_OS_MASIX 2
740#define EXT3_OS_FREEBSD 3
741#define EXT3_OS_LITES 4
742
743/*
744 * Revision levels
745 */
746#define EXT3_GOOD_OLD_REV 0 /* The good old (original) format */
747#define EXT3_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */
748
749#define EXT3_CURRENT_REV EXT3_GOOD_OLD_REV
750#define EXT3_MAX_SUPP_REV EXT3_DYNAMIC_REV
751
752#define EXT3_GOOD_OLD_INODE_SIZE 128
753
754/*
755 * Feature set definitions
756 */
757
758#define EXT3_HAS_COMPAT_FEATURE(sb,mask) \
759 ( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
760#define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask) \
761 ( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
762#define EXT3_HAS_INCOMPAT_FEATURE(sb,mask) \
763 ( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
764#define EXT3_SET_COMPAT_FEATURE(sb,mask) \
765 EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
766#define EXT3_SET_RO_COMPAT_FEATURE(sb,mask) \
767 EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
768#define EXT3_SET_INCOMPAT_FEATURE(sb,mask) \
769 EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
770#define EXT3_CLEAR_COMPAT_FEATURE(sb,mask) \
771 EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
772#define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask) \
773 EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
774#define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask) \
775 EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
776
777#define EXT3_FEATURE_COMPAT_DIR_PREALLOC 0x0001
778#define EXT3_FEATURE_COMPAT_IMAGIC_INODES 0x0002
779#define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004
780#define EXT3_FEATURE_COMPAT_EXT_ATTR 0x0008
781#define EXT3_FEATURE_COMPAT_RESIZE_INODE 0x0010
782#define EXT3_FEATURE_COMPAT_DIR_INDEX 0x0020
783
784#define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
785#define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
786#define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004
787
788#define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001
789#define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002
790#define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */
791#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */
792#define EXT3_FEATURE_INCOMPAT_META_BG 0x0010
793
794#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
795#define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \
796 EXT3_FEATURE_INCOMPAT_RECOVER| \
797 EXT3_FEATURE_INCOMPAT_META_BG)
798#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
799 EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
800 EXT3_FEATURE_RO_COMPAT_BTREE_DIR)
801
802/*
803 * Default values for user and/or group using reserved blocks
804 */
805#define EXT3_DEF_RESUID 0
806#define EXT3_DEF_RESGID 0
807
808/*
809 * Default mount options
810 */
811#define EXT3_DEFM_DEBUG 0x0001
812#define EXT3_DEFM_BSDGROUPS 0x0002
813#define EXT3_DEFM_XATTR_USER 0x0004
814#define EXT3_DEFM_ACL 0x0008
815#define EXT3_DEFM_UID16 0x0010
816#define EXT3_DEFM_JMODE 0x0060
817#define EXT3_DEFM_JMODE_DATA 0x0020
818#define EXT3_DEFM_JMODE_ORDERED 0x0040
819#define EXT3_DEFM_JMODE_WBACK 0x0060
820
821/*
822 * Structure of a directory entry
823 */
824#define EXT3_NAME_LEN 255
825
826struct ext3_dir_entry {
827 __le32 inode; /* Inode number */
828 __le16 rec_len; /* Directory entry length */
829 __le16 name_len; /* Name length */
830 char name[EXT3_NAME_LEN]; /* File name */
831};
832
833/*
834 * The new version of the directory entry. Since EXT3 structures are
835 * stored in intel byte order, and the name_len field could never be
836 * bigger than 255 chars, it's safe to reclaim the extra byte for the
837 * file_type field.
838 */
839struct ext3_dir_entry_2 {
840 __le32 inode; /* Inode number */
841 __le16 rec_len; /* Directory entry length */
842 __u8 name_len; /* Name length */
843 __u8 file_type;
844 char name[EXT3_NAME_LEN]; /* File name */
845};
846
847/*
848 * Ext3 directory file types. Only the low 3 bits are used. The
849 * other bits are reserved for now.
850 */
851#define EXT3_FT_UNKNOWN 0
852#define EXT3_FT_REG_FILE 1
853#define EXT3_FT_DIR 2
854#define EXT3_FT_CHRDEV 3
855#define EXT3_FT_BLKDEV 4
856#define EXT3_FT_FIFO 5
857#define EXT3_FT_SOCK 6
858#define EXT3_FT_SYMLINK 7
859
860#define EXT3_FT_MAX 8
861
862/*
863 * EXT3_DIR_PAD defines the directory entries boundaries
864 *
865 * NOTE: It must be a multiple of 4
866 */
867#define EXT3_DIR_PAD 4
868#define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1)
869#define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \
870 ~EXT3_DIR_ROUND)
871#define EXT3_MAX_REC_LEN ((1<<16)-1)
872
873/*
874 * Tests against MAX_REC_LEN etc were put in place for 64k block
875 * sizes; if that is not possible on this arch, we can skip
876 * those tests and speed things up.
877 */
878static inline unsigned ext3_rec_len_from_disk(__le16 dlen)
879{
880 unsigned len = le16_to_cpu(dlen);
881
882#if (PAGE_CACHE_SIZE >= 65536)
883 if (len == EXT3_MAX_REC_LEN)
884 return 1 << 16;
885#endif
886 return len;
887}
888
889static inline __le16 ext3_rec_len_to_disk(unsigned len)
890{
891#if (PAGE_CACHE_SIZE >= 65536)
892 if (len == (1 << 16))
893 return cpu_to_le16(EXT3_MAX_REC_LEN);
894 else if (len > (1 << 16))
895 BUG();
896#endif
897 return cpu_to_le16(len);
898}
899
900/*
901 * Hash Tree Directory indexing
902 * (c) Daniel Phillips, 2001
903 */
904
905#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
906 EXT3_FEATURE_COMPAT_DIR_INDEX) && \
907 (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
908#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
909#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
910
911/* Legal values for the dx_root hash_version field: */
912
913#define DX_HASH_LEGACY 0
914#define DX_HASH_HALF_MD4 1
915#define DX_HASH_TEA 2
916#define DX_HASH_LEGACY_UNSIGNED 3
917#define DX_HASH_HALF_MD4_UNSIGNED 4
918#define DX_HASH_TEA_UNSIGNED 5
919
920/* hash info structure used by the directory hash */
921struct dx_hash_info
922{
923 u32 hash;
924 u32 minor_hash;
925 int hash_version;
926 u32 *seed;
927};
928
929
930/* 32 and 64 bit signed EOF for dx directories */
931#define EXT3_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1)
932#define EXT3_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1)
933
934
935/*
936 * Control parameters used by ext3_htree_next_block
937 */
938#define HASH_NB_ALWAYS 1
939
940
941/*
942 * Describe an inode's exact location on disk and in memory
943 */
944struct ext3_iloc
945{
946 struct buffer_head *bh;
947 unsigned long offset;
948 unsigned long block_group;
949};
950
951static inline struct ext3_inode *ext3_raw_inode(struct ext3_iloc *iloc)
952{
953 return (struct ext3_inode *) (iloc->bh->b_data + iloc->offset);
954}
955
956/*
957 * This structure is stuffed into the struct file's private_data field
958 * for directories. It is where we put information so that we can do
959 * readdir operations in hash tree order.
960 */
961struct dir_private_info {
962 struct rb_root root;
963 struct rb_node *curr_node;
964 struct fname *extra_fname;
965 loff_t last_pos;
966 __u32 curr_hash;
967 __u32 curr_minor_hash;
968 __u32 next_hash;
969};
970
971/* calculate the first block number of the group */
972static inline ext3_fsblk_t
973ext3_group_first_block_no(struct super_block *sb, unsigned long group_no)
974{
975 return group_no * (ext3_fsblk_t)EXT3_BLOCKS_PER_GROUP(sb) +
976 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block);
977}
978
979/*
980 * Special error return code only used by dx_probe() and its callers.
981 */
982#define ERR_BAD_DX_DIR -75000
983
984/*
985 * Function prototypes
986 */
987
988/*
989 * Ok, these declarations are also in <linux/kernel.h> but none of the
990 * ext3 source programs needs to include it so they are duplicated here.
991 */
992# define NORET_TYPE /**/
993# define ATTRIB_NORET __attribute__((noreturn))
994# define NORET_AND noreturn,
995
996/* balloc.c */
997extern int ext3_bg_has_super(struct super_block *sb, int group);
998extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
999extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode,
1000 ext3_fsblk_t goal, int *errp);
1001extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode,
1002 ext3_fsblk_t goal, unsigned long *count, int *errp);
1003extern void ext3_free_blocks (handle_t *handle, struct inode *inode,
1004 ext3_fsblk_t block, unsigned long count);
1005extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb,
1006 ext3_fsblk_t block, unsigned long count,
1007 unsigned long *pdquot_freed_blocks);
1008extern ext3_fsblk_t ext3_count_free_blocks (struct super_block *);
1009extern void ext3_check_blocks_bitmap (struct super_block *);
1010extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
1011 unsigned int block_group,
1012 struct buffer_head ** bh);
1013extern int ext3_should_retry_alloc(struct super_block *sb, int *retries);
1014extern void ext3_init_block_alloc_info(struct inode *);
1015extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv);
1016extern int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range);
1017
1018/* dir.c */
1019extern int ext3_check_dir_entry(const char *, struct inode *,
1020 struct ext3_dir_entry_2 *,
1021 struct buffer_head *, unsigned long);
1022extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
1023 __u32 minor_hash,
1024 struct ext3_dir_entry_2 *dirent);
1025extern void ext3_htree_free_dir_info(struct dir_private_info *p);
1026
1027/* fsync.c */
1028extern int ext3_sync_file(struct file *, loff_t, loff_t, int);
1029
1030/* hash.c */
1031extern int ext3fs_dirhash(const char *name, int len, struct
1032 dx_hash_info *hinfo);
1033
1034/* ialloc.c */
1035extern struct inode * ext3_new_inode (handle_t *, struct inode *,
1036 const struct qstr *, umode_t);
1037extern void ext3_free_inode (handle_t *, struct inode *);
1038extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
1039extern unsigned long ext3_count_free_inodes (struct super_block *);
1040extern unsigned long ext3_count_dirs (struct super_block *);
1041extern void ext3_check_inodes_bitmap (struct super_block *);
1042extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
1043
1044
1045/* inode.c */
1046int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
1047 struct buffer_head *bh, ext3_fsblk_t blocknr);
1048struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
1049struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
1050int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
1051 sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result,
1052 int create);
1053
1054extern struct inode *ext3_iget(struct super_block *, unsigned long);
1055extern int ext3_write_inode (struct inode *, struct writeback_control *);
1056extern int ext3_setattr (struct dentry *, struct iattr *);
1057extern void ext3_evict_inode (struct inode *);
1058extern int ext3_sync_inode (handle_t *, struct inode *);
1059extern void ext3_discard_reservation (struct inode *);
1060extern void ext3_dirty_inode(struct inode *, int);
1061extern int ext3_change_inode_journal_flag(struct inode *, int);
1062extern int ext3_get_inode_loc(struct inode *, struct ext3_iloc *);
1063extern int ext3_can_truncate(struct inode *inode);
1064extern void ext3_truncate(struct inode *inode);
1065extern void ext3_set_inode_flags(struct inode *);
1066extern void ext3_get_inode_flags(struct ext3_inode_info *);
1067extern void ext3_set_aops(struct inode *inode);
1068extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1069 u64 start, u64 len);
1070
1071/* ioctl.c */
1072extern long ext3_ioctl(struct file *, unsigned int, unsigned long);
1073extern long ext3_compat_ioctl(struct file *, unsigned int, unsigned long);
1074
1075/* namei.c */
1076extern int ext3_orphan_add(handle_t *, struct inode *);
1077extern int ext3_orphan_del(handle_t *, struct inode *);
1078extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
1079 __u32 start_minor_hash, __u32 *next_hash);
1080
1081/* resize.c */
1082extern int ext3_group_add(struct super_block *sb,
1083 struct ext3_new_group_data *input);
1084extern int ext3_group_extend(struct super_block *sb,
1085 struct ext3_super_block *es,
1086 ext3_fsblk_t n_blocks_count);
1087
1088/* super.c */
1089extern __printf(3, 4)
1090void ext3_error(struct super_block *, const char *, const char *, ...);
1091extern void __ext3_std_error (struct super_block *, const char *, int);
1092extern __printf(3, 4)
1093void ext3_abort(struct super_block *, const char *, const char *, ...);
1094extern __printf(3, 4)
1095void ext3_warning(struct super_block *, const char *, const char *, ...);
1096extern __printf(3, 4)
1097void ext3_msg(struct super_block *, const char *, const char *, ...);
1098extern void ext3_update_dynamic_rev (struct super_block *sb);
1099
1100#define ext3_std_error(sb, errno) \
1101do { \
1102 if ((errno)) \
1103 __ext3_std_error((sb), __func__, (errno)); \
1104} while (0)
1105
1106/*
1107 * Inodes and files operations
1108 */
1109
1110/* dir.c */
1111extern const struct file_operations ext3_dir_operations;
1112
1113/* file.c */
1114extern const struct inode_operations ext3_file_inode_operations;
1115extern const struct file_operations ext3_file_operations;
1116
1117/* namei.c */
1118extern const struct inode_operations ext3_dir_inode_operations;
1119extern const struct inode_operations ext3_special_inode_operations;
1120
1121/* symlink.c */
1122extern const struct inode_operations ext3_symlink_inode_operations;
1123extern const struct inode_operations ext3_fast_symlink_inode_operations;
1124
1125#define EXT3_JOURNAL(inode) (EXT3_SB((inode)->i_sb)->s_journal)
1126
1127/* Define the number of blocks we need to account to a transaction to
1128 * modify one block of data.
1129 *
1130 * We may have to touch one inode, one bitmap buffer, up to three
1131 * indirection blocks, the group and superblock summaries, and the data
1132 * block to complete the transaction. */
1133
1134#define EXT3_SINGLEDATA_TRANS_BLOCKS 8U
1135
1136/* Extended attribute operations touch at most two data buffers,
1137 * two bitmap buffers, and two group summaries, in addition to the inode
1138 * and the superblock, which are already accounted for. */
1139
1140#define EXT3_XATTR_TRANS_BLOCKS 6U
1141
1142/* Define the minimum size for a transaction which modifies data. This
1143 * needs to take into account the fact that we may end up modifying two
1144 * quota files too (one for the group, one for the user quota). The
1145 * superblock only gets updated once, of course, so don't bother
1146 * counting that again for the quota updates. */
1147
1148#define EXT3_DATA_TRANS_BLOCKS(sb) (EXT3_SINGLEDATA_TRANS_BLOCKS + \
1149 EXT3_XATTR_TRANS_BLOCKS - 2 + \
1150 EXT3_MAXQUOTAS_TRANS_BLOCKS(sb))
1151
1152/* Delete operations potentially hit one directory's namespace plus an
1153 * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
1154 * generous. We can grow the delete transaction later if necessary. */
1155
1156#define EXT3_DELETE_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) + 64)
1157
1158/* Define an arbitrary limit for the amount of data we will anticipate
1159 * writing to any given transaction. For unbounded transactions such as
1160 * write(2) and truncate(2) we can write more than this, but we always
1161 * start off at the maximum transaction size and grow the transaction
1162 * optimistically as we go. */
1163
1164#define EXT3_MAX_TRANS_DATA 64U
1165
1166/* We break up a large truncate or write transaction once the handle's
1167 * buffer credits gets this low, we need either to extend the
1168 * transaction or to start a new one. Reserve enough space here for
1169 * inode, bitmap, superblock, group and indirection updates for at least
1170 * one block, plus two quota updates. Quota allocations are not
1171 * needed. */
1172
1173#define EXT3_RESERVE_TRANS_BLOCKS 12U
1174
1175#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8
1176
1177#ifdef CONFIG_QUOTA
1178/* Amount of blocks needed for quota update - we know that the structure was
1179 * allocated so we need to update only inode+data */
1180#define EXT3_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
1181/* Amount of blocks needed for quota insert/delete - we do some block writes
1182 * but inode, sb and group updates are done only once */
1183#define EXT3_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
1184 (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0)
1185#define EXT3_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
1186 (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0)
1187#else
1188#define EXT3_QUOTA_TRANS_BLOCKS(sb) 0
1189#define EXT3_QUOTA_INIT_BLOCKS(sb) 0
1190#define EXT3_QUOTA_DEL_BLOCKS(sb) 0
1191#endif
1192#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb))
1193#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb))
1194#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb))
1195
1196int
1197ext3_mark_iloc_dirty(handle_t *handle,
1198 struct inode *inode,
1199 struct ext3_iloc *iloc);
1200
1201/*
1202 * On success, We end up with an outstanding reference count against
1203 * iloc->bh. This _must_ be cleaned up later.
1204 */
1205
1206int ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
1207 struct ext3_iloc *iloc);
1208
1209int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode);
1210
1211/*
1212 * Wrapper functions with which ext3 calls into JBD. The intent here is
1213 * to allow these to be turned into appropriate stubs so ext3 can control
1214 * ext2 filesystems, so ext2+ext3 systems only nee one fs. This work hasn't
1215 * been done yet.
1216 */
1217
1218static inline void ext3_journal_release_buffer(handle_t *handle,
1219 struct buffer_head *bh)
1220{
1221 journal_release_buffer(handle, bh);
1222}
1223
1224void ext3_journal_abort_handle(const char *caller, const char *err_fn,
1225 struct buffer_head *bh, handle_t *handle, int err);
1226
1227int __ext3_journal_get_undo_access(const char *where, handle_t *handle,
1228 struct buffer_head *bh);
1229
1230int __ext3_journal_get_write_access(const char *where, handle_t *handle,
1231 struct buffer_head *bh);
1232
1233int __ext3_journal_forget(const char *where, handle_t *handle,
1234 struct buffer_head *bh);
1235
1236int __ext3_journal_revoke(const char *where, handle_t *handle,
1237 unsigned long blocknr, struct buffer_head *bh);
1238
1239int __ext3_journal_get_create_access(const char *where,
1240 handle_t *handle, struct buffer_head *bh);
1241
1242int __ext3_journal_dirty_metadata(const char *where,
1243 handle_t *handle, struct buffer_head *bh);
1244
1245#define ext3_journal_get_undo_access(handle, bh) \
1246 __ext3_journal_get_undo_access(__func__, (handle), (bh))
1247#define ext3_journal_get_write_access(handle, bh) \
1248 __ext3_journal_get_write_access(__func__, (handle), (bh))
1249#define ext3_journal_revoke(handle, blocknr, bh) \
1250 __ext3_journal_revoke(__func__, (handle), (blocknr), (bh))
1251#define ext3_journal_get_create_access(handle, bh) \
1252 __ext3_journal_get_create_access(__func__, (handle), (bh))
1253#define ext3_journal_dirty_metadata(handle, bh) \
1254 __ext3_journal_dirty_metadata(__func__, (handle), (bh))
1255#define ext3_journal_forget(handle, bh) \
1256 __ext3_journal_forget(__func__, (handle), (bh))
1257
1258int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
1259
1260handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks);
1261int __ext3_journal_stop(const char *where, handle_t *handle);
1262
1263static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks)
1264{
1265 return ext3_journal_start_sb(inode->i_sb, nblocks);
1266}
1267
1268#define ext3_journal_stop(handle) \
1269 __ext3_journal_stop(__func__, (handle))
1270
1271static inline handle_t *ext3_journal_current_handle(void)
1272{
1273 return journal_current_handle();
1274}
1275
1276static inline int ext3_journal_extend(handle_t *handle, int nblocks)
1277{
1278 return journal_extend(handle, nblocks);
1279}
1280
1281static inline int ext3_journal_restart(handle_t *handle, int nblocks)
1282{
1283 return journal_restart(handle, nblocks);
1284}
1285
1286static inline int ext3_journal_blocks_per_page(struct inode *inode)
1287{
1288 return journal_blocks_per_page(inode);
1289}
1290
1291static inline int ext3_journal_force_commit(journal_t *journal)
1292{
1293 return journal_force_commit(journal);
1294}
1295
1296/* super.c */
1297int ext3_force_commit(struct super_block *sb);
1298
1299static inline int ext3_should_journal_data(struct inode *inode)
1300{
1301 if (!S_ISREG(inode->i_mode))
1302 return 1;
1303 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
1304 return 1;
1305 if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
1306 return 1;
1307 return 0;
1308}
1309
1310static inline int ext3_should_order_data(struct inode *inode)
1311{
1312 if (!S_ISREG(inode->i_mode))
1313 return 0;
1314 if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
1315 return 0;
1316 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
1317 return 1;
1318 return 0;
1319}
1320
1321static inline int ext3_should_writeback_data(struct inode *inode)
1322{
1323 if (!S_ISREG(inode->i_mode))
1324 return 0;
1325 if (EXT3_I(inode)->i_flags & EXT3_JOURNAL_DATA_FL)
1326 return 0;
1327 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
1328 return 1;
1329 return 0;
1330}
1331
1332#include <trace/events/ext3.h>
diff --git a/fs/ext3/ext3_jbd.c b/fs/ext3/ext3_jbd.c
deleted file mode 100644
index 785a3261a26c..000000000000
--- a/fs/ext3/ext3_jbd.c
+++ /dev/null
@@ -1,59 +0,0 @@
1/*
2 * Interface between ext3 and JBD
3 */
4
5#include "ext3.h"
6
7int __ext3_journal_get_undo_access(const char *where, handle_t *handle,
8 struct buffer_head *bh)
9{
10 int err = journal_get_undo_access(handle, bh);
11 if (err)
12 ext3_journal_abort_handle(where, __func__, bh, handle,err);
13 return err;
14}
15
16int __ext3_journal_get_write_access(const char *where, handle_t *handle,
17 struct buffer_head *bh)
18{
19 int err = journal_get_write_access(handle, bh);
20 if (err)
21 ext3_journal_abort_handle(where, __func__, bh, handle,err);
22 return err;
23}
24
25int __ext3_journal_forget(const char *where, handle_t *handle,
26 struct buffer_head *bh)
27{
28 int err = journal_forget(handle, bh);
29 if (err)
30 ext3_journal_abort_handle(where, __func__, bh, handle,err);
31 return err;
32}
33
34int __ext3_journal_revoke(const char *where, handle_t *handle,
35 unsigned long blocknr, struct buffer_head *bh)
36{
37 int err = journal_revoke(handle, blocknr, bh);
38 if (err)
39 ext3_journal_abort_handle(where, __func__, bh, handle,err);
40 return err;
41}
42
43int __ext3_journal_get_create_access(const char *where,
44 handle_t *handle, struct buffer_head *bh)
45{
46 int err = journal_get_create_access(handle, bh);
47 if (err)
48 ext3_journal_abort_handle(where, __func__, bh, handle,err);
49 return err;
50}
51
52int __ext3_journal_dirty_metadata(const char *where,
53 handle_t *handle, struct buffer_head *bh)
54{
55 int err = journal_dirty_metadata(handle, bh);
56 if (err)
57 ext3_journal_abort_handle(where, __func__, bh, handle,err);
58 return err;
59}
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
deleted file mode 100644
index 3b8f650de22c..000000000000
--- a/fs/ext3/file.c
+++ /dev/null
@@ -1,79 +0,0 @@
1/*
2 * linux/fs/ext3/file.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/file.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * ext3 fs regular file handling primitives
16 *
17 * 64-bit file support on 64-bit platforms by Jakub Jelinek
18 * (jj@sunsite.ms.mff.cuni.cz)
19 */
20
21#include <linux/quotaops.h>
22#include "ext3.h"
23#include "xattr.h"
24#include "acl.h"
25
26/*
27 * Called when an inode is released. Note that this is different
28 * from ext3_file_open: open gets called at every open, but release
29 * gets called only when /all/ the files are closed.
30 */
31static int ext3_release_file (struct inode * inode, struct file * filp)
32{
33 if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) {
34 filemap_flush(inode->i_mapping);
35 ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
36 }
37 /* if we are the last writer on the inode, drop the block reservation */
38 if ((filp->f_mode & FMODE_WRITE) &&
39 (atomic_read(&inode->i_writecount) == 1))
40 {
41 mutex_lock(&EXT3_I(inode)->truncate_mutex);
42 ext3_discard_reservation(inode);
43 mutex_unlock(&EXT3_I(inode)->truncate_mutex);
44 }
45 if (is_dx(inode) && filp->private_data)
46 ext3_htree_free_dir_info(filp->private_data);
47
48 return 0;
49}
50
51const struct file_operations ext3_file_operations = {
52 .llseek = generic_file_llseek,
53 .read_iter = generic_file_read_iter,
54 .write_iter = generic_file_write_iter,
55 .unlocked_ioctl = ext3_ioctl,
56#ifdef CONFIG_COMPAT
57 .compat_ioctl = ext3_compat_ioctl,
58#endif
59 .mmap = generic_file_mmap,
60 .open = dquot_file_open,
61 .release = ext3_release_file,
62 .fsync = ext3_sync_file,
63 .splice_read = generic_file_splice_read,
64 .splice_write = iter_file_splice_write,
65};
66
67const struct inode_operations ext3_file_inode_operations = {
68 .setattr = ext3_setattr,
69#ifdef CONFIG_EXT3_FS_XATTR
70 .setxattr = generic_setxattr,
71 .getxattr = generic_getxattr,
72 .listxattr = ext3_listxattr,
73 .removexattr = generic_removexattr,
74#endif
75 .get_acl = ext3_get_acl,
76 .set_acl = ext3_set_acl,
77 .fiemap = ext3_fiemap,
78};
79
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
deleted file mode 100644
index 1cb9c7e10c6f..000000000000
--- a/fs/ext3/fsync.c
+++ /dev/null
@@ -1,109 +0,0 @@
1/*
2 * linux/fs/ext3/fsync.c
3 *
4 * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com)
5 * from
6 * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
7 * Laboratoire MASI - Institut Blaise Pascal
8 * Universite Pierre et Marie Curie (Paris VI)
9 * from
10 * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds
11 *
12 * ext3fs fsync primitive
13 *
14 * Big-endian to little-endian byte-swapping/bitmaps by
15 * David S. Miller (davem@caip.rutgers.edu), 1995
16 *
17 * Removed unnecessary code duplication for little endian machines
18 * and excessive __inline__s.
19 * Andi Kleen, 1997
20 *
21 * Major simplications and cleanup - we only need to do the metadata, because
22 * we can depend on generic_block_fdatasync() to sync the data blocks.
23 */
24
25#include <linux/blkdev.h>
26#include <linux/writeback.h>
27#include "ext3.h"
28
29/*
30 * akpm: A new design for ext3_sync_file().
31 *
32 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
33 * There cannot be a transaction open by this task.
34 * Another task could have dirtied this inode. Its data can be in any
35 * state in the journalling system.
36 *
37 * What we do is just kick off a commit and wait on it. This will snapshot the
38 * inode to disk.
39 */
40
41int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
42{
43 struct inode *inode = file->f_mapping->host;
44 struct ext3_inode_info *ei = EXT3_I(inode);
45 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
46 int ret, needs_barrier = 0;
47 tid_t commit_tid;
48
49 trace_ext3_sync_file_enter(file, datasync);
50
51 if (inode->i_sb->s_flags & MS_RDONLY) {
52 /* Make sure that we read updated state */
53 smp_rmb();
54 if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)
55 return -EROFS;
56 return 0;
57 }
58 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
59 if (ret)
60 goto out;
61
62 J_ASSERT(ext3_journal_current_handle() == NULL);
63
64 /*
65 * data=writeback,ordered:
66 * The caller's filemap_fdatawrite()/wait will sync the data.
67 * Metadata is in the journal, we wait for a proper transaction
68 * to commit here.
69 *
70 * data=journal:
71 * filemap_fdatawrite won't do anything (the buffers are clean).
72 * ext3_force_commit will write the file data into the journal and
73 * will wait on that.
74 * filemap_fdatawait() will encounter a ton of newly-dirtied pages
75 * (they were dirtied by commit). But that's OK - the blocks are
76 * safe in-journal, which is all fsync() needs to ensure.
77 */
78 if (ext3_should_journal_data(inode)) {
79 ret = ext3_force_commit(inode->i_sb);
80 goto out;
81 }
82
83 if (datasync)
84 commit_tid = atomic_read(&ei->i_datasync_tid);
85 else
86 commit_tid = atomic_read(&ei->i_sync_tid);
87
88 if (test_opt(inode->i_sb, BARRIER) &&
89 !journal_trans_will_send_data_barrier(journal, commit_tid))
90 needs_barrier = 1;
91 log_start_commit(journal, commit_tid);
92 ret = log_wait_commit(journal, commit_tid);
93
94 /*
95 * In case we didn't commit a transaction, we have to flush
96 * disk caches manually so that data really is on persistent
97 * storage
98 */
99 if (needs_barrier) {
100 int err;
101
102 err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
103 if (!ret)
104 ret = err;
105 }
106out:
107 trace_ext3_sync_file_exit(inode, ret);
108 return ret;
109}
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
deleted file mode 100644
index ede315cdf126..000000000000
--- a/fs/ext3/hash.c
+++ /dev/null
@@ -1,206 +0,0 @@
1/*
2 * linux/fs/ext3/hash.c
3 *
4 * Copyright (C) 2002 by Theodore Ts'o
5 *
6 * This file is released under the GPL v2.
7 *
8 * This file may be redistributed under the terms of the GNU Public
9 * License.
10 */
11
12#include "ext3.h"
13#include <linux/cryptohash.h>
14
15#define DELTA 0x9E3779B9
16
17static void TEA_transform(__u32 buf[4], __u32 const in[])
18{
19 __u32 sum = 0;
20 __u32 b0 = buf[0], b1 = buf[1];
21 __u32 a = in[0], b = in[1], c = in[2], d = in[3];
22 int n = 16;
23
24 do {
25 sum += DELTA;
26 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
27 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
28 } while(--n);
29
30 buf[0] += b0;
31 buf[1] += b1;
32}
33
34
35/* The old legacy hash */
36static __u32 dx_hack_hash_unsigned(const char *name, int len)
37{
38 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
39 const unsigned char *ucp = (const unsigned char *) name;
40
41 while (len--) {
42 hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
43
44 if (hash & 0x80000000)
45 hash -= 0x7fffffff;
46 hash1 = hash0;
47 hash0 = hash;
48 }
49 return hash0 << 1;
50}
51
52static __u32 dx_hack_hash_signed(const char *name, int len)
53{
54 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
55 const signed char *scp = (const signed char *) name;
56
57 while (len--) {
58 hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
59
60 if (hash & 0x80000000)
61 hash -= 0x7fffffff;
62 hash1 = hash0;
63 hash0 = hash;
64 }
65 return hash0 << 1;
66}
67
68static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
69{
70 __u32 pad, val;
71 int i;
72 const signed char *scp = (const signed char *) msg;
73
74 pad = (__u32)len | ((__u32)len << 8);
75 pad |= pad << 16;
76
77 val = pad;
78 if (len > num*4)
79 len = num * 4;
80 for (i = 0; i < len; i++) {
81 if ((i % 4) == 0)
82 val = pad;
83 val = ((int) scp[i]) + (val << 8);
84 if ((i % 4) == 3) {
85 *buf++ = val;
86 val = pad;
87 num--;
88 }
89 }
90 if (--num >= 0)
91 *buf++ = val;
92 while (--num >= 0)
93 *buf++ = pad;
94}
95
96static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
97{
98 __u32 pad, val;
99 int i;
100 const unsigned char *ucp = (const unsigned char *) msg;
101
102 pad = (__u32)len | ((__u32)len << 8);
103 pad |= pad << 16;
104
105 val = pad;
106 if (len > num*4)
107 len = num * 4;
108 for (i=0; i < len; i++) {
109 if ((i % 4) == 0)
110 val = pad;
111 val = ((int) ucp[i]) + (val << 8);
112 if ((i % 4) == 3) {
113 *buf++ = val;
114 val = pad;
115 num--;
116 }
117 }
118 if (--num >= 0)
119 *buf++ = val;
120 while (--num >= 0)
121 *buf++ = pad;
122}
123
124/*
125 * Returns the hash of a filename. If len is 0 and name is NULL, then
126 * this function can be used to test whether or not a hash version is
127 * supported.
128 *
129 * The seed is an 4 longword (32 bits) "secret" which can be used to
130 * uniquify a hash. If the seed is all zero's, then some default seed
131 * may be used.
132 *
133 * A particular hash version specifies whether or not the seed is
134 * represented, and whether or not the returned hash is 32 bits or 64
135 * bits. 32 bit hashes will return 0 for the minor hash.
136 */
137int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
138{
139 __u32 hash;
140 __u32 minor_hash = 0;
141 const char *p;
142 int i;
143 __u32 in[8], buf[4];
144 void (*str2hashbuf)(const char *, int, __u32 *, int) =
145 str2hashbuf_signed;
146
147 /* Initialize the default seed for the hash checksum functions */
148 buf[0] = 0x67452301;
149 buf[1] = 0xefcdab89;
150 buf[2] = 0x98badcfe;
151 buf[3] = 0x10325476;
152
153 /* Check to see if the seed is all zero's */
154 if (hinfo->seed) {
155 for (i=0; i < 4; i++) {
156 if (hinfo->seed[i])
157 break;
158 }
159 if (i < 4)
160 memcpy(buf, hinfo->seed, sizeof(buf));
161 }
162
163 switch (hinfo->hash_version) {
164 case DX_HASH_LEGACY_UNSIGNED:
165 hash = dx_hack_hash_unsigned(name, len);
166 break;
167 case DX_HASH_LEGACY:
168 hash = dx_hack_hash_signed(name, len);
169 break;
170 case DX_HASH_HALF_MD4_UNSIGNED:
171 str2hashbuf = str2hashbuf_unsigned;
172 case DX_HASH_HALF_MD4:
173 p = name;
174 while (len > 0) {
175 (*str2hashbuf)(p, len, in, 8);
176 half_md4_transform(buf, in);
177 len -= 32;
178 p += 32;
179 }
180 minor_hash = buf[2];
181 hash = buf[1];
182 break;
183 case DX_HASH_TEA_UNSIGNED:
184 str2hashbuf = str2hashbuf_unsigned;
185 case DX_HASH_TEA:
186 p = name;
187 while (len > 0) {
188 (*str2hashbuf)(p, len, in, 4);
189 TEA_transform(buf, in);
190 len -= 16;
191 p += 16;
192 }
193 hash = buf[0];
194 minor_hash = buf[1];
195 break;
196 default:
197 hinfo->hash = 0;
198 return -1;
199 }
200 hash = hash & ~1;
201 if (hash == (EXT3_HTREE_EOF_32BIT << 1))
202 hash = (EXT3_HTREE_EOF_32BIT - 1) << 1;
203 hinfo->hash = hash;
204 hinfo->minor_hash = minor_hash;
205 return 0;
206}
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
deleted file mode 100644
index 3ad242e5840e..000000000000
--- a/fs/ext3/ialloc.c
+++ /dev/null
@@ -1,706 +0,0 @@
1/*
2 * linux/fs/ext3/ialloc.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * BSD ufs-inspired inode and directory allocation by
10 * Stephen Tweedie (sct@redhat.com), 1993
11 * Big-endian to little-endian byte-swapping/bitmaps by
12 * David S. Miller (davem@caip.rutgers.edu), 1995
13 */
14
15#include <linux/quotaops.h>
16#include <linux/random.h>
17
18#include "ext3.h"
19#include "xattr.h"
20#include "acl.h"
21
22/*
23 * ialloc.c contains the inodes allocation and deallocation routines
24 */
25
26/*
27 * The free inodes are managed by bitmaps. A file system contains several
28 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
29 * block for inodes, N blocks for the inode table and data blocks.
30 *
31 * The file system contains group descriptors which are located after the
32 * super block. Each descriptor contains the number of the bitmap block and
33 * the free blocks count in the block.
34 */
35
36
37/*
38 * Read the inode allocation bitmap for a given block_group, reading
39 * into the specified slot in the superblock's bitmap cache.
40 *
41 * Return buffer_head of bitmap on success or NULL.
42 */
43static struct buffer_head *
44read_inode_bitmap(struct super_block * sb, unsigned long block_group)
45{
46 struct ext3_group_desc *desc;
47 struct buffer_head *bh = NULL;
48
49 desc = ext3_get_group_desc(sb, block_group, NULL);
50 if (!desc)
51 goto error_out;
52
53 bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap));
54 if (!bh)
55 ext3_error(sb, "read_inode_bitmap",
56 "Cannot read inode bitmap - "
57 "block_group = %lu, inode_bitmap = %u",
58 block_group, le32_to_cpu(desc->bg_inode_bitmap));
59error_out:
60 return bh;
61}
62
63/*
64 * NOTE! When we get the inode, we're the only people
65 * that have access to it, and as such there are no
66 * race conditions we have to worry about. The inode
67 * is not on the hash-lists, and it cannot be reached
68 * through the filesystem because the directory entry
69 * has been deleted earlier.
70 *
71 * HOWEVER: we must make sure that we get no aliases,
72 * which means that we have to call "clear_inode()"
73 * _before_ we mark the inode not in use in the inode
74 * bitmaps. Otherwise a newly created file might use
75 * the same inode number (not actually the same pointer
76 * though), and then we'd have two inodes sharing the
77 * same inode number and space on the harddisk.
78 */
79void ext3_free_inode (handle_t *handle, struct inode * inode)
80{
81 struct super_block * sb = inode->i_sb;
82 int is_directory;
83 unsigned long ino;
84 struct buffer_head *bitmap_bh = NULL;
85 struct buffer_head *bh2;
86 unsigned long block_group;
87 unsigned long bit;
88 struct ext3_group_desc * gdp;
89 struct ext3_super_block * es;
90 struct ext3_sb_info *sbi;
91 int fatal = 0, err;
92
93 if (atomic_read(&inode->i_count) > 1) {
94 printk ("ext3_free_inode: inode has count=%d\n",
95 atomic_read(&inode->i_count));
96 return;
97 }
98 if (inode->i_nlink) {
99 printk ("ext3_free_inode: inode has nlink=%d\n",
100 inode->i_nlink);
101 return;
102 }
103 if (!sb) {
104 printk("ext3_free_inode: inode on nonexistent device\n");
105 return;
106 }
107 sbi = EXT3_SB(sb);
108
109 ino = inode->i_ino;
110 ext3_debug ("freeing inode %lu\n", ino);
111 trace_ext3_free_inode(inode);
112
113 is_directory = S_ISDIR(inode->i_mode);
114
115 es = EXT3_SB(sb)->s_es;
116 if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
117 ext3_error (sb, "ext3_free_inode",
118 "reserved or nonexistent inode %lu", ino);
119 goto error_return;
120 }
121 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
122 bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
123 bitmap_bh = read_inode_bitmap(sb, block_group);
124 if (!bitmap_bh)
125 goto error_return;
126
127 BUFFER_TRACE(bitmap_bh, "get_write_access");
128 fatal = ext3_journal_get_write_access(handle, bitmap_bh);
129 if (fatal)
130 goto error_return;
131
132 /* Ok, now we can actually update the inode bitmaps.. */
133 if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
134 bit, bitmap_bh->b_data))
135 ext3_error (sb, "ext3_free_inode",
136 "bit already cleared for inode %lu", ino);
137 else {
138 gdp = ext3_get_group_desc (sb, block_group, &bh2);
139
140 BUFFER_TRACE(bh2, "get_write_access");
141 fatal = ext3_journal_get_write_access(handle, bh2);
142 if (fatal) goto error_return;
143
144 if (gdp) {
145 spin_lock(sb_bgl_lock(sbi, block_group));
146 le16_add_cpu(&gdp->bg_free_inodes_count, 1);
147 if (is_directory)
148 le16_add_cpu(&gdp->bg_used_dirs_count, -1);
149 spin_unlock(sb_bgl_lock(sbi, block_group));
150 percpu_counter_inc(&sbi->s_freeinodes_counter);
151 if (is_directory)
152 percpu_counter_dec(&sbi->s_dirs_counter);
153
154 }
155 BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
156 err = ext3_journal_dirty_metadata(handle, bh2);
157 if (!fatal) fatal = err;
158 }
159 BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata");
160 err = ext3_journal_dirty_metadata(handle, bitmap_bh);
161 if (!fatal)
162 fatal = err;
163
164error_return:
165 brelse(bitmap_bh);
166 ext3_std_error(sb, fatal);
167}
168
169/*
170 * Orlov's allocator for directories.
171 *
172 * We always try to spread first-level directories.
173 *
174 * If there are blockgroups with both free inodes and free blocks counts
175 * not worse than average we return one with smallest directory count.
176 * Otherwise we simply return a random group.
177 *
178 * For the rest rules look so:
179 *
180 * It's OK to put directory into a group unless
181 * it has too many directories already (max_dirs) or
182 * it has too few free inodes left (min_inodes) or
183 * it has too few free blocks left (min_blocks).
184 * Parent's group is preferred, if it doesn't satisfy these
185 * conditions we search cyclically through the rest. If none
186 * of the groups look good we just look for a group with more
187 * free inodes than average (starting at parent's group).
188 *
189 * Debt is incremented each time we allocate a directory and decremented
190 * when we allocate an inode, within 0--255.
191 */
192
193static int find_group_orlov(struct super_block *sb, struct inode *parent)
194{
195 int parent_group = EXT3_I(parent)->i_block_group;
196 struct ext3_sb_info *sbi = EXT3_SB(sb);
197 int ngroups = sbi->s_groups_count;
198 int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
199 unsigned int freei, avefreei;
200 ext3_fsblk_t freeb, avefreeb;
201 unsigned int ndirs;
202 int max_dirs, min_inodes;
203 ext3_grpblk_t min_blocks;
204 int group = -1, i;
205 struct ext3_group_desc *desc;
206
207 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
208 avefreei = freei / ngroups;
209 freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
210 avefreeb = freeb / ngroups;
211 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
212
213 if ((parent == d_inode(sb->s_root)) ||
214 (EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) {
215 int best_ndir = inodes_per_group;
216 int best_group = -1;
217
218 group = prandom_u32();
219 parent_group = (unsigned)group % ngroups;
220 for (i = 0; i < ngroups; i++) {
221 group = (parent_group + i) % ngroups;
222 desc = ext3_get_group_desc (sb, group, NULL);
223 if (!desc || !desc->bg_free_inodes_count)
224 continue;
225 if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
226 continue;
227 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
228 continue;
229 if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
230 continue;
231 best_group = group;
232 best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
233 }
234 if (best_group >= 0)
235 return best_group;
236 goto fallback;
237 }
238
239 max_dirs = ndirs / ngroups + inodes_per_group / 16;
240 min_inodes = avefreei - inodes_per_group / 4;
241 min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4;
242
243 for (i = 0; i < ngroups; i++) {
244 group = (parent_group + i) % ngroups;
245 desc = ext3_get_group_desc (sb, group, NULL);
246 if (!desc || !desc->bg_free_inodes_count)
247 continue;
248 if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
249 continue;
250 if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
251 continue;
252 if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
253 continue;
254 return group;
255 }
256
257fallback:
258 for (i = 0; i < ngroups; i++) {
259 group = (parent_group + i) % ngroups;
260 desc = ext3_get_group_desc (sb, group, NULL);
261 if (!desc || !desc->bg_free_inodes_count)
262 continue;
263 if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
264 return group;
265 }
266
267 if (avefreei) {
268 /*
269 * The free-inodes counter is approximate, and for really small
270 * filesystems the above test can fail to find any blockgroups
271 */
272 avefreei = 0;
273 goto fallback;
274 }
275
276 return -1;
277}
278
279static int find_group_other(struct super_block *sb, struct inode *parent)
280{
281 int parent_group = EXT3_I(parent)->i_block_group;
282 int ngroups = EXT3_SB(sb)->s_groups_count;
283 struct ext3_group_desc *desc;
284 int group, i;
285
286 /*
287 * Try to place the inode in its parent directory
288 */
289 group = parent_group;
290 desc = ext3_get_group_desc (sb, group, NULL);
291 if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
292 le16_to_cpu(desc->bg_free_blocks_count))
293 return group;
294
295 /*
296 * We're going to place this inode in a different blockgroup from its
297 * parent. We want to cause files in a common directory to all land in
298 * the same blockgroup. But we want files which are in a different
299 * directory which shares a blockgroup with our parent to land in a
300 * different blockgroup.
301 *
302 * So add our directory's i_ino into the starting point for the hash.
303 */
304 group = (group + parent->i_ino) % ngroups;
305
306 /*
307 * Use a quadratic hash to find a group with a free inode and some free
308 * blocks.
309 */
310 for (i = 1; i < ngroups; i <<= 1) {
311 group += i;
312 if (group >= ngroups)
313 group -= ngroups;
314 desc = ext3_get_group_desc (sb, group, NULL);
315 if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
316 le16_to_cpu(desc->bg_free_blocks_count))
317 return group;
318 }
319
320 /*
321 * That failed: try linear search for a free inode, even if that group
322 * has no free blocks.
323 */
324 group = parent_group;
325 for (i = 0; i < ngroups; i++) {
326 if (++group >= ngroups)
327 group = 0;
328 desc = ext3_get_group_desc (sb, group, NULL);
329 if (desc && le16_to_cpu(desc->bg_free_inodes_count))
330 return group;
331 }
332
333 return -1;
334}
335
336/*
337 * There are two policies for allocating an inode. If the new inode is
338 * a directory, then a forward search is made for a block group with both
339 * free space and a low directory-to-inode ratio; if that fails, then of
340 * the groups with above-average free space, that group with the fewest
341 * directories already is chosen.
342 *
343 * For other inodes, search forward from the parent directory's block
344 * group to find a free inode.
345 */
346struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
347 const struct qstr *qstr, umode_t mode)
348{
349 struct super_block *sb;
350 struct buffer_head *bitmap_bh = NULL;
351 struct buffer_head *bh2;
352 int group;
353 unsigned long ino = 0;
354 struct inode * inode;
355 struct ext3_group_desc * gdp = NULL;
356 struct ext3_super_block * es;
357 struct ext3_inode_info *ei;
358 struct ext3_sb_info *sbi;
359 int err = 0;
360 struct inode *ret;
361 int i;
362
363 /* Cannot create files in a deleted directory */
364 if (!dir || !dir->i_nlink)
365 return ERR_PTR(-EPERM);
366
367 sb = dir->i_sb;
368 trace_ext3_request_inode(dir, mode);
369 inode = new_inode(sb);
370 if (!inode)
371 return ERR_PTR(-ENOMEM);
372 ei = EXT3_I(inode);
373
374 sbi = EXT3_SB(sb);
375 es = sbi->s_es;
376 if (S_ISDIR(mode))
377 group = find_group_orlov(sb, dir);
378 else
379 group = find_group_other(sb, dir);
380
381 err = -ENOSPC;
382 if (group == -1)
383 goto out;
384
385 for (i = 0; i < sbi->s_groups_count; i++) {
386 err = -EIO;
387
388 gdp = ext3_get_group_desc(sb, group, &bh2);
389 if (!gdp)
390 goto fail;
391
392 brelse(bitmap_bh);
393 bitmap_bh = read_inode_bitmap(sb, group);
394 if (!bitmap_bh)
395 goto fail;
396
397 ino = 0;
398
399repeat_in_this_group:
400 ino = ext3_find_next_zero_bit((unsigned long *)
401 bitmap_bh->b_data, EXT3_INODES_PER_GROUP(sb), ino);
402 if (ino < EXT3_INODES_PER_GROUP(sb)) {
403
404 BUFFER_TRACE(bitmap_bh, "get_write_access");
405 err = ext3_journal_get_write_access(handle, bitmap_bh);
406 if (err)
407 goto fail;
408
409 if (!ext3_set_bit_atomic(sb_bgl_lock(sbi, group),
410 ino, bitmap_bh->b_data)) {
411 /* we won it */
412 BUFFER_TRACE(bitmap_bh,
413 "call ext3_journal_dirty_metadata");
414 err = ext3_journal_dirty_metadata(handle,
415 bitmap_bh);
416 if (err)
417 goto fail;
418 goto got;
419 }
420 /* we lost it */
421 journal_release_buffer(handle, bitmap_bh);
422
423 if (++ino < EXT3_INODES_PER_GROUP(sb))
424 goto repeat_in_this_group;
425 }
426
427 /*
428 * This case is possible in concurrent environment. It is very
429 * rare. We cannot repeat the find_group_xxx() call because
430 * that will simply return the same blockgroup, because the
431 * group descriptor metadata has not yet been updated.
432 * So we just go onto the next blockgroup.
433 */
434 if (++group == sbi->s_groups_count)
435 group = 0;
436 }
437 err = -ENOSPC;
438 goto out;
439
440got:
441 ino += group * EXT3_INODES_PER_GROUP(sb) + 1;
442 if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
443 ext3_error (sb, "ext3_new_inode",
444 "reserved inode or inode > inodes count - "
445 "block_group = %d, inode=%lu", group, ino);
446 err = -EIO;
447 goto fail;
448 }
449
450 BUFFER_TRACE(bh2, "get_write_access");
451 err = ext3_journal_get_write_access(handle, bh2);
452 if (err) goto fail;
453 spin_lock(sb_bgl_lock(sbi, group));
454 le16_add_cpu(&gdp->bg_free_inodes_count, -1);
455 if (S_ISDIR(mode)) {
456 le16_add_cpu(&gdp->bg_used_dirs_count, 1);
457 }
458 spin_unlock(sb_bgl_lock(sbi, group));
459 BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
460 err = ext3_journal_dirty_metadata(handle, bh2);
461 if (err) goto fail;
462
463 percpu_counter_dec(&sbi->s_freeinodes_counter);
464 if (S_ISDIR(mode))
465 percpu_counter_inc(&sbi->s_dirs_counter);
466
467
468 if (test_opt(sb, GRPID)) {
469 inode->i_mode = mode;
470 inode->i_uid = current_fsuid();
471 inode->i_gid = dir->i_gid;
472 } else
473 inode_init_owner(inode, dir, mode);
474
475 inode->i_ino = ino;
476 /* This is the optimal IO size (for stat), not the fs block size */
477 inode->i_blocks = 0;
478 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
479
480 memset(ei->i_data, 0, sizeof(ei->i_data));
481 ei->i_dir_start_lookup = 0;
482 ei->i_disksize = 0;
483
484 ei->i_flags =
485 ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED);
486#ifdef EXT3_FRAGMENTS
487 ei->i_faddr = 0;
488 ei->i_frag_no = 0;
489 ei->i_frag_size = 0;
490#endif
491 ei->i_file_acl = 0;
492 ei->i_dir_acl = 0;
493 ei->i_dtime = 0;
494 ei->i_block_alloc_info = NULL;
495 ei->i_block_group = group;
496
497 ext3_set_inode_flags(inode);
498 if (IS_DIRSYNC(inode))
499 handle->h_sync = 1;
500 if (insert_inode_locked(inode) < 0) {
501 /*
502 * Likely a bitmap corruption causing inode to be allocated
503 * twice.
504 */
505 err = -EIO;
506 goto fail;
507 }
508 spin_lock(&sbi->s_next_gen_lock);
509 inode->i_generation = sbi->s_next_generation++;
510 spin_unlock(&sbi->s_next_gen_lock);
511
512 ei->i_state_flags = 0;
513 ext3_set_inode_state(inode, EXT3_STATE_NEW);
514
515 /* See comment in ext3_iget for explanation */
516 if (ino >= EXT3_FIRST_INO(sb) + 1 &&
517 EXT3_INODE_SIZE(sb) > EXT3_GOOD_OLD_INODE_SIZE) {
518 ei->i_extra_isize =
519 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE;
520 } else {
521 ei->i_extra_isize = 0;
522 }
523
524 ret = inode;
525 dquot_initialize(inode);
526 err = dquot_alloc_inode(inode);
527 if (err)
528 goto fail_drop;
529
530 err = ext3_init_acl(handle, inode, dir);
531 if (err)
532 goto fail_free_drop;
533
534 err = ext3_init_security(handle, inode, dir, qstr);
535 if (err)
536 goto fail_free_drop;
537
538 err = ext3_mark_inode_dirty(handle, inode);
539 if (err) {
540 ext3_std_error(sb, err);
541 goto fail_free_drop;
542 }
543
544 ext3_debug("allocating inode %lu\n", inode->i_ino);
545 trace_ext3_allocate_inode(inode, dir, mode);
546 goto really_out;
547fail:
548 ext3_std_error(sb, err);
549out:
550 iput(inode);
551 ret = ERR_PTR(err);
552really_out:
553 brelse(bitmap_bh);
554 return ret;
555
556fail_free_drop:
557 dquot_free_inode(inode);
558
559fail_drop:
560 dquot_drop(inode);
561 inode->i_flags |= S_NOQUOTA;
562 clear_nlink(inode);
563 unlock_new_inode(inode);
564 iput(inode);
565 brelse(bitmap_bh);
566 return ERR_PTR(err);
567}
568
569/* Verify that we are loading a valid orphan from disk */
570struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
571{
572 unsigned long max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count);
573 unsigned long block_group;
574 int bit;
575 struct buffer_head *bitmap_bh;
576 struct inode *inode = NULL;
577 long err = -EIO;
578
579 /* Error cases - e2fsck has already cleaned up for us */
580 if (ino > max_ino) {
581 ext3_warning(sb, __func__,
582 "bad orphan ino %lu! e2fsck was run?", ino);
583 goto error;
584 }
585
586 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
587 bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
588 bitmap_bh = read_inode_bitmap(sb, block_group);
589 if (!bitmap_bh) {
590 ext3_warning(sb, __func__,
591 "inode bitmap error for orphan %lu", ino);
592 goto error;
593 }
594
595 /* Having the inode bit set should be a 100% indicator that this
596 * is a valid orphan (no e2fsck run on fs). Orphans also include
597 * inodes that were being truncated, so we can't check i_nlink==0.
598 */
599 if (!ext3_test_bit(bit, bitmap_bh->b_data))
600 goto bad_orphan;
601
602 inode = ext3_iget(sb, ino);
603 if (IS_ERR(inode))
604 goto iget_failed;
605
606 /*
607 * If the orphans has i_nlinks > 0 then it should be able to be
608 * truncated, otherwise it won't be removed from the orphan list
609 * during processing and an infinite loop will result.
610 */
611 if (inode->i_nlink && !ext3_can_truncate(inode))
612 goto bad_orphan;
613
614 if (NEXT_ORPHAN(inode) > max_ino)
615 goto bad_orphan;
616 brelse(bitmap_bh);
617 return inode;
618
619iget_failed:
620 err = PTR_ERR(inode);
621 inode = NULL;
622bad_orphan:
623 ext3_warning(sb, __func__,
624 "bad orphan inode %lu! e2fsck was run?", ino);
625 printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n",
626 bit, (unsigned long long)bitmap_bh->b_blocknr,
627 ext3_test_bit(bit, bitmap_bh->b_data));
628 printk(KERN_NOTICE "inode=%p\n", inode);
629 if (inode) {
630 printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
631 is_bad_inode(inode));
632 printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
633 NEXT_ORPHAN(inode));
634 printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
635 printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
636 /* Avoid freeing blocks if we got a bad deleted inode */
637 if (inode->i_nlink == 0)
638 inode->i_blocks = 0;
639 iput(inode);
640 }
641 brelse(bitmap_bh);
642error:
643 return ERR_PTR(err);
644}
645
646unsigned long ext3_count_free_inodes (struct super_block * sb)
647{
648 unsigned long desc_count;
649 struct ext3_group_desc *gdp;
650 int i;
651#ifdef EXT3FS_DEBUG
652 struct ext3_super_block *es;
653 unsigned long bitmap_count, x;
654 struct buffer_head *bitmap_bh = NULL;
655
656 es = EXT3_SB(sb)->s_es;
657 desc_count = 0;
658 bitmap_count = 0;
659 gdp = NULL;
660 for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
661 gdp = ext3_get_group_desc (sb, i, NULL);
662 if (!gdp)
663 continue;
664 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
665 brelse(bitmap_bh);
666 bitmap_bh = read_inode_bitmap(sb, i);
667 if (!bitmap_bh)
668 continue;
669
670 x = ext3_count_free(bitmap_bh, EXT3_INODES_PER_GROUP(sb) / 8);
671 printk("group %d: stored = %d, counted = %lu\n",
672 i, le16_to_cpu(gdp->bg_free_inodes_count), x);
673 bitmap_count += x;
674 }
675 brelse(bitmap_bh);
676 printk("ext3_count_free_inodes: stored = %u, computed = %lu, %lu\n",
677 le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
678 return desc_count;
679#else
680 desc_count = 0;
681 for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
682 gdp = ext3_get_group_desc (sb, i, NULL);
683 if (!gdp)
684 continue;
685 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
686 cond_resched();
687 }
688 return desc_count;
689#endif
690}
691
692/* Called at mount-time, super-block is locked */
693unsigned long ext3_count_dirs (struct super_block * sb)
694{
695 unsigned long count = 0;
696 int i;
697
698 for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
699 struct ext3_group_desc *gdp = ext3_get_group_desc (sb, i, NULL);
700 if (!gdp)
701 continue;
702 count += le16_to_cpu(gdp->bg_used_dirs_count);
703 }
704 return count;
705}
706
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
deleted file mode 100644
index 6c7e5468a2f8..000000000000
--- a/fs/ext3/inode.c
+++ /dev/null
@@ -1,3574 +0,0 @@
1/*
2 * linux/fs/ext3/inode.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/inode.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Goal-directed block allocation by Stephen Tweedie
16 * (sct@redhat.com), 1993, 1998
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 64-bit file support on 64-bit platforms by Jakub Jelinek
20 * (jj@sunsite.ms.mff.cuni.cz)
21 *
22 * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
23 */
24
25#include <linux/highuid.h>
26#include <linux/quotaops.h>
27#include <linux/writeback.h>
28#include <linux/mpage.h>
29#include <linux/namei.h>
30#include <linux/uio.h>
31#include "ext3.h"
32#include "xattr.h"
33#include "acl.h"
34
35static int ext3_writepage_trans_blocks(struct inode *inode);
36static int ext3_block_truncate_page(struct inode *inode, loff_t from);
37
38/*
39 * Test whether an inode is a fast symlink.
40 */
41static int ext3_inode_is_fast_symlink(struct inode *inode)
42{
43 int ea_blocks = EXT3_I(inode)->i_file_acl ?
44 (inode->i_sb->s_blocksize >> 9) : 0;
45
46 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
47}
48
49/*
50 * The ext3 forget function must perform a revoke if we are freeing data
51 * which has been journaled. Metadata (eg. indirect blocks) must be
52 * revoked in all cases.
53 *
54 * "bh" may be NULL: a metadata block may have been freed from memory
55 * but there may still be a record of it in the journal, and that record
56 * still needs to be revoked.
57 */
58int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
59 struct buffer_head *bh, ext3_fsblk_t blocknr)
60{
61 int err;
62
63 might_sleep();
64
65 trace_ext3_forget(inode, is_metadata, blocknr);
66 BUFFER_TRACE(bh, "enter");
67
68 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
69 "data mode %lx\n",
70 bh, is_metadata, inode->i_mode,
71 test_opt(inode->i_sb, DATA_FLAGS));
72
73 /* Never use the revoke function if we are doing full data
74 * journaling: there is no need to, and a V1 superblock won't
75 * support it. Otherwise, only skip the revoke on un-journaled
76 * data blocks. */
77
78 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
79 (!is_metadata && !ext3_should_journal_data(inode))) {
80 if (bh) {
81 BUFFER_TRACE(bh, "call journal_forget");
82 return ext3_journal_forget(handle, bh);
83 }
84 return 0;
85 }
86
87 /*
88 * data!=journal && (is_metadata || should_journal_data(inode))
89 */
90 BUFFER_TRACE(bh, "call ext3_journal_revoke");
91 err = ext3_journal_revoke(handle, blocknr, bh);
92 if (err)
93 ext3_abort(inode->i_sb, __func__,
94 "error %d when attempting revoke", err);
95 BUFFER_TRACE(bh, "exit");
96 return err;
97}
98
99/*
100 * Work out how many blocks we need to proceed with the next chunk of a
101 * truncate transaction.
102 */
103static unsigned long blocks_for_truncate(struct inode *inode)
104{
105 unsigned long needed;
106
107 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
108
109 /* Give ourselves just enough room to cope with inodes in which
110 * i_blocks is corrupt: we've seen disk corruptions in the past
111 * which resulted in random data in an inode which looked enough
112 * like a regular file for ext3 to try to delete it. Things
113 * will go a bit crazy if that happens, but at least we should
114 * try not to panic the whole kernel. */
115 if (needed < 2)
116 needed = 2;
117
118 /* But we need to bound the transaction so we don't overflow the
119 * journal. */
120 if (needed > EXT3_MAX_TRANS_DATA)
121 needed = EXT3_MAX_TRANS_DATA;
122
123 return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
124}
125
126/*
127 * Truncate transactions can be complex and absolutely huge. So we need to
128 * be able to restart the transaction at a conventient checkpoint to make
129 * sure we don't overflow the journal.
130 *
131 * start_transaction gets us a new handle for a truncate transaction,
132 * and extend_transaction tries to extend the existing one a bit. If
133 * extend fails, we need to propagate the failure up and restart the
134 * transaction in the top-level truncate loop. --sct
135 */
136static handle_t *start_transaction(struct inode *inode)
137{
138 handle_t *result;
139
140 result = ext3_journal_start(inode, blocks_for_truncate(inode));
141 if (!IS_ERR(result))
142 return result;
143
144 ext3_std_error(inode->i_sb, PTR_ERR(result));
145 return result;
146}
147
148/*
149 * Try to extend this transaction for the purposes of truncation.
150 *
151 * Returns 0 if we managed to create more room. If we can't create more
152 * room, and the transaction must be restarted we return 1.
153 */
154static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
155{
156 if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
157 return 0;
158 if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
159 return 0;
160 return 1;
161}
162
163/*
164 * Restart the transaction associated with *handle. This does a commit,
165 * so before we call here everything must be consistently dirtied against
166 * this transaction.
167 */
168static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
169{
170 int ret;
171
172 jbd_debug(2, "restarting handle %p\n", handle);
173 /*
174 * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
175 * At this moment, get_block can be called only for blocks inside
176 * i_size since page cache has been already dropped and writes are
177 * blocked by i_mutex. So we can safely drop the truncate_mutex.
178 */
179 mutex_unlock(&EXT3_I(inode)->truncate_mutex);
180 ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
181 mutex_lock(&EXT3_I(inode)->truncate_mutex);
182 return ret;
183}
184
185/*
186 * Called at inode eviction from icache
187 */
188void ext3_evict_inode (struct inode *inode)
189{
190 struct ext3_inode_info *ei = EXT3_I(inode);
191 struct ext3_block_alloc_info *rsv;
192 handle_t *handle;
193 int want_delete = 0;
194
195 trace_ext3_evict_inode(inode);
196 if (!inode->i_nlink && !is_bad_inode(inode)) {
197 dquot_initialize(inode);
198 want_delete = 1;
199 }
200
201 /*
202 * When journalling data dirty buffers are tracked only in the journal.
203 * So although mm thinks everything is clean and ready for reaping the
204 * inode might still have some pages to write in the running
205 * transaction or waiting to be checkpointed. Thus calling
206 * journal_invalidatepage() (via truncate_inode_pages()) to discard
207 * these buffers can cause data loss. Also even if we did not discard
208 * these buffers, we would have no way to find them after the inode
209 * is reaped and thus user could see stale data if he tries to read
210 * them before the transaction is checkpointed. So be careful and
211 * force everything to disk here... We use ei->i_datasync_tid to
212 * store the newest transaction containing inode's data.
213 *
214 * Note that directories do not have this problem because they don't
215 * use page cache.
216 *
217 * The s_journal check handles the case when ext3_get_journal() fails
218 * and puts the journal inode.
219 */
220 if (inode->i_nlink && ext3_should_journal_data(inode) &&
221 EXT3_SB(inode->i_sb)->s_journal &&
222 (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
223 inode->i_ino != EXT3_JOURNAL_INO) {
224 tid_t commit_tid = atomic_read(&ei->i_datasync_tid);
225 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
226
227 log_start_commit(journal, commit_tid);
228 log_wait_commit(journal, commit_tid);
229 filemap_write_and_wait(&inode->i_data);
230 }
231 truncate_inode_pages_final(&inode->i_data);
232
233 ext3_discard_reservation(inode);
234 rsv = ei->i_block_alloc_info;
235 ei->i_block_alloc_info = NULL;
236 if (unlikely(rsv))
237 kfree(rsv);
238
239 if (!want_delete)
240 goto no_delete;
241
242 handle = start_transaction(inode);
243 if (IS_ERR(handle)) {
244 /*
245 * If we're going to skip the normal cleanup, we still need to
246 * make sure that the in-core orphan linked list is properly
247 * cleaned up.
248 */
249 ext3_orphan_del(NULL, inode);
250 goto no_delete;
251 }
252
253 if (IS_SYNC(inode))
254 handle->h_sync = 1;
255 inode->i_size = 0;
256 if (inode->i_blocks)
257 ext3_truncate(inode);
258 /*
259 * Kill off the orphan record created when the inode lost the last
260 * link. Note that ext3_orphan_del() has to be able to cope with the
261 * deletion of a non-existent orphan - ext3_truncate() could
262 * have removed the record.
263 */
264 ext3_orphan_del(handle, inode);
265 ei->i_dtime = get_seconds();
266
267 /*
268 * One subtle ordering requirement: if anything has gone wrong
269 * (transaction abort, IO errors, whatever), then we can still
270 * do these next steps (the fs will already have been marked as
271 * having errors), but we can't free the inode if the mark_dirty
272 * fails.
273 */
274 if (ext3_mark_inode_dirty(handle, inode)) {
275 /* If that failed, just dquot_drop() and be done with that */
276 dquot_drop(inode);
277 clear_inode(inode);
278 } else {
279 ext3_xattr_delete_inode(handle, inode);
280 dquot_free_inode(inode);
281 dquot_drop(inode);
282 clear_inode(inode);
283 ext3_free_inode(handle, inode);
284 }
285 ext3_journal_stop(handle);
286 return;
287no_delete:
288 clear_inode(inode);
289 dquot_drop(inode);
290}
291
292typedef struct {
293 __le32 *p;
294 __le32 key;
295 struct buffer_head *bh;
296} Indirect;
297
298static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
299{
300 p->key = *(p->p = v);
301 p->bh = bh;
302}
303
304static int verify_chain(Indirect *from, Indirect *to)
305{
306 while (from <= to && from->key == *from->p)
307 from++;
308 return (from > to);
309}
310
311/**
312 * ext3_block_to_path - parse the block number into array of offsets
313 * @inode: inode in question (we are only interested in its superblock)
314 * @i_block: block number to be parsed
315 * @offsets: array to store the offsets in
316 * @boundary: set this non-zero if the referred-to block is likely to be
317 * followed (on disk) by an indirect block.
318 *
319 * To store the locations of file's data ext3 uses a data structure common
320 * for UNIX filesystems - tree of pointers anchored in the inode, with
321 * data blocks at leaves and indirect blocks in intermediate nodes.
322 * This function translates the block number into path in that tree -
323 * return value is the path length and @offsets[n] is the offset of
324 * pointer to (n+1)th node in the nth one. If @block is out of range
325 * (negative or too large) warning is printed and zero returned.
326 *
327 * Note: function doesn't find node addresses, so no IO is needed. All
328 * we need to know is the capacity of indirect blocks (taken from the
329 * inode->i_sb).
330 */
331
332/*
333 * Portability note: the last comparison (check that we fit into triple
334 * indirect block) is spelled differently, because otherwise on an
335 * architecture with 32-bit longs and 8Kb pages we might get into trouble
336 * if our filesystem had 8Kb blocks. We might use long long, but that would
337 * kill us on x86. Oh, well, at least the sign propagation does not matter -
338 * i_block would have to be negative in the very beginning, so we would not
339 * get there at all.
340 */
341
342static int ext3_block_to_path(struct inode *inode,
343 long i_block, int offsets[4], int *boundary)
344{
345 int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
346 int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
347 const long direct_blocks = EXT3_NDIR_BLOCKS,
348 indirect_blocks = ptrs,
349 double_blocks = (1 << (ptrs_bits * 2));
350 int n = 0;
351 int final = 0;
352
353 if (i_block < 0) {
354 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
355 } else if (i_block < direct_blocks) {
356 offsets[n++] = i_block;
357 final = direct_blocks;
358 } else if ( (i_block -= direct_blocks) < indirect_blocks) {
359 offsets[n++] = EXT3_IND_BLOCK;
360 offsets[n++] = i_block;
361 final = ptrs;
362 } else if ((i_block -= indirect_blocks) < double_blocks) {
363 offsets[n++] = EXT3_DIND_BLOCK;
364 offsets[n++] = i_block >> ptrs_bits;
365 offsets[n++] = i_block & (ptrs - 1);
366 final = ptrs;
367 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
368 offsets[n++] = EXT3_TIND_BLOCK;
369 offsets[n++] = i_block >> (ptrs_bits * 2);
370 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
371 offsets[n++] = i_block & (ptrs - 1);
372 final = ptrs;
373 } else {
374 ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big");
375 }
376 if (boundary)
377 *boundary = final - 1 - (i_block & (ptrs - 1));
378 return n;
379}
380
381/**
382 * ext3_get_branch - read the chain of indirect blocks leading to data
383 * @inode: inode in question
384 * @depth: depth of the chain (1 - direct pointer, etc.)
385 * @offsets: offsets of pointers in inode/indirect blocks
386 * @chain: place to store the result
387 * @err: here we store the error value
388 *
389 * Function fills the array of triples <key, p, bh> and returns %NULL
390 * if everything went OK or the pointer to the last filled triple
391 * (incomplete one) otherwise. Upon the return chain[i].key contains
392 * the number of (i+1)-th block in the chain (as it is stored in memory,
393 * i.e. little-endian 32-bit), chain[i].p contains the address of that
394 * number (it points into struct inode for i==0 and into the bh->b_data
395 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
396 * block for i>0 and NULL for i==0. In other words, it holds the block
397 * numbers of the chain, addresses they were taken from (and where we can
398 * verify that chain did not change) and buffer_heads hosting these
399 * numbers.
400 *
401 * Function stops when it stumbles upon zero pointer (absent block)
402 * (pointer to last triple returned, *@err == 0)
403 * or when it gets an IO error reading an indirect block
404 * (ditto, *@err == -EIO)
405 * or when it notices that chain had been changed while it was reading
406 * (ditto, *@err == -EAGAIN)
407 * or when it reads all @depth-1 indirect blocks successfully and finds
408 * the whole chain, all way to the data (returns %NULL, *err == 0).
409 */
410static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
411 Indirect chain[4], int *err)
412{
413 struct super_block *sb = inode->i_sb;
414 Indirect *p = chain;
415 struct buffer_head *bh;
416
417 *err = 0;
418 /* i_data is not going away, no lock needed */
419 add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
420 if (!p->key)
421 goto no_block;
422 while (--depth) {
423 bh = sb_bread(sb, le32_to_cpu(p->key));
424 if (!bh)
425 goto failure;
426 /* Reader: pointers */
427 if (!verify_chain(chain, p))
428 goto changed;
429 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
430 /* Reader: end */
431 if (!p->key)
432 goto no_block;
433 }
434 return NULL;
435
436changed:
437 brelse(bh);
438 *err = -EAGAIN;
439 goto no_block;
440failure:
441 *err = -EIO;
442no_block:
443 return p;
444}
445
446/**
447 * ext3_find_near - find a place for allocation with sufficient locality
448 * @inode: owner
449 * @ind: descriptor of indirect block.
450 *
451 * This function returns the preferred place for block allocation.
452 * It is used when heuristic for sequential allocation fails.
453 * Rules are:
454 * + if there is a block to the left of our position - allocate near it.
455 * + if pointer will live in indirect block - allocate near that block.
456 * + if pointer will live in inode - allocate in the same
457 * cylinder group.
458 *
459 * In the latter case we colour the starting block by the callers PID to
460 * prevent it from clashing with concurrent allocations for a different inode
461 * in the same block group. The PID is used here so that functionally related
462 * files will be close-by on-disk.
463 *
464 * Caller must make sure that @ind is valid and will stay that way.
465 */
466static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
467{
468 struct ext3_inode_info *ei = EXT3_I(inode);
469 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
470 __le32 *p;
471 ext3_fsblk_t bg_start;
472 ext3_grpblk_t colour;
473
474 /* Try to find previous block */
475 for (p = ind->p - 1; p >= start; p--) {
476 if (*p)
477 return le32_to_cpu(*p);
478 }
479
480 /* No such thing, so let's try location of indirect block */
481 if (ind->bh)
482 return ind->bh->b_blocknr;
483
484 /*
485 * It is going to be referred to from the inode itself? OK, just put it
486 * into the same cylinder group then.
487 */
488 bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group);
489 colour = (current->pid % 16) *
490 (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
491 return bg_start + colour;
492}
493
494/**
495 * ext3_find_goal - find a preferred place for allocation.
496 * @inode: owner
497 * @block: block we want
498 * @partial: pointer to the last triple within a chain
499 *
500 * Normally this function find the preferred place for block allocation,
501 * returns it.
502 */
503
504static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
505 Indirect *partial)
506{
507 struct ext3_block_alloc_info *block_i;
508
509 block_i = EXT3_I(inode)->i_block_alloc_info;
510
511 /*
512 * try the heuristic for sequential allocation,
513 * failing that at least try to get decent locality.
514 */
515 if (block_i && (block == block_i->last_alloc_logical_block + 1)
516 && (block_i->last_alloc_physical_block != 0)) {
517 return block_i->last_alloc_physical_block + 1;
518 }
519
520 return ext3_find_near(inode, partial);
521}
522
523/**
524 * ext3_blks_to_allocate - Look up the block map and count the number
525 * of direct blocks need to be allocated for the given branch.
526 *
527 * @branch: chain of indirect blocks
528 * @k: number of blocks need for indirect blocks
529 * @blks: number of data blocks to be mapped.
530 * @blocks_to_boundary: the offset in the indirect block
531 *
532 * return the total number of blocks to be allocate, including the
533 * direct and indirect blocks.
534 */
535static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
536 int blocks_to_boundary)
537{
538 unsigned long count = 0;
539
540 /*
541 * Simple case, [t,d]Indirect block(s) has not allocated yet
542 * then it's clear blocks on that path have not allocated
543 */
544 if (k > 0) {
545 /* right now we don't handle cross boundary allocation */
546 if (blks < blocks_to_boundary + 1)
547 count += blks;
548 else
549 count += blocks_to_boundary + 1;
550 return count;
551 }
552
553 count++;
554 while (count < blks && count <= blocks_to_boundary &&
555 le32_to_cpu(*(branch[0].p + count)) == 0) {
556 count++;
557 }
558 return count;
559}
560
561/**
562 * ext3_alloc_blocks - multiple allocate blocks needed for a branch
563 * @handle: handle for this transaction
564 * @inode: owner
565 * @goal: preferred place for allocation
566 * @indirect_blks: the number of blocks need to allocate for indirect
567 * blocks
568 * @blks: number of blocks need to allocated for direct blocks
569 * @new_blocks: on return it will store the new block numbers for
570 * the indirect blocks(if needed) and the first direct block,
571 * @err: here we store the error value
572 *
573 * return the number of direct blocks allocated
574 */
575static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
576 ext3_fsblk_t goal, int indirect_blks, int blks,
577 ext3_fsblk_t new_blocks[4], int *err)
578{
579 int target, i;
580 unsigned long count = 0;
581 int index = 0;
582 ext3_fsblk_t current_block = 0;
583 int ret = 0;
584
585 /*
586 * Here we try to allocate the requested multiple blocks at once,
587 * on a best-effort basis.
588 * To build a branch, we should allocate blocks for
589 * the indirect blocks(if not allocated yet), and at least
590 * the first direct block of this branch. That's the
591 * minimum number of blocks need to allocate(required)
592 */
593 target = blks + indirect_blks;
594
595 while (1) {
596 count = target;
597 /* allocating blocks for indirect blocks and direct blocks */
598 current_block = ext3_new_blocks(handle,inode,goal,&count,err);
599 if (*err)
600 goto failed_out;
601
602 target -= count;
603 /* allocate blocks for indirect blocks */
604 while (index < indirect_blks && count) {
605 new_blocks[index++] = current_block++;
606 count--;
607 }
608
609 if (count > 0)
610 break;
611 }
612
613 /* save the new block number for the first direct block */
614 new_blocks[index] = current_block;
615
616 /* total number of blocks allocated for direct blocks */
617 ret = count;
618 *err = 0;
619 return ret;
620failed_out:
621 for (i = 0; i <index; i++)
622 ext3_free_blocks(handle, inode, new_blocks[i], 1);
623 return ret;
624}
625
626/**
627 * ext3_alloc_branch - allocate and set up a chain of blocks.
628 * @handle: handle for this transaction
629 * @inode: owner
630 * @indirect_blks: number of allocated indirect blocks
631 * @blks: number of allocated direct blocks
632 * @goal: preferred place for allocation
633 * @offsets: offsets (in the blocks) to store the pointers to next.
634 * @branch: place to store the chain in.
635 *
636 * This function allocates blocks, zeroes out all but the last one,
637 * links them into chain and (if we are synchronous) writes them to disk.
638 * In other words, it prepares a branch that can be spliced onto the
639 * inode. It stores the information about that chain in the branch[], in
640 * the same format as ext3_get_branch() would do. We are calling it after
641 * we had read the existing part of chain and partial points to the last
642 * triple of that (one with zero ->key). Upon the exit we have the same
643 * picture as after the successful ext3_get_block(), except that in one
644 * place chain is disconnected - *branch->p is still zero (we did not
645 * set the last link), but branch->key contains the number that should
646 * be placed into *branch->p to fill that gap.
647 *
648 * If allocation fails we free all blocks we've allocated (and forget
649 * their buffer_heads) and return the error value the from failed
650 * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
651 * as described above and return 0.
652 */
653static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
654 int indirect_blks, int *blks, ext3_fsblk_t goal,
655 int *offsets, Indirect *branch)
656{
657 int blocksize = inode->i_sb->s_blocksize;
658 int i, n = 0;
659 int err = 0;
660 struct buffer_head *bh;
661 int num;
662 ext3_fsblk_t new_blocks[4];
663 ext3_fsblk_t current_block;
664
665 num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
666 *blks, new_blocks, &err);
667 if (err)
668 return err;
669
670 branch[0].key = cpu_to_le32(new_blocks[0]);
671 /*
672 * metadata blocks and data blocks are allocated.
673 */
674 for (n = 1; n <= indirect_blks; n++) {
675 /*
676 * Get buffer_head for parent block, zero it out
677 * and set the pointer to new one, then send
678 * parent to disk.
679 */
680 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
681 if (unlikely(!bh)) {
682 err = -ENOMEM;
683 goto failed;
684 }
685 branch[n].bh = bh;
686 lock_buffer(bh);
687 BUFFER_TRACE(bh, "call get_create_access");
688 err = ext3_journal_get_create_access(handle, bh);
689 if (err) {
690 unlock_buffer(bh);
691 brelse(bh);
692 goto failed;
693 }
694
695 memset(bh->b_data, 0, blocksize);
696 branch[n].p = (__le32 *) bh->b_data + offsets[n];
697 branch[n].key = cpu_to_le32(new_blocks[n]);
698 *branch[n].p = branch[n].key;
699 if ( n == indirect_blks) {
700 current_block = new_blocks[n];
701 /*
702 * End of chain, update the last new metablock of
703 * the chain to point to the new allocated
704 * data blocks numbers
705 */
706 for (i=1; i < num; i++)
707 *(branch[n].p + i) = cpu_to_le32(++current_block);
708 }
709 BUFFER_TRACE(bh, "marking uptodate");
710 set_buffer_uptodate(bh);
711 unlock_buffer(bh);
712
713 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
714 err = ext3_journal_dirty_metadata(handle, bh);
715 if (err)
716 goto failed;
717 }
718 *blks = num;
719 return err;
720failed:
721 /* Allocation failed, free what we already allocated */
722 for (i = 1; i <= n ; i++) {
723 BUFFER_TRACE(branch[i].bh, "call journal_forget");
724 ext3_journal_forget(handle, branch[i].bh);
725 }
726 for (i = 0; i < indirect_blks; i++)
727 ext3_free_blocks(handle, inode, new_blocks[i], 1);
728
729 ext3_free_blocks(handle, inode, new_blocks[i], num);
730
731 return err;
732}
733
734/**
735 * ext3_splice_branch - splice the allocated branch onto inode.
736 * @handle: handle for this transaction
737 * @inode: owner
738 * @block: (logical) number of block we are adding
739 * @where: location of missing link
740 * @num: number of indirect blocks we are adding
741 * @blks: number of direct blocks we are adding
742 *
743 * This function fills the missing link and does all housekeeping needed in
744 * inode (->i_blocks, etc.). In case of success we end up with the full
745 * chain to new block and return 0.
746 */
747static int ext3_splice_branch(handle_t *handle, struct inode *inode,
748 long block, Indirect *where, int num, int blks)
749{
750 int i;
751 int err = 0;
752 struct ext3_block_alloc_info *block_i;
753 ext3_fsblk_t current_block;
754 struct ext3_inode_info *ei = EXT3_I(inode);
755 struct timespec now;
756
757 block_i = ei->i_block_alloc_info;
758 /*
759 * If we're splicing into a [td]indirect block (as opposed to the
760 * inode) then we need to get write access to the [td]indirect block
761 * before the splice.
762 */
763 if (where->bh) {
764 BUFFER_TRACE(where->bh, "get_write_access");
765 err = ext3_journal_get_write_access(handle, where->bh);
766 if (err)
767 goto err_out;
768 }
769 /* That's it */
770
771 *where->p = where->key;
772
773 /*
774 * Update the host buffer_head or inode to point to more just allocated
775 * direct blocks blocks
776 */
777 if (num == 0 && blks > 1) {
778 current_block = le32_to_cpu(where->key) + 1;
779 for (i = 1; i < blks; i++)
780 *(where->p + i ) = cpu_to_le32(current_block++);
781 }
782
783 /*
784 * update the most recently allocated logical & physical block
785 * in i_block_alloc_info, to assist find the proper goal block for next
786 * allocation
787 */
788 if (block_i) {
789 block_i->last_alloc_logical_block = block + blks - 1;
790 block_i->last_alloc_physical_block =
791 le32_to_cpu(where[num].key) + blks - 1;
792 }
793
794 /* We are done with atomic stuff, now do the rest of housekeeping */
795 now = CURRENT_TIME_SEC;
796 if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) {
797 inode->i_ctime = now;
798 ext3_mark_inode_dirty(handle, inode);
799 }
800 /* ext3_mark_inode_dirty already updated i_sync_tid */
801 atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
802
803 /* had we spliced it onto indirect block? */
804 if (where->bh) {
805 /*
806 * If we spliced it onto an indirect block, we haven't
807 * altered the inode. Note however that if it is being spliced
808 * onto an indirect block at the very end of the file (the
809 * file is growing) then we *will* alter the inode to reflect
810 * the new i_size. But that is not done here - it is done in
811 * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
812 */
813 jbd_debug(5, "splicing indirect only\n");
814 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
815 err = ext3_journal_dirty_metadata(handle, where->bh);
816 if (err)
817 goto err_out;
818 } else {
819 /*
820 * OK, we spliced it into the inode itself on a direct block.
821 * Inode was dirtied above.
822 */
823 jbd_debug(5, "splicing direct\n");
824 }
825 return err;
826
827err_out:
828 for (i = 1; i <= num; i++) {
829 BUFFER_TRACE(where[i].bh, "call journal_forget");
830 ext3_journal_forget(handle, where[i].bh);
831 ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
832 }
833 ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
834
835 return err;
836}
837
838/*
839 * Allocation strategy is simple: if we have to allocate something, we will
840 * have to go the whole way to leaf. So let's do it before attaching anything
841 * to tree, set linkage between the newborn blocks, write them if sync is
842 * required, recheck the path, free and repeat if check fails, otherwise
843 * set the last missing link (that will protect us from any truncate-generated
844 * removals - all blocks on the path are immune now) and possibly force the
845 * write on the parent block.
846 * That has a nice additional property: no special recovery from the failed
847 * allocations is needed - we simply release blocks and do not touch anything
848 * reachable from inode.
849 *
850 * `handle' can be NULL if create == 0.
851 *
852 * The BKL may not be held on entry here. Be sure to take it early.
853 * return > 0, # of blocks mapped or allocated.
854 * return = 0, if plain lookup failed.
855 * return < 0, error case.
856 */
857int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
858 sector_t iblock, unsigned long maxblocks,
859 struct buffer_head *bh_result,
860 int create)
861{
862 int err = -EIO;
863 int offsets[4];
864 Indirect chain[4];
865 Indirect *partial;
866 ext3_fsblk_t goal;
867 int indirect_blks;
868 int blocks_to_boundary = 0;
869 int depth;
870 struct ext3_inode_info *ei = EXT3_I(inode);
871 int count = 0;
872 ext3_fsblk_t first_block = 0;
873
874
875 trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create);
876 J_ASSERT(handle != NULL || create == 0);
877 depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
878
879 if (depth == 0)
880 goto out;
881
882 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
883
884 /* Simplest case - block found, no allocation needed */
885 if (!partial) {
886 first_block = le32_to_cpu(chain[depth - 1].key);
887 clear_buffer_new(bh_result);
888 count++;
889 /*map more blocks*/
890 while (count < maxblocks && count <= blocks_to_boundary) {
891 ext3_fsblk_t blk;
892
893 if (!verify_chain(chain, chain + depth - 1)) {
894 /*
895 * Indirect block might be removed by
896 * truncate while we were reading it.
897 * Handling of that case: forget what we've
898 * got now. Flag the err as EAGAIN, so it
899 * will reread.
900 */
901 err = -EAGAIN;
902 count = 0;
903 break;
904 }
905 blk = le32_to_cpu(*(chain[depth-1].p + count));
906
907 if (blk == first_block + count)
908 count++;
909 else
910 break;
911 }
912 if (err != -EAGAIN)
913 goto got_it;
914 }
915
916 /* Next simple case - plain lookup or failed read of indirect block */
917 if (!create || err == -EIO)
918 goto cleanup;
919
920 /*
921 * Block out ext3_truncate while we alter the tree
922 */
923 mutex_lock(&ei->truncate_mutex);
924
925 /*
926 * If the indirect block is missing while we are reading
927 * the chain(ext3_get_branch() returns -EAGAIN err), or
928 * if the chain has been changed after we grab the semaphore,
929 * (either because another process truncated this branch, or
930 * another get_block allocated this branch) re-grab the chain to see if
931 * the request block has been allocated or not.
932 *
933 * Since we already block the truncate/other get_block
934 * at this point, we will have the current copy of the chain when we
935 * splice the branch into the tree.
936 */
937 if (err == -EAGAIN || !verify_chain(chain, partial)) {
938 while (partial > chain) {
939 brelse(partial->bh);
940 partial--;
941 }
942 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
943 if (!partial) {
944 count++;
945 mutex_unlock(&ei->truncate_mutex);
946 if (err)
947 goto cleanup;
948 clear_buffer_new(bh_result);
949 goto got_it;
950 }
951 }
952
953 /*
954 * Okay, we need to do block allocation. Lazily initialize the block
955 * allocation info here if necessary
956 */
957 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
958 ext3_init_block_alloc_info(inode);
959
960 goal = ext3_find_goal(inode, iblock, partial);
961
962 /* the number of blocks need to allocate for [d,t]indirect blocks */
963 indirect_blks = (chain + depth) - partial - 1;
964
965 /*
966 * Next look up the indirect map to count the totoal number of
967 * direct blocks to allocate for this branch.
968 */
969 count = ext3_blks_to_allocate(partial, indirect_blks,
970 maxblocks, blocks_to_boundary);
971 err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
972 offsets + (partial - chain), partial);
973
974 /*
975 * The ext3_splice_branch call will free and forget any buffers
976 * on the new chain if there is a failure, but that risks using
977 * up transaction credits, especially for bitmaps where the
978 * credits cannot be returned. Can we handle this somehow? We
979 * may need to return -EAGAIN upwards in the worst case. --sct
980 */
981 if (!err)
982 err = ext3_splice_branch(handle, inode, iblock,
983 partial, indirect_blks, count);
984 mutex_unlock(&ei->truncate_mutex);
985 if (err)
986 goto cleanup;
987
988 set_buffer_new(bh_result);
989got_it:
990 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
991 if (count > blocks_to_boundary)
992 set_buffer_boundary(bh_result);
993 err = count;
994 /* Clean up and exit */
995 partial = chain + depth - 1; /* the whole chain */
996cleanup:
997 while (partial > chain) {
998 BUFFER_TRACE(partial->bh, "call brelse");
999 brelse(partial->bh);
1000 partial--;
1001 }
1002 BUFFER_TRACE(bh_result, "returned");
1003out:
1004 trace_ext3_get_blocks_exit(inode, iblock,
1005 depth ? le32_to_cpu(chain[depth-1].key) : 0,
1006 count, err);
1007 return err;
1008}
1009
1010/* Maximum number of blocks we map for direct IO at once. */
1011#define DIO_MAX_BLOCKS 4096
1012/*
1013 * Number of credits we need for writing DIO_MAX_BLOCKS:
1014 * We need sb + group descriptor + bitmap + inode -> 4
1015 * For B blocks with A block pointers per block we need:
1016 * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
1017 * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
1018 */
1019#define DIO_CREDITS 25
1020
1021static int ext3_get_block(struct inode *inode, sector_t iblock,
1022 struct buffer_head *bh_result, int create)
1023{
1024 handle_t *handle = ext3_journal_current_handle();
1025 int ret = 0, started = 0;
1026 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1027
1028 if (create && !handle) { /* Direct IO write... */
1029 if (max_blocks > DIO_MAX_BLOCKS)
1030 max_blocks = DIO_MAX_BLOCKS;
1031 handle = ext3_journal_start(inode, DIO_CREDITS +
1032 EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
1033 if (IS_ERR(handle)) {
1034 ret = PTR_ERR(handle);
1035 goto out;
1036 }
1037 started = 1;
1038 }
1039
1040 ret = ext3_get_blocks_handle(handle, inode, iblock,
1041 max_blocks, bh_result, create);
1042 if (ret > 0) {
1043 bh_result->b_size = (ret << inode->i_blkbits);
1044 ret = 0;
1045 }
1046 if (started)
1047 ext3_journal_stop(handle);
1048out:
1049 return ret;
1050}
1051
1052int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1053 u64 start, u64 len)
1054{
1055 return generic_block_fiemap(inode, fieinfo, start, len,
1056 ext3_get_block);
1057}
1058
1059/*
1060 * `handle' can be NULL if create is zero
1061 */
1062struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
1063 long block, int create, int *errp)
1064{
1065 struct buffer_head dummy;
1066 int fatal = 0, err;
1067
1068 J_ASSERT(handle != NULL || create == 0);
1069
1070 dummy.b_state = 0;
1071 dummy.b_blocknr = -1000;
1072 buffer_trace_init(&dummy.b_history);
1073 err = ext3_get_blocks_handle(handle, inode, block, 1,
1074 &dummy, create);
1075 /*
1076 * ext3_get_blocks_handle() returns number of blocks
1077 * mapped. 0 in case of a HOLE.
1078 */
1079 if (err > 0) {
1080 WARN_ON(err > 1);
1081 err = 0;
1082 }
1083 *errp = err;
1084 if (!err && buffer_mapped(&dummy)) {
1085 struct buffer_head *bh;
1086 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
1087 if (unlikely(!bh)) {
1088 *errp = -ENOMEM;
1089 goto err;
1090 }
1091 if (buffer_new(&dummy)) {
1092 J_ASSERT(create != 0);
1093 J_ASSERT(handle != NULL);
1094
1095 /*
1096 * Now that we do not always journal data, we should
1097 * keep in mind whether this should always journal the
1098 * new buffer as metadata. For now, regular file
1099 * writes use ext3_get_block instead, so it's not a
1100 * problem.
1101 */
1102 lock_buffer(bh);
1103 BUFFER_TRACE(bh, "call get_create_access");
1104 fatal = ext3_journal_get_create_access(handle, bh);
1105 if (!fatal && !buffer_uptodate(bh)) {
1106 memset(bh->b_data,0,inode->i_sb->s_blocksize);
1107 set_buffer_uptodate(bh);
1108 }
1109 unlock_buffer(bh);
1110 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1111 err = ext3_journal_dirty_metadata(handle, bh);
1112 if (!fatal)
1113 fatal = err;
1114 } else {
1115 BUFFER_TRACE(bh, "not a new buffer");
1116 }
1117 if (fatal) {
1118 *errp = fatal;
1119 brelse(bh);
1120 bh = NULL;
1121 }
1122 return bh;
1123 }
1124err:
1125 return NULL;
1126}
1127
1128struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
1129 int block, int create, int *err)
1130{
1131 struct buffer_head * bh;
1132
1133 bh = ext3_getblk(handle, inode, block, create, err);
1134 if (!bh)
1135 return bh;
1136 if (bh_uptodate_or_lock(bh))
1137 return bh;
1138 get_bh(bh);
1139 bh->b_end_io = end_buffer_read_sync;
1140 submit_bh(READ | REQ_META | REQ_PRIO, bh);
1141 wait_on_buffer(bh);
1142 if (buffer_uptodate(bh))
1143 return bh;
1144 put_bh(bh);
1145 *err = -EIO;
1146 return NULL;
1147}
1148
1149static int walk_page_buffers( handle_t *handle,
1150 struct buffer_head *head,
1151 unsigned from,
1152 unsigned to,
1153 int *partial,
1154 int (*fn)( handle_t *handle,
1155 struct buffer_head *bh))
1156{
1157 struct buffer_head *bh;
1158 unsigned block_start, block_end;
1159 unsigned blocksize = head->b_size;
1160 int err, ret = 0;
1161 struct buffer_head *next;
1162
1163 for ( bh = head, block_start = 0;
1164 ret == 0 && (bh != head || !block_start);
1165 block_start = block_end, bh = next)
1166 {
1167 next = bh->b_this_page;
1168 block_end = block_start + blocksize;
1169 if (block_end <= from || block_start >= to) {
1170 if (partial && !buffer_uptodate(bh))
1171 *partial = 1;
1172 continue;
1173 }
1174 err = (*fn)(handle, bh);
1175 if (!ret)
1176 ret = err;
1177 }
1178 return ret;
1179}
1180
1181/*
1182 * To preserve ordering, it is essential that the hole instantiation and
1183 * the data write be encapsulated in a single transaction. We cannot
1184 * close off a transaction and start a new one between the ext3_get_block()
1185 * and the commit_write(). So doing the journal_start at the start of
1186 * prepare_write() is the right place.
1187 *
1188 * Also, this function can nest inside ext3_writepage() ->
1189 * block_write_full_page(). In that case, we *know* that ext3_writepage()
1190 * has generated enough buffer credits to do the whole page. So we won't
1191 * block on the journal in that case, which is good, because the caller may
1192 * be PF_MEMALLOC.
1193 *
1194 * By accident, ext3 can be reentered when a transaction is open via
1195 * quota file writes. If we were to commit the transaction while thus
1196 * reentered, there can be a deadlock - we would be holding a quota
1197 * lock, and the commit would never complete if another thread had a
1198 * transaction open and was blocking on the quota lock - a ranking
1199 * violation.
1200 *
1201 * So what we do is to rely on the fact that journal_stop/journal_start
1202 * will _not_ run commit under these circumstances because handle->h_ref
1203 * is elevated. We'll still have enough credits for the tiny quotafile
1204 * write.
1205 */
1206static int do_journal_get_write_access(handle_t *handle,
1207 struct buffer_head *bh)
1208{
1209 int dirty = buffer_dirty(bh);
1210 int ret;
1211
1212 if (!buffer_mapped(bh) || buffer_freed(bh))
1213 return 0;
1214 /*
1215 * __block_prepare_write() could have dirtied some buffers. Clean
1216 * the dirty bit as jbd2_journal_get_write_access() could complain
1217 * otherwise about fs integrity issues. Setting of the dirty bit
1218 * by __block_prepare_write() isn't a real problem here as we clear
1219 * the bit before releasing a page lock and thus writeback cannot
1220 * ever write the buffer.
1221 */
1222 if (dirty)
1223 clear_buffer_dirty(bh);
1224 ret = ext3_journal_get_write_access(handle, bh);
1225 if (!ret && dirty)
1226 ret = ext3_journal_dirty_metadata(handle, bh);
1227 return ret;
1228}
1229
1230/*
1231 * Truncate blocks that were not used by write. We have to truncate the
1232 * pagecache as well so that corresponding buffers get properly unmapped.
1233 */
1234static void ext3_truncate_failed_write(struct inode *inode)
1235{
1236 truncate_inode_pages(inode->i_mapping, inode->i_size);
1237 ext3_truncate(inode);
1238}
1239
1240/*
1241 * Truncate blocks that were not used by direct IO write. We have to zero out
1242 * the last file block as well because direct IO might have written to it.
1243 */
1244static void ext3_truncate_failed_direct_write(struct inode *inode)
1245{
1246 ext3_block_truncate_page(inode, inode->i_size);
1247 ext3_truncate(inode);
1248}
1249
1250static int ext3_write_begin(struct file *file, struct address_space *mapping,
1251 loff_t pos, unsigned len, unsigned flags,
1252 struct page **pagep, void **fsdata)
1253{
1254 struct inode *inode = mapping->host;
1255 int ret;
1256 handle_t *handle;
1257 int retries = 0;
1258 struct page *page;
1259 pgoff_t index;
1260 unsigned from, to;
1261 /* Reserve one block more for addition to orphan list in case
1262 * we allocate blocks but write fails for some reason */
1263 int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
1264
1265 trace_ext3_write_begin(inode, pos, len, flags);
1266
1267 index = pos >> PAGE_CACHE_SHIFT;
1268 from = pos & (PAGE_CACHE_SIZE - 1);
1269 to = from + len;
1270
1271retry:
1272 page = grab_cache_page_write_begin(mapping, index, flags);
1273 if (!page)
1274 return -ENOMEM;
1275 *pagep = page;
1276
1277 handle = ext3_journal_start(inode, needed_blocks);
1278 if (IS_ERR(handle)) {
1279 unlock_page(page);
1280 page_cache_release(page);
1281 ret = PTR_ERR(handle);
1282 goto out;
1283 }
1284 ret = __block_write_begin(page, pos, len, ext3_get_block);
1285 if (ret)
1286 goto write_begin_failed;
1287
1288 if (ext3_should_journal_data(inode)) {
1289 ret = walk_page_buffers(handle, page_buffers(page),
1290 from, to, NULL, do_journal_get_write_access);
1291 }
1292write_begin_failed:
1293 if (ret) {
1294 /*
1295 * block_write_begin may have instantiated a few blocks
1296 * outside i_size. Trim these off again. Don't need
1297 * i_size_read because we hold i_mutex.
1298 *
1299 * Add inode to orphan list in case we crash before truncate
1300 * finishes. Do this only if ext3_can_truncate() agrees so
1301 * that orphan processing code is happy.
1302 */
1303 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1304 ext3_orphan_add(handle, inode);
1305 ext3_journal_stop(handle);
1306 unlock_page(page);
1307 page_cache_release(page);
1308 if (pos + len > inode->i_size)
1309 ext3_truncate_failed_write(inode);
1310 }
1311 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1312 goto retry;
1313out:
1314 return ret;
1315}
1316
1317
1318int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1319{
1320 int err = journal_dirty_data(handle, bh);
1321 if (err)
1322 ext3_journal_abort_handle(__func__, __func__,
1323 bh, handle, err);
1324 return err;
1325}
1326
1327/* For ordered writepage and write_end functions */
1328static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1329{
1330 /*
1331 * Write could have mapped the buffer but it didn't copy the data in
1332 * yet. So avoid filing such buffer into a transaction.
1333 */
1334 if (buffer_mapped(bh) && buffer_uptodate(bh))
1335 return ext3_journal_dirty_data(handle, bh);
1336 return 0;
1337}
1338
1339/* For write_end() in data=journal mode */
1340static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1341{
1342 if (!buffer_mapped(bh) || buffer_freed(bh))
1343 return 0;
1344 set_buffer_uptodate(bh);
1345 return ext3_journal_dirty_metadata(handle, bh);
1346}
1347
1348/*
1349 * This is nasty and subtle: ext3_write_begin() could have allocated blocks
1350 * for the whole page but later we failed to copy the data in. Update inode
1351 * size according to what we managed to copy. The rest is going to be
1352 * truncated in write_end function.
1353 */
1354static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
1355{
1356 /* What matters to us is i_disksize. We don't write i_size anywhere */
1357 if (pos + copied > inode->i_size)
1358 i_size_write(inode, pos + copied);
1359 if (pos + copied > EXT3_I(inode)->i_disksize) {
1360 EXT3_I(inode)->i_disksize = pos + copied;
1361 mark_inode_dirty(inode);
1362 }
1363}
1364
1365/*
1366 * We need to pick up the new inode size which generic_commit_write gave us
1367 * `file' can be NULL - eg, when called from page_symlink().
1368 *
1369 * ext3 never places buffers on inode->i_mapping->private_list. metadata
1370 * buffers are managed internally.
1371 */
1372static int ext3_ordered_write_end(struct file *file,
1373 struct address_space *mapping,
1374 loff_t pos, unsigned len, unsigned copied,
1375 struct page *page, void *fsdata)
1376{
1377 handle_t *handle = ext3_journal_current_handle();
1378 struct inode *inode = file->f_mapping->host;
1379 unsigned from, to;
1380 int ret = 0, ret2;
1381
1382 trace_ext3_ordered_write_end(inode, pos, len, copied);
1383 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1384
1385 from = pos & (PAGE_CACHE_SIZE - 1);
1386 to = from + copied;
1387 ret = walk_page_buffers(handle, page_buffers(page),
1388 from, to, NULL, journal_dirty_data_fn);
1389
1390 if (ret == 0)
1391 update_file_sizes(inode, pos, copied);
1392 /*
1393 * There may be allocated blocks outside of i_size because
1394 * we failed to copy some data. Prepare for truncate.
1395 */
1396 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1397 ext3_orphan_add(handle, inode);
1398 ret2 = ext3_journal_stop(handle);
1399 if (!ret)
1400 ret = ret2;
1401 unlock_page(page);
1402 page_cache_release(page);
1403
1404 if (pos + len > inode->i_size)
1405 ext3_truncate_failed_write(inode);
1406 return ret ? ret : copied;
1407}
1408
1409static int ext3_writeback_write_end(struct file *file,
1410 struct address_space *mapping,
1411 loff_t pos, unsigned len, unsigned copied,
1412 struct page *page, void *fsdata)
1413{
1414 handle_t *handle = ext3_journal_current_handle();
1415 struct inode *inode = file->f_mapping->host;
1416 int ret;
1417
1418 trace_ext3_writeback_write_end(inode, pos, len, copied);
1419 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1420 update_file_sizes(inode, pos, copied);
1421 /*
1422 * There may be allocated blocks outside of i_size because
1423 * we failed to copy some data. Prepare for truncate.
1424 */
1425 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1426 ext3_orphan_add(handle, inode);
1427 ret = ext3_journal_stop(handle);
1428 unlock_page(page);
1429 page_cache_release(page);
1430
1431 if (pos + len > inode->i_size)
1432 ext3_truncate_failed_write(inode);
1433 return ret ? ret : copied;
1434}
1435
1436static int ext3_journalled_write_end(struct file *file,
1437 struct address_space *mapping,
1438 loff_t pos, unsigned len, unsigned copied,
1439 struct page *page, void *fsdata)
1440{
1441 handle_t *handle = ext3_journal_current_handle();
1442 struct inode *inode = mapping->host;
1443 struct ext3_inode_info *ei = EXT3_I(inode);
1444 int ret = 0, ret2;
1445 int partial = 0;
1446 unsigned from, to;
1447
1448 trace_ext3_journalled_write_end(inode, pos, len, copied);
1449 from = pos & (PAGE_CACHE_SIZE - 1);
1450 to = from + len;
1451
1452 if (copied < len) {
1453 if (!PageUptodate(page))
1454 copied = 0;
1455 page_zero_new_buffers(page, from + copied, to);
1456 to = from + copied;
1457 }
1458
1459 ret = walk_page_buffers(handle, page_buffers(page), from,
1460 to, &partial, write_end_fn);
1461 if (!partial)
1462 SetPageUptodate(page);
1463
1464 if (pos + copied > inode->i_size)
1465 i_size_write(inode, pos + copied);
1466 /*
1467 * There may be allocated blocks outside of i_size because
1468 * we failed to copy some data. Prepare for truncate.
1469 */
1470 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1471 ext3_orphan_add(handle, inode);
1472 ext3_set_inode_state(inode, EXT3_STATE_JDATA);
1473 atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
1474 if (inode->i_size > ei->i_disksize) {
1475 ei->i_disksize = inode->i_size;
1476 ret2 = ext3_mark_inode_dirty(handle, inode);
1477 if (!ret)
1478 ret = ret2;
1479 }
1480
1481 ret2 = ext3_journal_stop(handle);
1482 if (!ret)
1483 ret = ret2;
1484 unlock_page(page);
1485 page_cache_release(page);
1486
1487 if (pos + len > inode->i_size)
1488 ext3_truncate_failed_write(inode);
1489 return ret ? ret : copied;
1490}
1491
1492/*
1493 * bmap() is special. It gets used by applications such as lilo and by
1494 * the swapper to find the on-disk block of a specific piece of data.
1495 *
1496 * Naturally, this is dangerous if the block concerned is still in the
1497 * journal. If somebody makes a swapfile on an ext3 data-journaling
1498 * filesystem and enables swap, then they may get a nasty shock when the
1499 * data getting swapped to that swapfile suddenly gets overwritten by
1500 * the original zero's written out previously to the journal and
1501 * awaiting writeback in the kernel's buffer cache.
1502 *
1503 * So, if we see any bmap calls here on a modified, data-journaled file,
1504 * take extra steps to flush any blocks which might be in the cache.
1505 */
1506static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1507{
1508 struct inode *inode = mapping->host;
1509 journal_t *journal;
1510 int err;
1511
1512 if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) {
1513 /*
1514 * This is a REALLY heavyweight approach, but the use of
1515 * bmap on dirty files is expected to be extremely rare:
1516 * only if we run lilo or swapon on a freshly made file
1517 * do we expect this to happen.
1518 *
1519 * (bmap requires CAP_SYS_RAWIO so this does not
1520 * represent an unprivileged user DOS attack --- we'd be
1521 * in trouble if mortal users could trigger this path at
1522 * will.)
1523 *
1524 * NB. EXT3_STATE_JDATA is not set on files other than
1525 * regular files. If somebody wants to bmap a directory
1526 * or symlink and gets confused because the buffer
1527 * hasn't yet been flushed to disk, they deserve
1528 * everything they get.
1529 */
1530
1531 ext3_clear_inode_state(inode, EXT3_STATE_JDATA);
1532 journal = EXT3_JOURNAL(inode);
1533 journal_lock_updates(journal);
1534 err = journal_flush(journal);
1535 journal_unlock_updates(journal);
1536
1537 if (err)
1538 return 0;
1539 }
1540
1541 return generic_block_bmap(mapping,block,ext3_get_block);
1542}
1543
1544static int bget_one(handle_t *handle, struct buffer_head *bh)
1545{
1546 get_bh(bh);
1547 return 0;
1548}
1549
1550static int bput_one(handle_t *handle, struct buffer_head *bh)
1551{
1552 put_bh(bh);
1553 return 0;
1554}
1555
1556static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
1557{
1558 return !buffer_mapped(bh);
1559}
1560
1561/*
1562 * Note that whenever we need to map blocks we start a transaction even if
1563 * we're not journalling data. This is to preserve ordering: any hole
1564 * instantiation within __block_write_full_page -> ext3_get_block() should be
1565 * journalled along with the data so we don't crash and then get metadata which
1566 * refers to old data.
1567 *
1568 * In all journalling modes block_write_full_page() will start the I/O.
1569 *
1570 * We don't honour synchronous mounts for writepage(). That would be
1571 * disastrous. Any write() or metadata operation will sync the fs for
1572 * us.
1573 */
1574static int ext3_ordered_writepage(struct page *page,
1575 struct writeback_control *wbc)
1576{
1577 struct inode *inode = page->mapping->host;
1578 struct buffer_head *page_bufs;
1579 handle_t *handle = NULL;
1580 int ret = 0;
1581 int err;
1582
1583 J_ASSERT(PageLocked(page));
1584 /*
1585 * We don't want to warn for emergency remount. The condition is
1586 * ordered to avoid dereferencing inode->i_sb in non-error case to
1587 * avoid slow-downs.
1588 */
1589 WARN_ON_ONCE(IS_RDONLY(inode) &&
1590 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
1591
1592 /*
1593 * We give up here if we're reentered, because it might be for a
1594 * different filesystem.
1595 */
1596 if (ext3_journal_current_handle())
1597 goto out_fail;
1598
1599 trace_ext3_ordered_writepage(page);
1600 if (!page_has_buffers(page)) {
1601 create_empty_buffers(page, inode->i_sb->s_blocksize,
1602 (1 << BH_Dirty)|(1 << BH_Uptodate));
1603 page_bufs = page_buffers(page);
1604 } else {
1605 page_bufs = page_buffers(page);
1606 if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,
1607 NULL, buffer_unmapped)) {
1608 /* Provide NULL get_block() to catch bugs if buffers
1609 * weren't really mapped */
1610 return block_write_full_page(page, NULL, wbc);
1611 }
1612 }
1613 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1614
1615 if (IS_ERR(handle)) {
1616 ret = PTR_ERR(handle);
1617 goto out_fail;
1618 }
1619
1620 walk_page_buffers(handle, page_bufs, 0,
1621 PAGE_CACHE_SIZE, NULL, bget_one);
1622
1623 ret = block_write_full_page(page, ext3_get_block, wbc);
1624
1625 /*
1626 * The page can become unlocked at any point now, and
1627 * truncate can then come in and change things. So we
1628 * can't touch *page from now on. But *page_bufs is
1629 * safe due to elevated refcount.
1630 */
1631
1632 /*
1633 * And attach them to the current transaction. But only if
1634 * block_write_full_page() succeeded. Otherwise they are unmapped,
1635 * and generally junk.
1636 */
1637 if (ret == 0)
1638 ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1639 NULL, journal_dirty_data_fn);
1640 walk_page_buffers(handle, page_bufs, 0,
1641 PAGE_CACHE_SIZE, NULL, bput_one);
1642 err = ext3_journal_stop(handle);
1643 if (!ret)
1644 ret = err;
1645 return ret;
1646
1647out_fail:
1648 redirty_page_for_writepage(wbc, page);
1649 unlock_page(page);
1650 return ret;
1651}
1652
1653static int ext3_writeback_writepage(struct page *page,
1654 struct writeback_control *wbc)
1655{
1656 struct inode *inode = page->mapping->host;
1657 handle_t *handle = NULL;
1658 int ret = 0;
1659 int err;
1660
1661 J_ASSERT(PageLocked(page));
1662 /*
1663 * We don't want to warn for emergency remount. The condition is
1664 * ordered to avoid dereferencing inode->i_sb in non-error case to
1665 * avoid slow-downs.
1666 */
1667 WARN_ON_ONCE(IS_RDONLY(inode) &&
1668 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
1669
1670 if (ext3_journal_current_handle())
1671 goto out_fail;
1672
1673 trace_ext3_writeback_writepage(page);
1674 if (page_has_buffers(page)) {
1675 if (!walk_page_buffers(NULL, page_buffers(page), 0,
1676 PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
1677 /* Provide NULL get_block() to catch bugs if buffers
1678 * weren't really mapped */
1679 return block_write_full_page(page, NULL, wbc);
1680 }
1681 }
1682
1683 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1684 if (IS_ERR(handle)) {
1685 ret = PTR_ERR(handle);
1686 goto out_fail;
1687 }
1688
1689 ret = block_write_full_page(page, ext3_get_block, wbc);
1690
1691 err = ext3_journal_stop(handle);
1692 if (!ret)
1693 ret = err;
1694 return ret;
1695
1696out_fail:
1697 redirty_page_for_writepage(wbc, page);
1698 unlock_page(page);
1699 return ret;
1700}
1701
1702static int ext3_journalled_writepage(struct page *page,
1703 struct writeback_control *wbc)
1704{
1705 struct inode *inode = page->mapping->host;
1706 handle_t *handle = NULL;
1707 int ret = 0;
1708 int err;
1709
1710 J_ASSERT(PageLocked(page));
1711 /*
1712 * We don't want to warn for emergency remount. The condition is
1713 * ordered to avoid dereferencing inode->i_sb in non-error case to
1714 * avoid slow-downs.
1715 */
1716 WARN_ON_ONCE(IS_RDONLY(inode) &&
1717 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
1718
1719 trace_ext3_journalled_writepage(page);
1720 if (!page_has_buffers(page) || PageChecked(page)) {
1721 if (ext3_journal_current_handle())
1722 goto no_write;
1723
1724 handle = ext3_journal_start(inode,
1725 ext3_writepage_trans_blocks(inode));
1726 if (IS_ERR(handle)) {
1727 ret = PTR_ERR(handle);
1728 goto no_write;
1729 }
1730 /*
1731 * It's mmapped pagecache. Add buffers and journal it. There
1732 * doesn't seem much point in redirtying the page here.
1733 */
1734 ClearPageChecked(page);
1735 ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
1736 ext3_get_block);
1737 if (ret != 0) {
1738 ext3_journal_stop(handle);
1739 goto out_unlock;
1740 }
1741 ret = walk_page_buffers(handle, page_buffers(page), 0,
1742 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1743
1744 err = walk_page_buffers(handle, page_buffers(page), 0,
1745 PAGE_CACHE_SIZE, NULL, write_end_fn);
1746 if (ret == 0)
1747 ret = err;
1748 ext3_set_inode_state(inode, EXT3_STATE_JDATA);
1749 atomic_set(&EXT3_I(inode)->i_datasync_tid,
1750 handle->h_transaction->t_tid);
1751 unlock_page(page);
1752 err = ext3_journal_stop(handle);
1753 if (!ret)
1754 ret = err;
1755 } else {
1756 /*
1757 * It is a page full of checkpoint-mode buffers. Go and write
1758 * them. They should have been already mapped when they went
1759 * to the journal so provide NULL get_block function to catch
1760 * errors.
1761 */
1762 ret = block_write_full_page(page, NULL, wbc);
1763 }
1764out:
1765 return ret;
1766
1767no_write:
1768 redirty_page_for_writepage(wbc, page);
1769out_unlock:
1770 unlock_page(page);
1771 goto out;
1772}
1773
1774static int ext3_readpage(struct file *file, struct page *page)
1775{
1776 trace_ext3_readpage(page);
1777 return mpage_readpage(page, ext3_get_block);
1778}
1779
1780static int
1781ext3_readpages(struct file *file, struct address_space *mapping,
1782 struct list_head *pages, unsigned nr_pages)
1783{
1784 return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1785}
1786
1787static void ext3_invalidatepage(struct page *page, unsigned int offset,
1788 unsigned int length)
1789{
1790 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1791
1792 trace_ext3_invalidatepage(page, offset, length);
1793
1794 /*
1795 * If it's a full truncate we just forget about the pending dirtying
1796 */
1797 if (offset == 0 && length == PAGE_CACHE_SIZE)
1798 ClearPageChecked(page);
1799
1800 journal_invalidatepage(journal, page, offset, length);
1801}
1802
1803static int ext3_releasepage(struct page *page, gfp_t wait)
1804{
1805 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1806
1807 trace_ext3_releasepage(page);
1808 WARN_ON(PageChecked(page));
1809 if (!page_has_buffers(page))
1810 return 0;
1811 return journal_try_to_free_buffers(journal, page, wait);
1812}
1813
1814/*
1815 * If the O_DIRECT write will extend the file then add this inode to the
1816 * orphan list. So recovery will truncate it back to the original size
1817 * if the machine crashes during the write.
1818 *
1819 * If the O_DIRECT write is intantiating holes inside i_size and the machine
1820 * crashes then stale disk data _may_ be exposed inside the file. But current
1821 * VFS code falls back into buffered path in that case so we are safe.
1822 */
1823static ssize_t ext3_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
1824 loff_t offset)
1825{
1826 struct file *file = iocb->ki_filp;
1827 struct inode *inode = file->f_mapping->host;
1828 struct ext3_inode_info *ei = EXT3_I(inode);
1829 handle_t *handle;
1830 ssize_t ret;
1831 int orphan = 0;
1832 size_t count = iov_iter_count(iter);
1833 int retries = 0;
1834
1835 trace_ext3_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
1836
1837 if (iov_iter_rw(iter) == WRITE) {
1838 loff_t final_size = offset + count;
1839
1840 if (final_size > inode->i_size) {
1841 /* Credits for sb + inode write */
1842 handle = ext3_journal_start(inode, 2);
1843 if (IS_ERR(handle)) {
1844 ret = PTR_ERR(handle);
1845 goto out;
1846 }
1847 ret = ext3_orphan_add(handle, inode);
1848 if (ret) {
1849 ext3_journal_stop(handle);
1850 goto out;
1851 }
1852 orphan = 1;
1853 ei->i_disksize = inode->i_size;
1854 ext3_journal_stop(handle);
1855 }
1856 }
1857
1858retry:
1859 ret = blockdev_direct_IO(iocb, inode, iter, offset, ext3_get_block);
1860 /*
1861 * In case of error extending write may have instantiated a few
1862 * blocks outside i_size. Trim these off again.
1863 */
1864 if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
1865 loff_t isize = i_size_read(inode);
1866 loff_t end = offset + count;
1867
1868 if (end > isize)
1869 ext3_truncate_failed_direct_write(inode);
1870 }
1871 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1872 goto retry;
1873
1874 if (orphan) {
1875 int err;
1876
1877 /* Credits for sb + inode write */
1878 handle = ext3_journal_start(inode, 2);
1879 if (IS_ERR(handle)) {
1880 /* This is really bad luck. We've written the data
1881 * but cannot extend i_size. Truncate allocated blocks
1882 * and pretend the write failed... */
1883 ext3_truncate_failed_direct_write(inode);
1884 ret = PTR_ERR(handle);
1885 if (inode->i_nlink)
1886 ext3_orphan_del(NULL, inode);
1887 goto out;
1888 }
1889 if (inode->i_nlink)
1890 ext3_orphan_del(handle, inode);
1891 if (ret > 0) {
1892 loff_t end = offset + ret;
1893 if (end > inode->i_size) {
1894 ei->i_disksize = end;
1895 i_size_write(inode, end);
1896 /*
1897 * We're going to return a positive `ret'
1898 * here due to non-zero-length I/O, so there's
1899 * no way of reporting error returns from
1900 * ext3_mark_inode_dirty() to userspace. So
1901 * ignore it.
1902 */
1903 ext3_mark_inode_dirty(handle, inode);
1904 }
1905 }
1906 err = ext3_journal_stop(handle);
1907 if (ret == 0)
1908 ret = err;
1909 }
1910out:
1911 trace_ext3_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
1912 return ret;
1913}
1914
1915/*
1916 * Pages can be marked dirty completely asynchronously from ext3's journalling
1917 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
1918 * much here because ->set_page_dirty is called under VFS locks. The page is
1919 * not necessarily locked.
1920 *
1921 * We cannot just dirty the page and leave attached buffers clean, because the
1922 * buffers' dirty state is "definitive". We cannot just set the buffers dirty
1923 * or jbddirty because all the journalling code will explode.
1924 *
1925 * So what we do is to mark the page "pending dirty" and next time writepage
1926 * is called, propagate that into the buffers appropriately.
1927 */
1928static int ext3_journalled_set_page_dirty(struct page *page)
1929{
1930 SetPageChecked(page);
1931 return __set_page_dirty_nobuffers(page);
1932}
1933
1934static const struct address_space_operations ext3_ordered_aops = {
1935 .readpage = ext3_readpage,
1936 .readpages = ext3_readpages,
1937 .writepage = ext3_ordered_writepage,
1938 .write_begin = ext3_write_begin,
1939 .write_end = ext3_ordered_write_end,
1940 .bmap = ext3_bmap,
1941 .invalidatepage = ext3_invalidatepage,
1942 .releasepage = ext3_releasepage,
1943 .direct_IO = ext3_direct_IO,
1944 .migratepage = buffer_migrate_page,
1945 .is_partially_uptodate = block_is_partially_uptodate,
1946 .is_dirty_writeback = buffer_check_dirty_writeback,
1947 .error_remove_page = generic_error_remove_page,
1948};
1949
1950static const struct address_space_operations ext3_writeback_aops = {
1951 .readpage = ext3_readpage,
1952 .readpages = ext3_readpages,
1953 .writepage = ext3_writeback_writepage,
1954 .write_begin = ext3_write_begin,
1955 .write_end = ext3_writeback_write_end,
1956 .bmap = ext3_bmap,
1957 .invalidatepage = ext3_invalidatepage,
1958 .releasepage = ext3_releasepage,
1959 .direct_IO = ext3_direct_IO,
1960 .migratepage = buffer_migrate_page,
1961 .is_partially_uptodate = block_is_partially_uptodate,
1962 .error_remove_page = generic_error_remove_page,
1963};
1964
1965static const struct address_space_operations ext3_journalled_aops = {
1966 .readpage = ext3_readpage,
1967 .readpages = ext3_readpages,
1968 .writepage = ext3_journalled_writepage,
1969 .write_begin = ext3_write_begin,
1970 .write_end = ext3_journalled_write_end,
1971 .set_page_dirty = ext3_journalled_set_page_dirty,
1972 .bmap = ext3_bmap,
1973 .invalidatepage = ext3_invalidatepage,
1974 .releasepage = ext3_releasepage,
1975 .is_partially_uptodate = block_is_partially_uptodate,
1976 .error_remove_page = generic_error_remove_page,
1977};
1978
1979void ext3_set_aops(struct inode *inode)
1980{
1981 if (ext3_should_order_data(inode))
1982 inode->i_mapping->a_ops = &ext3_ordered_aops;
1983 else if (ext3_should_writeback_data(inode))
1984 inode->i_mapping->a_ops = &ext3_writeback_aops;
1985 else
1986 inode->i_mapping->a_ops = &ext3_journalled_aops;
1987}
1988
1989/*
1990 * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1991 * up to the end of the block which corresponds to `from'.
1992 * This required during truncate. We need to physically zero the tail end
1993 * of that block so it doesn't yield old data if the file is later grown.
1994 */
1995static int ext3_block_truncate_page(struct inode *inode, loff_t from)
1996{
1997 ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
1998 unsigned offset = from & (PAGE_CACHE_SIZE - 1);
1999 unsigned blocksize, iblock, length, pos;
2000 struct page *page;
2001 handle_t *handle = NULL;
2002 struct buffer_head *bh;
2003 int err = 0;
2004
2005 /* Truncated on block boundary - nothing to do */
2006 blocksize = inode->i_sb->s_blocksize;
2007 if ((from & (blocksize - 1)) == 0)
2008 return 0;
2009
2010 page = grab_cache_page(inode->i_mapping, index);
2011 if (!page)
2012 return -ENOMEM;
2013 length = blocksize - (offset & (blocksize - 1));
2014 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
2015
2016 if (!page_has_buffers(page))
2017 create_empty_buffers(page, blocksize, 0);
2018
2019 /* Find the buffer that contains "offset" */
2020 bh = page_buffers(page);
2021 pos = blocksize;
2022 while (offset >= pos) {
2023 bh = bh->b_this_page;
2024 iblock++;
2025 pos += blocksize;
2026 }
2027
2028 err = 0;
2029 if (buffer_freed(bh)) {
2030 BUFFER_TRACE(bh, "freed: skip");
2031 goto unlock;
2032 }
2033
2034 if (!buffer_mapped(bh)) {
2035 BUFFER_TRACE(bh, "unmapped");
2036 ext3_get_block(inode, iblock, bh, 0);
2037 /* unmapped? It's a hole - nothing to do */
2038 if (!buffer_mapped(bh)) {
2039 BUFFER_TRACE(bh, "still unmapped");
2040 goto unlock;
2041 }
2042 }
2043
2044 /* Ok, it's mapped. Make sure it's up-to-date */
2045 if (PageUptodate(page))
2046 set_buffer_uptodate(bh);
2047
2048 if (!bh_uptodate_or_lock(bh)) {
2049 err = bh_submit_read(bh);
2050 /* Uhhuh. Read error. Complain and punt. */
2051 if (err)
2052 goto unlock;
2053 }
2054
2055 /* data=writeback mode doesn't need transaction to zero-out data */
2056 if (!ext3_should_writeback_data(inode)) {
2057 /* We journal at most one block */
2058 handle = ext3_journal_start(inode, 1);
2059 if (IS_ERR(handle)) {
2060 clear_highpage(page);
2061 flush_dcache_page(page);
2062 err = PTR_ERR(handle);
2063 goto unlock;
2064 }
2065 }
2066
2067 if (ext3_should_journal_data(inode)) {
2068 BUFFER_TRACE(bh, "get write access");
2069 err = ext3_journal_get_write_access(handle, bh);
2070 if (err)
2071 goto stop;
2072 }
2073
2074 zero_user(page, offset, length);
2075 BUFFER_TRACE(bh, "zeroed end of block");
2076
2077 err = 0;
2078 if (ext3_should_journal_data(inode)) {
2079 err = ext3_journal_dirty_metadata(handle, bh);
2080 } else {
2081 if (ext3_should_order_data(inode))
2082 err = ext3_journal_dirty_data(handle, bh);
2083 mark_buffer_dirty(bh);
2084 }
2085stop:
2086 if (handle)
2087 ext3_journal_stop(handle);
2088
2089unlock:
2090 unlock_page(page);
2091 page_cache_release(page);
2092 return err;
2093}
2094
2095/*
2096 * Probably it should be a library function... search for first non-zero word
2097 * or memcmp with zero_page, whatever is better for particular architecture.
2098 * Linus?
2099 */
2100static inline int all_zeroes(__le32 *p, __le32 *q)
2101{
2102 while (p < q)
2103 if (*p++)
2104 return 0;
2105 return 1;
2106}
2107
2108/**
2109 * ext3_find_shared - find the indirect blocks for partial truncation.
2110 * @inode: inode in question
2111 * @depth: depth of the affected branch
2112 * @offsets: offsets of pointers in that branch (see ext3_block_to_path)
2113 * @chain: place to store the pointers to partial indirect blocks
2114 * @top: place to the (detached) top of branch
2115 *
2116 * This is a helper function used by ext3_truncate().
2117 *
2118 * When we do truncate() we may have to clean the ends of several
2119 * indirect blocks but leave the blocks themselves alive. Block is
2120 * partially truncated if some data below the new i_size is referred
2121 * from it (and it is on the path to the first completely truncated
2122 * data block, indeed). We have to free the top of that path along
2123 * with everything to the right of the path. Since no allocation
2124 * past the truncation point is possible until ext3_truncate()
2125 * finishes, we may safely do the latter, but top of branch may
2126 * require special attention - pageout below the truncation point
2127 * might try to populate it.
2128 *
2129 * We atomically detach the top of branch from the tree, store the
2130 * block number of its root in *@top, pointers to buffer_heads of
2131 * partially truncated blocks - in @chain[].bh and pointers to
2132 * their last elements that should not be removed - in
2133 * @chain[].p. Return value is the pointer to last filled element
2134 * of @chain.
2135 *
2136 * The work left to caller to do the actual freeing of subtrees:
2137 * a) free the subtree starting from *@top
2138 * b) free the subtrees whose roots are stored in
2139 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
2140 * c) free the subtrees growing from the inode past the @chain[0].
2141 * (no partially truncated stuff there). */
2142
2143static Indirect *ext3_find_shared(struct inode *inode, int depth,
2144 int offsets[4], Indirect chain[4], __le32 *top)
2145{
2146 Indirect *partial, *p;
2147 int k, err;
2148
2149 *top = 0;
2150 /* Make k index the deepest non-null offset + 1 */
2151 for (k = depth; k > 1 && !offsets[k-1]; k--)
2152 ;
2153 partial = ext3_get_branch(inode, k, offsets, chain, &err);
2154 /* Writer: pointers */
2155 if (!partial)
2156 partial = chain + k-1;
2157 /*
2158 * If the branch acquired continuation since we've looked at it -
2159 * fine, it should all survive and (new) top doesn't belong to us.
2160 */
2161 if (!partial->key && *partial->p)
2162 /* Writer: end */
2163 goto no_top;
2164 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
2165 ;
2166 /*
2167 * OK, we've found the last block that must survive. The rest of our
2168 * branch should be detached before unlocking. However, if that rest
2169 * of branch is all ours and does not grow immediately from the inode
2170 * it's easier to cheat and just decrement partial->p.
2171 */
2172 if (p == chain + k - 1 && p > chain) {
2173 p->p--;
2174 } else {
2175 *top = *p->p;
2176 /* Nope, don't do this in ext3. Must leave the tree intact */
2177#if 0
2178 *p->p = 0;
2179#endif
2180 }
2181 /* Writer: end */
2182
2183 while(partial > p) {
2184 brelse(partial->bh);
2185 partial--;
2186 }
2187no_top:
2188 return partial;
2189}
2190
2191/*
2192 * Zero a number of block pointers in either an inode or an indirect block.
2193 * If we restart the transaction we must again get write access to the
2194 * indirect block for further modification.
2195 *
2196 * We release `count' blocks on disk, but (last - first) may be greater
2197 * than `count' because there can be holes in there.
2198 */
2199static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
2200 struct buffer_head *bh, ext3_fsblk_t block_to_free,
2201 unsigned long count, __le32 *first, __le32 *last)
2202{
2203 __le32 *p;
2204 if (try_to_extend_transaction(handle, inode)) {
2205 if (bh) {
2206 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2207 if (ext3_journal_dirty_metadata(handle, bh))
2208 return;
2209 }
2210 ext3_mark_inode_dirty(handle, inode);
2211 truncate_restart_transaction(handle, inode);
2212 if (bh) {
2213 BUFFER_TRACE(bh, "retaking write access");
2214 if (ext3_journal_get_write_access(handle, bh))
2215 return;
2216 }
2217 }
2218
2219 /*
2220 * Any buffers which are on the journal will be in memory. We find
2221 * them on the hash table so journal_revoke() will run journal_forget()
2222 * on them. We've already detached each block from the file, so
2223 * bforget() in journal_forget() should be safe.
2224 *
2225 * AKPM: turn on bforget in journal_forget()!!!
2226 */
2227 for (p = first; p < last; p++) {
2228 u32 nr = le32_to_cpu(*p);
2229 if (nr) {
2230 struct buffer_head *bh;
2231
2232 *p = 0;
2233 bh = sb_find_get_block(inode->i_sb, nr);
2234 ext3_forget(handle, 0, inode, bh, nr);
2235 }
2236 }
2237
2238 ext3_free_blocks(handle, inode, block_to_free, count);
2239}
2240
2241/**
2242 * ext3_free_data - free a list of data blocks
2243 * @handle: handle for this transaction
2244 * @inode: inode we are dealing with
2245 * @this_bh: indirect buffer_head which contains *@first and *@last
2246 * @first: array of block numbers
2247 * @last: points immediately past the end of array
2248 *
2249 * We are freeing all blocks referred from that array (numbers are stored as
2250 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
2251 *
2252 * We accumulate contiguous runs of blocks to free. Conveniently, if these
2253 * blocks are contiguous then releasing them at one time will only affect one
2254 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
2255 * actually use a lot of journal space.
2256 *
2257 * @this_bh will be %NULL if @first and @last point into the inode's direct
2258 * block pointers.
2259 */
2260static void ext3_free_data(handle_t *handle, struct inode *inode,
2261 struct buffer_head *this_bh,
2262 __le32 *first, __le32 *last)
2263{
2264 ext3_fsblk_t block_to_free = 0; /* Starting block # of a run */
2265 unsigned long count = 0; /* Number of blocks in the run */
2266 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
2267 corresponding to
2268 block_to_free */
2269 ext3_fsblk_t nr; /* Current block # */
2270 __le32 *p; /* Pointer into inode/ind
2271 for current block */
2272 int err;
2273
2274 if (this_bh) { /* For indirect block */
2275 BUFFER_TRACE(this_bh, "get_write_access");
2276 err = ext3_journal_get_write_access(handle, this_bh);
2277 /* Important: if we can't update the indirect pointers
2278 * to the blocks, we can't free them. */
2279 if (err)
2280 return;
2281 }
2282
2283 for (p = first; p < last; p++) {
2284 nr = le32_to_cpu(*p);
2285 if (nr) {
2286 /* accumulate blocks to free if they're contiguous */
2287 if (count == 0) {
2288 block_to_free = nr;
2289 block_to_free_p = p;
2290 count = 1;
2291 } else if (nr == block_to_free + count) {
2292 count++;
2293 } else {
2294 ext3_clear_blocks(handle, inode, this_bh,
2295 block_to_free,
2296 count, block_to_free_p, p);
2297 block_to_free = nr;
2298 block_to_free_p = p;
2299 count = 1;
2300 }
2301 }
2302 }
2303
2304 if (count > 0)
2305 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
2306 count, block_to_free_p, p);
2307
2308 if (this_bh) {
2309 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
2310
2311 /*
2312 * The buffer head should have an attached journal head at this
2313 * point. However, if the data is corrupted and an indirect
2314 * block pointed to itself, it would have been detached when
2315 * the block was cleared. Check for this instead of OOPSing.
2316 */
2317 if (bh2jh(this_bh))
2318 ext3_journal_dirty_metadata(handle, this_bh);
2319 else
2320 ext3_error(inode->i_sb, "ext3_free_data",
2321 "circular indirect block detected, "
2322 "inode=%lu, block=%llu",
2323 inode->i_ino,
2324 (unsigned long long)this_bh->b_blocknr);
2325 }
2326}
2327
2328/**
2329 * ext3_free_branches - free an array of branches
2330 * @handle: JBD handle for this transaction
2331 * @inode: inode we are dealing with
2332 * @parent_bh: the buffer_head which contains *@first and *@last
2333 * @first: array of block numbers
2334 * @last: pointer immediately past the end of array
2335 * @depth: depth of the branches to free
2336 *
2337 * We are freeing all blocks referred from these branches (numbers are
2338 * stored as little-endian 32-bit) and updating @inode->i_blocks
2339 * appropriately.
2340 */
2341static void ext3_free_branches(handle_t *handle, struct inode *inode,
2342 struct buffer_head *parent_bh,
2343 __le32 *first, __le32 *last, int depth)
2344{
2345 ext3_fsblk_t nr;
2346 __le32 *p;
2347
2348 if (is_handle_aborted(handle))
2349 return;
2350
2351 if (depth--) {
2352 struct buffer_head *bh;
2353 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2354 p = last;
2355 while (--p >= first) {
2356 nr = le32_to_cpu(*p);
2357 if (!nr)
2358 continue; /* A hole */
2359
2360 /* Go read the buffer for the next level down */
2361 bh = sb_bread(inode->i_sb, nr);
2362
2363 /*
2364 * A read failure? Report error and clear slot
2365 * (should be rare).
2366 */
2367 if (!bh) {
2368 ext3_error(inode->i_sb, "ext3_free_branches",
2369 "Read failure, inode=%lu, block="E3FSBLK,
2370 inode->i_ino, nr);
2371 continue;
2372 }
2373
2374 /* This zaps the entire block. Bottom up. */
2375 BUFFER_TRACE(bh, "free child branches");
2376 ext3_free_branches(handle, inode, bh,
2377 (__le32*)bh->b_data,
2378 (__le32*)bh->b_data + addr_per_block,
2379 depth);
2380
2381 /*
2382 * Everything below this this pointer has been
2383 * released. Now let this top-of-subtree go.
2384 *
2385 * We want the freeing of this indirect block to be
2386 * atomic in the journal with the updating of the
2387 * bitmap block which owns it. So make some room in
2388 * the journal.
2389 *
2390 * We zero the parent pointer *after* freeing its
2391 * pointee in the bitmaps, so if extend_transaction()
2392 * for some reason fails to put the bitmap changes and
2393 * the release into the same transaction, recovery
2394 * will merely complain about releasing a free block,
2395 * rather than leaking blocks.
2396 */
2397 if (is_handle_aborted(handle))
2398 return;
2399 if (try_to_extend_transaction(handle, inode)) {
2400 ext3_mark_inode_dirty(handle, inode);
2401 truncate_restart_transaction(handle, inode);
2402 }
2403
2404 /*
2405 * We've probably journalled the indirect block several
2406 * times during the truncate. But it's no longer
2407 * needed and we now drop it from the transaction via
2408 * journal_revoke().
2409 *
2410 * That's easy if it's exclusively part of this
2411 * transaction. But if it's part of the committing
2412 * transaction then journal_forget() will simply
2413 * brelse() it. That means that if the underlying
2414 * block is reallocated in ext3_get_block(),
2415 * unmap_underlying_metadata() will find this block
2416 * and will try to get rid of it. damn, damn. Thus
2417 * we don't allow a block to be reallocated until
2418 * a transaction freeing it has fully committed.
2419 *
2420 * We also have to make sure journal replay after a
2421 * crash does not overwrite non-journaled data blocks
2422 * with old metadata when the block got reallocated for
2423 * data. Thus we have to store a revoke record for a
2424 * block in the same transaction in which we free the
2425 * block.
2426 */
2427 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2428
2429 ext3_free_blocks(handle, inode, nr, 1);
2430
2431 if (parent_bh) {
2432 /*
2433 * The block which we have just freed is
2434 * pointed to by an indirect block: journal it
2435 */
2436 BUFFER_TRACE(parent_bh, "get_write_access");
2437 if (!ext3_journal_get_write_access(handle,
2438 parent_bh)){
2439 *p = 0;
2440 BUFFER_TRACE(parent_bh,
2441 "call ext3_journal_dirty_metadata");
2442 ext3_journal_dirty_metadata(handle,
2443 parent_bh);
2444 }
2445 }
2446 }
2447 } else {
2448 /* We have reached the bottom of the tree. */
2449 BUFFER_TRACE(parent_bh, "free data blocks");
2450 ext3_free_data(handle, inode, parent_bh, first, last);
2451 }
2452}
2453
2454int ext3_can_truncate(struct inode *inode)
2455{
2456 if (S_ISREG(inode->i_mode))
2457 return 1;
2458 if (S_ISDIR(inode->i_mode))
2459 return 1;
2460 if (S_ISLNK(inode->i_mode))
2461 return !ext3_inode_is_fast_symlink(inode);
2462 return 0;
2463}
2464
2465/*
2466 * ext3_truncate()
2467 *
2468 * We block out ext3_get_block() block instantiations across the entire
2469 * transaction, and VFS/VM ensures that ext3_truncate() cannot run
2470 * simultaneously on behalf of the same inode.
2471 *
2472 * As we work through the truncate and commit bits of it to the journal there
2473 * is one core, guiding principle: the file's tree must always be consistent on
2474 * disk. We must be able to restart the truncate after a crash.
2475 *
2476 * The file's tree may be transiently inconsistent in memory (although it
2477 * probably isn't), but whenever we close off and commit a journal transaction,
2478 * the contents of (the filesystem + the journal) must be consistent and
2479 * restartable. It's pretty simple, really: bottom up, right to left (although
2480 * left-to-right works OK too).
2481 *
2482 * Note that at recovery time, journal replay occurs *before* the restart of
2483 * truncate against the orphan inode list.
2484 *
2485 * The committed inode has the new, desired i_size (which is the same as
2486 * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see
2487 * that this inode's truncate did not complete and it will again call
2488 * ext3_truncate() to have another go. So there will be instantiated blocks
2489 * to the right of the truncation point in a crashed ext3 filesystem. But
2490 * that's fine - as long as they are linked from the inode, the post-crash
2491 * ext3_truncate() run will find them and release them.
2492 */
2493void ext3_truncate(struct inode *inode)
2494{
2495 handle_t *handle;
2496 struct ext3_inode_info *ei = EXT3_I(inode);
2497 __le32 *i_data = ei->i_data;
2498 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2499 int offsets[4];
2500 Indirect chain[4];
2501 Indirect *partial;
2502 __le32 nr = 0;
2503 int n;
2504 long last_block;
2505 unsigned blocksize = inode->i_sb->s_blocksize;
2506
2507 trace_ext3_truncate_enter(inode);
2508
2509 if (!ext3_can_truncate(inode))
2510 goto out_notrans;
2511
2512 if (inode->i_size == 0 && ext3_should_writeback_data(inode))
2513 ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
2514
2515 handle = start_transaction(inode);
2516 if (IS_ERR(handle))
2517 goto out_notrans;
2518
2519 last_block = (inode->i_size + blocksize-1)
2520 >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2521 n = ext3_block_to_path(inode, last_block, offsets, NULL);
2522 if (n == 0)
2523 goto out_stop; /* error */
2524
2525 /*
2526 * OK. This truncate is going to happen. We add the inode to the
2527 * orphan list, so that if this truncate spans multiple transactions,
2528 * and we crash, we will resume the truncate when the filesystem
2529 * recovers. It also marks the inode dirty, to catch the new size.
2530 *
2531 * Implication: the file must always be in a sane, consistent
2532 * truncatable state while each transaction commits.
2533 */
2534 if (ext3_orphan_add(handle, inode))
2535 goto out_stop;
2536
2537 /*
2538 * The orphan list entry will now protect us from any crash which
2539 * occurs before the truncate completes, so it is now safe to propagate
2540 * the new, shorter inode size (held for now in i_size) into the
2541 * on-disk inode. We do this via i_disksize, which is the value which
2542 * ext3 *really* writes onto the disk inode.
2543 */
2544 ei->i_disksize = inode->i_size;
2545
2546 /*
2547 * From here we block out all ext3_get_block() callers who want to
2548 * modify the block allocation tree.
2549 */
2550 mutex_lock(&ei->truncate_mutex);
2551
2552 if (n == 1) { /* direct blocks */
2553 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
2554 i_data + EXT3_NDIR_BLOCKS);
2555 goto do_indirects;
2556 }
2557
2558 partial = ext3_find_shared(inode, n, offsets, chain, &nr);
2559 /* Kill the top of shared branch (not detached) */
2560 if (nr) {
2561 if (partial == chain) {
2562 /* Shared branch grows from the inode */
2563 ext3_free_branches(handle, inode, NULL,
2564 &nr, &nr+1, (chain+n-1) - partial);
2565 *partial->p = 0;
2566 /*
2567 * We mark the inode dirty prior to restart,
2568 * and prior to stop. No need for it here.
2569 */
2570 } else {
2571 /* Shared branch grows from an indirect block */
2572 ext3_free_branches(handle, inode, partial->bh,
2573 partial->p,
2574 partial->p+1, (chain+n-1) - partial);
2575 }
2576 }
2577 /* Clear the ends of indirect blocks on the shared branch */
2578 while (partial > chain) {
2579 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
2580 (__le32*)partial->bh->b_data+addr_per_block,
2581 (chain+n-1) - partial);
2582 BUFFER_TRACE(partial->bh, "call brelse");
2583 brelse (partial->bh);
2584 partial--;
2585 }
2586do_indirects:
2587 /* Kill the remaining (whole) subtrees */
2588 switch (offsets[0]) {
2589 default:
2590 nr = i_data[EXT3_IND_BLOCK];
2591 if (nr) {
2592 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
2593 i_data[EXT3_IND_BLOCK] = 0;
2594 }
2595 case EXT3_IND_BLOCK:
2596 nr = i_data[EXT3_DIND_BLOCK];
2597 if (nr) {
2598 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
2599 i_data[EXT3_DIND_BLOCK] = 0;
2600 }
2601 case EXT3_DIND_BLOCK:
2602 nr = i_data[EXT3_TIND_BLOCK];
2603 if (nr) {
2604 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
2605 i_data[EXT3_TIND_BLOCK] = 0;
2606 }
2607 case EXT3_TIND_BLOCK:
2608 ;
2609 }
2610
2611 ext3_discard_reservation(inode);
2612
2613 mutex_unlock(&ei->truncate_mutex);
2614 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
2615 ext3_mark_inode_dirty(handle, inode);
2616
2617 /*
2618 * In a multi-transaction truncate, we only make the final transaction
2619 * synchronous
2620 */
2621 if (IS_SYNC(inode))
2622 handle->h_sync = 1;
2623out_stop:
2624 /*
2625 * If this was a simple ftruncate(), and the file will remain alive
2626 * then we need to clear up the orphan record which we created above.
2627 * However, if this was a real unlink then we were called by
2628 * ext3_evict_inode(), and we allow that function to clean up the
2629 * orphan info for us.
2630 */
2631 if (inode->i_nlink)
2632 ext3_orphan_del(handle, inode);
2633
2634 ext3_journal_stop(handle);
2635 trace_ext3_truncate_exit(inode);
2636 return;
2637out_notrans:
2638 /*
2639 * Delete the inode from orphan list so that it doesn't stay there
2640 * forever and trigger assertion on umount.
2641 */
2642 if (inode->i_nlink)
2643 ext3_orphan_del(NULL, inode);
2644 trace_ext3_truncate_exit(inode);
2645}
2646
2647static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
2648 unsigned long ino, struct ext3_iloc *iloc)
2649{
2650 unsigned long block_group;
2651 unsigned long offset;
2652 ext3_fsblk_t block;
2653 struct ext3_group_desc *gdp;
2654
2655 if (!ext3_valid_inum(sb, ino)) {
2656 /*
2657 * This error is already checked for in namei.c unless we are
2658 * looking at an NFS filehandle, in which case no error
2659 * report is needed
2660 */
2661 return 0;
2662 }
2663
2664 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2665 gdp = ext3_get_group_desc(sb, block_group, NULL);
2666 if (!gdp)
2667 return 0;
2668 /*
2669 * Figure out the offset within the block group inode table
2670 */
2671 offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
2672 EXT3_INODE_SIZE(sb);
2673 block = le32_to_cpu(gdp->bg_inode_table) +
2674 (offset >> EXT3_BLOCK_SIZE_BITS(sb));
2675
2676 iloc->block_group = block_group;
2677 iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
2678 return block;
2679}
2680
2681/*
2682 * ext3_get_inode_loc returns with an extra refcount against the inode's
2683 * underlying buffer_head on success. If 'in_mem' is true, we have all
2684 * data in memory that is needed to recreate the on-disk version of this
2685 * inode.
2686 */
2687static int __ext3_get_inode_loc(struct inode *inode,
2688 struct ext3_iloc *iloc, int in_mem)
2689{
2690 ext3_fsblk_t block;
2691 struct buffer_head *bh;
2692
2693 block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2694 if (!block)
2695 return -EIO;
2696
2697 bh = sb_getblk(inode->i_sb, block);
2698 if (unlikely(!bh)) {
2699 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2700 "unable to read inode block - "
2701 "inode=%lu, block="E3FSBLK,
2702 inode->i_ino, block);
2703 return -ENOMEM;
2704 }
2705 if (!buffer_uptodate(bh)) {
2706 lock_buffer(bh);
2707
2708 /*
2709 * If the buffer has the write error flag, we have failed
2710 * to write out another inode in the same block. In this
2711 * case, we don't have to read the block because we may
2712 * read the old inode data successfully.
2713 */
2714 if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
2715 set_buffer_uptodate(bh);
2716
2717 if (buffer_uptodate(bh)) {
2718 /* someone brought it uptodate while we waited */
2719 unlock_buffer(bh);
2720 goto has_buffer;
2721 }
2722
2723 /*
2724 * If we have all information of the inode in memory and this
2725 * is the only valid inode in the block, we need not read the
2726 * block.
2727 */
2728 if (in_mem) {
2729 struct buffer_head *bitmap_bh;
2730 struct ext3_group_desc *desc;
2731 int inodes_per_buffer;
2732 int inode_offset, i;
2733 int block_group;
2734 int start;
2735
2736 block_group = (inode->i_ino - 1) /
2737 EXT3_INODES_PER_GROUP(inode->i_sb);
2738 inodes_per_buffer = bh->b_size /
2739 EXT3_INODE_SIZE(inode->i_sb);
2740 inode_offset = ((inode->i_ino - 1) %
2741 EXT3_INODES_PER_GROUP(inode->i_sb));
2742 start = inode_offset & ~(inodes_per_buffer - 1);
2743
2744 /* Is the inode bitmap in cache? */
2745 desc = ext3_get_group_desc(inode->i_sb,
2746 block_group, NULL);
2747 if (!desc)
2748 goto make_io;
2749
2750 bitmap_bh = sb_getblk(inode->i_sb,
2751 le32_to_cpu(desc->bg_inode_bitmap));
2752 if (unlikely(!bitmap_bh))
2753 goto make_io;
2754
2755 /*
2756 * If the inode bitmap isn't in cache then the
2757 * optimisation may end up performing two reads instead
2758 * of one, so skip it.
2759 */
2760 if (!buffer_uptodate(bitmap_bh)) {
2761 brelse(bitmap_bh);
2762 goto make_io;
2763 }
2764 for (i = start; i < start + inodes_per_buffer; i++) {
2765 if (i == inode_offset)
2766 continue;
2767 if (ext3_test_bit(i, bitmap_bh->b_data))
2768 break;
2769 }
2770 brelse(bitmap_bh);
2771 if (i == start + inodes_per_buffer) {
2772 /* all other inodes are free, so skip I/O */
2773 memset(bh->b_data, 0, bh->b_size);
2774 set_buffer_uptodate(bh);
2775 unlock_buffer(bh);
2776 goto has_buffer;
2777 }
2778 }
2779
2780make_io:
2781 /*
2782 * There are other valid inodes in the buffer, this inode
2783 * has in-inode xattrs, or we don't have this inode in memory.
2784 * Read the block from disk.
2785 */
2786 trace_ext3_load_inode(inode);
2787 get_bh(bh);
2788 bh->b_end_io = end_buffer_read_sync;
2789 submit_bh(READ | REQ_META | REQ_PRIO, bh);
2790 wait_on_buffer(bh);
2791 if (!buffer_uptodate(bh)) {
2792 ext3_error(inode->i_sb, "ext3_get_inode_loc",
2793 "unable to read inode block - "
2794 "inode=%lu, block="E3FSBLK,
2795 inode->i_ino, block);
2796 brelse(bh);
2797 return -EIO;
2798 }
2799 }
2800has_buffer:
2801 iloc->bh = bh;
2802 return 0;
2803}
2804
2805int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
2806{
2807 /* We have all inode data except xattrs in memory here. */
2808 return __ext3_get_inode_loc(inode, iloc,
2809 !ext3_test_inode_state(inode, EXT3_STATE_XATTR));
2810}
2811
2812void ext3_set_inode_flags(struct inode *inode)
2813{
2814 unsigned int flags = EXT3_I(inode)->i_flags;
2815
2816 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
2817 if (flags & EXT3_SYNC_FL)
2818 inode->i_flags |= S_SYNC;
2819 if (flags & EXT3_APPEND_FL)
2820 inode->i_flags |= S_APPEND;
2821 if (flags & EXT3_IMMUTABLE_FL)
2822 inode->i_flags |= S_IMMUTABLE;
2823 if (flags & EXT3_NOATIME_FL)
2824 inode->i_flags |= S_NOATIME;
2825 if (flags & EXT3_DIRSYNC_FL)
2826 inode->i_flags |= S_DIRSYNC;
2827}
2828
2829/* Propagate flags from i_flags to EXT3_I(inode)->i_flags */
2830void ext3_get_inode_flags(struct ext3_inode_info *ei)
2831{
2832 unsigned int flags = ei->vfs_inode.i_flags;
2833
2834 ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL|
2835 EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL);
2836 if (flags & S_SYNC)
2837 ei->i_flags |= EXT3_SYNC_FL;
2838 if (flags & S_APPEND)
2839 ei->i_flags |= EXT3_APPEND_FL;
2840 if (flags & S_IMMUTABLE)
2841 ei->i_flags |= EXT3_IMMUTABLE_FL;
2842 if (flags & S_NOATIME)
2843 ei->i_flags |= EXT3_NOATIME_FL;
2844 if (flags & S_DIRSYNC)
2845 ei->i_flags |= EXT3_DIRSYNC_FL;
2846}
2847
2848struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2849{
2850 struct ext3_iloc iloc;
2851 struct ext3_inode *raw_inode;
2852 struct ext3_inode_info *ei;
2853 struct buffer_head *bh;
2854 struct inode *inode;
2855 journal_t *journal = EXT3_SB(sb)->s_journal;
2856 transaction_t *transaction;
2857 long ret;
2858 int block;
2859 uid_t i_uid;
2860 gid_t i_gid;
2861
2862 inode = iget_locked(sb, ino);
2863 if (!inode)
2864 return ERR_PTR(-ENOMEM);
2865 if (!(inode->i_state & I_NEW))
2866 return inode;
2867
2868 ei = EXT3_I(inode);
2869 ei->i_block_alloc_info = NULL;
2870
2871 ret = __ext3_get_inode_loc(inode, &iloc, 0);
2872 if (ret < 0)
2873 goto bad_inode;
2874 bh = iloc.bh;
2875 raw_inode = ext3_raw_inode(&iloc);
2876 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2877 i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2878 i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2879 if(!(test_opt (inode->i_sb, NO_UID32))) {
2880 i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2881 i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2882 }
2883 i_uid_write(inode, i_uid);
2884 i_gid_write(inode, i_gid);
2885 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
2886 inode->i_size = le32_to_cpu(raw_inode->i_size);
2887 inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
2888 inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
2889 inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
2890 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2891
2892 ei->i_state_flags = 0;
2893 ei->i_dir_start_lookup = 0;
2894 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2895 /* We now have enough fields to check if the inode was active or not.
2896 * This is needed because nfsd might try to access dead inodes
2897 * the test is that same one that e2fsck uses
2898 * NeilBrown 1999oct15
2899 */
2900 if (inode->i_nlink == 0) {
2901 if (inode->i_mode == 0 ||
2902 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
2903 /* this inode is deleted */
2904 brelse (bh);
2905 ret = -ESTALE;
2906 goto bad_inode;
2907 }
2908 /* The only unlinked inodes we let through here have
2909 * valid i_mode and are being read by the orphan
2910 * recovery code: that's fine, we're about to complete
2911 * the process of deleting those. */
2912 }
2913 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2914 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2915#ifdef EXT3_FRAGMENTS
2916 ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2917 ei->i_frag_no = raw_inode->i_frag;
2918 ei->i_frag_size = raw_inode->i_fsize;
2919#endif
2920 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2921 if (!S_ISREG(inode->i_mode)) {
2922 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2923 } else {
2924 inode->i_size |=
2925 ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2926 }
2927 ei->i_disksize = inode->i_size;
2928 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2929 ei->i_block_group = iloc.block_group;
2930 /*
2931 * NOTE! The in-memory inode i_data array is in little-endian order
2932 * even on big-endian machines: we do NOT byteswap the block numbers!
2933 */
2934 for (block = 0; block < EXT3_N_BLOCKS; block++)
2935 ei->i_data[block] = raw_inode->i_block[block];
2936 INIT_LIST_HEAD(&ei->i_orphan);
2937
2938 /*
2939 * Set transaction id's of transactions that have to be committed
2940 * to finish f[data]sync. We set them to currently running transaction
2941 * as we cannot be sure that the inode or some of its metadata isn't
2942 * part of the transaction - the inode could have been reclaimed and
2943 * now it is reread from disk.
2944 */
2945 if (journal) {
2946 tid_t tid;
2947
2948 spin_lock(&journal->j_state_lock);
2949 if (journal->j_running_transaction)
2950 transaction = journal->j_running_transaction;
2951 else
2952 transaction = journal->j_committing_transaction;
2953 if (transaction)
2954 tid = transaction->t_tid;
2955 else
2956 tid = journal->j_commit_sequence;
2957 spin_unlock(&journal->j_state_lock);
2958 atomic_set(&ei->i_sync_tid, tid);
2959 atomic_set(&ei->i_datasync_tid, tid);
2960 }
2961
2962 if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
2963 EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
2964 /*
2965 * When mke2fs creates big inodes it does not zero out
2966 * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
2967 * so ignore those first few inodes.
2968 */
2969 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2970 if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2971 EXT3_INODE_SIZE(inode->i_sb)) {
2972 brelse (bh);
2973 ret = -EIO;
2974 goto bad_inode;
2975 }
2976 if (ei->i_extra_isize == 0) {
2977 /* The extra space is currently unused. Use it. */
2978 ei->i_extra_isize = sizeof(struct ext3_inode) -
2979 EXT3_GOOD_OLD_INODE_SIZE;
2980 } else {
2981 __le32 *magic = (void *)raw_inode +
2982 EXT3_GOOD_OLD_INODE_SIZE +
2983 ei->i_extra_isize;
2984 if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
2985 ext3_set_inode_state(inode, EXT3_STATE_XATTR);
2986 }
2987 } else
2988 ei->i_extra_isize = 0;
2989
2990 if (S_ISREG(inode->i_mode)) {
2991 inode->i_op = &ext3_file_inode_operations;
2992 inode->i_fop = &ext3_file_operations;
2993 ext3_set_aops(inode);
2994 } else if (S_ISDIR(inode->i_mode)) {
2995 inode->i_op = &ext3_dir_inode_operations;
2996 inode->i_fop = &ext3_dir_operations;
2997 } else if (S_ISLNK(inode->i_mode)) {
2998 if (ext3_inode_is_fast_symlink(inode)) {
2999 inode->i_op = &ext3_fast_symlink_inode_operations;
3000 nd_terminate_link(ei->i_data, inode->i_size,
3001 sizeof(ei->i_data) - 1);
3002 inode->i_link = (char *)ei->i_data;
3003 } else {
3004 inode->i_op = &ext3_symlink_inode_operations;
3005 ext3_set_aops(inode);
3006 }
3007 } else {
3008 inode->i_op = &ext3_special_inode_operations;
3009 if (raw_inode->i_block[0])
3010 init_special_inode(inode, inode->i_mode,
3011 old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
3012 else
3013 init_special_inode(inode, inode->i_mode,
3014 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
3015 }
3016 brelse (iloc.bh);
3017 ext3_set_inode_flags(inode);
3018 unlock_new_inode(inode);
3019 return inode;
3020
3021bad_inode:
3022 iget_failed(inode);
3023 return ERR_PTR(ret);
3024}
3025
3026/*
3027 * Post the struct inode info into an on-disk inode location in the
3028 * buffer-cache. This gobbles the caller's reference to the
3029 * buffer_head in the inode location struct.
3030 *
3031 * The caller must have write access to iloc->bh.
3032 */
3033static int ext3_do_update_inode(handle_t *handle,
3034 struct inode *inode,
3035 struct ext3_iloc *iloc)
3036{
3037 struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
3038 struct ext3_inode_info *ei = EXT3_I(inode);
3039 struct buffer_head *bh = iloc->bh;
3040 int err = 0, rc, block;
3041 int need_datasync = 0;
3042 __le32 disksize;
3043 uid_t i_uid;
3044 gid_t i_gid;
3045
3046again:
3047 /* we can't allow multiple procs in here at once, its a bit racey */
3048 lock_buffer(bh);
3049
3050 /* For fields not not tracking in the in-memory inode,
3051 * initialise them to zero for new inodes. */
3052 if (ext3_test_inode_state(inode, EXT3_STATE_NEW))
3053 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
3054
3055 ext3_get_inode_flags(ei);
3056 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
3057 i_uid = i_uid_read(inode);
3058 i_gid = i_gid_read(inode);
3059 if(!(test_opt(inode->i_sb, NO_UID32))) {
3060 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
3061 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
3062/*
3063 * Fix up interoperability with old kernels. Otherwise, old inodes get
3064 * re-used with the upper 16 bits of the uid/gid intact
3065 */
3066 if(!ei->i_dtime) {
3067 raw_inode->i_uid_high =
3068 cpu_to_le16(high_16_bits(i_uid));
3069 raw_inode->i_gid_high =
3070 cpu_to_le16(high_16_bits(i_gid));
3071 } else {
3072 raw_inode->i_uid_high = 0;
3073 raw_inode->i_gid_high = 0;
3074 }
3075 } else {
3076 raw_inode->i_uid_low =
3077 cpu_to_le16(fs_high2lowuid(i_uid));
3078 raw_inode->i_gid_low =
3079 cpu_to_le16(fs_high2lowgid(i_gid));
3080 raw_inode->i_uid_high = 0;
3081 raw_inode->i_gid_high = 0;
3082 }
3083 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
3084 disksize = cpu_to_le32(ei->i_disksize);
3085 if (disksize != raw_inode->i_size) {
3086 need_datasync = 1;
3087 raw_inode->i_size = disksize;
3088 }
3089 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
3090 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
3091 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
3092 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
3093 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
3094 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
3095#ifdef EXT3_FRAGMENTS
3096 raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
3097 raw_inode->i_frag = ei->i_frag_no;
3098 raw_inode->i_fsize = ei->i_frag_size;
3099#endif
3100 raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
3101 if (!S_ISREG(inode->i_mode)) {
3102 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
3103 } else {
3104 disksize = cpu_to_le32(ei->i_disksize >> 32);
3105 if (disksize != raw_inode->i_size_high) {
3106 raw_inode->i_size_high = disksize;
3107 need_datasync = 1;
3108 }
3109 if (ei->i_disksize > 0x7fffffffULL) {
3110 struct super_block *sb = inode->i_sb;
3111 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
3112 EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
3113 EXT3_SB(sb)->s_es->s_rev_level ==
3114 cpu_to_le32(EXT3_GOOD_OLD_REV)) {
3115 /* If this is the first large file
3116 * created, add a flag to the superblock.
3117 */
3118 unlock_buffer(bh);
3119 err = ext3_journal_get_write_access(handle,
3120 EXT3_SB(sb)->s_sbh);
3121 if (err)
3122 goto out_brelse;
3123
3124 ext3_update_dynamic_rev(sb);
3125 EXT3_SET_RO_COMPAT_FEATURE(sb,
3126 EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
3127 handle->h_sync = 1;
3128 err = ext3_journal_dirty_metadata(handle,
3129 EXT3_SB(sb)->s_sbh);
3130 /* get our lock and start over */
3131 goto again;
3132 }
3133 }
3134 }
3135 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
3136 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
3137 if (old_valid_dev(inode->i_rdev)) {
3138 raw_inode->i_block[0] =
3139 cpu_to_le32(old_encode_dev(inode->i_rdev));
3140 raw_inode->i_block[1] = 0;
3141 } else {
3142 raw_inode->i_block[0] = 0;
3143 raw_inode->i_block[1] =
3144 cpu_to_le32(new_encode_dev(inode->i_rdev));
3145 raw_inode->i_block[2] = 0;
3146 }
3147 } else for (block = 0; block < EXT3_N_BLOCKS; block++)
3148 raw_inode->i_block[block] = ei->i_data[block];
3149
3150 if (ei->i_extra_isize)
3151 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
3152
3153 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
3154 unlock_buffer(bh);
3155 rc = ext3_journal_dirty_metadata(handle, bh);
3156 if (!err)
3157 err = rc;
3158 ext3_clear_inode_state(inode, EXT3_STATE_NEW);
3159
3160 atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
3161 if (need_datasync)
3162 atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
3163out_brelse:
3164 brelse (bh);
3165 ext3_std_error(inode->i_sb, err);
3166 return err;
3167}
3168
3169/*
3170 * ext3_write_inode()
3171 *
3172 * We are called from a few places:
3173 *
3174 * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
3175 * Here, there will be no transaction running. We wait for any running
3176 * transaction to commit.
3177 *
3178 * - Within flush work (for sys_sync(), kupdate and such).
3179 * We wait on commit, if told to.
3180 *
3181 * - Within iput_final() -> write_inode_now()
3182 * We wait on commit, if told to.
3183 *
3184 * In all cases it is actually safe for us to return without doing anything,
3185 * because the inode has been copied into a raw inode buffer in
3186 * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
3187 * writeback.
3188 *
3189 * Note that we are absolutely dependent upon all inode dirtiers doing the
3190 * right thing: they *must* call mark_inode_dirty() after dirtying info in
3191 * which we are interested.
3192 *
3193 * It would be a bug for them to not do this. The code:
3194 *
3195 * mark_inode_dirty(inode)
3196 * stuff();
3197 * inode->i_size = expr;
3198 *
3199 * is in error because write_inode() could occur while `stuff()' is running,
3200 * and the new i_size will be lost. Plus the inode will no longer be on the
3201 * superblock's dirty inode list.
3202 */
3203int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
3204{
3205 if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
3206 return 0;
3207
3208 if (ext3_journal_current_handle()) {
3209 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
3210 dump_stack();
3211 return -EIO;
3212 }
3213
3214 /*
3215 * No need to force transaction in WB_SYNC_NONE mode. Also
3216 * ext3_sync_fs() will force the commit after everything is
3217 * written.
3218 */
3219 if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
3220 return 0;
3221
3222 return ext3_force_commit(inode->i_sb);
3223}
3224
3225/*
3226 * ext3_setattr()
3227 *
3228 * Called from notify_change.
3229 *
3230 * We want to trap VFS attempts to truncate the file as soon as
3231 * possible. In particular, we want to make sure that when the VFS
3232 * shrinks i_size, we put the inode on the orphan list and modify
3233 * i_disksize immediately, so that during the subsequent flushing of
3234 * dirty pages and freeing of disk blocks, we can guarantee that any
3235 * commit will leave the blocks being flushed in an unused state on
3236 * disk. (On recovery, the inode will get truncated and the blocks will
3237 * be freed, so we have a strong guarantee that no future commit will
3238 * leave these blocks visible to the user.)
3239 *
3240 * Called with inode->sem down.
3241 */
3242int ext3_setattr(struct dentry *dentry, struct iattr *attr)
3243{
3244 struct inode *inode = d_inode(dentry);
3245 int error, rc = 0;
3246 const unsigned int ia_valid = attr->ia_valid;
3247
3248 error = inode_change_ok(inode, attr);
3249 if (error)
3250 return error;
3251
3252 if (is_quota_modification(inode, attr))
3253 dquot_initialize(inode);
3254 if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
3255 (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
3256 handle_t *handle;
3257
3258 /* (user+group)*(old+new) structure, inode write (sb,
3259 * inode block, ? - but truncate inode update has it) */
3260 handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
3261 EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3);
3262 if (IS_ERR(handle)) {
3263 error = PTR_ERR(handle);
3264 goto err_out;
3265 }
3266 error = dquot_transfer(inode, attr);
3267 if (error) {
3268 ext3_journal_stop(handle);
3269 return error;
3270 }
3271 /* Update corresponding info in inode so that everything is in
3272 * one transaction */
3273 if (attr->ia_valid & ATTR_UID)
3274 inode->i_uid = attr->ia_uid;
3275 if (attr->ia_valid & ATTR_GID)
3276 inode->i_gid = attr->ia_gid;
3277 error = ext3_mark_inode_dirty(handle, inode);
3278 ext3_journal_stop(handle);
3279 }
3280
3281 if (attr->ia_valid & ATTR_SIZE)
3282 inode_dio_wait(inode);
3283
3284 if (S_ISREG(inode->i_mode) &&
3285 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
3286 handle_t *handle;
3287
3288 handle = ext3_journal_start(inode, 3);
3289 if (IS_ERR(handle)) {
3290 error = PTR_ERR(handle);
3291 goto err_out;
3292 }
3293
3294 error = ext3_orphan_add(handle, inode);
3295 if (error) {
3296 ext3_journal_stop(handle);
3297 goto err_out;
3298 }
3299 EXT3_I(inode)->i_disksize = attr->ia_size;
3300 error = ext3_mark_inode_dirty(handle, inode);
3301 ext3_journal_stop(handle);
3302 if (error) {
3303 /* Some hard fs error must have happened. Bail out. */
3304 ext3_orphan_del(NULL, inode);
3305 goto err_out;
3306 }
3307 rc = ext3_block_truncate_page(inode, attr->ia_size);
3308 if (rc) {
3309 /* Cleanup orphan list and exit */
3310 handle = ext3_journal_start(inode, 3);
3311 if (IS_ERR(handle)) {
3312 ext3_orphan_del(NULL, inode);
3313 goto err_out;
3314 }
3315 ext3_orphan_del(handle, inode);
3316 ext3_journal_stop(handle);
3317 goto err_out;
3318 }
3319 }
3320
3321 if ((attr->ia_valid & ATTR_SIZE) &&
3322 attr->ia_size != i_size_read(inode)) {
3323 truncate_setsize(inode, attr->ia_size);
3324 ext3_truncate(inode);
3325 }
3326
3327 setattr_copy(inode, attr);
3328 mark_inode_dirty(inode);
3329
3330 if (ia_valid & ATTR_MODE)
3331 rc = posix_acl_chmod(inode, inode->i_mode);
3332
3333err_out:
3334 ext3_std_error(inode->i_sb, error);
3335 if (!error)
3336 error = rc;
3337 return error;
3338}
3339
3340
3341/*
3342 * How many blocks doth make a writepage()?
3343 *
3344 * With N blocks per page, it may be:
3345 * N data blocks
3346 * 2 indirect block
3347 * 2 dindirect
3348 * 1 tindirect
3349 * N+5 bitmap blocks (from the above)
3350 * N+5 group descriptor summary blocks
3351 * 1 inode block
3352 * 1 superblock.
3353 * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
3354 *
3355 * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
3356 *
3357 * With ordered or writeback data it's the same, less the N data blocks.
3358 *
3359 * If the inode's direct blocks can hold an integral number of pages then a
3360 * page cannot straddle two indirect blocks, and we can only touch one indirect
3361 * and dindirect block, and the "5" above becomes "3".
3362 *
3363 * This still overestimates under most circumstances. If we were to pass the
3364 * start and end offsets in here as well we could do block_to_path() on each
3365 * block and work out the exact number of indirects which are touched. Pah.
3366 */
3367
3368static int ext3_writepage_trans_blocks(struct inode *inode)
3369{
3370 int bpp = ext3_journal_blocks_per_page(inode);
3371 int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
3372 int ret;
3373
3374 if (ext3_should_journal_data(inode))
3375 ret = 3 * (bpp + indirects) + 2;
3376 else
3377 ret = 2 * (bpp + indirects) + indirects + 2;
3378
3379#ifdef CONFIG_QUOTA
3380 /* We know that structure was already allocated during dquot_initialize so
3381 * we will be updating only the data blocks + inodes */
3382 ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
3383#endif
3384
3385 return ret;
3386}
3387
3388/*
3389 * The caller must have previously called ext3_reserve_inode_write().
3390 * Give this, we know that the caller already has write access to iloc->bh.
3391 */
3392int ext3_mark_iloc_dirty(handle_t *handle,
3393 struct inode *inode, struct ext3_iloc *iloc)
3394{
3395 int err = 0;
3396
3397 /* the do_update_inode consumes one bh->b_count */
3398 get_bh(iloc->bh);
3399
3400 /* ext3_do_update_inode() does journal_dirty_metadata */
3401 err = ext3_do_update_inode(handle, inode, iloc);
3402 put_bh(iloc->bh);
3403 return err;
3404}
3405
3406/*
3407 * On success, We end up with an outstanding reference count against
3408 * iloc->bh. This _must_ be cleaned up later.
3409 */
3410
3411int
3412ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
3413 struct ext3_iloc *iloc)
3414{
3415 int err = 0;
3416 if (handle) {
3417 err = ext3_get_inode_loc(inode, iloc);
3418 if (!err) {
3419 BUFFER_TRACE(iloc->bh, "get_write_access");
3420 err = ext3_journal_get_write_access(handle, iloc->bh);
3421 if (err) {
3422 brelse(iloc->bh);
3423 iloc->bh = NULL;
3424 }
3425 }
3426 }
3427 ext3_std_error(inode->i_sb, err);
3428 return err;
3429}
3430
3431/*
3432 * What we do here is to mark the in-core inode as clean with respect to inode
3433 * dirtiness (it may still be data-dirty).
3434 * This means that the in-core inode may be reaped by prune_icache
3435 * without having to perform any I/O. This is a very good thing,
3436 * because *any* task may call prune_icache - even ones which
3437 * have a transaction open against a different journal.
3438 *
3439 * Is this cheating? Not really. Sure, we haven't written the
3440 * inode out, but prune_icache isn't a user-visible syncing function.
3441 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
3442 * we start and wait on commits.
3443 */
3444int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
3445{
3446 struct ext3_iloc iloc;
3447 int err;
3448
3449 might_sleep();
3450 trace_ext3_mark_inode_dirty(inode, _RET_IP_);
3451 err = ext3_reserve_inode_write(handle, inode, &iloc);
3452 if (!err)
3453 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
3454 return err;
3455}
3456
3457/*
3458 * ext3_dirty_inode() is called from __mark_inode_dirty()
3459 *
3460 * We're really interested in the case where a file is being extended.
3461 * i_size has been changed by generic_commit_write() and we thus need
3462 * to include the updated inode in the current transaction.
3463 *
3464 * Also, dquot_alloc_space() will always dirty the inode when blocks
3465 * are allocated to the file.
3466 *
3467 * If the inode is marked synchronous, we don't honour that here - doing
3468 * so would cause a commit on atime updates, which we don't bother doing.
3469 * We handle synchronous inodes at the highest possible level.
3470 */
3471void ext3_dirty_inode(struct inode *inode, int flags)
3472{
3473 handle_t *current_handle = ext3_journal_current_handle();
3474 handle_t *handle;
3475
3476 handle = ext3_journal_start(inode, 2);
3477 if (IS_ERR(handle))
3478 goto out;
3479 if (current_handle &&
3480 current_handle->h_transaction != handle->h_transaction) {
3481 /* This task has a transaction open against a different fs */
3482 printk(KERN_EMERG "%s: transactions do not match!\n",
3483 __func__);
3484 } else {
3485 jbd_debug(5, "marking dirty. outer handle=%p\n",
3486 current_handle);
3487 ext3_mark_inode_dirty(handle, inode);
3488 }
3489 ext3_journal_stop(handle);
3490out:
3491 return;
3492}
3493
3494#if 0
3495/*
3496 * Bind an inode's backing buffer_head into this transaction, to prevent
3497 * it from being flushed to disk early. Unlike
3498 * ext3_reserve_inode_write, this leaves behind no bh reference and
3499 * returns no iloc structure, so the caller needs to repeat the iloc
3500 * lookup to mark the inode dirty later.
3501 */
3502static int ext3_pin_inode(handle_t *handle, struct inode *inode)
3503{
3504 struct ext3_iloc iloc;
3505
3506 int err = 0;
3507 if (handle) {
3508 err = ext3_get_inode_loc(inode, &iloc);
3509 if (!err) {
3510 BUFFER_TRACE(iloc.bh, "get_write_access");
3511 err = journal_get_write_access(handle, iloc.bh);
3512 if (!err)
3513 err = ext3_journal_dirty_metadata(handle,
3514 iloc.bh);
3515 brelse(iloc.bh);
3516 }
3517 }
3518 ext3_std_error(inode->i_sb, err);
3519 return err;
3520}
3521#endif
3522
3523int ext3_change_inode_journal_flag(struct inode *inode, int val)
3524{
3525 journal_t *journal;
3526 handle_t *handle;
3527 int err;
3528
3529 /*
3530 * We have to be very careful here: changing a data block's
3531 * journaling status dynamically is dangerous. If we write a
3532 * data block to the journal, change the status and then delete
3533 * that block, we risk forgetting to revoke the old log record
3534 * from the journal and so a subsequent replay can corrupt data.
3535 * So, first we make sure that the journal is empty and that
3536 * nobody is changing anything.
3537 */
3538
3539 journal = EXT3_JOURNAL(inode);
3540 if (is_journal_aborted(journal))
3541 return -EROFS;
3542
3543 journal_lock_updates(journal);
3544 journal_flush(journal);
3545
3546 /*
3547 * OK, there are no updates running now, and all cached data is
3548 * synced to disk. We are now in a completely consistent state
3549 * which doesn't have anything in the journal, and we know that
3550 * no filesystem updates are running, so it is safe to modify
3551 * the inode's in-core data-journaling state flag now.
3552 */
3553
3554 if (val)
3555 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
3556 else
3557 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
3558 ext3_set_aops(inode);
3559
3560 journal_unlock_updates(journal);
3561
3562 /* Finally we can mark the inode as dirty. */
3563
3564 handle = ext3_journal_start(inode, 1);
3565 if (IS_ERR(handle))
3566 return PTR_ERR(handle);
3567
3568 err = ext3_mark_inode_dirty(handle, inode);
3569 handle->h_sync = 1;
3570 ext3_journal_stop(handle);
3571 ext3_std_error(inode->i_sb, err);
3572
3573 return err;
3574}
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
deleted file mode 100644
index 4d96e9a64532..000000000000
--- a/fs/ext3/ioctl.c
+++ /dev/null
@@ -1,327 +0,0 @@
1/*
2 * linux/fs/ext3/ioctl.c
3 *
4 * Copyright (C) 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 */
9
10#include <linux/mount.h>
11#include <linux/compat.h>
12#include <asm/uaccess.h>
13#include "ext3.h"
14
15long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
16{
17 struct inode *inode = file_inode(filp);
18 struct ext3_inode_info *ei = EXT3_I(inode);
19 unsigned int flags;
20 unsigned short rsv_window_size;
21
22 ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
23
24 switch (cmd) {
25 case EXT3_IOC_GETFLAGS:
26 ext3_get_inode_flags(ei);
27 flags = ei->i_flags & EXT3_FL_USER_VISIBLE;
28 return put_user(flags, (int __user *) arg);
29 case EXT3_IOC_SETFLAGS: {
30 handle_t *handle = NULL;
31 int err;
32 struct ext3_iloc iloc;
33 unsigned int oldflags;
34 unsigned int jflag;
35
36 if (!inode_owner_or_capable(inode))
37 return -EACCES;
38
39 if (get_user(flags, (int __user *) arg))
40 return -EFAULT;
41
42 err = mnt_want_write_file(filp);
43 if (err)
44 return err;
45
46 flags = ext3_mask_flags(inode->i_mode, flags);
47
48 mutex_lock(&inode->i_mutex);
49
50 /* Is it quota file? Do not allow user to mess with it */
51 err = -EPERM;
52 if (IS_NOQUOTA(inode))
53 goto flags_out;
54
55 oldflags = ei->i_flags;
56
57 /* The JOURNAL_DATA flag is modifiable only by root */
58 jflag = flags & EXT3_JOURNAL_DATA_FL;
59
60 /*
61 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
62 * the relevant capability.
63 *
64 * This test looks nicer. Thanks to Pauline Middelink
65 */
66 if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
67 if (!capable(CAP_LINUX_IMMUTABLE))
68 goto flags_out;
69 }
70
71 /*
72 * The JOURNAL_DATA flag can only be changed by
73 * the relevant capability.
74 */
75 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
76 if (!capable(CAP_SYS_RESOURCE))
77 goto flags_out;
78 }
79
80 handle = ext3_journal_start(inode, 1);
81 if (IS_ERR(handle)) {
82 err = PTR_ERR(handle);
83 goto flags_out;
84 }
85 if (IS_SYNC(inode))
86 handle->h_sync = 1;
87 err = ext3_reserve_inode_write(handle, inode, &iloc);
88 if (err)
89 goto flags_err;
90
91 flags = flags & EXT3_FL_USER_MODIFIABLE;
92 flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE;
93 ei->i_flags = flags;
94
95 ext3_set_inode_flags(inode);
96 inode->i_ctime = CURRENT_TIME_SEC;
97
98 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
99flags_err:
100 ext3_journal_stop(handle);
101 if (err)
102 goto flags_out;
103
104 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
105 err = ext3_change_inode_journal_flag(inode, jflag);
106flags_out:
107 mutex_unlock(&inode->i_mutex);
108 mnt_drop_write_file(filp);
109 return err;
110 }
111 case EXT3_IOC_GETVERSION:
112 case EXT3_IOC_GETVERSION_OLD:
113 return put_user(inode->i_generation, (int __user *) arg);
114 case EXT3_IOC_SETVERSION:
115 case EXT3_IOC_SETVERSION_OLD: {
116 handle_t *handle;
117 struct ext3_iloc iloc;
118 __u32 generation;
119 int err;
120
121 if (!inode_owner_or_capable(inode))
122 return -EPERM;
123
124 err = mnt_want_write_file(filp);
125 if (err)
126 return err;
127 if (get_user(generation, (int __user *) arg)) {
128 err = -EFAULT;
129 goto setversion_out;
130 }
131
132 mutex_lock(&inode->i_mutex);
133 handle = ext3_journal_start(inode, 1);
134 if (IS_ERR(handle)) {
135 err = PTR_ERR(handle);
136 goto unlock_out;
137 }
138 err = ext3_reserve_inode_write(handle, inode, &iloc);
139 if (err == 0) {
140 inode->i_ctime = CURRENT_TIME_SEC;
141 inode->i_generation = generation;
142 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
143 }
144 ext3_journal_stop(handle);
145
146unlock_out:
147 mutex_unlock(&inode->i_mutex);
148setversion_out:
149 mnt_drop_write_file(filp);
150 return err;
151 }
152 case EXT3_IOC_GETRSVSZ:
153 if (test_opt(inode->i_sb, RESERVATION)
154 && S_ISREG(inode->i_mode)
155 && ei->i_block_alloc_info) {
156 rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
157 return put_user(rsv_window_size, (int __user *)arg);
158 }
159 return -ENOTTY;
160 case EXT3_IOC_SETRSVSZ: {
161 int err;
162
163 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
164 return -ENOTTY;
165
166 err = mnt_want_write_file(filp);
167 if (err)
168 return err;
169
170 if (!inode_owner_or_capable(inode)) {
171 err = -EACCES;
172 goto setrsvsz_out;
173 }
174
175 if (get_user(rsv_window_size, (int __user *)arg)) {
176 err = -EFAULT;
177 goto setrsvsz_out;
178 }
179
180 if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS)
181 rsv_window_size = EXT3_MAX_RESERVE_BLOCKS;
182
183 /*
184 * need to allocate reservation structure for this inode
185 * before set the window size
186 */
187 mutex_lock(&ei->truncate_mutex);
188 if (!ei->i_block_alloc_info)
189 ext3_init_block_alloc_info(inode);
190
191 if (ei->i_block_alloc_info){
192 struct ext3_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
193 rsv->rsv_goal_size = rsv_window_size;
194 }
195 mutex_unlock(&ei->truncate_mutex);
196setrsvsz_out:
197 mnt_drop_write_file(filp);
198 return err;
199 }
200 case EXT3_IOC_GROUP_EXTEND: {
201 ext3_fsblk_t n_blocks_count;
202 struct super_block *sb = inode->i_sb;
203 int err, err2;
204
205 if (!capable(CAP_SYS_RESOURCE))
206 return -EPERM;
207
208 err = mnt_want_write_file(filp);
209 if (err)
210 return err;
211
212 if (get_user(n_blocks_count, (__u32 __user *)arg)) {
213 err = -EFAULT;
214 goto group_extend_out;
215 }
216 err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count);
217 journal_lock_updates(EXT3_SB(sb)->s_journal);
218 err2 = journal_flush(EXT3_SB(sb)->s_journal);
219 journal_unlock_updates(EXT3_SB(sb)->s_journal);
220 if (err == 0)
221 err = err2;
222group_extend_out:
223 mnt_drop_write_file(filp);
224 return err;
225 }
226 case EXT3_IOC_GROUP_ADD: {
227 struct ext3_new_group_data input;
228 struct super_block *sb = inode->i_sb;
229 int err, err2;
230
231 if (!capable(CAP_SYS_RESOURCE))
232 return -EPERM;
233
234 err = mnt_want_write_file(filp);
235 if (err)
236 return err;
237
238 if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg,
239 sizeof(input))) {
240 err = -EFAULT;
241 goto group_add_out;
242 }
243
244 err = ext3_group_add(sb, &input);
245 journal_lock_updates(EXT3_SB(sb)->s_journal);
246 err2 = journal_flush(EXT3_SB(sb)->s_journal);
247 journal_unlock_updates(EXT3_SB(sb)->s_journal);
248 if (err == 0)
249 err = err2;
250group_add_out:
251 mnt_drop_write_file(filp);
252 return err;
253 }
254 case FITRIM: {
255
256 struct super_block *sb = inode->i_sb;
257 struct fstrim_range range;
258 int ret = 0;
259
260 if (!capable(CAP_SYS_ADMIN))
261 return -EPERM;
262
263 if (copy_from_user(&range, (struct fstrim_range __user *)arg,
264 sizeof(range)))
265 return -EFAULT;
266
267 ret = ext3_trim_fs(sb, &range);
268 if (ret < 0)
269 return ret;
270
271 if (copy_to_user((struct fstrim_range __user *)arg, &range,
272 sizeof(range)))
273 return -EFAULT;
274
275 return 0;
276 }
277
278 default:
279 return -ENOTTY;
280 }
281}
282
283#ifdef CONFIG_COMPAT
284long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
285{
286 /* These are just misnamed, they actually get/put from/to user an int */
287 switch (cmd) {
288 case EXT3_IOC32_GETFLAGS:
289 cmd = EXT3_IOC_GETFLAGS;
290 break;
291 case EXT3_IOC32_SETFLAGS:
292 cmd = EXT3_IOC_SETFLAGS;
293 break;
294 case EXT3_IOC32_GETVERSION:
295 cmd = EXT3_IOC_GETVERSION;
296 break;
297 case EXT3_IOC32_SETVERSION:
298 cmd = EXT3_IOC_SETVERSION;
299 break;
300 case EXT3_IOC32_GROUP_EXTEND:
301 cmd = EXT3_IOC_GROUP_EXTEND;
302 break;
303 case EXT3_IOC32_GETVERSION_OLD:
304 cmd = EXT3_IOC_GETVERSION_OLD;
305 break;
306 case EXT3_IOC32_SETVERSION_OLD:
307 cmd = EXT3_IOC_SETVERSION_OLD;
308 break;
309#ifdef CONFIG_JBD_DEBUG
310 case EXT3_IOC32_WAIT_FOR_READONLY:
311 cmd = EXT3_IOC_WAIT_FOR_READONLY;
312 break;
313#endif
314 case EXT3_IOC32_GETRSVSZ:
315 cmd = EXT3_IOC_GETRSVSZ;
316 break;
317 case EXT3_IOC32_SETRSVSZ:
318 cmd = EXT3_IOC_SETRSVSZ;
319 break;
320 case EXT3_IOC_GROUP_ADD:
321 break;
322 default:
323 return -ENOIOCTLCMD;
324 }
325 return ext3_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
326}
327#endif
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
deleted file mode 100644
index c9e767cd4b67..000000000000
--- a/fs/ext3/namei.c
+++ /dev/null
@@ -1,2586 +0,0 @@
1/*
2 * linux/fs/ext3/namei.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/namei.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Big-endian to little-endian byte-swapping/bitmaps by
16 * David S. Miller (davem@caip.rutgers.edu), 1995
17 * Directory entry file type support and forward compatibility hooks
18 * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
19 * Hash Tree Directory indexing (c)
20 * Daniel Phillips, 2001
21 * Hash Tree Directory indexing porting
22 * Christopher Li, 2002
23 * Hash Tree Directory indexing cleanup
24 * Theodore Ts'o, 2002
25 */
26
27#include <linux/quotaops.h>
28#include "ext3.h"
29#include "namei.h"
30#include "xattr.h"
31#include "acl.h"
32
33/*
34 * define how far ahead to read directories while searching them.
35 */
36#define NAMEI_RA_CHUNKS 2
37#define NAMEI_RA_BLOCKS 4
38#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
39
40static struct buffer_head *ext3_append(handle_t *handle,
41 struct inode *inode,
42 u32 *block, int *err)
43{
44 struct buffer_head *bh;
45
46 *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
47
48 if ((bh = ext3_dir_bread(handle, inode, *block, 1, err))) {
49 inode->i_size += inode->i_sb->s_blocksize;
50 EXT3_I(inode)->i_disksize = inode->i_size;
51 *err = ext3_journal_get_write_access(handle, bh);
52 if (*err) {
53 brelse(bh);
54 bh = NULL;
55 }
56 }
57 return bh;
58}
59
60#ifndef assert
61#define assert(test) J_ASSERT(test)
62#endif
63
64#ifdef DX_DEBUG
65#define dxtrace(command) command
66#else
67#define dxtrace(command)
68#endif
69
70struct fake_dirent
71{
72 __le32 inode;
73 __le16 rec_len;
74 u8 name_len;
75 u8 file_type;
76};
77
78struct dx_countlimit
79{
80 __le16 limit;
81 __le16 count;
82};
83
84struct dx_entry
85{
86 __le32 hash;
87 __le32 block;
88};
89
90/*
91 * dx_root_info is laid out so that if it should somehow get overlaid by a
92 * dirent the two low bits of the hash version will be zero. Therefore, the
93 * hash version mod 4 should never be 0. Sincerely, the paranoia department.
94 */
95
96struct dx_root
97{
98 struct fake_dirent dot;
99 char dot_name[4];
100 struct fake_dirent dotdot;
101 char dotdot_name[4];
102 struct dx_root_info
103 {
104 __le32 reserved_zero;
105 u8 hash_version;
106 u8 info_length; /* 8 */
107 u8 indirect_levels;
108 u8 unused_flags;
109 }
110 info;
111 struct dx_entry entries[0];
112};
113
114struct dx_node
115{
116 struct fake_dirent fake;
117 struct dx_entry entries[0];
118};
119
120
121struct dx_frame
122{
123 struct buffer_head *bh;
124 struct dx_entry *entries;
125 struct dx_entry *at;
126};
127
128struct dx_map_entry
129{
130 u32 hash;
131 u16 offs;
132 u16 size;
133};
134
135static inline unsigned dx_get_block (struct dx_entry *entry);
136static void dx_set_block (struct dx_entry *entry, unsigned value);
137static inline unsigned dx_get_hash (struct dx_entry *entry);
138static void dx_set_hash (struct dx_entry *entry, unsigned value);
139static unsigned dx_get_count (struct dx_entry *entries);
140static unsigned dx_get_limit (struct dx_entry *entries);
141static void dx_set_count (struct dx_entry *entries, unsigned value);
142static void dx_set_limit (struct dx_entry *entries, unsigned value);
143static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
144static unsigned dx_node_limit (struct inode *dir);
145static struct dx_frame *dx_probe(struct qstr *entry,
146 struct inode *dir,
147 struct dx_hash_info *hinfo,
148 struct dx_frame *frame,
149 int *err);
150static void dx_release (struct dx_frame *frames);
151static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
152 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
153static void dx_sort_map(struct dx_map_entry *map, unsigned count);
154static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
155 struct dx_map_entry *offsets, int count);
156static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize);
157static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
158static int ext3_htree_next_block(struct inode *dir, __u32 hash,
159 struct dx_frame *frame,
160 struct dx_frame *frames,
161 __u32 *start_hash);
162static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
163 struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
164 int *err);
165static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
166 struct inode *inode);
167
168/*
169 * p is at least 6 bytes before the end of page
170 */
171static inline struct ext3_dir_entry_2 *
172ext3_next_entry(struct ext3_dir_entry_2 *p)
173{
174 return (struct ext3_dir_entry_2 *)((char *)p +
175 ext3_rec_len_from_disk(p->rec_len));
176}
177
178/*
179 * Future: use high four bits of block for coalesce-on-delete flags
180 * Mask them off for now.
181 */
182
183static inline unsigned dx_get_block (struct dx_entry *entry)
184{
185 return le32_to_cpu(entry->block) & 0x00ffffff;
186}
187
188static inline void dx_set_block (struct dx_entry *entry, unsigned value)
189{
190 entry->block = cpu_to_le32(value);
191}
192
193static inline unsigned dx_get_hash (struct dx_entry *entry)
194{
195 return le32_to_cpu(entry->hash);
196}
197
198static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
199{
200 entry->hash = cpu_to_le32(value);
201}
202
203static inline unsigned dx_get_count (struct dx_entry *entries)
204{
205 return le16_to_cpu(((struct dx_countlimit *) entries)->count);
206}
207
208static inline unsigned dx_get_limit (struct dx_entry *entries)
209{
210 return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
211}
212
213static inline void dx_set_count (struct dx_entry *entries, unsigned value)
214{
215 ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
216}
217
218static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
219{
220 ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
221}
222
223static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
224{
225 unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
226 EXT3_DIR_REC_LEN(2) - infosize;
227 return entry_space / sizeof(struct dx_entry);
228}
229
230static inline unsigned dx_node_limit (struct inode *dir)
231{
232 unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
233 return entry_space / sizeof(struct dx_entry);
234}
235
236/*
237 * Debug
238 */
239#ifdef DX_DEBUG
240static void dx_show_index (char * label, struct dx_entry *entries)
241{
242 int i, n = dx_get_count (entries);
243 printk("%s index ", label);
244 for (i = 0; i < n; i++)
245 {
246 printk("%x->%u ", i? dx_get_hash(entries + i): 0, dx_get_block(entries + i));
247 }
248 printk("\n");
249}
250
251struct stats
252{
253 unsigned names;
254 unsigned space;
255 unsigned bcount;
256};
257
258static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de,
259 int size, int show_names)
260{
261 unsigned names = 0, space = 0;
262 char *base = (char *) de;
263 struct dx_hash_info h = *hinfo;
264
265 printk("names: ");
266 while ((char *) de < base + size)
267 {
268 if (de->inode)
269 {
270 if (show_names)
271 {
272 int len = de->name_len;
273 char *name = de->name;
274 while (len--) printk("%c", *name++);
275 ext3fs_dirhash(de->name, de->name_len, &h);
276 printk(":%x.%u ", h.hash,
277 (unsigned) ((char *) de - base));
278 }
279 space += EXT3_DIR_REC_LEN(de->name_len);
280 names++;
281 }
282 de = ext3_next_entry(de);
283 }
284 printk("(%i)\n", names);
285 return (struct stats) { names, space, 1 };
286}
287
288struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
289 struct dx_entry *entries, int levels)
290{
291 unsigned blocksize = dir->i_sb->s_blocksize;
292 unsigned count = dx_get_count (entries), names = 0, space = 0, i;
293 unsigned bcount = 0;
294 struct buffer_head *bh;
295 int err;
296 printk("%i indexed blocks...\n", count);
297 for (i = 0; i < count; i++, entries++)
298 {
299 u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
300 u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
301 struct stats stats;
302 printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
303 if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue;
304 stats = levels?
305 dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
306 dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0);
307 names += stats.names;
308 space += stats.space;
309 bcount += stats.bcount;
310 brelse (bh);
311 }
312 if (bcount)
313 printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ",
314 names, space/bcount,(space/bcount)*100/blocksize);
315 return (struct stats) { names, space, bcount};
316}
317#endif /* DX_DEBUG */
318
319/*
320 * Probe for a directory leaf block to search.
321 *
322 * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
323 * error in the directory index, and the caller should fall back to
324 * searching the directory normally. The callers of dx_probe **MUST**
325 * check for this error code, and make sure it never gets reflected
326 * back to userspace.
327 */
328static struct dx_frame *
329dx_probe(struct qstr *entry, struct inode *dir,
330 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
331{
332 unsigned count, indirect;
333 struct dx_entry *at, *entries, *p, *q, *m;
334 struct dx_root *root;
335 struct buffer_head *bh;
336 struct dx_frame *frame = frame_in;
337 u32 hash;
338
339 frame->bh = NULL;
340 if (!(bh = ext3_dir_bread(NULL, dir, 0, 0, err))) {
341 *err = ERR_BAD_DX_DIR;
342 goto fail;
343 }
344 root = (struct dx_root *) bh->b_data;
345 if (root->info.hash_version != DX_HASH_TEA &&
346 root->info.hash_version != DX_HASH_HALF_MD4 &&
347 root->info.hash_version != DX_HASH_LEGACY) {
348 ext3_warning(dir->i_sb, __func__,
349 "Unrecognised inode hash code %d",
350 root->info.hash_version);
351 brelse(bh);
352 *err = ERR_BAD_DX_DIR;
353 goto fail;
354 }
355 hinfo->hash_version = root->info.hash_version;
356 if (hinfo->hash_version <= DX_HASH_TEA)
357 hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
358 hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
359 if (entry)
360 ext3fs_dirhash(entry->name, entry->len, hinfo);
361 hash = hinfo->hash;
362
363 if (root->info.unused_flags & 1) {
364 ext3_warning(dir->i_sb, __func__,
365 "Unimplemented inode hash flags: %#06x",
366 root->info.unused_flags);
367 brelse(bh);
368 *err = ERR_BAD_DX_DIR;
369 goto fail;
370 }
371
372 if ((indirect = root->info.indirect_levels) > 1) {
373 ext3_warning(dir->i_sb, __func__,
374 "Unimplemented inode hash depth: %#06x",
375 root->info.indirect_levels);
376 brelse(bh);
377 *err = ERR_BAD_DX_DIR;
378 goto fail;
379 }
380
381 entries = (struct dx_entry *) (((char *)&root->info) +
382 root->info.info_length);
383
384 if (dx_get_limit(entries) != dx_root_limit(dir,
385 root->info.info_length)) {
386 ext3_warning(dir->i_sb, __func__,
387 "dx entry: limit != root limit");
388 brelse(bh);
389 *err = ERR_BAD_DX_DIR;
390 goto fail;
391 }
392
393 dxtrace (printk("Look up %x", hash));
394 while (1)
395 {
396 count = dx_get_count(entries);
397 if (!count || count > dx_get_limit(entries)) {
398 ext3_warning(dir->i_sb, __func__,
399 "dx entry: no count or count > limit");
400 brelse(bh);
401 *err = ERR_BAD_DX_DIR;
402 goto fail2;
403 }
404
405 p = entries + 1;
406 q = entries + count - 1;
407 while (p <= q)
408 {
409 m = p + (q - p)/2;
410 dxtrace(printk("."));
411 if (dx_get_hash(m) > hash)
412 q = m - 1;
413 else
414 p = m + 1;
415 }
416
417 if (0) // linear search cross check
418 {
419 unsigned n = count - 1;
420 at = entries;
421 while (n--)
422 {
423 dxtrace(printk(","));
424 if (dx_get_hash(++at) > hash)
425 {
426 at--;
427 break;
428 }
429 }
430 assert (at == p - 1);
431 }
432
433 at = p - 1;
434 dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
435 frame->bh = bh;
436 frame->entries = entries;
437 frame->at = at;
438 if (!indirect--) return frame;
439 if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(at), 0, err))) {
440 *err = ERR_BAD_DX_DIR;
441 goto fail2;
442 }
443 at = entries = ((struct dx_node *) bh->b_data)->entries;
444 if (dx_get_limit(entries) != dx_node_limit (dir)) {
445 ext3_warning(dir->i_sb, __func__,
446 "dx entry: limit != node limit");
447 brelse(bh);
448 *err = ERR_BAD_DX_DIR;
449 goto fail2;
450 }
451 frame++;
452 frame->bh = NULL;
453 }
454fail2:
455 while (frame >= frame_in) {
456 brelse(frame->bh);
457 frame--;
458 }
459fail:
460 if (*err == ERR_BAD_DX_DIR)
461 ext3_warning(dir->i_sb, __func__,
462 "Corrupt dir inode %ld, running e2fsck is "
463 "recommended.", dir->i_ino);
464 return NULL;
465}
466
467static void dx_release (struct dx_frame *frames)
468{
469 if (frames[0].bh == NULL)
470 return;
471
472 if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
473 brelse(frames[1].bh);
474 brelse(frames[0].bh);
475}
476
477/*
478 * This function increments the frame pointer to search the next leaf
479 * block, and reads in the necessary intervening nodes if the search
480 * should be necessary. Whether or not the search is necessary is
481 * controlled by the hash parameter. If the hash value is even, then
482 * the search is only continued if the next block starts with that
483 * hash value. This is used if we are searching for a specific file.
484 *
485 * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
486 *
487 * This function returns 1 if the caller should continue to search,
488 * or 0 if it should not. If there is an error reading one of the
489 * index blocks, it will a negative error code.
490 *
491 * If start_hash is non-null, it will be filled in with the starting
492 * hash of the next page.
493 */
494static int ext3_htree_next_block(struct inode *dir, __u32 hash,
495 struct dx_frame *frame,
496 struct dx_frame *frames,
497 __u32 *start_hash)
498{
499 struct dx_frame *p;
500 struct buffer_head *bh;
501 int err, num_frames = 0;
502 __u32 bhash;
503
504 p = frame;
505 /*
506 * Find the next leaf page by incrementing the frame pointer.
507 * If we run out of entries in the interior node, loop around and
508 * increment pointer in the parent node. When we break out of
509 * this loop, num_frames indicates the number of interior
510 * nodes need to be read.
511 */
512 while (1) {
513 if (++(p->at) < p->entries + dx_get_count(p->entries))
514 break;
515 if (p == frames)
516 return 0;
517 num_frames++;
518 p--;
519 }
520
521 /*
522 * If the hash is 1, then continue only if the next page has a
523 * continuation hash of any value. This is used for readdir
524 * handling. Otherwise, check to see if the hash matches the
525 * desired contiuation hash. If it doesn't, return since
526 * there's no point to read in the successive index pages.
527 */
528 bhash = dx_get_hash(p->at);
529 if (start_hash)
530 *start_hash = bhash;
531 if ((hash & 1) == 0) {
532 if ((bhash & ~1) != hash)
533 return 0;
534 }
535 /*
536 * If the hash is HASH_NB_ALWAYS, we always go to the next
537 * block so no check is necessary
538 */
539 while (num_frames--) {
540 if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(p->at),
541 0, &err)))
542 return err; /* Failure */
543 p++;
544 brelse (p->bh);
545 p->bh = bh;
546 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
547 }
548 return 1;
549}
550
551
552/*
553 * This function fills a red-black tree with information from a
554 * directory block. It returns the number directory entries loaded
555 * into the tree. If there is an error it is returned in err.
556 */
557static int htree_dirblock_to_tree(struct file *dir_file,
558 struct inode *dir, int block,
559 struct dx_hash_info *hinfo,
560 __u32 start_hash, __u32 start_minor_hash)
561{
562 struct buffer_head *bh;
563 struct ext3_dir_entry_2 *de, *top;
564 int err = 0, count = 0;
565
566 dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
567
568 if (!(bh = ext3_dir_bread(NULL, dir, block, 0, &err)))
569 return err;
570
571 de = (struct ext3_dir_entry_2 *) bh->b_data;
572 top = (struct ext3_dir_entry_2 *) ((char *) de +
573 dir->i_sb->s_blocksize -
574 EXT3_DIR_REC_LEN(0));
575 for (; de < top; de = ext3_next_entry(de)) {
576 if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
577 (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
578 +((char *)de - bh->b_data))) {
579 /* silently ignore the rest of the block */
580 break;
581 }
582 ext3fs_dirhash(de->name, de->name_len, hinfo);
583 if ((hinfo->hash < start_hash) ||
584 ((hinfo->hash == start_hash) &&
585 (hinfo->minor_hash < start_minor_hash)))
586 continue;
587 if (de->inode == 0)
588 continue;
589 if ((err = ext3_htree_store_dirent(dir_file,
590 hinfo->hash, hinfo->minor_hash, de)) != 0) {
591 brelse(bh);
592 return err;
593 }
594 count++;
595 }
596 brelse(bh);
597 return count;
598}
599
600
601/*
602 * This function fills a red-black tree with information from a
603 * directory. We start scanning the directory in hash order, starting
604 * at start_hash and start_minor_hash.
605 *
606 * This function returns the number of entries inserted into the tree,
607 * or a negative error code.
608 */
609int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
610 __u32 start_minor_hash, __u32 *next_hash)
611{
612 struct dx_hash_info hinfo;
613 struct ext3_dir_entry_2 *de;
614 struct dx_frame frames[2], *frame;
615 struct inode *dir;
616 int block, err;
617 int count = 0;
618 int ret;
619 __u32 hashval;
620
621 dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
622 start_minor_hash));
623 dir = file_inode(dir_file);
624 if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
625 hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
626 if (hinfo.hash_version <= DX_HASH_TEA)
627 hinfo.hash_version +=
628 EXT3_SB(dir->i_sb)->s_hash_unsigned;
629 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
630 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
631 start_hash, start_minor_hash);
632 *next_hash = ~0;
633 return count;
634 }
635 hinfo.hash = start_hash;
636 hinfo.minor_hash = 0;
637 frame = dx_probe(NULL, file_inode(dir_file), &hinfo, frames, &err);
638 if (!frame)
639 return err;
640
641 /* Add '.' and '..' from the htree header */
642 if (!start_hash && !start_minor_hash) {
643 de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
644 if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
645 goto errout;
646 count++;
647 }
648 if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
649 de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
650 de = ext3_next_entry(de);
651 if ((err = ext3_htree_store_dirent(dir_file, 2, 0, de)) != 0)
652 goto errout;
653 count++;
654 }
655
656 while (1) {
657 block = dx_get_block(frame->at);
658 ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
659 start_hash, start_minor_hash);
660 if (ret < 0) {
661 err = ret;
662 goto errout;
663 }
664 count += ret;
665 hashval = ~0;
666 ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
667 frame, frames, &hashval);
668 *next_hash = hashval;
669 if (ret < 0) {
670 err = ret;
671 goto errout;
672 }
673 /*
674 * Stop if: (a) there are no more entries, or
675 * (b) we have inserted at least one entry and the
676 * next hash value is not a continuation
677 */
678 if ((ret == 0) ||
679 (count && ((hashval & 1) == 0)))
680 break;
681 }
682 dx_release(frames);
683 dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
684 count, *next_hash));
685 return count;
686errout:
687 dx_release(frames);
688 return (err);
689}
690
691
692/*
693 * Directory block splitting, compacting
694 */
695
696/*
697 * Create map of hash values, offsets, and sizes, stored at end of block.
698 * Returns number of entries mapped.
699 */
700static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
701 struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
702{
703 int count = 0;
704 char *base = (char *) de;
705 struct dx_hash_info h = *hinfo;
706
707 while ((char *) de < base + blocksize)
708 {
709 if (de->name_len && de->inode) {
710 ext3fs_dirhash(de->name, de->name_len, &h);
711 map_tail--;
712 map_tail->hash = h.hash;
713 map_tail->offs = (u16) ((char *) de - base);
714 map_tail->size = le16_to_cpu(de->rec_len);
715 count++;
716 cond_resched();
717 }
718 /* XXX: do we need to check rec_len == 0 case? -Chris */
719 de = ext3_next_entry(de);
720 }
721 return count;
722}
723
724/* Sort map by hash value */
725static void dx_sort_map (struct dx_map_entry *map, unsigned count)
726{
727 struct dx_map_entry *p, *q, *top = map + count - 1;
728 int more;
729 /* Combsort until bubble sort doesn't suck */
730 while (count > 2)
731 {
732 count = count*10/13;
733 if (count - 9 < 2) /* 9, 10 -> 11 */
734 count = 11;
735 for (p = top, q = p - count; q >= map; p--, q--)
736 if (p->hash < q->hash)
737 swap(*p, *q);
738 }
739 /* Garden variety bubble sort */
740 do {
741 more = 0;
742 q = top;
743 while (q-- > map)
744 {
745 if (q[1].hash >= q[0].hash)
746 continue;
747 swap(*(q+1), *q);
748 more = 1;
749 }
750 } while(more);
751}
752
753static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
754{
755 struct dx_entry *entries = frame->entries;
756 struct dx_entry *old = frame->at, *new = old + 1;
757 int count = dx_get_count(entries);
758
759 assert(count < dx_get_limit(entries));
760 assert(old < entries + count);
761 memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
762 dx_set_hash(new, hash);
763 dx_set_block(new, block);
764 dx_set_count(entries, count + 1);
765}
766
767static void ext3_update_dx_flag(struct inode *inode)
768{
769 if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
770 EXT3_FEATURE_COMPAT_DIR_INDEX))
771 EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
772}
773
774/*
775 * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
776 *
777 * `len <= EXT3_NAME_LEN' is guaranteed by caller.
778 * `de != NULL' is guaranteed by caller.
779 */
780static inline int ext3_match (int len, const char * const name,
781 struct ext3_dir_entry_2 * de)
782{
783 if (len != de->name_len)
784 return 0;
785 if (!de->inode)
786 return 0;
787 return !memcmp(name, de->name, len);
788}
789
790/*
791 * Returns 0 if not found, -1 on failure, and 1 on success
792 */
793static inline int search_dirblock(struct buffer_head * bh,
794 struct inode *dir,
795 struct qstr *child,
796 unsigned long offset,
797 struct ext3_dir_entry_2 ** res_dir)
798{
799 struct ext3_dir_entry_2 * de;
800 char * dlimit;
801 int de_len;
802 const char *name = child->name;
803 int namelen = child->len;
804
805 de = (struct ext3_dir_entry_2 *) bh->b_data;
806 dlimit = bh->b_data + dir->i_sb->s_blocksize;
807 while ((char *) de < dlimit) {
808 /* this code is executed quadratically often */
809 /* do minimal checking `by hand' */
810
811 if ((char *) de + namelen <= dlimit &&
812 ext3_match (namelen, name, de)) {
813 /* found a match - just to be sure, do a full check */
814 if (!ext3_check_dir_entry("ext3_find_entry",
815 dir, de, bh, offset))
816 return -1;
817 *res_dir = de;
818 return 1;
819 }
820 /* prevent looping on a bad block */
821 de_len = ext3_rec_len_from_disk(de->rec_len);
822 if (de_len <= 0)
823 return -1;
824 offset += de_len;
825 de = (struct ext3_dir_entry_2 *) ((char *) de + de_len);
826 }
827 return 0;
828}
829
830
831/*
832 * ext3_find_entry()
833 *
834 * finds an entry in the specified directory with the wanted name. It
835 * returns the cache buffer in which the entry was found, and the entry
836 * itself (as a parameter - res_dir). It does NOT read the inode of the
837 * entry - you'll have to do that yourself if you want to.
838 *
839 * The returned buffer_head has ->b_count elevated. The caller is expected
840 * to brelse() it when appropriate.
841 */
842static struct buffer_head *ext3_find_entry(struct inode *dir,
843 struct qstr *entry,
844 struct ext3_dir_entry_2 **res_dir)
845{
846 struct super_block * sb;
847 struct buffer_head * bh_use[NAMEI_RA_SIZE];
848 struct buffer_head * bh, *ret = NULL;
849 unsigned long start, block, b;
850 const u8 *name = entry->name;
851 int ra_max = 0; /* Number of bh's in the readahead
852 buffer, bh_use[] */
853 int ra_ptr = 0; /* Current index into readahead
854 buffer */
855 int num = 0;
856 int nblocks, i, err;
857 int namelen;
858
859 *res_dir = NULL;
860 sb = dir->i_sb;
861 namelen = entry->len;
862 if (namelen > EXT3_NAME_LEN)
863 return NULL;
864 if ((namelen <= 2) && (name[0] == '.') &&
865 (name[1] == '.' || name[1] == 0)) {
866 /*
867 * "." or ".." will only be in the first block
868 * NFS may look up ".."; "." should be handled by the VFS
869 */
870 block = start = 0;
871 nblocks = 1;
872 goto restart;
873 }
874 if (is_dx(dir)) {
875 bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
876 /*
877 * On success, or if the error was file not found,
878 * return. Otherwise, fall back to doing a search the
879 * old fashioned way.
880 */
881 if (bh || (err != ERR_BAD_DX_DIR))
882 return bh;
883 dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
884 }
885 nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
886 start = EXT3_I(dir)->i_dir_start_lookup;
887 if (start >= nblocks)
888 start = 0;
889 block = start;
890restart:
891 do {
892 /*
893 * We deal with the read-ahead logic here.
894 */
895 if (ra_ptr >= ra_max) {
896 /* Refill the readahead buffer */
897 ra_ptr = 0;
898 b = block;
899 for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
900 /*
901 * Terminate if we reach the end of the
902 * directory and must wrap, or if our
903 * search has finished at this block.
904 */
905 if (b >= nblocks || (num && block == start)) {
906 bh_use[ra_max] = NULL;
907 break;
908 }
909 num++;
910 bh = ext3_getblk(NULL, dir, b++, 0, &err);
911 bh_use[ra_max] = bh;
912 if (bh && !bh_uptodate_or_lock(bh)) {
913 get_bh(bh);
914 bh->b_end_io = end_buffer_read_sync;
915 submit_bh(READ | REQ_META | REQ_PRIO,
916 bh);
917 }
918 }
919 }
920 if ((bh = bh_use[ra_ptr++]) == NULL)
921 goto next;
922 wait_on_buffer(bh);
923 if (!buffer_uptodate(bh)) {
924 /* read error, skip block & hope for the best */
925 ext3_error(sb, __func__, "reading directory #%lu "
926 "offset %lu", dir->i_ino, block);
927 brelse(bh);
928 goto next;
929 }
930 i = search_dirblock(bh, dir, entry,
931 block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
932 if (i == 1) {
933 EXT3_I(dir)->i_dir_start_lookup = block;
934 ret = bh;
935 goto cleanup_and_exit;
936 } else {
937 brelse(bh);
938 if (i < 0)
939 goto cleanup_and_exit;
940 }
941 next:
942 if (++block >= nblocks)
943 block = 0;
944 } while (block != start);
945
946 /*
947 * If the directory has grown while we were searching, then
948 * search the last part of the directory before giving up.
949 */
950 block = nblocks;
951 nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
952 if (block < nblocks) {
953 start = 0;
954 goto restart;
955 }
956
957cleanup_and_exit:
958 /* Clean up the read-ahead blocks */
959 for (; ra_ptr < ra_max; ra_ptr++)
960 brelse (bh_use[ra_ptr]);
961 return ret;
962}
963
964static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
965 struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
966 int *err)
967{
968 struct super_block *sb = dir->i_sb;
969 struct dx_hash_info hinfo;
970 struct dx_frame frames[2], *frame;
971 struct buffer_head *bh;
972 unsigned long block;
973 int retval;
974
975 if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
976 return NULL;
977 do {
978 block = dx_get_block(frame->at);
979 if (!(bh = ext3_dir_bread (NULL, dir, block, 0, err)))
980 goto errout;
981
982 retval = search_dirblock(bh, dir, entry,
983 block << EXT3_BLOCK_SIZE_BITS(sb),
984 res_dir);
985 if (retval == 1) {
986 dx_release(frames);
987 return bh;
988 }
989 brelse(bh);
990 if (retval == -1) {
991 *err = ERR_BAD_DX_DIR;
992 goto errout;
993 }
994
995 /* Check to see if we should continue to search */
996 retval = ext3_htree_next_block(dir, hinfo.hash, frame,
997 frames, NULL);
998 if (retval < 0) {
999 ext3_warning(sb, __func__,
1000 "error reading index page in directory #%lu",
1001 dir->i_ino);
1002 *err = retval;
1003 goto errout;
1004 }
1005 } while (retval == 1);
1006
1007 *err = -ENOENT;
1008errout:
1009 dxtrace(printk("%s not found\n", entry->name));
1010 dx_release (frames);
1011 return NULL;
1012}
1013
1014static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags)
1015{
1016 struct inode * inode;
1017 struct ext3_dir_entry_2 * de;
1018 struct buffer_head * bh;
1019
1020 if (dentry->d_name.len > EXT3_NAME_LEN)
1021 return ERR_PTR(-ENAMETOOLONG);
1022
1023 bh = ext3_find_entry(dir, &dentry->d_name, &de);
1024 inode = NULL;
1025 if (bh) {
1026 unsigned long ino = le32_to_cpu(de->inode);
1027 brelse (bh);
1028 if (!ext3_valid_inum(dir->i_sb, ino)) {
1029 ext3_error(dir->i_sb, "ext3_lookup",
1030 "bad inode number: %lu", ino);
1031 return ERR_PTR(-EIO);
1032 }
1033 inode = ext3_iget(dir->i_sb, ino);
1034 if (inode == ERR_PTR(-ESTALE)) {
1035 ext3_error(dir->i_sb, __func__,
1036 "deleted inode referenced: %lu",
1037 ino);
1038 return ERR_PTR(-EIO);
1039 }
1040 }
1041 return d_splice_alias(inode, dentry);
1042}
1043
1044
1045struct dentry *ext3_get_parent(struct dentry *child)
1046{
1047 unsigned long ino;
1048 struct qstr dotdot = QSTR_INIT("..", 2);
1049 struct ext3_dir_entry_2 * de;
1050 struct buffer_head *bh;
1051
1052 bh = ext3_find_entry(d_inode(child), &dotdot, &de);
1053 if (!bh)
1054 return ERR_PTR(-ENOENT);
1055 ino = le32_to_cpu(de->inode);
1056 brelse(bh);
1057
1058 if (!ext3_valid_inum(d_inode(child)->i_sb, ino)) {
1059 ext3_error(d_inode(child)->i_sb, "ext3_get_parent",
1060 "bad inode number: %lu", ino);
1061 return ERR_PTR(-EIO);
1062 }
1063
1064 return d_obtain_alias(ext3_iget(d_inode(child)->i_sb, ino));
1065}
1066
1067#define S_SHIFT 12
1068static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
1069 [S_IFREG >> S_SHIFT] = EXT3_FT_REG_FILE,
1070 [S_IFDIR >> S_SHIFT] = EXT3_FT_DIR,
1071 [S_IFCHR >> S_SHIFT] = EXT3_FT_CHRDEV,
1072 [S_IFBLK >> S_SHIFT] = EXT3_FT_BLKDEV,
1073 [S_IFIFO >> S_SHIFT] = EXT3_FT_FIFO,
1074 [S_IFSOCK >> S_SHIFT] = EXT3_FT_SOCK,
1075 [S_IFLNK >> S_SHIFT] = EXT3_FT_SYMLINK,
1076};
1077
1078static inline void ext3_set_de_type(struct super_block *sb,
1079 struct ext3_dir_entry_2 *de,
1080 umode_t mode) {
1081 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE))
1082 de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
1083}
1084
1085/*
1086 * Move count entries from end of map between two memory locations.
1087 * Returns pointer to last entry moved.
1088 */
1089static struct ext3_dir_entry_2 *
1090dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1091{
1092 unsigned rec_len = 0;
1093
1094 while (count--) {
1095 struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
1096 rec_len = EXT3_DIR_REC_LEN(de->name_len);
1097 memcpy (to, de, rec_len);
1098 ((struct ext3_dir_entry_2 *) to)->rec_len =
1099 ext3_rec_len_to_disk(rec_len);
1100 de->inode = 0;
1101 map++;
1102 to += rec_len;
1103 }
1104 return (struct ext3_dir_entry_2 *) (to - rec_len);
1105}
1106
1107/*
1108 * Compact each dir entry in the range to the minimal rec_len.
1109 * Returns pointer to last entry in range.
1110 */
1111static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize)
1112{
1113 struct ext3_dir_entry_2 *next, *to, *prev;
1114 struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base;
1115 unsigned rec_len = 0;
1116
1117 prev = to = de;
1118 while ((char *)de < base + blocksize) {
1119 next = ext3_next_entry(de);
1120 if (de->inode && de->name_len) {
1121 rec_len = EXT3_DIR_REC_LEN(de->name_len);
1122 if (de > to)
1123 memmove(to, de, rec_len);
1124 to->rec_len = ext3_rec_len_to_disk(rec_len);
1125 prev = to;
1126 to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
1127 }
1128 de = next;
1129 }
1130 return prev;
1131}
1132
1133/*
1134 * Split a full leaf block to make room for a new dir entry.
1135 * Allocate a new block, and move entries so that they are approx. equally full.
1136 * Returns pointer to de in block into which the new entry will be inserted.
1137 */
1138static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1139 struct buffer_head **bh,struct dx_frame *frame,
1140 struct dx_hash_info *hinfo, int *error)
1141{
1142 unsigned blocksize = dir->i_sb->s_blocksize;
1143 unsigned count, continued;
1144 struct buffer_head *bh2;
1145 u32 newblock;
1146 u32 hash2;
1147 struct dx_map_entry *map;
1148 char *data1 = (*bh)->b_data, *data2;
1149 unsigned split, move, size;
1150 struct ext3_dir_entry_2 *de = NULL, *de2;
1151 int err = 0, i;
1152
1153 bh2 = ext3_append (handle, dir, &newblock, &err);
1154 if (!(bh2)) {
1155 brelse(*bh);
1156 *bh = NULL;
1157 goto errout;
1158 }
1159
1160 BUFFER_TRACE(*bh, "get_write_access");
1161 err = ext3_journal_get_write_access(handle, *bh);
1162 if (err)
1163 goto journal_error;
1164
1165 BUFFER_TRACE(frame->bh, "get_write_access");
1166 err = ext3_journal_get_write_access(handle, frame->bh);
1167 if (err)
1168 goto journal_error;
1169
1170 data2 = bh2->b_data;
1171
1172 /* create map in the end of data2 block */
1173 map = (struct dx_map_entry *) (data2 + blocksize);
1174 count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
1175 blocksize, hinfo, map);
1176 map -= count;
1177 dx_sort_map (map, count);
1178 /* Split the existing block in the middle, size-wise */
1179 size = 0;
1180 move = 0;
1181 for (i = count-1; i >= 0; i--) {
1182 /* is more than half of this entry in 2nd half of the block? */
1183 if (size + map[i].size/2 > blocksize/2)
1184 break;
1185 size += map[i].size;
1186 move++;
1187 }
1188 /* map index at which we will split */
1189 split = count - move;
1190 hash2 = map[split].hash;
1191 continued = hash2 == map[split - 1].hash;
1192 dxtrace(printk("Split block %i at %x, %i/%i\n",
1193 dx_get_block(frame->at), hash2, split, count-split));
1194
1195 /* Fancy dance to stay within two buffers */
1196 de2 = dx_move_dirents(data1, data2, map + split, count - split);
1197 de = dx_pack_dirents(data1,blocksize);
1198 de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
1199 de2->rec_len = ext3_rec_len_to_disk(data2 + blocksize - (char *) de2);
1200 dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
1201 dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
1202
1203 /* Which block gets the new entry? */
1204 if (hinfo->hash >= hash2)
1205 {
1206 swap(*bh, bh2);
1207 de = de2;
1208 }
1209 dx_insert_block (frame, hash2 + continued, newblock);
1210 err = ext3_journal_dirty_metadata (handle, bh2);
1211 if (err)
1212 goto journal_error;
1213 err = ext3_journal_dirty_metadata (handle, frame->bh);
1214 if (err)
1215 goto journal_error;
1216 brelse (bh2);
1217 dxtrace(dx_show_index ("frame", frame->entries));
1218 return de;
1219
1220journal_error:
1221 brelse(*bh);
1222 brelse(bh2);
1223 *bh = NULL;
1224 ext3_std_error(dir->i_sb, err);
1225errout:
1226 *error = err;
1227 return NULL;
1228}
1229
1230
1231/*
1232 * Add a new entry into a directory (leaf) block. If de is non-NULL,
1233 * it points to a directory entry which is guaranteed to be large
1234 * enough for new directory entry. If de is NULL, then
1235 * add_dirent_to_buf will attempt search the directory block for
1236 * space. It will return -ENOSPC if no space is available, and -EIO
1237 * and -EEXIST if directory entry already exists.
1238 *
1239 * NOTE! bh is NOT released in the case where ENOSPC is returned. In
1240 * all other cases bh is released.
1241 */
1242static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1243 struct inode *inode, struct ext3_dir_entry_2 *de,
1244 struct buffer_head * bh)
1245{
1246 struct inode *dir = d_inode(dentry->d_parent);
1247 const char *name = dentry->d_name.name;
1248 int namelen = dentry->d_name.len;
1249 unsigned long offset = 0;
1250 unsigned short reclen;
1251 int nlen, rlen, err;
1252 char *top;
1253
1254 reclen = EXT3_DIR_REC_LEN(namelen);
1255 if (!de) {
1256 de = (struct ext3_dir_entry_2 *)bh->b_data;
1257 top = bh->b_data + dir->i_sb->s_blocksize - reclen;
1258 while ((char *) de <= top) {
1259 if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
1260 bh, offset)) {
1261 brelse (bh);
1262 return -EIO;
1263 }
1264 if (ext3_match (namelen, name, de)) {
1265 brelse (bh);
1266 return -EEXIST;
1267 }
1268 nlen = EXT3_DIR_REC_LEN(de->name_len);
1269 rlen = ext3_rec_len_from_disk(de->rec_len);
1270 if ((de->inode? rlen - nlen: rlen) >= reclen)
1271 break;
1272 de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
1273 offset += rlen;
1274 }
1275 if ((char *) de > top)
1276 return -ENOSPC;
1277 }
1278 BUFFER_TRACE(bh, "get_write_access");
1279 err = ext3_journal_get_write_access(handle, bh);
1280 if (err) {
1281 ext3_std_error(dir->i_sb, err);
1282 brelse(bh);
1283 return err;
1284 }
1285
1286 /* By now the buffer is marked for journaling */
1287 nlen = EXT3_DIR_REC_LEN(de->name_len);
1288 rlen = ext3_rec_len_from_disk(de->rec_len);
1289 if (de->inode) {
1290 struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
1291 de1->rec_len = ext3_rec_len_to_disk(rlen - nlen);
1292 de->rec_len = ext3_rec_len_to_disk(nlen);
1293 de = de1;
1294 }
1295 de->file_type = EXT3_FT_UNKNOWN;
1296 if (inode) {
1297 de->inode = cpu_to_le32(inode->i_ino);
1298 ext3_set_de_type(dir->i_sb, de, inode->i_mode);
1299 } else
1300 de->inode = 0;
1301 de->name_len = namelen;
1302 memcpy (de->name, name, namelen);
1303 /*
1304 * XXX shouldn't update any times until successful
1305 * completion of syscall, but too many callers depend
1306 * on this.
1307 *
1308 * XXX similarly, too many callers depend on
1309 * ext3_new_inode() setting the times, but error
1310 * recovery deletes the inode, so the worst that can
1311 * happen is that the times are slightly out of date
1312 * and/or different from the directory change time.
1313 */
1314 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
1315 ext3_update_dx_flag(dir);
1316 dir->i_version++;
1317 ext3_mark_inode_dirty(handle, dir);
1318 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1319 err = ext3_journal_dirty_metadata(handle, bh);
1320 if (err)
1321 ext3_std_error(dir->i_sb, err);
1322 brelse(bh);
1323 return 0;
1324}
1325
1326/*
1327 * This converts a one block unindexed directory to a 3 block indexed
1328 * directory, and adds the dentry to the indexed directory.
1329 */
1330static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1331 struct inode *inode, struct buffer_head *bh)
1332{
1333 struct inode *dir = d_inode(dentry->d_parent);
1334 const char *name = dentry->d_name.name;
1335 int namelen = dentry->d_name.len;
1336 struct buffer_head *bh2;
1337 struct dx_root *root;
1338 struct dx_frame frames[2], *frame;
1339 struct dx_entry *entries;
1340 struct ext3_dir_entry_2 *de, *de2;
1341 char *data1, *top;
1342 unsigned len;
1343 int retval;
1344 unsigned blocksize;
1345 struct dx_hash_info hinfo;
1346 u32 block;
1347 struct fake_dirent *fde;
1348
1349 blocksize = dir->i_sb->s_blocksize;
1350 dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
1351 retval = ext3_journal_get_write_access(handle, bh);
1352 if (retval) {
1353 ext3_std_error(dir->i_sb, retval);
1354 brelse(bh);
1355 return retval;
1356 }
1357 root = (struct dx_root *) bh->b_data;
1358
1359 /* The 0th block becomes the root, move the dirents out */
1360 fde = &root->dotdot;
1361 de = (struct ext3_dir_entry_2 *)((char *)fde +
1362 ext3_rec_len_from_disk(fde->rec_len));
1363 if ((char *) de >= (((char *) root) + blocksize)) {
1364 ext3_error(dir->i_sb, __func__,
1365 "invalid rec_len for '..' in inode %lu",
1366 dir->i_ino);
1367 brelse(bh);
1368 return -EIO;
1369 }
1370 len = ((char *) root) + blocksize - (char *) de;
1371
1372 bh2 = ext3_append (handle, dir, &block, &retval);
1373 if (!(bh2)) {
1374 brelse(bh);
1375 return retval;
1376 }
1377 EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
1378 data1 = bh2->b_data;
1379
1380 memcpy (data1, de, len);
1381 de = (struct ext3_dir_entry_2 *) data1;
1382 top = data1 + len;
1383 while ((char *)(de2 = ext3_next_entry(de)) < top)
1384 de = de2;
1385 de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
1386 /* Initialize the root; the dot dirents already exist */
1387 de = (struct ext3_dir_entry_2 *) (&root->dotdot);
1388 de->rec_len = ext3_rec_len_to_disk(blocksize - EXT3_DIR_REC_LEN(2));
1389 memset (&root->info, 0, sizeof(root->info));
1390 root->info.info_length = sizeof(root->info);
1391 root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
1392 entries = root->entries;
1393 dx_set_block (entries, 1);
1394 dx_set_count (entries, 1);
1395 dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
1396
1397 /* Initialize as for dx_probe */
1398 hinfo.hash_version = root->info.hash_version;
1399 if (hinfo.hash_version <= DX_HASH_TEA)
1400 hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
1401 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
1402 ext3fs_dirhash(name, namelen, &hinfo);
1403 frame = frames;
1404 frame->entries = entries;
1405 frame->at = entries;
1406 frame->bh = bh;
1407 bh = bh2;
1408 /*
1409 * Mark buffers dirty here so that if do_split() fails we write a
1410 * consistent set of buffers to disk.
1411 */
1412 ext3_journal_dirty_metadata(handle, frame->bh);
1413 ext3_journal_dirty_metadata(handle, bh);
1414 de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1415 if (!de) {
1416 ext3_mark_inode_dirty(handle, dir);
1417 dx_release(frames);
1418 return retval;
1419 }
1420 dx_release(frames);
1421
1422 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1423}
1424
1425/*
1426 * ext3_add_entry()
1427 *
1428 * adds a file entry to the specified directory, using the same
1429 * semantics as ext3_find_entry(). It returns NULL if it failed.
1430 *
1431 * NOTE!! The inode part of 'de' is left at 0 - which means you
1432 * may not sleep between calling this and putting something into
1433 * the entry, as someone else might have used it while you slept.
1434 */
1435static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
1436 struct inode *inode)
1437{
1438 struct inode *dir = d_inode(dentry->d_parent);
1439 struct buffer_head * bh;
1440 struct ext3_dir_entry_2 *de;
1441 struct super_block * sb;
1442 int retval;
1443 int dx_fallback=0;
1444 unsigned blocksize;
1445 u32 block, blocks;
1446
1447 sb = dir->i_sb;
1448 blocksize = sb->s_blocksize;
1449 if (!dentry->d_name.len)
1450 return -EINVAL;
1451 if (is_dx(dir)) {
1452 retval = ext3_dx_add_entry(handle, dentry, inode);
1453 if (!retval || (retval != ERR_BAD_DX_DIR))
1454 return retval;
1455 EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL;
1456 dx_fallback++;
1457 ext3_mark_inode_dirty(handle, dir);
1458 }
1459 blocks = dir->i_size >> sb->s_blocksize_bits;
1460 for (block = 0; block < blocks; block++) {
1461 if (!(bh = ext3_dir_bread(handle, dir, block, 0, &retval)))
1462 return retval;
1463
1464 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1465 if (retval != -ENOSPC)
1466 return retval;
1467
1468 if (blocks == 1 && !dx_fallback &&
1469 EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
1470 return make_indexed_dir(handle, dentry, inode, bh);
1471 brelse(bh);
1472 }
1473 bh = ext3_append(handle, dir, &block, &retval);
1474 if (!bh)
1475 return retval;
1476 de = (struct ext3_dir_entry_2 *) bh->b_data;
1477 de->inode = 0;
1478 de->rec_len = ext3_rec_len_to_disk(blocksize);
1479 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1480}
1481
1482/*
1483 * Returns 0 for success, or a negative error value
1484 */
1485static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
1486 struct inode *inode)
1487{
1488 struct dx_frame frames[2], *frame;
1489 struct dx_entry *entries, *at;
1490 struct dx_hash_info hinfo;
1491 struct buffer_head * bh;
1492 struct inode *dir = d_inode(dentry->d_parent);
1493 struct super_block * sb = dir->i_sb;
1494 struct ext3_dir_entry_2 *de;
1495 int err;
1496
1497 frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
1498 if (!frame)
1499 return err;
1500 entries = frame->entries;
1501 at = frame->at;
1502
1503 if (!(bh = ext3_dir_bread(handle, dir, dx_get_block(frame->at), 0, &err)))
1504 goto cleanup;
1505
1506 BUFFER_TRACE(bh, "get_write_access");
1507 err = ext3_journal_get_write_access(handle, bh);
1508 if (err)
1509 goto journal_error;
1510
1511 err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1512 if (err != -ENOSPC) {
1513 bh = NULL;
1514 goto cleanup;
1515 }
1516
1517 /* Block full, should compress but for now just split */
1518 dxtrace(printk("using %u of %u node entries\n",
1519 dx_get_count(entries), dx_get_limit(entries)));
1520 /* Need to split index? */
1521 if (dx_get_count(entries) == dx_get_limit(entries)) {
1522 u32 newblock;
1523 unsigned icount = dx_get_count(entries);
1524 int levels = frame - frames;
1525 struct dx_entry *entries2;
1526 struct dx_node *node2;
1527 struct buffer_head *bh2;
1528
1529 if (levels && (dx_get_count(frames->entries) ==
1530 dx_get_limit(frames->entries))) {
1531 ext3_warning(sb, __func__,
1532 "Directory index full!");
1533 err = -ENOSPC;
1534 goto cleanup;
1535 }
1536 bh2 = ext3_append (handle, dir, &newblock, &err);
1537 if (!(bh2))
1538 goto cleanup;
1539 node2 = (struct dx_node *)(bh2->b_data);
1540 entries2 = node2->entries;
1541 memset(&node2->fake, 0, sizeof(struct fake_dirent));
1542 node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize);
1543 BUFFER_TRACE(frame->bh, "get_write_access");
1544 err = ext3_journal_get_write_access(handle, frame->bh);
1545 if (err)
1546 goto journal_error;
1547 if (levels) {
1548 unsigned icount1 = icount/2, icount2 = icount - icount1;
1549 unsigned hash2 = dx_get_hash(entries + icount1);
1550 dxtrace(printk("Split index %i/%i\n", icount1, icount2));
1551
1552 BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
1553 err = ext3_journal_get_write_access(handle,
1554 frames[0].bh);
1555 if (err)
1556 goto journal_error;
1557
1558 memcpy ((char *) entries2, (char *) (entries + icount1),
1559 icount2 * sizeof(struct dx_entry));
1560 dx_set_count (entries, icount1);
1561 dx_set_count (entries2, icount2);
1562 dx_set_limit (entries2, dx_node_limit(dir));
1563
1564 /* Which index block gets the new entry? */
1565 if (at - entries >= icount1) {
1566 frame->at = at = at - entries - icount1 + entries2;
1567 frame->entries = entries = entries2;
1568 swap(frame->bh, bh2);
1569 }
1570 dx_insert_block (frames + 0, hash2, newblock);
1571 dxtrace(dx_show_index ("node", frames[1].entries));
1572 dxtrace(dx_show_index ("node",
1573 ((struct dx_node *) bh2->b_data)->entries));
1574 err = ext3_journal_dirty_metadata(handle, bh2);
1575 if (err)
1576 goto journal_error;
1577 brelse (bh2);
1578 } else {
1579 dxtrace(printk("Creating second level index...\n"));
1580 memcpy((char *) entries2, (char *) entries,
1581 icount * sizeof(struct dx_entry));
1582 dx_set_limit(entries2, dx_node_limit(dir));
1583
1584 /* Set up root */
1585 dx_set_count(entries, 1);
1586 dx_set_block(entries + 0, newblock);
1587 ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
1588
1589 /* Add new access path frame */
1590 frame = frames + 1;
1591 frame->at = at = at - entries + entries2;
1592 frame->entries = entries = entries2;
1593 frame->bh = bh2;
1594 err = ext3_journal_get_write_access(handle,
1595 frame->bh);
1596 if (err)
1597 goto journal_error;
1598 }
1599 err = ext3_journal_dirty_metadata(handle, frames[0].bh);
1600 if (err)
1601 goto journal_error;
1602 }
1603 de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1604 if (!de)
1605 goto cleanup;
1606 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1607 bh = NULL;
1608 goto cleanup;
1609
1610journal_error:
1611 ext3_std_error(dir->i_sb, err);
1612cleanup:
1613 if (bh)
1614 brelse(bh);
1615 dx_release(frames);
1616 return err;
1617}
1618
1619/*
1620 * ext3_delete_entry deletes a directory entry by merging it with the
1621 * previous entry
1622 */
1623static int ext3_delete_entry (handle_t *handle,
1624 struct inode * dir,
1625 struct ext3_dir_entry_2 * de_del,
1626 struct buffer_head * bh)
1627{
1628 struct ext3_dir_entry_2 * de, * pde;
1629 int i;
1630
1631 i = 0;
1632 pde = NULL;
1633 de = (struct ext3_dir_entry_2 *) bh->b_data;
1634 while (i < bh->b_size) {
1635 if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
1636 return -EIO;
1637 if (de == de_del) {
1638 int err;
1639
1640 BUFFER_TRACE(bh, "get_write_access");
1641 err = ext3_journal_get_write_access(handle, bh);
1642 if (err)
1643 goto journal_error;
1644
1645 if (pde)
1646 pde->rec_len = ext3_rec_len_to_disk(
1647 ext3_rec_len_from_disk(pde->rec_len) +
1648 ext3_rec_len_from_disk(de->rec_len));
1649 else
1650 de->inode = 0;
1651 dir->i_version++;
1652 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1653 err = ext3_journal_dirty_metadata(handle, bh);
1654 if (err) {
1655journal_error:
1656 ext3_std_error(dir->i_sb, err);
1657 return err;
1658 }
1659 return 0;
1660 }
1661 i += ext3_rec_len_from_disk(de->rec_len);
1662 pde = de;
1663 de = ext3_next_entry(de);
1664 }
1665 return -ENOENT;
1666}
1667
1668static int ext3_add_nondir(handle_t *handle,
1669 struct dentry *dentry, struct inode *inode)
1670{
1671 int err = ext3_add_entry(handle, dentry, inode);
1672 if (!err) {
1673 ext3_mark_inode_dirty(handle, inode);
1674 unlock_new_inode(inode);
1675 d_instantiate(dentry, inode);
1676 return 0;
1677 }
1678 drop_nlink(inode);
1679 unlock_new_inode(inode);
1680 iput(inode);
1681 return err;
1682}
1683
1684/*
1685 * By the time this is called, we already have created
1686 * the directory cache entry for the new file, but it
1687 * is so far negative - it has no inode.
1688 *
1689 * If the create succeeds, we fill in the inode information
1690 * with d_instantiate().
1691 */
1692static int ext3_create (struct inode * dir, struct dentry * dentry, umode_t mode,
1693 bool excl)
1694{
1695 handle_t *handle;
1696 struct inode * inode;
1697 int err, retries = 0;
1698
1699 dquot_initialize(dir);
1700
1701retry:
1702 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1703 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1704 EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
1705 if (IS_ERR(handle))
1706 return PTR_ERR(handle);
1707
1708 if (IS_DIRSYNC(dir))
1709 handle->h_sync = 1;
1710
1711 inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
1712 err = PTR_ERR(inode);
1713 if (!IS_ERR(inode)) {
1714 inode->i_op = &ext3_file_inode_operations;
1715 inode->i_fop = &ext3_file_operations;
1716 ext3_set_aops(inode);
1717 err = ext3_add_nondir(handle, dentry, inode);
1718 }
1719 ext3_journal_stop(handle);
1720 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1721 goto retry;
1722 return err;
1723}
1724
1725static int ext3_mknod (struct inode * dir, struct dentry *dentry,
1726 umode_t mode, dev_t rdev)
1727{
1728 handle_t *handle;
1729 struct inode *inode;
1730 int err, retries = 0;
1731
1732 if (!new_valid_dev(rdev))
1733 return -EINVAL;
1734
1735 dquot_initialize(dir);
1736
1737retry:
1738 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1739 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1740 EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
1741 if (IS_ERR(handle))
1742 return PTR_ERR(handle);
1743
1744 if (IS_DIRSYNC(dir))
1745 handle->h_sync = 1;
1746
1747 inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
1748 err = PTR_ERR(inode);
1749 if (!IS_ERR(inode)) {
1750 init_special_inode(inode, inode->i_mode, rdev);
1751#ifdef CONFIG_EXT3_FS_XATTR
1752 inode->i_op = &ext3_special_inode_operations;
1753#endif
1754 err = ext3_add_nondir(handle, dentry, inode);
1755 }
1756 ext3_journal_stop(handle);
1757 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1758 goto retry;
1759 return err;
1760}
1761
1762static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
1763{
1764 handle_t *handle;
1765 struct inode *inode;
1766 int err, retries = 0;
1767
1768 dquot_initialize(dir);
1769
1770retry:
1771 handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
1772 4 + EXT3_XATTR_TRANS_BLOCKS);
1773
1774 if (IS_ERR(handle))
1775 return PTR_ERR(handle);
1776
1777 inode = ext3_new_inode (handle, dir, NULL, mode);
1778 err = PTR_ERR(inode);
1779 if (!IS_ERR(inode)) {
1780 inode->i_op = &ext3_file_inode_operations;
1781 inode->i_fop = &ext3_file_operations;
1782 ext3_set_aops(inode);
1783 d_tmpfile(dentry, inode);
1784 err = ext3_orphan_add(handle, inode);
1785 if (err)
1786 goto err_unlock_inode;
1787 mark_inode_dirty(inode);
1788 unlock_new_inode(inode);
1789 }
1790 ext3_journal_stop(handle);
1791 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1792 goto retry;
1793 return err;
1794err_unlock_inode:
1795 ext3_journal_stop(handle);
1796 unlock_new_inode(inode);
1797 return err;
1798}
1799
1800static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
1801{
1802 handle_t *handle;
1803 struct inode * inode;
1804 struct buffer_head * dir_block = NULL;
1805 struct ext3_dir_entry_2 * de;
1806 int err, retries = 0;
1807
1808 if (dir->i_nlink >= EXT3_LINK_MAX)
1809 return -EMLINK;
1810
1811 dquot_initialize(dir);
1812
1813retry:
1814 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1815 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1816 EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
1817 if (IS_ERR(handle))
1818 return PTR_ERR(handle);
1819
1820 if (IS_DIRSYNC(dir))
1821 handle->h_sync = 1;
1822
1823 inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode);
1824 err = PTR_ERR(inode);
1825 if (IS_ERR(inode))
1826 goto out_stop;
1827
1828 inode->i_op = &ext3_dir_inode_operations;
1829 inode->i_fop = &ext3_dir_operations;
1830 inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1831 if (!(dir_block = ext3_dir_bread(handle, inode, 0, 1, &err)))
1832 goto out_clear_inode;
1833
1834 BUFFER_TRACE(dir_block, "get_write_access");
1835 err = ext3_journal_get_write_access(handle, dir_block);
1836 if (err)
1837 goto out_clear_inode;
1838
1839 de = (struct ext3_dir_entry_2 *) dir_block->b_data;
1840 de->inode = cpu_to_le32(inode->i_ino);
1841 de->name_len = 1;
1842 de->rec_len = ext3_rec_len_to_disk(EXT3_DIR_REC_LEN(de->name_len));
1843 strcpy (de->name, ".");
1844 ext3_set_de_type(dir->i_sb, de, S_IFDIR);
1845 de = ext3_next_entry(de);
1846 de->inode = cpu_to_le32(dir->i_ino);
1847 de->rec_len = ext3_rec_len_to_disk(inode->i_sb->s_blocksize -
1848 EXT3_DIR_REC_LEN(1));
1849 de->name_len = 2;
1850 strcpy (de->name, "..");
1851 ext3_set_de_type(dir->i_sb, de, S_IFDIR);
1852 set_nlink(inode, 2);
1853 BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
1854 err = ext3_journal_dirty_metadata(handle, dir_block);
1855 if (err)
1856 goto out_clear_inode;
1857
1858 err = ext3_mark_inode_dirty(handle, inode);
1859 if (!err)
1860 err = ext3_add_entry (handle, dentry, inode);
1861
1862 if (err) {
1863out_clear_inode:
1864 clear_nlink(inode);
1865 unlock_new_inode(inode);
1866 ext3_mark_inode_dirty(handle, inode);
1867 iput (inode);
1868 goto out_stop;
1869 }
1870 inc_nlink(dir);
1871 ext3_update_dx_flag(dir);
1872 err = ext3_mark_inode_dirty(handle, dir);
1873 if (err)
1874 goto out_clear_inode;
1875
1876 unlock_new_inode(inode);
1877 d_instantiate(dentry, inode);
1878out_stop:
1879 brelse(dir_block);
1880 ext3_journal_stop(handle);
1881 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1882 goto retry;
1883 return err;
1884}
1885
1886/*
1887 * routine to check that the specified directory is empty (for rmdir)
1888 */
1889static int empty_dir (struct inode * inode)
1890{
1891 unsigned long offset;
1892 struct buffer_head * bh;
1893 struct ext3_dir_entry_2 * de, * de1;
1894 struct super_block * sb;
1895 int err = 0;
1896
1897 sb = inode->i_sb;
1898 if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) ||
1899 !(bh = ext3_dir_bread(NULL, inode, 0, 0, &err))) {
1900 if (err)
1901 ext3_error(inode->i_sb, __func__,
1902 "error %d reading directory #%lu offset 0",
1903 err, inode->i_ino);
1904 else
1905 ext3_warning(inode->i_sb, __func__,
1906 "bad directory (dir #%lu) - no data block",
1907 inode->i_ino);
1908 return 1;
1909 }
1910 de = (struct ext3_dir_entry_2 *) bh->b_data;
1911 de1 = ext3_next_entry(de);
1912 if (le32_to_cpu(de->inode) != inode->i_ino ||
1913 !le32_to_cpu(de1->inode) ||
1914 strcmp (".", de->name) ||
1915 strcmp ("..", de1->name)) {
1916 ext3_warning (inode->i_sb, "empty_dir",
1917 "bad directory (dir #%lu) - no `.' or `..'",
1918 inode->i_ino);
1919 brelse (bh);
1920 return 1;
1921 }
1922 offset = ext3_rec_len_from_disk(de->rec_len) +
1923 ext3_rec_len_from_disk(de1->rec_len);
1924 de = ext3_next_entry(de1);
1925 while (offset < inode->i_size ) {
1926 if (!bh ||
1927 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1928 err = 0;
1929 brelse (bh);
1930 if (!(bh = ext3_dir_bread (NULL, inode,
1931 offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err))) {
1932 if (err)
1933 ext3_error(sb, __func__,
1934 "error %d reading directory"
1935 " #%lu offset %lu",
1936 err, inode->i_ino, offset);
1937 offset += sb->s_blocksize;
1938 continue;
1939 }
1940 de = (struct ext3_dir_entry_2 *) bh->b_data;
1941 }
1942 if (!ext3_check_dir_entry("empty_dir", inode, de, bh, offset)) {
1943 de = (struct ext3_dir_entry_2 *)(bh->b_data +
1944 sb->s_blocksize);
1945 offset = (offset | (sb->s_blocksize - 1)) + 1;
1946 continue;
1947 }
1948 if (le32_to_cpu(de->inode)) {
1949 brelse (bh);
1950 return 0;
1951 }
1952 offset += ext3_rec_len_from_disk(de->rec_len);
1953 de = ext3_next_entry(de);
1954 }
1955 brelse (bh);
1956 return 1;
1957}
1958
1959/* ext3_orphan_add() links an unlinked or truncated inode into a list of
1960 * such inodes, starting at the superblock, in case we crash before the
1961 * file is closed/deleted, or in case the inode truncate spans multiple
1962 * transactions and the last transaction is not recovered after a crash.
1963 *
1964 * At filesystem recovery time, we walk this list deleting unlinked
1965 * inodes and truncating linked inodes in ext3_orphan_cleanup().
1966 */
1967int ext3_orphan_add(handle_t *handle, struct inode *inode)
1968{
1969 struct super_block *sb = inode->i_sb;
1970 struct ext3_iloc iloc;
1971 int err = 0, rc;
1972
1973 mutex_lock(&EXT3_SB(sb)->s_orphan_lock);
1974 if (!list_empty(&EXT3_I(inode)->i_orphan))
1975 goto out_unlock;
1976
1977 /* Orphan handling is only valid for files with data blocks
1978 * being truncated, or files being unlinked. */
1979
1980 /* @@@ FIXME: Observation from aviro:
1981 * I think I can trigger J_ASSERT in ext3_orphan_add(). We block
1982 * here (on s_orphan_lock), so race with ext3_link() which might bump
1983 * ->i_nlink. For, say it, character device. Not a regular file,
1984 * not a directory, not a symlink and ->i_nlink > 0.
1985 *
1986 * tytso, 4/25/2009: I'm not sure how that could happen;
1987 * shouldn't the fs core protect us from these sort of
1988 * unlink()/link() races?
1989 */
1990 J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1991 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
1992
1993 BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
1994 err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
1995 if (err)
1996 goto out_unlock;
1997
1998 err = ext3_reserve_inode_write(handle, inode, &iloc);
1999 if (err)
2000 goto out_unlock;
2001
2002 /* Insert this inode at the head of the on-disk orphan list... */
2003 NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
2004 EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
2005 err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
2006 rc = ext3_mark_iloc_dirty(handle, inode, &iloc);
2007 if (!err)
2008 err = rc;
2009
2010 /* Only add to the head of the in-memory list if all the
2011 * previous operations succeeded. If the orphan_add is going to
2012 * fail (possibly taking the journal offline), we can't risk
2013 * leaving the inode on the orphan list: stray orphan-list
2014 * entries can cause panics at unmount time.
2015 *
2016 * This is safe: on error we're going to ignore the orphan list
2017 * anyway on the next recovery. */
2018 if (!err)
2019 list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
2020
2021 jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
2022 jbd_debug(4, "orphan inode %lu will point to %d\n",
2023 inode->i_ino, NEXT_ORPHAN(inode));
2024out_unlock:
2025 mutex_unlock(&EXT3_SB(sb)->s_orphan_lock);
2026 ext3_std_error(inode->i_sb, err);
2027 return err;
2028}
2029
2030/*
2031 * ext3_orphan_del() removes an unlinked or truncated inode from the list
2032 * of such inodes stored on disk, because it is finally being cleaned up.
2033 */
2034int ext3_orphan_del(handle_t *handle, struct inode *inode)
2035{
2036 struct list_head *prev;
2037 struct ext3_inode_info *ei = EXT3_I(inode);
2038 struct ext3_sb_info *sbi;
2039 unsigned long ino_next;
2040 struct ext3_iloc iloc;
2041 int err = 0;
2042
2043 mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
2044 if (list_empty(&ei->i_orphan))
2045 goto out;
2046
2047 ino_next = NEXT_ORPHAN(inode);
2048 prev = ei->i_orphan.prev;
2049 sbi = EXT3_SB(inode->i_sb);
2050
2051 jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
2052
2053 list_del_init(&ei->i_orphan);
2054
2055 /* If we're on an error path, we may not have a valid
2056 * transaction handle with which to update the orphan list on
2057 * disk, but we still need to remove the inode from the linked
2058 * list in memory. */
2059 if (!handle)
2060 goto out;
2061
2062 err = ext3_reserve_inode_write(handle, inode, &iloc);
2063 if (err)
2064 goto out_err;
2065
2066 if (prev == &sbi->s_orphan) {
2067 jbd_debug(4, "superblock will point to %lu\n", ino_next);
2068 BUFFER_TRACE(sbi->s_sbh, "get_write_access");
2069 err = ext3_journal_get_write_access(handle, sbi->s_sbh);
2070 if (err)
2071 goto out_brelse;
2072 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
2073 err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
2074 } else {
2075 struct ext3_iloc iloc2;
2076 struct inode *i_prev =
2077 &list_entry(prev, struct ext3_inode_info, i_orphan)->vfs_inode;
2078
2079 jbd_debug(4, "orphan inode %lu will point to %lu\n",
2080 i_prev->i_ino, ino_next);
2081 err = ext3_reserve_inode_write(handle, i_prev, &iloc2);
2082 if (err)
2083 goto out_brelse;
2084 NEXT_ORPHAN(i_prev) = ino_next;
2085 err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2);
2086 }
2087 if (err)
2088 goto out_brelse;
2089 NEXT_ORPHAN(inode) = 0;
2090 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
2091
2092out_err:
2093 ext3_std_error(inode->i_sb, err);
2094out:
2095 mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
2096 return err;
2097
2098out_brelse:
2099 brelse(iloc.bh);
2100 goto out_err;
2101}
2102
2103static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
2104{
2105 int retval;
2106 struct inode * inode;
2107 struct buffer_head * bh;
2108 struct ext3_dir_entry_2 * de;
2109 handle_t *handle;
2110
2111 /* Initialize quotas before so that eventual writes go in
2112 * separate transaction */
2113 dquot_initialize(dir);
2114 dquot_initialize(d_inode(dentry));
2115
2116 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
2117 if (IS_ERR(handle))
2118 return PTR_ERR(handle);
2119
2120 retval = -ENOENT;
2121 bh = ext3_find_entry(dir, &dentry->d_name, &de);
2122 if (!bh)
2123 goto end_rmdir;
2124
2125 if (IS_DIRSYNC(dir))
2126 handle->h_sync = 1;
2127
2128 inode = d_inode(dentry);
2129
2130 retval = -EIO;
2131 if (le32_to_cpu(de->inode) != inode->i_ino)
2132 goto end_rmdir;
2133
2134 retval = -ENOTEMPTY;
2135 if (!empty_dir (inode))
2136 goto end_rmdir;
2137
2138 retval = ext3_delete_entry(handle, dir, de, bh);
2139 if (retval)
2140 goto end_rmdir;
2141 if (inode->i_nlink != 2)
2142 ext3_warning (inode->i_sb, "ext3_rmdir",
2143 "empty directory has nlink!=2 (%d)",
2144 inode->i_nlink);
2145 inode->i_version++;
2146 clear_nlink(inode);
2147 /* There's no need to set i_disksize: the fact that i_nlink is
2148 * zero will ensure that the right thing happens during any
2149 * recovery. */
2150 inode->i_size = 0;
2151 ext3_orphan_add(handle, inode);
2152 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
2153 ext3_mark_inode_dirty(handle, inode);
2154 drop_nlink(dir);
2155 ext3_update_dx_flag(dir);
2156 ext3_mark_inode_dirty(handle, dir);
2157
2158end_rmdir:
2159 ext3_journal_stop(handle);
2160 brelse (bh);
2161 return retval;
2162}
2163
2164static int ext3_unlink(struct inode * dir, struct dentry *dentry)
2165{
2166 int retval;
2167 struct inode * inode;
2168 struct buffer_head * bh;
2169 struct ext3_dir_entry_2 * de;
2170 handle_t *handle;
2171
2172 trace_ext3_unlink_enter(dir, dentry);
2173 /* Initialize quotas before so that eventual writes go
2174 * in separate transaction */
2175 dquot_initialize(dir);
2176 dquot_initialize(d_inode(dentry));
2177
2178 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
2179 if (IS_ERR(handle))
2180 return PTR_ERR(handle);
2181
2182 if (IS_DIRSYNC(dir))
2183 handle->h_sync = 1;
2184
2185 retval = -ENOENT;
2186 bh = ext3_find_entry(dir, &dentry->d_name, &de);
2187 if (!bh)
2188 goto end_unlink;
2189
2190 inode = d_inode(dentry);
2191
2192 retval = -EIO;
2193 if (le32_to_cpu(de->inode) != inode->i_ino)
2194 goto end_unlink;
2195
2196 if (!inode->i_nlink) {
2197 ext3_warning (inode->i_sb, "ext3_unlink",
2198 "Deleting nonexistent file (%lu), %d",
2199 inode->i_ino, inode->i_nlink);
2200 set_nlink(inode, 1);
2201 }
2202 retval = ext3_delete_entry(handle, dir, de, bh);
2203 if (retval)
2204 goto end_unlink;
2205 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
2206 ext3_update_dx_flag(dir);
2207 ext3_mark_inode_dirty(handle, dir);
2208 drop_nlink(inode);
2209 if (!inode->i_nlink)
2210 ext3_orphan_add(handle, inode);
2211 inode->i_ctime = dir->i_ctime;
2212 ext3_mark_inode_dirty(handle, inode);
2213 retval = 0;
2214
2215end_unlink:
2216 ext3_journal_stop(handle);
2217 brelse (bh);
2218 trace_ext3_unlink_exit(dentry, retval);
2219 return retval;
2220}
2221
2222static int ext3_symlink (struct inode * dir,
2223 struct dentry *dentry, const char * symname)
2224{
2225 handle_t *handle;
2226 struct inode * inode;
2227 int l, err, retries = 0;
2228 int credits;
2229
2230 l = strlen(symname)+1;
2231 if (l > dir->i_sb->s_blocksize)
2232 return -ENAMETOOLONG;
2233
2234 dquot_initialize(dir);
2235
2236 if (l > EXT3_N_BLOCKS * 4) {
2237 /*
2238 * For non-fast symlinks, we just allocate inode and put it on
2239 * orphan list in the first transaction => we need bitmap,
2240 * group descriptor, sb, inode block, quota blocks, and
2241 * possibly selinux xattr blocks.
2242 */
2243 credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
2244 EXT3_XATTR_TRANS_BLOCKS;
2245 } else {
2246 /*
2247 * Fast symlink. We have to add entry to directory
2248 * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS),
2249 * allocate new inode (bitmap, group descriptor, inode block,
2250 * quota blocks, sb is already counted in previous macros).
2251 */
2252 credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2253 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
2254 EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
2255 }
2256retry:
2257 handle = ext3_journal_start(dir, credits);
2258 if (IS_ERR(handle))
2259 return PTR_ERR(handle);
2260
2261 if (IS_DIRSYNC(dir))
2262 handle->h_sync = 1;
2263
2264 inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO);
2265 err = PTR_ERR(inode);
2266 if (IS_ERR(inode))
2267 goto out_stop;
2268
2269 if (l > EXT3_N_BLOCKS * 4) {
2270 inode->i_op = &ext3_symlink_inode_operations;
2271 ext3_set_aops(inode);
2272 /*
2273 * We cannot call page_symlink() with transaction started
2274 * because it calls into ext3_write_begin() which acquires page
2275 * lock which ranks below transaction start (and it can also
2276 * wait for journal commit if we are running out of space). So
2277 * we have to stop transaction now and restart it when symlink
2278 * contents is written.
2279 *
2280 * To keep fs consistent in case of crash, we have to put inode
2281 * to orphan list in the mean time.
2282 */
2283 drop_nlink(inode);
2284 err = ext3_orphan_add(handle, inode);
2285 ext3_journal_stop(handle);
2286 if (err)
2287 goto err_drop_inode;
2288 err = __page_symlink(inode, symname, l, 1);
2289 if (err)
2290 goto err_drop_inode;
2291 /*
2292 * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS
2293 * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
2294 */
2295 handle = ext3_journal_start(dir,
2296 EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2297 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
2298 if (IS_ERR(handle)) {
2299 err = PTR_ERR(handle);
2300 goto err_drop_inode;
2301 }
2302 set_nlink(inode, 1);
2303 err = ext3_orphan_del(handle, inode);
2304 if (err) {
2305 ext3_journal_stop(handle);
2306 drop_nlink(inode);
2307 goto err_drop_inode;
2308 }
2309 } else {
2310 inode->i_op = &ext3_fast_symlink_inode_operations;
2311 inode->i_link = (char*)&EXT3_I(inode)->i_data;
2312 memcpy(inode->i_link, symname, l);
2313 inode->i_size = l-1;
2314 }
2315 EXT3_I(inode)->i_disksize = inode->i_size;
2316 err = ext3_add_nondir(handle, dentry, inode);
2317out_stop:
2318 ext3_journal_stop(handle);
2319 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
2320 goto retry;
2321 return err;
2322err_drop_inode:
2323 unlock_new_inode(inode);
2324 iput(inode);
2325 return err;
2326}
2327
2328static int ext3_link (struct dentry * old_dentry,
2329 struct inode * dir, struct dentry *dentry)
2330{
2331 handle_t *handle;
2332 struct inode *inode = d_inode(old_dentry);
2333 int err, retries = 0;
2334
2335 if (inode->i_nlink >= EXT3_LINK_MAX)
2336 return -EMLINK;
2337
2338 dquot_initialize(dir);
2339
2340retry:
2341 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2342 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
2343 if (IS_ERR(handle))
2344 return PTR_ERR(handle);
2345
2346 if (IS_DIRSYNC(dir))
2347 handle->h_sync = 1;
2348
2349 inode->i_ctime = CURRENT_TIME_SEC;
2350 inc_nlink(inode);
2351 ihold(inode);
2352
2353 err = ext3_add_entry(handle, dentry, inode);
2354 if (!err) {
2355 ext3_mark_inode_dirty(handle, inode);
2356 /* this can happen only for tmpfile being
2357 * linked the first time
2358 */
2359 if (inode->i_nlink == 1)
2360 ext3_orphan_del(handle, inode);
2361 d_instantiate(dentry, inode);
2362 } else {
2363 drop_nlink(inode);
2364 iput(inode);
2365 }
2366 ext3_journal_stop(handle);
2367 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
2368 goto retry;
2369 return err;
2370}
2371
2372#define PARENT_INO(buffer) \
2373 (ext3_next_entry((struct ext3_dir_entry_2 *)(buffer))->inode)
2374
2375/*
2376 * Anybody can rename anything with this: the permission checks are left to the
2377 * higher-level routines.
2378 */
2379static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2380 struct inode * new_dir,struct dentry *new_dentry)
2381{
2382 handle_t *handle;
2383 struct inode * old_inode, * new_inode;
2384 struct buffer_head * old_bh, * new_bh, * dir_bh;
2385 struct ext3_dir_entry_2 * old_de, * new_de;
2386 int retval, flush_file = 0;
2387
2388 dquot_initialize(old_dir);
2389 dquot_initialize(new_dir);
2390
2391 old_bh = new_bh = dir_bh = NULL;
2392
2393 /* Initialize quotas before so that eventual writes go
2394 * in separate transaction */
2395 if (d_really_is_positive(new_dentry))
2396 dquot_initialize(d_inode(new_dentry));
2397 handle = ext3_journal_start(old_dir, 2 *
2398 EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
2399 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
2400 if (IS_ERR(handle))
2401 return PTR_ERR(handle);
2402
2403 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2404 handle->h_sync = 1;
2405
2406 old_bh = ext3_find_entry(old_dir, &old_dentry->d_name, &old_de);
2407 /*
2408 * Check for inode number is _not_ due to possible IO errors.
2409 * We might rmdir the source, keep it as pwd of some process
2410 * and merrily kill the link to whatever was created under the
2411 * same name. Goodbye sticky bit ;-<
2412 */
2413 old_inode = d_inode(old_dentry);
2414 retval = -ENOENT;
2415 if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
2416 goto end_rename;
2417
2418 new_inode = d_inode(new_dentry);
2419 new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de);
2420 if (new_bh) {
2421 if (!new_inode) {
2422 brelse (new_bh);
2423 new_bh = NULL;
2424 }
2425 }
2426 if (S_ISDIR(old_inode->i_mode)) {
2427 if (new_inode) {
2428 retval = -ENOTEMPTY;
2429 if (!empty_dir (new_inode))
2430 goto end_rename;
2431 }
2432 retval = -EIO;
2433 dir_bh = ext3_dir_bread(handle, old_inode, 0, 0, &retval);
2434 if (!dir_bh)
2435 goto end_rename;
2436 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
2437 goto end_rename;
2438 retval = -EMLINK;
2439 if (!new_inode && new_dir!=old_dir &&
2440 new_dir->i_nlink >= EXT3_LINK_MAX)
2441 goto end_rename;
2442 }
2443 if (!new_bh) {
2444 retval = ext3_add_entry (handle, new_dentry, old_inode);
2445 if (retval)
2446 goto end_rename;
2447 } else {
2448 BUFFER_TRACE(new_bh, "get write access");
2449 retval = ext3_journal_get_write_access(handle, new_bh);
2450 if (retval)
2451 goto journal_error;
2452 new_de->inode = cpu_to_le32(old_inode->i_ino);
2453 if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
2454 EXT3_FEATURE_INCOMPAT_FILETYPE))
2455 new_de->file_type = old_de->file_type;
2456 new_dir->i_version++;
2457 new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
2458 ext3_mark_inode_dirty(handle, new_dir);
2459 BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
2460 retval = ext3_journal_dirty_metadata(handle, new_bh);
2461 if (retval)
2462 goto journal_error;
2463 brelse(new_bh);
2464 new_bh = NULL;
2465 }
2466
2467 /*
2468 * Like most other Unix systems, set the ctime for inodes on a
2469 * rename.
2470 */
2471 old_inode->i_ctime = CURRENT_TIME_SEC;
2472 ext3_mark_inode_dirty(handle, old_inode);
2473
2474 /*
2475 * ok, that's it
2476 */
2477 if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
2478 old_de->name_len != old_dentry->d_name.len ||
2479 strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
2480 (retval = ext3_delete_entry(handle, old_dir,
2481 old_de, old_bh)) == -ENOENT) {
2482 /* old_de could have moved from under us during htree split, so
2483 * make sure that we are deleting the right entry. We might
2484 * also be pointing to a stale entry in the unused part of
2485 * old_bh so just checking inum and the name isn't enough. */
2486 struct buffer_head *old_bh2;
2487 struct ext3_dir_entry_2 *old_de2;
2488
2489 old_bh2 = ext3_find_entry(old_dir, &old_dentry->d_name,
2490 &old_de2);
2491 if (old_bh2) {
2492 retval = ext3_delete_entry(handle, old_dir,
2493 old_de2, old_bh2);
2494 brelse(old_bh2);
2495 }
2496 }
2497 if (retval) {
2498 ext3_warning(old_dir->i_sb, "ext3_rename",
2499 "Deleting old file (%lu), %d, error=%d",
2500 old_dir->i_ino, old_dir->i_nlink, retval);
2501 }
2502
2503 if (new_inode) {
2504 drop_nlink(new_inode);
2505 new_inode->i_ctime = CURRENT_TIME_SEC;
2506 }
2507 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
2508 ext3_update_dx_flag(old_dir);
2509 if (dir_bh) {
2510 BUFFER_TRACE(dir_bh, "get_write_access");
2511 retval = ext3_journal_get_write_access(handle, dir_bh);
2512 if (retval)
2513 goto journal_error;
2514 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
2515 BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
2516 retval = ext3_journal_dirty_metadata(handle, dir_bh);
2517 if (retval) {
2518journal_error:
2519 ext3_std_error(new_dir->i_sb, retval);
2520 goto end_rename;
2521 }
2522 drop_nlink(old_dir);
2523 if (new_inode) {
2524 drop_nlink(new_inode);
2525 } else {
2526 inc_nlink(new_dir);
2527 ext3_update_dx_flag(new_dir);
2528 ext3_mark_inode_dirty(handle, new_dir);
2529 }
2530 }
2531 ext3_mark_inode_dirty(handle, old_dir);
2532 if (new_inode) {
2533 ext3_mark_inode_dirty(handle, new_inode);
2534 if (!new_inode->i_nlink)
2535 ext3_orphan_add(handle, new_inode);
2536 if (ext3_should_writeback_data(new_inode))
2537 flush_file = 1;
2538 }
2539 retval = 0;
2540
2541end_rename:
2542 brelse (dir_bh);
2543 brelse (old_bh);
2544 brelse (new_bh);
2545 ext3_journal_stop(handle);
2546 if (retval == 0 && flush_file)
2547 filemap_flush(old_inode->i_mapping);
2548 return retval;
2549}
2550
2551/*
2552 * directories can handle most operations...
2553 */
2554const struct inode_operations ext3_dir_inode_operations = {
2555 .create = ext3_create,
2556 .lookup = ext3_lookup,
2557 .link = ext3_link,
2558 .unlink = ext3_unlink,
2559 .symlink = ext3_symlink,
2560 .mkdir = ext3_mkdir,
2561 .rmdir = ext3_rmdir,
2562 .mknod = ext3_mknod,
2563 .tmpfile = ext3_tmpfile,
2564 .rename = ext3_rename,
2565 .setattr = ext3_setattr,
2566#ifdef CONFIG_EXT3_FS_XATTR
2567 .setxattr = generic_setxattr,
2568 .getxattr = generic_getxattr,
2569 .listxattr = ext3_listxattr,
2570 .removexattr = generic_removexattr,
2571#endif
2572 .get_acl = ext3_get_acl,
2573 .set_acl = ext3_set_acl,
2574};
2575
2576const struct inode_operations ext3_special_inode_operations = {
2577 .setattr = ext3_setattr,
2578#ifdef CONFIG_EXT3_FS_XATTR
2579 .setxattr = generic_setxattr,
2580 .getxattr = generic_getxattr,
2581 .listxattr = ext3_listxattr,
2582 .removexattr = generic_removexattr,
2583#endif
2584 .get_acl = ext3_get_acl,
2585 .set_acl = ext3_set_acl,
2586};
diff --git a/fs/ext3/namei.h b/fs/ext3/namei.h
deleted file mode 100644
index 46304d8c9f0a..000000000000
--- a/fs/ext3/namei.h
+++ /dev/null
@@ -1,27 +0,0 @@
1/* linux/fs/ext3/namei.h
2 *
3 * Copyright (C) 2005 Simtec Electronics
4 * Ben Dooks <ben@simtec.co.uk>
5 *
6*/
7
8extern struct dentry *ext3_get_parent(struct dentry *child);
9
10static inline struct buffer_head *ext3_dir_bread(handle_t *handle,
11 struct inode *inode,
12 int block, int create,
13 int *err)
14{
15 struct buffer_head *bh;
16
17 bh = ext3_bread(handle, inode, block, create, err);
18
19 if (!bh && !(*err)) {
20 *err = -EIO;
21 ext3_error(inode->i_sb, __func__,
22 "Directory hole detected on inode %lu\n",
23 inode->i_ino);
24 return NULL;
25 }
26 return bh;
27}
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
deleted file mode 100644
index 27105655502c..000000000000
--- a/fs/ext3/resize.c
+++ /dev/null
@@ -1,1117 +0,0 @@
1/*
2 * linux/fs/ext3/resize.c
3 *
4 * Support for resizing an ext3 filesystem while it is mounted.
5 *
6 * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com>
7 *
8 * This could probably be made into a module, because it is not often in use.
9 */
10
11
12#define EXT3FS_DEBUG
13
14#include "ext3.h"
15
16
17#define outside(b, first, last) ((b) < (first) || (b) >= (last))
18#define inside(b, first, last) ((b) >= (first) && (b) < (last))
19
20static int verify_group_input(struct super_block *sb,
21 struct ext3_new_group_data *input)
22{
23 struct ext3_sb_info *sbi = EXT3_SB(sb);
24 struct ext3_super_block *es = sbi->s_es;
25 ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count);
26 ext3_fsblk_t end = start + input->blocks_count;
27 unsigned group = input->group;
28 ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
29 unsigned overhead = ext3_bg_has_super(sb, group) ?
30 (1 + ext3_bg_num_gdb(sb, group) +
31 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
32 ext3_fsblk_t metaend = start + overhead;
33 struct buffer_head *bh = NULL;
34 ext3_grpblk_t free_blocks_count;
35 int err = -EINVAL;
36
37 input->free_blocks_count = free_blocks_count =
38 input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
39
40 if (test_opt(sb, DEBUG))
41 printk(KERN_DEBUG "EXT3-fs: adding %s group %u: %u blocks "
42 "(%d free, %u reserved)\n",
43 ext3_bg_has_super(sb, input->group) ? "normal" :
44 "no-super", input->group, input->blocks_count,
45 free_blocks_count, input->reserved_blocks);
46
47 if (group != sbi->s_groups_count)
48 ext3_warning(sb, __func__,
49 "Cannot add at group %u (only %lu groups)",
50 input->group, sbi->s_groups_count);
51 else if ((start - le32_to_cpu(es->s_first_data_block)) %
52 EXT3_BLOCKS_PER_GROUP(sb))
53 ext3_warning(sb, __func__, "Last group not full");
54 else if (input->reserved_blocks > input->blocks_count / 5)
55 ext3_warning(sb, __func__, "Reserved blocks too high (%u)",
56 input->reserved_blocks);
57 else if (free_blocks_count < 0)
58 ext3_warning(sb, __func__, "Bad blocks count %u",
59 input->blocks_count);
60 else if (!(bh = sb_bread(sb, end - 1)))
61 ext3_warning(sb, __func__,
62 "Cannot read last block ("E3FSBLK")",
63 end - 1);
64 else if (outside(input->block_bitmap, start, end))
65 ext3_warning(sb, __func__,
66 "Block bitmap not in group (block %u)",
67 input->block_bitmap);
68 else if (outside(input->inode_bitmap, start, end))
69 ext3_warning(sb, __func__,
70 "Inode bitmap not in group (block %u)",
71 input->inode_bitmap);
72 else if (outside(input->inode_table, start, end) ||
73 outside(itend - 1, start, end))
74 ext3_warning(sb, __func__,
75 "Inode table not in group (blocks %u-"E3FSBLK")",
76 input->inode_table, itend - 1);
77 else if (input->inode_bitmap == input->block_bitmap)
78 ext3_warning(sb, __func__,
79 "Block bitmap same as inode bitmap (%u)",
80 input->block_bitmap);
81 else if (inside(input->block_bitmap, input->inode_table, itend))
82 ext3_warning(sb, __func__,
83 "Block bitmap (%u) in inode table (%u-"E3FSBLK")",
84 input->block_bitmap, input->inode_table, itend-1);
85 else if (inside(input->inode_bitmap, input->inode_table, itend))
86 ext3_warning(sb, __func__,
87 "Inode bitmap (%u) in inode table (%u-"E3FSBLK")",
88 input->inode_bitmap, input->inode_table, itend-1);
89 else if (inside(input->block_bitmap, start, metaend))
90 ext3_warning(sb, __func__,
91 "Block bitmap (%u) in GDT table"
92 " ("E3FSBLK"-"E3FSBLK")",
93 input->block_bitmap, start, metaend - 1);
94 else if (inside(input->inode_bitmap, start, metaend))
95 ext3_warning(sb, __func__,
96 "Inode bitmap (%u) in GDT table"
97 " ("E3FSBLK"-"E3FSBLK")",
98 input->inode_bitmap, start, metaend - 1);
99 else if (inside(input->inode_table, start, metaend) ||
100 inside(itend - 1, start, metaend))
101 ext3_warning(sb, __func__,
102 "Inode table (%u-"E3FSBLK") overlaps"
103 "GDT table ("E3FSBLK"-"E3FSBLK")",
104 input->inode_table, itend - 1, start, metaend - 1);
105 else
106 err = 0;
107 brelse(bh);
108
109 return err;
110}
111
112static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
113 ext3_fsblk_t blk)
114{
115 struct buffer_head *bh;
116 int err;
117
118 bh = sb_getblk(sb, blk);
119 if (unlikely(!bh))
120 return ERR_PTR(-ENOMEM);
121 if ((err = ext3_journal_get_write_access(handle, bh))) {
122 brelse(bh);
123 bh = ERR_PTR(err);
124 } else {
125 lock_buffer(bh);
126 memset(bh->b_data, 0, sb->s_blocksize);
127 set_buffer_uptodate(bh);
128 unlock_buffer(bh);
129 }
130
131 return bh;
132}
133
134/*
135 * To avoid calling the atomic setbit hundreds or thousands of times, we only
136 * need to use it within a single byte (to ensure we get endianness right).
137 * We can use memset for the rest of the bitmap as there are no other users.
138 */
139static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
140{
141 int i;
142
143 if (start_bit >= end_bit)
144 return;
145
146 ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
147 for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
148 ext3_set_bit(i, bitmap);
149 if (i < end_bit)
150 memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
151}
152
153/*
154 * If we have fewer than thresh credits, extend by EXT3_MAX_TRANS_DATA.
155 * If that fails, restart the transaction & regain write access for the
156 * buffer head which is used for block_bitmap modifications.
157 */
158static int extend_or_restart_transaction(handle_t *handle, int thresh,
159 struct buffer_head *bh)
160{
161 int err;
162
163 if (handle->h_buffer_credits >= thresh)
164 return 0;
165
166 err = ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA);
167 if (err < 0)
168 return err;
169 if (err) {
170 err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA);
171 if (err)
172 return err;
173 err = ext3_journal_get_write_access(handle, bh);
174 if (err)
175 return err;
176 }
177
178 return 0;
179}
180
181/*
182 * Set up the block and inode bitmaps, and the inode table for the new group.
183 * This doesn't need to be part of the main transaction, since we are only
184 * changing blocks outside the actual filesystem. We still do journaling to
185 * ensure the recovery is correct in case of a failure just after resize.
186 * If any part of this fails, we simply abort the resize.
187 */
188static int setup_new_group_blocks(struct super_block *sb,
189 struct ext3_new_group_data *input)
190{
191 struct ext3_sb_info *sbi = EXT3_SB(sb);
192 ext3_fsblk_t start = ext3_group_first_block_no(sb, input->group);
193 int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
194 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
195 unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group);
196 struct buffer_head *bh;
197 handle_t *handle;
198 ext3_fsblk_t block;
199 ext3_grpblk_t bit;
200 int i;
201 int err = 0, err2;
202
203 /* This transaction may be extended/restarted along the way */
204 handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA);
205
206 if (IS_ERR(handle))
207 return PTR_ERR(handle);
208
209 mutex_lock(&sbi->s_resize_lock);
210 if (input->group != sbi->s_groups_count) {
211 err = -EBUSY;
212 goto exit_journal;
213 }
214
215 if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
216 err = PTR_ERR(bh);
217 goto exit_journal;
218 }
219
220 if (ext3_bg_has_super(sb, input->group)) {
221 ext3_debug("mark backup superblock %#04lx (+0)\n", start);
222 ext3_set_bit(0, bh->b_data);
223 }
224
225 /* Copy all of the GDT blocks into the backup in this group */
226 for (i = 0, bit = 1, block = start + 1;
227 i < gdblocks; i++, block++, bit++) {
228 struct buffer_head *gdb;
229
230 ext3_debug("update backup group %#04lx (+%d)\n", block, bit);
231
232 err = extend_or_restart_transaction(handle, 1, bh);
233 if (err)
234 goto exit_bh;
235
236 gdb = sb_getblk(sb, block);
237 if (unlikely(!gdb)) {
238 err = -ENOMEM;
239 goto exit_bh;
240 }
241 if ((err = ext3_journal_get_write_access(handle, gdb))) {
242 brelse(gdb);
243 goto exit_bh;
244 }
245 lock_buffer(gdb);
246 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
247 set_buffer_uptodate(gdb);
248 unlock_buffer(gdb);
249 err = ext3_journal_dirty_metadata(handle, gdb);
250 if (err) {
251 brelse(gdb);
252 goto exit_bh;
253 }
254 ext3_set_bit(bit, bh->b_data);
255 brelse(gdb);
256 }
257
258 /* Zero out all of the reserved backup group descriptor table blocks */
259 for (i = 0, bit = gdblocks + 1, block = start + bit;
260 i < reserved_gdb; i++, block++, bit++) {
261 struct buffer_head *gdb;
262
263 ext3_debug("clear reserved block %#04lx (+%d)\n", block, bit);
264
265 err = extend_or_restart_transaction(handle, 1, bh);
266 if (err)
267 goto exit_bh;
268
269 if (IS_ERR(gdb = bclean(handle, sb, block))) {
270 err = PTR_ERR(gdb);
271 goto exit_bh;
272 }
273 err = ext3_journal_dirty_metadata(handle, gdb);
274 if (err) {
275 brelse(gdb);
276 goto exit_bh;
277 }
278 ext3_set_bit(bit, bh->b_data);
279 brelse(gdb);
280 }
281 ext3_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap,
282 input->block_bitmap - start);
283 ext3_set_bit(input->block_bitmap - start, bh->b_data);
284 ext3_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap,
285 input->inode_bitmap - start);
286 ext3_set_bit(input->inode_bitmap - start, bh->b_data);
287
288 /* Zero out all of the inode table blocks */
289 for (i = 0, block = input->inode_table, bit = block - start;
290 i < sbi->s_itb_per_group; i++, bit++, block++) {
291 struct buffer_head *it;
292
293 ext3_debug("clear inode block %#04lx (+%d)\n", block, bit);
294
295 err = extend_or_restart_transaction(handle, 1, bh);
296 if (err)
297 goto exit_bh;
298
299 if (IS_ERR(it = bclean(handle, sb, block))) {
300 err = PTR_ERR(it);
301 goto exit_bh;
302 }
303 err = ext3_journal_dirty_metadata(handle, it);
304 if (err) {
305 brelse(it);
306 goto exit_bh;
307 }
308 brelse(it);
309 ext3_set_bit(bit, bh->b_data);
310 }
311
312 err = extend_or_restart_transaction(handle, 2, bh);
313 if (err)
314 goto exit_bh;
315
316 mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb),
317 bh->b_data);
318 err = ext3_journal_dirty_metadata(handle, bh);
319 if (err)
320 goto exit_bh;
321 brelse(bh);
322
323 /* Mark unused entries in inode bitmap used */
324 ext3_debug("clear inode bitmap %#04x (+%ld)\n",
325 input->inode_bitmap, input->inode_bitmap - start);
326 if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
327 err = PTR_ERR(bh);
328 goto exit_journal;
329 }
330
331 mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
332 bh->b_data);
333 err = ext3_journal_dirty_metadata(handle, bh);
334exit_bh:
335 brelse(bh);
336
337exit_journal:
338 mutex_unlock(&sbi->s_resize_lock);
339 if ((err2 = ext3_journal_stop(handle)) && !err)
340 err = err2;
341
342 return err;
343}
344
345/*
346 * Iterate through the groups which hold BACKUP superblock/GDT copies in an
347 * ext3 filesystem. The counters should be initialized to 1, 5, and 7 before
348 * calling this for the first time. In a sparse filesystem it will be the
349 * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
350 * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
351 */
352static unsigned ext3_list_backups(struct super_block *sb, unsigned *three,
353 unsigned *five, unsigned *seven)
354{
355 unsigned *min = three;
356 int mult = 3;
357 unsigned ret;
358
359 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
360 EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
361 ret = *min;
362 *min += 1;
363 return ret;
364 }
365
366 if (*five < *min) {
367 min = five;
368 mult = 5;
369 }
370 if (*seven < *min) {
371 min = seven;
372 mult = 7;
373 }
374
375 ret = *min;
376 *min *= mult;
377
378 return ret;
379}
380
381/*
382 * Check that all of the backup GDT blocks are held in the primary GDT block.
383 * It is assumed that they are stored in group order. Returns the number of
384 * groups in current filesystem that have BACKUPS, or -ve error code.
385 */
386static int verify_reserved_gdb(struct super_block *sb,
387 struct buffer_head *primary)
388{
389 const ext3_fsblk_t blk = primary->b_blocknr;
390 const unsigned long end = EXT3_SB(sb)->s_groups_count;
391 unsigned three = 1;
392 unsigned five = 5;
393 unsigned seven = 7;
394 unsigned grp;
395 __le32 *p = (__le32 *)primary->b_data;
396 int gdbackups = 0;
397
398 while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
399 if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
400 ext3_warning(sb, __func__,
401 "reserved GDT "E3FSBLK
402 " missing grp %d ("E3FSBLK")",
403 blk, grp,
404 grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
405 return -EINVAL;
406 }
407 if (++gdbackups > EXT3_ADDR_PER_BLOCK(sb))
408 return -EFBIG;
409 }
410
411 return gdbackups;
412}
413
414/*
415 * Called when we need to bring a reserved group descriptor table block into
416 * use from the resize inode. The primary copy of the new GDT block currently
417 * is an indirect block (under the double indirect block in the resize inode).
418 * The new backup GDT blocks will be stored as leaf blocks in this indirect
419 * block, in group order. Even though we know all the block numbers we need,
420 * we check to ensure that the resize inode has actually reserved these blocks.
421 *
422 * Don't need to update the block bitmaps because the blocks are still in use.
423 *
424 * We get all of the error cases out of the way, so that we are sure to not
425 * fail once we start modifying the data on disk, because JBD has no rollback.
426 */
427static int add_new_gdb(handle_t *handle, struct inode *inode,
428 struct ext3_new_group_data *input,
429 struct buffer_head **primary)
430{
431 struct super_block *sb = inode->i_sb;
432 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
433 unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
434 ext3_fsblk_t gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
435 struct buffer_head **o_group_desc, **n_group_desc;
436 struct buffer_head *dind;
437 int gdbackups;
438 struct ext3_iloc iloc;
439 __le32 *data;
440 int err;
441
442 if (test_opt(sb, DEBUG))
443 printk(KERN_DEBUG
444 "EXT3-fs: ext3_add_new_gdb: adding group block %lu\n",
445 gdb_num);
446
447 /*
448 * If we are not using the primary superblock/GDT copy don't resize,
449 * because the user tools have no way of handling this. Probably a
450 * bad time to do it anyways.
451 */
452 if (EXT3_SB(sb)->s_sbh->b_blocknr !=
453 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) {
454 ext3_warning(sb, __func__,
455 "won't resize using backup superblock at %llu",
456 (unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr);
457 return -EPERM;
458 }
459
460 *primary = sb_bread(sb, gdblock);
461 if (!*primary)
462 return -EIO;
463
464 if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
465 err = gdbackups;
466 goto exit_bh;
467 }
468
469 data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK;
470 dind = sb_bread(sb, le32_to_cpu(*data));
471 if (!dind) {
472 err = -EIO;
473 goto exit_bh;
474 }
475
476 data = (__le32 *)dind->b_data;
477 if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
478 ext3_warning(sb, __func__,
479 "new group %u GDT block "E3FSBLK" not reserved",
480 input->group, gdblock);
481 err = -EINVAL;
482 goto exit_dind;
483 }
484
485 if ((err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh)))
486 goto exit_dind;
487
488 if ((err = ext3_journal_get_write_access(handle, *primary)))
489 goto exit_sbh;
490
491 if ((err = ext3_journal_get_write_access(handle, dind)))
492 goto exit_primary;
493
494 /* ext3_reserve_inode_write() gets a reference on the iloc */
495 if ((err = ext3_reserve_inode_write(handle, inode, &iloc)))
496 goto exit_dindj;
497
498 n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
499 GFP_NOFS);
500 if (!n_group_desc) {
501 err = -ENOMEM;
502 ext3_warning (sb, __func__,
503 "not enough memory for %lu groups", gdb_num + 1);
504 goto exit_inode;
505 }
506
507 /*
508 * Finally, we have all of the possible failures behind us...
509 *
510 * Remove new GDT block from inode double-indirect block and clear out
511 * the new GDT block for use (which also "frees" the backup GDT blocks
512 * from the reserved inode). We don't need to change the bitmaps for
513 * these blocks, because they are marked as in-use from being in the
514 * reserved inode, and will become GDT blocks (primary and backup).
515 */
516 data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0;
517 err = ext3_journal_dirty_metadata(handle, dind);
518 if (err)
519 goto exit_group_desc;
520 brelse(dind);
521 dind = NULL;
522 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
523 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
524 if (err)
525 goto exit_group_desc;
526 memset((*primary)->b_data, 0, sb->s_blocksize);
527 err = ext3_journal_dirty_metadata(handle, *primary);
528 if (err)
529 goto exit_group_desc;
530
531 o_group_desc = EXT3_SB(sb)->s_group_desc;
532 memcpy(n_group_desc, o_group_desc,
533 EXT3_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
534 n_group_desc[gdb_num] = *primary;
535 EXT3_SB(sb)->s_group_desc = n_group_desc;
536 EXT3_SB(sb)->s_gdb_count++;
537 kfree(o_group_desc);
538
539 le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
540 err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
541 if (err)
542 goto exit_inode;
543
544 return 0;
545
546exit_group_desc:
547 kfree(n_group_desc);
548exit_inode:
549 //ext3_journal_release_buffer(handle, iloc.bh);
550 brelse(iloc.bh);
551exit_dindj:
552 //ext3_journal_release_buffer(handle, dind);
553exit_primary:
554 //ext3_journal_release_buffer(handle, *primary);
555exit_sbh:
556 //ext3_journal_release_buffer(handle, *primary);
557exit_dind:
558 brelse(dind);
559exit_bh:
560 brelse(*primary);
561
562 ext3_debug("leaving with error %d\n", err);
563 return err;
564}
565
566/*
567 * Called when we are adding a new group which has a backup copy of each of
568 * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
569 * We need to add these reserved backup GDT blocks to the resize inode, so
570 * that they are kept for future resizing and not allocated to files.
571 *
572 * Each reserved backup GDT block will go into a different indirect block.
573 * The indirect blocks are actually the primary reserved GDT blocks,
574 * so we know in advance what their block numbers are. We only get the
575 * double-indirect block to verify it is pointing to the primary reserved
576 * GDT blocks so we don't overwrite a data block by accident. The reserved
577 * backup GDT blocks are stored in their reserved primary GDT block.
578 */
579static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
580 struct ext3_new_group_data *input)
581{
582 struct super_block *sb = inode->i_sb;
583 int reserved_gdb =le16_to_cpu(EXT3_SB(sb)->s_es->s_reserved_gdt_blocks);
584 struct buffer_head **primary;
585 struct buffer_head *dind;
586 struct ext3_iloc iloc;
587 ext3_fsblk_t blk;
588 __le32 *data, *end;
589 int gdbackups = 0;
590 int res, i;
591 int err;
592
593 primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS);
594 if (!primary)
595 return -ENOMEM;
596
597 data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK;
598 dind = sb_bread(sb, le32_to_cpu(*data));
599 if (!dind) {
600 err = -EIO;
601 goto exit_free;
602 }
603
604 blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count;
605 data = (__le32 *)dind->b_data + (EXT3_SB(sb)->s_gdb_count %
606 EXT3_ADDR_PER_BLOCK(sb));
607 end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb);
608
609 /* Get each reserved primary GDT block and verify it holds backups */
610 for (res = 0; res < reserved_gdb; res++, blk++) {
611 if (le32_to_cpu(*data) != blk) {
612 ext3_warning(sb, __func__,
613 "reserved block "E3FSBLK
614 " not at offset %ld",
615 blk,
616 (long)(data - (__le32 *)dind->b_data));
617 err = -EINVAL;
618 goto exit_bh;
619 }
620 primary[res] = sb_bread(sb, blk);
621 if (!primary[res]) {
622 err = -EIO;
623 goto exit_bh;
624 }
625 if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
626 brelse(primary[res]);
627 err = gdbackups;
628 goto exit_bh;
629 }
630 if (++data >= end)
631 data = (__le32 *)dind->b_data;
632 }
633
634 for (i = 0; i < reserved_gdb; i++) {
635 if ((err = ext3_journal_get_write_access(handle, primary[i]))) {
636 /*
637 int j;
638 for (j = 0; j < i; j++)
639 ext3_journal_release_buffer(handle, primary[j]);
640 */
641 goto exit_bh;
642 }
643 }
644
645 if ((err = ext3_reserve_inode_write(handle, inode, &iloc)))
646 goto exit_bh;
647
648 /*
649 * Finally we can add each of the reserved backup GDT blocks from
650 * the new group to its reserved primary GDT block.
651 */
652 blk = input->group * EXT3_BLOCKS_PER_GROUP(sb);
653 for (i = 0; i < reserved_gdb; i++) {
654 int err2;
655 data = (__le32 *)primary[i]->b_data;
656 /* printk("reserving backup %lu[%u] = %lu\n",
657 primary[i]->b_blocknr, gdbackups,
658 blk + primary[i]->b_blocknr); */
659 data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
660 err2 = ext3_journal_dirty_metadata(handle, primary[i]);
661 if (!err)
662 err = err2;
663 }
664 inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9;
665 ext3_mark_iloc_dirty(handle, inode, &iloc);
666
667exit_bh:
668 while (--res >= 0)
669 brelse(primary[res]);
670 brelse(dind);
671
672exit_free:
673 kfree(primary);
674
675 return err;
676}
677
678/*
679 * Update the backup copies of the ext3 metadata. These don't need to be part
680 * of the main resize transaction, because e2fsck will re-write them if there
681 * is a problem (basically only OOM will cause a problem). However, we
682 * _should_ update the backups if possible, in case the primary gets trashed
683 * for some reason and we need to run e2fsck from a backup superblock. The
684 * important part is that the new block and inode counts are in the backup
685 * superblocks, and the location of the new group metadata in the GDT backups.
686 *
687 * We do not need take the s_resize_lock for this, because these
688 * blocks are not otherwise touched by the filesystem code when it is
689 * mounted. We don't need to worry about last changing from
690 * sbi->s_groups_count, because the worst that can happen is that we
691 * do not copy the full number of backups at this time. The resize
692 * which changed s_groups_count will backup again.
693 */
694static void update_backups(struct super_block *sb,
695 int blk_off, char *data, int size)
696{
697 struct ext3_sb_info *sbi = EXT3_SB(sb);
698 const unsigned long last = sbi->s_groups_count;
699 const int bpg = EXT3_BLOCKS_PER_GROUP(sb);
700 unsigned three = 1;
701 unsigned five = 5;
702 unsigned seven = 7;
703 unsigned group;
704 int rest = sb->s_blocksize - size;
705 handle_t *handle;
706 int err = 0, err2;
707
708 handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA);
709 if (IS_ERR(handle)) {
710 group = 1;
711 err = PTR_ERR(handle);
712 goto exit_err;
713 }
714
715 while ((group = ext3_list_backups(sb, &three, &five, &seven)) < last) {
716 struct buffer_head *bh;
717
718 /* Out of journal space, and can't get more - abort - so sad */
719 if (handle->h_buffer_credits == 0 &&
720 ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA) &&
721 (err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA)))
722 break;
723
724 bh = sb_getblk(sb, group * bpg + blk_off);
725 if (unlikely(!bh)) {
726 err = -ENOMEM;
727 break;
728 }
729 ext3_debug("update metadata backup %#04lx\n",
730 (unsigned long)bh->b_blocknr);
731 if ((err = ext3_journal_get_write_access(handle, bh))) {
732 brelse(bh);
733 break;
734 }
735 lock_buffer(bh);
736 memcpy(bh->b_data, data, size);
737 if (rest)
738 memset(bh->b_data + size, 0, rest);
739 set_buffer_uptodate(bh);
740 unlock_buffer(bh);
741 err = ext3_journal_dirty_metadata(handle, bh);
742 brelse(bh);
743 if (err)
744 break;
745 }
746 if ((err2 = ext3_journal_stop(handle)) && !err)
747 err = err2;
748
749 /*
750 * Ugh! Need to have e2fsck write the backup copies. It is too
751 * late to revert the resize, we shouldn't fail just because of
752 * the backup copies (they are only needed in case of corruption).
753 *
754 * However, if we got here we have a journal problem too, so we
755 * can't really start a transaction to mark the superblock.
756 * Chicken out and just set the flag on the hope it will be written
757 * to disk, and if not - we will simply wait until next fsck.
758 */
759exit_err:
760 if (err) {
761 ext3_warning(sb, __func__,
762 "can't update backup for group %d (err %d), "
763 "forcing fsck on next reboot", group, err);
764 sbi->s_mount_state &= ~EXT3_VALID_FS;
765 sbi->s_es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
766 mark_buffer_dirty(sbi->s_sbh);
767 }
768}
769
770/* Add group descriptor data to an existing or new group descriptor block.
771 * Ensure we handle all possible error conditions _before_ we start modifying
772 * the filesystem, because we cannot abort the transaction and not have it
773 * write the data to disk.
774 *
775 * If we are on a GDT block boundary, we need to get the reserved GDT block.
776 * Otherwise, we may need to add backup GDT blocks for a sparse group.
777 *
778 * We only need to hold the superblock lock while we are actually adding
779 * in the new group's counts to the superblock. Prior to that we have
780 * not really "added" the group at all. We re-check that we are still
781 * adding in the last group in case things have changed since verifying.
782 */
783int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
784{
785 struct ext3_sb_info *sbi = EXT3_SB(sb);
786 struct ext3_super_block *es = sbi->s_es;
787 int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
788 le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
789 struct buffer_head *primary = NULL;
790 struct ext3_group_desc *gdp;
791 struct inode *inode = NULL;
792 handle_t *handle;
793 int gdb_off, gdb_num;
794 int err, err2;
795
796 gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
797 gdb_off = input->group % EXT3_DESC_PER_BLOCK(sb);
798
799 if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb,
800 EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
801 ext3_warning(sb, __func__,
802 "Can't resize non-sparse filesystem further");
803 return -EPERM;
804 }
805
806 if (le32_to_cpu(es->s_blocks_count) + input->blocks_count <
807 le32_to_cpu(es->s_blocks_count)) {
808 ext3_warning(sb, __func__, "blocks_count overflow\n");
809 return -EINVAL;
810 }
811
812 if (le32_to_cpu(es->s_inodes_count) + EXT3_INODES_PER_GROUP(sb) <
813 le32_to_cpu(es->s_inodes_count)) {
814 ext3_warning(sb, __func__, "inodes_count overflow\n");
815 return -EINVAL;
816 }
817
818 if (reserved_gdb || gdb_off == 0) {
819 if (!EXT3_HAS_COMPAT_FEATURE(sb,
820 EXT3_FEATURE_COMPAT_RESIZE_INODE)
821 || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
822 ext3_warning(sb, __func__,
823 "No reserved GDT blocks, can't resize");
824 return -EPERM;
825 }
826 inode = ext3_iget(sb, EXT3_RESIZE_INO);
827 if (IS_ERR(inode)) {
828 ext3_warning(sb, __func__,
829 "Error opening resize inode");
830 return PTR_ERR(inode);
831 }
832 }
833
834 if ((err = verify_group_input(sb, input)))
835 goto exit_put;
836
837 if ((err = setup_new_group_blocks(sb, input)))
838 goto exit_put;
839
840 /*
841 * We will always be modifying at least the superblock and a GDT
842 * block. If we are adding a group past the last current GDT block,
843 * we will also modify the inode and the dindirect block. If we
844 * are adding a group with superblock/GDT backups we will also
845 * modify each of the reserved GDT dindirect blocks.
846 */
847 handle = ext3_journal_start_sb(sb,
848 ext3_bg_has_super(sb, input->group) ?
849 3 + reserved_gdb : 4);
850 if (IS_ERR(handle)) {
851 err = PTR_ERR(handle);
852 goto exit_put;
853 }
854
855 mutex_lock(&sbi->s_resize_lock);
856 if (input->group != sbi->s_groups_count) {
857 ext3_warning(sb, __func__,
858 "multiple resizers run on filesystem!");
859 err = -EBUSY;
860 goto exit_journal;
861 }
862
863 if ((err = ext3_journal_get_write_access(handle, sbi->s_sbh)))
864 goto exit_journal;
865
866 /*
867 * We will only either add reserved group blocks to a backup group
868 * or remove reserved blocks for the first group in a new group block.
869 * Doing both would be mean more complex code, and sane people don't
870 * use non-sparse filesystems anymore. This is already checked above.
871 */
872 if (gdb_off) {
873 primary = sbi->s_group_desc[gdb_num];
874 if ((err = ext3_journal_get_write_access(handle, primary)))
875 goto exit_journal;
876
877 if (reserved_gdb && ext3_bg_num_gdb(sb, input->group) &&
878 (err = reserve_backup_gdb(handle, inode, input)))
879 goto exit_journal;
880 } else if ((err = add_new_gdb(handle, inode, input, &primary)))
881 goto exit_journal;
882
883 /*
884 * OK, now we've set up the new group. Time to make it active.
885 *
886 * We do not lock all allocations via s_resize_lock
887 * so we have to be safe wrt. concurrent accesses the group
888 * data. So we need to be careful to set all of the relevant
889 * group descriptor data etc. *before* we enable the group.
890 *
891 * The key field here is sbi->s_groups_count: as long as
892 * that retains its old value, nobody is going to access the new
893 * group.
894 *
895 * So first we update all the descriptor metadata for the new
896 * group; then we update the total disk blocks count; then we
897 * update the groups count to enable the group; then finally we
898 * update the free space counts so that the system can start
899 * using the new disk blocks.
900 */
901
902 /* Update group descriptor block for new group */
903 gdp = (struct ext3_group_desc *)primary->b_data + gdb_off;
904
905 gdp->bg_block_bitmap = cpu_to_le32(input->block_bitmap);
906 gdp->bg_inode_bitmap = cpu_to_le32(input->inode_bitmap);
907 gdp->bg_inode_table = cpu_to_le32(input->inode_table);
908 gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
909 gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb));
910
911 /*
912 * Make the new blocks and inodes valid next. We do this before
913 * increasing the group count so that once the group is enabled,
914 * all of its blocks and inodes are already valid.
915 *
916 * We always allocate group-by-group, then block-by-block or
917 * inode-by-inode within a group, so enabling these
918 * blocks/inodes before the group is live won't actually let us
919 * allocate the new space yet.
920 */
921 le32_add_cpu(&es->s_blocks_count, input->blocks_count);
922 le32_add_cpu(&es->s_inodes_count, EXT3_INODES_PER_GROUP(sb));
923
924 /*
925 * We need to protect s_groups_count against other CPUs seeing
926 * inconsistent state in the superblock.
927 *
928 * The precise rules we use are:
929 *
930 * * Writers of s_groups_count *must* hold s_resize_lock
931 * AND
932 * * Writers must perform a smp_wmb() after updating all dependent
933 * data and before modifying the groups count
934 *
935 * * Readers must hold s_resize_lock over the access
936 * OR
937 * * Readers must perform an smp_rmb() after reading the groups count
938 * and before reading any dependent data.
939 *
940 * NB. These rules can be relaxed when checking the group count
941 * while freeing data, as we can only allocate from a block
942 * group after serialising against the group count, and we can
943 * only then free after serialising in turn against that
944 * allocation.
945 */
946 smp_wmb();
947
948 /* Update the global fs size fields */
949 sbi->s_groups_count++;
950
951 err = ext3_journal_dirty_metadata(handle, primary);
952 if (err)
953 goto exit_journal;
954
955 /* Update the reserved block counts only once the new group is
956 * active. */
957 le32_add_cpu(&es->s_r_blocks_count, input->reserved_blocks);
958
959 /* Update the free space counts */
960 percpu_counter_add(&sbi->s_freeblocks_counter,
961 input->free_blocks_count);
962 percpu_counter_add(&sbi->s_freeinodes_counter,
963 EXT3_INODES_PER_GROUP(sb));
964
965 err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
966
967exit_journal:
968 mutex_unlock(&sbi->s_resize_lock);
969 if ((err2 = ext3_journal_stop(handle)) && !err)
970 err = err2;
971 if (!err) {
972 update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
973 sizeof(struct ext3_super_block));
974 update_backups(sb, primary->b_blocknr, primary->b_data,
975 primary->b_size);
976 }
977exit_put:
978 iput(inode);
979 return err;
980} /* ext3_group_add */
981
982/* Extend the filesystem to the new number of blocks specified. This entry
983 * point is only used to extend the current filesystem to the end of the last
984 * existing group. It can be accessed via ioctl, or by "remount,resize=<size>"
985 * for emergencies (because it has no dependencies on reserved blocks).
986 *
987 * If we _really_ wanted, we could use default values to call ext3_group_add()
988 * allow the "remount" trick to work for arbitrary resizing, assuming enough
989 * GDT blocks are reserved to grow to the desired size.
990 */
991int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
992 ext3_fsblk_t n_blocks_count)
993{
994 ext3_fsblk_t o_blocks_count;
995 ext3_grpblk_t last;
996 ext3_grpblk_t add;
997 struct buffer_head * bh;
998 handle_t *handle;
999 int err;
1000 unsigned long freed_blocks;
1001
1002 /* We don't need to worry about locking wrt other resizers just
1003 * yet: we're going to revalidate es->s_blocks_count after
1004 * taking the s_resize_lock below. */
1005 o_blocks_count = le32_to_cpu(es->s_blocks_count);
1006
1007 if (test_opt(sb, DEBUG))
1008 printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK
1009 " up to "E3FSBLK" blocks\n",
1010 o_blocks_count, n_blocks_count);
1011
1012 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
1013 return 0;
1014
1015 if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
1016 printk(KERN_ERR "EXT3-fs: filesystem on %s:"
1017 " too large to resize to "E3FSBLK" blocks safely\n",
1018 sb->s_id, n_blocks_count);
1019 if (sizeof(sector_t) < 8)
1020 ext3_warning(sb, __func__,
1021 "CONFIG_LBDAF not enabled\n");
1022 return -EINVAL;
1023 }
1024
1025 if (n_blocks_count < o_blocks_count) {
1026 ext3_warning(sb, __func__,
1027 "can't shrink FS - resize aborted");
1028 return -EBUSY;
1029 }
1030
1031 /* Handle the remaining blocks in the last group only. */
1032 last = (o_blocks_count - le32_to_cpu(es->s_first_data_block)) %
1033 EXT3_BLOCKS_PER_GROUP(sb);
1034
1035 if (last == 0) {
1036 ext3_warning(sb, __func__,
1037 "need to use ext2online to resize further");
1038 return -EPERM;
1039 }
1040
1041 add = EXT3_BLOCKS_PER_GROUP(sb) - last;
1042
1043 if (o_blocks_count + add < o_blocks_count) {
1044 ext3_warning(sb, __func__, "blocks_count overflow");
1045 return -EINVAL;
1046 }
1047
1048 if (o_blocks_count + add > n_blocks_count)
1049 add = n_blocks_count - o_blocks_count;
1050
1051 if (o_blocks_count + add < n_blocks_count)
1052 ext3_warning(sb, __func__,
1053 "will only finish group ("E3FSBLK
1054 " blocks, %u new)",
1055 o_blocks_count + add, add);
1056
1057 /* See if the device is actually as big as what was requested */
1058 bh = sb_bread(sb, o_blocks_count + add -1);
1059 if (!bh) {
1060 ext3_warning(sb, __func__,
1061 "can't read last block, resize aborted");
1062 return -ENOSPC;
1063 }
1064 brelse(bh);
1065
1066 /* We will update the superblock, one block bitmap, and
1067 * one group descriptor via ext3_free_blocks().
1068 */
1069 handle = ext3_journal_start_sb(sb, 3);
1070 if (IS_ERR(handle)) {
1071 err = PTR_ERR(handle);
1072 ext3_warning(sb, __func__, "error %d on journal start",err);
1073 goto exit_put;
1074 }
1075
1076 mutex_lock(&EXT3_SB(sb)->s_resize_lock);
1077 if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
1078 ext3_warning(sb, __func__,
1079 "multiple resizers run on filesystem!");
1080 mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
1081 ext3_journal_stop(handle);
1082 err = -EBUSY;
1083 goto exit_put;
1084 }
1085
1086 if ((err = ext3_journal_get_write_access(handle,
1087 EXT3_SB(sb)->s_sbh))) {
1088 ext3_warning(sb, __func__,
1089 "error %d on journal write access", err);
1090 mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
1091 ext3_journal_stop(handle);
1092 goto exit_put;
1093 }
1094 es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
1095 err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
1096 mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
1097 if (err) {
1098 ext3_warning(sb, __func__,
1099 "error %d on journal dirty metadata", err);
1100 ext3_journal_stop(handle);
1101 goto exit_put;
1102 }
1103 ext3_debug("freeing blocks "E3FSBLK" through "E3FSBLK"\n",
1104 o_blocks_count, o_blocks_count + add);
1105 ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
1106 ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n",
1107 o_blocks_count, o_blocks_count + add);
1108 if ((err = ext3_journal_stop(handle)))
1109 goto exit_put;
1110 if (test_opt(sb, DEBUG))
1111 printk(KERN_DEBUG "EXT3-fs: extended group to %u blocks\n",
1112 le32_to_cpu(es->s_blocks_count));
1113 update_backups(sb, EXT3_SB(sb)->s_sbh->b_blocknr, (char *)es,
1114 sizeof(struct ext3_super_block));
1115exit_put:
1116 return err;
1117} /* ext3_group_extend */
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
deleted file mode 100644
index 5ed0044fbb37..000000000000
--- a/fs/ext3/super.c
+++ /dev/null
@@ -1,3165 +0,0 @@
1/*
2 * linux/fs/ext3/super.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/inode.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Big-endian to little-endian byte-swapping/bitmaps by
16 * David S. Miller (davem@caip.rutgers.edu), 1995
17 */
18
19#include <linux/module.h>
20#include <linux/blkdev.h>
21#include <linux/parser.h>
22#include <linux/exportfs.h>
23#include <linux/statfs.h>
24#include <linux/random.h>
25#include <linux/mount.h>
26#include <linux/quotaops.h>
27#include <linux/seq_file.h>
28#include <linux/log2.h>
29#include <linux/cleancache.h>
30#include <linux/namei.h>
31
32#include <asm/uaccess.h>
33
34#define CREATE_TRACE_POINTS
35
36#include "ext3.h"
37#include "xattr.h"
38#include "acl.h"
39#include "namei.h"
40
41#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
42 #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
43#else
44 #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
45#endif
46
47static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
48 unsigned long journal_devnum);
49static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
50 unsigned int);
51static int ext3_commit_super(struct super_block *sb,
52 struct ext3_super_block *es,
53 int sync);
54static void ext3_mark_recovery_complete(struct super_block * sb,
55 struct ext3_super_block * es);
56static void ext3_clear_journal_err(struct super_block * sb,
57 struct ext3_super_block * es);
58static int ext3_sync_fs(struct super_block *sb, int wait);
59static const char *ext3_decode_error(struct super_block * sb, int errno,
60 char nbuf[16]);
61static int ext3_remount (struct super_block * sb, int * flags, char * data);
62static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
63static int ext3_unfreeze(struct super_block *sb);
64static int ext3_freeze(struct super_block *sb);
65
66/*
67 * Wrappers for journal_start/end.
68 */
69handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
70{
71 journal_t *journal;
72
73 if (sb->s_flags & MS_RDONLY)
74 return ERR_PTR(-EROFS);
75
76 /* Special case here: if the journal has aborted behind our
77 * backs (eg. EIO in the commit thread), then we still need to
78 * take the FS itself readonly cleanly. */
79 journal = EXT3_SB(sb)->s_journal;
80 if (is_journal_aborted(journal)) {
81 ext3_abort(sb, __func__,
82 "Detected aborted journal");
83 return ERR_PTR(-EROFS);
84 }
85
86 return journal_start(journal, nblocks);
87}
88
89int __ext3_journal_stop(const char *where, handle_t *handle)
90{
91 struct super_block *sb;
92 int err;
93 int rc;
94
95 sb = handle->h_transaction->t_journal->j_private;
96 err = handle->h_err;
97 rc = journal_stop(handle);
98
99 if (!err)
100 err = rc;
101 if (err)
102 __ext3_std_error(sb, where, err);
103 return err;
104}
105
106void ext3_journal_abort_handle(const char *caller, const char *err_fn,
107 struct buffer_head *bh, handle_t *handle, int err)
108{
109 char nbuf[16];
110 const char *errstr = ext3_decode_error(NULL, err, nbuf);
111
112 if (bh)
113 BUFFER_TRACE(bh, "abort");
114
115 if (!handle->h_err)
116 handle->h_err = err;
117
118 if (is_handle_aborted(handle))
119 return;
120
121 printk(KERN_ERR "EXT3-fs: %s: aborting transaction: %s in %s\n",
122 caller, errstr, err_fn);
123
124 journal_abort_handle(handle);
125}
126
127void ext3_msg(struct super_block *sb, const char *prefix,
128 const char *fmt, ...)
129{
130 struct va_format vaf;
131 va_list args;
132
133 va_start(args, fmt);
134
135 vaf.fmt = fmt;
136 vaf.va = &args;
137
138 printk("%sEXT3-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
139
140 va_end(args);
141}
142
143/* Deal with the reporting of failure conditions on a filesystem such as
144 * inconsistencies detected or read IO failures.
145 *
146 * On ext2, we can store the error state of the filesystem in the
147 * superblock. That is not possible on ext3, because we may have other
148 * write ordering constraints on the superblock which prevent us from
149 * writing it out straight away; and given that the journal is about to
150 * be aborted, we can't rely on the current, or future, transactions to
151 * write out the superblock safely.
152 *
153 * We'll just use the journal_abort() error code to record an error in
154 * the journal instead. On recovery, the journal will complain about
155 * that error until we've noted it down and cleared it.
156 */
157
158static void ext3_handle_error(struct super_block *sb)
159{
160 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
161
162 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
163 es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
164
165 if (sb->s_flags & MS_RDONLY)
166 return;
167
168 if (!test_opt (sb, ERRORS_CONT)) {
169 journal_t *journal = EXT3_SB(sb)->s_journal;
170
171 set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
172 if (journal)
173 journal_abort(journal, -EIO);
174 }
175 if (test_opt (sb, ERRORS_RO)) {
176 ext3_msg(sb, KERN_CRIT,
177 "error: remounting filesystem read-only");
178 /*
179 * Make sure updated value of ->s_mount_state will be visible
180 * before ->s_flags update.
181 */
182 smp_wmb();
183 sb->s_flags |= MS_RDONLY;
184 }
185 ext3_commit_super(sb, es, 1);
186 if (test_opt(sb, ERRORS_PANIC))
187 panic("EXT3-fs (%s): panic forced after error\n",
188 sb->s_id);
189}
190
191void ext3_error(struct super_block *sb, const char *function,
192 const char *fmt, ...)
193{
194 struct va_format vaf;
195 va_list args;
196
197 va_start(args, fmt);
198
199 vaf.fmt = fmt;
200 vaf.va = &args;
201
202 printk(KERN_CRIT "EXT3-fs error (device %s): %s: %pV\n",
203 sb->s_id, function, &vaf);
204
205 va_end(args);
206
207 ext3_handle_error(sb);
208}
209
210static const char *ext3_decode_error(struct super_block * sb, int errno,
211 char nbuf[16])
212{
213 char *errstr = NULL;
214
215 switch (errno) {
216 case -EIO:
217 errstr = "IO failure";
218 break;
219 case -ENOMEM:
220 errstr = "Out of memory";
221 break;
222 case -EROFS:
223 if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT)
224 errstr = "Journal has aborted";
225 else
226 errstr = "Readonly filesystem";
227 break;
228 default:
229 /* If the caller passed in an extra buffer for unknown
230 * errors, textualise them now. Else we just return
231 * NULL. */
232 if (nbuf) {
233 /* Check for truncated error codes... */
234 if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
235 errstr = nbuf;
236 }
237 break;
238 }
239
240 return errstr;
241}
242
243/* __ext3_std_error decodes expected errors from journaling functions
244 * automatically and invokes the appropriate error response. */
245
246void __ext3_std_error (struct super_block * sb, const char * function,
247 int errno)
248{
249 char nbuf[16];
250 const char *errstr;
251
252 /* Special case: if the error is EROFS, and we're not already
253 * inside a transaction, then there's really no point in logging
254 * an error. */
255 if (errno == -EROFS && journal_current_handle() == NULL &&
256 (sb->s_flags & MS_RDONLY))
257 return;
258
259 errstr = ext3_decode_error(sb, errno, nbuf);
260 ext3_msg(sb, KERN_CRIT, "error in %s: %s", function, errstr);
261
262 ext3_handle_error(sb);
263}
264
265/*
266 * ext3_abort is a much stronger failure handler than ext3_error. The
267 * abort function may be used to deal with unrecoverable failures such
268 * as journal IO errors or ENOMEM at a critical moment in log management.
269 *
270 * We unconditionally force the filesystem into an ABORT|READONLY state,
271 * unless the error response on the fs has been set to panic in which
272 * case we take the easy way out and panic immediately.
273 */
274
275void ext3_abort(struct super_block *sb, const char *function,
276 const char *fmt, ...)
277{
278 struct va_format vaf;
279 va_list args;
280
281 va_start(args, fmt);
282
283 vaf.fmt = fmt;
284 vaf.va = &args;
285
286 printk(KERN_CRIT "EXT3-fs (%s): error: %s: %pV\n",
287 sb->s_id, function, &vaf);
288
289 va_end(args);
290
291 if (test_opt(sb, ERRORS_PANIC))
292 panic("EXT3-fs: panic from previous error\n");
293
294 if (sb->s_flags & MS_RDONLY)
295 return;
296
297 ext3_msg(sb, KERN_CRIT,
298 "error: remounting filesystem read-only");
299 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
300 set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
301 /*
302 * Make sure updated value of ->s_mount_state will be visible
303 * before ->s_flags update.
304 */
305 smp_wmb();
306 sb->s_flags |= MS_RDONLY;
307
308 if (EXT3_SB(sb)->s_journal)
309 journal_abort(EXT3_SB(sb)->s_journal, -EIO);
310}
311
312void ext3_warning(struct super_block *sb, const char *function,
313 const char *fmt, ...)
314{
315 struct va_format vaf;
316 va_list args;
317
318 va_start(args, fmt);
319
320 vaf.fmt = fmt;
321 vaf.va = &args;
322
323 printk(KERN_WARNING "EXT3-fs (%s): warning: %s: %pV\n",
324 sb->s_id, function, &vaf);
325
326 va_end(args);
327}
328
329void ext3_update_dynamic_rev(struct super_block *sb)
330{
331 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
332
333 if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV)
334 return;
335
336 ext3_msg(sb, KERN_WARNING,
337 "warning: updating to rev %d because of "
338 "new feature flag, running e2fsck is recommended",
339 EXT3_DYNAMIC_REV);
340
341 es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO);
342 es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE);
343 es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV);
344 /* leave es->s_feature_*compat flags alone */
345 /* es->s_uuid will be set by e2fsck if empty */
346
347 /*
348 * The rest of the superblock fields should be zero, and if not it
349 * means they are likely already in use, so leave them alone. We
350 * can leave it up to e2fsck to clean up any inconsistencies there.
351 */
352}
353
354/*
355 * Open the external journal device
356 */
357static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
358{
359 struct block_device *bdev;
360 char b[BDEVNAME_SIZE];
361
362 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
363 if (IS_ERR(bdev))
364 goto fail;
365 return bdev;
366
367fail:
368 ext3_msg(sb, KERN_ERR, "error: failed to open journal device %s: %ld",
369 __bdevname(dev, b), PTR_ERR(bdev));
370
371 return NULL;
372}
373
374/*
375 * Release the journal device
376 */
377static void ext3_blkdev_put(struct block_device *bdev)
378{
379 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
380}
381
382static void ext3_blkdev_remove(struct ext3_sb_info *sbi)
383{
384 struct block_device *bdev;
385 bdev = sbi->journal_bdev;
386 if (bdev) {
387 ext3_blkdev_put(bdev);
388 sbi->journal_bdev = NULL;
389 }
390}
391
392static inline struct inode *orphan_list_entry(struct list_head *l)
393{
394 return &list_entry(l, struct ext3_inode_info, i_orphan)->vfs_inode;
395}
396
397static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
398{
399 struct list_head *l;
400
401 ext3_msg(sb, KERN_ERR, "error: sb orphan head is %d",
402 le32_to_cpu(sbi->s_es->s_last_orphan));
403
404 ext3_msg(sb, KERN_ERR, "sb_info orphan list:");
405 list_for_each(l, &sbi->s_orphan) {
406 struct inode *inode = orphan_list_entry(l);
407 ext3_msg(sb, KERN_ERR, " "
408 "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
409 inode->i_sb->s_id, inode->i_ino, inode,
410 inode->i_mode, inode->i_nlink,
411 NEXT_ORPHAN(inode));
412 }
413}
414
415static void ext3_put_super (struct super_block * sb)
416{
417 struct ext3_sb_info *sbi = EXT3_SB(sb);
418 struct ext3_super_block *es = sbi->s_es;
419 int i, err;
420
421 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
422 ext3_xattr_put_super(sb);
423 err = journal_destroy(sbi->s_journal);
424 sbi->s_journal = NULL;
425 if (err < 0)
426 ext3_abort(sb, __func__, "Couldn't clean up the journal");
427
428 if (!(sb->s_flags & MS_RDONLY)) {
429 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
430 es->s_state = cpu_to_le16(sbi->s_mount_state);
431 BUFFER_TRACE(sbi->s_sbh, "marking dirty");
432 mark_buffer_dirty(sbi->s_sbh);
433 ext3_commit_super(sb, es, 1);
434 }
435
436 for (i = 0; i < sbi->s_gdb_count; i++)
437 brelse(sbi->s_group_desc[i]);
438 kfree(sbi->s_group_desc);
439 percpu_counter_destroy(&sbi->s_freeblocks_counter);
440 percpu_counter_destroy(&sbi->s_freeinodes_counter);
441 percpu_counter_destroy(&sbi->s_dirs_counter);
442 brelse(sbi->s_sbh);
443#ifdef CONFIG_QUOTA
444 for (i = 0; i < EXT3_MAXQUOTAS; i++)
445 kfree(sbi->s_qf_names[i]);
446#endif
447
448 /* Debugging code just in case the in-memory inode orphan list
449 * isn't empty. The on-disk one can be non-empty if we've
450 * detected an error and taken the fs readonly, but the
451 * in-memory list had better be clean by this point. */
452 if (!list_empty(&sbi->s_orphan))
453 dump_orphan_list(sb, sbi);
454 J_ASSERT(list_empty(&sbi->s_orphan));
455
456 invalidate_bdev(sb->s_bdev);
457 if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
458 /*
459 * Invalidate the journal device's buffers. We don't want them
460 * floating about in memory - the physical journal device may
461 * hotswapped, and it breaks the `ro-after' testing code.
462 */
463 sync_blockdev(sbi->journal_bdev);
464 invalidate_bdev(sbi->journal_bdev);
465 ext3_blkdev_remove(sbi);
466 }
467 sb->s_fs_info = NULL;
468 kfree(sbi->s_blockgroup_lock);
469 mutex_destroy(&sbi->s_orphan_lock);
470 mutex_destroy(&sbi->s_resize_lock);
471 kfree(sbi);
472}
473
474static struct kmem_cache *ext3_inode_cachep;
475
476/*
477 * Called inside transaction, so use GFP_NOFS
478 */
479static struct inode *ext3_alloc_inode(struct super_block *sb)
480{
481 struct ext3_inode_info *ei;
482
483 ei = kmem_cache_alloc(ext3_inode_cachep, GFP_NOFS);
484 if (!ei)
485 return NULL;
486 ei->i_block_alloc_info = NULL;
487 ei->vfs_inode.i_version = 1;
488 atomic_set(&ei->i_datasync_tid, 0);
489 atomic_set(&ei->i_sync_tid, 0);
490#ifdef CONFIG_QUOTA
491 memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
492#endif
493
494 return &ei->vfs_inode;
495}
496
497static int ext3_drop_inode(struct inode *inode)
498{
499 int drop = generic_drop_inode(inode);
500
501 trace_ext3_drop_inode(inode, drop);
502 return drop;
503}
504
505static void ext3_i_callback(struct rcu_head *head)
506{
507 struct inode *inode = container_of(head, struct inode, i_rcu);
508 kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
509}
510
511static void ext3_destroy_inode(struct inode *inode)
512{
513 if (!list_empty(&(EXT3_I(inode)->i_orphan))) {
514 printk("EXT3 Inode %p: orphan list check failed!\n",
515 EXT3_I(inode));
516 print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
517 EXT3_I(inode), sizeof(struct ext3_inode_info),
518 false);
519 dump_stack();
520 }
521 call_rcu(&inode->i_rcu, ext3_i_callback);
522}
523
524static void init_once(void *foo)
525{
526 struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
527
528 INIT_LIST_HEAD(&ei->i_orphan);
529#ifdef CONFIG_EXT3_FS_XATTR
530 init_rwsem(&ei->xattr_sem);
531#endif
532 mutex_init(&ei->truncate_mutex);
533 inode_init_once(&ei->vfs_inode);
534}
535
536static int __init init_inodecache(void)
537{
538 ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
539 sizeof(struct ext3_inode_info),
540 0, (SLAB_RECLAIM_ACCOUNT|
541 SLAB_MEM_SPREAD),
542 init_once);
543 if (ext3_inode_cachep == NULL)
544 return -ENOMEM;
545 return 0;
546}
547
548static void destroy_inodecache(void)
549{
550 /*
551 * Make sure all delayed rcu free inodes are flushed before we
552 * destroy cache.
553 */
554 rcu_barrier();
555 kmem_cache_destroy(ext3_inode_cachep);
556}
557
558static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
559{
560#if defined(CONFIG_QUOTA)
561 struct ext3_sb_info *sbi = EXT3_SB(sb);
562
563 if (sbi->s_jquota_fmt) {
564 char *fmtname = "";
565
566 switch (sbi->s_jquota_fmt) {
567 case QFMT_VFS_OLD:
568 fmtname = "vfsold";
569 break;
570 case QFMT_VFS_V0:
571 fmtname = "vfsv0";
572 break;
573 case QFMT_VFS_V1:
574 fmtname = "vfsv1";
575 break;
576 }
577 seq_printf(seq, ",jqfmt=%s", fmtname);
578 }
579
580 if (sbi->s_qf_names[USRQUOTA])
581 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
582
583 if (sbi->s_qf_names[GRPQUOTA])
584 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
585
586 if (test_opt(sb, USRQUOTA))
587 seq_puts(seq, ",usrquota");
588
589 if (test_opt(sb, GRPQUOTA))
590 seq_puts(seq, ",grpquota");
591#endif
592}
593
594static char *data_mode_string(unsigned long mode)
595{
596 switch (mode) {
597 case EXT3_MOUNT_JOURNAL_DATA:
598 return "journal";
599 case EXT3_MOUNT_ORDERED_DATA:
600 return "ordered";
601 case EXT3_MOUNT_WRITEBACK_DATA:
602 return "writeback";
603 }
604 return "unknown";
605}
606
607/*
608 * Show an option if
609 * - it's set to a non-default value OR
610 * - if the per-sb default is different from the global default
611 */
612static int ext3_show_options(struct seq_file *seq, struct dentry *root)
613{
614 struct super_block *sb = root->d_sb;
615 struct ext3_sb_info *sbi = EXT3_SB(sb);
616 struct ext3_super_block *es = sbi->s_es;
617 unsigned long def_mount_opts;
618
619 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
620
621 if (sbi->s_sb_block != 1)
622 seq_printf(seq, ",sb=%lu", sbi->s_sb_block);
623 if (test_opt(sb, MINIX_DF))
624 seq_puts(seq, ",minixdf");
625 if (test_opt(sb, GRPID))
626 seq_puts(seq, ",grpid");
627 if (!test_opt(sb, GRPID) && (def_mount_opts & EXT3_DEFM_BSDGROUPS))
628 seq_puts(seq, ",nogrpid");
629 if (!uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT3_DEF_RESUID)) ||
630 le16_to_cpu(es->s_def_resuid) != EXT3_DEF_RESUID) {
631 seq_printf(seq, ",resuid=%u",
632 from_kuid_munged(&init_user_ns, sbi->s_resuid));
633 }
634 if (!gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT3_DEF_RESGID)) ||
635 le16_to_cpu(es->s_def_resgid) != EXT3_DEF_RESGID) {
636 seq_printf(seq, ",resgid=%u",
637 from_kgid_munged(&init_user_ns, sbi->s_resgid));
638 }
639 if (test_opt(sb, ERRORS_RO)) {
640 int def_errors = le16_to_cpu(es->s_errors);
641
642 if (def_errors == EXT3_ERRORS_PANIC ||
643 def_errors == EXT3_ERRORS_CONTINUE) {
644 seq_puts(seq, ",errors=remount-ro");
645 }
646 }
647 if (test_opt(sb, ERRORS_CONT))
648 seq_puts(seq, ",errors=continue");
649 if (test_opt(sb, ERRORS_PANIC))
650 seq_puts(seq, ",errors=panic");
651 if (test_opt(sb, NO_UID32))
652 seq_puts(seq, ",nouid32");
653 if (test_opt(sb, DEBUG))
654 seq_puts(seq, ",debug");
655#ifdef CONFIG_EXT3_FS_XATTR
656 if (test_opt(sb, XATTR_USER))
657 seq_puts(seq, ",user_xattr");
658 if (!test_opt(sb, XATTR_USER) &&
659 (def_mount_opts & EXT3_DEFM_XATTR_USER)) {
660 seq_puts(seq, ",nouser_xattr");
661 }
662#endif
663#ifdef CONFIG_EXT3_FS_POSIX_ACL
664 if (test_opt(sb, POSIX_ACL))
665 seq_puts(seq, ",acl");
666 if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT3_DEFM_ACL))
667 seq_puts(seq, ",noacl");
668#endif
669 if (!test_opt(sb, RESERVATION))
670 seq_puts(seq, ",noreservation");
671 if (sbi->s_commit_interval) {
672 seq_printf(seq, ",commit=%u",
673 (unsigned) (sbi->s_commit_interval / HZ));
674 }
675
676 /*
677 * Always display barrier state so it's clear what the status is.
678 */
679 seq_puts(seq, ",barrier=");
680 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
681 seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
682 if (test_opt(sb, DATA_ERR_ABORT))
683 seq_puts(seq, ",data_err=abort");
684
685 if (test_opt(sb, NOLOAD))
686 seq_puts(seq, ",norecovery");
687
688 ext3_show_quota_options(seq, sb);
689
690 return 0;
691}
692
693
694static struct inode *ext3_nfs_get_inode(struct super_block *sb,
695 u64 ino, u32 generation)
696{
697 struct inode *inode;
698
699 if (ino < EXT3_FIRST_INO(sb) && ino != EXT3_ROOT_INO)
700 return ERR_PTR(-ESTALE);
701 if (ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count))
702 return ERR_PTR(-ESTALE);
703
704 /* iget isn't really right if the inode is currently unallocated!!
705 *
706 * ext3_read_inode will return a bad_inode if the inode had been
707 * deleted, so we should be safe.
708 *
709 * Currently we don't know the generation for parent directory, so
710 * a generation of 0 means "accept any"
711 */
712 inode = ext3_iget(sb, ino);
713 if (IS_ERR(inode))
714 return ERR_CAST(inode);
715 if (generation && inode->i_generation != generation) {
716 iput(inode);
717 return ERR_PTR(-ESTALE);
718 }
719
720 return inode;
721}
722
723static struct dentry *ext3_fh_to_dentry(struct super_block *sb, struct fid *fid,
724 int fh_len, int fh_type)
725{
726 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
727 ext3_nfs_get_inode);
728}
729
730static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid,
731 int fh_len, int fh_type)
732{
733 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
734 ext3_nfs_get_inode);
735}
736
737/*
738 * Try to release metadata pages (indirect blocks, directories) which are
739 * mapped via the block device. Since these pages could have journal heads
740 * which would prevent try_to_free_buffers() from freeing them, we must use
741 * jbd layer's try_to_free_buffers() function to release them.
742 */
743static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
744 gfp_t wait)
745{
746 journal_t *journal = EXT3_SB(sb)->s_journal;
747
748 WARN_ON(PageChecked(page));
749 if (!page_has_buffers(page))
750 return 0;
751 if (journal)
752 return journal_try_to_free_buffers(journal, page,
753 wait & ~__GFP_WAIT);
754 return try_to_free_buffers(page);
755}
756
757#ifdef CONFIG_QUOTA
758#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
759#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
760
761static int ext3_write_dquot(struct dquot *dquot);
762static int ext3_acquire_dquot(struct dquot *dquot);
763static int ext3_release_dquot(struct dquot *dquot);
764static int ext3_mark_dquot_dirty(struct dquot *dquot);
765static int ext3_write_info(struct super_block *sb, int type);
766static int ext3_quota_on(struct super_block *sb, int type, int format_id,
767 struct path *path);
768static int ext3_quota_on_mount(struct super_block *sb, int type);
769static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
770 size_t len, loff_t off);
771static ssize_t ext3_quota_write(struct super_block *sb, int type,
772 const char *data, size_t len, loff_t off);
773static struct dquot **ext3_get_dquots(struct inode *inode)
774{
775 return EXT3_I(inode)->i_dquot;
776}
777
778static const struct dquot_operations ext3_quota_operations = {
779 .write_dquot = ext3_write_dquot,
780 .acquire_dquot = ext3_acquire_dquot,
781 .release_dquot = ext3_release_dquot,
782 .mark_dirty = ext3_mark_dquot_dirty,
783 .write_info = ext3_write_info,
784 .alloc_dquot = dquot_alloc,
785 .destroy_dquot = dquot_destroy,
786};
787
788static const struct quotactl_ops ext3_qctl_operations = {
789 .quota_on = ext3_quota_on,
790 .quota_off = dquot_quota_off,
791 .quota_sync = dquot_quota_sync,
792 .get_state = dquot_get_state,
793 .set_info = dquot_set_dqinfo,
794 .get_dqblk = dquot_get_dqblk,
795 .set_dqblk = dquot_set_dqblk
796};
797#endif
798
799static const struct super_operations ext3_sops = {
800 .alloc_inode = ext3_alloc_inode,
801 .destroy_inode = ext3_destroy_inode,
802 .write_inode = ext3_write_inode,
803 .dirty_inode = ext3_dirty_inode,
804 .drop_inode = ext3_drop_inode,
805 .evict_inode = ext3_evict_inode,
806 .put_super = ext3_put_super,
807 .sync_fs = ext3_sync_fs,
808 .freeze_fs = ext3_freeze,
809 .unfreeze_fs = ext3_unfreeze,
810 .statfs = ext3_statfs,
811 .remount_fs = ext3_remount,
812 .show_options = ext3_show_options,
813#ifdef CONFIG_QUOTA
814 .quota_read = ext3_quota_read,
815 .quota_write = ext3_quota_write,
816 .get_dquots = ext3_get_dquots,
817#endif
818 .bdev_try_to_free_page = bdev_try_to_free_page,
819};
820
821static const struct export_operations ext3_export_ops = {
822 .fh_to_dentry = ext3_fh_to_dentry,
823 .fh_to_parent = ext3_fh_to_parent,
824 .get_parent = ext3_get_parent,
825};
826
827enum {
828 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
829 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
830 Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
831 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
832 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
833 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
834 Opt_journal_path,
835 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
836 Opt_data_err_abort, Opt_data_err_ignore,
837 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
838 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
839 Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
840 Opt_resize, Opt_usrquota, Opt_grpquota
841};
842
843static const match_table_t tokens = {
844 {Opt_bsd_df, "bsddf"},
845 {Opt_minix_df, "minixdf"},
846 {Opt_grpid, "grpid"},
847 {Opt_grpid, "bsdgroups"},
848 {Opt_nogrpid, "nogrpid"},
849 {Opt_nogrpid, "sysvgroups"},
850 {Opt_resgid, "resgid=%u"},
851 {Opt_resuid, "resuid=%u"},
852 {Opt_sb, "sb=%u"},
853 {Opt_err_cont, "errors=continue"},
854 {Opt_err_panic, "errors=panic"},
855 {Opt_err_ro, "errors=remount-ro"},
856 {Opt_nouid32, "nouid32"},
857 {Opt_nocheck, "nocheck"},
858 {Opt_nocheck, "check=none"},
859 {Opt_debug, "debug"},
860 {Opt_oldalloc, "oldalloc"},
861 {Opt_orlov, "orlov"},
862 {Opt_user_xattr, "user_xattr"},
863 {Opt_nouser_xattr, "nouser_xattr"},
864 {Opt_acl, "acl"},
865 {Opt_noacl, "noacl"},
866 {Opt_reservation, "reservation"},
867 {Opt_noreservation, "noreservation"},
868 {Opt_noload, "noload"},
869 {Opt_noload, "norecovery"},
870 {Opt_nobh, "nobh"},
871 {Opt_bh, "bh"},
872 {Opt_commit, "commit=%u"},
873 {Opt_journal_update, "journal=update"},
874 {Opt_journal_inum, "journal=%u"},
875 {Opt_journal_dev, "journal_dev=%u"},
876 {Opt_journal_path, "journal_path=%s"},
877 {Opt_abort, "abort"},
878 {Opt_data_journal, "data=journal"},
879 {Opt_data_ordered, "data=ordered"},
880 {Opt_data_writeback, "data=writeback"},
881 {Opt_data_err_abort, "data_err=abort"},
882 {Opt_data_err_ignore, "data_err=ignore"},
883 {Opt_offusrjquota, "usrjquota="},
884 {Opt_usrjquota, "usrjquota=%s"},
885 {Opt_offgrpjquota, "grpjquota="},
886 {Opt_grpjquota, "grpjquota=%s"},
887 {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
888 {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
889 {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
890 {Opt_grpquota, "grpquota"},
891 {Opt_noquota, "noquota"},
892 {Opt_quota, "quota"},
893 {Opt_usrquota, "usrquota"},
894 {Opt_barrier, "barrier=%u"},
895 {Opt_barrier, "barrier"},
896 {Opt_nobarrier, "nobarrier"},
897 {Opt_resize, "resize"},
898 {Opt_err, NULL},
899};
900
901static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb)
902{
903 ext3_fsblk_t sb_block;
904 char *options = (char *) *data;
905
906 if (!options || strncmp(options, "sb=", 3) != 0)
907 return 1; /* Default location */
908 options += 3;
909 /*todo: use simple_strtoll with >32bit ext3 */
910 sb_block = simple_strtoul(options, &options, 0);
911 if (*options && *options != ',') {
912 ext3_msg(sb, KERN_ERR, "error: invalid sb specification: %s",
913 (char *) *data);
914 return 1;
915 }
916 if (*options == ',')
917 options++;
918 *data = (void *) options;
919 return sb_block;
920}
921
922#ifdef CONFIG_QUOTA
923static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
924{
925 struct ext3_sb_info *sbi = EXT3_SB(sb);
926 char *qname;
927
928 if (sb_any_quota_loaded(sb) &&
929 !sbi->s_qf_names[qtype]) {
930 ext3_msg(sb, KERN_ERR,
931 "Cannot change journaled "
932 "quota options when quota turned on");
933 return 0;
934 }
935 qname = match_strdup(args);
936 if (!qname) {
937 ext3_msg(sb, KERN_ERR,
938 "Not enough memory for storing quotafile name");
939 return 0;
940 }
941 if (sbi->s_qf_names[qtype]) {
942 int same = !strcmp(sbi->s_qf_names[qtype], qname);
943
944 kfree(qname);
945 if (!same) {
946 ext3_msg(sb, KERN_ERR,
947 "%s quota file already specified",
948 QTYPE2NAME(qtype));
949 }
950 return same;
951 }
952 if (strchr(qname, '/')) {
953 ext3_msg(sb, KERN_ERR,
954 "quotafile must be on filesystem root");
955 kfree(qname);
956 return 0;
957 }
958 sbi->s_qf_names[qtype] = qname;
959 set_opt(sbi->s_mount_opt, QUOTA);
960 return 1;
961}
962
963static int clear_qf_name(struct super_block *sb, int qtype) {
964
965 struct ext3_sb_info *sbi = EXT3_SB(sb);
966
967 if (sb_any_quota_loaded(sb) &&
968 sbi->s_qf_names[qtype]) {
969 ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options"
970 " when quota turned on");
971 return 0;
972 }
973 if (sbi->s_qf_names[qtype]) {
974 kfree(sbi->s_qf_names[qtype]);
975 sbi->s_qf_names[qtype] = NULL;
976 }
977 return 1;
978}
979#endif
980
981static int parse_options (char *options, struct super_block *sb,
982 unsigned int *inum, unsigned long *journal_devnum,
983 ext3_fsblk_t *n_blocks_count, int is_remount)
984{
985 struct ext3_sb_info *sbi = EXT3_SB(sb);
986 char * p;
987 substring_t args[MAX_OPT_ARGS];
988 int data_opt = 0;
989 int option;
990 kuid_t uid;
991 kgid_t gid;
992 char *journal_path;
993 struct inode *journal_inode;
994 struct path path;
995 int error;
996
997#ifdef CONFIG_QUOTA
998 int qfmt;
999#endif
1000
1001 if (!options)
1002 return 1;
1003
1004 while ((p = strsep (&options, ",")) != NULL) {
1005 int token;
1006 if (!*p)
1007 continue;
1008 /*
1009 * Initialize args struct so we know whether arg was
1010 * found; some options take optional arguments.
1011 */
1012 args[0].to = args[0].from = NULL;
1013 token = match_token(p, tokens, args);
1014 switch (token) {
1015 case Opt_bsd_df:
1016 clear_opt (sbi->s_mount_opt, MINIX_DF);
1017 break;
1018 case Opt_minix_df:
1019 set_opt (sbi->s_mount_opt, MINIX_DF);
1020 break;
1021 case Opt_grpid:
1022 set_opt (sbi->s_mount_opt, GRPID);
1023 break;
1024 case Opt_nogrpid:
1025 clear_opt (sbi->s_mount_opt, GRPID);
1026 break;
1027 case Opt_resuid:
1028 if (match_int(&args[0], &option))
1029 return 0;
1030 uid = make_kuid(current_user_ns(), option);
1031 if (!uid_valid(uid)) {
1032 ext3_msg(sb, KERN_ERR, "Invalid uid value %d", option);
1033 return 0;
1034
1035 }
1036 sbi->s_resuid = uid;
1037 break;
1038 case Opt_resgid:
1039 if (match_int(&args[0], &option))
1040 return 0;
1041 gid = make_kgid(current_user_ns(), option);
1042 if (!gid_valid(gid)) {
1043 ext3_msg(sb, KERN_ERR, "Invalid gid value %d", option);
1044 return 0;
1045 }
1046 sbi->s_resgid = gid;
1047 break;
1048 case Opt_sb:
1049 /* handled by get_sb_block() instead of here */
1050 /* *sb_block = match_int(&args[0]); */
1051 break;
1052 case Opt_err_panic:
1053 clear_opt (sbi->s_mount_opt, ERRORS_CONT);
1054 clear_opt (sbi->s_mount_opt, ERRORS_RO);
1055 set_opt (sbi->s_mount_opt, ERRORS_PANIC);
1056 break;
1057 case Opt_err_ro:
1058 clear_opt (sbi->s_mount_opt, ERRORS_CONT);
1059 clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
1060 set_opt (sbi->s_mount_opt, ERRORS_RO);
1061 break;
1062 case Opt_err_cont:
1063 clear_opt (sbi->s_mount_opt, ERRORS_RO);
1064 clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
1065 set_opt (sbi->s_mount_opt, ERRORS_CONT);
1066 break;
1067 case Opt_nouid32:
1068 set_opt (sbi->s_mount_opt, NO_UID32);
1069 break;
1070 case Opt_nocheck:
1071 clear_opt (sbi->s_mount_opt, CHECK);
1072 break;
1073 case Opt_debug:
1074 set_opt (sbi->s_mount_opt, DEBUG);
1075 break;
1076 case Opt_oldalloc:
1077 ext3_msg(sb, KERN_WARNING,
1078 "Ignoring deprecated oldalloc option");
1079 break;
1080 case Opt_orlov:
1081 ext3_msg(sb, KERN_WARNING,
1082 "Ignoring deprecated orlov option");
1083 break;
1084#ifdef CONFIG_EXT3_FS_XATTR
1085 case Opt_user_xattr:
1086 set_opt (sbi->s_mount_opt, XATTR_USER);
1087 break;
1088 case Opt_nouser_xattr:
1089 clear_opt (sbi->s_mount_opt, XATTR_USER);
1090 break;
1091#else
1092 case Opt_user_xattr:
1093 case Opt_nouser_xattr:
1094 ext3_msg(sb, KERN_INFO,
1095 "(no)user_xattr options not supported");
1096 break;
1097#endif
1098#ifdef CONFIG_EXT3_FS_POSIX_ACL
1099 case Opt_acl:
1100 set_opt(sbi->s_mount_opt, POSIX_ACL);
1101 break;
1102 case Opt_noacl:
1103 clear_opt(sbi->s_mount_opt, POSIX_ACL);
1104 break;
1105#else
1106 case Opt_acl:
1107 case Opt_noacl:
1108 ext3_msg(sb, KERN_INFO,
1109 "(no)acl options not supported");
1110 break;
1111#endif
1112 case Opt_reservation:
1113 set_opt(sbi->s_mount_opt, RESERVATION);
1114 break;
1115 case Opt_noreservation:
1116 clear_opt(sbi->s_mount_opt, RESERVATION);
1117 break;
1118 case Opt_journal_update:
1119 /* @@@ FIXME */
1120 /* Eventually we will want to be able to create
1121 a journal file here. For now, only allow the
1122 user to specify an existing inode to be the
1123 journal file. */
1124 if (is_remount) {
1125 ext3_msg(sb, KERN_ERR, "error: cannot specify "
1126 "journal on remount");
1127 return 0;
1128 }
1129 set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
1130 break;
1131 case Opt_journal_inum:
1132 if (is_remount) {
1133 ext3_msg(sb, KERN_ERR, "error: cannot specify "
1134 "journal on remount");
1135 return 0;
1136 }
1137 if (match_int(&args[0], &option))
1138 return 0;
1139 *inum = option;
1140 break;
1141 case Opt_journal_dev:
1142 if (is_remount) {
1143 ext3_msg(sb, KERN_ERR, "error: cannot specify "
1144 "journal on remount");
1145 return 0;
1146 }
1147 if (match_int(&args[0], &option))
1148 return 0;
1149 *journal_devnum = option;
1150 break;
1151 case Opt_journal_path:
1152 if (is_remount) {
1153 ext3_msg(sb, KERN_ERR, "error: cannot specify "
1154 "journal on remount");
1155 return 0;
1156 }
1157
1158 journal_path = match_strdup(&args[0]);
1159 if (!journal_path) {
1160 ext3_msg(sb, KERN_ERR, "error: could not dup "
1161 "journal device string");
1162 return 0;
1163 }
1164
1165 error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
1166 if (error) {
1167 ext3_msg(sb, KERN_ERR, "error: could not find "
1168 "journal device path: error %d", error);
1169 kfree(journal_path);
1170 return 0;
1171 }
1172
1173 journal_inode = d_inode(path.dentry);
1174 if (!S_ISBLK(journal_inode->i_mode)) {
1175 ext3_msg(sb, KERN_ERR, "error: journal path %s "
1176 "is not a block device", journal_path);
1177 path_put(&path);
1178 kfree(journal_path);
1179 return 0;
1180 }
1181
1182 *journal_devnum = new_encode_dev(journal_inode->i_rdev);
1183 path_put(&path);
1184 kfree(journal_path);
1185 break;
1186 case Opt_noload:
1187 set_opt (sbi->s_mount_opt, NOLOAD);
1188 break;
1189 case Opt_commit:
1190 if (match_int(&args[0], &option))
1191 return 0;
1192 if (option < 0)
1193 return 0;
1194 if (option == 0)
1195 option = JBD_DEFAULT_MAX_COMMIT_AGE;
1196 sbi->s_commit_interval = HZ * option;
1197 break;
1198 case Opt_data_journal:
1199 data_opt = EXT3_MOUNT_JOURNAL_DATA;
1200 goto datacheck;
1201 case Opt_data_ordered:
1202 data_opt = EXT3_MOUNT_ORDERED_DATA;
1203 goto datacheck;
1204 case Opt_data_writeback:
1205 data_opt = EXT3_MOUNT_WRITEBACK_DATA;
1206 datacheck:
1207 if (is_remount) {
1208 if (test_opt(sb, DATA_FLAGS) == data_opt)
1209 break;
1210 ext3_msg(sb, KERN_ERR,
1211 "error: cannot change "
1212 "data mode on remount. The filesystem "
1213 "is mounted in data=%s mode and you "
1214 "try to remount it in data=%s mode.",
1215 data_mode_string(test_opt(sb,
1216 DATA_FLAGS)),
1217 data_mode_string(data_opt));
1218 return 0;
1219 } else {
1220 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
1221 sbi->s_mount_opt |= data_opt;
1222 }
1223 break;
1224 case Opt_data_err_abort:
1225 set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1226 break;
1227 case Opt_data_err_ignore:
1228 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1229 break;
1230#ifdef CONFIG_QUOTA
1231 case Opt_usrjquota:
1232 if (!set_qf_name(sb, USRQUOTA, &args[0]))
1233 return 0;
1234 break;
1235 case Opt_grpjquota:
1236 if (!set_qf_name(sb, GRPQUOTA, &args[0]))
1237 return 0;
1238 break;
1239 case Opt_offusrjquota:
1240 if (!clear_qf_name(sb, USRQUOTA))
1241 return 0;
1242 break;
1243 case Opt_offgrpjquota:
1244 if (!clear_qf_name(sb, GRPQUOTA))
1245 return 0;
1246 break;
1247 case Opt_jqfmt_vfsold:
1248 qfmt = QFMT_VFS_OLD;
1249 goto set_qf_format;
1250 case Opt_jqfmt_vfsv0:
1251 qfmt = QFMT_VFS_V0;
1252 goto set_qf_format;
1253 case Opt_jqfmt_vfsv1:
1254 qfmt = QFMT_VFS_V1;
1255set_qf_format:
1256 if (sb_any_quota_loaded(sb) &&
1257 sbi->s_jquota_fmt != qfmt) {
1258 ext3_msg(sb, KERN_ERR, "error: cannot change "
1259 "journaled quota options when "
1260 "quota turned on.");
1261 return 0;
1262 }
1263 sbi->s_jquota_fmt = qfmt;
1264 break;
1265 case Opt_quota:
1266 case Opt_usrquota:
1267 set_opt(sbi->s_mount_opt, QUOTA);
1268 set_opt(sbi->s_mount_opt, USRQUOTA);
1269 break;
1270 case Opt_grpquota:
1271 set_opt(sbi->s_mount_opt, QUOTA);
1272 set_opt(sbi->s_mount_opt, GRPQUOTA);
1273 break;
1274 case Opt_noquota:
1275 if (sb_any_quota_loaded(sb)) {
1276 ext3_msg(sb, KERN_ERR, "error: cannot change "
1277 "quota options when quota turned on.");
1278 return 0;
1279 }
1280 clear_opt(sbi->s_mount_opt, QUOTA);
1281 clear_opt(sbi->s_mount_opt, USRQUOTA);
1282 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1283 break;
1284#else
1285 case Opt_quota:
1286 case Opt_usrquota:
1287 case Opt_grpquota:
1288 ext3_msg(sb, KERN_ERR,
1289 "error: quota options not supported.");
1290 break;
1291 case Opt_usrjquota:
1292 case Opt_grpjquota:
1293 case Opt_offusrjquota:
1294 case Opt_offgrpjquota:
1295 case Opt_jqfmt_vfsold:
1296 case Opt_jqfmt_vfsv0:
1297 case Opt_jqfmt_vfsv1:
1298 ext3_msg(sb, KERN_ERR,
1299 "error: journaled quota options not "
1300 "supported.");
1301 break;
1302 case Opt_noquota:
1303 break;
1304#endif
1305 case Opt_abort:
1306 set_opt(sbi->s_mount_opt, ABORT);
1307 break;
1308 case Opt_nobarrier:
1309 clear_opt(sbi->s_mount_opt, BARRIER);
1310 break;
1311 case Opt_barrier:
1312 if (args[0].from) {
1313 if (match_int(&args[0], &option))
1314 return 0;
1315 } else
1316 option = 1; /* No argument, default to 1 */
1317 if (option)
1318 set_opt(sbi->s_mount_opt, BARRIER);
1319 else
1320 clear_opt(sbi->s_mount_opt, BARRIER);
1321 break;
1322 case Opt_ignore:
1323 break;
1324 case Opt_resize:
1325 if (!is_remount) {
1326 ext3_msg(sb, KERN_ERR,
1327 "error: resize option only available "
1328 "for remount");
1329 return 0;
1330 }
1331 if (match_int(&args[0], &option) != 0)
1332 return 0;
1333 *n_blocks_count = option;
1334 break;
1335 case Opt_nobh:
1336 ext3_msg(sb, KERN_WARNING,
1337 "warning: ignoring deprecated nobh option");
1338 break;
1339 case Opt_bh:
1340 ext3_msg(sb, KERN_WARNING,
1341 "warning: ignoring deprecated bh option");
1342 break;
1343 default:
1344 ext3_msg(sb, KERN_ERR,
1345 "error: unrecognized mount option \"%s\" "
1346 "or missing value", p);
1347 return 0;
1348 }
1349 }
1350#ifdef CONFIG_QUOTA
1351 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1352 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
1353 clear_opt(sbi->s_mount_opt, USRQUOTA);
1354 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
1355 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1356
1357 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
1358 ext3_msg(sb, KERN_ERR, "error: old and new quota "
1359 "format mixing.");
1360 return 0;
1361 }
1362
1363 if (!sbi->s_jquota_fmt) {
1364 ext3_msg(sb, KERN_ERR, "error: journaled quota format "
1365 "not specified.");
1366 return 0;
1367 }
1368 }
1369#endif
1370 return 1;
1371}
1372
1373static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
1374 int read_only)
1375{
1376 struct ext3_sb_info *sbi = EXT3_SB(sb);
1377 int res = 0;
1378
1379 if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) {
1380 ext3_msg(sb, KERN_ERR,
1381 "error: revision level too high, "
1382 "forcing read-only mode");
1383 res = MS_RDONLY;
1384 }
1385 if (read_only)
1386 return res;
1387 if (!(sbi->s_mount_state & EXT3_VALID_FS))
1388 ext3_msg(sb, KERN_WARNING,
1389 "warning: mounting unchecked fs, "
1390 "running e2fsck is recommended");
1391 else if ((sbi->s_mount_state & EXT3_ERROR_FS))
1392 ext3_msg(sb, KERN_WARNING,
1393 "warning: mounting fs with errors, "
1394 "running e2fsck is recommended");
1395 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
1396 le16_to_cpu(es->s_mnt_count) >=
1397 le16_to_cpu(es->s_max_mnt_count))
1398 ext3_msg(sb, KERN_WARNING,
1399 "warning: maximal mount count reached, "
1400 "running e2fsck is recommended");
1401 else if (le32_to_cpu(es->s_checkinterval) &&
1402 (le32_to_cpu(es->s_lastcheck) +
1403 le32_to_cpu(es->s_checkinterval) <= get_seconds()))
1404 ext3_msg(sb, KERN_WARNING,
1405 "warning: checktime reached, "
1406 "running e2fsck is recommended");
1407#if 0
1408 /* @@@ We _will_ want to clear the valid bit if we find
1409 inconsistencies, to force a fsck at reboot. But for
1410 a plain journaled filesystem we can keep it set as
1411 valid forever! :) */
1412 es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
1413#endif
1414 if (!le16_to_cpu(es->s_max_mnt_count))
1415 es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
1416 le16_add_cpu(&es->s_mnt_count, 1);
1417 es->s_mtime = cpu_to_le32(get_seconds());
1418 ext3_update_dynamic_rev(sb);
1419 EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
1420
1421 ext3_commit_super(sb, es, 1);
1422 if (test_opt(sb, DEBUG))
1423 ext3_msg(sb, KERN_INFO, "[bs=%lu, gc=%lu, "
1424 "bpg=%lu, ipg=%lu, mo=%04lx]",
1425 sb->s_blocksize,
1426 sbi->s_groups_count,
1427 EXT3_BLOCKS_PER_GROUP(sb),
1428 EXT3_INODES_PER_GROUP(sb),
1429 sbi->s_mount_opt);
1430
1431 if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
1432 char b[BDEVNAME_SIZE];
1433 ext3_msg(sb, KERN_INFO, "using external journal on %s",
1434 bdevname(EXT3_SB(sb)->s_journal->j_dev, b));
1435 } else {
1436 ext3_msg(sb, KERN_INFO, "using internal journal");
1437 }
1438 cleancache_init_fs(sb);
1439 return res;
1440}
1441
1442/* Called at mount-time, super-block is locked */
1443static int ext3_check_descriptors(struct super_block *sb)
1444{
1445 struct ext3_sb_info *sbi = EXT3_SB(sb);
1446 int i;
1447
1448 ext3_debug ("Checking group descriptors");
1449
1450 for (i = 0; i < sbi->s_groups_count; i++) {
1451 struct ext3_group_desc *gdp = ext3_get_group_desc(sb, i, NULL);
1452 ext3_fsblk_t first_block = ext3_group_first_block_no(sb, i);
1453 ext3_fsblk_t last_block;
1454
1455 if (i == sbi->s_groups_count - 1)
1456 last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1;
1457 else
1458 last_block = first_block +
1459 (EXT3_BLOCKS_PER_GROUP(sb) - 1);
1460
1461 if (le32_to_cpu(gdp->bg_block_bitmap) < first_block ||
1462 le32_to_cpu(gdp->bg_block_bitmap) > last_block)
1463 {
1464 ext3_error (sb, "ext3_check_descriptors",
1465 "Block bitmap for group %d"
1466 " not in group (block %lu)!",
1467 i, (unsigned long)
1468 le32_to_cpu(gdp->bg_block_bitmap));
1469 return 0;
1470 }
1471 if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block ||
1472 le32_to_cpu(gdp->bg_inode_bitmap) > last_block)
1473 {
1474 ext3_error (sb, "ext3_check_descriptors",
1475 "Inode bitmap for group %d"
1476 " not in group (block %lu)!",
1477 i, (unsigned long)
1478 le32_to_cpu(gdp->bg_inode_bitmap));
1479 return 0;
1480 }
1481 if (le32_to_cpu(gdp->bg_inode_table) < first_block ||
1482 le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group - 1 >
1483 last_block)
1484 {
1485 ext3_error (sb, "ext3_check_descriptors",
1486 "Inode table for group %d"
1487 " not in group (block %lu)!",
1488 i, (unsigned long)
1489 le32_to_cpu(gdp->bg_inode_table));
1490 return 0;
1491 }
1492 }
1493
1494 sbi->s_es->s_free_blocks_count=cpu_to_le32(ext3_count_free_blocks(sb));
1495 sbi->s_es->s_free_inodes_count=cpu_to_le32(ext3_count_free_inodes(sb));
1496 return 1;
1497}
1498
1499
1500/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at
1501 * the superblock) which were deleted from all directories, but held open by
1502 * a process at the time of a crash. We walk the list and try to delete these
1503 * inodes at recovery time (only with a read-write filesystem).
1504 *
1505 * In order to keep the orphan inode chain consistent during traversal (in
1506 * case of crash during recovery), we link each inode into the superblock
1507 * orphan list_head and handle it the same way as an inode deletion during
1508 * normal operation (which journals the operations for us).
1509 *
1510 * We only do an iget() and an iput() on each inode, which is very safe if we
1511 * accidentally point at an in-use or already deleted inode. The worst that
1512 * can happen in this case is that we get a "bit already cleared" message from
1513 * ext3_free_inode(). The only reason we would point at a wrong inode is if
1514 * e2fsck was run on this filesystem, and it must have already done the orphan
1515 * inode cleanup for us, so we can safely abort without any further action.
1516 */
1517static void ext3_orphan_cleanup (struct super_block * sb,
1518 struct ext3_super_block * es)
1519{
1520 unsigned int s_flags = sb->s_flags;
1521 int nr_orphans = 0, nr_truncates = 0;
1522#ifdef CONFIG_QUOTA
1523 int i;
1524#endif
1525 if (!es->s_last_orphan) {
1526 jbd_debug(4, "no orphan inodes to clean up\n");
1527 return;
1528 }
1529
1530 if (bdev_read_only(sb->s_bdev)) {
1531 ext3_msg(sb, KERN_ERR, "error: write access "
1532 "unavailable, skipping orphan cleanup.");
1533 return;
1534 }
1535
1536 /* Check if feature set allows readwrite operations */
1537 if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) {
1538 ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
1539 "unknown ROCOMPAT features");
1540 return;
1541 }
1542
1543 if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
1544 /* don't clear list on RO mount w/ errors */
1545 if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
1546 jbd_debug(1, "Errors on filesystem, "
1547 "clearing orphan list.\n");
1548 es->s_last_orphan = 0;
1549 }
1550 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
1551 return;
1552 }
1553
1554 if (s_flags & MS_RDONLY) {
1555 ext3_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
1556 sb->s_flags &= ~MS_RDONLY;
1557 }
1558#ifdef CONFIG_QUOTA
1559 /* Needed for iput() to work correctly and not trash data */
1560 sb->s_flags |= MS_ACTIVE;
1561 /* Turn on quotas so that they are updated correctly */
1562 for (i = 0; i < EXT3_MAXQUOTAS; i++) {
1563 if (EXT3_SB(sb)->s_qf_names[i]) {
1564 int ret = ext3_quota_on_mount(sb, i);
1565 if (ret < 0)
1566 ext3_msg(sb, KERN_ERR,
1567 "error: cannot turn on journaled "
1568 "quota: %d", ret);
1569 }
1570 }
1571#endif
1572
1573 while (es->s_last_orphan) {
1574 struct inode *inode;
1575
1576 inode = ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
1577 if (IS_ERR(inode)) {
1578 es->s_last_orphan = 0;
1579 break;
1580 }
1581
1582 list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
1583 dquot_initialize(inode);
1584 if (inode->i_nlink) {
1585 printk(KERN_DEBUG
1586 "%s: truncating inode %lu to %Ld bytes\n",
1587 __func__, inode->i_ino, inode->i_size);
1588 jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
1589 inode->i_ino, inode->i_size);
1590 ext3_truncate(inode);
1591 nr_truncates++;
1592 } else {
1593 printk(KERN_DEBUG
1594 "%s: deleting unreferenced inode %lu\n",
1595 __func__, inode->i_ino);
1596 jbd_debug(2, "deleting unreferenced inode %lu\n",
1597 inode->i_ino);
1598 nr_orphans++;
1599 }
1600 iput(inode); /* The delete magic happens here! */
1601 }
1602
1603#define PLURAL(x) (x), ((x)==1) ? "" : "s"
1604
1605 if (nr_orphans)
1606 ext3_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
1607 PLURAL(nr_orphans));
1608 if (nr_truncates)
1609 ext3_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
1610 PLURAL(nr_truncates));
1611#ifdef CONFIG_QUOTA
1612 /* Turn quotas off */
1613 for (i = 0; i < EXT3_MAXQUOTAS; i++) {
1614 if (sb_dqopt(sb)->files[i])
1615 dquot_quota_off(sb, i);
1616 }
1617#endif
1618 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
1619}
1620
1621/*
1622 * Maximal file size. There is a direct, and {,double-,triple-}indirect
1623 * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
1624 * We need to be 1 filesystem block less than the 2^32 sector limit.
1625 */
1626static loff_t ext3_max_size(int bits)
1627{
1628 loff_t res = EXT3_NDIR_BLOCKS;
1629 int meta_blocks;
1630 loff_t upper_limit;
1631
1632 /* This is calculated to be the largest file size for a
1633 * dense, file such that the total number of
1634 * sectors in the file, including data and all indirect blocks,
1635 * does not exceed 2^32 -1
1636 * __u32 i_blocks representing the total number of
1637 * 512 bytes blocks of the file
1638 */
1639 upper_limit = (1LL << 32) - 1;
1640
1641 /* total blocks in file system block size */
1642 upper_limit >>= (bits - 9);
1643
1644
1645 /* indirect blocks */
1646 meta_blocks = 1;
1647 /* double indirect blocks */
1648 meta_blocks += 1 + (1LL << (bits-2));
1649 /* tripple indirect blocks */
1650 meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
1651
1652 upper_limit -= meta_blocks;
1653 upper_limit <<= bits;
1654
1655 res += 1LL << (bits-2);
1656 res += 1LL << (2*(bits-2));
1657 res += 1LL << (3*(bits-2));
1658 res <<= bits;
1659 if (res > upper_limit)
1660 res = upper_limit;
1661
1662 if (res > MAX_LFS_FILESIZE)
1663 res = MAX_LFS_FILESIZE;
1664
1665 return res;
1666}
1667
1668static ext3_fsblk_t descriptor_loc(struct super_block *sb,
1669 ext3_fsblk_t logic_sb_block,
1670 int nr)
1671{
1672 struct ext3_sb_info *sbi = EXT3_SB(sb);
1673 unsigned long bg, first_meta_bg;
1674 int has_super = 0;
1675
1676 first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
1677
1678 if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) ||
1679 nr < first_meta_bg)
1680 return (logic_sb_block + nr + 1);
1681 bg = sbi->s_desc_per_block * nr;
1682 if (ext3_bg_has_super(sb, bg))
1683 has_super = 1;
1684 return (has_super + ext3_group_first_block_no(sb, bg));
1685}
1686
1687
1688static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1689{
1690 struct buffer_head * bh;
1691 struct ext3_super_block *es = NULL;
1692 struct ext3_sb_info *sbi;
1693 ext3_fsblk_t block;
1694 ext3_fsblk_t sb_block = get_sb_block(&data, sb);
1695 ext3_fsblk_t logic_sb_block;
1696 unsigned long offset = 0;
1697 unsigned int journal_inum = 0;
1698 unsigned long journal_devnum = 0;
1699 unsigned long def_mount_opts;
1700 struct inode *root;
1701 int blocksize;
1702 int hblock;
1703 int db_count;
1704 int i;
1705 int needs_recovery;
1706 int ret = -EINVAL;
1707 __le32 features;
1708 int err;
1709
1710 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
1711 if (!sbi)
1712 return -ENOMEM;
1713
1714 sbi->s_blockgroup_lock =
1715 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
1716 if (!sbi->s_blockgroup_lock) {
1717 kfree(sbi);
1718 return -ENOMEM;
1719 }
1720 sb->s_fs_info = sbi;
1721 sbi->s_sb_block = sb_block;
1722
1723 blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
1724 if (!blocksize) {
1725 ext3_msg(sb, KERN_ERR, "error: unable to set blocksize");
1726 goto out_fail;
1727 }
1728
1729 /*
1730 * The ext3 superblock will not be buffer aligned for other than 1kB
1731 * block sizes. We need to calculate the offset from buffer start.
1732 */
1733 if (blocksize != EXT3_MIN_BLOCK_SIZE) {
1734 logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
1735 offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
1736 } else {
1737 logic_sb_block = sb_block;
1738 }
1739
1740 if (!(bh = sb_bread(sb, logic_sb_block))) {
1741 ext3_msg(sb, KERN_ERR, "error: unable to read superblock");
1742 goto out_fail;
1743 }
1744 /*
1745 * Note: s_es must be initialized as soon as possible because
1746 * some ext3 macro-instructions depend on its value
1747 */
1748 es = (struct ext3_super_block *) (bh->b_data + offset);
1749 sbi->s_es = es;
1750 sb->s_magic = le16_to_cpu(es->s_magic);
1751 if (sb->s_magic != EXT3_SUPER_MAGIC)
1752 goto cantfind_ext3;
1753
1754 /* Set defaults before we parse the mount options */
1755 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
1756 if (def_mount_opts & EXT3_DEFM_DEBUG)
1757 set_opt(sbi->s_mount_opt, DEBUG);
1758 if (def_mount_opts & EXT3_DEFM_BSDGROUPS)
1759 set_opt(sbi->s_mount_opt, GRPID);
1760 if (def_mount_opts & EXT3_DEFM_UID16)
1761 set_opt(sbi->s_mount_opt, NO_UID32);
1762#ifdef CONFIG_EXT3_FS_XATTR
1763 if (def_mount_opts & EXT3_DEFM_XATTR_USER)
1764 set_opt(sbi->s_mount_opt, XATTR_USER);
1765#endif
1766#ifdef CONFIG_EXT3_FS_POSIX_ACL
1767 if (def_mount_opts & EXT3_DEFM_ACL)
1768 set_opt(sbi->s_mount_opt, POSIX_ACL);
1769#endif
1770 if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA)
1771 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
1772 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED)
1773 set_opt(sbi->s_mount_opt, ORDERED_DATA);
1774 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK)
1775 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
1776
1777 if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC)
1778 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
1779 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_CONTINUE)
1780 set_opt(sbi->s_mount_opt, ERRORS_CONT);
1781 else
1782 set_opt(sbi->s_mount_opt, ERRORS_RO);
1783
1784 sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
1785 sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
1786
1787 /* enable barriers by default */
1788 set_opt(sbi->s_mount_opt, BARRIER);
1789 set_opt(sbi->s_mount_opt, RESERVATION);
1790
1791 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
1792 NULL, 0))
1793 goto failed_mount;
1794
1795 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
1796 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
1797
1798 if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
1799 (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
1800 EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
1801 EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
1802 ext3_msg(sb, KERN_WARNING,
1803 "warning: feature flags set on rev 0 fs, "
1804 "running e2fsck is recommended");
1805 /*
1806 * Check feature flags regardless of the revision level, since we
1807 * previously didn't change the revision level when setting the flags,
1808 * so there is a chance incompat flags are set on a rev 0 filesystem.
1809 */
1810 features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP);
1811 if (features) {
1812 ext3_msg(sb, KERN_ERR,
1813 "error: couldn't mount because of unsupported "
1814 "optional features (%x)", le32_to_cpu(features));
1815 goto failed_mount;
1816 }
1817 features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP);
1818 if (!(sb->s_flags & MS_RDONLY) && features) {
1819 ext3_msg(sb, KERN_ERR,
1820 "error: couldn't mount RDWR because of unsupported "
1821 "optional features (%x)", le32_to_cpu(features));
1822 goto failed_mount;
1823 }
1824 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
1825
1826 if (blocksize < EXT3_MIN_BLOCK_SIZE ||
1827 blocksize > EXT3_MAX_BLOCK_SIZE) {
1828 ext3_msg(sb, KERN_ERR,
1829 "error: couldn't mount because of unsupported "
1830 "filesystem blocksize %d", blocksize);
1831 goto failed_mount;
1832 }
1833
1834 hblock = bdev_logical_block_size(sb->s_bdev);
1835 if (sb->s_blocksize != blocksize) {
1836 /*
1837 * Make sure the blocksize for the filesystem is larger
1838 * than the hardware sectorsize for the machine.
1839 */
1840 if (blocksize < hblock) {
1841 ext3_msg(sb, KERN_ERR,
1842 "error: fsblocksize %d too small for "
1843 "hardware sectorsize %d", blocksize, hblock);
1844 goto failed_mount;
1845 }
1846
1847 brelse (bh);
1848 if (!sb_set_blocksize(sb, blocksize)) {
1849 ext3_msg(sb, KERN_ERR,
1850 "error: bad blocksize %d", blocksize);
1851 goto out_fail;
1852 }
1853 logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
1854 offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
1855 bh = sb_bread(sb, logic_sb_block);
1856 if (!bh) {
1857 ext3_msg(sb, KERN_ERR,
1858 "error: can't read superblock on 2nd try");
1859 goto failed_mount;
1860 }
1861 es = (struct ext3_super_block *)(bh->b_data + offset);
1862 sbi->s_es = es;
1863 if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
1864 ext3_msg(sb, KERN_ERR,
1865 "error: magic mismatch");
1866 goto failed_mount;
1867 }
1868 }
1869
1870 sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits);
1871
1872 if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) {
1873 sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE;
1874 sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO;
1875 } else {
1876 sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
1877 sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
1878 if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) ||
1879 (!is_power_of_2(sbi->s_inode_size)) ||
1880 (sbi->s_inode_size > blocksize)) {
1881 ext3_msg(sb, KERN_ERR,
1882 "error: unsupported inode size: %d",
1883 sbi->s_inode_size);
1884 goto failed_mount;
1885 }
1886 }
1887 sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
1888 le32_to_cpu(es->s_log_frag_size);
1889 if (blocksize != sbi->s_frag_size) {
1890 ext3_msg(sb, KERN_ERR,
1891 "error: fragsize %lu != blocksize %u (unsupported)",
1892 sbi->s_frag_size, blocksize);
1893 goto failed_mount;
1894 }
1895 sbi->s_frags_per_block = 1;
1896 sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
1897 sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
1898 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
1899 if (EXT3_INODE_SIZE(sb) == 0 || EXT3_INODES_PER_GROUP(sb) == 0)
1900 goto cantfind_ext3;
1901 sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
1902 if (sbi->s_inodes_per_block == 0)
1903 goto cantfind_ext3;
1904 sbi->s_itb_per_group = sbi->s_inodes_per_group /
1905 sbi->s_inodes_per_block;
1906 sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc);
1907 sbi->s_sbh = bh;
1908 sbi->s_mount_state = le16_to_cpu(es->s_state);
1909 sbi->s_addr_per_block_bits = ilog2(EXT3_ADDR_PER_BLOCK(sb));
1910 sbi->s_desc_per_block_bits = ilog2(EXT3_DESC_PER_BLOCK(sb));
1911 for (i = 0; i < 4; i++)
1912 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
1913 sbi->s_def_hash_version = es->s_def_hash_version;
1914 i = le32_to_cpu(es->s_flags);
1915 if (i & EXT2_FLAGS_UNSIGNED_HASH)
1916 sbi->s_hash_unsigned = 3;
1917 else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
1918#ifdef __CHAR_UNSIGNED__
1919 es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
1920 sbi->s_hash_unsigned = 3;
1921#else
1922 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
1923#endif
1924 }
1925
1926 if (sbi->s_blocks_per_group > blocksize * 8) {
1927 ext3_msg(sb, KERN_ERR,
1928 "#blocks per group too big: %lu",
1929 sbi->s_blocks_per_group);
1930 goto failed_mount;
1931 }
1932 if (sbi->s_frags_per_group > blocksize * 8) {
1933 ext3_msg(sb, KERN_ERR,
1934 "error: #fragments per group too big: %lu",
1935 sbi->s_frags_per_group);
1936 goto failed_mount;
1937 }
1938 if (sbi->s_inodes_per_group > blocksize * 8) {
1939 ext3_msg(sb, KERN_ERR,
1940 "error: #inodes per group too big: %lu",
1941 sbi->s_inodes_per_group);
1942 goto failed_mount;
1943 }
1944
1945 err = generic_check_addressable(sb->s_blocksize_bits,
1946 le32_to_cpu(es->s_blocks_count));
1947 if (err) {
1948 ext3_msg(sb, KERN_ERR,
1949 "error: filesystem is too large to mount safely");
1950 if (sizeof(sector_t) < 8)
1951 ext3_msg(sb, KERN_ERR,
1952 "error: CONFIG_LBDAF not enabled");
1953 ret = err;
1954 goto failed_mount;
1955 }
1956
1957 if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
1958 goto cantfind_ext3;
1959 sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
1960 le32_to_cpu(es->s_first_data_block) - 1)
1961 / EXT3_BLOCKS_PER_GROUP(sb)) + 1;
1962 db_count = DIV_ROUND_UP(sbi->s_groups_count, EXT3_DESC_PER_BLOCK(sb));
1963 sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
1964 GFP_KERNEL);
1965 if (sbi->s_group_desc == NULL) {
1966 ext3_msg(sb, KERN_ERR,
1967 "error: not enough memory");
1968 ret = -ENOMEM;
1969 goto failed_mount;
1970 }
1971
1972 bgl_lock_init(sbi->s_blockgroup_lock);
1973
1974 for (i = 0; i < db_count; i++) {
1975 block = descriptor_loc(sb, logic_sb_block, i);
1976 sbi->s_group_desc[i] = sb_bread(sb, block);
1977 if (!sbi->s_group_desc[i]) {
1978 ext3_msg(sb, KERN_ERR,
1979 "error: can't read group descriptor %d", i);
1980 db_count = i;
1981 goto failed_mount2;
1982 }
1983 }
1984 if (!ext3_check_descriptors (sb)) {
1985 ext3_msg(sb, KERN_ERR,
1986 "error: group descriptors corrupted");
1987 goto failed_mount2;
1988 }
1989 sbi->s_gdb_count = db_count;
1990 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
1991 spin_lock_init(&sbi->s_next_gen_lock);
1992
1993 /* per fileystem reservation list head & lock */
1994 spin_lock_init(&sbi->s_rsv_window_lock);
1995 sbi->s_rsv_window_root = RB_ROOT;
1996 /* Add a single, static dummy reservation to the start of the
1997 * reservation window list --- it gives us a placeholder for
1998 * append-at-start-of-list which makes the allocation logic
1999 * _much_ simpler. */
2000 sbi->s_rsv_window_head.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
2001 sbi->s_rsv_window_head.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
2002 sbi->s_rsv_window_head.rsv_alloc_hit = 0;
2003 sbi->s_rsv_window_head.rsv_goal_size = 0;
2004 ext3_rsv_window_add(sb, &sbi->s_rsv_window_head);
2005
2006 /*
2007 * set up enough so that it can read an inode
2008 */
2009 sb->s_op = &ext3_sops;
2010 sb->s_export_op = &ext3_export_ops;
2011 sb->s_xattr = ext3_xattr_handlers;
2012#ifdef CONFIG_QUOTA
2013 sb->s_qcop = &ext3_qctl_operations;
2014 sb->dq_op = &ext3_quota_operations;
2015 sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
2016#endif
2017 memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
2018 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
2019 mutex_init(&sbi->s_orphan_lock);
2020 mutex_init(&sbi->s_resize_lock);
2021
2022 sb->s_root = NULL;
2023
2024 needs_recovery = (es->s_last_orphan != 0 ||
2025 EXT3_HAS_INCOMPAT_FEATURE(sb,
2026 EXT3_FEATURE_INCOMPAT_RECOVER));
2027
2028 /*
2029 * The first inode we look at is the journal inode. Don't try
2030 * root first: it may be modified in the journal!
2031 */
2032 if (!test_opt(sb, NOLOAD) &&
2033 EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
2034 if (ext3_load_journal(sb, es, journal_devnum))
2035 goto failed_mount2;
2036 } else if (journal_inum) {
2037 if (ext3_create_journal(sb, es, journal_inum))
2038 goto failed_mount2;
2039 } else {
2040 if (!silent)
2041 ext3_msg(sb, KERN_ERR,
2042 "error: no journal found. "
2043 "mounting ext3 over ext2?");
2044 goto failed_mount2;
2045 }
2046 err = percpu_counter_init(&sbi->s_freeblocks_counter,
2047 ext3_count_free_blocks(sb), GFP_KERNEL);
2048 if (!err) {
2049 err = percpu_counter_init(&sbi->s_freeinodes_counter,
2050 ext3_count_free_inodes(sb), GFP_KERNEL);
2051 }
2052 if (!err) {
2053 err = percpu_counter_init(&sbi->s_dirs_counter,
2054 ext3_count_dirs(sb), GFP_KERNEL);
2055 }
2056 if (err) {
2057 ext3_msg(sb, KERN_ERR, "error: insufficient memory");
2058 ret = err;
2059 goto failed_mount3;
2060 }
2061
2062 /* We have now updated the journal if required, so we can
2063 * validate the data journaling mode. */
2064 switch (test_opt(sb, DATA_FLAGS)) {
2065 case 0:
2066 /* No mode set, assume a default based on the journal
2067 capabilities: ORDERED_DATA if the journal can
2068 cope, else JOURNAL_DATA */
2069 if (journal_check_available_features
2070 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
2071 set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
2072 else
2073 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
2074 break;
2075
2076 case EXT3_MOUNT_ORDERED_DATA:
2077 case EXT3_MOUNT_WRITEBACK_DATA:
2078 if (!journal_check_available_features
2079 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
2080 ext3_msg(sb, KERN_ERR,
2081 "error: journal does not support "
2082 "requested data journaling mode");
2083 goto failed_mount3;
2084 }
2085 default:
2086 break;
2087 }
2088
2089 /*
2090 * The journal_load will have done any necessary log recovery,
2091 * so we can safely mount the rest of the filesystem now.
2092 */
2093
2094 root = ext3_iget(sb, EXT3_ROOT_INO);
2095 if (IS_ERR(root)) {
2096 ext3_msg(sb, KERN_ERR, "error: get root inode failed");
2097 ret = PTR_ERR(root);
2098 goto failed_mount3;
2099 }
2100 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
2101 iput(root);
2102 ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
2103 goto failed_mount3;
2104 }
2105 sb->s_root = d_make_root(root);
2106 if (!sb->s_root) {
2107 ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
2108 ret = -ENOMEM;
2109 goto failed_mount3;
2110 }
2111
2112 if (ext3_setup_super(sb, es, sb->s_flags & MS_RDONLY))
2113 sb->s_flags |= MS_RDONLY;
2114
2115 EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
2116 ext3_orphan_cleanup(sb, es);
2117 EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
2118 if (needs_recovery) {
2119 ext3_mark_recovery_complete(sb, es);
2120 ext3_msg(sb, KERN_INFO, "recovery complete");
2121 }
2122 ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode",
2123 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
2124 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
2125 "writeback");
2126
2127 return 0;
2128
2129cantfind_ext3:
2130 if (!silent)
2131 ext3_msg(sb, KERN_INFO,
2132 "error: can't find ext3 filesystem on dev %s.",
2133 sb->s_id);
2134 goto failed_mount;
2135
2136failed_mount3:
2137 percpu_counter_destroy(&sbi->s_freeblocks_counter);
2138 percpu_counter_destroy(&sbi->s_freeinodes_counter);
2139 percpu_counter_destroy(&sbi->s_dirs_counter);
2140 journal_destroy(sbi->s_journal);
2141failed_mount2:
2142 for (i = 0; i < db_count; i++)
2143 brelse(sbi->s_group_desc[i]);
2144 kfree(sbi->s_group_desc);
2145failed_mount:
2146#ifdef CONFIG_QUOTA
2147 for (i = 0; i < EXT3_MAXQUOTAS; i++)
2148 kfree(sbi->s_qf_names[i]);
2149#endif
2150 ext3_blkdev_remove(sbi);
2151 brelse(bh);
2152out_fail:
2153 sb->s_fs_info = NULL;
2154 kfree(sbi->s_blockgroup_lock);
2155 kfree(sbi);
2156 return ret;
2157}
2158
2159/*
2160 * Setup any per-fs journal parameters now. We'll do this both on
2161 * initial mount, once the journal has been initialised but before we've
2162 * done any recovery; and again on any subsequent remount.
2163 */
2164static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
2165{
2166 struct ext3_sb_info *sbi = EXT3_SB(sb);
2167
2168 if (sbi->s_commit_interval)
2169 journal->j_commit_interval = sbi->s_commit_interval;
2170 /* We could also set up an ext3-specific default for the commit
2171 * interval here, but for now we'll just fall back to the jbd
2172 * default. */
2173
2174 spin_lock(&journal->j_state_lock);
2175 if (test_opt(sb, BARRIER))
2176 journal->j_flags |= JFS_BARRIER;
2177 else
2178 journal->j_flags &= ~JFS_BARRIER;
2179 if (test_opt(sb, DATA_ERR_ABORT))
2180 journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR;
2181 else
2182 journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR;
2183 spin_unlock(&journal->j_state_lock);
2184}
2185
2186static journal_t *ext3_get_journal(struct super_block *sb,
2187 unsigned int journal_inum)
2188{
2189 struct inode *journal_inode;
2190 journal_t *journal;
2191
2192 /* First, test for the existence of a valid inode on disk. Bad
2193 * things happen if we iget() an unused inode, as the subsequent
2194 * iput() will try to delete it. */
2195
2196 journal_inode = ext3_iget(sb, journal_inum);
2197 if (IS_ERR(journal_inode)) {
2198 ext3_msg(sb, KERN_ERR, "error: no journal found");
2199 return NULL;
2200 }
2201 if (!journal_inode->i_nlink) {
2202 make_bad_inode(journal_inode);
2203 iput(journal_inode);
2204 ext3_msg(sb, KERN_ERR, "error: journal inode is deleted");
2205 return NULL;
2206 }
2207
2208 jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
2209 journal_inode, journal_inode->i_size);
2210 if (!S_ISREG(journal_inode->i_mode)) {
2211 ext3_msg(sb, KERN_ERR, "error: invalid journal inode");
2212 iput(journal_inode);
2213 return NULL;
2214 }
2215
2216 journal = journal_init_inode(journal_inode);
2217 if (!journal) {
2218 ext3_msg(sb, KERN_ERR, "error: could not load journal inode");
2219 iput(journal_inode);
2220 return NULL;
2221 }
2222 journal->j_private = sb;
2223 ext3_init_journal_params(sb, journal);
2224 return journal;
2225}
2226
2227static journal_t *ext3_get_dev_journal(struct super_block *sb,
2228 dev_t j_dev)
2229{
2230 struct buffer_head * bh;
2231 journal_t *journal;
2232 ext3_fsblk_t start;
2233 ext3_fsblk_t len;
2234 int hblock, blocksize;
2235 ext3_fsblk_t sb_block;
2236 unsigned long offset;
2237 struct ext3_super_block * es;
2238 struct block_device *bdev;
2239
2240 bdev = ext3_blkdev_get(j_dev, sb);
2241 if (bdev == NULL)
2242 return NULL;
2243
2244 blocksize = sb->s_blocksize;
2245 hblock = bdev_logical_block_size(bdev);
2246 if (blocksize < hblock) {
2247 ext3_msg(sb, KERN_ERR,
2248 "error: blocksize too small for journal device");
2249 goto out_bdev;
2250 }
2251
2252 sb_block = EXT3_MIN_BLOCK_SIZE / blocksize;
2253 offset = EXT3_MIN_BLOCK_SIZE % blocksize;
2254 set_blocksize(bdev, blocksize);
2255 if (!(bh = __bread(bdev, sb_block, blocksize))) {
2256 ext3_msg(sb, KERN_ERR, "error: couldn't read superblock of "
2257 "external journal");
2258 goto out_bdev;
2259 }
2260
2261 es = (struct ext3_super_block *) (bh->b_data + offset);
2262 if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
2263 !(le32_to_cpu(es->s_feature_incompat) &
2264 EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
2265 ext3_msg(sb, KERN_ERR, "error: external journal has "
2266 "bad superblock");
2267 brelse(bh);
2268 goto out_bdev;
2269 }
2270
2271 if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
2272 ext3_msg(sb, KERN_ERR, "error: journal UUID does not match");
2273 brelse(bh);
2274 goto out_bdev;
2275 }
2276
2277 len = le32_to_cpu(es->s_blocks_count);
2278 start = sb_block + 1;
2279 brelse(bh); /* we're done with the superblock */
2280
2281 journal = journal_init_dev(bdev, sb->s_bdev,
2282 start, len, blocksize);
2283 if (!journal) {
2284 ext3_msg(sb, KERN_ERR,
2285 "error: failed to create device journal");
2286 goto out_bdev;
2287 }
2288 journal->j_private = sb;
2289 if (!bh_uptodate_or_lock(journal->j_sb_buffer)) {
2290 if (bh_submit_read(journal->j_sb_buffer)) {
2291 ext3_msg(sb, KERN_ERR, "I/O error on journal device");
2292 goto out_journal;
2293 }
2294 }
2295 if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
2296 ext3_msg(sb, KERN_ERR,
2297 "error: external journal has more than one "
2298 "user (unsupported) - %d",
2299 be32_to_cpu(journal->j_superblock->s_nr_users));
2300 goto out_journal;
2301 }
2302 EXT3_SB(sb)->journal_bdev = bdev;
2303 ext3_init_journal_params(sb, journal);
2304 return journal;
2305out_journal:
2306 journal_destroy(journal);
2307out_bdev:
2308 ext3_blkdev_put(bdev);
2309 return NULL;
2310}
2311
2312static int ext3_load_journal(struct super_block *sb,
2313 struct ext3_super_block *es,
2314 unsigned long journal_devnum)
2315{
2316 journal_t *journal;
2317 unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
2318 dev_t journal_dev;
2319 int err = 0;
2320 int really_read_only;
2321
2322 if (journal_devnum &&
2323 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2324 ext3_msg(sb, KERN_INFO, "external journal device major/minor "
2325 "numbers have changed");
2326 journal_dev = new_decode_dev(journal_devnum);
2327 } else
2328 journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
2329
2330 really_read_only = bdev_read_only(sb->s_bdev);
2331
2332 /*
2333 * Are we loading a blank journal or performing recovery after a
2334 * crash? For recovery, we need to check in advance whether we
2335 * can get read-write access to the device.
2336 */
2337
2338 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
2339 if (sb->s_flags & MS_RDONLY) {
2340 ext3_msg(sb, KERN_INFO,
2341 "recovery required on readonly filesystem");
2342 if (really_read_only) {
2343 ext3_msg(sb, KERN_ERR, "error: write access "
2344 "unavailable, cannot proceed");
2345 return -EROFS;
2346 }
2347 ext3_msg(sb, KERN_INFO,
2348 "write access will be enabled during recovery");
2349 }
2350 }
2351
2352 if (journal_inum && journal_dev) {
2353 ext3_msg(sb, KERN_ERR, "error: filesystem has both journal "
2354 "and inode journals");
2355 return -EINVAL;
2356 }
2357
2358 if (journal_inum) {
2359 if (!(journal = ext3_get_journal(sb, journal_inum)))
2360 return -EINVAL;
2361 } else {
2362 if (!(journal = ext3_get_dev_journal(sb, journal_dev)))
2363 return -EINVAL;
2364 }
2365
2366 if (!(journal->j_flags & JFS_BARRIER))
2367 printk(KERN_INFO "EXT3-fs: barriers not enabled\n");
2368
2369 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
2370 err = journal_update_format(journal);
2371 if (err) {
2372 ext3_msg(sb, KERN_ERR, "error updating journal");
2373 journal_destroy(journal);
2374 return err;
2375 }
2376 }
2377
2378 if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER))
2379 err = journal_wipe(journal, !really_read_only);
2380 if (!err)
2381 err = journal_load(journal);
2382
2383 if (err) {
2384 ext3_msg(sb, KERN_ERR, "error loading journal");
2385 journal_destroy(journal);
2386 return err;
2387 }
2388
2389 EXT3_SB(sb)->s_journal = journal;
2390 ext3_clear_journal_err(sb, es);
2391
2392 if (!really_read_only && journal_devnum &&
2393 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2394 es->s_journal_dev = cpu_to_le32(journal_devnum);
2395
2396 /* Make sure we flush the recovery flag to disk. */
2397 ext3_commit_super(sb, es, 1);
2398 }
2399
2400 return 0;
2401}
2402
2403static int ext3_create_journal(struct super_block *sb,
2404 struct ext3_super_block *es,
2405 unsigned int journal_inum)
2406{
2407 journal_t *journal;
2408 int err;
2409
2410 if (sb->s_flags & MS_RDONLY) {
2411 ext3_msg(sb, KERN_ERR,
2412 "error: readonly filesystem when trying to "
2413 "create journal");
2414 return -EROFS;
2415 }
2416
2417 journal = ext3_get_journal(sb, journal_inum);
2418 if (!journal)
2419 return -EINVAL;
2420
2421 ext3_msg(sb, KERN_INFO, "creating new journal on inode %u",
2422 journal_inum);
2423
2424 err = journal_create(journal);
2425 if (err) {
2426 ext3_msg(sb, KERN_ERR, "error creating journal");
2427 journal_destroy(journal);
2428 return -EIO;
2429 }
2430
2431 EXT3_SB(sb)->s_journal = journal;
2432
2433 ext3_update_dynamic_rev(sb);
2434 EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2435 EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
2436
2437 es->s_journal_inum = cpu_to_le32(journal_inum);
2438
2439 /* Make sure we flush the recovery flag to disk. */
2440 ext3_commit_super(sb, es, 1);
2441
2442 return 0;
2443}
2444
2445static int ext3_commit_super(struct super_block *sb,
2446 struct ext3_super_block *es,
2447 int sync)
2448{
2449 struct buffer_head *sbh = EXT3_SB(sb)->s_sbh;
2450 int error = 0;
2451
2452 if (!sbh)
2453 return error;
2454
2455 if (buffer_write_io_error(sbh)) {
2456 /*
2457 * Oh, dear. A previous attempt to write the
2458 * superblock failed. This could happen because the
2459 * USB device was yanked out. Or it could happen to
2460 * be a transient write error and maybe the block will
2461 * be remapped. Nothing we can do but to retry the
2462 * write and hope for the best.
2463 */
2464 ext3_msg(sb, KERN_ERR, "previous I/O error to "
2465 "superblock detected");
2466 clear_buffer_write_io_error(sbh);
2467 set_buffer_uptodate(sbh);
2468 }
2469 /*
2470 * If the file system is mounted read-only, don't update the
2471 * superblock write time. This avoids updating the superblock
2472 * write time when we are mounting the root file system
2473 * read/only but we need to replay the journal; at that point,
2474 * for people who are east of GMT and who make their clock
2475 * tick in localtime for Windows bug-for-bug compatibility,
2476 * the clock is set in the future, and this will cause e2fsck
2477 * to complain and force a full file system check.
2478 */
2479 if (!(sb->s_flags & MS_RDONLY))
2480 es->s_wtime = cpu_to_le32(get_seconds());
2481 es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
2482 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
2483 BUFFER_TRACE(sbh, "marking dirty");
2484 mark_buffer_dirty(sbh);
2485 if (sync) {
2486 error = sync_dirty_buffer(sbh);
2487 if (buffer_write_io_error(sbh)) {
2488 ext3_msg(sb, KERN_ERR, "I/O error while writing "
2489 "superblock");
2490 clear_buffer_write_io_error(sbh);
2491 set_buffer_uptodate(sbh);
2492 }
2493 }
2494 return error;
2495}
2496
2497
2498/*
2499 * Have we just finished recovery? If so, and if we are mounting (or
2500 * remounting) the filesystem readonly, then we will end up with a
2501 * consistent fs on disk. Record that fact.
2502 */
2503static void ext3_mark_recovery_complete(struct super_block * sb,
2504 struct ext3_super_block * es)
2505{
2506 journal_t *journal = EXT3_SB(sb)->s_journal;
2507
2508 journal_lock_updates(journal);
2509 if (journal_flush(journal) < 0)
2510 goto out;
2511
2512 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
2513 sb->s_flags & MS_RDONLY) {
2514 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2515 ext3_commit_super(sb, es, 1);
2516 }
2517
2518out:
2519 journal_unlock_updates(journal);
2520}
2521
2522/*
2523 * If we are mounting (or read-write remounting) a filesystem whose journal
2524 * has recorded an error from a previous lifetime, move that error to the
2525 * main filesystem now.
2526 */
2527static void ext3_clear_journal_err(struct super_block *sb,
2528 struct ext3_super_block *es)
2529{
2530 journal_t *journal;
2531 int j_errno;
2532 const char *errstr;
2533
2534 journal = EXT3_SB(sb)->s_journal;
2535
2536 /*
2537 * Now check for any error status which may have been recorded in the
2538 * journal by a prior ext3_error() or ext3_abort()
2539 */
2540
2541 j_errno = journal_errno(journal);
2542 if (j_errno) {
2543 char nbuf[16];
2544
2545 errstr = ext3_decode_error(sb, j_errno, nbuf);
2546 ext3_warning(sb, __func__, "Filesystem error recorded "
2547 "from previous mount: %s", errstr);
2548 ext3_warning(sb, __func__, "Marking fs in need of "
2549 "filesystem check.");
2550
2551 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
2552 es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
2553 ext3_commit_super (sb, es, 1);
2554
2555 journal_clear_err(journal);
2556 }
2557}
2558
2559/*
2560 * Force the running and committing transactions to commit,
2561 * and wait on the commit.
2562 */
2563int ext3_force_commit(struct super_block *sb)
2564{
2565 journal_t *journal;
2566 int ret;
2567
2568 if (sb->s_flags & MS_RDONLY)
2569 return 0;
2570
2571 journal = EXT3_SB(sb)->s_journal;
2572 ret = ext3_journal_force_commit(journal);
2573 return ret;
2574}
2575
2576static int ext3_sync_fs(struct super_block *sb, int wait)
2577{
2578 tid_t target;
2579
2580 trace_ext3_sync_fs(sb, wait);
2581 /*
2582 * Writeback quota in non-journalled quota case - journalled quota has
2583 * no dirty dquots
2584 */
2585 dquot_writeback_dquots(sb, -1);
2586 if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
2587 if (wait)
2588 log_wait_commit(EXT3_SB(sb)->s_journal, target);
2589 }
2590 return 0;
2591}
2592
2593/*
2594 * LVM calls this function before a (read-only) snapshot is created. This
2595 * gives us a chance to flush the journal completely and mark the fs clean.
2596 */
2597static int ext3_freeze(struct super_block *sb)
2598{
2599 int error = 0;
2600 journal_t *journal;
2601
2602 if (!(sb->s_flags & MS_RDONLY)) {
2603 journal = EXT3_SB(sb)->s_journal;
2604
2605 /* Now we set up the journal barrier. */
2606 journal_lock_updates(journal);
2607
2608 /*
2609 * We don't want to clear needs_recovery flag when we failed
2610 * to flush the journal.
2611 */
2612 error = journal_flush(journal);
2613 if (error < 0)
2614 goto out;
2615
2616 /* Journal blocked and flushed, clear needs_recovery flag. */
2617 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2618 error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
2619 if (error)
2620 goto out;
2621 }
2622 return 0;
2623
2624out:
2625 journal_unlock_updates(journal);
2626 return error;
2627}
2628
2629/*
2630 * Called by LVM after the snapshot is done. We need to reset the RECOVER
2631 * flag here, even though the filesystem is not technically dirty yet.
2632 */
2633static int ext3_unfreeze(struct super_block *sb)
2634{
2635 if (!(sb->s_flags & MS_RDONLY)) {
2636 /* Reser the needs_recovery flag before the fs is unlocked. */
2637 EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2638 ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
2639 journal_unlock_updates(EXT3_SB(sb)->s_journal);
2640 }
2641 return 0;
2642}
2643
2644static int ext3_remount (struct super_block * sb, int * flags, char * data)
2645{
2646 struct ext3_super_block * es;
2647 struct ext3_sb_info *sbi = EXT3_SB(sb);
2648 ext3_fsblk_t n_blocks_count = 0;
2649 unsigned long old_sb_flags;
2650 struct ext3_mount_options old_opts;
2651 int enable_quota = 0;
2652 int err;
2653#ifdef CONFIG_QUOTA
2654 int i;
2655#endif
2656
2657 sync_filesystem(sb);
2658
2659 /* Store the original options */
2660 old_sb_flags = sb->s_flags;
2661 old_opts.s_mount_opt = sbi->s_mount_opt;
2662 old_opts.s_resuid = sbi->s_resuid;
2663 old_opts.s_resgid = sbi->s_resgid;
2664 old_opts.s_commit_interval = sbi->s_commit_interval;
2665#ifdef CONFIG_QUOTA
2666 old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
2667 for (i = 0; i < EXT3_MAXQUOTAS; i++)
2668 if (sbi->s_qf_names[i]) {
2669 old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
2670 GFP_KERNEL);
2671 if (!old_opts.s_qf_names[i]) {
2672 int j;
2673
2674 for (j = 0; j < i; j++)
2675 kfree(old_opts.s_qf_names[j]);
2676 return -ENOMEM;
2677 }
2678 } else
2679 old_opts.s_qf_names[i] = NULL;
2680#endif
2681
2682 /*
2683 * Allow the "check" option to be passed as a remount option.
2684 */
2685 if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
2686 err = -EINVAL;
2687 goto restore_opts;
2688 }
2689
2690 if (test_opt(sb, ABORT))
2691 ext3_abort(sb, __func__, "Abort forced by user");
2692
2693 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
2694 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
2695
2696 es = sbi->s_es;
2697
2698 ext3_init_journal_params(sb, sbi->s_journal);
2699
2700 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
2701 n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
2702 if (test_opt(sb, ABORT)) {
2703 err = -EROFS;
2704 goto restore_opts;
2705 }
2706
2707 if (*flags & MS_RDONLY) {
2708 err = dquot_suspend(sb, -1);
2709 if (err < 0)
2710 goto restore_opts;
2711
2712 /*
2713 * First of all, the unconditional stuff we have to do
2714 * to disable replay of the journal when we next remount
2715 */
2716 sb->s_flags |= MS_RDONLY;
2717
2718 /*
2719 * OK, test if we are remounting a valid rw partition
2720 * readonly, and if so set the rdonly flag and then
2721 * mark the partition as valid again.
2722 */
2723 if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) &&
2724 (sbi->s_mount_state & EXT3_VALID_FS))
2725 es->s_state = cpu_to_le16(sbi->s_mount_state);
2726
2727 ext3_mark_recovery_complete(sb, es);
2728 } else {
2729 __le32 ret;
2730 if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
2731 ~EXT3_FEATURE_RO_COMPAT_SUPP))) {
2732 ext3_msg(sb, KERN_WARNING,
2733 "warning: couldn't remount RDWR "
2734 "because of unsupported optional "
2735 "features (%x)", le32_to_cpu(ret));
2736 err = -EROFS;
2737 goto restore_opts;
2738 }
2739
2740 /*
2741 * If we have an unprocessed orphan list hanging
2742 * around from a previously readonly bdev mount,
2743 * require a full umount & mount for now.
2744 */
2745 if (es->s_last_orphan) {
2746 ext3_msg(sb, KERN_WARNING, "warning: couldn't "
2747 "remount RDWR because of unprocessed "
2748 "orphan inode list. Please "
2749 "umount & mount instead.");
2750 err = -EINVAL;
2751 goto restore_opts;
2752 }
2753
2754 /*
2755 * Mounting a RDONLY partition read-write, so reread
2756 * and store the current valid flag. (It may have
2757 * been changed by e2fsck since we originally mounted
2758 * the partition.)
2759 */
2760 ext3_clear_journal_err(sb, es);
2761 sbi->s_mount_state = le16_to_cpu(es->s_state);
2762 if ((err = ext3_group_extend(sb, es, n_blocks_count)))
2763 goto restore_opts;
2764 if (!ext3_setup_super (sb, es, 0))
2765 sb->s_flags &= ~MS_RDONLY;
2766 enable_quota = 1;
2767 }
2768 }
2769#ifdef CONFIG_QUOTA
2770 /* Release old quota file names */
2771 for (i = 0; i < EXT3_MAXQUOTAS; i++)
2772 kfree(old_opts.s_qf_names[i]);
2773#endif
2774 if (enable_quota)
2775 dquot_resume(sb, -1);
2776 return 0;
2777restore_opts:
2778 sb->s_flags = old_sb_flags;
2779 sbi->s_mount_opt = old_opts.s_mount_opt;
2780 sbi->s_resuid = old_opts.s_resuid;
2781 sbi->s_resgid = old_opts.s_resgid;
2782 sbi->s_commit_interval = old_opts.s_commit_interval;
2783#ifdef CONFIG_QUOTA
2784 sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
2785 for (i = 0; i < EXT3_MAXQUOTAS; i++) {
2786 kfree(sbi->s_qf_names[i]);
2787 sbi->s_qf_names[i] = old_opts.s_qf_names[i];
2788 }
2789#endif
2790 return err;
2791}
2792
2793static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
2794{
2795 struct super_block *sb = dentry->d_sb;
2796 struct ext3_sb_info *sbi = EXT3_SB(sb);
2797 struct ext3_super_block *es = sbi->s_es;
2798 u64 fsid;
2799
2800 if (test_opt(sb, MINIX_DF)) {
2801 sbi->s_overhead_last = 0;
2802 } else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
2803 unsigned long ngroups = sbi->s_groups_count, i;
2804 ext3_fsblk_t overhead = 0;
2805 smp_rmb();
2806
2807 /*
2808 * Compute the overhead (FS structures). This is constant
2809 * for a given filesystem unless the number of block groups
2810 * changes so we cache the previous value until it does.
2811 */
2812
2813 /*
2814 * All of the blocks before first_data_block are
2815 * overhead
2816 */
2817 overhead = le32_to_cpu(es->s_first_data_block);
2818
2819 /*
2820 * Add the overhead attributed to the superblock and
2821 * block group descriptors. If the sparse superblocks
2822 * feature is turned on, then not all groups have this.
2823 */
2824 for (i = 0; i < ngroups; i++) {
2825 overhead += ext3_bg_has_super(sb, i) +
2826 ext3_bg_num_gdb(sb, i);
2827 cond_resched();
2828 }
2829
2830 /*
2831 * Every block group has an inode bitmap, a block
2832 * bitmap, and an inode table.
2833 */
2834 overhead += ngroups * (2 + sbi->s_itb_per_group);
2835
2836 /* Add the internal journal blocks as well */
2837 if (sbi->s_journal && !sbi->journal_bdev)
2838 overhead += sbi->s_journal->j_maxlen;
2839
2840 sbi->s_overhead_last = overhead;
2841 smp_wmb();
2842 sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
2843 }
2844
2845 buf->f_type = EXT3_SUPER_MAGIC;
2846 buf->f_bsize = sb->s_blocksize;
2847 buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
2848 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
2849 buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
2850 if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
2851 buf->f_bavail = 0;
2852 buf->f_files = le32_to_cpu(es->s_inodes_count);
2853 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
2854 buf->f_namelen = EXT3_NAME_LEN;
2855 fsid = le64_to_cpup((void *)es->s_uuid) ^
2856 le64_to_cpup((void *)es->s_uuid + sizeof(u64));
2857 buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
2858 buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
2859 return 0;
2860}
2861
2862/* Helper function for writing quotas on sync - we need to start transaction before quota file
2863 * is locked for write. Otherwise the are possible deadlocks:
2864 * Process 1 Process 2
2865 * ext3_create() quota_sync()
2866 * journal_start() write_dquot()
2867 * dquot_initialize() down(dqio_mutex)
2868 * down(dqio_mutex) journal_start()
2869 *
2870 */
2871
2872#ifdef CONFIG_QUOTA
2873
2874static inline struct inode *dquot_to_inode(struct dquot *dquot)
2875{
2876 return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
2877}
2878
2879static int ext3_write_dquot(struct dquot *dquot)
2880{
2881 int ret, err;
2882 handle_t *handle;
2883 struct inode *inode;
2884
2885 inode = dquot_to_inode(dquot);
2886 handle = ext3_journal_start(inode,
2887 EXT3_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
2888 if (IS_ERR(handle))
2889 return PTR_ERR(handle);
2890 ret = dquot_commit(dquot);
2891 err = ext3_journal_stop(handle);
2892 if (!ret)
2893 ret = err;
2894 return ret;
2895}
2896
2897static int ext3_acquire_dquot(struct dquot *dquot)
2898{
2899 int ret, err;
2900 handle_t *handle;
2901
2902 handle = ext3_journal_start(dquot_to_inode(dquot),
2903 EXT3_QUOTA_INIT_BLOCKS(dquot->dq_sb));
2904 if (IS_ERR(handle))
2905 return PTR_ERR(handle);
2906 ret = dquot_acquire(dquot);
2907 err = ext3_journal_stop(handle);
2908 if (!ret)
2909 ret = err;
2910 return ret;
2911}
2912
2913static int ext3_release_dquot(struct dquot *dquot)
2914{
2915 int ret, err;
2916 handle_t *handle;
2917
2918 handle = ext3_journal_start(dquot_to_inode(dquot),
2919 EXT3_QUOTA_DEL_BLOCKS(dquot->dq_sb));
2920 if (IS_ERR(handle)) {
2921 /* Release dquot anyway to avoid endless cycle in dqput() */
2922 dquot_release(dquot);
2923 return PTR_ERR(handle);
2924 }
2925 ret = dquot_release(dquot);
2926 err = ext3_journal_stop(handle);
2927 if (!ret)
2928 ret = err;
2929 return ret;
2930}
2931
2932static int ext3_mark_dquot_dirty(struct dquot *dquot)
2933{
2934 /* Are we journaling quotas? */
2935 if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
2936 EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
2937 dquot_mark_dquot_dirty(dquot);
2938 return ext3_write_dquot(dquot);
2939 } else {
2940 return dquot_mark_dquot_dirty(dquot);
2941 }
2942}
2943
2944static int ext3_write_info(struct super_block *sb, int type)
2945{
2946 int ret, err;
2947 handle_t *handle;
2948
2949 /* Data block + inode block */
2950 handle = ext3_journal_start(d_inode(sb->s_root), 2);
2951 if (IS_ERR(handle))
2952 return PTR_ERR(handle);
2953 ret = dquot_commit_info(sb, type);
2954 err = ext3_journal_stop(handle);
2955 if (!ret)
2956 ret = err;
2957 return ret;
2958}
2959
2960/*
2961 * Turn on quotas during mount time - we need to find
2962 * the quota file and such...
2963 */
2964static int ext3_quota_on_mount(struct super_block *sb, int type)
2965{
2966 return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
2967 EXT3_SB(sb)->s_jquota_fmt, type);
2968}
2969
2970/*
2971 * Standard function to be called on quota_on
2972 */
2973static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2974 struct path *path)
2975{
2976 int err;
2977
2978 if (!test_opt(sb, QUOTA))
2979 return -EINVAL;
2980
2981 /* Quotafile not on the same filesystem? */
2982 if (path->dentry->d_sb != sb)
2983 return -EXDEV;
2984 /* Journaling quota? */
2985 if (EXT3_SB(sb)->s_qf_names[type]) {
2986 /* Quotafile not of fs root? */
2987 if (path->dentry->d_parent != sb->s_root)
2988 ext3_msg(sb, KERN_WARNING,
2989 "warning: Quota file not on filesystem root. "
2990 "Journaled quota will not work.");
2991 }
2992
2993 /*
2994 * When we journal data on quota file, we have to flush journal to see
2995 * all updates to the file when we bypass pagecache...
2996 */
2997 if (ext3_should_journal_data(d_inode(path->dentry))) {
2998 /*
2999 * We don't need to lock updates but journal_flush() could
3000 * otherwise be livelocked...
3001 */
3002 journal_lock_updates(EXT3_SB(sb)->s_journal);
3003 err = journal_flush(EXT3_SB(sb)->s_journal);
3004 journal_unlock_updates(EXT3_SB(sb)->s_journal);
3005 if (err)
3006 return err;
3007 }
3008
3009 return dquot_quota_on(sb, type, format_id, path);
3010}
3011
3012/* Read data from quotafile - avoid pagecache and such because we cannot afford
3013 * acquiring the locks... As quota files are never truncated and quota code
3014 * itself serializes the operations (and no one else should touch the files)
3015 * we don't have to be afraid of races */
3016static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
3017 size_t len, loff_t off)
3018{
3019 struct inode *inode = sb_dqopt(sb)->files[type];
3020 sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
3021 int err = 0;
3022 int offset = off & (sb->s_blocksize - 1);
3023 int tocopy;
3024 size_t toread;
3025 struct buffer_head *bh;
3026 loff_t i_size = i_size_read(inode);
3027
3028 if (off > i_size)
3029 return 0;
3030 if (off+len > i_size)
3031 len = i_size-off;
3032 toread = len;
3033 while (toread > 0) {
3034 tocopy = sb->s_blocksize - offset < toread ?
3035 sb->s_blocksize - offset : toread;
3036 bh = ext3_bread(NULL, inode, blk, 0, &err);
3037 if (err)
3038 return err;
3039 if (!bh) /* A hole? */
3040 memset(data, 0, tocopy);
3041 else
3042 memcpy(data, bh->b_data+offset, tocopy);
3043 brelse(bh);
3044 offset = 0;
3045 toread -= tocopy;
3046 data += tocopy;
3047 blk++;
3048 }
3049 return len;
3050}
3051
3052/* Write to quotafile (we know the transaction is already started and has
3053 * enough credits) */
3054static ssize_t ext3_quota_write(struct super_block *sb, int type,
3055 const char *data, size_t len, loff_t off)
3056{
3057 struct inode *inode = sb_dqopt(sb)->files[type];
3058 sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
3059 int err = 0;
3060 int offset = off & (sb->s_blocksize - 1);
3061 int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL;
3062 struct buffer_head *bh;
3063 handle_t *handle = journal_current_handle();
3064
3065 if (!handle) {
3066 ext3_msg(sb, KERN_WARNING,
3067 "warning: quota write (off=%llu, len=%llu)"
3068 " cancelled because transaction is not started.",
3069 (unsigned long long)off, (unsigned long long)len);
3070 return -EIO;
3071 }
3072
3073 /*
3074 * Since we account only one data block in transaction credits,
3075 * then it is impossible to cross a block boundary.
3076 */
3077 if (sb->s_blocksize - offset < len) {
3078 ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
3079 " cancelled because not block aligned",
3080 (unsigned long long)off, (unsigned long long)len);
3081 return -EIO;
3082 }
3083 bh = ext3_bread(handle, inode, blk, 1, &err);
3084 if (!bh)
3085 goto out;
3086 if (journal_quota) {
3087 err = ext3_journal_get_write_access(handle, bh);
3088 if (err) {
3089 brelse(bh);
3090 goto out;
3091 }
3092 }
3093 lock_buffer(bh);
3094 memcpy(bh->b_data+offset, data, len);
3095 flush_dcache_page(bh->b_page);
3096 unlock_buffer(bh);
3097 if (journal_quota)
3098 err = ext3_journal_dirty_metadata(handle, bh);
3099 else {
3100 /* Always do at least ordered writes for quotas */
3101 err = ext3_journal_dirty_data(handle, bh);
3102 mark_buffer_dirty(bh);
3103 }
3104 brelse(bh);
3105out:
3106 if (err)
3107 return err;
3108 if (inode->i_size < off + len) {
3109 i_size_write(inode, off + len);
3110 EXT3_I(inode)->i_disksize = inode->i_size;
3111 }
3112 inode->i_version++;
3113 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
3114 ext3_mark_inode_dirty(handle, inode);
3115 return len;
3116}
3117
3118#endif
3119
3120static struct dentry *ext3_mount(struct file_system_type *fs_type,
3121 int flags, const char *dev_name, void *data)
3122{
3123 return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
3124}
3125
3126static struct file_system_type ext3_fs_type = {
3127 .owner = THIS_MODULE,
3128 .name = "ext3",
3129 .mount = ext3_mount,
3130 .kill_sb = kill_block_super,
3131 .fs_flags = FS_REQUIRES_DEV,
3132};
3133MODULE_ALIAS_FS("ext3");
3134
3135static int __init init_ext3_fs(void)
3136{
3137 int err = init_ext3_xattr();
3138 if (err)
3139 return err;
3140 err = init_inodecache();
3141 if (err)
3142 goto out1;
3143 err = register_filesystem(&ext3_fs_type);
3144 if (err)
3145 goto out;
3146 return 0;
3147out:
3148 destroy_inodecache();
3149out1:
3150 exit_ext3_xattr();
3151 return err;
3152}
3153
3154static void __exit exit_ext3_fs(void)
3155{
3156 unregister_filesystem(&ext3_fs_type);
3157 destroy_inodecache();
3158 exit_ext3_xattr();
3159}
3160
3161MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
3162MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
3163MODULE_LICENSE("GPL");
3164module_init(init_ext3_fs)
3165module_exit(exit_ext3_fs)
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
deleted file mode 100644
index c08c59094ae6..000000000000
--- a/fs/ext3/symlink.c
+++ /dev/null
@@ -1,46 +0,0 @@
1/*
2 * linux/fs/ext3/symlink.c
3 *
4 * Only fast symlinks left here - the rest is done by generic code. AV, 1999
5 *
6 * Copyright (C) 1992, 1993, 1994, 1995
7 * Remy Card (card@masi.ibp.fr)
8 * Laboratoire MASI - Institut Blaise Pascal
9 * Universite Pierre et Marie Curie (Paris VI)
10 *
11 * from
12 *
13 * linux/fs/minix/symlink.c
14 *
15 * Copyright (C) 1991, 1992 Linus Torvalds
16 *
17 * ext3 symlink handling code
18 */
19
20#include "ext3.h"
21#include "xattr.h"
22
23const struct inode_operations ext3_symlink_inode_operations = {
24 .readlink = generic_readlink,
25 .follow_link = page_follow_link_light,
26 .put_link = page_put_link,
27 .setattr = ext3_setattr,
28#ifdef CONFIG_EXT3_FS_XATTR
29 .setxattr = generic_setxattr,
30 .getxattr = generic_getxattr,
31 .listxattr = ext3_listxattr,
32 .removexattr = generic_removexattr,
33#endif
34};
35
36const struct inode_operations ext3_fast_symlink_inode_operations = {
37 .readlink = generic_readlink,
38 .follow_link = simple_follow_link,
39 .setattr = ext3_setattr,
40#ifdef CONFIG_EXT3_FS_XATTR
41 .setxattr = generic_setxattr,
42 .getxattr = generic_getxattr,
43 .listxattr = ext3_listxattr,
44 .removexattr = generic_removexattr,
45#endif
46};
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
deleted file mode 100644
index 7cf36501ccf4..000000000000
--- a/fs/ext3/xattr.c
+++ /dev/null
@@ -1,1330 +0,0 @@
1/*
2 * linux/fs/ext3/xattr.c
3 *
4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
5 *
6 * Fix by Harrison Xing <harrison@mountainviewdata.com>.
7 * Ext3 code with a lot of help from Eric Jarman <ejarman@acm.org>.
8 * Extended attributes for symlinks and special files added per
9 * suggestion of Luka Renko <luka.renko@hermes.si>.
10 * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
11 * Red Hat Inc.
12 * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
13 * and Andreas Gruenbacher <agruen@suse.de>.
14 */
15
16/*
17 * Extended attributes are stored directly in inodes (on file systems with
18 * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
19 * field contains the block number if an inode uses an additional block. All
20 * attributes must fit in the inode and one additional block. Blocks that
21 * contain the identical set of attributes may be shared among several inodes.
22 * Identical blocks are detected by keeping a cache of blocks that have
23 * recently been accessed.
24 *
25 * The attributes in inodes and on blocks have a different header; the entries
26 * are stored in the same format:
27 *
28 * +------------------+
29 * | header |
30 * | entry 1 | |
31 * | entry 2 | | growing downwards
32 * | entry 3 | v
33 * | four null bytes |
34 * | . . . |
35 * | value 1 | ^
36 * | value 3 | | growing upwards
37 * | value 2 | |
38 * +------------------+
39 *
40 * The header is followed by multiple entry descriptors. In disk blocks, the
41 * entry descriptors are kept sorted. In inodes, they are unsorted. The
42 * attribute values are aligned to the end of the block in no specific order.
43 *
44 * Locking strategy
45 * ----------------
46 * EXT3_I(inode)->i_file_acl is protected by EXT3_I(inode)->xattr_sem.
47 * EA blocks are only changed if they are exclusive to an inode, so
48 * holding xattr_sem also means that nothing but the EA block's reference
49 * count can change. Multiple writers to the same block are synchronized
50 * by the buffer lock.
51 */
52
53#include "ext3.h"
54#include <linux/mbcache.h>
55#include <linux/quotaops.h>
56#include "xattr.h"
57#include "acl.h"
58
59#define BHDR(bh) ((struct ext3_xattr_header *)((bh)->b_data))
60#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr))
61#define BFIRST(bh) ENTRY(BHDR(bh)+1)
62#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
63
64#define IHDR(inode, raw_inode) \
65 ((struct ext3_xattr_ibody_header *) \
66 ((void *)raw_inode + \
67 EXT3_GOOD_OLD_INODE_SIZE + \
68 EXT3_I(inode)->i_extra_isize))
69#define IFIRST(hdr) ((struct ext3_xattr_entry *)((hdr)+1))
70
71#ifdef EXT3_XATTR_DEBUG
72# define ea_idebug(inode, f...) do { \
73 printk(KERN_DEBUG "inode %s:%lu: ", \
74 inode->i_sb->s_id, inode->i_ino); \
75 printk(f); \
76 printk("\n"); \
77 } while (0)
78# define ea_bdebug(bh, f...) do { \
79 char b[BDEVNAME_SIZE]; \
80 printk(KERN_DEBUG "block %s:%lu: ", \
81 bdevname(bh->b_bdev, b), \
82 (unsigned long) bh->b_blocknr); \
83 printk(f); \
84 printk("\n"); \
85 } while (0)
86#else
87# define ea_idebug(f...)
88# define ea_bdebug(f...)
89#endif
90
91static void ext3_xattr_cache_insert(struct buffer_head *);
92static struct buffer_head *ext3_xattr_cache_find(struct inode *,
93 struct ext3_xattr_header *,
94 struct mb_cache_entry **);
95static void ext3_xattr_rehash(struct ext3_xattr_header *,
96 struct ext3_xattr_entry *);
97static int ext3_xattr_list(struct dentry *dentry, char *buffer,
98 size_t buffer_size);
99
100static struct mb_cache *ext3_xattr_cache;
101
102static const struct xattr_handler *ext3_xattr_handler_map[] = {
103 [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler,
104#ifdef CONFIG_EXT3_FS_POSIX_ACL
105 [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
106 [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
107#endif
108 [EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler,
109#ifdef CONFIG_EXT3_FS_SECURITY
110 [EXT3_XATTR_INDEX_SECURITY] = &ext3_xattr_security_handler,
111#endif
112};
113
114const struct xattr_handler *ext3_xattr_handlers[] = {
115 &ext3_xattr_user_handler,
116 &ext3_xattr_trusted_handler,
117#ifdef CONFIG_EXT3_FS_POSIX_ACL
118 &posix_acl_access_xattr_handler,
119 &posix_acl_default_xattr_handler,
120#endif
121#ifdef CONFIG_EXT3_FS_SECURITY
122 &ext3_xattr_security_handler,
123#endif
124 NULL
125};
126
127static inline const struct xattr_handler *
128ext3_xattr_handler(int name_index)
129{
130 const struct xattr_handler *handler = NULL;
131
132 if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map))
133 handler = ext3_xattr_handler_map[name_index];
134 return handler;
135}
136
137/*
138 * Inode operation listxattr()
139 *
140 * d_inode(dentry)->i_mutex: don't care
141 */
142ssize_t
143ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
144{
145 return ext3_xattr_list(dentry, buffer, size);
146}
147
148static int
149ext3_xattr_check_names(struct ext3_xattr_entry *entry, void *end)
150{
151 while (!IS_LAST_ENTRY(entry)) {
152 struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(entry);
153 if ((void *)next >= end)
154 return -EIO;
155 entry = next;
156 }
157 return 0;
158}
159
160static inline int
161ext3_xattr_check_block(struct buffer_head *bh)
162{
163 int error;
164
165 if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
166 BHDR(bh)->h_blocks != cpu_to_le32(1))
167 return -EIO;
168 error = ext3_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
169 return error;
170}
171
172static inline int
173ext3_xattr_check_entry(struct ext3_xattr_entry *entry, size_t size)
174{
175 size_t value_size = le32_to_cpu(entry->e_value_size);
176
177 if (entry->e_value_block != 0 || value_size > size ||
178 le16_to_cpu(entry->e_value_offs) + value_size > size)
179 return -EIO;
180 return 0;
181}
182
183static int
184ext3_xattr_find_entry(struct ext3_xattr_entry **pentry, int name_index,
185 const char *name, size_t size, int sorted)
186{
187 struct ext3_xattr_entry *entry;
188 size_t name_len;
189 int cmp = 1;
190
191 if (name == NULL)
192 return -EINVAL;
193 name_len = strlen(name);
194 entry = *pentry;
195 for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
196 cmp = name_index - entry->e_name_index;
197 if (!cmp)
198 cmp = name_len - entry->e_name_len;
199 if (!cmp)
200 cmp = memcmp(name, entry->e_name, name_len);
201 if (cmp <= 0 && (sorted || cmp == 0))
202 break;
203 }
204 *pentry = entry;
205 if (!cmp && ext3_xattr_check_entry(entry, size))
206 return -EIO;
207 return cmp ? -ENODATA : 0;
208}
209
210static int
211ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
212 void *buffer, size_t buffer_size)
213{
214 struct buffer_head *bh = NULL;
215 struct ext3_xattr_entry *entry;
216 size_t size;
217 int error;
218
219 ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
220 name_index, name, buffer, (long)buffer_size);
221
222 error = -ENODATA;
223 if (!EXT3_I(inode)->i_file_acl)
224 goto cleanup;
225 ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
226 bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
227 if (!bh)
228 goto cleanup;
229 ea_bdebug(bh, "b_count=%d, refcount=%d",
230 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
231 if (ext3_xattr_check_block(bh)) {
232bad_block: ext3_error(inode->i_sb, __func__,
233 "inode %lu: bad block "E3FSBLK, inode->i_ino,
234 EXT3_I(inode)->i_file_acl);
235 error = -EIO;
236 goto cleanup;
237 }
238 ext3_xattr_cache_insert(bh);
239 entry = BFIRST(bh);
240 error = ext3_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
241 if (error == -EIO)
242 goto bad_block;
243 if (error)
244 goto cleanup;
245 size = le32_to_cpu(entry->e_value_size);
246 if (buffer) {
247 error = -ERANGE;
248 if (size > buffer_size)
249 goto cleanup;
250 memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
251 size);
252 }
253 error = size;
254
255cleanup:
256 brelse(bh);
257 return error;
258}
259
260static int
261ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
262 void *buffer, size_t buffer_size)
263{
264 struct ext3_xattr_ibody_header *header;
265 struct ext3_xattr_entry *entry;
266 struct ext3_inode *raw_inode;
267 struct ext3_iloc iloc;
268 size_t size;
269 void *end;
270 int error;
271
272 if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
273 return -ENODATA;
274 error = ext3_get_inode_loc(inode, &iloc);
275 if (error)
276 return error;
277 raw_inode = ext3_raw_inode(&iloc);
278 header = IHDR(inode, raw_inode);
279 entry = IFIRST(header);
280 end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
281 error = ext3_xattr_check_names(entry, end);
282 if (error)
283 goto cleanup;
284 error = ext3_xattr_find_entry(&entry, name_index, name,
285 end - (void *)entry, 0);
286 if (error)
287 goto cleanup;
288 size = le32_to_cpu(entry->e_value_size);
289 if (buffer) {
290 error = -ERANGE;
291 if (size > buffer_size)
292 goto cleanup;
293 memcpy(buffer, (void *)IFIRST(header) +
294 le16_to_cpu(entry->e_value_offs), size);
295 }
296 error = size;
297
298cleanup:
299 brelse(iloc.bh);
300 return error;
301}
302
303/*
304 * ext3_xattr_get()
305 *
306 * Copy an extended attribute into the buffer
307 * provided, or compute the buffer size required.
308 * Buffer is NULL to compute the size of the buffer required.
309 *
310 * Returns a negative error number on failure, or the number of bytes
311 * used / required on success.
312 */
313int
314ext3_xattr_get(struct inode *inode, int name_index, const char *name,
315 void *buffer, size_t buffer_size)
316{
317 int error;
318
319 down_read(&EXT3_I(inode)->xattr_sem);
320 error = ext3_xattr_ibody_get(inode, name_index, name, buffer,
321 buffer_size);
322 if (error == -ENODATA)
323 error = ext3_xattr_block_get(inode, name_index, name, buffer,
324 buffer_size);
325 up_read(&EXT3_I(inode)->xattr_sem);
326 return error;
327}
328
329static int
330ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
331 char *buffer, size_t buffer_size)
332{
333 size_t rest = buffer_size;
334
335 for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
336 const struct xattr_handler *handler =
337 ext3_xattr_handler(entry->e_name_index);
338
339 if (handler) {
340 size_t size = handler->list(dentry, buffer, rest,
341 entry->e_name,
342 entry->e_name_len,
343 handler->flags);
344 if (buffer) {
345 if (size > rest)
346 return -ERANGE;
347 buffer += size;
348 }
349 rest -= size;
350 }
351 }
352 return buffer_size - rest;
353}
354
355static int
356ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
357{
358 struct inode *inode = d_inode(dentry);
359 struct buffer_head *bh = NULL;
360 int error;
361
362 ea_idebug(inode, "buffer=%p, buffer_size=%ld",
363 buffer, (long)buffer_size);
364
365 error = 0;
366 if (!EXT3_I(inode)->i_file_acl)
367 goto cleanup;
368 ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
369 bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
370 error = -EIO;
371 if (!bh)
372 goto cleanup;
373 ea_bdebug(bh, "b_count=%d, refcount=%d",
374 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
375 if (ext3_xattr_check_block(bh)) {
376 ext3_error(inode->i_sb, __func__,
377 "inode %lu: bad block "E3FSBLK, inode->i_ino,
378 EXT3_I(inode)->i_file_acl);
379 error = -EIO;
380 goto cleanup;
381 }
382 ext3_xattr_cache_insert(bh);
383 error = ext3_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
384
385cleanup:
386 brelse(bh);
387
388 return error;
389}
390
391static int
392ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
393{
394 struct inode *inode = d_inode(dentry);
395 struct ext3_xattr_ibody_header *header;
396 struct ext3_inode *raw_inode;
397 struct ext3_iloc iloc;
398 void *end;
399 int error;
400
401 if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR))
402 return 0;
403 error = ext3_get_inode_loc(inode, &iloc);
404 if (error)
405 return error;
406 raw_inode = ext3_raw_inode(&iloc);
407 header = IHDR(inode, raw_inode);
408 end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
409 error = ext3_xattr_check_names(IFIRST(header), end);
410 if (error)
411 goto cleanup;
412 error = ext3_xattr_list_entries(dentry, IFIRST(header),
413 buffer, buffer_size);
414
415cleanup:
416 brelse(iloc.bh);
417 return error;
418}
419
420/*
421 * ext3_xattr_list()
422 *
423 * Copy a list of attribute names into the buffer
424 * provided, or compute the buffer size required.
425 * Buffer is NULL to compute the size of the buffer required.
426 *
427 * Returns a negative error number on failure, or the number of bytes
428 * used / required on success.
429 */
430static int
431ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
432{
433 int i_error, b_error;
434
435 down_read(&EXT3_I(d_inode(dentry))->xattr_sem);
436 i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size);
437 if (i_error < 0) {
438 b_error = 0;
439 } else {
440 if (buffer) {
441 buffer += i_error;
442 buffer_size -= i_error;
443 }
444 b_error = ext3_xattr_block_list(dentry, buffer, buffer_size);
445 if (b_error < 0)
446 i_error = 0;
447 }
448 up_read(&EXT3_I(d_inode(dentry))->xattr_sem);
449 return i_error + b_error;
450}
451
452/*
453 * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is
454 * not set, set it.
455 */
456static void ext3_xattr_update_super_block(handle_t *handle,
457 struct super_block *sb)
458{
459 if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR))
460 return;
461
462 if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
463 EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR);
464 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
465 }
466}
467
468/*
469 * Release the xattr block BH: If the reference count is > 1, decrement
470 * it; otherwise free the block.
471 */
472static void
473ext3_xattr_release_block(handle_t *handle, struct inode *inode,
474 struct buffer_head *bh)
475{
476 struct mb_cache_entry *ce = NULL;
477 int error = 0;
478
479 ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev, bh->b_blocknr);
480 error = ext3_journal_get_write_access(handle, bh);
481 if (error)
482 goto out;
483
484 lock_buffer(bh);
485
486 if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
487 ea_bdebug(bh, "refcount now=0; freeing");
488 if (ce)
489 mb_cache_entry_free(ce);
490 ext3_free_blocks(handle, inode, bh->b_blocknr, 1);
491 get_bh(bh);
492 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
493 } else {
494 le32_add_cpu(&BHDR(bh)->h_refcount, -1);
495 error = ext3_journal_dirty_metadata(handle, bh);
496 if (IS_SYNC(inode))
497 handle->h_sync = 1;
498 dquot_free_block(inode, 1);
499 ea_bdebug(bh, "refcount now=%d; releasing",
500 le32_to_cpu(BHDR(bh)->h_refcount));
501 if (ce)
502 mb_cache_entry_release(ce);
503 }
504 unlock_buffer(bh);
505out:
506 ext3_std_error(inode->i_sb, error);
507 return;
508}
509
510struct ext3_xattr_info {
511 int name_index;
512 const char *name;
513 const void *value;
514 size_t value_len;
515};
516
517struct ext3_xattr_search {
518 struct ext3_xattr_entry *first;
519 void *base;
520 void *end;
521 struct ext3_xattr_entry *here;
522 int not_found;
523};
524
525static int
526ext3_xattr_set_entry(struct ext3_xattr_info *i, struct ext3_xattr_search *s)
527{
528 struct ext3_xattr_entry *last;
529 size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
530
531 /* Compute min_offs and last. */
532 last = s->first;
533 for (; !IS_LAST_ENTRY(last); last = EXT3_XATTR_NEXT(last)) {
534 if (!last->e_value_block && last->e_value_size) {
535 size_t offs = le16_to_cpu(last->e_value_offs);
536 if (offs < min_offs)
537 min_offs = offs;
538 }
539 }
540 free = min_offs - ((void *)last - s->base) - sizeof(__u32);
541 if (!s->not_found) {
542 if (!s->here->e_value_block && s->here->e_value_size) {
543 size_t size = le32_to_cpu(s->here->e_value_size);
544 free += EXT3_XATTR_SIZE(size);
545 }
546 free += EXT3_XATTR_LEN(name_len);
547 }
548 if (i->value) {
549 if (free < EXT3_XATTR_LEN(name_len) +
550 EXT3_XATTR_SIZE(i->value_len))
551 return -ENOSPC;
552 }
553
554 if (i->value && s->not_found) {
555 /* Insert the new name. */
556 size_t size = EXT3_XATTR_LEN(name_len);
557 size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
558 memmove((void *)s->here + size, s->here, rest);
559 memset(s->here, 0, size);
560 s->here->e_name_index = i->name_index;
561 s->here->e_name_len = name_len;
562 memcpy(s->here->e_name, i->name, name_len);
563 } else {
564 if (!s->here->e_value_block && s->here->e_value_size) {
565 void *first_val = s->base + min_offs;
566 size_t offs = le16_to_cpu(s->here->e_value_offs);
567 void *val = s->base + offs;
568 size_t size = EXT3_XATTR_SIZE(
569 le32_to_cpu(s->here->e_value_size));
570
571 if (i->value && size == EXT3_XATTR_SIZE(i->value_len)) {
572 /* The old and the new value have the same
573 size. Just replace. */
574 s->here->e_value_size =
575 cpu_to_le32(i->value_len);
576 memset(val + size - EXT3_XATTR_PAD, 0,
577 EXT3_XATTR_PAD); /* Clear pad bytes. */
578 memcpy(val, i->value, i->value_len);
579 return 0;
580 }
581
582 /* Remove the old value. */
583 memmove(first_val + size, first_val, val - first_val);
584 memset(first_val, 0, size);
585 s->here->e_value_size = 0;
586 s->here->e_value_offs = 0;
587 min_offs += size;
588
589 /* Adjust all value offsets. */
590 last = s->first;
591 while (!IS_LAST_ENTRY(last)) {
592 size_t o = le16_to_cpu(last->e_value_offs);
593 if (!last->e_value_block &&
594 last->e_value_size && o < offs)
595 last->e_value_offs =
596 cpu_to_le16(o + size);
597 last = EXT3_XATTR_NEXT(last);
598 }
599 }
600 if (!i->value) {
601 /* Remove the old name. */
602 size_t size = EXT3_XATTR_LEN(name_len);
603 last = ENTRY((void *)last - size);
604 memmove(s->here, (void *)s->here + size,
605 (void *)last - (void *)s->here + sizeof(__u32));
606 memset(last, 0, size);
607 }
608 }
609
610 if (i->value) {
611 /* Insert the new value. */
612 s->here->e_value_size = cpu_to_le32(i->value_len);
613 if (i->value_len) {
614 size_t size = EXT3_XATTR_SIZE(i->value_len);
615 void *val = s->base + min_offs - size;
616 s->here->e_value_offs = cpu_to_le16(min_offs - size);
617 memset(val + size - EXT3_XATTR_PAD, 0,
618 EXT3_XATTR_PAD); /* Clear the pad bytes. */
619 memcpy(val, i->value, i->value_len);
620 }
621 }
622 return 0;
623}
624
625struct ext3_xattr_block_find {
626 struct ext3_xattr_search s;
627 struct buffer_head *bh;
628};
629
630static int
631ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
632 struct ext3_xattr_block_find *bs)
633{
634 struct super_block *sb = inode->i_sb;
635 int error;
636
637 ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
638 i->name_index, i->name, i->value, (long)i->value_len);
639
640 if (EXT3_I(inode)->i_file_acl) {
641 /* The inode already has an extended attribute block. */
642 bs->bh = sb_bread(sb, EXT3_I(inode)->i_file_acl);
643 error = -EIO;
644 if (!bs->bh)
645 goto cleanup;
646 ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
647 atomic_read(&(bs->bh->b_count)),
648 le32_to_cpu(BHDR(bs->bh)->h_refcount));
649 if (ext3_xattr_check_block(bs->bh)) {
650 ext3_error(sb, __func__,
651 "inode %lu: bad block "E3FSBLK, inode->i_ino,
652 EXT3_I(inode)->i_file_acl);
653 error = -EIO;
654 goto cleanup;
655 }
656 /* Find the named attribute. */
657 bs->s.base = BHDR(bs->bh);
658 bs->s.first = BFIRST(bs->bh);
659 bs->s.end = bs->bh->b_data + bs->bh->b_size;
660 bs->s.here = bs->s.first;
661 error = ext3_xattr_find_entry(&bs->s.here, i->name_index,
662 i->name, bs->bh->b_size, 1);
663 if (error && error != -ENODATA)
664 goto cleanup;
665 bs->s.not_found = error;
666 }
667 error = 0;
668
669cleanup:
670 return error;
671}
672
673static int
674ext3_xattr_block_set(handle_t *handle, struct inode *inode,
675 struct ext3_xattr_info *i,
676 struct ext3_xattr_block_find *bs)
677{
678 struct super_block *sb = inode->i_sb;
679 struct buffer_head *new_bh = NULL;
680 struct ext3_xattr_search *s = &bs->s;
681 struct mb_cache_entry *ce = NULL;
682 int error = 0;
683
684#define header(x) ((struct ext3_xattr_header *)(x))
685
686 if (i->value && i->value_len > sb->s_blocksize)
687 return -ENOSPC;
688 if (s->base) {
689 ce = mb_cache_entry_get(ext3_xattr_cache, bs->bh->b_bdev,
690 bs->bh->b_blocknr);
691 error = ext3_journal_get_write_access(handle, bs->bh);
692 if (error)
693 goto cleanup;
694 lock_buffer(bs->bh);
695
696 if (header(s->base)->h_refcount == cpu_to_le32(1)) {
697 if (ce) {
698 mb_cache_entry_free(ce);
699 ce = NULL;
700 }
701 ea_bdebug(bs->bh, "modifying in-place");
702 error = ext3_xattr_set_entry(i, s);
703 if (!error) {
704 if (!IS_LAST_ENTRY(s->first))
705 ext3_xattr_rehash(header(s->base),
706 s->here);
707 ext3_xattr_cache_insert(bs->bh);
708 }
709 unlock_buffer(bs->bh);
710 if (error == -EIO)
711 goto bad_block;
712 if (!error)
713 error = ext3_journal_dirty_metadata(handle,
714 bs->bh);
715 if (error)
716 goto cleanup;
717 goto inserted;
718 } else {
719 int offset = (char *)s->here - bs->bh->b_data;
720
721 unlock_buffer(bs->bh);
722 journal_release_buffer(handle, bs->bh);
723
724 if (ce) {
725 mb_cache_entry_release(ce);
726 ce = NULL;
727 }
728 ea_bdebug(bs->bh, "cloning");
729 s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
730 error = -ENOMEM;
731 if (s->base == NULL)
732 goto cleanup;
733 memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
734 s->first = ENTRY(header(s->base)+1);
735 header(s->base)->h_refcount = cpu_to_le32(1);
736 s->here = ENTRY(s->base + offset);
737 s->end = s->base + bs->bh->b_size;
738 }
739 } else {
740 /* Allocate a buffer where we construct the new block. */
741 s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
742 /* assert(header == s->base) */
743 error = -ENOMEM;
744 if (s->base == NULL)
745 goto cleanup;
746 header(s->base)->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
747 header(s->base)->h_blocks = cpu_to_le32(1);
748 header(s->base)->h_refcount = cpu_to_le32(1);
749 s->first = ENTRY(header(s->base)+1);
750 s->here = ENTRY(header(s->base)+1);
751 s->end = s->base + sb->s_blocksize;
752 }
753
754 error = ext3_xattr_set_entry(i, s);
755 if (error == -EIO)
756 goto bad_block;
757 if (error)
758 goto cleanup;
759 if (!IS_LAST_ENTRY(s->first))
760 ext3_xattr_rehash(header(s->base), s->here);
761
762inserted:
763 if (!IS_LAST_ENTRY(s->first)) {
764 new_bh = ext3_xattr_cache_find(inode, header(s->base), &ce);
765 if (new_bh) {
766 /* We found an identical block in the cache. */
767 if (new_bh == bs->bh)
768 ea_bdebug(new_bh, "keeping");
769 else {
770 /* The old block is released after updating
771 the inode. */
772 error = dquot_alloc_block(inode, 1);
773 if (error)
774 goto cleanup;
775 error = ext3_journal_get_write_access(handle,
776 new_bh);
777 if (error)
778 goto cleanup_dquot;
779 lock_buffer(new_bh);
780 le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
781 ea_bdebug(new_bh, "reusing; refcount now=%d",
782 le32_to_cpu(BHDR(new_bh)->h_refcount));
783 unlock_buffer(new_bh);
784 error = ext3_journal_dirty_metadata(handle,
785 new_bh);
786 if (error)
787 goto cleanup_dquot;
788 }
789 mb_cache_entry_release(ce);
790 ce = NULL;
791 } else if (bs->bh && s->base == bs->bh->b_data) {
792 /* We were modifying this block in-place. */
793 ea_bdebug(bs->bh, "keeping this block");
794 new_bh = bs->bh;
795 get_bh(new_bh);
796 } else {
797 /* We need to allocate a new block */
798 ext3_fsblk_t goal = ext3_group_first_block_no(sb,
799 EXT3_I(inode)->i_block_group);
800 ext3_fsblk_t block;
801
802 /*
803 * Protect us agaist concurrent allocations to the
804 * same inode from ext3_..._writepage(). Reservation
805 * code does not expect racing allocations.
806 */
807 mutex_lock(&EXT3_I(inode)->truncate_mutex);
808 block = ext3_new_block(handle, inode, goal, &error);
809 mutex_unlock(&EXT3_I(inode)->truncate_mutex);
810 if (error)
811 goto cleanup;
812 ea_idebug(inode, "creating block %d", block);
813
814 new_bh = sb_getblk(sb, block);
815 if (unlikely(!new_bh)) {
816getblk_failed:
817 ext3_free_blocks(handle, inode, block, 1);
818 error = -ENOMEM;
819 goto cleanup;
820 }
821 lock_buffer(new_bh);
822 error = ext3_journal_get_create_access(handle, new_bh);
823 if (error) {
824 unlock_buffer(new_bh);
825 goto getblk_failed;
826 }
827 memcpy(new_bh->b_data, s->base, new_bh->b_size);
828 set_buffer_uptodate(new_bh);
829 unlock_buffer(new_bh);
830 ext3_xattr_cache_insert(new_bh);
831 error = ext3_journal_dirty_metadata(handle, new_bh);
832 if (error)
833 goto cleanup;
834 }
835 }
836
837 /* Update the inode. */
838 EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
839
840 /* Drop the previous xattr block. */
841 if (bs->bh && bs->bh != new_bh)
842 ext3_xattr_release_block(handle, inode, bs->bh);
843 error = 0;
844
845cleanup:
846 if (ce)
847 mb_cache_entry_release(ce);
848 brelse(new_bh);
849 if (!(bs->bh && s->base == bs->bh->b_data))
850 kfree(s->base);
851
852 return error;
853
854cleanup_dquot:
855 dquot_free_block(inode, 1);
856 goto cleanup;
857
858bad_block:
859 ext3_error(inode->i_sb, __func__,
860 "inode %lu: bad block "E3FSBLK, inode->i_ino,
861 EXT3_I(inode)->i_file_acl);
862 goto cleanup;
863
864#undef header
865}
866
867struct ext3_xattr_ibody_find {
868 struct ext3_xattr_search s;
869 struct ext3_iloc iloc;
870};
871
872static int
873ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i,
874 struct ext3_xattr_ibody_find *is)
875{
876 struct ext3_xattr_ibody_header *header;
877 struct ext3_inode *raw_inode;
878 int error;
879
880 if (EXT3_I(inode)->i_extra_isize == 0)
881 return 0;
882 raw_inode = ext3_raw_inode(&is->iloc);
883 header = IHDR(inode, raw_inode);
884 is->s.base = is->s.first = IFIRST(header);
885 is->s.here = is->s.first;
886 is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
887 if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) {
888 error = ext3_xattr_check_names(IFIRST(header), is->s.end);
889 if (error)
890 return error;
891 /* Find the named attribute. */
892 error = ext3_xattr_find_entry(&is->s.here, i->name_index,
893 i->name, is->s.end -
894 (void *)is->s.base, 0);
895 if (error && error != -ENODATA)
896 return error;
897 is->s.not_found = error;
898 }
899 return 0;
900}
901
902static int
903ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
904 struct ext3_xattr_info *i,
905 struct ext3_xattr_ibody_find *is)
906{
907 struct ext3_xattr_ibody_header *header;
908 struct ext3_xattr_search *s = &is->s;
909 int error;
910
911 if (EXT3_I(inode)->i_extra_isize == 0)
912 return -ENOSPC;
913 error = ext3_xattr_set_entry(i, s);
914 if (error)
915 return error;
916 header = IHDR(inode, ext3_raw_inode(&is->iloc));
917 if (!IS_LAST_ENTRY(s->first)) {
918 header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
919 ext3_set_inode_state(inode, EXT3_STATE_XATTR);
920 } else {
921 header->h_magic = cpu_to_le32(0);
922 ext3_clear_inode_state(inode, EXT3_STATE_XATTR);
923 }
924 return 0;
925}
926
927/*
928 * ext3_xattr_set_handle()
929 *
930 * Create, replace or remove an extended attribute for this inode. Value
931 * is NULL to remove an existing extended attribute, and non-NULL to
932 * either replace an existing extended attribute, or create a new extended
933 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
934 * specify that an extended attribute must exist and must not exist
935 * previous to the call, respectively.
936 *
937 * Returns 0, or a negative error number on failure.
938 */
939int
940ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
941 const char *name, const void *value, size_t value_len,
942 int flags)
943{
944 struct ext3_xattr_info i = {
945 .name_index = name_index,
946 .name = name,
947 .value = value,
948 .value_len = value_len,
949
950 };
951 struct ext3_xattr_ibody_find is = {
952 .s = { .not_found = -ENODATA, },
953 };
954 struct ext3_xattr_block_find bs = {
955 .s = { .not_found = -ENODATA, },
956 };
957 int error;
958
959 if (!name)
960 return -EINVAL;
961 if (strlen(name) > 255)
962 return -ERANGE;
963 down_write(&EXT3_I(inode)->xattr_sem);
964 error = ext3_get_inode_loc(inode, &is.iloc);
965 if (error)
966 goto cleanup;
967
968 error = ext3_journal_get_write_access(handle, is.iloc.bh);
969 if (error)
970 goto cleanup;
971
972 if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) {
973 struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc);
974 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
975 ext3_clear_inode_state(inode, EXT3_STATE_NEW);
976 }
977
978 error = ext3_xattr_ibody_find(inode, &i, &is);
979 if (error)
980 goto cleanup;
981 if (is.s.not_found)
982 error = ext3_xattr_block_find(inode, &i, &bs);
983 if (error)
984 goto cleanup;
985 if (is.s.not_found && bs.s.not_found) {
986 error = -ENODATA;
987 if (flags & XATTR_REPLACE)
988 goto cleanup;
989 error = 0;
990 if (!value)
991 goto cleanup;
992 } else {
993 error = -EEXIST;
994 if (flags & XATTR_CREATE)
995 goto cleanup;
996 }
997 if (!value) {
998 if (!is.s.not_found)
999 error = ext3_xattr_ibody_set(handle, inode, &i, &is);
1000 else if (!bs.s.not_found)
1001 error = ext3_xattr_block_set(handle, inode, &i, &bs);
1002 } else {
1003 error = ext3_xattr_ibody_set(handle, inode, &i, &is);
1004 if (!error && !bs.s.not_found) {
1005 i.value = NULL;
1006 error = ext3_xattr_block_set(handle, inode, &i, &bs);
1007 } else if (error == -ENOSPC) {
1008 if (EXT3_I(inode)->i_file_acl && !bs.s.base) {
1009 error = ext3_xattr_block_find(inode, &i, &bs);
1010 if (error)
1011 goto cleanup;
1012 }
1013 error = ext3_xattr_block_set(handle, inode, &i, &bs);
1014 if (error)
1015 goto cleanup;
1016 if (!is.s.not_found) {
1017 i.value = NULL;
1018 error = ext3_xattr_ibody_set(handle, inode, &i,
1019 &is);
1020 }
1021 }
1022 }
1023 if (!error) {
1024 ext3_xattr_update_super_block(handle, inode->i_sb);
1025 inode->i_ctime = CURRENT_TIME_SEC;
1026 error = ext3_mark_iloc_dirty(handle, inode, &is.iloc);
1027 /*
1028 * The bh is consumed by ext3_mark_iloc_dirty, even with
1029 * error != 0.
1030 */
1031 is.iloc.bh = NULL;
1032 if (IS_SYNC(inode))
1033 handle->h_sync = 1;
1034 }
1035
1036cleanup:
1037 brelse(is.iloc.bh);
1038 brelse(bs.bh);
1039 up_write(&EXT3_I(inode)->xattr_sem);
1040 return error;
1041}
1042
1043/*
1044 * ext3_xattr_set()
1045 *
1046 * Like ext3_xattr_set_handle, but start from an inode. This extended
1047 * attribute modification is a filesystem transaction by itself.
1048 *
1049 * Returns 0, or a negative error number on failure.
1050 */
1051int
1052ext3_xattr_set(struct inode *inode, int name_index, const char *name,
1053 const void *value, size_t value_len, int flags)
1054{
1055 handle_t *handle;
1056 int error, retries = 0;
1057
1058retry:
1059 handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
1060 if (IS_ERR(handle)) {
1061 error = PTR_ERR(handle);
1062 } else {
1063 int error2;
1064
1065 error = ext3_xattr_set_handle(handle, inode, name_index, name,
1066 value, value_len, flags);
1067 error2 = ext3_journal_stop(handle);
1068 if (error == -ENOSPC &&
1069 ext3_should_retry_alloc(inode->i_sb, &retries))
1070 goto retry;
1071 if (error == 0)
1072 error = error2;
1073 }
1074
1075 return error;
1076}
1077
1078/*
1079 * ext3_xattr_delete_inode()
1080 *
1081 * Free extended attribute resources associated with this inode. This
1082 * is called immediately before an inode is freed. We have exclusive
1083 * access to the inode.
1084 */
1085void
1086ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
1087{
1088 struct buffer_head *bh = NULL;
1089
1090 if (!EXT3_I(inode)->i_file_acl)
1091 goto cleanup;
1092 bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
1093 if (!bh) {
1094 ext3_error(inode->i_sb, __func__,
1095 "inode %lu: block "E3FSBLK" read error", inode->i_ino,
1096 EXT3_I(inode)->i_file_acl);
1097 goto cleanup;
1098 }
1099 if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
1100 BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1101 ext3_error(inode->i_sb, __func__,
1102 "inode %lu: bad block "E3FSBLK, inode->i_ino,
1103 EXT3_I(inode)->i_file_acl);
1104 goto cleanup;
1105 }
1106 ext3_xattr_release_block(handle, inode, bh);
1107 EXT3_I(inode)->i_file_acl = 0;
1108
1109cleanup:
1110 brelse(bh);
1111}
1112
1113/*
1114 * ext3_xattr_put_super()
1115 *
1116 * This is called when a file system is unmounted.
1117 */
1118void
1119ext3_xattr_put_super(struct super_block *sb)
1120{
1121 mb_cache_shrink(sb->s_bdev);
1122}
1123
1124/*
1125 * ext3_xattr_cache_insert()
1126 *
1127 * Create a new entry in the extended attribute cache, and insert
1128 * it unless such an entry is already in the cache.
1129 *
1130 * Returns 0, or a negative error number on failure.
1131 */
1132static void
1133ext3_xattr_cache_insert(struct buffer_head *bh)
1134{
1135 __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
1136 struct mb_cache_entry *ce;
1137 int error;
1138
1139 ce = mb_cache_entry_alloc(ext3_xattr_cache, GFP_NOFS);
1140 if (!ce) {
1141 ea_bdebug(bh, "out of memory");
1142 return;
1143 }
1144 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
1145 if (error) {
1146 mb_cache_entry_free(ce);
1147 if (error == -EBUSY) {
1148 ea_bdebug(bh, "already in cache");
1149 error = 0;
1150 }
1151 } else {
1152 ea_bdebug(bh, "inserting [%x]", (int)hash);
1153 mb_cache_entry_release(ce);
1154 }
1155}
1156
1157/*
1158 * ext3_xattr_cmp()
1159 *
1160 * Compare two extended attribute blocks for equality.
1161 *
1162 * Returns 0 if the blocks are equal, 1 if they differ, and
1163 * a negative error number on errors.
1164 */
1165static int
1166ext3_xattr_cmp(struct ext3_xattr_header *header1,
1167 struct ext3_xattr_header *header2)
1168{
1169 struct ext3_xattr_entry *entry1, *entry2;
1170
1171 entry1 = ENTRY(header1+1);
1172 entry2 = ENTRY(header2+1);
1173 while (!IS_LAST_ENTRY(entry1)) {
1174 if (IS_LAST_ENTRY(entry2))
1175 return 1;
1176 if (entry1->e_hash != entry2->e_hash ||
1177 entry1->e_name_index != entry2->e_name_index ||
1178 entry1->e_name_len != entry2->e_name_len ||
1179 entry1->e_value_size != entry2->e_value_size ||
1180 memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
1181 return 1;
1182 if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
1183 return -EIO;
1184 if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
1185 (char *)header2 + le16_to_cpu(entry2->e_value_offs),
1186 le32_to_cpu(entry1->e_value_size)))
1187 return 1;
1188
1189 entry1 = EXT3_XATTR_NEXT(entry1);
1190 entry2 = EXT3_XATTR_NEXT(entry2);
1191 }
1192 if (!IS_LAST_ENTRY(entry2))
1193 return 1;
1194 return 0;
1195}
1196
1197/*
1198 * ext3_xattr_cache_find()
1199 *
1200 * Find an identical extended attribute block.
1201 *
1202 * Returns a pointer to the block found, or NULL if such a block was
1203 * not found or an error occurred.
1204 */
1205static struct buffer_head *
1206ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header,
1207 struct mb_cache_entry **pce)
1208{
1209 __u32 hash = le32_to_cpu(header->h_hash);
1210 struct mb_cache_entry *ce;
1211
1212 if (!header->h_hash)
1213 return NULL; /* never share */
1214 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
1215again:
1216 ce = mb_cache_entry_find_first(ext3_xattr_cache, inode->i_sb->s_bdev,
1217 hash);
1218 while (ce) {
1219 struct buffer_head *bh;
1220
1221 if (IS_ERR(ce)) {
1222 if (PTR_ERR(ce) == -EAGAIN)
1223 goto again;
1224 break;
1225 }
1226 bh = sb_bread(inode->i_sb, ce->e_block);
1227 if (!bh) {
1228 ext3_error(inode->i_sb, __func__,
1229 "inode %lu: block %lu read error",
1230 inode->i_ino, (unsigned long) ce->e_block);
1231 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
1232 EXT3_XATTR_REFCOUNT_MAX) {
1233 ea_idebug(inode, "block %lu refcount %d>=%d",
1234 (unsigned long) ce->e_block,
1235 le32_to_cpu(BHDR(bh)->h_refcount),
1236 EXT3_XATTR_REFCOUNT_MAX);
1237 } else if (ext3_xattr_cmp(header, BHDR(bh)) == 0) {
1238 *pce = ce;
1239 return bh;
1240 }
1241 brelse(bh);
1242 ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
1243 }
1244 return NULL;
1245}
1246
1247#define NAME_HASH_SHIFT 5
1248#define VALUE_HASH_SHIFT 16
1249
1250/*
1251 * ext3_xattr_hash_entry()
1252 *
1253 * Compute the hash of an extended attribute.
1254 */
1255static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header,
1256 struct ext3_xattr_entry *entry)
1257{
1258 __u32 hash = 0;
1259 char *name = entry->e_name;
1260 int n;
1261
1262 for (n=0; n < entry->e_name_len; n++) {
1263 hash = (hash << NAME_HASH_SHIFT) ^
1264 (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
1265 *name++;
1266 }
1267
1268 if (entry->e_value_block == 0 && entry->e_value_size != 0) {
1269 __le32 *value = (__le32 *)((char *)header +
1270 le16_to_cpu(entry->e_value_offs));
1271 for (n = (le32_to_cpu(entry->e_value_size) +
1272 EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) {
1273 hash = (hash << VALUE_HASH_SHIFT) ^
1274 (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
1275 le32_to_cpu(*value++);
1276 }
1277 }
1278 entry->e_hash = cpu_to_le32(hash);
1279}
1280
1281#undef NAME_HASH_SHIFT
1282#undef VALUE_HASH_SHIFT
1283
1284#define BLOCK_HASH_SHIFT 16
1285
1286/*
1287 * ext3_xattr_rehash()
1288 *
1289 * Re-compute the extended attribute hash value after an entry has changed.
1290 */
1291static void ext3_xattr_rehash(struct ext3_xattr_header *header,
1292 struct ext3_xattr_entry *entry)
1293{
1294 struct ext3_xattr_entry *here;
1295 __u32 hash = 0;
1296
1297 ext3_xattr_hash_entry(header, entry);
1298 here = ENTRY(header+1);
1299 while (!IS_LAST_ENTRY(here)) {
1300 if (!here->e_hash) {
1301 /* Block is not shared if an entry's hash value == 0 */
1302 hash = 0;
1303 break;
1304 }
1305 hash = (hash << BLOCK_HASH_SHIFT) ^
1306 (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
1307 le32_to_cpu(here->e_hash);
1308 here = EXT3_XATTR_NEXT(here);
1309 }
1310 header->h_hash = cpu_to_le32(hash);
1311}
1312
1313#undef BLOCK_HASH_SHIFT
1314
1315int __init
1316init_ext3_xattr(void)
1317{
1318 ext3_xattr_cache = mb_cache_create("ext3_xattr", 6);
1319 if (!ext3_xattr_cache)
1320 return -ENOMEM;
1321 return 0;
1322}
1323
1324void
1325exit_ext3_xattr(void)
1326{
1327 if (ext3_xattr_cache)
1328 mb_cache_destroy(ext3_xattr_cache);
1329 ext3_xattr_cache = NULL;
1330}
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
deleted file mode 100644
index 32e93ebf8031..000000000000
--- a/fs/ext3/xattr.h
+++ /dev/null
@@ -1,136 +0,0 @@
1/*
2 File: fs/ext3/xattr.h
3
4 On-disk format of extended attributes for the ext3 filesystem.
5
6 (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
7*/
8
9#include <linux/xattr.h>
10
11/* Magic value in attribute blocks */
12#define EXT3_XATTR_MAGIC 0xEA020000
13
14/* Maximum number of references to one attribute block */
15#define EXT3_XATTR_REFCOUNT_MAX 1024
16
17/* Name indexes */
18#define EXT3_XATTR_INDEX_USER 1
19#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2
20#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3
21#define EXT3_XATTR_INDEX_TRUSTED 4
22#define EXT3_XATTR_INDEX_LUSTRE 5
23#define EXT3_XATTR_INDEX_SECURITY 6
24
25struct ext3_xattr_header {
26 __le32 h_magic; /* magic number for identification */
27 __le32 h_refcount; /* reference count */
28 __le32 h_blocks; /* number of disk blocks used */
29 __le32 h_hash; /* hash value of all attributes */
30 __u32 h_reserved[4]; /* zero right now */
31};
32
33struct ext3_xattr_ibody_header {
34 __le32 h_magic; /* magic number for identification */
35};
36
37struct ext3_xattr_entry {
38 __u8 e_name_len; /* length of name */
39 __u8 e_name_index; /* attribute name index */
40 __le16 e_value_offs; /* offset in disk block of value */
41 __le32 e_value_block; /* disk block attribute is stored on (n/i) */
42 __le32 e_value_size; /* size of attribute value */
43 __le32 e_hash; /* hash value of name and value */
44 char e_name[0]; /* attribute name */
45};
46
47#define EXT3_XATTR_PAD_BITS 2
48#define EXT3_XATTR_PAD (1<<EXT3_XATTR_PAD_BITS)
49#define EXT3_XATTR_ROUND (EXT3_XATTR_PAD-1)
50#define EXT3_XATTR_LEN(name_len) \
51 (((name_len) + EXT3_XATTR_ROUND + \
52 sizeof(struct ext3_xattr_entry)) & ~EXT3_XATTR_ROUND)
53#define EXT3_XATTR_NEXT(entry) \
54 ( (struct ext3_xattr_entry *)( \
55 (char *)(entry) + EXT3_XATTR_LEN((entry)->e_name_len)) )
56#define EXT3_XATTR_SIZE(size) \
57 (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND)
58
59# ifdef CONFIG_EXT3_FS_XATTR
60
61extern const struct xattr_handler ext3_xattr_user_handler;
62extern const struct xattr_handler ext3_xattr_trusted_handler;
63extern const struct xattr_handler ext3_xattr_security_handler;
64
65extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
66
67extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t);
68extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
69extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
70
71extern void ext3_xattr_delete_inode(handle_t *, struct inode *);
72extern void ext3_xattr_put_super(struct super_block *);
73
74extern int init_ext3_xattr(void);
75extern void exit_ext3_xattr(void);
76
77extern const struct xattr_handler *ext3_xattr_handlers[];
78
79# else /* CONFIG_EXT3_FS_XATTR */
80
81static inline int
82ext3_xattr_get(struct inode *inode, int name_index, const char *name,
83 void *buffer, size_t size, int flags)
84{
85 return -EOPNOTSUPP;
86}
87
88static inline int
89ext3_xattr_set(struct inode *inode, int name_index, const char *name,
90 const void *value, size_t size, int flags)
91{
92 return -EOPNOTSUPP;
93}
94
95static inline int
96ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
97 const char *name, const void *value, size_t size, int flags)
98{
99 return -EOPNOTSUPP;
100}
101
102static inline void
103ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
104{
105}
106
107static inline void
108ext3_xattr_put_super(struct super_block *sb)
109{
110}
111
112static inline int
113init_ext3_xattr(void)
114{
115 return 0;
116}
117
118static inline void
119exit_ext3_xattr(void)
120{
121}
122
123#define ext3_xattr_handlers NULL
124
125# endif /* CONFIG_EXT3_FS_XATTR */
126
127#ifdef CONFIG_EXT3_FS_SECURITY
128extern int ext3_init_security(handle_t *handle, struct inode *inode,
129 struct inode *dir, const struct qstr *qstr);
130#else
131static inline int ext3_init_security(handle_t *handle, struct inode *inode,
132 struct inode *dir, const struct qstr *qstr)
133{
134 return 0;
135}
136#endif
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
deleted file mode 100644
index c9506d5e3b13..000000000000
--- a/fs/ext3/xattr_security.c
+++ /dev/null
@@ -1,78 +0,0 @@
1/*
2 * linux/fs/ext3/xattr_security.c
3 * Handler for storing security labels as extended attributes.
4 */
5
6#include <linux/security.h>
7#include "ext3.h"
8#include "xattr.h"
9
10static size_t
11ext3_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
12 const char *name, size_t name_len, int type)
13{
14 const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
15 const size_t total_len = prefix_len + name_len + 1;
16
17
18 if (list && total_len <= list_size) {
19 memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
20 memcpy(list+prefix_len, name, name_len);
21 list[prefix_len + name_len] = '\0';
22 }
23 return total_len;
24}
25
26static int
27ext3_xattr_security_get(struct dentry *dentry, const char *name,
28 void *buffer, size_t size, int type)
29{
30 if (strcmp(name, "") == 0)
31 return -EINVAL;
32 return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY,
33 name, buffer, size);
34}
35
36static int
37ext3_xattr_security_set(struct dentry *dentry, const char *name,
38 const void *value, size_t size, int flags, int type)
39{
40 if (strcmp(name, "") == 0)
41 return -EINVAL;
42 return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY,
43 name, value, size, flags);
44}
45
46static int ext3_initxattrs(struct inode *inode,
47 const struct xattr *xattr_array,
48 void *fs_info)
49{
50 const struct xattr *xattr;
51 handle_t *handle = fs_info;
52 int err = 0;
53
54 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
55 err = ext3_xattr_set_handle(handle, inode,
56 EXT3_XATTR_INDEX_SECURITY,
57 xattr->name, xattr->value,
58 xattr->value_len, 0);
59 if (err < 0)
60 break;
61 }
62 return err;
63}
64
65int
66ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
67 const struct qstr *qstr)
68{
69 return security_inode_init_security(inode, dir, qstr,
70 &ext3_initxattrs, handle);
71}
72
73const struct xattr_handler ext3_xattr_security_handler = {
74 .prefix = XATTR_SECURITY_PREFIX,
75 .list = ext3_xattr_security_list,
76 .get = ext3_xattr_security_get,
77 .set = ext3_xattr_security_set,
78};
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
deleted file mode 100644
index 206cc66dc285..000000000000
--- a/fs/ext3/xattr_trusted.c
+++ /dev/null
@@ -1,54 +0,0 @@
1/*
2 * linux/fs/ext3/xattr_trusted.c
3 * Handler for trusted extended attributes.
4 *
5 * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */
7
8#include "ext3.h"
9#include "xattr.h"
10
11static size_t
12ext3_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
13 const char *name, size_t name_len, int type)
14{
15 const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
16 const size_t total_len = prefix_len + name_len + 1;
17
18 if (!capable(CAP_SYS_ADMIN))
19 return 0;
20
21 if (list && total_len <= list_size) {
22 memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
23 memcpy(list+prefix_len, name, name_len);
24 list[prefix_len + name_len] = '\0';
25 }
26 return total_len;
27}
28
29static int
30ext3_xattr_trusted_get(struct dentry *dentry, const char *name,
31 void *buffer, size_t size, int type)
32{
33 if (strcmp(name, "") == 0)
34 return -EINVAL;
35 return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED,
36 name, buffer, size);
37}
38
39static int
40ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
41 const void *value, size_t size, int flags, int type)
42{
43 if (strcmp(name, "") == 0)
44 return -EINVAL;
45 return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED, name,
46 value, size, flags);
47}
48
49const struct xattr_handler ext3_xattr_trusted_handler = {
50 .prefix = XATTR_TRUSTED_PREFIX,
51 .list = ext3_xattr_trusted_list,
52 .get = ext3_xattr_trusted_get,
53 .set = ext3_xattr_trusted_set,
54};
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
deleted file mode 100644
index 021508ad1616..000000000000
--- a/fs/ext3/xattr_user.c
+++ /dev/null
@@ -1,58 +0,0 @@
1/*
2 * linux/fs/ext3/xattr_user.c
3 * Handler for extended user attributes.
4 *
5 * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */
7
8#include "ext3.h"
9#include "xattr.h"
10
11static size_t
12ext3_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
13 const char *name, size_t name_len, int type)
14{
15 const size_t prefix_len = XATTR_USER_PREFIX_LEN;
16 const size_t total_len = prefix_len + name_len + 1;
17
18 if (!test_opt(dentry->d_sb, XATTR_USER))
19 return 0;
20
21 if (list && total_len <= list_size) {
22 memcpy(list, XATTR_USER_PREFIX, prefix_len);
23 memcpy(list+prefix_len, name, name_len);
24 list[prefix_len + name_len] = '\0';
25 }
26 return total_len;
27}
28
29static int
30ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer,
31 size_t size, int type)
32{
33 if (strcmp(name, "") == 0)
34 return -EINVAL;
35 if (!test_opt(dentry->d_sb, XATTR_USER))
36 return -EOPNOTSUPP;
37 return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_USER,
38 name, buffer, size);
39}
40
41static int
42ext3_xattr_user_set(struct dentry *dentry, const char *name,
43 const void *value, size_t size, int flags, int type)
44{
45 if (strcmp(name, "") == 0)
46 return -EINVAL;
47 if (!test_opt(dentry->d_sb, XATTR_USER))
48 return -EOPNOTSUPP;
49 return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_USER,
50 name, value, size, flags);
51}
52
53const struct xattr_handler ext3_xattr_user_handler = {
54 .prefix = XATTR_USER_PREFIX,
55 .list = ext3_xattr_user_list,
56 .get = ext3_xattr_user_get,
57 .set = ext3_xattr_user_set,
58};