aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/Makefile12
-rw-r--r--fs/ext4/acl.c551
-rw-r--r--fs/ext4/acl.h81
-rw-r--r--fs/ext4/balloc.c1833
-rw-r--r--fs/ext4/bitmap.c32
-rw-r--r--fs/ext4/dir.c518
-rw-r--r--fs/ext4/extents.c2152
-rw-r--r--fs/ext4/file.c139
-rw-r--r--fs/ext4/fsync.c88
-rw-r--r--fs/ext4/hash.c152
-rw-r--r--fs/ext4/ialloc.c772
-rw-r--r--fs/ext4/inode.c3233
-rw-r--r--fs/ext4/ioctl.c306
-rw-r--r--fs/ext4/namei.c2395
-rw-r--r--fs/ext4/namei.h8
-rw-r--r--fs/ext4/resize.c1050
-rw-r--r--fs/ext4/super.c2829
-rw-r--r--fs/ext4/symlink.c54
-rw-r--r--fs/ext4/xattr.c1317
-rw-r--r--fs/ext4/xattr.h145
-rw-r--r--fs/ext4/xattr_security.c77
-rw-r--r--fs/ext4/xattr_trusted.c62
-rw-r--r--fs/ext4/xattr_user.c64
23 files changed, 17870 insertions, 0 deletions
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
new file mode 100644
index 000000000000..a6acb96ebeb9
--- /dev/null
+++ b/fs/ext4/Makefile
@@ -0,0 +1,12 @@
1#
2# Makefile for the linux ext4-filesystem routines.
3#
4
5obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
6
7ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o
9
10ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
11ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
12ext4dev-$(CONFIG_EXT4DEV_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
new file mode 100644
index 000000000000..9e882546d91a
--- /dev/null
+++ b/fs/ext4/acl.c
@@ -0,0 +1,551 @@
1/*
2 * linux/fs/ext4/acl.c
3 *
4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
5 */
6
7#include <linux/init.h>
8#include <linux/sched.h>
9#include <linux/slab.h>
10#include <linux/capability.h>
11#include <linux/fs.h>
12#include <linux/ext4_jbd2.h>
13#include <linux/ext4_fs.h>
14#include "xattr.h"
15#include "acl.h"
16
17/*
18 * Convert from filesystem to in-memory representation.
19 */
20static struct posix_acl *
21ext4_acl_from_disk(const void *value, size_t size)
22{
23 const char *end = (char *)value + size;
24 int n, count;
25 struct posix_acl *acl;
26
27 if (!value)
28 return NULL;
29 if (size < sizeof(ext4_acl_header))
30 return ERR_PTR(-EINVAL);
31 if (((ext4_acl_header *)value)->a_version !=
32 cpu_to_le32(EXT4_ACL_VERSION))
33 return ERR_PTR(-EINVAL);
34 value = (char *)value + sizeof(ext4_acl_header);
35 count = ext4_acl_count(size);
36 if (count < 0)
37 return ERR_PTR(-EINVAL);
38 if (count == 0)
39 return NULL;
40 acl = posix_acl_alloc(count, GFP_KERNEL);
41 if (!acl)
42 return ERR_PTR(-ENOMEM);
43 for (n=0; n < count; n++) {
44 ext4_acl_entry *entry =
45 (ext4_acl_entry *)value;
46 if ((char *)value + sizeof(ext4_acl_entry_short) > end)
47 goto fail;
48 acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
49 acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
50 switch(acl->a_entries[n].e_tag) {
51 case ACL_USER_OBJ:
52 case ACL_GROUP_OBJ:
53 case ACL_MASK:
54 case ACL_OTHER:
55 value = (char *)value +
56 sizeof(ext4_acl_entry_short);
57 acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
58 break;
59
60 case ACL_USER:
61 case ACL_GROUP:
62 value = (char *)value + sizeof(ext4_acl_entry);
63 if ((char *)value > end)
64 goto fail;
65 acl->a_entries[n].e_id =
66 le32_to_cpu(entry->e_id);
67 break;
68
69 default:
70 goto fail;
71 }
72 }
73 if (value != end)
74 goto fail;
75 return acl;
76
77fail:
78 posix_acl_release(acl);
79 return ERR_PTR(-EINVAL);
80}
81
82/*
83 * Convert from in-memory to filesystem representation.
84 */
85static void *
86ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
87{
88 ext4_acl_header *ext_acl;
89 char *e;
90 size_t n;
91
92 *size = ext4_acl_size(acl->a_count);
93 ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count *
94 sizeof(ext4_acl_entry), GFP_KERNEL);
95 if (!ext_acl)
96 return ERR_PTR(-ENOMEM);
97 ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
98 e = (char *)ext_acl + sizeof(ext4_acl_header);
99 for (n=0; n < acl->a_count; n++) {
100 ext4_acl_entry *entry = (ext4_acl_entry *)e;
101 entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
102 entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
103 switch(acl->a_entries[n].e_tag) {
104 case ACL_USER:
105 case ACL_GROUP:
106 entry->e_id =
107 cpu_to_le32(acl->a_entries[n].e_id);
108 e += sizeof(ext4_acl_entry);
109 break;
110
111 case ACL_USER_OBJ:
112 case ACL_GROUP_OBJ:
113 case ACL_MASK:
114 case ACL_OTHER:
115 e += sizeof(ext4_acl_entry_short);
116 break;
117
118 default:
119 goto fail;
120 }
121 }
122 return (char *)ext_acl;
123
124fail:
125 kfree(ext_acl);
126 return ERR_PTR(-EINVAL);
127}
128
129static inline struct posix_acl *
130ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl)
131{
132 struct posix_acl *acl = EXT4_ACL_NOT_CACHED;
133
134 spin_lock(&inode->i_lock);
135 if (*i_acl != EXT4_ACL_NOT_CACHED)
136 acl = posix_acl_dup(*i_acl);
137 spin_unlock(&inode->i_lock);
138
139 return acl;
140}
141
142static inline void
143ext4_iset_acl(struct inode *inode, struct posix_acl **i_acl,
144 struct posix_acl *acl)
145{
146 spin_lock(&inode->i_lock);
147 if (*i_acl != EXT4_ACL_NOT_CACHED)
148 posix_acl_release(*i_acl);
149 *i_acl = posix_acl_dup(acl);
150 spin_unlock(&inode->i_lock);
151}
152
153/*
154 * Inode operation get_posix_acl().
155 *
156 * inode->i_mutex: don't care
157 */
158static struct posix_acl *
159ext4_get_acl(struct inode *inode, int type)
160{
161 struct ext4_inode_info *ei = EXT4_I(inode);
162 int name_index;
163 char *value = NULL;
164 struct posix_acl *acl;
165 int retval;
166
167 if (!test_opt(inode->i_sb, POSIX_ACL))
168 return NULL;
169
170 switch(type) {
171 case ACL_TYPE_ACCESS:
172 acl = ext4_iget_acl(inode, &ei->i_acl);
173 if (acl != EXT4_ACL_NOT_CACHED)
174 return acl;
175 name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
176 break;
177
178 case ACL_TYPE_DEFAULT:
179 acl = ext4_iget_acl(inode, &ei->i_default_acl);
180 if (acl != EXT4_ACL_NOT_CACHED)
181 return acl;
182 name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
183 break;
184
185 default:
186 return ERR_PTR(-EINVAL);
187 }
188 retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
189 if (retval > 0) {
190 value = kmalloc(retval, GFP_KERNEL);
191 if (!value)
192 return ERR_PTR(-ENOMEM);
193 retval = ext4_xattr_get(inode, name_index, "", value, retval);
194 }
195 if (retval > 0)
196 acl = ext4_acl_from_disk(value, retval);
197 else if (retval == -ENODATA || retval == -ENOSYS)
198 acl = NULL;
199 else
200 acl = ERR_PTR(retval);
201 kfree(value);
202
203 if (!IS_ERR(acl)) {
204 switch(type) {
205 case ACL_TYPE_ACCESS:
206 ext4_iset_acl(inode, &ei->i_acl, acl);
207 break;
208
209 case ACL_TYPE_DEFAULT:
210 ext4_iset_acl(inode, &ei->i_default_acl, acl);
211 break;
212 }
213 }
214 return acl;
215}
216
217/*
218 * Set the access or default ACL of an inode.
219 *
220 * inode->i_mutex: down unless called from ext4_new_inode
221 */
222static int
223ext4_set_acl(handle_t *handle, struct inode *inode, int type,
224 struct posix_acl *acl)
225{
226 struct ext4_inode_info *ei = EXT4_I(inode);
227 int name_index;
228 void *value = NULL;
229 size_t size = 0;
230 int error;
231
232 if (S_ISLNK(inode->i_mode))
233 return -EOPNOTSUPP;
234
235 switch(type) {
236 case ACL_TYPE_ACCESS:
237 name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
238 if (acl) {
239 mode_t mode = inode->i_mode;
240 error = posix_acl_equiv_mode(acl, &mode);
241 if (error < 0)
242 return error;
243 else {
244 inode->i_mode = mode;
245 ext4_mark_inode_dirty(handle, inode);
246 if (error == 0)
247 acl = NULL;
248 }
249 }
250 break;
251
252 case ACL_TYPE_DEFAULT:
253 name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
254 if (!S_ISDIR(inode->i_mode))
255 return acl ? -EACCES : 0;
256 break;
257
258 default:
259 return -EINVAL;
260 }
261 if (acl) {
262 value = ext4_acl_to_disk(acl, &size);
263 if (IS_ERR(value))
264 return (int)PTR_ERR(value);
265 }
266
267 error = ext4_xattr_set_handle(handle, inode, name_index, "",
268 value, size, 0);
269
270 kfree(value);
271 if (!error) {
272 switch(type) {
273 case ACL_TYPE_ACCESS:
274 ext4_iset_acl(inode, &ei->i_acl, acl);
275 break;
276
277 case ACL_TYPE_DEFAULT:
278 ext4_iset_acl(inode, &ei->i_default_acl, acl);
279 break;
280 }
281 }
282 return error;
283}
284
285static int
286ext4_check_acl(struct inode *inode, int mask)
287{
288 struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
289
290 if (IS_ERR(acl))
291 return PTR_ERR(acl);
292 if (acl) {
293 int error = posix_acl_permission(inode, acl, mask);
294 posix_acl_release(acl);
295 return error;
296 }
297
298 return -EAGAIN;
299}
300
301int
302ext4_permission(struct inode *inode, int mask, struct nameidata *nd)
303{
304 return generic_permission(inode, mask, ext4_check_acl);
305}
306
307/*
308 * Initialize the ACLs of a new inode. Called from ext4_new_inode.
309 *
310 * dir->i_mutex: down
311 * inode->i_mutex: up (access to inode is still exclusive)
312 */
313int
314ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
315{
316 struct posix_acl *acl = NULL;
317 int error = 0;
318
319 if (!S_ISLNK(inode->i_mode)) {
320 if (test_opt(dir->i_sb, POSIX_ACL)) {
321 acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT);
322 if (IS_ERR(acl))
323 return PTR_ERR(acl);
324 }
325 if (!acl)
326 inode->i_mode &= ~current->fs->umask;
327 }
328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
329 struct posix_acl *clone;
330 mode_t mode;
331
332 if (S_ISDIR(inode->i_mode)) {
333 error = ext4_set_acl(handle, inode,
334 ACL_TYPE_DEFAULT, acl);
335 if (error)
336 goto cleanup;
337 }
338 clone = posix_acl_clone(acl, GFP_KERNEL);
339 error = -ENOMEM;
340 if (!clone)
341 goto cleanup;
342
343 mode = inode->i_mode;
344 error = posix_acl_create_masq(clone, &mode);
345 if (error >= 0) {
346 inode->i_mode = mode;
347 if (error > 0) {
348 /* This is an extended ACL */
349 error = ext4_set_acl(handle, inode,
350 ACL_TYPE_ACCESS, clone);
351 }
352 }
353 posix_acl_release(clone);
354 }
355cleanup:
356 posix_acl_release(acl);
357 return error;
358}
359
360/*
361 * Does chmod for an inode that may have an Access Control List. The
362 * inode->i_mode field must be updated to the desired value by the caller
363 * before calling this function.
364 * Returns 0 on success, or a negative error number.
365 *
366 * We change the ACL rather than storing some ACL entries in the file
367 * mode permission bits (which would be more efficient), because that
368 * would break once additional permissions (like ACL_APPEND, ACL_DELETE
369 * for directories) are added. There are no more bits available in the
370 * file mode.
371 *
372 * inode->i_mutex: down
373 */
374int
375ext4_acl_chmod(struct inode *inode)
376{
377 struct posix_acl *acl, *clone;
378 int error;
379
380 if (S_ISLNK(inode->i_mode))
381 return -EOPNOTSUPP;
382 if (!test_opt(inode->i_sb, POSIX_ACL))
383 return 0;
384 acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
385 if (IS_ERR(acl) || !acl)
386 return PTR_ERR(acl);
387 clone = posix_acl_clone(acl, GFP_KERNEL);
388 posix_acl_release(acl);
389 if (!clone)
390 return -ENOMEM;
391 error = posix_acl_chmod_masq(clone, inode->i_mode);
392 if (!error) {
393 handle_t *handle;
394 int retries = 0;
395
396 retry:
397 handle = ext4_journal_start(inode,
398 EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
399 if (IS_ERR(handle)) {
400 error = PTR_ERR(handle);
401 ext4_std_error(inode->i_sb, error);
402 goto out;
403 }
404 error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, clone);
405 ext4_journal_stop(handle);
406 if (error == -ENOSPC &&
407 ext4_should_retry_alloc(inode->i_sb, &retries))
408 goto retry;
409 }
410out:
411 posix_acl_release(clone);
412 return error;
413}
414
415/*
416 * Extended attribute handlers
417 */
418static size_t
419ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
420 const char *name, size_t name_len)
421{
422 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
423
424 if (!test_opt(inode->i_sb, POSIX_ACL))
425 return 0;
426 if (list && size <= list_len)
427 memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
428 return size;
429}
430
431static size_t
432ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
433 const char *name, size_t name_len)
434{
435 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
436
437 if (!test_opt(inode->i_sb, POSIX_ACL))
438 return 0;
439 if (list && size <= list_len)
440 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
441 return size;
442}
443
444static int
445ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
446{
447 struct posix_acl *acl;
448 int error;
449
450 if (!test_opt(inode->i_sb, POSIX_ACL))
451 return -EOPNOTSUPP;
452
453 acl = ext4_get_acl(inode, type);
454 if (IS_ERR(acl))
455 return PTR_ERR(acl);
456 if (acl == NULL)
457 return -ENODATA;
458 error = posix_acl_to_xattr(acl, buffer, size);
459 posix_acl_release(acl);
460
461 return error;
462}
463
464static int
465ext4_xattr_get_acl_access(struct inode *inode, const char *name,
466 void *buffer, size_t size)
467{
468 if (strcmp(name, "") != 0)
469 return -EINVAL;
470 return ext4_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
471}
472
473static int
474ext4_xattr_get_acl_default(struct inode *inode, const char *name,
475 void *buffer, size_t size)
476{
477 if (strcmp(name, "") != 0)
478 return -EINVAL;
479 return ext4_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
480}
481
482static int
483ext4_xattr_set_acl(struct inode *inode, int type, const void *value,
484 size_t size)
485{
486 handle_t *handle;
487 struct posix_acl *acl;
488 int error, retries = 0;
489
490 if (!test_opt(inode->i_sb, POSIX_ACL))
491 return -EOPNOTSUPP;
492 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
493 return -EPERM;
494
495 if (value) {
496 acl = posix_acl_from_xattr(value, size);
497 if (IS_ERR(acl))
498 return PTR_ERR(acl);
499 else if (acl) {
500 error = posix_acl_valid(acl);
501 if (error)
502 goto release_and_out;
503 }
504 } else
505 acl = NULL;
506
507retry:
508 handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
509 if (IS_ERR(handle))
510 return PTR_ERR(handle);
511 error = ext4_set_acl(handle, inode, type, acl);
512 ext4_journal_stop(handle);
513 if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
514 goto retry;
515
516release_and_out:
517 posix_acl_release(acl);
518 return error;
519}
520
521static int
522ext4_xattr_set_acl_access(struct inode *inode, const char *name,
523 const void *value, size_t size, int flags)
524{
525 if (strcmp(name, "") != 0)
526 return -EINVAL;
527 return ext4_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
528}
529
530static int
531ext4_xattr_set_acl_default(struct inode *inode, const char *name,
532 const void *value, size_t size, int flags)
533{
534 if (strcmp(name, "") != 0)
535 return -EINVAL;
536 return ext4_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
537}
538
539struct xattr_handler ext4_xattr_acl_access_handler = {
540 .prefix = POSIX_ACL_XATTR_ACCESS,
541 .list = ext4_xattr_list_acl_access,
542 .get = ext4_xattr_get_acl_access,
543 .set = ext4_xattr_set_acl_access,
544};
545
546struct xattr_handler ext4_xattr_acl_default_handler = {
547 .prefix = POSIX_ACL_XATTR_DEFAULT,
548 .list = ext4_xattr_list_acl_default,
549 .get = ext4_xattr_get_acl_default,
550 .set = ext4_xattr_set_acl_default,
551};
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
new file mode 100644
index 000000000000..26a5c1abf147
--- /dev/null
+++ b/fs/ext4/acl.h
@@ -0,0 +1,81 @@
1/*
2 File: fs/ext4/acl.h
3
4 (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
5*/
6
7#include <linux/posix_acl_xattr.h>
8
9#define EXT4_ACL_VERSION 0x0001
10
11typedef struct {
12 __le16 e_tag;
13 __le16 e_perm;
14 __le32 e_id;
15} ext4_acl_entry;
16
17typedef struct {
18 __le16 e_tag;
19 __le16 e_perm;
20} ext4_acl_entry_short;
21
22typedef struct {
23 __le32 a_version;
24} ext4_acl_header;
25
26static inline size_t ext4_acl_size(int count)
27{
28 if (count <= 4) {
29 return sizeof(ext4_acl_header) +
30 count * sizeof(ext4_acl_entry_short);
31 } else {
32 return sizeof(ext4_acl_header) +
33 4 * sizeof(ext4_acl_entry_short) +
34 (count - 4) * sizeof(ext4_acl_entry);
35 }
36}
37
38static inline int ext4_acl_count(size_t size)
39{
40 ssize_t s;
41 size -= sizeof(ext4_acl_header);
42 s = size - 4 * sizeof(ext4_acl_entry_short);
43 if (s < 0) {
44 if (size % sizeof(ext4_acl_entry_short))
45 return -1;
46 return size / sizeof(ext4_acl_entry_short);
47 } else {
48 if (s % sizeof(ext4_acl_entry))
49 return -1;
50 return s / sizeof(ext4_acl_entry) + 4;
51 }
52}
53
54#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
55
56/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
57 if the ACL has not been cached */
58#define EXT4_ACL_NOT_CACHED ((void *)-1)
59
60/* acl.c */
61extern int ext4_permission (struct inode *, int, struct nameidata *);
62extern int ext4_acl_chmod (struct inode *);
63extern int ext4_init_acl (handle_t *, struct inode *, struct inode *);
64
65#else /* CONFIG_EXT4DEV_FS_POSIX_ACL */
66#include <linux/sched.h>
67#define ext4_permission NULL
68
69static inline int
70ext4_acl_chmod(struct inode *inode)
71{
72 return 0;
73}
74
75static inline int
76ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
77{
78 return 0;
79}
80#endif /* CONFIG_EXT4DEV_FS_POSIX_ACL */
81
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
new file mode 100644
index 000000000000..5d45582f9517
--- /dev/null
+++ b/fs/ext4/balloc.c
@@ -0,0 +1,1833 @@
1/*
2 * linux/fs/ext4/balloc.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
10 * Big-endian to little-endian byte-swapping/bitmaps by
11 * David S. Miller (davem@caip.rutgers.edu), 1995
12 */
13
14#include <linux/time.h>
15#include <linux/capability.h>
16#include <linux/fs.h>
17#include <linux/jbd2.h>
18#include <linux/ext4_fs.h>
19#include <linux/ext4_jbd2.h>
20#include <linux/quotaops.h>
21#include <linux/buffer_head.h>
22
23/*
24 * balloc.c contains the blocks allocation and deallocation routines
25 */
26
27/*
28 * Calculate the block group number and offset, given a block number
29 */
30void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
31 unsigned long *blockgrpp, ext4_grpblk_t *offsetp)
32{
33 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
34 ext4_grpblk_t offset;
35
36 blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
37 offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb));
38 if (offsetp)
39 *offsetp = offset;
40 if (blockgrpp)
41 *blockgrpp = blocknr;
42
43}
44
45/*
46 * The free blocks are managed by bitmaps. A file system contains several
47 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
48 * block for inodes, N blocks for the inode table and data blocks.
49 *
50 * The file system contains group descriptors which are located after the
51 * super block. Each descriptor contains the number of the bitmap block and
52 * the free blocks count in the block. The descriptors are loaded in memory
53 * when a file system is mounted (see ext4_read_super).
54 */
55
56
57#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
58
59/**
60 * ext4_get_group_desc() -- load group descriptor from disk
61 * @sb: super block
62 * @block_group: given block group
63 * @bh: pointer to the buffer head to store the block
64 * group descriptor
65 */
66struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
67 unsigned int block_group,
68 struct buffer_head ** bh)
69{
70 unsigned long group_desc;
71 unsigned long offset;
72 struct ext4_group_desc * desc;
73 struct ext4_sb_info *sbi = EXT4_SB(sb);
74
75 if (block_group >= sbi->s_groups_count) {
76 ext4_error (sb, "ext4_get_group_desc",
77 "block_group >= groups_count - "
78 "block_group = %d, groups_count = %lu",
79 block_group, sbi->s_groups_count);
80
81 return NULL;
82 }
83 smp_rmb();
84
85 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
86 offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
87 if (!sbi->s_group_desc[group_desc]) {
88 ext4_error (sb, "ext4_get_group_desc",
89 "Group descriptor not loaded - "
90 "block_group = %d, group_desc = %lu, desc = %lu",
91 block_group, group_desc, offset);
92 return NULL;
93 }
94
95 desc = (struct ext4_group_desc *)(
96 (__u8 *)sbi->s_group_desc[group_desc]->b_data +
97 offset * EXT4_DESC_SIZE(sb));
98 if (bh)
99 *bh = sbi->s_group_desc[group_desc];
100 return desc;
101}
102
103/**
104 * read_block_bitmap()
105 * @sb: super block
106 * @block_group: given block group
107 *
108 * Read the bitmap for a given block_group, reading into the specified
109 * slot in the superblock's bitmap cache.
110 *
111 * Return buffer_head on success or NULL in case of failure.
112 */
113static struct buffer_head *
114read_block_bitmap(struct super_block *sb, unsigned int block_group)
115{
116 struct ext4_group_desc * desc;
117 struct buffer_head * bh = NULL;
118
119 desc = ext4_get_group_desc (sb, block_group, NULL);
120 if (!desc)
121 goto error_out;
122 bh = sb_bread(sb, ext4_block_bitmap(sb, desc));
123 if (!bh)
124 ext4_error (sb, "read_block_bitmap",
125 "Cannot read block bitmap - "
126 "block_group = %d, block_bitmap = %llu",
127 block_group,
128 ext4_block_bitmap(sb, desc));
129error_out:
130 return bh;
131}
132/*
133 * The reservation window structure operations
134 * --------------------------------------------
135 * Operations include:
136 * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
137 *
138 * We use a red-black tree to represent per-filesystem reservation
139 * windows.
140 *
141 */
142
143/**
144 * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
145 * @rb_root: root of per-filesystem reservation rb tree
146 * @verbose: verbose mode
147 * @fn: function which wishes to dump the reservation map
148 *
149 * If verbose is turned on, it will print the whole block reservation
150 * windows(start, end). Otherwise, it will only print out the "bad" windows,
151 * those windows that overlap with their immediate neighbors.
152 */
153#if 1
154static void __rsv_window_dump(struct rb_root *root, int verbose,
155 const char *fn)
156{
157 struct rb_node *n;
158 struct ext4_reserve_window_node *rsv, *prev;
159 int bad;
160
161restart:
162 n = rb_first(root);
163 bad = 0;
164 prev = NULL;
165
166 printk("Block Allocation Reservation Windows Map (%s):\n", fn);
167 while (n) {
168 rsv = list_entry(n, struct ext4_reserve_window_node, rsv_node);
169 if (verbose)
170 printk("reservation window 0x%p "
171 "start: %llu, end: %llu\n",
172 rsv, rsv->rsv_start, rsv->rsv_end);
173 if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
174 printk("Bad reservation %p (start >= end)\n",
175 rsv);
176 bad = 1;
177 }
178 if (prev && prev->rsv_end >= rsv->rsv_start) {
179 printk("Bad reservation %p (prev->end >= start)\n",
180 rsv);
181 bad = 1;
182 }
183 if (bad) {
184 if (!verbose) {
185 printk("Restarting reservation walk in verbose mode\n");
186 verbose = 1;
187 goto restart;
188 }
189 }
190 n = rb_next(n);
191 prev = rsv;
192 }
193 printk("Window map complete.\n");
194 if (bad)
195 BUG();
196}
197#define rsv_window_dump(root, verbose) \
198 __rsv_window_dump((root), (verbose), __FUNCTION__)
199#else
200#define rsv_window_dump(root, verbose) do {} while (0)
201#endif
202
203/**
204 * goal_in_my_reservation()
205 * @rsv: inode's reservation window
206 * @grp_goal: given goal block relative to the allocation block group
207 * @group: the current allocation block group
208 * @sb: filesystem super block
209 *
210 * Test if the given goal block (group relative) is within the file's
211 * own block reservation window range.
212 *
213 * If the reservation window is outside the goal allocation group, return 0;
214 * grp_goal (given goal block) could be -1, which means no specific
215 * goal block. In this case, always return 1.
216 * If the goal block is within the reservation window, return 1;
217 * otherwise, return 0;
218 */
219static int
220goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
221 unsigned int group, struct super_block * sb)
222{
223 ext4_fsblk_t group_first_block, group_last_block;
224
225 group_first_block = ext4_group_first_block_no(sb, group);
226 group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
227
228 if ((rsv->_rsv_start > group_last_block) ||
229 (rsv->_rsv_end < group_first_block))
230 return 0;
231 if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
232 || (grp_goal + group_first_block > rsv->_rsv_end)))
233 return 0;
234 return 1;
235}
236
237/**
238 * search_reserve_window()
239 * @rb_root: root of reservation tree
240 * @goal: target allocation block
241 *
242 * Find the reserved window which includes the goal, or the previous one
243 * if the goal is not in any window.
244 * Returns NULL if there are no windows or if all windows start after the goal.
245 */
246static struct ext4_reserve_window_node *
247search_reserve_window(struct rb_root *root, ext4_fsblk_t goal)
248{
249 struct rb_node *n = root->rb_node;
250 struct ext4_reserve_window_node *rsv;
251
252 if (!n)
253 return NULL;
254
255 do {
256 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
257
258 if (goal < rsv->rsv_start)
259 n = n->rb_left;
260 else if (goal > rsv->rsv_end)
261 n = n->rb_right;
262 else
263 return rsv;
264 } while (n);
265 /*
266 * We've fallen off the end of the tree: the goal wasn't inside
267 * any particular node. OK, the previous node must be to one
268 * side of the interval containing the goal. If it's the RHS,
269 * we need to back up one.
270 */
271 if (rsv->rsv_start > goal) {
272 n = rb_prev(&rsv->rsv_node);
273 rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
274 }
275 return rsv;
276}
277
278/**
279 * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree.
280 * @sb: super block
281 * @rsv: reservation window to add
282 *
283 * Must be called with rsv_lock hold.
284 */
285void ext4_rsv_window_add(struct super_block *sb,
286 struct ext4_reserve_window_node *rsv)
287{
288 struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root;
289 struct rb_node *node = &rsv->rsv_node;
290 ext4_fsblk_t start = rsv->rsv_start;
291
292 struct rb_node ** p = &root->rb_node;
293 struct rb_node * parent = NULL;
294 struct ext4_reserve_window_node *this;
295
296 while (*p)
297 {
298 parent = *p;
299 this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node);
300
301 if (start < this->rsv_start)
302 p = &(*p)->rb_left;
303 else if (start > this->rsv_end)
304 p = &(*p)->rb_right;
305 else {
306 rsv_window_dump(root, 1);
307 BUG();
308 }
309 }
310
311 rb_link_node(node, parent, p);
312 rb_insert_color(node, root);
313}
314
315/**
316 * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree
317 * @sb: super block
318 * @rsv: reservation window to remove
319 *
320 * Mark the block reservation window as not allocated, and unlink it
321 * from the filesystem reservation window rb tree. Must be called with
322 * rsv_lock hold.
323 */
324static void rsv_window_remove(struct super_block *sb,
325 struct ext4_reserve_window_node *rsv)
326{
327 rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
328 rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
329 rsv->rsv_alloc_hit = 0;
330 rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root);
331}
332
333/*
334 * rsv_is_empty() -- Check if the reservation window is allocated.
335 * @rsv: given reservation window to check
336 *
337 * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
338 */
339static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
340{
341 /* a valid reservation end block could not be 0 */
342 return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
343}
344
345/**
346 * ext4_init_block_alloc_info()
347 * @inode: file inode structure
348 *
349 * Allocate and initialize the reservation window structure, and
350 * link the window to the ext4 inode structure at last
351 *
352 * The reservation window structure is only dynamically allocated
353 * and linked to ext4 inode the first time the open file
354 * needs a new block. So, before every ext4_new_block(s) call, for
355 * regular files, we should check whether the reservation window
356 * structure exists or not. In the latter case, this function is called.
357 * Fail to do so will result in block reservation being turned off for that
358 * open file.
359 *
360 * This function is called from ext4_get_blocks_handle(), also called
361 * when setting the reservation window size through ioctl before the file
362 * is open for write (needs block allocation).
363 *
364 * Needs truncate_mutex protection prior to call this function.
365 */
366void ext4_init_block_alloc_info(struct inode *inode)
367{
368 struct ext4_inode_info *ei = EXT4_I(inode);
369 struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
370 struct super_block *sb = inode->i_sb;
371
372 block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
373 if (block_i) {
374 struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node;
375
376 rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
377 rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
378
379 /*
380 * if filesystem is mounted with NORESERVATION, the goal
381 * reservation window size is set to zero to indicate
382 * block reservation is off
383 */
384 if (!test_opt(sb, RESERVATION))
385 rsv->rsv_goal_size = 0;
386 else
387 rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS;
388 rsv->rsv_alloc_hit = 0;
389 block_i->last_alloc_logical_block = 0;
390 block_i->last_alloc_physical_block = 0;
391 }
392 ei->i_block_alloc_info = block_i;
393}
394
395/**
396 * ext4_discard_reservation()
397 * @inode: inode
398 *
399 * Discard(free) block reservation window on last file close, or truncate
400 * or at last iput().
401 *
402 * It is being called in three cases:
403 * ext4_release_file(): last writer close the file
404 * ext4_clear_inode(): last iput(), when nobody link to this file.
405 * ext4_truncate(): when the block indirect map is about to change.
406 *
407 */
408void ext4_discard_reservation(struct inode *inode)
409{
410 struct ext4_inode_info *ei = EXT4_I(inode);
411 struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
412 struct ext4_reserve_window_node *rsv;
413 spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
414
415 if (!block_i)
416 return;
417
418 rsv = &block_i->rsv_window_node;
419 if (!rsv_is_empty(&rsv->rsv_window)) {
420 spin_lock(rsv_lock);
421 if (!rsv_is_empty(&rsv->rsv_window))
422 rsv_window_remove(inode->i_sb, rsv);
423 spin_unlock(rsv_lock);
424 }
425}
426
427/**
428 * ext4_free_blocks_sb() -- Free given blocks and update quota
429 * @handle: handle to this transaction
430 * @sb: super block
431 * @block: start physcial block to free
432 * @count: number of blocks to free
433 * @pdquot_freed_blocks: pointer to quota
434 */
435void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
436 ext4_fsblk_t block, unsigned long count,
437 unsigned long *pdquot_freed_blocks)
438{
439 struct buffer_head *bitmap_bh = NULL;
440 struct buffer_head *gd_bh;
441 unsigned long block_group;
442 ext4_grpblk_t bit;
443 unsigned long i;
444 unsigned long overflow;
445 struct ext4_group_desc * desc;
446 struct ext4_super_block * es;
447 struct ext4_sb_info *sbi;
448 int err = 0, ret;
449 ext4_grpblk_t group_freed;
450
451 *pdquot_freed_blocks = 0;
452 sbi = EXT4_SB(sb);
453 es = sbi->s_es;
454 if (block < le32_to_cpu(es->s_first_data_block) ||
455 block + count < block ||
456 block + count > ext4_blocks_count(es)) {
457 ext4_error (sb, "ext4_free_blocks",
458 "Freeing blocks not in datazone - "
459 "block = %llu, count = %lu", block, count);
460 goto error_return;
461 }
462
463 ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1);
464
465do_more:
466 overflow = 0;
467 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
468 /*
469 * Check to see if we are freeing blocks across a group
470 * boundary.
471 */
472 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
473 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
474 count -= overflow;
475 }
476 brelse(bitmap_bh);
477 bitmap_bh = read_block_bitmap(sb, block_group);
478 if (!bitmap_bh)
479 goto error_return;
480 desc = ext4_get_group_desc (sb, block_group, &gd_bh);
481 if (!desc)
482 goto error_return;
483
484 if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
485 in_range(ext4_inode_bitmap(sb, desc), block, count) ||
486 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
487 in_range(block + count - 1, ext4_inode_table(sb, desc),
488 sbi->s_itb_per_group))
489 ext4_error (sb, "ext4_free_blocks",
490 "Freeing blocks in system zones - "
491 "Block = %llu, count = %lu",
492 block, count);
493
494 /*
495 * We are about to start releasing blocks in the bitmap,
496 * so we need undo access.
497 */
498 /* @@@ check errors */
499 BUFFER_TRACE(bitmap_bh, "getting undo access");
500 err = ext4_journal_get_undo_access(handle, bitmap_bh);
501 if (err)
502 goto error_return;
503
504 /*
505 * We are about to modify some metadata. Call the journal APIs
506 * to unshare ->b_data if a currently-committing transaction is
507 * using it
508 */
509 BUFFER_TRACE(gd_bh, "get_write_access");
510 err = ext4_journal_get_write_access(handle, gd_bh);
511 if (err)
512 goto error_return;
513
514 jbd_lock_bh_state(bitmap_bh);
515
516 for (i = 0, group_freed = 0; i < count; i++) {
517 /*
518 * An HJ special. This is expensive...
519 */
520#ifdef CONFIG_JBD_DEBUG
521 jbd_unlock_bh_state(bitmap_bh);
522 {
523 struct buffer_head *debug_bh;
524 debug_bh = sb_find_get_block(sb, block + i);
525 if (debug_bh) {
526 BUFFER_TRACE(debug_bh, "Deleted!");
527 if (!bh2jh(bitmap_bh)->b_committed_data)
528 BUFFER_TRACE(debug_bh,
529 "No commited data in bitmap");
530 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
531 __brelse(debug_bh);
532 }
533 }
534 jbd_lock_bh_state(bitmap_bh);
535#endif
536 if (need_resched()) {
537 jbd_unlock_bh_state(bitmap_bh);
538 cond_resched();
539 jbd_lock_bh_state(bitmap_bh);
540 }
541 /* @@@ This prevents newly-allocated data from being
542 * freed and then reallocated within the same
543 * transaction.
544 *
545 * Ideally we would want to allow that to happen, but to
546 * do so requires making jbd2_journal_forget() capable of
547 * revoking the queued write of a data block, which
548 * implies blocking on the journal lock. *forget()
549 * cannot block due to truncate races.
550 *
551 * Eventually we can fix this by making jbd2_journal_forget()
552 * return a status indicating whether or not it was able
553 * to revoke the buffer. On successful revoke, it is
554 * safe not to set the allocation bit in the committed
555 * bitmap, because we know that there is no outstanding
556 * activity on the buffer any more and so it is safe to
557 * reallocate it.
558 */
559 BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
560 J_ASSERT_BH(bitmap_bh,
561 bh2jh(bitmap_bh)->b_committed_data != NULL);
562 ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
563 bh2jh(bitmap_bh)->b_committed_data);
564
565 /*
566 * We clear the bit in the bitmap after setting the committed
567 * data bit, because this is the reverse order to that which
568 * the allocator uses.
569 */
570 BUFFER_TRACE(bitmap_bh, "clear bit");
571 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
572 bit + i, bitmap_bh->b_data)) {
573 jbd_unlock_bh_state(bitmap_bh);
574 ext4_error(sb, __FUNCTION__,
575 "bit already cleared for block %llu",
576 (ext4_fsblk_t)(block + i));
577 jbd_lock_bh_state(bitmap_bh);
578 BUFFER_TRACE(bitmap_bh, "bit already cleared");
579 } else {
580 group_freed++;
581 }
582 }
583 jbd_unlock_bh_state(bitmap_bh);
584
585 spin_lock(sb_bgl_lock(sbi, block_group));
586 desc->bg_free_blocks_count =
587 cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
588 group_freed);
589 spin_unlock(sb_bgl_lock(sbi, block_group));
590 percpu_counter_mod(&sbi->s_freeblocks_counter, count);
591
592 /* We dirtied the bitmap block */
593 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
594 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
595
596 /* And the group descriptor block */
597 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
598 ret = ext4_journal_dirty_metadata(handle, gd_bh);
599 if (!err) err = ret;
600 *pdquot_freed_blocks += group_freed;
601
602 if (overflow && !err) {
603 block += count;
604 count = overflow;
605 goto do_more;
606 }
607 sb->s_dirt = 1;
608error_return:
609 brelse(bitmap_bh);
610 ext4_std_error(sb, err);
611 return;
612}
613
614/**
615 * ext4_free_blocks() -- Free given blocks and update quota
616 * @handle: handle for this transaction
617 * @inode: inode
618 * @block: start physical block to free
619 * @count: number of blocks to count
620 */
621void ext4_free_blocks(handle_t *handle, struct inode *inode,
622 ext4_fsblk_t block, unsigned long count)
623{
624 struct super_block * sb;
625 unsigned long dquot_freed_blocks;
626
627 sb = inode->i_sb;
628 if (!sb) {
629 printk ("ext4_free_blocks: nonexistent device");
630 return;
631 }
632 ext4_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
633 if (dquot_freed_blocks)
634 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
635 return;
636}
637
638/**
639 * ext4_test_allocatable()
640 * @nr: given allocation block group
641 * @bh: bufferhead contains the bitmap of the given block group
642 *
643 * For ext4 allocations, we must not reuse any blocks which are
644 * allocated in the bitmap buffer's "last committed data" copy. This
645 * prevents deletes from freeing up the page for reuse until we have
646 * committed the delete transaction.
647 *
648 * If we didn't do this, then deleting something and reallocating it as
649 * data would allow the old block to be overwritten before the
650 * transaction committed (because we force data to disk before commit).
651 * This would lead to corruption if we crashed between overwriting the
652 * data and committing the delete.
653 *
654 * @@@ We may want to make this allocation behaviour conditional on
655 * data-writes at some point, and disable it for metadata allocations or
656 * sync-data inodes.
657 */
658static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh)
659{
660 int ret;
661 struct journal_head *jh = bh2jh(bh);
662
663 if (ext4_test_bit(nr, bh->b_data))
664 return 0;
665
666 jbd_lock_bh_state(bh);
667 if (!jh->b_committed_data)
668 ret = 1;
669 else
670 ret = !ext4_test_bit(nr, jh->b_committed_data);
671 jbd_unlock_bh_state(bh);
672 return ret;
673}
674
675/**
676 * bitmap_search_next_usable_block()
677 * @start: the starting block (group relative) of the search
678 * @bh: bufferhead contains the block group bitmap
679 * @maxblocks: the ending block (group relative) of the reservation
680 *
681 * The bitmap search --- search forward alternately through the actual
682 * bitmap on disk and the last-committed copy in journal, until we find a
683 * bit free in both bitmaps.
684 */
685static ext4_grpblk_t
686bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
687 ext4_grpblk_t maxblocks)
688{
689 ext4_grpblk_t next;
690 struct journal_head *jh = bh2jh(bh);
691
692 while (start < maxblocks) {
693 next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start);
694 if (next >= maxblocks)
695 return -1;
696 if (ext4_test_allocatable(next, bh))
697 return next;
698 jbd_lock_bh_state(bh);
699 if (jh->b_committed_data)
700 start = ext4_find_next_zero_bit(jh->b_committed_data,
701 maxblocks, next);
702 jbd_unlock_bh_state(bh);
703 }
704 return -1;
705}
706
707/**
708 * find_next_usable_block()
709 * @start: the starting block (group relative) to find next
710 * allocatable block in bitmap.
711 * @bh: bufferhead contains the block group bitmap
712 * @maxblocks: the ending block (group relative) for the search
713 *
714 * Find an allocatable block in a bitmap. We honor both the bitmap and
715 * its last-committed copy (if that exists), and perform the "most
716 * appropriate allocation" algorithm of looking for a free block near
717 * the initial goal; then for a free byte somewhere in the bitmap; then
718 * for any free bit in the bitmap.
719 */
720static ext4_grpblk_t
721find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
722 ext4_grpblk_t maxblocks)
723{
724 ext4_grpblk_t here, next;
725 char *p, *r;
726
727 if (start > 0) {
728 /*
729 * The goal was occupied; search forward for a free
730 * block within the next XX blocks.
731 *
732 * end_goal is more or less random, but it has to be
733 * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the
734 * next 64-bit boundary is simple..
735 */
736 ext4_grpblk_t end_goal = (start + 63) & ~63;
737 if (end_goal > maxblocks)
738 end_goal = maxblocks;
739 here = ext4_find_next_zero_bit(bh->b_data, end_goal, start);
740 if (here < end_goal && ext4_test_allocatable(here, bh))
741 return here;
742 ext4_debug("Bit not found near goal\n");
743 }
744
745 here = start;
746 if (here < 0)
747 here = 0;
748
749 p = ((char *)bh->b_data) + (here >> 3);
750 r = memscan(p, 0, (maxblocks - here + 7) >> 3);
751 next = (r - ((char *)bh->b_data)) << 3;
752
753 if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
754 return next;
755
756 /*
757 * The bitmap search --- search forward alternately through the actual
758 * bitmap and the last-committed copy until we find a bit free in
759 * both
760 */
761 here = bitmap_search_next_usable_block(here, bh, maxblocks);
762 return here;
763}
764
765/**
766 * claim_block()
767 * @block: the free block (group relative) to allocate
768 * @bh: the bufferhead containts the block group bitmap
769 *
770 * We think we can allocate this block in this bitmap. Try to set the bit.
771 * If that succeeds then check that nobody has allocated and then freed the
772 * block since we saw that is was not marked in b_committed_data. If it _was_
773 * allocated and freed then clear the bit in the bitmap again and return
774 * zero (failure).
775 */
776static inline int
777claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
778{
779 struct journal_head *jh = bh2jh(bh);
780 int ret;
781
782 if (ext4_set_bit_atomic(lock, block, bh->b_data))
783 return 0;
784 jbd_lock_bh_state(bh);
785 if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) {
786 ext4_clear_bit_atomic(lock, block, bh->b_data);
787 ret = 0;
788 } else {
789 ret = 1;
790 }
791 jbd_unlock_bh_state(bh);
792 return ret;
793}
794
795/**
796 * ext4_try_to_allocate()
797 * @sb: superblock
798 * @handle: handle to this transaction
799 * @group: given allocation block group
800 * @bitmap_bh: bufferhead holds the block bitmap
801 * @grp_goal: given target block within the group
802 * @count: target number of blocks to allocate
803 * @my_rsv: reservation window
804 *
805 * Attempt to allocate blocks within a give range. Set the range of allocation
806 * first, then find the first free bit(s) from the bitmap (within the range),
807 * and at last, allocate the blocks by claiming the found free bit as allocated.
808 *
809 * To set the range of this allocation:
810 * if there is a reservation window, only try to allocate block(s) from the
811 * file's own reservation window;
812 * Otherwise, the allocation range starts from the give goal block, ends at
813 * the block group's last block.
814 *
815 * If we failed to allocate the desired block then we may end up crossing to a
816 * new bitmap. In that case we must release write access to the old one via
817 * ext4_journal_release_buffer(), else we'll run out of credits.
818 */
819static ext4_grpblk_t
820ext4_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
821 struct buffer_head *bitmap_bh, ext4_grpblk_t grp_goal,
822 unsigned long *count, struct ext4_reserve_window *my_rsv)
823{
824 ext4_fsblk_t group_first_block;
825 ext4_grpblk_t start, end;
826 unsigned long num = 0;
827
828 /* we do allocation within the reservation window if we have a window */
829 if (my_rsv) {
830 group_first_block = ext4_group_first_block_no(sb, group);
831 if (my_rsv->_rsv_start >= group_first_block)
832 start = my_rsv->_rsv_start - group_first_block;
833 else
834 /* reservation window cross group boundary */
835 start = 0;
836 end = my_rsv->_rsv_end - group_first_block + 1;
837 if (end > EXT4_BLOCKS_PER_GROUP(sb))
838 /* reservation window crosses group boundary */
839 end = EXT4_BLOCKS_PER_GROUP(sb);
840 if ((start <= grp_goal) && (grp_goal < end))
841 start = grp_goal;
842 else
843 grp_goal = -1;
844 } else {
845 if (grp_goal > 0)
846 start = grp_goal;
847 else
848 start = 0;
849 end = EXT4_BLOCKS_PER_GROUP(sb);
850 }
851
852 BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb));
853
854repeat:
855 if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) {
856 grp_goal = find_next_usable_block(start, bitmap_bh, end);
857 if (grp_goal < 0)
858 goto fail_access;
859 if (!my_rsv) {
860 int i;
861
862 for (i = 0; i < 7 && grp_goal > start &&
863 ext4_test_allocatable(grp_goal - 1,
864 bitmap_bh);
865 i++, grp_goal--)
866 ;
867 }
868 }
869 start = grp_goal;
870
871 if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group),
872 grp_goal, bitmap_bh)) {
873 /*
874 * The block was allocated by another thread, or it was
875 * allocated and then freed by another thread
876 */
877 start++;
878 grp_goal++;
879 if (start >= end)
880 goto fail_access;
881 goto repeat;
882 }
883 num++;
884 grp_goal++;
885 while (num < *count && grp_goal < end
886 && ext4_test_allocatable(grp_goal, bitmap_bh)
887 && claim_block(sb_bgl_lock(EXT4_SB(sb), group),
888 grp_goal, bitmap_bh)) {
889 num++;
890 grp_goal++;
891 }
892 *count = num;
893 return grp_goal - num;
894fail_access:
895 *count = num;
896 return -1;
897}
898
899/**
900 * find_next_reservable_window():
901 * find a reservable space within the given range.
902 * It does not allocate the reservation window for now:
903 * alloc_new_reservation() will do the work later.
904 *
905 * @search_head: the head of the searching list;
906 * This is not necessarily the list head of the whole filesystem
907 *
908 * We have both head and start_block to assist the search
909 * for the reservable space. The list starts from head,
910 * but we will shift to the place where start_block is,
911 * then start from there, when looking for a reservable space.
912 *
913 * @size: the target new reservation window size
914 *
915 * @group_first_block: the first block we consider to start
916 * the real search from
917 *
918 * @last_block:
919 * the maximum block number that our goal reservable space
920 * could start from. This is normally the last block in this
921 * group. The search will end when we found the start of next
922 * possible reservable space is out of this boundary.
923 * This could handle the cross boundary reservation window
924 * request.
925 *
926 * basically we search from the given range, rather than the whole
927 * reservation double linked list, (start_block, last_block)
928 * to find a free region that is of my size and has not
929 * been reserved.
930 *
931 */
932static int find_next_reservable_window(
933 struct ext4_reserve_window_node *search_head,
934 struct ext4_reserve_window_node *my_rsv,
935 struct super_block * sb,
936 ext4_fsblk_t start_block,
937 ext4_fsblk_t last_block)
938{
939 struct rb_node *next;
940 struct ext4_reserve_window_node *rsv, *prev;
941 ext4_fsblk_t cur;
942 int size = my_rsv->rsv_goal_size;
943
944 /* TODO: make the start of the reservation window byte-aligned */
945 /* cur = *start_block & ~7;*/
946 cur = start_block;
947 rsv = search_head;
948 if (!rsv)
949 return -1;
950
951 while (1) {
952 if (cur <= rsv->rsv_end)
953 cur = rsv->rsv_end + 1;
954
955 /* TODO?
956 * in the case we could not find a reservable space
957 * that is what is expected, during the re-search, we could
958 * remember what's the largest reservable space we could have
959 * and return that one.
960 *
961 * For now it will fail if we could not find the reservable
962 * space with expected-size (or more)...
963 */
964 if (cur > last_block)
965 return -1; /* fail */
966
967 prev = rsv;
968 next = rb_next(&rsv->rsv_node);
969 rsv = list_entry(next,struct ext4_reserve_window_node,rsv_node);
970
971 /*
972 * Reached the last reservation, we can just append to the
973 * previous one.
974 */
975 if (!next)
976 break;
977
978 if (cur + size <= rsv->rsv_start) {
979 /*
980 * Found a reserveable space big enough. We could
981 * have a reservation across the group boundary here
982 */
983 break;
984 }
985 }
986 /*
987 * we come here either :
988 * when we reach the end of the whole list,
989 * and there is empty reservable space after last entry in the list.
990 * append it to the end of the list.
991 *
992 * or we found one reservable space in the middle of the list,
993 * return the reservation window that we could append to.
994 * succeed.
995 */
996
997 if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
998 rsv_window_remove(sb, my_rsv);
999
1000 /*
1001 * Let's book the whole avaliable window for now. We will check the
1002 * disk bitmap later and then, if there are free blocks then we adjust
1003 * the window size if it's larger than requested.
1004 * Otherwise, we will remove this node from the tree next time
1005 * call find_next_reservable_window.
1006 */
1007 my_rsv->rsv_start = cur;
1008 my_rsv->rsv_end = cur + size - 1;
1009 my_rsv->rsv_alloc_hit = 0;
1010
1011 if (prev != my_rsv)
1012 ext4_rsv_window_add(sb, my_rsv);
1013
1014 return 0;
1015}
1016
1017/**
1018 * alloc_new_reservation()--allocate a new reservation window
1019 *
1020 * To make a new reservation, we search part of the filesystem
1021 * reservation list (the list that inside the group). We try to
1022 * allocate a new reservation window near the allocation goal,
1023 * or the beginning of the group, if there is no goal.
1024 *
1025 * We first find a reservable space after the goal, then from
1026 * there, we check the bitmap for the first free block after
1027 * it. If there is no free block until the end of group, then the
1028 * whole group is full, we failed. Otherwise, check if the free
1029 * block is inside the expected reservable space, if so, we
1030 * succeed.
1031 * If the first free block is outside the reservable space, then
1032 * start from the first free block, we search for next available
1033 * space, and go on.
1034 *
1035 * on succeed, a new reservation will be found and inserted into the list
1036 * It contains at least one free block, and it does not overlap with other
1037 * reservation windows.
1038 *
1039 * failed: we failed to find a reservation window in this group
1040 *
1041 * @rsv: the reservation
1042 *
1043 * @grp_goal: The goal (group-relative). It is where the search for a
1044 * free reservable space should start from.
1045 * if we have a grp_goal(grp_goal >0 ), then start from there,
1046 * no grp_goal(grp_goal = -1), we start from the first block
1047 * of the group.
1048 *
1049 * @sb: the super block
1050 * @group: the group we are trying to allocate in
1051 * @bitmap_bh: the block group block bitmap
1052 *
1053 */
1054static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
1055 ext4_grpblk_t grp_goal, struct super_block *sb,
1056 unsigned int group, struct buffer_head *bitmap_bh)
1057{
1058 struct ext4_reserve_window_node *search_head;
1059 ext4_fsblk_t group_first_block, group_end_block, start_block;
1060 ext4_grpblk_t first_free_block;
1061 struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root;
1062 unsigned long size;
1063 int ret;
1064 spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
1065
1066 group_first_block = ext4_group_first_block_no(sb, group);
1067 group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1068
1069 if (grp_goal < 0)
1070 start_block = group_first_block;
1071 else
1072 start_block = grp_goal + group_first_block;
1073
1074 size = my_rsv->rsv_goal_size;
1075
1076 if (!rsv_is_empty(&my_rsv->rsv_window)) {
1077 /*
1078 * if the old reservation is cross group boundary
1079 * and if the goal is inside the old reservation window,
1080 * we will come here when we just failed to allocate from
1081 * the first part of the window. We still have another part
1082 * that belongs to the next group. In this case, there is no
1083 * point to discard our window and try to allocate a new one
1084 * in this group(which will fail). we should
1085 * keep the reservation window, just simply move on.
1086 *
1087 * Maybe we could shift the start block of the reservation
1088 * window to the first block of next group.
1089 */
1090
1091 if ((my_rsv->rsv_start <= group_end_block) &&
1092 (my_rsv->rsv_end > group_end_block) &&
1093 (start_block >= my_rsv->rsv_start))
1094 return -1;
1095
1096 if ((my_rsv->rsv_alloc_hit >
1097 (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
1098 /*
1099 * if the previously allocation hit ratio is
1100 * greater than 1/2, then we double the size of
1101 * the reservation window the next time,
1102 * otherwise we keep the same size window
1103 */
1104 size = size * 2;
1105 if (size > EXT4_MAX_RESERVE_BLOCKS)
1106 size = EXT4_MAX_RESERVE_BLOCKS;
1107 my_rsv->rsv_goal_size= size;
1108 }
1109 }
1110
1111 spin_lock(rsv_lock);
1112 /*
1113 * shift the search start to the window near the goal block
1114 */
1115 search_head = search_reserve_window(fs_rsv_root, start_block);
1116
1117 /*
1118 * find_next_reservable_window() simply finds a reservable window
1119 * inside the given range(start_block, group_end_block).
1120 *
1121 * To make sure the reservation window has a free bit inside it, we
1122 * need to check the bitmap after we found a reservable window.
1123 */
1124retry:
1125 ret = find_next_reservable_window(search_head, my_rsv, sb,
1126 start_block, group_end_block);
1127
1128 if (ret == -1) {
1129 if (!rsv_is_empty(&my_rsv->rsv_window))
1130 rsv_window_remove(sb, my_rsv);
1131 spin_unlock(rsv_lock);
1132 return -1;
1133 }
1134
1135 /*
1136 * On success, find_next_reservable_window() returns the
1137 * reservation window where there is a reservable space after it.
1138 * Before we reserve this reservable space, we need
1139 * to make sure there is at least a free block inside this region.
1140 *
1141 * searching the first free bit on the block bitmap and copy of
1142 * last committed bitmap alternatively, until we found a allocatable
1143 * block. Search start from the start block of the reservable space
1144 * we just found.
1145 */
1146 spin_unlock(rsv_lock);
1147 first_free_block = bitmap_search_next_usable_block(
1148 my_rsv->rsv_start - group_first_block,
1149 bitmap_bh, group_end_block - group_first_block + 1);
1150
1151 if (first_free_block < 0) {
1152 /*
1153 * no free block left on the bitmap, no point
1154 * to reserve the space. return failed.
1155 */
1156 spin_lock(rsv_lock);
1157 if (!rsv_is_empty(&my_rsv->rsv_window))
1158 rsv_window_remove(sb, my_rsv);
1159 spin_unlock(rsv_lock);
1160 return -1; /* failed */
1161 }
1162
1163 start_block = first_free_block + group_first_block;
1164 /*
1165 * check if the first free block is within the
1166 * free space we just reserved
1167 */
1168 if (start_block >= my_rsv->rsv_start && start_block < my_rsv->rsv_end)
1169 return 0; /* success */
1170 /*
1171 * if the first free bit we found is out of the reservable space
1172 * continue search for next reservable space,
1173 * start from where the free block is,
1174 * we also shift the list head to where we stopped last time
1175 */
1176 search_head = my_rsv;
1177 spin_lock(rsv_lock);
1178 goto retry;
1179}
1180
1181/**
1182 * try_to_extend_reservation()
1183 * @my_rsv: given reservation window
1184 * @sb: super block
1185 * @size: the delta to extend
1186 *
1187 * Attempt to expand the reservation window large enough to have
1188 * required number of free blocks
1189 *
1190 * Since ext4_try_to_allocate() will always allocate blocks within
1191 * the reservation window range, if the window size is too small,
1192 * multiple blocks allocation has to stop at the end of the reservation
1193 * window. To make this more efficient, given the total number of
1194 * blocks needed and the current size of the window, we try to
1195 * expand the reservation window size if necessary on a best-effort
1196 * basis before ext4_new_blocks() tries to allocate blocks,
1197 */
1198static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
1199 struct super_block *sb, int size)
1200{
1201 struct ext4_reserve_window_node *next_rsv;
1202 struct rb_node *next;
1203 spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
1204
1205 if (!spin_trylock(rsv_lock))
1206 return;
1207
1208 next = rb_next(&my_rsv->rsv_node);
1209
1210 if (!next)
1211 my_rsv->rsv_end += size;
1212 else {
1213 next_rsv = list_entry(next, struct ext4_reserve_window_node, rsv_node);
1214
1215 if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
1216 my_rsv->rsv_end += size;
1217 else
1218 my_rsv->rsv_end = next_rsv->rsv_start - 1;
1219 }
1220 spin_unlock(rsv_lock);
1221}
1222
1223/**
1224 * ext4_try_to_allocate_with_rsv()
1225 * @sb: superblock
1226 * @handle: handle to this transaction
1227 * @group: given allocation block group
1228 * @bitmap_bh: bufferhead holds the block bitmap
1229 * @grp_goal: given target block within the group
1230 * @count: target number of blocks to allocate
1231 * @my_rsv: reservation window
1232 * @errp: pointer to store the error code
1233 *
1234 * This is the main function used to allocate a new block and its reservation
1235 * window.
1236 *
1237 * Each time when a new block allocation is need, first try to allocate from
1238 * its own reservation. If it does not have a reservation window, instead of
1239 * looking for a free bit on bitmap first, then look up the reservation list to
1240 * see if it is inside somebody else's reservation window, we try to allocate a
1241 * reservation window for it starting from the goal first. Then do the block
1242 * allocation within the reservation window.
1243 *
1244 * This will avoid keeping on searching the reservation list again and
1245 * again when somebody is looking for a free block (without
1246 * reservation), and there are lots of free blocks, but they are all
1247 * being reserved.
1248 *
1249 * We use a red-black tree for the per-filesystem reservation list.
1250 *
1251 */
1252static ext4_grpblk_t
1253ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1254 unsigned int group, struct buffer_head *bitmap_bh,
1255 ext4_grpblk_t grp_goal,
1256 struct ext4_reserve_window_node * my_rsv,
1257 unsigned long *count, int *errp)
1258{
1259 ext4_fsblk_t group_first_block, group_last_block;
1260 ext4_grpblk_t ret = 0;
1261 int fatal;
1262 unsigned long num = *count;
1263
1264 *errp = 0;
1265
1266 /*
1267 * Make sure we use undo access for the bitmap, because it is critical
1268 * that we do the frozen_data COW on bitmap buffers in all cases even
1269 * if the buffer is in BJ_Forget state in the committing transaction.
1270 */
1271 BUFFER_TRACE(bitmap_bh, "get undo access for new block");
1272 fatal = ext4_journal_get_undo_access(handle, bitmap_bh);
1273 if (fatal) {
1274 *errp = fatal;
1275 return -1;
1276 }
1277
1278 /*
1279 * we don't deal with reservation when
1280 * filesystem is mounted without reservation
1281 * or the file is not a regular file
1282 * or last attempt to allocate a block with reservation turned on failed
1283 */
1284 if (my_rsv == NULL ) {
1285 ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
1286 grp_goal, count, NULL);
1287 goto out;
1288 }
1289 /*
1290 * grp_goal is a group relative block number (if there is a goal)
1291 * 0 < grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
1292 * first block is a filesystem wide block number
1293 * first block is the block number of the first block in this group
1294 */
1295 group_first_block = ext4_group_first_block_no(sb, group);
1296 group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1297
1298 /*
1299 * Basically we will allocate a new block from inode's reservation
1300 * window.
1301 *
1302 * We need to allocate a new reservation window, if:
1303 * a) inode does not have a reservation window; or
1304 * b) last attempt to allocate a block from existing reservation
1305 * failed; or
1306 * c) we come here with a goal and with a reservation window
1307 *
1308 * We do not need to allocate a new reservation window if we come here
1309 * at the beginning with a goal and the goal is inside the window, or
1310 * we don't have a goal but already have a reservation window.
1311 * then we could go to allocate from the reservation window directly.
1312 */
1313 while (1) {
1314 if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
1315 !goal_in_my_reservation(&my_rsv->rsv_window,
1316 grp_goal, group, sb)) {
1317 if (my_rsv->rsv_goal_size < *count)
1318 my_rsv->rsv_goal_size = *count;
1319 ret = alloc_new_reservation(my_rsv, grp_goal, sb,
1320 group, bitmap_bh);
1321 if (ret < 0)
1322 break; /* failed */
1323
1324 if (!goal_in_my_reservation(&my_rsv->rsv_window,
1325 grp_goal, group, sb))
1326 grp_goal = -1;
1327 } else if (grp_goal > 0 &&
1328 (my_rsv->rsv_end-grp_goal+1) < *count)
1329 try_to_extend_reservation(my_rsv, sb,
1330 *count-my_rsv->rsv_end + grp_goal - 1);
1331
1332 if ((my_rsv->rsv_start > group_last_block) ||
1333 (my_rsv->rsv_end < group_first_block)) {
1334 rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1);
1335 BUG();
1336 }
1337 ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
1338 grp_goal, &num, &my_rsv->rsv_window);
1339 if (ret >= 0) {
1340 my_rsv->rsv_alloc_hit += num;
1341 *count = num;
1342 break; /* succeed */
1343 }
1344 num = *count;
1345 }
1346out:
1347 if (ret >= 0) {
1348 BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
1349 "bitmap block");
1350 fatal = ext4_journal_dirty_metadata(handle, bitmap_bh);
1351 if (fatal) {
1352 *errp = fatal;
1353 return -1;
1354 }
1355 return ret;
1356 }
1357
1358 BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
1359 ext4_journal_release_buffer(handle, bitmap_bh);
1360 return ret;
1361}
1362
1363/**
1364 * ext4_has_free_blocks()
1365 * @sbi: in-core super block structure.
1366 *
1367 * Check if filesystem has at least 1 free block available for allocation.
1368 */
1369static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
1370{
1371 ext4_fsblk_t free_blocks, root_blocks;
1372
1373 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
1374 root_blocks = ext4_r_blocks_count(sbi->s_es);
1375 if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
1376 sbi->s_resuid != current->fsuid &&
1377 (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
1378 return 0;
1379 }
1380 return 1;
1381}
1382
1383/**
1384 * ext4_should_retry_alloc()
1385 * @sb: super block
1386 * @retries number of attemps has been made
1387 *
1388 * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
1389 * it is profitable to retry the operation, this function will wait
1390 * for the current or commiting transaction to complete, and then
1391 * return TRUE.
1392 *
1393 * if the total number of retries exceed three times, return FALSE.
1394 */
1395int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1396{
1397 if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
1398 return 0;
1399
1400 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
1401
1402 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
1403}
1404
1405/**
1406 * ext4_new_blocks() -- core block(s) allocation function
1407 * @handle: handle to this transaction
1408 * @inode: file inode
1409 * @goal: given target block(filesystem wide)
1410 * @count: target number of blocks to allocate
1411 * @errp: error code
1412 *
1413 * ext4_new_blocks uses a goal block to assist allocation. It tries to
1414 * allocate block(s) from the block group contains the goal block first. If that
1415 * fails, it will try to allocate block(s) from other block groups without
1416 * any specific goal block.
1417 *
1418 */
1419ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
1420 ext4_fsblk_t goal, unsigned long *count, int *errp)
1421{
1422 struct buffer_head *bitmap_bh = NULL;
1423 struct buffer_head *gdp_bh;
1424 unsigned long group_no;
1425 int goal_group;
1426 ext4_grpblk_t grp_target_blk; /* blockgroup relative goal block */
1427 ext4_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/
1428 ext4_fsblk_t ret_block; /* filesyetem-wide allocated block */
1429 int bgi; /* blockgroup iteration index */
1430 int fatal = 0, err;
1431 int performed_allocation = 0;
1432 ext4_grpblk_t free_blocks; /* number of free blocks in a group */
1433 struct super_block *sb;
1434 struct ext4_group_desc *gdp;
1435 struct ext4_super_block *es;
1436 struct ext4_sb_info *sbi;
1437 struct ext4_reserve_window_node *my_rsv = NULL;
1438 struct ext4_block_alloc_info *block_i;
1439 unsigned short windowsz = 0;
1440#ifdef EXT4FS_DEBUG
1441 static int goal_hits, goal_attempts;
1442#endif
1443 unsigned long ngroups;
1444 unsigned long num = *count;
1445
1446 *errp = -ENOSPC;
1447 sb = inode->i_sb;
1448 if (!sb) {
1449 printk("ext4_new_block: nonexistent device");
1450 return 0;
1451 }
1452
1453 /*
1454 * Check quota for allocation of this block.
1455 */
1456 if (DQUOT_ALLOC_BLOCK(inode, num)) {
1457 *errp = -EDQUOT;
1458 return 0;
1459 }
1460
1461 sbi = EXT4_SB(sb);
1462 es = EXT4_SB(sb)->s_es;
1463 ext4_debug("goal=%lu.\n", goal);
1464 /*
1465 * Allocate a block from reservation only when
1466 * filesystem is mounted with reservation(default,-o reservation), and
1467 * it's a regular file, and
1468 * the desired window size is greater than 0 (One could use ioctl
1469 * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off
1470 * reservation on that particular file)
1471 */
1472 block_i = EXT4_I(inode)->i_block_alloc_info;
1473 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1474 my_rsv = &block_i->rsv_window_node;
1475
1476 if (!ext4_has_free_blocks(sbi)) {
1477 *errp = -ENOSPC;
1478 goto out;
1479 }
1480
1481 /*
1482 * First, test whether the goal block is free.
1483 */
1484 if (goal < le32_to_cpu(es->s_first_data_block) ||
1485 goal >= ext4_blocks_count(es))
1486 goal = le32_to_cpu(es->s_first_data_block);
1487 ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk);
1488 goal_group = group_no;
1489retry_alloc:
1490 gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
1491 if (!gdp)
1492 goto io_error;
1493
1494 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1495 /*
1496 * if there is not enough free blocks to make a new resevation
1497 * turn off reservation for this allocation
1498 */
1499 if (my_rsv && (free_blocks < windowsz)
1500 && (rsv_is_empty(&my_rsv->rsv_window)))
1501 my_rsv = NULL;
1502
1503 if (free_blocks > 0) {
1504 bitmap_bh = read_block_bitmap(sb, group_no);
1505 if (!bitmap_bh)
1506 goto io_error;
1507 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
1508 group_no, bitmap_bh, grp_target_blk,
1509 my_rsv, &num, &fatal);
1510 if (fatal)
1511 goto out;
1512 if (grp_alloc_blk >= 0)
1513 goto allocated;
1514 }
1515
1516 ngroups = EXT4_SB(sb)->s_groups_count;
1517 smp_rmb();
1518
1519 /*
1520 * Now search the rest of the groups. We assume that
1521 * i and gdp correctly point to the last group visited.
1522 */
1523 for (bgi = 0; bgi < ngroups; bgi++) {
1524 group_no++;
1525 if (group_no >= ngroups)
1526 group_no = 0;
1527 gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
1528 if (!gdp) {
1529 *errp = -EIO;
1530 goto out;
1531 }
1532 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1533 /*
1534 * skip this group if the number of
1535 * free blocks is less than half of the reservation
1536 * window size.
1537 */
1538 if (free_blocks <= (windowsz/2))
1539 continue;
1540
1541 brelse(bitmap_bh);
1542 bitmap_bh = read_block_bitmap(sb, group_no);
1543 if (!bitmap_bh)
1544 goto io_error;
1545 /*
1546 * try to allocate block(s) from this group, without a goal(-1).
1547 */
1548 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
1549 group_no, bitmap_bh, -1, my_rsv,
1550 &num, &fatal);
1551 if (fatal)
1552 goto out;
1553 if (grp_alloc_blk >= 0)
1554 goto allocated;
1555 }
1556 /*
1557 * We may end up a bogus ealier ENOSPC error due to
1558 * filesystem is "full" of reservations, but
1559 * there maybe indeed free blocks avaliable on disk
1560 * In this case, we just forget about the reservations
1561 * just do block allocation as without reservations.
1562 */
1563 if (my_rsv) {
1564 my_rsv = NULL;
1565 group_no = goal_group;
1566 goto retry_alloc;
1567 }
1568 /* No space left on the device */
1569 *errp = -ENOSPC;
1570 goto out;
1571
1572allocated:
1573
1574 ext4_debug("using block group %d(%d)\n",
1575 group_no, gdp->bg_free_blocks_count);
1576
1577 BUFFER_TRACE(gdp_bh, "get_write_access");
1578 fatal = ext4_journal_get_write_access(handle, gdp_bh);
1579 if (fatal)
1580 goto out;
1581
1582 ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
1583
1584 if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
1585 in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
1586 in_range(ret_block, ext4_inode_table(sb, gdp),
1587 EXT4_SB(sb)->s_itb_per_group) ||
1588 in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
1589 EXT4_SB(sb)->s_itb_per_group))
1590 ext4_error(sb, "ext4_new_block",
1591 "Allocating block in system zone - "
1592 "blocks from %llu, length %lu",
1593 ret_block, num);
1594
1595 performed_allocation = 1;
1596
1597#ifdef CONFIG_JBD_DEBUG
1598 {
1599 struct buffer_head *debug_bh;
1600
1601 /* Record bitmap buffer state in the newly allocated block */
1602 debug_bh = sb_find_get_block(sb, ret_block);
1603 if (debug_bh) {
1604 BUFFER_TRACE(debug_bh, "state when allocated");
1605 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
1606 brelse(debug_bh);
1607 }
1608 }
1609 jbd_lock_bh_state(bitmap_bh);
1610 spin_lock(sb_bgl_lock(sbi, group_no));
1611 if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
1612 int i;
1613
1614 for (i = 0; i < num; i++) {
1615 if (ext4_test_bit(grp_alloc_blk+i,
1616 bh2jh(bitmap_bh)->b_committed_data)) {
1617 printk("%s: block was unexpectedly set in "
1618 "b_committed_data\n", __FUNCTION__);
1619 }
1620 }
1621 }
1622 ext4_debug("found bit %d\n", grp_alloc_blk);
1623 spin_unlock(sb_bgl_lock(sbi, group_no));
1624 jbd_unlock_bh_state(bitmap_bh);
1625#endif
1626
1627 if (ret_block + num - 1 >= ext4_blocks_count(es)) {
1628 ext4_error(sb, "ext4_new_block",
1629 "block(%llu) >= blocks count(%llu) - "
1630 "block_group = %lu, es == %p ", ret_block,
1631 ext4_blocks_count(es), group_no, es);
1632 goto out;
1633 }
1634
1635 /*
1636 * It is up to the caller to add the new buffer to a journal
1637 * list of some description. We don't know in advance whether
1638 * the caller wants to use it as metadata or data.
1639 */
1640 ext4_debug("allocating block %lu. Goal hits %d of %d.\n",
1641 ret_block, goal_hits, goal_attempts);
1642
1643 spin_lock(sb_bgl_lock(sbi, group_no));
1644 gdp->bg_free_blocks_count =
1645 cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num);
1646 spin_unlock(sb_bgl_lock(sbi, group_no));
1647 percpu_counter_mod(&sbi->s_freeblocks_counter, -num);
1648
1649 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1650 err = ext4_journal_dirty_metadata(handle, gdp_bh);
1651 if (!fatal)
1652 fatal = err;
1653
1654 sb->s_dirt = 1;
1655 if (fatal)
1656 goto out;
1657
1658 *errp = 0;
1659 brelse(bitmap_bh);
1660 DQUOT_FREE_BLOCK(inode, *count-num);
1661 *count = num;
1662 return ret_block;
1663
1664io_error:
1665 *errp = -EIO;
1666out:
1667 if (fatal) {
1668 *errp = fatal;
1669 ext4_std_error(sb, fatal);
1670 }
1671 /*
1672 * Undo the block allocation
1673 */
1674 if (!performed_allocation)
1675 DQUOT_FREE_BLOCK(inode, *count);
1676 brelse(bitmap_bh);
1677 return 0;
1678}
1679
1680ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
1681 ext4_fsblk_t goal, int *errp)
1682{
1683 unsigned long count = 1;
1684
1685 return ext4_new_blocks(handle, inode, goal, &count, errp);
1686}
1687
1688/**
1689 * ext4_count_free_blocks() -- count filesystem free blocks
1690 * @sb: superblock
1691 *
1692 * Adds up the number of free blocks from each block group.
1693 */
1694ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
1695{
1696 ext4_fsblk_t desc_count;
1697 struct ext4_group_desc *gdp;
1698 int i;
1699 unsigned long ngroups = EXT4_SB(sb)->s_groups_count;
1700#ifdef EXT4FS_DEBUG
1701 struct ext4_super_block *es;
1702 ext4_fsblk_t bitmap_count;
1703 unsigned long x;
1704 struct buffer_head *bitmap_bh = NULL;
1705
1706 es = EXT4_SB(sb)->s_es;
1707 desc_count = 0;
1708 bitmap_count = 0;
1709 gdp = NULL;
1710
1711 smp_rmb();
1712 for (i = 0; i < ngroups; i++) {
1713 gdp = ext4_get_group_desc(sb, i, NULL);
1714 if (!gdp)
1715 continue;
1716 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1717 brelse(bitmap_bh);
1718 bitmap_bh = read_block_bitmap(sb, i);
1719 if (bitmap_bh == NULL)
1720 continue;
1721
1722 x = ext4_count_free(bitmap_bh, sb->s_blocksize);
1723 printk("group %d: stored = %d, counted = %lu\n",
1724 i, le16_to_cpu(gdp->bg_free_blocks_count), x);
1725 bitmap_count += x;
1726 }
1727 brelse(bitmap_bh);
1728 printk("ext4_count_free_blocks: stored = %llu"
1729 ", computed = %llu, %llu\n",
1730 EXT4_FREE_BLOCKS_COUNT(es),
1731 desc_count, bitmap_count);
1732 return bitmap_count;
1733#else
1734 desc_count = 0;
1735 smp_rmb();
1736 for (i = 0; i < ngroups; i++) {
1737 gdp = ext4_get_group_desc(sb, i, NULL);
1738 if (!gdp)
1739 continue;
1740 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1741 }
1742
1743 return desc_count;
1744#endif
1745}
1746
1747static inline int
1748block_in_use(ext4_fsblk_t block, struct super_block *sb, unsigned char *map)
1749{
1750 ext4_grpblk_t offset;
1751
1752 ext4_get_group_no_and_offset(sb, block, NULL, &offset);
1753 return ext4_test_bit (offset, map);
1754}
1755
1756static inline int test_root(int a, int b)
1757{
1758 int num = b;
1759
1760 while (a > num)
1761 num *= b;
1762 return num == a;
1763}
1764
1765static int ext4_group_sparse(int group)
1766{
1767 if (group <= 1)
1768 return 1;
1769 if (!(group & 1))
1770 return 0;
1771 return (test_root(group, 7) || test_root(group, 5) ||
1772 test_root(group, 3));
1773}
1774
1775/**
1776 * ext4_bg_has_super - number of blocks used by the superblock in group
1777 * @sb: superblock for filesystem
1778 * @group: group number to check
1779 *
1780 * Return the number of blocks used by the superblock (primary or backup)
1781 * in this group. Currently this will be only 0 or 1.
1782 */
1783int ext4_bg_has_super(struct super_block *sb, int group)
1784{
1785 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
1786 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
1787 !ext4_group_sparse(group))
1788 return 0;
1789 return 1;
1790}
1791
1792static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, int group)
1793{
1794 unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
1795 unsigned long first = metagroup * EXT4_DESC_PER_BLOCK(sb);
1796 unsigned long last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
1797
1798 if (group == first || group == first + 1 || group == last)
1799 return 1;
1800 return 0;
1801}
1802
1803static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, int group)
1804{
1805 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
1806 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
1807 !ext4_group_sparse(group))
1808 return 0;
1809 return EXT4_SB(sb)->s_gdb_count;
1810}
1811
1812/**
1813 * ext4_bg_num_gdb - number of blocks used by the group table in group
1814 * @sb: superblock for filesystem
1815 * @group: group number to check
1816 *
1817 * Return the number of blocks used by the group descriptor table
1818 * (primary or backup) in this group. In the future there may be a
1819 * different number of descriptor blocks in each group.
1820 */
1821unsigned long ext4_bg_num_gdb(struct super_block *sb, int group)
1822{
1823 unsigned long first_meta_bg =
1824 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
1825 unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
1826
1827 if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
1828 metagroup < first_meta_bg)
1829 return ext4_bg_num_gdb_nometa(sb,group);
1830
1831 return ext4_bg_num_gdb_meta(sb,group);
1832
1833}
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
new file mode 100644
index 000000000000..11e93c169bcf
--- /dev/null
+++ b/fs/ext4/bitmap.c
@@ -0,0 +1,32 @@
1/*
2 * linux/fs/ext4/bitmap.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 */
9
10#include <linux/buffer_head.h>
11#include <linux/jbd2.h>
12#include <linux/ext4_fs.h>
13
14#ifdef EXT4FS_DEBUG
15
16static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
17
18unsigned long ext4_count_free (struct buffer_head * map, unsigned int numchars)
19{
20 unsigned int i;
21 unsigned long sum = 0;
22
23 if (!map)
24 return (0);
25 for (i = 0; i < numchars; i++)
26 sum += nibblemap[map->b_data[i] & 0xf] +
27 nibblemap[(map->b_data[i] >> 4) & 0xf];
28 return (sum);
29}
30
31#endif /* EXT4FS_DEBUG */
32
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
new file mode 100644
index 000000000000..f8595787a70e
--- /dev/null
+++ b/fs/ext4/dir.c
@@ -0,0 +1,518 @@
1/*
2 * linux/fs/ext4/dir.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/dir.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * ext4 directory handling functions
16 *
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 *
20 * Hash Tree Directory indexing (c) 2001 Daniel Phillips
21 *
22 */
23
24#include <linux/fs.h>
25#include <linux/jbd2.h>
26#include <linux/ext4_fs.h>
27#include <linux/buffer_head.h>
28#include <linux/smp_lock.h>
29#include <linux/slab.h>
30#include <linux/rbtree.h>
31
32static unsigned char ext4_filetype_table[] = {
33 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
34};
35
36static int ext4_readdir(struct file *, void *, filldir_t);
37static int ext4_dx_readdir(struct file * filp,
38 void * dirent, filldir_t filldir);
39static int ext4_release_dir (struct inode * inode,
40 struct file * filp);
41
42const struct file_operations ext4_dir_operations = {
43 .llseek = generic_file_llseek,
44 .read = generic_read_dir,
45 .readdir = ext4_readdir, /* we take BKL. needed?*/
46 .ioctl = ext4_ioctl, /* BKL held */
47#ifdef CONFIG_COMPAT
48 .compat_ioctl = ext4_compat_ioctl,
49#endif
50 .fsync = ext4_sync_file, /* BKL held */
51#ifdef CONFIG_EXT4_INDEX
52 .release = ext4_release_dir,
53#endif
54};
55
56
57static unsigned char get_dtype(struct super_block *sb, int filetype)
58{
59 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
60 (filetype >= EXT4_FT_MAX))
61 return DT_UNKNOWN;
62
63 return (ext4_filetype_table[filetype]);
64}
65
66
67int ext4_check_dir_entry (const char * function, struct inode * dir,
68 struct ext4_dir_entry_2 * de,
69 struct buffer_head * bh,
70 unsigned long offset)
71{
72 const char * error_msg = NULL;
73 const int rlen = le16_to_cpu(de->rec_len);
74
75 if (rlen < EXT4_DIR_REC_LEN(1))
76 error_msg = "rec_len is smaller than minimal";
77 else if (rlen % 4 != 0)
78 error_msg = "rec_len % 4 != 0";
79 else if (rlen < EXT4_DIR_REC_LEN(de->name_len))
80 error_msg = "rec_len is too small for name_len";
81 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
82 error_msg = "directory entry across blocks";
83 else if (le32_to_cpu(de->inode) >
84 le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))
85 error_msg = "inode out of bounds";
86
87 if (error_msg != NULL)
88 ext4_error (dir->i_sb, function,
89 "bad entry in directory #%lu: %s - "
90 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
91 dir->i_ino, error_msg, offset,
92 (unsigned long) le32_to_cpu(de->inode),
93 rlen, de->name_len);
94 return error_msg == NULL ? 1 : 0;
95}
96
97static int ext4_readdir(struct file * filp,
98 void * dirent, filldir_t filldir)
99{
100 int error = 0;
101 unsigned long offset;
102 int i, stored;
103 struct ext4_dir_entry_2 *de;
104 struct super_block *sb;
105 int err;
106 struct inode *inode = filp->f_dentry->d_inode;
107 int ret = 0;
108
109 sb = inode->i_sb;
110
111#ifdef CONFIG_EXT4_INDEX
112 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
113 EXT4_FEATURE_COMPAT_DIR_INDEX) &&
114 ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
115 ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
116 err = ext4_dx_readdir(filp, dirent, filldir);
117 if (err != ERR_BAD_DX_DIR) {
118 ret = err;
119 goto out;
120 }
121 /*
122 * We don't set the inode dirty flag since it's not
123 * critical that it get flushed back to the disk.
124 */
125 EXT4_I(filp->f_dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
126 }
127#endif
128 stored = 0;
129 offset = filp->f_pos & (sb->s_blocksize - 1);
130
131 while (!error && !stored && filp->f_pos < inode->i_size) {
132 unsigned long blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
133 struct buffer_head map_bh;
134 struct buffer_head *bh = NULL;
135
136 map_bh.b_state = 0;
137 err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
138 if (err > 0) {
139 page_cache_readahead(sb->s_bdev->bd_inode->i_mapping,
140 &filp->f_ra,
141 filp,
142 map_bh.b_blocknr >>
143 (PAGE_CACHE_SHIFT - inode->i_blkbits),
144 1);
145 bh = ext4_bread(NULL, inode, blk, 0, &err);
146 }
147
148 /*
149 * We ignore I/O errors on directories so users have a chance
150 * of recovering data when there's a bad sector
151 */
152 if (!bh) {
153 ext4_error (sb, "ext4_readdir",
154 "directory #%lu contains a hole at offset %lu",
155 inode->i_ino, (unsigned long)filp->f_pos);
156 filp->f_pos += sb->s_blocksize - offset;
157 continue;
158 }
159
160revalidate:
161 /* If the dir block has changed since the last call to
162 * readdir(2), then we might be pointing to an invalid
163 * dirent right now. Scan from the start of the block
164 * to make sure. */
165 if (filp->f_version != inode->i_version) {
166 for (i = 0; i < sb->s_blocksize && i < offset; ) {
167 de = (struct ext4_dir_entry_2 *)
168 (bh->b_data + i);
169 /* It's too expensive to do a full
170 * dirent test each time round this
171 * loop, but we do have to test at
172 * least that it is non-zero. A
173 * failure will be detected in the
174 * dirent test below. */
175 if (le16_to_cpu(de->rec_len) <
176 EXT4_DIR_REC_LEN(1))
177 break;
178 i += le16_to_cpu(de->rec_len);
179 }
180 offset = i;
181 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
182 | offset;
183 filp->f_version = inode->i_version;
184 }
185
186 while (!error && filp->f_pos < inode->i_size
187 && offset < sb->s_blocksize) {
188 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
189 if (!ext4_check_dir_entry ("ext4_readdir", inode, de,
190 bh, offset)) {
191 /*
192 * On error, skip the f_pos to the next block
193 */
194 filp->f_pos = (filp->f_pos |
195 (sb->s_blocksize - 1)) + 1;
196 brelse (bh);
197 ret = stored;
198 goto out;
199 }
200 offset += le16_to_cpu(de->rec_len);
201 if (le32_to_cpu(de->inode)) {
202 /* We might block in the next section
203 * if the data destination is
204 * currently swapped out. So, use a
205 * version stamp to detect whether or
206 * not the directory has been modified
207 * during the copy operation.
208 */
209 unsigned long version = filp->f_version;
210
211 error = filldir(dirent, de->name,
212 de->name_len,
213 filp->f_pos,
214 le32_to_cpu(de->inode),
215 get_dtype(sb, de->file_type));
216 if (error)
217 break;
218 if (version != filp->f_version)
219 goto revalidate;
220 stored ++;
221 }
222 filp->f_pos += le16_to_cpu(de->rec_len);
223 }
224 offset = 0;
225 brelse (bh);
226 }
227out:
228 return ret;
229}
230
231#ifdef CONFIG_EXT4_INDEX
232/*
233 * These functions convert from the major/minor hash to an f_pos
234 * value.
235 *
236 * Currently we only use major hash numer. This is unfortunate, but
237 * on 32-bit machines, the same VFS interface is used for lseek and
238 * llseek, so if we use the 64 bit offset, then the 32-bit versions of
239 * lseek/telldir/seekdir will blow out spectacularly, and from within
240 * the ext2 low-level routine, we don't know if we're being called by
241 * a 64-bit version of the system call or the 32-bit version of the
242 * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
243 * cookie. Sigh.
244 */
245#define hash2pos(major, minor) (major >> 1)
246#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
247#define pos2min_hash(pos) (0)
248
249/*
250 * This structure holds the nodes of the red-black tree used to store
251 * the directory entry in hash order.
252 */
253struct fname {
254 __u32 hash;
255 __u32 minor_hash;
256 struct rb_node rb_hash;
257 struct fname *next;
258 __u32 inode;
259 __u8 name_len;
260 __u8 file_type;
261 char name[0];
262};
263
264/*
265 * This functoin implements a non-recursive way of freeing all of the
266 * nodes in the red-black tree.
267 */
268static void free_rb_tree_fname(struct rb_root *root)
269{
270 struct rb_node *n = root->rb_node;
271 struct rb_node *parent;
272 struct fname *fname;
273
274 while (n) {
275 /* Do the node's children first */
276 if ((n)->rb_left) {
277 n = n->rb_left;
278 continue;
279 }
280 if (n->rb_right) {
281 n = n->rb_right;
282 continue;
283 }
284 /*
285 * The node has no children; free it, and then zero
286 * out parent's link to it. Finally go to the
287 * beginning of the loop and try to free the parent
288 * node.
289 */
290 parent = rb_parent(n);
291 fname = rb_entry(n, struct fname, rb_hash);
292 while (fname) {
293 struct fname * old = fname;
294 fname = fname->next;
295 kfree (old);
296 }
297 if (!parent)
298 root->rb_node = NULL;
299 else if (parent->rb_left == n)
300 parent->rb_left = NULL;
301 else if (parent->rb_right == n)
302 parent->rb_right = NULL;
303 n = parent;
304 }
305 root->rb_node = NULL;
306}
307
308
309static struct dir_private_info *create_dir_info(loff_t pos)
310{
311 struct dir_private_info *p;
312
313 p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
314 if (!p)
315 return NULL;
316 p->root.rb_node = NULL;
317 p->curr_node = NULL;
318 p->extra_fname = NULL;
319 p->last_pos = 0;
320 p->curr_hash = pos2maj_hash(pos);
321 p->curr_minor_hash = pos2min_hash(pos);
322 p->next_hash = 0;
323 return p;
324}
325
326void ext4_htree_free_dir_info(struct dir_private_info *p)
327{
328 free_rb_tree_fname(&p->root);
329 kfree(p);
330}
331
332/*
333 * Given a directory entry, enter it into the fname rb tree.
334 */
335int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
336 __u32 minor_hash,
337 struct ext4_dir_entry_2 *dirent)
338{
339 struct rb_node **p, *parent = NULL;
340 struct fname * fname, *new_fn;
341 struct dir_private_info *info;
342 int len;
343
344 info = (struct dir_private_info *) dir_file->private_data;
345 p = &info->root.rb_node;
346
347 /* Create and allocate the fname structure */
348 len = sizeof(struct fname) + dirent->name_len + 1;
349 new_fn = kzalloc(len, GFP_KERNEL);
350 if (!new_fn)
351 return -ENOMEM;
352 new_fn->hash = hash;
353 new_fn->minor_hash = minor_hash;
354 new_fn->inode = le32_to_cpu(dirent->inode);
355 new_fn->name_len = dirent->name_len;
356 new_fn->file_type = dirent->file_type;
357 memcpy(new_fn->name, dirent->name, dirent->name_len);
358 new_fn->name[dirent->name_len] = 0;
359
360 while (*p) {
361 parent = *p;
362 fname = rb_entry(parent, struct fname, rb_hash);
363
364 /*
365 * If the hash and minor hash match up, then we put
366 * them on a linked list. This rarely happens...
367 */
368 if ((new_fn->hash == fname->hash) &&
369 (new_fn->minor_hash == fname->minor_hash)) {
370 new_fn->next = fname->next;
371 fname->next = new_fn;
372 return 0;
373 }
374
375 if (new_fn->hash < fname->hash)
376 p = &(*p)->rb_left;
377 else if (new_fn->hash > fname->hash)
378 p = &(*p)->rb_right;
379 else if (new_fn->minor_hash < fname->minor_hash)
380 p = &(*p)->rb_left;
381 else /* if (new_fn->minor_hash > fname->minor_hash) */
382 p = &(*p)->rb_right;
383 }
384
385 rb_link_node(&new_fn->rb_hash, parent, p);
386 rb_insert_color(&new_fn->rb_hash, &info->root);
387 return 0;
388}
389
390
391
392/*
393 * This is a helper function for ext4_dx_readdir. It calls filldir
394 * for all entres on the fname linked list. (Normally there is only
395 * one entry on the linked list, unless there are 62 bit hash collisions.)
396 */
397static int call_filldir(struct file * filp, void * dirent,
398 filldir_t filldir, struct fname *fname)
399{
400 struct dir_private_info *info = filp->private_data;
401 loff_t curr_pos;
402 struct inode *inode = filp->f_dentry->d_inode;
403 struct super_block * sb;
404 int error;
405
406 sb = inode->i_sb;
407
408 if (!fname) {
409 printk("call_filldir: called with null fname?!?\n");
410 return 0;
411 }
412 curr_pos = hash2pos(fname->hash, fname->minor_hash);
413 while (fname) {
414 error = filldir(dirent, fname->name,
415 fname->name_len, curr_pos,
416 fname->inode,
417 get_dtype(sb, fname->file_type));
418 if (error) {
419 filp->f_pos = curr_pos;
420 info->extra_fname = fname->next;
421 return error;
422 }
423 fname = fname->next;
424 }
425 return 0;
426}
427
428static int ext4_dx_readdir(struct file * filp,
429 void * dirent, filldir_t filldir)
430{
431 struct dir_private_info *info = filp->private_data;
432 struct inode *inode = filp->f_dentry->d_inode;
433 struct fname *fname;
434 int ret;
435
436 if (!info) {
437 info = create_dir_info(filp->f_pos);
438 if (!info)
439 return -ENOMEM;
440 filp->private_data = info;
441 }
442
443 if (filp->f_pos == EXT4_HTREE_EOF)
444 return 0; /* EOF */
445
446 /* Some one has messed with f_pos; reset the world */
447 if (info->last_pos != filp->f_pos) {
448 free_rb_tree_fname(&info->root);
449 info->curr_node = NULL;
450 info->extra_fname = NULL;
451 info->curr_hash = pos2maj_hash(filp->f_pos);
452 info->curr_minor_hash = pos2min_hash(filp->f_pos);
453 }
454
455 /*
456 * If there are any leftover names on the hash collision
457 * chain, return them first.
458 */
459 if (info->extra_fname &&
460 call_filldir(filp, dirent, filldir, info->extra_fname))
461 goto finished;
462
463 if (!info->curr_node)
464 info->curr_node = rb_first(&info->root);
465
466 while (1) {
467 /*
468 * Fill the rbtree if we have no more entries,
469 * or the inode has changed since we last read in the
470 * cached entries.
471 */
472 if ((!info->curr_node) ||
473 (filp->f_version != inode->i_version)) {
474 info->curr_node = NULL;
475 free_rb_tree_fname(&info->root);
476 filp->f_version = inode->i_version;
477 ret = ext4_htree_fill_tree(filp, info->curr_hash,
478 info->curr_minor_hash,
479 &info->next_hash);
480 if (ret < 0)
481 return ret;
482 if (ret == 0) {
483 filp->f_pos = EXT4_HTREE_EOF;
484 break;
485 }
486 info->curr_node = rb_first(&info->root);
487 }
488
489 fname = rb_entry(info->curr_node, struct fname, rb_hash);
490 info->curr_hash = fname->hash;
491 info->curr_minor_hash = fname->minor_hash;
492 if (call_filldir(filp, dirent, filldir, fname))
493 break;
494
495 info->curr_node = rb_next(info->curr_node);
496 if (!info->curr_node) {
497 if (info->next_hash == ~0) {
498 filp->f_pos = EXT4_HTREE_EOF;
499 break;
500 }
501 info->curr_hash = info->next_hash;
502 info->curr_minor_hash = 0;
503 }
504 }
505finished:
506 info->last_pos = filp->f_pos;
507 return 0;
508}
509
510static int ext4_release_dir (struct inode * inode, struct file * filp)
511{
512 if (filp->private_data)
513 ext4_htree_free_dir_info(filp->private_data);
514
515 return 0;
516}
517
518#endif
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
new file mode 100644
index 000000000000..2608dce18f3e
--- /dev/null
+++ b/fs/ext4/extents.c
@@ -0,0 +1,2152 @@
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * Architecture independence:
6 * Copyright (c) 2005, Bull S.A.
7 * Written by Pierre Peiffer <pierre.peiffer@bull.net>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public Licens
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
21 */
22
23/*
24 * Extents support for EXT4
25 *
26 * TODO:
27 * - ext4*_error() should be used in some situations
28 * - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29 * - smart tree reduction
30 */
31
32#include <linux/module.h>
33#include <linux/fs.h>
34#include <linux/time.h>
35#include <linux/ext4_jbd2.h>
36#include <linux/jbd.h>
37#include <linux/smp_lock.h>
38#include <linux/highuid.h>
39#include <linux/pagemap.h>
40#include <linux/quotaops.h>
41#include <linux/string.h>
42#include <linux/slab.h>
43#include <linux/ext4_fs_extents.h>
44#include <asm/uaccess.h>
45
46
47/*
48 * ext_pblock:
49 * combine low and high parts of physical block number into ext4_fsblk_t
50 */
51static inline ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
52{
53 ext4_fsblk_t block;
54
55 block = le32_to_cpu(ex->ee_start);
56 block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
57 return block;
58}
59
60/*
61 * idx_pblock:
62 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
63 */
64static inline ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
65{
66 ext4_fsblk_t block;
67
68 block = le32_to_cpu(ix->ei_leaf);
69 block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
70 return block;
71}
72
73/*
74 * ext4_ext_store_pblock:
75 * stores a large physical block number into an extent struct,
76 * breaking it into parts
77 */
78static inline void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
79{
80 ex->ee_start = cpu_to_le32((unsigned long) (pb & 0xffffffff));
81 ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
82}
83
84/*
85 * ext4_idx_store_pblock:
86 * stores a large physical block number into an index struct,
87 * breaking it into parts
88 */
89static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
90{
91 ix->ei_leaf = cpu_to_le32((unsigned long) (pb & 0xffffffff));
92 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
93}
94
95static int ext4_ext_check_header(const char *function, struct inode *inode,
96 struct ext4_extent_header *eh)
97{
98 const char *error_msg = NULL;
99
100 if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
101 error_msg = "invalid magic";
102 goto corrupted;
103 }
104 if (unlikely(eh->eh_max == 0)) {
105 error_msg = "invalid eh_max";
106 goto corrupted;
107 }
108 if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
109 error_msg = "invalid eh_entries";
110 goto corrupted;
111 }
112 return 0;
113
114corrupted:
115 ext4_error(inode->i_sb, function,
116 "bad header in inode #%lu: %s - magic %x, "
117 "entries %u, max %u, depth %u",
118 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
119 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
120 le16_to_cpu(eh->eh_depth));
121
122 return -EIO;
123}
124
125static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
126{
127 int err;
128
129 if (handle->h_buffer_credits > needed)
130 return handle;
131 if (!ext4_journal_extend(handle, needed))
132 return handle;
133 err = ext4_journal_restart(handle, needed);
134
135 return handle;
136}
137
138/*
139 * could return:
140 * - EROFS
141 * - ENOMEM
142 */
143static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
144 struct ext4_ext_path *path)
145{
146 if (path->p_bh) {
147 /* path points to block */
148 return ext4_journal_get_write_access(handle, path->p_bh);
149 }
150 /* path points to leaf/index in inode body */
151 /* we use in-core data, no need to protect them */
152 return 0;
153}
154
155/*
156 * could return:
157 * - EROFS
158 * - ENOMEM
159 * - EIO
160 */
161static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
162 struct ext4_ext_path *path)
163{
164 int err;
165 if (path->p_bh) {
166 /* path points to block */
167 err = ext4_journal_dirty_metadata(handle, path->p_bh);
168 } else {
169 /* path points to leaf/index in inode body */
170 err = ext4_mark_inode_dirty(handle, inode);
171 }
172 return err;
173}
174
175static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
176 struct ext4_ext_path *path,
177 ext4_fsblk_t block)
178{
179 struct ext4_inode_info *ei = EXT4_I(inode);
180 ext4_fsblk_t bg_start;
181 ext4_grpblk_t colour;
182 int depth;
183
184 if (path) {
185 struct ext4_extent *ex;
186 depth = path->p_depth;
187
188 /* try to predict block placement */
189 if ((ex = path[depth].p_ext))
190 return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
191
192 /* it looks like index is empty;
193 * try to find starting block from index itself */
194 if (path[depth].p_bh)
195 return path[depth].p_bh->b_blocknr;
196 }
197
198 /* OK. use inode's group */
199 bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
200 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
201 colour = (current->pid % 16) *
202 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
203 return bg_start + colour + block;
204}
205
206static ext4_fsblk_t
207ext4_ext_new_block(handle_t *handle, struct inode *inode,
208 struct ext4_ext_path *path,
209 struct ext4_extent *ex, int *err)
210{
211 ext4_fsblk_t goal, newblock;
212
213 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
214 newblock = ext4_new_block(handle, inode, goal, err);
215 return newblock;
216}
217
218static inline int ext4_ext_space_block(struct inode *inode)
219{
220 int size;
221
222 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
223 / sizeof(struct ext4_extent);
224#ifdef AGRESSIVE_TEST
225 if (size > 6)
226 size = 6;
227#endif
228 return size;
229}
230
231static inline int ext4_ext_space_block_idx(struct inode *inode)
232{
233 int size;
234
235 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
236 / sizeof(struct ext4_extent_idx);
237#ifdef AGRESSIVE_TEST
238 if (size > 5)
239 size = 5;
240#endif
241 return size;
242}
243
244static inline int ext4_ext_space_root(struct inode *inode)
245{
246 int size;
247
248 size = sizeof(EXT4_I(inode)->i_data);
249 size -= sizeof(struct ext4_extent_header);
250 size /= sizeof(struct ext4_extent);
251#ifdef AGRESSIVE_TEST
252 if (size > 3)
253 size = 3;
254#endif
255 return size;
256}
257
258static inline int ext4_ext_space_root_idx(struct inode *inode)
259{
260 int size;
261
262 size = sizeof(EXT4_I(inode)->i_data);
263 size -= sizeof(struct ext4_extent_header);
264 size /= sizeof(struct ext4_extent_idx);
265#ifdef AGRESSIVE_TEST
266 if (size > 4)
267 size = 4;
268#endif
269 return size;
270}
271
272#ifdef EXT_DEBUG
273static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
274{
275 int k, l = path->p_depth;
276
277 ext_debug("path:");
278 for (k = 0; k <= l; k++, path++) {
279 if (path->p_idx) {
280 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
281 idx_pblock(path->p_idx));
282 } else if (path->p_ext) {
283 ext_debug(" %d:%d:%llu ",
284 le32_to_cpu(path->p_ext->ee_block),
285 le16_to_cpu(path->p_ext->ee_len),
286 ext_pblock(path->p_ext));
287 } else
288 ext_debug(" []");
289 }
290 ext_debug("\n");
291}
292
293static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
294{
295 int depth = ext_depth(inode);
296 struct ext4_extent_header *eh;
297 struct ext4_extent *ex;
298 int i;
299
300 if (!path)
301 return;
302
303 eh = path[depth].p_hdr;
304 ex = EXT_FIRST_EXTENT(eh);
305
306 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
307 ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
308 le16_to_cpu(ex->ee_len), ext_pblock(ex));
309 }
310 ext_debug("\n");
311}
312#else
313#define ext4_ext_show_path(inode,path)
314#define ext4_ext_show_leaf(inode,path)
315#endif
316
317static void ext4_ext_drop_refs(struct ext4_ext_path *path)
318{
319 int depth = path->p_depth;
320 int i;
321
322 for (i = 0; i <= depth; i++, path++)
323 if (path->p_bh) {
324 brelse(path->p_bh);
325 path->p_bh = NULL;
326 }
327}
328
329/*
330 * ext4_ext_binsearch_idx:
331 * binary search for the closest index of the given block
332 */
333static void
334ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int block)
335{
336 struct ext4_extent_header *eh = path->p_hdr;
337 struct ext4_extent_idx *r, *l, *m;
338
339 BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
340 BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
341 BUG_ON(le16_to_cpu(eh->eh_entries) <= 0);
342
343 ext_debug("binsearch for %d(idx): ", block);
344
345 l = EXT_FIRST_INDEX(eh) + 1;
346 r = EXT_FIRST_INDEX(eh) + le16_to_cpu(eh->eh_entries) - 1;
347 while (l <= r) {
348 m = l + (r - l) / 2;
349 if (block < le32_to_cpu(m->ei_block))
350 r = m - 1;
351 else
352 l = m + 1;
353 ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ei_block,
354 m, m->ei_block, r, r->ei_block);
355 }
356
357 path->p_idx = l - 1;
358 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
359 idx_block(path->p_idx));
360
361#ifdef CHECK_BINSEARCH
362 {
363 struct ext4_extent_idx *chix, *ix;
364 int k;
365
366 chix = ix = EXT_FIRST_INDEX(eh);
367 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
368 if (k != 0 &&
369 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
370 printk("k=%d, ix=0x%p, first=0x%p\n", k,
371 ix, EXT_FIRST_INDEX(eh));
372 printk("%u <= %u\n",
373 le32_to_cpu(ix->ei_block),
374 le32_to_cpu(ix[-1].ei_block));
375 }
376 BUG_ON(k && le32_to_cpu(ix->ei_block)
377 <= le32_to_cpu(ix[-1].ei_block));
378 if (block < le32_to_cpu(ix->ei_block))
379 break;
380 chix = ix;
381 }
382 BUG_ON(chix != path->p_idx);
383 }
384#endif
385
386}
387
388/*
389 * ext4_ext_binsearch:
390 * binary search for closest extent of the given block
391 */
392static void
393ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
394{
395 struct ext4_extent_header *eh = path->p_hdr;
396 struct ext4_extent *r, *l, *m;
397
398 BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
399 BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
400
401 if (eh->eh_entries == 0) {
402 /*
403 * this leaf is empty:
404 * we get such a leaf in split/add case
405 */
406 return;
407 }
408
409 ext_debug("binsearch for %d: ", block);
410
411 l = EXT_FIRST_EXTENT(eh) + 1;
412 r = EXT_FIRST_EXTENT(eh) + le16_to_cpu(eh->eh_entries) - 1;
413
414 while (l <= r) {
415 m = l + (r - l) / 2;
416 if (block < le32_to_cpu(m->ee_block))
417 r = m - 1;
418 else
419 l = m + 1;
420 ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ee_block,
421 m, m->ee_block, r, r->ee_block);
422 }
423
424 path->p_ext = l - 1;
425 ext_debug(" -> %d:%llu:%d ",
426 le32_to_cpu(path->p_ext->ee_block),
427 ext_pblock(path->p_ext),
428 le16_to_cpu(path->p_ext->ee_len));
429
430#ifdef CHECK_BINSEARCH
431 {
432 struct ext4_extent *chex, *ex;
433 int k;
434
435 chex = ex = EXT_FIRST_EXTENT(eh);
436 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
437 BUG_ON(k && le32_to_cpu(ex->ee_block)
438 <= le32_to_cpu(ex[-1].ee_block));
439 if (block < le32_to_cpu(ex->ee_block))
440 break;
441 chex = ex;
442 }
443 BUG_ON(chex != path->p_ext);
444 }
445#endif
446
447}
448
449int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
450{
451 struct ext4_extent_header *eh;
452
453 eh = ext_inode_hdr(inode);
454 eh->eh_depth = 0;
455 eh->eh_entries = 0;
456 eh->eh_magic = EXT4_EXT_MAGIC;
457 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
458 ext4_mark_inode_dirty(handle, inode);
459 ext4_ext_invalidate_cache(inode);
460 return 0;
461}
462
463struct ext4_ext_path *
464ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path)
465{
466 struct ext4_extent_header *eh;
467 struct buffer_head *bh;
468 short int depth, i, ppos = 0, alloc = 0;
469
470 eh = ext_inode_hdr(inode);
471 BUG_ON(eh == NULL);
472 if (ext4_ext_check_header(__FUNCTION__, inode, eh))
473 return ERR_PTR(-EIO);
474
475 i = depth = ext_depth(inode);
476
477 /* account possible depth increase */
478 if (!path) {
479 path = kmalloc(sizeof(struct ext4_ext_path) * (depth + 2),
480 GFP_NOFS);
481 if (!path)
482 return ERR_PTR(-ENOMEM);
483 alloc = 1;
484 }
485 memset(path, 0, sizeof(struct ext4_ext_path) * (depth + 1));
486 path[0].p_hdr = eh;
487
488 /* walk through the tree */
489 while (i) {
490 ext_debug("depth %d: num %d, max %d\n",
491 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
492 ext4_ext_binsearch_idx(inode, path + ppos, block);
493 path[ppos].p_block = idx_pblock(path[ppos].p_idx);
494 path[ppos].p_depth = i;
495 path[ppos].p_ext = NULL;
496
497 bh = sb_bread(inode->i_sb, path[ppos].p_block);
498 if (!bh)
499 goto err;
500
501 eh = ext_block_hdr(bh);
502 ppos++;
503 BUG_ON(ppos > depth);
504 path[ppos].p_bh = bh;
505 path[ppos].p_hdr = eh;
506 i--;
507
508 if (ext4_ext_check_header(__FUNCTION__, inode, eh))
509 goto err;
510 }
511
512 path[ppos].p_depth = i;
513 path[ppos].p_hdr = eh;
514 path[ppos].p_ext = NULL;
515 path[ppos].p_idx = NULL;
516
517 if (ext4_ext_check_header(__FUNCTION__, inode, eh))
518 goto err;
519
520 /* find extent */
521 ext4_ext_binsearch(inode, path + ppos, block);
522
523 ext4_ext_show_path(inode, path);
524
525 return path;
526
527err:
528 ext4_ext_drop_refs(path);
529 if (alloc)
530 kfree(path);
531 return ERR_PTR(-EIO);
532}
533
534/*
535 * ext4_ext_insert_index:
536 * insert new index [@logical;@ptr] into the block at @curp;
537 * check where to insert: before @curp or after @curp
538 */
539static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
540 struct ext4_ext_path *curp,
541 int logical, ext4_fsblk_t ptr)
542{
543 struct ext4_extent_idx *ix;
544 int len, err;
545
546 if ((err = ext4_ext_get_access(handle, inode, curp)))
547 return err;
548
549 BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
550 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
551 if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
552 /* insert after */
553 if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
554 len = (len - 1) * sizeof(struct ext4_extent_idx);
555 len = len < 0 ? 0 : len;
556 ext_debug("insert new index %d after: %d. "
557 "move %d from 0x%p to 0x%p\n",
558 logical, ptr, len,
559 (curp->p_idx + 1), (curp->p_idx + 2));
560 memmove(curp->p_idx + 2, curp->p_idx + 1, len);
561 }
562 ix = curp->p_idx + 1;
563 } else {
564 /* insert before */
565 len = len * sizeof(struct ext4_extent_idx);
566 len = len < 0 ? 0 : len;
567 ext_debug("insert new index %d before: %d. "
568 "move %d from 0x%p to 0x%p\n",
569 logical, ptr, len,
570 curp->p_idx, (curp->p_idx + 1));
571 memmove(curp->p_idx + 1, curp->p_idx, len);
572 ix = curp->p_idx;
573 }
574
575 ix->ei_block = cpu_to_le32(logical);
576 ext4_idx_store_pblock(ix, ptr);
577 curp->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(curp->p_hdr->eh_entries)+1);
578
579 BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
580 > le16_to_cpu(curp->p_hdr->eh_max));
581 BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
582
583 err = ext4_ext_dirty(handle, inode, curp);
584 ext4_std_error(inode->i_sb, err);
585
586 return err;
587}
588
589/*
590 * ext4_ext_split:
591 * inserts new subtree into the path, using free index entry
592 * at depth @at:
593 * - allocates all needed blocks (new leaf and all intermediate index blocks)
594 * - makes decision where to split
595 * - moves remaining extents and index entries (right to the split point)
596 * into the newly allocated blocks
597 * - initializes subtree
598 */
599static int ext4_ext_split(handle_t *handle, struct inode *inode,
600 struct ext4_ext_path *path,
601 struct ext4_extent *newext, int at)
602{
603 struct buffer_head *bh = NULL;
604 int depth = ext_depth(inode);
605 struct ext4_extent_header *neh;
606 struct ext4_extent_idx *fidx;
607 struct ext4_extent *ex;
608 int i = at, k, m, a;
609 ext4_fsblk_t newblock, oldblock;
610 __le32 border;
611 ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
612 int err = 0;
613
614 /* make decision: where to split? */
615 /* FIXME: now decision is simplest: at current extent */
616
617 /* if current leaf will be split, then we should use
618 * border from split point */
619 BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
620 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
621 border = path[depth].p_ext[1].ee_block;
622 ext_debug("leaf will be split."
623 " next leaf starts at %d\n",
624 le32_to_cpu(border));
625 } else {
626 border = newext->ee_block;
627 ext_debug("leaf will be added."
628 " next leaf starts at %d\n",
629 le32_to_cpu(border));
630 }
631
632 /*
633 * If error occurs, then we break processing
634 * and mark filesystem read-only. index won't
635 * be inserted and tree will be in consistent
636 * state. Next mount will repair buffers too.
637 */
638
639 /*
640 * Get array to track all allocated blocks.
641 * We need this to handle errors and free blocks
642 * upon them.
643 */
644 ablocks = kmalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
645 if (!ablocks)
646 return -ENOMEM;
647 memset(ablocks, 0, sizeof(ext4_fsblk_t) * depth);
648
649 /* allocate all needed blocks */
650 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
651 for (a = 0; a < depth - at; a++) {
652 newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
653 if (newblock == 0)
654 goto cleanup;
655 ablocks[a] = newblock;
656 }
657
658 /* initialize new leaf */
659 newblock = ablocks[--a];
660 BUG_ON(newblock == 0);
661 bh = sb_getblk(inode->i_sb, newblock);
662 if (!bh) {
663 err = -EIO;
664 goto cleanup;
665 }
666 lock_buffer(bh);
667
668 if ((err = ext4_journal_get_create_access(handle, bh)))
669 goto cleanup;
670
671 neh = ext_block_hdr(bh);
672 neh->eh_entries = 0;
673 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
674 neh->eh_magic = EXT4_EXT_MAGIC;
675 neh->eh_depth = 0;
676 ex = EXT_FIRST_EXTENT(neh);
677
678 /* move remainder of path[depth] to the new leaf */
679 BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
680 /* start copy from next extent */
681 /* TODO: we could do it by single memmove */
682 m = 0;
683 path[depth].p_ext++;
684 while (path[depth].p_ext <=
685 EXT_MAX_EXTENT(path[depth].p_hdr)) {
686 ext_debug("move %d:%llu:%d in new leaf %llu\n",
687 le32_to_cpu(path[depth].p_ext->ee_block),
688 ext_pblock(path[depth].p_ext),
689 le16_to_cpu(path[depth].p_ext->ee_len),
690 newblock);
691 /*memmove(ex++, path[depth].p_ext++,
692 sizeof(struct ext4_extent));
693 neh->eh_entries++;*/
694 path[depth].p_ext++;
695 m++;
696 }
697 if (m) {
698 memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
699 neh->eh_entries = cpu_to_le16(le16_to_cpu(neh->eh_entries)+m);
700 }
701
702 set_buffer_uptodate(bh);
703 unlock_buffer(bh);
704
705 if ((err = ext4_journal_dirty_metadata(handle, bh)))
706 goto cleanup;
707 brelse(bh);
708 bh = NULL;
709
710 /* correct old leaf */
711 if (m) {
712 if ((err = ext4_ext_get_access(handle, inode, path + depth)))
713 goto cleanup;
714 path[depth].p_hdr->eh_entries =
715 cpu_to_le16(le16_to_cpu(path[depth].p_hdr->eh_entries)-m);
716 if ((err = ext4_ext_dirty(handle, inode, path + depth)))
717 goto cleanup;
718
719 }
720
721 /* create intermediate indexes */
722 k = depth - at - 1;
723 BUG_ON(k < 0);
724 if (k)
725 ext_debug("create %d intermediate indices\n", k);
726 /* insert new index into current index block */
727 /* current depth stored in i var */
728 i = depth - 1;
729 while (k--) {
730 oldblock = newblock;
731 newblock = ablocks[--a];
732 bh = sb_getblk(inode->i_sb, (ext4_fsblk_t)newblock);
733 if (!bh) {
734 err = -EIO;
735 goto cleanup;
736 }
737 lock_buffer(bh);
738
739 if ((err = ext4_journal_get_create_access(handle, bh)))
740 goto cleanup;
741
742 neh = ext_block_hdr(bh);
743 neh->eh_entries = cpu_to_le16(1);
744 neh->eh_magic = EXT4_EXT_MAGIC;
745 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
746 neh->eh_depth = cpu_to_le16(depth - i);
747 fidx = EXT_FIRST_INDEX(neh);
748 fidx->ei_block = border;
749 ext4_idx_store_pblock(fidx, oldblock);
750
751 ext_debug("int.index at %d (block %llu): %lu -> %llu\n", i,
752 newblock, (unsigned long) le32_to_cpu(border),
753 oldblock);
754 /* copy indexes */
755 m = 0;
756 path[i].p_idx++;
757
758 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
759 EXT_MAX_INDEX(path[i].p_hdr));
760 BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
761 EXT_LAST_INDEX(path[i].p_hdr));
762 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
763 ext_debug("%d: move %d:%d in new index %llu\n", i,
764 le32_to_cpu(path[i].p_idx->ei_block),
765 idx_pblock(path[i].p_idx),
766 newblock);
767 /*memmove(++fidx, path[i].p_idx++,
768 sizeof(struct ext4_extent_idx));
769 neh->eh_entries++;
770 BUG_ON(neh->eh_entries > neh->eh_max);*/
771 path[i].p_idx++;
772 m++;
773 }
774 if (m) {
775 memmove(++fidx, path[i].p_idx - m,
776 sizeof(struct ext4_extent_idx) * m);
777 neh->eh_entries =
778 cpu_to_le16(le16_to_cpu(neh->eh_entries) + m);
779 }
780 set_buffer_uptodate(bh);
781 unlock_buffer(bh);
782
783 if ((err = ext4_journal_dirty_metadata(handle, bh)))
784 goto cleanup;
785 brelse(bh);
786 bh = NULL;
787
788 /* correct old index */
789 if (m) {
790 err = ext4_ext_get_access(handle, inode, path + i);
791 if (err)
792 goto cleanup;
793 path[i].p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path[i].p_hdr->eh_entries)-m);
794 err = ext4_ext_dirty(handle, inode, path + i);
795 if (err)
796 goto cleanup;
797 }
798
799 i--;
800 }
801
802 /* insert new index */
803 if (err)
804 goto cleanup;
805
806 err = ext4_ext_insert_index(handle, inode, path + at,
807 le32_to_cpu(border), newblock);
808
809cleanup:
810 if (bh) {
811 if (buffer_locked(bh))
812 unlock_buffer(bh);
813 brelse(bh);
814 }
815
816 if (err) {
817 /* free all allocated blocks in error case */
818 for (i = 0; i < depth; i++) {
819 if (!ablocks[i])
820 continue;
821 ext4_free_blocks(handle, inode, ablocks[i], 1);
822 }
823 }
824 kfree(ablocks);
825
826 return err;
827}
828
829/*
830 * ext4_ext_grow_indepth:
831 * implements tree growing procedure:
832 * - allocates new block
833 * - moves top-level data (index block or leaf) into the new block
834 * - initializes new top-level, creating index that points to the
835 * just created block
836 */
837static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
838 struct ext4_ext_path *path,
839 struct ext4_extent *newext)
840{
841 struct ext4_ext_path *curp = path;
842 struct ext4_extent_header *neh;
843 struct ext4_extent_idx *fidx;
844 struct buffer_head *bh;
845 ext4_fsblk_t newblock;
846 int err = 0;
847
848 newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
849 if (newblock == 0)
850 return err;
851
852 bh = sb_getblk(inode->i_sb, newblock);
853 if (!bh) {
854 err = -EIO;
855 ext4_std_error(inode->i_sb, err);
856 return err;
857 }
858 lock_buffer(bh);
859
860 if ((err = ext4_journal_get_create_access(handle, bh))) {
861 unlock_buffer(bh);
862 goto out;
863 }
864
865 /* move top-level index/leaf into new block */
866 memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
867
868 /* set size of new block */
869 neh = ext_block_hdr(bh);
870 /* old root could have indexes or leaves
871 * so calculate e_max right way */
872 if (ext_depth(inode))
873 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
874 else
875 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
876 neh->eh_magic = EXT4_EXT_MAGIC;
877 set_buffer_uptodate(bh);
878 unlock_buffer(bh);
879
880 if ((err = ext4_journal_dirty_metadata(handle, bh)))
881 goto out;
882
883 /* create index in new top-level index: num,max,pointer */
884 if ((err = ext4_ext_get_access(handle, inode, curp)))
885 goto out;
886
887 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
888 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
889 curp->p_hdr->eh_entries = cpu_to_le16(1);
890 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
891 /* FIXME: it works, but actually path[0] can be index */
892 curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
893 ext4_idx_store_pblock(curp->p_idx, newblock);
894
895 neh = ext_inode_hdr(inode);
896 fidx = EXT_FIRST_INDEX(neh);
897 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
898 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
899 le32_to_cpu(fidx->ei_block), idx_pblock(fidx));
900
901 neh->eh_depth = cpu_to_le16(path->p_depth + 1);
902 err = ext4_ext_dirty(handle, inode, curp);
903out:
904 brelse(bh);
905
906 return err;
907}
908
909/*
910 * ext4_ext_create_new_leaf:
911 * finds empty index and adds new leaf.
912 * if no free index is found, then it requests in-depth growing.
913 */
914static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
915 struct ext4_ext_path *path,
916 struct ext4_extent *newext)
917{
918 struct ext4_ext_path *curp;
919 int depth, i, err = 0;
920
921repeat:
922 i = depth = ext_depth(inode);
923
924 /* walk up to the tree and look for free index entry */
925 curp = path + depth;
926 while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
927 i--;
928 curp--;
929 }
930
931 /* we use already allocated block for index block,
932 * so subsequent data blocks should be contiguous */
933 if (EXT_HAS_FREE_INDEX(curp)) {
934 /* if we found index with free entry, then use that
935 * entry: create all needed subtree and add new leaf */
936 err = ext4_ext_split(handle, inode, path, newext, i);
937
938 /* refill path */
939 ext4_ext_drop_refs(path);
940 path = ext4_ext_find_extent(inode,
941 le32_to_cpu(newext->ee_block),
942 path);
943 if (IS_ERR(path))
944 err = PTR_ERR(path);
945 } else {
946 /* tree is full, time to grow in depth */
947 err = ext4_ext_grow_indepth(handle, inode, path, newext);
948 if (err)
949 goto out;
950
951 /* refill path */
952 ext4_ext_drop_refs(path);
953 path = ext4_ext_find_extent(inode,
954 le32_to_cpu(newext->ee_block),
955 path);
956 if (IS_ERR(path)) {
957 err = PTR_ERR(path);
958 goto out;
959 }
960
961 /*
962 * only first (depth 0 -> 1) produces free space;
963 * in all other cases we have to split the grown tree
964 */
965 depth = ext_depth(inode);
966 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
967 /* now we need to split */
968 goto repeat;
969 }
970 }
971
972out:
973 return err;
974}
975
976/*
977 * ext4_ext_next_allocated_block:
978 * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
979 * NOTE: it considers block number from index entry as
980 * allocated block. Thus, index entries have to be consistent
981 * with leaves.
982 */
983static unsigned long
984ext4_ext_next_allocated_block(struct ext4_ext_path *path)
985{
986 int depth;
987
988 BUG_ON(path == NULL);
989 depth = path->p_depth;
990
991 if (depth == 0 && path->p_ext == NULL)
992 return EXT_MAX_BLOCK;
993
994 while (depth >= 0) {
995 if (depth == path->p_depth) {
996 /* leaf */
997 if (path[depth].p_ext !=
998 EXT_LAST_EXTENT(path[depth].p_hdr))
999 return le32_to_cpu(path[depth].p_ext[1].ee_block);
1000 } else {
1001 /* index */
1002 if (path[depth].p_idx !=
1003 EXT_LAST_INDEX(path[depth].p_hdr))
1004 return le32_to_cpu(path[depth].p_idx[1].ei_block);
1005 }
1006 depth--;
1007 }
1008
1009 return EXT_MAX_BLOCK;
1010}
1011
1012/*
1013 * ext4_ext_next_leaf_block:
1014 * returns first allocated block from next leaf or EXT_MAX_BLOCK
1015 */
1016static unsigned ext4_ext_next_leaf_block(struct inode *inode,
1017 struct ext4_ext_path *path)
1018{
1019 int depth;
1020
1021 BUG_ON(path == NULL);
1022 depth = path->p_depth;
1023
1024 /* zero-tree has no leaf blocks at all */
1025 if (depth == 0)
1026 return EXT_MAX_BLOCK;
1027
1028 /* go to index block */
1029 depth--;
1030
1031 while (depth >= 0) {
1032 if (path[depth].p_idx !=
1033 EXT_LAST_INDEX(path[depth].p_hdr))
1034 return le32_to_cpu(path[depth].p_idx[1].ei_block);
1035 depth--;
1036 }
1037
1038 return EXT_MAX_BLOCK;
1039}
1040
1041/*
1042 * ext4_ext_correct_indexes:
1043 * if leaf gets modified and modified extent is first in the leaf,
1044 * then we have to correct all indexes above.
1045 * TODO: do we need to correct tree in all cases?
1046 */
1047int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1048 struct ext4_ext_path *path)
1049{
1050 struct ext4_extent_header *eh;
1051 int depth = ext_depth(inode);
1052 struct ext4_extent *ex;
1053 __le32 border;
1054 int k, err = 0;
1055
1056 eh = path[depth].p_hdr;
1057 ex = path[depth].p_ext;
1058 BUG_ON(ex == NULL);
1059 BUG_ON(eh == NULL);
1060
1061 if (depth == 0) {
1062 /* there is no tree at all */
1063 return 0;
1064 }
1065
1066 if (ex != EXT_FIRST_EXTENT(eh)) {
1067 /* we correct tree if first leaf got modified only */
1068 return 0;
1069 }
1070
1071 /*
1072 * TODO: we need correction if border is smaller than current one
1073 */
1074 k = depth - 1;
1075 border = path[depth].p_ext->ee_block;
1076 if ((err = ext4_ext_get_access(handle, inode, path + k)))
1077 return err;
1078 path[k].p_idx->ei_block = border;
1079 if ((err = ext4_ext_dirty(handle, inode, path + k)))
1080 return err;
1081
1082 while (k--) {
1083 /* change all left-side indexes */
1084 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1085 break;
1086 if ((err = ext4_ext_get_access(handle, inode, path + k)))
1087 break;
1088 path[k].p_idx->ei_block = border;
1089 if ((err = ext4_ext_dirty(handle, inode, path + k)))
1090 break;
1091 }
1092
1093 return err;
1094}
1095
1096static int inline
1097ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1098 struct ext4_extent *ex2)
1099{
1100 if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) !=
1101 le32_to_cpu(ex2->ee_block))
1102 return 0;
1103
1104 /*
1105 * To allow future support for preallocated extents to be added
1106 * as an RO_COMPAT feature, refuse to merge to extents if
1107 * this can result in the top bit of ee_len being set.
1108 */
1109 if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN)
1110 return 0;
1111#ifdef AGRESSIVE_TEST
1112 if (le16_to_cpu(ex1->ee_len) >= 4)
1113 return 0;
1114#endif
1115
1116 if (ext_pblock(ex1) + le16_to_cpu(ex1->ee_len) == ext_pblock(ex2))
1117 return 1;
1118 return 0;
1119}
1120
1121/*
1122 * ext4_ext_insert_extent:
1123 * tries to merge requsted extent into the existing extent or
1124 * inserts requested extent as new one into the tree,
1125 * creating new leaf in the no-space case.
1126 */
1127int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1128 struct ext4_ext_path *path,
1129 struct ext4_extent *newext)
1130{
1131 struct ext4_extent_header * eh;
1132 struct ext4_extent *ex, *fex;
1133 struct ext4_extent *nearex; /* nearest extent */
1134 struct ext4_ext_path *npath = NULL;
1135 int depth, len, err, next;
1136
1137 BUG_ON(newext->ee_len == 0);
1138 depth = ext_depth(inode);
1139 ex = path[depth].p_ext;
1140 BUG_ON(path[depth].p_hdr == NULL);
1141
1142 /* try to insert block into found extent and return */
1143 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
1144 ext_debug("append %d block to %d:%d (from %llu)\n",
1145 le16_to_cpu(newext->ee_len),
1146 le32_to_cpu(ex->ee_block),
1147 le16_to_cpu(ex->ee_len), ext_pblock(ex));
1148 if ((err = ext4_ext_get_access(handle, inode, path + depth)))
1149 return err;
1150 ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len)
1151 + le16_to_cpu(newext->ee_len));
1152 eh = path[depth].p_hdr;
1153 nearex = ex;
1154 goto merge;
1155 }
1156
1157repeat:
1158 depth = ext_depth(inode);
1159 eh = path[depth].p_hdr;
1160 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1161 goto has_space;
1162
1163 /* probably next leaf has space for us? */
1164 fex = EXT_LAST_EXTENT(eh);
1165 next = ext4_ext_next_leaf_block(inode, path);
1166 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
1167 && next != EXT_MAX_BLOCK) {
1168 ext_debug("next leaf block - %d\n", next);
1169 BUG_ON(npath != NULL);
1170 npath = ext4_ext_find_extent(inode, next, NULL);
1171 if (IS_ERR(npath))
1172 return PTR_ERR(npath);
1173 BUG_ON(npath->p_depth != path->p_depth);
1174 eh = npath[depth].p_hdr;
1175 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1176 ext_debug("next leaf isnt full(%d)\n",
1177 le16_to_cpu(eh->eh_entries));
1178 path = npath;
1179 goto repeat;
1180 }
1181 ext_debug("next leaf has no free space(%d,%d)\n",
1182 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1183 }
1184
1185 /*
1186 * There is no free space in the found leaf.
1187 * We're gonna add a new leaf in the tree.
1188 */
1189 err = ext4_ext_create_new_leaf(handle, inode, path, newext);
1190 if (err)
1191 goto cleanup;
1192 depth = ext_depth(inode);
1193 eh = path[depth].p_hdr;
1194
1195has_space:
1196 nearex = path[depth].p_ext;
1197
1198 if ((err = ext4_ext_get_access(handle, inode, path + depth)))
1199 goto cleanup;
1200
1201 if (!nearex) {
1202 /* there is no extent in this leaf, create first one */
1203 ext_debug("first extent in the leaf: %d:%llu:%d\n",
1204 le32_to_cpu(newext->ee_block),
1205 ext_pblock(newext),
1206 le16_to_cpu(newext->ee_len));
1207 path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1208 } else if (le32_to_cpu(newext->ee_block)
1209 > le32_to_cpu(nearex->ee_block)) {
1210/* BUG_ON(newext->ee_block == nearex->ee_block); */
1211 if (nearex != EXT_LAST_EXTENT(eh)) {
1212 len = EXT_MAX_EXTENT(eh) - nearex;
1213 len = (len - 1) * sizeof(struct ext4_extent);
1214 len = len < 0 ? 0 : len;
1215 ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
1216 "move %d from 0x%p to 0x%p\n",
1217 le32_to_cpu(newext->ee_block),
1218 ext_pblock(newext),
1219 le16_to_cpu(newext->ee_len),
1220 nearex, len, nearex + 1, nearex + 2);
1221 memmove(nearex + 2, nearex + 1, len);
1222 }
1223 path[depth].p_ext = nearex + 1;
1224 } else {
1225 BUG_ON(newext->ee_block == nearex->ee_block);
1226 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1227 len = len < 0 ? 0 : len;
1228 ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
1229 "move %d from 0x%p to 0x%p\n",
1230 le32_to_cpu(newext->ee_block),
1231 ext_pblock(newext),
1232 le16_to_cpu(newext->ee_len),
1233 nearex, len, nearex + 1, nearex + 2);
1234 memmove(nearex + 1, nearex, len);
1235 path[depth].p_ext = nearex;
1236 }
1237
1238 eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)+1);
1239 nearex = path[depth].p_ext;
1240 nearex->ee_block = newext->ee_block;
1241 nearex->ee_start = newext->ee_start;
1242 nearex->ee_start_hi = newext->ee_start_hi;
1243 nearex->ee_len = newext->ee_len;
1244
1245merge:
1246 /* try to merge extents to the right */
1247 while (nearex < EXT_LAST_EXTENT(eh)) {
1248 if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1))
1249 break;
1250 /* merge with next extent! */
1251 nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len)
1252 + le16_to_cpu(nearex[1].ee_len));
1253 if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
1254 len = (EXT_LAST_EXTENT(eh) - nearex - 1)
1255 * sizeof(struct ext4_extent);
1256 memmove(nearex + 1, nearex + 2, len);
1257 }
1258 eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
1259 BUG_ON(eh->eh_entries == 0);
1260 }
1261
1262 /* try to merge extents to the left */
1263
1264 /* time to correct all indexes above */
1265 err = ext4_ext_correct_indexes(handle, inode, path);
1266 if (err)
1267 goto cleanup;
1268
1269 err = ext4_ext_dirty(handle, inode, path + depth);
1270
1271cleanup:
1272 if (npath) {
1273 ext4_ext_drop_refs(npath);
1274 kfree(npath);
1275 }
1276 ext4_ext_tree_changed(inode);
1277 ext4_ext_invalidate_cache(inode);
1278 return err;
1279}
1280
1281int ext4_ext_walk_space(struct inode *inode, unsigned long block,
1282 unsigned long num, ext_prepare_callback func,
1283 void *cbdata)
1284{
1285 struct ext4_ext_path *path = NULL;
1286 struct ext4_ext_cache cbex;
1287 struct ext4_extent *ex;
1288 unsigned long next, start = 0, end = 0;
1289 unsigned long last = block + num;
1290 int depth, exists, err = 0;
1291
1292 BUG_ON(func == NULL);
1293 BUG_ON(inode == NULL);
1294
1295 while (block < last && block != EXT_MAX_BLOCK) {
1296 num = last - block;
1297 /* find extent for this block */
1298 path = ext4_ext_find_extent(inode, block, path);
1299 if (IS_ERR(path)) {
1300 err = PTR_ERR(path);
1301 path = NULL;
1302 break;
1303 }
1304
1305 depth = ext_depth(inode);
1306 BUG_ON(path[depth].p_hdr == NULL);
1307 ex = path[depth].p_ext;
1308 next = ext4_ext_next_allocated_block(path);
1309
1310 exists = 0;
1311 if (!ex) {
1312 /* there is no extent yet, so try to allocate
1313 * all requested space */
1314 start = block;
1315 end = block + num;
1316 } else if (le32_to_cpu(ex->ee_block) > block) {
1317 /* need to allocate space before found extent */
1318 start = block;
1319 end = le32_to_cpu(ex->ee_block);
1320 if (block + num < end)
1321 end = block + num;
1322 } else if (block >=
1323 le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) {
1324 /* need to allocate space after found extent */
1325 start = block;
1326 end = block + num;
1327 if (end >= next)
1328 end = next;
1329 } else if (block >= le32_to_cpu(ex->ee_block)) {
1330 /*
1331 * some part of requested space is covered
1332 * by found extent
1333 */
1334 start = block;
1335 end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len);
1336 if (block + num < end)
1337 end = block + num;
1338 exists = 1;
1339 } else {
1340 BUG();
1341 }
1342 BUG_ON(end <= start);
1343
1344 if (!exists) {
1345 cbex.ec_block = start;
1346 cbex.ec_len = end - start;
1347 cbex.ec_start = 0;
1348 cbex.ec_type = EXT4_EXT_CACHE_GAP;
1349 } else {
1350 cbex.ec_block = le32_to_cpu(ex->ee_block);
1351 cbex.ec_len = le16_to_cpu(ex->ee_len);
1352 cbex.ec_start = ext_pblock(ex);
1353 cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
1354 }
1355
1356 BUG_ON(cbex.ec_len == 0);
1357 err = func(inode, path, &cbex, cbdata);
1358 ext4_ext_drop_refs(path);
1359
1360 if (err < 0)
1361 break;
1362 if (err == EXT_REPEAT)
1363 continue;
1364 else if (err == EXT_BREAK) {
1365 err = 0;
1366 break;
1367 }
1368
1369 if (ext_depth(inode) != depth) {
1370 /* depth was changed. we have to realloc path */
1371 kfree(path);
1372 path = NULL;
1373 }
1374
1375 block = cbex.ec_block + cbex.ec_len;
1376 }
1377
1378 if (path) {
1379 ext4_ext_drop_refs(path);
1380 kfree(path);
1381 }
1382
1383 return err;
1384}
1385
1386static inline void
1387ext4_ext_put_in_cache(struct inode *inode, __u32 block,
1388 __u32 len, __u32 start, int type)
1389{
1390 struct ext4_ext_cache *cex;
1391 BUG_ON(len == 0);
1392 cex = &EXT4_I(inode)->i_cached_extent;
1393 cex->ec_type = type;
1394 cex->ec_block = block;
1395 cex->ec_len = len;
1396 cex->ec_start = start;
1397}
1398
1399/*
1400 * ext4_ext_put_gap_in_cache:
1401 * calculate boundaries of the gap that the requested block fits into
1402 * and cache this gap
1403 */
1404static inline void
1405ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1406 unsigned long block)
1407{
1408 int depth = ext_depth(inode);
1409 unsigned long lblock, len;
1410 struct ext4_extent *ex;
1411
1412 ex = path[depth].p_ext;
1413 if (ex == NULL) {
1414 /* there is no extent yet, so gap is [0;-] */
1415 lblock = 0;
1416 len = EXT_MAX_BLOCK;
1417 ext_debug("cache gap(whole file):");
1418 } else if (block < le32_to_cpu(ex->ee_block)) {
1419 lblock = block;
1420 len = le32_to_cpu(ex->ee_block) - block;
1421 ext_debug("cache gap(before): %lu [%lu:%lu]",
1422 (unsigned long) block,
1423 (unsigned long) le32_to_cpu(ex->ee_block),
1424 (unsigned long) le16_to_cpu(ex->ee_len));
1425 } else if (block >= le32_to_cpu(ex->ee_block)
1426 + le16_to_cpu(ex->ee_len)) {
1427 lblock = le32_to_cpu(ex->ee_block)
1428 + le16_to_cpu(ex->ee_len);
1429 len = ext4_ext_next_allocated_block(path);
1430 ext_debug("cache gap(after): [%lu:%lu] %lu",
1431 (unsigned long) le32_to_cpu(ex->ee_block),
1432 (unsigned long) le16_to_cpu(ex->ee_len),
1433 (unsigned long) block);
1434 BUG_ON(len == lblock);
1435 len = len - lblock;
1436 } else {
1437 lblock = len = 0;
1438 BUG();
1439 }
1440
1441 ext_debug(" -> %lu:%lu\n", (unsigned long) lblock, len);
1442 ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
1443}
1444
1445static inline int
1446ext4_ext_in_cache(struct inode *inode, unsigned long block,
1447 struct ext4_extent *ex)
1448{
1449 struct ext4_ext_cache *cex;
1450
1451 cex = &EXT4_I(inode)->i_cached_extent;
1452
1453 /* has cache valid data? */
1454 if (cex->ec_type == EXT4_EXT_CACHE_NO)
1455 return EXT4_EXT_CACHE_NO;
1456
1457 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
1458 cex->ec_type != EXT4_EXT_CACHE_EXTENT);
1459 if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
1460 ex->ee_block = cpu_to_le32(cex->ec_block);
1461 ext4_ext_store_pblock(ex, cex->ec_start);
1462 ex->ee_len = cpu_to_le16(cex->ec_len);
1463 ext_debug("%lu cached by %lu:%lu:%llu\n",
1464 (unsigned long) block,
1465 (unsigned long) cex->ec_block,
1466 (unsigned long) cex->ec_len,
1467 cex->ec_start);
1468 return cex->ec_type;
1469 }
1470
1471 /* not in cache */
1472 return EXT4_EXT_CACHE_NO;
1473}
1474
1475/*
1476 * ext4_ext_rm_idx:
1477 * removes index from the index block.
1478 * It's used in truncate case only, thus all requests are for
1479 * last index in the block only.
1480 */
1481int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1482 struct ext4_ext_path *path)
1483{
1484 struct buffer_head *bh;
1485 int err;
1486 ext4_fsblk_t leaf;
1487
1488 /* free index block */
1489 path--;
1490 leaf = idx_pblock(path->p_idx);
1491 BUG_ON(path->p_hdr->eh_entries == 0);
1492 if ((err = ext4_ext_get_access(handle, inode, path)))
1493 return err;
1494 path->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path->p_hdr->eh_entries)-1);
1495 if ((err = ext4_ext_dirty(handle, inode, path)))
1496 return err;
1497 ext_debug("index is empty, remove it, free block %llu\n", leaf);
1498 bh = sb_find_get_block(inode->i_sb, leaf);
1499 ext4_forget(handle, 1, inode, bh, leaf);
1500 ext4_free_blocks(handle, inode, leaf, 1);
1501 return err;
1502}
1503
1504/*
1505 * ext4_ext_calc_credits_for_insert:
1506 * This routine returns max. credits that the extent tree can consume.
1507 * It should be OK for low-performance paths like ->writepage()
1508 * To allow many writing processes to fit into a single transaction,
1509 * the caller should calculate credits under truncate_mutex and
1510 * pass the actual path.
1511 */
1512int inline ext4_ext_calc_credits_for_insert(struct inode *inode,
1513 struct ext4_ext_path *path)
1514{
1515 int depth, needed;
1516
1517 if (path) {
1518 /* probably there is space in leaf? */
1519 depth = ext_depth(inode);
1520 if (le16_to_cpu(path[depth].p_hdr->eh_entries)
1521 < le16_to_cpu(path[depth].p_hdr->eh_max))
1522 return 1;
1523 }
1524
1525 /*
1526 * given 32-bit logical block (4294967296 blocks), max. tree
1527 * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
1528 * Let's also add one more level for imbalance.
1529 */
1530 depth = 5;
1531
1532 /* allocation of new data block(s) */
1533 needed = 2;
1534
1535 /*
1536 * tree can be full, so it would need to grow in depth:
1537 * allocation + old root + new root
1538 */
1539 needed += 2 + 1 + 1;
1540
1541 /*
1542 * Index split can happen, we would need:
1543 * allocate intermediate indexes (bitmap + group)
1544 * + change two blocks at each level, but root (already included)
1545 */
1546 needed = (depth * 2) + (depth * 2);
1547
1548 /* any allocation modifies superblock */
1549 needed += 1;
1550
1551 return needed;
1552}
1553
1554static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
1555 struct ext4_extent *ex,
1556 unsigned long from, unsigned long to)
1557{
1558 struct buffer_head *bh;
1559 int i;
1560
1561#ifdef EXTENTS_STATS
1562 {
1563 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1564 unsigned short ee_len = le16_to_cpu(ex->ee_len);
1565 spin_lock(&sbi->s_ext_stats_lock);
1566 sbi->s_ext_blocks += ee_len;
1567 sbi->s_ext_extents++;
1568 if (ee_len < sbi->s_ext_min)
1569 sbi->s_ext_min = ee_len;
1570 if (ee_len > sbi->s_ext_max)
1571 sbi->s_ext_max = ee_len;
1572 if (ext_depth(inode) > sbi->s_depth_max)
1573 sbi->s_depth_max = ext_depth(inode);
1574 spin_unlock(&sbi->s_ext_stats_lock);
1575 }
1576#endif
1577 if (from >= le32_to_cpu(ex->ee_block)
1578 && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
1579 /* tail removal */
1580 unsigned long num;
1581 ext4_fsblk_t start;
1582 num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from;
1583 start = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - num;
1584 ext_debug("free last %lu blocks starting %llu\n", num, start);
1585 for (i = 0; i < num; i++) {
1586 bh = sb_find_get_block(inode->i_sb, start + i);
1587 ext4_forget(handle, 0, inode, bh, start + i);
1588 }
1589 ext4_free_blocks(handle, inode, start, num);
1590 } else if (from == le32_to_cpu(ex->ee_block)
1591 && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
1592 printk("strange request: removal %lu-%lu from %u:%u\n",
1593 from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
1594 } else {
1595 printk("strange request: removal(2) %lu-%lu from %u:%u\n",
1596 from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
1597 }
1598 return 0;
1599}
1600
1601static int
1602ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1603 struct ext4_ext_path *path, unsigned long start)
1604{
1605 int err = 0, correct_index = 0;
1606 int depth = ext_depth(inode), credits;
1607 struct ext4_extent_header *eh;
1608 unsigned a, b, block, num;
1609 unsigned long ex_ee_block;
1610 unsigned short ex_ee_len;
1611 struct ext4_extent *ex;
1612
1613 ext_debug("truncate since %lu in leaf\n", start);
1614 if (!path[depth].p_hdr)
1615 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
1616 eh = path[depth].p_hdr;
1617 BUG_ON(eh == NULL);
1618 BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
1619 BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
1620
1621 /* find where to start removing */
1622 ex = EXT_LAST_EXTENT(eh);
1623
1624 ex_ee_block = le32_to_cpu(ex->ee_block);
1625 ex_ee_len = le16_to_cpu(ex->ee_len);
1626
1627 while (ex >= EXT_FIRST_EXTENT(eh) &&
1628 ex_ee_block + ex_ee_len > start) {
1629 ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
1630 path[depth].p_ext = ex;
1631
1632 a = ex_ee_block > start ? ex_ee_block : start;
1633 b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
1634 ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
1635
1636 ext_debug(" border %u:%u\n", a, b);
1637
1638 if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
1639 block = 0;
1640 num = 0;
1641 BUG();
1642 } else if (a != ex_ee_block) {
1643 /* remove tail of the extent */
1644 block = ex_ee_block;
1645 num = a - block;
1646 } else if (b != ex_ee_block + ex_ee_len - 1) {
1647 /* remove head of the extent */
1648 block = a;
1649 num = b - a;
1650 /* there is no "make a hole" API yet */
1651 BUG();
1652 } else {
1653 /* remove whole extent: excellent! */
1654 block = ex_ee_block;
1655 num = 0;
1656 BUG_ON(a != ex_ee_block);
1657 BUG_ON(b != ex_ee_block + ex_ee_len - 1);
1658 }
1659
1660 /* at present, extent can't cross block group: */
1661 /* leaf + bitmap + group desc + sb + inode */
1662 credits = 5;
1663 if (ex == EXT_FIRST_EXTENT(eh)) {
1664 correct_index = 1;
1665 credits += (ext_depth(inode)) + 1;
1666 }
1667#ifdef CONFIG_QUOTA
1668 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
1669#endif
1670
1671 handle = ext4_ext_journal_restart(handle, credits);
1672 if (IS_ERR(handle)) {
1673 err = PTR_ERR(handle);
1674 goto out;
1675 }
1676
1677 err = ext4_ext_get_access(handle, inode, path + depth);
1678 if (err)
1679 goto out;
1680
1681 err = ext4_remove_blocks(handle, inode, ex, a, b);
1682 if (err)
1683 goto out;
1684
1685 if (num == 0) {
1686 /* this extent is removed; mark slot entirely unused */
1687 ext4_ext_store_pblock(ex, 0);
1688 eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
1689 }
1690
1691 ex->ee_block = cpu_to_le32(block);
1692 ex->ee_len = cpu_to_le16(num);
1693
1694 err = ext4_ext_dirty(handle, inode, path + depth);
1695 if (err)
1696 goto out;
1697
1698 ext_debug("new extent: %u:%u:%llu\n", block, num,
1699 ext_pblock(ex));
1700 ex--;
1701 ex_ee_block = le32_to_cpu(ex->ee_block);
1702 ex_ee_len = le16_to_cpu(ex->ee_len);
1703 }
1704
1705 if (correct_index && eh->eh_entries)
1706 err = ext4_ext_correct_indexes(handle, inode, path);
1707
1708 /* if this leaf is free, then we should
1709 * remove it from index block above */
1710 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
1711 err = ext4_ext_rm_idx(handle, inode, path + depth);
1712
1713out:
1714 return err;
1715}
1716
1717/*
1718 * ext4_ext_more_to_rm:
1719 * returns 1 if current index has to be freed (even partial)
1720 */
1721static int inline
1722ext4_ext_more_to_rm(struct ext4_ext_path *path)
1723{
1724 BUG_ON(path->p_idx == NULL);
1725
1726 if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
1727 return 0;
1728
1729 /*
1730 * if truncate on deeper level happened, it wasn't partial,
1731 * so we have to consider current index for truncation
1732 */
1733 if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
1734 return 0;
1735 return 1;
1736}
1737
1738int ext4_ext_remove_space(struct inode *inode, unsigned long start)
1739{
1740 struct super_block *sb = inode->i_sb;
1741 int depth = ext_depth(inode);
1742 struct ext4_ext_path *path;
1743 handle_t *handle;
1744 int i = 0, err = 0;
1745
1746 ext_debug("truncate since %lu\n", start);
1747
1748 /* probably first extent we're gonna free will be last in block */
1749 handle = ext4_journal_start(inode, depth + 1);
1750 if (IS_ERR(handle))
1751 return PTR_ERR(handle);
1752
1753 ext4_ext_invalidate_cache(inode);
1754
1755 /*
1756 * We start scanning from right side, freeing all the blocks
1757 * after i_size and walking into the tree depth-wise.
1758 */
1759 path = kmalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_KERNEL);
1760 if (path == NULL) {
1761 ext4_journal_stop(handle);
1762 return -ENOMEM;
1763 }
1764 memset(path, 0, sizeof(struct ext4_ext_path) * (depth + 1));
1765 path[0].p_hdr = ext_inode_hdr(inode);
1766 if (ext4_ext_check_header(__FUNCTION__, inode, path[0].p_hdr)) {
1767 err = -EIO;
1768 goto out;
1769 }
1770 path[0].p_depth = depth;
1771
1772 while (i >= 0 && err == 0) {
1773 if (i == depth) {
1774 /* this is leaf block */
1775 err = ext4_ext_rm_leaf(handle, inode, path, start);
1776 /* root level has p_bh == NULL, brelse() eats this */
1777 brelse(path[i].p_bh);
1778 path[i].p_bh = NULL;
1779 i--;
1780 continue;
1781 }
1782
1783 /* this is index block */
1784 if (!path[i].p_hdr) {
1785 ext_debug("initialize header\n");
1786 path[i].p_hdr = ext_block_hdr(path[i].p_bh);
1787 if (ext4_ext_check_header(__FUNCTION__, inode,
1788 path[i].p_hdr)) {
1789 err = -EIO;
1790 goto out;
1791 }
1792 }
1793
1794 BUG_ON(le16_to_cpu(path[i].p_hdr->eh_entries)
1795 > le16_to_cpu(path[i].p_hdr->eh_max));
1796 BUG_ON(path[i].p_hdr->eh_magic != EXT4_EXT_MAGIC);
1797
1798 if (!path[i].p_idx) {
1799 /* this level hasn't been touched yet */
1800 path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
1801 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
1802 ext_debug("init index ptr: hdr 0x%p, num %d\n",
1803 path[i].p_hdr,
1804 le16_to_cpu(path[i].p_hdr->eh_entries));
1805 } else {
1806 /* we were already here, see at next index */
1807 path[i].p_idx--;
1808 }
1809
1810 ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
1811 i, EXT_FIRST_INDEX(path[i].p_hdr),
1812 path[i].p_idx);
1813 if (ext4_ext_more_to_rm(path + i)) {
1814 /* go to the next level */
1815 ext_debug("move to level %d (block %llu)\n",
1816 i + 1, idx_pblock(path[i].p_idx));
1817 memset(path + i + 1, 0, sizeof(*path));
1818 path[i+1].p_bh =
1819 sb_bread(sb, idx_pblock(path[i].p_idx));
1820 if (!path[i+1].p_bh) {
1821 /* should we reset i_size? */
1822 err = -EIO;
1823 break;
1824 }
1825
1826 /* save actual number of indexes since this
1827 * number is changed at the next iteration */
1828 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
1829 i++;
1830 } else {
1831 /* we finished processing this index, go up */
1832 if (path[i].p_hdr->eh_entries == 0 && i > 0) {
1833 /* index is empty, remove it;
1834 * handle must be already prepared by the
1835 * truncatei_leaf() */
1836 err = ext4_ext_rm_idx(handle, inode, path + i);
1837 }
1838 /* root level has p_bh == NULL, brelse() eats this */
1839 brelse(path[i].p_bh);
1840 path[i].p_bh = NULL;
1841 i--;
1842 ext_debug("return to level %d\n", i);
1843 }
1844 }
1845
1846 /* TODO: flexible tree reduction should be here */
1847 if (path->p_hdr->eh_entries == 0) {
1848 /*
1849 * truncate to zero freed all the tree,
1850 * so we need to correct eh_depth
1851 */
1852 err = ext4_ext_get_access(handle, inode, path);
1853 if (err == 0) {
1854 ext_inode_hdr(inode)->eh_depth = 0;
1855 ext_inode_hdr(inode)->eh_max =
1856 cpu_to_le16(ext4_ext_space_root(inode));
1857 err = ext4_ext_dirty(handle, inode, path);
1858 }
1859 }
1860out:
1861 ext4_ext_tree_changed(inode);
1862 ext4_ext_drop_refs(path);
1863 kfree(path);
1864 ext4_journal_stop(handle);
1865
1866 return err;
1867}
1868
1869/*
1870 * called at mount time
1871 */
1872void ext4_ext_init(struct super_block *sb)
1873{
1874 /*
1875 * possible initialization would be here
1876 */
1877
1878 if (test_opt(sb, EXTENTS)) {
1879 printk("EXT4-fs: file extents enabled");
1880#ifdef AGRESSIVE_TEST
1881 printk(", agressive tests");
1882#endif
1883#ifdef CHECK_BINSEARCH
1884 printk(", check binsearch");
1885#endif
1886#ifdef EXTENTS_STATS
1887 printk(", stats");
1888#endif
1889 printk("\n");
1890#ifdef EXTENTS_STATS
1891 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
1892 EXT4_SB(sb)->s_ext_min = 1 << 30;
1893 EXT4_SB(sb)->s_ext_max = 0;
1894#endif
1895 }
1896}
1897
1898/*
1899 * called at umount time
1900 */
1901void ext4_ext_release(struct super_block *sb)
1902{
1903 if (!test_opt(sb, EXTENTS))
1904 return;
1905
1906#ifdef EXTENTS_STATS
1907 if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
1908 struct ext4_sb_info *sbi = EXT4_SB(sb);
1909 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
1910 sbi->s_ext_blocks, sbi->s_ext_extents,
1911 sbi->s_ext_blocks / sbi->s_ext_extents);
1912 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
1913 sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
1914 }
1915#endif
1916}
1917
1918int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1919 ext4_fsblk_t iblock,
1920 unsigned long max_blocks, struct buffer_head *bh_result,
1921 int create, int extend_disksize)
1922{
1923 struct ext4_ext_path *path = NULL;
1924 struct ext4_extent newex, *ex;
1925 ext4_fsblk_t goal, newblock;
1926 int err = 0, depth;
1927 unsigned long allocated = 0;
1928
1929 __clear_bit(BH_New, &bh_result->b_state);
1930 ext_debug("blocks %d/%lu requested for inode %u\n", (int) iblock,
1931 max_blocks, (unsigned) inode->i_ino);
1932 mutex_lock(&EXT4_I(inode)->truncate_mutex);
1933
1934 /* check in cache */
1935 if ((goal = ext4_ext_in_cache(inode, iblock, &newex))) {
1936 if (goal == EXT4_EXT_CACHE_GAP) {
1937 if (!create) {
1938 /* block isn't allocated yet and
1939 * user doesn't want to allocate it */
1940 goto out2;
1941 }
1942 /* we should allocate requested block */
1943 } else if (goal == EXT4_EXT_CACHE_EXTENT) {
1944 /* block is already allocated */
1945 newblock = iblock
1946 - le32_to_cpu(newex.ee_block)
1947 + ext_pblock(&newex);
1948 /* number of remaining blocks in the extent */
1949 allocated = le16_to_cpu(newex.ee_len) -
1950 (iblock - le32_to_cpu(newex.ee_block));
1951 goto out;
1952 } else {
1953 BUG();
1954 }
1955 }
1956
1957 /* find extent for this block */
1958 path = ext4_ext_find_extent(inode, iblock, NULL);
1959 if (IS_ERR(path)) {
1960 err = PTR_ERR(path);
1961 path = NULL;
1962 goto out2;
1963 }
1964
1965 depth = ext_depth(inode);
1966
1967 /*
1968 * consistent leaf must not be empty;
1969 * this situation is possible, though, _during_ tree modification;
1970 * this is why assert can't be put in ext4_ext_find_extent()
1971 */
1972 BUG_ON(path[depth].p_ext == NULL && depth != 0);
1973
1974 if ((ex = path[depth].p_ext)) {
1975 unsigned long ee_block = le32_to_cpu(ex->ee_block);
1976 ext4_fsblk_t ee_start = ext_pblock(ex);
1977 unsigned short ee_len = le16_to_cpu(ex->ee_len);
1978
1979 /*
1980 * Allow future support for preallocated extents to be added
1981 * as an RO_COMPAT feature:
1982 * Uninitialized extents are treated as holes, except that
1983 * we avoid (fail) allocating new blocks during a write.
1984 */
1985 if (ee_len > EXT_MAX_LEN)
1986 goto out2;
1987 /* if found extent covers block, simply return it */
1988 if (iblock >= ee_block && iblock < ee_block + ee_len) {
1989 newblock = iblock - ee_block + ee_start;
1990 /* number of remaining blocks in the extent */
1991 allocated = ee_len - (iblock - ee_block);
1992 ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock,
1993 ee_block, ee_len, newblock);
1994 ext4_ext_put_in_cache(inode, ee_block, ee_len,
1995 ee_start, EXT4_EXT_CACHE_EXTENT);
1996 goto out;
1997 }
1998 }
1999
2000 /*
2001 * requested block isn't allocated yet;
2002 * we couldn't try to create block if create flag is zero
2003 */
2004 if (!create) {
2005 /* put just found gap into cache to speed up
2006 * subsequent requests */
2007 ext4_ext_put_gap_in_cache(inode, path, iblock);
2008 goto out2;
2009 }
2010 /*
2011 * Okay, we need to do block allocation. Lazily initialize the block
2012 * allocation info here if necessary.
2013 */
2014 if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
2015 ext4_init_block_alloc_info(inode);
2016
2017 /* allocate new block */
2018 goal = ext4_ext_find_goal(inode, path, iblock);
2019 allocated = max_blocks;
2020 newblock = ext4_new_blocks(handle, inode, goal, &allocated, &err);
2021 if (!newblock)
2022 goto out2;
2023 ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
2024 goal, newblock, allocated);
2025
2026 /* try to insert new extent into found leaf and return */
2027 newex.ee_block = cpu_to_le32(iblock);
2028 ext4_ext_store_pblock(&newex, newblock);
2029 newex.ee_len = cpu_to_le16(allocated);
2030 err = ext4_ext_insert_extent(handle, inode, path, &newex);
2031 if (err)
2032 goto out2;
2033
2034 if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
2035 EXT4_I(inode)->i_disksize = inode->i_size;
2036
2037 /* previous routine could use block we allocated */
2038 newblock = ext_pblock(&newex);
2039 __set_bit(BH_New, &bh_result->b_state);
2040
2041 ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
2042 EXT4_EXT_CACHE_EXTENT);
2043out:
2044 if (allocated > max_blocks)
2045 allocated = max_blocks;
2046 ext4_ext_show_leaf(inode, path);
2047 __set_bit(BH_Mapped, &bh_result->b_state);
2048 bh_result->b_bdev = inode->i_sb->s_bdev;
2049 bh_result->b_blocknr = newblock;
2050out2:
2051 if (path) {
2052 ext4_ext_drop_refs(path);
2053 kfree(path);
2054 }
2055 mutex_unlock(&EXT4_I(inode)->truncate_mutex);
2056
2057 return err ? err : allocated;
2058}
2059
2060void ext4_ext_truncate(struct inode * inode, struct page *page)
2061{
2062 struct address_space *mapping = inode->i_mapping;
2063 struct super_block *sb = inode->i_sb;
2064 unsigned long last_block;
2065 handle_t *handle;
2066 int err = 0;
2067
2068 /*
2069 * probably first extent we're gonna free will be last in block
2070 */
2071 err = ext4_writepage_trans_blocks(inode) + 3;
2072 handle = ext4_journal_start(inode, err);
2073 if (IS_ERR(handle)) {
2074 if (page) {
2075 clear_highpage(page);
2076 flush_dcache_page(page);
2077 unlock_page(page);
2078 page_cache_release(page);
2079 }
2080 return;
2081 }
2082
2083 if (page)
2084 ext4_block_truncate_page(handle, page, mapping, inode->i_size);
2085
2086 mutex_lock(&EXT4_I(inode)->truncate_mutex);
2087 ext4_ext_invalidate_cache(inode);
2088
2089 /*
2090 * TODO: optimization is possible here.
2091 * Probably we need not scan at all,
2092 * because page truncation is enough.
2093 */
2094 if (ext4_orphan_add(handle, inode))
2095 goto out_stop;
2096
2097 /* we have to know where to truncate from in crash case */
2098 EXT4_I(inode)->i_disksize = inode->i_size;
2099 ext4_mark_inode_dirty(handle, inode);
2100
2101 last_block = (inode->i_size + sb->s_blocksize - 1)
2102 >> EXT4_BLOCK_SIZE_BITS(sb);
2103 err = ext4_ext_remove_space(inode, last_block);
2104
2105 /* In a multi-transaction truncate, we only make the final
2106 * transaction synchronous. */
2107 if (IS_SYNC(inode))
2108 handle->h_sync = 1;
2109
2110out_stop:
2111 /*
2112 * If this was a simple ftruncate() and the file will remain alive,
2113 * then we need to clear up the orphan record which we created above.
2114 * However, if this was a real unlink then we were called by
2115 * ext4_delete_inode(), and we allow that function to clean up the
2116 * orphan info for us.
2117 */
2118 if (inode->i_nlink)
2119 ext4_orphan_del(handle, inode);
2120
2121 mutex_unlock(&EXT4_I(inode)->truncate_mutex);
2122 ext4_journal_stop(handle);
2123}
2124
2125/*
2126 * ext4_ext_writepage_trans_blocks:
2127 * calculate max number of blocks we could modify
2128 * in order to allocate new block for an inode
2129 */
2130int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
2131{
2132 int needed;
2133
2134 needed = ext4_ext_calc_credits_for_insert(inode, NULL);
2135
2136 /* caller wants to allocate num blocks, but note it includes sb */
2137 needed = needed * num - (num - 1);
2138
2139#ifdef CONFIG_QUOTA
2140 needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
2141#endif
2142
2143 return needed;
2144}
2145
2146EXPORT_SYMBOL(ext4_mark_inode_dirty);
2147EXPORT_SYMBOL(ext4_ext_invalidate_cache);
2148EXPORT_SYMBOL(ext4_ext_insert_extent);
2149EXPORT_SYMBOL(ext4_ext_walk_space);
2150EXPORT_SYMBOL(ext4_ext_find_goal);
2151EXPORT_SYMBOL(ext4_ext_calc_credits_for_insert);
2152
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
new file mode 100644
index 000000000000..0b622c0624b7
--- /dev/null
+++ b/fs/ext4/file.c
@@ -0,0 +1,139 @@
1/*
2 * linux/fs/ext4/file.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/file.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * ext4 fs regular file handling primitives
16 *
17 * 64-bit file support on 64-bit platforms by Jakub Jelinek
18 * (jj@sunsite.ms.mff.cuni.cz)
19 */
20
21#include <linux/time.h>
22#include <linux/fs.h>
23#include <linux/jbd2.h>
24#include <linux/ext4_fs.h>
25#include <linux/ext4_jbd2.h>
26#include "xattr.h"
27#include "acl.h"
28
29/*
30 * Called when an inode is released. Note that this is different
31 * from ext4_file_open: open gets called at every open, but release
32 * gets called only when /all/ the files are closed.
33 */
34static int ext4_release_file (struct inode * inode, struct file * filp)
35{
36 /* if we are the last writer on the inode, drop the block reservation */
37 if ((filp->f_mode & FMODE_WRITE) &&
38 (atomic_read(&inode->i_writecount) == 1))
39 {
40 mutex_lock(&EXT4_I(inode)->truncate_mutex);
41 ext4_discard_reservation(inode);
42 mutex_unlock(&EXT4_I(inode)->truncate_mutex);
43 }
44 if (is_dx(inode) && filp->private_data)
45 ext4_htree_free_dir_info(filp->private_data);
46
47 return 0;
48}
49
50static ssize_t
51ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
52 unsigned long nr_segs, loff_t pos)
53{
54 struct file *file = iocb->ki_filp;
55 struct inode *inode = file->f_dentry->d_inode;
56 ssize_t ret;
57 int err;
58
59 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
60
61 /*
62 * Skip flushing if there was an error, or if nothing was written.
63 */
64 if (ret <= 0)
65 return ret;
66
67 /*
68 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
69 * journalling then we need to make sure that we force the transaction
70 * to disk to keep all metadata uptodate synchronously.
71 */
72 if (file->f_flags & O_SYNC) {
73 /*
74 * If we are non-data-journaled, then the dirty data has
75 * already been flushed to backing store by generic_osync_inode,
76 * and the inode has been flushed too if there have been any
77 * modifications other than mere timestamp updates.
78 *
79 * Open question --- do we care about flushing timestamps too
80 * if the inode is IS_SYNC?
81 */
82 if (!ext4_should_journal_data(inode))
83 return ret;
84
85 goto force_commit;
86 }
87
88 /*
89 * So we know that there has been no forced data flush. If the inode
90 * is marked IS_SYNC, we need to force one ourselves.
91 */
92 if (!IS_SYNC(inode))
93 return ret;
94
95 /*
96 * Open question #2 --- should we force data to disk here too? If we
97 * don't, the only impact is that data=writeback filesystems won't
98 * flush data to disk automatically on IS_SYNC, only metadata (but
99 * historically, that is what ext2 has done.)
100 */
101
102force_commit:
103 err = ext4_force_commit(inode->i_sb);
104 if (err)
105 return err;
106 return ret;
107}
108
109const struct file_operations ext4_file_operations = {
110 .llseek = generic_file_llseek,
111 .read = do_sync_read,
112 .write = do_sync_write,
113 .aio_read = generic_file_aio_read,
114 .aio_write = ext4_file_write,
115 .ioctl = ext4_ioctl,
116#ifdef CONFIG_COMPAT
117 .compat_ioctl = ext4_compat_ioctl,
118#endif
119 .mmap = generic_file_mmap,
120 .open = generic_file_open,
121 .release = ext4_release_file,
122 .fsync = ext4_sync_file,
123 .sendfile = generic_file_sendfile,
124 .splice_read = generic_file_splice_read,
125 .splice_write = generic_file_splice_write,
126};
127
128struct inode_operations ext4_file_inode_operations = {
129 .truncate = ext4_truncate,
130 .setattr = ext4_setattr,
131#ifdef CONFIG_EXT4DEV_FS_XATTR
132 .setxattr = generic_setxattr,
133 .getxattr = generic_getxattr,
134 .listxattr = ext4_listxattr,
135 .removexattr = generic_removexattr,
136#endif
137 .permission = ext4_permission,
138};
139
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
new file mode 100644
index 000000000000..2a167d7131fa
--- /dev/null
+++ b/fs/ext4/fsync.c
@@ -0,0 +1,88 @@
1/*
2 * linux/fs/ext4/fsync.c
3 *
4 * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com)
5 * from
6 * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
7 * Laboratoire MASI - Institut Blaise Pascal
8 * Universite Pierre et Marie Curie (Paris VI)
9 * from
10 * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds
11 *
12 * ext4fs fsync primitive
13 *
14 * Big-endian to little-endian byte-swapping/bitmaps by
15 * David S. Miller (davem@caip.rutgers.edu), 1995
16 *
17 * Removed unnecessary code duplication for little endian machines
18 * and excessive __inline__s.
19 * Andi Kleen, 1997
20 *
21 * Major simplications and cleanup - we only need to do the metadata, because
22 * we can depend on generic_block_fdatasync() to sync the data blocks.
23 */
24
25#include <linux/time.h>
26#include <linux/fs.h>
27#include <linux/sched.h>
28#include <linux/writeback.h>
29#include <linux/jbd2.h>
30#include <linux/ext4_fs.h>
31#include <linux/ext4_jbd2.h>
32
33/*
34 * akpm: A new design for ext4_sync_file().
35 *
36 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
37 * There cannot be a transaction open by this task.
38 * Another task could have dirtied this inode. Its data can be in any
39 * state in the journalling system.
40 *
41 * What we do is just kick off a commit and wait on it. This will snapshot the
42 * inode to disk.
43 */
44
45int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
46{
47 struct inode *inode = dentry->d_inode;
48 int ret = 0;
49
50 J_ASSERT(ext4_journal_current_handle() == 0);
51
52 /*
53 * data=writeback:
54 * The caller's filemap_fdatawrite()/wait will sync the data.
55 * sync_inode() will sync the metadata
56 *
57 * data=ordered:
58 * The caller's filemap_fdatawrite() will write the data and
59 * sync_inode() will write the inode if it is dirty. Then the caller's
60 * filemap_fdatawait() will wait on the pages.
61 *
62 * data=journal:
63 * filemap_fdatawrite won't do anything (the buffers are clean).
64 * ext4_force_commit will write the file data into the journal and
65 * will wait on that.
66 * filemap_fdatawait() will encounter a ton of newly-dirtied pages
67 * (they were dirtied by commit). But that's OK - the blocks are
68 * safe in-journal, which is all fsync() needs to ensure.
69 */
70 if (ext4_should_journal_data(inode)) {
71 ret = ext4_force_commit(inode->i_sb);
72 goto out;
73 }
74
75 /*
76 * The VFS has written the file data. If the inode is unaltered
77 * then we need not start a commit.
78 */
79 if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
80 struct writeback_control wbc = {
81 .sync_mode = WB_SYNC_ALL,
82 .nr_to_write = 0, /* sys_fsync did this */
83 };
84 ret = sync_inode(inode, &wbc);
85 }
86out:
87 return ret;
88}
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
new file mode 100644
index 000000000000..a67966385e06
--- /dev/null
+++ b/fs/ext4/hash.c
@@ -0,0 +1,152 @@
1/*
2 * linux/fs/ext4/hash.c
3 *
4 * Copyright (C) 2002 by Theodore Ts'o
5 *
6 * This file is released under the GPL v2.
7 *
8 * This file may be redistributed under the terms of the GNU Public
9 * License.
10 */
11
12#include <linux/fs.h>
13#include <linux/jbd2.h>
14#include <linux/sched.h>
15#include <linux/ext4_fs.h>
16#include <linux/cryptohash.h>
17
18#define DELTA 0x9E3779B9
19
20static void TEA_transform(__u32 buf[4], __u32 const in[])
21{
22 __u32 sum = 0;
23 __u32 b0 = buf[0], b1 = buf[1];
24 __u32 a = in[0], b = in[1], c = in[2], d = in[3];
25 int n = 16;
26
27 do {
28 sum += DELTA;
29 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
30 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
31 } while(--n);
32
33 buf[0] += b0;
34 buf[1] += b1;
35}
36
37
38/* The old legacy hash */
39static __u32 dx_hack_hash (const char *name, int len)
40{
41 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
42 while (len--) {
43 __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
44
45 if (hash & 0x80000000) hash -= 0x7fffffff;
46 hash1 = hash0;
47 hash0 = hash;
48 }
49 return (hash0 << 1);
50}
51
52static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
53{
54 __u32 pad, val;
55 int i;
56
57 pad = (__u32)len | ((__u32)len << 8);
58 pad |= pad << 16;
59
60 val = pad;
61 if (len > num*4)
62 len = num * 4;
63 for (i=0; i < len; i++) {
64 if ((i % 4) == 0)
65 val = pad;
66 val = msg[i] + (val << 8);
67 if ((i % 4) == 3) {
68 *buf++ = val;
69 val = pad;
70 num--;
71 }
72 }
73 if (--num >= 0)
74 *buf++ = val;
75 while (--num >= 0)
76 *buf++ = pad;
77}
78
79/*
80 * Returns the hash of a filename. If len is 0 and name is NULL, then
81 * this function can be used to test whether or not a hash version is
82 * supported.
83 *
84 * The seed is an 4 longword (32 bits) "secret" which can be used to
85 * uniquify a hash. If the seed is all zero's, then some default seed
86 * may be used.
87 *
88 * A particular hash version specifies whether or not the seed is
89 * represented, and whether or not the returned hash is 32 bits or 64
90 * bits. 32 bit hashes will return 0 for the minor hash.
91 */
92int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
93{
94 __u32 hash;
95 __u32 minor_hash = 0;
96 const char *p;
97 int i;
98 __u32 in[8], buf[4];
99
100 /* Initialize the default seed for the hash checksum functions */
101 buf[0] = 0x67452301;
102 buf[1] = 0xefcdab89;
103 buf[2] = 0x98badcfe;
104 buf[3] = 0x10325476;
105
106 /* Check to see if the seed is all zero's */
107 if (hinfo->seed) {
108 for (i=0; i < 4; i++) {
109 if (hinfo->seed[i])
110 break;
111 }
112 if (i < 4)
113 memcpy(buf, hinfo->seed, sizeof(buf));
114 }
115
116 switch (hinfo->hash_version) {
117 case DX_HASH_LEGACY:
118 hash = dx_hack_hash(name, len);
119 break;
120 case DX_HASH_HALF_MD4:
121 p = name;
122 while (len > 0) {
123 str2hashbuf(p, len, in, 8);
124 half_md4_transform(buf, in);
125 len -= 32;
126 p += 32;
127 }
128 minor_hash = buf[2];
129 hash = buf[1];
130 break;
131 case DX_HASH_TEA:
132 p = name;
133 while (len > 0) {
134 str2hashbuf(p, len, in, 4);
135 TEA_transform(buf, in);
136 len -= 16;
137 p += 16;
138 }
139 hash = buf[0];
140 minor_hash = buf[1];
141 break;
142 default:
143 hinfo->hash = 0;
144 return -1;
145 }
146 hash = hash & ~1;
147 if (hash == (EXT4_HTREE_EOF << 1))
148 hash = (EXT4_HTREE_EOF-1) << 1;
149 hinfo->hash = hash;
150 hinfo->minor_hash = minor_hash;
151 return 0;
152}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
new file mode 100644
index 000000000000..c88b439ba5cd
--- /dev/null
+++ b/fs/ext4/ialloc.c
@@ -0,0 +1,772 @@
1/*
2 * linux/fs/ext4/ialloc.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * BSD ufs-inspired inode and directory allocation by
10 * Stephen Tweedie (sct@redhat.com), 1993
11 * Big-endian to little-endian byte-swapping/bitmaps by
12 * David S. Miller (davem@caip.rutgers.edu), 1995
13 */
14
15#include <linux/time.h>
16#include <linux/fs.h>
17#include <linux/jbd2.h>
18#include <linux/ext4_fs.h>
19#include <linux/ext4_jbd2.h>
20#include <linux/stat.h>
21#include <linux/string.h>
22#include <linux/quotaops.h>
23#include <linux/buffer_head.h>
24#include <linux/random.h>
25#include <linux/bitops.h>
26#include <linux/blkdev.h>
27#include <asm/byteorder.h>
28
29#include "xattr.h"
30#include "acl.h"
31
32/*
33 * ialloc.c contains the inodes allocation and deallocation routines
34 */
35
36/*
37 * The free inodes are managed by bitmaps. A file system contains several
38 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
39 * block for inodes, N blocks for the inode table and data blocks.
40 *
41 * The file system contains group descriptors which are located after the
42 * super block. Each descriptor contains the number of the bitmap block and
43 * the free blocks count in the block.
44 */
45
46
47/*
48 * Read the inode allocation bitmap for a given block_group, reading
49 * into the specified slot in the superblock's bitmap cache.
50 *
51 * Return buffer_head of bitmap on success or NULL.
52 */
53static struct buffer_head *
54read_inode_bitmap(struct super_block * sb, unsigned long block_group)
55{
56 struct ext4_group_desc *desc;
57 struct buffer_head *bh = NULL;
58
59 desc = ext4_get_group_desc(sb, block_group, NULL);
60 if (!desc)
61 goto error_out;
62
63 bh = sb_bread(sb, ext4_inode_bitmap(sb, desc));
64 if (!bh)
65 ext4_error(sb, "read_inode_bitmap",
66 "Cannot read inode bitmap - "
67 "block_group = %lu, inode_bitmap = %llu",
68 block_group, ext4_inode_bitmap(sb, desc));
69error_out:
70 return bh;
71}
72
73/*
74 * NOTE! When we get the inode, we're the only people
75 * that have access to it, and as such there are no
76 * race conditions we have to worry about. The inode
77 * is not on the hash-lists, and it cannot be reached
78 * through the filesystem because the directory entry
79 * has been deleted earlier.
80 *
81 * HOWEVER: we must make sure that we get no aliases,
82 * which means that we have to call "clear_inode()"
83 * _before_ we mark the inode not in use in the inode
84 * bitmaps. Otherwise a newly created file might use
85 * the same inode number (not actually the same pointer
86 * though), and then we'd have two inodes sharing the
87 * same inode number and space on the harddisk.
88 */
89void ext4_free_inode (handle_t *handle, struct inode * inode)
90{
91 struct super_block * sb = inode->i_sb;
92 int is_directory;
93 unsigned long ino;
94 struct buffer_head *bitmap_bh = NULL;
95 struct buffer_head *bh2;
96 unsigned long block_group;
97 unsigned long bit;
98 struct ext4_group_desc * gdp;
99 struct ext4_super_block * es;
100 struct ext4_sb_info *sbi;
101 int fatal = 0, err;
102
103 if (atomic_read(&inode->i_count) > 1) {
104 printk ("ext4_free_inode: inode has count=%d\n",
105 atomic_read(&inode->i_count));
106 return;
107 }
108 if (inode->i_nlink) {
109 printk ("ext4_free_inode: inode has nlink=%d\n",
110 inode->i_nlink);
111 return;
112 }
113 if (!sb) {
114 printk("ext4_free_inode: inode on nonexistent device\n");
115 return;
116 }
117 sbi = EXT4_SB(sb);
118
119 ino = inode->i_ino;
120 ext4_debug ("freeing inode %lu\n", ino);
121
122 /*
123 * Note: we must free any quota before locking the superblock,
124 * as writing the quota to disk may need the lock as well.
125 */
126 DQUOT_INIT(inode);
127 ext4_xattr_delete_inode(handle, inode);
128 DQUOT_FREE_INODE(inode);
129 DQUOT_DROP(inode);
130
131 is_directory = S_ISDIR(inode->i_mode);
132
133 /* Do this BEFORE marking the inode not in use or returning an error */
134 clear_inode (inode);
135
136 es = EXT4_SB(sb)->s_es;
137 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
138 ext4_error (sb, "ext4_free_inode",
139 "reserved or nonexistent inode %lu", ino);
140 goto error_return;
141 }
142 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
143 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
144 bitmap_bh = read_inode_bitmap(sb, block_group);
145 if (!bitmap_bh)
146 goto error_return;
147
148 BUFFER_TRACE(bitmap_bh, "get_write_access");
149 fatal = ext4_journal_get_write_access(handle, bitmap_bh);
150 if (fatal)
151 goto error_return;
152
153 /* Ok, now we can actually update the inode bitmaps.. */
154 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
155 bit, bitmap_bh->b_data))
156 ext4_error (sb, "ext4_free_inode",
157 "bit already cleared for inode %lu", ino);
158 else {
159 gdp = ext4_get_group_desc (sb, block_group, &bh2);
160
161 BUFFER_TRACE(bh2, "get_write_access");
162 fatal = ext4_journal_get_write_access(handle, bh2);
163 if (fatal) goto error_return;
164
165 if (gdp) {
166 spin_lock(sb_bgl_lock(sbi, block_group));
167 gdp->bg_free_inodes_count = cpu_to_le16(
168 le16_to_cpu(gdp->bg_free_inodes_count) + 1);
169 if (is_directory)
170 gdp->bg_used_dirs_count = cpu_to_le16(
171 le16_to_cpu(gdp->bg_used_dirs_count) - 1);
172 spin_unlock(sb_bgl_lock(sbi, block_group));
173 percpu_counter_inc(&sbi->s_freeinodes_counter);
174 if (is_directory)
175 percpu_counter_dec(&sbi->s_dirs_counter);
176
177 }
178 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
179 err = ext4_journal_dirty_metadata(handle, bh2);
180 if (!fatal) fatal = err;
181 }
182 BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata");
183 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
184 if (!fatal)
185 fatal = err;
186 sb->s_dirt = 1;
187error_return:
188 brelse(bitmap_bh);
189 ext4_std_error(sb, fatal);
190}
191
192/*
193 * There are two policies for allocating an inode. If the new inode is
194 * a directory, then a forward search is made for a block group with both
195 * free space and a low directory-to-inode ratio; if that fails, then of
196 * the groups with above-average free space, that group with the fewest
197 * directories already is chosen.
198 *
199 * For other inodes, search forward from the parent directory\'s block
200 * group to find a free inode.
201 */
202static int find_group_dir(struct super_block *sb, struct inode *parent)
203{
204 int ngroups = EXT4_SB(sb)->s_groups_count;
205 unsigned int freei, avefreei;
206 struct ext4_group_desc *desc, *best_desc = NULL;
207 struct buffer_head *bh;
208 int group, best_group = -1;
209
210 freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
211 avefreei = freei / ngroups;
212
213 for (group = 0; group < ngroups; group++) {
214 desc = ext4_get_group_desc (sb, group, &bh);
215 if (!desc || !desc->bg_free_inodes_count)
216 continue;
217 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
218 continue;
219 if (!best_desc ||
220 (le16_to_cpu(desc->bg_free_blocks_count) >
221 le16_to_cpu(best_desc->bg_free_blocks_count))) {
222 best_group = group;
223 best_desc = desc;
224 }
225 }
226 return best_group;
227}
228
229/*
230 * Orlov's allocator for directories.
231 *
232 * We always try to spread first-level directories.
233 *
234 * If there are blockgroups with both free inodes and free blocks counts
235 * not worse than average we return one with smallest directory count.
236 * Otherwise we simply return a random group.
237 *
238 * For the rest rules look so:
239 *
240 * It's OK to put directory into a group unless
241 * it has too many directories already (max_dirs) or
242 * it has too few free inodes left (min_inodes) or
243 * it has too few free blocks left (min_blocks) or
244 * it's already running too large debt (max_debt).
245 * Parent's group is prefered, if it doesn't satisfy these
246 * conditions we search cyclically through the rest. If none
247 * of the groups look good we just look for a group with more
248 * free inodes than average (starting at parent's group).
249 *
250 * Debt is incremented each time we allocate a directory and decremented
251 * when we allocate an inode, within 0--255.
252 */
253
254#define INODE_COST 64
255#define BLOCK_COST 256
256
257static int find_group_orlov(struct super_block *sb, struct inode *parent)
258{
259 int parent_group = EXT4_I(parent)->i_block_group;
260 struct ext4_sb_info *sbi = EXT4_SB(sb);
261 struct ext4_super_block *es = sbi->s_es;
262 int ngroups = sbi->s_groups_count;
263 int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
264 unsigned int freei, avefreei;
265 ext4_fsblk_t freeb, avefreeb;
266 ext4_fsblk_t blocks_per_dir;
267 unsigned int ndirs;
268 int max_debt, max_dirs, min_inodes;
269 ext4_grpblk_t min_blocks;
270 int group = -1, i;
271 struct ext4_group_desc *desc;
272 struct buffer_head *bh;
273
274 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
275 avefreei = freei / ngroups;
276 freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
277 avefreeb = freeb;
278 do_div(avefreeb, ngroups);
279 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
280
281 if ((parent == sb->s_root->d_inode) ||
282 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
283 int best_ndir = inodes_per_group;
284 int best_group = -1;
285
286 get_random_bytes(&group, sizeof(group));
287 parent_group = (unsigned)group % ngroups;
288 for (i = 0; i < ngroups; i++) {
289 group = (parent_group + i) % ngroups;
290 desc = ext4_get_group_desc (sb, group, &bh);
291 if (!desc || !desc->bg_free_inodes_count)
292 continue;
293 if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
294 continue;
295 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
296 continue;
297 if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
298 continue;
299 best_group = group;
300 best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
301 }
302 if (best_group >= 0)
303 return best_group;
304 goto fallback;
305 }
306
307 blocks_per_dir = ext4_blocks_count(es) - freeb;
308 do_div(blocks_per_dir, ndirs);
309
310 max_dirs = ndirs / ngroups + inodes_per_group / 16;
311 min_inodes = avefreei - inodes_per_group / 4;
312 min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
313
314 max_debt = EXT4_BLOCKS_PER_GROUP(sb);
315 max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
316 if (max_debt * INODE_COST > inodes_per_group)
317 max_debt = inodes_per_group / INODE_COST;
318 if (max_debt > 255)
319 max_debt = 255;
320 if (max_debt == 0)
321 max_debt = 1;
322
323 for (i = 0; i < ngroups; i++) {
324 group = (parent_group + i) % ngroups;
325 desc = ext4_get_group_desc (sb, group, &bh);
326 if (!desc || !desc->bg_free_inodes_count)
327 continue;
328 if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
329 continue;
330 if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
331 continue;
332 if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
333 continue;
334 return group;
335 }
336
337fallback:
338 for (i = 0; i < ngroups; i++) {
339 group = (parent_group + i) % ngroups;
340 desc = ext4_get_group_desc (sb, group, &bh);
341 if (!desc || !desc->bg_free_inodes_count)
342 continue;
343 if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
344 return group;
345 }
346
347 if (avefreei) {
348 /*
349 * The free-inodes counter is approximate, and for really small
350 * filesystems the above test can fail to find any blockgroups
351 */
352 avefreei = 0;
353 goto fallback;
354 }
355
356 return -1;
357}
358
359static int find_group_other(struct super_block *sb, struct inode *parent)
360{
361 int parent_group = EXT4_I(parent)->i_block_group;
362 int ngroups = EXT4_SB(sb)->s_groups_count;
363 struct ext4_group_desc *desc;
364 struct buffer_head *bh;
365 int group, i;
366
367 /*
368 * Try to place the inode in its parent directory
369 */
370 group = parent_group;
371 desc = ext4_get_group_desc (sb, group, &bh);
372 if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
373 le16_to_cpu(desc->bg_free_blocks_count))
374 return group;
375
376 /*
377 * We're going to place this inode in a different blockgroup from its
378 * parent. We want to cause files in a common directory to all land in
379 * the same blockgroup. But we want files which are in a different
380 * directory which shares a blockgroup with our parent to land in a
381 * different blockgroup.
382 *
383 * So add our directory's i_ino into the starting point for the hash.
384 */
385 group = (group + parent->i_ino) % ngroups;
386
387 /*
388 * Use a quadratic hash to find a group with a free inode and some free
389 * blocks.
390 */
391 for (i = 1; i < ngroups; i <<= 1) {
392 group += i;
393 if (group >= ngroups)
394 group -= ngroups;
395 desc = ext4_get_group_desc (sb, group, &bh);
396 if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
397 le16_to_cpu(desc->bg_free_blocks_count))
398 return group;
399 }
400
401 /*
402 * That failed: try linear search for a free inode, even if that group
403 * has no free blocks.
404 */
405 group = parent_group;
406 for (i = 0; i < ngroups; i++) {
407 if (++group >= ngroups)
408 group = 0;
409 desc = ext4_get_group_desc (sb, group, &bh);
410 if (desc && le16_to_cpu(desc->bg_free_inodes_count))
411 return group;
412 }
413
414 return -1;
415}
416
417/*
418 * There are two policies for allocating an inode. If the new inode is
419 * a directory, then a forward search is made for a block group with both
420 * free space and a low directory-to-inode ratio; if that fails, then of
421 * the groups with above-average free space, that group with the fewest
422 * directories already is chosen.
423 *
424 * For other inodes, search forward from the parent directory's block
425 * group to find a free inode.
426 */
427struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
428{
429 struct super_block *sb;
430 struct buffer_head *bitmap_bh = NULL;
431 struct buffer_head *bh2;
432 int group;
433 unsigned long ino = 0;
434 struct inode * inode;
435 struct ext4_group_desc * gdp = NULL;
436 struct ext4_super_block * es;
437 struct ext4_inode_info *ei;
438 struct ext4_sb_info *sbi;
439 int err = 0;
440 struct inode *ret;
441 int i;
442
443 /* Cannot create files in a deleted directory */
444 if (!dir || !dir->i_nlink)
445 return ERR_PTR(-EPERM);
446
447 sb = dir->i_sb;
448 inode = new_inode(sb);
449 if (!inode)
450 return ERR_PTR(-ENOMEM);
451 ei = EXT4_I(inode);
452
453 sbi = EXT4_SB(sb);
454 es = sbi->s_es;
455 if (S_ISDIR(mode)) {
456 if (test_opt (sb, OLDALLOC))
457 group = find_group_dir(sb, dir);
458 else
459 group = find_group_orlov(sb, dir);
460 } else
461 group = find_group_other(sb, dir);
462
463 err = -ENOSPC;
464 if (group == -1)
465 goto out;
466
467 for (i = 0; i < sbi->s_groups_count; i++) {
468 err = -EIO;
469
470 gdp = ext4_get_group_desc(sb, group, &bh2);
471 if (!gdp)
472 goto fail;
473
474 brelse(bitmap_bh);
475 bitmap_bh = read_inode_bitmap(sb, group);
476 if (!bitmap_bh)
477 goto fail;
478
479 ino = 0;
480
481repeat_in_this_group:
482 ino = ext4_find_next_zero_bit((unsigned long *)
483 bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino);
484 if (ino < EXT4_INODES_PER_GROUP(sb)) {
485
486 BUFFER_TRACE(bitmap_bh, "get_write_access");
487 err = ext4_journal_get_write_access(handle, bitmap_bh);
488 if (err)
489 goto fail;
490
491 if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
492 ino, bitmap_bh->b_data)) {
493 /* we won it */
494 BUFFER_TRACE(bitmap_bh,
495 "call ext4_journal_dirty_metadata");
496 err = ext4_journal_dirty_metadata(handle,
497 bitmap_bh);
498 if (err)
499 goto fail;
500 goto got;
501 }
502 /* we lost it */
503 jbd2_journal_release_buffer(handle, bitmap_bh);
504
505 if (++ino < EXT4_INODES_PER_GROUP(sb))
506 goto repeat_in_this_group;
507 }
508
509 /*
510 * This case is possible in concurrent environment. It is very
511 * rare. We cannot repeat the find_group_xxx() call because
512 * that will simply return the same blockgroup, because the
513 * group descriptor metadata has not yet been updated.
514 * So we just go onto the next blockgroup.
515 */
516 if (++group == sbi->s_groups_count)
517 group = 0;
518 }
519 err = -ENOSPC;
520 goto out;
521
522got:
523 ino += group * EXT4_INODES_PER_GROUP(sb) + 1;
524 if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
525 ext4_error (sb, "ext4_new_inode",
526 "reserved inode or inode > inodes count - "
527 "block_group = %d, inode=%lu", group, ino);
528 err = -EIO;
529 goto fail;
530 }
531
532 BUFFER_TRACE(bh2, "get_write_access");
533 err = ext4_journal_get_write_access(handle, bh2);
534 if (err) goto fail;
535 spin_lock(sb_bgl_lock(sbi, group));
536 gdp->bg_free_inodes_count =
537 cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
538 if (S_ISDIR(mode)) {
539 gdp->bg_used_dirs_count =
540 cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
541 }
542 spin_unlock(sb_bgl_lock(sbi, group));
543 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
544 err = ext4_journal_dirty_metadata(handle, bh2);
545 if (err) goto fail;
546
547 percpu_counter_dec(&sbi->s_freeinodes_counter);
548 if (S_ISDIR(mode))
549 percpu_counter_inc(&sbi->s_dirs_counter);
550 sb->s_dirt = 1;
551
552 inode->i_uid = current->fsuid;
553 if (test_opt (sb, GRPID))
554 inode->i_gid = dir->i_gid;
555 else if (dir->i_mode & S_ISGID) {
556 inode->i_gid = dir->i_gid;
557 if (S_ISDIR(mode))
558 mode |= S_ISGID;
559 } else
560 inode->i_gid = current->fsgid;
561 inode->i_mode = mode;
562
563 inode->i_ino = ino;
564 /* This is the optimal IO size (for stat), not the fs block size */
565 inode->i_blocks = 0;
566 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
567
568 memset(ei->i_data, 0, sizeof(ei->i_data));
569 ei->i_dir_start_lookup = 0;
570 ei->i_disksize = 0;
571
572 ei->i_flags = EXT4_I(dir)->i_flags & ~EXT4_INDEX_FL;
573 if (S_ISLNK(mode))
574 ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
575 /* dirsync only applies to directories */
576 if (!S_ISDIR(mode))
577 ei->i_flags &= ~EXT4_DIRSYNC_FL;
578#ifdef EXT4_FRAGMENTS
579 ei->i_faddr = 0;
580 ei->i_frag_no = 0;
581 ei->i_frag_size = 0;
582#endif
583 ei->i_file_acl = 0;
584 ei->i_dir_acl = 0;
585 ei->i_dtime = 0;
586 ei->i_block_alloc_info = NULL;
587 ei->i_block_group = group;
588
589 ext4_set_inode_flags(inode);
590 if (IS_DIRSYNC(inode))
591 handle->h_sync = 1;
592 insert_inode_hash(inode);
593 spin_lock(&sbi->s_next_gen_lock);
594 inode->i_generation = sbi->s_next_generation++;
595 spin_unlock(&sbi->s_next_gen_lock);
596
597 ei->i_state = EXT4_STATE_NEW;
598 ei->i_extra_isize =
599 (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) ?
600 sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE : 0;
601
602 ret = inode;
603 if(DQUOT_ALLOC_INODE(inode)) {
604 err = -EDQUOT;
605 goto fail_drop;
606 }
607
608 err = ext4_init_acl(handle, inode, dir);
609 if (err)
610 goto fail_free_drop;
611
612 err = ext4_init_security(handle,inode, dir);
613 if (err)
614 goto fail_free_drop;
615
616 err = ext4_mark_inode_dirty(handle, inode);
617 if (err) {
618 ext4_std_error(sb, err);
619 goto fail_free_drop;
620 }
621 if (test_opt(sb, EXTENTS)) {
622 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
623 ext4_ext_tree_init(handle, inode);
624 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
625 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
626 if (err) goto fail;
627 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS);
628 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "call ext4_journal_dirty_metadata");
629 err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
630 }
631 }
632
633 ext4_debug("allocating inode %lu\n", inode->i_ino);
634 goto really_out;
635fail:
636 ext4_std_error(sb, err);
637out:
638 iput(inode);
639 ret = ERR_PTR(err);
640really_out:
641 brelse(bitmap_bh);
642 return ret;
643
644fail_free_drop:
645 DQUOT_FREE_INODE(inode);
646
647fail_drop:
648 DQUOT_DROP(inode);
649 inode->i_flags |= S_NOQUOTA;
650 inode->i_nlink = 0;
651 iput(inode);
652 brelse(bitmap_bh);
653 return ERR_PTR(err);
654}
655
656/* Verify that we are loading a valid orphan from disk */
657struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
658{
659 unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
660 unsigned long block_group;
661 int bit;
662 struct buffer_head *bitmap_bh = NULL;
663 struct inode *inode = NULL;
664
665 /* Error cases - e2fsck has already cleaned up for us */
666 if (ino > max_ino) {
667 ext4_warning(sb, __FUNCTION__,
668 "bad orphan ino %lu! e2fsck was run?", ino);
669 goto out;
670 }
671
672 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
673 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
674 bitmap_bh = read_inode_bitmap(sb, block_group);
675 if (!bitmap_bh) {
676 ext4_warning(sb, __FUNCTION__,
677 "inode bitmap error for orphan %lu", ino);
678 goto out;
679 }
680
681 /* Having the inode bit set should be a 100% indicator that this
682 * is a valid orphan (no e2fsck run on fs). Orphans also include
683 * inodes that were being truncated, so we can't check i_nlink==0.
684 */
685 if (!ext4_test_bit(bit, bitmap_bh->b_data) ||
686 !(inode = iget(sb, ino)) || is_bad_inode(inode) ||
687 NEXT_ORPHAN(inode) > max_ino) {
688 ext4_warning(sb, __FUNCTION__,
689 "bad orphan inode %lu! e2fsck was run?", ino);
690 printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
691 bit, (unsigned long long)bitmap_bh->b_blocknr,
692 ext4_test_bit(bit, bitmap_bh->b_data));
693 printk(KERN_NOTICE "inode=%p\n", inode);
694 if (inode) {
695 printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
696 is_bad_inode(inode));
697 printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
698 NEXT_ORPHAN(inode));
699 printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
700 }
701 /* Avoid freeing blocks if we got a bad deleted inode */
702 if (inode && inode->i_nlink == 0)
703 inode->i_blocks = 0;
704 iput(inode);
705 inode = NULL;
706 }
707out:
708 brelse(bitmap_bh);
709 return inode;
710}
711
712unsigned long ext4_count_free_inodes (struct super_block * sb)
713{
714 unsigned long desc_count;
715 struct ext4_group_desc *gdp;
716 int i;
717#ifdef EXT4FS_DEBUG
718 struct ext4_super_block *es;
719 unsigned long bitmap_count, x;
720 struct buffer_head *bitmap_bh = NULL;
721
722 es = EXT4_SB(sb)->s_es;
723 desc_count = 0;
724 bitmap_count = 0;
725 gdp = NULL;
726 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
727 gdp = ext4_get_group_desc (sb, i, NULL);
728 if (!gdp)
729 continue;
730 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
731 brelse(bitmap_bh);
732 bitmap_bh = read_inode_bitmap(sb, i);
733 if (!bitmap_bh)
734 continue;
735
736 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
737 printk("group %d: stored = %d, counted = %lu\n",
738 i, le16_to_cpu(gdp->bg_free_inodes_count), x);
739 bitmap_count += x;
740 }
741 brelse(bitmap_bh);
742 printk("ext4_count_free_inodes: stored = %u, computed = %lu, %lu\n",
743 le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
744 return desc_count;
745#else
746 desc_count = 0;
747 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
748 gdp = ext4_get_group_desc (sb, i, NULL);
749 if (!gdp)
750 continue;
751 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
752 cond_resched();
753 }
754 return desc_count;
755#endif
756}
757
758/* Called at mount-time, super-block is locked */
759unsigned long ext4_count_dirs (struct super_block * sb)
760{
761 unsigned long count = 0;
762 int i;
763
764 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
765 struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL);
766 if (!gdp)
767 continue;
768 count += le16_to_cpu(gdp->bg_used_dirs_count);
769 }
770 return count;
771}
772
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
new file mode 100644
index 000000000000..0a60ec5a16db
--- /dev/null
+++ b/fs/ext4/inode.c
@@ -0,0 +1,3233 @@
1/*
2 * linux/fs/ext4/inode.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/inode.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Goal-directed block allocation by Stephen Tweedie
16 * (sct@redhat.com), 1993, 1998
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 64-bit file support on 64-bit platforms by Jakub Jelinek
20 * (jj@sunsite.ms.mff.cuni.cz)
21 *
22 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
23 */
24
25#include <linux/module.h>
26#include <linux/fs.h>
27#include <linux/time.h>
28#include <linux/ext4_jbd2.h>
29#include <linux/jbd2.h>
30#include <linux/smp_lock.h>
31#include <linux/highuid.h>
32#include <linux/pagemap.h>
33#include <linux/quotaops.h>
34#include <linux/string.h>
35#include <linux/buffer_head.h>
36#include <linux/writeback.h>
37#include <linux/mpage.h>
38#include <linux/uio.h>
39#include <linux/bio.h>
40#include "xattr.h"
41#include "acl.h"
42
43/*
44 * Test whether an inode is a fast symlink.
45 */
46static int ext4_inode_is_fast_symlink(struct inode *inode)
47{
48 int ea_blocks = EXT4_I(inode)->i_file_acl ?
49 (inode->i_sb->s_blocksize >> 9) : 0;
50
51 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
52}
53
54/*
55 * The ext4 forget function must perform a revoke if we are freeing data
56 * which has been journaled. Metadata (eg. indirect blocks) must be
57 * revoked in all cases.
58 *
59 * "bh" may be NULL: a metadata block may have been freed from memory
60 * but there may still be a record of it in the journal, and that record
61 * still needs to be revoked.
62 */
63int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
64 struct buffer_head *bh, ext4_fsblk_t blocknr)
65{
66 int err;
67
68 might_sleep();
69
70 BUFFER_TRACE(bh, "enter");
71
72 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
73 "data mode %lx\n",
74 bh, is_metadata, inode->i_mode,
75 test_opt(inode->i_sb, DATA_FLAGS));
76
77 /* Never use the revoke function if we are doing full data
78 * journaling: there is no need to, and a V1 superblock won't
79 * support it. Otherwise, only skip the revoke on un-journaled
80 * data blocks. */
81
82 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
83 (!is_metadata && !ext4_should_journal_data(inode))) {
84 if (bh) {
85 BUFFER_TRACE(bh, "call jbd2_journal_forget");
86 return ext4_journal_forget(handle, bh);
87 }
88 return 0;
89 }
90
91 /*
92 * data!=journal && (is_metadata || should_journal_data(inode))
93 */
94 BUFFER_TRACE(bh, "call ext4_journal_revoke");
95 err = ext4_journal_revoke(handle, blocknr, bh);
96 if (err)
97 ext4_abort(inode->i_sb, __FUNCTION__,
98 "error %d when attempting revoke", err);
99 BUFFER_TRACE(bh, "exit");
100 return err;
101}
102
103/*
104 * Work out how many blocks we need to proceed with the next chunk of a
105 * truncate transaction.
106 */
107static unsigned long blocks_for_truncate(struct inode *inode)
108{
109 unsigned long needed;
110
111 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
112
113 /* Give ourselves just enough room to cope with inodes in which
114 * i_blocks is corrupt: we've seen disk corruptions in the past
115 * which resulted in random data in an inode which looked enough
116 * like a regular file for ext4 to try to delete it. Things
117 * will go a bit crazy if that happens, but at least we should
118 * try not to panic the whole kernel. */
119 if (needed < 2)
120 needed = 2;
121
122 /* But we need to bound the transaction so we don't overflow the
123 * journal. */
124 if (needed > EXT4_MAX_TRANS_DATA)
125 needed = EXT4_MAX_TRANS_DATA;
126
127 return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
128}
129
130/*
131 * Truncate transactions can be complex and absolutely huge. So we need to
132 * be able to restart the transaction at a conventient checkpoint to make
133 * sure we don't overflow the journal.
134 *
135 * start_transaction gets us a new handle for a truncate transaction,
136 * and extend_transaction tries to extend the existing one a bit. If
137 * extend fails, we need to propagate the failure up and restart the
138 * transaction in the top-level truncate loop. --sct
139 */
140static handle_t *start_transaction(struct inode *inode)
141{
142 handle_t *result;
143
144 result = ext4_journal_start(inode, blocks_for_truncate(inode));
145 if (!IS_ERR(result))
146 return result;
147
148 ext4_std_error(inode->i_sb, PTR_ERR(result));
149 return result;
150}
151
152/*
153 * Try to extend this transaction for the purposes of truncation.
154 *
155 * Returns 0 if we managed to create more room. If we can't create more
156 * room, and the transaction must be restarted we return 1.
157 */
158static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
159{
160 if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
161 return 0;
162 if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
163 return 0;
164 return 1;
165}
166
167/*
168 * Restart the transaction associated with *handle. This does a commit,
169 * so before we call here everything must be consistently dirtied against
170 * this transaction.
171 */
172static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
173{
174 jbd_debug(2, "restarting handle %p\n", handle);
175 return ext4_journal_restart(handle, blocks_for_truncate(inode));
176}
177
178/*
179 * Called at the last iput() if i_nlink is zero.
180 */
181void ext4_delete_inode (struct inode * inode)
182{
183 handle_t *handle;
184
185 truncate_inode_pages(&inode->i_data, 0);
186
187 if (is_bad_inode(inode))
188 goto no_delete;
189
190 handle = start_transaction(inode);
191 if (IS_ERR(handle)) {
192 /*
193 * If we're going to skip the normal cleanup, we still need to
194 * make sure that the in-core orphan linked list is properly
195 * cleaned up.
196 */
197 ext4_orphan_del(NULL, inode);
198 goto no_delete;
199 }
200
201 if (IS_SYNC(inode))
202 handle->h_sync = 1;
203 inode->i_size = 0;
204 if (inode->i_blocks)
205 ext4_truncate(inode);
206 /*
207 * Kill off the orphan record which ext4_truncate created.
208 * AKPM: I think this can be inside the above `if'.
209 * Note that ext4_orphan_del() has to be able to cope with the
210 * deletion of a non-existent orphan - this is because we don't
211 * know if ext4_truncate() actually created an orphan record.
212 * (Well, we could do this if we need to, but heck - it works)
213 */
214 ext4_orphan_del(handle, inode);
215 EXT4_I(inode)->i_dtime = get_seconds();
216
217 /*
218 * One subtle ordering requirement: if anything has gone wrong
219 * (transaction abort, IO errors, whatever), then we can still
220 * do these next steps (the fs will already have been marked as
221 * having errors), but we can't free the inode if the mark_dirty
222 * fails.
223 */
224 if (ext4_mark_inode_dirty(handle, inode))
225 /* If that failed, just do the required in-core inode clear. */
226 clear_inode(inode);
227 else
228 ext4_free_inode(handle, inode);
229 ext4_journal_stop(handle);
230 return;
231no_delete:
232 clear_inode(inode); /* We must guarantee clearing of inode... */
233}
234
235typedef struct {
236 __le32 *p;
237 __le32 key;
238 struct buffer_head *bh;
239} Indirect;
240
241static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
242{
243 p->key = *(p->p = v);
244 p->bh = bh;
245}
246
247static int verify_chain(Indirect *from, Indirect *to)
248{
249 while (from <= to && from->key == *from->p)
250 from++;
251 return (from > to);
252}
253
254/**
255 * ext4_block_to_path - parse the block number into array of offsets
256 * @inode: inode in question (we are only interested in its superblock)
257 * @i_block: block number to be parsed
258 * @offsets: array to store the offsets in
259 * @boundary: set this non-zero if the referred-to block is likely to be
260 * followed (on disk) by an indirect block.
261 *
262 * To store the locations of file's data ext4 uses a data structure common
263 * for UNIX filesystems - tree of pointers anchored in the inode, with
264 * data blocks at leaves and indirect blocks in intermediate nodes.
265 * This function translates the block number into path in that tree -
266 * return value is the path length and @offsets[n] is the offset of
267 * pointer to (n+1)th node in the nth one. If @block is out of range
268 * (negative or too large) warning is printed and zero returned.
269 *
270 * Note: function doesn't find node addresses, so no IO is needed. All
271 * we need to know is the capacity of indirect blocks (taken from the
272 * inode->i_sb).
273 */
274
275/*
276 * Portability note: the last comparison (check that we fit into triple
277 * indirect block) is spelled differently, because otherwise on an
278 * architecture with 32-bit longs and 8Kb pages we might get into trouble
279 * if our filesystem had 8Kb blocks. We might use long long, but that would
280 * kill us on x86. Oh, well, at least the sign propagation does not matter -
281 * i_block would have to be negative in the very beginning, so we would not
282 * get there at all.
283 */
284
285static int ext4_block_to_path(struct inode *inode,
286 long i_block, int offsets[4], int *boundary)
287{
288 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
289 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
290 const long direct_blocks = EXT4_NDIR_BLOCKS,
291 indirect_blocks = ptrs,
292 double_blocks = (1 << (ptrs_bits * 2));
293 int n = 0;
294 int final = 0;
295
296 if (i_block < 0) {
297 ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");
298 } else if (i_block < direct_blocks) {
299 offsets[n++] = i_block;
300 final = direct_blocks;
301 } else if ( (i_block -= direct_blocks) < indirect_blocks) {
302 offsets[n++] = EXT4_IND_BLOCK;
303 offsets[n++] = i_block;
304 final = ptrs;
305 } else if ((i_block -= indirect_blocks) < double_blocks) {
306 offsets[n++] = EXT4_DIND_BLOCK;
307 offsets[n++] = i_block >> ptrs_bits;
308 offsets[n++] = i_block & (ptrs - 1);
309 final = ptrs;
310 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
311 offsets[n++] = EXT4_TIND_BLOCK;
312 offsets[n++] = i_block >> (ptrs_bits * 2);
313 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
314 offsets[n++] = i_block & (ptrs - 1);
315 final = ptrs;
316 } else {
317 ext4_warning(inode->i_sb, "ext4_block_to_path", "block > big");
318 }
319 if (boundary)
320 *boundary = final - 1 - (i_block & (ptrs - 1));
321 return n;
322}
323
324/**
325 * ext4_get_branch - read the chain of indirect blocks leading to data
326 * @inode: inode in question
327 * @depth: depth of the chain (1 - direct pointer, etc.)
328 * @offsets: offsets of pointers in inode/indirect blocks
329 * @chain: place to store the result
330 * @err: here we store the error value
331 *
332 * Function fills the array of triples <key, p, bh> and returns %NULL
333 * if everything went OK or the pointer to the last filled triple
334 * (incomplete one) otherwise. Upon the return chain[i].key contains
335 * the number of (i+1)-th block in the chain (as it is stored in memory,
336 * i.e. little-endian 32-bit), chain[i].p contains the address of that
337 * number (it points into struct inode for i==0 and into the bh->b_data
338 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
339 * block for i>0 and NULL for i==0. In other words, it holds the block
340 * numbers of the chain, addresses they were taken from (and where we can
341 * verify that chain did not change) and buffer_heads hosting these
342 * numbers.
343 *
344 * Function stops when it stumbles upon zero pointer (absent block)
345 * (pointer to last triple returned, *@err == 0)
346 * or when it gets an IO error reading an indirect block
347 * (ditto, *@err == -EIO)
348 * or when it notices that chain had been changed while it was reading
349 * (ditto, *@err == -EAGAIN)
350 * or when it reads all @depth-1 indirect blocks successfully and finds
351 * the whole chain, all way to the data (returns %NULL, *err == 0).
352 */
353static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets,
354 Indirect chain[4], int *err)
355{
356 struct super_block *sb = inode->i_sb;
357 Indirect *p = chain;
358 struct buffer_head *bh;
359
360 *err = 0;
361 /* i_data is not going away, no lock needed */
362 add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);
363 if (!p->key)
364 goto no_block;
365 while (--depth) {
366 bh = sb_bread(sb, le32_to_cpu(p->key));
367 if (!bh)
368 goto failure;
369 /* Reader: pointers */
370 if (!verify_chain(chain, p))
371 goto changed;
372 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
373 /* Reader: end */
374 if (!p->key)
375 goto no_block;
376 }
377 return NULL;
378
379changed:
380 brelse(bh);
381 *err = -EAGAIN;
382 goto no_block;
383failure:
384 *err = -EIO;
385no_block:
386 return p;
387}
388
389/**
390 * ext4_find_near - find a place for allocation with sufficient locality
391 * @inode: owner
392 * @ind: descriptor of indirect block.
393 *
394 * This function returns the prefered place for block allocation.
395 * It is used when heuristic for sequential allocation fails.
396 * Rules are:
397 * + if there is a block to the left of our position - allocate near it.
398 * + if pointer will live in indirect block - allocate near that block.
399 * + if pointer will live in inode - allocate in the same
400 * cylinder group.
401 *
402 * In the latter case we colour the starting block by the callers PID to
403 * prevent it from clashing with concurrent allocations for a different inode
404 * in the same block group. The PID is used here so that functionally related
405 * files will be close-by on-disk.
406 *
407 * Caller must make sure that @ind is valid and will stay that way.
408 */
409static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
410{
411 struct ext4_inode_info *ei = EXT4_I(inode);
412 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
413 __le32 *p;
414 ext4_fsblk_t bg_start;
415 ext4_grpblk_t colour;
416
417 /* Try to find previous block */
418 for (p = ind->p - 1; p >= start; p--) {
419 if (*p)
420 return le32_to_cpu(*p);
421 }
422
423 /* No such thing, so let's try location of indirect block */
424 if (ind->bh)
425 return ind->bh->b_blocknr;
426
427 /*
428 * It is going to be referred to from the inode itself? OK, just put it
429 * into the same cylinder group then.
430 */
431 bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
432 colour = (current->pid % 16) *
433 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
434 return bg_start + colour;
435}
436
437/**
438 * ext4_find_goal - find a prefered place for allocation.
439 * @inode: owner
440 * @block: block we want
441 * @chain: chain of indirect blocks
442 * @partial: pointer to the last triple within a chain
443 * @goal: place to store the result.
444 *
445 * Normally this function find the prefered place for block allocation,
446 * stores it in *@goal and returns zero.
447 */
448
449static ext4_fsblk_t ext4_find_goal(struct inode *inode, long block,
450 Indirect chain[4], Indirect *partial)
451{
452 struct ext4_block_alloc_info *block_i;
453
454 block_i = EXT4_I(inode)->i_block_alloc_info;
455
456 /*
457 * try the heuristic for sequential allocation,
458 * failing that at least try to get decent locality.
459 */
460 if (block_i && (block == block_i->last_alloc_logical_block + 1)
461 && (block_i->last_alloc_physical_block != 0)) {
462 return block_i->last_alloc_physical_block + 1;
463 }
464
465 return ext4_find_near(inode, partial);
466}
467
468/**
469 * ext4_blks_to_allocate: Look up the block map and count the number
470 * of direct blocks need to be allocated for the given branch.
471 *
472 * @branch: chain of indirect blocks
473 * @k: number of blocks need for indirect blocks
474 * @blks: number of data blocks to be mapped.
475 * @blocks_to_boundary: the offset in the indirect block
476 *
477 * return the total number of blocks to be allocate, including the
478 * direct and indirect blocks.
479 */
480static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
481 int blocks_to_boundary)
482{
483 unsigned long count = 0;
484
485 /*
486 * Simple case, [t,d]Indirect block(s) has not allocated yet
487 * then it's clear blocks on that path have not allocated
488 */
489 if (k > 0) {
490 /* right now we don't handle cross boundary allocation */
491 if (blks < blocks_to_boundary + 1)
492 count += blks;
493 else
494 count += blocks_to_boundary + 1;
495 return count;
496 }
497
498 count++;
499 while (count < blks && count <= blocks_to_boundary &&
500 le32_to_cpu(*(branch[0].p + count)) == 0) {
501 count++;
502 }
503 return count;
504}
505
506/**
507 * ext4_alloc_blocks: multiple allocate blocks needed for a branch
508 * @indirect_blks: the number of blocks need to allocate for indirect
509 * blocks
510 *
511 * @new_blocks: on return it will store the new block numbers for
512 * the indirect blocks(if needed) and the first direct block,
513 * @blks: on return it will store the total number of allocated
514 * direct blocks
515 */
516static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
517 ext4_fsblk_t goal, int indirect_blks, int blks,
518 ext4_fsblk_t new_blocks[4], int *err)
519{
520 int target, i;
521 unsigned long count = 0;
522 int index = 0;
523 ext4_fsblk_t current_block = 0;
524 int ret = 0;
525
526 /*
527 * Here we try to allocate the requested multiple blocks at once,
528 * on a best-effort basis.
529 * To build a branch, we should allocate blocks for
530 * the indirect blocks(if not allocated yet), and at least
531 * the first direct block of this branch. That's the
532 * minimum number of blocks need to allocate(required)
533 */
534 target = blks + indirect_blks;
535
536 while (1) {
537 count = target;
538 /* allocating blocks for indirect blocks and direct blocks */
539 current_block = ext4_new_blocks(handle,inode,goal,&count,err);
540 if (*err)
541 goto failed_out;
542
543 target -= count;
544 /* allocate blocks for indirect blocks */
545 while (index < indirect_blks && count) {
546 new_blocks[index++] = current_block++;
547 count--;
548 }
549
550 if (count > 0)
551 break;
552 }
553
554 /* save the new block number for the first direct block */
555 new_blocks[index] = current_block;
556
557 /* total number of blocks allocated for direct blocks */
558 ret = count;
559 *err = 0;
560 return ret;
561failed_out:
562 for (i = 0; i <index; i++)
563 ext4_free_blocks(handle, inode, new_blocks[i], 1);
564 return ret;
565}
566
567/**
568 * ext4_alloc_branch - allocate and set up a chain of blocks.
569 * @inode: owner
570 * @indirect_blks: number of allocated indirect blocks
571 * @blks: number of allocated direct blocks
572 * @offsets: offsets (in the blocks) to store the pointers to next.
573 * @branch: place to store the chain in.
574 *
575 * This function allocates blocks, zeroes out all but the last one,
576 * links them into chain and (if we are synchronous) writes them to disk.
577 * In other words, it prepares a branch that can be spliced onto the
578 * inode. It stores the information about that chain in the branch[], in
579 * the same format as ext4_get_branch() would do. We are calling it after
580 * we had read the existing part of chain and partial points to the last
581 * triple of that (one with zero ->key). Upon the exit we have the same
582 * picture as after the successful ext4_get_block(), except that in one
583 * place chain is disconnected - *branch->p is still zero (we did not
584 * set the last link), but branch->key contains the number that should
585 * be placed into *branch->p to fill that gap.
586 *
587 * If allocation fails we free all blocks we've allocated (and forget
588 * their buffer_heads) and return the error value the from failed
589 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
590 * as described above and return 0.
591 */
592static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
593 int indirect_blks, int *blks, ext4_fsblk_t goal,
594 int *offsets, Indirect *branch)
595{
596 int blocksize = inode->i_sb->s_blocksize;
597 int i, n = 0;
598 int err = 0;
599 struct buffer_head *bh;
600 int num;
601 ext4_fsblk_t new_blocks[4];
602 ext4_fsblk_t current_block;
603
604 num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
605 *blks, new_blocks, &err);
606 if (err)
607 return err;
608
609 branch[0].key = cpu_to_le32(new_blocks[0]);
610 /*
611 * metadata blocks and data blocks are allocated.
612 */
613 for (n = 1; n <= indirect_blks; n++) {
614 /*
615 * Get buffer_head for parent block, zero it out
616 * and set the pointer to new one, then send
617 * parent to disk.
618 */
619 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
620 branch[n].bh = bh;
621 lock_buffer(bh);
622 BUFFER_TRACE(bh, "call get_create_access");
623 err = ext4_journal_get_create_access(handle, bh);
624 if (err) {
625 unlock_buffer(bh);
626 brelse(bh);
627 goto failed;
628 }
629
630 memset(bh->b_data, 0, blocksize);
631 branch[n].p = (__le32 *) bh->b_data + offsets[n];
632 branch[n].key = cpu_to_le32(new_blocks[n]);
633 *branch[n].p = branch[n].key;
634 if ( n == indirect_blks) {
635 current_block = new_blocks[n];
636 /*
637 * End of chain, update the last new metablock of
638 * the chain to point to the new allocated
639 * data blocks numbers
640 */
641 for (i=1; i < num; i++)
642 *(branch[n].p + i) = cpu_to_le32(++current_block);
643 }
644 BUFFER_TRACE(bh, "marking uptodate");
645 set_buffer_uptodate(bh);
646 unlock_buffer(bh);
647
648 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
649 err = ext4_journal_dirty_metadata(handle, bh);
650 if (err)
651 goto failed;
652 }
653 *blks = num;
654 return err;
655failed:
656 /* Allocation failed, free what we already allocated */
657 for (i = 1; i <= n ; i++) {
658 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
659 ext4_journal_forget(handle, branch[i].bh);
660 }
661 for (i = 0; i <indirect_blks; i++)
662 ext4_free_blocks(handle, inode, new_blocks[i], 1);
663
664 ext4_free_blocks(handle, inode, new_blocks[i], num);
665
666 return err;
667}
668
669/**
670 * ext4_splice_branch - splice the allocated branch onto inode.
671 * @inode: owner
672 * @block: (logical) number of block we are adding
673 * @chain: chain of indirect blocks (with a missing link - see
674 * ext4_alloc_branch)
675 * @where: location of missing link
676 * @num: number of indirect blocks we are adding
677 * @blks: number of direct blocks we are adding
678 *
679 * This function fills the missing link and does all housekeeping needed in
680 * inode (->i_blocks, etc.). In case of success we end up with the full
681 * chain to new block and return 0.
682 */
683static int ext4_splice_branch(handle_t *handle, struct inode *inode,
684 long block, Indirect *where, int num, int blks)
685{
686 int i;
687 int err = 0;
688 struct ext4_block_alloc_info *block_i;
689 ext4_fsblk_t current_block;
690
691 block_i = EXT4_I(inode)->i_block_alloc_info;
692 /*
693 * If we're splicing into a [td]indirect block (as opposed to the
694 * inode) then we need to get write access to the [td]indirect block
695 * before the splice.
696 */
697 if (where->bh) {
698 BUFFER_TRACE(where->bh, "get_write_access");
699 err = ext4_journal_get_write_access(handle, where->bh);
700 if (err)
701 goto err_out;
702 }
703 /* That's it */
704
705 *where->p = where->key;
706
707 /*
708 * Update the host buffer_head or inode to point to more just allocated
709 * direct blocks blocks
710 */
711 if (num == 0 && blks > 1) {
712 current_block = le32_to_cpu(where->key) + 1;
713 for (i = 1; i < blks; i++)
714 *(where->p + i ) = cpu_to_le32(current_block++);
715 }
716
717 /*
718 * update the most recently allocated logical & physical block
719 * in i_block_alloc_info, to assist find the proper goal block for next
720 * allocation
721 */
722 if (block_i) {
723 block_i->last_alloc_logical_block = block + blks - 1;
724 block_i->last_alloc_physical_block =
725 le32_to_cpu(where[num].key) + blks - 1;
726 }
727
728 /* We are done with atomic stuff, now do the rest of housekeeping */
729
730 inode->i_ctime = CURRENT_TIME_SEC;
731 ext4_mark_inode_dirty(handle, inode);
732
733 /* had we spliced it onto indirect block? */
734 if (where->bh) {
735 /*
736 * If we spliced it onto an indirect block, we haven't
737 * altered the inode. Note however that if it is being spliced
738 * onto an indirect block at the very end of the file (the
739 * file is growing) then we *will* alter the inode to reflect
740 * the new i_size. But that is not done here - it is done in
741 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
742 */
743 jbd_debug(5, "splicing indirect only\n");
744 BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");
745 err = ext4_journal_dirty_metadata(handle, where->bh);
746 if (err)
747 goto err_out;
748 } else {
749 /*
750 * OK, we spliced it into the inode itself on a direct block.
751 * Inode was dirtied above.
752 */
753 jbd_debug(5, "splicing direct\n");
754 }
755 return err;
756
757err_out:
758 for (i = 1; i <= num; i++) {
759 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
760 ext4_journal_forget(handle, where[i].bh);
761 ext4_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
762 }
763 ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
764
765 return err;
766}
767
768/*
769 * Allocation strategy is simple: if we have to allocate something, we will
770 * have to go the whole way to leaf. So let's do it before attaching anything
771 * to tree, set linkage between the newborn blocks, write them if sync is
772 * required, recheck the path, free and repeat if check fails, otherwise
773 * set the last missing link (that will protect us from any truncate-generated
774 * removals - all blocks on the path are immune now) and possibly force the
775 * write on the parent block.
776 * That has a nice additional property: no special recovery from the failed
777 * allocations is needed - we simply release blocks and do not touch anything
778 * reachable from inode.
779 *
780 * `handle' can be NULL if create == 0.
781 *
782 * The BKL may not be held on entry here. Be sure to take it early.
783 * return > 0, # of blocks mapped or allocated.
784 * return = 0, if plain lookup failed.
785 * return < 0, error case.
786 */
787int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
788 sector_t iblock, unsigned long maxblocks,
789 struct buffer_head *bh_result,
790 int create, int extend_disksize)
791{
792 int err = -EIO;
793 int offsets[4];
794 Indirect chain[4];
795 Indirect *partial;
796 ext4_fsblk_t goal;
797 int indirect_blks;
798 int blocks_to_boundary = 0;
799 int depth;
800 struct ext4_inode_info *ei = EXT4_I(inode);
801 int count = 0;
802 ext4_fsblk_t first_block = 0;
803
804
805 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
806 J_ASSERT(handle != NULL || create == 0);
807 depth = ext4_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
808
809 if (depth == 0)
810 goto out;
811
812 partial = ext4_get_branch(inode, depth, offsets, chain, &err);
813
814 /* Simplest case - block found, no allocation needed */
815 if (!partial) {
816 first_block = le32_to_cpu(chain[depth - 1].key);
817 clear_buffer_new(bh_result);
818 count++;
819 /*map more blocks*/
820 while (count < maxblocks && count <= blocks_to_boundary) {
821 ext4_fsblk_t blk;
822
823 if (!verify_chain(chain, partial)) {
824 /*
825 * Indirect block might be removed by
826 * truncate while we were reading it.
827 * Handling of that case: forget what we've
828 * got now. Flag the err as EAGAIN, so it
829 * will reread.
830 */
831 err = -EAGAIN;
832 count = 0;
833 break;
834 }
835 blk = le32_to_cpu(*(chain[depth-1].p + count));
836
837 if (blk == first_block + count)
838 count++;
839 else
840 break;
841 }
842 if (err != -EAGAIN)
843 goto got_it;
844 }
845
846 /* Next simple case - plain lookup or failed read of indirect block */
847 if (!create || err == -EIO)
848 goto cleanup;
849
850 mutex_lock(&ei->truncate_mutex);
851
852 /*
853 * If the indirect block is missing while we are reading
854 * the chain(ext4_get_branch() returns -EAGAIN err), or
855 * if the chain has been changed after we grab the semaphore,
856 * (either because another process truncated this branch, or
857 * another get_block allocated this branch) re-grab the chain to see if
858 * the request block has been allocated or not.
859 *
860 * Since we already block the truncate/other get_block
861 * at this point, we will have the current copy of the chain when we
862 * splice the branch into the tree.
863 */
864 if (err == -EAGAIN || !verify_chain(chain, partial)) {
865 while (partial > chain) {
866 brelse(partial->bh);
867 partial--;
868 }
869 partial = ext4_get_branch(inode, depth, offsets, chain, &err);
870 if (!partial) {
871 count++;
872 mutex_unlock(&ei->truncate_mutex);
873 if (err)
874 goto cleanup;
875 clear_buffer_new(bh_result);
876 goto got_it;
877 }
878 }
879
880 /*
881 * Okay, we need to do block allocation. Lazily initialize the block
882 * allocation info here if necessary
883 */
884 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
885 ext4_init_block_alloc_info(inode);
886
887 goal = ext4_find_goal(inode, iblock, chain, partial);
888
889 /* the number of blocks need to allocate for [d,t]indirect blocks */
890 indirect_blks = (chain + depth) - partial - 1;
891
892 /*
893 * Next look up the indirect map to count the totoal number of
894 * direct blocks to allocate for this branch.
895 */
896 count = ext4_blks_to_allocate(partial, indirect_blks,
897 maxblocks, blocks_to_boundary);
898 /*
899 * Block out ext4_truncate while we alter the tree
900 */
901 err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
902 offsets + (partial - chain), partial);
903
904 /*
905 * The ext4_splice_branch call will free and forget any buffers
906 * on the new chain if there is a failure, but that risks using
907 * up transaction credits, especially for bitmaps where the
908 * credits cannot be returned. Can we handle this somehow? We
909 * may need to return -EAGAIN upwards in the worst case. --sct
910 */
911 if (!err)
912 err = ext4_splice_branch(handle, inode, iblock,
913 partial, indirect_blks, count);
914 /*
915 * i_disksize growing is protected by truncate_mutex. Don't forget to
916 * protect it if you're about to implement concurrent
917 * ext4_get_block() -bzzz
918 */
919 if (!err && extend_disksize && inode->i_size > ei->i_disksize)
920 ei->i_disksize = inode->i_size;
921 mutex_unlock(&ei->truncate_mutex);
922 if (err)
923 goto cleanup;
924
925 set_buffer_new(bh_result);
926got_it:
927 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
928 if (count > blocks_to_boundary)
929 set_buffer_boundary(bh_result);
930 err = count;
931 /* Clean up and exit */
932 partial = chain + depth - 1; /* the whole chain */
933cleanup:
934 while (partial > chain) {
935 BUFFER_TRACE(partial->bh, "call brelse");
936 brelse(partial->bh);
937 partial--;
938 }
939 BUFFER_TRACE(bh_result, "returned");
940out:
941 return err;
942}
943
944#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)
945
946static int ext4_get_block(struct inode *inode, sector_t iblock,
947 struct buffer_head *bh_result, int create)
948{
949 handle_t *handle = journal_current_handle();
950 int ret = 0;
951 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
952
953 if (!create)
954 goto get_block; /* A read */
955
956 if (max_blocks == 1)
957 goto get_block; /* A single block get */
958
959 if (handle->h_transaction->t_state == T_LOCKED) {
960 /*
961 * Huge direct-io writes can hold off commits for long
962 * periods of time. Let this commit run.
963 */
964 ext4_journal_stop(handle);
965 handle = ext4_journal_start(inode, DIO_CREDITS);
966 if (IS_ERR(handle))
967 ret = PTR_ERR(handle);
968 goto get_block;
969 }
970
971 if (handle->h_buffer_credits <= EXT4_RESERVE_TRANS_BLOCKS) {
972 /*
973 * Getting low on buffer credits...
974 */
975 ret = ext4_journal_extend(handle, DIO_CREDITS);
976 if (ret > 0) {
977 /*
978 * Couldn't extend the transaction. Start a new one.
979 */
980 ret = ext4_journal_restart(handle, DIO_CREDITS);
981 }
982 }
983
984get_block:
985 if (ret == 0) {
986 ret = ext4_get_blocks_wrap(handle, inode, iblock,
987 max_blocks, bh_result, create, 0);
988 if (ret > 0) {
989 bh_result->b_size = (ret << inode->i_blkbits);
990 ret = 0;
991 }
992 }
993 return ret;
994}
995
996/*
997 * `handle' can be NULL if create is zero
998 */
999struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1000 long block, int create, int *errp)
1001{
1002 struct buffer_head dummy;
1003 int fatal = 0, err;
1004
1005 J_ASSERT(handle != NULL || create == 0);
1006
1007 dummy.b_state = 0;
1008 dummy.b_blocknr = -1000;
1009 buffer_trace_init(&dummy.b_history);
1010 err = ext4_get_blocks_wrap(handle, inode, block, 1,
1011 &dummy, create, 1);
1012 /*
1013 * ext4_get_blocks_handle() returns number of blocks
1014 * mapped. 0 in case of a HOLE.
1015 */
1016 if (err > 0) {
1017 if (err > 1)
1018 WARN_ON(1);
1019 err = 0;
1020 }
1021 *errp = err;
1022 if (!err && buffer_mapped(&dummy)) {
1023 struct buffer_head *bh;
1024 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
1025 if (!bh) {
1026 *errp = -EIO;
1027 goto err;
1028 }
1029 if (buffer_new(&dummy)) {
1030 J_ASSERT(create != 0);
1031 J_ASSERT(handle != 0);
1032
1033 /*
1034 * Now that we do not always journal data, we should
1035 * keep in mind whether this should always journal the
1036 * new buffer as metadata. For now, regular file
1037 * writes use ext4_get_block instead, so it's not a
1038 * problem.
1039 */
1040 lock_buffer(bh);
1041 BUFFER_TRACE(bh, "call get_create_access");
1042 fatal = ext4_journal_get_create_access(handle, bh);
1043 if (!fatal && !buffer_uptodate(bh)) {
1044 memset(bh->b_data,0,inode->i_sb->s_blocksize);
1045 set_buffer_uptodate(bh);
1046 }
1047 unlock_buffer(bh);
1048 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
1049 err = ext4_journal_dirty_metadata(handle, bh);
1050 if (!fatal)
1051 fatal = err;
1052 } else {
1053 BUFFER_TRACE(bh, "not a new buffer");
1054 }
1055 if (fatal) {
1056 *errp = fatal;
1057 brelse(bh);
1058 bh = NULL;
1059 }
1060 return bh;
1061 }
1062err:
1063 return NULL;
1064}
1065
1066struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1067 int block, int create, int *err)
1068{
1069 struct buffer_head * bh;
1070
1071 bh = ext4_getblk(handle, inode, block, create, err);
1072 if (!bh)
1073 return bh;
1074 if (buffer_uptodate(bh))
1075 return bh;
1076 ll_rw_block(READ_META, 1, &bh);
1077 wait_on_buffer(bh);
1078 if (buffer_uptodate(bh))
1079 return bh;
1080 put_bh(bh);
1081 *err = -EIO;
1082 return NULL;
1083}
1084
1085static int walk_page_buffers( handle_t *handle,
1086 struct buffer_head *head,
1087 unsigned from,
1088 unsigned to,
1089 int *partial,
1090 int (*fn)( handle_t *handle,
1091 struct buffer_head *bh))
1092{
1093 struct buffer_head *bh;
1094 unsigned block_start, block_end;
1095 unsigned blocksize = head->b_size;
1096 int err, ret = 0;
1097 struct buffer_head *next;
1098
1099 for ( bh = head, block_start = 0;
1100 ret == 0 && (bh != head || !block_start);
1101 block_start = block_end, bh = next)
1102 {
1103 next = bh->b_this_page;
1104 block_end = block_start + blocksize;
1105 if (block_end <= from || block_start >= to) {
1106 if (partial && !buffer_uptodate(bh))
1107 *partial = 1;
1108 continue;
1109 }
1110 err = (*fn)(handle, bh);
1111 if (!ret)
1112 ret = err;
1113 }
1114 return ret;
1115}
1116
1117/*
1118 * To preserve ordering, it is essential that the hole instantiation and
1119 * the data write be encapsulated in a single transaction. We cannot
1120 * close off a transaction and start a new one between the ext4_get_block()
1121 * and the commit_write(). So doing the jbd2_journal_start at the start of
1122 * prepare_write() is the right place.
1123 *
1124 * Also, this function can nest inside ext4_writepage() ->
1125 * block_write_full_page(). In that case, we *know* that ext4_writepage()
1126 * has generated enough buffer credits to do the whole page. So we won't
1127 * block on the journal in that case, which is good, because the caller may
1128 * be PF_MEMALLOC.
1129 *
1130 * By accident, ext4 can be reentered when a transaction is open via
1131 * quota file writes. If we were to commit the transaction while thus
1132 * reentered, there can be a deadlock - we would be holding a quota
1133 * lock, and the commit would never complete if another thread had a
1134 * transaction open and was blocking on the quota lock - a ranking
1135 * violation.
1136 *
1137 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
1138 * will _not_ run commit under these circumstances because handle->h_ref
1139 * is elevated. We'll still have enough credits for the tiny quotafile
1140 * write.
1141 */
1142static int do_journal_get_write_access(handle_t *handle,
1143 struct buffer_head *bh)
1144{
1145 if (!buffer_mapped(bh) || buffer_freed(bh))
1146 return 0;
1147 return ext4_journal_get_write_access(handle, bh);
1148}
1149
1150static int ext4_prepare_write(struct file *file, struct page *page,
1151 unsigned from, unsigned to)
1152{
1153 struct inode *inode = page->mapping->host;
1154 int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
1155 handle_t *handle;
1156 int retries = 0;
1157
1158retry:
1159 handle = ext4_journal_start(inode, needed_blocks);
1160 if (IS_ERR(handle)) {
1161 ret = PTR_ERR(handle);
1162 goto out;
1163 }
1164 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
1165 ret = nobh_prepare_write(page, from, to, ext4_get_block);
1166 else
1167 ret = block_prepare_write(page, from, to, ext4_get_block);
1168 if (ret)
1169 goto prepare_write_failed;
1170
1171 if (ext4_should_journal_data(inode)) {
1172 ret = walk_page_buffers(handle, page_buffers(page),
1173 from, to, NULL, do_journal_get_write_access);
1174 }
1175prepare_write_failed:
1176 if (ret)
1177 ext4_journal_stop(handle);
1178 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
1179 goto retry;
1180out:
1181 return ret;
1182}
1183
1184int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1185{
1186 int err = jbd2_journal_dirty_data(handle, bh);
1187 if (err)
1188 ext4_journal_abort_handle(__FUNCTION__, __FUNCTION__,
1189 bh, handle,err);
1190 return err;
1191}
1192
1193/* For commit_write() in data=journal mode */
1194static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
1195{
1196 if (!buffer_mapped(bh) || buffer_freed(bh))
1197 return 0;
1198 set_buffer_uptodate(bh);
1199 return ext4_journal_dirty_metadata(handle, bh);
1200}
1201
1202/*
1203 * We need to pick up the new inode size which generic_commit_write gave us
1204 * `file' can be NULL - eg, when called from page_symlink().
1205 *
1206 * ext4 never places buffers on inode->i_mapping->private_list. metadata
1207 * buffers are managed internally.
1208 */
1209static int ext4_ordered_commit_write(struct file *file, struct page *page,
1210 unsigned from, unsigned to)
1211{
1212 handle_t *handle = ext4_journal_current_handle();
1213 struct inode *inode = page->mapping->host;
1214 int ret = 0, ret2;
1215
1216 ret = walk_page_buffers(handle, page_buffers(page),
1217 from, to, NULL, ext4_journal_dirty_data);
1218
1219 if (ret == 0) {
1220 /*
1221 * generic_commit_write() will run mark_inode_dirty() if i_size
1222 * changes. So let's piggyback the i_disksize mark_inode_dirty
1223 * into that.
1224 */
1225 loff_t new_i_size;
1226
1227 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1228 if (new_i_size > EXT4_I(inode)->i_disksize)
1229 EXT4_I(inode)->i_disksize = new_i_size;
1230 ret = generic_commit_write(file, page, from, to);
1231 }
1232 ret2 = ext4_journal_stop(handle);
1233 if (!ret)
1234 ret = ret2;
1235 return ret;
1236}
1237
1238static int ext4_writeback_commit_write(struct file *file, struct page *page,
1239 unsigned from, unsigned to)
1240{
1241 handle_t *handle = ext4_journal_current_handle();
1242 struct inode *inode = page->mapping->host;
1243 int ret = 0, ret2;
1244 loff_t new_i_size;
1245
1246 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1247 if (new_i_size > EXT4_I(inode)->i_disksize)
1248 EXT4_I(inode)->i_disksize = new_i_size;
1249
1250 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
1251 ret = nobh_commit_write(file, page, from, to);
1252 else
1253 ret = generic_commit_write(file, page, from, to);
1254
1255 ret2 = ext4_journal_stop(handle);
1256 if (!ret)
1257 ret = ret2;
1258 return ret;
1259}
1260
1261static int ext4_journalled_commit_write(struct file *file,
1262 struct page *page, unsigned from, unsigned to)
1263{
1264 handle_t *handle = ext4_journal_current_handle();
1265 struct inode *inode = page->mapping->host;
1266 int ret = 0, ret2;
1267 int partial = 0;
1268 loff_t pos;
1269
1270 /*
1271 * Here we duplicate the generic_commit_write() functionality
1272 */
1273 pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1274
1275 ret = walk_page_buffers(handle, page_buffers(page), from,
1276 to, &partial, commit_write_fn);
1277 if (!partial)
1278 SetPageUptodate(page);
1279 if (pos > inode->i_size)
1280 i_size_write(inode, pos);
1281 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
1282 if (inode->i_size > EXT4_I(inode)->i_disksize) {
1283 EXT4_I(inode)->i_disksize = inode->i_size;
1284 ret2 = ext4_mark_inode_dirty(handle, inode);
1285 if (!ret)
1286 ret = ret2;
1287 }
1288 ret2 = ext4_journal_stop(handle);
1289 if (!ret)
1290 ret = ret2;
1291 return ret;
1292}
1293
1294/*
1295 * bmap() is special. It gets used by applications such as lilo and by
1296 * the swapper to find the on-disk block of a specific piece of data.
1297 *
1298 * Naturally, this is dangerous if the block concerned is still in the
1299 * journal. If somebody makes a swapfile on an ext4 data-journaling
1300 * filesystem and enables swap, then they may get a nasty shock when the
1301 * data getting swapped to that swapfile suddenly gets overwritten by
1302 * the original zero's written out previously to the journal and
1303 * awaiting writeback in the kernel's buffer cache.
1304 *
1305 * So, if we see any bmap calls here on a modified, data-journaled file,
1306 * take extra steps to flush any blocks which might be in the cache.
1307 */
1308static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
1309{
1310 struct inode *inode = mapping->host;
1311 journal_t *journal;
1312 int err;
1313
1314 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
1315 /*
1316 * This is a REALLY heavyweight approach, but the use of
1317 * bmap on dirty files is expected to be extremely rare:
1318 * only if we run lilo or swapon on a freshly made file
1319 * do we expect this to happen.
1320 *
1321 * (bmap requires CAP_SYS_RAWIO so this does not
1322 * represent an unprivileged user DOS attack --- we'd be
1323 * in trouble if mortal users could trigger this path at
1324 * will.)
1325 *
1326 * NB. EXT4_STATE_JDATA is not set on files other than
1327 * regular files. If somebody wants to bmap a directory
1328 * or symlink and gets confused because the buffer
1329 * hasn't yet been flushed to disk, they deserve
1330 * everything they get.
1331 */
1332
1333 EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
1334 journal = EXT4_JOURNAL(inode);
1335 jbd2_journal_lock_updates(journal);
1336 err = jbd2_journal_flush(journal);
1337 jbd2_journal_unlock_updates(journal);
1338
1339 if (err)
1340 return 0;
1341 }
1342
1343 return generic_block_bmap(mapping,block,ext4_get_block);
1344}
1345
1346static int bget_one(handle_t *handle, struct buffer_head *bh)
1347{
1348 get_bh(bh);
1349 return 0;
1350}
1351
1352static int bput_one(handle_t *handle, struct buffer_head *bh)
1353{
1354 put_bh(bh);
1355 return 0;
1356}
1357
1358static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1359{
1360 if (buffer_mapped(bh))
1361 return ext4_journal_dirty_data(handle, bh);
1362 return 0;
1363}
1364
1365/*
1366 * Note that we always start a transaction even if we're not journalling
1367 * data. This is to preserve ordering: any hole instantiation within
1368 * __block_write_full_page -> ext4_get_block() should be journalled
1369 * along with the data so we don't crash and then get metadata which
1370 * refers to old data.
1371 *
1372 * In all journalling modes block_write_full_page() will start the I/O.
1373 *
1374 * Problem:
1375 *
1376 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1377 * ext4_writepage()
1378 *
1379 * Similar for:
1380 *
1381 * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1382 *
1383 * Same applies to ext4_get_block(). We will deadlock on various things like
1384 * lock_journal and i_truncate_mutex.
1385 *
1386 * Setting PF_MEMALLOC here doesn't work - too many internal memory
1387 * allocations fail.
1388 *
1389 * 16May01: If we're reentered then journal_current_handle() will be
1390 * non-zero. We simply *return*.
1391 *
1392 * 1 July 2001: @@@ FIXME:
1393 * In journalled data mode, a data buffer may be metadata against the
1394 * current transaction. But the same file is part of a shared mapping
1395 * and someone does a writepage() on it.
1396 *
1397 * We will move the buffer onto the async_data list, but *after* it has
1398 * been dirtied. So there's a small window where we have dirty data on
1399 * BJ_Metadata.
1400 *
1401 * Note that this only applies to the last partial page in the file. The
1402 * bit which block_write_full_page() uses prepare/commit for. (That's
1403 * broken code anyway: it's wrong for msync()).
1404 *
1405 * It's a rare case: affects the final partial page, for journalled data
1406 * where the file is subject to bith write() and writepage() in the same
1407 * transction. To fix it we'll need a custom block_write_full_page().
1408 * We'll probably need that anyway for journalling writepage() output.
1409 *
1410 * We don't honour synchronous mounts for writepage(). That would be
1411 * disastrous. Any write() or metadata operation will sync the fs for
1412 * us.
1413 *
1414 * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1415 * we don't need to open a transaction here.
1416 */
1417static int ext4_ordered_writepage(struct page *page,
1418 struct writeback_control *wbc)
1419{
1420 struct inode *inode = page->mapping->host;
1421 struct buffer_head *page_bufs;
1422 handle_t *handle = NULL;
1423 int ret = 0;
1424 int err;
1425
1426 J_ASSERT(PageLocked(page));
1427
1428 /*
1429 * We give up here if we're reentered, because it might be for a
1430 * different filesystem.
1431 */
1432 if (ext4_journal_current_handle())
1433 goto out_fail;
1434
1435 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
1436
1437 if (IS_ERR(handle)) {
1438 ret = PTR_ERR(handle);
1439 goto out_fail;
1440 }
1441
1442 if (!page_has_buffers(page)) {
1443 create_empty_buffers(page, inode->i_sb->s_blocksize,
1444 (1 << BH_Dirty)|(1 << BH_Uptodate));
1445 }
1446 page_bufs = page_buffers(page);
1447 walk_page_buffers(handle, page_bufs, 0,
1448 PAGE_CACHE_SIZE, NULL, bget_one);
1449
1450 ret = block_write_full_page(page, ext4_get_block, wbc);
1451
1452 /*
1453 * The page can become unlocked at any point now, and
1454 * truncate can then come in and change things. So we
1455 * can't touch *page from now on. But *page_bufs is
1456 * safe due to elevated refcount.
1457 */
1458
1459 /*
1460 * And attach them to the current transaction. But only if
1461 * block_write_full_page() succeeded. Otherwise they are unmapped,
1462 * and generally junk.
1463 */
1464 if (ret == 0) {
1465 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1466 NULL, jbd2_journal_dirty_data_fn);
1467 if (!ret)
1468 ret = err;
1469 }
1470 walk_page_buffers(handle, page_bufs, 0,
1471 PAGE_CACHE_SIZE, NULL, bput_one);
1472 err = ext4_journal_stop(handle);
1473 if (!ret)
1474 ret = err;
1475 return ret;
1476
1477out_fail:
1478 redirty_page_for_writepage(wbc, page);
1479 unlock_page(page);
1480 return ret;
1481}
1482
1483static int ext4_writeback_writepage(struct page *page,
1484 struct writeback_control *wbc)
1485{
1486 struct inode *inode = page->mapping->host;
1487 handle_t *handle = NULL;
1488 int ret = 0;
1489 int err;
1490
1491 if (ext4_journal_current_handle())
1492 goto out_fail;
1493
1494 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
1495 if (IS_ERR(handle)) {
1496 ret = PTR_ERR(handle);
1497 goto out_fail;
1498 }
1499
1500 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
1501 ret = nobh_writepage(page, ext4_get_block, wbc);
1502 else
1503 ret = block_write_full_page(page, ext4_get_block, wbc);
1504
1505 err = ext4_journal_stop(handle);
1506 if (!ret)
1507 ret = err;
1508 return ret;
1509
1510out_fail:
1511 redirty_page_for_writepage(wbc, page);
1512 unlock_page(page);
1513 return ret;
1514}
1515
1516static int ext4_journalled_writepage(struct page *page,
1517 struct writeback_control *wbc)
1518{
1519 struct inode *inode = page->mapping->host;
1520 handle_t *handle = NULL;
1521 int ret = 0;
1522 int err;
1523
1524 if (ext4_journal_current_handle())
1525 goto no_write;
1526
1527 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
1528 if (IS_ERR(handle)) {
1529 ret = PTR_ERR(handle);
1530 goto no_write;
1531 }
1532
1533 if (!page_has_buffers(page) || PageChecked(page)) {
1534 /*
1535 * It's mmapped pagecache. Add buffers and journal it. There
1536 * doesn't seem much point in redirtying the page here.
1537 */
1538 ClearPageChecked(page);
1539 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1540 ext4_get_block);
1541 if (ret != 0) {
1542 ext4_journal_stop(handle);
1543 goto out_unlock;
1544 }
1545 ret = walk_page_buffers(handle, page_buffers(page), 0,
1546 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1547
1548 err = walk_page_buffers(handle, page_buffers(page), 0,
1549 PAGE_CACHE_SIZE, NULL, commit_write_fn);
1550 if (ret == 0)
1551 ret = err;
1552 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
1553 unlock_page(page);
1554 } else {
1555 /*
1556 * It may be a page full of checkpoint-mode buffers. We don't
1557 * really know unless we go poke around in the buffer_heads.
1558 * But block_write_full_page will do the right thing.
1559 */
1560 ret = block_write_full_page(page, ext4_get_block, wbc);
1561 }
1562 err = ext4_journal_stop(handle);
1563 if (!ret)
1564 ret = err;
1565out:
1566 return ret;
1567
1568no_write:
1569 redirty_page_for_writepage(wbc, page);
1570out_unlock:
1571 unlock_page(page);
1572 goto out;
1573}
1574
1575static int ext4_readpage(struct file *file, struct page *page)
1576{
1577 return mpage_readpage(page, ext4_get_block);
1578}
1579
1580static int
1581ext4_readpages(struct file *file, struct address_space *mapping,
1582 struct list_head *pages, unsigned nr_pages)
1583{
1584 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
1585}
1586
1587static void ext4_invalidatepage(struct page *page, unsigned long offset)
1588{
1589 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
1590
1591 /*
1592 * If it's a full truncate we just forget about the pending dirtying
1593 */
1594 if (offset == 0)
1595 ClearPageChecked(page);
1596
1597 jbd2_journal_invalidatepage(journal, page, offset);
1598}
1599
1600static int ext4_releasepage(struct page *page, gfp_t wait)
1601{
1602 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
1603
1604 WARN_ON(PageChecked(page));
1605 if (!page_has_buffers(page))
1606 return 0;
1607 return jbd2_journal_try_to_free_buffers(journal, page, wait);
1608}
1609
1610/*
1611 * If the O_DIRECT write will extend the file then add this inode to the
1612 * orphan list. So recovery will truncate it back to the original size
1613 * if the machine crashes during the write.
1614 *
1615 * If the O_DIRECT write is intantiating holes inside i_size and the machine
1616 * crashes then stale disk data _may_ be exposed inside the file.
1617 */
1618static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
1619 const struct iovec *iov, loff_t offset,
1620 unsigned long nr_segs)
1621{
1622 struct file *file = iocb->ki_filp;
1623 struct inode *inode = file->f_mapping->host;
1624 struct ext4_inode_info *ei = EXT4_I(inode);
1625 handle_t *handle = NULL;
1626 ssize_t ret;
1627 int orphan = 0;
1628 size_t count = iov_length(iov, nr_segs);
1629
1630 if (rw == WRITE) {
1631 loff_t final_size = offset + count;
1632
1633 handle = ext4_journal_start(inode, DIO_CREDITS);
1634 if (IS_ERR(handle)) {
1635 ret = PTR_ERR(handle);
1636 goto out;
1637 }
1638 if (final_size > inode->i_size) {
1639 ret = ext4_orphan_add(handle, inode);
1640 if (ret)
1641 goto out_stop;
1642 orphan = 1;
1643 ei->i_disksize = inode->i_size;
1644 }
1645 }
1646
1647 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1648 offset, nr_segs,
1649 ext4_get_block, NULL);
1650
1651 /*
1652 * Reacquire the handle: ext4_get_block() can restart the transaction
1653 */
1654 handle = journal_current_handle();
1655
1656out_stop:
1657 if (handle) {
1658 int err;
1659
1660 if (orphan && inode->i_nlink)
1661 ext4_orphan_del(handle, inode);
1662 if (orphan && ret > 0) {
1663 loff_t end = offset + ret;
1664 if (end > inode->i_size) {
1665 ei->i_disksize = end;
1666 i_size_write(inode, end);
1667 /*
1668 * We're going to return a positive `ret'
1669 * here due to non-zero-length I/O, so there's
1670 * no way of reporting error returns from
1671 * ext4_mark_inode_dirty() to userspace. So
1672 * ignore it.
1673 */
1674 ext4_mark_inode_dirty(handle, inode);
1675 }
1676 }
1677 err = ext4_journal_stop(handle);
1678 if (ret == 0)
1679 ret = err;
1680 }
1681out:
1682 return ret;
1683}
1684
1685/*
1686 * Pages can be marked dirty completely asynchronously from ext4's journalling
1687 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
1688 * much here because ->set_page_dirty is called under VFS locks. The page is
1689 * not necessarily locked.
1690 *
1691 * We cannot just dirty the page and leave attached buffers clean, because the
1692 * buffers' dirty state is "definitive". We cannot just set the buffers dirty
1693 * or jbddirty because all the journalling code will explode.
1694 *
1695 * So what we do is to mark the page "pending dirty" and next time writepage
1696 * is called, propagate that into the buffers appropriately.
1697 */
1698static int ext4_journalled_set_page_dirty(struct page *page)
1699{
1700 SetPageChecked(page);
1701 return __set_page_dirty_nobuffers(page);
1702}
1703
1704static const struct address_space_operations ext4_ordered_aops = {
1705 .readpage = ext4_readpage,
1706 .readpages = ext4_readpages,
1707 .writepage = ext4_ordered_writepage,
1708 .sync_page = block_sync_page,
1709 .prepare_write = ext4_prepare_write,
1710 .commit_write = ext4_ordered_commit_write,
1711 .bmap = ext4_bmap,
1712 .invalidatepage = ext4_invalidatepage,
1713 .releasepage = ext4_releasepage,
1714 .direct_IO = ext4_direct_IO,
1715 .migratepage = buffer_migrate_page,
1716};
1717
1718static const struct address_space_operations ext4_writeback_aops = {
1719 .readpage = ext4_readpage,
1720 .readpages = ext4_readpages,
1721 .writepage = ext4_writeback_writepage,
1722 .sync_page = block_sync_page,
1723 .prepare_write = ext4_prepare_write,
1724 .commit_write = ext4_writeback_commit_write,
1725 .bmap = ext4_bmap,
1726 .invalidatepage = ext4_invalidatepage,
1727 .releasepage = ext4_releasepage,
1728 .direct_IO = ext4_direct_IO,
1729 .migratepage = buffer_migrate_page,
1730};
1731
1732static const struct address_space_operations ext4_journalled_aops = {
1733 .readpage = ext4_readpage,
1734 .readpages = ext4_readpages,
1735 .writepage = ext4_journalled_writepage,
1736 .sync_page = block_sync_page,
1737 .prepare_write = ext4_prepare_write,
1738 .commit_write = ext4_journalled_commit_write,
1739 .set_page_dirty = ext4_journalled_set_page_dirty,
1740 .bmap = ext4_bmap,
1741 .invalidatepage = ext4_invalidatepage,
1742 .releasepage = ext4_releasepage,
1743};
1744
1745void ext4_set_aops(struct inode *inode)
1746{
1747 if (ext4_should_order_data(inode))
1748 inode->i_mapping->a_ops = &ext4_ordered_aops;
1749 else if (ext4_should_writeback_data(inode))
1750 inode->i_mapping->a_ops = &ext4_writeback_aops;
1751 else
1752 inode->i_mapping->a_ops = &ext4_journalled_aops;
1753}
1754
1755/*
1756 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
1757 * up to the end of the block which corresponds to `from'.
1758 * This required during truncate. We need to physically zero the tail end
1759 * of that block so it doesn't yield old data if the file is later grown.
1760 */
1761int ext4_block_truncate_page(handle_t *handle, struct page *page,
1762 struct address_space *mapping, loff_t from)
1763{
1764 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
1765 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1766 unsigned blocksize, iblock, length, pos;
1767 struct inode *inode = mapping->host;
1768 struct buffer_head *bh;
1769 int err = 0;
1770 void *kaddr;
1771
1772 blocksize = inode->i_sb->s_blocksize;
1773 length = blocksize - (offset & (blocksize - 1));
1774 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1775
1776 /*
1777 * For "nobh" option, we can only work if we don't need to
1778 * read-in the page - otherwise we create buffers to do the IO.
1779 */
1780 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
1781 ext4_should_writeback_data(inode) && PageUptodate(page)) {
1782 kaddr = kmap_atomic(page, KM_USER0);
1783 memset(kaddr + offset, 0, length);
1784 flush_dcache_page(page);
1785 kunmap_atomic(kaddr, KM_USER0);
1786 set_page_dirty(page);
1787 goto unlock;
1788 }
1789
1790 if (!page_has_buffers(page))
1791 create_empty_buffers(page, blocksize, 0);
1792
1793 /* Find the buffer that contains "offset" */
1794 bh = page_buffers(page);
1795 pos = blocksize;
1796 while (offset >= pos) {
1797 bh = bh->b_this_page;
1798 iblock++;
1799 pos += blocksize;
1800 }
1801
1802 err = 0;
1803 if (buffer_freed(bh)) {
1804 BUFFER_TRACE(bh, "freed: skip");
1805 goto unlock;
1806 }
1807
1808 if (!buffer_mapped(bh)) {
1809 BUFFER_TRACE(bh, "unmapped");
1810 ext4_get_block(inode, iblock, bh, 0);
1811 /* unmapped? It's a hole - nothing to do */
1812 if (!buffer_mapped(bh)) {
1813 BUFFER_TRACE(bh, "still unmapped");
1814 goto unlock;
1815 }
1816 }
1817
1818 /* Ok, it's mapped. Make sure it's up-to-date */
1819 if (PageUptodate(page))
1820 set_buffer_uptodate(bh);
1821
1822 if (!buffer_uptodate(bh)) {
1823 err = -EIO;
1824 ll_rw_block(READ, 1, &bh);
1825 wait_on_buffer(bh);
1826 /* Uhhuh. Read error. Complain and punt. */
1827 if (!buffer_uptodate(bh))
1828 goto unlock;
1829 }
1830
1831 if (ext4_should_journal_data(inode)) {
1832 BUFFER_TRACE(bh, "get write access");
1833 err = ext4_journal_get_write_access(handle, bh);
1834 if (err)
1835 goto unlock;
1836 }
1837
1838 kaddr = kmap_atomic(page, KM_USER0);
1839 memset(kaddr + offset, 0, length);
1840 flush_dcache_page(page);
1841 kunmap_atomic(kaddr, KM_USER0);
1842
1843 BUFFER_TRACE(bh, "zeroed end of block");
1844
1845 err = 0;
1846 if (ext4_should_journal_data(inode)) {
1847 err = ext4_journal_dirty_metadata(handle, bh);
1848 } else {
1849 if (ext4_should_order_data(inode))
1850 err = ext4_journal_dirty_data(handle, bh);
1851 mark_buffer_dirty(bh);
1852 }
1853
1854unlock:
1855 unlock_page(page);
1856 page_cache_release(page);
1857 return err;
1858}
1859
1860/*
1861 * Probably it should be a library function... search for first non-zero word
1862 * or memcmp with zero_page, whatever is better for particular architecture.
1863 * Linus?
1864 */
1865static inline int all_zeroes(__le32 *p, __le32 *q)
1866{
1867 while (p < q)
1868 if (*p++)
1869 return 0;
1870 return 1;
1871}
1872
1873/**
1874 * ext4_find_shared - find the indirect blocks for partial truncation.
1875 * @inode: inode in question
1876 * @depth: depth of the affected branch
1877 * @offsets: offsets of pointers in that branch (see ext4_block_to_path)
1878 * @chain: place to store the pointers to partial indirect blocks
1879 * @top: place to the (detached) top of branch
1880 *
1881 * This is a helper function used by ext4_truncate().
1882 *
1883 * When we do truncate() we may have to clean the ends of several
1884 * indirect blocks but leave the blocks themselves alive. Block is
1885 * partially truncated if some data below the new i_size is refered
1886 * from it (and it is on the path to the first completely truncated
1887 * data block, indeed). We have to free the top of that path along
1888 * with everything to the right of the path. Since no allocation
1889 * past the truncation point is possible until ext4_truncate()
1890 * finishes, we may safely do the latter, but top of branch may
1891 * require special attention - pageout below the truncation point
1892 * might try to populate it.
1893 *
1894 * We atomically detach the top of branch from the tree, store the
1895 * block number of its root in *@top, pointers to buffer_heads of
1896 * partially truncated blocks - in @chain[].bh and pointers to
1897 * their last elements that should not be removed - in
1898 * @chain[].p. Return value is the pointer to last filled element
1899 * of @chain.
1900 *
1901 * The work left to caller to do the actual freeing of subtrees:
1902 * a) free the subtree starting from *@top
1903 * b) free the subtrees whose roots are stored in
1904 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1905 * c) free the subtrees growing from the inode past the @chain[0].
1906 * (no partially truncated stuff there). */
1907
1908static Indirect *ext4_find_shared(struct inode *inode, int depth,
1909 int offsets[4], Indirect chain[4], __le32 *top)
1910{
1911 Indirect *partial, *p;
1912 int k, err;
1913
1914 *top = 0;
1915 /* Make k index the deepest non-null offest + 1 */
1916 for (k = depth; k > 1 && !offsets[k-1]; k--)
1917 ;
1918 partial = ext4_get_branch(inode, k, offsets, chain, &err);
1919 /* Writer: pointers */
1920 if (!partial)
1921 partial = chain + k-1;
1922 /*
1923 * If the branch acquired continuation since we've looked at it -
1924 * fine, it should all survive and (new) top doesn't belong to us.
1925 */
1926 if (!partial->key && *partial->p)
1927 /* Writer: end */
1928 goto no_top;
1929 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
1930 ;
1931 /*
1932 * OK, we've found the last block that must survive. The rest of our
1933 * branch should be detached before unlocking. However, if that rest
1934 * of branch is all ours and does not grow immediately from the inode
1935 * it's easier to cheat and just decrement partial->p.
1936 */
1937 if (p == chain + k - 1 && p > chain) {
1938 p->p--;
1939 } else {
1940 *top = *p->p;
1941 /* Nope, don't do this in ext4. Must leave the tree intact */
1942#if 0
1943 *p->p = 0;
1944#endif
1945 }
1946 /* Writer: end */
1947
1948 while(partial > p) {
1949 brelse(partial->bh);
1950 partial--;
1951 }
1952no_top:
1953 return partial;
1954}
1955
1956/*
1957 * Zero a number of block pointers in either an inode or an indirect block.
1958 * If we restart the transaction we must again get write access to the
1959 * indirect block for further modification.
1960 *
1961 * We release `count' blocks on disk, but (last - first) may be greater
1962 * than `count' because there can be holes in there.
1963 */
1964static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
1965 struct buffer_head *bh, ext4_fsblk_t block_to_free,
1966 unsigned long count, __le32 *first, __le32 *last)
1967{
1968 __le32 *p;
1969 if (try_to_extend_transaction(handle, inode)) {
1970 if (bh) {
1971 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
1972 ext4_journal_dirty_metadata(handle, bh);
1973 }
1974 ext4_mark_inode_dirty(handle, inode);
1975 ext4_journal_test_restart(handle, inode);
1976 if (bh) {
1977 BUFFER_TRACE(bh, "retaking write access");
1978 ext4_journal_get_write_access(handle, bh);
1979 }
1980 }
1981
1982 /*
1983 * Any buffers which are on the journal will be in memory. We find
1984 * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()
1985 * on them. We've already detached each block from the file, so
1986 * bforget() in jbd2_journal_forget() should be safe.
1987 *
1988 * AKPM: turn on bforget in jbd2_journal_forget()!!!
1989 */
1990 for (p = first; p < last; p++) {
1991 u32 nr = le32_to_cpu(*p);
1992 if (nr) {
1993 struct buffer_head *bh;
1994
1995 *p = 0;
1996 bh = sb_find_get_block(inode->i_sb, nr);
1997 ext4_forget(handle, 0, inode, bh, nr);
1998 }
1999 }
2000
2001 ext4_free_blocks(handle, inode, block_to_free, count);
2002}
2003
2004/**
2005 * ext4_free_data - free a list of data blocks
2006 * @handle: handle for this transaction
2007 * @inode: inode we are dealing with
2008 * @this_bh: indirect buffer_head which contains *@first and *@last
2009 * @first: array of block numbers
2010 * @last: points immediately past the end of array
2011 *
2012 * We are freeing all blocks refered from that array (numbers are stored as
2013 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
2014 *
2015 * We accumulate contiguous runs of blocks to free. Conveniently, if these
2016 * blocks are contiguous then releasing them at one time will only affect one
2017 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
2018 * actually use a lot of journal space.
2019 *
2020 * @this_bh will be %NULL if @first and @last point into the inode's direct
2021 * block pointers.
2022 */
2023static void ext4_free_data(handle_t *handle, struct inode *inode,
2024 struct buffer_head *this_bh,
2025 __le32 *first, __le32 *last)
2026{
2027 ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
2028 unsigned long count = 0; /* Number of blocks in the run */
2029 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
2030 corresponding to
2031 block_to_free */
2032 ext4_fsblk_t nr; /* Current block # */
2033 __le32 *p; /* Pointer into inode/ind
2034 for current block */
2035 int err;
2036
2037 if (this_bh) { /* For indirect block */
2038 BUFFER_TRACE(this_bh, "get_write_access");
2039 err = ext4_journal_get_write_access(handle, this_bh);
2040 /* Important: if we can't update the indirect pointers
2041 * to the blocks, we can't free them. */
2042 if (err)
2043 return;
2044 }
2045
2046 for (p = first; p < last; p++) {
2047 nr = le32_to_cpu(*p);
2048 if (nr) {
2049 /* accumulate blocks to free if they're contiguous */
2050 if (count == 0) {
2051 block_to_free = nr;
2052 block_to_free_p = p;
2053 count = 1;
2054 } else if (nr == block_to_free + count) {
2055 count++;
2056 } else {
2057 ext4_clear_blocks(handle, inode, this_bh,
2058 block_to_free,
2059 count, block_to_free_p, p);
2060 block_to_free = nr;
2061 block_to_free_p = p;
2062 count = 1;
2063 }
2064 }
2065 }
2066
2067 if (count > 0)
2068 ext4_clear_blocks(handle, inode, this_bh, block_to_free,
2069 count, block_to_free_p, p);
2070
2071 if (this_bh) {
2072 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
2073 ext4_journal_dirty_metadata(handle, this_bh);
2074 }
2075}
2076
2077/**
2078 * ext4_free_branches - free an array of branches
2079 * @handle: JBD handle for this transaction
2080 * @inode: inode we are dealing with
2081 * @parent_bh: the buffer_head which contains *@first and *@last
2082 * @first: array of block numbers
2083 * @last: pointer immediately past the end of array
2084 * @depth: depth of the branches to free
2085 *
2086 * We are freeing all blocks refered from these branches (numbers are
2087 * stored as little-endian 32-bit) and updating @inode->i_blocks
2088 * appropriately.
2089 */
2090static void ext4_free_branches(handle_t *handle, struct inode *inode,
2091 struct buffer_head *parent_bh,
2092 __le32 *first, __le32 *last, int depth)
2093{
2094 ext4_fsblk_t nr;
2095 __le32 *p;
2096
2097 if (is_handle_aborted(handle))
2098 return;
2099
2100 if (depth--) {
2101 struct buffer_head *bh;
2102 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
2103 p = last;
2104 while (--p >= first) {
2105 nr = le32_to_cpu(*p);
2106 if (!nr)
2107 continue; /* A hole */
2108
2109 /* Go read the buffer for the next level down */
2110 bh = sb_bread(inode->i_sb, nr);
2111
2112 /*
2113 * A read failure? Report error and clear slot
2114 * (should be rare).
2115 */
2116 if (!bh) {
2117 ext4_error(inode->i_sb, "ext4_free_branches",
2118 "Read failure, inode=%lu, block=%llu",
2119 inode->i_ino, nr);
2120 continue;
2121 }
2122
2123 /* This zaps the entire block. Bottom up. */
2124 BUFFER_TRACE(bh, "free child branches");
2125 ext4_free_branches(handle, inode, bh,
2126 (__le32*)bh->b_data,
2127 (__le32*)bh->b_data + addr_per_block,
2128 depth);
2129
2130 /*
2131 * We've probably journalled the indirect block several
2132 * times during the truncate. But it's no longer
2133 * needed and we now drop it from the transaction via
2134 * jbd2_journal_revoke().
2135 *
2136 * That's easy if it's exclusively part of this
2137 * transaction. But if it's part of the committing
2138 * transaction then jbd2_journal_forget() will simply
2139 * brelse() it. That means that if the underlying
2140 * block is reallocated in ext4_get_block(),
2141 * unmap_underlying_metadata() will find this block
2142 * and will try to get rid of it. damn, damn.
2143 *
2144 * If this block has already been committed to the
2145 * journal, a revoke record will be written. And
2146 * revoke records must be emitted *before* clearing
2147 * this block's bit in the bitmaps.
2148 */
2149 ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
2150
2151 /*
2152 * Everything below this this pointer has been
2153 * released. Now let this top-of-subtree go.
2154 *
2155 * We want the freeing of this indirect block to be
2156 * atomic in the journal with the updating of the
2157 * bitmap block which owns it. So make some room in
2158 * the journal.
2159 *
2160 * We zero the parent pointer *after* freeing its
2161 * pointee in the bitmaps, so if extend_transaction()
2162 * for some reason fails to put the bitmap changes and
2163 * the release into the same transaction, recovery
2164 * will merely complain about releasing a free block,
2165 * rather than leaking blocks.
2166 */
2167 if (is_handle_aborted(handle))
2168 return;
2169 if (try_to_extend_transaction(handle, inode)) {
2170 ext4_mark_inode_dirty(handle, inode);
2171 ext4_journal_test_restart(handle, inode);
2172 }
2173
2174 ext4_free_blocks(handle, inode, nr, 1);
2175
2176 if (parent_bh) {
2177 /*
2178 * The block which we have just freed is
2179 * pointed to by an indirect block: journal it
2180 */
2181 BUFFER_TRACE(parent_bh, "get_write_access");
2182 if (!ext4_journal_get_write_access(handle,
2183 parent_bh)){
2184 *p = 0;
2185 BUFFER_TRACE(parent_bh,
2186 "call ext4_journal_dirty_metadata");
2187 ext4_journal_dirty_metadata(handle,
2188 parent_bh);
2189 }
2190 }
2191 }
2192 } else {
2193 /* We have reached the bottom of the tree. */
2194 BUFFER_TRACE(parent_bh, "free data blocks");
2195 ext4_free_data(handle, inode, parent_bh, first, last);
2196 }
2197}
2198
2199/*
2200 * ext4_truncate()
2201 *
2202 * We block out ext4_get_block() block instantiations across the entire
2203 * transaction, and VFS/VM ensures that ext4_truncate() cannot run
2204 * simultaneously on behalf of the same inode.
2205 *
2206 * As we work through the truncate and commmit bits of it to the journal there
2207 * is one core, guiding principle: the file's tree must always be consistent on
2208 * disk. We must be able to restart the truncate after a crash.
2209 *
2210 * The file's tree may be transiently inconsistent in memory (although it
2211 * probably isn't), but whenever we close off and commit a journal transaction,
2212 * the contents of (the filesystem + the journal) must be consistent and
2213 * restartable. It's pretty simple, really: bottom up, right to left (although
2214 * left-to-right works OK too).
2215 *
2216 * Note that at recovery time, journal replay occurs *before* the restart of
2217 * truncate against the orphan inode list.
2218 *
2219 * The committed inode has the new, desired i_size (which is the same as
2220 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see
2221 * that this inode's truncate did not complete and it will again call
2222 * ext4_truncate() to have another go. So there will be instantiated blocks
2223 * to the right of the truncation point in a crashed ext4 filesystem. But
2224 * that's fine - as long as they are linked from the inode, the post-crash
2225 * ext4_truncate() run will find them and release them.
2226 */
2227void ext4_truncate(struct inode *inode)
2228{
2229 handle_t *handle;
2230 struct ext4_inode_info *ei = EXT4_I(inode);
2231 __le32 *i_data = ei->i_data;
2232 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
2233 struct address_space *mapping = inode->i_mapping;
2234 int offsets[4];
2235 Indirect chain[4];
2236 Indirect *partial;
2237 __le32 nr = 0;
2238 int n;
2239 long last_block;
2240 unsigned blocksize = inode->i_sb->s_blocksize;
2241 struct page *page;
2242
2243 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2244 S_ISLNK(inode->i_mode)))
2245 return;
2246 if (ext4_inode_is_fast_symlink(inode))
2247 return;
2248 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2249 return;
2250
2251 /*
2252 * We have to lock the EOF page here, because lock_page() nests
2253 * outside jbd2_journal_start().
2254 */
2255 if ((inode->i_size & (blocksize - 1)) == 0) {
2256 /* Block boundary? Nothing to do */
2257 page = NULL;
2258 } else {
2259 page = grab_cache_page(mapping,
2260 inode->i_size >> PAGE_CACHE_SHIFT);
2261 if (!page)
2262 return;
2263 }
2264
2265 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
2266 return ext4_ext_truncate(inode, page);
2267
2268 handle = start_transaction(inode);
2269 if (IS_ERR(handle)) {
2270 if (page) {
2271 clear_highpage(page);
2272 flush_dcache_page(page);
2273 unlock_page(page);
2274 page_cache_release(page);
2275 }
2276 return; /* AKPM: return what? */
2277 }
2278
2279 last_block = (inode->i_size + blocksize-1)
2280 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
2281
2282 if (page)
2283 ext4_block_truncate_page(handle, page, mapping, inode->i_size);
2284
2285 n = ext4_block_to_path(inode, last_block, offsets, NULL);
2286 if (n == 0)
2287 goto out_stop; /* error */
2288
2289 /*
2290 * OK. This truncate is going to happen. We add the inode to the
2291 * orphan list, so that if this truncate spans multiple transactions,
2292 * and we crash, we will resume the truncate when the filesystem
2293 * recovers. It also marks the inode dirty, to catch the new size.
2294 *
2295 * Implication: the file must always be in a sane, consistent
2296 * truncatable state while each transaction commits.
2297 */
2298 if (ext4_orphan_add(handle, inode))
2299 goto out_stop;
2300
2301 /*
2302 * The orphan list entry will now protect us from any crash which
2303 * occurs before the truncate completes, so it is now safe to propagate
2304 * the new, shorter inode size (held for now in i_size) into the
2305 * on-disk inode. We do this via i_disksize, which is the value which
2306 * ext4 *really* writes onto the disk inode.
2307 */
2308 ei->i_disksize = inode->i_size;
2309
2310 /*
2311 * From here we block out all ext4_get_block() callers who want to
2312 * modify the block allocation tree.
2313 */
2314 mutex_lock(&ei->truncate_mutex);
2315
2316 if (n == 1) { /* direct blocks */
2317 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
2318 i_data + EXT4_NDIR_BLOCKS);
2319 goto do_indirects;
2320 }
2321
2322 partial = ext4_find_shared(inode, n, offsets, chain, &nr);
2323 /* Kill the top of shared branch (not detached) */
2324 if (nr) {
2325 if (partial == chain) {
2326 /* Shared branch grows from the inode */
2327 ext4_free_branches(handle, inode, NULL,
2328 &nr, &nr+1, (chain+n-1) - partial);
2329 *partial->p = 0;
2330 /*
2331 * We mark the inode dirty prior to restart,
2332 * and prior to stop. No need for it here.
2333 */
2334 } else {
2335 /* Shared branch grows from an indirect block */
2336 BUFFER_TRACE(partial->bh, "get_write_access");
2337 ext4_free_branches(handle, inode, partial->bh,
2338 partial->p,
2339 partial->p+1, (chain+n-1) - partial);
2340 }
2341 }
2342 /* Clear the ends of indirect blocks on the shared branch */
2343 while (partial > chain) {
2344 ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
2345 (__le32*)partial->bh->b_data+addr_per_block,
2346 (chain+n-1) - partial);
2347 BUFFER_TRACE(partial->bh, "call brelse");
2348 brelse (partial->bh);
2349 partial--;
2350 }
2351do_indirects:
2352 /* Kill the remaining (whole) subtrees */
2353 switch (offsets[0]) {
2354 default:
2355 nr = i_data[EXT4_IND_BLOCK];
2356 if (nr) {
2357 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
2358 i_data[EXT4_IND_BLOCK] = 0;
2359 }
2360 case EXT4_IND_BLOCK:
2361 nr = i_data[EXT4_DIND_BLOCK];
2362 if (nr) {
2363 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
2364 i_data[EXT4_DIND_BLOCK] = 0;
2365 }
2366 case EXT4_DIND_BLOCK:
2367 nr = i_data[EXT4_TIND_BLOCK];
2368 if (nr) {
2369 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
2370 i_data[EXT4_TIND_BLOCK] = 0;
2371 }
2372 case EXT4_TIND_BLOCK:
2373 ;
2374 }
2375
2376 ext4_discard_reservation(inode);
2377
2378 mutex_unlock(&ei->truncate_mutex);
2379 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
2380 ext4_mark_inode_dirty(handle, inode);
2381
2382 /*
2383 * In a multi-transaction truncate, we only make the final transaction
2384 * synchronous
2385 */
2386 if (IS_SYNC(inode))
2387 handle->h_sync = 1;
2388out_stop:
2389 /*
2390 * If this was a simple ftruncate(), and the file will remain alive
2391 * then we need to clear up the orphan record which we created above.
2392 * However, if this was a real unlink then we were called by
2393 * ext4_delete_inode(), and we allow that function to clean up the
2394 * orphan info for us.
2395 */
2396 if (inode->i_nlink)
2397 ext4_orphan_del(handle, inode);
2398
2399 ext4_journal_stop(handle);
2400}
2401
2402static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
2403 unsigned long ino, struct ext4_iloc *iloc)
2404{
2405 unsigned long desc, group_desc, block_group;
2406 unsigned long offset;
2407 ext4_fsblk_t block;
2408 struct buffer_head *bh;
2409 struct ext4_group_desc * gdp;
2410
2411 if (!ext4_valid_inum(sb, ino)) {
2412 /*
2413 * This error is already checked for in namei.c unless we are
2414 * looking at an NFS filehandle, in which case no error
2415 * report is needed
2416 */
2417 return 0;
2418 }
2419
2420 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
2421 if (block_group >= EXT4_SB(sb)->s_groups_count) {
2422 ext4_error(sb,"ext4_get_inode_block","group >= groups count");
2423 return 0;
2424 }
2425 smp_rmb();
2426 group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
2427 desc = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2428 bh = EXT4_SB(sb)->s_group_desc[group_desc];
2429 if (!bh) {
2430 ext4_error (sb, "ext4_get_inode_block",
2431 "Descriptor not loaded");
2432 return 0;
2433 }
2434
2435 gdp = (struct ext4_group_desc *)((__u8 *)bh->b_data +
2436 desc * EXT4_DESC_SIZE(sb));
2437 /*
2438 * Figure out the offset within the block group inode table
2439 */
2440 offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
2441 EXT4_INODE_SIZE(sb);
2442 block = ext4_inode_table(sb, gdp) +
2443 (offset >> EXT4_BLOCK_SIZE_BITS(sb));
2444
2445 iloc->block_group = block_group;
2446 iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
2447 return block;
2448}
2449
2450/*
2451 * ext4_get_inode_loc returns with an extra refcount against the inode's
2452 * underlying buffer_head on success. If 'in_mem' is true, we have all
2453 * data in memory that is needed to recreate the on-disk version of this
2454 * inode.
2455 */
2456static int __ext4_get_inode_loc(struct inode *inode,
2457 struct ext4_iloc *iloc, int in_mem)
2458{
2459 ext4_fsblk_t block;
2460 struct buffer_head *bh;
2461
2462 block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2463 if (!block)
2464 return -EIO;
2465
2466 bh = sb_getblk(inode->i_sb, block);
2467 if (!bh) {
2468 ext4_error (inode->i_sb, "ext4_get_inode_loc",
2469 "unable to read inode block - "
2470 "inode=%lu, block=%llu",
2471 inode->i_ino, block);
2472 return -EIO;
2473 }
2474 if (!buffer_uptodate(bh)) {
2475 lock_buffer(bh);
2476 if (buffer_uptodate(bh)) {
2477 /* someone brought it uptodate while we waited */
2478 unlock_buffer(bh);
2479 goto has_buffer;
2480 }
2481
2482 /*
2483 * If we have all information of the inode in memory and this
2484 * is the only valid inode in the block, we need not read the
2485 * block.
2486 */
2487 if (in_mem) {
2488 struct buffer_head *bitmap_bh;
2489 struct ext4_group_desc *desc;
2490 int inodes_per_buffer;
2491 int inode_offset, i;
2492 int block_group;
2493 int start;
2494
2495 block_group = (inode->i_ino - 1) /
2496 EXT4_INODES_PER_GROUP(inode->i_sb);
2497 inodes_per_buffer = bh->b_size /
2498 EXT4_INODE_SIZE(inode->i_sb);
2499 inode_offset = ((inode->i_ino - 1) %
2500 EXT4_INODES_PER_GROUP(inode->i_sb));
2501 start = inode_offset & ~(inodes_per_buffer - 1);
2502
2503 /* Is the inode bitmap in cache? */
2504 desc = ext4_get_group_desc(inode->i_sb,
2505 block_group, NULL);
2506 if (!desc)
2507 goto make_io;
2508
2509 bitmap_bh = sb_getblk(inode->i_sb,
2510 ext4_inode_bitmap(inode->i_sb, desc));
2511 if (!bitmap_bh)
2512 goto make_io;
2513
2514 /*
2515 * If the inode bitmap isn't in cache then the
2516 * optimisation may end up performing two reads instead
2517 * of one, so skip it.
2518 */
2519 if (!buffer_uptodate(bitmap_bh)) {
2520 brelse(bitmap_bh);
2521 goto make_io;
2522 }
2523 for (i = start; i < start + inodes_per_buffer; i++) {
2524 if (i == inode_offset)
2525 continue;
2526 if (ext4_test_bit(i, bitmap_bh->b_data))
2527 break;
2528 }
2529 brelse(bitmap_bh);
2530 if (i == start + inodes_per_buffer) {
2531 /* all other inodes are free, so skip I/O */
2532 memset(bh->b_data, 0, bh->b_size);
2533 set_buffer_uptodate(bh);
2534 unlock_buffer(bh);
2535 goto has_buffer;
2536 }
2537 }
2538
2539make_io:
2540 /*
2541 * There are other valid inodes in the buffer, this inode
2542 * has in-inode xattrs, or we don't have this inode in memory.
2543 * Read the block from disk.
2544 */
2545 get_bh(bh);
2546 bh->b_end_io = end_buffer_read_sync;
2547 submit_bh(READ_META, bh);
2548 wait_on_buffer(bh);
2549 if (!buffer_uptodate(bh)) {
2550 ext4_error(inode->i_sb, "ext4_get_inode_loc",
2551 "unable to read inode block - "
2552 "inode=%lu, block=%llu",
2553 inode->i_ino, block);
2554 brelse(bh);
2555 return -EIO;
2556 }
2557 }
2558has_buffer:
2559 iloc->bh = bh;
2560 return 0;
2561}
2562
2563int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
2564{
2565 /* We have all inode data except xattrs in memory here. */
2566 return __ext4_get_inode_loc(inode, iloc,
2567 !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
2568}
2569
2570void ext4_set_inode_flags(struct inode *inode)
2571{
2572 unsigned int flags = EXT4_I(inode)->i_flags;
2573
2574 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
2575 if (flags & EXT4_SYNC_FL)
2576 inode->i_flags |= S_SYNC;
2577 if (flags & EXT4_APPEND_FL)
2578 inode->i_flags |= S_APPEND;
2579 if (flags & EXT4_IMMUTABLE_FL)
2580 inode->i_flags |= S_IMMUTABLE;
2581 if (flags & EXT4_NOATIME_FL)
2582 inode->i_flags |= S_NOATIME;
2583 if (flags & EXT4_DIRSYNC_FL)
2584 inode->i_flags |= S_DIRSYNC;
2585}
2586
2587void ext4_read_inode(struct inode * inode)
2588{
2589 struct ext4_iloc iloc;
2590 struct ext4_inode *raw_inode;
2591 struct ext4_inode_info *ei = EXT4_I(inode);
2592 struct buffer_head *bh;
2593 int block;
2594
2595#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
2596 ei->i_acl = EXT4_ACL_NOT_CACHED;
2597 ei->i_default_acl = EXT4_ACL_NOT_CACHED;
2598#endif
2599 ei->i_block_alloc_info = NULL;
2600
2601 if (__ext4_get_inode_loc(inode, &iloc, 0))
2602 goto bad_inode;
2603 bh = iloc.bh;
2604 raw_inode = ext4_raw_inode(&iloc);
2605 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2606 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2607 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2608 if(!(test_opt (inode->i_sb, NO_UID32))) {
2609 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2610 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2611 }
2612 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2613 inode->i_size = le32_to_cpu(raw_inode->i_size);
2614 inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
2615 inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
2616 inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
2617 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2618
2619 ei->i_state = 0;
2620 ei->i_dir_start_lookup = 0;
2621 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2622 /* We now have enough fields to check if the inode was active or not.
2623 * This is needed because nfsd might try to access dead inodes
2624 * the test is that same one that e2fsck uses
2625 * NeilBrown 1999oct15
2626 */
2627 if (inode->i_nlink == 0) {
2628 if (inode->i_mode == 0 ||
2629 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
2630 /* this inode is deleted */
2631 brelse (bh);
2632 goto bad_inode;
2633 }
2634 /* The only unlinked inodes we let through here have
2635 * valid i_mode and are being read by the orphan
2636 * recovery code: that's fine, we're about to complete
2637 * the process of deleting those. */
2638 }
2639 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2640 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2641#ifdef EXT4_FRAGMENTS
2642 ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2643 ei->i_frag_no = raw_inode->i_frag;
2644 ei->i_frag_size = raw_inode->i_fsize;
2645#endif
2646 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2647 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
2648 cpu_to_le32(EXT4_OS_HURD))
2649 ei->i_file_acl |=
2650 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
2651 if (!S_ISREG(inode->i_mode)) {
2652 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2653 } else {
2654 inode->i_size |=
2655 ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2656 }
2657 ei->i_disksize = inode->i_size;
2658 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2659 ei->i_block_group = iloc.block_group;
2660 /*
2661 * NOTE! The in-memory inode i_data array is in little-endian order
2662 * even on big-endian machines: we do NOT byteswap the block numbers!
2663 */
2664 for (block = 0; block < EXT4_N_BLOCKS; block++)
2665 ei->i_data[block] = raw_inode->i_block[block];
2666 INIT_LIST_HEAD(&ei->i_orphan);
2667
2668 if (inode->i_ino >= EXT4_FIRST_INO(inode->i_sb) + 1 &&
2669 EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
2670 /*
2671 * When mke2fs creates big inodes it does not zero out
2672 * the unused bytes above EXT4_GOOD_OLD_INODE_SIZE,
2673 * so ignore those first few inodes.
2674 */
2675 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2676 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2677 EXT4_INODE_SIZE(inode->i_sb))
2678 goto bad_inode;
2679 if (ei->i_extra_isize == 0) {
2680 /* The extra space is currently unused. Use it. */
2681 ei->i_extra_isize = sizeof(struct ext4_inode) -
2682 EXT4_GOOD_OLD_INODE_SIZE;
2683 } else {
2684 __le32 *magic = (void *)raw_inode +
2685 EXT4_GOOD_OLD_INODE_SIZE +
2686 ei->i_extra_isize;
2687 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
2688 ei->i_state |= EXT4_STATE_XATTR;
2689 }
2690 } else
2691 ei->i_extra_isize = 0;
2692
2693 if (S_ISREG(inode->i_mode)) {
2694 inode->i_op = &ext4_file_inode_operations;
2695 inode->i_fop = &ext4_file_operations;
2696 ext4_set_aops(inode);
2697 } else if (S_ISDIR(inode->i_mode)) {
2698 inode->i_op = &ext4_dir_inode_operations;
2699 inode->i_fop = &ext4_dir_operations;
2700 } else if (S_ISLNK(inode->i_mode)) {
2701 if (ext4_inode_is_fast_symlink(inode))
2702 inode->i_op = &ext4_fast_symlink_inode_operations;
2703 else {
2704 inode->i_op = &ext4_symlink_inode_operations;
2705 ext4_set_aops(inode);
2706 }
2707 } else {
2708 inode->i_op = &ext4_special_inode_operations;
2709 if (raw_inode->i_block[0])
2710 init_special_inode(inode, inode->i_mode,
2711 old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
2712 else
2713 init_special_inode(inode, inode->i_mode,
2714 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2715 }
2716 brelse (iloc.bh);
2717 ext4_set_inode_flags(inode);
2718 return;
2719
2720bad_inode:
2721 make_bad_inode(inode);
2722 return;
2723}
2724
2725/*
2726 * Post the struct inode info into an on-disk inode location in the
2727 * buffer-cache. This gobbles the caller's reference to the
2728 * buffer_head in the inode location struct.
2729 *
2730 * The caller must have write access to iloc->bh.
2731 */
2732static int ext4_do_update_inode(handle_t *handle,
2733 struct inode *inode,
2734 struct ext4_iloc *iloc)
2735{
2736 struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
2737 struct ext4_inode_info *ei = EXT4_I(inode);
2738 struct buffer_head *bh = iloc->bh;
2739 int err = 0, rc, block;
2740
2741 /* For fields not not tracking in the in-memory inode,
2742 * initialise them to zero for new inodes. */
2743 if (ei->i_state & EXT4_STATE_NEW)
2744 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
2745
2746 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2747 if(!(test_opt(inode->i_sb, NO_UID32))) {
2748 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
2749 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
2750/*
2751 * Fix up interoperability with old kernels. Otherwise, old inodes get
2752 * re-used with the upper 16 bits of the uid/gid intact
2753 */
2754 if(!ei->i_dtime) {
2755 raw_inode->i_uid_high =
2756 cpu_to_le16(high_16_bits(inode->i_uid));
2757 raw_inode->i_gid_high =
2758 cpu_to_le16(high_16_bits(inode->i_gid));
2759 } else {
2760 raw_inode->i_uid_high = 0;
2761 raw_inode->i_gid_high = 0;
2762 }
2763 } else {
2764 raw_inode->i_uid_low =
2765 cpu_to_le16(fs_high2lowuid(inode->i_uid));
2766 raw_inode->i_gid_low =
2767 cpu_to_le16(fs_high2lowgid(inode->i_gid));
2768 raw_inode->i_uid_high = 0;
2769 raw_inode->i_gid_high = 0;
2770 }
2771 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2772 raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2773 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
2774 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
2775 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
2776 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2777 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2778 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2779#ifdef EXT4_FRAGMENTS
2780 raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
2781 raw_inode->i_frag = ei->i_frag_no;
2782 raw_inode->i_fsize = ei->i_frag_size;
2783#endif
2784 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
2785 cpu_to_le32(EXT4_OS_HURD))
2786 raw_inode->i_file_acl_high =
2787 cpu_to_le16(ei->i_file_acl >> 32);
2788 raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
2789 if (!S_ISREG(inode->i_mode)) {
2790 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
2791 } else {
2792 raw_inode->i_size_high =
2793 cpu_to_le32(ei->i_disksize >> 32);
2794 if (ei->i_disksize > 0x7fffffffULL) {
2795 struct super_block *sb = inode->i_sb;
2796 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
2797 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
2798 EXT4_SB(sb)->s_es->s_rev_level ==
2799 cpu_to_le32(EXT4_GOOD_OLD_REV)) {
2800 /* If this is the first large file
2801 * created, add a flag to the superblock.
2802 */
2803 err = ext4_journal_get_write_access(handle,
2804 EXT4_SB(sb)->s_sbh);
2805 if (err)
2806 goto out_brelse;
2807 ext4_update_dynamic_rev(sb);
2808 EXT4_SET_RO_COMPAT_FEATURE(sb,
2809 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
2810 sb->s_dirt = 1;
2811 handle->h_sync = 1;
2812 err = ext4_journal_dirty_metadata(handle,
2813 EXT4_SB(sb)->s_sbh);
2814 }
2815 }
2816 }
2817 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2818 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
2819 if (old_valid_dev(inode->i_rdev)) {
2820 raw_inode->i_block[0] =
2821 cpu_to_le32(old_encode_dev(inode->i_rdev));
2822 raw_inode->i_block[1] = 0;
2823 } else {
2824 raw_inode->i_block[0] = 0;
2825 raw_inode->i_block[1] =
2826 cpu_to_le32(new_encode_dev(inode->i_rdev));
2827 raw_inode->i_block[2] = 0;
2828 }
2829 } else for (block = 0; block < EXT4_N_BLOCKS; block++)
2830 raw_inode->i_block[block] = ei->i_data[block];
2831
2832 if (ei->i_extra_isize)
2833 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
2834
2835 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
2836 rc = ext4_journal_dirty_metadata(handle, bh);
2837 if (!err)
2838 err = rc;
2839 ei->i_state &= ~EXT4_STATE_NEW;
2840
2841out_brelse:
2842 brelse (bh);
2843 ext4_std_error(inode->i_sb, err);
2844 return err;
2845}
2846
2847/*
2848 * ext4_write_inode()
2849 *
2850 * We are called from a few places:
2851 *
2852 * - Within generic_file_write() for O_SYNC files.
2853 * Here, there will be no transaction running. We wait for any running
2854 * trasnaction to commit.
2855 *
2856 * - Within sys_sync(), kupdate and such.
2857 * We wait on commit, if tol to.
2858 *
2859 * - Within prune_icache() (PF_MEMALLOC == true)
2860 * Here we simply return. We can't afford to block kswapd on the
2861 * journal commit.
2862 *
2863 * In all cases it is actually safe for us to return without doing anything,
2864 * because the inode has been copied into a raw inode buffer in
2865 * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
2866 * knfsd.
2867 *
2868 * Note that we are absolutely dependent upon all inode dirtiers doing the
2869 * right thing: they *must* call mark_inode_dirty() after dirtying info in
2870 * which we are interested.
2871 *
2872 * It would be a bug for them to not do this. The code:
2873 *
2874 * mark_inode_dirty(inode)
2875 * stuff();
2876 * inode->i_size = expr;
2877 *
2878 * is in error because a kswapd-driven write_inode() could occur while
2879 * `stuff()' is running, and the new i_size will be lost. Plus the inode
2880 * will no longer be on the superblock's dirty inode list.
2881 */
2882int ext4_write_inode(struct inode *inode, int wait)
2883{
2884 if (current->flags & PF_MEMALLOC)
2885 return 0;
2886
2887 if (ext4_journal_current_handle()) {
2888 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2889 dump_stack();
2890 return -EIO;
2891 }
2892
2893 if (!wait)
2894 return 0;
2895
2896 return ext4_force_commit(inode->i_sb);
2897}
2898
2899/*
2900 * ext4_setattr()
2901 *
2902 * Called from notify_change.
2903 *
2904 * We want to trap VFS attempts to truncate the file as soon as
2905 * possible. In particular, we want to make sure that when the VFS
2906 * shrinks i_size, we put the inode on the orphan list and modify
2907 * i_disksize immediately, so that during the subsequent flushing of
2908 * dirty pages and freeing of disk blocks, we can guarantee that any
2909 * commit will leave the blocks being flushed in an unused state on
2910 * disk. (On recovery, the inode will get truncated and the blocks will
2911 * be freed, so we have a strong guarantee that no future commit will
2912 * leave these blocks visible to the user.)
2913 *
2914 * Called with inode->sem down.
2915 */
2916int ext4_setattr(struct dentry *dentry, struct iattr *attr)
2917{
2918 struct inode *inode = dentry->d_inode;
2919 int error, rc = 0;
2920 const unsigned int ia_valid = attr->ia_valid;
2921
2922 error = inode_change_ok(inode, attr);
2923 if (error)
2924 return error;
2925
2926 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2927 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2928 handle_t *handle;
2929
2930 /* (user+group)*(old+new) structure, inode write (sb,
2931 * inode block, ? - but truncate inode update has it) */
2932 handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
2933 EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
2934 if (IS_ERR(handle)) {
2935 error = PTR_ERR(handle);
2936 goto err_out;
2937 }
2938 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2939 if (error) {
2940 ext4_journal_stop(handle);
2941 return error;
2942 }
2943 /* Update corresponding info in inode so that everything is in
2944 * one transaction */
2945 if (attr->ia_valid & ATTR_UID)
2946 inode->i_uid = attr->ia_uid;
2947 if (attr->ia_valid & ATTR_GID)
2948 inode->i_gid = attr->ia_gid;
2949 error = ext4_mark_inode_dirty(handle, inode);
2950 ext4_journal_stop(handle);
2951 }
2952
2953 if (S_ISREG(inode->i_mode) &&
2954 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2955 handle_t *handle;
2956
2957 handle = ext4_journal_start(inode, 3);
2958 if (IS_ERR(handle)) {
2959 error = PTR_ERR(handle);
2960 goto err_out;
2961 }
2962
2963 error = ext4_orphan_add(handle, inode);
2964 EXT4_I(inode)->i_disksize = attr->ia_size;
2965 rc = ext4_mark_inode_dirty(handle, inode);
2966 if (!error)
2967 error = rc;
2968 ext4_journal_stop(handle);
2969 }
2970
2971 rc = inode_setattr(inode, attr);
2972
2973 /* If inode_setattr's call to ext4_truncate failed to get a
2974 * transaction handle at all, we need to clean up the in-core
2975 * orphan list manually. */
2976 if (inode->i_nlink)
2977 ext4_orphan_del(NULL, inode);
2978
2979 if (!rc && (ia_valid & ATTR_MODE))
2980 rc = ext4_acl_chmod(inode);
2981
2982err_out:
2983 ext4_std_error(inode->i_sb, error);
2984 if (!error)
2985 error = rc;
2986 return error;
2987}
2988
2989
2990/*
2991 * How many blocks doth make a writepage()?
2992 *
2993 * With N blocks per page, it may be:
2994 * N data blocks
2995 * 2 indirect block
2996 * 2 dindirect
2997 * 1 tindirect
2998 * N+5 bitmap blocks (from the above)
2999 * N+5 group descriptor summary blocks
3000 * 1 inode block
3001 * 1 superblock.
3002 * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
3003 *
3004 * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
3005 *
3006 * With ordered or writeback data it's the same, less the N data blocks.
3007 *
3008 * If the inode's direct blocks can hold an integral number of pages then a
3009 * page cannot straddle two indirect blocks, and we can only touch one indirect
3010 * and dindirect block, and the "5" above becomes "3".
3011 *
3012 * This still overestimates under most circumstances. If we were to pass the
3013 * start and end offsets in here as well we could do block_to_path() on each
3014 * block and work out the exact number of indirects which are touched. Pah.
3015 */
3016
3017int ext4_writepage_trans_blocks(struct inode *inode)
3018{
3019 int bpp = ext4_journal_blocks_per_page(inode);
3020 int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
3021 int ret;
3022
3023 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
3024 return ext4_ext_writepage_trans_blocks(inode, bpp);
3025
3026 if (ext4_should_journal_data(inode))
3027 ret = 3 * (bpp + indirects) + 2;
3028 else
3029 ret = 2 * (bpp + indirects) + 2;
3030
3031#ifdef CONFIG_QUOTA
3032 /* We know that structure was already allocated during DQUOT_INIT so
3033 * we will be updating only the data blocks + inodes */
3034 ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
3035#endif
3036
3037 return ret;
3038}
3039
3040/*
3041 * The caller must have previously called ext4_reserve_inode_write().
3042 * Give this, we know that the caller already has write access to iloc->bh.
3043 */
3044int ext4_mark_iloc_dirty(handle_t *handle,
3045 struct inode *inode, struct ext4_iloc *iloc)
3046{
3047 int err = 0;
3048
3049 /* the do_update_inode consumes one bh->b_count */
3050 get_bh(iloc->bh);
3051
3052 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
3053 err = ext4_do_update_inode(handle, inode, iloc);
3054 put_bh(iloc->bh);
3055 return err;
3056}
3057
3058/*
3059 * On success, We end up with an outstanding reference count against
3060 * iloc->bh. This _must_ be cleaned up later.
3061 */
3062
3063int
3064ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
3065 struct ext4_iloc *iloc)
3066{
3067 int err = 0;
3068 if (handle) {
3069 err = ext4_get_inode_loc(inode, iloc);
3070 if (!err) {
3071 BUFFER_TRACE(iloc->bh, "get_write_access");
3072 err = ext4_journal_get_write_access(handle, iloc->bh);
3073 if (err) {
3074 brelse(iloc->bh);
3075 iloc->bh = NULL;
3076 }
3077 }
3078 }
3079 ext4_std_error(inode->i_sb, err);
3080 return err;
3081}
3082
3083/*
3084 * What we do here is to mark the in-core inode as clean with respect to inode
3085 * dirtiness (it may still be data-dirty).
3086 * This means that the in-core inode may be reaped by prune_icache
3087 * without having to perform any I/O. This is a very good thing,
3088 * because *any* task may call prune_icache - even ones which
3089 * have a transaction open against a different journal.
3090 *
3091 * Is this cheating? Not really. Sure, we haven't written the
3092 * inode out, but prune_icache isn't a user-visible syncing function.
3093 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
3094 * we start and wait on commits.
3095 *
3096 * Is this efficient/effective? Well, we're being nice to the system
3097 * by cleaning up our inodes proactively so they can be reaped
3098 * without I/O. But we are potentially leaving up to five seconds'
3099 * worth of inodes floating about which prune_icache wants us to
3100 * write out. One way to fix that would be to get prune_icache()
3101 * to do a write_super() to free up some memory. It has the desired
3102 * effect.
3103 */
3104int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
3105{
3106 struct ext4_iloc iloc;
3107 int err;
3108
3109 might_sleep();
3110 err = ext4_reserve_inode_write(handle, inode, &iloc);
3111 if (!err)
3112 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
3113 return err;
3114}
3115
3116/*
3117 * ext4_dirty_inode() is called from __mark_inode_dirty()
3118 *
3119 * We're really interested in the case where a file is being extended.
3120 * i_size has been changed by generic_commit_write() and we thus need
3121 * to include the updated inode in the current transaction.
3122 *
3123 * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
3124 * are allocated to the file.
3125 *
3126 * If the inode is marked synchronous, we don't honour that here - doing
3127 * so would cause a commit on atime updates, which we don't bother doing.
3128 * We handle synchronous inodes at the highest possible level.
3129 */
3130void ext4_dirty_inode(struct inode *inode)
3131{
3132 handle_t *current_handle = ext4_journal_current_handle();
3133 handle_t *handle;
3134
3135 handle = ext4_journal_start(inode, 2);
3136 if (IS_ERR(handle))
3137 goto out;
3138 if (current_handle &&
3139 current_handle->h_transaction != handle->h_transaction) {
3140 /* This task has a transaction open against a different fs */
3141 printk(KERN_EMERG "%s: transactions do not match!\n",
3142 __FUNCTION__);
3143 } else {
3144 jbd_debug(5, "marking dirty. outer handle=%p\n",
3145 current_handle);
3146 ext4_mark_inode_dirty(handle, inode);
3147 }
3148 ext4_journal_stop(handle);
3149out:
3150 return;
3151}
3152
3153#if 0
3154/*
3155 * Bind an inode's backing buffer_head into this transaction, to prevent
3156 * it from being flushed to disk early. Unlike
3157 * ext4_reserve_inode_write, this leaves behind no bh reference and
3158 * returns no iloc structure, so the caller needs to repeat the iloc
3159 * lookup to mark the inode dirty later.
3160 */
3161static int ext4_pin_inode(handle_t *handle, struct inode *inode)
3162{
3163 struct ext4_iloc iloc;
3164
3165 int err = 0;
3166 if (handle) {
3167 err = ext4_get_inode_loc(inode, &iloc);
3168 if (!err) {
3169 BUFFER_TRACE(iloc.bh, "get_write_access");
3170 err = jbd2_journal_get_write_access(handle, iloc.bh);
3171 if (!err)
3172 err = ext4_journal_dirty_metadata(handle,
3173 iloc.bh);
3174 brelse(iloc.bh);
3175 }
3176 }
3177 ext4_std_error(inode->i_sb, err);
3178 return err;
3179}
3180#endif
3181
3182int ext4_change_inode_journal_flag(struct inode *inode, int val)
3183{
3184 journal_t *journal;
3185 handle_t *handle;
3186 int err;
3187
3188 /*
3189 * We have to be very careful here: changing a data block's
3190 * journaling status dynamically is dangerous. If we write a
3191 * data block to the journal, change the status and then delete
3192 * that block, we risk forgetting to revoke the old log record
3193 * from the journal and so a subsequent replay can corrupt data.
3194 * So, first we make sure that the journal is empty and that
3195 * nobody is changing anything.
3196 */
3197
3198 journal = EXT4_JOURNAL(inode);
3199 if (is_journal_aborted(journal) || IS_RDONLY(inode))
3200 return -EROFS;
3201
3202 jbd2_journal_lock_updates(journal);
3203 jbd2_journal_flush(journal);
3204
3205 /*
3206 * OK, there are no updates running now, and all cached data is
3207 * synced to disk. We are now in a completely consistent state
3208 * which doesn't have anything in the journal, and we know that
3209 * no filesystem updates are running, so it is safe to modify
3210 * the inode's in-core data-journaling state flag now.
3211 */
3212
3213 if (val)
3214 EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
3215 else
3216 EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
3217 ext4_set_aops(inode);
3218
3219 jbd2_journal_unlock_updates(journal);
3220
3221 /* Finally we can mark the inode as dirty. */
3222
3223 handle = ext4_journal_start(inode, 1);
3224 if (IS_ERR(handle))
3225 return PTR_ERR(handle);
3226
3227 err = ext4_mark_inode_dirty(handle, inode);
3228 handle->h_sync = 1;
3229 ext4_journal_stop(handle);
3230 ext4_std_error(inode->i_sb, err);
3231
3232 return err;
3233}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
new file mode 100644
index 000000000000..22a737c306c7
--- /dev/null
+++ b/fs/ext4/ioctl.c
@@ -0,0 +1,306 @@
1/*
2 * linux/fs/ext4/ioctl.c
3 *
4 * Copyright (C) 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 */
9
10#include <linux/fs.h>
11#include <linux/jbd2.h>
12#include <linux/capability.h>
13#include <linux/ext4_fs.h>
14#include <linux/ext4_jbd2.h>
15#include <linux/time.h>
16#include <linux/compat.h>
17#include <linux/smp_lock.h>
18#include <asm/uaccess.h>
19
20int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
21 unsigned long arg)
22{
23 struct ext4_inode_info *ei = EXT4_I(inode);
24 unsigned int flags;
25 unsigned short rsv_window_size;
26
27 ext4_debug ("cmd = %u, arg = %lu\n", cmd, arg);
28
29 switch (cmd) {
30 case EXT4_IOC_GETFLAGS:
31 flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
32 return put_user(flags, (int __user *) arg);
33 case EXT4_IOC_SETFLAGS: {
34 handle_t *handle = NULL;
35 int err;
36 struct ext4_iloc iloc;
37 unsigned int oldflags;
38 unsigned int jflag;
39
40 if (IS_RDONLY(inode))
41 return -EROFS;
42
43 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
44 return -EACCES;
45
46 if (get_user(flags, (int __user *) arg))
47 return -EFAULT;
48
49 if (!S_ISDIR(inode->i_mode))
50 flags &= ~EXT4_DIRSYNC_FL;
51
52 mutex_lock(&inode->i_mutex);
53 oldflags = ei->i_flags;
54
55 /* The JOURNAL_DATA flag is modifiable only by root */
56 jflag = flags & EXT4_JOURNAL_DATA_FL;
57
58 /*
59 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
60 * the relevant capability.
61 *
62 * This test looks nicer. Thanks to Pauline Middelink
63 */
64 if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
65 if (!capable(CAP_LINUX_IMMUTABLE)) {
66 mutex_unlock(&inode->i_mutex);
67 return -EPERM;
68 }
69 }
70
71 /*
72 * The JOURNAL_DATA flag can only be changed by
73 * the relevant capability.
74 */
75 if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
76 if (!capable(CAP_SYS_RESOURCE)) {
77 mutex_unlock(&inode->i_mutex);
78 return -EPERM;
79 }
80 }
81
82
83 handle = ext4_journal_start(inode, 1);
84 if (IS_ERR(handle)) {
85 mutex_unlock(&inode->i_mutex);
86 return PTR_ERR(handle);
87 }
88 if (IS_SYNC(inode))
89 handle->h_sync = 1;
90 err = ext4_reserve_inode_write(handle, inode, &iloc);
91 if (err)
92 goto flags_err;
93
94 flags = flags & EXT4_FL_USER_MODIFIABLE;
95 flags |= oldflags & ~EXT4_FL_USER_MODIFIABLE;
96 ei->i_flags = flags;
97
98 ext4_set_inode_flags(inode);
99 inode->i_ctime = CURRENT_TIME_SEC;
100
101 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
102flags_err:
103 ext4_journal_stop(handle);
104 if (err) {
105 mutex_unlock(&inode->i_mutex);
106 return err;
107 }
108
109 if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
110 err = ext4_change_inode_journal_flag(inode, jflag);
111 mutex_unlock(&inode->i_mutex);
112 return err;
113 }
114 case EXT4_IOC_GETVERSION:
115 case EXT4_IOC_GETVERSION_OLD:
116 return put_user(inode->i_generation, (int __user *) arg);
117 case EXT4_IOC_SETVERSION:
118 case EXT4_IOC_SETVERSION_OLD: {
119 handle_t *handle;
120 struct ext4_iloc iloc;
121 __u32 generation;
122 int err;
123
124 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
125 return -EPERM;
126 if (IS_RDONLY(inode))
127 return -EROFS;
128 if (get_user(generation, (int __user *) arg))
129 return -EFAULT;
130
131 handle = ext4_journal_start(inode, 1);
132 if (IS_ERR(handle))
133 return PTR_ERR(handle);
134 err = ext4_reserve_inode_write(handle, inode, &iloc);
135 if (err == 0) {
136 inode->i_ctime = CURRENT_TIME_SEC;
137 inode->i_generation = generation;
138 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
139 }
140 ext4_journal_stop(handle);
141 return err;
142 }
143#ifdef CONFIG_JBD_DEBUG
144 case EXT4_IOC_WAIT_FOR_READONLY:
145 /*
146 * This is racy - by the time we're woken up and running,
147 * the superblock could be released. And the module could
148 * have been unloaded. So sue me.
149 *
150 * Returns 1 if it slept, else zero.
151 */
152 {
153 struct super_block *sb = inode->i_sb;
154 DECLARE_WAITQUEUE(wait, current);
155 int ret = 0;
156
157 set_current_state(TASK_INTERRUPTIBLE);
158 add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
159 if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
160 schedule();
161 ret = 1;
162 }
163 remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
164 return ret;
165 }
166#endif
167 case EXT4_IOC_GETRSVSZ:
168 if (test_opt(inode->i_sb, RESERVATION)
169 && S_ISREG(inode->i_mode)
170 && ei->i_block_alloc_info) {
171 rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
172 return put_user(rsv_window_size, (int __user *)arg);
173 }
174 return -ENOTTY;
175 case EXT4_IOC_SETRSVSZ: {
176
177 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
178 return -ENOTTY;
179
180 if (IS_RDONLY(inode))
181 return -EROFS;
182
183 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
184 return -EACCES;
185
186 if (get_user(rsv_window_size, (int __user *)arg))
187 return -EFAULT;
188
189 if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
190 rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
191
192 /*
193 * need to allocate reservation structure for this inode
194 * before set the window size
195 */
196 mutex_lock(&ei->truncate_mutex);
197 if (!ei->i_block_alloc_info)
198 ext4_init_block_alloc_info(inode);
199
200 if (ei->i_block_alloc_info){
201 struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
202 rsv->rsv_goal_size = rsv_window_size;
203 }
204 mutex_unlock(&ei->truncate_mutex);
205 return 0;
206 }
207 case EXT4_IOC_GROUP_EXTEND: {
208 ext4_fsblk_t n_blocks_count;
209 struct super_block *sb = inode->i_sb;
210 int err;
211
212 if (!capable(CAP_SYS_RESOURCE))
213 return -EPERM;
214
215 if (IS_RDONLY(inode))
216 return -EROFS;
217
218 if (get_user(n_blocks_count, (__u32 __user *)arg))
219 return -EFAULT;
220
221 err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
222 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
223 jbd2_journal_flush(EXT4_SB(sb)->s_journal);
224 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
225
226 return err;
227 }
228 case EXT4_IOC_GROUP_ADD: {
229 struct ext4_new_group_data input;
230 struct super_block *sb = inode->i_sb;
231 int err;
232
233 if (!capable(CAP_SYS_RESOURCE))
234 return -EPERM;
235
236 if (IS_RDONLY(inode))
237 return -EROFS;
238
239 if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
240 sizeof(input)))
241 return -EFAULT;
242
243 err = ext4_group_add(sb, &input);
244 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
245 jbd2_journal_flush(EXT4_SB(sb)->s_journal);
246 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
247
248 return err;
249 }
250
251 default:
252 return -ENOTTY;
253 }
254}
255
256#ifdef CONFIG_COMPAT
257long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
258{
259 struct inode *inode = file->f_dentry->d_inode;
260 int ret;
261
262 /* These are just misnamed, they actually get/put from/to user an int */
263 switch (cmd) {
264 case EXT4_IOC32_GETFLAGS:
265 cmd = EXT4_IOC_GETFLAGS;
266 break;
267 case EXT4_IOC32_SETFLAGS:
268 cmd = EXT4_IOC_SETFLAGS;
269 break;
270 case EXT4_IOC32_GETVERSION:
271 cmd = EXT4_IOC_GETVERSION;
272 break;
273 case EXT4_IOC32_SETVERSION:
274 cmd = EXT4_IOC_SETVERSION;
275 break;
276 case EXT4_IOC32_GROUP_EXTEND:
277 cmd = EXT4_IOC_GROUP_EXTEND;
278 break;
279 case EXT4_IOC32_GETVERSION_OLD:
280 cmd = EXT4_IOC_GETVERSION_OLD;
281 break;
282 case EXT4_IOC32_SETVERSION_OLD:
283 cmd = EXT4_IOC_SETVERSION_OLD;
284 break;
285#ifdef CONFIG_JBD_DEBUG
286 case EXT4_IOC32_WAIT_FOR_READONLY:
287 cmd = EXT4_IOC_WAIT_FOR_READONLY;
288 break;
289#endif
290 case EXT4_IOC32_GETRSVSZ:
291 cmd = EXT4_IOC_GETRSVSZ;
292 break;
293 case EXT4_IOC32_SETRSVSZ:
294 cmd = EXT4_IOC_SETRSVSZ;
295 break;
296 case EXT4_IOC_GROUP_ADD:
297 break;
298 default:
299 return -ENOIOCTLCMD;
300 }
301 lock_kernel();
302 ret = ext4_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
303 unlock_kernel();
304 return ret;
305}
306#endif
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
new file mode 100644
index 000000000000..8b1bd03d20f5
--- /dev/null
+++ b/fs/ext4/namei.c
@@ -0,0 +1,2395 @@
1/*
2 * linux/fs/ext4/namei.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/namei.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Big-endian to little-endian byte-swapping/bitmaps by
16 * David S. Miller (davem@caip.rutgers.edu), 1995
17 * Directory entry file type support and forward compatibility hooks
18 * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
19 * Hash Tree Directory indexing (c)
20 * Daniel Phillips, 2001
21 * Hash Tree Directory indexing porting
22 * Christopher Li, 2002
23 * Hash Tree Directory indexing cleanup
24 * Theodore Ts'o, 2002
25 */
26
27#include <linux/fs.h>
28#include <linux/pagemap.h>
29#include <linux/jbd2.h>
30#include <linux/time.h>
31#include <linux/ext4_fs.h>
32#include <linux/ext4_jbd2.h>
33#include <linux/fcntl.h>
34#include <linux/stat.h>
35#include <linux/string.h>
36#include <linux/quotaops.h>
37#include <linux/buffer_head.h>
38#include <linux/bio.h>
39#include <linux/smp_lock.h>
40
41#include "namei.h"
42#include "xattr.h"
43#include "acl.h"
44
45/*
46 * define how far ahead to read directories while searching them.
47 */
48#define NAMEI_RA_CHUNKS 2
49#define NAMEI_RA_BLOCKS 4
50#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
51#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
52
53static struct buffer_head *ext4_append(handle_t *handle,
54 struct inode *inode,
55 u32 *block, int *err)
56{
57 struct buffer_head *bh;
58
59 *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
60
61 if ((bh = ext4_bread(handle, inode, *block, 1, err))) {
62 inode->i_size += inode->i_sb->s_blocksize;
63 EXT4_I(inode)->i_disksize = inode->i_size;
64 ext4_journal_get_write_access(handle,bh);
65 }
66 return bh;
67}
68
69#ifndef assert
70#define assert(test) J_ASSERT(test)
71#endif
72
73#ifndef swap
74#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
75#endif
76
77#ifdef DX_DEBUG
78#define dxtrace(command) command
79#else
80#define dxtrace(command)
81#endif
82
83struct fake_dirent
84{
85 __le32 inode;
86 __le16 rec_len;
87 u8 name_len;
88 u8 file_type;
89};
90
91struct dx_countlimit
92{
93 __le16 limit;
94 __le16 count;
95};
96
97struct dx_entry
98{
99 __le32 hash;
100 __le32 block;
101};
102
103/*
104 * dx_root_info is laid out so that if it should somehow get overlaid by a
105 * dirent the two low bits of the hash version will be zero. Therefore, the
106 * hash version mod 4 should never be 0. Sincerely, the paranoia department.
107 */
108
109struct dx_root
110{
111 struct fake_dirent dot;
112 char dot_name[4];
113 struct fake_dirent dotdot;
114 char dotdot_name[4];
115 struct dx_root_info
116 {
117 __le32 reserved_zero;
118 u8 hash_version;
119 u8 info_length; /* 8 */
120 u8 indirect_levels;
121 u8 unused_flags;
122 }
123 info;
124 struct dx_entry entries[0];
125};
126
127struct dx_node
128{
129 struct fake_dirent fake;
130 struct dx_entry entries[0];
131};
132
133
134struct dx_frame
135{
136 struct buffer_head *bh;
137 struct dx_entry *entries;
138 struct dx_entry *at;
139};
140
141struct dx_map_entry
142{
143 u32 hash;
144 u32 offs;
145};
146
147#ifdef CONFIG_EXT4_INDEX
148static inline unsigned dx_get_block (struct dx_entry *entry);
149static void dx_set_block (struct dx_entry *entry, unsigned value);
150static inline unsigned dx_get_hash (struct dx_entry *entry);
151static void dx_set_hash (struct dx_entry *entry, unsigned value);
152static unsigned dx_get_count (struct dx_entry *entries);
153static unsigned dx_get_limit (struct dx_entry *entries);
154static void dx_set_count (struct dx_entry *entries, unsigned value);
155static void dx_set_limit (struct dx_entry *entries, unsigned value);
156static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
157static unsigned dx_node_limit (struct inode *dir);
158static struct dx_frame *dx_probe(struct dentry *dentry,
159 struct inode *dir,
160 struct dx_hash_info *hinfo,
161 struct dx_frame *frame,
162 int *err);
163static void dx_release (struct dx_frame *frames);
164static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
165 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
166static void dx_sort_map(struct dx_map_entry *map, unsigned count);
167static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to,
168 struct dx_map_entry *offsets, int count);
169static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size);
170static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
171static int ext4_htree_next_block(struct inode *dir, __u32 hash,
172 struct dx_frame *frame,
173 struct dx_frame *frames,
174 __u32 *start_hash);
175static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
176 struct ext4_dir_entry_2 **res_dir, int *err);
177static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
178 struct inode *inode);
179
180/*
181 * Future: use high four bits of block for coalesce-on-delete flags
182 * Mask them off for now.
183 */
184
185static inline unsigned dx_get_block (struct dx_entry *entry)
186{
187 return le32_to_cpu(entry->block) & 0x00ffffff;
188}
189
190static inline void dx_set_block (struct dx_entry *entry, unsigned value)
191{
192 entry->block = cpu_to_le32(value);
193}
194
195static inline unsigned dx_get_hash (struct dx_entry *entry)
196{
197 return le32_to_cpu(entry->hash);
198}
199
200static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
201{
202 entry->hash = cpu_to_le32(value);
203}
204
205static inline unsigned dx_get_count (struct dx_entry *entries)
206{
207 return le16_to_cpu(((struct dx_countlimit *) entries)->count);
208}
209
210static inline unsigned dx_get_limit (struct dx_entry *entries)
211{
212 return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
213}
214
215static inline void dx_set_count (struct dx_entry *entries, unsigned value)
216{
217 ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
218}
219
220static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
221{
222 ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
223}
224
225static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
226{
227 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
228 EXT4_DIR_REC_LEN(2) - infosize;
229 return 0? 20: entry_space / sizeof(struct dx_entry);
230}
231
232static inline unsigned dx_node_limit (struct inode *dir)
233{
234 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
235 return 0? 22: entry_space / sizeof(struct dx_entry);
236}
237
238/*
239 * Debug
240 */
241#ifdef DX_DEBUG
242static void dx_show_index (char * label, struct dx_entry *entries)
243{
244 int i, n = dx_get_count (entries);
245 printk("%s index ", label);
246 for (i = 0; i < n; i++) {
247 printk("%x->%u ", i? dx_get_hash(entries + i) :
248 0, dx_get_block(entries + i));
249 }
250 printk("\n");
251}
252
253struct stats
254{
255 unsigned names;
256 unsigned space;
257 unsigned bcount;
258};
259
260static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de,
261 int size, int show_names)
262{
263 unsigned names = 0, space = 0;
264 char *base = (char *) de;
265 struct dx_hash_info h = *hinfo;
266
267 printk("names: ");
268 while ((char *) de < base + size)
269 {
270 if (de->inode)
271 {
272 if (show_names)
273 {
274 int len = de->name_len;
275 char *name = de->name;
276 while (len--) printk("%c", *name++);
277 ext4fs_dirhash(de->name, de->name_len, &h);
278 printk(":%x.%u ", h.hash,
279 ((char *) de - base));
280 }
281 space += EXT4_DIR_REC_LEN(de->name_len);
282 names++;
283 }
284 de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
285 }
286 printk("(%i)\n", names);
287 return (struct stats) { names, space, 1 };
288}
289
290struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
291 struct dx_entry *entries, int levels)
292{
293 unsigned blocksize = dir->i_sb->s_blocksize;
294 unsigned count = dx_get_count (entries), names = 0, space = 0, i;
295 unsigned bcount = 0;
296 struct buffer_head *bh;
297 int err;
298 printk("%i indexed blocks...\n", count);
299 for (i = 0; i < count; i++, entries++)
300 {
301 u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
302 u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
303 struct stats stats;
304 printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
305 if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue;
306 stats = levels?
307 dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
308 dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0);
309 names += stats.names;
310 space += stats.space;
311 bcount += stats.bcount;
312 brelse (bh);
313 }
314 if (bcount)
315 printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ",
316 names, space/bcount,(space/bcount)*100/blocksize);
317 return (struct stats) { names, space, bcount};
318}
319#endif /* DX_DEBUG */
320
321/*
322 * Probe for a directory leaf block to search.
323 *
324 * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
325 * error in the directory index, and the caller should fall back to
326 * searching the directory normally. The callers of dx_probe **MUST**
327 * check for this error code, and make sure it never gets reflected
328 * back to userspace.
329 */
330static struct dx_frame *
331dx_probe(struct dentry *dentry, struct inode *dir,
332 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
333{
334 unsigned count, indirect;
335 struct dx_entry *at, *entries, *p, *q, *m;
336 struct dx_root *root;
337 struct buffer_head *bh;
338 struct dx_frame *frame = frame_in;
339 u32 hash;
340
341 frame->bh = NULL;
342 if (dentry)
343 dir = dentry->d_parent->d_inode;
344 if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
345 goto fail;
346 root = (struct dx_root *) bh->b_data;
347 if (root->info.hash_version != DX_HASH_TEA &&
348 root->info.hash_version != DX_HASH_HALF_MD4 &&
349 root->info.hash_version != DX_HASH_LEGACY) {
350 ext4_warning(dir->i_sb, __FUNCTION__,
351 "Unrecognised inode hash code %d",
352 root->info.hash_version);
353 brelse(bh);
354 *err = ERR_BAD_DX_DIR;
355 goto fail;
356 }
357 hinfo->hash_version = root->info.hash_version;
358 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
359 if (dentry)
360 ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
361 hash = hinfo->hash;
362
363 if (root->info.unused_flags & 1) {
364 ext4_warning(dir->i_sb, __FUNCTION__,
365 "Unimplemented inode hash flags: %#06x",
366 root->info.unused_flags);
367 brelse(bh);
368 *err = ERR_BAD_DX_DIR;
369 goto fail;
370 }
371
372 if ((indirect = root->info.indirect_levels) > 1) {
373 ext4_warning(dir->i_sb, __FUNCTION__,
374 "Unimplemented inode hash depth: %#06x",
375 root->info.indirect_levels);
376 brelse(bh);
377 *err = ERR_BAD_DX_DIR;
378 goto fail;
379 }
380
381 entries = (struct dx_entry *) (((char *)&root->info) +
382 root->info.info_length);
383 assert(dx_get_limit(entries) == dx_root_limit(dir,
384 root->info.info_length));
385 dxtrace (printk("Look up %x", hash));
386 while (1)
387 {
388 count = dx_get_count(entries);
389 assert (count && count <= dx_get_limit(entries));
390 p = entries + 1;
391 q = entries + count - 1;
392 while (p <= q)
393 {
394 m = p + (q - p)/2;
395 dxtrace(printk("."));
396 if (dx_get_hash(m) > hash)
397 q = m - 1;
398 else
399 p = m + 1;
400 }
401
402 if (0) // linear search cross check
403 {
404 unsigned n = count - 1;
405 at = entries;
406 while (n--)
407 {
408 dxtrace(printk(","));
409 if (dx_get_hash(++at) > hash)
410 {
411 at--;
412 break;
413 }
414 }
415 assert (at == p - 1);
416 }
417
418 at = p - 1;
419 dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
420 frame->bh = bh;
421 frame->entries = entries;
422 frame->at = at;
423 if (!indirect--) return frame;
424 if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
425 goto fail2;
426 at = entries = ((struct dx_node *) bh->b_data)->entries;
427 assert (dx_get_limit(entries) == dx_node_limit (dir));
428 frame++;
429 }
430fail2:
431 while (frame >= frame_in) {
432 brelse(frame->bh);
433 frame--;
434 }
435fail:
436 return NULL;
437}
438
439static void dx_release (struct dx_frame *frames)
440{
441 if (frames[0].bh == NULL)
442 return;
443
444 if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
445 brelse(frames[1].bh);
446 brelse(frames[0].bh);
447}
448
449/*
450 * This function increments the frame pointer to search the next leaf
451 * block, and reads in the necessary intervening nodes if the search
452 * should be necessary. Whether or not the search is necessary is
453 * controlled by the hash parameter. If the hash value is even, then
454 * the search is only continued if the next block starts with that
455 * hash value. This is used if we are searching for a specific file.
456 *
457 * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
458 *
459 * This function returns 1 if the caller should continue to search,
460 * or 0 if it should not. If there is an error reading one of the
461 * index blocks, it will a negative error code.
462 *
463 * If start_hash is non-null, it will be filled in with the starting
464 * hash of the next page.
465 */
466static int ext4_htree_next_block(struct inode *dir, __u32 hash,
467 struct dx_frame *frame,
468 struct dx_frame *frames,
469 __u32 *start_hash)
470{
471 struct dx_frame *p;
472 struct buffer_head *bh;
473 int err, num_frames = 0;
474 __u32 bhash;
475
476 p = frame;
477 /*
478 * Find the next leaf page by incrementing the frame pointer.
479 * If we run out of entries in the interior node, loop around and
480 * increment pointer in the parent node. When we break out of
481 * this loop, num_frames indicates the number of interior
482 * nodes need to be read.
483 */
484 while (1) {
485 if (++(p->at) < p->entries + dx_get_count(p->entries))
486 break;
487 if (p == frames)
488 return 0;
489 num_frames++;
490 p--;
491 }
492
493 /*
494 * If the hash is 1, then continue only if the next page has a
495 * continuation hash of any value. This is used for readdir
496 * handling. Otherwise, check to see if the hash matches the
497 * desired contiuation hash. If it doesn't, return since
498 * there's no point to read in the successive index pages.
499 */
500 bhash = dx_get_hash(p->at);
501 if (start_hash)
502 *start_hash = bhash;
503 if ((hash & 1) == 0) {
504 if ((bhash & ~1) != hash)
505 return 0;
506 }
507 /*
508 * If the hash is HASH_NB_ALWAYS, we always go to the next
509 * block so no check is necessary
510 */
511 while (num_frames--) {
512 if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
513 0, &err)))
514 return err; /* Failure */
515 p++;
516 brelse (p->bh);
517 p->bh = bh;
518 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
519 }
520 return 1;
521}
522
523
524/*
525 * p is at least 6 bytes before the end of page
526 */
527static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
528{
529 return (struct ext4_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
530}
531
532/*
533 * This function fills a red-black tree with information from a
534 * directory block. It returns the number directory entries loaded
535 * into the tree. If there is an error it is returned in err.
536 */
537static int htree_dirblock_to_tree(struct file *dir_file,
538 struct inode *dir, int block,
539 struct dx_hash_info *hinfo,
540 __u32 start_hash, __u32 start_minor_hash)
541{
542 struct buffer_head *bh;
543 struct ext4_dir_entry_2 *de, *top;
544 int err, count = 0;
545
546 dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
547 if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
548 return err;
549
550 de = (struct ext4_dir_entry_2 *) bh->b_data;
551 top = (struct ext4_dir_entry_2 *) ((char *) de +
552 dir->i_sb->s_blocksize -
553 EXT4_DIR_REC_LEN(0));
554 for (; de < top; de = ext4_next_entry(de)) {
555 ext4fs_dirhash(de->name, de->name_len, hinfo);
556 if ((hinfo->hash < start_hash) ||
557 ((hinfo->hash == start_hash) &&
558 (hinfo->minor_hash < start_minor_hash)))
559 continue;
560 if (de->inode == 0)
561 continue;
562 if ((err = ext4_htree_store_dirent(dir_file,
563 hinfo->hash, hinfo->minor_hash, de)) != 0) {
564 brelse(bh);
565 return err;
566 }
567 count++;
568 }
569 brelse(bh);
570 return count;
571}
572
573
574/*
575 * This function fills a red-black tree with information from a
576 * directory. We start scanning the directory in hash order, starting
577 * at start_hash and start_minor_hash.
578 *
579 * This function returns the number of entries inserted into the tree,
580 * or a negative error code.
581 */
582int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
583 __u32 start_minor_hash, __u32 *next_hash)
584{
585 struct dx_hash_info hinfo;
586 struct ext4_dir_entry_2 *de;
587 struct dx_frame frames[2], *frame;
588 struct inode *dir;
589 int block, err;
590 int count = 0;
591 int ret;
592 __u32 hashval;
593
594 dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
595 start_minor_hash));
596 dir = dir_file->f_dentry->d_inode;
597 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
598 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
599 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
600 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
601 start_hash, start_minor_hash);
602 *next_hash = ~0;
603 return count;
604 }
605 hinfo.hash = start_hash;
606 hinfo.minor_hash = 0;
607 frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
608 if (!frame)
609 return err;
610
611 /* Add '.' and '..' from the htree header */
612 if (!start_hash && !start_minor_hash) {
613 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
614 if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0)
615 goto errout;
616 count++;
617 }
618 if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
619 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
620 de = ext4_next_entry(de);
621 if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
622 goto errout;
623 count++;
624 }
625
626 while (1) {
627 block = dx_get_block(frame->at);
628 ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
629 start_hash, start_minor_hash);
630 if (ret < 0) {
631 err = ret;
632 goto errout;
633 }
634 count += ret;
635 hashval = ~0;
636 ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
637 frame, frames, &hashval);
638 *next_hash = hashval;
639 if (ret < 0) {
640 err = ret;
641 goto errout;
642 }
643 /*
644 * Stop if: (a) there are no more entries, or
645 * (b) we have inserted at least one entry and the
646 * next hash value is not a continuation
647 */
648 if ((ret == 0) ||
649 (count && ((hashval & 1) == 0)))
650 break;
651 }
652 dx_release(frames);
653 dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
654 count, *next_hash));
655 return count;
656errout:
657 dx_release(frames);
658 return (err);
659}
660
661
662/*
663 * Directory block splitting, compacting
664 */
665
666static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
667 struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
668{
669 int count = 0;
670 char *base = (char *) de;
671 struct dx_hash_info h = *hinfo;
672
673 while ((char *) de < base + size)
674 {
675 if (de->name_len && de->inode) {
676 ext4fs_dirhash(de->name, de->name_len, &h);
677 map_tail--;
678 map_tail->hash = h.hash;
679 map_tail->offs = (u32) ((char *) de - base);
680 count++;
681 cond_resched();
682 }
683 /* XXX: do we need to check rec_len == 0 case? -Chris */
684 de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
685 }
686 return count;
687}
688
689static void dx_sort_map (struct dx_map_entry *map, unsigned count)
690{
691 struct dx_map_entry *p, *q, *top = map + count - 1;
692 int more;
693 /* Combsort until bubble sort doesn't suck */
694 while (count > 2) {
695 count = count*10/13;
696 if (count - 9 < 2) /* 9, 10 -> 11 */
697 count = 11;
698 for (p = top, q = p - count; q >= map; p--, q--)
699 if (p->hash < q->hash)
700 swap(*p, *q);
701 }
702 /* Garden variety bubble sort */
703 do {
704 more = 0;
705 q = top;
706 while (q-- > map) {
707 if (q[1].hash >= q[0].hash)
708 continue;
709 swap(*(q+1), *q);
710 more = 1;
711 }
712 } while(more);
713}
714
715static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
716{
717 struct dx_entry *entries = frame->entries;
718 struct dx_entry *old = frame->at, *new = old + 1;
719 int count = dx_get_count(entries);
720
721 assert(count < dx_get_limit(entries));
722 assert(old < entries + count);
723 memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
724 dx_set_hash(new, hash);
725 dx_set_block(new, block);
726 dx_set_count(entries, count + 1);
727}
728#endif
729
730
731static void ext4_update_dx_flag(struct inode *inode)
732{
733 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
734 EXT4_FEATURE_COMPAT_DIR_INDEX))
735 EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
736}
737
738/*
739 * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
740 *
741 * `len <= EXT4_NAME_LEN' is guaranteed by caller.
742 * `de != NULL' is guaranteed by caller.
743 */
744static inline int ext4_match (int len, const char * const name,
745 struct ext4_dir_entry_2 * de)
746{
747 if (len != de->name_len)
748 return 0;
749 if (!de->inode)
750 return 0;
751 return !memcmp(name, de->name, len);
752}
753
754/*
755 * Returns 0 if not found, -1 on failure, and 1 on success
756 */
757static inline int search_dirblock(struct buffer_head * bh,
758 struct inode *dir,
759 struct dentry *dentry,
760 unsigned long offset,
761 struct ext4_dir_entry_2 ** res_dir)
762{
763 struct ext4_dir_entry_2 * de;
764 char * dlimit;
765 int de_len;
766 const char *name = dentry->d_name.name;
767 int namelen = dentry->d_name.len;
768
769 de = (struct ext4_dir_entry_2 *) bh->b_data;
770 dlimit = bh->b_data + dir->i_sb->s_blocksize;
771 while ((char *) de < dlimit) {
772 /* this code is executed quadratically often */
773 /* do minimal checking `by hand' */
774
775 if ((char *) de + namelen <= dlimit &&
776 ext4_match (namelen, name, de)) {
777 /* found a match - just to be sure, do a full check */
778 if (!ext4_check_dir_entry("ext4_find_entry",
779 dir, de, bh, offset))
780 return -1;
781 *res_dir = de;
782 return 1;
783 }
784 /* prevent looping on a bad block */
785 de_len = le16_to_cpu(de->rec_len);
786 if (de_len <= 0)
787 return -1;
788 offset += de_len;
789 de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
790 }
791 return 0;
792}
793
794
795/*
796 * ext4_find_entry()
797 *
798 * finds an entry in the specified directory with the wanted name. It
799 * returns the cache buffer in which the entry was found, and the entry
800 * itself (as a parameter - res_dir). It does NOT read the inode of the
801 * entry - you'll have to do that yourself if you want to.
802 *
803 * The returned buffer_head has ->b_count elevated. The caller is expected
804 * to brelse() it when appropriate.
805 */
806static struct buffer_head * ext4_find_entry (struct dentry *dentry,
807 struct ext4_dir_entry_2 ** res_dir)
808{
809 struct super_block * sb;
810 struct buffer_head * bh_use[NAMEI_RA_SIZE];
811 struct buffer_head * bh, *ret = NULL;
812 unsigned long start, block, b;
813 int ra_max = 0; /* Number of bh's in the readahead
814 buffer, bh_use[] */
815 int ra_ptr = 0; /* Current index into readahead
816 buffer */
817 int num = 0;
818 int nblocks, i, err;
819 struct inode *dir = dentry->d_parent->d_inode;
820 int namelen;
821 const u8 *name;
822 unsigned blocksize;
823
824 *res_dir = NULL;
825 sb = dir->i_sb;
826 blocksize = sb->s_blocksize;
827 namelen = dentry->d_name.len;
828 name = dentry->d_name.name;
829 if (namelen > EXT4_NAME_LEN)
830 return NULL;
831#ifdef CONFIG_EXT4_INDEX
832 if (is_dx(dir)) {
833 bh = ext4_dx_find_entry(dentry, res_dir, &err);
834 /*
835 * On success, or if the error was file not found,
836 * return. Otherwise, fall back to doing a search the
837 * old fashioned way.
838 */
839 if (bh || (err != ERR_BAD_DX_DIR))
840 return bh;
841 dxtrace(printk("ext4_find_entry: dx failed, falling back\n"));
842 }
843#endif
844 nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
845 start = EXT4_I(dir)->i_dir_start_lookup;
846 if (start >= nblocks)
847 start = 0;
848 block = start;
849restart:
850 do {
851 /*
852 * We deal with the read-ahead logic here.
853 */
854 if (ra_ptr >= ra_max) {
855 /* Refill the readahead buffer */
856 ra_ptr = 0;
857 b = block;
858 for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
859 /*
860 * Terminate if we reach the end of the
861 * directory and must wrap, or if our
862 * search has finished at this block.
863 */
864 if (b >= nblocks || (num && block == start)) {
865 bh_use[ra_max] = NULL;
866 break;
867 }
868 num++;
869 bh = ext4_getblk(NULL, dir, b++, 0, &err);
870 bh_use[ra_max] = bh;
871 if (bh)
872 ll_rw_block(READ_META, 1, &bh);
873 }
874 }
875 if ((bh = bh_use[ra_ptr++]) == NULL)
876 goto next;
877 wait_on_buffer(bh);
878 if (!buffer_uptodate(bh)) {
879 /* read error, skip block & hope for the best */
880 ext4_error(sb, __FUNCTION__, "reading directory #%lu "
881 "offset %lu", dir->i_ino, block);
882 brelse(bh);
883 goto next;
884 }
885 i = search_dirblock(bh, dir, dentry,
886 block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
887 if (i == 1) {
888 EXT4_I(dir)->i_dir_start_lookup = block;
889 ret = bh;
890 goto cleanup_and_exit;
891 } else {
892 brelse(bh);
893 if (i < 0)
894 goto cleanup_and_exit;
895 }
896 next:
897 if (++block >= nblocks)
898 block = 0;
899 } while (block != start);
900
901 /*
902 * If the directory has grown while we were searching, then
903 * search the last part of the directory before giving up.
904 */
905 block = nblocks;
906 nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
907 if (block < nblocks) {
908 start = 0;
909 goto restart;
910 }
911
912cleanup_and_exit:
913 /* Clean up the read-ahead blocks */
914 for (; ra_ptr < ra_max; ra_ptr++)
915 brelse (bh_use[ra_ptr]);
916 return ret;
917}
918
919#ifdef CONFIG_EXT4_INDEX
920static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
921 struct ext4_dir_entry_2 **res_dir, int *err)
922{
923 struct super_block * sb;
924 struct dx_hash_info hinfo;
925 u32 hash;
926 struct dx_frame frames[2], *frame;
927 struct ext4_dir_entry_2 *de, *top;
928 struct buffer_head *bh;
929 unsigned long block;
930 int retval;
931 int namelen = dentry->d_name.len;
932 const u8 *name = dentry->d_name.name;
933 struct inode *dir = dentry->d_parent->d_inode;
934
935 sb = dir->i_sb;
936 /* NFS may look up ".." - look at dx_root directory block */
937 if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
938 if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
939 return NULL;
940 } else {
941 frame = frames;
942 frame->bh = NULL; /* for dx_release() */
943 frame->at = (struct dx_entry *)frames; /* hack for zero entry*/
944 dx_set_block(frame->at, 0); /* dx_root block is 0 */
945 }
946 hash = hinfo.hash;
947 do {
948 block = dx_get_block(frame->at);
949 if (!(bh = ext4_bread (NULL,dir, block, 0, err)))
950 goto errout;
951 de = (struct ext4_dir_entry_2 *) bh->b_data;
952 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
953 EXT4_DIR_REC_LEN(0));
954 for (; de < top; de = ext4_next_entry(de))
955 if (ext4_match (namelen, name, de)) {
956 if (!ext4_check_dir_entry("ext4_find_entry",
957 dir, de, bh,
958 (block<<EXT4_BLOCK_SIZE_BITS(sb))
959 +((char *)de - bh->b_data))) {
960 brelse (bh);
961 goto errout;
962 }
963 *res_dir = de;
964 dx_release (frames);
965 return bh;
966 }
967 brelse (bh);
968 /* Check to see if we should continue to search */
969 retval = ext4_htree_next_block(dir, hash, frame,
970 frames, NULL);
971 if (retval < 0) {
972 ext4_warning(sb, __FUNCTION__,
973 "error reading index page in directory #%lu",
974 dir->i_ino);
975 *err = retval;
976 goto errout;
977 }
978 } while (retval == 1);
979
980 *err = -ENOENT;
981errout:
982 dxtrace(printk("%s not found\n", name));
983 dx_release (frames);
984 return NULL;
985}
986#endif
987
988static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
989{
990 struct inode * inode;
991 struct ext4_dir_entry_2 * de;
992 struct buffer_head * bh;
993
994 if (dentry->d_name.len > EXT4_NAME_LEN)
995 return ERR_PTR(-ENAMETOOLONG);
996
997 bh = ext4_find_entry(dentry, &de);
998 inode = NULL;
999 if (bh) {
1000 unsigned long ino = le32_to_cpu(de->inode);
1001 brelse (bh);
1002 if (!ext4_valid_inum(dir->i_sb, ino)) {
1003 ext4_error(dir->i_sb, "ext4_lookup",
1004 "bad inode number: %lu", ino);
1005 inode = NULL;
1006 } else
1007 inode = iget(dir->i_sb, ino);
1008
1009 if (!inode)
1010 return ERR_PTR(-EACCES);
1011 }
1012 return d_splice_alias(inode, dentry);
1013}
1014
1015
1016struct dentry *ext4_get_parent(struct dentry *child)
1017{
1018 unsigned long ino;
1019 struct dentry *parent;
1020 struct inode *inode;
1021 struct dentry dotdot;
1022 struct ext4_dir_entry_2 * de;
1023 struct buffer_head *bh;
1024
1025 dotdot.d_name.name = "..";
1026 dotdot.d_name.len = 2;
1027 dotdot.d_parent = child; /* confusing, isn't it! */
1028
1029 bh = ext4_find_entry(&dotdot, &de);
1030 inode = NULL;
1031 if (!bh)
1032 return ERR_PTR(-ENOENT);
1033 ino = le32_to_cpu(de->inode);
1034 brelse(bh);
1035
1036 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1037 ext4_error(child->d_inode->i_sb, "ext4_get_parent",
1038 "bad inode number: %lu", ino);
1039 inode = NULL;
1040 } else
1041 inode = iget(child->d_inode->i_sb, ino);
1042
1043 if (!inode)
1044 return ERR_PTR(-EACCES);
1045
1046 parent = d_alloc_anon(inode);
1047 if (!parent) {
1048 iput(inode);
1049 parent = ERR_PTR(-ENOMEM);
1050 }
1051 return parent;
1052}
1053
1054#define S_SHIFT 12
1055static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
1056 [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
1057 [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
1058 [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
1059 [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
1060 [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
1061 [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
1062 [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
1063};
1064
1065static inline void ext4_set_de_type(struct super_block *sb,
1066 struct ext4_dir_entry_2 *de,
1067 umode_t mode) {
1068 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
1069 de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
1070}
1071
1072#ifdef CONFIG_EXT4_INDEX
1073static struct ext4_dir_entry_2 *
1074dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1075{
1076 unsigned rec_len = 0;
1077
1078 while (count--) {
1079 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs);
1080 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1081 memcpy (to, de, rec_len);
1082 ((struct ext4_dir_entry_2 *) to)->rec_len =
1083 cpu_to_le16(rec_len);
1084 de->inode = 0;
1085 map++;
1086 to += rec_len;
1087 }
1088 return (struct ext4_dir_entry_2 *) (to - rec_len);
1089}
1090
1091static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
1092{
1093 struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
1094 unsigned rec_len = 0;
1095
1096 prev = to = de;
1097 while ((char*)de < base + size) {
1098 next = (struct ext4_dir_entry_2 *) ((char *) de +
1099 le16_to_cpu(de->rec_len));
1100 if (de->inode && de->name_len) {
1101 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1102 if (de > to)
1103 memmove(to, de, rec_len);
1104 to->rec_len = cpu_to_le16(rec_len);
1105 prev = to;
1106 to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
1107 }
1108 de = next;
1109 }
1110 return prev;
1111}
1112
1113static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1114 struct buffer_head **bh,struct dx_frame *frame,
1115 struct dx_hash_info *hinfo, int *error)
1116{
1117 unsigned blocksize = dir->i_sb->s_blocksize;
1118 unsigned count, continued;
1119 struct buffer_head *bh2;
1120 u32 newblock;
1121 u32 hash2;
1122 struct dx_map_entry *map;
1123 char *data1 = (*bh)->b_data, *data2;
1124 unsigned split;
1125 struct ext4_dir_entry_2 *de = NULL, *de2;
1126 int err;
1127
1128 bh2 = ext4_append (handle, dir, &newblock, error);
1129 if (!(bh2)) {
1130 brelse(*bh);
1131 *bh = NULL;
1132 goto errout;
1133 }
1134
1135 BUFFER_TRACE(*bh, "get_write_access");
1136 err = ext4_journal_get_write_access(handle, *bh);
1137 if (err) {
1138 journal_error:
1139 brelse(*bh);
1140 brelse(bh2);
1141 *bh = NULL;
1142 ext4_std_error(dir->i_sb, err);
1143 goto errout;
1144 }
1145 BUFFER_TRACE(frame->bh, "get_write_access");
1146 err = ext4_journal_get_write_access(handle, frame->bh);
1147 if (err)
1148 goto journal_error;
1149
1150 data2 = bh2->b_data;
1151
1152 /* create map in the end of data2 block */
1153 map = (struct dx_map_entry *) (data2 + blocksize);
1154 count = dx_make_map ((struct ext4_dir_entry_2 *) data1,
1155 blocksize, hinfo, map);
1156 map -= count;
1157 split = count/2; // need to adjust to actual middle
1158 dx_sort_map (map, count);
1159 hash2 = map[split].hash;
1160 continued = hash2 == map[split - 1].hash;
1161 dxtrace(printk("Split block %i at %x, %i/%i\n",
1162 dx_get_block(frame->at), hash2, split, count-split));
1163
1164 /* Fancy dance to stay within two buffers */
1165 de2 = dx_move_dirents(data1, data2, map + split, count - split);
1166 de = dx_pack_dirents(data1,blocksize);
1167 de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
1168 de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
1169 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
1170 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
1171
1172 /* Which block gets the new entry? */
1173 if (hinfo->hash >= hash2)
1174 {
1175 swap(*bh, bh2);
1176 de = de2;
1177 }
1178 dx_insert_block (frame, hash2 + continued, newblock);
1179 err = ext4_journal_dirty_metadata (handle, bh2);
1180 if (err)
1181 goto journal_error;
1182 err = ext4_journal_dirty_metadata (handle, frame->bh);
1183 if (err)
1184 goto journal_error;
1185 brelse (bh2);
1186 dxtrace(dx_show_index ("frame", frame->entries));
1187errout:
1188 return de;
1189}
1190#endif
1191
1192
1193/*
1194 * Add a new entry into a directory (leaf) block. If de is non-NULL,
1195 * it points to a directory entry which is guaranteed to be large
1196 * enough for new directory entry. If de is NULL, then
1197 * add_dirent_to_buf will attempt search the directory block for
1198 * space. It will return -ENOSPC if no space is available, and -EIO
1199 * and -EEXIST if directory entry already exists.
1200 *
1201 * NOTE! bh is NOT released in the case where ENOSPC is returned. In
1202 * all other cases bh is released.
1203 */
1204static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1205 struct inode *inode, struct ext4_dir_entry_2 *de,
1206 struct buffer_head * bh)
1207{
1208 struct inode *dir = dentry->d_parent->d_inode;
1209 const char *name = dentry->d_name.name;
1210 int namelen = dentry->d_name.len;
1211 unsigned long offset = 0;
1212 unsigned short reclen;
1213 int nlen, rlen, err;
1214 char *top;
1215
1216 reclen = EXT4_DIR_REC_LEN(namelen);
1217 if (!de) {
1218 de = (struct ext4_dir_entry_2 *)bh->b_data;
1219 top = bh->b_data + dir->i_sb->s_blocksize - reclen;
1220 while ((char *) de <= top) {
1221 if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
1222 bh, offset)) {
1223 brelse (bh);
1224 return -EIO;
1225 }
1226 if (ext4_match (namelen, name, de)) {
1227 brelse (bh);
1228 return -EEXIST;
1229 }
1230 nlen = EXT4_DIR_REC_LEN(de->name_len);
1231 rlen = le16_to_cpu(de->rec_len);
1232 if ((de->inode? rlen - nlen: rlen) >= reclen)
1233 break;
1234 de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
1235 offset += rlen;
1236 }
1237 if ((char *) de > top)
1238 return -ENOSPC;
1239 }
1240 BUFFER_TRACE(bh, "get_write_access");
1241 err = ext4_journal_get_write_access(handle, bh);
1242 if (err) {
1243 ext4_std_error(dir->i_sb, err);
1244 brelse(bh);
1245 return err;
1246 }
1247
1248 /* By now the buffer is marked for journaling */
1249 nlen = EXT4_DIR_REC_LEN(de->name_len);
1250 rlen = le16_to_cpu(de->rec_len);
1251 if (de->inode) {
1252 struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
1253 de1->rec_len = cpu_to_le16(rlen - nlen);
1254 de->rec_len = cpu_to_le16(nlen);
1255 de = de1;
1256 }
1257 de->file_type = EXT4_FT_UNKNOWN;
1258 if (inode) {
1259 de->inode = cpu_to_le32(inode->i_ino);
1260 ext4_set_de_type(dir->i_sb, de, inode->i_mode);
1261 } else
1262 de->inode = 0;
1263 de->name_len = namelen;
1264 memcpy (de->name, name, namelen);
1265 /*
1266 * XXX shouldn't update any times until successful
1267 * completion of syscall, but too many callers depend
1268 * on this.
1269 *
1270 * XXX similarly, too many callers depend on
1271 * ext4_new_inode() setting the times, but error
1272 * recovery deletes the inode, so the worst that can
1273 * happen is that the times are slightly out of date
1274 * and/or different from the directory change time.
1275 */
1276 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
1277 ext4_update_dx_flag(dir);
1278 dir->i_version++;
1279 ext4_mark_inode_dirty(handle, dir);
1280 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
1281 err = ext4_journal_dirty_metadata(handle, bh);
1282 if (err)
1283 ext4_std_error(dir->i_sb, err);
1284 brelse(bh);
1285 return 0;
1286}
1287
1288#ifdef CONFIG_EXT4_INDEX
1289/*
1290 * This converts a one block unindexed directory to a 3 block indexed
1291 * directory, and adds the dentry to the indexed directory.
1292 */
1293static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1294 struct inode *inode, struct buffer_head *bh)
1295{
1296 struct inode *dir = dentry->d_parent->d_inode;
1297 const char *name = dentry->d_name.name;
1298 int namelen = dentry->d_name.len;
1299 struct buffer_head *bh2;
1300 struct dx_root *root;
1301 struct dx_frame frames[2], *frame;
1302 struct dx_entry *entries;
1303 struct ext4_dir_entry_2 *de, *de2;
1304 char *data1, *top;
1305 unsigned len;
1306 int retval;
1307 unsigned blocksize;
1308 struct dx_hash_info hinfo;
1309 u32 block;
1310 struct fake_dirent *fde;
1311
1312 blocksize = dir->i_sb->s_blocksize;
1313 dxtrace(printk("Creating index\n"));
1314 retval = ext4_journal_get_write_access(handle, bh);
1315 if (retval) {
1316 ext4_std_error(dir->i_sb, retval);
1317 brelse(bh);
1318 return retval;
1319 }
1320 root = (struct dx_root *) bh->b_data;
1321
1322 bh2 = ext4_append (handle, dir, &block, &retval);
1323 if (!(bh2)) {
1324 brelse(bh);
1325 return retval;
1326 }
1327 EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
1328 data1 = bh2->b_data;
1329
1330 /* The 0th block becomes the root, move the dirents out */
1331 fde = &root->dotdot;
1332 de = (struct ext4_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len));
1333 len = ((char *) root) + blocksize - (char *) de;
1334 memcpy (data1, de, len);
1335 de = (struct ext4_dir_entry_2 *) data1;
1336 top = data1 + len;
1337 while ((char *)(de2=(void*)de+le16_to_cpu(de->rec_len)) < top)
1338 de = de2;
1339 de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
1340 /* Initialize the root; the dot dirents already exist */
1341 de = (struct ext4_dir_entry_2 *) (&root->dotdot);
1342 de->rec_len = cpu_to_le16(blocksize - EXT4_DIR_REC_LEN(2));
1343 memset (&root->info, 0, sizeof(root->info));
1344 root->info.info_length = sizeof(root->info);
1345 root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
1346 entries = root->entries;
1347 dx_set_block (entries, 1);
1348 dx_set_count (entries, 1);
1349 dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
1350
1351 /* Initialize as for dx_probe */
1352 hinfo.hash_version = root->info.hash_version;
1353 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
1354 ext4fs_dirhash(name, namelen, &hinfo);
1355 frame = frames;
1356 frame->entries = entries;
1357 frame->at = entries;
1358 frame->bh = bh;
1359 bh = bh2;
1360 de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1361 dx_release (frames);
1362 if (!(de))
1363 return retval;
1364
1365 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1366}
1367#endif
1368
1369/*
1370 * ext4_add_entry()
1371 *
1372 * adds a file entry to the specified directory, using the same
1373 * semantics as ext4_find_entry(). It returns NULL if it failed.
1374 *
1375 * NOTE!! The inode part of 'de' is left at 0 - which means you
1376 * may not sleep between calling this and putting something into
1377 * the entry, as someone else might have used it while you slept.
1378 */
1379static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
1380 struct inode *inode)
1381{
1382 struct inode *dir = dentry->d_parent->d_inode;
1383 unsigned long offset;
1384 struct buffer_head * bh;
1385 struct ext4_dir_entry_2 *de;
1386 struct super_block * sb;
1387 int retval;
1388#ifdef CONFIG_EXT4_INDEX
1389 int dx_fallback=0;
1390#endif
1391 unsigned blocksize;
1392 u32 block, blocks;
1393
1394 sb = dir->i_sb;
1395 blocksize = sb->s_blocksize;
1396 if (!dentry->d_name.len)
1397 return -EINVAL;
1398#ifdef CONFIG_EXT4_INDEX
1399 if (is_dx(dir)) {
1400 retval = ext4_dx_add_entry(handle, dentry, inode);
1401 if (!retval || (retval != ERR_BAD_DX_DIR))
1402 return retval;
1403 EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
1404 dx_fallback++;
1405 ext4_mark_inode_dirty(handle, dir);
1406 }
1407#endif
1408 blocks = dir->i_size >> sb->s_blocksize_bits;
1409 for (block = 0, offset = 0; block < blocks; block++) {
1410 bh = ext4_bread(handle, dir, block, 0, &retval);
1411 if(!bh)
1412 return retval;
1413 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1414 if (retval != -ENOSPC)
1415 return retval;
1416
1417#ifdef CONFIG_EXT4_INDEX
1418 if (blocks == 1 && !dx_fallback &&
1419 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
1420 return make_indexed_dir(handle, dentry, inode, bh);
1421#endif
1422 brelse(bh);
1423 }
1424 bh = ext4_append(handle, dir, &block, &retval);
1425 if (!bh)
1426 return retval;
1427 de = (struct ext4_dir_entry_2 *) bh->b_data;
1428 de->inode = 0;
1429 de->rec_len = cpu_to_le16(blocksize);
1430 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1431}
1432
1433#ifdef CONFIG_EXT4_INDEX
1434/*
1435 * Returns 0 for success, or a negative error value
1436 */
1437static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1438 struct inode *inode)
1439{
1440 struct dx_frame frames[2], *frame;
1441 struct dx_entry *entries, *at;
1442 struct dx_hash_info hinfo;
1443 struct buffer_head * bh;
1444 struct inode *dir = dentry->d_parent->d_inode;
1445 struct super_block * sb = dir->i_sb;
1446 struct ext4_dir_entry_2 *de;
1447 int err;
1448
1449 frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
1450 if (!frame)
1451 return err;
1452 entries = frame->entries;
1453 at = frame->at;
1454
1455 if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
1456 goto cleanup;
1457
1458 BUFFER_TRACE(bh, "get_write_access");
1459 err = ext4_journal_get_write_access(handle, bh);
1460 if (err)
1461 goto journal_error;
1462
1463 err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1464 if (err != -ENOSPC) {
1465 bh = NULL;
1466 goto cleanup;
1467 }
1468
1469 /* Block full, should compress but for now just split */
1470 dxtrace(printk("using %u of %u node entries\n",
1471 dx_get_count(entries), dx_get_limit(entries)));
1472 /* Need to split index? */
1473 if (dx_get_count(entries) == dx_get_limit(entries)) {
1474 u32 newblock;
1475 unsigned icount = dx_get_count(entries);
1476 int levels = frame - frames;
1477 struct dx_entry *entries2;
1478 struct dx_node *node2;
1479 struct buffer_head *bh2;
1480
1481 if (levels && (dx_get_count(frames->entries) ==
1482 dx_get_limit(frames->entries))) {
1483 ext4_warning(sb, __FUNCTION__,
1484 "Directory index full!");
1485 err = -ENOSPC;
1486 goto cleanup;
1487 }
1488 bh2 = ext4_append (handle, dir, &newblock, &err);
1489 if (!(bh2))
1490 goto cleanup;
1491 node2 = (struct dx_node *)(bh2->b_data);
1492 entries2 = node2->entries;
1493 node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
1494 node2->fake.inode = 0;
1495 BUFFER_TRACE(frame->bh, "get_write_access");
1496 err = ext4_journal_get_write_access(handle, frame->bh);
1497 if (err)
1498 goto journal_error;
1499 if (levels) {
1500 unsigned icount1 = icount/2, icount2 = icount - icount1;
1501 unsigned hash2 = dx_get_hash(entries + icount1);
1502 dxtrace(printk("Split index %i/%i\n", icount1, icount2));
1503
1504 BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
1505 err = ext4_journal_get_write_access(handle,
1506 frames[0].bh);
1507 if (err)
1508 goto journal_error;
1509
1510 memcpy ((char *) entries2, (char *) (entries + icount1),
1511 icount2 * sizeof(struct dx_entry));
1512 dx_set_count (entries, icount1);
1513 dx_set_count (entries2, icount2);
1514 dx_set_limit (entries2, dx_node_limit(dir));
1515
1516 /* Which index block gets the new entry? */
1517 if (at - entries >= icount1) {
1518 frame->at = at = at - entries - icount1 + entries2;
1519 frame->entries = entries = entries2;
1520 swap(frame->bh, bh2);
1521 }
1522 dx_insert_block (frames + 0, hash2, newblock);
1523 dxtrace(dx_show_index ("node", frames[1].entries));
1524 dxtrace(dx_show_index ("node",
1525 ((struct dx_node *) bh2->b_data)->entries));
1526 err = ext4_journal_dirty_metadata(handle, bh2);
1527 if (err)
1528 goto journal_error;
1529 brelse (bh2);
1530 } else {
1531 dxtrace(printk("Creating second level index...\n"));
1532 memcpy((char *) entries2, (char *) entries,
1533 icount * sizeof(struct dx_entry));
1534 dx_set_limit(entries2, dx_node_limit(dir));
1535
1536 /* Set up root */
1537 dx_set_count(entries, 1);
1538 dx_set_block(entries + 0, newblock);
1539 ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
1540
1541 /* Add new access path frame */
1542 frame = frames + 1;
1543 frame->at = at = at - entries + entries2;
1544 frame->entries = entries = entries2;
1545 frame->bh = bh2;
1546 err = ext4_journal_get_write_access(handle,
1547 frame->bh);
1548 if (err)
1549 goto journal_error;
1550 }
1551 ext4_journal_dirty_metadata(handle, frames[0].bh);
1552 }
1553 de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1554 if (!de)
1555 goto cleanup;
1556 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1557 bh = NULL;
1558 goto cleanup;
1559
1560journal_error:
1561 ext4_std_error(dir->i_sb, err);
1562cleanup:
1563 if (bh)
1564 brelse(bh);
1565 dx_release(frames);
1566 return err;
1567}
1568#endif
1569
1570/*
1571 * ext4_delete_entry deletes a directory entry by merging it with the
1572 * previous entry
1573 */
1574static int ext4_delete_entry (handle_t *handle,
1575 struct inode * dir,
1576 struct ext4_dir_entry_2 * de_del,
1577 struct buffer_head * bh)
1578{
1579 struct ext4_dir_entry_2 * de, * pde;
1580 int i;
1581
1582 i = 0;
1583 pde = NULL;
1584 de = (struct ext4_dir_entry_2 *) bh->b_data;
1585 while (i < bh->b_size) {
1586 if (!ext4_check_dir_entry("ext4_delete_entry", dir, de, bh, i))
1587 return -EIO;
1588 if (de == de_del) {
1589 BUFFER_TRACE(bh, "get_write_access");
1590 ext4_journal_get_write_access(handle, bh);
1591 if (pde)
1592 pde->rec_len =
1593 cpu_to_le16(le16_to_cpu(pde->rec_len) +
1594 le16_to_cpu(de->rec_len));
1595 else
1596 de->inode = 0;
1597 dir->i_version++;
1598 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
1599 ext4_journal_dirty_metadata(handle, bh);
1600 return 0;
1601 }
1602 i += le16_to_cpu(de->rec_len);
1603 pde = de;
1604 de = (struct ext4_dir_entry_2 *)
1605 ((char *) de + le16_to_cpu(de->rec_len));
1606 }
1607 return -ENOENT;
1608}
1609
1610/*
1611 * ext4_mark_inode_dirty is somewhat expensive, so unlike ext2 we
1612 * do not perform it in these functions. We perform it at the call site,
1613 * if it is needed.
1614 */
1615static inline void ext4_inc_count(handle_t *handle, struct inode *inode)
1616{
1617 inc_nlink(inode);
1618}
1619
1620static inline void ext4_dec_count(handle_t *handle, struct inode *inode)
1621{
1622 drop_nlink(inode);
1623}
1624
1625static int ext4_add_nondir(handle_t *handle,
1626 struct dentry *dentry, struct inode *inode)
1627{
1628 int err = ext4_add_entry(handle, dentry, inode);
1629 if (!err) {
1630 ext4_mark_inode_dirty(handle, inode);
1631 d_instantiate(dentry, inode);
1632 return 0;
1633 }
1634 ext4_dec_count(handle, inode);
1635 iput(inode);
1636 return err;
1637}
1638
1639/*
1640 * By the time this is called, we already have created
1641 * the directory cache entry for the new file, but it
1642 * is so far negative - it has no inode.
1643 *
1644 * If the create succeeds, we fill in the inode information
1645 * with d_instantiate().
1646 */
1647static int ext4_create (struct inode * dir, struct dentry * dentry, int mode,
1648 struct nameidata *nd)
1649{
1650 handle_t *handle;
1651 struct inode * inode;
1652 int err, retries = 0;
1653
1654retry:
1655 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1656 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1657 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
1658 if (IS_ERR(handle))
1659 return PTR_ERR(handle);
1660
1661 if (IS_DIRSYNC(dir))
1662 handle->h_sync = 1;
1663
1664 inode = ext4_new_inode (handle, dir, mode);
1665 err = PTR_ERR(inode);
1666 if (!IS_ERR(inode)) {
1667 inode->i_op = &ext4_file_inode_operations;
1668 inode->i_fop = &ext4_file_operations;
1669 ext4_set_aops(inode);
1670 err = ext4_add_nondir(handle, dentry, inode);
1671 }
1672 ext4_journal_stop(handle);
1673 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
1674 goto retry;
1675 return err;
1676}
1677
1678static int ext4_mknod (struct inode * dir, struct dentry *dentry,
1679 int mode, dev_t rdev)
1680{
1681 handle_t *handle;
1682 struct inode *inode;
1683 int err, retries = 0;
1684
1685 if (!new_valid_dev(rdev))
1686 return -EINVAL;
1687
1688retry:
1689 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1690 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1691 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
1692 if (IS_ERR(handle))
1693 return PTR_ERR(handle);
1694
1695 if (IS_DIRSYNC(dir))
1696 handle->h_sync = 1;
1697
1698 inode = ext4_new_inode (handle, dir, mode);
1699 err = PTR_ERR(inode);
1700 if (!IS_ERR(inode)) {
1701 init_special_inode(inode, inode->i_mode, rdev);
1702#ifdef CONFIG_EXT4DEV_FS_XATTR
1703 inode->i_op = &ext4_special_inode_operations;
1704#endif
1705 err = ext4_add_nondir(handle, dentry, inode);
1706 }
1707 ext4_journal_stop(handle);
1708 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
1709 goto retry;
1710 return err;
1711}
1712
1713static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode)
1714{
1715 handle_t *handle;
1716 struct inode * inode;
1717 struct buffer_head * dir_block;
1718 struct ext4_dir_entry_2 * de;
1719 int err, retries = 0;
1720
1721 if (dir->i_nlink >= EXT4_LINK_MAX)
1722 return -EMLINK;
1723
1724retry:
1725 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1726 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1727 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
1728 if (IS_ERR(handle))
1729 return PTR_ERR(handle);
1730
1731 if (IS_DIRSYNC(dir))
1732 handle->h_sync = 1;
1733
1734 inode = ext4_new_inode (handle, dir, S_IFDIR | mode);
1735 err = PTR_ERR(inode);
1736 if (IS_ERR(inode))
1737 goto out_stop;
1738
1739 inode->i_op = &ext4_dir_inode_operations;
1740 inode->i_fop = &ext4_dir_operations;
1741 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1742 dir_block = ext4_bread (handle, inode, 0, 1, &err);
1743 if (!dir_block) {
1744 drop_nlink(inode); /* is this nlink == 0? */
1745 ext4_mark_inode_dirty(handle, inode);
1746 iput (inode);
1747 goto out_stop;
1748 }
1749 BUFFER_TRACE(dir_block, "get_write_access");
1750 ext4_journal_get_write_access(handle, dir_block);
1751 de = (struct ext4_dir_entry_2 *) dir_block->b_data;
1752 de->inode = cpu_to_le32(inode->i_ino);
1753 de->name_len = 1;
1754 de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de->name_len));
1755 strcpy (de->name, ".");
1756 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1757 de = (struct ext4_dir_entry_2 *)
1758 ((char *) de + le16_to_cpu(de->rec_len));
1759 de->inode = cpu_to_le32(dir->i_ino);
1760 de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT4_DIR_REC_LEN(1));
1761 de->name_len = 2;
1762 strcpy (de->name, "..");
1763 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1764 inode->i_nlink = 2;
1765 BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
1766 ext4_journal_dirty_metadata(handle, dir_block);
1767 brelse (dir_block);
1768 ext4_mark_inode_dirty(handle, inode);
1769 err = ext4_add_entry (handle, dentry, inode);
1770 if (err) {
1771 inode->i_nlink = 0;
1772 ext4_mark_inode_dirty(handle, inode);
1773 iput (inode);
1774 goto out_stop;
1775 }
1776 inc_nlink(dir);
1777 ext4_update_dx_flag(dir);
1778 ext4_mark_inode_dirty(handle, dir);
1779 d_instantiate(dentry, inode);
1780out_stop:
1781 ext4_journal_stop(handle);
1782 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
1783 goto retry;
1784 return err;
1785}
1786
1787/*
1788 * routine to check that the specified directory is empty (for rmdir)
1789 */
1790static int empty_dir (struct inode * inode)
1791{
1792 unsigned long offset;
1793 struct buffer_head * bh;
1794 struct ext4_dir_entry_2 * de, * de1;
1795 struct super_block * sb;
1796 int err = 0;
1797
1798 sb = inode->i_sb;
1799 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
1800 !(bh = ext4_bread (NULL, inode, 0, 0, &err))) {
1801 if (err)
1802 ext4_error(inode->i_sb, __FUNCTION__,
1803 "error %d reading directory #%lu offset 0",
1804 err, inode->i_ino);
1805 else
1806 ext4_warning(inode->i_sb, __FUNCTION__,
1807 "bad directory (dir #%lu) - no data block",
1808 inode->i_ino);
1809 return 1;
1810 }
1811 de = (struct ext4_dir_entry_2 *) bh->b_data;
1812 de1 = (struct ext4_dir_entry_2 *)
1813 ((char *) de + le16_to_cpu(de->rec_len));
1814 if (le32_to_cpu(de->inode) != inode->i_ino ||
1815 !le32_to_cpu(de1->inode) ||
1816 strcmp (".", de->name) ||
1817 strcmp ("..", de1->name)) {
1818 ext4_warning (inode->i_sb, "empty_dir",
1819 "bad directory (dir #%lu) - no `.' or `..'",
1820 inode->i_ino);
1821 brelse (bh);
1822 return 1;
1823 }
1824 offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
1825 de = (struct ext4_dir_entry_2 *)
1826 ((char *) de1 + le16_to_cpu(de1->rec_len));
1827 while (offset < inode->i_size ) {
1828 if (!bh ||
1829 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1830 err = 0;
1831 brelse (bh);
1832 bh = ext4_bread (NULL, inode,
1833 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
1834 if (!bh) {
1835 if (err)
1836 ext4_error(sb, __FUNCTION__,
1837 "error %d reading directory"
1838 " #%lu offset %lu",
1839 err, inode->i_ino, offset);
1840 offset += sb->s_blocksize;
1841 continue;
1842 }
1843 de = (struct ext4_dir_entry_2 *) bh->b_data;
1844 }
1845 if (!ext4_check_dir_entry("empty_dir", inode, de, bh, offset)) {
1846 de = (struct ext4_dir_entry_2 *)(bh->b_data +
1847 sb->s_blocksize);
1848 offset = (offset | (sb->s_blocksize - 1)) + 1;
1849 continue;
1850 }
1851 if (le32_to_cpu(de->inode)) {
1852 brelse (bh);
1853 return 0;
1854 }
1855 offset += le16_to_cpu(de->rec_len);
1856 de = (struct ext4_dir_entry_2 *)
1857 ((char *) de + le16_to_cpu(de->rec_len));
1858 }
1859 brelse (bh);
1860 return 1;
1861}
1862
1863/* ext4_orphan_add() links an unlinked or truncated inode into a list of
1864 * such inodes, starting at the superblock, in case we crash before the
1865 * file is closed/deleted, or in case the inode truncate spans multiple
1866 * transactions and the last transaction is not recovered after a crash.
1867 *
1868 * At filesystem recovery time, we walk this list deleting unlinked
1869 * inodes and truncating linked inodes in ext4_orphan_cleanup().
1870 */
1871int ext4_orphan_add(handle_t *handle, struct inode *inode)
1872{
1873 struct super_block *sb = inode->i_sb;
1874 struct ext4_iloc iloc;
1875 int err = 0, rc;
1876
1877 lock_super(sb);
1878 if (!list_empty(&EXT4_I(inode)->i_orphan))
1879 goto out_unlock;
1880
1881 /* Orphan handling is only valid for files with data blocks
1882 * being truncated, or files being unlinked. */
1883
1884 /* @@@ FIXME: Observation from aviro:
1885 * I think I can trigger J_ASSERT in ext4_orphan_add(). We block
1886 * here (on lock_super()), so race with ext4_link() which might bump
1887 * ->i_nlink. For, say it, character device. Not a regular file,
1888 * not a directory, not a symlink and ->i_nlink > 0.
1889 */
1890 J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1891 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
1892
1893 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
1894 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
1895 if (err)
1896 goto out_unlock;
1897
1898 err = ext4_reserve_inode_write(handle, inode, &iloc);
1899 if (err)
1900 goto out_unlock;
1901
1902 /* Insert this inode at the head of the on-disk orphan list... */
1903 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
1904 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
1905 err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
1906 rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
1907 if (!err)
1908 err = rc;
1909
1910 /* Only add to the head of the in-memory list if all the
1911 * previous operations succeeded. If the orphan_add is going to
1912 * fail (possibly taking the journal offline), we can't risk
1913 * leaving the inode on the orphan list: stray orphan-list
1914 * entries can cause panics at unmount time.
1915 *
1916 * This is safe: on error we're going to ignore the orphan list
1917 * anyway on the next recovery. */
1918 if (!err)
1919 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
1920
1921 jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
1922 jbd_debug(4, "orphan inode %lu will point to %d\n",
1923 inode->i_ino, NEXT_ORPHAN(inode));
1924out_unlock:
1925 unlock_super(sb);
1926 ext4_std_error(inode->i_sb, err);
1927 return err;
1928}
1929
1930/*
1931 * ext4_orphan_del() removes an unlinked or truncated inode from the list
1932 * of such inodes stored on disk, because it is finally being cleaned up.
1933 */
1934int ext4_orphan_del(handle_t *handle, struct inode *inode)
1935{
1936 struct list_head *prev;
1937 struct ext4_inode_info *ei = EXT4_I(inode);
1938 struct ext4_sb_info *sbi;
1939 unsigned long ino_next;
1940 struct ext4_iloc iloc;
1941 int err = 0;
1942
1943 lock_super(inode->i_sb);
1944 if (list_empty(&ei->i_orphan)) {
1945 unlock_super(inode->i_sb);
1946 return 0;
1947 }
1948
1949 ino_next = NEXT_ORPHAN(inode);
1950 prev = ei->i_orphan.prev;
1951 sbi = EXT4_SB(inode->i_sb);
1952
1953 jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
1954
1955 list_del_init(&ei->i_orphan);
1956
1957 /* If we're on an error path, we may not have a valid
1958 * transaction handle with which to update the orphan list on
1959 * disk, but we still need to remove the inode from the linked
1960 * list in memory. */
1961 if (!handle)
1962 goto out;
1963
1964 err = ext4_reserve_inode_write(handle, inode, &iloc);
1965 if (err)
1966 goto out_err;
1967
1968 if (prev == &sbi->s_orphan) {
1969 jbd_debug(4, "superblock will point to %lu\n", ino_next);
1970 BUFFER_TRACE(sbi->s_sbh, "get_write_access");
1971 err = ext4_journal_get_write_access(handle, sbi->s_sbh);
1972 if (err)
1973 goto out_brelse;
1974 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
1975 err = ext4_journal_dirty_metadata(handle, sbi->s_sbh);
1976 } else {
1977 struct ext4_iloc iloc2;
1978 struct inode *i_prev =
1979 &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
1980
1981 jbd_debug(4, "orphan inode %lu will point to %lu\n",
1982 i_prev->i_ino, ino_next);
1983 err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
1984 if (err)
1985 goto out_brelse;
1986 NEXT_ORPHAN(i_prev) = ino_next;
1987 err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
1988 }
1989 if (err)
1990 goto out_brelse;
1991 NEXT_ORPHAN(inode) = 0;
1992 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
1993
1994out_err:
1995 ext4_std_error(inode->i_sb, err);
1996out:
1997 unlock_super(inode->i_sb);
1998 return err;
1999
2000out_brelse:
2001 brelse(iloc.bh);
2002 goto out_err;
2003}
2004
2005static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
2006{
2007 int retval;
2008 struct inode * inode;
2009 struct buffer_head * bh;
2010 struct ext4_dir_entry_2 * de;
2011 handle_t *handle;
2012
2013 /* Initialize quotas before so that eventual writes go in
2014 * separate transaction */
2015 DQUOT_INIT(dentry->d_inode);
2016 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2017 if (IS_ERR(handle))
2018 return PTR_ERR(handle);
2019
2020 retval = -ENOENT;
2021 bh = ext4_find_entry (dentry, &de);
2022 if (!bh)
2023 goto end_rmdir;
2024
2025 if (IS_DIRSYNC(dir))
2026 handle->h_sync = 1;
2027
2028 inode = dentry->d_inode;
2029
2030 retval = -EIO;
2031 if (le32_to_cpu(de->inode) != inode->i_ino)
2032 goto end_rmdir;
2033
2034 retval = -ENOTEMPTY;
2035 if (!empty_dir (inode))
2036 goto end_rmdir;
2037
2038 retval = ext4_delete_entry(handle, dir, de, bh);
2039 if (retval)
2040 goto end_rmdir;
2041 if (inode->i_nlink != 2)
2042 ext4_warning (inode->i_sb, "ext4_rmdir",
2043 "empty directory has nlink!=2 (%d)",
2044 inode->i_nlink);
2045 inode->i_version++;
2046 clear_nlink(inode);
2047 /* There's no need to set i_disksize: the fact that i_nlink is
2048 * zero will ensure that the right thing happens during any
2049 * recovery. */
2050 inode->i_size = 0;
2051 ext4_orphan_add(handle, inode);
2052 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
2053 ext4_mark_inode_dirty(handle, inode);
2054 drop_nlink(dir);
2055 ext4_update_dx_flag(dir);
2056 ext4_mark_inode_dirty(handle, dir);
2057
2058end_rmdir:
2059 ext4_journal_stop(handle);
2060 brelse (bh);
2061 return retval;
2062}
2063
2064static int ext4_unlink(struct inode * dir, struct dentry *dentry)
2065{
2066 int retval;
2067 struct inode * inode;
2068 struct buffer_head * bh;
2069 struct ext4_dir_entry_2 * de;
2070 handle_t *handle;
2071
2072 /* Initialize quotas before so that eventual writes go
2073 * in separate transaction */
2074 DQUOT_INIT(dentry->d_inode);
2075 handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2076 if (IS_ERR(handle))
2077 return PTR_ERR(handle);
2078
2079 if (IS_DIRSYNC(dir))
2080 handle->h_sync = 1;
2081
2082 retval = -ENOENT;
2083 bh = ext4_find_entry (dentry, &de);
2084 if (!bh)
2085 goto end_unlink;
2086
2087 inode = dentry->d_inode;
2088
2089 retval = -EIO;
2090 if (le32_to_cpu(de->inode) != inode->i_ino)
2091 goto end_unlink;
2092
2093 if (!inode->i_nlink) {
2094 ext4_warning (inode->i_sb, "ext4_unlink",
2095 "Deleting nonexistent file (%lu), %d",
2096 inode->i_ino, inode->i_nlink);
2097 inode->i_nlink = 1;
2098 }
2099 retval = ext4_delete_entry(handle, dir, de, bh);
2100 if (retval)
2101 goto end_unlink;
2102 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
2103 ext4_update_dx_flag(dir);
2104 ext4_mark_inode_dirty(handle, dir);
2105 drop_nlink(inode);
2106 if (!inode->i_nlink)
2107 ext4_orphan_add(handle, inode);
2108 inode->i_ctime = dir->i_ctime;
2109 ext4_mark_inode_dirty(handle, inode);
2110 retval = 0;
2111
2112end_unlink:
2113 ext4_journal_stop(handle);
2114 brelse (bh);
2115 return retval;
2116}
2117
2118static int ext4_symlink (struct inode * dir,
2119 struct dentry *dentry, const char * symname)
2120{
2121 handle_t *handle;
2122 struct inode * inode;
2123 int l, err, retries = 0;
2124
2125 l = strlen(symname)+1;
2126 if (l > dir->i_sb->s_blocksize)
2127 return -ENAMETOOLONG;
2128
2129retry:
2130 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2131 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
2132 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
2133 if (IS_ERR(handle))
2134 return PTR_ERR(handle);
2135
2136 if (IS_DIRSYNC(dir))
2137 handle->h_sync = 1;
2138
2139 inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
2140 err = PTR_ERR(inode);
2141 if (IS_ERR(inode))
2142 goto out_stop;
2143
2144 if (l > sizeof (EXT4_I(inode)->i_data)) {
2145 inode->i_op = &ext4_symlink_inode_operations;
2146 ext4_set_aops(inode);
2147 /*
2148 * page_symlink() calls into ext4_prepare/commit_write.
2149 * We have a transaction open. All is sweetness. It also sets
2150 * i_size in generic_commit_write().
2151 */
2152 err = __page_symlink(inode, symname, l,
2153 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
2154 if (err) {
2155 ext4_dec_count(handle, inode);
2156 ext4_mark_inode_dirty(handle, inode);
2157 iput (inode);
2158 goto out_stop;
2159 }
2160 } else {
2161 inode->i_op = &ext4_fast_symlink_inode_operations;
2162 memcpy((char*)&EXT4_I(inode)->i_data,symname,l);
2163 inode->i_size = l-1;
2164 }
2165 EXT4_I(inode)->i_disksize = inode->i_size;
2166 err = ext4_add_nondir(handle, dentry, inode);
2167out_stop:
2168 ext4_journal_stop(handle);
2169 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2170 goto retry;
2171 return err;
2172}
2173
2174static int ext4_link (struct dentry * old_dentry,
2175 struct inode * dir, struct dentry *dentry)
2176{
2177 handle_t *handle;
2178 struct inode *inode = old_dentry->d_inode;
2179 int err, retries = 0;
2180
2181 if (inode->i_nlink >= EXT4_LINK_MAX)
2182 return -EMLINK;
2183
2184retry:
2185 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2186 EXT4_INDEX_EXTRA_TRANS_BLOCKS);
2187 if (IS_ERR(handle))
2188 return PTR_ERR(handle);
2189
2190 if (IS_DIRSYNC(dir))
2191 handle->h_sync = 1;
2192
2193 inode->i_ctime = CURRENT_TIME_SEC;
2194 ext4_inc_count(handle, inode);
2195 atomic_inc(&inode->i_count);
2196
2197 err = ext4_add_nondir(handle, dentry, inode);
2198 ext4_journal_stop(handle);
2199 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2200 goto retry;
2201 return err;
2202}
2203
2204#define PARENT_INO(buffer) \
2205 ((struct ext4_dir_entry_2 *) ((char *) buffer + \
2206 le16_to_cpu(((struct ext4_dir_entry_2 *) buffer)->rec_len)))->inode
2207
2208/*
2209 * Anybody can rename anything with this: the permission checks are left to the
2210 * higher-level routines.
2211 */
2212static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
2213 struct inode * new_dir,struct dentry *new_dentry)
2214{
2215 handle_t *handle;
2216 struct inode * old_inode, * new_inode;
2217 struct buffer_head * old_bh, * new_bh, * dir_bh;
2218 struct ext4_dir_entry_2 * old_de, * new_de;
2219 int retval;
2220
2221 old_bh = new_bh = dir_bh = NULL;
2222
2223 /* Initialize quotas before so that eventual writes go
2224 * in separate transaction */
2225 if (new_dentry->d_inode)
2226 DQUOT_INIT(new_dentry->d_inode);
2227 handle = ext4_journal_start(old_dir, 2 *
2228 EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
2229 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
2230 if (IS_ERR(handle))
2231 return PTR_ERR(handle);
2232
2233 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2234 handle->h_sync = 1;
2235
2236 old_bh = ext4_find_entry (old_dentry, &old_de);
2237 /*
2238 * Check for inode number is _not_ due to possible IO errors.
2239 * We might rmdir the source, keep it as pwd of some process
2240 * and merrily kill the link to whatever was created under the
2241 * same name. Goodbye sticky bit ;-<
2242 */
2243 old_inode = old_dentry->d_inode;
2244 retval = -ENOENT;
2245 if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
2246 goto end_rename;
2247
2248 new_inode = new_dentry->d_inode;
2249 new_bh = ext4_find_entry (new_dentry, &new_de);
2250 if (new_bh) {
2251 if (!new_inode) {
2252 brelse (new_bh);
2253 new_bh = NULL;
2254 }
2255 }
2256 if (S_ISDIR(old_inode->i_mode)) {
2257 if (new_inode) {
2258 retval = -ENOTEMPTY;
2259 if (!empty_dir (new_inode))
2260 goto end_rename;
2261 }
2262 retval = -EIO;
2263 dir_bh = ext4_bread (handle, old_inode, 0, 0, &retval);
2264 if (!dir_bh)
2265 goto end_rename;
2266 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
2267 goto end_rename;
2268 retval = -EMLINK;
2269 if (!new_inode && new_dir!=old_dir &&
2270 new_dir->i_nlink >= EXT4_LINK_MAX)
2271 goto end_rename;
2272 }
2273 if (!new_bh) {
2274 retval = ext4_add_entry (handle, new_dentry, old_inode);
2275 if (retval)
2276 goto end_rename;
2277 } else {
2278 BUFFER_TRACE(new_bh, "get write access");
2279 ext4_journal_get_write_access(handle, new_bh);
2280 new_de->inode = cpu_to_le32(old_inode->i_ino);
2281 if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
2282 EXT4_FEATURE_INCOMPAT_FILETYPE))
2283 new_de->file_type = old_de->file_type;
2284 new_dir->i_version++;
2285 BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata");
2286 ext4_journal_dirty_metadata(handle, new_bh);
2287 brelse(new_bh);
2288 new_bh = NULL;
2289 }
2290
2291 /*
2292 * Like most other Unix systems, set the ctime for inodes on a
2293 * rename.
2294 */
2295 old_inode->i_ctime = CURRENT_TIME_SEC;
2296 ext4_mark_inode_dirty(handle, old_inode);
2297
2298 /*
2299 * ok, that's it
2300 */
2301 if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
2302 old_de->name_len != old_dentry->d_name.len ||
2303 strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
2304 (retval = ext4_delete_entry(handle, old_dir,
2305 old_de, old_bh)) == -ENOENT) {
2306 /* old_de could have moved from under us during htree split, so
2307 * make sure that we are deleting the right entry. We might
2308 * also be pointing to a stale entry in the unused part of
2309 * old_bh so just checking inum and the name isn't enough. */
2310 struct buffer_head *old_bh2;
2311 struct ext4_dir_entry_2 *old_de2;
2312
2313 old_bh2 = ext4_find_entry(old_dentry, &old_de2);
2314 if (old_bh2) {
2315 retval = ext4_delete_entry(handle, old_dir,
2316 old_de2, old_bh2);
2317 brelse(old_bh2);
2318 }
2319 }
2320 if (retval) {
2321 ext4_warning(old_dir->i_sb, "ext4_rename",
2322 "Deleting old file (%lu), %d, error=%d",
2323 old_dir->i_ino, old_dir->i_nlink, retval);
2324 }
2325
2326 if (new_inode) {
2327 drop_nlink(new_inode);
2328 new_inode->i_ctime = CURRENT_TIME_SEC;
2329 }
2330 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
2331 ext4_update_dx_flag(old_dir);
2332 if (dir_bh) {
2333 BUFFER_TRACE(dir_bh, "get_write_access");
2334 ext4_journal_get_write_access(handle, dir_bh);
2335 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
2336 BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata");
2337 ext4_journal_dirty_metadata(handle, dir_bh);
2338 drop_nlink(old_dir);
2339 if (new_inode) {
2340 drop_nlink(new_inode);
2341 } else {
2342 inc_nlink(new_dir);
2343 ext4_update_dx_flag(new_dir);
2344 ext4_mark_inode_dirty(handle, new_dir);
2345 }
2346 }
2347 ext4_mark_inode_dirty(handle, old_dir);
2348 if (new_inode) {
2349 ext4_mark_inode_dirty(handle, new_inode);
2350 if (!new_inode->i_nlink)
2351 ext4_orphan_add(handle, new_inode);
2352 }
2353 retval = 0;
2354
2355end_rename:
2356 brelse (dir_bh);
2357 brelse (old_bh);
2358 brelse (new_bh);
2359 ext4_journal_stop(handle);
2360 return retval;
2361}
2362
2363/*
2364 * directories can handle most operations...
2365 */
2366struct inode_operations ext4_dir_inode_operations = {
2367 .create = ext4_create,
2368 .lookup = ext4_lookup,
2369 .link = ext4_link,
2370 .unlink = ext4_unlink,
2371 .symlink = ext4_symlink,
2372 .mkdir = ext4_mkdir,
2373 .rmdir = ext4_rmdir,
2374 .mknod = ext4_mknod,
2375 .rename = ext4_rename,
2376 .setattr = ext4_setattr,
2377#ifdef CONFIG_EXT4DEV_FS_XATTR
2378 .setxattr = generic_setxattr,
2379 .getxattr = generic_getxattr,
2380 .listxattr = ext4_listxattr,
2381 .removexattr = generic_removexattr,
2382#endif
2383 .permission = ext4_permission,
2384};
2385
2386struct inode_operations ext4_special_inode_operations = {
2387 .setattr = ext4_setattr,
2388#ifdef CONFIG_EXT4DEV_FS_XATTR
2389 .setxattr = generic_setxattr,
2390 .getxattr = generic_getxattr,
2391 .listxattr = ext4_listxattr,
2392 .removexattr = generic_removexattr,
2393#endif
2394 .permission = ext4_permission,
2395};
diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h
new file mode 100644
index 000000000000..5e4dfff36a00
--- /dev/null
+++ b/fs/ext4/namei.h
@@ -0,0 +1,8 @@
1/* linux/fs/ext4/namei.h
2 *
3 * Copyright (C) 2005 Simtec Electronics
4 * Ben Dooks <ben@simtec.co.uk>
5 *
6*/
7
8extern struct dentry *ext4_get_parent(struct dentry *child);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
new file mode 100644
index 000000000000..4fe49c3661b2
--- /dev/null
+++ b/fs/ext4/resize.c
@@ -0,0 +1,1050 @@
1/*
2 * linux/fs/ext4/resize.c
3 *
4 * Support for resizing an ext4 filesystem while it is mounted.
5 *
6 * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com>
7 *
8 * This could probably be made into a module, because it is not often in use.
9 */
10
11
12#define EXT4FS_DEBUG
13
14#include <linux/sched.h>
15#include <linux/smp_lock.h>
16#include <linux/ext4_jbd2.h>
17
18#include <linux/errno.h>
19#include <linux/slab.h>
20
21
22#define outside(b, first, last) ((b) < (first) || (b) >= (last))
23#define inside(b, first, last) ((b) >= (first) && (b) < (last))
24
25static int verify_group_input(struct super_block *sb,
26 struct ext4_new_group_data *input)
27{
28 struct ext4_sb_info *sbi = EXT4_SB(sb);
29 struct ext4_super_block *es = sbi->s_es;
30 ext4_fsblk_t start = ext4_blocks_count(es);
31 ext4_fsblk_t end = start + input->blocks_count;
32 unsigned group = input->group;
33 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
34 unsigned overhead = ext4_bg_has_super(sb, group) ?
35 (1 + ext4_bg_num_gdb(sb, group) +
36 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
37 ext4_fsblk_t metaend = start + overhead;
38 struct buffer_head *bh = NULL;
39 ext4_grpblk_t free_blocks_count, offset;
40 int err = -EINVAL;
41
42 input->free_blocks_count = free_blocks_count =
43 input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
44
45 if (test_opt(sb, DEBUG))
46 printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks "
47 "(%d free, %u reserved)\n",
48 ext4_bg_has_super(sb, input->group) ? "normal" :
49 "no-super", input->group, input->blocks_count,
50 free_blocks_count, input->reserved_blocks);
51
52 ext4_get_group_no_and_offset(sb, start, NULL, &offset);
53 if (group != sbi->s_groups_count)
54 ext4_warning(sb, __FUNCTION__,
55 "Cannot add at group %u (only %lu groups)",
56 input->group, sbi->s_groups_count);
57 else if (offset != 0)
58 ext4_warning(sb, __FUNCTION__, "Last group not full");
59 else if (input->reserved_blocks > input->blocks_count / 5)
60 ext4_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)",
61 input->reserved_blocks);
62 else if (free_blocks_count < 0)
63 ext4_warning(sb, __FUNCTION__, "Bad blocks count %u",
64 input->blocks_count);
65 else if (!(bh = sb_bread(sb, end - 1)))
66 ext4_warning(sb, __FUNCTION__,
67 "Cannot read last block (%llu)",
68 end - 1);
69 else if (outside(input->block_bitmap, start, end))
70 ext4_warning(sb, __FUNCTION__,
71 "Block bitmap not in group (block %llu)",
72 (unsigned long long)input->block_bitmap);
73 else if (outside(input->inode_bitmap, start, end))
74 ext4_warning(sb, __FUNCTION__,
75 "Inode bitmap not in group (block %llu)",
76 (unsigned long long)input->inode_bitmap);
77 else if (outside(input->inode_table, start, end) ||
78 outside(itend - 1, start, end))
79 ext4_warning(sb, __FUNCTION__,
80 "Inode table not in group (blocks %llu-%llu)",
81 (unsigned long long)input->inode_table, itend - 1);
82 else if (input->inode_bitmap == input->block_bitmap)
83 ext4_warning(sb, __FUNCTION__,
84 "Block bitmap same as inode bitmap (%llu)",
85 (unsigned long long)input->block_bitmap);
86 else if (inside(input->block_bitmap, input->inode_table, itend))
87 ext4_warning(sb, __FUNCTION__,
88 "Block bitmap (%llu) in inode table (%llu-%llu)",
89 (unsigned long long)input->block_bitmap,
90 (unsigned long long)input->inode_table, itend - 1);
91 else if (inside(input->inode_bitmap, input->inode_table, itend))
92 ext4_warning(sb, __FUNCTION__,
93 "Inode bitmap (%llu) in inode table (%llu-%llu)",
94 (unsigned long long)input->inode_bitmap,
95 (unsigned long long)input->inode_table, itend - 1);
96 else if (inside(input->block_bitmap, start, metaend))
97 ext4_warning(sb, __FUNCTION__,
98 "Block bitmap (%llu) in GDT table"
99 " (%llu-%llu)",
100 (unsigned long long)input->block_bitmap,
101 start, metaend - 1);
102 else if (inside(input->inode_bitmap, start, metaend))
103 ext4_warning(sb, __FUNCTION__,
104 "Inode bitmap (%llu) in GDT table"
105 " (%llu-%llu)",
106 (unsigned long long)input->inode_bitmap,
107 start, metaend - 1);
108 else if (inside(input->inode_table, start, metaend) ||
109 inside(itend - 1, start, metaend))
110 ext4_warning(sb, __FUNCTION__,
111 "Inode table (%llu-%llu) overlaps"
112 "GDT table (%llu-%llu)",
113 (unsigned long long)input->inode_table,
114 itend - 1, start, metaend - 1);
115 else
116 err = 0;
117 brelse(bh);
118
119 return err;
120}
121
122static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
123 ext4_fsblk_t blk)
124{
125 struct buffer_head *bh;
126 int err;
127
128 bh = sb_getblk(sb, blk);
129 if (!bh)
130 return ERR_PTR(-EIO);
131 if ((err = ext4_journal_get_write_access(handle, bh))) {
132 brelse(bh);
133 bh = ERR_PTR(err);
134 } else {
135 lock_buffer(bh);
136 memset(bh->b_data, 0, sb->s_blocksize);
137 set_buffer_uptodate(bh);
138 unlock_buffer(bh);
139 }
140
141 return bh;
142}
143
144/*
145 * To avoid calling the atomic setbit hundreds or thousands of times, we only
146 * need to use it within a single byte (to ensure we get endianness right).
147 * We can use memset for the rest of the bitmap as there are no other users.
148 */
149static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
150{
151 int i;
152
153 if (start_bit >= end_bit)
154 return;
155
156 ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
157 for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
158 ext4_set_bit(i, bitmap);
159 if (i < end_bit)
160 memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
161}
162
163/*
164 * Set up the block and inode bitmaps, and the inode table for the new group.
165 * This doesn't need to be part of the main transaction, since we are only
166 * changing blocks outside the actual filesystem. We still do journaling to
167 * ensure the recovery is correct in case of a failure just after resize.
168 * If any part of this fails, we simply abort the resize.
169 */
170static int setup_new_group_blocks(struct super_block *sb,
171 struct ext4_new_group_data *input)
172{
173 struct ext4_sb_info *sbi = EXT4_SB(sb);
174 ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group);
175 int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
176 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
177 unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group);
178 struct buffer_head *bh;
179 handle_t *handle;
180 ext4_fsblk_t block;
181 ext4_grpblk_t bit;
182 int i;
183 int err = 0, err2;
184
185 handle = ext4_journal_start_sb(sb, reserved_gdb + gdblocks +
186 2 + sbi->s_itb_per_group);
187 if (IS_ERR(handle))
188 return PTR_ERR(handle);
189
190 lock_super(sb);
191 if (input->group != sbi->s_groups_count) {
192 err = -EBUSY;
193 goto exit_journal;
194 }
195
196 if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
197 err = PTR_ERR(bh);
198 goto exit_journal;
199 }
200
201 if (ext4_bg_has_super(sb, input->group)) {
202 ext4_debug("mark backup superblock %#04lx (+0)\n", start);
203 ext4_set_bit(0, bh->b_data);
204 }
205
206 /* Copy all of the GDT blocks into the backup in this group */
207 for (i = 0, bit = 1, block = start + 1;
208 i < gdblocks; i++, block++, bit++) {
209 struct buffer_head *gdb;
210
211 ext4_debug("update backup group %#04lx (+%d)\n", block, bit);
212
213 gdb = sb_getblk(sb, block);
214 if (!gdb) {
215 err = -EIO;
216 goto exit_bh;
217 }
218 if ((err = ext4_journal_get_write_access(handle, gdb))) {
219 brelse(gdb);
220 goto exit_bh;
221 }
222 lock_buffer(bh);
223 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, bh->b_size);
224 set_buffer_uptodate(gdb);
225 unlock_buffer(bh);
226 ext4_journal_dirty_metadata(handle, gdb);
227 ext4_set_bit(bit, bh->b_data);
228 brelse(gdb);
229 }
230
231 /* Zero out all of the reserved backup group descriptor table blocks */
232 for (i = 0, bit = gdblocks + 1, block = start + bit;
233 i < reserved_gdb; i++, block++, bit++) {
234 struct buffer_head *gdb;
235
236 ext4_debug("clear reserved block %#04lx (+%d)\n", block, bit);
237
238 if (IS_ERR(gdb = bclean(handle, sb, block))) {
239 err = PTR_ERR(bh);
240 goto exit_bh;
241 }
242 ext4_journal_dirty_metadata(handle, gdb);
243 ext4_set_bit(bit, bh->b_data);
244 brelse(gdb);
245 }
246 ext4_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap,
247 input->block_bitmap - start);
248 ext4_set_bit(input->block_bitmap - start, bh->b_data);
249 ext4_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap,
250 input->inode_bitmap - start);
251 ext4_set_bit(input->inode_bitmap - start, bh->b_data);
252
253 /* Zero out all of the inode table blocks */
254 for (i = 0, block = input->inode_table, bit = block - start;
255 i < sbi->s_itb_per_group; i++, bit++, block++) {
256 struct buffer_head *it;
257
258 ext4_debug("clear inode block %#04lx (+%d)\n", block, bit);
259 if (IS_ERR(it = bclean(handle, sb, block))) {
260 err = PTR_ERR(it);
261 goto exit_bh;
262 }
263 ext4_journal_dirty_metadata(handle, it);
264 brelse(it);
265 ext4_set_bit(bit, bh->b_data);
266 }
267 mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
268 bh->b_data);
269 ext4_journal_dirty_metadata(handle, bh);
270 brelse(bh);
271
272 /* Mark unused entries in inode bitmap used */
273 ext4_debug("clear inode bitmap %#04x (+%ld)\n",
274 input->inode_bitmap, input->inode_bitmap - start);
275 if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
276 err = PTR_ERR(bh);
277 goto exit_journal;
278 }
279
280 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
281 bh->b_data);
282 ext4_journal_dirty_metadata(handle, bh);
283exit_bh:
284 brelse(bh);
285
286exit_journal:
287 unlock_super(sb);
288 if ((err2 = ext4_journal_stop(handle)) && !err)
289 err = err2;
290
291 return err;
292}
293
294
295/*
296 * Iterate through the groups which hold BACKUP superblock/GDT copies in an
297 * ext4 filesystem. The counters should be initialized to 1, 5, and 7 before
298 * calling this for the first time. In a sparse filesystem it will be the
299 * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
300 * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
301 */
302static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
303 unsigned *five, unsigned *seven)
304{
305 unsigned *min = three;
306 int mult = 3;
307 unsigned ret;
308
309 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
310 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
311 ret = *min;
312 *min += 1;
313 return ret;
314 }
315
316 if (*five < *min) {
317 min = five;
318 mult = 5;
319 }
320 if (*seven < *min) {
321 min = seven;
322 mult = 7;
323 }
324
325 ret = *min;
326 *min *= mult;
327
328 return ret;
329}
330
331/*
332 * Check that all of the backup GDT blocks are held in the primary GDT block.
333 * It is assumed that they are stored in group order. Returns the number of
334 * groups in current filesystem that have BACKUPS, or -ve error code.
335 */
336static int verify_reserved_gdb(struct super_block *sb,
337 struct buffer_head *primary)
338{
339 const ext4_fsblk_t blk = primary->b_blocknr;
340 const unsigned long end = EXT4_SB(sb)->s_groups_count;
341 unsigned three = 1;
342 unsigned five = 5;
343 unsigned seven = 7;
344 unsigned grp;
345 __le32 *p = (__le32 *)primary->b_data;
346 int gdbackups = 0;
347
348 while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
349 if (le32_to_cpu(*p++) !=
350 grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
351 ext4_warning(sb, __FUNCTION__,
352 "reserved GDT %llu"
353 " missing grp %d (%llu)",
354 blk, grp,
355 grp *
356 (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
357 blk);
358 return -EINVAL;
359 }
360 if (++gdbackups > EXT4_ADDR_PER_BLOCK(sb))
361 return -EFBIG;
362 }
363
364 return gdbackups;
365}
366
367/*
368 * Called when we need to bring a reserved group descriptor table block into
369 * use from the resize inode. The primary copy of the new GDT block currently
370 * is an indirect block (under the double indirect block in the resize inode).
371 * The new backup GDT blocks will be stored as leaf blocks in this indirect
372 * block, in group order. Even though we know all the block numbers we need,
373 * we check to ensure that the resize inode has actually reserved these blocks.
374 *
375 * Don't need to update the block bitmaps because the blocks are still in use.
376 *
377 * We get all of the error cases out of the way, so that we are sure to not
378 * fail once we start modifying the data on disk, because JBD has no rollback.
379 */
380static int add_new_gdb(handle_t *handle, struct inode *inode,
381 struct ext4_new_group_data *input,
382 struct buffer_head **primary)
383{
384 struct super_block *sb = inode->i_sb;
385 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
386 unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
387 ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
388 struct buffer_head **o_group_desc, **n_group_desc;
389 struct buffer_head *dind;
390 int gdbackups;
391 struct ext4_iloc iloc;
392 __le32 *data;
393 int err;
394
395 if (test_opt(sb, DEBUG))
396 printk(KERN_DEBUG
397 "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
398 gdb_num);
399
400 /*
401 * If we are not using the primary superblock/GDT copy don't resize,
402 * because the user tools have no way of handling this. Probably a
403 * bad time to do it anyways.
404 */
405 if (EXT4_SB(sb)->s_sbh->b_blocknr !=
406 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
407 ext4_warning(sb, __FUNCTION__,
408 "won't resize using backup superblock at %llu",
409 (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
410 return -EPERM;
411 }
412
413 *primary = sb_bread(sb, gdblock);
414 if (!*primary)
415 return -EIO;
416
417 if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
418 err = gdbackups;
419 goto exit_bh;
420 }
421
422 data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
423 dind = sb_bread(sb, le32_to_cpu(*data));
424 if (!dind) {
425 err = -EIO;
426 goto exit_bh;
427 }
428
429 data = (__le32 *)dind->b_data;
430 if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
431 ext4_warning(sb, __FUNCTION__,
432 "new group %u GDT block %llu not reserved",
433 input->group, gdblock);
434 err = -EINVAL;
435 goto exit_dind;
436 }
437
438 if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh)))
439 goto exit_dind;
440
441 if ((err = ext4_journal_get_write_access(handle, *primary)))
442 goto exit_sbh;
443
444 if ((err = ext4_journal_get_write_access(handle, dind)))
445 goto exit_primary;
446
447 /* ext4_reserve_inode_write() gets a reference on the iloc */
448 if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
449 goto exit_dindj;
450
451 n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
452 GFP_KERNEL);
453 if (!n_group_desc) {
454 err = -ENOMEM;
455 ext4_warning (sb, __FUNCTION__,
456 "not enough memory for %lu groups", gdb_num + 1);
457 goto exit_inode;
458 }
459
460 /*
461 * Finally, we have all of the possible failures behind us...
462 *
463 * Remove new GDT block from inode double-indirect block and clear out
464 * the new GDT block for use (which also "frees" the backup GDT blocks
465 * from the reserved inode). We don't need to change the bitmaps for
466 * these blocks, because they are marked as in-use from being in the
467 * reserved inode, and will become GDT blocks (primary and backup).
468 */
469 data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
470 ext4_journal_dirty_metadata(handle, dind);
471 brelse(dind);
472 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
473 ext4_mark_iloc_dirty(handle, inode, &iloc);
474 memset((*primary)->b_data, 0, sb->s_blocksize);
475 ext4_journal_dirty_metadata(handle, *primary);
476
477 o_group_desc = EXT4_SB(sb)->s_group_desc;
478 memcpy(n_group_desc, o_group_desc,
479 EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
480 n_group_desc[gdb_num] = *primary;
481 EXT4_SB(sb)->s_group_desc = n_group_desc;
482 EXT4_SB(sb)->s_gdb_count++;
483 kfree(o_group_desc);
484
485 es->s_reserved_gdt_blocks =
486 cpu_to_le16(le16_to_cpu(es->s_reserved_gdt_blocks) - 1);
487 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
488
489 return 0;
490
491exit_inode:
492 //ext4_journal_release_buffer(handle, iloc.bh);
493 brelse(iloc.bh);
494exit_dindj:
495 //ext4_journal_release_buffer(handle, dind);
496exit_primary:
497 //ext4_journal_release_buffer(handle, *primary);
498exit_sbh:
499 //ext4_journal_release_buffer(handle, *primary);
500exit_dind:
501 brelse(dind);
502exit_bh:
503 brelse(*primary);
504
505 ext4_debug("leaving with error %d\n", err);
506 return err;
507}
508
509/*
510 * Called when we are adding a new group which has a backup copy of each of
511 * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
512 * We need to add these reserved backup GDT blocks to the resize inode, so
513 * that they are kept for future resizing and not allocated to files.
514 *
515 * Each reserved backup GDT block will go into a different indirect block.
516 * The indirect blocks are actually the primary reserved GDT blocks,
517 * so we know in advance what their block numbers are. We only get the
518 * double-indirect block to verify it is pointing to the primary reserved
519 * GDT blocks so we don't overwrite a data block by accident. The reserved
520 * backup GDT blocks are stored in their reserved primary GDT block.
521 */
522static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
523 struct ext4_new_group_data *input)
524{
525 struct super_block *sb = inode->i_sb;
526 int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
527 struct buffer_head **primary;
528 struct buffer_head *dind;
529 struct ext4_iloc iloc;
530 ext4_fsblk_t blk;
531 __le32 *data, *end;
532 int gdbackups = 0;
533 int res, i;
534 int err;
535
536 primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_KERNEL);
537 if (!primary)
538 return -ENOMEM;
539
540 data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
541 dind = sb_bread(sb, le32_to_cpu(*data));
542 if (!dind) {
543 err = -EIO;
544 goto exit_free;
545 }
546
547 blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count;
548 data = (__le32 *)dind->b_data + EXT4_SB(sb)->s_gdb_count;
549 end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb);
550
551 /* Get each reserved primary GDT block and verify it holds backups */
552 for (res = 0; res < reserved_gdb; res++, blk++) {
553 if (le32_to_cpu(*data) != blk) {
554 ext4_warning(sb, __FUNCTION__,
555 "reserved block %llu"
556 " not at offset %ld",
557 blk,
558 (long)(data - (__le32 *)dind->b_data));
559 err = -EINVAL;
560 goto exit_bh;
561 }
562 primary[res] = sb_bread(sb, blk);
563 if (!primary[res]) {
564 err = -EIO;
565 goto exit_bh;
566 }
567 if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
568 brelse(primary[res]);
569 err = gdbackups;
570 goto exit_bh;
571 }
572 if (++data >= end)
573 data = (__le32 *)dind->b_data;
574 }
575
576 for (i = 0; i < reserved_gdb; i++) {
577 if ((err = ext4_journal_get_write_access(handle, primary[i]))) {
578 /*
579 int j;
580 for (j = 0; j < i; j++)
581 ext4_journal_release_buffer(handle, primary[j]);
582 */
583 goto exit_bh;
584 }
585 }
586
587 if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
588 goto exit_bh;
589
590 /*
591 * Finally we can add each of the reserved backup GDT blocks from
592 * the new group to its reserved primary GDT block.
593 */
594 blk = input->group * EXT4_BLOCKS_PER_GROUP(sb);
595 for (i = 0; i < reserved_gdb; i++) {
596 int err2;
597 data = (__le32 *)primary[i]->b_data;
598 /* printk("reserving backup %lu[%u] = %lu\n",
599 primary[i]->b_blocknr, gdbackups,
600 blk + primary[i]->b_blocknr); */
601 data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
602 err2 = ext4_journal_dirty_metadata(handle, primary[i]);
603 if (!err)
604 err = err2;
605 }
606 inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9;
607 ext4_mark_iloc_dirty(handle, inode, &iloc);
608
609exit_bh:
610 while (--res >= 0)
611 brelse(primary[res]);
612 brelse(dind);
613
614exit_free:
615 kfree(primary);
616
617 return err;
618}
619
620/*
621 * Update the backup copies of the ext4 metadata. These don't need to be part
622 * of the main resize transaction, because e2fsck will re-write them if there
623 * is a problem (basically only OOM will cause a problem). However, we
624 * _should_ update the backups if possible, in case the primary gets trashed
625 * for some reason and we need to run e2fsck from a backup superblock. The
626 * important part is that the new block and inode counts are in the backup
627 * superblocks, and the location of the new group metadata in the GDT backups.
628 *
629 * We do not need lock_super() for this, because these blocks are not
630 * otherwise touched by the filesystem code when it is mounted. We don't
631 * need to worry about last changing from sbi->s_groups_count, because the
632 * worst that can happen is that we do not copy the full number of backups
633 * at this time. The resize which changed s_groups_count will backup again.
634 */
635static void update_backups(struct super_block *sb,
636 int blk_off, char *data, int size)
637{
638 struct ext4_sb_info *sbi = EXT4_SB(sb);
639 const unsigned long last = sbi->s_groups_count;
640 const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
641 unsigned three = 1;
642 unsigned five = 5;
643 unsigned seven = 7;
644 unsigned group;
645 int rest = sb->s_blocksize - size;
646 handle_t *handle;
647 int err = 0, err2;
648
649 handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
650 if (IS_ERR(handle)) {
651 group = 1;
652 err = PTR_ERR(handle);
653 goto exit_err;
654 }
655
656 while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) {
657 struct buffer_head *bh;
658
659 /* Out of journal space, and can't get more - abort - so sad */
660 if (handle->h_buffer_credits == 0 &&
661 ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
662 (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
663 break;
664
665 bh = sb_getblk(sb, group * bpg + blk_off);
666 if (!bh) {
667 err = -EIO;
668 break;
669 }
670 ext4_debug("update metadata backup %#04lx\n",
671 (unsigned long)bh->b_blocknr);
672 if ((err = ext4_journal_get_write_access(handle, bh)))
673 break;
674 lock_buffer(bh);
675 memcpy(bh->b_data, data, size);
676 if (rest)
677 memset(bh->b_data + size, 0, rest);
678 set_buffer_uptodate(bh);
679 unlock_buffer(bh);
680 ext4_journal_dirty_metadata(handle, bh);
681 brelse(bh);
682 }
683 if ((err2 = ext4_journal_stop(handle)) && !err)
684 err = err2;
685
686 /*
687 * Ugh! Need to have e2fsck write the backup copies. It is too
688 * late to revert the resize, we shouldn't fail just because of
689 * the backup copies (they are only needed in case of corruption).
690 *
691 * However, if we got here we have a journal problem too, so we
692 * can't really start a transaction to mark the superblock.
693 * Chicken out and just set the flag on the hope it will be written
694 * to disk, and if not - we will simply wait until next fsck.
695 */
696exit_err:
697 if (err) {
698 ext4_warning(sb, __FUNCTION__,
699 "can't update backup for group %d (err %d), "
700 "forcing fsck on next reboot", group, err);
701 sbi->s_mount_state &= ~EXT4_VALID_FS;
702 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
703 mark_buffer_dirty(sbi->s_sbh);
704 }
705}
706
707/* Add group descriptor data to an existing or new group descriptor block.
708 * Ensure we handle all possible error conditions _before_ we start modifying
709 * the filesystem, because we cannot abort the transaction and not have it
710 * write the data to disk.
711 *
712 * If we are on a GDT block boundary, we need to get the reserved GDT block.
713 * Otherwise, we may need to add backup GDT blocks for a sparse group.
714 *
715 * We only need to hold the superblock lock while we are actually adding
716 * in the new group's counts to the superblock. Prior to that we have
717 * not really "added" the group at all. We re-check that we are still
718 * adding in the last group in case things have changed since verifying.
719 */
720int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
721{
722 struct ext4_sb_info *sbi = EXT4_SB(sb);
723 struct ext4_super_block *es = sbi->s_es;
724 int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
725 le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
726 struct buffer_head *primary = NULL;
727 struct ext4_group_desc *gdp;
728 struct inode *inode = NULL;
729 handle_t *handle;
730 int gdb_off, gdb_num;
731 int err, err2;
732
733 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
734 gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
735
736 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
737 EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
738 ext4_warning(sb, __FUNCTION__,
739 "Can't resize non-sparse filesystem further");
740 return -EPERM;
741 }
742
743 if (ext4_blocks_count(es) + input->blocks_count <
744 ext4_blocks_count(es)) {
745 ext4_warning(sb, __FUNCTION__, "blocks_count overflow\n");
746 return -EINVAL;
747 }
748
749 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
750 le32_to_cpu(es->s_inodes_count)) {
751 ext4_warning(sb, __FUNCTION__, "inodes_count overflow\n");
752 return -EINVAL;
753 }
754
755 if (reserved_gdb || gdb_off == 0) {
756 if (!EXT4_HAS_COMPAT_FEATURE(sb,
757 EXT4_FEATURE_COMPAT_RESIZE_INODE)){
758 ext4_warning(sb, __FUNCTION__,
759 "No reserved GDT blocks, can't resize");
760 return -EPERM;
761 }
762 inode = iget(sb, EXT4_RESIZE_INO);
763 if (!inode || is_bad_inode(inode)) {
764 ext4_warning(sb, __FUNCTION__,
765 "Error opening resize inode");
766 iput(inode);
767 return -ENOENT;
768 }
769 }
770
771 if ((err = verify_group_input(sb, input)))
772 goto exit_put;
773
774 if ((err = setup_new_group_blocks(sb, input)))
775 goto exit_put;
776
777 /*
778 * We will always be modifying at least the superblock and a GDT
779 * block. If we are adding a group past the last current GDT block,
780 * we will also modify the inode and the dindirect block. If we
781 * are adding a group with superblock/GDT backups we will also
782 * modify each of the reserved GDT dindirect blocks.
783 */
784 handle = ext4_journal_start_sb(sb,
785 ext4_bg_has_super(sb, input->group) ?
786 3 + reserved_gdb : 4);
787 if (IS_ERR(handle)) {
788 err = PTR_ERR(handle);
789 goto exit_put;
790 }
791
792 lock_super(sb);
793 if (input->group != sbi->s_groups_count) {
794 ext4_warning(sb, __FUNCTION__,
795 "multiple resizers run on filesystem!");
796 err = -EBUSY;
797 goto exit_journal;
798 }
799
800 if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
801 goto exit_journal;
802
803 /*
804 * We will only either add reserved group blocks to a backup group
805 * or remove reserved blocks for the first group in a new group block.
806 * Doing both would be mean more complex code, and sane people don't
807 * use non-sparse filesystems anymore. This is already checked above.
808 */
809 if (gdb_off) {
810 primary = sbi->s_group_desc[gdb_num];
811 if ((err = ext4_journal_get_write_access(handle, primary)))
812 goto exit_journal;
813
814 if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) &&
815 (err = reserve_backup_gdb(handle, inode, input)))
816 goto exit_journal;
817 } else if ((err = add_new_gdb(handle, inode, input, &primary)))
818 goto exit_journal;
819
820 /*
821 * OK, now we've set up the new group. Time to make it active.
822 *
823 * Current kernels don't lock all allocations via lock_super(),
824 * so we have to be safe wrt. concurrent accesses the group
825 * data. So we need to be careful to set all of the relevant
826 * group descriptor data etc. *before* we enable the group.
827 *
828 * The key field here is sbi->s_groups_count: as long as
829 * that retains its old value, nobody is going to access the new
830 * group.
831 *
832 * So first we update all the descriptor metadata for the new
833 * group; then we update the total disk blocks count; then we
834 * update the groups count to enable the group; then finally we
835 * update the free space counts so that the system can start
836 * using the new disk blocks.
837 */
838
839 /* Update group descriptor block for new group */
840 gdp = (struct ext4_group_desc *)primary->b_data + gdb_off;
841
842 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
843 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
844 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
845 gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
846 gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
847
848 /*
849 * Make the new blocks and inodes valid next. We do this before
850 * increasing the group count so that once the group is enabled,
851 * all of its blocks and inodes are already valid.
852 *
853 * We always allocate group-by-group, then block-by-block or
854 * inode-by-inode within a group, so enabling these
855 * blocks/inodes before the group is live won't actually let us
856 * allocate the new space yet.
857 */
858 ext4_blocks_count_set(es, ext4_blocks_count(es) +
859 input->blocks_count);
860 es->s_inodes_count = cpu_to_le32(le32_to_cpu(es->s_inodes_count) +
861 EXT4_INODES_PER_GROUP(sb));
862
863 /*
864 * We need to protect s_groups_count against other CPUs seeing
865 * inconsistent state in the superblock.
866 *
867 * The precise rules we use are:
868 *
869 * * Writers of s_groups_count *must* hold lock_super
870 * AND
871 * * Writers must perform a smp_wmb() after updating all dependent
872 * data and before modifying the groups count
873 *
874 * * Readers must hold lock_super() over the access
875 * OR
876 * * Readers must perform an smp_rmb() after reading the groups count
877 * and before reading any dependent data.
878 *
879 * NB. These rules can be relaxed when checking the group count
880 * while freeing data, as we can only allocate from a block
881 * group after serialising against the group count, and we can
882 * only then free after serialising in turn against that
883 * allocation.
884 */
885 smp_wmb();
886
887 /* Update the global fs size fields */
888 sbi->s_groups_count++;
889
890 ext4_journal_dirty_metadata(handle, primary);
891
892 /* Update the reserved block counts only once the new group is
893 * active. */
894 ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
895 input->reserved_blocks);
896
897 /* Update the free space counts */
898 percpu_counter_mod(&sbi->s_freeblocks_counter,
899 input->free_blocks_count);
900 percpu_counter_mod(&sbi->s_freeinodes_counter,
901 EXT4_INODES_PER_GROUP(sb));
902
903 ext4_journal_dirty_metadata(handle, sbi->s_sbh);
904 sb->s_dirt = 1;
905
906exit_journal:
907 unlock_super(sb);
908 if ((err2 = ext4_journal_stop(handle)) && !err)
909 err = err2;
910 if (!err) {
911 update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
912 sizeof(struct ext4_super_block));
913 update_backups(sb, primary->b_blocknr, primary->b_data,
914 primary->b_size);
915 }
916exit_put:
917 iput(inode);
918 return err;
919} /* ext4_group_add */
920
921/* Extend the filesystem to the new number of blocks specified. This entry
922 * point is only used to extend the current filesystem to the end of the last
923 * existing group. It can be accessed via ioctl, or by "remount,resize=<size>"
924 * for emergencies (because it has no dependencies on reserved blocks).
925 *
926 * If we _really_ wanted, we could use default values to call ext4_group_add()
927 * allow the "remount" trick to work for arbitrary resizing, assuming enough
928 * GDT blocks are reserved to grow to the desired size.
929 */
930int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
931 ext4_fsblk_t n_blocks_count)
932{
933 ext4_fsblk_t o_blocks_count;
934 unsigned long o_groups_count;
935 ext4_grpblk_t last;
936 ext4_grpblk_t add;
937 struct buffer_head * bh;
938 handle_t *handle;
939 int err;
940 unsigned long freed_blocks;
941
942 /* We don't need to worry about locking wrt other resizers just
943 * yet: we're going to revalidate es->s_blocks_count after
944 * taking lock_super() below. */
945 o_blocks_count = ext4_blocks_count(es);
946 o_groups_count = EXT4_SB(sb)->s_groups_count;
947
948 if (test_opt(sb, DEBUG))
949 printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n",
950 o_blocks_count, n_blocks_count);
951
952 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
953 return 0;
954
955 if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
956 printk(KERN_ERR "EXT4-fs: filesystem on %s:"
957 " too large to resize to %llu blocks safely\n",
958 sb->s_id, n_blocks_count);
959 if (sizeof(sector_t) < 8)
960 ext4_warning(sb, __FUNCTION__,
961 "CONFIG_LBD not enabled\n");
962 return -EINVAL;
963 }
964
965 if (n_blocks_count < o_blocks_count) {
966 ext4_warning(sb, __FUNCTION__,
967 "can't shrink FS - resize aborted");
968 return -EBUSY;
969 }
970
971 /* Handle the remaining blocks in the last group only. */
972 ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
973
974 if (last == 0) {
975 ext4_warning(sb, __FUNCTION__,
976 "need to use ext2online to resize further");
977 return -EPERM;
978 }
979
980 add = EXT4_BLOCKS_PER_GROUP(sb) - last;
981
982 if (o_blocks_count + add < o_blocks_count) {
983 ext4_warning(sb, __FUNCTION__, "blocks_count overflow");
984 return -EINVAL;
985 }
986
987 if (o_blocks_count + add > n_blocks_count)
988 add = n_blocks_count - o_blocks_count;
989
990 if (o_blocks_count + add < n_blocks_count)
991 ext4_warning(sb, __FUNCTION__,
992 "will only finish group (%llu"
993 " blocks, %u new)",
994 o_blocks_count + add, add);
995
996 /* See if the device is actually as big as what was requested */
997 bh = sb_bread(sb, o_blocks_count + add -1);
998 if (!bh) {
999 ext4_warning(sb, __FUNCTION__,
1000 "can't read last block, resize aborted");
1001 return -ENOSPC;
1002 }
1003 brelse(bh);
1004
1005 /* We will update the superblock, one block bitmap, and
1006 * one group descriptor via ext4_free_blocks().
1007 */
1008 handle = ext4_journal_start_sb(sb, 3);
1009 if (IS_ERR(handle)) {
1010 err = PTR_ERR(handle);
1011 ext4_warning(sb, __FUNCTION__, "error %d on journal start",err);
1012 goto exit_put;
1013 }
1014
1015 lock_super(sb);
1016 if (o_blocks_count != ext4_blocks_count(es)) {
1017 ext4_warning(sb, __FUNCTION__,
1018 "multiple resizers run on filesystem!");
1019 unlock_super(sb);
1020 err = -EBUSY;
1021 goto exit_put;
1022 }
1023
1024 if ((err = ext4_journal_get_write_access(handle,
1025 EXT4_SB(sb)->s_sbh))) {
1026 ext4_warning(sb, __FUNCTION__,
1027 "error %d on journal write access", err);
1028 unlock_super(sb);
1029 ext4_journal_stop(handle);
1030 goto exit_put;
1031 }
1032 ext4_blocks_count_set(es, o_blocks_count + add);
1033 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
1034 sb->s_dirt = 1;
1035 unlock_super(sb);
1036 ext4_debug("freeing blocks %lu through %llu\n", o_blocks_count,
1037 o_blocks_count + add);
1038 ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
1039 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
1040 o_blocks_count + add);
1041 if ((err = ext4_journal_stop(handle)))
1042 goto exit_put;
1043 if (test_opt(sb, DEBUG))
1044 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
1045 ext4_blocks_count(es));
1046 update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
1047 sizeof(struct ext4_super_block));
1048exit_put:
1049 return err;
1050} /* ext4_group_extend */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
new file mode 100644
index 000000000000..b4b022aa2bc2
--- /dev/null
+++ b/fs/ext4/super.c
@@ -0,0 +1,2829 @@
1/*
2 * linux/fs/ext4/super.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/inode.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Big-endian to little-endian byte-swapping/bitmaps by
16 * David S. Miller (davem@caip.rutgers.edu), 1995
17 */
18
19#include <linux/module.h>
20#include <linux/string.h>
21#include <linux/fs.h>
22#include <linux/time.h>
23#include <linux/jbd2.h>
24#include <linux/ext4_fs.h>
25#include <linux/ext4_jbd2.h>
26#include <linux/slab.h>
27#include <linux/init.h>
28#include <linux/blkdev.h>
29#include <linux/parser.h>
30#include <linux/smp_lock.h>
31#include <linux/buffer_head.h>
32#include <linux/vfs.h>
33#include <linux/random.h>
34#include <linux/mount.h>
35#include <linux/namei.h>
36#include <linux/quotaops.h>
37#include <linux/seq_file.h>
38
39#include <asm/uaccess.h>
40
41#include "xattr.h"
42#include "acl.h"
43#include "namei.h"
44
45static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
46 unsigned long journal_devnum);
47static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
48 unsigned int);
49static void ext4_commit_super (struct super_block * sb,
50 struct ext4_super_block * es,
51 int sync);
52static void ext4_mark_recovery_complete(struct super_block * sb,
53 struct ext4_super_block * es);
54static void ext4_clear_journal_err(struct super_block * sb,
55 struct ext4_super_block * es);
56static int ext4_sync_fs(struct super_block *sb, int wait);
57static const char *ext4_decode_error(struct super_block * sb, int errno,
58 char nbuf[16]);
59static int ext4_remount (struct super_block * sb, int * flags, char * data);
60static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf);
61static void ext4_unlockfs(struct super_block *sb);
62static void ext4_write_super (struct super_block * sb);
63static void ext4_write_super_lockfs(struct super_block *sb);
64
65
66ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
67 struct ext4_group_desc *bg)
68{
69 return le32_to_cpu(bg->bg_block_bitmap) |
70 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
71 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
72}
73
74ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
75 struct ext4_group_desc *bg)
76{
77 return le32_to_cpu(bg->bg_inode_bitmap) |
78 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
79 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
80}
81
82ext4_fsblk_t ext4_inode_table(struct super_block *sb,
83 struct ext4_group_desc *bg)
84{
85 return le32_to_cpu(bg->bg_inode_table) |
86 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
87 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
88}
89
90void ext4_block_bitmap_set(struct super_block *sb,
91 struct ext4_group_desc *bg, ext4_fsblk_t blk)
92{
93 bg->bg_block_bitmap = cpu_to_le32((u32)blk);
94 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
95 bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
96}
97
98void ext4_inode_bitmap_set(struct super_block *sb,
99 struct ext4_group_desc *bg, ext4_fsblk_t blk)
100{
101 bg->bg_inode_bitmap = cpu_to_le32((u32)blk);
102 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
103 bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
104}
105
106void ext4_inode_table_set(struct super_block *sb,
107 struct ext4_group_desc *bg, ext4_fsblk_t blk)
108{
109 bg->bg_inode_table = cpu_to_le32((u32)blk);
110 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
111 bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
112}
113
114/*
115 * Wrappers for jbd2_journal_start/end.
116 *
117 * The only special thing we need to do here is to make sure that all
118 * journal_end calls result in the superblock being marked dirty, so
119 * that sync() will call the filesystem's write_super callback if
120 * appropriate.
121 */
122handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
123{
124 journal_t *journal;
125
126 if (sb->s_flags & MS_RDONLY)
127 return ERR_PTR(-EROFS);
128
129 /* Special case here: if the journal has aborted behind our
130 * backs (eg. EIO in the commit thread), then we still need to
131 * take the FS itself readonly cleanly. */
132 journal = EXT4_SB(sb)->s_journal;
133 if (is_journal_aborted(journal)) {
134 ext4_abort(sb, __FUNCTION__,
135 "Detected aborted journal");
136 return ERR_PTR(-EROFS);
137 }
138
139 return jbd2_journal_start(journal, nblocks);
140}
141
142/*
143 * The only special thing we need to do here is to make sure that all
144 * jbd2_journal_stop calls result in the superblock being marked dirty, so
145 * that sync() will call the filesystem's write_super callback if
146 * appropriate.
147 */
148int __ext4_journal_stop(const char *where, handle_t *handle)
149{
150 struct super_block *sb;
151 int err;
152 int rc;
153
154 sb = handle->h_transaction->t_journal->j_private;
155 err = handle->h_err;
156 rc = jbd2_journal_stop(handle);
157
158 if (!err)
159 err = rc;
160 if (err)
161 __ext4_std_error(sb, where, err);
162 return err;
163}
164
165void ext4_journal_abort_handle(const char *caller, const char *err_fn,
166 struct buffer_head *bh, handle_t *handle, int err)
167{
168 char nbuf[16];
169 const char *errstr = ext4_decode_error(NULL, err, nbuf);
170
171 if (bh)
172 BUFFER_TRACE(bh, "abort");
173
174 if (!handle->h_err)
175 handle->h_err = err;
176
177 if (is_handle_aborted(handle))
178 return;
179
180 printk(KERN_ERR "%s: aborting transaction: %s in %s\n",
181 caller, errstr, err_fn);
182
183 jbd2_journal_abort_handle(handle);
184}
185
186/* Deal with the reporting of failure conditions on a filesystem such as
187 * inconsistencies detected or read IO failures.
188 *
189 * On ext2, we can store the error state of the filesystem in the
190 * superblock. That is not possible on ext4, because we may have other
191 * write ordering constraints on the superblock which prevent us from
192 * writing it out straight away; and given that the journal is about to
193 * be aborted, we can't rely on the current, or future, transactions to
194 * write out the superblock safely.
195 *
196 * We'll just use the jbd2_journal_abort() error code to record an error in
197 * the journal instead. On recovery, the journal will compain about
198 * that error until we've noted it down and cleared it.
199 */
200
201static void ext4_handle_error(struct super_block *sb)
202{
203 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
204
205 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
206 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
207
208 if (sb->s_flags & MS_RDONLY)
209 return;
210
211 if (!test_opt (sb, ERRORS_CONT)) {
212 journal_t *journal = EXT4_SB(sb)->s_journal;
213
214 EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
215 if (journal)
216 jbd2_journal_abort(journal, -EIO);
217 }
218 if (test_opt (sb, ERRORS_RO)) {
219 printk (KERN_CRIT "Remounting filesystem read-only\n");
220 sb->s_flags |= MS_RDONLY;
221 }
222 ext4_commit_super(sb, es, 1);
223 if (test_opt(sb, ERRORS_PANIC))
224 panic("EXT4-fs (device %s): panic forced after error\n",
225 sb->s_id);
226}
227
228void ext4_error (struct super_block * sb, const char * function,
229 const char * fmt, ...)
230{
231 va_list args;
232
233 va_start(args, fmt);
234 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
235 vprintk(fmt, args);
236 printk("\n");
237 va_end(args);
238
239 ext4_handle_error(sb);
240}
241
242static const char *ext4_decode_error(struct super_block * sb, int errno,
243 char nbuf[16])
244{
245 char *errstr = NULL;
246
247 switch (errno) {
248 case -EIO:
249 errstr = "IO failure";
250 break;
251 case -ENOMEM:
252 errstr = "Out of memory";
253 break;
254 case -EROFS:
255 if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
256 errstr = "Journal has aborted";
257 else
258 errstr = "Readonly filesystem";
259 break;
260 default:
261 /* If the caller passed in an extra buffer for unknown
262 * errors, textualise them now. Else we just return
263 * NULL. */
264 if (nbuf) {
265 /* Check for truncated error codes... */
266 if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
267 errstr = nbuf;
268 }
269 break;
270 }
271
272 return errstr;
273}
274
275/* __ext4_std_error decodes expected errors from journaling functions
276 * automatically and invokes the appropriate error response. */
277
278void __ext4_std_error (struct super_block * sb, const char * function,
279 int errno)
280{
281 char nbuf[16];
282 const char *errstr;
283
284 /* Special case: if the error is EROFS, and we're not already
285 * inside a transaction, then there's really no point in logging
286 * an error. */
287 if (errno == -EROFS && journal_current_handle() == NULL &&
288 (sb->s_flags & MS_RDONLY))
289 return;
290
291 errstr = ext4_decode_error(sb, errno, nbuf);
292 printk (KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
293 sb->s_id, function, errstr);
294
295 ext4_handle_error(sb);
296}
297
298/*
299 * ext4_abort is a much stronger failure handler than ext4_error. The
300 * abort function may be used to deal with unrecoverable failures such
301 * as journal IO errors or ENOMEM at a critical moment in log management.
302 *
303 * We unconditionally force the filesystem into an ABORT|READONLY state,
304 * unless the error response on the fs has been set to panic in which
305 * case we take the easy way out and panic immediately.
306 */
307
308void ext4_abort (struct super_block * sb, const char * function,
309 const char * fmt, ...)
310{
311 va_list args;
312
313 printk (KERN_CRIT "ext4_abort called.\n");
314
315 va_start(args, fmt);
316 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
317 vprintk(fmt, args);
318 printk("\n");
319 va_end(args);
320
321 if (test_opt(sb, ERRORS_PANIC))
322 panic("EXT4-fs panic from previous error\n");
323
324 if (sb->s_flags & MS_RDONLY)
325 return;
326
327 printk(KERN_CRIT "Remounting filesystem read-only\n");
328 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
329 sb->s_flags |= MS_RDONLY;
330 EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
331 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
332}
333
334void ext4_warning (struct super_block * sb, const char * function,
335 const char * fmt, ...)
336{
337 va_list args;
338
339 va_start(args, fmt);
340 printk(KERN_WARNING "EXT4-fs warning (device %s): %s: ",
341 sb->s_id, function);
342 vprintk(fmt, args);
343 printk("\n");
344 va_end(args);
345}
346
347void ext4_update_dynamic_rev(struct super_block *sb)
348{
349 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
350
351 if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
352 return;
353
354 ext4_warning(sb, __FUNCTION__,
355 "updating to rev %d because of new feature flag, "
356 "running e2fsck is recommended",
357 EXT4_DYNAMIC_REV);
358
359 es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
360 es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
361 es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
362 /* leave es->s_feature_*compat flags alone */
363 /* es->s_uuid will be set by e2fsck if empty */
364
365 /*
366 * The rest of the superblock fields should be zero, and if not it
367 * means they are likely already in use, so leave them alone. We
368 * can leave it up to e2fsck to clean up any inconsistencies there.
369 */
370}
371
372/*
373 * Open the external journal device
374 */
375static struct block_device *ext4_blkdev_get(dev_t dev)
376{
377 struct block_device *bdev;
378 char b[BDEVNAME_SIZE];
379
380 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
381 if (IS_ERR(bdev))
382 goto fail;
383 return bdev;
384
385fail:
386 printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n",
387 __bdevname(dev, b), PTR_ERR(bdev));
388 return NULL;
389}
390
391/*
392 * Release the journal device
393 */
394static int ext4_blkdev_put(struct block_device *bdev)
395{
396 bd_release(bdev);
397 return blkdev_put(bdev);
398}
399
400static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
401{
402 struct block_device *bdev;
403 int ret = -ENODEV;
404
405 bdev = sbi->journal_bdev;
406 if (bdev) {
407 ret = ext4_blkdev_put(bdev);
408 sbi->journal_bdev = NULL;
409 }
410 return ret;
411}
412
413static inline struct inode *orphan_list_entry(struct list_head *l)
414{
415 return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
416}
417
418static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
419{
420 struct list_head *l;
421
422 printk(KERN_ERR "sb orphan head is %d\n",
423 le32_to_cpu(sbi->s_es->s_last_orphan));
424
425 printk(KERN_ERR "sb_info orphan list:\n");
426 list_for_each(l, &sbi->s_orphan) {
427 struct inode *inode = orphan_list_entry(l);
428 printk(KERN_ERR " "
429 "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
430 inode->i_sb->s_id, inode->i_ino, inode,
431 inode->i_mode, inode->i_nlink,
432 NEXT_ORPHAN(inode));
433 }
434}
435
436static void ext4_put_super (struct super_block * sb)
437{
438 struct ext4_sb_info *sbi = EXT4_SB(sb);
439 struct ext4_super_block *es = sbi->s_es;
440 int i;
441
442 ext4_ext_release(sb);
443 ext4_xattr_put_super(sb);
444 jbd2_journal_destroy(sbi->s_journal);
445 if (!(sb->s_flags & MS_RDONLY)) {
446 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
447 es->s_state = cpu_to_le16(sbi->s_mount_state);
448 BUFFER_TRACE(sbi->s_sbh, "marking dirty");
449 mark_buffer_dirty(sbi->s_sbh);
450 ext4_commit_super(sb, es, 1);
451 }
452
453 for (i = 0; i < sbi->s_gdb_count; i++)
454 brelse(sbi->s_group_desc[i]);
455 kfree(sbi->s_group_desc);
456 percpu_counter_destroy(&sbi->s_freeblocks_counter);
457 percpu_counter_destroy(&sbi->s_freeinodes_counter);
458 percpu_counter_destroy(&sbi->s_dirs_counter);
459 brelse(sbi->s_sbh);
460#ifdef CONFIG_QUOTA
461 for (i = 0; i < MAXQUOTAS; i++)
462 kfree(sbi->s_qf_names[i]);
463#endif
464
465 /* Debugging code just in case the in-memory inode orphan list
466 * isn't empty. The on-disk one can be non-empty if we've
467 * detected an error and taken the fs readonly, but the
468 * in-memory list had better be clean by this point. */
469 if (!list_empty(&sbi->s_orphan))
470 dump_orphan_list(sb, sbi);
471 J_ASSERT(list_empty(&sbi->s_orphan));
472
473 invalidate_bdev(sb->s_bdev, 0);
474 if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
475 /*
476 * Invalidate the journal device's buffers. We don't want them
477 * floating about in memory - the physical journal device may
478 * hotswapped, and it breaks the `ro-after' testing code.
479 */
480 sync_blockdev(sbi->journal_bdev);
481 invalidate_bdev(sbi->journal_bdev, 0);
482 ext4_blkdev_remove(sbi);
483 }
484 sb->s_fs_info = NULL;
485 kfree(sbi);
486 return;
487}
488
489static kmem_cache_t *ext4_inode_cachep;
490
491/*
492 * Called inside transaction, so use GFP_NOFS
493 */
494static struct inode *ext4_alloc_inode(struct super_block *sb)
495{
496 struct ext4_inode_info *ei;
497
498 ei = kmem_cache_alloc(ext4_inode_cachep, SLAB_NOFS);
499 if (!ei)
500 return NULL;
501#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
502 ei->i_acl = EXT4_ACL_NOT_CACHED;
503 ei->i_default_acl = EXT4_ACL_NOT_CACHED;
504#endif
505 ei->i_block_alloc_info = NULL;
506 ei->vfs_inode.i_version = 1;
507 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
508 return &ei->vfs_inode;
509}
510
511static void ext4_destroy_inode(struct inode *inode)
512{
513 kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
514}
515
516static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
517{
518 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
519
520 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
521 SLAB_CTOR_CONSTRUCTOR) {
522 INIT_LIST_HEAD(&ei->i_orphan);
523#ifdef CONFIG_EXT4DEV_FS_XATTR
524 init_rwsem(&ei->xattr_sem);
525#endif
526 mutex_init(&ei->truncate_mutex);
527 inode_init_once(&ei->vfs_inode);
528 }
529}
530
531static int init_inodecache(void)
532{
533 ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
534 sizeof(struct ext4_inode_info),
535 0, (SLAB_RECLAIM_ACCOUNT|
536 SLAB_MEM_SPREAD),
537 init_once, NULL);
538 if (ext4_inode_cachep == NULL)
539 return -ENOMEM;
540 return 0;
541}
542
543static void destroy_inodecache(void)
544{
545 kmem_cache_destroy(ext4_inode_cachep);
546}
547
548static void ext4_clear_inode(struct inode *inode)
549{
550 struct ext4_block_alloc_info *rsv = EXT4_I(inode)->i_block_alloc_info;
551#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
552 if (EXT4_I(inode)->i_acl &&
553 EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
554 posix_acl_release(EXT4_I(inode)->i_acl);
555 EXT4_I(inode)->i_acl = EXT4_ACL_NOT_CACHED;
556 }
557 if (EXT4_I(inode)->i_default_acl &&
558 EXT4_I(inode)->i_default_acl != EXT4_ACL_NOT_CACHED) {
559 posix_acl_release(EXT4_I(inode)->i_default_acl);
560 EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
561 }
562#endif
563 ext4_discard_reservation(inode);
564 EXT4_I(inode)->i_block_alloc_info = NULL;
565 if (unlikely(rsv))
566 kfree(rsv);
567}
568
569static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
570{
571#if defined(CONFIG_QUOTA)
572 struct ext4_sb_info *sbi = EXT4_SB(sb);
573
574 if (sbi->s_jquota_fmt)
575 seq_printf(seq, ",jqfmt=%s",
576 (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
577
578 if (sbi->s_qf_names[USRQUOTA])
579 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
580
581 if (sbi->s_qf_names[GRPQUOTA])
582 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
583
584 if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA)
585 seq_puts(seq, ",usrquota");
586
587 if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)
588 seq_puts(seq, ",grpquota");
589#endif
590}
591
592static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
593{
594 struct super_block *sb = vfs->mnt_sb;
595
596 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
597 seq_puts(seq, ",data=journal");
598 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
599 seq_puts(seq, ",data=ordered");
600 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
601 seq_puts(seq, ",data=writeback");
602
603 ext4_show_quota_options(seq, sb);
604
605 return 0;
606}
607
608
609static struct dentry *ext4_get_dentry(struct super_block *sb, void *vobjp)
610{
611 __u32 *objp = vobjp;
612 unsigned long ino = objp[0];
613 __u32 generation = objp[1];
614 struct inode *inode;
615 struct dentry *result;
616
617 if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
618 return ERR_PTR(-ESTALE);
619 if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
620 return ERR_PTR(-ESTALE);
621
622 /* iget isn't really right if the inode is currently unallocated!!
623 *
624 * ext4_read_inode will return a bad_inode if the inode had been
625 * deleted, so we should be safe.
626 *
627 * Currently we don't know the generation for parent directory, so
628 * a generation of 0 means "accept any"
629 */
630 inode = iget(sb, ino);
631 if (inode == NULL)
632 return ERR_PTR(-ENOMEM);
633 if (is_bad_inode(inode) ||
634 (generation && inode->i_generation != generation)) {
635 iput(inode);
636 return ERR_PTR(-ESTALE);
637 }
638 /* now to find a dentry.
639 * If possible, get a well-connected one
640 */
641 result = d_alloc_anon(inode);
642 if (!result) {
643 iput(inode);
644 return ERR_PTR(-ENOMEM);
645 }
646 return result;
647}
648
649#ifdef CONFIG_QUOTA
650#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
651#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
652
653static int ext4_dquot_initialize(struct inode *inode, int type);
654static int ext4_dquot_drop(struct inode *inode);
655static int ext4_write_dquot(struct dquot *dquot);
656static int ext4_acquire_dquot(struct dquot *dquot);
657static int ext4_release_dquot(struct dquot *dquot);
658static int ext4_mark_dquot_dirty(struct dquot *dquot);
659static int ext4_write_info(struct super_block *sb, int type);
660static int ext4_quota_on(struct super_block *sb, int type, int format_id, char *path);
661static int ext4_quota_on_mount(struct super_block *sb, int type);
662static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
663 size_t len, loff_t off);
664static ssize_t ext4_quota_write(struct super_block *sb, int type,
665 const char *data, size_t len, loff_t off);
666
667static struct dquot_operations ext4_quota_operations = {
668 .initialize = ext4_dquot_initialize,
669 .drop = ext4_dquot_drop,
670 .alloc_space = dquot_alloc_space,
671 .alloc_inode = dquot_alloc_inode,
672 .free_space = dquot_free_space,
673 .free_inode = dquot_free_inode,
674 .transfer = dquot_transfer,
675 .write_dquot = ext4_write_dquot,
676 .acquire_dquot = ext4_acquire_dquot,
677 .release_dquot = ext4_release_dquot,
678 .mark_dirty = ext4_mark_dquot_dirty,
679 .write_info = ext4_write_info
680};
681
682static struct quotactl_ops ext4_qctl_operations = {
683 .quota_on = ext4_quota_on,
684 .quota_off = vfs_quota_off,
685 .quota_sync = vfs_quota_sync,
686 .get_info = vfs_get_dqinfo,
687 .set_info = vfs_set_dqinfo,
688 .get_dqblk = vfs_get_dqblk,
689 .set_dqblk = vfs_set_dqblk
690};
691#endif
692
693static struct super_operations ext4_sops = {
694 .alloc_inode = ext4_alloc_inode,
695 .destroy_inode = ext4_destroy_inode,
696 .read_inode = ext4_read_inode,
697 .write_inode = ext4_write_inode,
698 .dirty_inode = ext4_dirty_inode,
699 .delete_inode = ext4_delete_inode,
700 .put_super = ext4_put_super,
701 .write_super = ext4_write_super,
702 .sync_fs = ext4_sync_fs,
703 .write_super_lockfs = ext4_write_super_lockfs,
704 .unlockfs = ext4_unlockfs,
705 .statfs = ext4_statfs,
706 .remount_fs = ext4_remount,
707 .clear_inode = ext4_clear_inode,
708 .show_options = ext4_show_options,
709#ifdef CONFIG_QUOTA
710 .quota_read = ext4_quota_read,
711 .quota_write = ext4_quota_write,
712#endif
713};
714
715static struct export_operations ext4_export_ops = {
716 .get_parent = ext4_get_parent,
717 .get_dentry = ext4_get_dentry,
718};
719
720enum {
721 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
722 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
723 Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
724 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
725 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
726 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
727 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
728 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
729 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
730 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
731 Opt_grpquota, Opt_extents,
732};
733
734static match_table_t tokens = {
735 {Opt_bsd_df, "bsddf"},
736 {Opt_minix_df, "minixdf"},
737 {Opt_grpid, "grpid"},
738 {Opt_grpid, "bsdgroups"},
739 {Opt_nogrpid, "nogrpid"},
740 {Opt_nogrpid, "sysvgroups"},
741 {Opt_resgid, "resgid=%u"},
742 {Opt_resuid, "resuid=%u"},
743 {Opt_sb, "sb=%u"},
744 {Opt_err_cont, "errors=continue"},
745 {Opt_err_panic, "errors=panic"},
746 {Opt_err_ro, "errors=remount-ro"},
747 {Opt_nouid32, "nouid32"},
748 {Opt_nocheck, "nocheck"},
749 {Opt_nocheck, "check=none"},
750 {Opt_debug, "debug"},
751 {Opt_oldalloc, "oldalloc"},
752 {Opt_orlov, "orlov"},
753 {Opt_user_xattr, "user_xattr"},
754 {Opt_nouser_xattr, "nouser_xattr"},
755 {Opt_acl, "acl"},
756 {Opt_noacl, "noacl"},
757 {Opt_reservation, "reservation"},
758 {Opt_noreservation, "noreservation"},
759 {Opt_noload, "noload"},
760 {Opt_nobh, "nobh"},
761 {Opt_bh, "bh"},
762 {Opt_commit, "commit=%u"},
763 {Opt_journal_update, "journal=update"},
764 {Opt_journal_inum, "journal=%u"},
765 {Opt_journal_dev, "journal_dev=%u"},
766 {Opt_abort, "abort"},
767 {Opt_data_journal, "data=journal"},
768 {Opt_data_ordered, "data=ordered"},
769 {Opt_data_writeback, "data=writeback"},
770 {Opt_offusrjquota, "usrjquota="},
771 {Opt_usrjquota, "usrjquota=%s"},
772 {Opt_offgrpjquota, "grpjquota="},
773 {Opt_grpjquota, "grpjquota=%s"},
774 {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
775 {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
776 {Opt_grpquota, "grpquota"},
777 {Opt_noquota, "noquota"},
778 {Opt_quota, "quota"},
779 {Opt_usrquota, "usrquota"},
780 {Opt_barrier, "barrier=%u"},
781 {Opt_extents, "extents"},
782 {Opt_err, NULL},
783 {Opt_resize, "resize"},
784};
785
786static ext4_fsblk_t get_sb_block(void **data)
787{
788 ext4_fsblk_t sb_block;
789 char *options = (char *) *data;
790
791 if (!options || strncmp(options, "sb=", 3) != 0)
792 return 1; /* Default location */
793 options += 3;
794 /*todo: use simple_strtoll with >32bit ext4 */
795 sb_block = simple_strtoul(options, &options, 0);
796 if (*options && *options != ',') {
797 printk("EXT4-fs: Invalid sb specification: %s\n",
798 (char *) *data);
799 return 1;
800 }
801 if (*options == ',')
802 options++;
803 *data = (void *) options;
804 return sb_block;
805}
806
807static int parse_options (char *options, struct super_block *sb,
808 unsigned int *inum, unsigned long *journal_devnum,
809 ext4_fsblk_t *n_blocks_count, int is_remount)
810{
811 struct ext4_sb_info *sbi = EXT4_SB(sb);
812 char * p;
813 substring_t args[MAX_OPT_ARGS];
814 int data_opt = 0;
815 int option;
816#ifdef CONFIG_QUOTA
817 int qtype;
818 char *qname;
819#endif
820
821 if (!options)
822 return 1;
823
824 while ((p = strsep (&options, ",")) != NULL) {
825 int token;
826 if (!*p)
827 continue;
828
829 token = match_token(p, tokens, args);
830 switch (token) {
831 case Opt_bsd_df:
832 clear_opt (sbi->s_mount_opt, MINIX_DF);
833 break;
834 case Opt_minix_df:
835 set_opt (sbi->s_mount_opt, MINIX_DF);
836 break;
837 case Opt_grpid:
838 set_opt (sbi->s_mount_opt, GRPID);
839 break;
840 case Opt_nogrpid:
841 clear_opt (sbi->s_mount_opt, GRPID);
842 break;
843 case Opt_resuid:
844 if (match_int(&args[0], &option))
845 return 0;
846 sbi->s_resuid = option;
847 break;
848 case Opt_resgid:
849 if (match_int(&args[0], &option))
850 return 0;
851 sbi->s_resgid = option;
852 break;
853 case Opt_sb:
854 /* handled by get_sb_block() instead of here */
855 /* *sb_block = match_int(&args[0]); */
856 break;
857 case Opt_err_panic:
858 clear_opt (sbi->s_mount_opt, ERRORS_CONT);
859 clear_opt (sbi->s_mount_opt, ERRORS_RO);
860 set_opt (sbi->s_mount_opt, ERRORS_PANIC);
861 break;
862 case Opt_err_ro:
863 clear_opt (sbi->s_mount_opt, ERRORS_CONT);
864 clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
865 set_opt (sbi->s_mount_opt, ERRORS_RO);
866 break;
867 case Opt_err_cont:
868 clear_opt (sbi->s_mount_opt, ERRORS_RO);
869 clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
870 set_opt (sbi->s_mount_opt, ERRORS_CONT);
871 break;
872 case Opt_nouid32:
873 set_opt (sbi->s_mount_opt, NO_UID32);
874 break;
875 case Opt_nocheck:
876 clear_opt (sbi->s_mount_opt, CHECK);
877 break;
878 case Opt_debug:
879 set_opt (sbi->s_mount_opt, DEBUG);
880 break;
881 case Opt_oldalloc:
882 set_opt (sbi->s_mount_opt, OLDALLOC);
883 break;
884 case Opt_orlov:
885 clear_opt (sbi->s_mount_opt, OLDALLOC);
886 break;
887#ifdef CONFIG_EXT4DEV_FS_XATTR
888 case Opt_user_xattr:
889 set_opt (sbi->s_mount_opt, XATTR_USER);
890 break;
891 case Opt_nouser_xattr:
892 clear_opt (sbi->s_mount_opt, XATTR_USER);
893 break;
894#else
895 case Opt_user_xattr:
896 case Opt_nouser_xattr:
897 printk("EXT4 (no)user_xattr options not supported\n");
898 break;
899#endif
900#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
901 case Opt_acl:
902 set_opt(sbi->s_mount_opt, POSIX_ACL);
903 break;
904 case Opt_noacl:
905 clear_opt(sbi->s_mount_opt, POSIX_ACL);
906 break;
907#else
908 case Opt_acl:
909 case Opt_noacl:
910 printk("EXT4 (no)acl options not supported\n");
911 break;
912#endif
913 case Opt_reservation:
914 set_opt(sbi->s_mount_opt, RESERVATION);
915 break;
916 case Opt_noreservation:
917 clear_opt(sbi->s_mount_opt, RESERVATION);
918 break;
919 case Opt_journal_update:
920 /* @@@ FIXME */
921 /* Eventually we will want to be able to create
922 a journal file here. For now, only allow the
923 user to specify an existing inode to be the
924 journal file. */
925 if (is_remount) {
926 printk(KERN_ERR "EXT4-fs: cannot specify "
927 "journal on remount\n");
928 return 0;
929 }
930 set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
931 break;
932 case Opt_journal_inum:
933 if (is_remount) {
934 printk(KERN_ERR "EXT4-fs: cannot specify "
935 "journal on remount\n");
936 return 0;
937 }
938 if (match_int(&args[0], &option))
939 return 0;
940 *inum = option;
941 break;
942 case Opt_journal_dev:
943 if (is_remount) {
944 printk(KERN_ERR "EXT4-fs: cannot specify "
945 "journal on remount\n");
946 return 0;
947 }
948 if (match_int(&args[0], &option))
949 return 0;
950 *journal_devnum = option;
951 break;
952 case Opt_noload:
953 set_opt (sbi->s_mount_opt, NOLOAD);
954 break;
955 case Opt_commit:
956 if (match_int(&args[0], &option))
957 return 0;
958 if (option < 0)
959 return 0;
960 if (option == 0)
961 option = JBD_DEFAULT_MAX_COMMIT_AGE;
962 sbi->s_commit_interval = HZ * option;
963 break;
964 case Opt_data_journal:
965 data_opt = EXT4_MOUNT_JOURNAL_DATA;
966 goto datacheck;
967 case Opt_data_ordered:
968 data_opt = EXT4_MOUNT_ORDERED_DATA;
969 goto datacheck;
970 case Opt_data_writeback:
971 data_opt = EXT4_MOUNT_WRITEBACK_DATA;
972 datacheck:
973 if (is_remount) {
974 if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
975 != data_opt) {
976 printk(KERN_ERR
977 "EXT4-fs: cannot change data "
978 "mode on remount\n");
979 return 0;
980 }
981 } else {
982 sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS;
983 sbi->s_mount_opt |= data_opt;
984 }
985 break;
986#ifdef CONFIG_QUOTA
987 case Opt_usrjquota:
988 qtype = USRQUOTA;
989 goto set_qf_name;
990 case Opt_grpjquota:
991 qtype = GRPQUOTA;
992set_qf_name:
993 if (sb_any_quota_enabled(sb)) {
994 printk(KERN_ERR
995 "EXT4-fs: Cannot change journalled "
996 "quota options when quota turned on.\n");
997 return 0;
998 }
999 qname = match_strdup(&args[0]);
1000 if (!qname) {
1001 printk(KERN_ERR
1002 "EXT4-fs: not enough memory for "
1003 "storing quotafile name.\n");
1004 return 0;
1005 }
1006 if (sbi->s_qf_names[qtype] &&
1007 strcmp(sbi->s_qf_names[qtype], qname)) {
1008 printk(KERN_ERR
1009 "EXT4-fs: %s quota file already "
1010 "specified.\n", QTYPE2NAME(qtype));
1011 kfree(qname);
1012 return 0;
1013 }
1014 sbi->s_qf_names[qtype] = qname;
1015 if (strchr(sbi->s_qf_names[qtype], '/')) {
1016 printk(KERN_ERR
1017 "EXT4-fs: quotafile must be on "
1018 "filesystem root.\n");
1019 kfree(sbi->s_qf_names[qtype]);
1020 sbi->s_qf_names[qtype] = NULL;
1021 return 0;
1022 }
1023 set_opt(sbi->s_mount_opt, QUOTA);
1024 break;
1025 case Opt_offusrjquota:
1026 qtype = USRQUOTA;
1027 goto clear_qf_name;
1028 case Opt_offgrpjquota:
1029 qtype = GRPQUOTA;
1030clear_qf_name:
1031 if (sb_any_quota_enabled(sb)) {
1032 printk(KERN_ERR "EXT4-fs: Cannot change "
1033 "journalled quota options when "
1034 "quota turned on.\n");
1035 return 0;
1036 }
1037 /*
1038 * The space will be released later when all options
1039 * are confirmed to be correct
1040 */
1041 sbi->s_qf_names[qtype] = NULL;
1042 break;
1043 case Opt_jqfmt_vfsold:
1044 sbi->s_jquota_fmt = QFMT_VFS_OLD;
1045 break;
1046 case Opt_jqfmt_vfsv0:
1047 sbi->s_jquota_fmt = QFMT_VFS_V0;
1048 break;
1049 case Opt_quota:
1050 case Opt_usrquota:
1051 set_opt(sbi->s_mount_opt, QUOTA);
1052 set_opt(sbi->s_mount_opt, USRQUOTA);
1053 break;
1054 case Opt_grpquota:
1055 set_opt(sbi->s_mount_opt, QUOTA);
1056 set_opt(sbi->s_mount_opt, GRPQUOTA);
1057 break;
1058 case Opt_noquota:
1059 if (sb_any_quota_enabled(sb)) {
1060 printk(KERN_ERR "EXT4-fs: Cannot change quota "
1061 "options when quota turned on.\n");
1062 return 0;
1063 }
1064 clear_opt(sbi->s_mount_opt, QUOTA);
1065 clear_opt(sbi->s_mount_opt, USRQUOTA);
1066 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1067 break;
1068#else
1069 case Opt_quota:
1070 case Opt_usrquota:
1071 case Opt_grpquota:
1072 case Opt_usrjquota:
1073 case Opt_grpjquota:
1074 case Opt_offusrjquota:
1075 case Opt_offgrpjquota:
1076 case Opt_jqfmt_vfsold:
1077 case Opt_jqfmt_vfsv0:
1078 printk(KERN_ERR
1079 "EXT4-fs: journalled quota options not "
1080 "supported.\n");
1081 break;
1082 case Opt_noquota:
1083 break;
1084#endif
1085 case Opt_abort:
1086 set_opt(sbi->s_mount_opt, ABORT);
1087 break;
1088 case Opt_barrier:
1089 if (match_int(&args[0], &option))
1090 return 0;
1091 if (option)
1092 set_opt(sbi->s_mount_opt, BARRIER);
1093 else
1094 clear_opt(sbi->s_mount_opt, BARRIER);
1095 break;
1096 case Opt_ignore:
1097 break;
1098 case Opt_resize:
1099 if (!is_remount) {
1100 printk("EXT4-fs: resize option only available "
1101 "for remount\n");
1102 return 0;
1103 }
1104 if (match_int(&args[0], &option) != 0)
1105 return 0;
1106 *n_blocks_count = option;
1107 break;
1108 case Opt_nobh:
1109 set_opt(sbi->s_mount_opt, NOBH);
1110 break;
1111 case Opt_bh:
1112 clear_opt(sbi->s_mount_opt, NOBH);
1113 break;
1114 case Opt_extents:
1115 set_opt (sbi->s_mount_opt, EXTENTS);
1116 break;
1117 default:
1118 printk (KERN_ERR
1119 "EXT4-fs: Unrecognized mount option \"%s\" "
1120 "or missing value\n", p);
1121 return 0;
1122 }
1123 }
1124#ifdef CONFIG_QUOTA
1125 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1126 if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) &&
1127 sbi->s_qf_names[USRQUOTA])
1128 clear_opt(sbi->s_mount_opt, USRQUOTA);
1129
1130 if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) &&
1131 sbi->s_qf_names[GRPQUOTA])
1132 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1133
1134 if ((sbi->s_qf_names[USRQUOTA] &&
1135 (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
1136 (sbi->s_qf_names[GRPQUOTA] &&
1137 (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
1138 printk(KERN_ERR "EXT4-fs: old and new quota "
1139 "format mixing.\n");
1140 return 0;
1141 }
1142
1143 if (!sbi->s_jquota_fmt) {
1144 printk(KERN_ERR "EXT4-fs: journalled quota format "
1145 "not specified.\n");
1146 return 0;
1147 }
1148 } else {
1149 if (sbi->s_jquota_fmt) {
1150 printk(KERN_ERR "EXT4-fs: journalled quota format "
1151 "specified with no journalling "
1152 "enabled.\n");
1153 return 0;
1154 }
1155 }
1156#endif
1157 return 1;
1158}
1159
1160static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1161 int read_only)
1162{
1163 struct ext4_sb_info *sbi = EXT4_SB(sb);
1164 int res = 0;
1165
1166 if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
1167 printk (KERN_ERR "EXT4-fs warning: revision level too high, "
1168 "forcing read-only mode\n");
1169 res = MS_RDONLY;
1170 }
1171 if (read_only)
1172 return res;
1173 if (!(sbi->s_mount_state & EXT4_VALID_FS))
1174 printk (KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
1175 "running e2fsck is recommended\n");
1176 else if ((sbi->s_mount_state & EXT4_ERROR_FS))
1177 printk (KERN_WARNING
1178 "EXT4-fs warning: mounting fs with errors, "
1179 "running e2fsck is recommended\n");
1180 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
1181 le16_to_cpu(es->s_mnt_count) >=
1182 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
1183 printk (KERN_WARNING
1184 "EXT4-fs warning: maximal mount count reached, "
1185 "running e2fsck is recommended\n");
1186 else if (le32_to_cpu(es->s_checkinterval) &&
1187 (le32_to_cpu(es->s_lastcheck) +
1188 le32_to_cpu(es->s_checkinterval) <= get_seconds()))
1189 printk (KERN_WARNING
1190 "EXT4-fs warning: checktime reached, "
1191 "running e2fsck is recommended\n");
1192#if 0
1193 /* @@@ We _will_ want to clear the valid bit if we find
1194 * inconsistencies, to force a fsck at reboot. But for
1195 * a plain journaled filesystem we can keep it set as
1196 * valid forever! :)
1197 */
1198 es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT4_VALID_FS);
1199#endif
1200 if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
1201 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
1202 es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1);
1203 es->s_mtime = cpu_to_le32(get_seconds());
1204 ext4_update_dynamic_rev(sb);
1205 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
1206
1207 ext4_commit_super(sb, es, 1);
1208 if (test_opt(sb, DEBUG))
1209 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, "
1210 "bpg=%lu, ipg=%lu, mo=%04lx]\n",
1211 sb->s_blocksize,
1212 sbi->s_groups_count,
1213 EXT4_BLOCKS_PER_GROUP(sb),
1214 EXT4_INODES_PER_GROUP(sb),
1215 sbi->s_mount_opt);
1216
1217 printk(KERN_INFO "EXT4 FS on %s, ", sb->s_id);
1218 if (EXT4_SB(sb)->s_journal->j_inode == NULL) {
1219 char b[BDEVNAME_SIZE];
1220
1221 printk("external journal on %s\n",
1222 bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
1223 } else {
1224 printk("internal journal\n");
1225 }
1226 return res;
1227}
1228
1229/* Called at mount-time, super-block is locked */
1230static int ext4_check_descriptors (struct super_block * sb)
1231{
1232 struct ext4_sb_info *sbi = EXT4_SB(sb);
1233 ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
1234 ext4_fsblk_t last_block;
1235 ext4_fsblk_t block_bitmap;
1236 ext4_fsblk_t inode_bitmap;
1237 ext4_fsblk_t inode_table;
1238 struct ext4_group_desc * gdp = NULL;
1239 int desc_block = 0;
1240 int i;
1241
1242 ext4_debug ("Checking group descriptors");
1243
1244 for (i = 0; i < sbi->s_groups_count; i++)
1245 {
1246 if (i == sbi->s_groups_count - 1)
1247 last_block = ext4_blocks_count(sbi->s_es) - 1;
1248 else
1249 last_block = first_block +
1250 (EXT4_BLOCKS_PER_GROUP(sb) - 1);
1251
1252 if ((i % EXT4_DESC_PER_BLOCK(sb)) == 0)
1253 gdp = (struct ext4_group_desc *)
1254 sbi->s_group_desc[desc_block++]->b_data;
1255 block_bitmap = ext4_block_bitmap(sb, gdp);
1256 if (block_bitmap < first_block || block_bitmap > last_block)
1257 {
1258 ext4_error (sb, "ext4_check_descriptors",
1259 "Block bitmap for group %d"
1260 " not in group (block %llu)!",
1261 i, block_bitmap);
1262 return 0;
1263 }
1264 inode_bitmap = ext4_inode_bitmap(sb, gdp);
1265 if (inode_bitmap < first_block || inode_bitmap > last_block)
1266 {
1267 ext4_error (sb, "ext4_check_descriptors",
1268 "Inode bitmap for group %d"
1269 " not in group (block %llu)!",
1270 i, inode_bitmap);
1271 return 0;
1272 }
1273 inode_table = ext4_inode_table(sb, gdp);
1274 if (inode_table < first_block ||
1275 inode_table + sbi->s_itb_per_group > last_block)
1276 {
1277 ext4_error (sb, "ext4_check_descriptors",
1278 "Inode table for group %d"
1279 " not in group (block %llu)!",
1280 i, inode_table);
1281 return 0;
1282 }
1283 first_block += EXT4_BLOCKS_PER_GROUP(sb);
1284 gdp = (struct ext4_group_desc *)
1285 ((__u8 *)gdp + EXT4_DESC_SIZE(sb));
1286 }
1287
1288 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
1289 sbi->s_es->s_free_inodes_count=cpu_to_le32(ext4_count_free_inodes(sb));
1290 return 1;
1291}
1292
1293
1294/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
1295 * the superblock) which were deleted from all directories, but held open by
1296 * a process at the time of a crash. We walk the list and try to delete these
1297 * inodes at recovery time (only with a read-write filesystem).
1298 *
1299 * In order to keep the orphan inode chain consistent during traversal (in
1300 * case of crash during recovery), we link each inode into the superblock
1301 * orphan list_head and handle it the same way as an inode deletion during
1302 * normal operation (which journals the operations for us).
1303 *
1304 * We only do an iget() and an iput() on each inode, which is very safe if we
1305 * accidentally point at an in-use or already deleted inode. The worst that
1306 * can happen in this case is that we get a "bit already cleared" message from
1307 * ext4_free_inode(). The only reason we would point at a wrong inode is if
1308 * e2fsck was run on this filesystem, and it must have already done the orphan
1309 * inode cleanup for us, so we can safely abort without any further action.
1310 */
1311static void ext4_orphan_cleanup (struct super_block * sb,
1312 struct ext4_super_block * es)
1313{
1314 unsigned int s_flags = sb->s_flags;
1315 int nr_orphans = 0, nr_truncates = 0;
1316#ifdef CONFIG_QUOTA
1317 int i;
1318#endif
1319 if (!es->s_last_orphan) {
1320 jbd_debug(4, "no orphan inodes to clean up\n");
1321 return;
1322 }
1323
1324 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
1325 if (es->s_last_orphan)
1326 jbd_debug(1, "Errors on filesystem, "
1327 "clearing orphan list.\n");
1328 es->s_last_orphan = 0;
1329 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
1330 return;
1331 }
1332
1333 if (s_flags & MS_RDONLY) {
1334 printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n",
1335 sb->s_id);
1336 sb->s_flags &= ~MS_RDONLY;
1337 }
1338#ifdef CONFIG_QUOTA
1339 /* Needed for iput() to work correctly and not trash data */
1340 sb->s_flags |= MS_ACTIVE;
1341 /* Turn on quotas so that they are updated correctly */
1342 for (i = 0; i < MAXQUOTAS; i++) {
1343 if (EXT4_SB(sb)->s_qf_names[i]) {
1344 int ret = ext4_quota_on_mount(sb, i);
1345 if (ret < 0)
1346 printk(KERN_ERR
1347 "EXT4-fs: Cannot turn on journalled "
1348 "quota: error %d\n", ret);
1349 }
1350 }
1351#endif
1352
1353 while (es->s_last_orphan) {
1354 struct inode *inode;
1355
1356 if (!(inode =
1357 ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) {
1358 es->s_last_orphan = 0;
1359 break;
1360 }
1361
1362 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
1363 DQUOT_INIT(inode);
1364 if (inode->i_nlink) {
1365 printk(KERN_DEBUG
1366 "%s: truncating inode %lu to %Ld bytes\n",
1367 __FUNCTION__, inode->i_ino, inode->i_size);
1368 jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
1369 inode->i_ino, inode->i_size);
1370 ext4_truncate(inode);
1371 nr_truncates++;
1372 } else {
1373 printk(KERN_DEBUG
1374 "%s: deleting unreferenced inode %lu\n",
1375 __FUNCTION__, inode->i_ino);
1376 jbd_debug(2, "deleting unreferenced inode %lu\n",
1377 inode->i_ino);
1378 nr_orphans++;
1379 }
1380 iput(inode); /* The delete magic happens here! */
1381 }
1382
1383#define PLURAL(x) (x), ((x)==1) ? "" : "s"
1384
1385 if (nr_orphans)
1386 printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
1387 sb->s_id, PLURAL(nr_orphans));
1388 if (nr_truncates)
1389 printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n",
1390 sb->s_id, PLURAL(nr_truncates));
1391#ifdef CONFIG_QUOTA
1392 /* Turn quotas off */
1393 for (i = 0; i < MAXQUOTAS; i++) {
1394 if (sb_dqopt(sb)->files[i])
1395 vfs_quota_off(sb, i);
1396 }
1397#endif
1398 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
1399}
1400
1401#define log2(n) ffz(~(n))
1402
1403/*
1404 * Maximal file size. There is a direct, and {,double-,triple-}indirect
1405 * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
1406 * We need to be 1 filesystem block less than the 2^32 sector limit.
1407 */
1408static loff_t ext4_max_size(int bits)
1409{
1410 loff_t res = EXT4_NDIR_BLOCKS;
1411 /* This constant is calculated to be the largest file size for a
1412 * dense, 4k-blocksize file such that the total number of
1413 * sectors in the file, including data and all indirect blocks,
1414 * does not exceed 2^32. */
1415 const loff_t upper_limit = 0x1ff7fffd000LL;
1416
1417 res += 1LL << (bits-2);
1418 res += 1LL << (2*(bits-2));
1419 res += 1LL << (3*(bits-2));
1420 res <<= bits;
1421 if (res > upper_limit)
1422 res = upper_limit;
1423 return res;
1424}
1425
1426static ext4_fsblk_t descriptor_loc(struct super_block *sb,
1427 ext4_fsblk_t logical_sb_block, int nr)
1428{
1429 struct ext4_sb_info *sbi = EXT4_SB(sb);
1430 unsigned long bg, first_meta_bg;
1431 int has_super = 0;
1432
1433 first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
1434
1435 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
1436 nr < first_meta_bg)
1437 return logical_sb_block + nr + 1;
1438 bg = sbi->s_desc_per_block * nr;
1439 if (ext4_bg_has_super(sb, bg))
1440 has_super = 1;
1441 return (has_super + ext4_group_first_block_no(sb, bg));
1442}
1443
1444
1445static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1446{
1447 struct buffer_head * bh;
1448 struct ext4_super_block *es = NULL;
1449 struct ext4_sb_info *sbi;
1450 ext4_fsblk_t block;
1451 ext4_fsblk_t sb_block = get_sb_block(&data);
1452 ext4_fsblk_t logical_sb_block;
1453 unsigned long offset = 0;
1454 unsigned int journal_inum = 0;
1455 unsigned long journal_devnum = 0;
1456 unsigned long def_mount_opts;
1457 struct inode *root;
1458 int blocksize;
1459 int hblock;
1460 int db_count;
1461 int i;
1462 int needs_recovery;
1463 __le32 features;
1464 __u64 blocks_count;
1465
1466 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
1467 if (!sbi)
1468 return -ENOMEM;
1469 sb->s_fs_info = sbi;
1470 sbi->s_mount_opt = 0;
1471 sbi->s_resuid = EXT4_DEF_RESUID;
1472 sbi->s_resgid = EXT4_DEF_RESGID;
1473
1474 unlock_kernel();
1475
1476 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
1477 if (!blocksize) {
1478 printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
1479 goto out_fail;
1480 }
1481
1482 /*
1483 * The ext4 superblock will not be buffer aligned for other than 1kB
1484 * block sizes. We need to calculate the offset from buffer start.
1485 */
1486 if (blocksize != EXT4_MIN_BLOCK_SIZE) {
1487 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
1488 offset = do_div(logical_sb_block, blocksize);
1489 } else {
1490 logical_sb_block = sb_block;
1491 }
1492
1493 if (!(bh = sb_bread(sb, logical_sb_block))) {
1494 printk (KERN_ERR "EXT4-fs: unable to read superblock\n");
1495 goto out_fail;
1496 }
1497 /*
1498 * Note: s_es must be initialized as soon as possible because
1499 * some ext4 macro-instructions depend on its value
1500 */
1501 es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
1502 sbi->s_es = es;
1503 sb->s_magic = le16_to_cpu(es->s_magic);
1504 if (sb->s_magic != EXT4_SUPER_MAGIC)
1505 goto cantfind_ext4;
1506
1507 /* Set defaults before we parse the mount options */
1508 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
1509 if (def_mount_opts & EXT4_DEFM_DEBUG)
1510 set_opt(sbi->s_mount_opt, DEBUG);
1511 if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
1512 set_opt(sbi->s_mount_opt, GRPID);
1513 if (def_mount_opts & EXT4_DEFM_UID16)
1514 set_opt(sbi->s_mount_opt, NO_UID32);
1515 if (def_mount_opts & EXT4_DEFM_XATTR_USER)
1516 set_opt(sbi->s_mount_opt, XATTR_USER);
1517 if (def_mount_opts & EXT4_DEFM_ACL)
1518 set_opt(sbi->s_mount_opt, POSIX_ACL);
1519 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
1520 sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
1521 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
1522 sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
1523 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
1524 sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA;
1525
1526 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
1527 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
1528 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_RO)
1529 set_opt(sbi->s_mount_opt, ERRORS_RO);
1530 else
1531 set_opt(sbi->s_mount_opt, ERRORS_CONT);
1532
1533 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
1534 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
1535
1536 set_opt(sbi->s_mount_opt, RESERVATION);
1537
1538 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
1539 NULL, 0))
1540 goto failed_mount;
1541
1542 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
1543 ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
1544
1545 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
1546 (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
1547 EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
1548 EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
1549 printk(KERN_WARNING
1550 "EXT4-fs warning: feature flags set on rev 0 fs, "
1551 "running e2fsck is recommended\n");
1552 /*
1553 * Check feature flags regardless of the revision level, since we
1554 * previously didn't change the revision level when setting the flags,
1555 * so there is a chance incompat flags are set on a rev 0 filesystem.
1556 */
1557 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
1558 if (features) {
1559 printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
1560 "unsupported optional features (%x).\n",
1561 sb->s_id, le32_to_cpu(features));
1562 goto failed_mount;
1563 }
1564 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
1565 if (!(sb->s_flags & MS_RDONLY) && features) {
1566 printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
1567 "unsupported optional features (%x).\n",
1568 sb->s_id, le32_to_cpu(features));
1569 goto failed_mount;
1570 }
1571 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
1572
1573 if (blocksize < EXT4_MIN_BLOCK_SIZE ||
1574 blocksize > EXT4_MAX_BLOCK_SIZE) {
1575 printk(KERN_ERR
1576 "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n",
1577 blocksize, sb->s_id);
1578 goto failed_mount;
1579 }
1580
1581 hblock = bdev_hardsect_size(sb->s_bdev);
1582 if (sb->s_blocksize != blocksize) {
1583 /*
1584 * Make sure the blocksize for the filesystem is larger
1585 * than the hardware sectorsize for the machine.
1586 */
1587 if (blocksize < hblock) {
1588 printk(KERN_ERR "EXT4-fs: blocksize %d too small for "
1589 "device blocksize %d.\n", blocksize, hblock);
1590 goto failed_mount;
1591 }
1592
1593 brelse (bh);
1594 sb_set_blocksize(sb, blocksize);
1595 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
1596 offset = do_div(logical_sb_block, blocksize);
1597 bh = sb_bread(sb, logical_sb_block);
1598 if (!bh) {
1599 printk(KERN_ERR
1600 "EXT4-fs: Can't read superblock on 2nd try.\n");
1601 goto failed_mount;
1602 }
1603 es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
1604 sbi->s_es = es;
1605 if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
1606 printk (KERN_ERR
1607 "EXT4-fs: Magic mismatch, very weird !\n");
1608 goto failed_mount;
1609 }
1610 }
1611
1612 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
1613
1614 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
1615 sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
1616 sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
1617 } else {
1618 sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
1619 sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
1620 if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
1621 (sbi->s_inode_size & (sbi->s_inode_size - 1)) ||
1622 (sbi->s_inode_size > blocksize)) {
1623 printk (KERN_ERR
1624 "EXT4-fs: unsupported inode size: %d\n",
1625 sbi->s_inode_size);
1626 goto failed_mount;
1627 }
1628 }
1629 sbi->s_frag_size = EXT4_MIN_FRAG_SIZE <<
1630 le32_to_cpu(es->s_log_frag_size);
1631 if (blocksize != sbi->s_frag_size) {
1632 printk(KERN_ERR
1633 "EXT4-fs: fragsize %lu != blocksize %u (unsupported)\n",
1634 sbi->s_frag_size, blocksize);
1635 goto failed_mount;
1636 }
1637 sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
1638 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
1639 if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
1640 sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
1641 sbi->s_desc_size & (sbi->s_desc_size - 1)) {
1642 printk(KERN_ERR
1643 "EXT4-fs: unsupported descriptor size %lu\n",
1644 sbi->s_desc_size);
1645 goto failed_mount;
1646 }
1647 } else
1648 sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
1649 sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
1650 sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
1651 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
1652 if (EXT4_INODE_SIZE(sb) == 0)
1653 goto cantfind_ext4;
1654 sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
1655 if (sbi->s_inodes_per_block == 0)
1656 goto cantfind_ext4;
1657 sbi->s_itb_per_group = sbi->s_inodes_per_group /
1658 sbi->s_inodes_per_block;
1659 sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
1660 sbi->s_sbh = bh;
1661 sbi->s_mount_state = le16_to_cpu(es->s_state);
1662 sbi->s_addr_per_block_bits = log2(EXT4_ADDR_PER_BLOCK(sb));
1663 sbi->s_desc_per_block_bits = log2(EXT4_DESC_PER_BLOCK(sb));
1664 for (i=0; i < 4; i++)
1665 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
1666 sbi->s_def_hash_version = es->s_def_hash_version;
1667
1668 if (sbi->s_blocks_per_group > blocksize * 8) {
1669 printk (KERN_ERR
1670 "EXT4-fs: #blocks per group too big: %lu\n",
1671 sbi->s_blocks_per_group);
1672 goto failed_mount;
1673 }
1674 if (sbi->s_frags_per_group > blocksize * 8) {
1675 printk (KERN_ERR
1676 "EXT4-fs: #fragments per group too big: %lu\n",
1677 sbi->s_frags_per_group);
1678 goto failed_mount;
1679 }
1680 if (sbi->s_inodes_per_group > blocksize * 8) {
1681 printk (KERN_ERR
1682 "EXT4-fs: #inodes per group too big: %lu\n",
1683 sbi->s_inodes_per_group);
1684 goto failed_mount;
1685 }
1686
1687 if (ext4_blocks_count(es) >
1688 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
1689 printk(KERN_ERR "EXT4-fs: filesystem on %s:"
1690 " too large to mount safely\n", sb->s_id);
1691 if (sizeof(sector_t) < 8)
1692 printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not "
1693 "enabled\n");
1694 goto failed_mount;
1695 }
1696
1697 if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
1698 goto cantfind_ext4;
1699 blocks_count = (ext4_blocks_count(es) -
1700 le32_to_cpu(es->s_first_data_block) +
1701 EXT4_BLOCKS_PER_GROUP(sb) - 1);
1702 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
1703 sbi->s_groups_count = blocks_count;
1704 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
1705 EXT4_DESC_PER_BLOCK(sb);
1706 sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
1707 GFP_KERNEL);
1708 if (sbi->s_group_desc == NULL) {
1709 printk (KERN_ERR "EXT4-fs: not enough memory\n");
1710 goto failed_mount;
1711 }
1712
1713 bgl_lock_init(&sbi->s_blockgroup_lock);
1714
1715 for (i = 0; i < db_count; i++) {
1716 block = descriptor_loc(sb, logical_sb_block, i);
1717 sbi->s_group_desc[i] = sb_bread(sb, block);
1718 if (!sbi->s_group_desc[i]) {
1719 printk (KERN_ERR "EXT4-fs: "
1720 "can't read group descriptor %d\n", i);
1721 db_count = i;
1722 goto failed_mount2;
1723 }
1724 }
1725 if (!ext4_check_descriptors (sb)) {
1726 printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
1727 goto failed_mount2;
1728 }
1729 sbi->s_gdb_count = db_count;
1730 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
1731 spin_lock_init(&sbi->s_next_gen_lock);
1732
1733 percpu_counter_init(&sbi->s_freeblocks_counter,
1734 ext4_count_free_blocks(sb));
1735 percpu_counter_init(&sbi->s_freeinodes_counter,
1736 ext4_count_free_inodes(sb));
1737 percpu_counter_init(&sbi->s_dirs_counter,
1738 ext4_count_dirs(sb));
1739
1740 /* per fileystem reservation list head & lock */
1741 spin_lock_init(&sbi->s_rsv_window_lock);
1742 sbi->s_rsv_window_root = RB_ROOT;
1743 /* Add a single, static dummy reservation to the start of the
1744 * reservation window list --- it gives us a placeholder for
1745 * append-at-start-of-list which makes the allocation logic
1746 * _much_ simpler. */
1747 sbi->s_rsv_window_head.rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
1748 sbi->s_rsv_window_head.rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
1749 sbi->s_rsv_window_head.rsv_alloc_hit = 0;
1750 sbi->s_rsv_window_head.rsv_goal_size = 0;
1751 ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
1752
1753 /*
1754 * set up enough so that it can read an inode
1755 */
1756 sb->s_op = &ext4_sops;
1757 sb->s_export_op = &ext4_export_ops;
1758 sb->s_xattr = ext4_xattr_handlers;
1759#ifdef CONFIG_QUOTA
1760 sb->s_qcop = &ext4_qctl_operations;
1761 sb->dq_op = &ext4_quota_operations;
1762#endif
1763 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
1764
1765 sb->s_root = NULL;
1766
1767 needs_recovery = (es->s_last_orphan != 0 ||
1768 EXT4_HAS_INCOMPAT_FEATURE(sb,
1769 EXT4_FEATURE_INCOMPAT_RECOVER));
1770
1771 /*
1772 * The first inode we look at is the journal inode. Don't try
1773 * root first: it may be modified in the journal!
1774 */
1775 if (!test_opt(sb, NOLOAD) &&
1776 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
1777 if (ext4_load_journal(sb, es, journal_devnum))
1778 goto failed_mount3;
1779 } else if (journal_inum) {
1780 if (ext4_create_journal(sb, es, journal_inum))
1781 goto failed_mount3;
1782 } else {
1783 if (!silent)
1784 printk (KERN_ERR
1785 "ext4: No journal on filesystem on %s\n",
1786 sb->s_id);
1787 goto failed_mount3;
1788 }
1789
1790 /* We have now updated the journal if required, so we can
1791 * validate the data journaling mode. */
1792 switch (test_opt(sb, DATA_FLAGS)) {
1793 case 0:
1794 /* No mode set, assume a default based on the journal
1795 * capabilities: ORDERED_DATA if the journal can
1796 * cope, else JOURNAL_DATA
1797 */
1798 if (jbd2_journal_check_available_features
1799 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
1800 set_opt(sbi->s_mount_opt, ORDERED_DATA);
1801 else
1802 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
1803 break;
1804
1805 case EXT4_MOUNT_ORDERED_DATA:
1806 case EXT4_MOUNT_WRITEBACK_DATA:
1807 if (!jbd2_journal_check_available_features
1808 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
1809 printk(KERN_ERR "EXT4-fs: Journal does not support "
1810 "requested data journaling mode\n");
1811 goto failed_mount4;
1812 }
1813 default:
1814 break;
1815 }
1816
1817 if (test_opt(sb, NOBH)) {
1818 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
1819 printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - "
1820 "its supported only with writeback mode\n");
1821 clear_opt(sbi->s_mount_opt, NOBH);
1822 }
1823 }
1824 /*
1825 * The jbd2_journal_load will have done any necessary log recovery,
1826 * so we can safely mount the rest of the filesystem now.
1827 */
1828
1829 root = iget(sb, EXT4_ROOT_INO);
1830 sb->s_root = d_alloc_root(root);
1831 if (!sb->s_root) {
1832 printk(KERN_ERR "EXT4-fs: get root inode failed\n");
1833 iput(root);
1834 goto failed_mount4;
1835 }
1836 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
1837 dput(sb->s_root);
1838 sb->s_root = NULL;
1839 printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n");
1840 goto failed_mount4;
1841 }
1842
1843 ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY);
1844 /*
1845 * akpm: core read_super() calls in here with the superblock locked.
1846 * That deadlocks, because orphan cleanup needs to lock the superblock
1847 * in numerous places. Here we just pop the lock - it's relatively
1848 * harmless, because we are now ready to accept write_super() requests,
1849 * and aviro says that's the only reason for hanging onto the
1850 * superblock lock.
1851 */
1852 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
1853 ext4_orphan_cleanup(sb, es);
1854 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
1855 if (needs_recovery)
1856 printk (KERN_INFO "EXT4-fs: recovery complete.\n");
1857 ext4_mark_recovery_complete(sb, es);
1858 printk (KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
1859 test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
1860 test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
1861 "writeback");
1862
1863 ext4_ext_init(sb);
1864
1865 lock_kernel();
1866 return 0;
1867
1868cantfind_ext4:
1869 if (!silent)
1870 printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n",
1871 sb->s_id);
1872 goto failed_mount;
1873
1874failed_mount4:
1875 jbd2_journal_destroy(sbi->s_journal);
1876failed_mount3:
1877 percpu_counter_destroy(&sbi->s_freeblocks_counter);
1878 percpu_counter_destroy(&sbi->s_freeinodes_counter);
1879 percpu_counter_destroy(&sbi->s_dirs_counter);
1880failed_mount2:
1881 for (i = 0; i < db_count; i++)
1882 brelse(sbi->s_group_desc[i]);
1883 kfree(sbi->s_group_desc);
1884failed_mount:
1885#ifdef CONFIG_QUOTA
1886 for (i = 0; i < MAXQUOTAS; i++)
1887 kfree(sbi->s_qf_names[i]);
1888#endif
1889 ext4_blkdev_remove(sbi);
1890 brelse(bh);
1891out_fail:
1892 sb->s_fs_info = NULL;
1893 kfree(sbi);
1894 lock_kernel();
1895 return -EINVAL;
1896}
1897
1898/*
1899 * Setup any per-fs journal parameters now. We'll do this both on
1900 * initial mount, once the journal has been initialised but before we've
1901 * done any recovery; and again on any subsequent remount.
1902 */
1903static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
1904{
1905 struct ext4_sb_info *sbi = EXT4_SB(sb);
1906
1907 if (sbi->s_commit_interval)
1908 journal->j_commit_interval = sbi->s_commit_interval;
1909 /* We could also set up an ext4-specific default for the commit
1910 * interval here, but for now we'll just fall back to the jbd
1911 * default. */
1912
1913 spin_lock(&journal->j_state_lock);
1914 if (test_opt(sb, BARRIER))
1915 journal->j_flags |= JBD2_BARRIER;
1916 else
1917 journal->j_flags &= ~JBD2_BARRIER;
1918 spin_unlock(&journal->j_state_lock);
1919}
1920
1921static journal_t *ext4_get_journal(struct super_block *sb,
1922 unsigned int journal_inum)
1923{
1924 struct inode *journal_inode;
1925 journal_t *journal;
1926
1927 /* First, test for the existence of a valid inode on disk. Bad
1928 * things happen if we iget() an unused inode, as the subsequent
1929 * iput() will try to delete it. */
1930
1931 journal_inode = iget(sb, journal_inum);
1932 if (!journal_inode) {
1933 printk(KERN_ERR "EXT4-fs: no journal found.\n");
1934 return NULL;
1935 }
1936 if (!journal_inode->i_nlink) {
1937 make_bad_inode(journal_inode);
1938 iput(journal_inode);
1939 printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n");
1940 return NULL;
1941 }
1942
1943 jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
1944 journal_inode, journal_inode->i_size);
1945 if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) {
1946 printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
1947 iput(journal_inode);
1948 return NULL;
1949 }
1950
1951 journal = jbd2_journal_init_inode(journal_inode);
1952 if (!journal) {
1953 printk(KERN_ERR "EXT4-fs: Could not load journal inode\n");
1954 iput(journal_inode);
1955 return NULL;
1956 }
1957 journal->j_private = sb;
1958 ext4_init_journal_params(sb, journal);
1959 return journal;
1960}
1961
1962static journal_t *ext4_get_dev_journal(struct super_block *sb,
1963 dev_t j_dev)
1964{
1965 struct buffer_head * bh;
1966 journal_t *journal;
1967 ext4_fsblk_t start;
1968 ext4_fsblk_t len;
1969 int hblock, blocksize;
1970 ext4_fsblk_t sb_block;
1971 unsigned long offset;
1972 struct ext4_super_block * es;
1973 struct block_device *bdev;
1974
1975 bdev = ext4_blkdev_get(j_dev);
1976 if (bdev == NULL)
1977 return NULL;
1978
1979 if (bd_claim(bdev, sb)) {
1980 printk(KERN_ERR
1981 "EXT4: failed to claim external journal device.\n");
1982 blkdev_put(bdev);
1983 return NULL;
1984 }
1985
1986 blocksize = sb->s_blocksize;
1987 hblock = bdev_hardsect_size(bdev);
1988 if (blocksize < hblock) {
1989 printk(KERN_ERR
1990 "EXT4-fs: blocksize too small for journal device.\n");
1991 goto out_bdev;
1992 }
1993
1994 sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
1995 offset = EXT4_MIN_BLOCK_SIZE % blocksize;
1996 set_blocksize(bdev, blocksize);
1997 if (!(bh = __bread(bdev, sb_block, blocksize))) {
1998 printk(KERN_ERR "EXT4-fs: couldn't read superblock of "
1999 "external journal\n");
2000 goto out_bdev;
2001 }
2002
2003 es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
2004 if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
2005 !(le32_to_cpu(es->s_feature_incompat) &
2006 EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
2007 printk(KERN_ERR "EXT4-fs: external journal has "
2008 "bad superblock\n");
2009 brelse(bh);
2010 goto out_bdev;
2011 }
2012
2013 if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
2014 printk(KERN_ERR "EXT4-fs: journal UUID does not match\n");
2015 brelse(bh);
2016 goto out_bdev;
2017 }
2018
2019 len = ext4_blocks_count(es);
2020 start = sb_block + 1;
2021 brelse(bh); /* we're done with the superblock */
2022
2023 journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
2024 start, len, blocksize);
2025 if (!journal) {
2026 printk(KERN_ERR "EXT4-fs: failed to create device journal\n");
2027 goto out_bdev;
2028 }
2029 journal->j_private = sb;
2030 ll_rw_block(READ, 1, &journal->j_sb_buffer);
2031 wait_on_buffer(journal->j_sb_buffer);
2032 if (!buffer_uptodate(journal->j_sb_buffer)) {
2033 printk(KERN_ERR "EXT4-fs: I/O error on journal device\n");
2034 goto out_journal;
2035 }
2036 if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
2037 printk(KERN_ERR "EXT4-fs: External journal has more than one "
2038 "user (unsupported) - %d\n",
2039 be32_to_cpu(journal->j_superblock->s_nr_users));
2040 goto out_journal;
2041 }
2042 EXT4_SB(sb)->journal_bdev = bdev;
2043 ext4_init_journal_params(sb, journal);
2044 return journal;
2045out_journal:
2046 jbd2_journal_destroy(journal);
2047out_bdev:
2048 ext4_blkdev_put(bdev);
2049 return NULL;
2050}
2051
2052static int ext4_load_journal(struct super_block *sb,
2053 struct ext4_super_block *es,
2054 unsigned long journal_devnum)
2055{
2056 journal_t *journal;
2057 unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
2058 dev_t journal_dev;
2059 int err = 0;
2060 int really_read_only;
2061
2062 if (journal_devnum &&
2063 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2064 printk(KERN_INFO "EXT4-fs: external journal device major/minor "
2065 "numbers have changed\n");
2066 journal_dev = new_decode_dev(journal_devnum);
2067 } else
2068 journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
2069
2070 really_read_only = bdev_read_only(sb->s_bdev);
2071
2072 /*
2073 * Are we loading a blank journal or performing recovery after a
2074 * crash? For recovery, we need to check in advance whether we
2075 * can get read-write access to the device.
2076 */
2077
2078 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
2079 if (sb->s_flags & MS_RDONLY) {
2080 printk(KERN_INFO "EXT4-fs: INFO: recovery "
2081 "required on readonly filesystem.\n");
2082 if (really_read_only) {
2083 printk(KERN_ERR "EXT4-fs: write access "
2084 "unavailable, cannot proceed.\n");
2085 return -EROFS;
2086 }
2087 printk (KERN_INFO "EXT4-fs: write access will "
2088 "be enabled during recovery.\n");
2089 }
2090 }
2091
2092 if (journal_inum && journal_dev) {
2093 printk(KERN_ERR "EXT4-fs: filesystem has both journal "
2094 "and inode journals!\n");
2095 return -EINVAL;
2096 }
2097
2098 if (journal_inum) {
2099 if (!(journal = ext4_get_journal(sb, journal_inum)))
2100 return -EINVAL;
2101 } else {
2102 if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
2103 return -EINVAL;
2104 }
2105
2106 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
2107 err = jbd2_journal_update_format(journal);
2108 if (err) {
2109 printk(KERN_ERR "EXT4-fs: error updating journal.\n");
2110 jbd2_journal_destroy(journal);
2111 return err;
2112 }
2113 }
2114
2115 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
2116 err = jbd2_journal_wipe(journal, !really_read_only);
2117 if (!err)
2118 err = jbd2_journal_load(journal);
2119
2120 if (err) {
2121 printk(KERN_ERR "EXT4-fs: error loading journal.\n");
2122 jbd2_journal_destroy(journal);
2123 return err;
2124 }
2125
2126 EXT4_SB(sb)->s_journal = journal;
2127 ext4_clear_journal_err(sb, es);
2128
2129 if (journal_devnum &&
2130 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2131 es->s_journal_dev = cpu_to_le32(journal_devnum);
2132 sb->s_dirt = 1;
2133
2134 /* Make sure we flush the recovery flag to disk. */
2135 ext4_commit_super(sb, es, 1);
2136 }
2137
2138 return 0;
2139}
2140
2141static int ext4_create_journal(struct super_block * sb,
2142 struct ext4_super_block * es,
2143 unsigned int journal_inum)
2144{
2145 journal_t *journal;
2146
2147 if (sb->s_flags & MS_RDONLY) {
2148 printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
2149 "create journal.\n");
2150 return -EROFS;
2151 }
2152
2153 if (!(journal = ext4_get_journal(sb, journal_inum)))
2154 return -EINVAL;
2155
2156 printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
2157 journal_inum);
2158
2159 if (jbd2_journal_create(journal)) {
2160 printk(KERN_ERR "EXT4-fs: error creating journal.\n");
2161 jbd2_journal_destroy(journal);
2162 return -EIO;
2163 }
2164
2165 EXT4_SB(sb)->s_journal = journal;
2166
2167 ext4_update_dynamic_rev(sb);
2168 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2169 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
2170
2171 es->s_journal_inum = cpu_to_le32(journal_inum);
2172 sb->s_dirt = 1;
2173
2174 /* Make sure we flush the recovery flag to disk. */
2175 ext4_commit_super(sb, es, 1);
2176
2177 return 0;
2178}
2179
2180static void ext4_commit_super (struct super_block * sb,
2181 struct ext4_super_block * es,
2182 int sync)
2183{
2184 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
2185
2186 if (!sbh)
2187 return;
2188 es->s_wtime = cpu_to_le32(get_seconds());
2189 ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
2190 es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
2191 BUFFER_TRACE(sbh, "marking dirty");
2192 mark_buffer_dirty(sbh);
2193 if (sync)
2194 sync_dirty_buffer(sbh);
2195}
2196
2197
2198/*
2199 * Have we just finished recovery? If so, and if we are mounting (or
2200 * remounting) the filesystem readonly, then we will end up with a
2201 * consistent fs on disk. Record that fact.
2202 */
2203static void ext4_mark_recovery_complete(struct super_block * sb,
2204 struct ext4_super_block * es)
2205{
2206 journal_t *journal = EXT4_SB(sb)->s_journal;
2207
2208 jbd2_journal_lock_updates(journal);
2209 jbd2_journal_flush(journal);
2210 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
2211 sb->s_flags & MS_RDONLY) {
2212 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2213 sb->s_dirt = 0;
2214 ext4_commit_super(sb, es, 1);
2215 }
2216 jbd2_journal_unlock_updates(journal);
2217}
2218
2219/*
2220 * If we are mounting (or read-write remounting) a filesystem whose journal
2221 * has recorded an error from a previous lifetime, move that error to the
2222 * main filesystem now.
2223 */
2224static void ext4_clear_journal_err(struct super_block * sb,
2225 struct ext4_super_block * es)
2226{
2227 journal_t *journal;
2228 int j_errno;
2229 const char *errstr;
2230
2231 journal = EXT4_SB(sb)->s_journal;
2232
2233 /*
2234 * Now check for any error status which may have been recorded in the
2235 * journal by a prior ext4_error() or ext4_abort()
2236 */
2237
2238 j_errno = jbd2_journal_errno(journal);
2239 if (j_errno) {
2240 char nbuf[16];
2241
2242 errstr = ext4_decode_error(sb, j_errno, nbuf);
2243 ext4_warning(sb, __FUNCTION__, "Filesystem error recorded "
2244 "from previous mount: %s", errstr);
2245 ext4_warning(sb, __FUNCTION__, "Marking fs in need of "
2246 "filesystem check.");
2247
2248 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
2249 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
2250 ext4_commit_super (sb, es, 1);
2251
2252 jbd2_journal_clear_err(journal);
2253 }
2254}
2255
2256/*
2257 * Force the running and committing transactions to commit,
2258 * and wait on the commit.
2259 */
2260int ext4_force_commit(struct super_block *sb)
2261{
2262 journal_t *journal;
2263 int ret;
2264
2265 if (sb->s_flags & MS_RDONLY)
2266 return 0;
2267
2268 journal = EXT4_SB(sb)->s_journal;
2269 sb->s_dirt = 0;
2270 ret = ext4_journal_force_commit(journal);
2271 return ret;
2272}
2273
2274/*
2275 * Ext4 always journals updates to the superblock itself, so we don't
2276 * have to propagate any other updates to the superblock on disk at this
2277 * point. Just start an async writeback to get the buffers on their way
2278 * to the disk.
2279 *
2280 * This implicitly triggers the writebehind on sync().
2281 */
2282
2283static void ext4_write_super (struct super_block * sb)
2284{
2285 if (mutex_trylock(&sb->s_lock) != 0)
2286 BUG();
2287 sb->s_dirt = 0;
2288}
2289
2290static int ext4_sync_fs(struct super_block *sb, int wait)
2291{
2292 tid_t target;
2293
2294 sb->s_dirt = 0;
2295 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
2296 if (wait)
2297 jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
2298 }
2299 return 0;
2300}
2301
2302/*
2303 * LVM calls this function before a (read-only) snapshot is created. This
2304 * gives us a chance to flush the journal completely and mark the fs clean.
2305 */
2306static void ext4_write_super_lockfs(struct super_block *sb)
2307{
2308 sb->s_dirt = 0;
2309
2310 if (!(sb->s_flags & MS_RDONLY)) {
2311 journal_t *journal = EXT4_SB(sb)->s_journal;
2312
2313 /* Now we set up the journal barrier. */
2314 jbd2_journal_lock_updates(journal);
2315 jbd2_journal_flush(journal);
2316
2317 /* Journal blocked and flushed, clear needs_recovery flag. */
2318 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2319 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
2320 }
2321}
2322
2323/*
2324 * Called by LVM after the snapshot is done. We need to reset the RECOVER
2325 * flag here, even though the filesystem is not technically dirty yet.
2326 */
2327static void ext4_unlockfs(struct super_block *sb)
2328{
2329 if (!(sb->s_flags & MS_RDONLY)) {
2330 lock_super(sb);
2331 /* Reser the needs_recovery flag before the fs is unlocked. */
2332 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2333 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
2334 unlock_super(sb);
2335 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
2336 }
2337}
2338
2339static int ext4_remount (struct super_block * sb, int * flags, char * data)
2340{
2341 struct ext4_super_block * es;
2342 struct ext4_sb_info *sbi = EXT4_SB(sb);
2343 ext4_fsblk_t n_blocks_count = 0;
2344 unsigned long old_sb_flags;
2345 struct ext4_mount_options old_opts;
2346 int err;
2347#ifdef CONFIG_QUOTA
2348 int i;
2349#endif
2350
2351 /* Store the original options */
2352 old_sb_flags = sb->s_flags;
2353 old_opts.s_mount_opt = sbi->s_mount_opt;
2354 old_opts.s_resuid = sbi->s_resuid;
2355 old_opts.s_resgid = sbi->s_resgid;
2356 old_opts.s_commit_interval = sbi->s_commit_interval;
2357#ifdef CONFIG_QUOTA
2358 old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
2359 for (i = 0; i < MAXQUOTAS; i++)
2360 old_opts.s_qf_names[i] = sbi->s_qf_names[i];
2361#endif
2362
2363 /*
2364 * Allow the "check" option to be passed as a remount option.
2365 */
2366 if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
2367 err = -EINVAL;
2368 goto restore_opts;
2369 }
2370
2371 if (sbi->s_mount_opt & EXT4_MOUNT_ABORT)
2372 ext4_abort(sb, __FUNCTION__, "Abort forced by user");
2373
2374 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
2375 ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
2376
2377 es = sbi->s_es;
2378
2379 ext4_init_journal_params(sb, sbi->s_journal);
2380
2381 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
2382 n_blocks_count > ext4_blocks_count(es)) {
2383 if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) {
2384 err = -EROFS;
2385 goto restore_opts;
2386 }
2387
2388 if (*flags & MS_RDONLY) {
2389 /*
2390 * First of all, the unconditional stuff we have to do
2391 * to disable replay of the journal when we next remount
2392 */
2393 sb->s_flags |= MS_RDONLY;
2394
2395 /*
2396 * OK, test if we are remounting a valid rw partition
2397 * readonly, and if so set the rdonly flag and then
2398 * mark the partition as valid again.
2399 */
2400 if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
2401 (sbi->s_mount_state & EXT4_VALID_FS))
2402 es->s_state = cpu_to_le16(sbi->s_mount_state);
2403
2404 ext4_mark_recovery_complete(sb, es);
2405 } else {
2406 __le32 ret;
2407 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
2408 ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
2409 printk(KERN_WARNING "EXT4-fs: %s: couldn't "
2410 "remount RDWR because of unsupported "
2411 "optional features (%x).\n",
2412 sb->s_id, le32_to_cpu(ret));
2413 err = -EROFS;
2414 goto restore_opts;
2415 }
2416 /*
2417 * Mounting a RDONLY partition read-write, so reread
2418 * and store the current valid flag. (It may have
2419 * been changed by e2fsck since we originally mounted
2420 * the partition.)
2421 */
2422 ext4_clear_journal_err(sb, es);
2423 sbi->s_mount_state = le16_to_cpu(es->s_state);
2424 if ((err = ext4_group_extend(sb, es, n_blocks_count)))
2425 goto restore_opts;
2426 if (!ext4_setup_super (sb, es, 0))
2427 sb->s_flags &= ~MS_RDONLY;
2428 }
2429 }
2430#ifdef CONFIG_QUOTA
2431 /* Release old quota file names */
2432 for (i = 0; i < MAXQUOTAS; i++)
2433 if (old_opts.s_qf_names[i] &&
2434 old_opts.s_qf_names[i] != sbi->s_qf_names[i])
2435 kfree(old_opts.s_qf_names[i]);
2436#endif
2437 return 0;
2438restore_opts:
2439 sb->s_flags = old_sb_flags;
2440 sbi->s_mount_opt = old_opts.s_mount_opt;
2441 sbi->s_resuid = old_opts.s_resuid;
2442 sbi->s_resgid = old_opts.s_resgid;
2443 sbi->s_commit_interval = old_opts.s_commit_interval;
2444#ifdef CONFIG_QUOTA
2445 sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
2446 for (i = 0; i < MAXQUOTAS; i++) {
2447 if (sbi->s_qf_names[i] &&
2448 old_opts.s_qf_names[i] != sbi->s_qf_names[i])
2449 kfree(sbi->s_qf_names[i]);
2450 sbi->s_qf_names[i] = old_opts.s_qf_names[i];
2451 }
2452#endif
2453 return err;
2454}
2455
2456static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
2457{
2458 struct super_block *sb = dentry->d_sb;
2459 struct ext4_sb_info *sbi = EXT4_SB(sb);
2460 struct ext4_super_block *es = sbi->s_es;
2461 ext4_fsblk_t overhead;
2462 int i;
2463
2464 if (test_opt (sb, MINIX_DF))
2465 overhead = 0;
2466 else {
2467 unsigned long ngroups;
2468 ngroups = EXT4_SB(sb)->s_groups_count;
2469 smp_rmb();
2470
2471 /*
2472 * Compute the overhead (FS structures)
2473 */
2474
2475 /*
2476 * All of the blocks before first_data_block are
2477 * overhead
2478 */
2479 overhead = le32_to_cpu(es->s_first_data_block);
2480
2481 /*
2482 * Add the overhead attributed to the superblock and
2483 * block group descriptors. If the sparse superblocks
2484 * feature is turned on, then not all groups have this.
2485 */
2486 for (i = 0; i < ngroups; i++) {
2487 overhead += ext4_bg_has_super(sb, i) +
2488 ext4_bg_num_gdb(sb, i);
2489 cond_resched();
2490 }
2491
2492 /*
2493 * Every block group has an inode bitmap, a block
2494 * bitmap, and an inode table.
2495 */
2496 overhead += (ngroups * (2 + EXT4_SB(sb)->s_itb_per_group));
2497 }
2498
2499 buf->f_type = EXT4_SUPER_MAGIC;
2500 buf->f_bsize = sb->s_blocksize;
2501 buf->f_blocks = ext4_blocks_count(es) - overhead;
2502 buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter);
2503 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
2504 if (buf->f_bfree < ext4_r_blocks_count(es))
2505 buf->f_bavail = 0;
2506 buf->f_files = le32_to_cpu(es->s_inodes_count);
2507 buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
2508 buf->f_namelen = EXT4_NAME_LEN;
2509 return 0;
2510}
2511
2512/* Helper function for writing quotas on sync - we need to start transaction before quota file
2513 * is locked for write. Otherwise the are possible deadlocks:
2514 * Process 1 Process 2
2515 * ext4_create() quota_sync()
2516 * jbd2_journal_start() write_dquot()
2517 * DQUOT_INIT() down(dqio_mutex)
2518 * down(dqio_mutex) jbd2_journal_start()
2519 *
2520 */
2521
2522#ifdef CONFIG_QUOTA
2523
2524static inline struct inode *dquot_to_inode(struct dquot *dquot)
2525{
2526 return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
2527}
2528
2529static int ext4_dquot_initialize(struct inode *inode, int type)
2530{
2531 handle_t *handle;
2532 int ret, err;
2533
2534 /* We may create quota structure so we need to reserve enough blocks */
2535 handle = ext4_journal_start(inode, 2*EXT4_QUOTA_INIT_BLOCKS(inode->i_sb));
2536 if (IS_ERR(handle))
2537 return PTR_ERR(handle);
2538 ret = dquot_initialize(inode, type);
2539 err = ext4_journal_stop(handle);
2540 if (!ret)
2541 ret = err;
2542 return ret;
2543}
2544
2545static int ext4_dquot_drop(struct inode *inode)
2546{
2547 handle_t *handle;
2548 int ret, err;
2549
2550 /* We may delete quota structure so we need to reserve enough blocks */
2551 handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb));
2552 if (IS_ERR(handle))
2553 return PTR_ERR(handle);
2554 ret = dquot_drop(inode);
2555 err = ext4_journal_stop(handle);
2556 if (!ret)
2557 ret = err;
2558 return ret;
2559}
2560
2561static int ext4_write_dquot(struct dquot *dquot)
2562{
2563 int ret, err;
2564 handle_t *handle;
2565 struct inode *inode;
2566
2567 inode = dquot_to_inode(dquot);
2568 handle = ext4_journal_start(inode,
2569 EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
2570 if (IS_ERR(handle))
2571 return PTR_ERR(handle);
2572 ret = dquot_commit(dquot);
2573 err = ext4_journal_stop(handle);
2574 if (!ret)
2575 ret = err;
2576 return ret;
2577}
2578
2579static int ext4_acquire_dquot(struct dquot *dquot)
2580{
2581 int ret, err;
2582 handle_t *handle;
2583
2584 handle = ext4_journal_start(dquot_to_inode(dquot),
2585 EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
2586 if (IS_ERR(handle))
2587 return PTR_ERR(handle);
2588 ret = dquot_acquire(dquot);
2589 err = ext4_journal_stop(handle);
2590 if (!ret)
2591 ret = err;
2592 return ret;
2593}
2594
2595static int ext4_release_dquot(struct dquot *dquot)
2596{
2597 int ret, err;
2598 handle_t *handle;
2599
2600 handle = ext4_journal_start(dquot_to_inode(dquot),
2601 EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
2602 if (IS_ERR(handle))
2603 return PTR_ERR(handle);
2604 ret = dquot_release(dquot);
2605 err = ext4_journal_stop(handle);
2606 if (!ret)
2607 ret = err;
2608 return ret;
2609}
2610
2611static int ext4_mark_dquot_dirty(struct dquot *dquot)
2612{
2613 /* Are we journalling quotas? */
2614 if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
2615 EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
2616 dquot_mark_dquot_dirty(dquot);
2617 return ext4_write_dquot(dquot);
2618 } else {
2619 return dquot_mark_dquot_dirty(dquot);
2620 }
2621}
2622
2623static int ext4_write_info(struct super_block *sb, int type)
2624{
2625 int ret, err;
2626 handle_t *handle;
2627
2628 /* Data block + inode block */
2629 handle = ext4_journal_start(sb->s_root->d_inode, 2);
2630 if (IS_ERR(handle))
2631 return PTR_ERR(handle);
2632 ret = dquot_commit_info(sb, type);
2633 err = ext4_journal_stop(handle);
2634 if (!ret)
2635 ret = err;
2636 return ret;
2637}
2638
2639/*
2640 * Turn on quotas during mount time - we need to find
2641 * the quota file and such...
2642 */
2643static int ext4_quota_on_mount(struct super_block *sb, int type)
2644{
2645 return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
2646 EXT4_SB(sb)->s_jquota_fmt, type);
2647}
2648
2649/*
2650 * Standard function to be called on quota_on
2651 */
2652static int ext4_quota_on(struct super_block *sb, int type, int format_id,
2653 char *path)
2654{
2655 int err;
2656 struct nameidata nd;
2657
2658 if (!test_opt(sb, QUOTA))
2659 return -EINVAL;
2660 /* Not journalling quota? */
2661 if (!EXT4_SB(sb)->s_qf_names[USRQUOTA] &&
2662 !EXT4_SB(sb)->s_qf_names[GRPQUOTA])
2663 return vfs_quota_on(sb, type, format_id, path);
2664 err = path_lookup(path, LOOKUP_FOLLOW, &nd);
2665 if (err)
2666 return err;
2667 /* Quotafile not on the same filesystem? */
2668 if (nd.mnt->mnt_sb != sb) {
2669 path_release(&nd);
2670 return -EXDEV;
2671 }
2672 /* Quotafile not of fs root? */
2673 if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode)
2674 printk(KERN_WARNING
2675 "EXT4-fs: Quota file not on filesystem root. "
2676 "Journalled quota will not work.\n");
2677 path_release(&nd);
2678 return vfs_quota_on(sb, type, format_id, path);
2679}
2680
2681/* Read data from quotafile - avoid pagecache and such because we cannot afford
2682 * acquiring the locks... As quota files are never truncated and quota code
2683 * itself serializes the operations (and noone else should touch the files)
2684 * we don't have to be afraid of races */
2685static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
2686 size_t len, loff_t off)
2687{
2688 struct inode *inode = sb_dqopt(sb)->files[type];
2689 sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
2690 int err = 0;
2691 int offset = off & (sb->s_blocksize - 1);
2692 int tocopy;
2693 size_t toread;
2694 struct buffer_head *bh;
2695 loff_t i_size = i_size_read(inode);
2696
2697 if (off > i_size)
2698 return 0;
2699 if (off+len > i_size)
2700 len = i_size-off;
2701 toread = len;
2702 while (toread > 0) {
2703 tocopy = sb->s_blocksize - offset < toread ?
2704 sb->s_blocksize - offset : toread;
2705 bh = ext4_bread(NULL, inode, blk, 0, &err);
2706 if (err)
2707 return err;
2708 if (!bh) /* A hole? */
2709 memset(data, 0, tocopy);
2710 else
2711 memcpy(data, bh->b_data+offset, tocopy);
2712 brelse(bh);
2713 offset = 0;
2714 toread -= tocopy;
2715 data += tocopy;
2716 blk++;
2717 }
2718 return len;
2719}
2720
2721/* Write to quotafile (we know the transaction is already started and has
2722 * enough credits) */
2723static ssize_t ext4_quota_write(struct super_block *sb, int type,
2724 const char *data, size_t len, loff_t off)
2725{
2726 struct inode *inode = sb_dqopt(sb)->files[type];
2727 sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
2728 int err = 0;
2729 int offset = off & (sb->s_blocksize - 1);
2730 int tocopy;
2731 int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
2732 size_t towrite = len;
2733 struct buffer_head *bh;
2734 handle_t *handle = journal_current_handle();
2735
2736 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2737 while (towrite > 0) {
2738 tocopy = sb->s_blocksize - offset < towrite ?
2739 sb->s_blocksize - offset : towrite;
2740 bh = ext4_bread(handle, inode, blk, 1, &err);
2741 if (!bh)
2742 goto out;
2743 if (journal_quota) {
2744 err = ext4_journal_get_write_access(handle, bh);
2745 if (err) {
2746 brelse(bh);
2747 goto out;
2748 }
2749 }
2750 lock_buffer(bh);
2751 memcpy(bh->b_data+offset, data, tocopy);
2752 flush_dcache_page(bh->b_page);
2753 unlock_buffer(bh);
2754 if (journal_quota)
2755 err = ext4_journal_dirty_metadata(handle, bh);
2756 else {
2757 /* Always do at least ordered writes for quotas */
2758 err = ext4_journal_dirty_data(handle, bh);
2759 mark_buffer_dirty(bh);
2760 }
2761 brelse(bh);
2762 if (err)
2763 goto out;
2764 offset = 0;
2765 towrite -= tocopy;
2766 data += tocopy;
2767 blk++;
2768 }
2769out:
2770 if (len == towrite)
2771 return err;
2772 if (inode->i_size < off+len-towrite) {
2773 i_size_write(inode, off+len-towrite);
2774 EXT4_I(inode)->i_disksize = inode->i_size;
2775 }
2776 inode->i_version++;
2777 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2778 ext4_mark_inode_dirty(handle, inode);
2779 mutex_unlock(&inode->i_mutex);
2780 return len - towrite;
2781}
2782
2783#endif
2784
2785static int ext4_get_sb(struct file_system_type *fs_type,
2786 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
2787{
2788 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
2789}
2790
2791static struct file_system_type ext4dev_fs_type = {
2792 .owner = THIS_MODULE,
2793 .name = "ext4dev",
2794 .get_sb = ext4_get_sb,
2795 .kill_sb = kill_block_super,
2796 .fs_flags = FS_REQUIRES_DEV,
2797};
2798
2799static int __init init_ext4_fs(void)
2800{
2801 int err = init_ext4_xattr();
2802 if (err)
2803 return err;
2804 err = init_inodecache();
2805 if (err)
2806 goto out1;
2807 err = register_filesystem(&ext4dev_fs_type);
2808 if (err)
2809 goto out;
2810 return 0;
2811out:
2812 destroy_inodecache();
2813out1:
2814 exit_ext4_xattr();
2815 return err;
2816}
2817
2818static void __exit exit_ext4_fs(void)
2819{
2820 unregister_filesystem(&ext4dev_fs_type);
2821 destroy_inodecache();
2822 exit_ext4_xattr();
2823}
2824
2825MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
2826MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
2827MODULE_LICENSE("GPL");
2828module_init(init_ext4_fs)
2829module_exit(exit_ext4_fs)
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
new file mode 100644
index 000000000000..fcf527286d75
--- /dev/null
+++ b/fs/ext4/symlink.c
@@ -0,0 +1,54 @@
1/*
2 * linux/fs/ext4/symlink.c
3 *
4 * Only fast symlinks left here - the rest is done by generic code. AV, 1999
5 *
6 * Copyright (C) 1992, 1993, 1994, 1995
7 * Remy Card (card@masi.ibp.fr)
8 * Laboratoire MASI - Institut Blaise Pascal
9 * Universite Pierre et Marie Curie (Paris VI)
10 *
11 * from
12 *
13 * linux/fs/minix/symlink.c
14 *
15 * Copyright (C) 1991, 1992 Linus Torvalds
16 *
17 * ext4 symlink handling code
18 */
19
20#include <linux/fs.h>
21#include <linux/jbd2.h>
22#include <linux/ext4_fs.h>
23#include <linux/namei.h>
24#include "xattr.h"
25
26static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
27{
28 struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
29 nd_set_link(nd, (char*)ei->i_data);
30 return NULL;
31}
32
33struct inode_operations ext4_symlink_inode_operations = {
34 .readlink = generic_readlink,
35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link,
37#ifdef CONFIG_EXT4DEV_FS_XATTR
38 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr,
40 .listxattr = ext4_listxattr,
41 .removexattr = generic_removexattr,
42#endif
43};
44
45struct inode_operations ext4_fast_symlink_inode_operations = {
46 .readlink = generic_readlink,
47 .follow_link = ext4_follow_link,
48#ifdef CONFIG_EXT4DEV_FS_XATTR
49 .setxattr = generic_setxattr,
50 .getxattr = generic_getxattr,
51 .listxattr = ext4_listxattr,
52 .removexattr = generic_removexattr,
53#endif
54};
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
new file mode 100644
index 000000000000..63233cd946a7
--- /dev/null
+++ b/fs/ext4/xattr.c
@@ -0,0 +1,1317 @@
1/*
2 * linux/fs/ext4/xattr.c
3 *
4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
5 *
6 * Fix by Harrison Xing <harrison@mountainviewdata.com>.
7 * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>.
8 * Extended attributes for symlinks and special files added per
9 * suggestion of Luka Renko <luka.renko@hermes.si>.
10 * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
11 * Red Hat Inc.
12 * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
13 * and Andreas Gruenbacher <agruen@suse.de>.
14 */
15
16/*
17 * Extended attributes are stored directly in inodes (on file systems with
18 * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
19 * field contains the block number if an inode uses an additional block. All
20 * attributes must fit in the inode and one additional block. Blocks that
21 * contain the identical set of attributes may be shared among several inodes.
22 * Identical blocks are detected by keeping a cache of blocks that have
23 * recently been accessed.
24 *
25 * The attributes in inodes and on blocks have a different header; the entries
26 * are stored in the same format:
27 *
28 * +------------------+
29 * | header |
30 * | entry 1 | |
31 * | entry 2 | | growing downwards
32 * | entry 3 | v
33 * | four null bytes |
34 * | . . . |
35 * | value 1 | ^
36 * | value 3 | | growing upwards
37 * | value 2 | |
38 * +------------------+
39 *
40 * The header is followed by multiple entry descriptors. In disk blocks, the
41 * entry descriptors are kept sorted. In inodes, they are unsorted. The
42 * attribute values are aligned to the end of the block in no specific order.
43 *
44 * Locking strategy
45 * ----------------
46 * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem.
47 * EA blocks are only changed if they are exclusive to an inode, so
48 * holding xattr_sem also means that nothing but the EA block's reference
49 * count can change. Multiple writers to the same block are synchronized
50 * by the buffer lock.
51 */
52
53#include <linux/init.h>
54#include <linux/fs.h>
55#include <linux/slab.h>
56#include <linux/ext4_jbd2.h>
57#include <linux/ext4_fs.h>
58#include <linux/mbcache.h>
59#include <linux/quotaops.h>
60#include <linux/rwsem.h>
61#include "xattr.h"
62#include "acl.h"
63
64#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
65#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
66#define BFIRST(bh) ENTRY(BHDR(bh)+1)
67#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
68
69#define IHDR(inode, raw_inode) \
70 ((struct ext4_xattr_ibody_header *) \
71 ((void *)raw_inode + \
72 EXT4_GOOD_OLD_INODE_SIZE + \
73 EXT4_I(inode)->i_extra_isize))
74#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
75
76#ifdef EXT4_XATTR_DEBUG
77# define ea_idebug(inode, f...) do { \
78 printk(KERN_DEBUG "inode %s:%lu: ", \
79 inode->i_sb->s_id, inode->i_ino); \
80 printk(f); \
81 printk("\n"); \
82 } while (0)
83# define ea_bdebug(bh, f...) do { \
84 char b[BDEVNAME_SIZE]; \
85 printk(KERN_DEBUG "block %s:%lu: ", \
86 bdevname(bh->b_bdev, b), \
87 (unsigned long) bh->b_blocknr); \
88 printk(f); \
89 printk("\n"); \
90 } while (0)
91#else
92# define ea_idebug(f...)
93# define ea_bdebug(f...)
94#endif
95
96static void ext4_xattr_cache_insert(struct buffer_head *);
97static struct buffer_head *ext4_xattr_cache_find(struct inode *,
98 struct ext4_xattr_header *,
99 struct mb_cache_entry **);
100static void ext4_xattr_rehash(struct ext4_xattr_header *,
101 struct ext4_xattr_entry *);
102
103static struct mb_cache *ext4_xattr_cache;
104
105static struct xattr_handler *ext4_xattr_handler_map[] = {
106 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
107#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
108 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
109 [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
110#endif
111 [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler,
112#ifdef CONFIG_EXT4DEV_FS_SECURITY
113 [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler,
114#endif
115};
116
117struct xattr_handler *ext4_xattr_handlers[] = {
118 &ext4_xattr_user_handler,
119 &ext4_xattr_trusted_handler,
120#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
121 &ext4_xattr_acl_access_handler,
122 &ext4_xattr_acl_default_handler,
123#endif
124#ifdef CONFIG_EXT4DEV_FS_SECURITY
125 &ext4_xattr_security_handler,
126#endif
127 NULL
128};
129
130static inline struct xattr_handler *
131ext4_xattr_handler(int name_index)
132{
133 struct xattr_handler *handler = NULL;
134
135 if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
136 handler = ext4_xattr_handler_map[name_index];
137 return handler;
138}
139
140/*
141 * Inode operation listxattr()
142 *
143 * dentry->d_inode->i_mutex: don't care
144 */
145ssize_t
146ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
147{
148 return ext4_xattr_list(dentry->d_inode, buffer, size);
149}
150
151static int
152ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
153{
154 while (!IS_LAST_ENTRY(entry)) {
155 struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry);
156 if ((void *)next >= end)
157 return -EIO;
158 entry = next;
159 }
160 return 0;
161}
162
163static inline int
164ext4_xattr_check_block(struct buffer_head *bh)
165{
166 int error;
167
168 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
169 BHDR(bh)->h_blocks != cpu_to_le32(1))
170 return -EIO;
171 error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
172 return error;
173}
174
175static inline int
176ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size)
177{
178 size_t value_size = le32_to_cpu(entry->e_value_size);
179
180 if (entry->e_value_block != 0 || value_size > size ||
181 le16_to_cpu(entry->e_value_offs) + value_size > size)
182 return -EIO;
183 return 0;
184}
185
186static int
187ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
188 const char *name, size_t size, int sorted)
189{
190 struct ext4_xattr_entry *entry;
191 size_t name_len;
192 int cmp = 1;
193
194 if (name == NULL)
195 return -EINVAL;
196 name_len = strlen(name);
197 entry = *pentry;
198 for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
199 cmp = name_index - entry->e_name_index;
200 if (!cmp)
201 cmp = name_len - entry->e_name_len;
202 if (!cmp)
203 cmp = memcmp(name, entry->e_name, name_len);
204 if (cmp <= 0 && (sorted || cmp == 0))
205 break;
206 }
207 *pentry = entry;
208 if (!cmp && ext4_xattr_check_entry(entry, size))
209 return -EIO;
210 return cmp ? -ENODATA : 0;
211}
212
213static int
214ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
215 void *buffer, size_t buffer_size)
216{
217 struct buffer_head *bh = NULL;
218 struct ext4_xattr_entry *entry;
219 size_t size;
220 int error;
221
222 ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
223 name_index, name, buffer, (long)buffer_size);
224
225 error = -ENODATA;
226 if (!EXT4_I(inode)->i_file_acl)
227 goto cleanup;
228 ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
229 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
230 if (!bh)
231 goto cleanup;
232 ea_bdebug(bh, "b_count=%d, refcount=%d",
233 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
234 if (ext4_xattr_check_block(bh)) {
235bad_block: ext4_error(inode->i_sb, __FUNCTION__,
236 "inode %lu: bad block %llu", inode->i_ino,
237 EXT4_I(inode)->i_file_acl);
238 error = -EIO;
239 goto cleanup;
240 }
241 ext4_xattr_cache_insert(bh);
242 entry = BFIRST(bh);
243 error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
244 if (error == -EIO)
245 goto bad_block;
246 if (error)
247 goto cleanup;
248 size = le32_to_cpu(entry->e_value_size);
249 if (buffer) {
250 error = -ERANGE;
251 if (size > buffer_size)
252 goto cleanup;
253 memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
254 size);
255 }
256 error = size;
257
258cleanup:
259 brelse(bh);
260 return error;
261}
262
263static int
264ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
265 void *buffer, size_t buffer_size)
266{
267 struct ext4_xattr_ibody_header *header;
268 struct ext4_xattr_entry *entry;
269 struct ext4_inode *raw_inode;
270 struct ext4_iloc iloc;
271 size_t size;
272 void *end;
273 int error;
274
275 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
276 return -ENODATA;
277 error = ext4_get_inode_loc(inode, &iloc);
278 if (error)
279 return error;
280 raw_inode = ext4_raw_inode(&iloc);
281 header = IHDR(inode, raw_inode);
282 entry = IFIRST(header);
283 end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
284 error = ext4_xattr_check_names(entry, end);
285 if (error)
286 goto cleanup;
287 error = ext4_xattr_find_entry(&entry, name_index, name,
288 end - (void *)entry, 0);
289 if (error)
290 goto cleanup;
291 size = le32_to_cpu(entry->e_value_size);
292 if (buffer) {
293 error = -ERANGE;
294 if (size > buffer_size)
295 goto cleanup;
296 memcpy(buffer, (void *)IFIRST(header) +
297 le16_to_cpu(entry->e_value_offs), size);
298 }
299 error = size;
300
301cleanup:
302 brelse(iloc.bh);
303 return error;
304}
305
306/*
307 * ext4_xattr_get()
308 *
309 * Copy an extended attribute into the buffer
310 * provided, or compute the buffer size required.
311 * Buffer is NULL to compute the size of the buffer required.
312 *
313 * Returns a negative error number on failure, or the number of bytes
314 * used / required on success.
315 */
316int
317ext4_xattr_get(struct inode *inode, int name_index, const char *name,
318 void *buffer, size_t buffer_size)
319{
320 int error;
321
322 down_read(&EXT4_I(inode)->xattr_sem);
323 error = ext4_xattr_ibody_get(inode, name_index, name, buffer,
324 buffer_size);
325 if (error == -ENODATA)
326 error = ext4_xattr_block_get(inode, name_index, name, buffer,
327 buffer_size);
328 up_read(&EXT4_I(inode)->xattr_sem);
329 return error;
330}
331
332static int
333ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
334 char *buffer, size_t buffer_size)
335{
336 size_t rest = buffer_size;
337
338 for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
339 struct xattr_handler *handler =
340 ext4_xattr_handler(entry->e_name_index);
341
342 if (handler) {
343 size_t size = handler->list(inode, buffer, rest,
344 entry->e_name,
345 entry->e_name_len);
346 if (buffer) {
347 if (size > rest)
348 return -ERANGE;
349 buffer += size;
350 }
351 rest -= size;
352 }
353 }
354 return buffer_size - rest;
355}
356
357static int
358ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
359{
360 struct buffer_head *bh = NULL;
361 int error;
362
363 ea_idebug(inode, "buffer=%p, buffer_size=%ld",
364 buffer, (long)buffer_size);
365
366 error = 0;
367 if (!EXT4_I(inode)->i_file_acl)
368 goto cleanup;
369 ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
370 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
371 error = -EIO;
372 if (!bh)
373 goto cleanup;
374 ea_bdebug(bh, "b_count=%d, refcount=%d",
375 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
376 if (ext4_xattr_check_block(bh)) {
377 ext4_error(inode->i_sb, __FUNCTION__,
378 "inode %lu: bad block %llu", inode->i_ino,
379 EXT4_I(inode)->i_file_acl);
380 error = -EIO;
381 goto cleanup;
382 }
383 ext4_xattr_cache_insert(bh);
384 error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size);
385
386cleanup:
387 brelse(bh);
388
389 return error;
390}
391
392static int
393ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
394{
395 struct ext4_xattr_ibody_header *header;
396 struct ext4_inode *raw_inode;
397 struct ext4_iloc iloc;
398 void *end;
399 int error;
400
401 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
402 return 0;
403 error = ext4_get_inode_loc(inode, &iloc);
404 if (error)
405 return error;
406 raw_inode = ext4_raw_inode(&iloc);
407 header = IHDR(inode, raw_inode);
408 end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
409 error = ext4_xattr_check_names(IFIRST(header), end);
410 if (error)
411 goto cleanup;
412 error = ext4_xattr_list_entries(inode, IFIRST(header),
413 buffer, buffer_size);
414
415cleanup:
416 brelse(iloc.bh);
417 return error;
418}
419
420/*
421 * ext4_xattr_list()
422 *
423 * Copy a list of attribute names into the buffer
424 * provided, or compute the buffer size required.
425 * Buffer is NULL to compute the size of the buffer required.
426 *
427 * Returns a negative error number on failure, or the number of bytes
428 * used / required on success.
429 */
430int
431ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
432{
433 int i_error, b_error;
434
435 down_read(&EXT4_I(inode)->xattr_sem);
436 i_error = ext4_xattr_ibody_list(inode, buffer, buffer_size);
437 if (i_error < 0) {
438 b_error = 0;
439 } else {
440 if (buffer) {
441 buffer += i_error;
442 buffer_size -= i_error;
443 }
444 b_error = ext4_xattr_block_list(inode, buffer, buffer_size);
445 if (b_error < 0)
446 i_error = 0;
447 }
448 up_read(&EXT4_I(inode)->xattr_sem);
449 return i_error + b_error;
450}
451
452/*
453 * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is
454 * not set, set it.
455 */
456static void ext4_xattr_update_super_block(handle_t *handle,
457 struct super_block *sb)
458{
459 if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))
460 return;
461
462 lock_super(sb);
463 if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
464 EXT4_SB(sb)->s_es->s_feature_compat |=
465 cpu_to_le32(EXT4_FEATURE_COMPAT_EXT_ATTR);
466 sb->s_dirt = 1;
467 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
468 }
469 unlock_super(sb);
470}
471
472/*
473 * Release the xattr block BH: If the reference count is > 1, decrement
474 * it; otherwise free the block.
475 */
476static void
477ext4_xattr_release_block(handle_t *handle, struct inode *inode,
478 struct buffer_head *bh)
479{
480 struct mb_cache_entry *ce = NULL;
481
482 ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
483 if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
484 ea_bdebug(bh, "refcount now=0; freeing");
485 if (ce)
486 mb_cache_entry_free(ce);
487 ext4_free_blocks(handle, inode, bh->b_blocknr, 1);
488 get_bh(bh);
489 ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
490 } else {
491 if (ext4_journal_get_write_access(handle, bh) == 0) {
492 lock_buffer(bh);
493 BHDR(bh)->h_refcount = cpu_to_le32(
494 le32_to_cpu(BHDR(bh)->h_refcount) - 1);
495 ext4_journal_dirty_metadata(handle, bh);
496 if (IS_SYNC(inode))
497 handle->h_sync = 1;
498 DQUOT_FREE_BLOCK(inode, 1);
499 unlock_buffer(bh);
500 ea_bdebug(bh, "refcount now=%d; releasing",
501 le32_to_cpu(BHDR(bh)->h_refcount));
502 }
503 if (ce)
504 mb_cache_entry_release(ce);
505 }
506}
507
508struct ext4_xattr_info {
509 int name_index;
510 const char *name;
511 const void *value;
512 size_t value_len;
513};
514
515struct ext4_xattr_search {
516 struct ext4_xattr_entry *first;
517 void *base;
518 void *end;
519 struct ext4_xattr_entry *here;
520 int not_found;
521};
522
523static int
524ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
525{
526 struct ext4_xattr_entry *last;
527 size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
528
529 /* Compute min_offs and last. */
530 last = s->first;
531 for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
532 if (!last->e_value_block && last->e_value_size) {
533 size_t offs = le16_to_cpu(last->e_value_offs);
534 if (offs < min_offs)
535 min_offs = offs;
536 }
537 }
538 free = min_offs - ((void *)last - s->base) - sizeof(__u32);
539 if (!s->not_found) {
540 if (!s->here->e_value_block && s->here->e_value_size) {
541 size_t size = le32_to_cpu(s->here->e_value_size);
542 free += EXT4_XATTR_SIZE(size);
543 }
544 free += EXT4_XATTR_LEN(name_len);
545 }
546 if (i->value) {
547 if (free < EXT4_XATTR_SIZE(i->value_len) ||
548 free < EXT4_XATTR_LEN(name_len) +
549 EXT4_XATTR_SIZE(i->value_len))
550 return -ENOSPC;
551 }
552
553 if (i->value && s->not_found) {
554 /* Insert the new name. */
555 size_t size = EXT4_XATTR_LEN(name_len);
556 size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
557 memmove((void *)s->here + size, s->here, rest);
558 memset(s->here, 0, size);
559 s->here->e_name_index = i->name_index;
560 s->here->e_name_len = name_len;
561 memcpy(s->here->e_name, i->name, name_len);
562 } else {
563 if (!s->here->e_value_block && s->here->e_value_size) {
564 void *first_val = s->base + min_offs;
565 size_t offs = le16_to_cpu(s->here->e_value_offs);
566 void *val = s->base + offs;
567 size_t size = EXT4_XATTR_SIZE(
568 le32_to_cpu(s->here->e_value_size));
569
570 if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
571 /* The old and the new value have the same
572 size. Just replace. */
573 s->here->e_value_size =
574 cpu_to_le32(i->value_len);
575 memset(val + size - EXT4_XATTR_PAD, 0,
576 EXT4_XATTR_PAD); /* Clear pad bytes. */
577 memcpy(val, i->value, i->value_len);
578 return 0;
579 }
580
581 /* Remove the old value. */
582 memmove(first_val + size, first_val, val - first_val);
583 memset(first_val, 0, size);
584 s->here->e_value_size = 0;
585 s->here->e_value_offs = 0;
586 min_offs += size;
587
588 /* Adjust all value offsets. */
589 last = s->first;
590 while (!IS_LAST_ENTRY(last)) {
591 size_t o = le16_to_cpu(last->e_value_offs);
592 if (!last->e_value_block &&
593 last->e_value_size && o < offs)
594 last->e_value_offs =
595 cpu_to_le16(o + size);
596 last = EXT4_XATTR_NEXT(last);
597 }
598 }
599 if (!i->value) {
600 /* Remove the old name. */
601 size_t size = EXT4_XATTR_LEN(name_len);
602 last = ENTRY((void *)last - size);
603 memmove(s->here, (void *)s->here + size,
604 (void *)last - (void *)s->here + sizeof(__u32));
605 memset(last, 0, size);
606 }
607 }
608
609 if (i->value) {
610 /* Insert the new value. */
611 s->here->e_value_size = cpu_to_le32(i->value_len);
612 if (i->value_len) {
613 size_t size = EXT4_XATTR_SIZE(i->value_len);
614 void *val = s->base + min_offs - size;
615 s->here->e_value_offs = cpu_to_le16(min_offs - size);
616 memset(val + size - EXT4_XATTR_PAD, 0,
617 EXT4_XATTR_PAD); /* Clear the pad bytes. */
618 memcpy(val, i->value, i->value_len);
619 }
620 }
621 return 0;
622}
623
624struct ext4_xattr_block_find {
625 struct ext4_xattr_search s;
626 struct buffer_head *bh;
627};
628
629static int
630ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
631 struct ext4_xattr_block_find *bs)
632{
633 struct super_block *sb = inode->i_sb;
634 int error;
635
636 ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
637 i->name_index, i->name, i->value, (long)i->value_len);
638
639 if (EXT4_I(inode)->i_file_acl) {
640 /* The inode already has an extended attribute block. */
641 bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl);
642 error = -EIO;
643 if (!bs->bh)
644 goto cleanup;
645 ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
646 atomic_read(&(bs->bh->b_count)),
647 le32_to_cpu(BHDR(bs->bh)->h_refcount));
648 if (ext4_xattr_check_block(bs->bh)) {
649 ext4_error(sb, __FUNCTION__,
650 "inode %lu: bad block %llu", inode->i_ino,
651 EXT4_I(inode)->i_file_acl);
652 error = -EIO;
653 goto cleanup;
654 }
655 /* Find the named attribute. */
656 bs->s.base = BHDR(bs->bh);
657 bs->s.first = BFIRST(bs->bh);
658 bs->s.end = bs->bh->b_data + bs->bh->b_size;
659 bs->s.here = bs->s.first;
660 error = ext4_xattr_find_entry(&bs->s.here, i->name_index,
661 i->name, bs->bh->b_size, 1);
662 if (error && error != -ENODATA)
663 goto cleanup;
664 bs->s.not_found = error;
665 }
666 error = 0;
667
668cleanup:
669 return error;
670}
671
672static int
673ext4_xattr_block_set(handle_t *handle, struct inode *inode,
674 struct ext4_xattr_info *i,
675 struct ext4_xattr_block_find *bs)
676{
677 struct super_block *sb = inode->i_sb;
678 struct buffer_head *new_bh = NULL;
679 struct ext4_xattr_search *s = &bs->s;
680 struct mb_cache_entry *ce = NULL;
681 int error;
682
683#define header(x) ((struct ext4_xattr_header *)(x))
684
685 if (i->value && i->value_len > sb->s_blocksize)
686 return -ENOSPC;
687 if (s->base) {
688 ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
689 bs->bh->b_blocknr);
690 if (header(s->base)->h_refcount == cpu_to_le32(1)) {
691 if (ce) {
692 mb_cache_entry_free(ce);
693 ce = NULL;
694 }
695 ea_bdebug(bs->bh, "modifying in-place");
696 error = ext4_journal_get_write_access(handle, bs->bh);
697 if (error)
698 goto cleanup;
699 lock_buffer(bs->bh);
700 error = ext4_xattr_set_entry(i, s);
701 if (!error) {
702 if (!IS_LAST_ENTRY(s->first))
703 ext4_xattr_rehash(header(s->base),
704 s->here);
705 ext4_xattr_cache_insert(bs->bh);
706 }
707 unlock_buffer(bs->bh);
708 if (error == -EIO)
709 goto bad_block;
710 if (!error)
711 error = ext4_journal_dirty_metadata(handle,
712 bs->bh);
713 if (error)
714 goto cleanup;
715 goto inserted;
716 } else {
717 int offset = (char *)s->here - bs->bh->b_data;
718
719 if (ce) {
720 mb_cache_entry_release(ce);
721 ce = NULL;
722 }
723 ea_bdebug(bs->bh, "cloning");
724 s->base = kmalloc(bs->bh->b_size, GFP_KERNEL);
725 error = -ENOMEM;
726 if (s->base == NULL)
727 goto cleanup;
728 memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
729 s->first = ENTRY(header(s->base)+1);
730 header(s->base)->h_refcount = cpu_to_le32(1);
731 s->here = ENTRY(s->base + offset);
732 s->end = s->base + bs->bh->b_size;
733 }
734 } else {
735 /* Allocate a buffer where we construct the new block. */
736 s->base = kmalloc(sb->s_blocksize, GFP_KERNEL);
737 /* assert(header == s->base) */
738 error = -ENOMEM;
739 if (s->base == NULL)
740 goto cleanup;
741 memset(s->base, 0, sb->s_blocksize);
742 header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
743 header(s->base)->h_blocks = cpu_to_le32(1);
744 header(s->base)->h_refcount = cpu_to_le32(1);
745 s->first = ENTRY(header(s->base)+1);
746 s->here = ENTRY(header(s->base)+1);
747 s->end = s->base + sb->s_blocksize;
748 }
749
750 error = ext4_xattr_set_entry(i, s);
751 if (error == -EIO)
752 goto bad_block;
753 if (error)
754 goto cleanup;
755 if (!IS_LAST_ENTRY(s->first))
756 ext4_xattr_rehash(header(s->base), s->here);
757
758inserted:
759 if (!IS_LAST_ENTRY(s->first)) {
760 new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce);
761 if (new_bh) {
762 /* We found an identical block in the cache. */
763 if (new_bh == bs->bh)
764 ea_bdebug(new_bh, "keeping");
765 else {
766 /* The old block is released after updating
767 the inode. */
768 error = -EDQUOT;
769 if (DQUOT_ALLOC_BLOCK(inode, 1))
770 goto cleanup;
771 error = ext4_journal_get_write_access(handle,
772 new_bh);
773 if (error)
774 goto cleanup_dquot;
775 lock_buffer(new_bh);
776 BHDR(new_bh)->h_refcount = cpu_to_le32(1 +
777 le32_to_cpu(BHDR(new_bh)->h_refcount));
778 ea_bdebug(new_bh, "reusing; refcount now=%d",
779 le32_to_cpu(BHDR(new_bh)->h_refcount));
780 unlock_buffer(new_bh);
781 error = ext4_journal_dirty_metadata(handle,
782 new_bh);
783 if (error)
784 goto cleanup_dquot;
785 }
786 mb_cache_entry_release(ce);
787 ce = NULL;
788 } else if (bs->bh && s->base == bs->bh->b_data) {
789 /* We were modifying this block in-place. */
790 ea_bdebug(bs->bh, "keeping this block");
791 new_bh = bs->bh;
792 get_bh(new_bh);
793 } else {
794 /* We need to allocate a new block */
795 ext4_fsblk_t goal = le32_to_cpu(
796 EXT4_SB(sb)->s_es->s_first_data_block) +
797 (ext4_fsblk_t)EXT4_I(inode)->i_block_group *
798 EXT4_BLOCKS_PER_GROUP(sb);
799 ext4_fsblk_t block = ext4_new_block(handle, inode,
800 goal, &error);
801 if (error)
802 goto cleanup;
803 ea_idebug(inode, "creating block %d", block);
804
805 new_bh = sb_getblk(sb, block);
806 if (!new_bh) {
807getblk_failed:
808 ext4_free_blocks(handle, inode, block, 1);
809 error = -EIO;
810 goto cleanup;
811 }
812 lock_buffer(new_bh);
813 error = ext4_journal_get_create_access(handle, new_bh);
814 if (error) {
815 unlock_buffer(new_bh);
816 goto getblk_failed;
817 }
818 memcpy(new_bh->b_data, s->base, new_bh->b_size);
819 set_buffer_uptodate(new_bh);
820 unlock_buffer(new_bh);
821 ext4_xattr_cache_insert(new_bh);
822 error = ext4_journal_dirty_metadata(handle, new_bh);
823 if (error)
824 goto cleanup;
825 }
826 }
827
828 /* Update the inode. */
829 EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
830
831 /* Drop the previous xattr block. */
832 if (bs->bh && bs->bh != new_bh)
833 ext4_xattr_release_block(handle, inode, bs->bh);
834 error = 0;
835
836cleanup:
837 if (ce)
838 mb_cache_entry_release(ce);
839 brelse(new_bh);
840 if (!(bs->bh && s->base == bs->bh->b_data))
841 kfree(s->base);
842
843 return error;
844
845cleanup_dquot:
846 DQUOT_FREE_BLOCK(inode, 1);
847 goto cleanup;
848
849bad_block:
850 ext4_error(inode->i_sb, __FUNCTION__,
851 "inode %lu: bad block %llu", inode->i_ino,
852 EXT4_I(inode)->i_file_acl);
853 goto cleanup;
854
855#undef header
856}
857
858struct ext4_xattr_ibody_find {
859 struct ext4_xattr_search s;
860 struct ext4_iloc iloc;
861};
862
863static int
864ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
865 struct ext4_xattr_ibody_find *is)
866{
867 struct ext4_xattr_ibody_header *header;
868 struct ext4_inode *raw_inode;
869 int error;
870
871 if (EXT4_I(inode)->i_extra_isize == 0)
872 return 0;
873 raw_inode = ext4_raw_inode(&is->iloc);
874 header = IHDR(inode, raw_inode);
875 is->s.base = is->s.first = IFIRST(header);
876 is->s.here = is->s.first;
877 is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
878 if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
879 error = ext4_xattr_check_names(IFIRST(header), is->s.end);
880 if (error)
881 return error;
882 /* Find the named attribute. */
883 error = ext4_xattr_find_entry(&is->s.here, i->name_index,
884 i->name, is->s.end -
885 (void *)is->s.base, 0);
886 if (error && error != -ENODATA)
887 return error;
888 is->s.not_found = error;
889 }
890 return 0;
891}
892
893static int
894ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
895 struct ext4_xattr_info *i,
896 struct ext4_xattr_ibody_find *is)
897{
898 struct ext4_xattr_ibody_header *header;
899 struct ext4_xattr_search *s = &is->s;
900 int error;
901
902 if (EXT4_I(inode)->i_extra_isize == 0)
903 return -ENOSPC;
904 error = ext4_xattr_set_entry(i, s);
905 if (error)
906 return error;
907 header = IHDR(inode, ext4_raw_inode(&is->iloc));
908 if (!IS_LAST_ENTRY(s->first)) {
909 header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
910 EXT4_I(inode)->i_state |= EXT4_STATE_XATTR;
911 } else {
912 header->h_magic = cpu_to_le32(0);
913 EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR;
914 }
915 return 0;
916}
917
918/*
919 * ext4_xattr_set_handle()
920 *
921 * Create, replace or remove an extended attribute for this inode. Buffer
922 * is NULL to remove an existing extended attribute, and non-NULL to
923 * either replace an existing extended attribute, or create a new extended
924 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
925 * specify that an extended attribute must exist and must not exist
926 * previous to the call, respectively.
927 *
928 * Returns 0, or a negative error number on failure.
929 */
930int
931ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
932 const char *name, const void *value, size_t value_len,
933 int flags)
934{
935 struct ext4_xattr_info i = {
936 .name_index = name_index,
937 .name = name,
938 .value = value,
939 .value_len = value_len,
940
941 };
942 struct ext4_xattr_ibody_find is = {
943 .s = { .not_found = -ENODATA, },
944 };
945 struct ext4_xattr_block_find bs = {
946 .s = { .not_found = -ENODATA, },
947 };
948 int error;
949
950 if (!name)
951 return -EINVAL;
952 if (strlen(name) > 255)
953 return -ERANGE;
954 down_write(&EXT4_I(inode)->xattr_sem);
955 error = ext4_get_inode_loc(inode, &is.iloc);
956 if (error)
957 goto cleanup;
958
959 if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
960 struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
961 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
962 EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW;
963 }
964
965 error = ext4_xattr_ibody_find(inode, &i, &is);
966 if (error)
967 goto cleanup;
968 if (is.s.not_found)
969 error = ext4_xattr_block_find(inode, &i, &bs);
970 if (error)
971 goto cleanup;
972 if (is.s.not_found && bs.s.not_found) {
973 error = -ENODATA;
974 if (flags & XATTR_REPLACE)
975 goto cleanup;
976 error = 0;
977 if (!value)
978 goto cleanup;
979 } else {
980 error = -EEXIST;
981 if (flags & XATTR_CREATE)
982 goto cleanup;
983 }
984 error = ext4_journal_get_write_access(handle, is.iloc.bh);
985 if (error)
986 goto cleanup;
987 if (!value) {
988 if (!is.s.not_found)
989 error = ext4_xattr_ibody_set(handle, inode, &i, &is);
990 else if (!bs.s.not_found)
991 error = ext4_xattr_block_set(handle, inode, &i, &bs);
992 } else {
993 error = ext4_xattr_ibody_set(handle, inode, &i, &is);
994 if (!error && !bs.s.not_found) {
995 i.value = NULL;
996 error = ext4_xattr_block_set(handle, inode, &i, &bs);
997 } else if (error == -ENOSPC) {
998 error = ext4_xattr_block_set(handle, inode, &i, &bs);
999 if (error)
1000 goto cleanup;
1001 if (!is.s.not_found) {
1002 i.value = NULL;
1003 error = ext4_xattr_ibody_set(handle, inode, &i,
1004 &is);
1005 }
1006 }
1007 }
1008 if (!error) {
1009 ext4_xattr_update_super_block(handle, inode->i_sb);
1010 inode->i_ctime = CURRENT_TIME_SEC;
1011 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
1012 /*
1013 * The bh is consumed by ext4_mark_iloc_dirty, even with
1014 * error != 0.
1015 */
1016 is.iloc.bh = NULL;
1017 if (IS_SYNC(inode))
1018 handle->h_sync = 1;
1019 }
1020
1021cleanup:
1022 brelse(is.iloc.bh);
1023 brelse(bs.bh);
1024 up_write(&EXT4_I(inode)->xattr_sem);
1025 return error;
1026}
1027
1028/*
1029 * ext4_xattr_set()
1030 *
1031 * Like ext4_xattr_set_handle, but start from an inode. This extended
1032 * attribute modification is a filesystem transaction by itself.
1033 *
1034 * Returns 0, or a negative error number on failure.
1035 */
1036int
1037ext4_xattr_set(struct inode *inode, int name_index, const char *name,
1038 const void *value, size_t value_len, int flags)
1039{
1040 handle_t *handle;
1041 int error, retries = 0;
1042
1043retry:
1044 handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
1045 if (IS_ERR(handle)) {
1046 error = PTR_ERR(handle);
1047 } else {
1048 int error2;
1049
1050 error = ext4_xattr_set_handle(handle, inode, name_index, name,
1051 value, value_len, flags);
1052 error2 = ext4_journal_stop(handle);
1053 if (error == -ENOSPC &&
1054 ext4_should_retry_alloc(inode->i_sb, &retries))
1055 goto retry;
1056 if (error == 0)
1057 error = error2;
1058 }
1059
1060 return error;
1061}
1062
1063/*
1064 * ext4_xattr_delete_inode()
1065 *
1066 * Free extended attribute resources associated with this inode. This
1067 * is called immediately before an inode is freed. We have exclusive
1068 * access to the inode.
1069 */
1070void
1071ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
1072{
1073 struct buffer_head *bh = NULL;
1074
1075 if (!EXT4_I(inode)->i_file_acl)
1076 goto cleanup;
1077 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1078 if (!bh) {
1079 ext4_error(inode->i_sb, __FUNCTION__,
1080 "inode %lu: block %llu read error", inode->i_ino,
1081 EXT4_I(inode)->i_file_acl);
1082 goto cleanup;
1083 }
1084 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
1085 BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1086 ext4_error(inode->i_sb, __FUNCTION__,
1087 "inode %lu: bad block %llu", inode->i_ino,
1088 EXT4_I(inode)->i_file_acl);
1089 goto cleanup;
1090 }
1091 ext4_xattr_release_block(handle, inode, bh);
1092 EXT4_I(inode)->i_file_acl = 0;
1093
1094cleanup:
1095 brelse(bh);
1096}
1097
1098/*
1099 * ext4_xattr_put_super()
1100 *
1101 * This is called when a file system is unmounted.
1102 */
1103void
1104ext4_xattr_put_super(struct super_block *sb)
1105{
1106 mb_cache_shrink(sb->s_bdev);
1107}
1108
1109/*
1110 * ext4_xattr_cache_insert()
1111 *
1112 * Create a new entry in the extended attribute cache, and insert
1113 * it unless such an entry is already in the cache.
1114 *
1115 * Returns 0, or a negative error number on failure.
1116 */
1117static void
1118ext4_xattr_cache_insert(struct buffer_head *bh)
1119{
1120 __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
1121 struct mb_cache_entry *ce;
1122 int error;
1123
1124 ce = mb_cache_entry_alloc(ext4_xattr_cache);
1125 if (!ce) {
1126 ea_bdebug(bh, "out of memory");
1127 return;
1128 }
1129 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
1130 if (error) {
1131 mb_cache_entry_free(ce);
1132 if (error == -EBUSY) {
1133 ea_bdebug(bh, "already in cache");
1134 error = 0;
1135 }
1136 } else {
1137 ea_bdebug(bh, "inserting [%x]", (int)hash);
1138 mb_cache_entry_release(ce);
1139 }
1140}
1141
1142/*
1143 * ext4_xattr_cmp()
1144 *
1145 * Compare two extended attribute blocks for equality.
1146 *
1147 * Returns 0 if the blocks are equal, 1 if they differ, and
1148 * a negative error number on errors.
1149 */
1150static int
1151ext4_xattr_cmp(struct ext4_xattr_header *header1,
1152 struct ext4_xattr_header *header2)
1153{
1154 struct ext4_xattr_entry *entry1, *entry2;
1155
1156 entry1 = ENTRY(header1+1);
1157 entry2 = ENTRY(header2+1);
1158 while (!IS_LAST_ENTRY(entry1)) {
1159 if (IS_LAST_ENTRY(entry2))
1160 return 1;
1161 if (entry1->e_hash != entry2->e_hash ||
1162 entry1->e_name_index != entry2->e_name_index ||
1163 entry1->e_name_len != entry2->e_name_len ||
1164 entry1->e_value_size != entry2->e_value_size ||
1165 memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
1166 return 1;
1167 if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
1168 return -EIO;
1169 if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
1170 (char *)header2 + le16_to_cpu(entry2->e_value_offs),
1171 le32_to_cpu(entry1->e_value_size)))
1172 return 1;
1173
1174 entry1 = EXT4_XATTR_NEXT(entry1);
1175 entry2 = EXT4_XATTR_NEXT(entry2);
1176 }
1177 if (!IS_LAST_ENTRY(entry2))
1178 return 1;
1179 return 0;
1180}
1181
1182/*
1183 * ext4_xattr_cache_find()
1184 *
1185 * Find an identical extended attribute block.
1186 *
1187 * Returns a pointer to the block found, or NULL if such a block was
1188 * not found or an error occurred.
1189 */
1190static struct buffer_head *
1191ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
1192 struct mb_cache_entry **pce)
1193{
1194 __u32 hash = le32_to_cpu(header->h_hash);
1195 struct mb_cache_entry *ce;
1196
1197 if (!header->h_hash)
1198 return NULL; /* never share */
1199 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
1200again:
1201 ce = mb_cache_entry_find_first(ext4_xattr_cache, 0,
1202 inode->i_sb->s_bdev, hash);
1203 while (ce) {
1204 struct buffer_head *bh;
1205
1206 if (IS_ERR(ce)) {
1207 if (PTR_ERR(ce) == -EAGAIN)
1208 goto again;
1209 break;
1210 }
1211 bh = sb_bread(inode->i_sb, ce->e_block);
1212 if (!bh) {
1213 ext4_error(inode->i_sb, __FUNCTION__,
1214 "inode %lu: block %lu read error",
1215 inode->i_ino, (unsigned long) ce->e_block);
1216 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
1217 EXT4_XATTR_REFCOUNT_MAX) {
1218 ea_idebug(inode, "block %lu refcount %d>=%d",
1219 (unsigned long) ce->e_block,
1220 le32_to_cpu(BHDR(bh)->h_refcount),
1221 EXT4_XATTR_REFCOUNT_MAX);
1222 } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
1223 *pce = ce;
1224 return bh;
1225 }
1226 brelse(bh);
1227 ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
1228 }
1229 return NULL;
1230}
1231
1232#define NAME_HASH_SHIFT 5
1233#define VALUE_HASH_SHIFT 16
1234
1235/*
1236 * ext4_xattr_hash_entry()
1237 *
1238 * Compute the hash of an extended attribute.
1239 */
1240static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
1241 struct ext4_xattr_entry *entry)
1242{
1243 __u32 hash = 0;
1244 char *name = entry->e_name;
1245 int n;
1246
1247 for (n=0; n < entry->e_name_len; n++) {
1248 hash = (hash << NAME_HASH_SHIFT) ^
1249 (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
1250 *name++;
1251 }
1252
1253 if (entry->e_value_block == 0 && entry->e_value_size != 0) {
1254 __le32 *value = (__le32 *)((char *)header +
1255 le16_to_cpu(entry->e_value_offs));
1256 for (n = (le32_to_cpu(entry->e_value_size) +
1257 EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) {
1258 hash = (hash << VALUE_HASH_SHIFT) ^
1259 (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
1260 le32_to_cpu(*value++);
1261 }
1262 }
1263 entry->e_hash = cpu_to_le32(hash);
1264}
1265
1266#undef NAME_HASH_SHIFT
1267#undef VALUE_HASH_SHIFT
1268
1269#define BLOCK_HASH_SHIFT 16
1270
1271/*
1272 * ext4_xattr_rehash()
1273 *
1274 * Re-compute the extended attribute hash value after an entry has changed.
1275 */
1276static void ext4_xattr_rehash(struct ext4_xattr_header *header,
1277 struct ext4_xattr_entry *entry)
1278{
1279 struct ext4_xattr_entry *here;
1280 __u32 hash = 0;
1281
1282 ext4_xattr_hash_entry(header, entry);
1283 here = ENTRY(header+1);
1284 while (!IS_LAST_ENTRY(here)) {
1285 if (!here->e_hash) {
1286 /* Block is not shared if an entry's hash value == 0 */
1287 hash = 0;
1288 break;
1289 }
1290 hash = (hash << BLOCK_HASH_SHIFT) ^
1291 (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
1292 le32_to_cpu(here->e_hash);
1293 here = EXT4_XATTR_NEXT(here);
1294 }
1295 header->h_hash = cpu_to_le32(hash);
1296}
1297
1298#undef BLOCK_HASH_SHIFT
1299
1300int __init
1301init_ext4_xattr(void)
1302{
1303 ext4_xattr_cache = mb_cache_create("ext4_xattr", NULL,
1304 sizeof(struct mb_cache_entry) +
1305 sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
1306 if (!ext4_xattr_cache)
1307 return -ENOMEM;
1308 return 0;
1309}
1310
1311void
1312exit_ext4_xattr(void)
1313{
1314 if (ext4_xattr_cache)
1315 mb_cache_destroy(ext4_xattr_cache);
1316 ext4_xattr_cache = NULL;
1317}
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
new file mode 100644
index 000000000000..79432b35398f
--- /dev/null
+++ b/fs/ext4/xattr.h
@@ -0,0 +1,145 @@
1/*
2 File: fs/ext4/xattr.h
3
4 On-disk format of extended attributes for the ext4 filesystem.
5
6 (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
7*/
8
9#include <linux/xattr.h>
10
11/* Magic value in attribute blocks */
12#define EXT4_XATTR_MAGIC 0xEA020000
13
14/* Maximum number of references to one attribute block */
15#define EXT4_XATTR_REFCOUNT_MAX 1024
16
17/* Name indexes */
18#define EXT4_XATTR_INDEX_USER 1
19#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS 2
20#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT 3
21#define EXT4_XATTR_INDEX_TRUSTED 4
22#define EXT4_XATTR_INDEX_LUSTRE 5
23#define EXT4_XATTR_INDEX_SECURITY 6
24
25struct ext4_xattr_header {
26 __le32 h_magic; /* magic number for identification */
27 __le32 h_refcount; /* reference count */
28 __le32 h_blocks; /* number of disk blocks used */
29 __le32 h_hash; /* hash value of all attributes */
30 __u32 h_reserved[4]; /* zero right now */
31};
32
33struct ext4_xattr_ibody_header {
34 __le32 h_magic; /* magic number for identification */
35};
36
37struct ext4_xattr_entry {
38 __u8 e_name_len; /* length of name */
39 __u8 e_name_index; /* attribute name index */
40 __le16 e_value_offs; /* offset in disk block of value */
41 __le32 e_value_block; /* disk block attribute is stored on (n/i) */
42 __le32 e_value_size; /* size of attribute value */
43 __le32 e_hash; /* hash value of name and value */
44 char e_name[0]; /* attribute name */
45};
46
47#define EXT4_XATTR_PAD_BITS 2
48#define EXT4_XATTR_PAD (1<<EXT4_XATTR_PAD_BITS)
49#define EXT4_XATTR_ROUND (EXT4_XATTR_PAD-1)
50#define EXT4_XATTR_LEN(name_len) \
51 (((name_len) + EXT4_XATTR_ROUND + \
52 sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
53#define EXT4_XATTR_NEXT(entry) \
54 ( (struct ext4_xattr_entry *)( \
55 (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)) )
56#define EXT4_XATTR_SIZE(size) \
57 (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
58
59# ifdef CONFIG_EXT4DEV_FS_XATTR
60
61extern struct xattr_handler ext4_xattr_user_handler;
62extern struct xattr_handler ext4_xattr_trusted_handler;
63extern struct xattr_handler ext4_xattr_acl_access_handler;
64extern struct xattr_handler ext4_xattr_acl_default_handler;
65extern struct xattr_handler ext4_xattr_security_handler;
66
67extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
68
69extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
70extern int ext4_xattr_list(struct inode *, char *, size_t);
71extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
72extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
73
74extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
75extern void ext4_xattr_put_super(struct super_block *);
76
77extern int init_ext4_xattr(void);
78extern void exit_ext4_xattr(void);
79
80extern struct xattr_handler *ext4_xattr_handlers[];
81
82# else /* CONFIG_EXT4DEV_FS_XATTR */
83
84static inline int
85ext4_xattr_get(struct inode *inode, int name_index, const char *name,
86 void *buffer, size_t size, int flags)
87{
88 return -EOPNOTSUPP;
89}
90
91static inline int
92ext4_xattr_list(struct inode *inode, void *buffer, size_t size)
93{
94 return -EOPNOTSUPP;
95}
96
97static inline int
98ext4_xattr_set(struct inode *inode, int name_index, const char *name,
99 const void *value, size_t size, int flags)
100{
101 return -EOPNOTSUPP;
102}
103
104static inline int
105ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
106 const char *name, const void *value, size_t size, int flags)
107{
108 return -EOPNOTSUPP;
109}
110
111static inline void
112ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
113{
114}
115
116static inline void
117ext4_xattr_put_super(struct super_block *sb)
118{
119}
120
121static inline int
122init_ext4_xattr(void)
123{
124 return 0;
125}
126
127static inline void
128exit_ext4_xattr(void)
129{
130}
131
132#define ext4_xattr_handlers NULL
133
134# endif /* CONFIG_EXT4DEV_FS_XATTR */
135
136#ifdef CONFIG_EXT4DEV_FS_SECURITY
137extern int ext4_init_security(handle_t *handle, struct inode *inode,
138 struct inode *dir);
139#else
140static inline int ext4_init_security(handle_t *handle, struct inode *inode,
141 struct inode *dir)
142{
143 return 0;
144}
145#endif
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
new file mode 100644
index 000000000000..b6a6861951f9
--- /dev/null
+++ b/fs/ext4/xattr_security.c
@@ -0,0 +1,77 @@
1/*
2 * linux/fs/ext4/xattr_security.c
3 * Handler for storing security labels as extended attributes.
4 */
5
6#include <linux/module.h>
7#include <linux/string.h>
8#include <linux/fs.h>
9#include <linux/smp_lock.h>
10#include <linux/ext4_jbd2.h>
11#include <linux/ext4_fs.h>
12#include <linux/security.h>
13#include "xattr.h"
14
15static size_t
16ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size,
17 const char *name, size_t name_len)
18{
19 const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
20 const size_t total_len = prefix_len + name_len + 1;
21
22
23 if (list && total_len <= list_size) {
24 memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
25 memcpy(list+prefix_len, name, name_len);
26 list[prefix_len + name_len] = '\0';
27 }
28 return total_len;
29}
30
31static int
32ext4_xattr_security_get(struct inode *inode, const char *name,
33 void *buffer, size_t size)
34{
35 if (strcmp(name, "") == 0)
36 return -EINVAL;
37 return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, name,
38 buffer, size);
39}
40
41static int
42ext4_xattr_security_set(struct inode *inode, const char *name,
43 const void *value, size_t size, int flags)
44{
45 if (strcmp(name, "") == 0)
46 return -EINVAL;
47 return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name,
48 value, size, flags);
49}
50
51int
52ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
53{
54 int err;
55 size_t len;
56 void *value;
57 char *name;
58
59 err = security_inode_init_security(inode, dir, &name, &value, &len);
60 if (err) {
61 if (err == -EOPNOTSUPP)
62 return 0;
63 return err;
64 }
65 err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY,
66 name, value, len, 0);
67 kfree(name);
68 kfree(value);
69 return err;
70}
71
72struct xattr_handler ext4_xattr_security_handler = {
73 .prefix = XATTR_SECURITY_PREFIX,
74 .list = ext4_xattr_security_list,
75 .get = ext4_xattr_security_get,
76 .set = ext4_xattr_security_set,
77};
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
new file mode 100644
index 000000000000..b76f2dbc82da
--- /dev/null
+++ b/fs/ext4/xattr_trusted.c
@@ -0,0 +1,62 @@
1/*
2 * linux/fs/ext4/xattr_trusted.c
3 * Handler for trusted extended attributes.
4 *
5 * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */
7
8#include <linux/module.h>
9#include <linux/string.h>
10#include <linux/capability.h>
11#include <linux/fs.h>
12#include <linux/smp_lock.h>
13#include <linux/ext4_jbd2.h>
14#include <linux/ext4_fs.h>
15#include "xattr.h"
16
17#define XATTR_TRUSTED_PREFIX "trusted."
18
19static size_t
20ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
21 const char *name, size_t name_len)
22{
23 const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
24 const size_t total_len = prefix_len + name_len + 1;
25
26 if (!capable(CAP_SYS_ADMIN))
27 return 0;
28
29 if (list && total_len <= list_size) {
30 memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
31 memcpy(list+prefix_len, name, name_len);
32 list[prefix_len + name_len] = '\0';
33 }
34 return total_len;
35}
36
37static int
38ext4_xattr_trusted_get(struct inode *inode, const char *name,
39 void *buffer, size_t size)
40{
41 if (strcmp(name, "") == 0)
42 return -EINVAL;
43 return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, name,
44 buffer, size);
45}
46
47static int
48ext4_xattr_trusted_set(struct inode *inode, const char *name,
49 const void *value, size_t size, int flags)
50{
51 if (strcmp(name, "") == 0)
52 return -EINVAL;
53 return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name,
54 value, size, flags);
55}
56
57struct xattr_handler ext4_xattr_trusted_handler = {
58 .prefix = XATTR_TRUSTED_PREFIX,
59 .list = ext4_xattr_trusted_list,
60 .get = ext4_xattr_trusted_get,
61 .set = ext4_xattr_trusted_set,
62};
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
new file mode 100644
index 000000000000..c53cded0761a
--- /dev/null
+++ b/fs/ext4/xattr_user.c
@@ -0,0 +1,64 @@
1/*
2 * linux/fs/ext4/xattr_user.c
3 * Handler for extended user attributes.
4 *
5 * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */
7
8#include <linux/module.h>
9#include <linux/string.h>
10#include <linux/fs.h>
11#include <linux/smp_lock.h>
12#include <linux/ext4_jbd2.h>
13#include <linux/ext4_fs.h>
14#include "xattr.h"
15
16#define XATTR_USER_PREFIX "user."
17
18static size_t
19ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
20 const char *name, size_t name_len)
21{
22 const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
23 const size_t total_len = prefix_len + name_len + 1;
24
25 if (!test_opt(inode->i_sb, XATTR_USER))
26 return 0;
27
28 if (list && total_len <= list_size) {
29 memcpy(list, XATTR_USER_PREFIX, prefix_len);
30 memcpy(list+prefix_len, name, name_len);
31 list[prefix_len + name_len] = '\0';
32 }
33 return total_len;
34}
35
36static int
37ext4_xattr_user_get(struct inode *inode, const char *name,
38 void *buffer, size_t size)
39{
40 if (strcmp(name, "") == 0)
41 return -EINVAL;
42 if (!test_opt(inode->i_sb, XATTR_USER))
43 return -EOPNOTSUPP;
44 return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER, name, buffer, size);
45}
46
47static int
48ext4_xattr_user_set(struct inode *inode, const char *name,
49 const void *value, size_t size, int flags)
50{
51 if (strcmp(name, "") == 0)
52 return -EINVAL;
53 if (!test_opt(inode->i_sb, XATTR_USER))
54 return -EOPNOTSUPP;
55 return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name,
56 value, size, flags);
57}
58
59struct xattr_handler ext4_xattr_user_handler = {
60 .prefix = XATTR_USER_PREFIX,
61 .list = ext4_xattr_user_list,
62 .get = ext4_xattr_user_get,
63 .set = ext4_xattr_user_set,
64};