aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDave Kleikamp <shaggy@austin.ibm.com>2006-10-11 04:20:50 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-10-11 14:14:15 -0400
commitac27a0ec112a089f1a5102bc8dffc79c8c815571 (patch)
treebcbcc0a5a88bf99b35119d9d9d660a37c503d787 /fs
parent502717f4e112b18d9c37753a32f675bec9f2838b (diff)
[PATCH] ext4: initial copy of files from ext3
Start of the ext4 patch series. See Documentation/filesystems/ext4.txt for details. This is a simple copy of the files in fs/ext3 to fs/ext4 and /usr/incude/linux/ext3* to /usr/include/ex4* Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/ext4/Makefile12
-rw-r--r--fs/ext4/acl.c551
-rw-r--r--fs/ext4/acl.h81
-rw-r--r--fs/ext4/balloc.c1818
-rw-r--r--fs/ext4/bitmap.c32
-rw-r--r--fs/ext4/dir.c518
-rw-r--r--fs/ext4/file.c139
-rw-r--r--fs/ext4/fsync.c88
-rw-r--r--fs/ext4/hash.c152
-rw-r--r--fs/ext4/ialloc.c758
-rw-r--r--fs/ext4/inode.c3219
-rw-r--r--fs/ext4/ioctl.c307
-rw-r--r--fs/ext4/namei.c2397
-rw-r--r--fs/ext4/namei.h8
-rw-r--r--fs/ext4/resize.c1042
-rw-r--r--fs/ext4/super.c2754
-rw-r--r--fs/ext4/symlink.c54
-rw-r--r--fs/ext4/xattr.c1317
-rw-r--r--fs/ext4/xattr.h145
-rw-r--r--fs/ext4/xattr_security.c77
-rw-r--r--fs/ext4/xattr_trusted.c62
-rw-r--r--fs/ext4/xattr_user.c64
22 files changed, 15595 insertions, 0 deletions
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
new file mode 100644
index 000000000000..704cd44a40c2
--- /dev/null
+++ b/fs/ext4/Makefile
@@ -0,0 +1,12 @@
1#
2# Makefile for the linux ext3-filesystem routines.
3#
4
5obj-$(CONFIG_EXT3_FS) += ext3.o
6
7ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o
9
10ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
11ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
12ext3-$(CONFIG_EXT3_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
new file mode 100644
index 000000000000..1e5038d9a01b
--- /dev/null
+++ b/fs/ext4/acl.c
@@ -0,0 +1,551 @@
1/*
2 * linux/fs/ext3/acl.c
3 *
4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
5 */
6
7#include <linux/init.h>
8#include <linux/sched.h>
9#include <linux/slab.h>
10#include <linux/capability.h>
11#include <linux/fs.h>
12#include <linux/ext3_jbd.h>
13#include <linux/ext3_fs.h>
14#include "xattr.h"
15#include "acl.h"
16
17/*
18 * Convert from filesystem to in-memory representation.
19 */
20static struct posix_acl *
21ext3_acl_from_disk(const void *value, size_t size)
22{
23 const char *end = (char *)value + size;
24 int n, count;
25 struct posix_acl *acl;
26
27 if (!value)
28 return NULL;
29 if (size < sizeof(ext3_acl_header))
30 return ERR_PTR(-EINVAL);
31 if (((ext3_acl_header *)value)->a_version !=
32 cpu_to_le32(EXT3_ACL_VERSION))
33 return ERR_PTR(-EINVAL);
34 value = (char *)value + sizeof(ext3_acl_header);
35 count = ext3_acl_count(size);
36 if (count < 0)
37 return ERR_PTR(-EINVAL);
38 if (count == 0)
39 return NULL;
40 acl = posix_acl_alloc(count, GFP_KERNEL);
41 if (!acl)
42 return ERR_PTR(-ENOMEM);
43 for (n=0; n < count; n++) {
44 ext3_acl_entry *entry =
45 (ext3_acl_entry *)value;
46 if ((char *)value + sizeof(ext3_acl_entry_short) > end)
47 goto fail;
48 acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
49 acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
50 switch(acl->a_entries[n].e_tag) {
51 case ACL_USER_OBJ:
52 case ACL_GROUP_OBJ:
53 case ACL_MASK:
54 case ACL_OTHER:
55 value = (char *)value +
56 sizeof(ext3_acl_entry_short);
57 acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
58 break;
59
60 case ACL_USER:
61 case ACL_GROUP:
62 value = (char *)value + sizeof(ext3_acl_entry);
63 if ((char *)value > end)
64 goto fail;
65 acl->a_entries[n].e_id =
66 le32_to_cpu(entry->e_id);
67 break;
68
69 default:
70 goto fail;
71 }
72 }
73 if (value != end)
74 goto fail;
75 return acl;
76
77fail:
78 posix_acl_release(acl);
79 return ERR_PTR(-EINVAL);
80}
81
82/*
83 * Convert from in-memory to filesystem representation.
84 */
85static void *
86ext3_acl_to_disk(const struct posix_acl *acl, size_t *size)
87{
88 ext3_acl_header *ext_acl;
89 char *e;
90 size_t n;
91
92 *size = ext3_acl_size(acl->a_count);
93 ext_acl = kmalloc(sizeof(ext3_acl_header) + acl->a_count *
94 sizeof(ext3_acl_entry), GFP_KERNEL);
95 if (!ext_acl)
96 return ERR_PTR(-ENOMEM);
97 ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION);
98 e = (char *)ext_acl + sizeof(ext3_acl_header);
99 for (n=0; n < acl->a_count; n++) {
100 ext3_acl_entry *entry = (ext3_acl_entry *)e;
101 entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
102 entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
103 switch(acl->a_entries[n].e_tag) {
104 case ACL_USER:
105 case ACL_GROUP:
106 entry->e_id =
107 cpu_to_le32(acl->a_entries[n].e_id);
108 e += sizeof(ext3_acl_entry);
109 break;
110
111 case ACL_USER_OBJ:
112 case ACL_GROUP_OBJ:
113 case ACL_MASK:
114 case ACL_OTHER:
115 e += sizeof(ext3_acl_entry_short);
116 break;
117
118 default:
119 goto fail;
120 }
121 }
122 return (char *)ext_acl;
123
124fail:
125 kfree(ext_acl);
126 return ERR_PTR(-EINVAL);
127}
128
129static inline struct posix_acl *
130ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl)
131{
132 struct posix_acl *acl = EXT3_ACL_NOT_CACHED;
133
134 spin_lock(&inode->i_lock);
135 if (*i_acl != EXT3_ACL_NOT_CACHED)
136 acl = posix_acl_dup(*i_acl);
137 spin_unlock(&inode->i_lock);
138
139 return acl;
140}
141
142static inline void
143ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
144 struct posix_acl *acl)
145{
146 spin_lock(&inode->i_lock);
147 if (*i_acl != EXT3_ACL_NOT_CACHED)
148 posix_acl_release(*i_acl);
149 *i_acl = posix_acl_dup(acl);
150 spin_unlock(&inode->i_lock);
151}
152
153/*
154 * Inode operation get_posix_acl().
155 *
156 * inode->i_mutex: don't care
157 */
158static struct posix_acl *
159ext3_get_acl(struct inode *inode, int type)
160{
161 struct ext3_inode_info *ei = EXT3_I(inode);
162 int name_index;
163 char *value = NULL;
164 struct posix_acl *acl;
165 int retval;
166
167 if (!test_opt(inode->i_sb, POSIX_ACL))
168 return NULL;
169
170 switch(type) {
171 case ACL_TYPE_ACCESS:
172 acl = ext3_iget_acl(inode, &ei->i_acl);
173 if (acl != EXT3_ACL_NOT_CACHED)
174 return acl;
175 name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
176 break;
177
178 case ACL_TYPE_DEFAULT:
179 acl = ext3_iget_acl(inode, &ei->i_default_acl);
180 if (acl != EXT3_ACL_NOT_CACHED)
181 return acl;
182 name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
183 break;
184
185 default:
186 return ERR_PTR(-EINVAL);
187 }
188 retval = ext3_xattr_get(inode, name_index, "", NULL, 0);
189 if (retval > 0) {
190 value = kmalloc(retval, GFP_KERNEL);
191 if (!value)
192 return ERR_PTR(-ENOMEM);
193 retval = ext3_xattr_get(inode, name_index, "", value, retval);
194 }
195 if (retval > 0)
196 acl = ext3_acl_from_disk(value, retval);
197 else if (retval == -ENODATA || retval == -ENOSYS)
198 acl = NULL;
199 else
200 acl = ERR_PTR(retval);
201 kfree(value);
202
203 if (!IS_ERR(acl)) {
204 switch(type) {
205 case ACL_TYPE_ACCESS:
206 ext3_iset_acl(inode, &ei->i_acl, acl);
207 break;
208
209 case ACL_TYPE_DEFAULT:
210 ext3_iset_acl(inode, &ei->i_default_acl, acl);
211 break;
212 }
213 }
214 return acl;
215}
216
217/*
218 * Set the access or default ACL of an inode.
219 *
220 * inode->i_mutex: down unless called from ext3_new_inode
221 */
222static int
223ext3_set_acl(handle_t *handle, struct inode *inode, int type,
224 struct posix_acl *acl)
225{
226 struct ext3_inode_info *ei = EXT3_I(inode);
227 int name_index;
228 void *value = NULL;
229 size_t size = 0;
230 int error;
231
232 if (S_ISLNK(inode->i_mode))
233 return -EOPNOTSUPP;
234
235 switch(type) {
236 case ACL_TYPE_ACCESS:
237 name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
238 if (acl) {
239 mode_t mode = inode->i_mode;
240 error = posix_acl_equiv_mode(acl, &mode);
241 if (error < 0)
242 return error;
243 else {
244 inode->i_mode = mode;
245 ext3_mark_inode_dirty(handle, inode);
246 if (error == 0)
247 acl = NULL;
248 }
249 }
250 break;
251
252 case ACL_TYPE_DEFAULT:
253 name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
254 if (!S_ISDIR(inode->i_mode))
255 return acl ? -EACCES : 0;
256 break;
257
258 default:
259 return -EINVAL;
260 }
261 if (acl) {
262 value = ext3_acl_to_disk(acl, &size);
263 if (IS_ERR(value))
264 return (int)PTR_ERR(value);
265 }
266
267 error = ext3_xattr_set_handle(handle, inode, name_index, "",
268 value, size, 0);
269
270 kfree(value);
271 if (!error) {
272 switch(type) {
273 case ACL_TYPE_ACCESS:
274 ext3_iset_acl(inode, &ei->i_acl, acl);
275 break;
276
277 case ACL_TYPE_DEFAULT:
278 ext3_iset_acl(inode, &ei->i_default_acl, acl);
279 break;
280 }
281 }
282 return error;
283}
284
285static int
286ext3_check_acl(struct inode *inode, int mask)
287{
288 struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
289
290 if (IS_ERR(acl))
291 return PTR_ERR(acl);
292 if (acl) {
293 int error = posix_acl_permission(inode, acl, mask);
294 posix_acl_release(acl);
295 return error;
296 }
297
298 return -EAGAIN;
299}
300
301int
302ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
303{
304 return generic_permission(inode, mask, ext3_check_acl);
305}
306
307/*
308 * Initialize the ACLs of a new inode. Called from ext3_new_inode.
309 *
310 * dir->i_mutex: down
311 * inode->i_mutex: up (access to inode is still exclusive)
312 */
313int
314ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
315{
316 struct posix_acl *acl = NULL;
317 int error = 0;
318
319 if (!S_ISLNK(inode->i_mode)) {
320 if (test_opt(dir->i_sb, POSIX_ACL)) {
321 acl = ext3_get_acl(dir, ACL_TYPE_DEFAULT);
322 if (IS_ERR(acl))
323 return PTR_ERR(acl);
324 }
325 if (!acl)
326 inode->i_mode &= ~current->fs->umask;
327 }
328 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
329 struct posix_acl *clone;
330 mode_t mode;
331
332 if (S_ISDIR(inode->i_mode)) {
333 error = ext3_set_acl(handle, inode,
334 ACL_TYPE_DEFAULT, acl);
335 if (error)
336 goto cleanup;
337 }
338 clone = posix_acl_clone(acl, GFP_KERNEL);
339 error = -ENOMEM;
340 if (!clone)
341 goto cleanup;
342
343 mode = inode->i_mode;
344 error = posix_acl_create_masq(clone, &mode);
345 if (error >= 0) {
346 inode->i_mode = mode;
347 if (error > 0) {
348 /* This is an extended ACL */
349 error = ext3_set_acl(handle, inode,
350 ACL_TYPE_ACCESS, clone);
351 }
352 }
353 posix_acl_release(clone);
354 }
355cleanup:
356 posix_acl_release(acl);
357 return error;
358}
359
360/*
361 * Does chmod for an inode that may have an Access Control List. The
362 * inode->i_mode field must be updated to the desired value by the caller
363 * before calling this function.
364 * Returns 0 on success, or a negative error number.
365 *
366 * We change the ACL rather than storing some ACL entries in the file
367 * mode permission bits (which would be more efficient), because that
368 * would break once additional permissions (like ACL_APPEND, ACL_DELETE
369 * for directories) are added. There are no more bits available in the
370 * file mode.
371 *
372 * inode->i_mutex: down
373 */
374int
375ext3_acl_chmod(struct inode *inode)
376{
377 struct posix_acl *acl, *clone;
378 int error;
379
380 if (S_ISLNK(inode->i_mode))
381 return -EOPNOTSUPP;
382 if (!test_opt(inode->i_sb, POSIX_ACL))
383 return 0;
384 acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
385 if (IS_ERR(acl) || !acl)
386 return PTR_ERR(acl);
387 clone = posix_acl_clone(acl, GFP_KERNEL);
388 posix_acl_release(acl);
389 if (!clone)
390 return -ENOMEM;
391 error = posix_acl_chmod_masq(clone, inode->i_mode);
392 if (!error) {
393 handle_t *handle;
394 int retries = 0;
395
396 retry:
397 handle = ext3_journal_start(inode,
398 EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
399 if (IS_ERR(handle)) {
400 error = PTR_ERR(handle);
401 ext3_std_error(inode->i_sb, error);
402 goto out;
403 }
404 error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, clone);
405 ext3_journal_stop(handle);
406 if (error == -ENOSPC &&
407 ext3_should_retry_alloc(inode->i_sb, &retries))
408 goto retry;
409 }
410out:
411 posix_acl_release(clone);
412 return error;
413}
414
415/*
416 * Extended attribute handlers
417 */
418static size_t
419ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
420 const char *name, size_t name_len)
421{
422 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
423
424 if (!test_opt(inode->i_sb, POSIX_ACL))
425 return 0;
426 if (list && size <= list_len)
427 memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
428 return size;
429}
430
431static size_t
432ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
433 const char *name, size_t name_len)
434{
435 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
436
437 if (!test_opt(inode->i_sb, POSIX_ACL))
438 return 0;
439 if (list && size <= list_len)
440 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
441 return size;
442}
443
444static int
445ext3_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
446{
447 struct posix_acl *acl;
448 int error;
449
450 if (!test_opt(inode->i_sb, POSIX_ACL))
451 return -EOPNOTSUPP;
452
453 acl = ext3_get_acl(inode, type);
454 if (IS_ERR(acl))
455 return PTR_ERR(acl);
456 if (acl == NULL)
457 return -ENODATA;
458 error = posix_acl_to_xattr(acl, buffer, size);
459 posix_acl_release(acl);
460
461 return error;
462}
463
464static int
465ext3_xattr_get_acl_access(struct inode *inode, const char *name,
466 void *buffer, size_t size)
467{
468 if (strcmp(name, "") != 0)
469 return -EINVAL;
470 return ext3_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
471}
472
473static int
474ext3_xattr_get_acl_default(struct inode *inode, const char *name,
475 void *buffer, size_t size)
476{
477 if (strcmp(name, "") != 0)
478 return -EINVAL;
479 return ext3_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
480}
481
482static int
483ext3_xattr_set_acl(struct inode *inode, int type, const void *value,
484 size_t size)
485{
486 handle_t *handle;
487 struct posix_acl *acl;
488 int error, retries = 0;
489
490 if (!test_opt(inode->i_sb, POSIX_ACL))
491 return -EOPNOTSUPP;
492 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
493 return -EPERM;
494
495 if (value) {
496 acl = posix_acl_from_xattr(value, size);
497 if (IS_ERR(acl))
498 return PTR_ERR(acl);
499 else if (acl) {
500 error = posix_acl_valid(acl);
501 if (error)
502 goto release_and_out;
503 }
504 } else
505 acl = NULL;
506
507retry:
508 handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
509 if (IS_ERR(handle))
510 return PTR_ERR(handle);
511 error = ext3_set_acl(handle, inode, type, acl);
512 ext3_journal_stop(handle);
513 if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
514 goto retry;
515
516release_and_out:
517 posix_acl_release(acl);
518 return error;
519}
520
521static int
522ext3_xattr_set_acl_access(struct inode *inode, const char *name,
523 const void *value, size_t size, int flags)
524{
525 if (strcmp(name, "") != 0)
526 return -EINVAL;
527 return ext3_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
528}
529
530static int
531ext3_xattr_set_acl_default(struct inode *inode, const char *name,
532 const void *value, size_t size, int flags)
533{
534 if (strcmp(name, "") != 0)
535 return -EINVAL;
536 return ext3_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
537}
538
539struct xattr_handler ext3_xattr_acl_access_handler = {
540 .prefix = POSIX_ACL_XATTR_ACCESS,
541 .list = ext3_xattr_list_acl_access,
542 .get = ext3_xattr_get_acl_access,
543 .set = ext3_xattr_set_acl_access,
544};
545
546struct xattr_handler ext3_xattr_acl_default_handler = {
547 .prefix = POSIX_ACL_XATTR_DEFAULT,
548 .list = ext3_xattr_list_acl_default,
549 .get = ext3_xattr_get_acl_default,
550 .set = ext3_xattr_set_acl_default,
551};
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
new file mode 100644
index 000000000000..0d1e6279cbfd
--- /dev/null
+++ b/fs/ext4/acl.h
@@ -0,0 +1,81 @@
1/*
2 File: fs/ext3/acl.h
3
4 (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
5*/
6
7#include <linux/posix_acl_xattr.h>
8
9#define EXT3_ACL_VERSION 0x0001
10
11typedef struct {
12 __le16 e_tag;
13 __le16 e_perm;
14 __le32 e_id;
15} ext3_acl_entry;
16
17typedef struct {
18 __le16 e_tag;
19 __le16 e_perm;
20} ext3_acl_entry_short;
21
22typedef struct {
23 __le32 a_version;
24} ext3_acl_header;
25
26static inline size_t ext3_acl_size(int count)
27{
28 if (count <= 4) {
29 return sizeof(ext3_acl_header) +
30 count * sizeof(ext3_acl_entry_short);
31 } else {
32 return sizeof(ext3_acl_header) +
33 4 * sizeof(ext3_acl_entry_short) +
34 (count - 4) * sizeof(ext3_acl_entry);
35 }
36}
37
38static inline int ext3_acl_count(size_t size)
39{
40 ssize_t s;
41 size -= sizeof(ext3_acl_header);
42 s = size - 4 * sizeof(ext3_acl_entry_short);
43 if (s < 0) {
44 if (size % sizeof(ext3_acl_entry_short))
45 return -1;
46 return size / sizeof(ext3_acl_entry_short);
47 } else {
48 if (s % sizeof(ext3_acl_entry))
49 return -1;
50 return s / sizeof(ext3_acl_entry) + 4;
51 }
52}
53
54#ifdef CONFIG_EXT3_FS_POSIX_ACL
55
56/* Value for inode->u.ext3_i.i_acl and inode->u.ext3_i.i_default_acl
57 if the ACL has not been cached */
58#define EXT3_ACL_NOT_CACHED ((void *)-1)
59
60/* acl.c */
61extern int ext3_permission (struct inode *, int, struct nameidata *);
62extern int ext3_acl_chmod (struct inode *);
63extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
64
65#else /* CONFIG_EXT3_FS_POSIX_ACL */
66#include <linux/sched.h>
67#define ext3_permission NULL
68
69static inline int
70ext3_acl_chmod(struct inode *inode)
71{
72 return 0;
73}
74
75static inline int
76ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
77{
78 return 0;
79}
80#endif /* CONFIG_EXT3_FS_POSIX_ACL */
81
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
new file mode 100644
index 000000000000..b41a7d7e20f0
--- /dev/null
+++ b/fs/ext4/balloc.c
@@ -0,0 +1,1818 @@
1/*
2 * linux/fs/ext3/balloc.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
10 * Big-endian to little-endian byte-swapping/bitmaps by
11 * David S. Miller (davem@caip.rutgers.edu), 1995
12 */
13
14#include <linux/time.h>
15#include <linux/capability.h>
16#include <linux/fs.h>
17#include <linux/jbd.h>
18#include <linux/ext3_fs.h>
19#include <linux/ext3_jbd.h>
20#include <linux/quotaops.h>
21#include <linux/buffer_head.h>
22
23/*
24 * balloc.c contains the blocks allocation and deallocation routines
25 */
26
27/*
28 * The free blocks are managed by bitmaps. A file system contains several
29 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
30 * block for inodes, N blocks for the inode table and data blocks.
31 *
32 * The file system contains group descriptors which are located after the
33 * super block. Each descriptor contains the number of the bitmap block and
34 * the free blocks count in the block. The descriptors are loaded in memory
35 * when a file system is mounted (see ext3_read_super).
36 */
37
38
39#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
40
41/**
42 * ext3_get_group_desc() -- load group descriptor from disk
43 * @sb: super block
44 * @block_group: given block group
45 * @bh: pointer to the buffer head to store the block
46 * group descriptor
47 */
48struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
49 unsigned int block_group,
50 struct buffer_head ** bh)
51{
52 unsigned long group_desc;
53 unsigned long offset;
54 struct ext3_group_desc * desc;
55 struct ext3_sb_info *sbi = EXT3_SB(sb);
56
57 if (block_group >= sbi->s_groups_count) {
58 ext3_error (sb, "ext3_get_group_desc",
59 "block_group >= groups_count - "
60 "block_group = %d, groups_count = %lu",
61 block_group, sbi->s_groups_count);
62
63 return NULL;
64 }
65 smp_rmb();
66
67 group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
68 offset = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
69 if (!sbi->s_group_desc[group_desc]) {
70 ext3_error (sb, "ext3_get_group_desc",
71 "Group descriptor not loaded - "
72 "block_group = %d, group_desc = %lu, desc = %lu",
73 block_group, group_desc, offset);
74 return NULL;
75 }
76
77 desc = (struct ext3_group_desc *) sbi->s_group_desc[group_desc]->b_data;
78 if (bh)
79 *bh = sbi->s_group_desc[group_desc];
80 return desc + offset;
81}
82
83/**
84 * read_block_bitmap()
85 * @sb: super block
86 * @block_group: given block group
87 *
88 * Read the bitmap for a given block_group, reading into the specified
89 * slot in the superblock's bitmap cache.
90 *
91 * Return buffer_head on success or NULL in case of failure.
92 */
93static struct buffer_head *
94read_block_bitmap(struct super_block *sb, unsigned int block_group)
95{
96 struct ext3_group_desc * desc;
97 struct buffer_head * bh = NULL;
98
99 desc = ext3_get_group_desc (sb, block_group, NULL);
100 if (!desc)
101 goto error_out;
102 bh = sb_bread(sb, le32_to_cpu(desc->bg_block_bitmap));
103 if (!bh)
104 ext3_error (sb, "read_block_bitmap",
105 "Cannot read block bitmap - "
106 "block_group = %d, block_bitmap = %u",
107 block_group, le32_to_cpu(desc->bg_block_bitmap));
108error_out:
109 return bh;
110}
111/*
112 * The reservation window structure operations
113 * --------------------------------------------
114 * Operations include:
115 * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
116 *
117 * We use a red-black tree to represent per-filesystem reservation
118 * windows.
119 *
120 */
121
122/**
123 * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
124 * @rb_root: root of per-filesystem reservation rb tree
125 * @verbose: verbose mode
126 * @fn: function which wishes to dump the reservation map
127 *
128 * If verbose is turned on, it will print the whole block reservation
129 * windows(start, end). Otherwise, it will only print out the "bad" windows,
130 * those windows that overlap with their immediate neighbors.
131 */
132#if 1
133static void __rsv_window_dump(struct rb_root *root, int verbose,
134 const char *fn)
135{
136 struct rb_node *n;
137 struct ext3_reserve_window_node *rsv, *prev;
138 int bad;
139
140restart:
141 n = rb_first(root);
142 bad = 0;
143 prev = NULL;
144
145 printk("Block Allocation Reservation Windows Map (%s):\n", fn);
146 while (n) {
147 rsv = list_entry(n, struct ext3_reserve_window_node, rsv_node);
148 if (verbose)
149 printk("reservation window 0x%p "
150 "start: %lu, end: %lu\n",
151 rsv, rsv->rsv_start, rsv->rsv_end);
152 if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
153 printk("Bad reservation %p (start >= end)\n",
154 rsv);
155 bad = 1;
156 }
157 if (prev && prev->rsv_end >= rsv->rsv_start) {
158 printk("Bad reservation %p (prev->end >= start)\n",
159 rsv);
160 bad = 1;
161 }
162 if (bad) {
163 if (!verbose) {
164 printk("Restarting reservation walk in verbose mode\n");
165 verbose = 1;
166 goto restart;
167 }
168 }
169 n = rb_next(n);
170 prev = rsv;
171 }
172 printk("Window map complete.\n");
173 if (bad)
174 BUG();
175}
176#define rsv_window_dump(root, verbose) \
177 __rsv_window_dump((root), (verbose), __FUNCTION__)
178#else
179#define rsv_window_dump(root, verbose) do {} while (0)
180#endif
181
182/**
183 * goal_in_my_reservation()
184 * @rsv: inode's reservation window
185 * @grp_goal: given goal block relative to the allocation block group
186 * @group: the current allocation block group
187 * @sb: filesystem super block
188 *
189 * Test if the given goal block (group relative) is within the file's
190 * own block reservation window range.
191 *
192 * If the reservation window is outside the goal allocation group, return 0;
193 * grp_goal (given goal block) could be -1, which means no specific
194 * goal block. In this case, always return 1.
195 * If the goal block is within the reservation window, return 1;
196 * otherwise, return 0;
197 */
198static int
199goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
200 unsigned int group, struct super_block * sb)
201{
202 ext3_fsblk_t group_first_block, group_last_block;
203
204 group_first_block = ext3_group_first_block_no(sb, group);
205 group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
206
207 if ((rsv->_rsv_start > group_last_block) ||
208 (rsv->_rsv_end < group_first_block))
209 return 0;
210 if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
211 || (grp_goal + group_first_block > rsv->_rsv_end)))
212 return 0;
213 return 1;
214}
215
216/**
217 * search_reserve_window()
218 * @rb_root: root of reservation tree
219 * @goal: target allocation block
220 *
221 * Find the reserved window which includes the goal, or the previous one
222 * if the goal is not in any window.
223 * Returns NULL if there are no windows or if all windows start after the goal.
224 */
225static struct ext3_reserve_window_node *
226search_reserve_window(struct rb_root *root, ext3_fsblk_t goal)
227{
228 struct rb_node *n = root->rb_node;
229 struct ext3_reserve_window_node *rsv;
230
231 if (!n)
232 return NULL;
233
234 do {
235 rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
236
237 if (goal < rsv->rsv_start)
238 n = n->rb_left;
239 else if (goal > rsv->rsv_end)
240 n = n->rb_right;
241 else
242 return rsv;
243 } while (n);
244 /*
245 * We've fallen off the end of the tree: the goal wasn't inside
246 * any particular node. OK, the previous node must be to one
247 * side of the interval containing the goal. If it's the RHS,
248 * we need to back up one.
249 */
250 if (rsv->rsv_start > goal) {
251 n = rb_prev(&rsv->rsv_node);
252 rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
253 }
254 return rsv;
255}
256
257/**
258 * ext3_rsv_window_add() -- Insert a window to the block reservation rb tree.
259 * @sb: super block
260 * @rsv: reservation window to add
261 *
262 * Must be called with rsv_lock hold.
263 */
264void ext3_rsv_window_add(struct super_block *sb,
265 struct ext3_reserve_window_node *rsv)
266{
267 struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
268 struct rb_node *node = &rsv->rsv_node;
269 ext3_fsblk_t start = rsv->rsv_start;
270
271 struct rb_node ** p = &root->rb_node;
272 struct rb_node * parent = NULL;
273 struct ext3_reserve_window_node *this;
274
275 while (*p)
276 {
277 parent = *p;
278 this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node);
279
280 if (start < this->rsv_start)
281 p = &(*p)->rb_left;
282 else if (start > this->rsv_end)
283 p = &(*p)->rb_right;
284 else {
285 rsv_window_dump(root, 1);
286 BUG();
287 }
288 }
289
290 rb_link_node(node, parent, p);
291 rb_insert_color(node, root);
292}
293
294/**
295 * ext3_rsv_window_remove() -- unlink a window from the reservation rb tree
296 * @sb: super block
297 * @rsv: reservation window to remove
298 *
299 * Mark the block reservation window as not allocated, and unlink it
300 * from the filesystem reservation window rb tree. Must be called with
301 * rsv_lock hold.
302 */
303static void rsv_window_remove(struct super_block *sb,
304 struct ext3_reserve_window_node *rsv)
305{
306 rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
307 rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
308 rsv->rsv_alloc_hit = 0;
309 rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root);
310}
311
312/*
313 * rsv_is_empty() -- Check if the reservation window is allocated.
314 * @rsv: given reservation window to check
315 *
316 * returns 1 if the end block is EXT3_RESERVE_WINDOW_NOT_ALLOCATED.
317 */
318static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
319{
320 /* a valid reservation end block could not be 0 */
321 return rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
322}
323
324/**
325 * ext3_init_block_alloc_info()
326 * @inode: file inode structure
327 *
328 * Allocate and initialize the reservation window structure, and
329 * link the window to the ext3 inode structure at last
330 *
331 * The reservation window structure is only dynamically allocated
332 * and linked to ext3 inode the first time the open file
333 * needs a new block. So, before every ext3_new_block(s) call, for
334 * regular files, we should check whether the reservation window
335 * structure exists or not. In the latter case, this function is called.
336 * Fail to do so will result in block reservation being turned off for that
337 * open file.
338 *
339 * This function is called from ext3_get_blocks_handle(), also called
340 * when setting the reservation window size through ioctl before the file
341 * is open for write (needs block allocation).
342 *
343 * Needs truncate_mutex protection prior to call this function.
344 */
345void ext3_init_block_alloc_info(struct inode *inode)
346{
347 struct ext3_inode_info *ei = EXT3_I(inode);
348 struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
349 struct super_block *sb = inode->i_sb;
350
351 block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
352 if (block_i) {
353 struct ext3_reserve_window_node *rsv = &block_i->rsv_window_node;
354
355 rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
356 rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
357
358 /*
359 * if filesystem is mounted with NORESERVATION, the goal
360 * reservation window size is set to zero to indicate
361 * block reservation is off
362 */
363 if (!test_opt(sb, RESERVATION))
364 rsv->rsv_goal_size = 0;
365 else
366 rsv->rsv_goal_size = EXT3_DEFAULT_RESERVE_BLOCKS;
367 rsv->rsv_alloc_hit = 0;
368 block_i->last_alloc_logical_block = 0;
369 block_i->last_alloc_physical_block = 0;
370 }
371 ei->i_block_alloc_info = block_i;
372}
373
374/**
375 * ext3_discard_reservation()
376 * @inode: inode
377 *
378 * Discard(free) block reservation window on last file close, or truncate
379 * or at last iput().
380 *
381 * It is being called in three cases:
382 * ext3_release_file(): last writer close the file
383 * ext3_clear_inode(): last iput(), when nobody link to this file.
384 * ext3_truncate(): when the block indirect map is about to change.
385 *
386 */
387void ext3_discard_reservation(struct inode *inode)
388{
389 struct ext3_inode_info *ei = EXT3_I(inode);
390 struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
391 struct ext3_reserve_window_node *rsv;
392 spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock;
393
394 if (!block_i)
395 return;
396
397 rsv = &block_i->rsv_window_node;
398 if (!rsv_is_empty(&rsv->rsv_window)) {
399 spin_lock(rsv_lock);
400 if (!rsv_is_empty(&rsv->rsv_window))
401 rsv_window_remove(inode->i_sb, rsv);
402 spin_unlock(rsv_lock);
403 }
404}
405
406/**
407 * ext3_free_blocks_sb() -- Free given blocks and update quota
408 * @handle: handle to this transaction
409 * @sb: super block
410 * @block: start physcial block to free
411 * @count: number of blocks to free
412 * @pdquot_freed_blocks: pointer to quota
413 */
414void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
415 ext3_fsblk_t block, unsigned long count,
416 unsigned long *pdquot_freed_blocks)
417{
418 struct buffer_head *bitmap_bh = NULL;
419 struct buffer_head *gd_bh;
420 unsigned long block_group;
421 ext3_grpblk_t bit;
422 unsigned long i;
423 unsigned long overflow;
424 struct ext3_group_desc * desc;
425 struct ext3_super_block * es;
426 struct ext3_sb_info *sbi;
427 int err = 0, ret;
428 ext3_grpblk_t group_freed;
429
430 *pdquot_freed_blocks = 0;
431 sbi = EXT3_SB(sb);
432 es = sbi->s_es;
433 if (block < le32_to_cpu(es->s_first_data_block) ||
434 block + count < block ||
435 block + count > le32_to_cpu(es->s_blocks_count)) {
436 ext3_error (sb, "ext3_free_blocks",
437 "Freeing blocks not in datazone - "
438 "block = "E3FSBLK", count = %lu", block, count);
439 goto error_return;
440 }
441
442 ext3_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1);
443
444do_more:
445 overflow = 0;
446 block_group = (block - le32_to_cpu(es->s_first_data_block)) /
447 EXT3_BLOCKS_PER_GROUP(sb);
448 bit = (block - le32_to_cpu(es->s_first_data_block)) %
449 EXT3_BLOCKS_PER_GROUP(sb);
450 /*
451 * Check to see if we are freeing blocks across a group
452 * boundary.
453 */
454 if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
455 overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
456 count -= overflow;
457 }
458 brelse(bitmap_bh);
459 bitmap_bh = read_block_bitmap(sb, block_group);
460 if (!bitmap_bh)
461 goto error_return;
462 desc = ext3_get_group_desc (sb, block_group, &gd_bh);
463 if (!desc)
464 goto error_return;
465
466 if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
467 in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
468 in_range (block, le32_to_cpu(desc->bg_inode_table),
469 sbi->s_itb_per_group) ||
470 in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
471 sbi->s_itb_per_group))
472 ext3_error (sb, "ext3_free_blocks",
473 "Freeing blocks in system zones - "
474 "Block = "E3FSBLK", count = %lu",
475 block, count);
476
477 /*
478 * We are about to start releasing blocks in the bitmap,
479 * so we need undo access.
480 */
481 /* @@@ check errors */
482 BUFFER_TRACE(bitmap_bh, "getting undo access");
483 err = ext3_journal_get_undo_access(handle, bitmap_bh);
484 if (err)
485 goto error_return;
486
487 /*
488 * We are about to modify some metadata. Call the journal APIs
489 * to unshare ->b_data if a currently-committing transaction is
490 * using it
491 */
492 BUFFER_TRACE(gd_bh, "get_write_access");
493 err = ext3_journal_get_write_access(handle, gd_bh);
494 if (err)
495 goto error_return;
496
497 jbd_lock_bh_state(bitmap_bh);
498
499 for (i = 0, group_freed = 0; i < count; i++) {
500 /*
501 * An HJ special. This is expensive...
502 */
503#ifdef CONFIG_JBD_DEBUG
504 jbd_unlock_bh_state(bitmap_bh);
505 {
506 struct buffer_head *debug_bh;
507 debug_bh = sb_find_get_block(sb, block + i);
508 if (debug_bh) {
509 BUFFER_TRACE(debug_bh, "Deleted!");
510 if (!bh2jh(bitmap_bh)->b_committed_data)
511 BUFFER_TRACE(debug_bh,
512 "No commited data in bitmap");
513 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
514 __brelse(debug_bh);
515 }
516 }
517 jbd_lock_bh_state(bitmap_bh);
518#endif
519 if (need_resched()) {
520 jbd_unlock_bh_state(bitmap_bh);
521 cond_resched();
522 jbd_lock_bh_state(bitmap_bh);
523 }
524 /* @@@ This prevents newly-allocated data from being
525 * freed and then reallocated within the same
526 * transaction.
527 *
528 * Ideally we would want to allow that to happen, but to
529 * do so requires making journal_forget() capable of
530 * revoking the queued write of a data block, which
531 * implies blocking on the journal lock. *forget()
532 * cannot block due to truncate races.
533 *
534 * Eventually we can fix this by making journal_forget()
535 * return a status indicating whether or not it was able
536 * to revoke the buffer. On successful revoke, it is
537 * safe not to set the allocation bit in the committed
538 * bitmap, because we know that there is no outstanding
539 * activity on the buffer any more and so it is safe to
540 * reallocate it.
541 */
542 BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
543 J_ASSERT_BH(bitmap_bh,
544 bh2jh(bitmap_bh)->b_committed_data != NULL);
545 ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
546 bh2jh(bitmap_bh)->b_committed_data);
547
548 /*
549 * We clear the bit in the bitmap after setting the committed
550 * data bit, because this is the reverse order to that which
551 * the allocator uses.
552 */
553 BUFFER_TRACE(bitmap_bh, "clear bit");
554 if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
555 bit + i, bitmap_bh->b_data)) {
556 jbd_unlock_bh_state(bitmap_bh);
557 ext3_error(sb, __FUNCTION__,
558 "bit already cleared for block "E3FSBLK,
559 block + i);
560 jbd_lock_bh_state(bitmap_bh);
561 BUFFER_TRACE(bitmap_bh, "bit already cleared");
562 } else {
563 group_freed++;
564 }
565 }
566 jbd_unlock_bh_state(bitmap_bh);
567
568 spin_lock(sb_bgl_lock(sbi, block_group));
569 desc->bg_free_blocks_count =
570 cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
571 group_freed);
572 spin_unlock(sb_bgl_lock(sbi, block_group));
573 percpu_counter_mod(&sbi->s_freeblocks_counter, count);
574
575 /* We dirtied the bitmap block */
576 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
577 err = ext3_journal_dirty_metadata(handle, bitmap_bh);
578
579 /* And the group descriptor block */
580 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
581 ret = ext3_journal_dirty_metadata(handle, gd_bh);
582 if (!err) err = ret;
583 *pdquot_freed_blocks += group_freed;
584
585 if (overflow && !err) {
586 block += count;
587 count = overflow;
588 goto do_more;
589 }
590 sb->s_dirt = 1;
591error_return:
592 brelse(bitmap_bh);
593 ext3_std_error(sb, err);
594 return;
595}
596
597/**
598 * ext3_free_blocks() -- Free given blocks and update quota
599 * @handle: handle for this transaction
600 * @inode: inode
601 * @block: start physical block to free
602 * @count: number of blocks to count
603 */
604void ext3_free_blocks(handle_t *handle, struct inode *inode,
605 ext3_fsblk_t block, unsigned long count)
606{
607 struct super_block * sb;
608 unsigned long dquot_freed_blocks;
609
610 sb = inode->i_sb;
611 if (!sb) {
612 printk ("ext3_free_blocks: nonexistent device");
613 return;
614 }
615 ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
616 if (dquot_freed_blocks)
617 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
618 return;
619}
620
621/**
622 * ext3_test_allocatable()
623 * @nr: given allocation block group
624 * @bh: bufferhead contains the bitmap of the given block group
625 *
626 * For ext3 allocations, we must not reuse any blocks which are
627 * allocated in the bitmap buffer's "last committed data" copy. This
628 * prevents deletes from freeing up the page for reuse until we have
629 * committed the delete transaction.
630 *
631 * If we didn't do this, then deleting something and reallocating it as
632 * data would allow the old block to be overwritten before the
633 * transaction committed (because we force data to disk before commit).
634 * This would lead to corruption if we crashed between overwriting the
635 * data and committing the delete.
636 *
637 * @@@ We may want to make this allocation behaviour conditional on
638 * data-writes at some point, and disable it for metadata allocations or
639 * sync-data inodes.
640 */
641static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh)
642{
643 int ret;
644 struct journal_head *jh = bh2jh(bh);
645
646 if (ext3_test_bit(nr, bh->b_data))
647 return 0;
648
649 jbd_lock_bh_state(bh);
650 if (!jh->b_committed_data)
651 ret = 1;
652 else
653 ret = !ext3_test_bit(nr, jh->b_committed_data);
654 jbd_unlock_bh_state(bh);
655 return ret;
656}
657
658/**
659 * bitmap_search_next_usable_block()
660 * @start: the starting block (group relative) of the search
661 * @bh: bufferhead contains the block group bitmap
662 * @maxblocks: the ending block (group relative) of the reservation
663 *
664 * The bitmap search --- search forward alternately through the actual
665 * bitmap on disk and the last-committed copy in journal, until we find a
666 * bit free in both bitmaps.
667 */
668static ext3_grpblk_t
669bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
670 ext3_grpblk_t maxblocks)
671{
672 ext3_grpblk_t next;
673 struct journal_head *jh = bh2jh(bh);
674
675 while (start < maxblocks) {
676 next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start);
677 if (next >= maxblocks)
678 return -1;
679 if (ext3_test_allocatable(next, bh))
680 return next;
681 jbd_lock_bh_state(bh);
682 if (jh->b_committed_data)
683 start = ext3_find_next_zero_bit(jh->b_committed_data,
684 maxblocks, next);
685 jbd_unlock_bh_state(bh);
686 }
687 return -1;
688}
689
690/**
691 * find_next_usable_block()
692 * @start: the starting block (group relative) to find next
693 * allocatable block in bitmap.
694 * @bh: bufferhead contains the block group bitmap
695 * @maxblocks: the ending block (group relative) for the search
696 *
697 * Find an allocatable block in a bitmap. We honor both the bitmap and
698 * its last-committed copy (if that exists), and perform the "most
699 * appropriate allocation" algorithm of looking for a free block near
700 * the initial goal; then for a free byte somewhere in the bitmap; then
701 * for any free bit in the bitmap.
702 */
703static ext3_grpblk_t
704find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
705 ext3_grpblk_t maxblocks)
706{
707 ext3_grpblk_t here, next;
708 char *p, *r;
709
710 if (start > 0) {
711 /*
712 * The goal was occupied; search forward for a free
713 * block within the next XX blocks.
714 *
715 * end_goal is more or less random, but it has to be
716 * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
717 * next 64-bit boundary is simple..
718 */
719 ext3_grpblk_t end_goal = (start + 63) & ~63;
720 if (end_goal > maxblocks)
721 end_goal = maxblocks;
722 here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
723 if (here < end_goal && ext3_test_allocatable(here, bh))
724 return here;
725 ext3_debug("Bit not found near goal\n");
726 }
727
728 here = start;
729 if (here < 0)
730 here = 0;
731
732 p = ((char *)bh->b_data) + (here >> 3);
733 r = memscan(p, 0, (maxblocks - here + 7) >> 3);
734 next = (r - ((char *)bh->b_data)) << 3;
735
736 if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
737 return next;
738
739 /*
740 * The bitmap search --- search forward alternately through the actual
741 * bitmap and the last-committed copy until we find a bit free in
742 * both
743 */
744 here = bitmap_search_next_usable_block(here, bh, maxblocks);
745 return here;
746}
747
748/**
749 * claim_block()
750 * @block: the free block (group relative) to allocate
751 * @bh: the bufferhead containts the block group bitmap
752 *
753 * We think we can allocate this block in this bitmap. Try to set the bit.
754 * If that succeeds then check that nobody has allocated and then freed the
755 * block since we saw that is was not marked in b_committed_data. If it _was_
756 * allocated and freed then clear the bit in the bitmap again and return
757 * zero (failure).
758 */
759static inline int
760claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh)
761{
762 struct journal_head *jh = bh2jh(bh);
763 int ret;
764
765 if (ext3_set_bit_atomic(lock, block, bh->b_data))
766 return 0;
767 jbd_lock_bh_state(bh);
768 if (jh->b_committed_data && ext3_test_bit(block,jh->b_committed_data)) {
769 ext3_clear_bit_atomic(lock, block, bh->b_data);
770 ret = 0;
771 } else {
772 ret = 1;
773 }
774 jbd_unlock_bh_state(bh);
775 return ret;
776}
777
778/**
779 * ext3_try_to_allocate()
780 * @sb: superblock
781 * @handle: handle to this transaction
782 * @group: given allocation block group
783 * @bitmap_bh: bufferhead holds the block bitmap
784 * @grp_goal: given target block within the group
785 * @count: target number of blocks to allocate
786 * @my_rsv: reservation window
787 *
788 * Attempt to allocate blocks within a give range. Set the range of allocation
789 * first, then find the first free bit(s) from the bitmap (within the range),
790 * and at last, allocate the blocks by claiming the found free bit as allocated.
791 *
792 * To set the range of this allocation:
793 * if there is a reservation window, only try to allocate block(s) from the
794 * file's own reservation window;
795 * Otherwise, the allocation range starts from the give goal block, ends at
796 * the block group's last block.
797 *
798 * If we failed to allocate the desired block then we may end up crossing to a
799 * new bitmap. In that case we must release write access to the old one via
800 * ext3_journal_release_buffer(), else we'll run out of credits.
801 */
802static ext3_grpblk_t
803ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
804 struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
805 unsigned long *count, struct ext3_reserve_window *my_rsv)
806{
807 ext3_fsblk_t group_first_block;
808 ext3_grpblk_t start, end;
809 unsigned long num = 0;
810
811 /* we do allocation within the reservation window if we have a window */
812 if (my_rsv) {
813 group_first_block = ext3_group_first_block_no(sb, group);
814 if (my_rsv->_rsv_start >= group_first_block)
815 start = my_rsv->_rsv_start - group_first_block;
816 else
817 /* reservation window cross group boundary */
818 start = 0;
819 end = my_rsv->_rsv_end - group_first_block + 1;
820 if (end > EXT3_BLOCKS_PER_GROUP(sb))
821 /* reservation window crosses group boundary */
822 end = EXT3_BLOCKS_PER_GROUP(sb);
823 if ((start <= grp_goal) && (grp_goal < end))
824 start = grp_goal;
825 else
826 grp_goal = -1;
827 } else {
828 if (grp_goal > 0)
829 start = grp_goal;
830 else
831 start = 0;
832 end = EXT3_BLOCKS_PER_GROUP(sb);
833 }
834
835 BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
836
837repeat:
838 if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) {
839 grp_goal = find_next_usable_block(start, bitmap_bh, end);
840 if (grp_goal < 0)
841 goto fail_access;
842 if (!my_rsv) {
843 int i;
844
845 for (i = 0; i < 7 && grp_goal > start &&
846 ext3_test_allocatable(grp_goal - 1,
847 bitmap_bh);
848 i++, grp_goal--)
849 ;
850 }
851 }
852 start = grp_goal;
853
854 if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group),
855 grp_goal, bitmap_bh)) {
856 /*
857 * The block was allocated by another thread, or it was
858 * allocated and then freed by another thread
859 */
860 start++;
861 grp_goal++;
862 if (start >= end)
863 goto fail_access;
864 goto repeat;
865 }
866 num++;
867 grp_goal++;
868 while (num < *count && grp_goal < end
869 && ext3_test_allocatable(grp_goal, bitmap_bh)
870 && claim_block(sb_bgl_lock(EXT3_SB(sb), group),
871 grp_goal, bitmap_bh)) {
872 num++;
873 grp_goal++;
874 }
875 *count = num;
876 return grp_goal - num;
877fail_access:
878 *count = num;
879 return -1;
880}
881
882/**
883 * find_next_reservable_window():
884 * find a reservable space within the given range.
885 * It does not allocate the reservation window for now:
886 * alloc_new_reservation() will do the work later.
887 *
888 * @search_head: the head of the searching list;
889 * This is not necessarily the list head of the whole filesystem
890 *
891 * We have both head and start_block to assist the search
892 * for the reservable space. The list starts from head,
893 * but we will shift to the place where start_block is,
894 * then start from there, when looking for a reservable space.
895 *
896 * @size: the target new reservation window size
897 *
898 * @group_first_block: the first block we consider to start
899 * the real search from
900 *
901 * @last_block:
902 * the maximum block number that our goal reservable space
903 * could start from. This is normally the last block in this
904 * group. The search will end when we found the start of next
905 * possible reservable space is out of this boundary.
906 * This could handle the cross boundary reservation window
907 * request.
908 *
909 * basically we search from the given range, rather than the whole
910 * reservation double linked list, (start_block, last_block)
911 * to find a free region that is of my size and has not
912 * been reserved.
913 *
914 */
915static int find_next_reservable_window(
916 struct ext3_reserve_window_node *search_head,
917 struct ext3_reserve_window_node *my_rsv,
918 struct super_block * sb,
919 ext3_fsblk_t start_block,
920 ext3_fsblk_t last_block)
921{
922 struct rb_node *next;
923 struct ext3_reserve_window_node *rsv, *prev;
924 ext3_fsblk_t cur;
925 int size = my_rsv->rsv_goal_size;
926
927 /* TODO: make the start of the reservation window byte-aligned */
928 /* cur = *start_block & ~7;*/
929 cur = start_block;
930 rsv = search_head;
931 if (!rsv)
932 return -1;
933
934 while (1) {
935 if (cur <= rsv->rsv_end)
936 cur = rsv->rsv_end + 1;
937
938 /* TODO?
939 * in the case we could not find a reservable space
940 * that is what is expected, during the re-search, we could
941 * remember what's the largest reservable space we could have
942 * and return that one.
943 *
944 * For now it will fail if we could not find the reservable
945 * space with expected-size (or more)...
946 */
947 if (cur > last_block)
948 return -1; /* fail */
949
950 prev = rsv;
951 next = rb_next(&rsv->rsv_node);
952 rsv = list_entry(next,struct ext3_reserve_window_node,rsv_node);
953
954 /*
955 * Reached the last reservation, we can just append to the
956 * previous one.
957 */
958 if (!next)
959 break;
960
961 if (cur + size <= rsv->rsv_start) {
962 /*
963 * Found a reserveable space big enough. We could
964 * have a reservation across the group boundary here
965 */
966 break;
967 }
968 }
969 /*
970 * we come here either :
971 * when we reach the end of the whole list,
972 * and there is empty reservable space after last entry in the list.
973 * append it to the end of the list.
974 *
975 * or we found one reservable space in the middle of the list,
976 * return the reservation window that we could append to.
977 * succeed.
978 */
979
980 if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
981 rsv_window_remove(sb, my_rsv);
982
983 /*
984 * Let's book the whole avaliable window for now. We will check the
985 * disk bitmap later and then, if there are free blocks then we adjust
986 * the window size if it's larger than requested.
987 * Otherwise, we will remove this node from the tree next time
988 * call find_next_reservable_window.
989 */
990 my_rsv->rsv_start = cur;
991 my_rsv->rsv_end = cur + size - 1;
992 my_rsv->rsv_alloc_hit = 0;
993
994 if (prev != my_rsv)
995 ext3_rsv_window_add(sb, my_rsv);
996
997 return 0;
998}
999
1000/**
1001 * alloc_new_reservation()--allocate a new reservation window
1002 *
1003 * To make a new reservation, we search part of the filesystem
1004 * reservation list (the list that inside the group). We try to
1005 * allocate a new reservation window near the allocation goal,
1006 * or the beginning of the group, if there is no goal.
1007 *
1008 * We first find a reservable space after the goal, then from
1009 * there, we check the bitmap for the first free block after
1010 * it. If there is no free block until the end of group, then the
1011 * whole group is full, we failed. Otherwise, check if the free
1012 * block is inside the expected reservable space, if so, we
1013 * succeed.
1014 * If the first free block is outside the reservable space, then
1015 * start from the first free block, we search for next available
1016 * space, and go on.
1017 *
1018 * on succeed, a new reservation will be found and inserted into the list
1019 * It contains at least one free block, and it does not overlap with other
1020 * reservation windows.
1021 *
1022 * failed: we failed to find a reservation window in this group
1023 *
1024 * @rsv: the reservation
1025 *
1026 * @grp_goal: The goal (group-relative). It is where the search for a
1027 * free reservable space should start from.
1028 * if we have a grp_goal(grp_goal >0 ), then start from there,
1029 * no grp_goal(grp_goal = -1), we start from the first block
1030 * of the group.
1031 *
1032 * @sb: the super block
1033 * @group: the group we are trying to allocate in
1034 * @bitmap_bh: the block group block bitmap
1035 *
1036 */
1037static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
1038 ext3_grpblk_t grp_goal, struct super_block *sb,
1039 unsigned int group, struct buffer_head *bitmap_bh)
1040{
1041 struct ext3_reserve_window_node *search_head;
1042 ext3_fsblk_t group_first_block, group_end_block, start_block;
1043 ext3_grpblk_t first_free_block;
1044 struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
1045 unsigned long size;
1046 int ret;
1047 spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
1048
1049 group_first_block = ext3_group_first_block_no(sb, group);
1050 group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
1051
1052 if (grp_goal < 0)
1053 start_block = group_first_block;
1054 else
1055 start_block = grp_goal + group_first_block;
1056
1057 size = my_rsv->rsv_goal_size;
1058
1059 if (!rsv_is_empty(&my_rsv->rsv_window)) {
1060 /*
1061 * if the old reservation is cross group boundary
1062 * and if the goal is inside the old reservation window,
1063 * we will come here when we just failed to allocate from
1064 * the first part of the window. We still have another part
1065 * that belongs to the next group. In this case, there is no
1066 * point to discard our window and try to allocate a new one
1067 * in this group(which will fail). we should
1068 * keep the reservation window, just simply move on.
1069 *
1070 * Maybe we could shift the start block of the reservation
1071 * window to the first block of next group.
1072 */
1073
1074 if ((my_rsv->rsv_start <= group_end_block) &&
1075 (my_rsv->rsv_end > group_end_block) &&
1076 (start_block >= my_rsv->rsv_start))
1077 return -1;
1078
1079 if ((my_rsv->rsv_alloc_hit >
1080 (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
1081 /*
1082 * if the previously allocation hit ratio is
1083 * greater than 1/2, then we double the size of
1084 * the reservation window the next time,
1085 * otherwise we keep the same size window
1086 */
1087 size = size * 2;
1088 if (size > EXT3_MAX_RESERVE_BLOCKS)
1089 size = EXT3_MAX_RESERVE_BLOCKS;
1090 my_rsv->rsv_goal_size= size;
1091 }
1092 }
1093
1094 spin_lock(rsv_lock);
1095 /*
1096 * shift the search start to the window near the goal block
1097 */
1098 search_head = search_reserve_window(fs_rsv_root, start_block);
1099
1100 /*
1101 * find_next_reservable_window() simply finds a reservable window
1102 * inside the given range(start_block, group_end_block).
1103 *
1104 * To make sure the reservation window has a free bit inside it, we
1105 * need to check the bitmap after we found a reservable window.
1106 */
1107retry:
1108 ret = find_next_reservable_window(search_head, my_rsv, sb,
1109 start_block, group_end_block);
1110
1111 if (ret == -1) {
1112 if (!rsv_is_empty(&my_rsv->rsv_window))
1113 rsv_window_remove(sb, my_rsv);
1114 spin_unlock(rsv_lock);
1115 return -1;
1116 }
1117
1118 /*
1119 * On success, find_next_reservable_window() returns the
1120 * reservation window where there is a reservable space after it.
1121 * Before we reserve this reservable space, we need
1122 * to make sure there is at least a free block inside this region.
1123 *
1124 * searching the first free bit on the block bitmap and copy of
1125 * last committed bitmap alternatively, until we found a allocatable
1126 * block. Search start from the start block of the reservable space
1127 * we just found.
1128 */
1129 spin_unlock(rsv_lock);
1130 first_free_block = bitmap_search_next_usable_block(
1131 my_rsv->rsv_start - group_first_block,
1132 bitmap_bh, group_end_block - group_first_block + 1);
1133
1134 if (first_free_block < 0) {
1135 /*
1136 * no free block left on the bitmap, no point
1137 * to reserve the space. return failed.
1138 */
1139 spin_lock(rsv_lock);
1140 if (!rsv_is_empty(&my_rsv->rsv_window))
1141 rsv_window_remove(sb, my_rsv);
1142 spin_unlock(rsv_lock);
1143 return -1; /* failed */
1144 }
1145
1146 start_block = first_free_block + group_first_block;
1147 /*
1148 * check if the first free block is within the
1149 * free space we just reserved
1150 */
1151 if (start_block >= my_rsv->rsv_start && start_block < my_rsv->rsv_end)
1152 return 0; /* success */
1153 /*
1154 * if the first free bit we found is out of the reservable space
1155 * continue search for next reservable space,
1156 * start from where the free block is,
1157 * we also shift the list head to where we stopped last time
1158 */
1159 search_head = my_rsv;
1160 spin_lock(rsv_lock);
1161 goto retry;
1162}
1163
1164/**
1165 * try_to_extend_reservation()
1166 * @my_rsv: given reservation window
1167 * @sb: super block
1168 * @size: the delta to extend
1169 *
1170 * Attempt to expand the reservation window large enough to have
1171 * required number of free blocks
1172 *
1173 * Since ext3_try_to_allocate() will always allocate blocks within
1174 * the reservation window range, if the window size is too small,
1175 * multiple blocks allocation has to stop at the end of the reservation
1176 * window. To make this more efficient, given the total number of
1177 * blocks needed and the current size of the window, we try to
1178 * expand the reservation window size if necessary on a best-effort
1179 * basis before ext3_new_blocks() tries to allocate blocks,
1180 */
1181static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
1182 struct super_block *sb, int size)
1183{
1184 struct ext3_reserve_window_node *next_rsv;
1185 struct rb_node *next;
1186 spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
1187
1188 if (!spin_trylock(rsv_lock))
1189 return;
1190
1191 next = rb_next(&my_rsv->rsv_node);
1192
1193 if (!next)
1194 my_rsv->rsv_end += size;
1195 else {
1196 next_rsv = list_entry(next, struct ext3_reserve_window_node, rsv_node);
1197
1198 if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
1199 my_rsv->rsv_end += size;
1200 else
1201 my_rsv->rsv_end = next_rsv->rsv_start - 1;
1202 }
1203 spin_unlock(rsv_lock);
1204}
1205
1206/**
1207 * ext3_try_to_allocate_with_rsv()
1208 * @sb: superblock
1209 * @handle: handle to this transaction
1210 * @group: given allocation block group
1211 * @bitmap_bh: bufferhead holds the block bitmap
1212 * @grp_goal: given target block within the group
1213 * @count: target number of blocks to allocate
1214 * @my_rsv: reservation window
1215 * @errp: pointer to store the error code
1216 *
1217 * This is the main function used to allocate a new block and its reservation
1218 * window.
1219 *
1220 * Each time when a new block allocation is need, first try to allocate from
1221 * its own reservation. If it does not have a reservation window, instead of
1222 * looking for a free bit on bitmap first, then look up the reservation list to
1223 * see if it is inside somebody else's reservation window, we try to allocate a
1224 * reservation window for it starting from the goal first. Then do the block
1225 * allocation within the reservation window.
1226 *
1227 * This will avoid keeping on searching the reservation list again and
1228 * again when somebody is looking for a free block (without
1229 * reservation), and there are lots of free blocks, but they are all
1230 * being reserved.
1231 *
1232 * We use a red-black tree for the per-filesystem reservation list.
1233 *
1234 */
1235static ext3_grpblk_t
1236ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1237 unsigned int group, struct buffer_head *bitmap_bh,
1238 ext3_grpblk_t grp_goal,
1239 struct ext3_reserve_window_node * my_rsv,
1240 unsigned long *count, int *errp)
1241{
1242 ext3_fsblk_t group_first_block, group_last_block;
1243 ext3_grpblk_t ret = 0;
1244 int fatal;
1245 unsigned long num = *count;
1246
1247 *errp = 0;
1248
1249 /*
1250 * Make sure we use undo access for the bitmap, because it is critical
1251 * that we do the frozen_data COW on bitmap buffers in all cases even
1252 * if the buffer is in BJ_Forget state in the committing transaction.
1253 */
1254 BUFFER_TRACE(bitmap_bh, "get undo access for new block");
1255 fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
1256 if (fatal) {
1257 *errp = fatal;
1258 return -1;
1259 }
1260
1261 /*
1262 * we don't deal with reservation when
1263 * filesystem is mounted without reservation
1264 * or the file is not a regular file
1265 * or last attempt to allocate a block with reservation turned on failed
1266 */
1267 if (my_rsv == NULL ) {
1268 ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
1269 grp_goal, count, NULL);
1270 goto out;
1271 }
1272 /*
1273 * grp_goal is a group relative block number (if there is a goal)
1274 * 0 < grp_goal < EXT3_BLOCKS_PER_GROUP(sb)
1275 * first block is a filesystem wide block number
1276 * first block is the block number of the first block in this group
1277 */
1278 group_first_block = ext3_group_first_block_no(sb, group);
1279 group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
1280
1281 /*
1282 * Basically we will allocate a new block from inode's reservation
1283 * window.
1284 *
1285 * We need to allocate a new reservation window, if:
1286 * a) inode does not have a reservation window; or
1287 * b) last attempt to allocate a block from existing reservation
1288 * failed; or
1289 * c) we come here with a goal and with a reservation window
1290 *
1291 * We do not need to allocate a new reservation window if we come here
1292 * at the beginning with a goal and the goal is inside the window, or
1293 * we don't have a goal but already have a reservation window.
1294 * then we could go to allocate from the reservation window directly.
1295 */
1296 while (1) {
1297 if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
1298 !goal_in_my_reservation(&my_rsv->rsv_window,
1299 grp_goal, group, sb)) {
1300 if (my_rsv->rsv_goal_size < *count)
1301 my_rsv->rsv_goal_size = *count;
1302 ret = alloc_new_reservation(my_rsv, grp_goal, sb,
1303 group, bitmap_bh);
1304 if (ret < 0)
1305 break; /* failed */
1306
1307 if (!goal_in_my_reservation(&my_rsv->rsv_window,
1308 grp_goal, group, sb))
1309 grp_goal = -1;
1310 } else if (grp_goal > 0 &&
1311 (my_rsv->rsv_end-grp_goal+1) < *count)
1312 try_to_extend_reservation(my_rsv, sb,
1313 *count-my_rsv->rsv_end + grp_goal - 1);
1314
1315 if ((my_rsv->rsv_start > group_last_block) ||
1316 (my_rsv->rsv_end < group_first_block)) {
1317 rsv_window_dump(&EXT3_SB(sb)->s_rsv_window_root, 1);
1318 BUG();
1319 }
1320 ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
1321 grp_goal, &num, &my_rsv->rsv_window);
1322 if (ret >= 0) {
1323 my_rsv->rsv_alloc_hit += num;
1324 *count = num;
1325 break; /* succeed */
1326 }
1327 num = *count;
1328 }
1329out:
1330 if (ret >= 0) {
1331 BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
1332 "bitmap block");
1333 fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
1334 if (fatal) {
1335 *errp = fatal;
1336 return -1;
1337 }
1338 return ret;
1339 }
1340
1341 BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
1342 ext3_journal_release_buffer(handle, bitmap_bh);
1343 return ret;
1344}
1345
1346/**
1347 * ext3_has_free_blocks()
1348 * @sbi: in-core super block structure.
1349 *
1350 * Check if filesystem has at least 1 free block available for allocation.
1351 */
1352static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
1353{
1354 ext3_fsblk_t free_blocks, root_blocks;
1355
1356 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
1357 root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
1358 if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
1359 sbi->s_resuid != current->fsuid &&
1360 (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
1361 return 0;
1362 }
1363 return 1;
1364}
1365
1366/**
1367 * ext3_should_retry_alloc()
1368 * @sb: super block
1369 * @retries number of attemps has been made
1370 *
1371 * ext3_should_retry_alloc() is called when ENOSPC is returned, and if
1372 * it is profitable to retry the operation, this function will wait
1373 * for the current or commiting transaction to complete, and then
1374 * return TRUE.
1375 *
1376 * if the total number of retries exceed three times, return FALSE.
1377 */
1378int ext3_should_retry_alloc(struct super_block *sb, int *retries)
1379{
1380 if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3)
1381 return 0;
1382
1383 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
1384
1385 return journal_force_commit_nested(EXT3_SB(sb)->s_journal);
1386}
1387
1388/**
1389 * ext3_new_blocks() -- core block(s) allocation function
1390 * @handle: handle to this transaction
1391 * @inode: file inode
1392 * @goal: given target block(filesystem wide)
1393 * @count: target number of blocks to allocate
1394 * @errp: error code
1395 *
1396 * ext3_new_blocks uses a goal block to assist allocation. It tries to
1397 * allocate block(s) from the block group contains the goal block first. If that
1398 * fails, it will try to allocate block(s) from other block groups without
1399 * any specific goal block.
1400 *
1401 */
1402ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
1403 ext3_fsblk_t goal, unsigned long *count, int *errp)
1404{
1405 struct buffer_head *bitmap_bh = NULL;
1406 struct buffer_head *gdp_bh;
1407 int group_no;
1408 int goal_group;
1409 ext3_grpblk_t grp_target_blk; /* blockgroup relative goal block */
1410 ext3_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/
1411 ext3_fsblk_t ret_block; /* filesyetem-wide allocated block */
1412 int bgi; /* blockgroup iteration index */
1413 int fatal = 0, err;
1414 int performed_allocation = 0;
1415 ext3_grpblk_t free_blocks; /* number of free blocks in a group */
1416 struct super_block *sb;
1417 struct ext3_group_desc *gdp;
1418 struct ext3_super_block *es;
1419 struct ext3_sb_info *sbi;
1420 struct ext3_reserve_window_node *my_rsv = NULL;
1421 struct ext3_block_alloc_info *block_i;
1422 unsigned short windowsz = 0;
1423#ifdef EXT3FS_DEBUG
1424 static int goal_hits, goal_attempts;
1425#endif
1426 unsigned long ngroups;
1427 unsigned long num = *count;
1428
1429 *errp = -ENOSPC;
1430 sb = inode->i_sb;
1431 if (!sb) {
1432 printk("ext3_new_block: nonexistent device");
1433 return 0;
1434 }
1435
1436 /*
1437 * Check quota for allocation of this block.
1438 */
1439 if (DQUOT_ALLOC_BLOCK(inode, num)) {
1440 *errp = -EDQUOT;
1441 return 0;
1442 }
1443
1444 sbi = EXT3_SB(sb);
1445 es = EXT3_SB(sb)->s_es;
1446 ext3_debug("goal=%lu.\n", goal);
1447 /*
1448 * Allocate a block from reservation only when
1449 * filesystem is mounted with reservation(default,-o reservation), and
1450 * it's a regular file, and
1451 * the desired window size is greater than 0 (One could use ioctl
1452 * command EXT3_IOC_SETRSVSZ to set the window size to 0 to turn off
1453 * reservation on that particular file)
1454 */
1455 block_i = EXT3_I(inode)->i_block_alloc_info;
1456 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1457 my_rsv = &block_i->rsv_window_node;
1458
1459 if (!ext3_has_free_blocks(sbi)) {
1460 *errp = -ENOSPC;
1461 goto out;
1462 }
1463
1464 /*
1465 * First, test whether the goal block is free.
1466 */
1467 if (goal < le32_to_cpu(es->s_first_data_block) ||
1468 goal >= le32_to_cpu(es->s_blocks_count))
1469 goal = le32_to_cpu(es->s_first_data_block);
1470 group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
1471 EXT3_BLOCKS_PER_GROUP(sb);
1472 goal_group = group_no;
1473retry_alloc:
1474 gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
1475 if (!gdp)
1476 goto io_error;
1477
1478 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1479 /*
1480 * if there is not enough free blocks to make a new resevation
1481 * turn off reservation for this allocation
1482 */
1483 if (my_rsv && (free_blocks < windowsz)
1484 && (rsv_is_empty(&my_rsv->rsv_window)))
1485 my_rsv = NULL;
1486
1487 if (free_blocks > 0) {
1488 grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
1489 EXT3_BLOCKS_PER_GROUP(sb));
1490 bitmap_bh = read_block_bitmap(sb, group_no);
1491 if (!bitmap_bh)
1492 goto io_error;
1493 grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
1494 group_no, bitmap_bh, grp_target_blk,
1495 my_rsv, &num, &fatal);
1496 if (fatal)
1497 goto out;
1498 if (grp_alloc_blk >= 0)
1499 goto allocated;
1500 }
1501
1502 ngroups = EXT3_SB(sb)->s_groups_count;
1503 smp_rmb();
1504
1505 /*
1506 * Now search the rest of the groups. We assume that
1507 * i and gdp correctly point to the last group visited.
1508 */
1509 for (bgi = 0; bgi < ngroups; bgi++) {
1510 group_no++;
1511 if (group_no >= ngroups)
1512 group_no = 0;
1513 gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
1514 if (!gdp) {
1515 *errp = -EIO;
1516 goto out;
1517 }
1518 free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1519 /*
1520 * skip this group if the number of
1521 * free blocks is less than half of the reservation
1522 * window size.
1523 */
1524 if (free_blocks <= (windowsz/2))
1525 continue;
1526
1527 brelse(bitmap_bh);
1528 bitmap_bh = read_block_bitmap(sb, group_no);
1529 if (!bitmap_bh)
1530 goto io_error;
1531 /*
1532 * try to allocate block(s) from this group, without a goal(-1).
1533 */
1534 grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
1535 group_no, bitmap_bh, -1, my_rsv,
1536 &num, &fatal);
1537 if (fatal)
1538 goto out;
1539 if (grp_alloc_blk >= 0)
1540 goto allocated;
1541 }
1542 /*
1543 * We may end up a bogus ealier ENOSPC error due to
1544 * filesystem is "full" of reservations, but
1545 * there maybe indeed free blocks avaliable on disk
1546 * In this case, we just forget about the reservations
1547 * just do block allocation as without reservations.
1548 */
1549 if (my_rsv) {
1550 my_rsv = NULL;
1551 group_no = goal_group;
1552 goto retry_alloc;
1553 }
1554 /* No space left on the device */
1555 *errp = -ENOSPC;
1556 goto out;
1557
1558allocated:
1559
1560 ext3_debug("using block group %d(%d)\n",
1561 group_no, gdp->bg_free_blocks_count);
1562
1563 BUFFER_TRACE(gdp_bh, "get_write_access");
1564 fatal = ext3_journal_get_write_access(handle, gdp_bh);
1565 if (fatal)
1566 goto out;
1567
1568 ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);
1569
1570 if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
1571 in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
1572 in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
1573 EXT3_SB(sb)->s_itb_per_group) ||
1574 in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
1575 EXT3_SB(sb)->s_itb_per_group))
1576 ext3_error(sb, "ext3_new_block",
1577 "Allocating block in system zone - "
1578 "blocks from "E3FSBLK", length %lu",
1579 ret_block, num);
1580
1581 performed_allocation = 1;
1582
1583#ifdef CONFIG_JBD_DEBUG
1584 {
1585 struct buffer_head *debug_bh;
1586
1587 /* Record bitmap buffer state in the newly allocated block */
1588 debug_bh = sb_find_get_block(sb, ret_block);
1589 if (debug_bh) {
1590 BUFFER_TRACE(debug_bh, "state when allocated");
1591 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
1592 brelse(debug_bh);
1593 }
1594 }
1595 jbd_lock_bh_state(bitmap_bh);
1596 spin_lock(sb_bgl_lock(sbi, group_no));
1597 if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
1598 int i;
1599
1600 for (i = 0; i < num; i++) {
1601 if (ext3_test_bit(grp_alloc_blk+i,
1602 bh2jh(bitmap_bh)->b_committed_data)) {
1603 printk("%s: block was unexpectedly set in "
1604 "b_committed_data\n", __FUNCTION__);
1605 }
1606 }
1607 }
1608 ext3_debug("found bit %d\n", grp_alloc_blk);
1609 spin_unlock(sb_bgl_lock(sbi, group_no));
1610 jbd_unlock_bh_state(bitmap_bh);
1611#endif
1612
1613 if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
1614 ext3_error(sb, "ext3_new_block",
1615 "block("E3FSBLK") >= blocks count(%d) - "
1616 "block_group = %d, es == %p ", ret_block,
1617 le32_to_cpu(es->s_blocks_count), group_no, es);
1618 goto out;
1619 }
1620
1621 /*
1622 * It is up to the caller to add the new buffer to a journal
1623 * list of some description. We don't know in advance whether
1624 * the caller wants to use it as metadata or data.
1625 */
1626 ext3_debug("allocating block %lu. Goal hits %d of %d.\n",
1627 ret_block, goal_hits, goal_attempts);
1628
1629 spin_lock(sb_bgl_lock(sbi, group_no));
1630 gdp->bg_free_blocks_count =
1631 cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num);
1632 spin_unlock(sb_bgl_lock(sbi, group_no));
1633 percpu_counter_mod(&sbi->s_freeblocks_counter, -num);
1634
1635 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1636 err = ext3_journal_dirty_metadata(handle, gdp_bh);
1637 if (!fatal)
1638 fatal = err;
1639
1640 sb->s_dirt = 1;
1641 if (fatal)
1642 goto out;
1643
1644 *errp = 0;
1645 brelse(bitmap_bh);
1646 DQUOT_FREE_BLOCK(inode, *count-num);
1647 *count = num;
1648 return ret_block;
1649
1650io_error:
1651 *errp = -EIO;
1652out:
1653 if (fatal) {
1654 *errp = fatal;
1655 ext3_std_error(sb, fatal);
1656 }
1657 /*
1658 * Undo the block allocation
1659 */
1660 if (!performed_allocation)
1661 DQUOT_FREE_BLOCK(inode, *count);
1662 brelse(bitmap_bh);
1663 return 0;
1664}
1665
1666ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
1667 ext3_fsblk_t goal, int *errp)
1668{
1669 unsigned long count = 1;
1670
1671 return ext3_new_blocks(handle, inode, goal, &count, errp);
1672}
1673
1674/**
1675 * ext3_count_free_blocks() -- count filesystem free blocks
1676 * @sb: superblock
1677 *
1678 * Adds up the number of free blocks from each block group.
1679 */
1680ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
1681{
1682 ext3_fsblk_t desc_count;
1683 struct ext3_group_desc *gdp;
1684 int i;
1685 unsigned long ngroups = EXT3_SB(sb)->s_groups_count;
1686#ifdef EXT3FS_DEBUG
1687 struct ext3_super_block *es;
1688 ext3_fsblk_t bitmap_count;
1689 unsigned long x;
1690 struct buffer_head *bitmap_bh = NULL;
1691
1692 es = EXT3_SB(sb)->s_es;
1693 desc_count = 0;
1694 bitmap_count = 0;
1695 gdp = NULL;
1696
1697 smp_rmb();
1698 for (i = 0; i < ngroups; i++) {
1699 gdp = ext3_get_group_desc(sb, i, NULL);
1700 if (!gdp)
1701 continue;
1702 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1703 brelse(bitmap_bh);
1704 bitmap_bh = read_block_bitmap(sb, i);
1705 if (bitmap_bh == NULL)
1706 continue;
1707
1708 x = ext3_count_free(bitmap_bh, sb->s_blocksize);
1709 printk("group %d: stored = %d, counted = %lu\n",
1710 i, le16_to_cpu(gdp->bg_free_blocks_count), x);
1711 bitmap_count += x;
1712 }
1713 brelse(bitmap_bh);
1714 printk("ext3_count_free_blocks: stored = "E3FSBLK
1715 ", computed = "E3FSBLK", "E3FSBLK"\n",
1716 le32_to_cpu(es->s_free_blocks_count),
1717 desc_count, bitmap_count);
1718 return bitmap_count;
1719#else
1720 desc_count = 0;
1721 smp_rmb();
1722 for (i = 0; i < ngroups; i++) {
1723 gdp = ext3_get_group_desc(sb, i, NULL);
1724 if (!gdp)
1725 continue;
1726 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1727 }
1728
1729 return desc_count;
1730#endif
1731}
1732
1733static inline int
1734block_in_use(ext3_fsblk_t block, struct super_block *sb, unsigned char *map)
1735{
1736 return ext3_test_bit ((block -
1737 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) %
1738 EXT3_BLOCKS_PER_GROUP(sb), map);
1739}
1740
1741static inline int test_root(int a, int b)
1742{
1743 int num = b;
1744
1745 while (a > num)
1746 num *= b;
1747 return num == a;
1748}
1749
1750static int ext3_group_sparse(int group)
1751{
1752 if (group <= 1)
1753 return 1;
1754 if (!(group & 1))
1755 return 0;
1756 return (test_root(group, 7) || test_root(group, 5) ||
1757 test_root(group, 3));
1758}
1759
1760/**
1761 * ext3_bg_has_super - number of blocks used by the superblock in group
1762 * @sb: superblock for filesystem
1763 * @group: group number to check
1764 *
1765 * Return the number of blocks used by the superblock (primary or backup)
1766 * in this group. Currently this will be only 0 or 1.
1767 */
1768int ext3_bg_has_super(struct super_block *sb, int group)
1769{
1770 if (EXT3_HAS_RO_COMPAT_FEATURE(sb,
1771 EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
1772 !ext3_group_sparse(group))
1773 return 0;
1774 return 1;
1775}
1776
1777static unsigned long ext3_bg_num_gdb_meta(struct super_block *sb, int group)
1778{
1779 unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
1780 unsigned long first = metagroup * EXT3_DESC_PER_BLOCK(sb);
1781 unsigned long last = first + EXT3_DESC_PER_BLOCK(sb) - 1;
1782
1783 if (group == first || group == first + 1 || group == last)
1784 return 1;
1785 return 0;
1786}
1787
1788static unsigned long ext3_bg_num_gdb_nometa(struct super_block *sb, int group)
1789{
1790 if (EXT3_HAS_RO_COMPAT_FEATURE(sb,
1791 EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
1792 !ext3_group_sparse(group))
1793 return 0;
1794 return EXT3_SB(sb)->s_gdb_count;
1795}
1796
1797/**
1798 * ext3_bg_num_gdb - number of blocks used by the group table in group
1799 * @sb: superblock for filesystem
1800 * @group: group number to check
1801 *
1802 * Return the number of blocks used by the group descriptor table
1803 * (primary or backup) in this group. In the future there may be a
1804 * different number of descriptor blocks in each group.
1805 */
1806unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
1807{
1808 unsigned long first_meta_bg =
1809 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_meta_bg);
1810 unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
1811
1812 if (!EXT3_HAS_INCOMPAT_FEATURE(sb,EXT3_FEATURE_INCOMPAT_META_BG) ||
1813 metagroup < first_meta_bg)
1814 return ext3_bg_num_gdb_nometa(sb,group);
1815
1816 return ext3_bg_num_gdb_meta(sb,group);
1817
1818}
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
new file mode 100644
index 000000000000..b9176eed98d1
--- /dev/null
+++ b/fs/ext4/bitmap.c
@@ -0,0 +1,32 @@
1/*
2 * linux/fs/ext3/bitmap.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 */
9
10#include <linux/buffer_head.h>
11#include <linux/jbd.h>
12#include <linux/ext3_fs.h>
13
14#ifdef EXT3FS_DEBUG
15
16static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
17
18unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
19{
20 unsigned int i;
21 unsigned long sum = 0;
22
23 if (!map)
24 return (0);
25 for (i = 0; i < numchars; i++)
26 sum += nibblemap[map->b_data[i] & 0xf] +
27 nibblemap[(map->b_data[i] >> 4) & 0xf];
28 return (sum);
29}
30
31#endif /* EXT3FS_DEBUG */
32
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
new file mode 100644
index 000000000000..d0b54f30b914
--- /dev/null
+++ b/fs/ext4/dir.c
@@ -0,0 +1,518 @@
1/*
2 * linux/fs/ext3/dir.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/dir.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * ext3 directory handling functions
16 *
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 *
20 * Hash Tree Directory indexing (c) 2001 Daniel Phillips
21 *
22 */
23
24#include <linux/fs.h>
25#include <linux/jbd.h>
26#include <linux/ext3_fs.h>
27#include <linux/buffer_head.h>
28#include <linux/smp_lock.h>
29#include <linux/slab.h>
30#include <linux/rbtree.h>
31
32static unsigned char ext3_filetype_table[] = {
33 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
34};
35
36static int ext3_readdir(struct file *, void *, filldir_t);
37static int ext3_dx_readdir(struct file * filp,
38 void * dirent, filldir_t filldir);
39static int ext3_release_dir (struct inode * inode,
40 struct file * filp);
41
42const struct file_operations ext3_dir_operations = {
43 .llseek = generic_file_llseek,
44 .read = generic_read_dir,
45 .readdir = ext3_readdir, /* we take BKL. needed?*/
46 .ioctl = ext3_ioctl, /* BKL held */
47#ifdef CONFIG_COMPAT
48 .compat_ioctl = ext3_compat_ioctl,
49#endif
50 .fsync = ext3_sync_file, /* BKL held */
51#ifdef CONFIG_EXT3_INDEX
52 .release = ext3_release_dir,
53#endif
54};
55
56
57static unsigned char get_dtype(struct super_block *sb, int filetype)
58{
59 if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) ||
60 (filetype >= EXT3_FT_MAX))
61 return DT_UNKNOWN;
62
63 return (ext3_filetype_table[filetype]);
64}
65
66
67int ext3_check_dir_entry (const char * function, struct inode * dir,
68 struct ext3_dir_entry_2 * de,
69 struct buffer_head * bh,
70 unsigned long offset)
71{
72 const char * error_msg = NULL;
73 const int rlen = le16_to_cpu(de->rec_len);
74
75 if (rlen < EXT3_DIR_REC_LEN(1))
76 error_msg = "rec_len is smaller than minimal";
77 else if (rlen % 4 != 0)
78 error_msg = "rec_len % 4 != 0";
79 else if (rlen < EXT3_DIR_REC_LEN(de->name_len))
80 error_msg = "rec_len is too small for name_len";
81 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
82 error_msg = "directory entry across blocks";
83 else if (le32_to_cpu(de->inode) >
84 le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
85 error_msg = "inode out of bounds";
86
87 if (error_msg != NULL)
88 ext3_error (dir->i_sb, function,
89 "bad entry in directory #%lu: %s - "
90 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
91 dir->i_ino, error_msg, offset,
92 (unsigned long) le32_to_cpu(de->inode),
93 rlen, de->name_len);
94 return error_msg == NULL ? 1 : 0;
95}
96
97static int ext3_readdir(struct file * filp,
98 void * dirent, filldir_t filldir)
99{
100 int error = 0;
101 unsigned long offset;
102 int i, stored;
103 struct ext3_dir_entry_2 *de;
104 struct super_block *sb;
105 int err;
106 struct inode *inode = filp->f_dentry->d_inode;
107 int ret = 0;
108
109 sb = inode->i_sb;
110
111#ifdef CONFIG_EXT3_INDEX
112 if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
113 EXT3_FEATURE_COMPAT_DIR_INDEX) &&
114 ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) ||
115 ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
116 err = ext3_dx_readdir(filp, dirent, filldir);
117 if (err != ERR_BAD_DX_DIR) {
118 ret = err;
119 goto out;
120 }
121 /*
122 * We don't set the inode dirty flag since it's not
123 * critical that it get flushed back to the disk.
124 */
125 EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL;
126 }
127#endif
128 stored = 0;
129 offset = filp->f_pos & (sb->s_blocksize - 1);
130
131 while (!error && !stored && filp->f_pos < inode->i_size) {
132 unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb);
133 struct buffer_head map_bh;
134 struct buffer_head *bh = NULL;
135
136 map_bh.b_state = 0;
137 err = ext3_get_blocks_handle(NULL, inode, blk, 1,
138 &map_bh, 0, 0);
139 if (err > 0) {
140 page_cache_readahead(sb->s_bdev->bd_inode->i_mapping,
141 &filp->f_ra,
142 filp,
143 map_bh.b_blocknr >>
144 (PAGE_CACHE_SHIFT - inode->i_blkbits),
145 1);
146 bh = ext3_bread(NULL, inode, blk, 0, &err);
147 }
148
149 /*
150 * We ignore I/O errors on directories so users have a chance
151 * of recovering data when there's a bad sector
152 */
153 if (!bh) {
154 ext3_error (sb, "ext3_readdir",
155 "directory #%lu contains a hole at offset %lu",
156 inode->i_ino, (unsigned long)filp->f_pos);
157 filp->f_pos += sb->s_blocksize - offset;
158 continue;
159 }
160
161revalidate:
162 /* If the dir block has changed since the last call to
163 * readdir(2), then we might be pointing to an invalid
164 * dirent right now. Scan from the start of the block
165 * to make sure. */
166 if (filp->f_version != inode->i_version) {
167 for (i = 0; i < sb->s_blocksize && i < offset; ) {
168 de = (struct ext3_dir_entry_2 *)
169 (bh->b_data + i);
170 /* It's too expensive to do a full
171 * dirent test each time round this
172 * loop, but we do have to test at
173 * least that it is non-zero. A
174 * failure will be detected in the
175 * dirent test below. */
176 if (le16_to_cpu(de->rec_len) <
177 EXT3_DIR_REC_LEN(1))
178 break;
179 i += le16_to_cpu(de->rec_len);
180 }
181 offset = i;
182 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
183 | offset;
184 filp->f_version = inode->i_version;
185 }
186
187 while (!error && filp->f_pos < inode->i_size
188 && offset < sb->s_blocksize) {
189 de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
190 if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
191 bh, offset)) {
192 /* On error, skip the f_pos to the
193 next block. */
194 filp->f_pos = (filp->f_pos |
195 (sb->s_blocksize - 1)) + 1;
196 brelse (bh);
197 ret = stored;
198 goto out;
199 }
200 offset += le16_to_cpu(de->rec_len);
201 if (le32_to_cpu(de->inode)) {
202 /* We might block in the next section
203 * if the data destination is
204 * currently swapped out. So, use a
205 * version stamp to detect whether or
206 * not the directory has been modified
207 * during the copy operation.
208 */
209 unsigned long version = filp->f_version;
210
211 error = filldir(dirent, de->name,
212 de->name_len,
213 filp->f_pos,
214 le32_to_cpu(de->inode),
215 get_dtype(sb, de->file_type));
216 if (error)
217 break;
218 if (version != filp->f_version)
219 goto revalidate;
220 stored ++;
221 }
222 filp->f_pos += le16_to_cpu(de->rec_len);
223 }
224 offset = 0;
225 brelse (bh);
226 }
227out:
228 return ret;
229}
230
231#ifdef CONFIG_EXT3_INDEX
232/*
233 * These functions convert from the major/minor hash to an f_pos
234 * value.
235 *
236 * Currently we only use major hash numer. This is unfortunate, but
237 * on 32-bit machines, the same VFS interface is used for lseek and
238 * llseek, so if we use the 64 bit offset, then the 32-bit versions of
239 * lseek/telldir/seekdir will blow out spectacularly, and from within
240 * the ext2 low-level routine, we don't know if we're being called by
241 * a 64-bit version of the system call or the 32-bit version of the
242 * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
243 * cookie. Sigh.
244 */
245#define hash2pos(major, minor) (major >> 1)
246#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
247#define pos2min_hash(pos) (0)
248
249/*
250 * This structure holds the nodes of the red-black tree used to store
251 * the directory entry in hash order.
252 */
253struct fname {
254 __u32 hash;
255 __u32 minor_hash;
256 struct rb_node rb_hash;
257 struct fname *next;
258 __u32 inode;
259 __u8 name_len;
260 __u8 file_type;
261 char name[0];
262};
263
264/*
265 * This functoin implements a non-recursive way of freeing all of the
266 * nodes in the red-black tree.
267 */
268static void free_rb_tree_fname(struct rb_root *root)
269{
270 struct rb_node *n = root->rb_node;
271 struct rb_node *parent;
272 struct fname *fname;
273
274 while (n) {
275 /* Do the node's children first */
276 if ((n)->rb_left) {
277 n = n->rb_left;
278 continue;
279 }
280 if (n->rb_right) {
281 n = n->rb_right;
282 continue;
283 }
284 /*
285 * The node has no children; free it, and then zero
286 * out parent's link to it. Finally go to the
287 * beginning of the loop and try to free the parent
288 * node.
289 */
290 parent = rb_parent(n);
291 fname = rb_entry(n, struct fname, rb_hash);
292 while (fname) {
293 struct fname * old = fname;
294 fname = fname->next;
295 kfree (old);
296 }
297 if (!parent)
298 root->rb_node = NULL;
299 else if (parent->rb_left == n)
300 parent->rb_left = NULL;
301 else if (parent->rb_right == n)
302 parent->rb_right = NULL;
303 n = parent;
304 }
305 root->rb_node = NULL;
306}
307
308
309static struct dir_private_info *create_dir_info(loff_t pos)
310{
311 struct dir_private_info *p;
312
313 p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
314 if (!p)
315 return NULL;
316 p->root.rb_node = NULL;
317 p->curr_node = NULL;
318 p->extra_fname = NULL;
319 p->last_pos = 0;
320 p->curr_hash = pos2maj_hash(pos);
321 p->curr_minor_hash = pos2min_hash(pos);
322 p->next_hash = 0;
323 return p;
324}
325
326void ext3_htree_free_dir_info(struct dir_private_info *p)
327{
328 free_rb_tree_fname(&p->root);
329 kfree(p);
330}
331
332/*
333 * Given a directory entry, enter it into the fname rb tree.
334 */
335int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
336 __u32 minor_hash,
337 struct ext3_dir_entry_2 *dirent)
338{
339 struct rb_node **p, *parent = NULL;
340 struct fname * fname, *new_fn;
341 struct dir_private_info *info;
342 int len;
343
344 info = (struct dir_private_info *) dir_file->private_data;
345 p = &info->root.rb_node;
346
347 /* Create and allocate the fname structure */
348 len = sizeof(struct fname) + dirent->name_len + 1;
349 new_fn = kzalloc(len, GFP_KERNEL);
350 if (!new_fn)
351 return -ENOMEM;
352 new_fn->hash = hash;
353 new_fn->minor_hash = minor_hash;
354 new_fn->inode = le32_to_cpu(dirent->inode);
355 new_fn->name_len = dirent->name_len;
356 new_fn->file_type = dirent->file_type;
357 memcpy(new_fn->name, dirent->name, dirent->name_len);
358 new_fn->name[dirent->name_len] = 0;
359
360 while (*p) {
361 parent = *p;
362 fname = rb_entry(parent, struct fname, rb_hash);
363
364 /*
365 * If the hash and minor hash match up, then we put
366 * them on a linked list. This rarely happens...
367 */
368 if ((new_fn->hash == fname->hash) &&
369 (new_fn->minor_hash == fname->minor_hash)) {
370 new_fn->next = fname->next;
371 fname->next = new_fn;
372 return 0;
373 }
374
375 if (new_fn->hash < fname->hash)
376 p = &(*p)->rb_left;
377 else if (new_fn->hash > fname->hash)
378 p = &(*p)->rb_right;
379 else if (new_fn->minor_hash < fname->minor_hash)
380 p = &(*p)->rb_left;
381 else /* if (new_fn->minor_hash > fname->minor_hash) */
382 p = &(*p)->rb_right;
383 }
384
385 rb_link_node(&new_fn->rb_hash, parent, p);
386 rb_insert_color(&new_fn->rb_hash, &info->root);
387 return 0;
388}
389
390
391
392/*
393 * This is a helper function for ext3_dx_readdir. It calls filldir
394 * for all entres on the fname linked list. (Normally there is only
395 * one entry on the linked list, unless there are 62 bit hash collisions.)
396 */
397static int call_filldir(struct file * filp, void * dirent,
398 filldir_t filldir, struct fname *fname)
399{
400 struct dir_private_info *info = filp->private_data;
401 loff_t curr_pos;
402 struct inode *inode = filp->f_dentry->d_inode;
403 struct super_block * sb;
404 int error;
405
406 sb = inode->i_sb;
407
408 if (!fname) {
409 printk("call_filldir: called with null fname?!?\n");
410 return 0;
411 }
412 curr_pos = hash2pos(fname->hash, fname->minor_hash);
413 while (fname) {
414 error = filldir(dirent, fname->name,
415 fname->name_len, curr_pos,
416 fname->inode,
417 get_dtype(sb, fname->file_type));
418 if (error) {
419 filp->f_pos = curr_pos;
420 info->extra_fname = fname->next;
421 return error;
422 }
423 fname = fname->next;
424 }
425 return 0;
426}
427
428static int ext3_dx_readdir(struct file * filp,
429 void * dirent, filldir_t filldir)
430{
431 struct dir_private_info *info = filp->private_data;
432 struct inode *inode = filp->f_dentry->d_inode;
433 struct fname *fname;
434 int ret;
435
436 if (!info) {
437 info = create_dir_info(filp->f_pos);
438 if (!info)
439 return -ENOMEM;
440 filp->private_data = info;
441 }
442
443 if (filp->f_pos == EXT3_HTREE_EOF)
444 return 0; /* EOF */
445
446 /* Some one has messed with f_pos; reset the world */
447 if (info->last_pos != filp->f_pos) {
448 free_rb_tree_fname(&info->root);
449 info->curr_node = NULL;
450 info->extra_fname = NULL;
451 info->curr_hash = pos2maj_hash(filp->f_pos);
452 info->curr_minor_hash = pos2min_hash(filp->f_pos);
453 }
454
455 /*
456 * If there are any leftover names on the hash collision
457 * chain, return them first.
458 */
459 if (info->extra_fname &&
460 call_filldir(filp, dirent, filldir, info->extra_fname))
461 goto finished;
462
463 if (!info->curr_node)
464 info->curr_node = rb_first(&info->root);
465
466 while (1) {
467 /*
468 * Fill the rbtree if we have no more entries,
469 * or the inode has changed since we last read in the
470 * cached entries.
471 */
472 if ((!info->curr_node) ||
473 (filp->f_version != inode->i_version)) {
474 info->curr_node = NULL;
475 free_rb_tree_fname(&info->root);
476 filp->f_version = inode->i_version;
477 ret = ext3_htree_fill_tree(filp, info->curr_hash,
478 info->curr_minor_hash,
479 &info->next_hash);
480 if (ret < 0)
481 return ret;
482 if (ret == 0) {
483 filp->f_pos = EXT3_HTREE_EOF;
484 break;
485 }
486 info->curr_node = rb_first(&info->root);
487 }
488
489 fname = rb_entry(info->curr_node, struct fname, rb_hash);
490 info->curr_hash = fname->hash;
491 info->curr_minor_hash = fname->minor_hash;
492 if (call_filldir(filp, dirent, filldir, fname))
493 break;
494
495 info->curr_node = rb_next(info->curr_node);
496 if (!info->curr_node) {
497 if (info->next_hash == ~0) {
498 filp->f_pos = EXT3_HTREE_EOF;
499 break;
500 }
501 info->curr_hash = info->next_hash;
502 info->curr_minor_hash = 0;
503 }
504 }
505finished:
506 info->last_pos = filp->f_pos;
507 return 0;
508}
509
510static int ext3_release_dir (struct inode * inode, struct file * filp)
511{
512 if (filp->private_data)
513 ext3_htree_free_dir_info(filp->private_data);
514
515 return 0;
516}
517
518#endif
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
new file mode 100644
index 000000000000..e96c388047e0
--- /dev/null
+++ b/fs/ext4/file.c
@@ -0,0 +1,139 @@
1/*
2 * linux/fs/ext3/file.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/file.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * ext3 fs regular file handling primitives
16 *
17 * 64-bit file support on 64-bit platforms by Jakub Jelinek
18 * (jj@sunsite.ms.mff.cuni.cz)
19 */
20
21#include <linux/time.h>
22#include <linux/fs.h>
23#include <linux/jbd.h>
24#include <linux/ext3_fs.h>
25#include <linux/ext3_jbd.h>
26#include "xattr.h"
27#include "acl.h"
28
29/*
30 * Called when an inode is released. Note that this is different
31 * from ext3_file_open: open gets called at every open, but release
32 * gets called only when /all/ the files are closed.
33 */
34static int ext3_release_file (struct inode * inode, struct file * filp)
35{
36 /* if we are the last writer on the inode, drop the block reservation */
37 if ((filp->f_mode & FMODE_WRITE) &&
38 (atomic_read(&inode->i_writecount) == 1))
39 {
40 mutex_lock(&EXT3_I(inode)->truncate_mutex);
41 ext3_discard_reservation(inode);
42 mutex_unlock(&EXT3_I(inode)->truncate_mutex);
43 }
44 if (is_dx(inode) && filp->private_data)
45 ext3_htree_free_dir_info(filp->private_data);
46
47 return 0;
48}
49
50static ssize_t
51ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
52 unsigned long nr_segs, loff_t pos)
53{
54 struct file *file = iocb->ki_filp;
55 struct inode *inode = file->f_dentry->d_inode;
56 ssize_t ret;
57 int err;
58
59 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
60
61 /*
62 * Skip flushing if there was an error, or if nothing was written.
63 */
64 if (ret <= 0)
65 return ret;
66
67 /*
68 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
69 * journalling then we need to make sure that we force the transaction
70 * to disk to keep all metadata uptodate synchronously.
71 */
72 if (file->f_flags & O_SYNC) {
73 /*
74 * If we are non-data-journaled, then the dirty data has
75 * already been flushed to backing store by generic_osync_inode,
76 * and the inode has been flushed too if there have been any
77 * modifications other than mere timestamp updates.
78 *
79 * Open question --- do we care about flushing timestamps too
80 * if the inode is IS_SYNC?
81 */
82 if (!ext3_should_journal_data(inode))
83 return ret;
84
85 goto force_commit;
86 }
87
88 /*
89 * So we know that there has been no forced data flush. If the inode
90 * is marked IS_SYNC, we need to force one ourselves.
91 */
92 if (!IS_SYNC(inode))
93 return ret;
94
95 /*
96 * Open question #2 --- should we force data to disk here too? If we
97 * don't, the only impact is that data=writeback filesystems won't
98 * flush data to disk automatically on IS_SYNC, only metadata (but
99 * historically, that is what ext2 has done.)
100 */
101
102force_commit:
103 err = ext3_force_commit(inode->i_sb);
104 if (err)
105 return err;
106 return ret;
107}
108
109const struct file_operations ext3_file_operations = {
110 .llseek = generic_file_llseek,
111 .read = do_sync_read,
112 .write = do_sync_write,
113 .aio_read = generic_file_aio_read,
114 .aio_write = ext3_file_write,
115 .ioctl = ext3_ioctl,
116#ifdef CONFIG_COMPAT
117 .compat_ioctl = ext3_compat_ioctl,
118#endif
119 .mmap = generic_file_mmap,
120 .open = generic_file_open,
121 .release = ext3_release_file,
122 .fsync = ext3_sync_file,
123 .sendfile = generic_file_sendfile,
124 .splice_read = generic_file_splice_read,
125 .splice_write = generic_file_splice_write,
126};
127
128struct inode_operations ext3_file_inode_operations = {
129 .truncate = ext3_truncate,
130 .setattr = ext3_setattr,
131#ifdef CONFIG_EXT3_FS_XATTR
132 .setxattr = generic_setxattr,
133 .getxattr = generic_getxattr,
134 .listxattr = ext3_listxattr,
135 .removexattr = generic_removexattr,
136#endif
137 .permission = ext3_permission,
138};
139
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
new file mode 100644
index 000000000000..dd1fd3c0fc05
--- /dev/null
+++ b/fs/ext4/fsync.c
@@ -0,0 +1,88 @@
1/*
2 * linux/fs/ext3/fsync.c
3 *
4 * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com)
5 * from
6 * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
7 * Laboratoire MASI - Institut Blaise Pascal
8 * Universite Pierre et Marie Curie (Paris VI)
9 * from
10 * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds
11 *
12 * ext3fs fsync primitive
13 *
14 * Big-endian to little-endian byte-swapping/bitmaps by
15 * David S. Miller (davem@caip.rutgers.edu), 1995
16 *
17 * Removed unnecessary code duplication for little endian machines
18 * and excessive __inline__s.
19 * Andi Kleen, 1997
20 *
21 * Major simplications and cleanup - we only need to do the metadata, because
22 * we can depend on generic_block_fdatasync() to sync the data blocks.
23 */
24
25#include <linux/time.h>
26#include <linux/fs.h>
27#include <linux/sched.h>
28#include <linux/writeback.h>
29#include <linux/jbd.h>
30#include <linux/ext3_fs.h>
31#include <linux/ext3_jbd.h>
32
33/*
34 * akpm: A new design for ext3_sync_file().
35 *
36 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
37 * There cannot be a transaction open by this task.
38 * Another task could have dirtied this inode. Its data can be in any
39 * state in the journalling system.
40 *
41 * What we do is just kick off a commit and wait on it. This will snapshot the
42 * inode to disk.
43 */
44
45int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
46{
47 struct inode *inode = dentry->d_inode;
48 int ret = 0;
49
50 J_ASSERT(ext3_journal_current_handle() == 0);
51
52 /*
53 * data=writeback:
54 * The caller's filemap_fdatawrite()/wait will sync the data.
55 * sync_inode() will sync the metadata
56 *
57 * data=ordered:
58 * The caller's filemap_fdatawrite() will write the data and
59 * sync_inode() will write the inode if it is dirty. Then the caller's
60 * filemap_fdatawait() will wait on the pages.
61 *
62 * data=journal:
63 * filemap_fdatawrite won't do anything (the buffers are clean).
64 * ext3_force_commit will write the file data into the journal and
65 * will wait on that.
66 * filemap_fdatawait() will encounter a ton of newly-dirtied pages
67 * (they were dirtied by commit). But that's OK - the blocks are
68 * safe in-journal, which is all fsync() needs to ensure.
69 */
70 if (ext3_should_journal_data(inode)) {
71 ret = ext3_force_commit(inode->i_sb);
72 goto out;
73 }
74
75 /*
76 * The VFS has written the file data. If the inode is unaltered
77 * then we need not start a commit.
78 */
79 if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
80 struct writeback_control wbc = {
81 .sync_mode = WB_SYNC_ALL,
82 .nr_to_write = 0, /* sys_fsync did this */
83 };
84 ret = sync_inode(inode, &wbc);
85 }
86out:
87 return ret;
88}
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
new file mode 100644
index 000000000000..deeb27b5ba83
--- /dev/null
+++ b/fs/ext4/hash.c
@@ -0,0 +1,152 @@
1/*
2 * linux/fs/ext3/hash.c
3 *
4 * Copyright (C) 2002 by Theodore Ts'o
5 *
6 * This file is released under the GPL v2.
7 *
8 * This file may be redistributed under the terms of the GNU Public
9 * License.
10 */
11
12#include <linux/fs.h>
13#include <linux/jbd.h>
14#include <linux/sched.h>
15#include <linux/ext3_fs.h>
16#include <linux/cryptohash.h>
17
18#define DELTA 0x9E3779B9
19
20static void TEA_transform(__u32 buf[4], __u32 const in[])
21{
22 __u32 sum = 0;
23 __u32 b0 = buf[0], b1 = buf[1];
24 __u32 a = in[0], b = in[1], c = in[2], d = in[3];
25 int n = 16;
26
27 do {
28 sum += DELTA;
29 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
30 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
31 } while(--n);
32
33 buf[0] += b0;
34 buf[1] += b1;
35}
36
37
38/* The old legacy hash */
39static __u32 dx_hack_hash (const char *name, int len)
40{
41 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
42 while (len--) {
43 __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
44
45 if (hash & 0x80000000) hash -= 0x7fffffff;
46 hash1 = hash0;
47 hash0 = hash;
48 }
49 return (hash0 << 1);
50}
51
52static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
53{
54 __u32 pad, val;
55 int i;
56
57 pad = (__u32)len | ((__u32)len << 8);
58 pad |= pad << 16;
59
60 val = pad;
61 if (len > num*4)
62 len = num * 4;
63 for (i=0; i < len; i++) {
64 if ((i % 4) == 0)
65 val = pad;
66 val = msg[i] + (val << 8);
67 if ((i % 4) == 3) {
68 *buf++ = val;
69 val = pad;
70 num--;
71 }
72 }
73 if (--num >= 0)
74 *buf++ = val;
75 while (--num >= 0)
76 *buf++ = pad;
77}
78
79/*
80 * Returns the hash of a filename. If len is 0 and name is NULL, then
81 * this function can be used to test whether or not a hash version is
82 * supported.
83 *
84 * The seed is an 4 longword (32 bits) "secret" which can be used to
85 * uniquify a hash. If the seed is all zero's, then some default seed
86 * may be used.
87 *
88 * A particular hash version specifies whether or not the seed is
89 * represented, and whether or not the returned hash is 32 bits or 64
90 * bits. 32 bit hashes will return 0 for the minor hash.
91 */
92int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
93{
94 __u32 hash;
95 __u32 minor_hash = 0;
96 const char *p;
97 int i;
98 __u32 in[8], buf[4];
99
100 /* Initialize the default seed for the hash checksum functions */
101 buf[0] = 0x67452301;
102 buf[1] = 0xefcdab89;
103 buf[2] = 0x98badcfe;
104 buf[3] = 0x10325476;
105
106 /* Check to see if the seed is all zero's */
107 if (hinfo->seed) {
108 for (i=0; i < 4; i++) {
109 if (hinfo->seed[i])
110 break;
111 }
112 if (i < 4)
113 memcpy(buf, hinfo->seed, sizeof(buf));
114 }
115
116 switch (hinfo->hash_version) {
117 case DX_HASH_LEGACY:
118 hash = dx_hack_hash(name, len);
119 break;
120 case DX_HASH_HALF_MD4:
121 p = name;
122 while (len > 0) {
123 str2hashbuf(p, len, in, 8);
124 half_md4_transform(buf, in);
125 len -= 32;
126 p += 32;
127 }
128 minor_hash = buf[2];
129 hash = buf[1];
130 break;
131 case DX_HASH_TEA:
132 p = name;
133 while (len > 0) {
134 str2hashbuf(p, len, in, 4);
135 TEA_transform(buf, in);
136 len -= 16;
137 p += 16;
138 }
139 hash = buf[0];
140 minor_hash = buf[1];
141 break;
142 default:
143 hinfo->hash = 0;
144 return -1;
145 }
146 hash = hash & ~1;
147 if (hash == (EXT3_HTREE_EOF << 1))
148 hash = (EXT3_HTREE_EOF-1) << 1;
149 hinfo->hash = hash;
150 hinfo->minor_hash = minor_hash;
151 return 0;
152}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
new file mode 100644
index 000000000000..e45dbd651736
--- /dev/null
+++ b/fs/ext4/ialloc.c
@@ -0,0 +1,758 @@
1/*
2 * linux/fs/ext3/ialloc.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * BSD ufs-inspired inode and directory allocation by
10 * Stephen Tweedie (sct@redhat.com), 1993
11 * Big-endian to little-endian byte-swapping/bitmaps by
12 * David S. Miller (davem@caip.rutgers.edu), 1995
13 */
14
15#include <linux/time.h>
16#include <linux/fs.h>
17#include <linux/jbd.h>
18#include <linux/ext3_fs.h>
19#include <linux/ext3_jbd.h>
20#include <linux/stat.h>
21#include <linux/string.h>
22#include <linux/quotaops.h>
23#include <linux/buffer_head.h>
24#include <linux/random.h>
25#include <linux/bitops.h>
26
27#include <asm/byteorder.h>
28
29#include "xattr.h"
30#include "acl.h"
31
32/*
33 * ialloc.c contains the inodes allocation and deallocation routines
34 */
35
36/*
37 * The free inodes are managed by bitmaps. A file system contains several
38 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
39 * block for inodes, N blocks for the inode table and data blocks.
40 *
41 * The file system contains group descriptors which are located after the
42 * super block. Each descriptor contains the number of the bitmap block and
43 * the free blocks count in the block.
44 */
45
46
47/*
48 * Read the inode allocation bitmap for a given block_group, reading
49 * into the specified slot in the superblock's bitmap cache.
50 *
51 * Return buffer_head of bitmap on success or NULL.
52 */
53static struct buffer_head *
54read_inode_bitmap(struct super_block * sb, unsigned long block_group)
55{
56 struct ext3_group_desc *desc;
57 struct buffer_head *bh = NULL;
58
59 desc = ext3_get_group_desc(sb, block_group, NULL);
60 if (!desc)
61 goto error_out;
62
63 bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap));
64 if (!bh)
65 ext3_error(sb, "read_inode_bitmap",
66 "Cannot read inode bitmap - "
67 "block_group = %lu, inode_bitmap = %u",
68 block_group, le32_to_cpu(desc->bg_inode_bitmap));
69error_out:
70 return bh;
71}
72
73/*
74 * NOTE! When we get the inode, we're the only people
75 * that have access to it, and as such there are no
76 * race conditions we have to worry about. The inode
77 * is not on the hash-lists, and it cannot be reached
78 * through the filesystem because the directory entry
79 * has been deleted earlier.
80 *
81 * HOWEVER: we must make sure that we get no aliases,
82 * which means that we have to call "clear_inode()"
83 * _before_ we mark the inode not in use in the inode
84 * bitmaps. Otherwise a newly created file might use
85 * the same inode number (not actually the same pointer
86 * though), and then we'd have two inodes sharing the
87 * same inode number and space on the harddisk.
88 */
89void ext3_free_inode (handle_t *handle, struct inode * inode)
90{
91 struct super_block * sb = inode->i_sb;
92 int is_directory;
93 unsigned long ino;
94 struct buffer_head *bitmap_bh = NULL;
95 struct buffer_head *bh2;
96 unsigned long block_group;
97 unsigned long bit;
98 struct ext3_group_desc * gdp;
99 struct ext3_super_block * es;
100 struct ext3_sb_info *sbi;
101 int fatal = 0, err;
102
103 if (atomic_read(&inode->i_count) > 1) {
104 printk ("ext3_free_inode: inode has count=%d\n",
105 atomic_read(&inode->i_count));
106 return;
107 }
108 if (inode->i_nlink) {
109 printk ("ext3_free_inode: inode has nlink=%d\n",
110 inode->i_nlink);
111 return;
112 }
113 if (!sb) {
114 printk("ext3_free_inode: inode on nonexistent device\n");
115 return;
116 }
117 sbi = EXT3_SB(sb);
118
119 ino = inode->i_ino;
120 ext3_debug ("freeing inode %lu\n", ino);
121
122 /*
123 * Note: we must free any quota before locking the superblock,
124 * as writing the quota to disk may need the lock as well.
125 */
126 DQUOT_INIT(inode);
127 ext3_xattr_delete_inode(handle, inode);
128 DQUOT_FREE_INODE(inode);
129 DQUOT_DROP(inode);
130
131 is_directory = S_ISDIR(inode->i_mode);
132
133 /* Do this BEFORE marking the inode not in use or returning an error */
134 clear_inode (inode);
135
136 es = EXT3_SB(sb)->s_es;
137 if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
138 ext3_error (sb, "ext3_free_inode",
139 "reserved or nonexistent inode %lu", ino);
140 goto error_return;
141 }
142 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
143 bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
144 bitmap_bh = read_inode_bitmap(sb, block_group);
145 if (!bitmap_bh)
146 goto error_return;
147
148 BUFFER_TRACE(bitmap_bh, "get_write_access");
149 fatal = ext3_journal_get_write_access(handle, bitmap_bh);
150 if (fatal)
151 goto error_return;
152
153 /* Ok, now we can actually update the inode bitmaps.. */
154 if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
155 bit, bitmap_bh->b_data))
156 ext3_error (sb, "ext3_free_inode",
157 "bit already cleared for inode %lu", ino);
158 else {
159 gdp = ext3_get_group_desc (sb, block_group, &bh2);
160
161 BUFFER_TRACE(bh2, "get_write_access");
162 fatal = ext3_journal_get_write_access(handle, bh2);
163 if (fatal) goto error_return;
164
165 if (gdp) {
166 spin_lock(sb_bgl_lock(sbi, block_group));
167 gdp->bg_free_inodes_count = cpu_to_le16(
168 le16_to_cpu(gdp->bg_free_inodes_count) + 1);
169 if (is_directory)
170 gdp->bg_used_dirs_count = cpu_to_le16(
171 le16_to_cpu(gdp->bg_used_dirs_count) - 1);
172 spin_unlock(sb_bgl_lock(sbi, block_group));
173 percpu_counter_inc(&sbi->s_freeinodes_counter);
174 if (is_directory)
175 percpu_counter_dec(&sbi->s_dirs_counter);
176
177 }
178 BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
179 err = ext3_journal_dirty_metadata(handle, bh2);
180 if (!fatal) fatal = err;
181 }
182 BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata");
183 err = ext3_journal_dirty_metadata(handle, bitmap_bh);
184 if (!fatal)
185 fatal = err;
186 sb->s_dirt = 1;
187error_return:
188 brelse(bitmap_bh);
189 ext3_std_error(sb, fatal);
190}
191
192/*
193 * There are two policies for allocating an inode. If the new inode is
194 * a directory, then a forward search is made for a block group with both
195 * free space and a low directory-to-inode ratio; if that fails, then of
196 * the groups with above-average free space, that group with the fewest
197 * directories already is chosen.
198 *
199 * For other inodes, search forward from the parent directory\'s block
200 * group to find a free inode.
201 */
202static int find_group_dir(struct super_block *sb, struct inode *parent)
203{
204 int ngroups = EXT3_SB(sb)->s_groups_count;
205 unsigned int freei, avefreei;
206 struct ext3_group_desc *desc, *best_desc = NULL;
207 struct buffer_head *bh;
208 int group, best_group = -1;
209
210 freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter);
211 avefreei = freei / ngroups;
212
213 for (group = 0; group < ngroups; group++) {
214 desc = ext3_get_group_desc (sb, group, &bh);
215 if (!desc || !desc->bg_free_inodes_count)
216 continue;
217 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
218 continue;
219 if (!best_desc ||
220 (le16_to_cpu(desc->bg_free_blocks_count) >
221 le16_to_cpu(best_desc->bg_free_blocks_count))) {
222 best_group = group;
223 best_desc = desc;
224 }
225 }
226 return best_group;
227}
228
229/*
230 * Orlov's allocator for directories.
231 *
232 * We always try to spread first-level directories.
233 *
234 * If there are blockgroups with both free inodes and free blocks counts
235 * not worse than average we return one with smallest directory count.
236 * Otherwise we simply return a random group.
237 *
238 * For the rest rules look so:
239 *
240 * It's OK to put directory into a group unless
241 * it has too many directories already (max_dirs) or
242 * it has too few free inodes left (min_inodes) or
243 * it has too few free blocks left (min_blocks) or
244 * it's already running too large debt (max_debt).
245 * Parent's group is prefered, if it doesn't satisfy these
246 * conditions we search cyclically through the rest. If none
247 * of the groups look good we just look for a group with more
248 * free inodes than average (starting at parent's group).
249 *
250 * Debt is incremented each time we allocate a directory and decremented
251 * when we allocate an inode, within 0--255.
252 */
253
254#define INODE_COST 64
255#define BLOCK_COST 256
256
257static int find_group_orlov(struct super_block *sb, struct inode *parent)
258{
259 int parent_group = EXT3_I(parent)->i_block_group;
260 struct ext3_sb_info *sbi = EXT3_SB(sb);
261 struct ext3_super_block *es = sbi->s_es;
262 int ngroups = sbi->s_groups_count;
263 int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
264 unsigned int freei, avefreei;
265 ext3_fsblk_t freeb, avefreeb;
266 ext3_fsblk_t blocks_per_dir;
267 unsigned int ndirs;
268 int max_debt, max_dirs, min_inodes;
269 ext3_grpblk_t min_blocks;
270 int group = -1, i;
271 struct ext3_group_desc *desc;
272 struct buffer_head *bh;
273
274 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
275 avefreei = freei / ngroups;
276 freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
277 avefreeb = freeb / ngroups;
278 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
279
280 if ((parent == sb->s_root->d_inode) ||
281 (EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) {
282 int best_ndir = inodes_per_group;
283 int best_group = -1;
284
285 get_random_bytes(&group, sizeof(group));
286 parent_group = (unsigned)group % ngroups;
287 for (i = 0; i < ngroups; i++) {
288 group = (parent_group + i) % ngroups;
289 desc = ext3_get_group_desc (sb, group, &bh);
290 if (!desc || !desc->bg_free_inodes_count)
291 continue;
292 if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
293 continue;
294 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
295 continue;
296 if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
297 continue;
298 best_group = group;
299 best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
300 }
301 if (best_group >= 0)
302 return best_group;
303 goto fallback;
304 }
305
306 blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs;
307
308 max_dirs = ndirs / ngroups + inodes_per_group / 16;
309 min_inodes = avefreei - inodes_per_group / 4;
310 min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4;
311
312 max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3_fsblk_t)BLOCK_COST);
313 if (max_debt * INODE_COST > inodes_per_group)
314 max_debt = inodes_per_group / INODE_COST;
315 if (max_debt > 255)
316 max_debt = 255;
317 if (max_debt == 0)
318 max_debt = 1;
319
320 for (i = 0; i < ngroups; i++) {
321 group = (parent_group + i) % ngroups;
322 desc = ext3_get_group_desc (sb, group, &bh);
323 if (!desc || !desc->bg_free_inodes_count)
324 continue;
325 if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
326 continue;
327 if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
328 continue;
329 if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
330 continue;
331 return group;
332 }
333
334fallback:
335 for (i = 0; i < ngroups; i++) {
336 group = (parent_group + i) % ngroups;
337 desc = ext3_get_group_desc (sb, group, &bh);
338 if (!desc || !desc->bg_free_inodes_count)
339 continue;
340 if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
341 return group;
342 }
343
344 if (avefreei) {
345 /*
346 * The free-inodes counter is approximate, and for really small
347 * filesystems the above test can fail to find any blockgroups
348 */
349 avefreei = 0;
350 goto fallback;
351 }
352
353 return -1;
354}
355
356static int find_group_other(struct super_block *sb, struct inode *parent)
357{
358 int parent_group = EXT3_I(parent)->i_block_group;
359 int ngroups = EXT3_SB(sb)->s_groups_count;
360 struct ext3_group_desc *desc;
361 struct buffer_head *bh;
362 int group, i;
363
364 /*
365 * Try to place the inode in its parent directory
366 */
367 group = parent_group;
368 desc = ext3_get_group_desc (sb, group, &bh);
369 if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
370 le16_to_cpu(desc->bg_free_blocks_count))
371 return group;
372
373 /*
374 * We're going to place this inode in a different blockgroup from its
375 * parent. We want to cause files in a common directory to all land in
376 * the same blockgroup. But we want files which are in a different
377 * directory which shares a blockgroup with our parent to land in a
378 * different blockgroup.
379 *
380 * So add our directory's i_ino into the starting point for the hash.
381 */
382 group = (group + parent->i_ino) % ngroups;
383
384 /*
385 * Use a quadratic hash to find a group with a free inode and some free
386 * blocks.
387 */
388 for (i = 1; i < ngroups; i <<= 1) {
389 group += i;
390 if (group >= ngroups)
391 group -= ngroups;
392 desc = ext3_get_group_desc (sb, group, &bh);
393 if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
394 le16_to_cpu(desc->bg_free_blocks_count))
395 return group;
396 }
397
398 /*
399 * That failed: try linear search for a free inode, even if that group
400 * has no free blocks.
401 */
402 group = parent_group;
403 for (i = 0; i < ngroups; i++) {
404 if (++group >= ngroups)
405 group = 0;
406 desc = ext3_get_group_desc (sb, group, &bh);
407 if (desc && le16_to_cpu(desc->bg_free_inodes_count))
408 return group;
409 }
410
411 return -1;
412}
413
414/*
415 * There are two policies for allocating an inode. If the new inode is
416 * a directory, then a forward search is made for a block group with both
417 * free space and a low directory-to-inode ratio; if that fails, then of
418 * the groups with above-average free space, that group with the fewest
419 * directories already is chosen.
420 *
421 * For other inodes, search forward from the parent directory's block
422 * group to find a free inode.
423 */
424struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
425{
426 struct super_block *sb;
427 struct buffer_head *bitmap_bh = NULL;
428 struct buffer_head *bh2;
429 int group;
430 unsigned long ino = 0;
431 struct inode * inode;
432 struct ext3_group_desc * gdp = NULL;
433 struct ext3_super_block * es;
434 struct ext3_inode_info *ei;
435 struct ext3_sb_info *sbi;
436 int err = 0;
437 struct inode *ret;
438 int i;
439
440 /* Cannot create files in a deleted directory */
441 if (!dir || !dir->i_nlink)
442 return ERR_PTR(-EPERM);
443
444 sb = dir->i_sb;
445 inode = new_inode(sb);
446 if (!inode)
447 return ERR_PTR(-ENOMEM);
448 ei = EXT3_I(inode);
449
450 sbi = EXT3_SB(sb);
451 es = sbi->s_es;
452 if (S_ISDIR(mode)) {
453 if (test_opt (sb, OLDALLOC))
454 group = find_group_dir(sb, dir);
455 else
456 group = find_group_orlov(sb, dir);
457 } else
458 group = find_group_other(sb, dir);
459
460 err = -ENOSPC;
461 if (group == -1)
462 goto out;
463
464 for (i = 0; i < sbi->s_groups_count; i++) {
465 err = -EIO;
466
467 gdp = ext3_get_group_desc(sb, group, &bh2);
468 if (!gdp)
469 goto fail;
470
471 brelse(bitmap_bh);
472 bitmap_bh = read_inode_bitmap(sb, group);
473 if (!bitmap_bh)
474 goto fail;
475
476 ino = 0;
477
478repeat_in_this_group:
479 ino = ext3_find_next_zero_bit((unsigned long *)
480 bitmap_bh->b_data, EXT3_INODES_PER_GROUP(sb), ino);
481 if (ino < EXT3_INODES_PER_GROUP(sb)) {
482
483 BUFFER_TRACE(bitmap_bh, "get_write_access");
484 err = ext3_journal_get_write_access(handle, bitmap_bh);
485 if (err)
486 goto fail;
487
488 if (!ext3_set_bit_atomic(sb_bgl_lock(sbi, group),
489 ino, bitmap_bh->b_data)) {
490 /* we won it */
491 BUFFER_TRACE(bitmap_bh,
492 "call ext3_journal_dirty_metadata");
493 err = ext3_journal_dirty_metadata(handle,
494 bitmap_bh);
495 if (err)
496 goto fail;
497 goto got;
498 }
499 /* we lost it */
500 journal_release_buffer(handle, bitmap_bh);
501
502 if (++ino < EXT3_INODES_PER_GROUP(sb))
503 goto repeat_in_this_group;
504 }
505
506 /*
507 * This case is possible in concurrent environment. It is very
508 * rare. We cannot repeat the find_group_xxx() call because
509 * that will simply return the same blockgroup, because the
510 * group descriptor metadata has not yet been updated.
511 * So we just go onto the next blockgroup.
512 */
513 if (++group == sbi->s_groups_count)
514 group = 0;
515 }
516 err = -ENOSPC;
517 goto out;
518
519got:
520 ino += group * EXT3_INODES_PER_GROUP(sb) + 1;
521 if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
522 ext3_error (sb, "ext3_new_inode",
523 "reserved inode or inode > inodes count - "
524 "block_group = %d, inode=%lu", group, ino);
525 err = -EIO;
526 goto fail;
527 }
528
529 BUFFER_TRACE(bh2, "get_write_access");
530 err = ext3_journal_get_write_access(handle, bh2);
531 if (err) goto fail;
532 spin_lock(sb_bgl_lock(sbi, group));
533 gdp->bg_free_inodes_count =
534 cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
535 if (S_ISDIR(mode)) {
536 gdp->bg_used_dirs_count =
537 cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
538 }
539 spin_unlock(sb_bgl_lock(sbi, group));
540 BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
541 err = ext3_journal_dirty_metadata(handle, bh2);
542 if (err) goto fail;
543
544 percpu_counter_dec(&sbi->s_freeinodes_counter);
545 if (S_ISDIR(mode))
546 percpu_counter_inc(&sbi->s_dirs_counter);
547 sb->s_dirt = 1;
548
549 inode->i_uid = current->fsuid;
550 if (test_opt (sb, GRPID))
551 inode->i_gid = dir->i_gid;
552 else if (dir->i_mode & S_ISGID) {
553 inode->i_gid = dir->i_gid;
554 if (S_ISDIR(mode))
555 mode |= S_ISGID;
556 } else
557 inode->i_gid = current->fsgid;
558 inode->i_mode = mode;
559
560 inode->i_ino = ino;
561 /* This is the optimal IO size (for stat), not the fs block size */
562 inode->i_blocks = 0;
563 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
564
565 memset(ei->i_data, 0, sizeof(ei->i_data));
566 ei->i_dir_start_lookup = 0;
567 ei->i_disksize = 0;
568
569 ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL;
570 if (S_ISLNK(mode))
571 ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
572 /* dirsync only applies to directories */
573 if (!S_ISDIR(mode))
574 ei->i_flags &= ~EXT3_DIRSYNC_FL;
575#ifdef EXT3_FRAGMENTS
576 ei->i_faddr = 0;
577 ei->i_frag_no = 0;
578 ei->i_frag_size = 0;
579#endif
580 ei->i_file_acl = 0;
581 ei->i_dir_acl = 0;
582 ei->i_dtime = 0;
583 ei->i_block_alloc_info = NULL;
584 ei->i_block_group = group;
585
586 ext3_set_inode_flags(inode);
587 if (IS_DIRSYNC(inode))
588 handle->h_sync = 1;
589 insert_inode_hash(inode);
590 spin_lock(&sbi->s_next_gen_lock);
591 inode->i_generation = sbi->s_next_generation++;
592 spin_unlock(&sbi->s_next_gen_lock);
593
594 ei->i_state = EXT3_STATE_NEW;
595 ei->i_extra_isize =
596 (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
597 sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
598
599 ret = inode;
600 if(DQUOT_ALLOC_INODE(inode)) {
601 err = -EDQUOT;
602 goto fail_drop;
603 }
604
605 err = ext3_init_acl(handle, inode, dir);
606 if (err)
607 goto fail_free_drop;
608
609 err = ext3_init_security(handle,inode, dir);
610 if (err)
611 goto fail_free_drop;
612
613 err = ext3_mark_inode_dirty(handle, inode);
614 if (err) {
615 ext3_std_error(sb, err);
616 goto fail_free_drop;
617 }
618
619 ext3_debug("allocating inode %lu\n", inode->i_ino);
620 goto really_out;
621fail:
622 ext3_std_error(sb, err);
623out:
624 iput(inode);
625 ret = ERR_PTR(err);
626really_out:
627 brelse(bitmap_bh);
628 return ret;
629
630fail_free_drop:
631 DQUOT_FREE_INODE(inode);
632
633fail_drop:
634 DQUOT_DROP(inode);
635 inode->i_flags |= S_NOQUOTA;
636 inode->i_nlink = 0;
637 iput(inode);
638 brelse(bitmap_bh);
639 return ERR_PTR(err);
640}
641
642/* Verify that we are loading a valid orphan from disk */
643struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
644{
645 unsigned long max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count);
646 unsigned long block_group;
647 int bit;
648 struct buffer_head *bitmap_bh = NULL;
649 struct inode *inode = NULL;
650
651 /* Error cases - e2fsck has already cleaned up for us */
652 if (ino > max_ino) {
653 ext3_warning(sb, __FUNCTION__,
654 "bad orphan ino %lu! e2fsck was run?", ino);
655 goto out;
656 }
657
658 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
659 bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
660 bitmap_bh = read_inode_bitmap(sb, block_group);
661 if (!bitmap_bh) {
662 ext3_warning(sb, __FUNCTION__,
663 "inode bitmap error for orphan %lu", ino);
664 goto out;
665 }
666
667 /* Having the inode bit set should be a 100% indicator that this
668 * is a valid orphan (no e2fsck run on fs). Orphans also include
669 * inodes that were being truncated, so we can't check i_nlink==0.
670 */
671 if (!ext3_test_bit(bit, bitmap_bh->b_data) ||
672 !(inode = iget(sb, ino)) || is_bad_inode(inode) ||
673 NEXT_ORPHAN(inode) > max_ino) {
674 ext3_warning(sb, __FUNCTION__,
675 "bad orphan inode %lu! e2fsck was run?", ino);
676 printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n",
677 bit, (unsigned long long)bitmap_bh->b_blocknr,
678 ext3_test_bit(bit, bitmap_bh->b_data));
679 printk(KERN_NOTICE "inode=%p\n", inode);
680 if (inode) {
681 printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
682 is_bad_inode(inode));
683 printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
684 NEXT_ORPHAN(inode));
685 printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
686 }
687 /* Avoid freeing blocks if we got a bad deleted inode */
688 if (inode && inode->i_nlink == 0)
689 inode->i_blocks = 0;
690 iput(inode);
691 inode = NULL;
692 }
693out:
694 brelse(bitmap_bh);
695 return inode;
696}
697
698unsigned long ext3_count_free_inodes (struct super_block * sb)
699{
700 unsigned long desc_count;
701 struct ext3_group_desc *gdp;
702 int i;
703#ifdef EXT3FS_DEBUG
704 struct ext3_super_block *es;
705 unsigned long bitmap_count, x;
706 struct buffer_head *bitmap_bh = NULL;
707
708 es = EXT3_SB(sb)->s_es;
709 desc_count = 0;
710 bitmap_count = 0;
711 gdp = NULL;
712 for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
713 gdp = ext3_get_group_desc (sb, i, NULL);
714 if (!gdp)
715 continue;
716 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
717 brelse(bitmap_bh);
718 bitmap_bh = read_inode_bitmap(sb, i);
719 if (!bitmap_bh)
720 continue;
721
722 x = ext3_count_free(bitmap_bh, EXT3_INODES_PER_GROUP(sb) / 8);
723 printk("group %d: stored = %d, counted = %lu\n",
724 i, le16_to_cpu(gdp->bg_free_inodes_count), x);
725 bitmap_count += x;
726 }
727 brelse(bitmap_bh);
728 printk("ext3_count_free_inodes: stored = %u, computed = %lu, %lu\n",
729 le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
730 return desc_count;
731#else
732 desc_count = 0;
733 for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
734 gdp = ext3_get_group_desc (sb, i, NULL);
735 if (!gdp)
736 continue;
737 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
738 cond_resched();
739 }
740 return desc_count;
741#endif
742}
743
744/* Called at mount-time, super-block is locked */
745unsigned long ext3_count_dirs (struct super_block * sb)
746{
747 unsigned long count = 0;
748 int i;
749
750 for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
751 struct ext3_group_desc *gdp = ext3_get_group_desc (sb, i, NULL);
752 if (!gdp)
753 continue;
754 count += le16_to_cpu(gdp->bg_used_dirs_count);
755 }
756 return count;
757}
758
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
new file mode 100644
index 000000000000..03ba5bcab186
--- /dev/null
+++ b/fs/ext4/inode.c
@@ -0,0 +1,3219 @@
1/*
2 * linux/fs/ext3/inode.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/inode.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Goal-directed block allocation by Stephen Tweedie
16 * (sct@redhat.com), 1993, 1998
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 64-bit file support on 64-bit platforms by Jakub Jelinek
20 * (jj@sunsite.ms.mff.cuni.cz)
21 *
22 * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
23 */
24
25#include <linux/module.h>
26#include <linux/fs.h>
27#include <linux/time.h>
28#include <linux/ext3_jbd.h>
29#include <linux/jbd.h>
30#include <linux/smp_lock.h>
31#include <linux/highuid.h>
32#include <linux/pagemap.h>
33#include <linux/quotaops.h>
34#include <linux/string.h>
35#include <linux/buffer_head.h>
36#include <linux/writeback.h>
37#include <linux/mpage.h>
38#include <linux/uio.h>
39#include <linux/bio.h>
40#include "xattr.h"
41#include "acl.h"
42
43static int ext3_writepage_trans_blocks(struct inode *inode);
44
45/*
46 * Test whether an inode is a fast symlink.
47 */
48static int ext3_inode_is_fast_symlink(struct inode *inode)
49{
50 int ea_blocks = EXT3_I(inode)->i_file_acl ?
51 (inode->i_sb->s_blocksize >> 9) : 0;
52
53 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
54}
55
56/*
57 * The ext3 forget function must perform a revoke if we are freeing data
58 * which has been journaled. Metadata (eg. indirect blocks) must be
59 * revoked in all cases.
60 *
61 * "bh" may be NULL: a metadata block may have been freed from memory
62 * but there may still be a record of it in the journal, and that record
63 * still needs to be revoked.
64 */
65int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
66 struct buffer_head *bh, ext3_fsblk_t blocknr)
67{
68 int err;
69
70 might_sleep();
71
72 BUFFER_TRACE(bh, "enter");
73
74 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
75 "data mode %lx\n",
76 bh, is_metadata, inode->i_mode,
77 test_opt(inode->i_sb, DATA_FLAGS));
78
79 /* Never use the revoke function if we are doing full data
80 * journaling: there is no need to, and a V1 superblock won't
81 * support it. Otherwise, only skip the revoke on un-journaled
82 * data blocks. */
83
84 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
85 (!is_metadata && !ext3_should_journal_data(inode))) {
86 if (bh) {
87 BUFFER_TRACE(bh, "call journal_forget");
88 return ext3_journal_forget(handle, bh);
89 }
90 return 0;
91 }
92
93 /*
94 * data!=journal && (is_metadata || should_journal_data(inode))
95 */
96 BUFFER_TRACE(bh, "call ext3_journal_revoke");
97 err = ext3_journal_revoke(handle, blocknr, bh);
98 if (err)
99 ext3_abort(inode->i_sb, __FUNCTION__,
100 "error %d when attempting revoke", err);
101 BUFFER_TRACE(bh, "exit");
102 return err;
103}
104
105/*
106 * Work out how many blocks we need to proceed with the next chunk of a
107 * truncate transaction.
108 */
109static unsigned long blocks_for_truncate(struct inode *inode)
110{
111 unsigned long needed;
112
113 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
114
115 /* Give ourselves just enough room to cope with inodes in which
116 * i_blocks is corrupt: we've seen disk corruptions in the past
117 * which resulted in random data in an inode which looked enough
118 * like a regular file for ext3 to try to delete it. Things
119 * will go a bit crazy if that happens, but at least we should
120 * try not to panic the whole kernel. */
121 if (needed < 2)
122 needed = 2;
123
124 /* But we need to bound the transaction so we don't overflow the
125 * journal. */
126 if (needed > EXT3_MAX_TRANS_DATA)
127 needed = EXT3_MAX_TRANS_DATA;
128
129 return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
130}
131
132/*
133 * Truncate transactions can be complex and absolutely huge. So we need to
134 * be able to restart the transaction at a conventient checkpoint to make
135 * sure we don't overflow the journal.
136 *
137 * start_transaction gets us a new handle for a truncate transaction,
138 * and extend_transaction tries to extend the existing one a bit. If
139 * extend fails, we need to propagate the failure up and restart the
140 * transaction in the top-level truncate loop. --sct
141 */
142static handle_t *start_transaction(struct inode *inode)
143{
144 handle_t *result;
145
146 result = ext3_journal_start(inode, blocks_for_truncate(inode));
147 if (!IS_ERR(result))
148 return result;
149
150 ext3_std_error(inode->i_sb, PTR_ERR(result));
151 return result;
152}
153
154/*
155 * Try to extend this transaction for the purposes of truncation.
156 *
157 * Returns 0 if we managed to create more room. If we can't create more
158 * room, and the transaction must be restarted we return 1.
159 */
160static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
161{
162 if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
163 return 0;
164 if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
165 return 0;
166 return 1;
167}
168
169/*
170 * Restart the transaction associated with *handle. This does a commit,
171 * so before we call here everything must be consistently dirtied against
172 * this transaction.
173 */
174static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
175{
176 jbd_debug(2, "restarting handle %p\n", handle);
177 return ext3_journal_restart(handle, blocks_for_truncate(inode));
178}
179
180/*
181 * Called at the last iput() if i_nlink is zero.
182 */
183void ext3_delete_inode (struct inode * inode)
184{
185 handle_t *handle;
186
187 truncate_inode_pages(&inode->i_data, 0);
188
189 if (is_bad_inode(inode))
190 goto no_delete;
191
192 handle = start_transaction(inode);
193 if (IS_ERR(handle)) {
194 /*
195 * If we're going to skip the normal cleanup, we still need to
196 * make sure that the in-core orphan linked list is properly
197 * cleaned up.
198 */
199 ext3_orphan_del(NULL, inode);
200 goto no_delete;
201 }
202
203 if (IS_SYNC(inode))
204 handle->h_sync = 1;
205 inode->i_size = 0;
206 if (inode->i_blocks)
207 ext3_truncate(inode);
208 /*
209 * Kill off the orphan record which ext3_truncate created.
210 * AKPM: I think this can be inside the above `if'.
211 * Note that ext3_orphan_del() has to be able to cope with the
212 * deletion of a non-existent orphan - this is because we don't
213 * know if ext3_truncate() actually created an orphan record.
214 * (Well, we could do this if we need to, but heck - it works)
215 */
216 ext3_orphan_del(handle, inode);
217 EXT3_I(inode)->i_dtime = get_seconds();
218
219 /*
220 * One subtle ordering requirement: if anything has gone wrong
221 * (transaction abort, IO errors, whatever), then we can still
222 * do these next steps (the fs will already have been marked as
223 * having errors), but we can't free the inode if the mark_dirty
224 * fails.
225 */
226 if (ext3_mark_inode_dirty(handle, inode))
227 /* If that failed, just do the required in-core inode clear. */
228 clear_inode(inode);
229 else
230 ext3_free_inode(handle, inode);
231 ext3_journal_stop(handle);
232 return;
233no_delete:
234 clear_inode(inode); /* We must guarantee clearing of inode... */
235}
236
237typedef struct {
238 __le32 *p;
239 __le32 key;
240 struct buffer_head *bh;
241} Indirect;
242
243static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
244{
245 p->key = *(p->p = v);
246 p->bh = bh;
247}
248
249static int verify_chain(Indirect *from, Indirect *to)
250{
251 while (from <= to && from->key == *from->p)
252 from++;
253 return (from > to);
254}
255
256/**
257 * ext3_block_to_path - parse the block number into array of offsets
258 * @inode: inode in question (we are only interested in its superblock)
259 * @i_block: block number to be parsed
260 * @offsets: array to store the offsets in
261 * @boundary: set this non-zero if the referred-to block is likely to be
262 * followed (on disk) by an indirect block.
263 *
264 * To store the locations of file's data ext3 uses a data structure common
265 * for UNIX filesystems - tree of pointers anchored in the inode, with
266 * data blocks at leaves and indirect blocks in intermediate nodes.
267 * This function translates the block number into path in that tree -
268 * return value is the path length and @offsets[n] is the offset of
269 * pointer to (n+1)th node in the nth one. If @block is out of range
270 * (negative or too large) warning is printed and zero returned.
271 *
272 * Note: function doesn't find node addresses, so no IO is needed. All
273 * we need to know is the capacity of indirect blocks (taken from the
274 * inode->i_sb).
275 */
276
277/*
278 * Portability note: the last comparison (check that we fit into triple
279 * indirect block) is spelled differently, because otherwise on an
280 * architecture with 32-bit longs and 8Kb pages we might get into trouble
281 * if our filesystem had 8Kb blocks. We might use long long, but that would
282 * kill us on x86. Oh, well, at least the sign propagation does not matter -
283 * i_block would have to be negative in the very beginning, so we would not
284 * get there at all.
285 */
286
287static int ext3_block_to_path(struct inode *inode,
288 long i_block, int offsets[4], int *boundary)
289{
290 int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
291 int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
292 const long direct_blocks = EXT3_NDIR_BLOCKS,
293 indirect_blocks = ptrs,
294 double_blocks = (1 << (ptrs_bits * 2));
295 int n = 0;
296 int final = 0;
297
298 if (i_block < 0) {
299 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
300 } else if (i_block < direct_blocks) {
301 offsets[n++] = i_block;
302 final = direct_blocks;
303 } else if ( (i_block -= direct_blocks) < indirect_blocks) {
304 offsets[n++] = EXT3_IND_BLOCK;
305 offsets[n++] = i_block;
306 final = ptrs;
307 } else if ((i_block -= indirect_blocks) < double_blocks) {
308 offsets[n++] = EXT3_DIND_BLOCK;
309 offsets[n++] = i_block >> ptrs_bits;
310 offsets[n++] = i_block & (ptrs - 1);
311 final = ptrs;
312 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
313 offsets[n++] = EXT3_TIND_BLOCK;
314 offsets[n++] = i_block >> (ptrs_bits * 2);
315 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
316 offsets[n++] = i_block & (ptrs - 1);
317 final = ptrs;
318 } else {
319 ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big");
320 }
321 if (boundary)
322 *boundary = final - 1 - (i_block & (ptrs - 1));
323 return n;
324}
325
326/**
327 * ext3_get_branch - read the chain of indirect blocks leading to data
328 * @inode: inode in question
329 * @depth: depth of the chain (1 - direct pointer, etc.)
330 * @offsets: offsets of pointers in inode/indirect blocks
331 * @chain: place to store the result
332 * @err: here we store the error value
333 *
334 * Function fills the array of triples <key, p, bh> and returns %NULL
335 * if everything went OK or the pointer to the last filled triple
336 * (incomplete one) otherwise. Upon the return chain[i].key contains
337 * the number of (i+1)-th block in the chain (as it is stored in memory,
338 * i.e. little-endian 32-bit), chain[i].p contains the address of that
339 * number (it points into struct inode for i==0 and into the bh->b_data
340 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
341 * block for i>0 and NULL for i==0. In other words, it holds the block
342 * numbers of the chain, addresses they were taken from (and where we can
343 * verify that chain did not change) and buffer_heads hosting these
344 * numbers.
345 *
346 * Function stops when it stumbles upon zero pointer (absent block)
347 * (pointer to last triple returned, *@err == 0)
348 * or when it gets an IO error reading an indirect block
349 * (ditto, *@err == -EIO)
350 * or when it notices that chain had been changed while it was reading
351 * (ditto, *@err == -EAGAIN)
352 * or when it reads all @depth-1 indirect blocks successfully and finds
353 * the whole chain, all way to the data (returns %NULL, *err == 0).
354 */
355static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
356 Indirect chain[4], int *err)
357{
358 struct super_block *sb = inode->i_sb;
359 Indirect *p = chain;
360 struct buffer_head *bh;
361
362 *err = 0;
363 /* i_data is not going away, no lock needed */
364 add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
365 if (!p->key)
366 goto no_block;
367 while (--depth) {
368 bh = sb_bread(sb, le32_to_cpu(p->key));
369 if (!bh)
370 goto failure;
371 /* Reader: pointers */
372 if (!verify_chain(chain, p))
373 goto changed;
374 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
375 /* Reader: end */
376 if (!p->key)
377 goto no_block;
378 }
379 return NULL;
380
381changed:
382 brelse(bh);
383 *err = -EAGAIN;
384 goto no_block;
385failure:
386 *err = -EIO;
387no_block:
388 return p;
389}
390
391/**
392 * ext3_find_near - find a place for allocation with sufficient locality
393 * @inode: owner
394 * @ind: descriptor of indirect block.
395 *
396 * This function returns the prefered place for block allocation.
397 * It is used when heuristic for sequential allocation fails.
398 * Rules are:
399 * + if there is a block to the left of our position - allocate near it.
400 * + if pointer will live in indirect block - allocate near that block.
401 * + if pointer will live in inode - allocate in the same
402 * cylinder group.
403 *
404 * In the latter case we colour the starting block by the callers PID to
405 * prevent it from clashing with concurrent allocations for a different inode
406 * in the same block group. The PID is used here so that functionally related
407 * files will be close-by on-disk.
408 *
409 * Caller must make sure that @ind is valid and will stay that way.
410 */
411static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
412{
413 struct ext3_inode_info *ei = EXT3_I(inode);
414 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
415 __le32 *p;
416 ext3_fsblk_t bg_start;
417 ext3_grpblk_t colour;
418
419 /* Try to find previous block */
420 for (p = ind->p - 1; p >= start; p--) {
421 if (*p)
422 return le32_to_cpu(*p);
423 }
424
425 /* No such thing, so let's try location of indirect block */
426 if (ind->bh)
427 return ind->bh->b_blocknr;
428
429 /*
430 * It is going to be referred to from the inode itself? OK, just put it
431 * into the same cylinder group then.
432 */
433 bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group);
434 colour = (current->pid % 16) *
435 (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
436 return bg_start + colour;
437}
438
439/**
440 * ext3_find_goal - find a prefered place for allocation.
441 * @inode: owner
442 * @block: block we want
443 * @chain: chain of indirect blocks
444 * @partial: pointer to the last triple within a chain
445 * @goal: place to store the result.
446 *
447 * Normally this function find the prefered place for block allocation,
448 * stores it in *@goal and returns zero.
449 */
450
451static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
452 Indirect chain[4], Indirect *partial)
453{
454 struct ext3_block_alloc_info *block_i;
455
456 block_i = EXT3_I(inode)->i_block_alloc_info;
457
458 /*
459 * try the heuristic for sequential allocation,
460 * failing that at least try to get decent locality.
461 */
462 if (block_i && (block == block_i->last_alloc_logical_block + 1)
463 && (block_i->last_alloc_physical_block != 0)) {
464 return block_i->last_alloc_physical_block + 1;
465 }
466
467 return ext3_find_near(inode, partial);
468}
469
470/**
471 * ext3_blks_to_allocate: Look up the block map and count the number
472 * of direct blocks need to be allocated for the given branch.
473 *
474 * @branch: chain of indirect blocks
475 * @k: number of blocks need for indirect blocks
476 * @blks: number of data blocks to be mapped.
477 * @blocks_to_boundary: the offset in the indirect block
478 *
479 * return the total number of blocks to be allocate, including the
480 * direct and indirect blocks.
481 */
482static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
483 int blocks_to_boundary)
484{
485 unsigned long count = 0;
486
487 /*
488 * Simple case, [t,d]Indirect block(s) has not allocated yet
489 * then it's clear blocks on that path have not allocated
490 */
491 if (k > 0) {
492 /* right now we don't handle cross boundary allocation */
493 if (blks < blocks_to_boundary + 1)
494 count += blks;
495 else
496 count += blocks_to_boundary + 1;
497 return count;
498 }
499
500 count++;
501 while (count < blks && count <= blocks_to_boundary &&
502 le32_to_cpu(*(branch[0].p + count)) == 0) {
503 count++;
504 }
505 return count;
506}
507
508/**
509 * ext3_alloc_blocks: multiple allocate blocks needed for a branch
510 * @indirect_blks: the number of blocks need to allocate for indirect
511 * blocks
512 *
513 * @new_blocks: on return it will store the new block numbers for
514 * the indirect blocks(if needed) and the first direct block,
515 * @blks: on return it will store the total number of allocated
516 * direct blocks
517 */
518static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
519 ext3_fsblk_t goal, int indirect_blks, int blks,
520 ext3_fsblk_t new_blocks[4], int *err)
521{
522 int target, i;
523 unsigned long count = 0;
524 int index = 0;
525 ext3_fsblk_t current_block = 0;
526 int ret = 0;
527
528 /*
529 * Here we try to allocate the requested multiple blocks at once,
530 * on a best-effort basis.
531 * To build a branch, we should allocate blocks for
532 * the indirect blocks(if not allocated yet), and at least
533 * the first direct block of this branch. That's the
534 * minimum number of blocks need to allocate(required)
535 */
536 target = blks + indirect_blks;
537
538 while (1) {
539 count = target;
540 /* allocating blocks for indirect blocks and direct blocks */
541 current_block = ext3_new_blocks(handle,inode,goal,&count,err);
542 if (*err)
543 goto failed_out;
544
545 target -= count;
546 /* allocate blocks for indirect blocks */
547 while (index < indirect_blks && count) {
548 new_blocks[index++] = current_block++;
549 count--;
550 }
551
552 if (count > 0)
553 break;
554 }
555
556 /* save the new block number for the first direct block */
557 new_blocks[index] = current_block;
558
559 /* total number of blocks allocated for direct blocks */
560 ret = count;
561 *err = 0;
562 return ret;
563failed_out:
564 for (i = 0; i <index; i++)
565 ext3_free_blocks(handle, inode, new_blocks[i], 1);
566 return ret;
567}
568
569/**
570 * ext3_alloc_branch - allocate and set up a chain of blocks.
571 * @inode: owner
572 * @indirect_blks: number of allocated indirect blocks
573 * @blks: number of allocated direct blocks
574 * @offsets: offsets (in the blocks) to store the pointers to next.
575 * @branch: place to store the chain in.
576 *
577 * This function allocates blocks, zeroes out all but the last one,
578 * links them into chain and (if we are synchronous) writes them to disk.
579 * In other words, it prepares a branch that can be spliced onto the
580 * inode. It stores the information about that chain in the branch[], in
581 * the same format as ext3_get_branch() would do. We are calling it after
582 * we had read the existing part of chain and partial points to the last
583 * triple of that (one with zero ->key). Upon the exit we have the same
584 * picture as after the successful ext3_get_block(), except that in one
585 * place chain is disconnected - *branch->p is still zero (we did not
586 * set the last link), but branch->key contains the number that should
587 * be placed into *branch->p to fill that gap.
588 *
589 * If allocation fails we free all blocks we've allocated (and forget
590 * their buffer_heads) and return the error value the from failed
591 * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
592 * as described above and return 0.
593 */
594static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
595 int indirect_blks, int *blks, ext3_fsblk_t goal,
596 int *offsets, Indirect *branch)
597{
598 int blocksize = inode->i_sb->s_blocksize;
599 int i, n = 0;
600 int err = 0;
601 struct buffer_head *bh;
602 int num;
603 ext3_fsblk_t new_blocks[4];
604 ext3_fsblk_t current_block;
605
606 num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
607 *blks, new_blocks, &err);
608 if (err)
609 return err;
610
611 branch[0].key = cpu_to_le32(new_blocks[0]);
612 /*
613 * metadata blocks and data blocks are allocated.
614 */
615 for (n = 1; n <= indirect_blks; n++) {
616 /*
617 * Get buffer_head for parent block, zero it out
618 * and set the pointer to new one, then send
619 * parent to disk.
620 */
621 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
622 branch[n].bh = bh;
623 lock_buffer(bh);
624 BUFFER_TRACE(bh, "call get_create_access");
625 err = ext3_journal_get_create_access(handle, bh);
626 if (err) {
627 unlock_buffer(bh);
628 brelse(bh);
629 goto failed;
630 }
631
632 memset(bh->b_data, 0, blocksize);
633 branch[n].p = (__le32 *) bh->b_data + offsets[n];
634 branch[n].key = cpu_to_le32(new_blocks[n]);
635 *branch[n].p = branch[n].key;
636 if ( n == indirect_blks) {
637 current_block = new_blocks[n];
638 /*
639 * End of chain, update the last new metablock of
640 * the chain to point to the new allocated
641 * data blocks numbers
642 */
643 for (i=1; i < num; i++)
644 *(branch[n].p + i) = cpu_to_le32(++current_block);
645 }
646 BUFFER_TRACE(bh, "marking uptodate");
647 set_buffer_uptodate(bh);
648 unlock_buffer(bh);
649
650 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
651 err = ext3_journal_dirty_metadata(handle, bh);
652 if (err)
653 goto failed;
654 }
655 *blks = num;
656 return err;
657failed:
658 /* Allocation failed, free what we already allocated */
659 for (i = 1; i <= n ; i++) {
660 BUFFER_TRACE(branch[i].bh, "call journal_forget");
661 ext3_journal_forget(handle, branch[i].bh);
662 }
663 for (i = 0; i <indirect_blks; i++)
664 ext3_free_blocks(handle, inode, new_blocks[i], 1);
665
666 ext3_free_blocks(handle, inode, new_blocks[i], num);
667
668 return err;
669}
670
671/**
672 * ext3_splice_branch - splice the allocated branch onto inode.
673 * @inode: owner
674 * @block: (logical) number of block we are adding
675 * @chain: chain of indirect blocks (with a missing link - see
676 * ext3_alloc_branch)
677 * @where: location of missing link
678 * @num: number of indirect blocks we are adding
679 * @blks: number of direct blocks we are adding
680 *
681 * This function fills the missing link and does all housekeeping needed in
682 * inode (->i_blocks, etc.). In case of success we end up with the full
683 * chain to new block and return 0.
684 */
685static int ext3_splice_branch(handle_t *handle, struct inode *inode,
686 long block, Indirect *where, int num, int blks)
687{
688 int i;
689 int err = 0;
690 struct ext3_block_alloc_info *block_i;
691 ext3_fsblk_t current_block;
692
693 block_i = EXT3_I(inode)->i_block_alloc_info;
694 /*
695 * If we're splicing into a [td]indirect block (as opposed to the
696 * inode) then we need to get write access to the [td]indirect block
697 * before the splice.
698 */
699 if (where->bh) {
700 BUFFER_TRACE(where->bh, "get_write_access");
701 err = ext3_journal_get_write_access(handle, where->bh);
702 if (err)
703 goto err_out;
704 }
705 /* That's it */
706
707 *where->p = where->key;
708
709 /*
710 * Update the host buffer_head or inode to point to more just allocated
711 * direct blocks blocks
712 */
713 if (num == 0 && blks > 1) {
714 current_block = le32_to_cpu(where->key) + 1;
715 for (i = 1; i < blks; i++)
716 *(where->p + i ) = cpu_to_le32(current_block++);
717 }
718
719 /*
720 * update the most recently allocated logical & physical block
721 * in i_block_alloc_info, to assist find the proper goal block for next
722 * allocation
723 */
724 if (block_i) {
725 block_i->last_alloc_logical_block = block + blks - 1;
726 block_i->last_alloc_physical_block =
727 le32_to_cpu(where[num].key) + blks - 1;
728 }
729
730 /* We are done with atomic stuff, now do the rest of housekeeping */
731
732 inode->i_ctime = CURRENT_TIME_SEC;
733 ext3_mark_inode_dirty(handle, inode);
734
735 /* had we spliced it onto indirect block? */
736 if (where->bh) {
737 /*
738 * If we spliced it onto an indirect block, we haven't
739 * altered the inode. Note however that if it is being spliced
740 * onto an indirect block at the very end of the file (the
741 * file is growing) then we *will* alter the inode to reflect
742 * the new i_size. But that is not done here - it is done in
743 * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
744 */
745 jbd_debug(5, "splicing indirect only\n");
746 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
747 err = ext3_journal_dirty_metadata(handle, where->bh);
748 if (err)
749 goto err_out;
750 } else {
751 /*
752 * OK, we spliced it into the inode itself on a direct block.
753 * Inode was dirtied above.
754 */
755 jbd_debug(5, "splicing direct\n");
756 }
757 return err;
758
759err_out:
760 for (i = 1; i <= num; i++) {
761 BUFFER_TRACE(where[i].bh, "call journal_forget");
762 ext3_journal_forget(handle, where[i].bh);
763 ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
764 }
765 ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
766
767 return err;
768}
769
770/*
771 * Allocation strategy is simple: if we have to allocate something, we will
772 * have to go the whole way to leaf. So let's do it before attaching anything
773 * to tree, set linkage between the newborn blocks, write them if sync is
774 * required, recheck the path, free and repeat if check fails, otherwise
775 * set the last missing link (that will protect us from any truncate-generated
776 * removals - all blocks on the path are immune now) and possibly force the
777 * write on the parent block.
778 * That has a nice additional property: no special recovery from the failed
779 * allocations is needed - we simply release blocks and do not touch anything
780 * reachable from inode.
781 *
782 * `handle' can be NULL if create == 0.
783 *
784 * The BKL may not be held on entry here. Be sure to take it early.
785 * return > 0, # of blocks mapped or allocated.
786 * return = 0, if plain lookup failed.
787 * return < 0, error case.
788 */
789int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
790 sector_t iblock, unsigned long maxblocks,
791 struct buffer_head *bh_result,
792 int create, int extend_disksize)
793{
794 int err = -EIO;
795 int offsets[4];
796 Indirect chain[4];
797 Indirect *partial;
798 ext3_fsblk_t goal;
799 int indirect_blks;
800 int blocks_to_boundary = 0;
801 int depth;
802 struct ext3_inode_info *ei = EXT3_I(inode);
803 int count = 0;
804 ext3_fsblk_t first_block = 0;
805
806
807 J_ASSERT(handle != NULL || create == 0);
808 depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
809
810 if (depth == 0)
811 goto out;
812
813 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
814
815 /* Simplest case - block found, no allocation needed */
816 if (!partial) {
817 first_block = le32_to_cpu(chain[depth - 1].key);
818 clear_buffer_new(bh_result);
819 count++;
820 /*map more blocks*/
821 while (count < maxblocks && count <= blocks_to_boundary) {
822 ext3_fsblk_t blk;
823
824 if (!verify_chain(chain, partial)) {
825 /*
826 * Indirect block might be removed by
827 * truncate while we were reading it.
828 * Handling of that case: forget what we've
829 * got now. Flag the err as EAGAIN, so it
830 * will reread.
831 */
832 err = -EAGAIN;
833 count = 0;
834 break;
835 }
836 blk = le32_to_cpu(*(chain[depth-1].p + count));
837
838 if (blk == first_block + count)
839 count++;
840 else
841 break;
842 }
843 if (err != -EAGAIN)
844 goto got_it;
845 }
846
847 /* Next simple case - plain lookup or failed read of indirect block */
848 if (!create || err == -EIO)
849 goto cleanup;
850
851 mutex_lock(&ei->truncate_mutex);
852
853 /*
854 * If the indirect block is missing while we are reading
855 * the chain(ext3_get_branch() returns -EAGAIN err), or
856 * if the chain has been changed after we grab the semaphore,
857 * (either because another process truncated this branch, or
858 * another get_block allocated this branch) re-grab the chain to see if
859 * the request block has been allocated or not.
860 *
861 * Since we already block the truncate/other get_block
862 * at this point, we will have the current copy of the chain when we
863 * splice the branch into the tree.
864 */
865 if (err == -EAGAIN || !verify_chain(chain, partial)) {
866 while (partial > chain) {
867 brelse(partial->bh);
868 partial--;
869 }
870 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
871 if (!partial) {
872 count++;
873 mutex_unlock(&ei->truncate_mutex);
874 if (err)
875 goto cleanup;
876 clear_buffer_new(bh_result);
877 goto got_it;
878 }
879 }
880
881 /*
882 * Okay, we need to do block allocation. Lazily initialize the block
883 * allocation info here if necessary
884 */
885 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
886 ext3_init_block_alloc_info(inode);
887
888 goal = ext3_find_goal(inode, iblock, chain, partial);
889
890 /* the number of blocks need to allocate for [d,t]indirect blocks */
891 indirect_blks = (chain + depth) - partial - 1;
892
893 /*
894 * Next look up the indirect map to count the totoal number of
895 * direct blocks to allocate for this branch.
896 */
897 count = ext3_blks_to_allocate(partial, indirect_blks,
898 maxblocks, blocks_to_boundary);
899 /*
900 * Block out ext3_truncate while we alter the tree
901 */
902 err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
903 offsets + (partial - chain), partial);
904
905 /*
906 * The ext3_splice_branch call will free and forget any buffers
907 * on the new chain if there is a failure, but that risks using
908 * up transaction credits, especially for bitmaps where the
909 * credits cannot be returned. Can we handle this somehow? We
910 * may need to return -EAGAIN upwards in the worst case. --sct
911 */
912 if (!err)
913 err = ext3_splice_branch(handle, inode, iblock,
914 partial, indirect_blks, count);
915 /*
916 * i_disksize growing is protected by truncate_mutex. Don't forget to
917 * protect it if you're about to implement concurrent
918 * ext3_get_block() -bzzz
919 */
920 if (!err && extend_disksize && inode->i_size > ei->i_disksize)
921 ei->i_disksize = inode->i_size;
922 mutex_unlock(&ei->truncate_mutex);
923 if (err)
924 goto cleanup;
925
926 set_buffer_new(bh_result);
927got_it:
928 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
929 if (count > blocks_to_boundary)
930 set_buffer_boundary(bh_result);
931 err = count;
932 /* Clean up and exit */
933 partial = chain + depth - 1; /* the whole chain */
934cleanup:
935 while (partial > chain) {
936 BUFFER_TRACE(partial->bh, "call brelse");
937 brelse(partial->bh);
938 partial--;
939 }
940 BUFFER_TRACE(bh_result, "returned");
941out:
942 return err;
943}
944
945#define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
946
947static int ext3_get_block(struct inode *inode, sector_t iblock,
948 struct buffer_head *bh_result, int create)
949{
950 handle_t *handle = journal_current_handle();
951 int ret = 0;
952 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
953
954 if (!create)
955 goto get_block; /* A read */
956
957 if (max_blocks == 1)
958 goto get_block; /* A single block get */
959
960 if (handle->h_transaction->t_state == T_LOCKED) {
961 /*
962 * Huge direct-io writes can hold off commits for long
963 * periods of time. Let this commit run.
964 */
965 ext3_journal_stop(handle);
966 handle = ext3_journal_start(inode, DIO_CREDITS);
967 if (IS_ERR(handle))
968 ret = PTR_ERR(handle);
969 goto get_block;
970 }
971
972 if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
973 /*
974 * Getting low on buffer credits...
975 */
976 ret = ext3_journal_extend(handle, DIO_CREDITS);
977 if (ret > 0) {
978 /*
979 * Couldn't extend the transaction. Start a new one.
980 */
981 ret = ext3_journal_restart(handle, DIO_CREDITS);
982 }
983 }
984
985get_block:
986 if (ret == 0) {
987 ret = ext3_get_blocks_handle(handle, inode, iblock,
988 max_blocks, bh_result, create, 0);
989 if (ret > 0) {
990 bh_result->b_size = (ret << inode->i_blkbits);
991 ret = 0;
992 }
993 }
994 return ret;
995}
996
997/*
998 * `handle' can be NULL if create is zero
999 */
1000struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
1001 long block, int create, int *errp)
1002{
1003 struct buffer_head dummy;
1004 int fatal = 0, err;
1005
1006 J_ASSERT(handle != NULL || create == 0);
1007
1008 dummy.b_state = 0;
1009 dummy.b_blocknr = -1000;
1010 buffer_trace_init(&dummy.b_history);
1011 err = ext3_get_blocks_handle(handle, inode, block, 1,
1012 &dummy, create, 1);
1013 /*
1014 * ext3_get_blocks_handle() returns number of blocks
1015 * mapped. 0 in case of a HOLE.
1016 */
1017 if (err > 0) {
1018 if (err > 1)
1019 WARN_ON(1);
1020 err = 0;
1021 }
1022 *errp = err;
1023 if (!err && buffer_mapped(&dummy)) {
1024 struct buffer_head *bh;
1025 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
1026 if (!bh) {
1027 *errp = -EIO;
1028 goto err;
1029 }
1030 if (buffer_new(&dummy)) {
1031 J_ASSERT(create != 0);
1032 J_ASSERT(handle != 0);
1033
1034 /*
1035 * Now that we do not always journal data, we should
1036 * keep in mind whether this should always journal the
1037 * new buffer as metadata. For now, regular file
1038 * writes use ext3_get_block instead, so it's not a
1039 * problem.
1040 */
1041 lock_buffer(bh);
1042 BUFFER_TRACE(bh, "call get_create_access");
1043 fatal = ext3_journal_get_create_access(handle, bh);
1044 if (!fatal && !buffer_uptodate(bh)) {
1045 memset(bh->b_data,0,inode->i_sb->s_blocksize);
1046 set_buffer_uptodate(bh);
1047 }
1048 unlock_buffer(bh);
1049 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1050 err = ext3_journal_dirty_metadata(handle, bh);
1051 if (!fatal)
1052 fatal = err;
1053 } else {
1054 BUFFER_TRACE(bh, "not a new buffer");
1055 }
1056 if (fatal) {
1057 *errp = fatal;
1058 brelse(bh);
1059 bh = NULL;
1060 }
1061 return bh;
1062 }
1063err:
1064 return NULL;
1065}
1066
1067struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
1068 int block, int create, int *err)
1069{
1070 struct buffer_head * bh;
1071
1072 bh = ext3_getblk(handle, inode, block, create, err);
1073 if (!bh)
1074 return bh;
1075 if (buffer_uptodate(bh))
1076 return bh;
1077 ll_rw_block(READ_META, 1, &bh);
1078 wait_on_buffer(bh);
1079 if (buffer_uptodate(bh))
1080 return bh;
1081 put_bh(bh);
1082 *err = -EIO;
1083 return NULL;
1084}
1085
1086static int walk_page_buffers( handle_t *handle,
1087 struct buffer_head *head,
1088 unsigned from,
1089 unsigned to,
1090 int *partial,
1091 int (*fn)( handle_t *handle,
1092 struct buffer_head *bh))
1093{
1094 struct buffer_head *bh;
1095 unsigned block_start, block_end;
1096 unsigned blocksize = head->b_size;
1097 int err, ret = 0;
1098 struct buffer_head *next;
1099
1100 for ( bh = head, block_start = 0;
1101 ret == 0 && (bh != head || !block_start);
1102 block_start = block_end, bh = next)
1103 {
1104 next = bh->b_this_page;
1105 block_end = block_start + blocksize;
1106 if (block_end <= from || block_start >= to) {
1107 if (partial && !buffer_uptodate(bh))
1108 *partial = 1;
1109 continue;
1110 }
1111 err = (*fn)(handle, bh);
1112 if (!ret)
1113 ret = err;
1114 }
1115 return ret;
1116}
1117
1118/*
1119 * To preserve ordering, it is essential that the hole instantiation and
1120 * the data write be encapsulated in a single transaction. We cannot
1121 * close off a transaction and start a new one between the ext3_get_block()
1122 * and the commit_write(). So doing the journal_start at the start of
1123 * prepare_write() is the right place.
1124 *
1125 * Also, this function can nest inside ext3_writepage() ->
1126 * block_write_full_page(). In that case, we *know* that ext3_writepage()
1127 * has generated enough buffer credits to do the whole page. So we won't
1128 * block on the journal in that case, which is good, because the caller may
1129 * be PF_MEMALLOC.
1130 *
1131 * By accident, ext3 can be reentered when a transaction is open via
1132 * quota file writes. If we were to commit the transaction while thus
1133 * reentered, there can be a deadlock - we would be holding a quota
1134 * lock, and the commit would never complete if another thread had a
1135 * transaction open and was blocking on the quota lock - a ranking
1136 * violation.
1137 *
1138 * So what we do is to rely on the fact that journal_stop/journal_start
1139 * will _not_ run commit under these circumstances because handle->h_ref
1140 * is elevated. We'll still have enough credits for the tiny quotafile
1141 * write.
1142 */
1143static int do_journal_get_write_access(handle_t *handle,
1144 struct buffer_head *bh)
1145{
1146 if (!buffer_mapped(bh) || buffer_freed(bh))
1147 return 0;
1148 return ext3_journal_get_write_access(handle, bh);
1149}
1150
1151static int ext3_prepare_write(struct file *file, struct page *page,
1152 unsigned from, unsigned to)
1153{
1154 struct inode *inode = page->mapping->host;
1155 int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
1156 handle_t *handle;
1157 int retries = 0;
1158
1159retry:
1160 handle = ext3_journal_start(inode, needed_blocks);
1161 if (IS_ERR(handle)) {
1162 ret = PTR_ERR(handle);
1163 goto out;
1164 }
1165 if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
1166 ret = nobh_prepare_write(page, from, to, ext3_get_block);
1167 else
1168 ret = block_prepare_write(page, from, to, ext3_get_block);
1169 if (ret)
1170 goto prepare_write_failed;
1171
1172 if (ext3_should_journal_data(inode)) {
1173 ret = walk_page_buffers(handle, page_buffers(page),
1174 from, to, NULL, do_journal_get_write_access);
1175 }
1176prepare_write_failed:
1177 if (ret)
1178 ext3_journal_stop(handle);
1179 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1180 goto retry;
1181out:
1182 return ret;
1183}
1184
1185int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1186{
1187 int err = journal_dirty_data(handle, bh);
1188 if (err)
1189 ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
1190 bh, handle,err);
1191 return err;
1192}
1193
1194/* For commit_write() in data=journal mode */
1195static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
1196{
1197 if (!buffer_mapped(bh) || buffer_freed(bh))
1198 return 0;
1199 set_buffer_uptodate(bh);
1200 return ext3_journal_dirty_metadata(handle, bh);
1201}
1202
1203/*
1204 * We need to pick up the new inode size which generic_commit_write gave us
1205 * `file' can be NULL - eg, when called from page_symlink().
1206 *
1207 * ext3 never places buffers on inode->i_mapping->private_list. metadata
1208 * buffers are managed internally.
1209 */
1210static int ext3_ordered_commit_write(struct file *file, struct page *page,
1211 unsigned from, unsigned to)
1212{
1213 handle_t *handle = ext3_journal_current_handle();
1214 struct inode *inode = page->mapping->host;
1215 int ret = 0, ret2;
1216
1217 ret = walk_page_buffers(handle, page_buffers(page),
1218 from, to, NULL, ext3_journal_dirty_data);
1219
1220 if (ret == 0) {
1221 /*
1222 * generic_commit_write() will run mark_inode_dirty() if i_size
1223 * changes. So let's piggyback the i_disksize mark_inode_dirty
1224 * into that.
1225 */
1226 loff_t new_i_size;
1227
1228 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1229 if (new_i_size > EXT3_I(inode)->i_disksize)
1230 EXT3_I(inode)->i_disksize = new_i_size;
1231 ret = generic_commit_write(file, page, from, to);
1232 }
1233 ret2 = ext3_journal_stop(handle);
1234 if (!ret)
1235 ret = ret2;
1236 return ret;
1237}
1238
1239static int ext3_writeback_commit_write(struct file *file, struct page *page,
1240 unsigned from, unsigned to)
1241{
1242 handle_t *handle = ext3_journal_current_handle();
1243 struct inode *inode = page->mapping->host;
1244 int ret = 0, ret2;
1245 loff_t new_i_size;
1246
1247 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1248 if (new_i_size > EXT3_I(inode)->i_disksize)
1249 EXT3_I(inode)->i_disksize = new_i_size;
1250
1251 if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
1252 ret = nobh_commit_write(file, page, from, to);
1253 else
1254 ret = generic_commit_write(file, page, from, to);
1255
1256 ret2 = ext3_journal_stop(handle);
1257 if (!ret)
1258 ret = ret2;
1259 return ret;
1260}
1261
1262static int ext3_journalled_commit_write(struct file *file,
1263 struct page *page, unsigned from, unsigned to)
1264{
1265 handle_t *handle = ext3_journal_current_handle();
1266 struct inode *inode = page->mapping->host;
1267 int ret = 0, ret2;
1268 int partial = 0;
1269 loff_t pos;
1270
1271 /*
1272 * Here we duplicate the generic_commit_write() functionality
1273 */
1274 pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1275
1276 ret = walk_page_buffers(handle, page_buffers(page), from,
1277 to, &partial, commit_write_fn);
1278 if (!partial)
1279 SetPageUptodate(page);
1280 if (pos > inode->i_size)
1281 i_size_write(inode, pos);
1282 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1283 if (inode->i_size > EXT3_I(inode)->i_disksize) {
1284 EXT3_I(inode)->i_disksize = inode->i_size;
1285 ret2 = ext3_mark_inode_dirty(handle, inode);
1286 if (!ret)
1287 ret = ret2;
1288 }
1289 ret2 = ext3_journal_stop(handle);
1290 if (!ret)
1291 ret = ret2;
1292 return ret;
1293}
1294
1295/*
1296 * bmap() is special. It gets used by applications such as lilo and by
1297 * the swapper to find the on-disk block of a specific piece of data.
1298 *
1299 * Naturally, this is dangerous if the block concerned is still in the
1300 * journal. If somebody makes a swapfile on an ext3 data-journaling
1301 * filesystem and enables swap, then they may get a nasty shock when the
1302 * data getting swapped to that swapfile suddenly gets overwritten by
1303 * the original zero's written out previously to the journal and
1304 * awaiting writeback in the kernel's buffer cache.
1305 *
1306 * So, if we see any bmap calls here on a modified, data-journaled file,
1307 * take extra steps to flush any blocks which might be in the cache.
1308 */
1309static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1310{
1311 struct inode *inode = mapping->host;
1312 journal_t *journal;
1313 int err;
1314
1315 if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
1316 /*
1317 * This is a REALLY heavyweight approach, but the use of
1318 * bmap on dirty files is expected to be extremely rare:
1319 * only if we run lilo or swapon on a freshly made file
1320 * do we expect this to happen.
1321 *
1322 * (bmap requires CAP_SYS_RAWIO so this does not
1323 * represent an unprivileged user DOS attack --- we'd be
1324 * in trouble if mortal users could trigger this path at
1325 * will.)
1326 *
1327 * NB. EXT3_STATE_JDATA is not set on files other than
1328 * regular files. If somebody wants to bmap a directory
1329 * or symlink and gets confused because the buffer
1330 * hasn't yet been flushed to disk, they deserve
1331 * everything they get.
1332 */
1333
1334 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
1335 journal = EXT3_JOURNAL(inode);
1336 journal_lock_updates(journal);
1337 err = journal_flush(journal);
1338 journal_unlock_updates(journal);
1339
1340 if (err)
1341 return 0;
1342 }
1343
1344 return generic_block_bmap(mapping,block,ext3_get_block);
1345}
1346
1347static int bget_one(handle_t *handle, struct buffer_head *bh)
1348{
1349 get_bh(bh);
1350 return 0;
1351}
1352
1353static int bput_one(handle_t *handle, struct buffer_head *bh)
1354{
1355 put_bh(bh);
1356 return 0;
1357}
1358
1359static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1360{
1361 if (buffer_mapped(bh))
1362 return ext3_journal_dirty_data(handle, bh);
1363 return 0;
1364}
1365
1366/*
1367 * Note that we always start a transaction even if we're not journalling
1368 * data. This is to preserve ordering: any hole instantiation within
1369 * __block_write_full_page -> ext3_get_block() should be journalled
1370 * along with the data so we don't crash and then get metadata which
1371 * refers to old data.
1372 *
1373 * In all journalling modes block_write_full_page() will start the I/O.
1374 *
1375 * Problem:
1376 *
1377 * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1378 * ext3_writepage()
1379 *
1380 * Similar for:
1381 *
1382 * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1383 *
1384 * Same applies to ext3_get_block(). We will deadlock on various things like
1385 * lock_journal and i_truncate_mutex.
1386 *
1387 * Setting PF_MEMALLOC here doesn't work - too many internal memory
1388 * allocations fail.
1389 *
1390 * 16May01: If we're reentered then journal_current_handle() will be
1391 * non-zero. We simply *return*.
1392 *
1393 * 1 July 2001: @@@ FIXME:
1394 * In journalled data mode, a data buffer may be metadata against the
1395 * current transaction. But the same file is part of a shared mapping
1396 * and someone does a writepage() on it.
1397 *
1398 * We will move the buffer onto the async_data list, but *after* it has
1399 * been dirtied. So there's a small window where we have dirty data on
1400 * BJ_Metadata.
1401 *
1402 * Note that this only applies to the last partial page in the file. The
1403 * bit which block_write_full_page() uses prepare/commit for. (That's
1404 * broken code anyway: it's wrong for msync()).
1405 *
1406 * It's a rare case: affects the final partial page, for journalled data
1407 * where the file is subject to bith write() and writepage() in the same
1408 * transction. To fix it we'll need a custom block_write_full_page().
1409 * We'll probably need that anyway for journalling writepage() output.
1410 *
1411 * We don't honour synchronous mounts for writepage(). That would be
1412 * disastrous. Any write() or metadata operation will sync the fs for
1413 * us.
1414 *
1415 * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1416 * we don't need to open a transaction here.
1417 */
1418static int ext3_ordered_writepage(struct page *page,
1419 struct writeback_control *wbc)
1420{
1421 struct inode *inode = page->mapping->host;
1422 struct buffer_head *page_bufs;
1423 handle_t *handle = NULL;
1424 int ret = 0;
1425 int err;
1426
1427 J_ASSERT(PageLocked(page));
1428
1429 /*
1430 * We give up here if we're reentered, because it might be for a
1431 * different filesystem.
1432 */
1433 if (ext3_journal_current_handle())
1434 goto out_fail;
1435
1436 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1437
1438 if (IS_ERR(handle)) {
1439 ret = PTR_ERR(handle);
1440 goto out_fail;
1441 }
1442
1443 if (!page_has_buffers(page)) {
1444 create_empty_buffers(page, inode->i_sb->s_blocksize,
1445 (1 << BH_Dirty)|(1 << BH_Uptodate));
1446 }
1447 page_bufs = page_buffers(page);
1448 walk_page_buffers(handle, page_bufs, 0,
1449 PAGE_CACHE_SIZE, NULL, bget_one);
1450
1451 ret = block_write_full_page(page, ext3_get_block, wbc);
1452
1453 /*
1454 * The page can become unlocked at any point now, and
1455 * truncate can then come in and change things. So we
1456 * can't touch *page from now on. But *page_bufs is
1457 * safe due to elevated refcount.
1458 */
1459
1460 /*
1461 * And attach them to the current transaction. But only if
1462 * block_write_full_page() succeeded. Otherwise they are unmapped,
1463 * and generally junk.
1464 */
1465 if (ret == 0) {
1466 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1467 NULL, journal_dirty_data_fn);
1468 if (!ret)
1469 ret = err;
1470 }
1471 walk_page_buffers(handle, page_bufs, 0,
1472 PAGE_CACHE_SIZE, NULL, bput_one);
1473 err = ext3_journal_stop(handle);
1474 if (!ret)
1475 ret = err;
1476 return ret;
1477
1478out_fail:
1479 redirty_page_for_writepage(wbc, page);
1480 unlock_page(page);
1481 return ret;
1482}
1483
1484static int ext3_writeback_writepage(struct page *page,
1485 struct writeback_control *wbc)
1486{
1487 struct inode *inode = page->mapping->host;
1488 handle_t *handle = NULL;
1489 int ret = 0;
1490 int err;
1491
1492 if (ext3_journal_current_handle())
1493 goto out_fail;
1494
1495 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1496 if (IS_ERR(handle)) {
1497 ret = PTR_ERR(handle);
1498 goto out_fail;
1499 }
1500
1501 if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
1502 ret = nobh_writepage(page, ext3_get_block, wbc);
1503 else
1504 ret = block_write_full_page(page, ext3_get_block, wbc);
1505
1506 err = ext3_journal_stop(handle);
1507 if (!ret)
1508 ret = err;
1509 return ret;
1510
1511out_fail:
1512 redirty_page_for_writepage(wbc, page);
1513 unlock_page(page);
1514 return ret;
1515}
1516
1517static int ext3_journalled_writepage(struct page *page,
1518 struct writeback_control *wbc)
1519{
1520 struct inode *inode = page->mapping->host;
1521 handle_t *handle = NULL;
1522 int ret = 0;
1523 int err;
1524
1525 if (ext3_journal_current_handle())
1526 goto no_write;
1527
1528 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1529 if (IS_ERR(handle)) {
1530 ret = PTR_ERR(handle);
1531 goto no_write;
1532 }
1533
1534 if (!page_has_buffers(page) || PageChecked(page)) {
1535 /*
1536 * It's mmapped pagecache. Add buffers and journal it. There
1537 * doesn't seem much point in redirtying the page here.
1538 */
1539 ClearPageChecked(page);
1540 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1541 ext3_get_block);
1542 if (ret != 0) {
1543 ext3_journal_stop(handle);
1544 goto out_unlock;
1545 }
1546 ret = walk_page_buffers(handle, page_buffers(page), 0,
1547 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1548
1549 err = walk_page_buffers(handle, page_buffers(page), 0,
1550 PAGE_CACHE_SIZE, NULL, commit_write_fn);
1551 if (ret == 0)
1552 ret = err;
1553 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1554 unlock_page(page);
1555 } else {
1556 /*
1557 * It may be a page full of checkpoint-mode buffers. We don't
1558 * really know unless we go poke around in the buffer_heads.
1559 * But block_write_full_page will do the right thing.
1560 */
1561 ret = block_write_full_page(page, ext3_get_block, wbc);
1562 }
1563 err = ext3_journal_stop(handle);
1564 if (!ret)
1565 ret = err;
1566out:
1567 return ret;
1568
1569no_write:
1570 redirty_page_for_writepage(wbc, page);
1571out_unlock:
1572 unlock_page(page);
1573 goto out;
1574}
1575
1576static int ext3_readpage(struct file *file, struct page *page)
1577{
1578 return mpage_readpage(page, ext3_get_block);
1579}
1580
1581static int
1582ext3_readpages(struct file *file, struct address_space *mapping,
1583 struct list_head *pages, unsigned nr_pages)
1584{
1585 return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1586}
1587
1588static void ext3_invalidatepage(struct page *page, unsigned long offset)
1589{
1590 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1591
1592 /*
1593 * If it's a full truncate we just forget about the pending dirtying
1594 */
1595 if (offset == 0)
1596 ClearPageChecked(page);
1597
1598 journal_invalidatepage(journal, page, offset);
1599}
1600
1601static int ext3_releasepage(struct page *page, gfp_t wait)
1602{
1603 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1604
1605 WARN_ON(PageChecked(page));
1606 if (!page_has_buffers(page))
1607 return 0;
1608 return journal_try_to_free_buffers(journal, page, wait);
1609}
1610
1611/*
1612 * If the O_DIRECT write will extend the file then add this inode to the
1613 * orphan list. So recovery will truncate it back to the original size
1614 * if the machine crashes during the write.
1615 *
1616 * If the O_DIRECT write is intantiating holes inside i_size and the machine
1617 * crashes then stale disk data _may_ be exposed inside the file.
1618 */
1619static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1620 const struct iovec *iov, loff_t offset,
1621 unsigned long nr_segs)
1622{
1623 struct file *file = iocb->ki_filp;
1624 struct inode *inode = file->f_mapping->host;
1625 struct ext3_inode_info *ei = EXT3_I(inode);
1626 handle_t *handle = NULL;
1627 ssize_t ret;
1628 int orphan = 0;
1629 size_t count = iov_length(iov, nr_segs);
1630
1631 if (rw == WRITE) {
1632 loff_t final_size = offset + count;
1633
1634 handle = ext3_journal_start(inode, DIO_CREDITS);
1635 if (IS_ERR(handle)) {
1636 ret = PTR_ERR(handle);
1637 goto out;
1638 }
1639 if (final_size > inode->i_size) {
1640 ret = ext3_orphan_add(handle, inode);
1641 if (ret)
1642 goto out_stop;
1643 orphan = 1;
1644 ei->i_disksize = inode->i_size;
1645 }
1646 }
1647
1648 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1649 offset, nr_segs,
1650 ext3_get_block, NULL);
1651
1652 /*
1653 * Reacquire the handle: ext3_get_block() can restart the transaction
1654 */
1655 handle = journal_current_handle();
1656
1657out_stop:
1658 if (handle) {
1659 int err;
1660
1661 if (orphan && inode->i_nlink)
1662 ext3_orphan_del(handle, inode);
1663 if (orphan && ret > 0) {
1664 loff_t end = offset + ret;
1665 if (end > inode->i_size) {
1666 ei->i_disksize = end;
1667 i_size_write(inode, end);
1668 /*
1669 * We're going to return a positive `ret'
1670 * here due to non-zero-length I/O, so there's
1671 * no way of reporting error returns from
1672 * ext3_mark_inode_dirty() to userspace. So
1673 * ignore it.
1674 */
1675 ext3_mark_inode_dirty(handle, inode);
1676 }
1677 }
1678 err = ext3_journal_stop(handle);
1679 if (ret == 0)
1680 ret = err;
1681 }
1682out:
1683 return ret;
1684}
1685
1686/*
1687 * Pages can be marked dirty completely asynchronously from ext3's journalling
1688 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
1689 * much here because ->set_page_dirty is called under VFS locks. The page is
1690 * not necessarily locked.
1691 *
1692 * We cannot just dirty the page and leave attached buffers clean, because the
1693 * buffers' dirty state is "definitive". We cannot just set the buffers dirty
1694 * or jbddirty because all the journalling code will explode.
1695 *
1696 * So what we do is to mark the page "pending dirty" and next time writepage
1697 * is called, propagate that into the buffers appropriately.
1698 */
1699static int ext3_journalled_set_page_dirty(struct page *page)
1700{
1701 SetPageChecked(page);
1702 return __set_page_dirty_nobuffers(page);
1703}
1704
1705static const struct address_space_operations ext3_ordered_aops = {
1706 .readpage = ext3_readpage,
1707 .readpages = ext3_readpages,
1708 .writepage = ext3_ordered_writepage,
1709 .sync_page = block_sync_page,
1710 .prepare_write = ext3_prepare_write,
1711 .commit_write = ext3_ordered_commit_write,
1712 .bmap = ext3_bmap,
1713 .invalidatepage = ext3_invalidatepage,
1714 .releasepage = ext3_releasepage,
1715 .direct_IO = ext3_direct_IO,
1716 .migratepage = buffer_migrate_page,
1717};
1718
1719static const struct address_space_operations ext3_writeback_aops = {
1720 .readpage = ext3_readpage,
1721 .readpages = ext3_readpages,
1722 .writepage = ext3_writeback_writepage,
1723 .sync_page = block_sync_page,
1724 .prepare_write = ext3_prepare_write,
1725 .commit_write = ext3_writeback_commit_write,
1726 .bmap = ext3_bmap,
1727 .invalidatepage = ext3_invalidatepage,
1728 .releasepage = ext3_releasepage,
1729 .direct_IO = ext3_direct_IO,
1730 .migratepage = buffer_migrate_page,
1731};
1732
1733static const struct address_space_operations ext3_journalled_aops = {
1734 .readpage = ext3_readpage,
1735 .readpages = ext3_readpages,
1736 .writepage = ext3_journalled_writepage,
1737 .sync_page = block_sync_page,
1738 .prepare_write = ext3_prepare_write,
1739 .commit_write = ext3_journalled_commit_write,
1740 .set_page_dirty = ext3_journalled_set_page_dirty,
1741 .bmap = ext3_bmap,
1742 .invalidatepage = ext3_invalidatepage,
1743 .releasepage = ext3_releasepage,
1744};
1745
1746void ext3_set_aops(struct inode *inode)
1747{
1748 if (ext3_should_order_data(inode))
1749 inode->i_mapping->a_ops = &ext3_ordered_aops;
1750 else if (ext3_should_writeback_data(inode))
1751 inode->i_mapping->a_ops = &ext3_writeback_aops;
1752 else
1753 inode->i_mapping->a_ops = &ext3_journalled_aops;
1754}
1755
1756/*
1757 * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1758 * up to the end of the block which corresponds to `from'.
1759 * This required during truncate. We need to physically zero the tail end
1760 * of that block so it doesn't yield old data if the file is later grown.
1761 */
1762static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1763 struct address_space *mapping, loff_t from)
1764{
1765 ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
1766 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1767 unsigned blocksize, iblock, length, pos;
1768 struct inode *inode = mapping->host;
1769 struct buffer_head *bh;
1770 int err = 0;
1771 void *kaddr;
1772
1773 blocksize = inode->i_sb->s_blocksize;
1774 length = blocksize - (offset & (blocksize - 1));
1775 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1776
1777 /*
1778 * For "nobh" option, we can only work if we don't need to
1779 * read-in the page - otherwise we create buffers to do the IO.
1780 */
1781 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
1782 ext3_should_writeback_data(inode) && PageUptodate(page)) {
1783 kaddr = kmap_atomic(page, KM_USER0);
1784 memset(kaddr + offset, 0, length);
1785 flush_dcache_page(page);
1786 kunmap_atomic(kaddr, KM_USER0);
1787 set_page_dirty(page);
1788 goto unlock;
1789 }
1790
1791 if (!page_has_buffers(page))
1792 create_empty_buffers(page, blocksize, 0);
1793
1794 /* Find the buffer that contains "offset" */
1795 bh = page_buffers(page);
1796 pos = blocksize;
1797 while (offset >= pos) {
1798 bh = bh->b_this_page;
1799 iblock++;
1800 pos += blocksize;
1801 }
1802
1803 err = 0;
1804 if (buffer_freed(bh)) {
1805 BUFFER_TRACE(bh, "freed: skip");
1806 goto unlock;
1807 }
1808
1809 if (!buffer_mapped(bh)) {
1810 BUFFER_TRACE(bh, "unmapped");
1811 ext3_get_block(inode, iblock, bh, 0);
1812 /* unmapped? It's a hole - nothing to do */
1813 if (!buffer_mapped(bh)) {
1814 BUFFER_TRACE(bh, "still unmapped");
1815 goto unlock;
1816 }
1817 }
1818
1819 /* Ok, it's mapped. Make sure it's up-to-date */
1820 if (PageUptodate(page))
1821 set_buffer_uptodate(bh);
1822
1823 if (!buffer_uptodate(bh)) {
1824 err = -EIO;
1825 ll_rw_block(READ, 1, &bh);
1826 wait_on_buffer(bh);
1827 /* Uhhuh. Read error. Complain and punt. */
1828 if (!buffer_uptodate(bh))
1829 goto unlock;
1830 }
1831
1832 if (ext3_should_journal_data(inode)) {
1833 BUFFER_TRACE(bh, "get write access");
1834 err = ext3_journal_get_write_access(handle, bh);
1835 if (err)
1836 goto unlock;
1837 }
1838
1839 kaddr = kmap_atomic(page, KM_USER0);
1840 memset(kaddr + offset, 0, length);
1841 flush_dcache_page(page);
1842 kunmap_atomic(kaddr, KM_USER0);
1843
1844 BUFFER_TRACE(bh, "zeroed end of block");
1845
1846 err = 0;
1847 if (ext3_should_journal_data(inode)) {
1848 err = ext3_journal_dirty_metadata(handle, bh);
1849 } else {
1850 if (ext3_should_order_data(inode))
1851 err = ext3_journal_dirty_data(handle, bh);
1852 mark_buffer_dirty(bh);
1853 }
1854
1855unlock:
1856 unlock_page(page);
1857 page_cache_release(page);
1858 return err;
1859}
1860
1861/*
1862 * Probably it should be a library function... search for first non-zero word
1863 * or memcmp with zero_page, whatever is better for particular architecture.
1864 * Linus?
1865 */
1866static inline int all_zeroes(__le32 *p, __le32 *q)
1867{
1868 while (p < q)
1869 if (*p++)
1870 return 0;
1871 return 1;
1872}
1873
1874/**
1875 * ext3_find_shared - find the indirect blocks for partial truncation.
1876 * @inode: inode in question
1877 * @depth: depth of the affected branch
1878 * @offsets: offsets of pointers in that branch (see ext3_block_to_path)
1879 * @chain: place to store the pointers to partial indirect blocks
1880 * @top: place to the (detached) top of branch
1881 *
1882 * This is a helper function used by ext3_truncate().
1883 *
1884 * When we do truncate() we may have to clean the ends of several
1885 * indirect blocks but leave the blocks themselves alive. Block is
1886 * partially truncated if some data below the new i_size is refered
1887 * from it (and it is on the path to the first completely truncated
1888 * data block, indeed). We have to free the top of that path along
1889 * with everything to the right of the path. Since no allocation
1890 * past the truncation point is possible until ext3_truncate()
1891 * finishes, we may safely do the latter, but top of branch may
1892 * require special attention - pageout below the truncation point
1893 * might try to populate it.
1894 *
1895 * We atomically detach the top of branch from the tree, store the
1896 * block number of its root in *@top, pointers to buffer_heads of
1897 * partially truncated blocks - in @chain[].bh and pointers to
1898 * their last elements that should not be removed - in
1899 * @chain[].p. Return value is the pointer to last filled element
1900 * of @chain.
1901 *
1902 * The work left to caller to do the actual freeing of subtrees:
1903 * a) free the subtree starting from *@top
1904 * b) free the subtrees whose roots are stored in
1905 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1906 * c) free the subtrees growing from the inode past the @chain[0].
1907 * (no partially truncated stuff there). */
1908
1909static Indirect *ext3_find_shared(struct inode *inode, int depth,
1910 int offsets[4], Indirect chain[4], __le32 *top)
1911{
1912 Indirect *partial, *p;
1913 int k, err;
1914
1915 *top = 0;
1916 /* Make k index the deepest non-null offest + 1 */
1917 for (k = depth; k > 1 && !offsets[k-1]; k--)
1918 ;
1919 partial = ext3_get_branch(inode, k, offsets, chain, &err);
1920 /* Writer: pointers */
1921 if (!partial)
1922 partial = chain + k-1;
1923 /*
1924 * If the branch acquired continuation since we've looked at it -
1925 * fine, it should all survive and (new) top doesn't belong to us.
1926 */
1927 if (!partial->key && *partial->p)
1928 /* Writer: end */
1929 goto no_top;
1930 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
1931 ;
1932 /*
1933 * OK, we've found the last block that must survive. The rest of our
1934 * branch should be detached before unlocking. However, if that rest
1935 * of branch is all ours and does not grow immediately from the inode
1936 * it's easier to cheat and just decrement partial->p.
1937 */
1938 if (p == chain + k - 1 && p > chain) {
1939 p->p--;
1940 } else {
1941 *top = *p->p;
1942 /* Nope, don't do this in ext3. Must leave the tree intact */
1943#if 0
1944 *p->p = 0;
1945#endif
1946 }
1947 /* Writer: end */
1948
1949 while(partial > p) {
1950 brelse(partial->bh);
1951 partial--;
1952 }
1953no_top:
1954 return partial;
1955}
1956
1957/*
1958 * Zero a number of block pointers in either an inode or an indirect block.
1959 * If we restart the transaction we must again get write access to the
1960 * indirect block for further modification.
1961 *
1962 * We release `count' blocks on disk, but (last - first) may be greater
1963 * than `count' because there can be holes in there.
1964 */
1965static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
1966 struct buffer_head *bh, ext3_fsblk_t block_to_free,
1967 unsigned long count, __le32 *first, __le32 *last)
1968{
1969 __le32 *p;
1970 if (try_to_extend_transaction(handle, inode)) {
1971 if (bh) {
1972 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1973 ext3_journal_dirty_metadata(handle, bh);
1974 }
1975 ext3_mark_inode_dirty(handle, inode);
1976 ext3_journal_test_restart(handle, inode);
1977 if (bh) {
1978 BUFFER_TRACE(bh, "retaking write access");
1979 ext3_journal_get_write_access(handle, bh);
1980 }
1981 }
1982
1983 /*
1984 * Any buffers which are on the journal will be in memory. We find
1985 * them on the hash table so journal_revoke() will run journal_forget()
1986 * on them. We've already detached each block from the file, so
1987 * bforget() in journal_forget() should be safe.
1988 *
1989 * AKPM: turn on bforget in journal_forget()!!!
1990 */
1991 for (p = first; p < last; p++) {
1992 u32 nr = le32_to_cpu(*p);
1993 if (nr) {
1994 struct buffer_head *bh;
1995
1996 *p = 0;
1997 bh = sb_find_get_block(inode->i_sb, nr);
1998 ext3_forget(handle, 0, inode, bh, nr);
1999 }
2000 }
2001
2002 ext3_free_blocks(handle, inode, block_to_free, count);
2003}
2004
2005/**
2006 * ext3_free_data - free a list of data blocks
2007 * @handle: handle for this transaction
2008 * @inode: inode we are dealing with
2009 * @this_bh: indirect buffer_head which contains *@first and *@last
2010 * @first: array of block numbers
2011 * @last: points immediately past the end of array
2012 *
2013 * We are freeing all blocks refered from that array (numbers are stored as
2014 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
2015 *
2016 * We accumulate contiguous runs of blocks to free. Conveniently, if these
2017 * blocks are contiguous then releasing them at one time will only affect one
2018 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
2019 * actually use a lot of journal space.
2020 *
2021 * @this_bh will be %NULL if @first and @last point into the inode's direct
2022 * block pointers.
2023 */
2024static void ext3_free_data(handle_t *handle, struct inode *inode,
2025 struct buffer_head *this_bh,
2026 __le32 *first, __le32 *last)
2027{
2028 ext3_fsblk_t block_to_free = 0; /* Starting block # of a run */
2029 unsigned long count = 0; /* Number of blocks in the run */
2030 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
2031 corresponding to
2032 block_to_free */
2033 ext3_fsblk_t nr; /* Current block # */
2034 __le32 *p; /* Pointer into inode/ind
2035 for current block */
2036 int err;
2037
2038 if (this_bh) { /* For indirect block */
2039 BUFFER_TRACE(this_bh, "get_write_access");
2040 err = ext3_journal_get_write_access(handle, this_bh);
2041 /* Important: if we can't update the indirect pointers
2042 * to the blocks, we can't free them. */
2043 if (err)
2044 return;
2045 }
2046
2047 for (p = first; p < last; p++) {
2048 nr = le32_to_cpu(*p);
2049 if (nr) {
2050 /* accumulate blocks to free if they're contiguous */
2051 if (count == 0) {
2052 block_to_free = nr;
2053 block_to_free_p = p;
2054 count = 1;
2055 } else if (nr == block_to_free + count) {
2056 count++;
2057 } else {
2058 ext3_clear_blocks(handle, inode, this_bh,
2059 block_to_free,
2060 count, block_to_free_p, p);
2061 block_to_free = nr;
2062 block_to_free_p = p;
2063 count = 1;
2064 }
2065 }
2066 }
2067
2068 if (count > 0)
2069 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
2070 count, block_to_free_p, p);
2071
2072 if (this_bh) {
2073 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
2074 ext3_journal_dirty_metadata(handle, this_bh);
2075 }
2076}
2077
2078/**
2079 * ext3_free_branches - free an array of branches
2080 * @handle: JBD handle for this transaction
2081 * @inode: inode we are dealing with
2082 * @parent_bh: the buffer_head which contains *@first and *@last
2083 * @first: array of block numbers
2084 * @last: pointer immediately past the end of array
2085 * @depth: depth of the branches to free
2086 *
2087 * We are freeing all blocks refered from these branches (numbers are
2088 * stored as little-endian 32-bit) and updating @inode->i_blocks
2089 * appropriately.
2090 */
2091static void ext3_free_branches(handle_t *handle, struct inode *inode,
2092 struct buffer_head *parent_bh,
2093 __le32 *first, __le32 *last, int depth)
2094{
2095 ext3_fsblk_t nr;
2096 __le32 *p;
2097
2098 if (is_handle_aborted(handle))
2099 return;
2100
2101 if (depth--) {
2102 struct buffer_head *bh;
2103 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2104 p = last;
2105 while (--p >= first) {
2106 nr = le32_to_cpu(*p);
2107 if (!nr)
2108 continue; /* A hole */
2109
2110 /* Go read the buffer for the next level down */
2111 bh = sb_bread(inode->i_sb, nr);
2112
2113 /*
2114 * A read failure? Report error and clear slot
2115 * (should be rare).
2116 */
2117 if (!bh) {
2118 ext3_error(inode->i_sb, "ext3_free_branches",
2119 "Read failure, inode=%lu, block="E3FSBLK,
2120 inode->i_ino, nr);
2121 continue;
2122 }
2123
2124 /* This zaps the entire block. Bottom up. */
2125 BUFFER_TRACE(bh, "free child branches");
2126 ext3_free_branches(handle, inode, bh,
2127 (__le32*)bh->b_data,
2128 (__le32*)bh->b_data + addr_per_block,
2129 depth);
2130
2131 /*
2132 * We've probably journalled the indirect block several
2133 * times during the truncate. But it's no longer
2134 * needed and we now drop it from the transaction via
2135 * journal_revoke().
2136 *
2137 * That's easy if it's exclusively part of this
2138 * transaction. But if it's part of the committing
2139 * transaction then journal_forget() will simply
2140 * brelse() it. That means that if the underlying
2141 * block is reallocated in ext3_get_block(),
2142 * unmap_underlying_metadata() will find this block
2143 * and will try to get rid of it. damn, damn.
2144 *
2145 * If this block has already been committed to the
2146 * journal, a revoke record will be written. And
2147 * revoke records must be emitted *before* clearing
2148 * this block's bit in the bitmaps.
2149 */
2150 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2151
2152 /*
2153 * Everything below this this pointer has been
2154 * released. Now let this top-of-subtree go.
2155 *
2156 * We want the freeing of this indirect block to be
2157 * atomic in the journal with the updating of the
2158 * bitmap block which owns it. So make some room in
2159 * the journal.
2160 *
2161 * We zero the parent pointer *after* freeing its
2162 * pointee in the bitmaps, so if extend_transaction()
2163 * for some reason fails to put the bitmap changes and
2164 * the release into the same transaction, recovery
2165 * will merely complain about releasing a free block,
2166 * rather than leaking blocks.
2167 */
2168 if (is_handle_aborted(handle))
2169 return;
2170 if (try_to_extend_transaction(handle, inode)) {
2171 ext3_mark_inode_dirty(handle, inode);
2172 ext3_journal_test_restart(handle, inode);
2173 }
2174
2175 ext3_free_blocks(handle, inode, nr, 1);
2176
2177 if (parent_bh) {
2178 /*
2179 * The block which we have just freed is
2180 * pointed to by an indirect block: journal it
2181 */
2182 BUFFER_TRACE(parent_bh, "get_write_access");
2183 if (!ext3_journal_get_write_access(handle,
2184 parent_bh)){
2185 *p = 0;
2186 BUFFER_TRACE(parent_bh,
2187 "call ext3_journal_dirty_metadata");
2188 ext3_journal_dirty_metadata(handle,
2189 parent_bh);
2190 }
2191 }
2192 }
2193 } else {
2194 /* We have reached the bottom of the tree. */
2195 BUFFER_TRACE(parent_bh, "free data blocks");
2196 ext3_free_data(handle, inode, parent_bh, first, last);
2197 }
2198}
2199
2200/*
2201 * ext3_truncate()
2202 *
2203 * We block out ext3_get_block() block instantiations across the entire
2204 * transaction, and VFS/VM ensures that ext3_truncate() cannot run
2205 * simultaneously on behalf of the same inode.
2206 *
2207 * As we work through the truncate and commmit bits of it to the journal there
2208 * is one core, guiding principle: the file's tree must always be consistent on
2209 * disk. We must be able to restart the truncate after a crash.
2210 *
2211 * The file's tree may be transiently inconsistent in memory (although it
2212 * probably isn't), but whenever we close off and commit a journal transaction,
2213 * the contents of (the filesystem + the journal) must be consistent and
2214 * restartable. It's pretty simple, really: bottom up, right to left (although
2215 * left-to-right works OK too).
2216 *
2217 * Note that at recovery time, journal replay occurs *before* the restart of
2218 * truncate against the orphan inode list.
2219 *
2220 * The committed inode has the new, desired i_size (which is the same as
2221 * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see
2222 * that this inode's truncate did not complete and it will again call
2223 * ext3_truncate() to have another go. So there will be instantiated blocks
2224 * to the right of the truncation point in a crashed ext3 filesystem. But
2225 * that's fine - as long as they are linked from the inode, the post-crash
2226 * ext3_truncate() run will find them and release them.
2227 */
2228void ext3_truncate(struct inode *inode)
2229{
2230 handle_t *handle;
2231 struct ext3_inode_info *ei = EXT3_I(inode);
2232 __le32 *i_data = ei->i_data;
2233 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2234 struct address_space *mapping = inode->i_mapping;
2235 int offsets[4];
2236 Indirect chain[4];
2237 Indirect *partial;
2238 __le32 nr = 0;
2239 int n;
2240 long last_block;
2241 unsigned blocksize = inode->i_sb->s_blocksize;
2242 struct page *page;
2243
2244 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2245 S_ISLNK(inode->i_mode)))
2246 return;
2247 if (ext3_inode_is_fast_symlink(inode))
2248 return;
2249 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2250 return;
2251
2252 /*
2253 * We have to lock the EOF page here, because lock_page() nests
2254 * outside journal_start().
2255 */
2256 if ((inode->i_size & (blocksize - 1)) == 0) {
2257 /* Block boundary? Nothing to do */
2258 page = NULL;
2259 } else {
2260 page = grab_cache_page(mapping,
2261 inode->i_size >> PAGE_CACHE_SHIFT);
2262 if (!page)
2263 return;
2264 }
2265
2266 handle = start_transaction(inode);
2267 if (IS_ERR(handle)) {
2268 if (page) {
2269 clear_highpage(page);
2270 flush_dcache_page(page);
2271 unlock_page(page);
2272 page_cache_release(page);
2273 }
2274 return; /* AKPM: return what? */
2275 }
2276
2277 last_block = (inode->i_size + blocksize-1)
2278 >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2279
2280 if (page)
2281 ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2282
2283 n = ext3_block_to_path(inode, last_block, offsets, NULL);
2284 if (n == 0)
2285 goto out_stop; /* error */
2286
2287 /*
2288 * OK. This truncate is going to happen. We add the inode to the
2289 * orphan list, so that if this truncate spans multiple transactions,
2290 * and we crash, we will resume the truncate when the filesystem
2291 * recovers. It also marks the inode dirty, to catch the new size.
2292 *
2293 * Implication: the file must always be in a sane, consistent
2294 * truncatable state while each transaction commits.
2295 */
2296 if (ext3_orphan_add(handle, inode))
2297 goto out_stop;
2298
2299 /*
2300 * The orphan list entry will now protect us from any crash which
2301 * occurs before the truncate completes, so it is now safe to propagate
2302 * the new, shorter inode size (held for now in i_size) into the
2303 * on-disk inode. We do this via i_disksize, which is the value which
2304 * ext3 *really* writes onto the disk inode.
2305 */
2306 ei->i_disksize = inode->i_size;
2307
2308 /*
2309 * From here we block out all ext3_get_block() callers who want to
2310 * modify the block allocation tree.
2311 */
2312 mutex_lock(&ei->truncate_mutex);
2313
2314 if (n == 1) { /* direct blocks */
2315 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
2316 i_data + EXT3_NDIR_BLOCKS);
2317 goto do_indirects;
2318 }
2319
2320 partial = ext3_find_shared(inode, n, offsets, chain, &nr);
2321 /* Kill the top of shared branch (not detached) */
2322 if (nr) {
2323 if (partial == chain) {
2324 /* Shared branch grows from the inode */
2325 ext3_free_branches(handle, inode, NULL,
2326 &nr, &nr+1, (chain+n-1) - partial);
2327 *partial->p = 0;
2328 /*
2329 * We mark the inode dirty prior to restart,
2330 * and prior to stop. No need for it here.
2331 */
2332 } else {
2333 /* Shared branch grows from an indirect block */
2334 BUFFER_TRACE(partial->bh, "get_write_access");
2335 ext3_free_branches(handle, inode, partial->bh,
2336 partial->p,
2337 partial->p+1, (chain+n-1) - partial);
2338 }
2339 }
2340 /* Clear the ends of indirect blocks on the shared branch */
2341 while (partial > chain) {
2342 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
2343 (__le32*)partial->bh->b_data+addr_per_block,
2344 (chain+n-1) - partial);
2345 BUFFER_TRACE(partial->bh, "call brelse");
2346 brelse (partial->bh);
2347 partial--;
2348 }
2349do_indirects:
2350 /* Kill the remaining (whole) subtrees */
2351 switch (offsets[0]) {
2352 default:
2353 nr = i_data[EXT3_IND_BLOCK];
2354 if (nr) {
2355 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
2356 i_data[EXT3_IND_BLOCK] = 0;
2357 }
2358 case EXT3_IND_BLOCK:
2359 nr = i_data[EXT3_DIND_BLOCK];
2360 if (nr) {
2361 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
2362 i_data[EXT3_DIND_BLOCK] = 0;
2363 }
2364 case EXT3_DIND_BLOCK:
2365 nr = i_data[EXT3_TIND_BLOCK];
2366 if (nr) {
2367 ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
2368 i_data[EXT3_TIND_BLOCK] = 0;
2369 }
2370 case EXT3_TIND_BLOCK:
2371 ;
2372 }
2373
2374 ext3_discard_reservation(inode);
2375
2376 mutex_unlock(&ei->truncate_mutex);
2377 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
2378 ext3_mark_inode_dirty(handle, inode);
2379
2380 /*
2381 * In a multi-transaction truncate, we only make the final transaction
2382 * synchronous
2383 */
2384 if (IS_SYNC(inode))
2385 handle->h_sync = 1;
2386out_stop:
2387 /*
2388 * If this was a simple ftruncate(), and the file will remain alive
2389 * then we need to clear up the orphan record which we created above.
2390 * However, if this was a real unlink then we were called by
2391 * ext3_delete_inode(), and we allow that function to clean up the
2392 * orphan info for us.
2393 */
2394 if (inode->i_nlink)
2395 ext3_orphan_del(handle, inode);
2396
2397 ext3_journal_stop(handle);
2398}
2399
2400static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
2401 unsigned long ino, struct ext3_iloc *iloc)
2402{
2403 unsigned long desc, group_desc, block_group;
2404 unsigned long offset;
2405 ext3_fsblk_t block;
2406 struct buffer_head *bh;
2407 struct ext3_group_desc * gdp;
2408
2409 if (!ext3_valid_inum(sb, ino)) {
2410 /*
2411 * This error is already checked for in namei.c unless we are
2412 * looking at an NFS filehandle, in which case no error
2413 * report is needed
2414 */
2415 return 0;
2416 }
2417
2418 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2419 if (block_group >= EXT3_SB(sb)->s_groups_count) {
2420 ext3_error(sb,"ext3_get_inode_block","group >= groups count");
2421 return 0;
2422 }
2423 smp_rmb();
2424 group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
2425 desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
2426 bh = EXT3_SB(sb)->s_group_desc[group_desc];
2427 if (!bh) {
2428 ext3_error (sb, "ext3_get_inode_block",
2429 "Descriptor not loaded");
2430 return 0;
2431 }
2432
2433 gdp = (struct ext3_group_desc *)bh->b_data;
2434 /*
2435 * Figure out the offset within the block group inode table
2436 */
2437 offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
2438 EXT3_INODE_SIZE(sb);
2439 block = le32_to_cpu(gdp[desc].bg_inode_table) +
2440 (offset >> EXT3_BLOCK_SIZE_BITS(sb));
2441
2442 iloc->block_group = block_group;
2443 iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
2444 return block;
2445}
2446
2447/*
2448 * ext3_get_inode_loc returns with an extra refcount against the inode's
2449 * underlying buffer_head on success. If 'in_mem' is true, we have all
2450 * data in memory that is needed to recreate the on-disk version of this
2451 * inode.
2452 */
2453static int __ext3_get_inode_loc(struct inode *inode,
2454 struct ext3_iloc *iloc, int in_mem)
2455{
2456 ext3_fsblk_t block;
2457 struct buffer_head *bh;
2458
2459 block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2460 if (!block)
2461 return -EIO;
2462
2463 bh = sb_getblk(inode->i_sb, block);
2464 if (!bh) {
2465 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2466 "unable to read inode block - "
2467 "inode=%lu, block="E3FSBLK,
2468 inode->i_ino, block);
2469 return -EIO;
2470 }
2471 if (!buffer_uptodate(bh)) {
2472 lock_buffer(bh);
2473 if (buffer_uptodate(bh)) {
2474 /* someone brought it uptodate while we waited */
2475 unlock_buffer(bh);
2476 goto has_buffer;
2477 }
2478
2479 /*
2480 * If we have all information of the inode in memory and this
2481 * is the only valid inode in the block, we need not read the
2482 * block.
2483 */
2484 if (in_mem) {
2485 struct buffer_head *bitmap_bh;
2486 struct ext3_group_desc *desc;
2487 int inodes_per_buffer;
2488 int inode_offset, i;
2489 int block_group;
2490 int start;
2491
2492 block_group = (inode->i_ino - 1) /
2493 EXT3_INODES_PER_GROUP(inode->i_sb);
2494 inodes_per_buffer = bh->b_size /
2495 EXT3_INODE_SIZE(inode->i_sb);
2496 inode_offset = ((inode->i_ino - 1) %
2497 EXT3_INODES_PER_GROUP(inode->i_sb));
2498 start = inode_offset & ~(inodes_per_buffer - 1);
2499
2500 /* Is the inode bitmap in cache? */
2501 desc = ext3_get_group_desc(inode->i_sb,
2502 block_group, NULL);
2503 if (!desc)
2504 goto make_io;
2505
2506 bitmap_bh = sb_getblk(inode->i_sb,
2507 le32_to_cpu(desc->bg_inode_bitmap));
2508 if (!bitmap_bh)
2509 goto make_io;
2510
2511 /*
2512 * If the inode bitmap isn't in cache then the
2513 * optimisation may end up performing two reads instead
2514 * of one, so skip it.
2515 */
2516 if (!buffer_uptodate(bitmap_bh)) {
2517 brelse(bitmap_bh);
2518 goto make_io;
2519 }
2520 for (i = start; i < start + inodes_per_buffer; i++) {
2521 if (i == inode_offset)
2522 continue;
2523 if (ext3_test_bit(i, bitmap_bh->b_data))
2524 break;
2525 }
2526 brelse(bitmap_bh);
2527 if (i == start + inodes_per_buffer) {
2528 /* all other inodes are free, so skip I/O */
2529 memset(bh->b_data, 0, bh->b_size);
2530 set_buffer_uptodate(bh);
2531 unlock_buffer(bh);
2532 goto has_buffer;
2533 }
2534 }
2535
2536make_io:
2537 /*
2538 * There are other valid inodes in the buffer, this inode
2539 * has in-inode xattrs, or we don't have this inode in memory.
2540 * Read the block from disk.
2541 */
2542 get_bh(bh);
2543 bh->b_end_io = end_buffer_read_sync;
2544 submit_bh(READ_META, bh);
2545 wait_on_buffer(bh);
2546 if (!buffer_uptodate(bh)) {
2547 ext3_error(inode->i_sb, "ext3_get_inode_loc",
2548 "unable to read inode block - "
2549 "inode=%lu, block="E3FSBLK,
2550 inode->i_ino, block);
2551 brelse(bh);
2552 return -EIO;
2553 }
2554 }
2555has_buffer:
2556 iloc->bh = bh;
2557 return 0;
2558}
2559
2560int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
2561{
2562 /* We have all inode data except xattrs in memory here. */
2563 return __ext3_get_inode_loc(inode, iloc,
2564 !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
2565}
2566
2567void ext3_set_inode_flags(struct inode *inode)
2568{
2569 unsigned int flags = EXT3_I(inode)->i_flags;
2570
2571 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
2572 if (flags & EXT3_SYNC_FL)
2573 inode->i_flags |= S_SYNC;
2574 if (flags & EXT3_APPEND_FL)
2575 inode->i_flags |= S_APPEND;
2576 if (flags & EXT3_IMMUTABLE_FL)
2577 inode->i_flags |= S_IMMUTABLE;
2578 if (flags & EXT3_NOATIME_FL)
2579 inode->i_flags |= S_NOATIME;
2580 if (flags & EXT3_DIRSYNC_FL)
2581 inode->i_flags |= S_DIRSYNC;
2582}
2583
2584void ext3_read_inode(struct inode * inode)
2585{
2586 struct ext3_iloc iloc;
2587 struct ext3_inode *raw_inode;
2588 struct ext3_inode_info *ei = EXT3_I(inode);
2589 struct buffer_head *bh;
2590 int block;
2591
2592#ifdef CONFIG_EXT3_FS_POSIX_ACL
2593 ei->i_acl = EXT3_ACL_NOT_CACHED;
2594 ei->i_default_acl = EXT3_ACL_NOT_CACHED;
2595#endif
2596 ei->i_block_alloc_info = NULL;
2597
2598 if (__ext3_get_inode_loc(inode, &iloc, 0))
2599 goto bad_inode;
2600 bh = iloc.bh;
2601 raw_inode = ext3_raw_inode(&iloc);
2602 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2603 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2604 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2605 if(!(test_opt (inode->i_sb, NO_UID32))) {
2606 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2607 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2608 }
2609 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2610 inode->i_size = le32_to_cpu(raw_inode->i_size);
2611 inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
2612 inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
2613 inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
2614 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2615
2616 ei->i_state = 0;
2617 ei->i_dir_start_lookup = 0;
2618 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2619 /* We now have enough fields to check if the inode was active or not.
2620 * This is needed because nfsd might try to access dead inodes
2621 * the test is that same one that e2fsck uses
2622 * NeilBrown 1999oct15
2623 */
2624 if (inode->i_nlink == 0) {
2625 if (inode->i_mode == 0 ||
2626 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
2627 /* this inode is deleted */
2628 brelse (bh);
2629 goto bad_inode;
2630 }
2631 /* The only unlinked inodes we let through here have
2632 * valid i_mode and are being read by the orphan
2633 * recovery code: that's fine, we're about to complete
2634 * the process of deleting those. */
2635 }
2636 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2637 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2638#ifdef EXT3_FRAGMENTS
2639 ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2640 ei->i_frag_no = raw_inode->i_frag;
2641 ei->i_frag_size = raw_inode->i_fsize;
2642#endif
2643 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2644 if (!S_ISREG(inode->i_mode)) {
2645 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2646 } else {
2647 inode->i_size |=
2648 ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2649 }
2650 ei->i_disksize = inode->i_size;
2651 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2652 ei->i_block_group = iloc.block_group;
2653 /*
2654 * NOTE! The in-memory inode i_data array is in little-endian order
2655 * even on big-endian machines: we do NOT byteswap the block numbers!
2656 */
2657 for (block = 0; block < EXT3_N_BLOCKS; block++)
2658 ei->i_data[block] = raw_inode->i_block[block];
2659 INIT_LIST_HEAD(&ei->i_orphan);
2660
2661 if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
2662 EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
2663 /*
2664 * When mke2fs creates big inodes it does not zero out
2665 * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
2666 * so ignore those first few inodes.
2667 */
2668 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2669 if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2670 EXT3_INODE_SIZE(inode->i_sb))
2671 goto bad_inode;
2672 if (ei->i_extra_isize == 0) {
2673 /* The extra space is currently unused. Use it. */
2674 ei->i_extra_isize = sizeof(struct ext3_inode) -
2675 EXT3_GOOD_OLD_INODE_SIZE;
2676 } else {
2677 __le32 *magic = (void *)raw_inode +
2678 EXT3_GOOD_OLD_INODE_SIZE +
2679 ei->i_extra_isize;
2680 if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
2681 ei->i_state |= EXT3_STATE_XATTR;
2682 }
2683 } else
2684 ei->i_extra_isize = 0;
2685
2686 if (S_ISREG(inode->i_mode)) {
2687 inode->i_op = &ext3_file_inode_operations;
2688 inode->i_fop = &ext3_file_operations;
2689 ext3_set_aops(inode);
2690 } else if (S_ISDIR(inode->i_mode)) {
2691 inode->i_op = &ext3_dir_inode_operations;
2692 inode->i_fop = &ext3_dir_operations;
2693 } else if (S_ISLNK(inode->i_mode)) {
2694 if (ext3_inode_is_fast_symlink(inode))
2695 inode->i_op = &ext3_fast_symlink_inode_operations;
2696 else {
2697 inode->i_op = &ext3_symlink_inode_operations;
2698 ext3_set_aops(inode);
2699 }
2700 } else {
2701 inode->i_op = &ext3_special_inode_operations;
2702 if (raw_inode->i_block[0])
2703 init_special_inode(inode, inode->i_mode,
2704 old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
2705 else
2706 init_special_inode(inode, inode->i_mode,
2707 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2708 }
2709 brelse (iloc.bh);
2710 ext3_set_inode_flags(inode);
2711 return;
2712
2713bad_inode:
2714 make_bad_inode(inode);
2715 return;
2716}
2717
2718/*
2719 * Post the struct inode info into an on-disk inode location in the
2720 * buffer-cache. This gobbles the caller's reference to the
2721 * buffer_head in the inode location struct.
2722 *
2723 * The caller must have write access to iloc->bh.
2724 */
2725static int ext3_do_update_inode(handle_t *handle,
2726 struct inode *inode,
2727 struct ext3_iloc *iloc)
2728{
2729 struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
2730 struct ext3_inode_info *ei = EXT3_I(inode);
2731 struct buffer_head *bh = iloc->bh;
2732 int err = 0, rc, block;
2733
2734 /* For fields not not tracking in the in-memory inode,
2735 * initialise them to zero for new inodes. */
2736 if (ei->i_state & EXT3_STATE_NEW)
2737 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2738
2739 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2740 if(!(test_opt(inode->i_sb, NO_UID32))) {
2741 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
2742 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
2743/*
2744 * Fix up interoperability with old kernels. Otherwise, old inodes get
2745 * re-used with the upper 16 bits of the uid/gid intact
2746 */
2747 if(!ei->i_dtime) {
2748 raw_inode->i_uid_high =
2749 cpu_to_le16(high_16_bits(inode->i_uid));
2750 raw_inode->i_gid_high =
2751 cpu_to_le16(high_16_bits(inode->i_gid));
2752 } else {
2753 raw_inode->i_uid_high = 0;
2754 raw_inode->i_gid_high = 0;
2755 }
2756 } else {
2757 raw_inode->i_uid_low =
2758 cpu_to_le16(fs_high2lowuid(inode->i_uid));
2759 raw_inode->i_gid_low =
2760 cpu_to_le16(fs_high2lowgid(inode->i_gid));
2761 raw_inode->i_uid_high = 0;
2762 raw_inode->i_gid_high = 0;
2763 }
2764 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2765 raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2766 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
2767 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
2768 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
2769 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2770 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2771 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2772#ifdef EXT3_FRAGMENTS
2773 raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
2774 raw_inode->i_frag = ei->i_frag_no;
2775 raw_inode->i_fsize = ei->i_frag_size;
2776#endif
2777 raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
2778 if (!S_ISREG(inode->i_mode)) {
2779 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
2780 } else {
2781 raw_inode->i_size_high =
2782 cpu_to_le32(ei->i_disksize >> 32);
2783 if (ei->i_disksize > 0x7fffffffULL) {
2784 struct super_block *sb = inode->i_sb;
2785 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
2786 EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
2787 EXT3_SB(sb)->s_es->s_rev_level ==
2788 cpu_to_le32(EXT3_GOOD_OLD_REV)) {
2789 /* If this is the first large file
2790 * created, add a flag to the superblock.
2791 */
2792 err = ext3_journal_get_write_access(handle,
2793 EXT3_SB(sb)->s_sbh);
2794 if (err)
2795 goto out_brelse;
2796 ext3_update_dynamic_rev(sb);
2797 EXT3_SET_RO_COMPAT_FEATURE(sb,
2798 EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2799 sb->s_dirt = 1;
2800 handle->h_sync = 1;
2801 err = ext3_journal_dirty_metadata(handle,
2802 EXT3_SB(sb)->s_sbh);
2803 }
2804 }
2805 }
2806 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2807 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
2808 if (old_valid_dev(inode->i_rdev)) {
2809 raw_inode->i_block[0] =
2810 cpu_to_le32(old_encode_dev(inode->i_rdev));
2811 raw_inode->i_block[1] = 0;
2812 } else {
2813 raw_inode->i_block[0] = 0;
2814 raw_inode->i_block[1] =
2815 cpu_to_le32(new_encode_dev(inode->i_rdev));
2816 raw_inode->i_block[2] = 0;
2817 }
2818 } else for (block = 0; block < EXT3_N_BLOCKS; block++)
2819 raw_inode->i_block[block] = ei->i_data[block];
2820
2821 if (ei->i_extra_isize)
2822 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
2823
2824 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2825 rc = ext3_journal_dirty_metadata(handle, bh);
2826 if (!err)
2827 err = rc;
2828 ei->i_state &= ~EXT3_STATE_NEW;
2829
2830out_brelse:
2831 brelse (bh);
2832 ext3_std_error(inode->i_sb, err);
2833 return err;
2834}
2835
2836/*
2837 * ext3_write_inode()
2838 *
2839 * We are called from a few places:
2840 *
2841 * - Within generic_file_write() for O_SYNC files.
2842 * Here, there will be no transaction running. We wait for any running
2843 * trasnaction to commit.
2844 *
2845 * - Within sys_sync(), kupdate and such.
2846 * We wait on commit, if tol to.
2847 *
2848 * - Within prune_icache() (PF_MEMALLOC == true)
2849 * Here we simply return. We can't afford to block kswapd on the
2850 * journal commit.
2851 *
2852 * In all cases it is actually safe for us to return without doing anything,
2853 * because the inode has been copied into a raw inode buffer in
2854 * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
2855 * knfsd.
2856 *
2857 * Note that we are absolutely dependent upon all inode dirtiers doing the
2858 * right thing: they *must* call mark_inode_dirty() after dirtying info in
2859 * which we are interested.
2860 *
2861 * It would be a bug for them to not do this. The code:
2862 *
2863 * mark_inode_dirty(inode)
2864 * stuff();
2865 * inode->i_size = expr;
2866 *
2867 * is in error because a kswapd-driven write_inode() could occur while
2868 * `stuff()' is running, and the new i_size will be lost. Plus the inode
2869 * will no longer be on the superblock's dirty inode list.
2870 */
2871int ext3_write_inode(struct inode *inode, int wait)
2872{
2873 if (current->flags & PF_MEMALLOC)
2874 return 0;
2875
2876 if (ext3_journal_current_handle()) {
2877 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2878 dump_stack();
2879 return -EIO;
2880 }
2881
2882 if (!wait)
2883 return 0;
2884
2885 return ext3_force_commit(inode->i_sb);
2886}
2887
2888/*
2889 * ext3_setattr()
2890 *
2891 * Called from notify_change.
2892 *
2893 * We want to trap VFS attempts to truncate the file as soon as
2894 * possible. In particular, we want to make sure that when the VFS
2895 * shrinks i_size, we put the inode on the orphan list and modify
2896 * i_disksize immediately, so that during the subsequent flushing of
2897 * dirty pages and freeing of disk blocks, we can guarantee that any
2898 * commit will leave the blocks being flushed in an unused state on
2899 * disk. (On recovery, the inode will get truncated and the blocks will
2900 * be freed, so we have a strong guarantee that no future commit will
2901 * leave these blocks visible to the user.)
2902 *
2903 * Called with inode->sem down.
2904 */
2905int ext3_setattr(struct dentry *dentry, struct iattr *attr)
2906{
2907 struct inode *inode = dentry->d_inode;
2908 int error, rc = 0;
2909 const unsigned int ia_valid = attr->ia_valid;
2910
2911 error = inode_change_ok(inode, attr);
2912 if (error)
2913 return error;
2914
2915 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2916 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2917 handle_t *handle;
2918
2919 /* (user+group)*(old+new) structure, inode write (sb,
2920 * inode block, ? - but truncate inode update has it) */
2921 handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+
2922 EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
2923 if (IS_ERR(handle)) {
2924 error = PTR_ERR(handle);
2925 goto err_out;
2926 }
2927 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2928 if (error) {
2929 ext3_journal_stop(handle);
2930 return error;
2931 }
2932 /* Update corresponding info in inode so that everything is in
2933 * one transaction */
2934 if (attr->ia_valid & ATTR_UID)
2935 inode->i_uid = attr->ia_uid;
2936 if (attr->ia_valid & ATTR_GID)
2937 inode->i_gid = attr->ia_gid;
2938 error = ext3_mark_inode_dirty(handle, inode);
2939 ext3_journal_stop(handle);
2940 }
2941
2942 if (S_ISREG(inode->i_mode) &&
2943 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2944 handle_t *handle;
2945
2946 handle = ext3_journal_start(inode, 3);
2947 if (IS_ERR(handle)) {
2948 error = PTR_ERR(handle);
2949 goto err_out;
2950 }
2951
2952 error = ext3_orphan_add(handle, inode);
2953 EXT3_I(inode)->i_disksize = attr->ia_size;
2954 rc = ext3_mark_inode_dirty(handle, inode);
2955 if (!error)
2956 error = rc;
2957 ext3_journal_stop(handle);
2958 }
2959
2960 rc = inode_setattr(inode, attr);
2961
2962 /* If inode_setattr's call to ext3_truncate failed to get a
2963 * transaction handle at all, we need to clean up the in-core
2964 * orphan list manually. */
2965 if (inode->i_nlink)
2966 ext3_orphan_del(NULL, inode);
2967
2968 if (!rc && (ia_valid & ATTR_MODE))
2969 rc = ext3_acl_chmod(inode);
2970
2971err_out:
2972 ext3_std_error(inode->i_sb, error);
2973 if (!error)
2974 error = rc;
2975 return error;
2976}
2977
2978
2979/*
2980 * How many blocks doth make a writepage()?
2981 *
2982 * With N blocks per page, it may be:
2983 * N data blocks
2984 * 2 indirect block
2985 * 2 dindirect
2986 * 1 tindirect
2987 * N+5 bitmap blocks (from the above)
2988 * N+5 group descriptor summary blocks
2989 * 1 inode block
2990 * 1 superblock.
2991 * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
2992 *
2993 * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
2994 *
2995 * With ordered or writeback data it's the same, less the N data blocks.
2996 *
2997 * If the inode's direct blocks can hold an integral number of pages then a
2998 * page cannot straddle two indirect blocks, and we can only touch one indirect
2999 * and dindirect block, and the "5" above becomes "3".
3000 *
3001 * This still overestimates under most circumstances. If we were to pass the
3002 * start and end offsets in here as well we could do block_to_path() on each
3003 * block and work out the exact number of indirects which are touched. Pah.
3004 */
3005
3006static int ext3_writepage_trans_blocks(struct inode *inode)
3007{
3008 int bpp = ext3_journal_blocks_per_page(inode);
3009 int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
3010 int ret;
3011
3012 if (ext3_should_journal_data(inode))
3013 ret = 3 * (bpp + indirects) + 2;
3014 else
3015 ret = 2 * (bpp + indirects) + 2;
3016
3017#ifdef CONFIG_QUOTA
3018 /* We know that structure was already allocated during DQUOT_INIT so
3019 * we will be updating only the data blocks + inodes */
3020 ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb);
3021#endif
3022
3023 return ret;
3024}
3025
3026/*
3027 * The caller must have previously called ext3_reserve_inode_write().
3028 * Give this, we know that the caller already has write access to iloc->bh.
3029 */
3030int ext3_mark_iloc_dirty(handle_t *handle,
3031 struct inode *inode, struct ext3_iloc *iloc)
3032{
3033 int err = 0;
3034
3035 /* the do_update_inode consumes one bh->b_count */
3036 get_bh(iloc->bh);
3037
3038 /* ext3_do_update_inode() does journal_dirty_metadata */
3039 err = ext3_do_update_inode(handle, inode, iloc);
3040 put_bh(iloc->bh);
3041 return err;
3042}
3043
3044/*
3045 * On success, We end up with an outstanding reference count against
3046 * iloc->bh. This _must_ be cleaned up later.
3047 */
3048
3049int
3050ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
3051 struct ext3_iloc *iloc)
3052{
3053 int err = 0;
3054 if (handle) {
3055 err = ext3_get_inode_loc(inode, iloc);
3056 if (!err) {
3057 BUFFER_TRACE(iloc->bh, "get_write_access");
3058 err = ext3_journal_get_write_access(handle, iloc->bh);
3059 if (err) {
3060 brelse(iloc->bh);
3061 iloc->bh = NULL;
3062 }
3063 }
3064 }
3065 ext3_std_error(inode->i_sb, err);
3066 return err;
3067}
3068
3069/*
3070 * What we do here is to mark the in-core inode as clean with respect to inode
3071 * dirtiness (it may still be data-dirty).
3072 * This means that the in-core inode may be reaped by prune_icache
3073 * without having to perform any I/O. This is a very good thing,
3074 * because *any* task may call prune_icache - even ones which
3075 * have a transaction open against a different journal.
3076 *
3077 * Is this cheating? Not really. Sure, we haven't written the
3078 * inode out, but prune_icache isn't a user-visible syncing function.
3079 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
3080 * we start and wait on commits.
3081 *
3082 * Is this efficient/effective? Well, we're being nice to the system
3083 * by cleaning up our inodes proactively so they can be reaped
3084 * without I/O. But we are potentially leaving up to five seconds'
3085 * worth of inodes floating about which prune_icache wants us to
3086 * write out. One way to fix that would be to get prune_icache()
3087 * to do a write_super() to free up some memory. It has the desired
3088 * effect.
3089 */
3090int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
3091{
3092 struct ext3_iloc iloc;
3093 int err;
3094
3095 might_sleep();
3096 err = ext3_reserve_inode_write(handle, inode, &iloc);
3097 if (!err)
3098 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
3099 return err;
3100}
3101
3102/*
3103 * ext3_dirty_inode() is called from __mark_inode_dirty()
3104 *
3105 * We're really interested in the case where a file is being extended.
3106 * i_size has been changed by generic_commit_write() and we thus need
3107 * to include the updated inode in the current transaction.
3108 *
3109 * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
3110 * are allocated to the file.
3111 *
3112 * If the inode is marked synchronous, we don't honour that here - doing
3113 * so would cause a commit on atime updates, which we don't bother doing.
3114 * We handle synchronous inodes at the highest possible level.
3115 */
3116void ext3_dirty_inode(struct inode *inode)
3117{
3118 handle_t *current_handle = ext3_journal_current_handle();
3119 handle_t *handle;
3120
3121 handle = ext3_journal_start(inode, 2);
3122 if (IS_ERR(handle))
3123 goto out;
3124 if (current_handle &&
3125 current_handle->h_transaction != handle->h_transaction) {
3126 /* This task has a transaction open against a different fs */
3127 printk(KERN_EMERG "%s: transactions do not match!\n",
3128 __FUNCTION__);
3129 } else {
3130 jbd_debug(5, "marking dirty. outer handle=%p\n",
3131 current_handle);
3132 ext3_mark_inode_dirty(handle, inode);
3133 }
3134 ext3_journal_stop(handle);
3135out:
3136 return;
3137}
3138
3139#if 0
3140/*
3141 * Bind an inode's backing buffer_head into this transaction, to prevent
3142 * it from being flushed to disk early. Unlike
3143 * ext3_reserve_inode_write, this leaves behind no bh reference and
3144 * returns no iloc structure, so the caller needs to repeat the iloc
3145 * lookup to mark the inode dirty later.
3146 */
3147static int ext3_pin_inode(handle_t *handle, struct inode *inode)
3148{
3149 struct ext3_iloc iloc;
3150
3151 int err = 0;
3152 if (handle) {
3153 err = ext3_get_inode_loc(inode, &iloc);
3154 if (!err) {
3155 BUFFER_TRACE(iloc.bh, "get_write_access");
3156 err = journal_get_write_access(handle, iloc.bh);
3157 if (!err)
3158 err = ext3_journal_dirty_metadata(handle,
3159 iloc.bh);
3160 brelse(iloc.bh);
3161 }
3162 }
3163 ext3_std_error(inode->i_sb, err);
3164 return err;
3165}
3166#endif
3167
3168int ext3_change_inode_journal_flag(struct inode *inode, int val)
3169{
3170 journal_t *journal;
3171 handle_t *handle;
3172 int err;
3173
3174 /*
3175 * We have to be very careful here: changing a data block's
3176 * journaling status dynamically is dangerous. If we write a
3177 * data block to the journal, change the status and then delete
3178 * that block, we risk forgetting to revoke the old log record
3179 * from the journal and so a subsequent replay can corrupt data.
3180 * So, first we make sure that the journal is empty and that
3181 * nobody is changing anything.
3182 */
3183
3184 journal = EXT3_JOURNAL(inode);
3185 if (is_journal_aborted(journal) || IS_RDONLY(inode))
3186 return -EROFS;
3187
3188 journal_lock_updates(journal);
3189 journal_flush(journal);
3190
3191 /*
3192 * OK, there are no updates running now, and all cached data is
3193 * synced to disk. We are now in a completely consistent state
3194 * which doesn't have anything in the journal, and we know that
3195 * no filesystem updates are running, so it is safe to modify
3196 * the inode's in-core data-journaling state flag now.
3197 */
3198
3199 if (val)
3200 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
3201 else
3202 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
3203 ext3_set_aops(inode);
3204
3205 journal_unlock_updates(journal);
3206
3207 /* Finally we can mark the inode as dirty. */
3208
3209 handle = ext3_journal_start(inode, 1);
3210 if (IS_ERR(handle))
3211 return PTR_ERR(handle);
3212
3213 err = ext3_mark_inode_dirty(handle, inode);
3214 handle->h_sync = 1;
3215 ext3_journal_stop(handle);
3216 ext3_std_error(inode->i_sb, err);
3217
3218 return err;
3219}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
new file mode 100644
index 000000000000..12daa6869572
--- /dev/null
+++ b/fs/ext4/ioctl.c
@@ -0,0 +1,307 @@
1/*
2 * linux/fs/ext3/ioctl.c
3 *
4 * Copyright (C) 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 */
9
10#include <linux/fs.h>
11#include <linux/jbd.h>
12#include <linux/capability.h>
13#include <linux/ext3_fs.h>
14#include <linux/ext3_jbd.h>
15#include <linux/time.h>
16#include <linux/compat.h>
17#include <linux/smp_lock.h>
18#include <asm/uaccess.h>
19
20int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
21 unsigned long arg)
22{
23 struct ext3_inode_info *ei = EXT3_I(inode);
24 unsigned int flags;
25 unsigned short rsv_window_size;
26
27 ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
28
29 switch (cmd) {
30 case EXT3_IOC_GETFLAGS:
31 flags = ei->i_flags & EXT3_FL_USER_VISIBLE;
32 return put_user(flags, (int __user *) arg);
33 case EXT3_IOC_SETFLAGS: {
34 handle_t *handle = NULL;
35 int err;
36 struct ext3_iloc iloc;
37 unsigned int oldflags;
38 unsigned int jflag;
39
40 if (IS_RDONLY(inode))
41 return -EROFS;
42
43 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
44 return -EACCES;
45
46 if (get_user(flags, (int __user *) arg))
47 return -EFAULT;
48
49 if (!S_ISDIR(inode->i_mode))
50 flags &= ~EXT3_DIRSYNC_FL;
51
52 mutex_lock(&inode->i_mutex);
53 oldflags = ei->i_flags;
54
55 /* The JOURNAL_DATA flag is modifiable only by root */
56 jflag = flags & EXT3_JOURNAL_DATA_FL;
57
58 /*
59 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
60 * the relevant capability.
61 *
62 * This test looks nicer. Thanks to Pauline Middelink
63 */
64 if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
65 if (!capable(CAP_LINUX_IMMUTABLE)) {
66 mutex_unlock(&inode->i_mutex);
67 return -EPERM;
68 }
69 }
70
71 /*
72 * The JOURNAL_DATA flag can only be changed by
73 * the relevant capability.
74 */
75 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
76 if (!capable(CAP_SYS_RESOURCE)) {
77 mutex_unlock(&inode->i_mutex);
78 return -EPERM;
79 }
80 }
81
82
83 handle = ext3_journal_start(inode, 1);
84 if (IS_ERR(handle)) {
85 mutex_unlock(&inode->i_mutex);
86 return PTR_ERR(handle);
87 }
88 if (IS_SYNC(inode))
89 handle->h_sync = 1;
90 err = ext3_reserve_inode_write(handle, inode, &iloc);
91 if (err)
92 goto flags_err;
93
94 flags = flags & EXT3_FL_USER_MODIFIABLE;
95 flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE;
96 ei->i_flags = flags;
97
98 ext3_set_inode_flags(inode);
99 inode->i_ctime = CURRENT_TIME_SEC;
100
101 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
102flags_err:
103 ext3_journal_stop(handle);
104 if (err) {
105 mutex_unlock(&inode->i_mutex);
106 return err;
107 }
108
109 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
110 err = ext3_change_inode_journal_flag(inode, jflag);
111 mutex_unlock(&inode->i_mutex);
112 return err;
113 }
114 case EXT3_IOC_GETVERSION:
115 case EXT3_IOC_GETVERSION_OLD:
116 return put_user(inode->i_generation, (int __user *) arg);
117 case EXT3_IOC_SETVERSION:
118 case EXT3_IOC_SETVERSION_OLD: {
119 handle_t *handle;
120 struct ext3_iloc iloc;
121 __u32 generation;
122 int err;
123
124 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
125 return -EPERM;
126 if (IS_RDONLY(inode))
127 return -EROFS;
128 if (get_user(generation, (int __user *) arg))
129 return -EFAULT;
130
131 handle = ext3_journal_start(inode, 1);
132 if (IS_ERR(handle))
133 return PTR_ERR(handle);
134 err = ext3_reserve_inode_write(handle, inode, &iloc);
135 if (err == 0) {
136 inode->i_ctime = CURRENT_TIME_SEC;
137 inode->i_generation = generation;
138 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
139 }
140 ext3_journal_stop(handle);
141 return err;
142 }
143#ifdef CONFIG_JBD_DEBUG
144 case EXT3_IOC_WAIT_FOR_READONLY:
145 /*
146 * This is racy - by the time we're woken up and running,
147 * the superblock could be released. And the module could
148 * have been unloaded. So sue me.
149 *
150 * Returns 1 if it slept, else zero.
151 */
152 {
153 struct super_block *sb = inode->i_sb;
154 DECLARE_WAITQUEUE(wait, current);
155 int ret = 0;
156
157 set_current_state(TASK_INTERRUPTIBLE);
158 add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
159 if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) {
160 schedule();
161 ret = 1;
162 }
163 remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
164 return ret;
165 }
166#endif
167 case EXT3_IOC_GETRSVSZ:
168 if (test_opt(inode->i_sb, RESERVATION)
169 && S_ISREG(inode->i_mode)
170 && ei->i_block_alloc_info) {
171 rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
172 return put_user(rsv_window_size, (int __user *)arg);
173 }
174 return -ENOTTY;
175 case EXT3_IOC_SETRSVSZ: {
176
177 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
178 return -ENOTTY;
179
180 if (IS_RDONLY(inode))
181 return -EROFS;
182
183 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
184 return -EACCES;
185
186 if (get_user(rsv_window_size, (int __user *)arg))
187 return -EFAULT;
188
189 if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS)
190 rsv_window_size = EXT3_MAX_RESERVE_BLOCKS;
191
192 /*
193 * need to allocate reservation structure for this inode
194 * before set the window size
195 */
196 mutex_lock(&ei->truncate_mutex);
197 if (!ei->i_block_alloc_info)
198 ext3_init_block_alloc_info(inode);
199
200 if (ei->i_block_alloc_info){
201 struct ext3_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
202 rsv->rsv_goal_size = rsv_window_size;
203 }
204 mutex_unlock(&ei->truncate_mutex);
205 return 0;
206 }
207 case EXT3_IOC_GROUP_EXTEND: {
208 ext3_fsblk_t n_blocks_count;
209 struct super_block *sb = inode->i_sb;
210 int err;
211
212 if (!capable(CAP_SYS_RESOURCE))
213 return -EPERM;
214
215 if (IS_RDONLY(inode))
216 return -EROFS;
217
218 if (get_user(n_blocks_count, (__u32 __user *)arg))
219 return -EFAULT;
220
221 err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count);
222 journal_lock_updates(EXT3_SB(sb)->s_journal);
223 journal_flush(EXT3_SB(sb)->s_journal);
224 journal_unlock_updates(EXT3_SB(sb)->s_journal);
225
226 return err;
227 }
228 case EXT3_IOC_GROUP_ADD: {
229 struct ext3_new_group_data input;
230 struct super_block *sb = inode->i_sb;
231 int err;
232
233 if (!capable(CAP_SYS_RESOURCE))
234 return -EPERM;
235
236 if (IS_RDONLY(inode))
237 return -EROFS;
238
239 if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg,
240 sizeof(input)))
241 return -EFAULT;
242
243 err = ext3_group_add(sb, &input);
244 journal_lock_updates(EXT3_SB(sb)->s_journal);
245 journal_flush(EXT3_SB(sb)->s_journal);
246 journal_unlock_updates(EXT3_SB(sb)->s_journal);
247
248 return err;
249 }
250
251
252 default:
253 return -ENOTTY;
254 }
255}
256
257#ifdef CONFIG_COMPAT
258long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
259{
260 struct inode *inode = file->f_dentry->d_inode;
261 int ret;
262
263 /* These are just misnamed, they actually get/put from/to user an int */
264 switch (cmd) {
265 case EXT3_IOC32_GETFLAGS:
266 cmd = EXT3_IOC_GETFLAGS;
267 break;
268 case EXT3_IOC32_SETFLAGS:
269 cmd = EXT3_IOC_SETFLAGS;
270 break;
271 case EXT3_IOC32_GETVERSION:
272 cmd = EXT3_IOC_GETVERSION;
273 break;
274 case EXT3_IOC32_SETVERSION:
275 cmd = EXT3_IOC_SETVERSION;
276 break;
277 case EXT3_IOC32_GROUP_EXTEND:
278 cmd = EXT3_IOC_GROUP_EXTEND;
279 break;
280 case EXT3_IOC32_GETVERSION_OLD:
281 cmd = EXT3_IOC_GETVERSION_OLD;
282 break;
283 case EXT3_IOC32_SETVERSION_OLD:
284 cmd = EXT3_IOC_SETVERSION_OLD;
285 break;
286#ifdef CONFIG_JBD_DEBUG
287 case EXT3_IOC32_WAIT_FOR_READONLY:
288 cmd = EXT3_IOC_WAIT_FOR_READONLY;
289 break;
290#endif
291 case EXT3_IOC32_GETRSVSZ:
292 cmd = EXT3_IOC_GETRSVSZ;
293 break;
294 case EXT3_IOC32_SETRSVSZ:
295 cmd = EXT3_IOC_SETRSVSZ;
296 break;
297 case EXT3_IOC_GROUP_ADD:
298 break;
299 default:
300 return -ENOIOCTLCMD;
301 }
302 lock_kernel();
303 ret = ext3_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
304 unlock_kernel();
305 return ret;
306}
307#endif
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
new file mode 100644
index 000000000000..906731a20f1a
--- /dev/null
+++ b/fs/ext4/namei.c
@@ -0,0 +1,2397 @@
1/*
2 * linux/fs/ext3/namei.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/namei.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Big-endian to little-endian byte-swapping/bitmaps by
16 * David S. Miller (davem@caip.rutgers.edu), 1995
17 * Directory entry file type support and forward compatibility hooks
18 * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
19 * Hash Tree Directory indexing (c)
20 * Daniel Phillips, 2001
21 * Hash Tree Directory indexing porting
22 * Christopher Li, 2002
23 * Hash Tree Directory indexing cleanup
24 * Theodore Ts'o, 2002
25 */
26
27#include <linux/fs.h>
28#include <linux/pagemap.h>
29#include <linux/jbd.h>
30#include <linux/time.h>
31#include <linux/ext3_fs.h>
32#include <linux/ext3_jbd.h>
33#include <linux/fcntl.h>
34#include <linux/stat.h>
35#include <linux/string.h>
36#include <linux/quotaops.h>
37#include <linux/buffer_head.h>
38#include <linux/bio.h>
39#include <linux/smp_lock.h>
40
41#include "namei.h"
42#include "xattr.h"
43#include "acl.h"
44
45/*
46 * define how far ahead to read directories while searching them.
47 */
48#define NAMEI_RA_CHUNKS 2
49#define NAMEI_RA_BLOCKS 4
50#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
51#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
52
53static struct buffer_head *ext3_append(handle_t *handle,
54 struct inode *inode,
55 u32 *block, int *err)
56{
57 struct buffer_head *bh;
58
59 *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
60
61 if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
62 inode->i_size += inode->i_sb->s_blocksize;
63 EXT3_I(inode)->i_disksize = inode->i_size;
64 ext3_journal_get_write_access(handle,bh);
65 }
66 return bh;
67}
68
69#ifndef assert
70#define assert(test) J_ASSERT(test)
71#endif
72
73#ifndef swap
74#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
75#endif
76
77#ifdef DX_DEBUG
78#define dxtrace(command) command
79#else
80#define dxtrace(command)
81#endif
82
83struct fake_dirent
84{
85 __le32 inode;
86 __le16 rec_len;
87 u8 name_len;
88 u8 file_type;
89};
90
91struct dx_countlimit
92{
93 __le16 limit;
94 __le16 count;
95};
96
97struct dx_entry
98{
99 __le32 hash;
100 __le32 block;
101};
102
103/*
104 * dx_root_info is laid out so that if it should somehow get overlaid by a
105 * dirent the two low bits of the hash version will be zero. Therefore, the
106 * hash version mod 4 should never be 0. Sincerely, the paranoia department.
107 */
108
109struct dx_root
110{
111 struct fake_dirent dot;
112 char dot_name[4];
113 struct fake_dirent dotdot;
114 char dotdot_name[4];
115 struct dx_root_info
116 {
117 __le32 reserved_zero;
118 u8 hash_version;
119 u8 info_length; /* 8 */
120 u8 indirect_levels;
121 u8 unused_flags;
122 }
123 info;
124 struct dx_entry entries[0];
125};
126
127struct dx_node
128{
129 struct fake_dirent fake;
130 struct dx_entry entries[0];
131};
132
133
134struct dx_frame
135{
136 struct buffer_head *bh;
137 struct dx_entry *entries;
138 struct dx_entry *at;
139};
140
141struct dx_map_entry
142{
143 u32 hash;
144 u32 offs;
145};
146
147#ifdef CONFIG_EXT3_INDEX
148static inline unsigned dx_get_block (struct dx_entry *entry);
149static void dx_set_block (struct dx_entry *entry, unsigned value);
150static inline unsigned dx_get_hash (struct dx_entry *entry);
151static void dx_set_hash (struct dx_entry *entry, unsigned value);
152static unsigned dx_get_count (struct dx_entry *entries);
153static unsigned dx_get_limit (struct dx_entry *entries);
154static void dx_set_count (struct dx_entry *entries, unsigned value);
155static void dx_set_limit (struct dx_entry *entries, unsigned value);
156static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
157static unsigned dx_node_limit (struct inode *dir);
158static struct dx_frame *dx_probe(struct dentry *dentry,
159 struct inode *dir,
160 struct dx_hash_info *hinfo,
161 struct dx_frame *frame,
162 int *err);
163static void dx_release (struct dx_frame *frames);
164static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
165 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
166static void dx_sort_map(struct dx_map_entry *map, unsigned count);
167static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
168 struct dx_map_entry *offsets, int count);
169static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
170static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
171static int ext3_htree_next_block(struct inode *dir, __u32 hash,
172 struct dx_frame *frame,
173 struct dx_frame *frames,
174 __u32 *start_hash);
175static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
176 struct ext3_dir_entry_2 **res_dir, int *err);
177static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
178 struct inode *inode);
179
180/*
181 * Future: use high four bits of block for coalesce-on-delete flags
182 * Mask them off for now.
183 */
184
185static inline unsigned dx_get_block (struct dx_entry *entry)
186{
187 return le32_to_cpu(entry->block) & 0x00ffffff;
188}
189
190static inline void dx_set_block (struct dx_entry *entry, unsigned value)
191{
192 entry->block = cpu_to_le32(value);
193}
194
195static inline unsigned dx_get_hash (struct dx_entry *entry)
196{
197 return le32_to_cpu(entry->hash);
198}
199
200static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
201{
202 entry->hash = cpu_to_le32(value);
203}
204
205static inline unsigned dx_get_count (struct dx_entry *entries)
206{
207 return le16_to_cpu(((struct dx_countlimit *) entries)->count);
208}
209
210static inline unsigned dx_get_limit (struct dx_entry *entries)
211{
212 return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
213}
214
215static inline void dx_set_count (struct dx_entry *entries, unsigned value)
216{
217 ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
218}
219
220static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
221{
222 ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
223}
224
225static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
226{
227 unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
228 EXT3_DIR_REC_LEN(2) - infosize;
229 return 0? 20: entry_space / sizeof(struct dx_entry);
230}
231
232static inline unsigned dx_node_limit (struct inode *dir)
233{
234 unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
235 return 0? 22: entry_space / sizeof(struct dx_entry);
236}
237
238/*
239 * Debug
240 */
241#ifdef DX_DEBUG
242static void dx_show_index (char * label, struct dx_entry *entries)
243{
244 int i, n = dx_get_count (entries);
245 printk("%s index ", label);
246 for (i = 0; i < n; i++)
247 {
248 printk("%x->%u ", i? dx_get_hash(entries + i): 0, dx_get_block(entries + i));
249 }
250 printk("\n");
251}
252
253struct stats
254{
255 unsigned names;
256 unsigned space;
257 unsigned bcount;
258};
259
260static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de,
261 int size, int show_names)
262{
263 unsigned names = 0, space = 0;
264 char *base = (char *) de;
265 struct dx_hash_info h = *hinfo;
266
267 printk("names: ");
268 while ((char *) de < base + size)
269 {
270 if (de->inode)
271 {
272 if (show_names)
273 {
274 int len = de->name_len;
275 char *name = de->name;
276 while (len--) printk("%c", *name++);
277 ext3fs_dirhash(de->name, de->name_len, &h);
278 printk(":%x.%u ", h.hash,
279 ((char *) de - base));
280 }
281 space += EXT3_DIR_REC_LEN(de->name_len);
282 names++;
283 }
284 de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
285 }
286 printk("(%i)\n", names);
287 return (struct stats) { names, space, 1 };
288}
289
290struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
291 struct dx_entry *entries, int levels)
292{
293 unsigned blocksize = dir->i_sb->s_blocksize;
294 unsigned count = dx_get_count (entries), names = 0, space = 0, i;
295 unsigned bcount = 0;
296 struct buffer_head *bh;
297 int err;
298 printk("%i indexed blocks...\n", count);
299 for (i = 0; i < count; i++, entries++)
300 {
301 u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
302 u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
303 struct stats stats;
304 printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
305 if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue;
306 stats = levels?
307 dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
308 dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0);
309 names += stats.names;
310 space += stats.space;
311 bcount += stats.bcount;
312 brelse (bh);
313 }
314 if (bcount)
315 printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ",
316 names, space/bcount,(space/bcount)*100/blocksize);
317 return (struct stats) { names, space, bcount};
318}
319#endif /* DX_DEBUG */
320
321/*
322 * Probe for a directory leaf block to search.
323 *
324 * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
325 * error in the directory index, and the caller should fall back to
326 * searching the directory normally. The callers of dx_probe **MUST**
327 * check for this error code, and make sure it never gets reflected
328 * back to userspace.
329 */
330static struct dx_frame *
331dx_probe(struct dentry *dentry, struct inode *dir,
332 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
333{
334 unsigned count, indirect;
335 struct dx_entry *at, *entries, *p, *q, *m;
336 struct dx_root *root;
337 struct buffer_head *bh;
338 struct dx_frame *frame = frame_in;
339 u32 hash;
340
341 frame->bh = NULL;
342 if (dentry)
343 dir = dentry->d_parent->d_inode;
344 if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
345 goto fail;
346 root = (struct dx_root *) bh->b_data;
347 if (root->info.hash_version != DX_HASH_TEA &&
348 root->info.hash_version != DX_HASH_HALF_MD4 &&
349 root->info.hash_version != DX_HASH_LEGACY) {
350 ext3_warning(dir->i_sb, __FUNCTION__,
351 "Unrecognised inode hash code %d",
352 root->info.hash_version);
353 brelse(bh);
354 *err = ERR_BAD_DX_DIR;
355 goto fail;
356 }
357 hinfo->hash_version = root->info.hash_version;
358 hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
359 if (dentry)
360 ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
361 hash = hinfo->hash;
362
363 if (root->info.unused_flags & 1) {
364 ext3_warning(dir->i_sb, __FUNCTION__,
365 "Unimplemented inode hash flags: %#06x",
366 root->info.unused_flags);
367 brelse(bh);
368 *err = ERR_BAD_DX_DIR;
369 goto fail;
370 }
371
372 if ((indirect = root->info.indirect_levels) > 1) {
373 ext3_warning(dir->i_sb, __FUNCTION__,
374 "Unimplemented inode hash depth: %#06x",
375 root->info.indirect_levels);
376 brelse(bh);
377 *err = ERR_BAD_DX_DIR;
378 goto fail;
379 }
380
381 entries = (struct dx_entry *) (((char *)&root->info) +
382 root->info.info_length);
383 assert(dx_get_limit(entries) == dx_root_limit(dir,
384 root->info.info_length));
385 dxtrace (printk("Look up %x", hash));
386 while (1)
387 {
388 count = dx_get_count(entries);
389 assert (count && count <= dx_get_limit(entries));
390 p = entries + 1;
391 q = entries + count - 1;
392 while (p <= q)
393 {
394 m = p + (q - p)/2;
395 dxtrace(printk("."));
396 if (dx_get_hash(m) > hash)
397 q = m - 1;
398 else
399 p = m + 1;
400 }
401
402 if (0) // linear search cross check
403 {
404 unsigned n = count - 1;
405 at = entries;
406 while (n--)
407 {
408 dxtrace(printk(","));
409 if (dx_get_hash(++at) > hash)
410 {
411 at--;
412 break;
413 }
414 }
415 assert (at == p - 1);
416 }
417
418 at = p - 1;
419 dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
420 frame->bh = bh;
421 frame->entries = entries;
422 frame->at = at;
423 if (!indirect--) return frame;
424 if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
425 goto fail2;
426 at = entries = ((struct dx_node *) bh->b_data)->entries;
427 assert (dx_get_limit(entries) == dx_node_limit (dir));
428 frame++;
429 }
430fail2:
431 while (frame >= frame_in) {
432 brelse(frame->bh);
433 frame--;
434 }
435fail:
436 return NULL;
437}
438
439static void dx_release (struct dx_frame *frames)
440{
441 if (frames[0].bh == NULL)
442 return;
443
444 if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
445 brelse(frames[1].bh);
446 brelse(frames[0].bh);
447}
448
449/*
450 * This function increments the frame pointer to search the next leaf
451 * block, and reads in the necessary intervening nodes if the search
452 * should be necessary. Whether or not the search is necessary is
453 * controlled by the hash parameter. If the hash value is even, then
454 * the search is only continued if the next block starts with that
455 * hash value. This is used if we are searching for a specific file.
456 *
457 * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
458 *
459 * This function returns 1 if the caller should continue to search,
460 * or 0 if it should not. If there is an error reading one of the
461 * index blocks, it will a negative error code.
462 *
463 * If start_hash is non-null, it will be filled in with the starting
464 * hash of the next page.
465 */
466static int ext3_htree_next_block(struct inode *dir, __u32 hash,
467 struct dx_frame *frame,
468 struct dx_frame *frames,
469 __u32 *start_hash)
470{
471 struct dx_frame *p;
472 struct buffer_head *bh;
473 int err, num_frames = 0;
474 __u32 bhash;
475
476 p = frame;
477 /*
478 * Find the next leaf page by incrementing the frame pointer.
479 * If we run out of entries in the interior node, loop around and
480 * increment pointer in the parent node. When we break out of
481 * this loop, num_frames indicates the number of interior
482 * nodes need to be read.
483 */
484 while (1) {
485 if (++(p->at) < p->entries + dx_get_count(p->entries))
486 break;
487 if (p == frames)
488 return 0;
489 num_frames++;
490 p--;
491 }
492
493 /*
494 * If the hash is 1, then continue only if the next page has a
495 * continuation hash of any value. This is used for readdir
496 * handling. Otherwise, check to see if the hash matches the
497 * desired contiuation hash. If it doesn't, return since
498 * there's no point to read in the successive index pages.
499 */
500 bhash = dx_get_hash(p->at);
501 if (start_hash)
502 *start_hash = bhash;
503 if ((hash & 1) == 0) {
504 if ((bhash & ~1) != hash)
505 return 0;
506 }
507 /*
508 * If the hash is HASH_NB_ALWAYS, we always go to the next
509 * block so no check is necessary
510 */
511 while (num_frames--) {
512 if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
513 0, &err)))
514 return err; /* Failure */
515 p++;
516 brelse (p->bh);
517 p->bh = bh;
518 p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
519 }
520 return 1;
521}
522
523
524/*
525 * p is at least 6 bytes before the end of page
526 */
527static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p)
528{
529 return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
530}
531
532/*
533 * This function fills a red-black tree with information from a
534 * directory block. It returns the number directory entries loaded
535 * into the tree. If there is an error it is returned in err.
536 */
537static int htree_dirblock_to_tree(struct file *dir_file,
538 struct inode *dir, int block,
539 struct dx_hash_info *hinfo,
540 __u32 start_hash, __u32 start_minor_hash)
541{
542 struct buffer_head *bh;
543 struct ext3_dir_entry_2 *de, *top;
544 int err, count = 0;
545
546 dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
547 if (!(bh = ext3_bread (NULL, dir, block, 0, &err)))
548 return err;
549
550 de = (struct ext3_dir_entry_2 *) bh->b_data;
551 top = (struct ext3_dir_entry_2 *) ((char *) de +
552 dir->i_sb->s_blocksize -
553 EXT3_DIR_REC_LEN(0));
554 for (; de < top; de = ext3_next_entry(de)) {
555 ext3fs_dirhash(de->name, de->name_len, hinfo);
556 if ((hinfo->hash < start_hash) ||
557 ((hinfo->hash == start_hash) &&
558 (hinfo->minor_hash < start_minor_hash)))
559 continue;
560 if (de->inode == 0)
561 continue;
562 if ((err = ext3_htree_store_dirent(dir_file,
563 hinfo->hash, hinfo->minor_hash, de)) != 0) {
564 brelse(bh);
565 return err;
566 }
567 count++;
568 }
569 brelse(bh);
570 return count;
571}
572
573
574/*
575 * This function fills a red-black tree with information from a
576 * directory. We start scanning the directory in hash order, starting
577 * at start_hash and start_minor_hash.
578 *
579 * This function returns the number of entries inserted into the tree,
580 * or a negative error code.
581 */
582int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
583 __u32 start_minor_hash, __u32 *next_hash)
584{
585 struct dx_hash_info hinfo;
586 struct ext3_dir_entry_2 *de;
587 struct dx_frame frames[2], *frame;
588 struct inode *dir;
589 int block, err;
590 int count = 0;
591 int ret;
592 __u32 hashval;
593
594 dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
595 start_minor_hash));
596 dir = dir_file->f_dentry->d_inode;
597 if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
598 hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
599 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
600 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
601 start_hash, start_minor_hash);
602 *next_hash = ~0;
603 return count;
604 }
605 hinfo.hash = start_hash;
606 hinfo.minor_hash = 0;
607 frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
608 if (!frame)
609 return err;
610
611 /* Add '.' and '..' from the htree header */
612 if (!start_hash && !start_minor_hash) {
613 de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
614 if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
615 goto errout;
616 count++;
617 }
618 if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
619 de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
620 de = ext3_next_entry(de);
621 if ((err = ext3_htree_store_dirent(dir_file, 2, 0, de)) != 0)
622 goto errout;
623 count++;
624 }
625
626 while (1) {
627 block = dx_get_block(frame->at);
628 ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
629 start_hash, start_minor_hash);
630 if (ret < 0) {
631 err = ret;
632 goto errout;
633 }
634 count += ret;
635 hashval = ~0;
636 ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
637 frame, frames, &hashval);
638 *next_hash = hashval;
639 if (ret < 0) {
640 err = ret;
641 goto errout;
642 }
643 /*
644 * Stop if: (a) there are no more entries, or
645 * (b) we have inserted at least one entry and the
646 * next hash value is not a continuation
647 */
648 if ((ret == 0) ||
649 (count && ((hashval & 1) == 0)))
650 break;
651 }
652 dx_release(frames);
653 dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
654 count, *next_hash));
655 return count;
656errout:
657 dx_release(frames);
658 return (err);
659}
660
661
662/*
663 * Directory block splitting, compacting
664 */
665
666static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
667 struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
668{
669 int count = 0;
670 char *base = (char *) de;
671 struct dx_hash_info h = *hinfo;
672
673 while ((char *) de < base + size)
674 {
675 if (de->name_len && de->inode) {
676 ext3fs_dirhash(de->name, de->name_len, &h);
677 map_tail--;
678 map_tail->hash = h.hash;
679 map_tail->offs = (u32) ((char *) de - base);
680 count++;
681 cond_resched();
682 }
683 /* XXX: do we need to check rec_len == 0 case? -Chris */
684 de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
685 }
686 return count;
687}
688
689static void dx_sort_map (struct dx_map_entry *map, unsigned count)
690{
691 struct dx_map_entry *p, *q, *top = map + count - 1;
692 int more;
693 /* Combsort until bubble sort doesn't suck */
694 while (count > 2)
695 {
696 count = count*10/13;
697 if (count - 9 < 2) /* 9, 10 -> 11 */
698 count = 11;
699 for (p = top, q = p - count; q >= map; p--, q--)
700 if (p->hash < q->hash)
701 swap(*p, *q);
702 }
703 /* Garden variety bubble sort */
704 do {
705 more = 0;
706 q = top;
707 while (q-- > map)
708 {
709 if (q[1].hash >= q[0].hash)
710 continue;
711 swap(*(q+1), *q);
712 more = 1;
713 }
714 } while(more);
715}
716
717static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
718{
719 struct dx_entry *entries = frame->entries;
720 struct dx_entry *old = frame->at, *new = old + 1;
721 int count = dx_get_count(entries);
722
723 assert(count < dx_get_limit(entries));
724 assert(old < entries + count);
725 memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
726 dx_set_hash(new, hash);
727 dx_set_block(new, block);
728 dx_set_count(entries, count + 1);
729}
730#endif
731
732
733static void ext3_update_dx_flag(struct inode *inode)
734{
735 if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
736 EXT3_FEATURE_COMPAT_DIR_INDEX))
737 EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
738}
739
740/*
741 * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
742 *
743 * `len <= EXT3_NAME_LEN' is guaranteed by caller.
744 * `de != NULL' is guaranteed by caller.
745 */
746static inline int ext3_match (int len, const char * const name,
747 struct ext3_dir_entry_2 * de)
748{
749 if (len != de->name_len)
750 return 0;
751 if (!de->inode)
752 return 0;
753 return !memcmp(name, de->name, len);
754}
755
756/*
757 * Returns 0 if not found, -1 on failure, and 1 on success
758 */
759static inline int search_dirblock(struct buffer_head * bh,
760 struct inode *dir,
761 struct dentry *dentry,
762 unsigned long offset,
763 struct ext3_dir_entry_2 ** res_dir)
764{
765 struct ext3_dir_entry_2 * de;
766 char * dlimit;
767 int de_len;
768 const char *name = dentry->d_name.name;
769 int namelen = dentry->d_name.len;
770
771 de = (struct ext3_dir_entry_2 *) bh->b_data;
772 dlimit = bh->b_data + dir->i_sb->s_blocksize;
773 while ((char *) de < dlimit) {
774 /* this code is executed quadratically often */
775 /* do minimal checking `by hand' */
776
777 if ((char *) de + namelen <= dlimit &&
778 ext3_match (namelen, name, de)) {
779 /* found a match - just to be sure, do a full check */
780 if (!ext3_check_dir_entry("ext3_find_entry",
781 dir, de, bh, offset))
782 return -1;
783 *res_dir = de;
784 return 1;
785 }
786 /* prevent looping on a bad block */
787 de_len = le16_to_cpu(de->rec_len);
788 if (de_len <= 0)
789 return -1;
790 offset += de_len;
791 de = (struct ext3_dir_entry_2 *) ((char *) de + de_len);
792 }
793 return 0;
794}
795
796
797/*
798 * ext3_find_entry()
799 *
800 * finds an entry in the specified directory with the wanted name. It
801 * returns the cache buffer in which the entry was found, and the entry
802 * itself (as a parameter - res_dir). It does NOT read the inode of the
803 * entry - you'll have to do that yourself if you want to.
804 *
805 * The returned buffer_head has ->b_count elevated. The caller is expected
806 * to brelse() it when appropriate.
807 */
808static struct buffer_head * ext3_find_entry (struct dentry *dentry,
809 struct ext3_dir_entry_2 ** res_dir)
810{
811 struct super_block * sb;
812 struct buffer_head * bh_use[NAMEI_RA_SIZE];
813 struct buffer_head * bh, *ret = NULL;
814 unsigned long start, block, b;
815 int ra_max = 0; /* Number of bh's in the readahead
816 buffer, bh_use[] */
817 int ra_ptr = 0; /* Current index into readahead
818 buffer */
819 int num = 0;
820 int nblocks, i, err;
821 struct inode *dir = dentry->d_parent->d_inode;
822 int namelen;
823 const u8 *name;
824 unsigned blocksize;
825
826 *res_dir = NULL;
827 sb = dir->i_sb;
828 blocksize = sb->s_blocksize;
829 namelen = dentry->d_name.len;
830 name = dentry->d_name.name;
831 if (namelen > EXT3_NAME_LEN)
832 return NULL;
833#ifdef CONFIG_EXT3_INDEX
834 if (is_dx(dir)) {
835 bh = ext3_dx_find_entry(dentry, res_dir, &err);
836 /*
837 * On success, or if the error was file not found,
838 * return. Otherwise, fall back to doing a search the
839 * old fashioned way.
840 */
841 if (bh || (err != ERR_BAD_DX_DIR))
842 return bh;
843 dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
844 }
845#endif
846 nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
847 start = EXT3_I(dir)->i_dir_start_lookup;
848 if (start >= nblocks)
849 start = 0;
850 block = start;
851restart:
852 do {
853 /*
854 * We deal with the read-ahead logic here.
855 */
856 if (ra_ptr >= ra_max) {
857 /* Refill the readahead buffer */
858 ra_ptr = 0;
859 b = block;
860 for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
861 /*
862 * Terminate if we reach the end of the
863 * directory and must wrap, or if our
864 * search has finished at this block.
865 */
866 if (b >= nblocks || (num && block == start)) {
867 bh_use[ra_max] = NULL;
868 break;
869 }
870 num++;
871 bh = ext3_getblk(NULL, dir, b++, 0, &err);
872 bh_use[ra_max] = bh;
873 if (bh)
874 ll_rw_block(READ_META, 1, &bh);
875 }
876 }
877 if ((bh = bh_use[ra_ptr++]) == NULL)
878 goto next;
879 wait_on_buffer(bh);
880 if (!buffer_uptodate(bh)) {
881 /* read error, skip block & hope for the best */
882 ext3_error(sb, __FUNCTION__, "reading directory #%lu "
883 "offset %lu", dir->i_ino, block);
884 brelse(bh);
885 goto next;
886 }
887 i = search_dirblock(bh, dir, dentry,
888 block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
889 if (i == 1) {
890 EXT3_I(dir)->i_dir_start_lookup = block;
891 ret = bh;
892 goto cleanup_and_exit;
893 } else {
894 brelse(bh);
895 if (i < 0)
896 goto cleanup_and_exit;
897 }
898 next:
899 if (++block >= nblocks)
900 block = 0;
901 } while (block != start);
902
903 /*
904 * If the directory has grown while we were searching, then
905 * search the last part of the directory before giving up.
906 */
907 block = nblocks;
908 nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
909 if (block < nblocks) {
910 start = 0;
911 goto restart;
912 }
913
914cleanup_and_exit:
915 /* Clean up the read-ahead blocks */
916 for (; ra_ptr < ra_max; ra_ptr++)
917 brelse (bh_use[ra_ptr]);
918 return ret;
919}
920
921#ifdef CONFIG_EXT3_INDEX
922static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
923 struct ext3_dir_entry_2 **res_dir, int *err)
924{
925 struct super_block * sb;
926 struct dx_hash_info hinfo;
927 u32 hash;
928 struct dx_frame frames[2], *frame;
929 struct ext3_dir_entry_2 *de, *top;
930 struct buffer_head *bh;
931 unsigned long block;
932 int retval;
933 int namelen = dentry->d_name.len;
934 const u8 *name = dentry->d_name.name;
935 struct inode *dir = dentry->d_parent->d_inode;
936
937 sb = dir->i_sb;
938 /* NFS may look up ".." - look at dx_root directory block */
939 if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
940 if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
941 return NULL;
942 } else {
943 frame = frames;
944 frame->bh = NULL; /* for dx_release() */
945 frame->at = (struct dx_entry *)frames; /* hack for zero entry*/
946 dx_set_block(frame->at, 0); /* dx_root block is 0 */
947 }
948 hash = hinfo.hash;
949 do {
950 block = dx_get_block(frame->at);
951 if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
952 goto errout;
953 de = (struct ext3_dir_entry_2 *) bh->b_data;
954 top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
955 EXT3_DIR_REC_LEN(0));
956 for (; de < top; de = ext3_next_entry(de))
957 if (ext3_match (namelen, name, de)) {
958 if (!ext3_check_dir_entry("ext3_find_entry",
959 dir, de, bh,
960 (block<<EXT3_BLOCK_SIZE_BITS(sb))
961 +((char *)de - bh->b_data))) {
962 brelse (bh);
963 goto errout;
964 }
965 *res_dir = de;
966 dx_release (frames);
967 return bh;
968 }
969 brelse (bh);
970 /* Check to see if we should continue to search */
971 retval = ext3_htree_next_block(dir, hash, frame,
972 frames, NULL);
973 if (retval < 0) {
974 ext3_warning(sb, __FUNCTION__,
975 "error reading index page in directory #%lu",
976 dir->i_ino);
977 *err = retval;
978 goto errout;
979 }
980 } while (retval == 1);
981
982 *err = -ENOENT;
983errout:
984 dxtrace(printk("%s not found\n", name));
985 dx_release (frames);
986 return NULL;
987}
988#endif
989
990static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
991{
992 struct inode * inode;
993 struct ext3_dir_entry_2 * de;
994 struct buffer_head * bh;
995
996 if (dentry->d_name.len > EXT3_NAME_LEN)
997 return ERR_PTR(-ENAMETOOLONG);
998
999 bh = ext3_find_entry(dentry, &de);
1000 inode = NULL;
1001 if (bh) {
1002 unsigned long ino = le32_to_cpu(de->inode);
1003 brelse (bh);
1004 if (!ext3_valid_inum(dir->i_sb, ino)) {
1005 ext3_error(dir->i_sb, "ext3_lookup",
1006 "bad inode number: %lu", ino);
1007 inode = NULL;
1008 } else
1009 inode = iget(dir->i_sb, ino);
1010
1011 if (!inode)
1012 return ERR_PTR(-EACCES);
1013 }
1014 return d_splice_alias(inode, dentry);
1015}
1016
1017
1018struct dentry *ext3_get_parent(struct dentry *child)
1019{
1020 unsigned long ino;
1021 struct dentry *parent;
1022 struct inode *inode;
1023 struct dentry dotdot;
1024 struct ext3_dir_entry_2 * de;
1025 struct buffer_head *bh;
1026
1027 dotdot.d_name.name = "..";
1028 dotdot.d_name.len = 2;
1029 dotdot.d_parent = child; /* confusing, isn't it! */
1030
1031 bh = ext3_find_entry(&dotdot, &de);
1032 inode = NULL;
1033 if (!bh)
1034 return ERR_PTR(-ENOENT);
1035 ino = le32_to_cpu(de->inode);
1036 brelse(bh);
1037
1038 if (!ext3_valid_inum(child->d_inode->i_sb, ino)) {
1039 ext3_error(child->d_inode->i_sb, "ext3_get_parent",
1040 "bad inode number: %lu", ino);
1041 inode = NULL;
1042 } else
1043 inode = iget(child->d_inode->i_sb, ino);
1044
1045 if (!inode)
1046 return ERR_PTR(-EACCES);
1047
1048 parent = d_alloc_anon(inode);
1049 if (!parent) {
1050 iput(inode);
1051 parent = ERR_PTR(-ENOMEM);
1052 }
1053 return parent;
1054}
1055
1056#define S_SHIFT 12
1057static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
1058 [S_IFREG >> S_SHIFT] = EXT3_FT_REG_FILE,
1059 [S_IFDIR >> S_SHIFT] = EXT3_FT_DIR,
1060 [S_IFCHR >> S_SHIFT] = EXT3_FT_CHRDEV,
1061 [S_IFBLK >> S_SHIFT] = EXT3_FT_BLKDEV,
1062 [S_IFIFO >> S_SHIFT] = EXT3_FT_FIFO,
1063 [S_IFSOCK >> S_SHIFT] = EXT3_FT_SOCK,
1064 [S_IFLNK >> S_SHIFT] = EXT3_FT_SYMLINK,
1065};
1066
1067static inline void ext3_set_de_type(struct super_block *sb,
1068 struct ext3_dir_entry_2 *de,
1069 umode_t mode) {
1070 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE))
1071 de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
1072}
1073
1074#ifdef CONFIG_EXT3_INDEX
1075static struct ext3_dir_entry_2 *
1076dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1077{
1078 unsigned rec_len = 0;
1079
1080 while (count--) {
1081 struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
1082 rec_len = EXT3_DIR_REC_LEN(de->name_len);
1083 memcpy (to, de, rec_len);
1084 ((struct ext3_dir_entry_2 *) to)->rec_len =
1085 cpu_to_le16(rec_len);
1086 de->inode = 0;
1087 map++;
1088 to += rec_len;
1089 }
1090 return (struct ext3_dir_entry_2 *) (to - rec_len);
1091}
1092
1093static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
1094{
1095 struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
1096 unsigned rec_len = 0;
1097
1098 prev = to = de;
1099 while ((char*)de < base + size) {
1100 next = (struct ext3_dir_entry_2 *) ((char *) de +
1101 le16_to_cpu(de->rec_len));
1102 if (de->inode && de->name_len) {
1103 rec_len = EXT3_DIR_REC_LEN(de->name_len);
1104 if (de > to)
1105 memmove(to, de, rec_len);
1106 to->rec_len = cpu_to_le16(rec_len);
1107 prev = to;
1108 to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
1109 }
1110 de = next;
1111 }
1112 return prev;
1113}
1114
1115static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1116 struct buffer_head **bh,struct dx_frame *frame,
1117 struct dx_hash_info *hinfo, int *error)
1118{
1119 unsigned blocksize = dir->i_sb->s_blocksize;
1120 unsigned count, continued;
1121 struct buffer_head *bh2;
1122 u32 newblock;
1123 u32 hash2;
1124 struct dx_map_entry *map;
1125 char *data1 = (*bh)->b_data, *data2;
1126 unsigned split;
1127 struct ext3_dir_entry_2 *de = NULL, *de2;
1128 int err;
1129
1130 bh2 = ext3_append (handle, dir, &newblock, error);
1131 if (!(bh2)) {
1132 brelse(*bh);
1133 *bh = NULL;
1134 goto errout;
1135 }
1136
1137 BUFFER_TRACE(*bh, "get_write_access");
1138 err = ext3_journal_get_write_access(handle, *bh);
1139 if (err) {
1140 journal_error:
1141 brelse(*bh);
1142 brelse(bh2);
1143 *bh = NULL;
1144 ext3_std_error(dir->i_sb, err);
1145 goto errout;
1146 }
1147 BUFFER_TRACE(frame->bh, "get_write_access");
1148 err = ext3_journal_get_write_access(handle, frame->bh);
1149 if (err)
1150 goto journal_error;
1151
1152 data2 = bh2->b_data;
1153
1154 /* create map in the end of data2 block */
1155 map = (struct dx_map_entry *) (data2 + blocksize);
1156 count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
1157 blocksize, hinfo, map);
1158 map -= count;
1159 split = count/2; // need to adjust to actual middle
1160 dx_sort_map (map, count);
1161 hash2 = map[split].hash;
1162 continued = hash2 == map[split - 1].hash;
1163 dxtrace(printk("Split block %i at %x, %i/%i\n",
1164 dx_get_block(frame->at), hash2, split, count-split));
1165
1166 /* Fancy dance to stay within two buffers */
1167 de2 = dx_move_dirents(data1, data2, map + split, count - split);
1168 de = dx_pack_dirents(data1,blocksize);
1169 de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
1170 de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
1171 dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
1172 dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
1173
1174 /* Which block gets the new entry? */
1175 if (hinfo->hash >= hash2)
1176 {
1177 swap(*bh, bh2);
1178 de = de2;
1179 }
1180 dx_insert_block (frame, hash2 + continued, newblock);
1181 err = ext3_journal_dirty_metadata (handle, bh2);
1182 if (err)
1183 goto journal_error;
1184 err = ext3_journal_dirty_metadata (handle, frame->bh);
1185 if (err)
1186 goto journal_error;
1187 brelse (bh2);
1188 dxtrace(dx_show_index ("frame", frame->entries));
1189errout:
1190 return de;
1191}
1192#endif
1193
1194
1195/*
1196 * Add a new entry into a directory (leaf) block. If de is non-NULL,
1197 * it points to a directory entry which is guaranteed to be large
1198 * enough for new directory entry. If de is NULL, then
1199 * add_dirent_to_buf will attempt search the directory block for
1200 * space. It will return -ENOSPC if no space is available, and -EIO
1201 * and -EEXIST if directory entry already exists.
1202 *
1203 * NOTE! bh is NOT released in the case where ENOSPC is returned. In
1204 * all other cases bh is released.
1205 */
1206static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1207 struct inode *inode, struct ext3_dir_entry_2 *de,
1208 struct buffer_head * bh)
1209{
1210 struct inode *dir = dentry->d_parent->d_inode;
1211 const char *name = dentry->d_name.name;
1212 int namelen = dentry->d_name.len;
1213 unsigned long offset = 0;
1214 unsigned short reclen;
1215 int nlen, rlen, err;
1216 char *top;
1217
1218 reclen = EXT3_DIR_REC_LEN(namelen);
1219 if (!de) {
1220 de = (struct ext3_dir_entry_2 *)bh->b_data;
1221 top = bh->b_data + dir->i_sb->s_blocksize - reclen;
1222 while ((char *) de <= top) {
1223 if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
1224 bh, offset)) {
1225 brelse (bh);
1226 return -EIO;
1227 }
1228 if (ext3_match (namelen, name, de)) {
1229 brelse (bh);
1230 return -EEXIST;
1231 }
1232 nlen = EXT3_DIR_REC_LEN(de->name_len);
1233 rlen = le16_to_cpu(de->rec_len);
1234 if ((de->inode? rlen - nlen: rlen) >= reclen)
1235 break;
1236 de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
1237 offset += rlen;
1238 }
1239 if ((char *) de > top)
1240 return -ENOSPC;
1241 }
1242 BUFFER_TRACE(bh, "get_write_access");
1243 err = ext3_journal_get_write_access(handle, bh);
1244 if (err) {
1245 ext3_std_error(dir->i_sb, err);
1246 brelse(bh);
1247 return err;
1248 }
1249
1250 /* By now the buffer is marked for journaling */
1251 nlen = EXT3_DIR_REC_LEN(de->name_len);
1252 rlen = le16_to_cpu(de->rec_len);
1253 if (de->inode) {
1254 struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
1255 de1->rec_len = cpu_to_le16(rlen - nlen);
1256 de->rec_len = cpu_to_le16(nlen);
1257 de = de1;
1258 }
1259 de->file_type = EXT3_FT_UNKNOWN;
1260 if (inode) {
1261 de->inode = cpu_to_le32(inode->i_ino);
1262 ext3_set_de_type(dir->i_sb, de, inode->i_mode);
1263 } else
1264 de->inode = 0;
1265 de->name_len = namelen;
1266 memcpy (de->name, name, namelen);
1267 /*
1268 * XXX shouldn't update any times until successful
1269 * completion of syscall, but too many callers depend
1270 * on this.
1271 *
1272 * XXX similarly, too many callers depend on
1273 * ext3_new_inode() setting the times, but error
1274 * recovery deletes the inode, so the worst that can
1275 * happen is that the times are slightly out of date
1276 * and/or different from the directory change time.
1277 */
1278 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
1279 ext3_update_dx_flag(dir);
1280 dir->i_version++;
1281 ext3_mark_inode_dirty(handle, dir);
1282 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1283 err = ext3_journal_dirty_metadata(handle, bh);
1284 if (err)
1285 ext3_std_error(dir->i_sb, err);
1286 brelse(bh);
1287 return 0;
1288}
1289
1290#ifdef CONFIG_EXT3_INDEX
1291/*
1292 * This converts a one block unindexed directory to a 3 block indexed
1293 * directory, and adds the dentry to the indexed directory.
1294 */
1295static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1296 struct inode *inode, struct buffer_head *bh)
1297{
1298 struct inode *dir = dentry->d_parent->d_inode;
1299 const char *name = dentry->d_name.name;
1300 int namelen = dentry->d_name.len;
1301 struct buffer_head *bh2;
1302 struct dx_root *root;
1303 struct dx_frame frames[2], *frame;
1304 struct dx_entry *entries;
1305 struct ext3_dir_entry_2 *de, *de2;
1306 char *data1, *top;
1307 unsigned len;
1308 int retval;
1309 unsigned blocksize;
1310 struct dx_hash_info hinfo;
1311 u32 block;
1312 struct fake_dirent *fde;
1313
1314 blocksize = dir->i_sb->s_blocksize;
1315 dxtrace(printk("Creating index\n"));
1316 retval = ext3_journal_get_write_access(handle, bh);
1317 if (retval) {
1318 ext3_std_error(dir->i_sb, retval);
1319 brelse(bh);
1320 return retval;
1321 }
1322 root = (struct dx_root *) bh->b_data;
1323
1324 bh2 = ext3_append (handle, dir, &block, &retval);
1325 if (!(bh2)) {
1326 brelse(bh);
1327 return retval;
1328 }
1329 EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
1330 data1 = bh2->b_data;
1331
1332 /* The 0th block becomes the root, move the dirents out */
1333 fde = &root->dotdot;
1334 de = (struct ext3_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len));
1335 len = ((char *) root) + blocksize - (char *) de;
1336 memcpy (data1, de, len);
1337 de = (struct ext3_dir_entry_2 *) data1;
1338 top = data1 + len;
1339 while ((char *)(de2=(void*)de+le16_to_cpu(de->rec_len)) < top)
1340 de = de2;
1341 de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
1342 /* Initialize the root; the dot dirents already exist */
1343 de = (struct ext3_dir_entry_2 *) (&root->dotdot);
1344 de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2));
1345 memset (&root->info, 0, sizeof(root->info));
1346 root->info.info_length = sizeof(root->info);
1347 root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
1348 entries = root->entries;
1349 dx_set_block (entries, 1);
1350 dx_set_count (entries, 1);
1351 dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
1352
1353 /* Initialize as for dx_probe */
1354 hinfo.hash_version = root->info.hash_version;
1355 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
1356 ext3fs_dirhash(name, namelen, &hinfo);
1357 frame = frames;
1358 frame->entries = entries;
1359 frame->at = entries;
1360 frame->bh = bh;
1361 bh = bh2;
1362 de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1363 dx_release (frames);
1364 if (!(de))
1365 return retval;
1366
1367 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1368}
1369#endif
1370
1371/*
1372 * ext3_add_entry()
1373 *
1374 * adds a file entry to the specified directory, using the same
1375 * semantics as ext3_find_entry(). It returns NULL if it failed.
1376 *
1377 * NOTE!! The inode part of 'de' is left at 0 - which means you
1378 * may not sleep between calling this and putting something into
1379 * the entry, as someone else might have used it while you slept.
1380 */
1381static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
1382 struct inode *inode)
1383{
1384 struct inode *dir = dentry->d_parent->d_inode;
1385 unsigned long offset;
1386 struct buffer_head * bh;
1387 struct ext3_dir_entry_2 *de;
1388 struct super_block * sb;
1389 int retval;
1390#ifdef CONFIG_EXT3_INDEX
1391 int dx_fallback=0;
1392#endif
1393 unsigned blocksize;
1394 u32 block, blocks;
1395
1396 sb = dir->i_sb;
1397 blocksize = sb->s_blocksize;
1398 if (!dentry->d_name.len)
1399 return -EINVAL;
1400#ifdef CONFIG_EXT3_INDEX
1401 if (is_dx(dir)) {
1402 retval = ext3_dx_add_entry(handle, dentry, inode);
1403 if (!retval || (retval != ERR_BAD_DX_DIR))
1404 return retval;
1405 EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL;
1406 dx_fallback++;
1407 ext3_mark_inode_dirty(handle, dir);
1408 }
1409#endif
1410 blocks = dir->i_size >> sb->s_blocksize_bits;
1411 for (block = 0, offset = 0; block < blocks; block++) {
1412 bh = ext3_bread(handle, dir, block, 0, &retval);
1413 if(!bh)
1414 return retval;
1415 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1416 if (retval != -ENOSPC)
1417 return retval;
1418
1419#ifdef CONFIG_EXT3_INDEX
1420 if (blocks == 1 && !dx_fallback &&
1421 EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
1422 return make_indexed_dir(handle, dentry, inode, bh);
1423#endif
1424 brelse(bh);
1425 }
1426 bh = ext3_append(handle, dir, &block, &retval);
1427 if (!bh)
1428 return retval;
1429 de = (struct ext3_dir_entry_2 *) bh->b_data;
1430 de->inode = 0;
1431 de->rec_len = cpu_to_le16(blocksize);
1432 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1433}
1434
1435#ifdef CONFIG_EXT3_INDEX
1436/*
1437 * Returns 0 for success, or a negative error value
1438 */
1439static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
1440 struct inode *inode)
1441{
1442 struct dx_frame frames[2], *frame;
1443 struct dx_entry *entries, *at;
1444 struct dx_hash_info hinfo;
1445 struct buffer_head * bh;
1446 struct inode *dir = dentry->d_parent->d_inode;
1447 struct super_block * sb = dir->i_sb;
1448 struct ext3_dir_entry_2 *de;
1449 int err;
1450
1451 frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
1452 if (!frame)
1453 return err;
1454 entries = frame->entries;
1455 at = frame->at;
1456
1457 if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
1458 goto cleanup;
1459
1460 BUFFER_TRACE(bh, "get_write_access");
1461 err = ext3_journal_get_write_access(handle, bh);
1462 if (err)
1463 goto journal_error;
1464
1465 err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1466 if (err != -ENOSPC) {
1467 bh = NULL;
1468 goto cleanup;
1469 }
1470
1471 /* Block full, should compress but for now just split */
1472 dxtrace(printk("using %u of %u node entries\n",
1473 dx_get_count(entries), dx_get_limit(entries)));
1474 /* Need to split index? */
1475 if (dx_get_count(entries) == dx_get_limit(entries)) {
1476 u32 newblock;
1477 unsigned icount = dx_get_count(entries);
1478 int levels = frame - frames;
1479 struct dx_entry *entries2;
1480 struct dx_node *node2;
1481 struct buffer_head *bh2;
1482
1483 if (levels && (dx_get_count(frames->entries) ==
1484 dx_get_limit(frames->entries))) {
1485 ext3_warning(sb, __FUNCTION__,
1486 "Directory index full!");
1487 err = -ENOSPC;
1488 goto cleanup;
1489 }
1490 bh2 = ext3_append (handle, dir, &newblock, &err);
1491 if (!(bh2))
1492 goto cleanup;
1493 node2 = (struct dx_node *)(bh2->b_data);
1494 entries2 = node2->entries;
1495 node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
1496 node2->fake.inode = 0;
1497 BUFFER_TRACE(frame->bh, "get_write_access");
1498 err = ext3_journal_get_write_access(handle, frame->bh);
1499 if (err)
1500 goto journal_error;
1501 if (levels) {
1502 unsigned icount1 = icount/2, icount2 = icount - icount1;
1503 unsigned hash2 = dx_get_hash(entries + icount1);
1504 dxtrace(printk("Split index %i/%i\n", icount1, icount2));
1505
1506 BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
1507 err = ext3_journal_get_write_access(handle,
1508 frames[0].bh);
1509 if (err)
1510 goto journal_error;
1511
1512 memcpy ((char *) entries2, (char *) (entries + icount1),
1513 icount2 * sizeof(struct dx_entry));
1514 dx_set_count (entries, icount1);
1515 dx_set_count (entries2, icount2);
1516 dx_set_limit (entries2, dx_node_limit(dir));
1517
1518 /* Which index block gets the new entry? */
1519 if (at - entries >= icount1) {
1520 frame->at = at = at - entries - icount1 + entries2;
1521 frame->entries = entries = entries2;
1522 swap(frame->bh, bh2);
1523 }
1524 dx_insert_block (frames + 0, hash2, newblock);
1525 dxtrace(dx_show_index ("node", frames[1].entries));
1526 dxtrace(dx_show_index ("node",
1527 ((struct dx_node *) bh2->b_data)->entries));
1528 err = ext3_journal_dirty_metadata(handle, bh2);
1529 if (err)
1530 goto journal_error;
1531 brelse (bh2);
1532 } else {
1533 dxtrace(printk("Creating second level index...\n"));
1534 memcpy((char *) entries2, (char *) entries,
1535 icount * sizeof(struct dx_entry));
1536 dx_set_limit(entries2, dx_node_limit(dir));
1537
1538 /* Set up root */
1539 dx_set_count(entries, 1);
1540 dx_set_block(entries + 0, newblock);
1541 ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
1542
1543 /* Add new access path frame */
1544 frame = frames + 1;
1545 frame->at = at = at - entries + entries2;
1546 frame->entries = entries = entries2;
1547 frame->bh = bh2;
1548 err = ext3_journal_get_write_access(handle,
1549 frame->bh);
1550 if (err)
1551 goto journal_error;
1552 }
1553 ext3_journal_dirty_metadata(handle, frames[0].bh);
1554 }
1555 de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1556 if (!de)
1557 goto cleanup;
1558 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1559 bh = NULL;
1560 goto cleanup;
1561
1562journal_error:
1563 ext3_std_error(dir->i_sb, err);
1564cleanup:
1565 if (bh)
1566 brelse(bh);
1567 dx_release(frames);
1568 return err;
1569}
1570#endif
1571
1572/*
1573 * ext3_delete_entry deletes a directory entry by merging it with the
1574 * previous entry
1575 */
1576static int ext3_delete_entry (handle_t *handle,
1577 struct inode * dir,
1578 struct ext3_dir_entry_2 * de_del,
1579 struct buffer_head * bh)
1580{
1581 struct ext3_dir_entry_2 * de, * pde;
1582 int i;
1583
1584 i = 0;
1585 pde = NULL;
1586 de = (struct ext3_dir_entry_2 *) bh->b_data;
1587 while (i < bh->b_size) {
1588 if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
1589 return -EIO;
1590 if (de == de_del) {
1591 BUFFER_TRACE(bh, "get_write_access");
1592 ext3_journal_get_write_access(handle, bh);
1593 if (pde)
1594 pde->rec_len =
1595 cpu_to_le16(le16_to_cpu(pde->rec_len) +
1596 le16_to_cpu(de->rec_len));
1597 else
1598 de->inode = 0;
1599 dir->i_version++;
1600 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1601 ext3_journal_dirty_metadata(handle, bh);
1602 return 0;
1603 }
1604 i += le16_to_cpu(de->rec_len);
1605 pde = de;
1606 de = (struct ext3_dir_entry_2 *)
1607 ((char *) de + le16_to_cpu(de->rec_len));
1608 }
1609 return -ENOENT;
1610}
1611
1612/*
1613 * ext3_mark_inode_dirty is somewhat expensive, so unlike ext2 we
1614 * do not perform it in these functions. We perform it at the call site,
1615 * if it is needed.
1616 */
1617static inline void ext3_inc_count(handle_t *handle, struct inode *inode)
1618{
1619 inc_nlink(inode);
1620}
1621
1622static inline void ext3_dec_count(handle_t *handle, struct inode *inode)
1623{
1624 drop_nlink(inode);
1625}
1626
1627static int ext3_add_nondir(handle_t *handle,
1628 struct dentry *dentry, struct inode *inode)
1629{
1630 int err = ext3_add_entry(handle, dentry, inode);
1631 if (!err) {
1632 ext3_mark_inode_dirty(handle, inode);
1633 d_instantiate(dentry, inode);
1634 return 0;
1635 }
1636 ext3_dec_count(handle, inode);
1637 iput(inode);
1638 return err;
1639}
1640
1641/*
1642 * By the time this is called, we already have created
1643 * the directory cache entry for the new file, but it
1644 * is so far negative - it has no inode.
1645 *
1646 * If the create succeeds, we fill in the inode information
1647 * with d_instantiate().
1648 */
1649static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
1650 struct nameidata *nd)
1651{
1652 handle_t *handle;
1653 struct inode * inode;
1654 int err, retries = 0;
1655
1656retry:
1657 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1658 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1659 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
1660 if (IS_ERR(handle))
1661 return PTR_ERR(handle);
1662
1663 if (IS_DIRSYNC(dir))
1664 handle->h_sync = 1;
1665
1666 inode = ext3_new_inode (handle, dir, mode);
1667 err = PTR_ERR(inode);
1668 if (!IS_ERR(inode)) {
1669 inode->i_op = &ext3_file_inode_operations;
1670 inode->i_fop = &ext3_file_operations;
1671 ext3_set_aops(inode);
1672 err = ext3_add_nondir(handle, dentry, inode);
1673 }
1674 ext3_journal_stop(handle);
1675 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1676 goto retry;
1677 return err;
1678}
1679
1680static int ext3_mknod (struct inode * dir, struct dentry *dentry,
1681 int mode, dev_t rdev)
1682{
1683 handle_t *handle;
1684 struct inode *inode;
1685 int err, retries = 0;
1686
1687 if (!new_valid_dev(rdev))
1688 return -EINVAL;
1689
1690retry:
1691 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1692 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1693 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
1694 if (IS_ERR(handle))
1695 return PTR_ERR(handle);
1696
1697 if (IS_DIRSYNC(dir))
1698 handle->h_sync = 1;
1699
1700 inode = ext3_new_inode (handle, dir, mode);
1701 err = PTR_ERR(inode);
1702 if (!IS_ERR(inode)) {
1703 init_special_inode(inode, inode->i_mode, rdev);
1704#ifdef CONFIG_EXT3_FS_XATTR
1705 inode->i_op = &ext3_special_inode_operations;
1706#endif
1707 err = ext3_add_nondir(handle, dentry, inode);
1708 }
1709 ext3_journal_stop(handle);
1710 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1711 goto retry;
1712 return err;
1713}
1714
1715static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
1716{
1717 handle_t *handle;
1718 struct inode * inode;
1719 struct buffer_head * dir_block;
1720 struct ext3_dir_entry_2 * de;
1721 int err, retries = 0;
1722
1723 if (dir->i_nlink >= EXT3_LINK_MAX)
1724 return -EMLINK;
1725
1726retry:
1727 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1728 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1729 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
1730 if (IS_ERR(handle))
1731 return PTR_ERR(handle);
1732
1733 if (IS_DIRSYNC(dir))
1734 handle->h_sync = 1;
1735
1736 inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
1737 err = PTR_ERR(inode);
1738 if (IS_ERR(inode))
1739 goto out_stop;
1740
1741 inode->i_op = &ext3_dir_inode_operations;
1742 inode->i_fop = &ext3_dir_operations;
1743 inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1744 dir_block = ext3_bread (handle, inode, 0, 1, &err);
1745 if (!dir_block) {
1746 drop_nlink(inode); /* is this nlink == 0? */
1747 ext3_mark_inode_dirty(handle, inode);
1748 iput (inode);
1749 goto out_stop;
1750 }
1751 BUFFER_TRACE(dir_block, "get_write_access");
1752 ext3_journal_get_write_access(handle, dir_block);
1753 de = (struct ext3_dir_entry_2 *) dir_block->b_data;
1754 de->inode = cpu_to_le32(inode->i_ino);
1755 de->name_len = 1;
1756 de->rec_len = cpu_to_le16(EXT3_DIR_REC_LEN(de->name_len));
1757 strcpy (de->name, ".");
1758 ext3_set_de_type(dir->i_sb, de, S_IFDIR);
1759 de = (struct ext3_dir_entry_2 *)
1760 ((char *) de + le16_to_cpu(de->rec_len));
1761 de->inode = cpu_to_le32(dir->i_ino);
1762 de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT3_DIR_REC_LEN(1));
1763 de->name_len = 2;
1764 strcpy (de->name, "..");
1765 ext3_set_de_type(dir->i_sb, de, S_IFDIR);
1766 inode->i_nlink = 2;
1767 BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
1768 ext3_journal_dirty_metadata(handle, dir_block);
1769 brelse (dir_block);
1770 ext3_mark_inode_dirty(handle, inode);
1771 err = ext3_add_entry (handle, dentry, inode);
1772 if (err) {
1773 inode->i_nlink = 0;
1774 ext3_mark_inode_dirty(handle, inode);
1775 iput (inode);
1776 goto out_stop;
1777 }
1778 inc_nlink(dir);
1779 ext3_update_dx_flag(dir);
1780 ext3_mark_inode_dirty(handle, dir);
1781 d_instantiate(dentry, inode);
1782out_stop:
1783 ext3_journal_stop(handle);
1784 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1785 goto retry;
1786 return err;
1787}
1788
1789/*
1790 * routine to check that the specified directory is empty (for rmdir)
1791 */
1792static int empty_dir (struct inode * inode)
1793{
1794 unsigned long offset;
1795 struct buffer_head * bh;
1796 struct ext3_dir_entry_2 * de, * de1;
1797 struct super_block * sb;
1798 int err = 0;
1799
1800 sb = inode->i_sb;
1801 if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) ||
1802 !(bh = ext3_bread (NULL, inode, 0, 0, &err))) {
1803 if (err)
1804 ext3_error(inode->i_sb, __FUNCTION__,
1805 "error %d reading directory #%lu offset 0",
1806 err, inode->i_ino);
1807 else
1808 ext3_warning(inode->i_sb, __FUNCTION__,
1809 "bad directory (dir #%lu) - no data block",
1810 inode->i_ino);
1811 return 1;
1812 }
1813 de = (struct ext3_dir_entry_2 *) bh->b_data;
1814 de1 = (struct ext3_dir_entry_2 *)
1815 ((char *) de + le16_to_cpu(de->rec_len));
1816 if (le32_to_cpu(de->inode) != inode->i_ino ||
1817 !le32_to_cpu(de1->inode) ||
1818 strcmp (".", de->name) ||
1819 strcmp ("..", de1->name)) {
1820 ext3_warning (inode->i_sb, "empty_dir",
1821 "bad directory (dir #%lu) - no `.' or `..'",
1822 inode->i_ino);
1823 brelse (bh);
1824 return 1;
1825 }
1826 offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
1827 de = (struct ext3_dir_entry_2 *)
1828 ((char *) de1 + le16_to_cpu(de1->rec_len));
1829 while (offset < inode->i_size ) {
1830 if (!bh ||
1831 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1832 err = 0;
1833 brelse (bh);
1834 bh = ext3_bread (NULL, inode,
1835 offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err);
1836 if (!bh) {
1837 if (err)
1838 ext3_error(sb, __FUNCTION__,
1839 "error %d reading directory"
1840 " #%lu offset %lu",
1841 err, inode->i_ino, offset);
1842 offset += sb->s_blocksize;
1843 continue;
1844 }
1845 de = (struct ext3_dir_entry_2 *) bh->b_data;
1846 }
1847 if (!ext3_check_dir_entry("empty_dir", inode, de, bh, offset)) {
1848 de = (struct ext3_dir_entry_2 *)(bh->b_data +
1849 sb->s_blocksize);
1850 offset = (offset | (sb->s_blocksize - 1)) + 1;
1851 continue;
1852 }
1853 if (le32_to_cpu(de->inode)) {
1854 brelse (bh);
1855 return 0;
1856 }
1857 offset += le16_to_cpu(de->rec_len);
1858 de = (struct ext3_dir_entry_2 *)
1859 ((char *) de + le16_to_cpu(de->rec_len));
1860 }
1861 brelse (bh);
1862 return 1;
1863}
1864
1865/* ext3_orphan_add() links an unlinked or truncated inode into a list of
1866 * such inodes, starting at the superblock, in case we crash before the
1867 * file is closed/deleted, or in case the inode truncate spans multiple
1868 * transactions and the last transaction is not recovered after a crash.
1869 *
1870 * At filesystem recovery time, we walk this list deleting unlinked
1871 * inodes and truncating linked inodes in ext3_orphan_cleanup().
1872 */
1873int ext3_orphan_add(handle_t *handle, struct inode *inode)
1874{
1875 struct super_block *sb = inode->i_sb;
1876 struct ext3_iloc iloc;
1877 int err = 0, rc;
1878
1879 lock_super(sb);
1880 if (!list_empty(&EXT3_I(inode)->i_orphan))
1881 goto out_unlock;
1882
1883 /* Orphan handling is only valid for files with data blocks
1884 * being truncated, or files being unlinked. */
1885
1886 /* @@@ FIXME: Observation from aviro:
1887 * I think I can trigger J_ASSERT in ext3_orphan_add(). We block
1888 * here (on lock_super()), so race with ext3_link() which might bump
1889 * ->i_nlink. For, say it, character device. Not a regular file,
1890 * not a directory, not a symlink and ->i_nlink > 0.
1891 */
1892 J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1893 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
1894
1895 BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
1896 err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
1897 if (err)
1898 goto out_unlock;
1899
1900 err = ext3_reserve_inode_write(handle, inode, &iloc);
1901 if (err)
1902 goto out_unlock;
1903
1904 /* Insert this inode at the head of the on-disk orphan list... */
1905 NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
1906 EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
1907 err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
1908 rc = ext3_mark_iloc_dirty(handle, inode, &iloc);
1909 if (!err)
1910 err = rc;
1911
1912 /* Only add to the head of the in-memory list if all the
1913 * previous operations succeeded. If the orphan_add is going to
1914 * fail (possibly taking the journal offline), we can't risk
1915 * leaving the inode on the orphan list: stray orphan-list
1916 * entries can cause panics at unmount time.
1917 *
1918 * This is safe: on error we're going to ignore the orphan list
1919 * anyway on the next recovery. */
1920 if (!err)
1921 list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
1922
1923 jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
1924 jbd_debug(4, "orphan inode %lu will point to %d\n",
1925 inode->i_ino, NEXT_ORPHAN(inode));
1926out_unlock:
1927 unlock_super(sb);
1928 ext3_std_error(inode->i_sb, err);
1929 return err;
1930}
1931
1932/*
1933 * ext3_orphan_del() removes an unlinked or truncated inode from the list
1934 * of such inodes stored on disk, because it is finally being cleaned up.
1935 */
1936int ext3_orphan_del(handle_t *handle, struct inode *inode)
1937{
1938 struct list_head *prev;
1939 struct ext3_inode_info *ei = EXT3_I(inode);
1940 struct ext3_sb_info *sbi;
1941 unsigned long ino_next;
1942 struct ext3_iloc iloc;
1943 int err = 0;
1944
1945 lock_super(inode->i_sb);
1946 if (list_empty(&ei->i_orphan)) {
1947 unlock_super(inode->i_sb);
1948 return 0;
1949 }
1950
1951 ino_next = NEXT_ORPHAN(inode);
1952 prev = ei->i_orphan.prev;
1953 sbi = EXT3_SB(inode->i_sb);
1954
1955 jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
1956
1957 list_del_init(&ei->i_orphan);
1958
1959 /* If we're on an error path, we may not have a valid
1960 * transaction handle with which to update the orphan list on
1961 * disk, but we still need to remove the inode from the linked
1962 * list in memory. */
1963 if (!handle)
1964 goto out;
1965
1966 err = ext3_reserve_inode_write(handle, inode, &iloc);
1967 if (err)
1968 goto out_err;
1969
1970 if (prev == &sbi->s_orphan) {
1971 jbd_debug(4, "superblock will point to %lu\n", ino_next);
1972 BUFFER_TRACE(sbi->s_sbh, "get_write_access");
1973 err = ext3_journal_get_write_access(handle, sbi->s_sbh);
1974 if (err)
1975 goto out_brelse;
1976 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
1977 err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
1978 } else {
1979 struct ext3_iloc iloc2;
1980 struct inode *i_prev =
1981 &list_entry(prev, struct ext3_inode_info, i_orphan)->vfs_inode;
1982
1983 jbd_debug(4, "orphan inode %lu will point to %lu\n",
1984 i_prev->i_ino, ino_next);
1985 err = ext3_reserve_inode_write(handle, i_prev, &iloc2);
1986 if (err)
1987 goto out_brelse;
1988 NEXT_ORPHAN(i_prev) = ino_next;
1989 err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2);
1990 }
1991 if (err)
1992 goto out_brelse;
1993 NEXT_ORPHAN(inode) = 0;
1994 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
1995
1996out_err:
1997 ext3_std_error(inode->i_sb, err);
1998out:
1999 unlock_super(inode->i_sb);
2000 return err;
2001
2002out_brelse:
2003 brelse(iloc.bh);
2004 goto out_err;
2005}
2006
2007static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
2008{
2009 int retval;
2010 struct inode * inode;
2011 struct buffer_head * bh;
2012 struct ext3_dir_entry_2 * de;
2013 handle_t *handle;
2014
2015 /* Initialize quotas before so that eventual writes go in
2016 * separate transaction */
2017 DQUOT_INIT(dentry->d_inode);
2018 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
2019 if (IS_ERR(handle))
2020 return PTR_ERR(handle);
2021
2022 retval = -ENOENT;
2023 bh = ext3_find_entry (dentry, &de);
2024 if (!bh)
2025 goto end_rmdir;
2026
2027 if (IS_DIRSYNC(dir))
2028 handle->h_sync = 1;
2029
2030 inode = dentry->d_inode;
2031
2032 retval = -EIO;
2033 if (le32_to_cpu(de->inode) != inode->i_ino)
2034 goto end_rmdir;
2035
2036 retval = -ENOTEMPTY;
2037 if (!empty_dir (inode))
2038 goto end_rmdir;
2039
2040 retval = ext3_delete_entry(handle, dir, de, bh);
2041 if (retval)
2042 goto end_rmdir;
2043 if (inode->i_nlink != 2)
2044 ext3_warning (inode->i_sb, "ext3_rmdir",
2045 "empty directory has nlink!=2 (%d)",
2046 inode->i_nlink);
2047 inode->i_version++;
2048 clear_nlink(inode);
2049 /* There's no need to set i_disksize: the fact that i_nlink is
2050 * zero will ensure that the right thing happens during any
2051 * recovery. */
2052 inode->i_size = 0;
2053 ext3_orphan_add(handle, inode);
2054 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
2055 ext3_mark_inode_dirty(handle, inode);
2056 drop_nlink(dir);
2057 ext3_update_dx_flag(dir);
2058 ext3_mark_inode_dirty(handle, dir);
2059
2060end_rmdir:
2061 ext3_journal_stop(handle);
2062 brelse (bh);
2063 return retval;
2064}
2065
2066static int ext3_unlink(struct inode * dir, struct dentry *dentry)
2067{
2068 int retval;
2069 struct inode * inode;
2070 struct buffer_head * bh;
2071 struct ext3_dir_entry_2 * de;
2072 handle_t *handle;
2073
2074 /* Initialize quotas before so that eventual writes go
2075 * in separate transaction */
2076 DQUOT_INIT(dentry->d_inode);
2077 handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
2078 if (IS_ERR(handle))
2079 return PTR_ERR(handle);
2080
2081 if (IS_DIRSYNC(dir))
2082 handle->h_sync = 1;
2083
2084 retval = -ENOENT;
2085 bh = ext3_find_entry (dentry, &de);
2086 if (!bh)
2087 goto end_unlink;
2088
2089 inode = dentry->d_inode;
2090
2091 retval = -EIO;
2092 if (le32_to_cpu(de->inode) != inode->i_ino)
2093 goto end_unlink;
2094
2095 if (!inode->i_nlink) {
2096 ext3_warning (inode->i_sb, "ext3_unlink",
2097 "Deleting nonexistent file (%lu), %d",
2098 inode->i_ino, inode->i_nlink);
2099 inode->i_nlink = 1;
2100 }
2101 retval = ext3_delete_entry(handle, dir, de, bh);
2102 if (retval)
2103 goto end_unlink;
2104 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
2105 ext3_update_dx_flag(dir);
2106 ext3_mark_inode_dirty(handle, dir);
2107 drop_nlink(inode);
2108 if (!inode->i_nlink)
2109 ext3_orphan_add(handle, inode);
2110 inode->i_ctime = dir->i_ctime;
2111 ext3_mark_inode_dirty(handle, inode);
2112 retval = 0;
2113
2114end_unlink:
2115 ext3_journal_stop(handle);
2116 brelse (bh);
2117 return retval;
2118}
2119
2120static int ext3_symlink (struct inode * dir,
2121 struct dentry *dentry, const char * symname)
2122{
2123 handle_t *handle;
2124 struct inode * inode;
2125 int l, err, retries = 0;
2126
2127 l = strlen(symname)+1;
2128 if (l > dir->i_sb->s_blocksize)
2129 return -ENAMETOOLONG;
2130
2131retry:
2132 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2133 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
2134 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
2135 if (IS_ERR(handle))
2136 return PTR_ERR(handle);
2137
2138 if (IS_DIRSYNC(dir))
2139 handle->h_sync = 1;
2140
2141 inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
2142 err = PTR_ERR(inode);
2143 if (IS_ERR(inode))
2144 goto out_stop;
2145
2146 if (l > sizeof (EXT3_I(inode)->i_data)) {
2147 inode->i_op = &ext3_symlink_inode_operations;
2148 ext3_set_aops(inode);
2149 /*
2150 * page_symlink() calls into ext3_prepare/commit_write.
2151 * We have a transaction open. All is sweetness. It also sets
2152 * i_size in generic_commit_write().
2153 */
2154 err = __page_symlink(inode, symname, l,
2155 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
2156 if (err) {
2157 ext3_dec_count(handle, inode);
2158 ext3_mark_inode_dirty(handle, inode);
2159 iput (inode);
2160 goto out_stop;
2161 }
2162 } else {
2163 inode->i_op = &ext3_fast_symlink_inode_operations;
2164 memcpy((char*)&EXT3_I(inode)->i_data,symname,l);
2165 inode->i_size = l-1;
2166 }
2167 EXT3_I(inode)->i_disksize = inode->i_size;
2168 err = ext3_add_nondir(handle, dentry, inode);
2169out_stop:
2170 ext3_journal_stop(handle);
2171 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
2172 goto retry;
2173 return err;
2174}
2175
2176static int ext3_link (struct dentry * old_dentry,
2177 struct inode * dir, struct dentry *dentry)
2178{
2179 handle_t *handle;
2180 struct inode *inode = old_dentry->d_inode;
2181 int err, retries = 0;
2182
2183 if (inode->i_nlink >= EXT3_LINK_MAX)
2184 return -EMLINK;
2185
2186retry:
2187 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2188 EXT3_INDEX_EXTRA_TRANS_BLOCKS);
2189 if (IS_ERR(handle))
2190 return PTR_ERR(handle);
2191
2192 if (IS_DIRSYNC(dir))
2193 handle->h_sync = 1;
2194
2195 inode->i_ctime = CURRENT_TIME_SEC;
2196 ext3_inc_count(handle, inode);
2197 atomic_inc(&inode->i_count);
2198
2199 err = ext3_add_nondir(handle, dentry, inode);
2200 ext3_journal_stop(handle);
2201 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
2202 goto retry;
2203 return err;
2204}
2205
2206#define PARENT_INO(buffer) \
2207 ((struct ext3_dir_entry_2 *) ((char *) buffer + \
2208 le16_to_cpu(((struct ext3_dir_entry_2 *) buffer)->rec_len)))->inode
2209
2210/*
2211 * Anybody can rename anything with this: the permission checks are left to the
2212 * higher-level routines.
2213 */
2214static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2215 struct inode * new_dir,struct dentry *new_dentry)
2216{
2217 handle_t *handle;
2218 struct inode * old_inode, * new_inode;
2219 struct buffer_head * old_bh, * new_bh, * dir_bh;
2220 struct ext3_dir_entry_2 * old_de, * new_de;
2221 int retval;
2222
2223 old_bh = new_bh = dir_bh = NULL;
2224
2225 /* Initialize quotas before so that eventual writes go
2226 * in separate transaction */
2227 if (new_dentry->d_inode)
2228 DQUOT_INIT(new_dentry->d_inode);
2229 handle = ext3_journal_start(old_dir, 2 *
2230 EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
2231 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
2232 if (IS_ERR(handle))
2233 return PTR_ERR(handle);
2234
2235 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2236 handle->h_sync = 1;
2237
2238 old_bh = ext3_find_entry (old_dentry, &old_de);
2239 /*
2240 * Check for inode number is _not_ due to possible IO errors.
2241 * We might rmdir the source, keep it as pwd of some process
2242 * and merrily kill the link to whatever was created under the
2243 * same name. Goodbye sticky bit ;-<
2244 */
2245 old_inode = old_dentry->d_inode;
2246 retval = -ENOENT;
2247 if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
2248 goto end_rename;
2249
2250 new_inode = new_dentry->d_inode;
2251 new_bh = ext3_find_entry (new_dentry, &new_de);
2252 if (new_bh) {
2253 if (!new_inode) {
2254 brelse (new_bh);
2255 new_bh = NULL;
2256 }
2257 }
2258 if (S_ISDIR(old_inode->i_mode)) {
2259 if (new_inode) {
2260 retval = -ENOTEMPTY;
2261 if (!empty_dir (new_inode))
2262 goto end_rename;
2263 }
2264 retval = -EIO;
2265 dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval);
2266 if (!dir_bh)
2267 goto end_rename;
2268 if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
2269 goto end_rename;
2270 retval = -EMLINK;
2271 if (!new_inode && new_dir!=old_dir &&
2272 new_dir->i_nlink >= EXT3_LINK_MAX)
2273 goto end_rename;
2274 }
2275 if (!new_bh) {
2276 retval = ext3_add_entry (handle, new_dentry, old_inode);
2277 if (retval)
2278 goto end_rename;
2279 } else {
2280 BUFFER_TRACE(new_bh, "get write access");
2281 ext3_journal_get_write_access(handle, new_bh);
2282 new_de->inode = cpu_to_le32(old_inode->i_ino);
2283 if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
2284 EXT3_FEATURE_INCOMPAT_FILETYPE))
2285 new_de->file_type = old_de->file_type;
2286 new_dir->i_version++;
2287 BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
2288 ext3_journal_dirty_metadata(handle, new_bh);
2289 brelse(new_bh);
2290 new_bh = NULL;
2291 }
2292
2293 /*
2294 * Like most other Unix systems, set the ctime for inodes on a
2295 * rename.
2296 */
2297 old_inode->i_ctime = CURRENT_TIME_SEC;
2298 ext3_mark_inode_dirty(handle, old_inode);
2299
2300 /*
2301 * ok, that's it
2302 */
2303 if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
2304 old_de->name_len != old_dentry->d_name.len ||
2305 strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
2306 (retval = ext3_delete_entry(handle, old_dir,
2307 old_de, old_bh)) == -ENOENT) {
2308 /* old_de could have moved from under us during htree split, so
2309 * make sure that we are deleting the right entry. We might
2310 * also be pointing to a stale entry in the unused part of
2311 * old_bh so just checking inum and the name isn't enough. */
2312 struct buffer_head *old_bh2;
2313 struct ext3_dir_entry_2 *old_de2;
2314
2315 old_bh2 = ext3_find_entry(old_dentry, &old_de2);
2316 if (old_bh2) {
2317 retval = ext3_delete_entry(handle, old_dir,
2318 old_de2, old_bh2);
2319 brelse(old_bh2);
2320 }
2321 }
2322 if (retval) {
2323 ext3_warning(old_dir->i_sb, "ext3_rename",
2324 "Deleting old file (%lu), %d, error=%d",
2325 old_dir->i_ino, old_dir->i_nlink, retval);
2326 }
2327
2328 if (new_inode) {
2329 drop_nlink(new_inode);
2330 new_inode->i_ctime = CURRENT_TIME_SEC;
2331 }
2332 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
2333 ext3_update_dx_flag(old_dir);
2334 if (dir_bh) {
2335 BUFFER_TRACE(dir_bh, "get_write_access");
2336 ext3_journal_get_write_access(handle, dir_bh);
2337 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
2338 BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
2339 ext3_journal_dirty_metadata(handle, dir_bh);
2340 drop_nlink(old_dir);
2341 if (new_inode) {
2342 drop_nlink(new_inode);
2343 } else {
2344 inc_nlink(new_dir);
2345 ext3_update_dx_flag(new_dir);
2346 ext3_mark_inode_dirty(handle, new_dir);
2347 }
2348 }
2349 ext3_mark_inode_dirty(handle, old_dir);
2350 if (new_inode) {
2351 ext3_mark_inode_dirty(handle, new_inode);
2352 if (!new_inode->i_nlink)
2353 ext3_orphan_add(handle, new_inode);
2354 }
2355 retval = 0;
2356
2357end_rename:
2358 brelse (dir_bh);
2359 brelse (old_bh);
2360 brelse (new_bh);
2361 ext3_journal_stop(handle);
2362 return retval;
2363}
2364
2365/*
2366 * directories can handle most operations...
2367 */
2368struct inode_operations ext3_dir_inode_operations = {
2369 .create = ext3_create,
2370 .lookup = ext3_lookup,
2371 .link = ext3_link,
2372 .unlink = ext3_unlink,
2373 .symlink = ext3_symlink,
2374 .mkdir = ext3_mkdir,
2375 .rmdir = ext3_rmdir,
2376 .mknod = ext3_mknod,
2377 .rename = ext3_rename,
2378 .setattr = ext3_setattr,
2379#ifdef CONFIG_EXT3_FS_XATTR
2380 .setxattr = generic_setxattr,
2381 .getxattr = generic_getxattr,
2382 .listxattr = ext3_listxattr,
2383 .removexattr = generic_removexattr,
2384#endif
2385 .permission = ext3_permission,
2386};
2387
2388struct inode_operations ext3_special_inode_operations = {
2389 .setattr = ext3_setattr,
2390#ifdef CONFIG_EXT3_FS_XATTR
2391 .setxattr = generic_setxattr,
2392 .getxattr = generic_getxattr,
2393 .listxattr = ext3_listxattr,
2394 .removexattr = generic_removexattr,
2395#endif
2396 .permission = ext3_permission,
2397};
diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h
new file mode 100644
index 000000000000..f2ce2b0065c9
--- /dev/null
+++ b/fs/ext4/namei.h
@@ -0,0 +1,8 @@
1/* linux/fs/ext3/namei.h
2 *
3 * Copyright (C) 2005 Simtec Electronics
4 * Ben Dooks <ben@simtec.co.uk>
5 *
6*/
7
8extern struct dentry *ext3_get_parent(struct dentry *child);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
new file mode 100644
index 000000000000..b73cba12f79c
--- /dev/null
+++ b/fs/ext4/resize.c
@@ -0,0 +1,1042 @@
1/*
2 * linux/fs/ext3/resize.c
3 *
4 * Support for resizing an ext3 filesystem while it is mounted.
5 *
6 * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com>
7 *
8 * This could probably be made into a module, because it is not often in use.
9 */
10
11
12#define EXT3FS_DEBUG
13
14#include <linux/sched.h>
15#include <linux/smp_lock.h>
16#include <linux/ext3_jbd.h>
17
18#include <linux/errno.h>
19#include <linux/slab.h>
20
21
22#define outside(b, first, last) ((b) < (first) || (b) >= (last))
23#define inside(b, first, last) ((b) >= (first) && (b) < (last))
24
25static int verify_group_input(struct super_block *sb,
26 struct ext3_new_group_data *input)
27{
28 struct ext3_sb_info *sbi = EXT3_SB(sb);
29 struct ext3_super_block *es = sbi->s_es;
30 ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count);
31 ext3_fsblk_t end = start + input->blocks_count;
32 unsigned group = input->group;
33 ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
34 unsigned overhead = ext3_bg_has_super(sb, group) ?
35 (1 + ext3_bg_num_gdb(sb, group) +
36 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
37 ext3_fsblk_t metaend = start + overhead;
38 struct buffer_head *bh = NULL;
39 ext3_grpblk_t free_blocks_count;
40 int err = -EINVAL;
41
42 input->free_blocks_count = free_blocks_count =
43 input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
44
45 if (test_opt(sb, DEBUG))
46 printk(KERN_DEBUG "EXT3-fs: adding %s group %u: %u blocks "
47 "(%d free, %u reserved)\n",
48 ext3_bg_has_super(sb, input->group) ? "normal" :
49 "no-super", input->group, input->blocks_count,
50 free_blocks_count, input->reserved_blocks);
51
52 if (group != sbi->s_groups_count)
53 ext3_warning(sb, __FUNCTION__,
54 "Cannot add at group %u (only %lu groups)",
55 input->group, sbi->s_groups_count);
56 else if ((start - le32_to_cpu(es->s_first_data_block)) %
57 EXT3_BLOCKS_PER_GROUP(sb))
58 ext3_warning(sb, __FUNCTION__, "Last group not full");
59 else if (input->reserved_blocks > input->blocks_count / 5)
60 ext3_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)",
61 input->reserved_blocks);
62 else if (free_blocks_count < 0)
63 ext3_warning(sb, __FUNCTION__, "Bad blocks count %u",
64 input->blocks_count);
65 else if (!(bh = sb_bread(sb, end - 1)))
66 ext3_warning(sb, __FUNCTION__,
67 "Cannot read last block ("E3FSBLK")",
68 end - 1);
69 else if (outside(input->block_bitmap, start, end))
70 ext3_warning(sb, __FUNCTION__,
71 "Block bitmap not in group (block %u)",
72 input->block_bitmap);
73 else if (outside(input->inode_bitmap, start, end))
74 ext3_warning(sb, __FUNCTION__,
75 "Inode bitmap not in group (block %u)",
76 input->inode_bitmap);
77 else if (outside(input->inode_table, start, end) ||
78 outside(itend - 1, start, end))
79 ext3_warning(sb, __FUNCTION__,
80 "Inode table not in group (blocks %u-"E3FSBLK")",
81 input->inode_table, itend - 1);
82 else if (input->inode_bitmap == input->block_bitmap)
83 ext3_warning(sb, __FUNCTION__,
84 "Block bitmap same as inode bitmap (%u)",
85 input->block_bitmap);
86 else if (inside(input->block_bitmap, input->inode_table, itend))
87 ext3_warning(sb, __FUNCTION__,
88 "Block bitmap (%u) in inode table (%u-"E3FSBLK")",
89 input->block_bitmap, input->inode_table, itend-1);
90 else if (inside(input->inode_bitmap, input->inode_table, itend))
91 ext3_warning(sb, __FUNCTION__,
92 "Inode bitmap (%u) in inode table (%u-"E3FSBLK")",
93 input->inode_bitmap, input->inode_table, itend-1);
94 else if (inside(input->block_bitmap, start, metaend))
95 ext3_warning(sb, __FUNCTION__,
96 "Block bitmap (%u) in GDT table"
97 " ("E3FSBLK"-"E3FSBLK")",
98 input->block_bitmap, start, metaend - 1);
99 else if (inside(input->inode_bitmap, start, metaend))
100 ext3_warning(sb, __FUNCTION__,
101 "Inode bitmap (%u) in GDT table"
102 " ("E3FSBLK"-"E3FSBLK")",
103 input->inode_bitmap, start, metaend - 1);
104 else if (inside(input->inode_table, start, metaend) ||
105 inside(itend - 1, start, metaend))
106 ext3_warning(sb, __FUNCTION__,
107 "Inode table (%u-"E3FSBLK") overlaps"
108 "GDT table ("E3FSBLK"-"E3FSBLK")",
109 input->inode_table, itend - 1, start, metaend - 1);
110 else
111 err = 0;
112 brelse(bh);
113
114 return err;
115}
116
117static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
118 ext3_fsblk_t blk)
119{
120 struct buffer_head *bh;
121 int err;
122
123 bh = sb_getblk(sb, blk);
124 if (!bh)
125 return ERR_PTR(-EIO);
126 if ((err = ext3_journal_get_write_access(handle, bh))) {
127 brelse(bh);
128 bh = ERR_PTR(err);
129 } else {
130 lock_buffer(bh);
131 memset(bh->b_data, 0, sb->s_blocksize);
132 set_buffer_uptodate(bh);
133 unlock_buffer(bh);
134 }
135
136 return bh;
137}
138
139/*
140 * To avoid calling the atomic setbit hundreds or thousands of times, we only
141 * need to use it within a single byte (to ensure we get endianness right).
142 * We can use memset for the rest of the bitmap as there are no other users.
143 */
144static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
145{
146 int i;
147
148 if (start_bit >= end_bit)
149 return;
150
151 ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
152 for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
153 ext3_set_bit(i, bitmap);
154 if (i < end_bit)
155 memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
156}
157
158/*
159 * Set up the block and inode bitmaps, and the inode table for the new group.
160 * This doesn't need to be part of the main transaction, since we are only
161 * changing blocks outside the actual filesystem. We still do journaling to
162 * ensure the recovery is correct in case of a failure just after resize.
163 * If any part of this fails, we simply abort the resize.
164 */
165static int setup_new_group_blocks(struct super_block *sb,
166 struct ext3_new_group_data *input)
167{
168 struct ext3_sb_info *sbi = EXT3_SB(sb);
169 ext3_fsblk_t start = ext3_group_first_block_no(sb, input->group);
170 int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
171 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
172 unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group);
173 struct buffer_head *bh;
174 handle_t *handle;
175 ext3_fsblk_t block;
176 ext3_grpblk_t bit;
177 int i;
178 int err = 0, err2;
179
180 handle = ext3_journal_start_sb(sb, reserved_gdb + gdblocks +
181 2 + sbi->s_itb_per_group);
182 if (IS_ERR(handle))
183 return PTR_ERR(handle);
184
185 lock_super(sb);
186 if (input->group != sbi->s_groups_count) {
187 err = -EBUSY;
188 goto exit_journal;
189 }
190
191 if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
192 err = PTR_ERR(bh);
193 goto exit_journal;
194 }
195
196 if (ext3_bg_has_super(sb, input->group)) {
197 ext3_debug("mark backup superblock %#04lx (+0)\n", start);
198 ext3_set_bit(0, bh->b_data);
199 }
200
201 /* Copy all of the GDT blocks into the backup in this group */
202 for (i = 0, bit = 1, block = start + 1;
203 i < gdblocks; i++, block++, bit++) {
204 struct buffer_head *gdb;
205
206 ext3_debug("update backup group %#04lx (+%d)\n", block, bit);
207
208 gdb = sb_getblk(sb, block);
209 if (!gdb) {
210 err = -EIO;
211 goto exit_bh;
212 }
213 if ((err = ext3_journal_get_write_access(handle, gdb))) {
214 brelse(gdb);
215 goto exit_bh;
216 }
217 lock_buffer(bh);
218 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, bh->b_size);
219 set_buffer_uptodate(gdb);
220 unlock_buffer(bh);
221 ext3_journal_dirty_metadata(handle, gdb);
222 ext3_set_bit(bit, bh->b_data);
223 brelse(gdb);
224 }
225
226 /* Zero out all of the reserved backup group descriptor table blocks */
227 for (i = 0, bit = gdblocks + 1, block = start + bit;
228 i < reserved_gdb; i++, block++, bit++) {
229 struct buffer_head *gdb;
230
231 ext3_debug("clear reserved block %#04lx (+%d)\n", block, bit);
232
233 if (IS_ERR(gdb = bclean(handle, sb, block))) {
234 err = PTR_ERR(bh);
235 goto exit_bh;
236 }
237 ext3_journal_dirty_metadata(handle, gdb);
238 ext3_set_bit(bit, bh->b_data);
239 brelse(gdb);
240 }
241 ext3_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap,
242 input->block_bitmap - start);
243 ext3_set_bit(input->block_bitmap - start, bh->b_data);
244 ext3_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap,
245 input->inode_bitmap - start);
246 ext3_set_bit(input->inode_bitmap - start, bh->b_data);
247
248 /* Zero out all of the inode table blocks */
249 for (i = 0, block = input->inode_table, bit = block - start;
250 i < sbi->s_itb_per_group; i++, bit++, block++) {
251 struct buffer_head *it;
252
253 ext3_debug("clear inode block %#04lx (+%d)\n", block, bit);
254 if (IS_ERR(it = bclean(handle, sb, block))) {
255 err = PTR_ERR(it);
256 goto exit_bh;
257 }
258 ext3_journal_dirty_metadata(handle, it);
259 brelse(it);
260 ext3_set_bit(bit, bh->b_data);
261 }
262 mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb),
263 bh->b_data);
264 ext3_journal_dirty_metadata(handle, bh);
265 brelse(bh);
266
267 /* Mark unused entries in inode bitmap used */
268 ext3_debug("clear inode bitmap %#04x (+%ld)\n",
269 input->inode_bitmap, input->inode_bitmap - start);
270 if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
271 err = PTR_ERR(bh);
272 goto exit_journal;
273 }
274
275 mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
276 bh->b_data);
277 ext3_journal_dirty_metadata(handle, bh);
278exit_bh:
279 brelse(bh);
280
281exit_journal:
282 unlock_super(sb);
283 if ((err2 = ext3_journal_stop(handle)) && !err)
284 err = err2;
285
286 return err;
287}
288
289/*
290 * Iterate through the groups which hold BACKUP superblock/GDT copies in an
291 * ext3 filesystem. The counters should be initialized to 1, 5, and 7 before
292 * calling this for the first time. In a sparse filesystem it will be the
293 * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
294 * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
295 */
296static unsigned ext3_list_backups(struct super_block *sb, unsigned *three,
297 unsigned *five, unsigned *seven)
298{
299 unsigned *min = three;
300 int mult = 3;
301 unsigned ret;
302
303 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
304 EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
305 ret = *min;
306 *min += 1;
307 return ret;
308 }
309
310 if (*five < *min) {
311 min = five;
312 mult = 5;
313 }
314 if (*seven < *min) {
315 min = seven;
316 mult = 7;
317 }
318
319 ret = *min;
320 *min *= mult;
321
322 return ret;
323}
324
325/*
326 * Check that all of the backup GDT blocks are held in the primary GDT block.
327 * It is assumed that they are stored in group order. Returns the number of
328 * groups in current filesystem that have BACKUPS, or -ve error code.
329 */
330static int verify_reserved_gdb(struct super_block *sb,
331 struct buffer_head *primary)
332{
333 const ext3_fsblk_t blk = primary->b_blocknr;
334 const unsigned long end = EXT3_SB(sb)->s_groups_count;
335 unsigned three = 1;
336 unsigned five = 5;
337 unsigned seven = 7;
338 unsigned grp;
339 __le32 *p = (__le32 *)primary->b_data;
340 int gdbackups = 0;
341
342 while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
343 if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
344 ext3_warning(sb, __FUNCTION__,
345 "reserved GDT "E3FSBLK
346 " missing grp %d ("E3FSBLK")",
347 blk, grp,
348 grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
349 return -EINVAL;
350 }
351 if (++gdbackups > EXT3_ADDR_PER_BLOCK(sb))
352 return -EFBIG;
353 }
354
355 return gdbackups;
356}
357
358/*
359 * Called when we need to bring a reserved group descriptor table block into
360 * use from the resize inode. The primary copy of the new GDT block currently
361 * is an indirect block (under the double indirect block in the resize inode).
362 * The new backup GDT blocks will be stored as leaf blocks in this indirect
363 * block, in group order. Even though we know all the block numbers we need,
364 * we check to ensure that the resize inode has actually reserved these blocks.
365 *
366 * Don't need to update the block bitmaps because the blocks are still in use.
367 *
368 * We get all of the error cases out of the way, so that we are sure to not
369 * fail once we start modifying the data on disk, because JBD has no rollback.
370 */
371static int add_new_gdb(handle_t *handle, struct inode *inode,
372 struct ext3_new_group_data *input,
373 struct buffer_head **primary)
374{
375 struct super_block *sb = inode->i_sb;
376 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
377 unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
378 ext3_fsblk_t gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
379 struct buffer_head **o_group_desc, **n_group_desc;
380 struct buffer_head *dind;
381 int gdbackups;
382 struct ext3_iloc iloc;
383 __le32 *data;
384 int err;
385
386 if (test_opt(sb, DEBUG))
387 printk(KERN_DEBUG
388 "EXT3-fs: ext3_add_new_gdb: adding group block %lu\n",
389 gdb_num);
390
391 /*
392 * If we are not using the primary superblock/GDT copy don't resize,
393 * because the user tools have no way of handling this. Probably a
394 * bad time to do it anyways.
395 */
396 if (EXT3_SB(sb)->s_sbh->b_blocknr !=
397 le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) {
398 ext3_warning(sb, __FUNCTION__,
399 "won't resize using backup superblock at %llu",
400 (unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr);
401 return -EPERM;
402 }
403
404 *primary = sb_bread(sb, gdblock);
405 if (!*primary)
406 return -EIO;
407
408 if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
409 err = gdbackups;
410 goto exit_bh;
411 }
412
413 data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK;
414 dind = sb_bread(sb, le32_to_cpu(*data));
415 if (!dind) {
416 err = -EIO;
417 goto exit_bh;
418 }
419
420 data = (__le32 *)dind->b_data;
421 if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
422 ext3_warning(sb, __FUNCTION__,
423 "new group %u GDT block "E3FSBLK" not reserved",
424 input->group, gdblock);
425 err = -EINVAL;
426 goto exit_dind;
427 }
428
429 if ((err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh)))
430 goto exit_dind;
431
432 if ((err = ext3_journal_get_write_access(handle, *primary)))
433 goto exit_sbh;
434
435 if ((err = ext3_journal_get_write_access(handle, dind)))
436 goto exit_primary;
437
438 /* ext3_reserve_inode_write() gets a reference on the iloc */
439 if ((err = ext3_reserve_inode_write(handle, inode, &iloc)))
440 goto exit_dindj;
441
442 n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
443 GFP_KERNEL);
444 if (!n_group_desc) {
445 err = -ENOMEM;
446 ext3_warning (sb, __FUNCTION__,
447 "not enough memory for %lu groups", gdb_num + 1);
448 goto exit_inode;
449 }
450
451 /*
452 * Finally, we have all of the possible failures behind us...
453 *
454 * Remove new GDT block from inode double-indirect block and clear out
455 * the new GDT block for use (which also "frees" the backup GDT blocks
456 * from the reserved inode). We don't need to change the bitmaps for
457 * these blocks, because they are marked as in-use from being in the
458 * reserved inode, and will become GDT blocks (primary and backup).
459 */
460 data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0;
461 ext3_journal_dirty_metadata(handle, dind);
462 brelse(dind);
463 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
464 ext3_mark_iloc_dirty(handle, inode, &iloc);
465 memset((*primary)->b_data, 0, sb->s_blocksize);
466 ext3_journal_dirty_metadata(handle, *primary);
467
468 o_group_desc = EXT3_SB(sb)->s_group_desc;
469 memcpy(n_group_desc, o_group_desc,
470 EXT3_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
471 n_group_desc[gdb_num] = *primary;
472 EXT3_SB(sb)->s_group_desc = n_group_desc;
473 EXT3_SB(sb)->s_gdb_count++;
474 kfree(o_group_desc);
475
476 es->s_reserved_gdt_blocks =
477 cpu_to_le16(le16_to_cpu(es->s_reserved_gdt_blocks) - 1);
478 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
479
480 return 0;
481
482exit_inode:
483 //ext3_journal_release_buffer(handle, iloc.bh);
484 brelse(iloc.bh);
485exit_dindj:
486 //ext3_journal_release_buffer(handle, dind);
487exit_primary:
488 //ext3_journal_release_buffer(handle, *primary);
489exit_sbh:
490 //ext3_journal_release_buffer(handle, *primary);
491exit_dind:
492 brelse(dind);
493exit_bh:
494 brelse(*primary);
495
496 ext3_debug("leaving with error %d\n", err);
497 return err;
498}
499
500/*
501 * Called when we are adding a new group which has a backup copy of each of
502 * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
503 * We need to add these reserved backup GDT blocks to the resize inode, so
504 * that they are kept for future resizing and not allocated to files.
505 *
506 * Each reserved backup GDT block will go into a different indirect block.
507 * The indirect blocks are actually the primary reserved GDT blocks,
508 * so we know in advance what their block numbers are. We only get the
509 * double-indirect block to verify it is pointing to the primary reserved
510 * GDT blocks so we don't overwrite a data block by accident. The reserved
511 * backup GDT blocks are stored in their reserved primary GDT block.
512 */
513static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
514 struct ext3_new_group_data *input)
515{
516 struct super_block *sb = inode->i_sb;
517 int reserved_gdb =le16_to_cpu(EXT3_SB(sb)->s_es->s_reserved_gdt_blocks);
518 struct buffer_head **primary;
519 struct buffer_head *dind;
520 struct ext3_iloc iloc;
521 ext3_fsblk_t blk;
522 __le32 *data, *end;
523 int gdbackups = 0;
524 int res, i;
525 int err;
526
527 primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_KERNEL);
528 if (!primary)
529 return -ENOMEM;
530
531 data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK;
532 dind = sb_bread(sb, le32_to_cpu(*data));
533 if (!dind) {
534 err = -EIO;
535 goto exit_free;
536 }
537
538 blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count;
539 data = (__le32 *)dind->b_data + EXT3_SB(sb)->s_gdb_count;
540 end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb);
541
542 /* Get each reserved primary GDT block and verify it holds backups */
543 for (res = 0; res < reserved_gdb; res++, blk++) {
544 if (le32_to_cpu(*data) != blk) {
545 ext3_warning(sb, __FUNCTION__,
546 "reserved block "E3FSBLK
547 " not at offset %ld",
548 blk,
549 (long)(data - (__le32 *)dind->b_data));
550 err = -EINVAL;
551 goto exit_bh;
552 }
553 primary[res] = sb_bread(sb, blk);
554 if (!primary[res]) {
555 err = -EIO;
556 goto exit_bh;
557 }
558 if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
559 brelse(primary[res]);
560 err = gdbackups;
561 goto exit_bh;
562 }
563 if (++data >= end)
564 data = (__le32 *)dind->b_data;
565 }
566
567 for (i = 0; i < reserved_gdb; i++) {
568 if ((err = ext3_journal_get_write_access(handle, primary[i]))) {
569 /*
570 int j;
571 for (j = 0; j < i; j++)
572 ext3_journal_release_buffer(handle, primary[j]);
573 */
574 goto exit_bh;
575 }
576 }
577
578 if ((err = ext3_reserve_inode_write(handle, inode, &iloc)))
579 goto exit_bh;
580
581 /*
582 * Finally we can add each of the reserved backup GDT blocks from
583 * the new group to its reserved primary GDT block.
584 */
585 blk = input->group * EXT3_BLOCKS_PER_GROUP(sb);
586 for (i = 0; i < reserved_gdb; i++) {
587 int err2;
588 data = (__le32 *)primary[i]->b_data;
589 /* printk("reserving backup %lu[%u] = %lu\n",
590 primary[i]->b_blocknr, gdbackups,
591 blk + primary[i]->b_blocknr); */
592 data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
593 err2 = ext3_journal_dirty_metadata(handle, primary[i]);
594 if (!err)
595 err = err2;
596 }
597 inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9;
598 ext3_mark_iloc_dirty(handle, inode, &iloc);
599
600exit_bh:
601 while (--res >= 0)
602 brelse(primary[res]);
603 brelse(dind);
604
605exit_free:
606 kfree(primary);
607
608 return err;
609}
610
611/*
612 * Update the backup copies of the ext3 metadata. These don't need to be part
613 * of the main resize transaction, because e2fsck will re-write them if there
614 * is a problem (basically only OOM will cause a problem). However, we
615 * _should_ update the backups if possible, in case the primary gets trashed
616 * for some reason and we need to run e2fsck from a backup superblock. The
617 * important part is that the new block and inode counts are in the backup
618 * superblocks, and the location of the new group metadata in the GDT backups.
619 *
620 * We do not need lock_super() for this, because these blocks are not
621 * otherwise touched by the filesystem code when it is mounted. We don't
622 * need to worry about last changing from sbi->s_groups_count, because the
623 * worst that can happen is that we do not copy the full number of backups
624 * at this time. The resize which changed s_groups_count will backup again.
625 */
626static void update_backups(struct super_block *sb,
627 int blk_off, char *data, int size)
628{
629 struct ext3_sb_info *sbi = EXT3_SB(sb);
630 const unsigned long last = sbi->s_groups_count;
631 const int bpg = EXT3_BLOCKS_PER_GROUP(sb);
632 unsigned three = 1;
633 unsigned five = 5;
634 unsigned seven = 7;
635 unsigned group;
636 int rest = sb->s_blocksize - size;
637 handle_t *handle;
638 int err = 0, err2;
639
640 handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA);
641 if (IS_ERR(handle)) {
642 group = 1;
643 err = PTR_ERR(handle);
644 goto exit_err;
645 }
646
647 while ((group = ext3_list_backups(sb, &three, &five, &seven)) < last) {
648 struct buffer_head *bh;
649
650 /* Out of journal space, and can't get more - abort - so sad */
651 if (handle->h_buffer_credits == 0 &&
652 ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA) &&
653 (err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA)))
654 break;
655
656 bh = sb_getblk(sb, group * bpg + blk_off);
657 if (!bh) {
658 err = -EIO;
659 break;
660 }
661 ext3_debug("update metadata backup %#04lx\n",
662 (unsigned long)bh->b_blocknr);
663 if ((err = ext3_journal_get_write_access(handle, bh)))
664 break;
665 lock_buffer(bh);
666 memcpy(bh->b_data, data, size);
667 if (rest)
668 memset(bh->b_data + size, 0, rest);
669 set_buffer_uptodate(bh);
670 unlock_buffer(bh);
671 ext3_journal_dirty_metadata(handle, bh);
672 brelse(bh);
673 }
674 if ((err2 = ext3_journal_stop(handle)) && !err)
675 err = err2;
676
677 /*
678 * Ugh! Need to have e2fsck write the backup copies. It is too
679 * late to revert the resize, we shouldn't fail just because of
680 * the backup copies (they are only needed in case of corruption).
681 *
682 * However, if we got here we have a journal problem too, so we
683 * can't really start a transaction to mark the superblock.
684 * Chicken out and just set the flag on the hope it will be written
685 * to disk, and if not - we will simply wait until next fsck.
686 */
687exit_err:
688 if (err) {
689 ext3_warning(sb, __FUNCTION__,
690 "can't update backup for group %d (err %d), "
691 "forcing fsck on next reboot", group, err);
692 sbi->s_mount_state &= ~EXT3_VALID_FS;
693 sbi->s_es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
694 mark_buffer_dirty(sbi->s_sbh);
695 }
696}
697
698/* Add group descriptor data to an existing or new group descriptor block.
699 * Ensure we handle all possible error conditions _before_ we start modifying
700 * the filesystem, because we cannot abort the transaction and not have it
701 * write the data to disk.
702 *
703 * If we are on a GDT block boundary, we need to get the reserved GDT block.
704 * Otherwise, we may need to add backup GDT blocks for a sparse group.
705 *
706 * We only need to hold the superblock lock while we are actually adding
707 * in the new group's counts to the superblock. Prior to that we have
708 * not really "added" the group at all. We re-check that we are still
709 * adding in the last group in case things have changed since verifying.
710 */
711int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
712{
713 struct ext3_sb_info *sbi = EXT3_SB(sb);
714 struct ext3_super_block *es = sbi->s_es;
715 int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
716 le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
717 struct buffer_head *primary = NULL;
718 struct ext3_group_desc *gdp;
719 struct inode *inode = NULL;
720 handle_t *handle;
721 int gdb_off, gdb_num;
722 int err, err2;
723
724 gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
725 gdb_off = input->group % EXT3_DESC_PER_BLOCK(sb);
726
727 if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb,
728 EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
729 ext3_warning(sb, __FUNCTION__,
730 "Can't resize non-sparse filesystem further");
731 return -EPERM;
732 }
733
734 if (le32_to_cpu(es->s_blocks_count) + input->blocks_count <
735 le32_to_cpu(es->s_blocks_count)) {
736 ext3_warning(sb, __FUNCTION__, "blocks_count overflow\n");
737 return -EINVAL;
738 }
739
740 if (le32_to_cpu(es->s_inodes_count) + EXT3_INODES_PER_GROUP(sb) <
741 le32_to_cpu(es->s_inodes_count)) {
742 ext3_warning(sb, __FUNCTION__, "inodes_count overflow\n");
743 return -EINVAL;
744 }
745
746 if (reserved_gdb || gdb_off == 0) {
747 if (!EXT3_HAS_COMPAT_FEATURE(sb,
748 EXT3_FEATURE_COMPAT_RESIZE_INODE)){
749 ext3_warning(sb, __FUNCTION__,
750 "No reserved GDT blocks, can't resize");
751 return -EPERM;
752 }
753 inode = iget(sb, EXT3_RESIZE_INO);
754 if (!inode || is_bad_inode(inode)) {
755 ext3_warning(sb, __FUNCTION__,
756 "Error opening resize inode");
757 iput(inode);
758 return -ENOENT;
759 }
760 }
761
762 if ((err = verify_group_input(sb, input)))
763 goto exit_put;
764
765 if ((err = setup_new_group_blocks(sb, input)))
766 goto exit_put;
767
768 /*
769 * We will always be modifying at least the superblock and a GDT
770 * block. If we are adding a group past the last current GDT block,
771 * we will also modify the inode and the dindirect block. If we
772 * are adding a group with superblock/GDT backups we will also
773 * modify each of the reserved GDT dindirect blocks.
774 */
775 handle = ext3_journal_start_sb(sb,
776 ext3_bg_has_super(sb, input->group) ?
777 3 + reserved_gdb : 4);
778 if (IS_ERR(handle)) {
779 err = PTR_ERR(handle);
780 goto exit_put;
781 }
782
783 lock_super(sb);
784 if (input->group != sbi->s_groups_count) {
785 ext3_warning(sb, __FUNCTION__,
786 "multiple resizers run on filesystem!");
787 err = -EBUSY;
788 goto exit_journal;
789 }
790
791 if ((err = ext3_journal_get_write_access(handle, sbi->s_sbh)))
792 goto exit_journal;
793
794 /*
795 * We will only either add reserved group blocks to a backup group
796 * or remove reserved blocks for the first group in a new group block.
797 * Doing both would be mean more complex code, and sane people don't
798 * use non-sparse filesystems anymore. This is already checked above.
799 */
800 if (gdb_off) {
801 primary = sbi->s_group_desc[gdb_num];
802 if ((err = ext3_journal_get_write_access(handle, primary)))
803 goto exit_journal;
804
805 if (reserved_gdb && ext3_bg_num_gdb(sb, input->group) &&
806 (err = reserve_backup_gdb(handle, inode, input)))
807 goto exit_journal;
808 } else if ((err = add_new_gdb(handle, inode, input, &primary)))
809 goto exit_journal;
810
811 /*
812 * OK, now we've set up the new group. Time to make it active.
813 *
814 * Current kernels don't lock all allocations via lock_super(),
815 * so we have to be safe wrt. concurrent accesses the group
816 * data. So we need to be careful to set all of the relevant
817 * group descriptor data etc. *before* we enable the group.
818 *
819 * The key field here is sbi->s_groups_count: as long as
820 * that retains its old value, nobody is going to access the new
821 * group.
822 *
823 * So first we update all the descriptor metadata for the new
824 * group; then we update the total disk blocks count; then we
825 * update the groups count to enable the group; then finally we
826 * update the free space counts so that the system can start
827 * using the new disk blocks.
828 */
829
830 /* Update group descriptor block for new group */
831 gdp = (struct ext3_group_desc *)primary->b_data + gdb_off;
832
833 gdp->bg_block_bitmap = cpu_to_le32(input->block_bitmap);
834 gdp->bg_inode_bitmap = cpu_to_le32(input->inode_bitmap);
835 gdp->bg_inode_table = cpu_to_le32(input->inode_table);
836 gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
837 gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb));
838
839 /*
840 * Make the new blocks and inodes valid next. We do this before
841 * increasing the group count so that once the group is enabled,
842 * all of its blocks and inodes are already valid.
843 *
844 * We always allocate group-by-group, then block-by-block or
845 * inode-by-inode within a group, so enabling these
846 * blocks/inodes before the group is live won't actually let us
847 * allocate the new space yet.
848 */
849 es->s_blocks_count = cpu_to_le32(le32_to_cpu(es->s_blocks_count) +
850 input->blocks_count);
851 es->s_inodes_count = cpu_to_le32(le32_to_cpu(es->s_inodes_count) +
852 EXT3_INODES_PER_GROUP(sb));
853
854 /*
855 * We need to protect s_groups_count against other CPUs seeing
856 * inconsistent state in the superblock.
857 *
858 * The precise rules we use are:
859 *
860 * * Writers of s_groups_count *must* hold lock_super
861 * AND
862 * * Writers must perform a smp_wmb() after updating all dependent
863 * data and before modifying the groups count
864 *
865 * * Readers must hold lock_super() over the access
866 * OR
867 * * Readers must perform an smp_rmb() after reading the groups count
868 * and before reading any dependent data.
869 *
870 * NB. These rules can be relaxed when checking the group count
871 * while freeing data, as we can only allocate from a block
872 * group after serialising against the group count, and we can
873 * only then free after serialising in turn against that
874 * allocation.
875 */
876 smp_wmb();
877
878 /* Update the global fs size fields */
879 sbi->s_groups_count++;
880
881 ext3_journal_dirty_metadata(handle, primary);
882
883 /* Update the reserved block counts only once the new group is
884 * active. */
885 es->s_r_blocks_count = cpu_to_le32(le32_to_cpu(es->s_r_blocks_count) +
886 input->reserved_blocks);
887
888 /* Update the free space counts */
889 percpu_counter_mod(&sbi->s_freeblocks_counter,
890 input->free_blocks_count);
891 percpu_counter_mod(&sbi->s_freeinodes_counter,
892 EXT3_INODES_PER_GROUP(sb));
893
894 ext3_journal_dirty_metadata(handle, sbi->s_sbh);
895 sb->s_dirt = 1;
896
897exit_journal:
898 unlock_super(sb);
899 if ((err2 = ext3_journal_stop(handle)) && !err)
900 err = err2;
901 if (!err) {
902 update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
903 sizeof(struct ext3_super_block));
904 update_backups(sb, primary->b_blocknr, primary->b_data,
905 primary->b_size);
906 }
907exit_put:
908 iput(inode);
909 return err;
910} /* ext3_group_add */
911
912/* Extend the filesystem to the new number of blocks specified. This entry
913 * point is only used to extend the current filesystem to the end of the last
914 * existing group. It can be accessed via ioctl, or by "remount,resize=<size>"
915 * for emergencies (because it has no dependencies on reserved blocks).
916 *
917 * If we _really_ wanted, we could use default values to call ext3_group_add()
918 * allow the "remount" trick to work for arbitrary resizing, assuming enough
919 * GDT blocks are reserved to grow to the desired size.
920 */
921int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
922 ext3_fsblk_t n_blocks_count)
923{
924 ext3_fsblk_t o_blocks_count;
925 unsigned long o_groups_count;
926 ext3_grpblk_t last;
927 ext3_grpblk_t add;
928 struct buffer_head * bh;
929 handle_t *handle;
930 int err;
931 unsigned long freed_blocks;
932
933 /* We don't need to worry about locking wrt other resizers just
934 * yet: we're going to revalidate es->s_blocks_count after
935 * taking lock_super() below. */
936 o_blocks_count = le32_to_cpu(es->s_blocks_count);
937 o_groups_count = EXT3_SB(sb)->s_groups_count;
938
939 if (test_opt(sb, DEBUG))
940 printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n",
941 o_blocks_count, n_blocks_count);
942
943 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
944 return 0;
945
946 if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
947 printk(KERN_ERR "EXT3-fs: filesystem on %s:"
948 " too large to resize to %lu blocks safely\n",
949 sb->s_id, n_blocks_count);
950 if (sizeof(sector_t) < 8)
951 ext3_warning(sb, __FUNCTION__,
952 "CONFIG_LBD not enabled\n");
953 return -EINVAL;
954 }
955
956 if (n_blocks_count < o_blocks_count) {
957 ext3_warning(sb, __FUNCTION__,
958 "can't shrink FS - resize aborted");
959 return -EBUSY;
960 }
961
962 /* Handle the remaining blocks in the last group only. */
963 last = (o_blocks_count - le32_to_cpu(es->s_first_data_block)) %
964 EXT3_BLOCKS_PER_GROUP(sb);
965
966 if (last == 0) {
967 ext3_warning(sb, __FUNCTION__,
968 "need to use ext2online to resize further");
969 return -EPERM;
970 }
971
972 add = EXT3_BLOCKS_PER_GROUP(sb) - last;
973
974 if (o_blocks_count + add < o_blocks_count) {
975 ext3_warning(sb, __FUNCTION__, "blocks_count overflow");
976 return -EINVAL;
977 }
978
979 if (o_blocks_count + add > n_blocks_count)
980 add = n_blocks_count - o_blocks_count;
981
982 if (o_blocks_count + add < n_blocks_count)
983 ext3_warning(sb, __FUNCTION__,
984 "will only finish group ("E3FSBLK
985 " blocks, %u new)",
986 o_blocks_count + add, add);
987
988 /* See if the device is actually as big as what was requested */
989 bh = sb_bread(sb, o_blocks_count + add -1);
990 if (!bh) {
991 ext3_warning(sb, __FUNCTION__,
992 "can't read last block, resize aborted");
993 return -ENOSPC;
994 }
995 brelse(bh);
996
997 /* We will update the superblock, one block bitmap, and
998 * one group descriptor via ext3_free_blocks().
999 */
1000 handle = ext3_journal_start_sb(sb, 3);
1001 if (IS_ERR(handle)) {
1002 err = PTR_ERR(handle);
1003 ext3_warning(sb, __FUNCTION__, "error %d on journal start",err);
1004 goto exit_put;
1005 }
1006
1007 lock_super(sb);
1008 if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
1009 ext3_warning(sb, __FUNCTION__,
1010 "multiple resizers run on filesystem!");
1011 unlock_super(sb);
1012 err = -EBUSY;
1013 goto exit_put;
1014 }
1015
1016 if ((err = ext3_journal_get_write_access(handle,
1017 EXT3_SB(sb)->s_sbh))) {
1018 ext3_warning(sb, __FUNCTION__,
1019 "error %d on journal write access", err);
1020 unlock_super(sb);
1021 ext3_journal_stop(handle);
1022 goto exit_put;
1023 }
1024 es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
1025 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
1026 sb->s_dirt = 1;
1027 unlock_super(sb);
1028 ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
1029 o_blocks_count + add);
1030 ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
1031 ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count,
1032 o_blocks_count + add);
1033 if ((err = ext3_journal_stop(handle)))
1034 goto exit_put;
1035 if (test_opt(sb, DEBUG))
1036 printk(KERN_DEBUG "EXT3-fs: extended group to %u blocks\n",
1037 le32_to_cpu(es->s_blocks_count));
1038 update_backups(sb, EXT3_SB(sb)->s_sbh->b_blocknr, (char *)es,
1039 sizeof(struct ext3_super_block));
1040exit_put:
1041 return err;
1042} /* ext3_group_extend */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
new file mode 100644
index 000000000000..8bfd56ef18ca
--- /dev/null
+++ b/fs/ext4/super.c
@@ -0,0 +1,2754 @@
1/*
2 * linux/fs/ext3/super.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/inode.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Big-endian to little-endian byte-swapping/bitmaps by
16 * David S. Miller (davem@caip.rutgers.edu), 1995
17 */
18
19#include <linux/module.h>
20#include <linux/string.h>
21#include <linux/fs.h>
22#include <linux/time.h>
23#include <linux/jbd.h>
24#include <linux/ext3_fs.h>
25#include <linux/ext3_jbd.h>
26#include <linux/slab.h>
27#include <linux/init.h>
28#include <linux/blkdev.h>
29#include <linux/parser.h>
30#include <linux/smp_lock.h>
31#include <linux/buffer_head.h>
32#include <linux/vfs.h>
33#include <linux/random.h>
34#include <linux/mount.h>
35#include <linux/namei.h>
36#include <linux/quotaops.h>
37#include <linux/seq_file.h>
38
39#include <asm/uaccess.h>
40
41#include "xattr.h"
42#include "acl.h"
43#include "namei.h"
44
45static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
46 unsigned long journal_devnum);
47static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
48 unsigned int);
49static void ext3_commit_super (struct super_block * sb,
50 struct ext3_super_block * es,
51 int sync);
52static void ext3_mark_recovery_complete(struct super_block * sb,
53 struct ext3_super_block * es);
54static void ext3_clear_journal_err(struct super_block * sb,
55 struct ext3_super_block * es);
56static int ext3_sync_fs(struct super_block *sb, int wait);
57static const char *ext3_decode_error(struct super_block * sb, int errno,
58 char nbuf[16]);
59static int ext3_remount (struct super_block * sb, int * flags, char * data);
60static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
61static void ext3_unlockfs(struct super_block *sb);
62static void ext3_write_super (struct super_block * sb);
63static void ext3_write_super_lockfs(struct super_block *sb);
64
65/*
66 * Wrappers for journal_start/end.
67 *
68 * The only special thing we need to do here is to make sure that all
69 * journal_end calls result in the superblock being marked dirty, so
70 * that sync() will call the filesystem's write_super callback if
71 * appropriate.
72 */
73handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
74{
75 journal_t *journal;
76
77 if (sb->s_flags & MS_RDONLY)
78 return ERR_PTR(-EROFS);
79
80 /* Special case here: if the journal has aborted behind our
81 * backs (eg. EIO in the commit thread), then we still need to
82 * take the FS itself readonly cleanly. */
83 journal = EXT3_SB(sb)->s_journal;
84 if (is_journal_aborted(journal)) {
85 ext3_abort(sb, __FUNCTION__,
86 "Detected aborted journal");
87 return ERR_PTR(-EROFS);
88 }
89
90 return journal_start(journal, nblocks);
91}
92
93/*
94 * The only special thing we need to do here is to make sure that all
95 * journal_stop calls result in the superblock being marked dirty, so
96 * that sync() will call the filesystem's write_super callback if
97 * appropriate.
98 */
99int __ext3_journal_stop(const char *where, handle_t *handle)
100{
101 struct super_block *sb;
102 int err;
103 int rc;
104
105 sb = handle->h_transaction->t_journal->j_private;
106 err = handle->h_err;
107 rc = journal_stop(handle);
108
109 if (!err)
110 err = rc;
111 if (err)
112 __ext3_std_error(sb, where, err);
113 return err;
114}
115
116void ext3_journal_abort_handle(const char *caller, const char *err_fn,
117 struct buffer_head *bh, handle_t *handle, int err)
118{
119 char nbuf[16];
120 const char *errstr = ext3_decode_error(NULL, err, nbuf);
121
122 if (bh)
123 BUFFER_TRACE(bh, "abort");
124
125 if (!handle->h_err)
126 handle->h_err = err;
127
128 if (is_handle_aborted(handle))
129 return;
130
131 printk(KERN_ERR "%s: aborting transaction: %s in %s\n",
132 caller, errstr, err_fn);
133
134 journal_abort_handle(handle);
135}
136
137/* Deal with the reporting of failure conditions on a filesystem such as
138 * inconsistencies detected or read IO failures.
139 *
140 * On ext2, we can store the error state of the filesystem in the
141 * superblock. That is not possible on ext3, because we may have other
142 * write ordering constraints on the superblock which prevent us from
143 * writing it out straight away; and given that the journal is about to
144 * be aborted, we can't rely on the current, or future, transactions to
145 * write out the superblock safely.
146 *
147 * We'll just use the journal_abort() error code to record an error in
148 * the journal instead. On recovery, the journal will compain about
149 * that error until we've noted it down and cleared it.
150 */
151
152static void ext3_handle_error(struct super_block *sb)
153{
154 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
155
156 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
157 es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
158
159 if (sb->s_flags & MS_RDONLY)
160 return;
161
162 if (!test_opt (sb, ERRORS_CONT)) {
163 journal_t *journal = EXT3_SB(sb)->s_journal;
164
165 EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
166 if (journal)
167 journal_abort(journal, -EIO);
168 }
169 if (test_opt (sb, ERRORS_RO)) {
170 printk (KERN_CRIT "Remounting filesystem read-only\n");
171 sb->s_flags |= MS_RDONLY;
172 }
173 ext3_commit_super(sb, es, 1);
174 if (test_opt(sb, ERRORS_PANIC))
175 panic("EXT3-fs (device %s): panic forced after error\n",
176 sb->s_id);
177}
178
179void ext3_error (struct super_block * sb, const char * function,
180 const char * fmt, ...)
181{
182 va_list args;
183
184 va_start(args, fmt);
185 printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function);
186 vprintk(fmt, args);
187 printk("\n");
188 va_end(args);
189
190 ext3_handle_error(sb);
191}
192
193static const char *ext3_decode_error(struct super_block * sb, int errno,
194 char nbuf[16])
195{
196 char *errstr = NULL;
197
198 switch (errno) {
199 case -EIO:
200 errstr = "IO failure";
201 break;
202 case -ENOMEM:
203 errstr = "Out of memory";
204 break;
205 case -EROFS:
206 if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT)
207 errstr = "Journal has aborted";
208 else
209 errstr = "Readonly filesystem";
210 break;
211 default:
212 /* If the caller passed in an extra buffer for unknown
213 * errors, textualise them now. Else we just return
214 * NULL. */
215 if (nbuf) {
216 /* Check for truncated error codes... */
217 if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
218 errstr = nbuf;
219 }
220 break;
221 }
222
223 return errstr;
224}
225
226/* __ext3_std_error decodes expected errors from journaling functions
227 * automatically and invokes the appropriate error response. */
228
229void __ext3_std_error (struct super_block * sb, const char * function,
230 int errno)
231{
232 char nbuf[16];
233 const char *errstr;
234
235 /* Special case: if the error is EROFS, and we're not already
236 * inside a transaction, then there's really no point in logging
237 * an error. */
238 if (errno == -EROFS && journal_current_handle() == NULL &&
239 (sb->s_flags & MS_RDONLY))
240 return;
241
242 errstr = ext3_decode_error(sb, errno, nbuf);
243 printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n",
244 sb->s_id, function, errstr);
245
246 ext3_handle_error(sb);
247}
248
249/*
250 * ext3_abort is a much stronger failure handler than ext3_error. The
251 * abort function may be used to deal with unrecoverable failures such
252 * as journal IO errors or ENOMEM at a critical moment in log management.
253 *
254 * We unconditionally force the filesystem into an ABORT|READONLY state,
255 * unless the error response on the fs has been set to panic in which
256 * case we take the easy way out and panic immediately.
257 */
258
259void ext3_abort (struct super_block * sb, const char * function,
260 const char * fmt, ...)
261{
262 va_list args;
263
264 printk (KERN_CRIT "ext3_abort called.\n");
265
266 va_start(args, fmt);
267 printk(KERN_CRIT "EXT3-fs error (device %s): %s: ",sb->s_id, function);
268 vprintk(fmt, args);
269 printk("\n");
270 va_end(args);
271
272 if (test_opt(sb, ERRORS_PANIC))
273 panic("EXT3-fs panic from previous error\n");
274
275 if (sb->s_flags & MS_RDONLY)
276 return;
277
278 printk(KERN_CRIT "Remounting filesystem read-only\n");
279 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
280 sb->s_flags |= MS_RDONLY;
281 EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
282 journal_abort(EXT3_SB(sb)->s_journal, -EIO);
283}
284
285void ext3_warning (struct super_block * sb, const char * function,
286 const char * fmt, ...)
287{
288 va_list args;
289
290 va_start(args, fmt);
291 printk(KERN_WARNING "EXT3-fs warning (device %s): %s: ",
292 sb->s_id, function);
293 vprintk(fmt, args);
294 printk("\n");
295 va_end(args);
296}
297
298void ext3_update_dynamic_rev(struct super_block *sb)
299{
300 struct ext3_super_block *es = EXT3_SB(sb)->s_es;
301
302 if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV)
303 return;
304
305 ext3_warning(sb, __FUNCTION__,
306 "updating to rev %d because of new feature flag, "
307 "running e2fsck is recommended",
308 EXT3_DYNAMIC_REV);
309
310 es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO);
311 es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE);
312 es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV);
313 /* leave es->s_feature_*compat flags alone */
314 /* es->s_uuid will be set by e2fsck if empty */
315
316 /*
317 * The rest of the superblock fields should be zero, and if not it
318 * means they are likely already in use, so leave them alone. We
319 * can leave it up to e2fsck to clean up any inconsistencies there.
320 */
321}
322
323/*
324 * Open the external journal device
325 */
326static struct block_device *ext3_blkdev_get(dev_t dev)
327{
328 struct block_device *bdev;
329 char b[BDEVNAME_SIZE];
330
331 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
332 if (IS_ERR(bdev))
333 goto fail;
334 return bdev;
335
336fail:
337 printk(KERN_ERR "EXT3: failed to open journal device %s: %ld\n",
338 __bdevname(dev, b), PTR_ERR(bdev));
339 return NULL;
340}
341
342/*
343 * Release the journal device
344 */
345static int ext3_blkdev_put(struct block_device *bdev)
346{
347 bd_release(bdev);
348 return blkdev_put(bdev);
349}
350
351static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
352{
353 struct block_device *bdev;
354 int ret = -ENODEV;
355
356 bdev = sbi->journal_bdev;
357 if (bdev) {
358 ret = ext3_blkdev_put(bdev);
359 sbi->journal_bdev = NULL;
360 }
361 return ret;
362}
363
364static inline struct inode *orphan_list_entry(struct list_head *l)
365{
366 return &list_entry(l, struct ext3_inode_info, i_orphan)->vfs_inode;
367}
368
369static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
370{
371 struct list_head *l;
372
373 printk(KERN_ERR "sb orphan head is %d\n",
374 le32_to_cpu(sbi->s_es->s_last_orphan));
375
376 printk(KERN_ERR "sb_info orphan list:\n");
377 list_for_each(l, &sbi->s_orphan) {
378 struct inode *inode = orphan_list_entry(l);
379 printk(KERN_ERR " "
380 "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
381 inode->i_sb->s_id, inode->i_ino, inode,
382 inode->i_mode, inode->i_nlink,
383 NEXT_ORPHAN(inode));
384 }
385}
386
387static void ext3_put_super (struct super_block * sb)
388{
389 struct ext3_sb_info *sbi = EXT3_SB(sb);
390 struct ext3_super_block *es = sbi->s_es;
391 int i;
392
393 ext3_xattr_put_super(sb);
394 journal_destroy(sbi->s_journal);
395 if (!(sb->s_flags & MS_RDONLY)) {
396 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
397 es->s_state = cpu_to_le16(sbi->s_mount_state);
398 BUFFER_TRACE(sbi->s_sbh, "marking dirty");
399 mark_buffer_dirty(sbi->s_sbh);
400 ext3_commit_super(sb, es, 1);
401 }
402
403 for (i = 0; i < sbi->s_gdb_count; i++)
404 brelse(sbi->s_group_desc[i]);
405 kfree(sbi->s_group_desc);
406 percpu_counter_destroy(&sbi->s_freeblocks_counter);
407 percpu_counter_destroy(&sbi->s_freeinodes_counter);
408 percpu_counter_destroy(&sbi->s_dirs_counter);
409 brelse(sbi->s_sbh);
410#ifdef CONFIG_QUOTA
411 for (i = 0; i < MAXQUOTAS; i++)
412 kfree(sbi->s_qf_names[i]);
413#endif
414
415 /* Debugging code just in case the in-memory inode orphan list
416 * isn't empty. The on-disk one can be non-empty if we've
417 * detected an error and taken the fs readonly, but the
418 * in-memory list had better be clean by this point. */
419 if (!list_empty(&sbi->s_orphan))
420 dump_orphan_list(sb, sbi);
421 J_ASSERT(list_empty(&sbi->s_orphan));
422
423 invalidate_bdev(sb->s_bdev, 0);
424 if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
425 /*
426 * Invalidate the journal device's buffers. We don't want them
427 * floating about in memory - the physical journal device may
428 * hotswapped, and it breaks the `ro-after' testing code.
429 */
430 sync_blockdev(sbi->journal_bdev);
431 invalidate_bdev(sbi->journal_bdev, 0);
432 ext3_blkdev_remove(sbi);
433 }
434 sb->s_fs_info = NULL;
435 kfree(sbi);
436 return;
437}
438
439static kmem_cache_t *ext3_inode_cachep;
440
441/*
442 * Called inside transaction, so use GFP_NOFS
443 */
444static struct inode *ext3_alloc_inode(struct super_block *sb)
445{
446 struct ext3_inode_info *ei;
447
448 ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS);
449 if (!ei)
450 return NULL;
451#ifdef CONFIG_EXT3_FS_POSIX_ACL
452 ei->i_acl = EXT3_ACL_NOT_CACHED;
453 ei->i_default_acl = EXT3_ACL_NOT_CACHED;
454#endif
455 ei->i_block_alloc_info = NULL;
456 ei->vfs_inode.i_version = 1;
457 return &ei->vfs_inode;
458}
459
460static void ext3_destroy_inode(struct inode *inode)
461{
462 kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
463}
464
465static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
466{
467 struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
468
469 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
470 SLAB_CTOR_CONSTRUCTOR) {
471 INIT_LIST_HEAD(&ei->i_orphan);
472#ifdef CONFIG_EXT3_FS_XATTR
473 init_rwsem(&ei->xattr_sem);
474#endif
475 mutex_init(&ei->truncate_mutex);
476 inode_init_once(&ei->vfs_inode);
477 }
478}
479
480static int init_inodecache(void)
481{
482 ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
483 sizeof(struct ext3_inode_info),
484 0, (SLAB_RECLAIM_ACCOUNT|
485 SLAB_MEM_SPREAD),
486 init_once, NULL);
487 if (ext3_inode_cachep == NULL)
488 return -ENOMEM;
489 return 0;
490}
491
492static void destroy_inodecache(void)
493{
494 kmem_cache_destroy(ext3_inode_cachep);
495}
496
497static void ext3_clear_inode(struct inode *inode)
498{
499 struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
500#ifdef CONFIG_EXT3_FS_POSIX_ACL
501 if (EXT3_I(inode)->i_acl &&
502 EXT3_I(inode)->i_acl != EXT3_ACL_NOT_CACHED) {
503 posix_acl_release(EXT3_I(inode)->i_acl);
504 EXT3_I(inode)->i_acl = EXT3_ACL_NOT_CACHED;
505 }
506 if (EXT3_I(inode)->i_default_acl &&
507 EXT3_I(inode)->i_default_acl != EXT3_ACL_NOT_CACHED) {
508 posix_acl_release(EXT3_I(inode)->i_default_acl);
509 EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED;
510 }
511#endif
512 ext3_discard_reservation(inode);
513 EXT3_I(inode)->i_block_alloc_info = NULL;
514 if (unlikely(rsv))
515 kfree(rsv);
516}
517
518static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
519{
520#if defined(CONFIG_QUOTA)
521 struct ext3_sb_info *sbi = EXT3_SB(sb);
522
523 if (sbi->s_jquota_fmt)
524 seq_printf(seq, ",jqfmt=%s",
525 (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
526
527 if (sbi->s_qf_names[USRQUOTA])
528 seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
529
530 if (sbi->s_qf_names[GRPQUOTA])
531 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
532
533 if (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA)
534 seq_puts(seq, ",usrquota");
535
536 if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)
537 seq_puts(seq, ",grpquota");
538#endif
539}
540
541static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
542{
543 struct super_block *sb = vfs->mnt_sb;
544
545 if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
546 seq_puts(seq, ",data=journal");
547 else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
548 seq_puts(seq, ",data=ordered");
549 else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
550 seq_puts(seq, ",data=writeback");
551
552 ext3_show_quota_options(seq, sb);
553
554 return 0;
555}
556
557
558static struct dentry *ext3_get_dentry(struct super_block *sb, void *vobjp)
559{
560 __u32 *objp = vobjp;
561 unsigned long ino = objp[0];
562 __u32 generation = objp[1];
563 struct inode *inode;
564 struct dentry *result;
565
566 if (ino < EXT3_FIRST_INO(sb) && ino != EXT3_ROOT_INO)
567 return ERR_PTR(-ESTALE);
568 if (ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count))
569 return ERR_PTR(-ESTALE);
570
571 /* iget isn't really right if the inode is currently unallocated!!
572 *
573 * ext3_read_inode will return a bad_inode if the inode had been
574 * deleted, so we should be safe.
575 *
576 * Currently we don't know the generation for parent directory, so
577 * a generation of 0 means "accept any"
578 */
579 inode = iget(sb, ino);
580 if (inode == NULL)
581 return ERR_PTR(-ENOMEM);
582 if (is_bad_inode(inode) ||
583 (generation && inode->i_generation != generation)) {
584 iput(inode);
585 return ERR_PTR(-ESTALE);
586 }
587 /* now to find a dentry.
588 * If possible, get a well-connected one
589 */
590 result = d_alloc_anon(inode);
591 if (!result) {
592 iput(inode);
593 return ERR_PTR(-ENOMEM);
594 }
595 return result;
596}
597
598#ifdef CONFIG_QUOTA
599#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
600#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
601
602static int ext3_dquot_initialize(struct inode *inode, int type);
603static int ext3_dquot_drop(struct inode *inode);
604static int ext3_write_dquot(struct dquot *dquot);
605static int ext3_acquire_dquot(struct dquot *dquot);
606static int ext3_release_dquot(struct dquot *dquot);
607static int ext3_mark_dquot_dirty(struct dquot *dquot);
608static int ext3_write_info(struct super_block *sb, int type);
609static int ext3_quota_on(struct super_block *sb, int type, int format_id, char *path);
610static int ext3_quota_on_mount(struct super_block *sb, int type);
611static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
612 size_t len, loff_t off);
613static ssize_t ext3_quota_write(struct super_block *sb, int type,
614 const char *data, size_t len, loff_t off);
615
616static struct dquot_operations ext3_quota_operations = {
617 .initialize = ext3_dquot_initialize,
618 .drop = ext3_dquot_drop,
619 .alloc_space = dquot_alloc_space,
620 .alloc_inode = dquot_alloc_inode,
621 .free_space = dquot_free_space,
622 .free_inode = dquot_free_inode,
623 .transfer = dquot_transfer,
624 .write_dquot = ext3_write_dquot,
625 .acquire_dquot = ext3_acquire_dquot,
626 .release_dquot = ext3_release_dquot,
627 .mark_dirty = ext3_mark_dquot_dirty,
628 .write_info = ext3_write_info
629};
630
631static struct quotactl_ops ext3_qctl_operations = {
632 .quota_on = ext3_quota_on,
633 .quota_off = vfs_quota_off,
634 .quota_sync = vfs_quota_sync,
635 .get_info = vfs_get_dqinfo,
636 .set_info = vfs_set_dqinfo,
637 .get_dqblk = vfs_get_dqblk,
638 .set_dqblk = vfs_set_dqblk
639};
640#endif
641
642static struct super_operations ext3_sops = {
643 .alloc_inode = ext3_alloc_inode,
644 .destroy_inode = ext3_destroy_inode,
645 .read_inode = ext3_read_inode,
646 .write_inode = ext3_write_inode,
647 .dirty_inode = ext3_dirty_inode,
648 .delete_inode = ext3_delete_inode,
649 .put_super = ext3_put_super,
650 .write_super = ext3_write_super,
651 .sync_fs = ext3_sync_fs,
652 .write_super_lockfs = ext3_write_super_lockfs,
653 .unlockfs = ext3_unlockfs,
654 .statfs = ext3_statfs,
655 .remount_fs = ext3_remount,
656 .clear_inode = ext3_clear_inode,
657 .show_options = ext3_show_options,
658#ifdef CONFIG_QUOTA
659 .quota_read = ext3_quota_read,
660 .quota_write = ext3_quota_write,
661#endif
662};
663
664static struct export_operations ext3_export_ops = {
665 .get_parent = ext3_get_parent,
666 .get_dentry = ext3_get_dentry,
667};
668
669enum {
670 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
671 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
672 Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
673 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
674 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
675 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
676 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
677 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
678 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
679 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
680 Opt_grpquota
681};
682
683static match_table_t tokens = {
684 {Opt_bsd_df, "bsddf"},
685 {Opt_minix_df, "minixdf"},
686 {Opt_grpid, "grpid"},
687 {Opt_grpid, "bsdgroups"},
688 {Opt_nogrpid, "nogrpid"},
689 {Opt_nogrpid, "sysvgroups"},
690 {Opt_resgid, "resgid=%u"},
691 {Opt_resuid, "resuid=%u"},
692 {Opt_sb, "sb=%u"},
693 {Opt_err_cont, "errors=continue"},
694 {Opt_err_panic, "errors=panic"},
695 {Opt_err_ro, "errors=remount-ro"},
696 {Opt_nouid32, "nouid32"},
697 {Opt_nocheck, "nocheck"},
698 {Opt_nocheck, "check=none"},
699 {Opt_debug, "debug"},
700 {Opt_oldalloc, "oldalloc"},
701 {Opt_orlov, "orlov"},
702 {Opt_user_xattr, "user_xattr"},
703 {Opt_nouser_xattr, "nouser_xattr"},
704 {Opt_acl, "acl"},
705 {Opt_noacl, "noacl"},
706 {Opt_reservation, "reservation"},
707 {Opt_noreservation, "noreservation"},
708 {Opt_noload, "noload"},
709 {Opt_nobh, "nobh"},
710 {Opt_bh, "bh"},
711 {Opt_commit, "commit=%u"},
712 {Opt_journal_update, "journal=update"},
713 {Opt_journal_inum, "journal=%u"},
714 {Opt_journal_dev, "journal_dev=%u"},
715 {Opt_abort, "abort"},
716 {Opt_data_journal, "data=journal"},
717 {Opt_data_ordered, "data=ordered"},
718 {Opt_data_writeback, "data=writeback"},
719 {Opt_offusrjquota, "usrjquota="},
720 {Opt_usrjquota, "usrjquota=%s"},
721 {Opt_offgrpjquota, "grpjquota="},
722 {Opt_grpjquota, "grpjquota=%s"},
723 {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
724 {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
725 {Opt_grpquota, "grpquota"},
726 {Opt_noquota, "noquota"},
727 {Opt_quota, "quota"},
728 {Opt_usrquota, "usrquota"},
729 {Opt_barrier, "barrier=%u"},
730 {Opt_err, NULL},
731 {Opt_resize, "resize"},
732};
733
734static ext3_fsblk_t get_sb_block(void **data)
735{
736 ext3_fsblk_t sb_block;
737 char *options = (char *) *data;
738
739 if (!options || strncmp(options, "sb=", 3) != 0)
740 return 1; /* Default location */
741 options += 3;
742 /*todo: use simple_strtoll with >32bit ext3 */
743 sb_block = simple_strtoul(options, &options, 0);
744 if (*options && *options != ',') {
745 printk("EXT3-fs: Invalid sb specification: %s\n",
746 (char *) *data);
747 return 1;
748 }
749 if (*options == ',')
750 options++;
751 *data = (void *) options;
752 return sb_block;
753}
754
755static int parse_options (char *options, struct super_block *sb,
756 unsigned int *inum, unsigned long *journal_devnum,
757 ext3_fsblk_t *n_blocks_count, int is_remount)
758{
759 struct ext3_sb_info *sbi = EXT3_SB(sb);
760 char * p;
761 substring_t args[MAX_OPT_ARGS];
762 int data_opt = 0;
763 int option;
764#ifdef CONFIG_QUOTA
765 int qtype;
766 char *qname;
767#endif
768
769 if (!options)
770 return 1;
771
772 while ((p = strsep (&options, ",")) != NULL) {
773 int token;
774 if (!*p)
775 continue;
776
777 token = match_token(p, tokens, args);
778 switch (token) {
779 case Opt_bsd_df:
780 clear_opt (sbi->s_mount_opt, MINIX_DF);
781 break;
782 case Opt_minix_df:
783 set_opt (sbi->s_mount_opt, MINIX_DF);
784 break;
785 case Opt_grpid:
786 set_opt (sbi->s_mount_opt, GRPID);
787 break;
788 case Opt_nogrpid:
789 clear_opt (sbi->s_mount_opt, GRPID);
790 break;
791 case Opt_resuid:
792 if (match_int(&args[0], &option))
793 return 0;
794 sbi->s_resuid = option;
795 break;
796 case Opt_resgid:
797 if (match_int(&args[0], &option))
798 return 0;
799 sbi->s_resgid = option;
800 break;
801 case Opt_sb:
802 /* handled by get_sb_block() instead of here */
803 /* *sb_block = match_int(&args[0]); */
804 break;
805 case Opt_err_panic:
806 clear_opt (sbi->s_mount_opt, ERRORS_CONT);
807 clear_opt (sbi->s_mount_opt, ERRORS_RO);
808 set_opt (sbi->s_mount_opt, ERRORS_PANIC);
809 break;
810 case Opt_err_ro:
811 clear_opt (sbi->s_mount_opt, ERRORS_CONT);
812 clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
813 set_opt (sbi->s_mount_opt, ERRORS_RO);
814 break;
815 case Opt_err_cont:
816 clear_opt (sbi->s_mount_opt, ERRORS_RO);
817 clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
818 set_opt (sbi->s_mount_opt, ERRORS_CONT);
819 break;
820 case Opt_nouid32:
821 set_opt (sbi->s_mount_opt, NO_UID32);
822 break;
823 case Opt_nocheck:
824 clear_opt (sbi->s_mount_opt, CHECK);
825 break;
826 case Opt_debug:
827 set_opt (sbi->s_mount_opt, DEBUG);
828 break;
829 case Opt_oldalloc:
830 set_opt (sbi->s_mount_opt, OLDALLOC);
831 break;
832 case Opt_orlov:
833 clear_opt (sbi->s_mount_opt, OLDALLOC);
834 break;
835#ifdef CONFIG_EXT3_FS_XATTR
836 case Opt_user_xattr:
837 set_opt (sbi->s_mount_opt, XATTR_USER);
838 break;
839 case Opt_nouser_xattr:
840 clear_opt (sbi->s_mount_opt, XATTR_USER);
841 break;
842#else
843 case Opt_user_xattr:
844 case Opt_nouser_xattr:
845 printk("EXT3 (no)user_xattr options not supported\n");
846 break;
847#endif
848#ifdef CONFIG_EXT3_FS_POSIX_ACL
849 case Opt_acl:
850 set_opt(sbi->s_mount_opt, POSIX_ACL);
851 break;
852 case Opt_noacl:
853 clear_opt(sbi->s_mount_opt, POSIX_ACL);
854 break;
855#else
856 case Opt_acl:
857 case Opt_noacl:
858 printk("EXT3 (no)acl options not supported\n");
859 break;
860#endif
861 case Opt_reservation:
862 set_opt(sbi->s_mount_opt, RESERVATION);
863 break;
864 case Opt_noreservation:
865 clear_opt(sbi->s_mount_opt, RESERVATION);
866 break;
867 case Opt_journal_update:
868 /* @@@ FIXME */
869 /* Eventually we will want to be able to create
870 a journal file here. For now, only allow the
871 user to specify an existing inode to be the
872 journal file. */
873 if (is_remount) {
874 printk(KERN_ERR "EXT3-fs: cannot specify "
875 "journal on remount\n");
876 return 0;
877 }
878 set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
879 break;
880 case Opt_journal_inum:
881 if (is_remount) {
882 printk(KERN_ERR "EXT3-fs: cannot specify "
883 "journal on remount\n");
884 return 0;
885 }
886 if (match_int(&args[0], &option))
887 return 0;
888 *inum = option;
889 break;
890 case Opt_journal_dev:
891 if (is_remount) {
892 printk(KERN_ERR "EXT3-fs: cannot specify "
893 "journal on remount\n");
894 return 0;
895 }
896 if (match_int(&args[0], &option))
897 return 0;
898 *journal_devnum = option;
899 break;
900 case Opt_noload:
901 set_opt (sbi->s_mount_opt, NOLOAD);
902 break;
903 case Opt_commit:
904 if (match_int(&args[0], &option))
905 return 0;
906 if (option < 0)
907 return 0;
908 if (option == 0)
909 option = JBD_DEFAULT_MAX_COMMIT_AGE;
910 sbi->s_commit_interval = HZ * option;
911 break;
912 case Opt_data_journal:
913 data_opt = EXT3_MOUNT_JOURNAL_DATA;
914 goto datacheck;
915 case Opt_data_ordered:
916 data_opt = EXT3_MOUNT_ORDERED_DATA;
917 goto datacheck;
918 case Opt_data_writeback:
919 data_opt = EXT3_MOUNT_WRITEBACK_DATA;
920 datacheck:
921 if (is_remount) {
922 if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS)
923 != data_opt) {
924 printk(KERN_ERR
925 "EXT3-fs: cannot change data "
926 "mode on remount\n");
927 return 0;
928 }
929 } else {
930 sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS;
931 sbi->s_mount_opt |= data_opt;
932 }
933 break;
934#ifdef CONFIG_QUOTA
935 case Opt_usrjquota:
936 qtype = USRQUOTA;
937 goto set_qf_name;
938 case Opt_grpjquota:
939 qtype = GRPQUOTA;
940set_qf_name:
941 if (sb_any_quota_enabled(sb)) {
942 printk(KERN_ERR
943 "EXT3-fs: Cannot change journalled "
944 "quota options when quota turned on.\n");
945 return 0;
946 }
947 qname = match_strdup(&args[0]);
948 if (!qname) {
949 printk(KERN_ERR
950 "EXT3-fs: not enough memory for "
951 "storing quotafile name.\n");
952 return 0;
953 }
954 if (sbi->s_qf_names[qtype] &&
955 strcmp(sbi->s_qf_names[qtype], qname)) {
956 printk(KERN_ERR
957 "EXT3-fs: %s quota file already "
958 "specified.\n", QTYPE2NAME(qtype));
959 kfree(qname);
960 return 0;
961 }
962 sbi->s_qf_names[qtype] = qname;
963 if (strchr(sbi->s_qf_names[qtype], '/')) {
964 printk(KERN_ERR
965 "EXT3-fs: quotafile must be on "
966 "filesystem root.\n");
967 kfree(sbi->s_qf_names[qtype]);
968 sbi->s_qf_names[qtype] = NULL;
969 return 0;
970 }
971 set_opt(sbi->s_mount_opt, QUOTA);
972 break;
973 case Opt_offusrjquota:
974 qtype = USRQUOTA;
975 goto clear_qf_name;
976 case Opt_offgrpjquota:
977 qtype = GRPQUOTA;
978clear_qf_name:
979 if (sb_any_quota_enabled(sb)) {
980 printk(KERN_ERR "EXT3-fs: Cannot change "
981 "journalled quota options when "
982 "quota turned on.\n");
983 return 0;
984 }
985 /*
986 * The space will be released later when all options
987 * are confirmed to be correct
988 */
989 sbi->s_qf_names[qtype] = NULL;
990 break;
991 case Opt_jqfmt_vfsold:
992 sbi->s_jquota_fmt = QFMT_VFS_OLD;
993 break;
994 case Opt_jqfmt_vfsv0:
995 sbi->s_jquota_fmt = QFMT_VFS_V0;
996 break;
997 case Opt_quota:
998 case Opt_usrquota:
999 set_opt(sbi->s_mount_opt, QUOTA);
1000 set_opt(sbi->s_mount_opt, USRQUOTA);
1001 break;
1002 case Opt_grpquota:
1003 set_opt(sbi->s_mount_opt, QUOTA);
1004 set_opt(sbi->s_mount_opt, GRPQUOTA);
1005 break;
1006 case Opt_noquota:
1007 if (sb_any_quota_enabled(sb)) {
1008 printk(KERN_ERR "EXT3-fs: Cannot change quota "
1009 "options when quota turned on.\n");
1010 return 0;
1011 }
1012 clear_opt(sbi->s_mount_opt, QUOTA);
1013 clear_opt(sbi->s_mount_opt, USRQUOTA);
1014 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1015 break;
1016#else
1017 case Opt_quota:
1018 case Opt_usrquota:
1019 case Opt_grpquota:
1020 case Opt_usrjquota:
1021 case Opt_grpjquota:
1022 case Opt_offusrjquota:
1023 case Opt_offgrpjquota:
1024 case Opt_jqfmt_vfsold:
1025 case Opt_jqfmt_vfsv0:
1026 printk(KERN_ERR
1027 "EXT3-fs: journalled quota options not "
1028 "supported.\n");
1029 break;
1030 case Opt_noquota:
1031 break;
1032#endif
1033 case Opt_abort:
1034 set_opt(sbi->s_mount_opt, ABORT);
1035 break;
1036 case Opt_barrier:
1037 if (match_int(&args[0], &option))
1038 return 0;
1039 if (option)
1040 set_opt(sbi->s_mount_opt, BARRIER);
1041 else
1042 clear_opt(sbi->s_mount_opt, BARRIER);
1043 break;
1044 case Opt_ignore:
1045 break;
1046 case Opt_resize:
1047 if (!is_remount) {
1048 printk("EXT3-fs: resize option only available "
1049 "for remount\n");
1050 return 0;
1051 }
1052 if (match_int(&args[0], &option) != 0)
1053 return 0;
1054 *n_blocks_count = option;
1055 break;
1056 case Opt_nobh:
1057 set_opt(sbi->s_mount_opt, NOBH);
1058 break;
1059 case Opt_bh:
1060 clear_opt(sbi->s_mount_opt, NOBH);
1061 break;
1062 default:
1063 printk (KERN_ERR
1064 "EXT3-fs: Unrecognized mount option \"%s\" "
1065 "or missing value\n", p);
1066 return 0;
1067 }
1068 }
1069#ifdef CONFIG_QUOTA
1070 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1071 if ((sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) &&
1072 sbi->s_qf_names[USRQUOTA])
1073 clear_opt(sbi->s_mount_opt, USRQUOTA);
1074
1075 if ((sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) &&
1076 sbi->s_qf_names[GRPQUOTA])
1077 clear_opt(sbi->s_mount_opt, GRPQUOTA);
1078
1079 if ((sbi->s_qf_names[USRQUOTA] &&
1080 (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) ||
1081 (sbi->s_qf_names[GRPQUOTA] &&
1082 (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) {
1083 printk(KERN_ERR "EXT3-fs: old and new quota "
1084 "format mixing.\n");
1085 return 0;
1086 }
1087
1088 if (!sbi->s_jquota_fmt) {
1089 printk(KERN_ERR "EXT3-fs: journalled quota format "
1090 "not specified.\n");
1091 return 0;
1092 }
1093 } else {
1094 if (sbi->s_jquota_fmt) {
1095 printk(KERN_ERR "EXT3-fs: journalled quota format "
1096 "specified with no journalling "
1097 "enabled.\n");
1098 return 0;
1099 }
1100 }
1101#endif
1102 return 1;
1103}
1104
1105static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
1106 int read_only)
1107{
1108 struct ext3_sb_info *sbi = EXT3_SB(sb);
1109 int res = 0;
1110
1111 if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) {
1112 printk (KERN_ERR "EXT3-fs warning: revision level too high, "
1113 "forcing read-only mode\n");
1114 res = MS_RDONLY;
1115 }
1116 if (read_only)
1117 return res;
1118 if (!(sbi->s_mount_state & EXT3_VALID_FS))
1119 printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, "
1120 "running e2fsck is recommended\n");
1121 else if ((sbi->s_mount_state & EXT3_ERROR_FS))
1122 printk (KERN_WARNING
1123 "EXT3-fs warning: mounting fs with errors, "
1124 "running e2fsck is recommended\n");
1125 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
1126 le16_to_cpu(es->s_mnt_count) >=
1127 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
1128 printk (KERN_WARNING
1129 "EXT3-fs warning: maximal mount count reached, "
1130 "running e2fsck is recommended\n");
1131 else if (le32_to_cpu(es->s_checkinterval) &&
1132 (le32_to_cpu(es->s_lastcheck) +
1133 le32_to_cpu(es->s_checkinterval) <= get_seconds()))
1134 printk (KERN_WARNING
1135 "EXT3-fs warning: checktime reached, "
1136 "running e2fsck is recommended\n");
1137#if 0
1138 /* @@@ We _will_ want to clear the valid bit if we find
1139 inconsistencies, to force a fsck at reboot. But for
1140 a plain journaled filesystem we can keep it set as
1141 valid forever! :) */
1142 es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT3_VALID_FS);
1143#endif
1144 if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
1145 es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
1146 es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1);
1147 es->s_mtime = cpu_to_le32(get_seconds());
1148 ext3_update_dynamic_rev(sb);
1149 EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
1150
1151 ext3_commit_super(sb, es, 1);
1152 if (test_opt(sb, DEBUG))
1153 printk(KERN_INFO "[EXT3 FS bs=%lu, gc=%lu, "
1154 "bpg=%lu, ipg=%lu, mo=%04lx]\n",
1155 sb->s_blocksize,
1156 sbi->s_groups_count,
1157 EXT3_BLOCKS_PER_GROUP(sb),
1158 EXT3_INODES_PER_GROUP(sb),
1159 sbi->s_mount_opt);
1160
1161 printk(KERN_INFO "EXT3 FS on %s, ", sb->s_id);
1162 if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
1163 char b[BDEVNAME_SIZE];
1164
1165 printk("external journal on %s\n",
1166 bdevname(EXT3_SB(sb)->s_journal->j_dev, b));
1167 } else {
1168 printk("internal journal\n");
1169 }
1170 return res;
1171}
1172
1173/* Called at mount-time, super-block is locked */
1174static int ext3_check_descriptors (struct super_block * sb)
1175{
1176 struct ext3_sb_info *sbi = EXT3_SB(sb);
1177 ext3_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
1178 ext3_fsblk_t last_block;
1179 struct ext3_group_desc * gdp = NULL;
1180 int desc_block = 0;
1181 int i;
1182
1183 ext3_debug ("Checking group descriptors");
1184
1185 for (i = 0; i < sbi->s_groups_count; i++)
1186 {
1187 if (i == sbi->s_groups_count - 1)
1188 last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1;
1189 else
1190 last_block = first_block +
1191 (EXT3_BLOCKS_PER_GROUP(sb) - 1);
1192
1193 if ((i % EXT3_DESC_PER_BLOCK(sb)) == 0)
1194 gdp = (struct ext3_group_desc *)
1195 sbi->s_group_desc[desc_block++]->b_data;
1196 if (le32_to_cpu(gdp->bg_block_bitmap) < first_block ||
1197 le32_to_cpu(gdp->bg_block_bitmap) > last_block)
1198 {
1199 ext3_error (sb, "ext3_check_descriptors",
1200 "Block bitmap for group %d"
1201 " not in group (block %lu)!",
1202 i, (unsigned long)
1203 le32_to_cpu(gdp->bg_block_bitmap));
1204 return 0;
1205 }
1206 if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block ||
1207 le32_to_cpu(gdp->bg_inode_bitmap) > last_block)
1208 {
1209 ext3_error (sb, "ext3_check_descriptors",
1210 "Inode bitmap for group %d"
1211 " not in group (block %lu)!",
1212 i, (unsigned long)
1213 le32_to_cpu(gdp->bg_inode_bitmap));
1214 return 0;
1215 }
1216 if (le32_to_cpu(gdp->bg_inode_table) < first_block ||
1217 le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >
1218 last_block)
1219 {
1220 ext3_error (sb, "ext3_check_descriptors",
1221 "Inode table for group %d"
1222 " not in group (block %lu)!",
1223 i, (unsigned long)
1224 le32_to_cpu(gdp->bg_inode_table));
1225 return 0;
1226 }
1227 first_block += EXT3_BLOCKS_PER_GROUP(sb);
1228 gdp++;
1229 }
1230
1231 sbi->s_es->s_free_blocks_count=cpu_to_le32(ext3_count_free_blocks(sb));
1232 sbi->s_es->s_free_inodes_count=cpu_to_le32(ext3_count_free_inodes(sb));
1233 return 1;
1234}
1235
1236
1237/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at
1238 * the superblock) which were deleted from all directories, but held open by
1239 * a process at the time of a crash. We walk the list and try to delete these
1240 * inodes at recovery time (only with a read-write filesystem).
1241 *
1242 * In order to keep the orphan inode chain consistent during traversal (in
1243 * case of crash during recovery), we link each inode into the superblock
1244 * orphan list_head and handle it the same way as an inode deletion during
1245 * normal operation (which journals the operations for us).
1246 *
1247 * We only do an iget() and an iput() on each inode, which is very safe if we
1248 * accidentally point at an in-use or already deleted inode. The worst that
1249 * can happen in this case is that we get a "bit already cleared" message from
1250 * ext3_free_inode(). The only reason we would point at a wrong inode is if
1251 * e2fsck was run on this filesystem, and it must have already done the orphan
1252 * inode cleanup for us, so we can safely abort without any further action.
1253 */
1254static void ext3_orphan_cleanup (struct super_block * sb,
1255 struct ext3_super_block * es)
1256{
1257 unsigned int s_flags = sb->s_flags;
1258 int nr_orphans = 0, nr_truncates = 0;
1259#ifdef CONFIG_QUOTA
1260 int i;
1261#endif
1262 if (!es->s_last_orphan) {
1263 jbd_debug(4, "no orphan inodes to clean up\n");
1264 return;
1265 }
1266
1267 if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
1268 if (es->s_last_orphan)
1269 jbd_debug(1, "Errors on filesystem, "
1270 "clearing orphan list.\n");
1271 es->s_last_orphan = 0;
1272 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
1273 return;
1274 }
1275
1276 if (s_flags & MS_RDONLY) {
1277 printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n",
1278 sb->s_id);
1279 sb->s_flags &= ~MS_RDONLY;
1280 }
1281#ifdef CONFIG_QUOTA
1282 /* Needed for iput() to work correctly and not trash data */
1283 sb->s_flags |= MS_ACTIVE;
1284 /* Turn on quotas so that they are updated correctly */
1285 for (i = 0; i < MAXQUOTAS; i++) {
1286 if (EXT3_SB(sb)->s_qf_names[i]) {
1287 int ret = ext3_quota_on_mount(sb, i);
1288 if (ret < 0)
1289 printk(KERN_ERR
1290 "EXT3-fs: Cannot turn on journalled "
1291 "quota: error %d\n", ret);
1292 }
1293 }
1294#endif
1295
1296 while (es->s_last_orphan) {
1297 struct inode *inode;
1298
1299 if (!(inode =
1300 ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) {
1301 es->s_last_orphan = 0;
1302 break;
1303 }
1304
1305 list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
1306 DQUOT_INIT(inode);
1307 if (inode->i_nlink) {
1308 printk(KERN_DEBUG
1309 "%s: truncating inode %lu to %Ld bytes\n",
1310 __FUNCTION__, inode->i_ino, inode->i_size);
1311 jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
1312 inode->i_ino, inode->i_size);
1313 ext3_truncate(inode);
1314 nr_truncates++;
1315 } else {
1316 printk(KERN_DEBUG
1317 "%s: deleting unreferenced inode %lu\n",
1318 __FUNCTION__, inode->i_ino);
1319 jbd_debug(2, "deleting unreferenced inode %lu\n",
1320 inode->i_ino);
1321 nr_orphans++;
1322 }
1323 iput(inode); /* The delete magic happens here! */
1324 }
1325
1326#define PLURAL(x) (x), ((x)==1) ? "" : "s"
1327
1328 if (nr_orphans)
1329 printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n",
1330 sb->s_id, PLURAL(nr_orphans));
1331 if (nr_truncates)
1332 printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n",
1333 sb->s_id, PLURAL(nr_truncates));
1334#ifdef CONFIG_QUOTA
1335 /* Turn quotas off */
1336 for (i = 0; i < MAXQUOTAS; i++) {
1337 if (sb_dqopt(sb)->files[i])
1338 vfs_quota_off(sb, i);
1339 }
1340#endif
1341 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
1342}
1343
1344#define log2(n) ffz(~(n))
1345
1346/*
1347 * Maximal file size. There is a direct, and {,double-,triple-}indirect
1348 * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
1349 * We need to be 1 filesystem block less than the 2^32 sector limit.
1350 */
1351static loff_t ext3_max_size(int bits)
1352{
1353 loff_t res = EXT3_NDIR_BLOCKS;
1354 /* This constant is calculated to be the largest file size for a
1355 * dense, 4k-blocksize file such that the total number of
1356 * sectors in the file, including data and all indirect blocks,
1357 * does not exceed 2^32. */
1358 const loff_t upper_limit = 0x1ff7fffd000LL;
1359
1360 res += 1LL << (bits-2);
1361 res += 1LL << (2*(bits-2));
1362 res += 1LL << (3*(bits-2));
1363 res <<= bits;
1364 if (res > upper_limit)
1365 res = upper_limit;
1366 return res;
1367}
1368
1369static ext3_fsblk_t descriptor_loc(struct super_block *sb,
1370 ext3_fsblk_t logic_sb_block,
1371 int nr)
1372{
1373 struct ext3_sb_info *sbi = EXT3_SB(sb);
1374 unsigned long bg, first_meta_bg;
1375 int has_super = 0;
1376
1377 first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
1378
1379 if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) ||
1380 nr < first_meta_bg)
1381 return (logic_sb_block + nr + 1);
1382 bg = sbi->s_desc_per_block * nr;
1383 if (ext3_bg_has_super(sb, bg))
1384 has_super = 1;
1385 return (has_super + ext3_group_first_block_no(sb, bg));
1386}
1387
1388
1389static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1390{
1391 struct buffer_head * bh;
1392 struct ext3_super_block *es = NULL;
1393 struct ext3_sb_info *sbi;
1394 ext3_fsblk_t block;
1395 ext3_fsblk_t sb_block = get_sb_block(&data);
1396 ext3_fsblk_t logic_sb_block;
1397 unsigned long offset = 0;
1398 unsigned int journal_inum = 0;
1399 unsigned long journal_devnum = 0;
1400 unsigned long def_mount_opts;
1401 struct inode *root;
1402 int blocksize;
1403 int hblock;
1404 int db_count;
1405 int i;
1406 int needs_recovery;
1407 __le32 features;
1408
1409 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
1410 if (!sbi)
1411 return -ENOMEM;
1412 sb->s_fs_info = sbi;
1413 sbi->s_mount_opt = 0;
1414 sbi->s_resuid = EXT3_DEF_RESUID;
1415 sbi->s_resgid = EXT3_DEF_RESGID;
1416
1417 unlock_kernel();
1418
1419 blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
1420 if (!blocksize) {
1421 printk(KERN_ERR "EXT3-fs: unable to set blocksize\n");
1422 goto out_fail;
1423 }
1424
1425 /*
1426 * The ext3 superblock will not be buffer aligned for other than 1kB
1427 * block sizes. We need to calculate the offset from buffer start.
1428 */
1429 if (blocksize != EXT3_MIN_BLOCK_SIZE) {
1430 logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
1431 offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
1432 } else {
1433 logic_sb_block = sb_block;
1434 }
1435
1436 if (!(bh = sb_bread(sb, logic_sb_block))) {
1437 printk (KERN_ERR "EXT3-fs: unable to read superblock\n");
1438 goto out_fail;
1439 }
1440 /*
1441 * Note: s_es must be initialized as soon as possible because
1442 * some ext3 macro-instructions depend on its value
1443 */
1444 es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
1445 sbi->s_es = es;
1446 sb->s_magic = le16_to_cpu(es->s_magic);
1447 if (sb->s_magic != EXT3_SUPER_MAGIC)
1448 goto cantfind_ext3;
1449
1450 /* Set defaults before we parse the mount options */
1451 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
1452 if (def_mount_opts & EXT3_DEFM_DEBUG)
1453 set_opt(sbi->s_mount_opt, DEBUG);
1454 if (def_mount_opts & EXT3_DEFM_BSDGROUPS)
1455 set_opt(sbi->s_mount_opt, GRPID);
1456 if (def_mount_opts & EXT3_DEFM_UID16)
1457 set_opt(sbi->s_mount_opt, NO_UID32);
1458 if (def_mount_opts & EXT3_DEFM_XATTR_USER)
1459 set_opt(sbi->s_mount_opt, XATTR_USER);
1460 if (def_mount_opts & EXT3_DEFM_ACL)
1461 set_opt(sbi->s_mount_opt, POSIX_ACL);
1462 if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA)
1463 sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA;
1464 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED)
1465 sbi->s_mount_opt |= EXT3_MOUNT_ORDERED_DATA;
1466 else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK)
1467 sbi->s_mount_opt |= EXT3_MOUNT_WRITEBACK_DATA;
1468
1469 if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC)
1470 set_opt(sbi->s_mount_opt, ERRORS_PANIC);
1471 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_RO)
1472 set_opt(sbi->s_mount_opt, ERRORS_RO);
1473
1474 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
1475 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
1476
1477 set_opt(sbi->s_mount_opt, RESERVATION);
1478
1479 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
1480 NULL, 0))
1481 goto failed_mount;
1482
1483 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
1484 ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
1485
1486 if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
1487 (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
1488 EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
1489 EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
1490 printk(KERN_WARNING
1491 "EXT3-fs warning: feature flags set on rev 0 fs, "
1492 "running e2fsck is recommended\n");
1493 /*
1494 * Check feature flags regardless of the revision level, since we
1495 * previously didn't change the revision level when setting the flags,
1496 * so there is a chance incompat flags are set on a rev 0 filesystem.
1497 */
1498 features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP);
1499 if (features) {
1500 printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of "
1501 "unsupported optional features (%x).\n",
1502 sb->s_id, le32_to_cpu(features));
1503 goto failed_mount;
1504 }
1505 features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP);
1506 if (!(sb->s_flags & MS_RDONLY) && features) {
1507 printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of "
1508 "unsupported optional features (%x).\n",
1509 sb->s_id, le32_to_cpu(features));
1510 goto failed_mount;
1511 }
1512 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
1513
1514 if (blocksize < EXT3_MIN_BLOCK_SIZE ||
1515 blocksize > EXT3_MAX_BLOCK_SIZE) {
1516 printk(KERN_ERR
1517 "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n",
1518 blocksize, sb->s_id);
1519 goto failed_mount;
1520 }
1521
1522 hblock = bdev_hardsect_size(sb->s_bdev);
1523 if (sb->s_blocksize != blocksize) {
1524 /*
1525 * Make sure the blocksize for the filesystem is larger
1526 * than the hardware sectorsize for the machine.
1527 */
1528 if (blocksize < hblock) {
1529 printk(KERN_ERR "EXT3-fs: blocksize %d too small for "
1530 "device blocksize %d.\n", blocksize, hblock);
1531 goto failed_mount;
1532 }
1533
1534 brelse (bh);
1535 sb_set_blocksize(sb, blocksize);
1536 logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
1537 offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
1538 bh = sb_bread(sb, logic_sb_block);
1539 if (!bh) {
1540 printk(KERN_ERR
1541 "EXT3-fs: Can't read superblock on 2nd try.\n");
1542 goto failed_mount;
1543 }
1544 es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
1545 sbi->s_es = es;
1546 if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
1547 printk (KERN_ERR
1548 "EXT3-fs: Magic mismatch, very weird !\n");
1549 goto failed_mount;
1550 }
1551 }
1552
1553 sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits);
1554
1555 if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) {
1556 sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE;
1557 sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO;
1558 } else {
1559 sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
1560 sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
1561 if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) ||
1562 (sbi->s_inode_size & (sbi->s_inode_size - 1)) ||
1563 (sbi->s_inode_size > blocksize)) {
1564 printk (KERN_ERR
1565 "EXT3-fs: unsupported inode size: %d\n",
1566 sbi->s_inode_size);
1567 goto failed_mount;
1568 }
1569 }
1570 sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
1571 le32_to_cpu(es->s_log_frag_size);
1572 if (blocksize != sbi->s_frag_size) {
1573 printk(KERN_ERR
1574 "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n",
1575 sbi->s_frag_size, blocksize);
1576 goto failed_mount;
1577 }
1578 sbi->s_frags_per_block = 1;
1579 sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
1580 sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
1581 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
1582 if (EXT3_INODE_SIZE(sb) == 0)
1583 goto cantfind_ext3;
1584 sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
1585 if (sbi->s_inodes_per_block == 0)
1586 goto cantfind_ext3;
1587 sbi->s_itb_per_group = sbi->s_inodes_per_group /
1588 sbi->s_inodes_per_block;
1589 sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc);
1590 sbi->s_sbh = bh;
1591 sbi->s_mount_state = le16_to_cpu(es->s_state);
1592 sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb));
1593 sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb));
1594 for (i=0; i < 4; i++)
1595 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
1596 sbi->s_def_hash_version = es->s_def_hash_version;
1597
1598 if (sbi->s_blocks_per_group > blocksize * 8) {
1599 printk (KERN_ERR
1600 "EXT3-fs: #blocks per group too big: %lu\n",
1601 sbi->s_blocks_per_group);
1602 goto failed_mount;
1603 }
1604 if (sbi->s_frags_per_group > blocksize * 8) {
1605 printk (KERN_ERR
1606 "EXT3-fs: #fragments per group too big: %lu\n",
1607 sbi->s_frags_per_group);
1608 goto failed_mount;
1609 }
1610 if (sbi->s_inodes_per_group > blocksize * 8) {
1611 printk (KERN_ERR
1612 "EXT3-fs: #inodes per group too big: %lu\n",
1613 sbi->s_inodes_per_group);
1614 goto failed_mount;
1615 }
1616
1617 if (le32_to_cpu(es->s_blocks_count) >
1618 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
1619 printk(KERN_ERR "EXT3-fs: filesystem on %s:"
1620 " too large to mount safely\n", sb->s_id);
1621 if (sizeof(sector_t) < 8)
1622 printk(KERN_WARNING "EXT3-fs: CONFIG_LBD not "
1623 "enabled\n");
1624 goto failed_mount;
1625 }
1626
1627 if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
1628 goto cantfind_ext3;
1629 sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
1630 le32_to_cpu(es->s_first_data_block) - 1)
1631 / EXT3_BLOCKS_PER_GROUP(sb)) + 1;
1632 db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) /
1633 EXT3_DESC_PER_BLOCK(sb);
1634 sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
1635 GFP_KERNEL);
1636 if (sbi->s_group_desc == NULL) {
1637 printk (KERN_ERR "EXT3-fs: not enough memory\n");
1638 goto failed_mount;
1639 }
1640
1641 bgl_lock_init(&sbi->s_blockgroup_lock);
1642
1643 for (i = 0; i < db_count; i++) {
1644 block = descriptor_loc(sb, logic_sb_block, i);
1645 sbi->s_group_desc[i] = sb_bread(sb, block);
1646 if (!sbi->s_group_desc[i]) {
1647 printk (KERN_ERR "EXT3-fs: "
1648 "can't read group descriptor %d\n", i);
1649 db_count = i;
1650 goto failed_mount2;
1651 }
1652 }
1653 if (!ext3_check_descriptors (sb)) {
1654 printk(KERN_ERR "EXT3-fs: group descriptors corrupted!\n");
1655 goto failed_mount2;
1656 }
1657 sbi->s_gdb_count = db_count;
1658 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
1659 spin_lock_init(&sbi->s_next_gen_lock);
1660
1661 percpu_counter_init(&sbi->s_freeblocks_counter,
1662 ext3_count_free_blocks(sb));
1663 percpu_counter_init(&sbi->s_freeinodes_counter,
1664 ext3_count_free_inodes(sb));
1665 percpu_counter_init(&sbi->s_dirs_counter,
1666 ext3_count_dirs(sb));
1667
1668 /* per fileystem reservation list head & lock */
1669 spin_lock_init(&sbi->s_rsv_window_lock);
1670 sbi->s_rsv_window_root = RB_ROOT;
1671 /* Add a single, static dummy reservation to the start of the
1672 * reservation window list --- it gives us a placeholder for
1673 * append-at-start-of-list which makes the allocation logic
1674 * _much_ simpler. */
1675 sbi->s_rsv_window_head.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
1676 sbi->s_rsv_window_head.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
1677 sbi->s_rsv_window_head.rsv_alloc_hit = 0;
1678 sbi->s_rsv_window_head.rsv_goal_size = 0;
1679 ext3_rsv_window_add(sb, &sbi->s_rsv_window_head);
1680
1681 /*
1682 * set up enough so that it can read an inode
1683 */
1684 sb->s_op = &ext3_sops;
1685 sb->s_export_op = &ext3_export_ops;
1686 sb->s_xattr = ext3_xattr_handlers;
1687#ifdef CONFIG_QUOTA
1688 sb->s_qcop = &ext3_qctl_operations;
1689 sb->dq_op = &ext3_quota_operations;
1690#endif
1691 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
1692
1693 sb->s_root = NULL;
1694
1695 needs_recovery = (es->s_last_orphan != 0 ||
1696 EXT3_HAS_INCOMPAT_FEATURE(sb,
1697 EXT3_FEATURE_INCOMPAT_RECOVER));
1698
1699 /*
1700 * The first inode we look at is the journal inode. Don't try
1701 * root first: it may be modified in the journal!
1702 */
1703 if (!test_opt(sb, NOLOAD) &&
1704 EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
1705 if (ext3_load_journal(sb, es, journal_devnum))
1706 goto failed_mount3;
1707 } else if (journal_inum) {
1708 if (ext3_create_journal(sb, es, journal_inum))
1709 goto failed_mount3;
1710 } else {
1711 if (!silent)
1712 printk (KERN_ERR
1713 "ext3: No journal on filesystem on %s\n",
1714 sb->s_id);
1715 goto failed_mount3;
1716 }
1717
1718 /* We have now updated the journal if required, so we can
1719 * validate the data journaling mode. */
1720 switch (test_opt(sb, DATA_FLAGS)) {
1721 case 0:
1722 /* No mode set, assume a default based on the journal
1723 capabilities: ORDERED_DATA if the journal can
1724 cope, else JOURNAL_DATA */
1725 if (journal_check_available_features
1726 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
1727 set_opt(sbi->s_mount_opt, ORDERED_DATA);
1728 else
1729 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
1730 break;
1731
1732 case EXT3_MOUNT_ORDERED_DATA:
1733 case EXT3_MOUNT_WRITEBACK_DATA:
1734 if (!journal_check_available_features
1735 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
1736 printk(KERN_ERR "EXT3-fs: Journal does not support "
1737 "requested data journaling mode\n");
1738 goto failed_mount4;
1739 }
1740 default:
1741 break;
1742 }
1743
1744 if (test_opt(sb, NOBH)) {
1745 if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) {
1746 printk(KERN_WARNING "EXT3-fs: Ignoring nobh option - "
1747 "its supported only with writeback mode\n");
1748 clear_opt(sbi->s_mount_opt, NOBH);
1749 }
1750 }
1751 /*
1752 * The journal_load will have done any necessary log recovery,
1753 * so we can safely mount the rest of the filesystem now.
1754 */
1755
1756 root = iget(sb, EXT3_ROOT_INO);
1757 sb->s_root = d_alloc_root(root);
1758 if (!sb->s_root) {
1759 printk(KERN_ERR "EXT3-fs: get root inode failed\n");
1760 iput(root);
1761 goto failed_mount4;
1762 }
1763 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
1764 dput(sb->s_root);
1765 sb->s_root = NULL;
1766 printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n");
1767 goto failed_mount4;
1768 }
1769
1770 ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
1771 /*
1772 * akpm: core read_super() calls in here with the superblock locked.
1773 * That deadlocks, because orphan cleanup needs to lock the superblock
1774 * in numerous places. Here we just pop the lock - it's relatively
1775 * harmless, because we are now ready to accept write_super() requests,
1776 * and aviro says that's the only reason for hanging onto the
1777 * superblock lock.
1778 */
1779 EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
1780 ext3_orphan_cleanup(sb, es);
1781 EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
1782 if (needs_recovery)
1783 printk (KERN_INFO "EXT3-fs: recovery complete.\n");
1784 ext3_mark_recovery_complete(sb, es);
1785 printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n",
1786 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
1787 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
1788 "writeback");
1789
1790 lock_kernel();
1791 return 0;
1792
1793cantfind_ext3:
1794 if (!silent)
1795 printk(KERN_ERR "VFS: Can't find ext3 filesystem on dev %s.\n",
1796 sb->s_id);
1797 goto failed_mount;
1798
1799failed_mount4:
1800 journal_destroy(sbi->s_journal);
1801failed_mount3:
1802 percpu_counter_destroy(&sbi->s_freeblocks_counter);
1803 percpu_counter_destroy(&sbi->s_freeinodes_counter);
1804 percpu_counter_destroy(&sbi->s_dirs_counter);
1805failed_mount2:
1806 for (i = 0; i < db_count; i++)
1807 brelse(sbi->s_group_desc[i]);
1808 kfree(sbi->s_group_desc);
1809failed_mount:
1810#ifdef CONFIG_QUOTA
1811 for (i = 0; i < MAXQUOTAS; i++)
1812 kfree(sbi->s_qf_names[i]);
1813#endif
1814 ext3_blkdev_remove(sbi);
1815 brelse(bh);
1816out_fail:
1817 sb->s_fs_info = NULL;
1818 kfree(sbi);
1819 lock_kernel();
1820 return -EINVAL;
1821}
1822
1823/*
1824 * Setup any per-fs journal parameters now. We'll do this both on
1825 * initial mount, once the journal has been initialised but before we've
1826 * done any recovery; and again on any subsequent remount.
1827 */
1828static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
1829{
1830 struct ext3_sb_info *sbi = EXT3_SB(sb);
1831
1832 if (sbi->s_commit_interval)
1833 journal->j_commit_interval = sbi->s_commit_interval;
1834 /* We could also set up an ext3-specific default for the commit
1835 * interval here, but for now we'll just fall back to the jbd
1836 * default. */
1837
1838 spin_lock(&journal->j_state_lock);
1839 if (test_opt(sb, BARRIER))
1840 journal->j_flags |= JFS_BARRIER;
1841 else
1842 journal->j_flags &= ~JFS_BARRIER;
1843 spin_unlock(&journal->j_state_lock);
1844}
1845
1846static journal_t *ext3_get_journal(struct super_block *sb,
1847 unsigned int journal_inum)
1848{
1849 struct inode *journal_inode;
1850 journal_t *journal;
1851
1852 /* First, test for the existence of a valid inode on disk. Bad
1853 * things happen if we iget() an unused inode, as the subsequent
1854 * iput() will try to delete it. */
1855
1856 journal_inode = iget(sb, journal_inum);
1857 if (!journal_inode) {
1858 printk(KERN_ERR "EXT3-fs: no journal found.\n");
1859 return NULL;
1860 }
1861 if (!journal_inode->i_nlink) {
1862 make_bad_inode(journal_inode);
1863 iput(journal_inode);
1864 printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n");
1865 return NULL;
1866 }
1867
1868 jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
1869 journal_inode, journal_inode->i_size);
1870 if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) {
1871 printk(KERN_ERR "EXT3-fs: invalid journal inode.\n");
1872 iput(journal_inode);
1873 return NULL;
1874 }
1875
1876 journal = journal_init_inode(journal_inode);
1877 if (!journal) {
1878 printk(KERN_ERR "EXT3-fs: Could not load journal inode\n");
1879 iput(journal_inode);
1880 return NULL;
1881 }
1882 journal->j_private = sb;
1883 ext3_init_journal_params(sb, journal);
1884 return journal;
1885}
1886
1887static journal_t *ext3_get_dev_journal(struct super_block *sb,
1888 dev_t j_dev)
1889{
1890 struct buffer_head * bh;
1891 journal_t *journal;
1892 ext3_fsblk_t start;
1893 ext3_fsblk_t len;
1894 int hblock, blocksize;
1895 ext3_fsblk_t sb_block;
1896 unsigned long offset;
1897 struct ext3_super_block * es;
1898 struct block_device *bdev;
1899
1900 bdev = ext3_blkdev_get(j_dev);
1901 if (bdev == NULL)
1902 return NULL;
1903
1904 if (bd_claim(bdev, sb)) {
1905 printk(KERN_ERR
1906 "EXT3: failed to claim external journal device.\n");
1907 blkdev_put(bdev);
1908 return NULL;
1909 }
1910
1911 blocksize = sb->s_blocksize;
1912 hblock = bdev_hardsect_size(bdev);
1913 if (blocksize < hblock) {
1914 printk(KERN_ERR
1915 "EXT3-fs: blocksize too small for journal device.\n");
1916 goto out_bdev;
1917 }
1918
1919 sb_block = EXT3_MIN_BLOCK_SIZE / blocksize;
1920 offset = EXT3_MIN_BLOCK_SIZE % blocksize;
1921 set_blocksize(bdev, blocksize);
1922 if (!(bh = __bread(bdev, sb_block, blocksize))) {
1923 printk(KERN_ERR "EXT3-fs: couldn't read superblock of "
1924 "external journal\n");
1925 goto out_bdev;
1926 }
1927
1928 es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
1929 if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
1930 !(le32_to_cpu(es->s_feature_incompat) &
1931 EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
1932 printk(KERN_ERR "EXT3-fs: external journal has "
1933 "bad superblock\n");
1934 brelse(bh);
1935 goto out_bdev;
1936 }
1937
1938 if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
1939 printk(KERN_ERR "EXT3-fs: journal UUID does not match\n");
1940 brelse(bh);
1941 goto out_bdev;
1942 }
1943
1944 len = le32_to_cpu(es->s_blocks_count);
1945 start = sb_block + 1;
1946 brelse(bh); /* we're done with the superblock */
1947
1948 journal = journal_init_dev(bdev, sb->s_bdev,
1949 start, len, blocksize);
1950 if (!journal) {
1951 printk(KERN_ERR "EXT3-fs: failed to create device journal\n");
1952 goto out_bdev;
1953 }
1954 journal->j_private = sb;
1955 ll_rw_block(READ, 1, &journal->j_sb_buffer);
1956 wait_on_buffer(journal->j_sb_buffer);
1957 if (!buffer_uptodate(journal->j_sb_buffer)) {
1958 printk(KERN_ERR "EXT3-fs: I/O error on journal device\n");
1959 goto out_journal;
1960 }
1961 if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
1962 printk(KERN_ERR "EXT3-fs: External journal has more than one "
1963 "user (unsupported) - %d\n",
1964 be32_to_cpu(journal->j_superblock->s_nr_users));
1965 goto out_journal;
1966 }
1967 EXT3_SB(sb)->journal_bdev = bdev;
1968 ext3_init_journal_params(sb, journal);
1969 return journal;
1970out_journal:
1971 journal_destroy(journal);
1972out_bdev:
1973 ext3_blkdev_put(bdev);
1974 return NULL;
1975}
1976
1977static int ext3_load_journal(struct super_block *sb,
1978 struct ext3_super_block *es,
1979 unsigned long journal_devnum)
1980{
1981 journal_t *journal;
1982 unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
1983 dev_t journal_dev;
1984 int err = 0;
1985 int really_read_only;
1986
1987 if (journal_devnum &&
1988 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
1989 printk(KERN_INFO "EXT3-fs: external journal device major/minor "
1990 "numbers have changed\n");
1991 journal_dev = new_decode_dev(journal_devnum);
1992 } else
1993 journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
1994
1995 really_read_only = bdev_read_only(sb->s_bdev);
1996
1997 /*
1998 * Are we loading a blank journal or performing recovery after a
1999 * crash? For recovery, we need to check in advance whether we
2000 * can get read-write access to the device.
2001 */
2002
2003 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
2004 if (sb->s_flags & MS_RDONLY) {
2005 printk(KERN_INFO "EXT3-fs: INFO: recovery "
2006 "required on readonly filesystem.\n");
2007 if (really_read_only) {
2008 printk(KERN_ERR "EXT3-fs: write access "
2009 "unavailable, cannot proceed.\n");
2010 return -EROFS;
2011 }
2012 printk (KERN_INFO "EXT3-fs: write access will "
2013 "be enabled during recovery.\n");
2014 }
2015 }
2016
2017 if (journal_inum && journal_dev) {
2018 printk(KERN_ERR "EXT3-fs: filesystem has both journal "
2019 "and inode journals!\n");
2020 return -EINVAL;
2021 }
2022
2023 if (journal_inum) {
2024 if (!(journal = ext3_get_journal(sb, journal_inum)))
2025 return -EINVAL;
2026 } else {
2027 if (!(journal = ext3_get_dev_journal(sb, journal_dev)))
2028 return -EINVAL;
2029 }
2030
2031 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
2032 err = journal_update_format(journal);
2033 if (err) {
2034 printk(KERN_ERR "EXT3-fs: error updating journal.\n");
2035 journal_destroy(journal);
2036 return err;
2037 }
2038 }
2039
2040 if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER))
2041 err = journal_wipe(journal, !really_read_only);
2042 if (!err)
2043 err = journal_load(journal);
2044
2045 if (err) {
2046 printk(KERN_ERR "EXT3-fs: error loading journal.\n");
2047 journal_destroy(journal);
2048 return err;
2049 }
2050
2051 EXT3_SB(sb)->s_journal = journal;
2052 ext3_clear_journal_err(sb, es);
2053
2054 if (journal_devnum &&
2055 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2056 es->s_journal_dev = cpu_to_le32(journal_devnum);
2057 sb->s_dirt = 1;
2058
2059 /* Make sure we flush the recovery flag to disk. */
2060 ext3_commit_super(sb, es, 1);
2061 }
2062
2063 return 0;
2064}
2065
2066static int ext3_create_journal(struct super_block * sb,
2067 struct ext3_super_block * es,
2068 unsigned int journal_inum)
2069{
2070 journal_t *journal;
2071
2072 if (sb->s_flags & MS_RDONLY) {
2073 printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to "
2074 "create journal.\n");
2075 return -EROFS;
2076 }
2077
2078 if (!(journal = ext3_get_journal(sb, journal_inum)))
2079 return -EINVAL;
2080
2081 printk(KERN_INFO "EXT3-fs: creating new journal on inode %u\n",
2082 journal_inum);
2083
2084 if (journal_create(journal)) {
2085 printk(KERN_ERR "EXT3-fs: error creating journal.\n");
2086 journal_destroy(journal);
2087 return -EIO;
2088 }
2089
2090 EXT3_SB(sb)->s_journal = journal;
2091
2092 ext3_update_dynamic_rev(sb);
2093 EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2094 EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
2095
2096 es->s_journal_inum = cpu_to_le32(journal_inum);
2097 sb->s_dirt = 1;
2098
2099 /* Make sure we flush the recovery flag to disk. */
2100 ext3_commit_super(sb, es, 1);
2101
2102 return 0;
2103}
2104
2105static void ext3_commit_super (struct super_block * sb,
2106 struct ext3_super_block * es,
2107 int sync)
2108{
2109 struct buffer_head *sbh = EXT3_SB(sb)->s_sbh;
2110
2111 if (!sbh)
2112 return;
2113 es->s_wtime = cpu_to_le32(get_seconds());
2114 es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
2115 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
2116 BUFFER_TRACE(sbh, "marking dirty");
2117 mark_buffer_dirty(sbh);
2118 if (sync)
2119 sync_dirty_buffer(sbh);
2120}
2121
2122
2123/*
2124 * Have we just finished recovery? If so, and if we are mounting (or
2125 * remounting) the filesystem readonly, then we will end up with a
2126 * consistent fs on disk. Record that fact.
2127 */
2128static void ext3_mark_recovery_complete(struct super_block * sb,
2129 struct ext3_super_block * es)
2130{
2131 journal_t *journal = EXT3_SB(sb)->s_journal;
2132
2133 journal_lock_updates(journal);
2134 journal_flush(journal);
2135 if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
2136 sb->s_flags & MS_RDONLY) {
2137 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2138 sb->s_dirt = 0;
2139 ext3_commit_super(sb, es, 1);
2140 }
2141 journal_unlock_updates(journal);
2142}
2143
2144/*
2145 * If we are mounting (or read-write remounting) a filesystem whose journal
2146 * has recorded an error from a previous lifetime, move that error to the
2147 * main filesystem now.
2148 */
2149static void ext3_clear_journal_err(struct super_block * sb,
2150 struct ext3_super_block * es)
2151{
2152 journal_t *journal;
2153 int j_errno;
2154 const char *errstr;
2155
2156 journal = EXT3_SB(sb)->s_journal;
2157
2158 /*
2159 * Now check for any error status which may have been recorded in the
2160 * journal by a prior ext3_error() or ext3_abort()
2161 */
2162
2163 j_errno = journal_errno(journal);
2164 if (j_errno) {
2165 char nbuf[16];
2166
2167 errstr = ext3_decode_error(sb, j_errno, nbuf);
2168 ext3_warning(sb, __FUNCTION__, "Filesystem error recorded "
2169 "from previous mount: %s", errstr);
2170 ext3_warning(sb, __FUNCTION__, "Marking fs in need of "
2171 "filesystem check.");
2172
2173 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
2174 es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
2175 ext3_commit_super (sb, es, 1);
2176
2177 journal_clear_err(journal);
2178 }
2179}
2180
2181/*
2182 * Force the running and committing transactions to commit,
2183 * and wait on the commit.
2184 */
2185int ext3_force_commit(struct super_block *sb)
2186{
2187 journal_t *journal;
2188 int ret;
2189
2190 if (sb->s_flags & MS_RDONLY)
2191 return 0;
2192
2193 journal = EXT3_SB(sb)->s_journal;
2194 sb->s_dirt = 0;
2195 ret = ext3_journal_force_commit(journal);
2196 return ret;
2197}
2198
2199/*
2200 * Ext3 always journals updates to the superblock itself, so we don't
2201 * have to propagate any other updates to the superblock on disk at this
2202 * point. Just start an async writeback to get the buffers on their way
2203 * to the disk.
2204 *
2205 * This implicitly triggers the writebehind on sync().
2206 */
2207
2208static void ext3_write_super (struct super_block * sb)
2209{
2210 if (mutex_trylock(&sb->s_lock) != 0)
2211 BUG();
2212 sb->s_dirt = 0;
2213}
2214
2215static int ext3_sync_fs(struct super_block *sb, int wait)
2216{
2217 tid_t target;
2218
2219 sb->s_dirt = 0;
2220 if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
2221 if (wait)
2222 log_wait_commit(EXT3_SB(sb)->s_journal, target);
2223 }
2224 return 0;
2225}
2226
2227/*
2228 * LVM calls this function before a (read-only) snapshot is created. This
2229 * gives us a chance to flush the journal completely and mark the fs clean.
2230 */
2231static void ext3_write_super_lockfs(struct super_block *sb)
2232{
2233 sb->s_dirt = 0;
2234
2235 if (!(sb->s_flags & MS_RDONLY)) {
2236 journal_t *journal = EXT3_SB(sb)->s_journal;
2237
2238 /* Now we set up the journal barrier. */
2239 journal_lock_updates(journal);
2240 journal_flush(journal);
2241
2242 /* Journal blocked and flushed, clear needs_recovery flag. */
2243 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2244 ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
2245 }
2246}
2247
2248/*
2249 * Called by LVM after the snapshot is done. We need to reset the RECOVER
2250 * flag here, even though the filesystem is not technically dirty yet.
2251 */
2252static void ext3_unlockfs(struct super_block *sb)
2253{
2254 if (!(sb->s_flags & MS_RDONLY)) {
2255 lock_super(sb);
2256 /* Reser the needs_recovery flag before the fs is unlocked. */
2257 EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2258 ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
2259 unlock_super(sb);
2260 journal_unlock_updates(EXT3_SB(sb)->s_journal);
2261 }
2262}
2263
2264static int ext3_remount (struct super_block * sb, int * flags, char * data)
2265{
2266 struct ext3_super_block * es;
2267 struct ext3_sb_info *sbi = EXT3_SB(sb);
2268 ext3_fsblk_t n_blocks_count = 0;
2269 unsigned long old_sb_flags;
2270 struct ext3_mount_options old_opts;
2271 int err;
2272#ifdef CONFIG_QUOTA
2273 int i;
2274#endif
2275
2276 /* Store the original options */
2277 old_sb_flags = sb->s_flags;
2278 old_opts.s_mount_opt = sbi->s_mount_opt;
2279 old_opts.s_resuid = sbi->s_resuid;
2280 old_opts.s_resgid = sbi->s_resgid;
2281 old_opts.s_commit_interval = sbi->s_commit_interval;
2282#ifdef CONFIG_QUOTA
2283 old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
2284 for (i = 0; i < MAXQUOTAS; i++)
2285 old_opts.s_qf_names[i] = sbi->s_qf_names[i];
2286#endif
2287
2288 /*
2289 * Allow the "check" option to be passed as a remount option.
2290 */
2291 if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
2292 err = -EINVAL;
2293 goto restore_opts;
2294 }
2295
2296 if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
2297 ext3_abort(sb, __FUNCTION__, "Abort forced by user");
2298
2299 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
2300 ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
2301
2302 es = sbi->s_es;
2303
2304 ext3_init_journal_params(sb, sbi->s_journal);
2305
2306 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
2307 n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
2308 if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) {
2309 err = -EROFS;
2310 goto restore_opts;
2311 }
2312
2313 if (*flags & MS_RDONLY) {
2314 /*
2315 * First of all, the unconditional stuff we have to do
2316 * to disable replay of the journal when we next remount
2317 */
2318 sb->s_flags |= MS_RDONLY;
2319
2320 /*
2321 * OK, test if we are remounting a valid rw partition
2322 * readonly, and if so set the rdonly flag and then
2323 * mark the partition as valid again.
2324 */
2325 if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) &&
2326 (sbi->s_mount_state & EXT3_VALID_FS))
2327 es->s_state = cpu_to_le16(sbi->s_mount_state);
2328
2329 ext3_mark_recovery_complete(sb, es);
2330 } else {
2331 __le32 ret;
2332 if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
2333 ~EXT3_FEATURE_RO_COMPAT_SUPP))) {
2334 printk(KERN_WARNING "EXT3-fs: %s: couldn't "
2335 "remount RDWR because of unsupported "
2336 "optional features (%x).\n",
2337 sb->s_id, le32_to_cpu(ret));
2338 err = -EROFS;
2339 goto restore_opts;
2340 }
2341 /*
2342 * Mounting a RDONLY partition read-write, so reread
2343 * and store the current valid flag. (It may have
2344 * been changed by e2fsck since we originally mounted
2345 * the partition.)
2346 */
2347 ext3_clear_journal_err(sb, es);
2348 sbi->s_mount_state = le16_to_cpu(es->s_state);
2349 if ((err = ext3_group_extend(sb, es, n_blocks_count)))
2350 goto restore_opts;
2351 if (!ext3_setup_super (sb, es, 0))
2352 sb->s_flags &= ~MS_RDONLY;
2353 }
2354 }
2355#ifdef CONFIG_QUOTA
2356 /* Release old quota file names */
2357 for (i = 0; i < MAXQUOTAS; i++)
2358 if (old_opts.s_qf_names[i] &&
2359 old_opts.s_qf_names[i] != sbi->s_qf_names[i])
2360 kfree(old_opts.s_qf_names[i]);
2361#endif
2362 return 0;
2363restore_opts:
2364 sb->s_flags = old_sb_flags;
2365 sbi->s_mount_opt = old_opts.s_mount_opt;
2366 sbi->s_resuid = old_opts.s_resuid;
2367 sbi->s_resgid = old_opts.s_resgid;
2368 sbi->s_commit_interval = old_opts.s_commit_interval;
2369#ifdef CONFIG_QUOTA
2370 sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
2371 for (i = 0; i < MAXQUOTAS; i++) {
2372 if (sbi->s_qf_names[i] &&
2373 old_opts.s_qf_names[i] != sbi->s_qf_names[i])
2374 kfree(sbi->s_qf_names[i]);
2375 sbi->s_qf_names[i] = old_opts.s_qf_names[i];
2376 }
2377#endif
2378 return err;
2379}
2380
2381static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
2382{
2383 struct super_block *sb = dentry->d_sb;
2384 struct ext3_sb_info *sbi = EXT3_SB(sb);
2385 struct ext3_super_block *es = sbi->s_es;
2386 ext3_fsblk_t overhead;
2387 int i;
2388
2389 if (test_opt (sb, MINIX_DF))
2390 overhead = 0;
2391 else {
2392 unsigned long ngroups;
2393 ngroups = EXT3_SB(sb)->s_groups_count;
2394 smp_rmb();
2395
2396 /*
2397 * Compute the overhead (FS structures)
2398 */
2399
2400 /*
2401 * All of the blocks before first_data_block are
2402 * overhead
2403 */
2404 overhead = le32_to_cpu(es->s_first_data_block);
2405
2406 /*
2407 * Add the overhead attributed to the superblock and
2408 * block group descriptors. If the sparse superblocks
2409 * feature is turned on, then not all groups have this.
2410 */
2411 for (i = 0; i < ngroups; i++) {
2412 overhead += ext3_bg_has_super(sb, i) +
2413 ext3_bg_num_gdb(sb, i);
2414 cond_resched();
2415 }
2416
2417 /*
2418 * Every block group has an inode bitmap, a block
2419 * bitmap, and an inode table.
2420 */
2421 overhead += (ngroups * (2 + EXT3_SB(sb)->s_itb_per_group));
2422 }
2423
2424 buf->f_type = EXT3_SUPER_MAGIC;
2425 buf->f_bsize = sb->s_blocksize;
2426 buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead;
2427 buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter);
2428 buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
2429 if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
2430 buf->f_bavail = 0;
2431 buf->f_files = le32_to_cpu(es->s_inodes_count);
2432 buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
2433 buf->f_namelen = EXT3_NAME_LEN;
2434 return 0;
2435}
2436
2437/* Helper function for writing quotas on sync - we need to start transaction before quota file
2438 * is locked for write. Otherwise the are possible deadlocks:
2439 * Process 1 Process 2
2440 * ext3_create() quota_sync()
2441 * journal_start() write_dquot()
2442 * DQUOT_INIT() down(dqio_mutex)
2443 * down(dqio_mutex) journal_start()
2444 *
2445 */
2446
2447#ifdef CONFIG_QUOTA
2448
2449static inline struct inode *dquot_to_inode(struct dquot *dquot)
2450{
2451 return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
2452}
2453
2454static int ext3_dquot_initialize(struct inode *inode, int type)
2455{
2456 handle_t *handle;
2457 int ret, err;
2458
2459 /* We may create quota structure so we need to reserve enough blocks */
2460 handle = ext3_journal_start(inode, 2*EXT3_QUOTA_INIT_BLOCKS(inode->i_sb));
2461 if (IS_ERR(handle))
2462 return PTR_ERR(handle);
2463 ret = dquot_initialize(inode, type);
2464 err = ext3_journal_stop(handle);
2465 if (!ret)
2466 ret = err;
2467 return ret;
2468}
2469
2470static int ext3_dquot_drop(struct inode *inode)
2471{
2472 handle_t *handle;
2473 int ret, err;
2474
2475 /* We may delete quota structure so we need to reserve enough blocks */
2476 handle = ext3_journal_start(inode, 2*EXT3_QUOTA_DEL_BLOCKS(inode->i_sb));
2477 if (IS_ERR(handle))
2478 return PTR_ERR(handle);
2479 ret = dquot_drop(inode);
2480 err = ext3_journal_stop(handle);
2481 if (!ret)
2482 ret = err;
2483 return ret;
2484}
2485
2486static int ext3_write_dquot(struct dquot *dquot)
2487{
2488 int ret, err;
2489 handle_t *handle;
2490 struct inode *inode;
2491
2492 inode = dquot_to_inode(dquot);
2493 handle = ext3_journal_start(inode,
2494 EXT3_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
2495 if (IS_ERR(handle))
2496 return PTR_ERR(handle);
2497 ret = dquot_commit(dquot);
2498 err = ext3_journal_stop(handle);
2499 if (!ret)
2500 ret = err;
2501 return ret;
2502}
2503
2504static int ext3_acquire_dquot(struct dquot *dquot)
2505{
2506 int ret, err;
2507 handle_t *handle;
2508
2509 handle = ext3_journal_start(dquot_to_inode(dquot),
2510 EXT3_QUOTA_INIT_BLOCKS(dquot->dq_sb));
2511 if (IS_ERR(handle))
2512 return PTR_ERR(handle);
2513 ret = dquot_acquire(dquot);
2514 err = ext3_journal_stop(handle);
2515 if (!ret)
2516 ret = err;
2517 return ret;
2518}
2519
2520static int ext3_release_dquot(struct dquot *dquot)
2521{
2522 int ret, err;
2523 handle_t *handle;
2524
2525 handle = ext3_journal_start(dquot_to_inode(dquot),
2526 EXT3_QUOTA_DEL_BLOCKS(dquot->dq_sb));
2527 if (IS_ERR(handle))
2528 return PTR_ERR(handle);
2529 ret = dquot_release(dquot);
2530 err = ext3_journal_stop(handle);
2531 if (!ret)
2532 ret = err;
2533 return ret;
2534}
2535
2536static int ext3_mark_dquot_dirty(struct dquot *dquot)
2537{
2538 /* Are we journalling quotas? */
2539 if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
2540 EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
2541 dquot_mark_dquot_dirty(dquot);
2542 return ext3_write_dquot(dquot);
2543 } else {
2544 return dquot_mark_dquot_dirty(dquot);
2545 }
2546}
2547
2548static int ext3_write_info(struct super_block *sb, int type)
2549{
2550 int ret, err;
2551 handle_t *handle;
2552
2553 /* Data block + inode block */
2554 handle = ext3_journal_start(sb->s_root->d_inode, 2);
2555 if (IS_ERR(handle))
2556 return PTR_ERR(handle);
2557 ret = dquot_commit_info(sb, type);
2558 err = ext3_journal_stop(handle);
2559 if (!ret)
2560 ret = err;
2561 return ret;
2562}
2563
2564/*
2565 * Turn on quotas during mount time - we need to find
2566 * the quota file and such...
2567 */
2568static int ext3_quota_on_mount(struct super_block *sb, int type)
2569{
2570 return vfs_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
2571 EXT3_SB(sb)->s_jquota_fmt, type);
2572}
2573
2574/*
2575 * Standard function to be called on quota_on
2576 */
2577static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2578 char *path)
2579{
2580 int err;
2581 struct nameidata nd;
2582
2583 if (!test_opt(sb, QUOTA))
2584 return -EINVAL;
2585 /* Not journalling quota? */
2586 if (!EXT3_SB(sb)->s_qf_names[USRQUOTA] &&
2587 !EXT3_SB(sb)->s_qf_names[GRPQUOTA])
2588 return vfs_quota_on(sb, type, format_id, path);
2589 err = path_lookup(path, LOOKUP_FOLLOW, &nd);
2590 if (err)
2591 return err;
2592 /* Quotafile not on the same filesystem? */
2593 if (nd.mnt->mnt_sb != sb) {
2594 path_release(&nd);
2595 return -EXDEV;
2596 }
2597 /* Quotafile not of fs root? */
2598 if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode)
2599 printk(KERN_WARNING
2600 "EXT3-fs: Quota file not on filesystem root. "
2601 "Journalled quota will not work.\n");
2602 path_release(&nd);
2603 return vfs_quota_on(sb, type, format_id, path);
2604}
2605
2606/* Read data from quotafile - avoid pagecache and such because we cannot afford
2607 * acquiring the locks... As quota files are never truncated and quota code
2608 * itself serializes the operations (and noone else should touch the files)
2609 * we don't have to be afraid of races */
2610static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
2611 size_t len, loff_t off)
2612{
2613 struct inode *inode = sb_dqopt(sb)->files[type];
2614 sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
2615 int err = 0;
2616 int offset = off & (sb->s_blocksize - 1);
2617 int tocopy;
2618 size_t toread;
2619 struct buffer_head *bh;
2620 loff_t i_size = i_size_read(inode);
2621
2622 if (off > i_size)
2623 return 0;
2624 if (off+len > i_size)
2625 len = i_size-off;
2626 toread = len;
2627 while (toread > 0) {
2628 tocopy = sb->s_blocksize - offset < toread ?
2629 sb->s_blocksize - offset : toread;
2630 bh = ext3_bread(NULL, inode, blk, 0, &err);
2631 if (err)
2632 return err;
2633 if (!bh) /* A hole? */
2634 memset(data, 0, tocopy);
2635 else
2636 memcpy(data, bh->b_data+offset, tocopy);
2637 brelse(bh);
2638 offset = 0;
2639 toread -= tocopy;
2640 data += tocopy;
2641 blk++;
2642 }
2643 return len;
2644}
2645
2646/* Write to quotafile (we know the transaction is already started and has
2647 * enough credits) */
2648static ssize_t ext3_quota_write(struct super_block *sb, int type,
2649 const char *data, size_t len, loff_t off)
2650{
2651 struct inode *inode = sb_dqopt(sb)->files[type];
2652 sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb);
2653 int err = 0;
2654 int offset = off & (sb->s_blocksize - 1);
2655 int tocopy;
2656 int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL;
2657 size_t towrite = len;
2658 struct buffer_head *bh;
2659 handle_t *handle = journal_current_handle();
2660
2661 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
2662 while (towrite > 0) {
2663 tocopy = sb->s_blocksize - offset < towrite ?
2664 sb->s_blocksize - offset : towrite;
2665 bh = ext3_bread(handle, inode, blk, 1, &err);
2666 if (!bh)
2667 goto out;
2668 if (journal_quota) {
2669 err = ext3_journal_get_write_access(handle, bh);
2670 if (err) {
2671 brelse(bh);
2672 goto out;
2673 }
2674 }
2675 lock_buffer(bh);
2676 memcpy(bh->b_data+offset, data, tocopy);
2677 flush_dcache_page(bh->b_page);
2678 unlock_buffer(bh);
2679 if (journal_quota)
2680 err = ext3_journal_dirty_metadata(handle, bh);
2681 else {
2682 /* Always do at least ordered writes for quotas */
2683 err = ext3_journal_dirty_data(handle, bh);
2684 mark_buffer_dirty(bh);
2685 }
2686 brelse(bh);
2687 if (err)
2688 goto out;
2689 offset = 0;
2690 towrite -= tocopy;
2691 data += tocopy;
2692 blk++;
2693 }
2694out:
2695 if (len == towrite)
2696 return err;
2697 if (inode->i_size < off+len-towrite) {
2698 i_size_write(inode, off+len-towrite);
2699 EXT3_I(inode)->i_disksize = inode->i_size;
2700 }
2701 inode->i_version++;
2702 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2703 ext3_mark_inode_dirty(handle, inode);
2704 mutex_unlock(&inode->i_mutex);
2705 return len - towrite;
2706}
2707
2708#endif
2709
2710static int ext3_get_sb(struct file_system_type *fs_type,
2711 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
2712{
2713 return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt);
2714}
2715
2716static struct file_system_type ext3_fs_type = {
2717 .owner = THIS_MODULE,
2718 .name = "ext3",
2719 .get_sb = ext3_get_sb,
2720 .kill_sb = kill_block_super,
2721 .fs_flags = FS_REQUIRES_DEV,
2722};
2723
2724static int __init init_ext3_fs(void)
2725{
2726 int err = init_ext3_xattr();
2727 if (err)
2728 return err;
2729 err = init_inodecache();
2730 if (err)
2731 goto out1;
2732 err = register_filesystem(&ext3_fs_type);
2733 if (err)
2734 goto out;
2735 return 0;
2736out:
2737 destroy_inodecache();
2738out1:
2739 exit_ext3_xattr();
2740 return err;
2741}
2742
2743static void __exit exit_ext3_fs(void)
2744{
2745 unregister_filesystem(&ext3_fs_type);
2746 destroy_inodecache();
2747 exit_ext3_xattr();
2748}
2749
2750MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
2751MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
2752MODULE_LICENSE("GPL");
2753module_init(init_ext3_fs)
2754module_exit(exit_ext3_fs)
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
new file mode 100644
index 000000000000..4f79122cde67
--- /dev/null
+++ b/fs/ext4/symlink.c
@@ -0,0 +1,54 @@
1/*
2 * linux/fs/ext3/symlink.c
3 *
4 * Only fast symlinks left here - the rest is done by generic code. AV, 1999
5 *
6 * Copyright (C) 1992, 1993, 1994, 1995
7 * Remy Card (card@masi.ibp.fr)
8 * Laboratoire MASI - Institut Blaise Pascal
9 * Universite Pierre et Marie Curie (Paris VI)
10 *
11 * from
12 *
13 * linux/fs/minix/symlink.c
14 *
15 * Copyright (C) 1991, 1992 Linus Torvalds
16 *
17 * ext3 symlink handling code
18 */
19
20#include <linux/fs.h>
21#include <linux/jbd.h>
22#include <linux/ext3_fs.h>
23#include <linux/namei.h>
24#include "xattr.h"
25
26static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd)
27{
28 struct ext3_inode_info *ei = EXT3_I(dentry->d_inode);
29 nd_set_link(nd, (char*)ei->i_data);
30 return NULL;
31}
32
33struct inode_operations ext3_symlink_inode_operations = {
34 .readlink = generic_readlink,
35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link,
37#ifdef CONFIG_EXT3_FS_XATTR
38 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr,
40 .listxattr = ext3_listxattr,
41 .removexattr = generic_removexattr,
42#endif
43};
44
45struct inode_operations ext3_fast_symlink_inode_operations = {
46 .readlink = generic_readlink,
47 .follow_link = ext3_follow_link,
48#ifdef CONFIG_EXT3_FS_XATTR
49 .setxattr = generic_setxattr,
50 .getxattr = generic_getxattr,
51 .listxattr = ext3_listxattr,
52 .removexattr = generic_removexattr,
53#endif
54};
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
new file mode 100644
index 000000000000..f86f2482f01d
--- /dev/null
+++ b/fs/ext4/xattr.c
@@ -0,0 +1,1317 @@
1/*
2 * linux/fs/ext3/xattr.c
3 *
4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
5 *
6 * Fix by Harrison Xing <harrison@mountainviewdata.com>.
7 * Ext3 code with a lot of help from Eric Jarman <ejarman@acm.org>.
8 * Extended attributes for symlinks and special files added per
9 * suggestion of Luka Renko <luka.renko@hermes.si>.
10 * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
11 * Red Hat Inc.
12 * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
13 * and Andreas Gruenbacher <agruen@suse.de>.
14 */
15
16/*
17 * Extended attributes are stored directly in inodes (on file systems with
18 * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
19 * field contains the block number if an inode uses an additional block. All
20 * attributes must fit in the inode and one additional block. Blocks that
21 * contain the identical set of attributes may be shared among several inodes.
22 * Identical blocks are detected by keeping a cache of blocks that have
23 * recently been accessed.
24 *
25 * The attributes in inodes and on blocks have a different header; the entries
26 * are stored in the same format:
27 *
28 * +------------------+
29 * | header |
30 * | entry 1 | |
31 * | entry 2 | | growing downwards
32 * | entry 3 | v
33 * | four null bytes |
34 * | . . . |
35 * | value 1 | ^
36 * | value 3 | | growing upwards
37 * | value 2 | |
38 * +------------------+
39 *
40 * The header is followed by multiple entry descriptors. In disk blocks, the
41 * entry descriptors are kept sorted. In inodes, they are unsorted. The
42 * attribute values are aligned to the end of the block in no specific order.
43 *
44 * Locking strategy
45 * ----------------
46 * EXT3_I(inode)->i_file_acl is protected by EXT3_I(inode)->xattr_sem.
47 * EA blocks are only changed if they are exclusive to an inode, so
48 * holding xattr_sem also means that nothing but the EA block's reference
49 * count can change. Multiple writers to the same block are synchronized
50 * by the buffer lock.
51 */
52
53#include <linux/init.h>
54#include <linux/fs.h>
55#include <linux/slab.h>
56#include <linux/ext3_jbd.h>
57#include <linux/ext3_fs.h>
58#include <linux/mbcache.h>
59#include <linux/quotaops.h>
60#include <linux/rwsem.h>
61#include "xattr.h"
62#include "acl.h"
63
64#define BHDR(bh) ((struct ext3_xattr_header *)((bh)->b_data))
65#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr))
66#define BFIRST(bh) ENTRY(BHDR(bh)+1)
67#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
68
69#define IHDR(inode, raw_inode) \
70 ((struct ext3_xattr_ibody_header *) \
71 ((void *)raw_inode + \
72 EXT3_GOOD_OLD_INODE_SIZE + \
73 EXT3_I(inode)->i_extra_isize))
74#define IFIRST(hdr) ((struct ext3_xattr_entry *)((hdr)+1))
75
76#ifdef EXT3_XATTR_DEBUG
77# define ea_idebug(inode, f...) do { \
78 printk(KERN_DEBUG "inode %s:%lu: ", \
79 inode->i_sb->s_id, inode->i_ino); \
80 printk(f); \
81 printk("\n"); \
82 } while (0)
83# define ea_bdebug(bh, f...) do { \
84 char b[BDEVNAME_SIZE]; \
85 printk(KERN_DEBUG "block %s:%lu: ", \
86 bdevname(bh->b_bdev, b), \
87 (unsigned long) bh->b_blocknr); \
88 printk(f); \
89 printk("\n"); \
90 } while (0)
91#else
92# define ea_idebug(f...)
93# define ea_bdebug(f...)
94#endif
95
96static void ext3_xattr_cache_insert(struct buffer_head *);
97static struct buffer_head *ext3_xattr_cache_find(struct inode *,
98 struct ext3_xattr_header *,
99 struct mb_cache_entry **);
100static void ext3_xattr_rehash(struct ext3_xattr_header *,
101 struct ext3_xattr_entry *);
102
103static struct mb_cache *ext3_xattr_cache;
104
105static struct xattr_handler *ext3_xattr_handler_map[] = {
106 [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler,
107#ifdef CONFIG_EXT3_FS_POSIX_ACL
108 [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler,
109 [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext3_xattr_acl_default_handler,
110#endif
111 [EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler,
112#ifdef CONFIG_EXT3_FS_SECURITY
113 [EXT3_XATTR_INDEX_SECURITY] = &ext3_xattr_security_handler,
114#endif
115};
116
117struct xattr_handler *ext3_xattr_handlers[] = {
118 &ext3_xattr_user_handler,
119 &ext3_xattr_trusted_handler,
120#ifdef CONFIG_EXT3_FS_POSIX_ACL
121 &ext3_xattr_acl_access_handler,
122 &ext3_xattr_acl_default_handler,
123#endif
124#ifdef CONFIG_EXT3_FS_SECURITY
125 &ext3_xattr_security_handler,
126#endif
127 NULL
128};
129
130static inline struct xattr_handler *
131ext3_xattr_handler(int name_index)
132{
133 struct xattr_handler *handler = NULL;
134
135 if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map))
136 handler = ext3_xattr_handler_map[name_index];
137 return handler;
138}
139
140/*
141 * Inode operation listxattr()
142 *
143 * dentry->d_inode->i_mutex: don't care
144 */
145ssize_t
146ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
147{
148 return ext3_xattr_list(dentry->d_inode, buffer, size);
149}
150
151static int
152ext3_xattr_check_names(struct ext3_xattr_entry *entry, void *end)
153{
154 while (!IS_LAST_ENTRY(entry)) {
155 struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(entry);
156 if ((void *)next >= end)
157 return -EIO;
158 entry = next;
159 }
160 return 0;
161}
162
163static inline int
164ext3_xattr_check_block(struct buffer_head *bh)
165{
166 int error;
167
168 if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
169 BHDR(bh)->h_blocks != cpu_to_le32(1))
170 return -EIO;
171 error = ext3_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
172 return error;
173}
174
175static inline int
176ext3_xattr_check_entry(struct ext3_xattr_entry *entry, size_t size)
177{
178 size_t value_size = le32_to_cpu(entry->e_value_size);
179
180 if (entry->e_value_block != 0 || value_size > size ||
181 le16_to_cpu(entry->e_value_offs) + value_size > size)
182 return -EIO;
183 return 0;
184}
185
186static int
187ext3_xattr_find_entry(struct ext3_xattr_entry **pentry, int name_index,
188 const char *name, size_t size, int sorted)
189{
190 struct ext3_xattr_entry *entry;
191 size_t name_len;
192 int cmp = 1;
193
194 if (name == NULL)
195 return -EINVAL;
196 name_len = strlen(name);
197 entry = *pentry;
198 for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
199 cmp = name_index - entry->e_name_index;
200 if (!cmp)
201 cmp = name_len - entry->e_name_len;
202 if (!cmp)
203 cmp = memcmp(name, entry->e_name, name_len);
204 if (cmp <= 0 && (sorted || cmp == 0))
205 break;
206 }
207 *pentry = entry;
208 if (!cmp && ext3_xattr_check_entry(entry, size))
209 return -EIO;
210 return cmp ? -ENODATA : 0;
211}
212
213static int
214ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
215 void *buffer, size_t buffer_size)
216{
217 struct buffer_head *bh = NULL;
218 struct ext3_xattr_entry *entry;
219 size_t size;
220 int error;
221
222 ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
223 name_index, name, buffer, (long)buffer_size);
224
225 error = -ENODATA;
226 if (!EXT3_I(inode)->i_file_acl)
227 goto cleanup;
228 ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
229 bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
230 if (!bh)
231 goto cleanup;
232 ea_bdebug(bh, "b_count=%d, refcount=%d",
233 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
234 if (ext3_xattr_check_block(bh)) {
235bad_block: ext3_error(inode->i_sb, __FUNCTION__,
236 "inode %lu: bad block "E3FSBLK, inode->i_ino,
237 EXT3_I(inode)->i_file_acl);
238 error = -EIO;
239 goto cleanup;
240 }
241 ext3_xattr_cache_insert(bh);
242 entry = BFIRST(bh);
243 error = ext3_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
244 if (error == -EIO)
245 goto bad_block;
246 if (error)
247 goto cleanup;
248 size = le32_to_cpu(entry->e_value_size);
249 if (buffer) {
250 error = -ERANGE;
251 if (size > buffer_size)
252 goto cleanup;
253 memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
254 size);
255 }
256 error = size;
257
258cleanup:
259 brelse(bh);
260 return error;
261}
262
263static int
264ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
265 void *buffer, size_t buffer_size)
266{
267 struct ext3_xattr_ibody_header *header;
268 struct ext3_xattr_entry *entry;
269 struct ext3_inode *raw_inode;
270 struct ext3_iloc iloc;
271 size_t size;
272 void *end;
273 int error;
274
275 if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR))
276 return -ENODATA;
277 error = ext3_get_inode_loc(inode, &iloc);
278 if (error)
279 return error;
280 raw_inode = ext3_raw_inode(&iloc);
281 header = IHDR(inode, raw_inode);
282 entry = IFIRST(header);
283 end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
284 error = ext3_xattr_check_names(entry, end);
285 if (error)
286 goto cleanup;
287 error = ext3_xattr_find_entry(&entry, name_index, name,
288 end - (void *)entry, 0);
289 if (error)
290 goto cleanup;
291 size = le32_to_cpu(entry->e_value_size);
292 if (buffer) {
293 error = -ERANGE;
294 if (size > buffer_size)
295 goto cleanup;
296 memcpy(buffer, (void *)IFIRST(header) +
297 le16_to_cpu(entry->e_value_offs), size);
298 }
299 error = size;
300
301cleanup:
302 brelse(iloc.bh);
303 return error;
304}
305
306/*
307 * ext3_xattr_get()
308 *
309 * Copy an extended attribute into the buffer
310 * provided, or compute the buffer size required.
311 * Buffer is NULL to compute the size of the buffer required.
312 *
313 * Returns a negative error number on failure, or the number of bytes
314 * used / required on success.
315 */
316int
317ext3_xattr_get(struct inode *inode, int name_index, const char *name,
318 void *buffer, size_t buffer_size)
319{
320 int error;
321
322 down_read(&EXT3_I(inode)->xattr_sem);
323 error = ext3_xattr_ibody_get(inode, name_index, name, buffer,
324 buffer_size);
325 if (error == -ENODATA)
326 error = ext3_xattr_block_get(inode, name_index, name, buffer,
327 buffer_size);
328 up_read(&EXT3_I(inode)->xattr_sem);
329 return error;
330}
331
332static int
333ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry,
334 char *buffer, size_t buffer_size)
335{
336 size_t rest = buffer_size;
337
338 for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
339 struct xattr_handler *handler =
340 ext3_xattr_handler(entry->e_name_index);
341
342 if (handler) {
343 size_t size = handler->list(inode, buffer, rest,
344 entry->e_name,
345 entry->e_name_len);
346 if (buffer) {
347 if (size > rest)
348 return -ERANGE;
349 buffer += size;
350 }
351 rest -= size;
352 }
353 }
354 return buffer_size - rest;
355}
356
357static int
358ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
359{
360 struct buffer_head *bh = NULL;
361 int error;
362
363 ea_idebug(inode, "buffer=%p, buffer_size=%ld",
364 buffer, (long)buffer_size);
365
366 error = 0;
367 if (!EXT3_I(inode)->i_file_acl)
368 goto cleanup;
369 ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
370 bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
371 error = -EIO;
372 if (!bh)
373 goto cleanup;
374 ea_bdebug(bh, "b_count=%d, refcount=%d",
375 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
376 if (ext3_xattr_check_block(bh)) {
377 ext3_error(inode->i_sb, __FUNCTION__,
378 "inode %lu: bad block "E3FSBLK, inode->i_ino,
379 EXT3_I(inode)->i_file_acl);
380 error = -EIO;
381 goto cleanup;
382 }
383 ext3_xattr_cache_insert(bh);
384 error = ext3_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size);
385
386cleanup:
387 brelse(bh);
388
389 return error;
390}
391
392static int
393ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
394{
395 struct ext3_xattr_ibody_header *header;
396 struct ext3_inode *raw_inode;
397 struct ext3_iloc iloc;
398 void *end;
399 int error;
400
401 if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR))
402 return 0;
403 error = ext3_get_inode_loc(inode, &iloc);
404 if (error)
405 return error;
406 raw_inode = ext3_raw_inode(&iloc);
407 header = IHDR(inode, raw_inode);
408 end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
409 error = ext3_xattr_check_names(IFIRST(header), end);
410 if (error)
411 goto cleanup;
412 error = ext3_xattr_list_entries(inode, IFIRST(header),
413 buffer, buffer_size);
414
415cleanup:
416 brelse(iloc.bh);
417 return error;
418}
419
420/*
421 * ext3_xattr_list()
422 *
423 * Copy a list of attribute names into the buffer
424 * provided, or compute the buffer size required.
425 * Buffer is NULL to compute the size of the buffer required.
426 *
427 * Returns a negative error number on failure, or the number of bytes
428 * used / required on success.
429 */
430int
431ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
432{
433 int i_error, b_error;
434
435 down_read(&EXT3_I(inode)->xattr_sem);
436 i_error = ext3_xattr_ibody_list(inode, buffer, buffer_size);
437 if (i_error < 0) {
438 b_error = 0;
439 } else {
440 if (buffer) {
441 buffer += i_error;
442 buffer_size -= i_error;
443 }
444 b_error = ext3_xattr_block_list(inode, buffer, buffer_size);
445 if (b_error < 0)
446 i_error = 0;
447 }
448 up_read(&EXT3_I(inode)->xattr_sem);
449 return i_error + b_error;
450}
451
452/*
453 * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is
454 * not set, set it.
455 */
456static void ext3_xattr_update_super_block(handle_t *handle,
457 struct super_block *sb)
458{
459 if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR))
460 return;
461
462 lock_super(sb);
463 if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
464 EXT3_SB(sb)->s_es->s_feature_compat |=
465 cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR);
466 sb->s_dirt = 1;
467 ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
468 }
469 unlock_super(sb);
470}
471
472/*
473 * Release the xattr block BH: If the reference count is > 1, decrement
474 * it; otherwise free the block.
475 */
476static void
477ext3_xattr_release_block(handle_t *handle, struct inode *inode,
478 struct buffer_head *bh)
479{
480 struct mb_cache_entry *ce = NULL;
481
482 ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev, bh->b_blocknr);
483 if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
484 ea_bdebug(bh, "refcount now=0; freeing");
485 if (ce)
486 mb_cache_entry_free(ce);
487 ext3_free_blocks(handle, inode, bh->b_blocknr, 1);
488 get_bh(bh);
489 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
490 } else {
491 if (ext3_journal_get_write_access(handle, bh) == 0) {
492 lock_buffer(bh);
493 BHDR(bh)->h_refcount = cpu_to_le32(
494 le32_to_cpu(BHDR(bh)->h_refcount) - 1);
495 ext3_journal_dirty_metadata(handle, bh);
496 if (IS_SYNC(inode))
497 handle->h_sync = 1;
498 DQUOT_FREE_BLOCK(inode, 1);
499 unlock_buffer(bh);
500 ea_bdebug(bh, "refcount now=%d; releasing",
501 le32_to_cpu(BHDR(bh)->h_refcount));
502 }
503 if (ce)
504 mb_cache_entry_release(ce);
505 }
506}
507
508struct ext3_xattr_info {
509 int name_index;
510 const char *name;
511 const void *value;
512 size_t value_len;
513};
514
515struct ext3_xattr_search {
516 struct ext3_xattr_entry *first;
517 void *base;
518 void *end;
519 struct ext3_xattr_entry *here;
520 int not_found;
521};
522
523static int
524ext3_xattr_set_entry(struct ext3_xattr_info *i, struct ext3_xattr_search *s)
525{
526 struct ext3_xattr_entry *last;
527 size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
528
529 /* Compute min_offs and last. */
530 last = s->first;
531 for (; !IS_LAST_ENTRY(last); last = EXT3_XATTR_NEXT(last)) {
532 if (!last->e_value_block && last->e_value_size) {
533 size_t offs = le16_to_cpu(last->e_value_offs);
534 if (offs < min_offs)
535 min_offs = offs;
536 }
537 }
538 free = min_offs - ((void *)last - s->base) - sizeof(__u32);
539 if (!s->not_found) {
540 if (!s->here->e_value_block && s->here->e_value_size) {
541 size_t size = le32_to_cpu(s->here->e_value_size);
542 free += EXT3_XATTR_SIZE(size);
543 }
544 free += EXT3_XATTR_LEN(name_len);
545 }
546 if (i->value) {
547 if (free < EXT3_XATTR_SIZE(i->value_len) ||
548 free < EXT3_XATTR_LEN(name_len) +
549 EXT3_XATTR_SIZE(i->value_len))
550 return -ENOSPC;
551 }
552
553 if (i->value && s->not_found) {
554 /* Insert the new name. */
555 size_t size = EXT3_XATTR_LEN(name_len);
556 size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
557 memmove((void *)s->here + size, s->here, rest);
558 memset(s->here, 0, size);
559 s->here->e_name_index = i->name_index;
560 s->here->e_name_len = name_len;
561 memcpy(s->here->e_name, i->name, name_len);
562 } else {
563 if (!s->here->e_value_block && s->here->e_value_size) {
564 void *first_val = s->base + min_offs;
565 size_t offs = le16_to_cpu(s->here->e_value_offs);
566 void *val = s->base + offs;
567 size_t size = EXT3_XATTR_SIZE(
568 le32_to_cpu(s->here->e_value_size));
569
570 if (i->value && size == EXT3_XATTR_SIZE(i->value_len)) {
571 /* The old and the new value have the same
572 size. Just replace. */
573 s->here->e_value_size =
574 cpu_to_le32(i->value_len);
575 memset(val + size - EXT3_XATTR_PAD, 0,
576 EXT3_XATTR_PAD); /* Clear pad bytes. */
577 memcpy(val, i->value, i->value_len);
578 return 0;
579 }
580
581 /* Remove the old value. */
582 memmove(first_val + size, first_val, val - first_val);
583 memset(first_val, 0, size);
584 s->here->e_value_size = 0;
585 s->here->e_value_offs = 0;
586 min_offs += size;
587
588 /* Adjust all value offsets. */
589 last = s->first;
590 while (!IS_LAST_ENTRY(last)) {
591 size_t o = le16_to_cpu(last->e_value_offs);
592 if (!last->e_value_block &&
593 last->e_value_size && o < offs)
594 last->e_value_offs =
595 cpu_to_le16(o + size);
596 last = EXT3_XATTR_NEXT(last);
597 }
598 }
599 if (!i->value) {
600 /* Remove the old name. */
601 size_t size = EXT3_XATTR_LEN(name_len);
602 last = ENTRY((void *)last - size);
603 memmove(s->here, (void *)s->here + size,
604 (void *)last - (void *)s->here + sizeof(__u32));
605 memset(last, 0, size);
606 }
607 }
608
609 if (i->value) {
610 /* Insert the new value. */
611 s->here->e_value_size = cpu_to_le32(i->value_len);
612 if (i->value_len) {
613 size_t size = EXT3_XATTR_SIZE(i->value_len);
614 void *val = s->base + min_offs - size;
615 s->here->e_value_offs = cpu_to_le16(min_offs - size);
616 memset(val + size - EXT3_XATTR_PAD, 0,
617 EXT3_XATTR_PAD); /* Clear the pad bytes. */
618 memcpy(val, i->value, i->value_len);
619 }
620 }
621 return 0;
622}
623
624struct ext3_xattr_block_find {
625 struct ext3_xattr_search s;
626 struct buffer_head *bh;
627};
628
629static int
630ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
631 struct ext3_xattr_block_find *bs)
632{
633 struct super_block *sb = inode->i_sb;
634 int error;
635
636 ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
637 i->name_index, i->name, i->value, (long)i->value_len);
638
639 if (EXT3_I(inode)->i_file_acl) {
640 /* The inode already has an extended attribute block. */
641 bs->bh = sb_bread(sb, EXT3_I(inode)->i_file_acl);
642 error = -EIO;
643 if (!bs->bh)
644 goto cleanup;
645 ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
646 atomic_read(&(bs->bh->b_count)),
647 le32_to_cpu(BHDR(bs->bh)->h_refcount));
648 if (ext3_xattr_check_block(bs->bh)) {
649 ext3_error(sb, __FUNCTION__,
650 "inode %lu: bad block "E3FSBLK, inode->i_ino,
651 EXT3_I(inode)->i_file_acl);
652 error = -EIO;
653 goto cleanup;
654 }
655 /* Find the named attribute. */
656 bs->s.base = BHDR(bs->bh);
657 bs->s.first = BFIRST(bs->bh);
658 bs->s.end = bs->bh->b_data + bs->bh->b_size;
659 bs->s.here = bs->s.first;
660 error = ext3_xattr_find_entry(&bs->s.here, i->name_index,
661 i->name, bs->bh->b_size, 1);
662 if (error && error != -ENODATA)
663 goto cleanup;
664 bs->s.not_found = error;
665 }
666 error = 0;
667
668cleanup:
669 return error;
670}
671
672static int
673ext3_xattr_block_set(handle_t *handle, struct inode *inode,
674 struct ext3_xattr_info *i,
675 struct ext3_xattr_block_find *bs)
676{
677 struct super_block *sb = inode->i_sb;
678 struct buffer_head *new_bh = NULL;
679 struct ext3_xattr_search *s = &bs->s;
680 struct mb_cache_entry *ce = NULL;
681 int error;
682
683#define header(x) ((struct ext3_xattr_header *)(x))
684
685 if (i->value && i->value_len > sb->s_blocksize)
686 return -ENOSPC;
687 if (s->base) {
688 ce = mb_cache_entry_get(ext3_xattr_cache, bs->bh->b_bdev,
689 bs->bh->b_blocknr);
690 if (header(s->base)->h_refcount == cpu_to_le32(1)) {
691 if (ce) {
692 mb_cache_entry_free(ce);
693 ce = NULL;
694 }
695 ea_bdebug(bs->bh, "modifying in-place");
696 error = ext3_journal_get_write_access(handle, bs->bh);
697 if (error)
698 goto cleanup;
699 lock_buffer(bs->bh);
700 error = ext3_xattr_set_entry(i, s);
701 if (!error) {
702 if (!IS_LAST_ENTRY(s->first))
703 ext3_xattr_rehash(header(s->base),
704 s->here);
705 ext3_xattr_cache_insert(bs->bh);
706 }
707 unlock_buffer(bs->bh);
708 if (error == -EIO)
709 goto bad_block;
710 if (!error)
711 error = ext3_journal_dirty_metadata(handle,
712 bs->bh);
713 if (error)
714 goto cleanup;
715 goto inserted;
716 } else {
717 int offset = (char *)s->here - bs->bh->b_data;
718
719 if (ce) {
720 mb_cache_entry_release(ce);
721 ce = NULL;
722 }
723 ea_bdebug(bs->bh, "cloning");
724 s->base = kmalloc(bs->bh->b_size, GFP_KERNEL);
725 error = -ENOMEM;
726 if (s->base == NULL)
727 goto cleanup;
728 memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
729 s->first = ENTRY(header(s->base)+1);
730 header(s->base)->h_refcount = cpu_to_le32(1);
731 s->here = ENTRY(s->base + offset);
732 s->end = s->base + bs->bh->b_size;
733 }
734 } else {
735 /* Allocate a buffer where we construct the new block. */
736 s->base = kmalloc(sb->s_blocksize, GFP_KERNEL);
737 /* assert(header == s->base) */
738 error = -ENOMEM;
739 if (s->base == NULL)
740 goto cleanup;
741 memset(s->base, 0, sb->s_blocksize);
742 header(s->base)->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
743 header(s->base)->h_blocks = cpu_to_le32(1);
744 header(s->base)->h_refcount = cpu_to_le32(1);
745 s->first = ENTRY(header(s->base)+1);
746 s->here = ENTRY(header(s->base)+1);
747 s->end = s->base + sb->s_blocksize;
748 }
749
750 error = ext3_xattr_set_entry(i, s);
751 if (error == -EIO)
752 goto bad_block;
753 if (error)
754 goto cleanup;
755 if (!IS_LAST_ENTRY(s->first))
756 ext3_xattr_rehash(header(s->base), s->here);
757
758inserted:
759 if (!IS_LAST_ENTRY(s->first)) {
760 new_bh = ext3_xattr_cache_find(inode, header(s->base), &ce);
761 if (new_bh) {
762 /* We found an identical block in the cache. */
763 if (new_bh == bs->bh)
764 ea_bdebug(new_bh, "keeping");
765 else {
766 /* The old block is released after updating
767 the inode. */
768 error = -EDQUOT;
769 if (DQUOT_ALLOC_BLOCK(inode, 1))
770 goto cleanup;
771 error = ext3_journal_get_write_access(handle,
772 new_bh);
773 if (error)
774 goto cleanup_dquot;
775 lock_buffer(new_bh);
776 BHDR(new_bh)->h_refcount = cpu_to_le32(1 +
777 le32_to_cpu(BHDR(new_bh)->h_refcount));
778 ea_bdebug(new_bh, "reusing; refcount now=%d",
779 le32_to_cpu(BHDR(new_bh)->h_refcount));
780 unlock_buffer(new_bh);
781 error = ext3_journal_dirty_metadata(handle,
782 new_bh);
783 if (error)
784 goto cleanup_dquot;
785 }
786 mb_cache_entry_release(ce);
787 ce = NULL;
788 } else if (bs->bh && s->base == bs->bh->b_data) {
789 /* We were modifying this block in-place. */
790 ea_bdebug(bs->bh, "keeping this block");
791 new_bh = bs->bh;
792 get_bh(new_bh);
793 } else {
794 /* We need to allocate a new block */
795 ext3_fsblk_t goal = le32_to_cpu(
796 EXT3_SB(sb)->s_es->s_first_data_block) +
797 (ext3_fsblk_t)EXT3_I(inode)->i_block_group *
798 EXT3_BLOCKS_PER_GROUP(sb);
799 ext3_fsblk_t block = ext3_new_block(handle, inode,
800 goal, &error);
801 if (error)
802 goto cleanup;
803 ea_idebug(inode, "creating block %d", block);
804
805 new_bh = sb_getblk(sb, block);
806 if (!new_bh) {
807getblk_failed:
808 ext3_free_blocks(handle, inode, block, 1);
809 error = -EIO;
810 goto cleanup;
811 }
812 lock_buffer(new_bh);
813 error = ext3_journal_get_create_access(handle, new_bh);
814 if (error) {
815 unlock_buffer(new_bh);
816 goto getblk_failed;
817 }
818 memcpy(new_bh->b_data, s->base, new_bh->b_size);
819 set_buffer_uptodate(new_bh);
820 unlock_buffer(new_bh);
821 ext3_xattr_cache_insert(new_bh);
822 error = ext3_journal_dirty_metadata(handle, new_bh);
823 if (error)
824 goto cleanup;
825 }
826 }
827
828 /* Update the inode. */
829 EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
830
831 /* Drop the previous xattr block. */
832 if (bs->bh && bs->bh != new_bh)
833 ext3_xattr_release_block(handle, inode, bs->bh);
834 error = 0;
835
836cleanup:
837 if (ce)
838 mb_cache_entry_release(ce);
839 brelse(new_bh);
840 if (!(bs->bh && s->base == bs->bh->b_data))
841 kfree(s->base);
842
843 return error;
844
845cleanup_dquot:
846 DQUOT_FREE_BLOCK(inode, 1);
847 goto cleanup;
848
849bad_block:
850 ext3_error(inode->i_sb, __FUNCTION__,
851 "inode %lu: bad block "E3FSBLK, inode->i_ino,
852 EXT3_I(inode)->i_file_acl);
853 goto cleanup;
854
855#undef header
856}
857
858struct ext3_xattr_ibody_find {
859 struct ext3_xattr_search s;
860 struct ext3_iloc iloc;
861};
862
863static int
864ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i,
865 struct ext3_xattr_ibody_find *is)
866{
867 struct ext3_xattr_ibody_header *header;
868 struct ext3_inode *raw_inode;
869 int error;
870
871 if (EXT3_I(inode)->i_extra_isize == 0)
872 return 0;
873 raw_inode = ext3_raw_inode(&is->iloc);
874 header = IHDR(inode, raw_inode);
875 is->s.base = is->s.first = IFIRST(header);
876 is->s.here = is->s.first;
877 is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size;
878 if (EXT3_I(inode)->i_state & EXT3_STATE_XATTR) {
879 error = ext3_xattr_check_names(IFIRST(header), is->s.end);
880 if (error)
881 return error;
882 /* Find the named attribute. */
883 error = ext3_xattr_find_entry(&is->s.here, i->name_index,
884 i->name, is->s.end -
885 (void *)is->s.base, 0);
886 if (error && error != -ENODATA)
887 return error;
888 is->s.not_found = error;
889 }
890 return 0;
891}
892
893static int
894ext3_xattr_ibody_set(handle_t *handle, struct inode *inode,
895 struct ext3_xattr_info *i,
896 struct ext3_xattr_ibody_find *is)
897{
898 struct ext3_xattr_ibody_header *header;
899 struct ext3_xattr_search *s = &is->s;
900 int error;
901
902 if (EXT3_I(inode)->i_extra_isize == 0)
903 return -ENOSPC;
904 error = ext3_xattr_set_entry(i, s);
905 if (error)
906 return error;
907 header = IHDR(inode, ext3_raw_inode(&is->iloc));
908 if (!IS_LAST_ENTRY(s->first)) {
909 header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
910 EXT3_I(inode)->i_state |= EXT3_STATE_XATTR;
911 } else {
912 header->h_magic = cpu_to_le32(0);
913 EXT3_I(inode)->i_state &= ~EXT3_STATE_XATTR;
914 }
915 return 0;
916}
917
918/*
919 * ext3_xattr_set_handle()
920 *
921 * Create, replace or remove an extended attribute for this inode. Buffer
922 * is NULL to remove an existing extended attribute, and non-NULL to
923 * either replace an existing extended attribute, or create a new extended
924 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
925 * specify that an extended attribute must exist and must not exist
926 * previous to the call, respectively.
927 *
928 * Returns 0, or a negative error number on failure.
929 */
930int
931ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
932 const char *name, const void *value, size_t value_len,
933 int flags)
934{
935 struct ext3_xattr_info i = {
936 .name_index = name_index,
937 .name = name,
938 .value = value,
939 .value_len = value_len,
940
941 };
942 struct ext3_xattr_ibody_find is = {
943 .s = { .not_found = -ENODATA, },
944 };
945 struct ext3_xattr_block_find bs = {
946 .s = { .not_found = -ENODATA, },
947 };
948 int error;
949
950 if (!name)
951 return -EINVAL;
952 if (strlen(name) > 255)
953 return -ERANGE;
954 down_write(&EXT3_I(inode)->xattr_sem);
955 error = ext3_get_inode_loc(inode, &is.iloc);
956 if (error)
957 goto cleanup;
958
959 if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) {
960 struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc);
961 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
962 EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW;
963 }
964
965 error = ext3_xattr_ibody_find(inode, &i, &is);
966 if (error)
967 goto cleanup;
968 if (is.s.not_found)
969 error = ext3_xattr_block_find(inode, &i, &bs);
970 if (error)
971 goto cleanup;
972 if (is.s.not_found && bs.s.not_found) {
973 error = -ENODATA;
974 if (flags & XATTR_REPLACE)
975 goto cleanup;
976 error = 0;
977 if (!value)
978 goto cleanup;
979 } else {
980 error = -EEXIST;
981 if (flags & XATTR_CREATE)
982 goto cleanup;
983 }
984 error = ext3_journal_get_write_access(handle, is.iloc.bh);
985 if (error)
986 goto cleanup;
987 if (!value) {
988 if (!is.s.not_found)
989 error = ext3_xattr_ibody_set(handle, inode, &i, &is);
990 else if (!bs.s.not_found)
991 error = ext3_xattr_block_set(handle, inode, &i, &bs);
992 } else {
993 error = ext3_xattr_ibody_set(handle, inode, &i, &is);
994 if (!error && !bs.s.not_found) {
995 i.value = NULL;
996 error = ext3_xattr_block_set(handle, inode, &i, &bs);
997 } else if (error == -ENOSPC) {
998 error = ext3_xattr_block_set(handle, inode, &i, &bs);
999 if (error)
1000 goto cleanup;
1001 if (!is.s.not_found) {
1002 i.value = NULL;
1003 error = ext3_xattr_ibody_set(handle, inode, &i,
1004 &is);
1005 }
1006 }
1007 }
1008 if (!error) {
1009 ext3_xattr_update_super_block(handle, inode->i_sb);
1010 inode->i_ctime = CURRENT_TIME_SEC;
1011 error = ext3_mark_iloc_dirty(handle, inode, &is.iloc);
1012 /*
1013 * The bh is consumed by ext3_mark_iloc_dirty, even with
1014 * error != 0.
1015 */
1016 is.iloc.bh = NULL;
1017 if (IS_SYNC(inode))
1018 handle->h_sync = 1;
1019 }
1020
1021cleanup:
1022 brelse(is.iloc.bh);
1023 brelse(bs.bh);
1024 up_write(&EXT3_I(inode)->xattr_sem);
1025 return error;
1026}
1027
1028/*
1029 * ext3_xattr_set()
1030 *
1031 * Like ext3_xattr_set_handle, but start from an inode. This extended
1032 * attribute modification is a filesystem transaction by itself.
1033 *
1034 * Returns 0, or a negative error number on failure.
1035 */
1036int
1037ext3_xattr_set(struct inode *inode, int name_index, const char *name,
1038 const void *value, size_t value_len, int flags)
1039{
1040 handle_t *handle;
1041 int error, retries = 0;
1042
1043retry:
1044 handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
1045 if (IS_ERR(handle)) {
1046 error = PTR_ERR(handle);
1047 } else {
1048 int error2;
1049
1050 error = ext3_xattr_set_handle(handle, inode, name_index, name,
1051 value, value_len, flags);
1052 error2 = ext3_journal_stop(handle);
1053 if (error == -ENOSPC &&
1054 ext3_should_retry_alloc(inode->i_sb, &retries))
1055 goto retry;
1056 if (error == 0)
1057 error = error2;
1058 }
1059
1060 return error;
1061}
1062
1063/*
1064 * ext3_xattr_delete_inode()
1065 *
1066 * Free extended attribute resources associated with this inode. This
1067 * is called immediately before an inode is freed. We have exclusive
1068 * access to the inode.
1069 */
1070void
1071ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
1072{
1073 struct buffer_head *bh = NULL;
1074
1075 if (!EXT3_I(inode)->i_file_acl)
1076 goto cleanup;
1077 bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
1078 if (!bh) {
1079 ext3_error(inode->i_sb, __FUNCTION__,
1080 "inode %lu: block "E3FSBLK" read error", inode->i_ino,
1081 EXT3_I(inode)->i_file_acl);
1082 goto cleanup;
1083 }
1084 if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
1085 BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1086 ext3_error(inode->i_sb, __FUNCTION__,
1087 "inode %lu: bad block "E3FSBLK, inode->i_ino,
1088 EXT3_I(inode)->i_file_acl);
1089 goto cleanup;
1090 }
1091 ext3_xattr_release_block(handle, inode, bh);
1092 EXT3_I(inode)->i_file_acl = 0;
1093
1094cleanup:
1095 brelse(bh);
1096}
1097
1098/*
1099 * ext3_xattr_put_super()
1100 *
1101 * This is called when a file system is unmounted.
1102 */
1103void
1104ext3_xattr_put_super(struct super_block *sb)
1105{
1106 mb_cache_shrink(sb->s_bdev);
1107}
1108
1109/*
1110 * ext3_xattr_cache_insert()
1111 *
1112 * Create a new entry in the extended attribute cache, and insert
1113 * it unless such an entry is already in the cache.
1114 *
1115 * Returns 0, or a negative error number on failure.
1116 */
1117static void
1118ext3_xattr_cache_insert(struct buffer_head *bh)
1119{
1120 __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
1121 struct mb_cache_entry *ce;
1122 int error;
1123
1124 ce = mb_cache_entry_alloc(ext3_xattr_cache);
1125 if (!ce) {
1126 ea_bdebug(bh, "out of memory");
1127 return;
1128 }
1129 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
1130 if (error) {
1131 mb_cache_entry_free(ce);
1132 if (error == -EBUSY) {
1133 ea_bdebug(bh, "already in cache");
1134 error = 0;
1135 }
1136 } else {
1137 ea_bdebug(bh, "inserting [%x]", (int)hash);
1138 mb_cache_entry_release(ce);
1139 }
1140}
1141
1142/*
1143 * ext3_xattr_cmp()
1144 *
1145 * Compare two extended attribute blocks for equality.
1146 *
1147 * Returns 0 if the blocks are equal, 1 if they differ, and
1148 * a negative error number on errors.
1149 */
1150static int
1151ext3_xattr_cmp(struct ext3_xattr_header *header1,
1152 struct ext3_xattr_header *header2)
1153{
1154 struct ext3_xattr_entry *entry1, *entry2;
1155
1156 entry1 = ENTRY(header1+1);
1157 entry2 = ENTRY(header2+1);
1158 while (!IS_LAST_ENTRY(entry1)) {
1159 if (IS_LAST_ENTRY(entry2))
1160 return 1;
1161 if (entry1->e_hash != entry2->e_hash ||
1162 entry1->e_name_index != entry2->e_name_index ||
1163 entry1->e_name_len != entry2->e_name_len ||
1164 entry1->e_value_size != entry2->e_value_size ||
1165 memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
1166 return 1;
1167 if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
1168 return -EIO;
1169 if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
1170 (char *)header2 + le16_to_cpu(entry2->e_value_offs),
1171 le32_to_cpu(entry1->e_value_size)))
1172 return 1;
1173
1174 entry1 = EXT3_XATTR_NEXT(entry1);
1175 entry2 = EXT3_XATTR_NEXT(entry2);
1176 }
1177 if (!IS_LAST_ENTRY(entry2))
1178 return 1;
1179 return 0;
1180}
1181
1182/*
1183 * ext3_xattr_cache_find()
1184 *
1185 * Find an identical extended attribute block.
1186 *
1187 * Returns a pointer to the block found, or NULL if such a block was
1188 * not found or an error occurred.
1189 */
1190static struct buffer_head *
1191ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header,
1192 struct mb_cache_entry **pce)
1193{
1194 __u32 hash = le32_to_cpu(header->h_hash);
1195 struct mb_cache_entry *ce;
1196
1197 if (!header->h_hash)
1198 return NULL; /* never share */
1199 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
1200again:
1201 ce = mb_cache_entry_find_first(ext3_xattr_cache, 0,
1202 inode->i_sb->s_bdev, hash);
1203 while (ce) {
1204 struct buffer_head *bh;
1205
1206 if (IS_ERR(ce)) {
1207 if (PTR_ERR(ce) == -EAGAIN)
1208 goto again;
1209 break;
1210 }
1211 bh = sb_bread(inode->i_sb, ce->e_block);
1212 if (!bh) {
1213 ext3_error(inode->i_sb, __FUNCTION__,
1214 "inode %lu: block %lu read error",
1215 inode->i_ino, (unsigned long) ce->e_block);
1216 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
1217 EXT3_XATTR_REFCOUNT_MAX) {
1218 ea_idebug(inode, "block %lu refcount %d>=%d",
1219 (unsigned long) ce->e_block,
1220 le32_to_cpu(BHDR(bh)->h_refcount),
1221 EXT3_XATTR_REFCOUNT_MAX);
1222 } else if (ext3_xattr_cmp(header, BHDR(bh)) == 0) {
1223 *pce = ce;
1224 return bh;
1225 }
1226 brelse(bh);
1227 ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
1228 }
1229 return NULL;
1230}
1231
1232#define NAME_HASH_SHIFT 5
1233#define VALUE_HASH_SHIFT 16
1234
1235/*
1236 * ext3_xattr_hash_entry()
1237 *
1238 * Compute the hash of an extended attribute.
1239 */
1240static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header,
1241 struct ext3_xattr_entry *entry)
1242{
1243 __u32 hash = 0;
1244 char *name = entry->e_name;
1245 int n;
1246
1247 for (n=0; n < entry->e_name_len; n++) {
1248 hash = (hash << NAME_HASH_SHIFT) ^
1249 (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
1250 *name++;
1251 }
1252
1253 if (entry->e_value_block == 0 && entry->e_value_size != 0) {
1254 __le32 *value = (__le32 *)((char *)header +
1255 le16_to_cpu(entry->e_value_offs));
1256 for (n = (le32_to_cpu(entry->e_value_size) +
1257 EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) {
1258 hash = (hash << VALUE_HASH_SHIFT) ^
1259 (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
1260 le32_to_cpu(*value++);
1261 }
1262 }
1263 entry->e_hash = cpu_to_le32(hash);
1264}
1265
1266#undef NAME_HASH_SHIFT
1267#undef VALUE_HASH_SHIFT
1268
1269#define BLOCK_HASH_SHIFT 16
1270
1271/*
1272 * ext3_xattr_rehash()
1273 *
1274 * Re-compute the extended attribute hash value after an entry has changed.
1275 */
1276static void ext3_xattr_rehash(struct ext3_xattr_header *header,
1277 struct ext3_xattr_entry *entry)
1278{
1279 struct ext3_xattr_entry *here;
1280 __u32 hash = 0;
1281
1282 ext3_xattr_hash_entry(header, entry);
1283 here = ENTRY(header+1);
1284 while (!IS_LAST_ENTRY(here)) {
1285 if (!here->e_hash) {
1286 /* Block is not shared if an entry's hash value == 0 */
1287 hash = 0;
1288 break;
1289 }
1290 hash = (hash << BLOCK_HASH_SHIFT) ^
1291 (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
1292 le32_to_cpu(here->e_hash);
1293 here = EXT3_XATTR_NEXT(here);
1294 }
1295 header->h_hash = cpu_to_le32(hash);
1296}
1297
1298#undef BLOCK_HASH_SHIFT
1299
1300int __init
1301init_ext3_xattr(void)
1302{
1303 ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL,
1304 sizeof(struct mb_cache_entry) +
1305 sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
1306 if (!ext3_xattr_cache)
1307 return -ENOMEM;
1308 return 0;
1309}
1310
1311void
1312exit_ext3_xattr(void)
1313{
1314 if (ext3_xattr_cache)
1315 mb_cache_destroy(ext3_xattr_cache);
1316 ext3_xattr_cache = NULL;
1317}
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
new file mode 100644
index 000000000000..6b1ae1c6182c
--- /dev/null
+++ b/fs/ext4/xattr.h
@@ -0,0 +1,145 @@
1/*
2 File: fs/ext3/xattr.h
3
4 On-disk format of extended attributes for the ext3 filesystem.
5
6 (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
7*/
8
9#include <linux/xattr.h>
10
11/* Magic value in attribute blocks */
12#define EXT3_XATTR_MAGIC 0xEA020000
13
14/* Maximum number of references to one attribute block */
15#define EXT3_XATTR_REFCOUNT_MAX 1024
16
17/* Name indexes */
18#define EXT3_XATTR_INDEX_USER 1
19#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2
20#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3
21#define EXT3_XATTR_INDEX_TRUSTED 4
22#define EXT3_XATTR_INDEX_LUSTRE 5
23#define EXT3_XATTR_INDEX_SECURITY 6
24
25struct ext3_xattr_header {
26 __le32 h_magic; /* magic number for identification */
27 __le32 h_refcount; /* reference count */
28 __le32 h_blocks; /* number of disk blocks used */
29 __le32 h_hash; /* hash value of all attributes */
30 __u32 h_reserved[4]; /* zero right now */
31};
32
33struct ext3_xattr_ibody_header {
34 __le32 h_magic; /* magic number for identification */
35};
36
37struct ext3_xattr_entry {
38 __u8 e_name_len; /* length of name */
39 __u8 e_name_index; /* attribute name index */
40 __le16 e_value_offs; /* offset in disk block of value */
41 __le32 e_value_block; /* disk block attribute is stored on (n/i) */
42 __le32 e_value_size; /* size of attribute value */
43 __le32 e_hash; /* hash value of name and value */
44 char e_name[0]; /* attribute name */
45};
46
47#define EXT3_XATTR_PAD_BITS 2
48#define EXT3_XATTR_PAD (1<<EXT3_XATTR_PAD_BITS)
49#define EXT3_XATTR_ROUND (EXT3_XATTR_PAD-1)
50#define EXT3_XATTR_LEN(name_len) \
51 (((name_len) + EXT3_XATTR_ROUND + \
52 sizeof(struct ext3_xattr_entry)) & ~EXT3_XATTR_ROUND)
53#define EXT3_XATTR_NEXT(entry) \
54 ( (struct ext3_xattr_entry *)( \
55 (char *)(entry) + EXT3_XATTR_LEN((entry)->e_name_len)) )
56#define EXT3_XATTR_SIZE(size) \
57 (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND)
58
59# ifdef CONFIG_EXT3_FS_XATTR
60
61extern struct xattr_handler ext3_xattr_user_handler;
62extern struct xattr_handler ext3_xattr_trusted_handler;
63extern struct xattr_handler ext3_xattr_acl_access_handler;
64extern struct xattr_handler ext3_xattr_acl_default_handler;
65extern struct xattr_handler ext3_xattr_security_handler;
66
67extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
68
69extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t);
70extern int ext3_xattr_list(struct inode *, char *, size_t);
71extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
72extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
73
74extern void ext3_xattr_delete_inode(handle_t *, struct inode *);
75extern void ext3_xattr_put_super(struct super_block *);
76
77extern int init_ext3_xattr(void);
78extern void exit_ext3_xattr(void);
79
80extern struct xattr_handler *ext3_xattr_handlers[];
81
82# else /* CONFIG_EXT3_FS_XATTR */
83
84static inline int
85ext3_xattr_get(struct inode *inode, int name_index, const char *name,
86 void *buffer, size_t size, int flags)
87{
88 return -EOPNOTSUPP;
89}
90
91static inline int
92ext3_xattr_list(struct inode *inode, void *buffer, size_t size)
93{
94 return -EOPNOTSUPP;
95}
96
97static inline int
98ext3_xattr_set(struct inode *inode, int name_index, const char *name,
99 const void *value, size_t size, int flags)
100{
101 return -EOPNOTSUPP;
102}
103
104static inline int
105ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
106 const char *name, const void *value, size_t size, int flags)
107{
108 return -EOPNOTSUPP;
109}
110
111static inline void
112ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
113{
114}
115
116static inline void
117ext3_xattr_put_super(struct super_block *sb)
118{
119}
120
121static inline int
122init_ext3_xattr(void)
123{
124 return 0;
125}
126
127static inline void
128exit_ext3_xattr(void)
129{
130}
131
132#define ext3_xattr_handlers NULL
133
134# endif /* CONFIG_EXT3_FS_XATTR */
135
136#ifdef CONFIG_EXT3_FS_SECURITY
137extern int ext3_init_security(handle_t *handle, struct inode *inode,
138 struct inode *dir);
139#else
140static inline int ext3_init_security(handle_t *handle, struct inode *inode,
141 struct inode *dir)
142{
143 return 0;
144}
145#endif
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
new file mode 100644
index 000000000000..b9c40c15647b
--- /dev/null
+++ b/fs/ext4/xattr_security.c
@@ -0,0 +1,77 @@
1/*
2 * linux/fs/ext3/xattr_security.c
3 * Handler for storing security labels as extended attributes.
4 */
5
6#include <linux/module.h>
7#include <linux/string.h>
8#include <linux/fs.h>
9#include <linux/smp_lock.h>
10#include <linux/ext3_jbd.h>
11#include <linux/ext3_fs.h>
12#include <linux/security.h>
13#include "xattr.h"
14
15static size_t
16ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size,
17 const char *name, size_t name_len)
18{
19 const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
20 const size_t total_len = prefix_len + name_len + 1;
21
22
23 if (list && total_len <= list_size) {
24 memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
25 memcpy(list+prefix_len, name, name_len);
26 list[prefix_len + name_len] = '\0';
27 }
28 return total_len;
29}
30
31static int
32ext3_xattr_security_get(struct inode *inode, const char *name,
33 void *buffer, size_t size)
34{
35 if (strcmp(name, "") == 0)
36 return -EINVAL;
37 return ext3_xattr_get(inode, EXT3_XATTR_INDEX_SECURITY, name,
38 buffer, size);
39}
40
41static int
42ext3_xattr_security_set(struct inode *inode, const char *name,
43 const void *value, size_t size, int flags)
44{
45 if (strcmp(name, "") == 0)
46 return -EINVAL;
47 return ext3_xattr_set(inode, EXT3_XATTR_INDEX_SECURITY, name,
48 value, size, flags);
49}
50
51int
52ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
53{
54 int err;
55 size_t len;
56 void *value;
57 char *name;
58
59 err = security_inode_init_security(inode, dir, &name, &value, &len);
60 if (err) {
61 if (err == -EOPNOTSUPP)
62 return 0;
63 return err;
64 }
65 err = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_SECURITY,
66 name, value, len, 0);
67 kfree(name);
68 kfree(value);
69 return err;
70}
71
72struct xattr_handler ext3_xattr_security_handler = {
73 .prefix = XATTR_SECURITY_PREFIX,
74 .list = ext3_xattr_security_list,
75 .get = ext3_xattr_security_get,
76 .set = ext3_xattr_security_set,
77};
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
new file mode 100644
index 000000000000..86d91f1186dc
--- /dev/null
+++ b/fs/ext4/xattr_trusted.c
@@ -0,0 +1,62 @@
1/*
2 * linux/fs/ext3/xattr_trusted.c
3 * Handler for trusted extended attributes.
4 *
5 * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */
7
8#include <linux/module.h>
9#include <linux/string.h>
10#include <linux/capability.h>
11#include <linux/fs.h>
12#include <linux/smp_lock.h>
13#include <linux/ext3_jbd.h>
14#include <linux/ext3_fs.h>
15#include "xattr.h"
16
17#define XATTR_TRUSTED_PREFIX "trusted."
18
19static size_t
20ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
21 const char *name, size_t name_len)
22{
23 const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
24 const size_t total_len = prefix_len + name_len + 1;
25
26 if (!capable(CAP_SYS_ADMIN))
27 return 0;
28
29 if (list && total_len <= list_size) {
30 memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
31 memcpy(list+prefix_len, name, name_len);
32 list[prefix_len + name_len] = '\0';
33 }
34 return total_len;
35}
36
37static int
38ext3_xattr_trusted_get(struct inode *inode, const char *name,
39 void *buffer, size_t size)
40{
41 if (strcmp(name, "") == 0)
42 return -EINVAL;
43 return ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, name,
44 buffer, size);
45}
46
47static int
48ext3_xattr_trusted_set(struct inode *inode, const char *name,
49 const void *value, size_t size, int flags)
50{
51 if (strcmp(name, "") == 0)
52 return -EINVAL;
53 return ext3_xattr_set(inode, EXT3_XATTR_INDEX_TRUSTED, name,
54 value, size, flags);
55}
56
57struct xattr_handler ext3_xattr_trusted_handler = {
58 .prefix = XATTR_TRUSTED_PREFIX,
59 .list = ext3_xattr_trusted_list,
60 .get = ext3_xattr_trusted_get,
61 .set = ext3_xattr_trusted_set,
62};
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
new file mode 100644
index 000000000000..a85a0a17c4fd
--- /dev/null
+++ b/fs/ext4/xattr_user.c
@@ -0,0 +1,64 @@
1/*
2 * linux/fs/ext3/xattr_user.c
3 * Handler for extended user attributes.
4 *
5 * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */
7
8#include <linux/module.h>
9#include <linux/string.h>
10#include <linux/fs.h>
11#include <linux/smp_lock.h>
12#include <linux/ext3_jbd.h>
13#include <linux/ext3_fs.h>
14#include "xattr.h"
15
16#define XATTR_USER_PREFIX "user."
17
18static size_t
19ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size,
20 const char *name, size_t name_len)
21{
22 const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
23 const size_t total_len = prefix_len + name_len + 1;
24
25 if (!test_opt(inode->i_sb, XATTR_USER))
26 return 0;
27
28 if (list && total_len <= list_size) {
29 memcpy(list, XATTR_USER_PREFIX, prefix_len);
30 memcpy(list+prefix_len, name, name_len);
31 list[prefix_len + name_len] = '\0';
32 }
33 return total_len;
34}
35
36static int
37ext3_xattr_user_get(struct inode *inode, const char *name,
38 void *buffer, size_t size)
39{
40 if (strcmp(name, "") == 0)
41 return -EINVAL;
42 if (!test_opt(inode->i_sb, XATTR_USER))
43 return -EOPNOTSUPP;
44 return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, buffer, size);
45}
46
47static int
48ext3_xattr_user_set(struct inode *inode, const char *name,
49 const void *value, size_t size, int flags)
50{
51 if (strcmp(name, "") == 0)
52 return -EINVAL;
53 if (!test_opt(inode->i_sb, XATTR_USER))
54 return -EOPNOTSUPP;
55 return ext3_xattr_set(inode, EXT3_XATTR_INDEX_USER, name,
56 value, size, flags);
57}
58
59struct xattr_handler ext3_xattr_user_handler = {
60 .prefix = XATTR_USER_PREFIX,
61 .list = ext3_xattr_user_list,
62 .get = ext3_xattr_user_get,
63 .set = ext3_xattr_user_set,
64};